xref: /qemu/fpu/softfloat.c (revision a94b783952cc493cb241aabb1da8c7a830385baa)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89 
90 /* We only need stdlib for abort() */
91 
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations.  (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98 
99 /*
100  * Hardfloat
101  *
102  * Fast emulation of guest FP instructions is challenging for two reasons.
103  * First, FP instruction semantics are similar but not identical, particularly
104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
105  * exception flags is not trivial: reading the host's flags register with a
106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107  * and trapping on every FP exception is not fast nor pleasant to work with.
108  *
109  * We address these challenges by leveraging the host FPU for a subset of the
110  * operations. To do this we expand on the idea presented in this paper:
111  *
112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114  *
115  * The idea is thus to leverage the host FPU to (1) compute FP operations
116  * and (2) identify whether FP exceptions occurred while avoiding
117  * expensive exception flag register accesses.
118  *
119  * An important optimization shown in the paper is that given that exception
120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121  * This is particularly useful for the inexact flag, which is very frequently
122  * raised in floating-point workloads.
123  *
124  * We optimize the code further by deferring to soft-fp whenever FP exception
125  * detection might get hairy. Two examples: (1) when at least one operand is
126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127  * and the result is < the minimum normal.
128  */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
130     static inline void name(soft_t *a, float_status *s)                 \
131     {                                                                   \
132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
134                                      soft_t ## _is_neg(*a));            \
135             s->float_exception_flags |= float_flag_input_denormal;      \
136         }                                                               \
137     }
138 
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142 
143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
144     static inline void name(soft_t *a, float_status *s) \
145     {                                                   \
146         if (likely(!s->flush_inputs_to_zero)) {         \
147             return;                                     \
148         }                                               \
149         soft_t ## _input_flush__nocheck(a, s);          \
150     }
151 
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155 
156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
158     {                                                                   \
159         if (likely(!s->flush_inputs_to_zero)) {                         \
160             return;                                                     \
161         }                                                               \
162         soft_t ## _input_flush__nocheck(a, s);                          \
163         soft_t ## _input_flush__nocheck(b, s);                          \
164     }
165 
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169 
170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172     {                                                                   \
173         if (likely(!s->flush_inputs_to_zero)) {                         \
174             return;                                                     \
175         }                                                               \
176         soft_t ## _input_flush__nocheck(a, s);                          \
177         soft_t ## _input_flush__nocheck(b, s);                          \
178         soft_t ## _input_flush__nocheck(c, s);                          \
179     }
180 
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184 
185 /*
186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
187  * hardfloat functions. Each combination of number of inputs and float size
188  * gets its own value.
189  */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205 
206 /*
207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208  * float{32,64}_is_infinity when !USE_FP.
209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211  */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF   1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF   0
216 #endif
217 
218 /*
219  * Some targets clear the FP flags before most FP operations. This prevents
220  * the use of hardfloat, since hardfloat relies on the inexact flag being
221  * already set.
222  */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226     IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234 
235 static inline bool can_use_fpu(const float_status *s)
236 {
237     if (QEMU_NO_HARDFLOAT) {
238         return false;
239     }
240     return likely(s->float_exception_flags & float_flag_inexact &&
241                   s->float_rounding_mode == float_round_nearest_even);
242 }
243 
244 /*
245  * Hardfloat generation functions. Each operation can have two flavors:
246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247  * most condition checks, or native ones (e.g. fpclassify).
248  *
249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
250  * compiler to propagate constants and inline everything into the callers.
251  *
252  * We only generate functions for operations with two inputs, since only
253  * these are common enough to justify consolidating them into common code.
254  */
255 
256 typedef union {
257     float32 s;
258     float h;
259 } union_float32;
260 
261 typedef union {
262     float64 s;
263     double h;
264 } union_float64;
265 
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268 
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float   (*hard_f32_op2_fn)(float a, float b);
272 typedef double  (*hard_f64_op2_fn)(double a, double b);
273 
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
278         /*
279          * Not using a temp variable for consecutive fpclassify calls ends up
280          * generating faster code.
281          */
282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284     }
285     return float32_is_zero_or_normal(a.s) &&
286            float32_is_zero_or_normal(b.s);
287 }
288 
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294     }
295     return float64_is_zero_or_normal(a.s) &&
296            float64_is_zero_or_normal(b.s);
297 }
298 
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307     }
308     return float32_is_zero_or_normal(a.s) &&
309            float32_is_zero_or_normal(b.s) &&
310            float32_is_zero_or_normal(c.s);
311 }
312 
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320     }
321     return float64_is_zero_or_normal(a.s) &&
322            float64_is_zero_or_normal(b.s) &&
323            float64_is_zero_or_normal(c.s);
324 }
325 
326 static inline bool f32_is_inf(union_float32 a)
327 {
328     if (QEMU_HARDFLOAT_USE_ISINF) {
329         return isinf(a.h);
330     }
331     return float32_is_infinity(a.s);
332 }
333 
334 static inline bool f64_is_inf(union_float64 a)
335 {
336     if (QEMU_HARDFLOAT_USE_ISINF) {
337         return isinf(a.h);
338     }
339     return float64_is_infinity(a.s);
340 }
341 
342 /* Note: @fast_test and @post can be NULL */
343 static inline float32
344 float32_gen2(float32 xa, float32 xb, float_status *s,
345              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
346              f32_check_fn pre, f32_check_fn post,
347              f32_check_fn fast_test, soft_f32_op2_fn fast_op)
348 {
349     union_float32 ua, ub, ur;
350 
351     ua.s = xa;
352     ub.s = xb;
353 
354     if (unlikely(!can_use_fpu(s))) {
355         goto soft;
356     }
357 
358     float32_input_flush2(&ua.s, &ub.s, s);
359     if (unlikely(!pre(ua, ub))) {
360         goto soft;
361     }
362     if (fast_test && fast_test(ua, ub)) {
363         return fast_op(ua.s, ub.s, s);
364     }
365 
366     ur.h = hard(ua.h, ub.h);
367     if (unlikely(f32_is_inf(ur))) {
368         s->float_exception_flags |= float_flag_overflow;
369     } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
370         if (post == NULL || post(ua, ub)) {
371             goto soft;
372         }
373     }
374     return ur.s;
375 
376  soft:
377     return soft(ua.s, ub.s, s);
378 }
379 
380 static inline float64
381 float64_gen2(float64 xa, float64 xb, float_status *s,
382              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
383              f64_check_fn pre, f64_check_fn post,
384              f64_check_fn fast_test, soft_f64_op2_fn fast_op)
385 {
386     union_float64 ua, ub, ur;
387 
388     ua.s = xa;
389     ub.s = xb;
390 
391     if (unlikely(!can_use_fpu(s))) {
392         goto soft;
393     }
394 
395     float64_input_flush2(&ua.s, &ub.s, s);
396     if (unlikely(!pre(ua, ub))) {
397         goto soft;
398     }
399     if (fast_test && fast_test(ua, ub)) {
400         return fast_op(ua.s, ub.s, s);
401     }
402 
403     ur.h = hard(ua.h, ub.h);
404     if (unlikely(f64_is_inf(ur))) {
405         s->float_exception_flags |= float_flag_overflow;
406     } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
407         if (post == NULL || post(ua, ub)) {
408             goto soft;
409         }
410     }
411     return ur.s;
412 
413  soft:
414     return soft(ua.s, ub.s, s);
415 }
416 
417 /*----------------------------------------------------------------------------
418 | Returns the fraction bits of the half-precision floating-point value `a'.
419 *----------------------------------------------------------------------------*/
420 
421 static inline uint32_t extractFloat16Frac(float16 a)
422 {
423     return float16_val(a) & 0x3ff;
424 }
425 
426 /*----------------------------------------------------------------------------
427 | Returns the exponent bits of the half-precision floating-point value `a'.
428 *----------------------------------------------------------------------------*/
429 
430 static inline int extractFloat16Exp(float16 a)
431 {
432     return (float16_val(a) >> 10) & 0x1f;
433 }
434 
435 /*----------------------------------------------------------------------------
436 | Returns the fraction bits of the single-precision floating-point value `a'.
437 *----------------------------------------------------------------------------*/
438 
439 static inline uint32_t extractFloat32Frac(float32 a)
440 {
441     return float32_val(a) & 0x007FFFFF;
442 }
443 
444 /*----------------------------------------------------------------------------
445 | Returns the exponent bits of the single-precision floating-point value `a'.
446 *----------------------------------------------------------------------------*/
447 
448 static inline int extractFloat32Exp(float32 a)
449 {
450     return (float32_val(a) >> 23) & 0xFF;
451 }
452 
453 /*----------------------------------------------------------------------------
454 | Returns the sign bit of the single-precision floating-point value `a'.
455 *----------------------------------------------------------------------------*/
456 
457 static inline flag extractFloat32Sign(float32 a)
458 {
459     return float32_val(a) >> 31;
460 }
461 
462 /*----------------------------------------------------------------------------
463 | Returns the fraction bits of the double-precision floating-point value `a'.
464 *----------------------------------------------------------------------------*/
465 
466 static inline uint64_t extractFloat64Frac(float64 a)
467 {
468     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
469 }
470 
471 /*----------------------------------------------------------------------------
472 | Returns the exponent bits of the double-precision floating-point value `a'.
473 *----------------------------------------------------------------------------*/
474 
475 static inline int extractFloat64Exp(float64 a)
476 {
477     return (float64_val(a) >> 52) & 0x7FF;
478 }
479 
480 /*----------------------------------------------------------------------------
481 | Returns the sign bit of the double-precision floating-point value `a'.
482 *----------------------------------------------------------------------------*/
483 
484 static inline flag extractFloat64Sign(float64 a)
485 {
486     return float64_val(a) >> 63;
487 }
488 
489 /*
490  * Classify a floating point number. Everything above float_class_qnan
491  * is a NaN so cls >= float_class_qnan is any NaN.
492  */
493 
494 typedef enum __attribute__ ((__packed__)) {
495     float_class_unclassified,
496     float_class_zero,
497     float_class_normal,
498     float_class_inf,
499     float_class_qnan,  /* all NaNs from here */
500     float_class_snan,
501 } FloatClass;
502 
503 /* Simple helpers for checking if, or what kind of, NaN we have */
504 static inline __attribute__((unused)) bool is_nan(FloatClass c)
505 {
506     return unlikely(c >= float_class_qnan);
507 }
508 
509 static inline __attribute__((unused)) bool is_snan(FloatClass c)
510 {
511     return c == float_class_snan;
512 }
513 
514 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
515 {
516     return c == float_class_qnan;
517 }
518 
519 /*
520  * Structure holding all of the decomposed parts of a float. The
521  * exponent is unbiased and the fraction is normalized. All
522  * calculations are done with a 64 bit fraction and then rounded as
523  * appropriate for the final format.
524  *
525  * Thanks to the packed FloatClass a decent compiler should be able to
526  * fit the whole structure into registers and avoid using the stack
527  * for parameter passing.
528  */
529 
530 typedef struct {
531     uint64_t frac;
532     int32_t  exp;
533     FloatClass cls;
534     bool sign;
535 } FloatParts;
536 
537 #define DECOMPOSED_BINARY_POINT    (64 - 2)
538 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
539 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
540 
541 /* Structure holding all of the relevant parameters for a format.
542  *   exp_size: the size of the exponent field
543  *   exp_bias: the offset applied to the exponent field
544  *   exp_max: the maximum normalised exponent
545  *   frac_size: the size of the fraction field
546  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
547  * The following are computed based the size of fraction
548  *   frac_lsb: least significant bit of fraction
549  *   frac_lsbm1: the bit below the least significant bit (for rounding)
550  *   round_mask/roundeven_mask: masks used for rounding
551  * The following optional modifiers are available:
552  *   arm_althp: handle ARM Alternative Half Precision
553  */
554 typedef struct {
555     int exp_size;
556     int exp_bias;
557     int exp_max;
558     int frac_size;
559     int frac_shift;
560     uint64_t frac_lsb;
561     uint64_t frac_lsbm1;
562     uint64_t round_mask;
563     uint64_t roundeven_mask;
564     bool arm_althp;
565 } FloatFmt;
566 
567 /* Expand fields based on the size of exponent and fraction */
568 #define FLOAT_PARAMS(E, F)                                           \
569     .exp_size       = E,                                             \
570     .exp_bias       = ((1 << E) - 1) >> 1,                           \
571     .exp_max        = (1 << E) - 1,                                  \
572     .frac_size      = F,                                             \
573     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
574     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
575     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
576     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
577     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
578 
579 static const FloatFmt float16_params = {
580     FLOAT_PARAMS(5, 10)
581 };
582 
583 static const FloatFmt float16_params_ahp = {
584     FLOAT_PARAMS(5, 10),
585     .arm_althp = true
586 };
587 
588 static const FloatFmt float32_params = {
589     FLOAT_PARAMS(8, 23)
590 };
591 
592 static const FloatFmt float64_params = {
593     FLOAT_PARAMS(11, 52)
594 };
595 
596 /* Unpack a float to parts, but do not canonicalize.  */
597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
598 {
599     const int sign_pos = fmt.frac_size + fmt.exp_size;
600 
601     return (FloatParts) {
602         .cls = float_class_unclassified,
603         .sign = extract64(raw, sign_pos, 1),
604         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
605         .frac = extract64(raw, 0, fmt.frac_size),
606     };
607 }
608 
609 static inline FloatParts float16_unpack_raw(float16 f)
610 {
611     return unpack_raw(float16_params, f);
612 }
613 
614 static inline FloatParts float32_unpack_raw(float32 f)
615 {
616     return unpack_raw(float32_params, f);
617 }
618 
619 static inline FloatParts float64_unpack_raw(float64 f)
620 {
621     return unpack_raw(float64_params, f);
622 }
623 
624 /* Pack a float from parts, but do not canonicalize.  */
625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
626 {
627     const int sign_pos = fmt.frac_size + fmt.exp_size;
628     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
629     return deposit64(ret, sign_pos, 1, p.sign);
630 }
631 
632 static inline float16 float16_pack_raw(FloatParts p)
633 {
634     return make_float16(pack_raw(float16_params, p));
635 }
636 
637 static inline float32 float32_pack_raw(FloatParts p)
638 {
639     return make_float32(pack_raw(float32_params, p));
640 }
641 
642 static inline float64 float64_pack_raw(FloatParts p)
643 {
644     return make_float64(pack_raw(float64_params, p));
645 }
646 
647 /*----------------------------------------------------------------------------
648 | Functions and definitions to determine:  (1) whether tininess for underflow
649 | is detected before or after rounding by default, (2) what (if anything)
650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
652 | are propagated from function inputs to output.  These details are target-
653 | specific.
654 *----------------------------------------------------------------------------*/
655 #include "softfloat-specialize.h"
656 
657 /* Canonicalize EXP and FRAC, setting CLS.  */
658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
659                                   float_status *status)
660 {
661     if (part.exp == parm->exp_max && !parm->arm_althp) {
662         if (part.frac == 0) {
663             part.cls = float_class_inf;
664         } else {
665             part.frac <<= parm->frac_shift;
666             part.cls = (parts_is_snan_frac(part.frac, status)
667                         ? float_class_snan : float_class_qnan);
668         }
669     } else if (part.exp == 0) {
670         if (likely(part.frac == 0)) {
671             part.cls = float_class_zero;
672         } else if (status->flush_inputs_to_zero) {
673             float_raise(float_flag_input_denormal, status);
674             part.cls = float_class_zero;
675             part.frac = 0;
676         } else {
677             int shift = clz64(part.frac) - 1;
678             part.cls = float_class_normal;
679             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
680             part.frac <<= shift;
681         }
682     } else {
683         part.cls = float_class_normal;
684         part.exp -= parm->exp_bias;
685         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
686     }
687     return part;
688 }
689 
690 /* Round and uncanonicalize a floating-point number by parts. There
691  * are FRAC_SHIFT bits that may require rounding at the bottom of the
692  * fraction; these bits will be removed. The exponent will be biased
693  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
694  */
695 
696 static FloatParts round_canonical(FloatParts p, float_status *s,
697                                   const FloatFmt *parm)
698 {
699     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
700     const uint64_t round_mask = parm->round_mask;
701     const uint64_t roundeven_mask = parm->roundeven_mask;
702     const int exp_max = parm->exp_max;
703     const int frac_shift = parm->frac_shift;
704     uint64_t frac, inc;
705     int exp, flags = 0;
706     bool overflow_norm;
707 
708     frac = p.frac;
709     exp = p.exp;
710 
711     switch (p.cls) {
712     case float_class_normal:
713         switch (s->float_rounding_mode) {
714         case float_round_nearest_even:
715             overflow_norm = false;
716             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
717             break;
718         case float_round_ties_away:
719             overflow_norm = false;
720             inc = frac_lsbm1;
721             break;
722         case float_round_to_zero:
723             overflow_norm = true;
724             inc = 0;
725             break;
726         case float_round_up:
727             inc = p.sign ? 0 : round_mask;
728             overflow_norm = p.sign;
729             break;
730         case float_round_down:
731             inc = p.sign ? round_mask : 0;
732             overflow_norm = !p.sign;
733             break;
734         default:
735             g_assert_not_reached();
736         }
737 
738         exp += parm->exp_bias;
739         if (likely(exp > 0)) {
740             if (frac & round_mask) {
741                 flags |= float_flag_inexact;
742                 frac += inc;
743                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
744                     frac >>= 1;
745                     exp++;
746                 }
747             }
748             frac >>= frac_shift;
749 
750             if (parm->arm_althp) {
751                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
752                 if (unlikely(exp > exp_max)) {
753                     /* Overflow.  Return the maximum normal.  */
754                     flags = float_flag_invalid;
755                     exp = exp_max;
756                     frac = -1;
757                 }
758             } else if (unlikely(exp >= exp_max)) {
759                 flags |= float_flag_overflow | float_flag_inexact;
760                 if (overflow_norm) {
761                     exp = exp_max - 1;
762                     frac = -1;
763                 } else {
764                     p.cls = float_class_inf;
765                     goto do_inf;
766                 }
767             }
768         } else if (s->flush_to_zero) {
769             flags |= float_flag_output_denormal;
770             p.cls = float_class_zero;
771             goto do_zero;
772         } else {
773             bool is_tiny = (s->float_detect_tininess
774                             == float_tininess_before_rounding)
775                         || (exp < 0)
776                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
777 
778             shift64RightJamming(frac, 1 - exp, &frac);
779             if (frac & round_mask) {
780                 /* Need to recompute round-to-even.  */
781                 if (s->float_rounding_mode == float_round_nearest_even) {
782                     inc = ((frac & roundeven_mask) != frac_lsbm1
783                            ? frac_lsbm1 : 0);
784                 }
785                 flags |= float_flag_inexact;
786                 frac += inc;
787             }
788 
789             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
790             frac >>= frac_shift;
791 
792             if (is_tiny && (flags & float_flag_inexact)) {
793                 flags |= float_flag_underflow;
794             }
795             if (exp == 0 && frac == 0) {
796                 p.cls = float_class_zero;
797             }
798         }
799         break;
800 
801     case float_class_zero:
802     do_zero:
803         exp = 0;
804         frac = 0;
805         break;
806 
807     case float_class_inf:
808     do_inf:
809         assert(!parm->arm_althp);
810         exp = exp_max;
811         frac = 0;
812         break;
813 
814     case float_class_qnan:
815     case float_class_snan:
816         assert(!parm->arm_althp);
817         exp = exp_max;
818         frac >>= parm->frac_shift;
819         break;
820 
821     default:
822         g_assert_not_reached();
823     }
824 
825     float_raise(flags, s);
826     p.exp = exp;
827     p.frac = frac;
828     return p;
829 }
830 
831 /* Explicit FloatFmt version */
832 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
833                                             const FloatFmt *params)
834 {
835     return sf_canonicalize(float16_unpack_raw(f), params, s);
836 }
837 
838 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
839 {
840     return float16a_unpack_canonical(f, s, &float16_params);
841 }
842 
843 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
844                                              const FloatFmt *params)
845 {
846     return float16_pack_raw(round_canonical(p, s, params));
847 }
848 
849 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
850 {
851     return float16a_round_pack_canonical(p, s, &float16_params);
852 }
853 
854 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
855 {
856     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
857 }
858 
859 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
860 {
861     return float32_pack_raw(round_canonical(p, s, &float32_params));
862 }
863 
864 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
865 {
866     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
867 }
868 
869 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
870 {
871     return float64_pack_raw(round_canonical(p, s, &float64_params));
872 }
873 
874 static FloatParts return_nan(FloatParts a, float_status *s)
875 {
876     switch (a.cls) {
877     case float_class_snan:
878         s->float_exception_flags |= float_flag_invalid;
879         a = parts_silence_nan(a, s);
880         /* fall through */
881     case float_class_qnan:
882         if (s->default_nan_mode) {
883             return parts_default_nan(s);
884         }
885         break;
886 
887     default:
888         g_assert_not_reached();
889     }
890     return a;
891 }
892 
893 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
894 {
895     if (is_snan(a.cls) || is_snan(b.cls)) {
896         s->float_exception_flags |= float_flag_invalid;
897     }
898 
899     if (s->default_nan_mode) {
900         return parts_default_nan(s);
901     } else {
902         if (pickNaN(a.cls, b.cls,
903                     a.frac > b.frac ||
904                     (a.frac == b.frac && a.sign < b.sign))) {
905             a = b;
906         }
907         if (is_snan(a.cls)) {
908             return parts_silence_nan(a, s);
909         }
910     }
911     return a;
912 }
913 
914 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
915                                   bool inf_zero, float_status *s)
916 {
917     int which;
918 
919     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
920         s->float_exception_flags |= float_flag_invalid;
921     }
922 
923     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
924 
925     if (s->default_nan_mode) {
926         /* Note that this check is after pickNaNMulAdd so that function
927          * has an opportunity to set the Invalid flag.
928          */
929         which = 3;
930     }
931 
932     switch (which) {
933     case 0:
934         break;
935     case 1:
936         a = b;
937         break;
938     case 2:
939         a = c;
940         break;
941     case 3:
942         return parts_default_nan(s);
943     default:
944         g_assert_not_reached();
945     }
946 
947     if (is_snan(a.cls)) {
948         return parts_silence_nan(a, s);
949     }
950     return a;
951 }
952 
953 /*
954  * Returns the result of adding or subtracting the values of the
955  * floating-point values `a' and `b'. The operation is performed
956  * according to the IEC/IEEE Standard for Binary Floating-Point
957  * Arithmetic.
958  */
959 
960 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
961                                 float_status *s)
962 {
963     bool a_sign = a.sign;
964     bool b_sign = b.sign ^ subtract;
965 
966     if (a_sign != b_sign) {
967         /* Subtraction */
968 
969         if (a.cls == float_class_normal && b.cls == float_class_normal) {
970             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
971                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
972                 a.frac = a.frac - b.frac;
973             } else {
974                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
975                 a.frac = b.frac - a.frac;
976                 a.exp = b.exp;
977                 a_sign ^= 1;
978             }
979 
980             if (a.frac == 0) {
981                 a.cls = float_class_zero;
982                 a.sign = s->float_rounding_mode == float_round_down;
983             } else {
984                 int shift = clz64(a.frac) - 1;
985                 a.frac = a.frac << shift;
986                 a.exp = a.exp - shift;
987                 a.sign = a_sign;
988             }
989             return a;
990         }
991         if (is_nan(a.cls) || is_nan(b.cls)) {
992             return pick_nan(a, b, s);
993         }
994         if (a.cls == float_class_inf) {
995             if (b.cls == float_class_inf) {
996                 float_raise(float_flag_invalid, s);
997                 return parts_default_nan(s);
998             }
999             return a;
1000         }
1001         if (a.cls == float_class_zero && b.cls == float_class_zero) {
1002             a.sign = s->float_rounding_mode == float_round_down;
1003             return a;
1004         }
1005         if (a.cls == float_class_zero || b.cls == float_class_inf) {
1006             b.sign = a_sign ^ 1;
1007             return b;
1008         }
1009         if (b.cls == float_class_zero) {
1010             return a;
1011         }
1012     } else {
1013         /* Addition */
1014         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1015             if (a.exp > b.exp) {
1016                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1017             } else if (a.exp < b.exp) {
1018                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1019                 a.exp = b.exp;
1020             }
1021             a.frac += b.frac;
1022             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1023                 shift64RightJamming(a.frac, 1, &a.frac);
1024                 a.exp += 1;
1025             }
1026             return a;
1027         }
1028         if (is_nan(a.cls) || is_nan(b.cls)) {
1029             return pick_nan(a, b, s);
1030         }
1031         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1032             return a;
1033         }
1034         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1035             b.sign = b_sign;
1036             return b;
1037         }
1038     }
1039     g_assert_not_reached();
1040 }
1041 
1042 /*
1043  * Returns the result of adding or subtracting the floating-point
1044  * values `a' and `b'. The operation is performed according to the
1045  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1046  */
1047 
1048 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1049 {
1050     FloatParts pa = float16_unpack_canonical(a, status);
1051     FloatParts pb = float16_unpack_canonical(b, status);
1052     FloatParts pr = addsub_floats(pa, pb, false, status);
1053 
1054     return float16_round_pack_canonical(pr, status);
1055 }
1056 
1057 float32 QEMU_FLATTEN float32_add(float32 a, float32 b, float_status *status)
1058 {
1059     FloatParts pa = float32_unpack_canonical(a, status);
1060     FloatParts pb = float32_unpack_canonical(b, status);
1061     FloatParts pr = addsub_floats(pa, pb, false, status);
1062 
1063     return float32_round_pack_canonical(pr, status);
1064 }
1065 
1066 float64 QEMU_FLATTEN float64_add(float64 a, float64 b, float_status *status)
1067 {
1068     FloatParts pa = float64_unpack_canonical(a, status);
1069     FloatParts pb = float64_unpack_canonical(b, status);
1070     FloatParts pr = addsub_floats(pa, pb, false, status);
1071 
1072     return float64_round_pack_canonical(pr, status);
1073 }
1074 
1075 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1076 {
1077     FloatParts pa = float16_unpack_canonical(a, status);
1078     FloatParts pb = float16_unpack_canonical(b, status);
1079     FloatParts pr = addsub_floats(pa, pb, true, status);
1080 
1081     return float16_round_pack_canonical(pr, status);
1082 }
1083 
1084 float32 QEMU_FLATTEN float32_sub(float32 a, float32 b, float_status *status)
1085 {
1086     FloatParts pa = float32_unpack_canonical(a, status);
1087     FloatParts pb = float32_unpack_canonical(b, status);
1088     FloatParts pr = addsub_floats(pa, pb, true, status);
1089 
1090     return float32_round_pack_canonical(pr, status);
1091 }
1092 
1093 float64 QEMU_FLATTEN float64_sub(float64 a, float64 b, float_status *status)
1094 {
1095     FloatParts pa = float64_unpack_canonical(a, status);
1096     FloatParts pb = float64_unpack_canonical(b, status);
1097     FloatParts pr = addsub_floats(pa, pb, true, status);
1098 
1099     return float64_round_pack_canonical(pr, status);
1100 }
1101 
1102 /*
1103  * Returns the result of multiplying the floating-point values `a' and
1104  * `b'. The operation is performed according to the IEC/IEEE Standard
1105  * for Binary Floating-Point Arithmetic.
1106  */
1107 
1108 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1109 {
1110     bool sign = a.sign ^ b.sign;
1111 
1112     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1113         uint64_t hi, lo;
1114         int exp = a.exp + b.exp;
1115 
1116         mul64To128(a.frac, b.frac, &hi, &lo);
1117         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1118         if (lo & DECOMPOSED_OVERFLOW_BIT) {
1119             shift64RightJamming(lo, 1, &lo);
1120             exp += 1;
1121         }
1122 
1123         /* Re-use a */
1124         a.exp = exp;
1125         a.sign = sign;
1126         a.frac = lo;
1127         return a;
1128     }
1129     /* handle all the NaN cases */
1130     if (is_nan(a.cls) || is_nan(b.cls)) {
1131         return pick_nan(a, b, s);
1132     }
1133     /* Inf * Zero == NaN */
1134     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1135         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1136         s->float_exception_flags |= float_flag_invalid;
1137         return parts_default_nan(s);
1138     }
1139     /* Multiply by 0 or Inf */
1140     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1141         a.sign = sign;
1142         return a;
1143     }
1144     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1145         b.sign = sign;
1146         return b;
1147     }
1148     g_assert_not_reached();
1149 }
1150 
1151 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1152 {
1153     FloatParts pa = float16_unpack_canonical(a, status);
1154     FloatParts pb = float16_unpack_canonical(b, status);
1155     FloatParts pr = mul_floats(pa, pb, status);
1156 
1157     return float16_round_pack_canonical(pr, status);
1158 }
1159 
1160 float32 QEMU_FLATTEN float32_mul(float32 a, float32 b, float_status *status)
1161 {
1162     FloatParts pa = float32_unpack_canonical(a, status);
1163     FloatParts pb = float32_unpack_canonical(b, status);
1164     FloatParts pr = mul_floats(pa, pb, status);
1165 
1166     return float32_round_pack_canonical(pr, status);
1167 }
1168 
1169 float64 QEMU_FLATTEN float64_mul(float64 a, float64 b, float_status *status)
1170 {
1171     FloatParts pa = float64_unpack_canonical(a, status);
1172     FloatParts pb = float64_unpack_canonical(b, status);
1173     FloatParts pr = mul_floats(pa, pb, status);
1174 
1175     return float64_round_pack_canonical(pr, status);
1176 }
1177 
1178 /*
1179  * Returns the result of multiplying the floating-point values `a' and
1180  * `b' then adding 'c', with no intermediate rounding step after the
1181  * multiplication. The operation is performed according to the
1182  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1183  * The flags argument allows the caller to select negation of the
1184  * addend, the intermediate product, or the final result. (The
1185  * difference between this and having the caller do a separate
1186  * negation is that negating externally will flip the sign bit on
1187  * NaNs.)
1188  */
1189 
1190 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1191                                 int flags, float_status *s)
1192 {
1193     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1194                     ((1 << float_class_inf) | (1 << float_class_zero));
1195     bool p_sign;
1196     bool sign_flip = flags & float_muladd_negate_result;
1197     FloatClass p_class;
1198     uint64_t hi, lo;
1199     int p_exp;
1200 
1201     /* It is implementation-defined whether the cases of (0,inf,qnan)
1202      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1203      * they return if they do), so we have to hand this information
1204      * off to the target-specific pick-a-NaN routine.
1205      */
1206     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1207         return pick_nan_muladd(a, b, c, inf_zero, s);
1208     }
1209 
1210     if (inf_zero) {
1211         s->float_exception_flags |= float_flag_invalid;
1212         return parts_default_nan(s);
1213     }
1214 
1215     if (flags & float_muladd_negate_c) {
1216         c.sign ^= 1;
1217     }
1218 
1219     p_sign = a.sign ^ b.sign;
1220 
1221     if (flags & float_muladd_negate_product) {
1222         p_sign ^= 1;
1223     }
1224 
1225     if (a.cls == float_class_inf || b.cls == float_class_inf) {
1226         p_class = float_class_inf;
1227     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1228         p_class = float_class_zero;
1229     } else {
1230         p_class = float_class_normal;
1231     }
1232 
1233     if (c.cls == float_class_inf) {
1234         if (p_class == float_class_inf && p_sign != c.sign) {
1235             s->float_exception_flags |= float_flag_invalid;
1236             return parts_default_nan(s);
1237         } else {
1238             a.cls = float_class_inf;
1239             a.sign = c.sign ^ sign_flip;
1240             return a;
1241         }
1242     }
1243 
1244     if (p_class == float_class_inf) {
1245         a.cls = float_class_inf;
1246         a.sign = p_sign ^ sign_flip;
1247         return a;
1248     }
1249 
1250     if (p_class == float_class_zero) {
1251         if (c.cls == float_class_zero) {
1252             if (p_sign != c.sign) {
1253                 p_sign = s->float_rounding_mode == float_round_down;
1254             }
1255             c.sign = p_sign;
1256         } else if (flags & float_muladd_halve_result) {
1257             c.exp -= 1;
1258         }
1259         c.sign ^= sign_flip;
1260         return c;
1261     }
1262 
1263     /* a & b should be normals now... */
1264     assert(a.cls == float_class_normal &&
1265            b.cls == float_class_normal);
1266 
1267     p_exp = a.exp + b.exp;
1268 
1269     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1270      * result.
1271      */
1272     mul64To128(a.frac, b.frac, &hi, &lo);
1273     /* binary point now at bit 124 */
1274 
1275     /* check for overflow */
1276     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1277         shift128RightJamming(hi, lo, 1, &hi, &lo);
1278         p_exp += 1;
1279     }
1280 
1281     /* + add/sub */
1282     if (c.cls == float_class_zero) {
1283         /* move binary point back to 62 */
1284         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1285     } else {
1286         int exp_diff = p_exp - c.exp;
1287         if (p_sign == c.sign) {
1288             /* Addition */
1289             if (exp_diff <= 0) {
1290                 shift128RightJamming(hi, lo,
1291                                      DECOMPOSED_BINARY_POINT - exp_diff,
1292                                      &hi, &lo);
1293                 lo += c.frac;
1294                 p_exp = c.exp;
1295             } else {
1296                 uint64_t c_hi, c_lo;
1297                 /* shift c to the same binary point as the product (124) */
1298                 c_hi = c.frac >> 2;
1299                 c_lo = 0;
1300                 shift128RightJamming(c_hi, c_lo,
1301                                      exp_diff,
1302                                      &c_hi, &c_lo);
1303                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1304                 /* move binary point back to 62 */
1305                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1306             }
1307 
1308             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1309                 shift64RightJamming(lo, 1, &lo);
1310                 p_exp += 1;
1311             }
1312 
1313         } else {
1314             /* Subtraction */
1315             uint64_t c_hi, c_lo;
1316             /* make C binary point match product at bit 124 */
1317             c_hi = c.frac >> 2;
1318             c_lo = 0;
1319 
1320             if (exp_diff <= 0) {
1321                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1322                 if (exp_diff == 0
1323                     &&
1324                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1325                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1326                 } else {
1327                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1328                     p_sign ^= 1;
1329                     p_exp = c.exp;
1330                 }
1331             } else {
1332                 shift128RightJamming(c_hi, c_lo,
1333                                      exp_diff,
1334                                      &c_hi, &c_lo);
1335                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1336             }
1337 
1338             if (hi == 0 && lo == 0) {
1339                 a.cls = float_class_zero;
1340                 a.sign = s->float_rounding_mode == float_round_down;
1341                 a.sign ^= sign_flip;
1342                 return a;
1343             } else {
1344                 int shift;
1345                 if (hi != 0) {
1346                     shift = clz64(hi);
1347                 } else {
1348                     shift = clz64(lo) + 64;
1349                 }
1350                 /* Normalizing to a binary point of 124 is the
1351                    correct adjust for the exponent.  However since we're
1352                    shifting, we might as well put the binary point back
1353                    at 62 where we really want it.  Therefore shift as
1354                    if we're leaving 1 bit at the top of the word, but
1355                    adjust the exponent as if we're leaving 3 bits.  */
1356                 shift -= 1;
1357                 if (shift >= 64) {
1358                     lo = lo << (shift - 64);
1359                 } else {
1360                     hi = (hi << shift) | (lo >> (64 - shift));
1361                     lo = hi | ((lo << shift) != 0);
1362                 }
1363                 p_exp -= shift - 2;
1364             }
1365         }
1366     }
1367 
1368     if (flags & float_muladd_halve_result) {
1369         p_exp -= 1;
1370     }
1371 
1372     /* finally prepare our result */
1373     a.cls = float_class_normal;
1374     a.sign = p_sign ^ sign_flip;
1375     a.exp = p_exp;
1376     a.frac = lo;
1377 
1378     return a;
1379 }
1380 
1381 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1382                                                 int flags, float_status *status)
1383 {
1384     FloatParts pa = float16_unpack_canonical(a, status);
1385     FloatParts pb = float16_unpack_canonical(b, status);
1386     FloatParts pc = float16_unpack_canonical(c, status);
1387     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1388 
1389     return float16_round_pack_canonical(pr, status);
1390 }
1391 
1392 float32 QEMU_FLATTEN float32_muladd(float32 a, float32 b, float32 c,
1393                                                 int flags, float_status *status)
1394 {
1395     FloatParts pa = float32_unpack_canonical(a, status);
1396     FloatParts pb = float32_unpack_canonical(b, status);
1397     FloatParts pc = float32_unpack_canonical(c, status);
1398     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1399 
1400     return float32_round_pack_canonical(pr, status);
1401 }
1402 
1403 float64 QEMU_FLATTEN float64_muladd(float64 a, float64 b, float64 c,
1404                                                 int flags, float_status *status)
1405 {
1406     FloatParts pa = float64_unpack_canonical(a, status);
1407     FloatParts pb = float64_unpack_canonical(b, status);
1408     FloatParts pc = float64_unpack_canonical(c, status);
1409     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1410 
1411     return float64_round_pack_canonical(pr, status);
1412 }
1413 
1414 /*
1415  * Returns the result of dividing the floating-point value `a' by the
1416  * corresponding value `b'. The operation is performed according to
1417  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1418  */
1419 
1420 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1421 {
1422     bool sign = a.sign ^ b.sign;
1423 
1424     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1425         uint64_t n0, n1, q, r;
1426         int exp = a.exp - b.exp;
1427 
1428         /*
1429          * We want a 2*N / N-bit division to produce exactly an N-bit
1430          * result, so that we do not lose any precision and so that we
1431          * do not have to renormalize afterward.  If A.frac < B.frac,
1432          * then division would produce an (N-1)-bit result; shift A left
1433          * by one to produce the an N-bit result, and decrement the
1434          * exponent to match.
1435          *
1436          * The udiv_qrnnd algorithm that we're using requires normalization,
1437          * i.e. the msb of the denominator must be set.  Since we know that
1438          * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1439          * by one (more), and the remainder must be shifted right by one.
1440          */
1441         if (a.frac < b.frac) {
1442             exp -= 1;
1443             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1444         } else {
1445             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1446         }
1447         q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1448 
1449         /*
1450          * Set lsb if there is a remainder, to set inexact.
1451          * As mentioned above, to find the actual value of the remainder we
1452          * would need to shift right, but (1) we are only concerned about
1453          * non-zero-ness, and (2) the remainder will always be even because
1454          * both inputs to the division primitive are even.
1455          */
1456         a.frac = q | (r != 0);
1457         a.sign = sign;
1458         a.exp = exp;
1459         return a;
1460     }
1461     /* handle all the NaN cases */
1462     if (is_nan(a.cls) || is_nan(b.cls)) {
1463         return pick_nan(a, b, s);
1464     }
1465     /* 0/0 or Inf/Inf */
1466     if (a.cls == b.cls
1467         &&
1468         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1469         s->float_exception_flags |= float_flag_invalid;
1470         return parts_default_nan(s);
1471     }
1472     /* Inf / x or 0 / x */
1473     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1474         a.sign = sign;
1475         return a;
1476     }
1477     /* Div 0 => Inf */
1478     if (b.cls == float_class_zero) {
1479         s->float_exception_flags |= float_flag_divbyzero;
1480         a.cls = float_class_inf;
1481         a.sign = sign;
1482         return a;
1483     }
1484     /* Div by Inf */
1485     if (b.cls == float_class_inf) {
1486         a.cls = float_class_zero;
1487         a.sign = sign;
1488         return a;
1489     }
1490     g_assert_not_reached();
1491 }
1492 
1493 float16 float16_div(float16 a, float16 b, float_status *status)
1494 {
1495     FloatParts pa = float16_unpack_canonical(a, status);
1496     FloatParts pb = float16_unpack_canonical(b, status);
1497     FloatParts pr = div_floats(pa, pb, status);
1498 
1499     return float16_round_pack_canonical(pr, status);
1500 }
1501 
1502 float32 float32_div(float32 a, float32 b, float_status *status)
1503 {
1504     FloatParts pa = float32_unpack_canonical(a, status);
1505     FloatParts pb = float32_unpack_canonical(b, status);
1506     FloatParts pr = div_floats(pa, pb, status);
1507 
1508     return float32_round_pack_canonical(pr, status);
1509 }
1510 
1511 float64 float64_div(float64 a, float64 b, float_status *status)
1512 {
1513     FloatParts pa = float64_unpack_canonical(a, status);
1514     FloatParts pb = float64_unpack_canonical(b, status);
1515     FloatParts pr = div_floats(pa, pb, status);
1516 
1517     return float64_round_pack_canonical(pr, status);
1518 }
1519 
1520 /*
1521  * Float to Float conversions
1522  *
1523  * Returns the result of converting one float format to another. The
1524  * conversion is performed according to the IEC/IEEE Standard for
1525  * Binary Floating-Point Arithmetic.
1526  *
1527  * The float_to_float helper only needs to take care of raising
1528  * invalid exceptions and handling the conversion on NaNs.
1529  */
1530 
1531 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1532                                  float_status *s)
1533 {
1534     if (dstf->arm_althp) {
1535         switch (a.cls) {
1536         case float_class_qnan:
1537         case float_class_snan:
1538             /* There is no NaN in the destination format.  Raise Invalid
1539              * and return a zero with the sign of the input NaN.
1540              */
1541             s->float_exception_flags |= float_flag_invalid;
1542             a.cls = float_class_zero;
1543             a.frac = 0;
1544             a.exp = 0;
1545             break;
1546 
1547         case float_class_inf:
1548             /* There is no Inf in the destination format.  Raise Invalid
1549              * and return the maximum normal with the correct sign.
1550              */
1551             s->float_exception_flags |= float_flag_invalid;
1552             a.cls = float_class_normal;
1553             a.exp = dstf->exp_max;
1554             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1555             break;
1556 
1557         default:
1558             break;
1559         }
1560     } else if (is_nan(a.cls)) {
1561         if (is_snan(a.cls)) {
1562             s->float_exception_flags |= float_flag_invalid;
1563             a = parts_silence_nan(a, s);
1564         }
1565         if (s->default_nan_mode) {
1566             return parts_default_nan(s);
1567         }
1568     }
1569     return a;
1570 }
1571 
1572 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1573 {
1574     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1575     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1576     FloatParts pr = float_to_float(p, &float32_params, s);
1577     return float32_round_pack_canonical(pr, s);
1578 }
1579 
1580 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1581 {
1582     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1583     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1584     FloatParts pr = float_to_float(p, &float64_params, s);
1585     return float64_round_pack_canonical(pr, s);
1586 }
1587 
1588 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1589 {
1590     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1591     FloatParts p = float32_unpack_canonical(a, s);
1592     FloatParts pr = float_to_float(p, fmt16, s);
1593     return float16a_round_pack_canonical(pr, s, fmt16);
1594 }
1595 
1596 float64 float32_to_float64(float32 a, float_status *s)
1597 {
1598     FloatParts p = float32_unpack_canonical(a, s);
1599     FloatParts pr = float_to_float(p, &float64_params, s);
1600     return float64_round_pack_canonical(pr, s);
1601 }
1602 
1603 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1604 {
1605     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1606     FloatParts p = float64_unpack_canonical(a, s);
1607     FloatParts pr = float_to_float(p, fmt16, s);
1608     return float16a_round_pack_canonical(pr, s, fmt16);
1609 }
1610 
1611 float32 float64_to_float32(float64 a, float_status *s)
1612 {
1613     FloatParts p = float64_unpack_canonical(a, s);
1614     FloatParts pr = float_to_float(p, &float32_params, s);
1615     return float32_round_pack_canonical(pr, s);
1616 }
1617 
1618 /*
1619  * Rounds the floating-point value `a' to an integer, and returns the
1620  * result as a floating-point value. The operation is performed
1621  * according to the IEC/IEEE Standard for Binary Floating-Point
1622  * Arithmetic.
1623  */
1624 
1625 static FloatParts round_to_int(FloatParts a, int rmode,
1626                                int scale, float_status *s)
1627 {
1628     switch (a.cls) {
1629     case float_class_qnan:
1630     case float_class_snan:
1631         return return_nan(a, s);
1632 
1633     case float_class_zero:
1634     case float_class_inf:
1635         /* already "integral" */
1636         break;
1637 
1638     case float_class_normal:
1639         scale = MIN(MAX(scale, -0x10000), 0x10000);
1640         a.exp += scale;
1641 
1642         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1643             /* already integral */
1644             break;
1645         }
1646         if (a.exp < 0) {
1647             bool one;
1648             /* all fractional */
1649             s->float_exception_flags |= float_flag_inexact;
1650             switch (rmode) {
1651             case float_round_nearest_even:
1652                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1653                 break;
1654             case float_round_ties_away:
1655                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1656                 break;
1657             case float_round_to_zero:
1658                 one = false;
1659                 break;
1660             case float_round_up:
1661                 one = !a.sign;
1662                 break;
1663             case float_round_down:
1664                 one = a.sign;
1665                 break;
1666             default:
1667                 g_assert_not_reached();
1668             }
1669 
1670             if (one) {
1671                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1672                 a.exp = 0;
1673             } else {
1674                 a.cls = float_class_zero;
1675             }
1676         } else {
1677             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1678             uint64_t frac_lsbm1 = frac_lsb >> 1;
1679             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1680             uint64_t rnd_mask = rnd_even_mask >> 1;
1681             uint64_t inc;
1682 
1683             switch (rmode) {
1684             case float_round_nearest_even:
1685                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1686                 break;
1687             case float_round_ties_away:
1688                 inc = frac_lsbm1;
1689                 break;
1690             case float_round_to_zero:
1691                 inc = 0;
1692                 break;
1693             case float_round_up:
1694                 inc = a.sign ? 0 : rnd_mask;
1695                 break;
1696             case float_round_down:
1697                 inc = a.sign ? rnd_mask : 0;
1698                 break;
1699             default:
1700                 g_assert_not_reached();
1701             }
1702 
1703             if (a.frac & rnd_mask) {
1704                 s->float_exception_flags |= float_flag_inexact;
1705                 a.frac += inc;
1706                 a.frac &= ~rnd_mask;
1707                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1708                     a.frac >>= 1;
1709                     a.exp++;
1710                 }
1711             }
1712         }
1713         break;
1714     default:
1715         g_assert_not_reached();
1716     }
1717     return a;
1718 }
1719 
1720 float16 float16_round_to_int(float16 a, float_status *s)
1721 {
1722     FloatParts pa = float16_unpack_canonical(a, s);
1723     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1724     return float16_round_pack_canonical(pr, s);
1725 }
1726 
1727 float32 float32_round_to_int(float32 a, float_status *s)
1728 {
1729     FloatParts pa = float32_unpack_canonical(a, s);
1730     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1731     return float32_round_pack_canonical(pr, s);
1732 }
1733 
1734 float64 float64_round_to_int(float64 a, float_status *s)
1735 {
1736     FloatParts pa = float64_unpack_canonical(a, s);
1737     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1738     return float64_round_pack_canonical(pr, s);
1739 }
1740 
1741 /*
1742  * Returns the result of converting the floating-point value `a' to
1743  * the two's complement integer format. The conversion is performed
1744  * according to the IEC/IEEE Standard for Binary Floating-Point
1745  * Arithmetic---which means in particular that the conversion is
1746  * rounded according to the current rounding mode. If `a' is a NaN,
1747  * the largest positive integer is returned. Otherwise, if the
1748  * conversion overflows, the largest integer with the same sign as `a'
1749  * is returned.
1750 */
1751 
1752 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
1753                                      int64_t min, int64_t max,
1754                                      float_status *s)
1755 {
1756     uint64_t r;
1757     int orig_flags = get_float_exception_flags(s);
1758     FloatParts p = round_to_int(in, rmode, scale, s);
1759 
1760     switch (p.cls) {
1761     case float_class_snan:
1762     case float_class_qnan:
1763         s->float_exception_flags = orig_flags | float_flag_invalid;
1764         return max;
1765     case float_class_inf:
1766         s->float_exception_flags = orig_flags | float_flag_invalid;
1767         return p.sign ? min : max;
1768     case float_class_zero:
1769         return 0;
1770     case float_class_normal:
1771         if (p.exp < DECOMPOSED_BINARY_POINT) {
1772             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1773         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1774             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1775         } else {
1776             r = UINT64_MAX;
1777         }
1778         if (p.sign) {
1779             if (r <= -(uint64_t) min) {
1780                 return -r;
1781             } else {
1782                 s->float_exception_flags = orig_flags | float_flag_invalid;
1783                 return min;
1784             }
1785         } else {
1786             if (r <= max) {
1787                 return r;
1788             } else {
1789                 s->float_exception_flags = orig_flags | float_flag_invalid;
1790                 return max;
1791             }
1792         }
1793     default:
1794         g_assert_not_reached();
1795     }
1796 }
1797 
1798 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
1799                                 float_status *s)
1800 {
1801     return round_to_int_and_pack(float16_unpack_canonical(a, s),
1802                                  rmode, scale, INT16_MIN, INT16_MAX, s);
1803 }
1804 
1805 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
1806                                 float_status *s)
1807 {
1808     return round_to_int_and_pack(float16_unpack_canonical(a, s),
1809                                  rmode, scale, INT32_MIN, INT32_MAX, s);
1810 }
1811 
1812 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
1813                                 float_status *s)
1814 {
1815     return round_to_int_and_pack(float16_unpack_canonical(a, s),
1816                                  rmode, scale, INT64_MIN, INT64_MAX, s);
1817 }
1818 
1819 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
1820                                 float_status *s)
1821 {
1822     return round_to_int_and_pack(float32_unpack_canonical(a, s),
1823                                  rmode, scale, INT16_MIN, INT16_MAX, s);
1824 }
1825 
1826 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
1827                                 float_status *s)
1828 {
1829     return round_to_int_and_pack(float32_unpack_canonical(a, s),
1830                                  rmode, scale, INT32_MIN, INT32_MAX, s);
1831 }
1832 
1833 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
1834                                 float_status *s)
1835 {
1836     return round_to_int_and_pack(float32_unpack_canonical(a, s),
1837                                  rmode, scale, INT64_MIN, INT64_MAX, s);
1838 }
1839 
1840 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
1841                                 float_status *s)
1842 {
1843     return round_to_int_and_pack(float64_unpack_canonical(a, s),
1844                                  rmode, scale, INT16_MIN, INT16_MAX, s);
1845 }
1846 
1847 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
1848                                 float_status *s)
1849 {
1850     return round_to_int_and_pack(float64_unpack_canonical(a, s),
1851                                  rmode, scale, INT32_MIN, INT32_MAX, s);
1852 }
1853 
1854 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
1855                                 float_status *s)
1856 {
1857     return round_to_int_and_pack(float64_unpack_canonical(a, s),
1858                                  rmode, scale, INT64_MIN, INT64_MAX, s);
1859 }
1860 
1861 int16_t float16_to_int16(float16 a, float_status *s)
1862 {
1863     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1864 }
1865 
1866 int32_t float16_to_int32(float16 a, float_status *s)
1867 {
1868     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1869 }
1870 
1871 int64_t float16_to_int64(float16 a, float_status *s)
1872 {
1873     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1874 }
1875 
1876 int16_t float32_to_int16(float32 a, float_status *s)
1877 {
1878     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1879 }
1880 
1881 int32_t float32_to_int32(float32 a, float_status *s)
1882 {
1883     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1884 }
1885 
1886 int64_t float32_to_int64(float32 a, float_status *s)
1887 {
1888     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1889 }
1890 
1891 int16_t float64_to_int16(float64 a, float_status *s)
1892 {
1893     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1894 }
1895 
1896 int32_t float64_to_int32(float64 a, float_status *s)
1897 {
1898     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1899 }
1900 
1901 int64_t float64_to_int64(float64 a, float_status *s)
1902 {
1903     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1904 }
1905 
1906 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
1907 {
1908     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
1909 }
1910 
1911 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
1912 {
1913     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
1914 }
1915 
1916 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
1917 {
1918     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
1919 }
1920 
1921 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
1922 {
1923     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
1924 }
1925 
1926 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
1927 {
1928     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
1929 }
1930 
1931 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
1932 {
1933     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
1934 }
1935 
1936 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
1937 {
1938     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
1939 }
1940 
1941 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
1942 {
1943     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
1944 }
1945 
1946 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
1947 {
1948     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
1949 }
1950 
1951 /*
1952  *  Returns the result of converting the floating-point value `a' to
1953  *  the unsigned integer format. The conversion is performed according
1954  *  to the IEC/IEEE Standard for Binary Floating-Point
1955  *  Arithmetic---which means in particular that the conversion is
1956  *  rounded according to the current rounding mode. If `a' is a NaN,
1957  *  the largest unsigned integer is returned. Otherwise, if the
1958  *  conversion overflows, the largest unsigned integer is returned. If
1959  *  the 'a' is negative, the result is rounded and zero is returned;
1960  *  values that do not round to zero will raise the inexact exception
1961  *  flag.
1962  */
1963 
1964 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
1965                                        uint64_t max, float_status *s)
1966 {
1967     int orig_flags = get_float_exception_flags(s);
1968     FloatParts p = round_to_int(in, rmode, scale, s);
1969     uint64_t r;
1970 
1971     switch (p.cls) {
1972     case float_class_snan:
1973     case float_class_qnan:
1974         s->float_exception_flags = orig_flags | float_flag_invalid;
1975         return max;
1976     case float_class_inf:
1977         s->float_exception_flags = orig_flags | float_flag_invalid;
1978         return p.sign ? 0 : max;
1979     case float_class_zero:
1980         return 0;
1981     case float_class_normal:
1982         if (p.sign) {
1983             s->float_exception_flags = orig_flags | float_flag_invalid;
1984             return 0;
1985         }
1986 
1987         if (p.exp < DECOMPOSED_BINARY_POINT) {
1988             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1989         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1990             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1991         } else {
1992             s->float_exception_flags = orig_flags | float_flag_invalid;
1993             return max;
1994         }
1995 
1996         /* For uint64 this will never trip, but if p.exp is too large
1997          * to shift a decomposed fraction we shall have exited via the
1998          * 3rd leg above.
1999          */
2000         if (r > max) {
2001             s->float_exception_flags = orig_flags | float_flag_invalid;
2002             return max;
2003         }
2004         return r;
2005     default:
2006         g_assert_not_reached();
2007     }
2008 }
2009 
2010 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
2011                                   float_status *s)
2012 {
2013     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2014                                   rmode, scale, UINT16_MAX, s);
2015 }
2016 
2017 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
2018                                   float_status *s)
2019 {
2020     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2021                                   rmode, scale, UINT32_MAX, s);
2022 }
2023 
2024 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
2025                                   float_status *s)
2026 {
2027     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2028                                   rmode, scale, UINT64_MAX, s);
2029 }
2030 
2031 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
2032                                   float_status *s)
2033 {
2034     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2035                                   rmode, scale, UINT16_MAX, s);
2036 }
2037 
2038 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
2039                                   float_status *s)
2040 {
2041     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2042                                   rmode, scale, UINT32_MAX, s);
2043 }
2044 
2045 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
2046                                   float_status *s)
2047 {
2048     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2049                                   rmode, scale, UINT64_MAX, s);
2050 }
2051 
2052 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
2053                                   float_status *s)
2054 {
2055     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2056                                   rmode, scale, UINT16_MAX, s);
2057 }
2058 
2059 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
2060                                   float_status *s)
2061 {
2062     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2063                                   rmode, scale, UINT32_MAX, s);
2064 }
2065 
2066 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
2067                                   float_status *s)
2068 {
2069     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2070                                   rmode, scale, UINT64_MAX, s);
2071 }
2072 
2073 uint16_t float16_to_uint16(float16 a, float_status *s)
2074 {
2075     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2076 }
2077 
2078 uint32_t float16_to_uint32(float16 a, float_status *s)
2079 {
2080     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2081 }
2082 
2083 uint64_t float16_to_uint64(float16 a, float_status *s)
2084 {
2085     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2086 }
2087 
2088 uint16_t float32_to_uint16(float32 a, float_status *s)
2089 {
2090     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2091 }
2092 
2093 uint32_t float32_to_uint32(float32 a, float_status *s)
2094 {
2095     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2096 }
2097 
2098 uint64_t float32_to_uint64(float32 a, float_status *s)
2099 {
2100     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2101 }
2102 
2103 uint16_t float64_to_uint16(float64 a, float_status *s)
2104 {
2105     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2106 }
2107 
2108 uint32_t float64_to_uint32(float64 a, float_status *s)
2109 {
2110     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2111 }
2112 
2113 uint64_t float64_to_uint64(float64 a, float_status *s)
2114 {
2115     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2116 }
2117 
2118 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2119 {
2120     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2121 }
2122 
2123 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2124 {
2125     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2126 }
2127 
2128 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2129 {
2130     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2131 }
2132 
2133 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2134 {
2135     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2136 }
2137 
2138 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2139 {
2140     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2141 }
2142 
2143 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2144 {
2145     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2146 }
2147 
2148 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2149 {
2150     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2151 }
2152 
2153 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2154 {
2155     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2156 }
2157 
2158 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2159 {
2160     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2161 }
2162 
2163 /*
2164  * Integer to float conversions
2165  *
2166  * Returns the result of converting the two's complement integer `a'
2167  * to the floating-point format. The conversion is performed according
2168  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2169  */
2170 
2171 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2172 {
2173     FloatParts r = { .sign = false };
2174 
2175     if (a == 0) {
2176         r.cls = float_class_zero;
2177     } else {
2178         uint64_t f = a;
2179         int shift;
2180 
2181         r.cls = float_class_normal;
2182         if (a < 0) {
2183             f = -f;
2184             r.sign = true;
2185         }
2186         shift = clz64(f) - 1;
2187         scale = MIN(MAX(scale, -0x10000), 0x10000);
2188 
2189         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2190         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2191     }
2192 
2193     return r;
2194 }
2195 
2196 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2197 {
2198     FloatParts pa = int_to_float(a, scale, status);
2199     return float16_round_pack_canonical(pa, status);
2200 }
2201 
2202 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2203 {
2204     return int64_to_float16_scalbn(a, scale, status);
2205 }
2206 
2207 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2208 {
2209     return int64_to_float16_scalbn(a, scale, status);
2210 }
2211 
2212 float16 int64_to_float16(int64_t a, float_status *status)
2213 {
2214     return int64_to_float16_scalbn(a, 0, status);
2215 }
2216 
2217 float16 int32_to_float16(int32_t a, float_status *status)
2218 {
2219     return int64_to_float16_scalbn(a, 0, status);
2220 }
2221 
2222 float16 int16_to_float16(int16_t a, float_status *status)
2223 {
2224     return int64_to_float16_scalbn(a, 0, status);
2225 }
2226 
2227 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2228 {
2229     FloatParts pa = int_to_float(a, scale, status);
2230     return float32_round_pack_canonical(pa, status);
2231 }
2232 
2233 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2234 {
2235     return int64_to_float32_scalbn(a, scale, status);
2236 }
2237 
2238 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2239 {
2240     return int64_to_float32_scalbn(a, scale, status);
2241 }
2242 
2243 float32 int64_to_float32(int64_t a, float_status *status)
2244 {
2245     return int64_to_float32_scalbn(a, 0, status);
2246 }
2247 
2248 float32 int32_to_float32(int32_t a, float_status *status)
2249 {
2250     return int64_to_float32_scalbn(a, 0, status);
2251 }
2252 
2253 float32 int16_to_float32(int16_t a, float_status *status)
2254 {
2255     return int64_to_float32_scalbn(a, 0, status);
2256 }
2257 
2258 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2259 {
2260     FloatParts pa = int_to_float(a, scale, status);
2261     return float64_round_pack_canonical(pa, status);
2262 }
2263 
2264 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2265 {
2266     return int64_to_float64_scalbn(a, scale, status);
2267 }
2268 
2269 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2270 {
2271     return int64_to_float64_scalbn(a, scale, status);
2272 }
2273 
2274 float64 int64_to_float64(int64_t a, float_status *status)
2275 {
2276     return int64_to_float64_scalbn(a, 0, status);
2277 }
2278 
2279 float64 int32_to_float64(int32_t a, float_status *status)
2280 {
2281     return int64_to_float64_scalbn(a, 0, status);
2282 }
2283 
2284 float64 int16_to_float64(int16_t a, float_status *status)
2285 {
2286     return int64_to_float64_scalbn(a, 0, status);
2287 }
2288 
2289 
2290 /*
2291  * Unsigned Integer to float conversions
2292  *
2293  * Returns the result of converting the unsigned integer `a' to the
2294  * floating-point format. The conversion is performed according to the
2295  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2296  */
2297 
2298 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2299 {
2300     FloatParts r = { .sign = false };
2301 
2302     if (a == 0) {
2303         r.cls = float_class_zero;
2304     } else {
2305         scale = MIN(MAX(scale, -0x10000), 0x10000);
2306         r.cls = float_class_normal;
2307         if ((int64_t)a < 0) {
2308             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2309             shift64RightJamming(a, 1, &a);
2310             r.frac = a;
2311         } else {
2312             int shift = clz64(a) - 1;
2313             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2314             r.frac = a << shift;
2315         }
2316     }
2317 
2318     return r;
2319 }
2320 
2321 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2322 {
2323     FloatParts pa = uint_to_float(a, scale, status);
2324     return float16_round_pack_canonical(pa, status);
2325 }
2326 
2327 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2328 {
2329     return uint64_to_float16_scalbn(a, scale, status);
2330 }
2331 
2332 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2333 {
2334     return uint64_to_float16_scalbn(a, scale, status);
2335 }
2336 
2337 float16 uint64_to_float16(uint64_t a, float_status *status)
2338 {
2339     return uint64_to_float16_scalbn(a, 0, status);
2340 }
2341 
2342 float16 uint32_to_float16(uint32_t a, float_status *status)
2343 {
2344     return uint64_to_float16_scalbn(a, 0, status);
2345 }
2346 
2347 float16 uint16_to_float16(uint16_t a, float_status *status)
2348 {
2349     return uint64_to_float16_scalbn(a, 0, status);
2350 }
2351 
2352 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2353 {
2354     FloatParts pa = uint_to_float(a, scale, status);
2355     return float32_round_pack_canonical(pa, status);
2356 }
2357 
2358 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2359 {
2360     return uint64_to_float32_scalbn(a, scale, status);
2361 }
2362 
2363 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2364 {
2365     return uint64_to_float32_scalbn(a, scale, status);
2366 }
2367 
2368 float32 uint64_to_float32(uint64_t a, float_status *status)
2369 {
2370     return uint64_to_float32_scalbn(a, 0, status);
2371 }
2372 
2373 float32 uint32_to_float32(uint32_t a, float_status *status)
2374 {
2375     return uint64_to_float32_scalbn(a, 0, status);
2376 }
2377 
2378 float32 uint16_to_float32(uint16_t a, float_status *status)
2379 {
2380     return uint64_to_float32_scalbn(a, 0, status);
2381 }
2382 
2383 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2384 {
2385     FloatParts pa = uint_to_float(a, scale, status);
2386     return float64_round_pack_canonical(pa, status);
2387 }
2388 
2389 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2390 {
2391     return uint64_to_float64_scalbn(a, scale, status);
2392 }
2393 
2394 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2395 {
2396     return uint64_to_float64_scalbn(a, scale, status);
2397 }
2398 
2399 float64 uint64_to_float64(uint64_t a, float_status *status)
2400 {
2401     return uint64_to_float64_scalbn(a, 0, status);
2402 }
2403 
2404 float64 uint32_to_float64(uint32_t a, float_status *status)
2405 {
2406     return uint64_to_float64_scalbn(a, 0, status);
2407 }
2408 
2409 float64 uint16_to_float64(uint16_t a, float_status *status)
2410 {
2411     return uint64_to_float64_scalbn(a, 0, status);
2412 }
2413 
2414 /* Float Min/Max */
2415 /* min() and max() functions. These can't be implemented as
2416  * 'compare and pick one input' because that would mishandle
2417  * NaNs and +0 vs -0.
2418  *
2419  * minnum() and maxnum() functions. These are similar to the min()
2420  * and max() functions but if one of the arguments is a QNaN and
2421  * the other is numerical then the numerical argument is returned.
2422  * SNaNs will get quietened before being returned.
2423  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2424  * and maxNum() operations. min() and max() are the typical min/max
2425  * semantics provided by many CPUs which predate that specification.
2426  *
2427  * minnummag() and maxnummag() functions correspond to minNumMag()
2428  * and minNumMag() from the IEEE-754 2008.
2429  */
2430 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2431                                 bool ieee, bool ismag, float_status *s)
2432 {
2433     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2434         if (ieee) {
2435             /* Takes two floating-point values `a' and `b', one of
2436              * which is a NaN, and returns the appropriate NaN
2437              * result. If either `a' or `b' is a signaling NaN,
2438              * the invalid exception is raised.
2439              */
2440             if (is_snan(a.cls) || is_snan(b.cls)) {
2441                 return pick_nan(a, b, s);
2442             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2443                 return b;
2444             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2445                 return a;
2446             }
2447         }
2448         return pick_nan(a, b, s);
2449     } else {
2450         int a_exp, b_exp;
2451 
2452         switch (a.cls) {
2453         case float_class_normal:
2454             a_exp = a.exp;
2455             break;
2456         case float_class_inf:
2457             a_exp = INT_MAX;
2458             break;
2459         case float_class_zero:
2460             a_exp = INT_MIN;
2461             break;
2462         default:
2463             g_assert_not_reached();
2464             break;
2465         }
2466         switch (b.cls) {
2467         case float_class_normal:
2468             b_exp = b.exp;
2469             break;
2470         case float_class_inf:
2471             b_exp = INT_MAX;
2472             break;
2473         case float_class_zero:
2474             b_exp = INT_MIN;
2475             break;
2476         default:
2477             g_assert_not_reached();
2478             break;
2479         }
2480 
2481         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2482             bool a_less = a_exp < b_exp;
2483             if (a_exp == b_exp) {
2484                 a_less = a.frac < b.frac;
2485             }
2486             return a_less ^ ismin ? b : a;
2487         }
2488 
2489         if (a.sign == b.sign) {
2490             bool a_less = a_exp < b_exp;
2491             if (a_exp == b_exp) {
2492                 a_less = a.frac < b.frac;
2493             }
2494             return a.sign ^ a_less ^ ismin ? b : a;
2495         } else {
2496             return a.sign ^ ismin ? b : a;
2497         }
2498     }
2499 }
2500 
2501 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
2502 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2503                                      float_status *s)                   \
2504 {                                                                       \
2505     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2506     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2507     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2508                                                                         \
2509     return float ## sz ## _round_pack_canonical(pr, s);                 \
2510 }
2511 
2512 MINMAX(16, min, true, false, false)
2513 MINMAX(16, minnum, true, true, false)
2514 MINMAX(16, minnummag, true, true, true)
2515 MINMAX(16, max, false, false, false)
2516 MINMAX(16, maxnum, false, true, false)
2517 MINMAX(16, maxnummag, false, true, true)
2518 
2519 MINMAX(32, min, true, false, false)
2520 MINMAX(32, minnum, true, true, false)
2521 MINMAX(32, minnummag, true, true, true)
2522 MINMAX(32, max, false, false, false)
2523 MINMAX(32, maxnum, false, true, false)
2524 MINMAX(32, maxnummag, false, true, true)
2525 
2526 MINMAX(64, min, true, false, false)
2527 MINMAX(64, minnum, true, true, false)
2528 MINMAX(64, minnummag, true, true, true)
2529 MINMAX(64, max, false, false, false)
2530 MINMAX(64, maxnum, false, true, false)
2531 MINMAX(64, maxnummag, false, true, true)
2532 
2533 #undef MINMAX
2534 
2535 /* Floating point compare */
2536 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2537                           float_status *s)
2538 {
2539     if (is_nan(a.cls) || is_nan(b.cls)) {
2540         if (!is_quiet ||
2541             a.cls == float_class_snan ||
2542             b.cls == float_class_snan) {
2543             s->float_exception_flags |= float_flag_invalid;
2544         }
2545         return float_relation_unordered;
2546     }
2547 
2548     if (a.cls == float_class_zero) {
2549         if (b.cls == float_class_zero) {
2550             return float_relation_equal;
2551         }
2552         return b.sign ? float_relation_greater : float_relation_less;
2553     } else if (b.cls == float_class_zero) {
2554         return a.sign ? float_relation_less : float_relation_greater;
2555     }
2556 
2557     /* The only really important thing about infinity is its sign. If
2558      * both are infinities the sign marks the smallest of the two.
2559      */
2560     if (a.cls == float_class_inf) {
2561         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2562             return float_relation_equal;
2563         }
2564         return a.sign ? float_relation_less : float_relation_greater;
2565     } else if (b.cls == float_class_inf) {
2566         return b.sign ? float_relation_greater : float_relation_less;
2567     }
2568 
2569     if (a.sign != b.sign) {
2570         return a.sign ? float_relation_less : float_relation_greater;
2571     }
2572 
2573     if (a.exp == b.exp) {
2574         if (a.frac == b.frac) {
2575             return float_relation_equal;
2576         }
2577         if (a.sign) {
2578             return a.frac > b.frac ?
2579                 float_relation_less : float_relation_greater;
2580         } else {
2581             return a.frac > b.frac ?
2582                 float_relation_greater : float_relation_less;
2583         }
2584     } else {
2585         if (a.sign) {
2586             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2587         } else {
2588             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2589         }
2590     }
2591 }
2592 
2593 #define COMPARE(sz)                                                     \
2594 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
2595                             float_status *s)                            \
2596 {                                                                       \
2597     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2598     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2599     return compare_floats(pa, pb, false, s);                            \
2600 }                                                                       \
2601 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
2602                                   float_status *s)                      \
2603 {                                                                       \
2604     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2605     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2606     return compare_floats(pa, pb, true, s);                             \
2607 }
2608 
2609 COMPARE(16)
2610 COMPARE(32)
2611 COMPARE(64)
2612 
2613 #undef COMPARE
2614 
2615 /* Multiply A by 2 raised to the power N.  */
2616 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
2617 {
2618     if (unlikely(is_nan(a.cls))) {
2619         return return_nan(a, s);
2620     }
2621     if (a.cls == float_class_normal) {
2622         /* The largest float type (even though not supported by FloatParts)
2623          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
2624          * still allows rounding to infinity, without allowing overflow
2625          * within the int32_t that backs FloatParts.exp.
2626          */
2627         n = MIN(MAX(n, -0x10000), 0x10000);
2628         a.exp += n;
2629     }
2630     return a;
2631 }
2632 
2633 float16 float16_scalbn(float16 a, int n, float_status *status)
2634 {
2635     FloatParts pa = float16_unpack_canonical(a, status);
2636     FloatParts pr = scalbn_decomposed(pa, n, status);
2637     return float16_round_pack_canonical(pr, status);
2638 }
2639 
2640 float32 float32_scalbn(float32 a, int n, float_status *status)
2641 {
2642     FloatParts pa = float32_unpack_canonical(a, status);
2643     FloatParts pr = scalbn_decomposed(pa, n, status);
2644     return float32_round_pack_canonical(pr, status);
2645 }
2646 
2647 float64 float64_scalbn(float64 a, int n, float_status *status)
2648 {
2649     FloatParts pa = float64_unpack_canonical(a, status);
2650     FloatParts pr = scalbn_decomposed(pa, n, status);
2651     return float64_round_pack_canonical(pr, status);
2652 }
2653 
2654 /*
2655  * Square Root
2656  *
2657  * The old softfloat code did an approximation step before zeroing in
2658  * on the final result. However for simpleness we just compute the
2659  * square root by iterating down from the implicit bit to enough extra
2660  * bits to ensure we get a correctly rounded result.
2661  *
2662  * This does mean however the calculation is slower than before,
2663  * especially for 64 bit floats.
2664  */
2665 
2666 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
2667 {
2668     uint64_t a_frac, r_frac, s_frac;
2669     int bit, last_bit;
2670 
2671     if (is_nan(a.cls)) {
2672         return return_nan(a, s);
2673     }
2674     if (a.cls == float_class_zero) {
2675         return a;  /* sqrt(+-0) = +-0 */
2676     }
2677     if (a.sign) {
2678         s->float_exception_flags |= float_flag_invalid;
2679         return parts_default_nan(s);
2680     }
2681     if (a.cls == float_class_inf) {
2682         return a;  /* sqrt(+inf) = +inf */
2683     }
2684 
2685     assert(a.cls == float_class_normal);
2686 
2687     /* We need two overflow bits at the top. Adding room for that is a
2688      * right shift. If the exponent is odd, we can discard the low bit
2689      * by multiplying the fraction by 2; that's a left shift. Combine
2690      * those and we shift right if the exponent is even.
2691      */
2692     a_frac = a.frac;
2693     if (!(a.exp & 1)) {
2694         a_frac >>= 1;
2695     }
2696     a.exp >>= 1;
2697 
2698     /* Bit-by-bit computation of sqrt.  */
2699     r_frac = 0;
2700     s_frac = 0;
2701 
2702     /* Iterate from implicit bit down to the 3 extra bits to compute a
2703      * properly rounded result. Remember we've inserted one more bit
2704      * at the top, so these positions are one less.
2705      */
2706     bit = DECOMPOSED_BINARY_POINT - 1;
2707     last_bit = MAX(p->frac_shift - 4, 0);
2708     do {
2709         uint64_t q = 1ULL << bit;
2710         uint64_t t_frac = s_frac + q;
2711         if (t_frac <= a_frac) {
2712             s_frac = t_frac + q;
2713             a_frac -= t_frac;
2714             r_frac += q;
2715         }
2716         a_frac <<= 1;
2717     } while (--bit >= last_bit);
2718 
2719     /* Undo the right shift done above. If there is any remaining
2720      * fraction, the result is inexact. Set the sticky bit.
2721      */
2722     a.frac = (r_frac << 1) + (a_frac != 0);
2723 
2724     return a;
2725 }
2726 
2727 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
2728 {
2729     FloatParts pa = float16_unpack_canonical(a, status);
2730     FloatParts pr = sqrt_float(pa, status, &float16_params);
2731     return float16_round_pack_canonical(pr, status);
2732 }
2733 
2734 float32 QEMU_FLATTEN float32_sqrt(float32 a, float_status *status)
2735 {
2736     FloatParts pa = float32_unpack_canonical(a, status);
2737     FloatParts pr = sqrt_float(pa, status, &float32_params);
2738     return float32_round_pack_canonical(pr, status);
2739 }
2740 
2741 float64 QEMU_FLATTEN float64_sqrt(float64 a, float_status *status)
2742 {
2743     FloatParts pa = float64_unpack_canonical(a, status);
2744     FloatParts pr = sqrt_float(pa, status, &float64_params);
2745     return float64_round_pack_canonical(pr, status);
2746 }
2747 
2748 /*----------------------------------------------------------------------------
2749 | The pattern for a default generated NaN.
2750 *----------------------------------------------------------------------------*/
2751 
2752 float16 float16_default_nan(float_status *status)
2753 {
2754     FloatParts p = parts_default_nan(status);
2755     p.frac >>= float16_params.frac_shift;
2756     return float16_pack_raw(p);
2757 }
2758 
2759 float32 float32_default_nan(float_status *status)
2760 {
2761     FloatParts p = parts_default_nan(status);
2762     p.frac >>= float32_params.frac_shift;
2763     return float32_pack_raw(p);
2764 }
2765 
2766 float64 float64_default_nan(float_status *status)
2767 {
2768     FloatParts p = parts_default_nan(status);
2769     p.frac >>= float64_params.frac_shift;
2770     return float64_pack_raw(p);
2771 }
2772 
2773 float128 float128_default_nan(float_status *status)
2774 {
2775     FloatParts p = parts_default_nan(status);
2776     float128 r;
2777 
2778     /* Extrapolate from the choices made by parts_default_nan to fill
2779      * in the quad-floating format.  If the low bit is set, assume we
2780      * want to set all non-snan bits.
2781      */
2782     r.low = -(p.frac & 1);
2783     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
2784     r.high |= LIT64(0x7FFF000000000000);
2785     r.high |= (uint64_t)p.sign << 63;
2786 
2787     return r;
2788 }
2789 
2790 /*----------------------------------------------------------------------------
2791 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
2792 *----------------------------------------------------------------------------*/
2793 
2794 float16 float16_silence_nan(float16 a, float_status *status)
2795 {
2796     FloatParts p = float16_unpack_raw(a);
2797     p.frac <<= float16_params.frac_shift;
2798     p = parts_silence_nan(p, status);
2799     p.frac >>= float16_params.frac_shift;
2800     return float16_pack_raw(p);
2801 }
2802 
2803 float32 float32_silence_nan(float32 a, float_status *status)
2804 {
2805     FloatParts p = float32_unpack_raw(a);
2806     p.frac <<= float32_params.frac_shift;
2807     p = parts_silence_nan(p, status);
2808     p.frac >>= float32_params.frac_shift;
2809     return float32_pack_raw(p);
2810 }
2811 
2812 float64 float64_silence_nan(float64 a, float_status *status)
2813 {
2814     FloatParts p = float64_unpack_raw(a);
2815     p.frac <<= float64_params.frac_shift;
2816     p = parts_silence_nan(p, status);
2817     p.frac >>= float64_params.frac_shift;
2818     return float64_pack_raw(p);
2819 }
2820 
2821 /*----------------------------------------------------------------------------
2822 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2823 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2824 | input.  If `zSign' is 1, the input is negated before being converted to an
2825 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
2826 | is simply rounded to an integer, with the inexact exception raised if the
2827 | input cannot be represented exactly as an integer.  However, if the fixed-
2828 | point input is too large, the invalid exception is raised and the largest
2829 | positive or negative integer is returned.
2830 *----------------------------------------------------------------------------*/
2831 
2832 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2833 {
2834     int8_t roundingMode;
2835     flag roundNearestEven;
2836     int8_t roundIncrement, roundBits;
2837     int32_t z;
2838 
2839     roundingMode = status->float_rounding_mode;
2840     roundNearestEven = ( roundingMode == float_round_nearest_even );
2841     switch (roundingMode) {
2842     case float_round_nearest_even:
2843     case float_round_ties_away:
2844         roundIncrement = 0x40;
2845         break;
2846     case float_round_to_zero:
2847         roundIncrement = 0;
2848         break;
2849     case float_round_up:
2850         roundIncrement = zSign ? 0 : 0x7f;
2851         break;
2852     case float_round_down:
2853         roundIncrement = zSign ? 0x7f : 0;
2854         break;
2855     default:
2856         abort();
2857     }
2858     roundBits = absZ & 0x7F;
2859     absZ = ( absZ + roundIncrement )>>7;
2860     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2861     z = absZ;
2862     if ( zSign ) z = - z;
2863     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2864         float_raise(float_flag_invalid, status);
2865         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2866     }
2867     if (roundBits) {
2868         status->float_exception_flags |= float_flag_inexact;
2869     }
2870     return z;
2871 
2872 }
2873 
2874 /*----------------------------------------------------------------------------
2875 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2876 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2877 | and returns the properly rounded 64-bit integer corresponding to the input.
2878 | If `zSign' is 1, the input is negated before being converted to an integer.
2879 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2880 | the inexact exception raised if the input cannot be represented exactly as
2881 | an integer.  However, if the fixed-point input is too large, the invalid
2882 | exception is raised and the largest positive or negative integer is
2883 | returned.
2884 *----------------------------------------------------------------------------*/
2885 
2886 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2887                                float_status *status)
2888 {
2889     int8_t roundingMode;
2890     flag roundNearestEven, increment;
2891     int64_t z;
2892 
2893     roundingMode = status->float_rounding_mode;
2894     roundNearestEven = ( roundingMode == float_round_nearest_even );
2895     switch (roundingMode) {
2896     case float_round_nearest_even:
2897     case float_round_ties_away:
2898         increment = ((int64_t) absZ1 < 0);
2899         break;
2900     case float_round_to_zero:
2901         increment = 0;
2902         break;
2903     case float_round_up:
2904         increment = !zSign && absZ1;
2905         break;
2906     case float_round_down:
2907         increment = zSign && absZ1;
2908         break;
2909     default:
2910         abort();
2911     }
2912     if ( increment ) {
2913         ++absZ0;
2914         if ( absZ0 == 0 ) goto overflow;
2915         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2916     }
2917     z = absZ0;
2918     if ( zSign ) z = - z;
2919     if ( z && ( ( z < 0 ) ^ zSign ) ) {
2920  overflow:
2921         float_raise(float_flag_invalid, status);
2922         return
2923               zSign ? (int64_t) LIT64( 0x8000000000000000 )
2924             : LIT64( 0x7FFFFFFFFFFFFFFF );
2925     }
2926     if (absZ1) {
2927         status->float_exception_flags |= float_flag_inexact;
2928     }
2929     return z;
2930 
2931 }
2932 
2933 /*----------------------------------------------------------------------------
2934 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2935 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2936 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2937 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
2938 | with the inexact exception raised if the input cannot be represented exactly
2939 | as an integer.  However, if the fixed-point input is too large, the invalid
2940 | exception is raised and the largest unsigned integer is returned.
2941 *----------------------------------------------------------------------------*/
2942 
2943 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2944                                 uint64_t absZ1, float_status *status)
2945 {
2946     int8_t roundingMode;
2947     flag roundNearestEven, increment;
2948 
2949     roundingMode = status->float_rounding_mode;
2950     roundNearestEven = (roundingMode == float_round_nearest_even);
2951     switch (roundingMode) {
2952     case float_round_nearest_even:
2953     case float_round_ties_away:
2954         increment = ((int64_t)absZ1 < 0);
2955         break;
2956     case float_round_to_zero:
2957         increment = 0;
2958         break;
2959     case float_round_up:
2960         increment = !zSign && absZ1;
2961         break;
2962     case float_round_down:
2963         increment = zSign && absZ1;
2964         break;
2965     default:
2966         abort();
2967     }
2968     if (increment) {
2969         ++absZ0;
2970         if (absZ0 == 0) {
2971             float_raise(float_flag_invalid, status);
2972             return LIT64(0xFFFFFFFFFFFFFFFF);
2973         }
2974         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2975     }
2976 
2977     if (zSign && absZ0) {
2978         float_raise(float_flag_invalid, status);
2979         return 0;
2980     }
2981 
2982     if (absZ1) {
2983         status->float_exception_flags |= float_flag_inexact;
2984     }
2985     return absZ0;
2986 }
2987 
2988 /*----------------------------------------------------------------------------
2989 | If `a' is denormal and we are in flush-to-zero mode then set the
2990 | input-denormal exception and return zero. Otherwise just return the value.
2991 *----------------------------------------------------------------------------*/
2992 float32 float32_squash_input_denormal(float32 a, float_status *status)
2993 {
2994     if (status->flush_inputs_to_zero) {
2995         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2996             float_raise(float_flag_input_denormal, status);
2997             return make_float32(float32_val(a) & 0x80000000);
2998         }
2999     }
3000     return a;
3001 }
3002 
3003 /*----------------------------------------------------------------------------
3004 | Normalizes the subnormal single-precision floating-point value represented
3005 | by the denormalized significand `aSig'.  The normalized exponent and
3006 | significand are stored at the locations pointed to by `zExpPtr' and
3007 | `zSigPtr', respectively.
3008 *----------------------------------------------------------------------------*/
3009 
3010 static void
3011  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3012 {
3013     int8_t shiftCount;
3014 
3015     shiftCount = clz32(aSig) - 8;
3016     *zSigPtr = aSig<<shiftCount;
3017     *zExpPtr = 1 - shiftCount;
3018 
3019 }
3020 
3021 /*----------------------------------------------------------------------------
3022 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3023 | and significand `zSig', and returns the proper single-precision floating-
3024 | point value corresponding to the abstract input.  Ordinarily, the abstract
3025 | value is simply rounded and packed into the single-precision format, with
3026 | the inexact exception raised if the abstract input cannot be represented
3027 | exactly.  However, if the abstract value is too large, the overflow and
3028 | inexact exceptions are raised and an infinity or maximal finite value is
3029 | returned.  If the abstract value is too small, the input value is rounded to
3030 | a subnormal number, and the underflow and inexact exceptions are raised if
3031 | the abstract input cannot be represented exactly as a subnormal single-
3032 | precision floating-point number.
3033 |     The input significand `zSig' has its binary point between bits 30
3034 | and 29, which is 7 bits to the left of the usual location.  This shifted
3035 | significand must be normalized or smaller.  If `zSig' is not normalized,
3036 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3037 | and it must not require rounding.  In the usual case that `zSig' is
3038 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3039 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3040 | Binary Floating-Point Arithmetic.
3041 *----------------------------------------------------------------------------*/
3042 
3043 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3044                                    float_status *status)
3045 {
3046     int8_t roundingMode;
3047     flag roundNearestEven;
3048     int8_t roundIncrement, roundBits;
3049     flag isTiny;
3050 
3051     roundingMode = status->float_rounding_mode;
3052     roundNearestEven = ( roundingMode == float_round_nearest_even );
3053     switch (roundingMode) {
3054     case float_round_nearest_even:
3055     case float_round_ties_away:
3056         roundIncrement = 0x40;
3057         break;
3058     case float_round_to_zero:
3059         roundIncrement = 0;
3060         break;
3061     case float_round_up:
3062         roundIncrement = zSign ? 0 : 0x7f;
3063         break;
3064     case float_round_down:
3065         roundIncrement = zSign ? 0x7f : 0;
3066         break;
3067     default:
3068         abort();
3069         break;
3070     }
3071     roundBits = zSig & 0x7F;
3072     if ( 0xFD <= (uint16_t) zExp ) {
3073         if (    ( 0xFD < zExp )
3074              || (    ( zExp == 0xFD )
3075                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3076            ) {
3077             float_raise(float_flag_overflow | float_flag_inexact, status);
3078             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
3079         }
3080         if ( zExp < 0 ) {
3081             if (status->flush_to_zero) {
3082                 float_raise(float_flag_output_denormal, status);
3083                 return packFloat32(zSign, 0, 0);
3084             }
3085             isTiny =
3086                 (status->float_detect_tininess
3087                  == float_tininess_before_rounding)
3088                 || ( zExp < -1 )
3089                 || ( zSig + roundIncrement < 0x80000000 );
3090             shift32RightJamming( zSig, - zExp, &zSig );
3091             zExp = 0;
3092             roundBits = zSig & 0x7F;
3093             if (isTiny && roundBits) {
3094                 float_raise(float_flag_underflow, status);
3095             }
3096         }
3097     }
3098     if (roundBits) {
3099         status->float_exception_flags |= float_flag_inexact;
3100     }
3101     zSig = ( zSig + roundIncrement )>>7;
3102     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3103     if ( zSig == 0 ) zExp = 0;
3104     return packFloat32( zSign, zExp, zSig );
3105 
3106 }
3107 
3108 /*----------------------------------------------------------------------------
3109 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3110 | and significand `zSig', and returns the proper single-precision floating-
3111 | point value corresponding to the abstract input.  This routine is just like
3112 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3113 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3114 | floating-point exponent.
3115 *----------------------------------------------------------------------------*/
3116 
3117 static float32
3118  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3119                               float_status *status)
3120 {
3121     int8_t shiftCount;
3122 
3123     shiftCount = clz32(zSig) - 1;
3124     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3125                                status);
3126 
3127 }
3128 
3129 /*----------------------------------------------------------------------------
3130 | If `a' is denormal and we are in flush-to-zero mode then set the
3131 | input-denormal exception and return zero. Otherwise just return the value.
3132 *----------------------------------------------------------------------------*/
3133 float64 float64_squash_input_denormal(float64 a, float_status *status)
3134 {
3135     if (status->flush_inputs_to_zero) {
3136         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
3137             float_raise(float_flag_input_denormal, status);
3138             return make_float64(float64_val(a) & (1ULL << 63));
3139         }
3140     }
3141     return a;
3142 }
3143 
3144 /*----------------------------------------------------------------------------
3145 | Normalizes the subnormal double-precision floating-point value represented
3146 | by the denormalized significand `aSig'.  The normalized exponent and
3147 | significand are stored at the locations pointed to by `zExpPtr' and
3148 | `zSigPtr', respectively.
3149 *----------------------------------------------------------------------------*/
3150 
3151 static void
3152  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3153 {
3154     int8_t shiftCount;
3155 
3156     shiftCount = clz64(aSig) - 11;
3157     *zSigPtr = aSig<<shiftCount;
3158     *zExpPtr = 1 - shiftCount;
3159 
3160 }
3161 
3162 /*----------------------------------------------------------------------------
3163 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3164 | double-precision floating-point value, returning the result.  After being
3165 | shifted into the proper positions, the three fields are simply added
3166 | together to form the result.  This means that any integer portion of `zSig'
3167 | will be added into the exponent.  Since a properly normalized significand
3168 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3169 | than the desired result exponent whenever `zSig' is a complete, normalized
3170 | significand.
3171 *----------------------------------------------------------------------------*/
3172 
3173 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
3174 {
3175 
3176     return make_float64(
3177         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3178 
3179 }
3180 
3181 /*----------------------------------------------------------------------------
3182 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3183 | and significand `zSig', and returns the proper double-precision floating-
3184 | point value corresponding to the abstract input.  Ordinarily, the abstract
3185 | value is simply rounded and packed into the double-precision format, with
3186 | the inexact exception raised if the abstract input cannot be represented
3187 | exactly.  However, if the abstract value is too large, the overflow and
3188 | inexact exceptions are raised and an infinity or maximal finite value is
3189 | returned.  If the abstract value is too small, the input value is rounded to
3190 | a subnormal number, and the underflow and inexact exceptions are raised if
3191 | the abstract input cannot be represented exactly as a subnormal double-
3192 | precision floating-point number.
3193 |     The input significand `zSig' has its binary point between bits 62
3194 | and 61, which is 10 bits to the left of the usual location.  This shifted
3195 | significand must be normalized or smaller.  If `zSig' is not normalized,
3196 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3197 | and it must not require rounding.  In the usual case that `zSig' is
3198 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3199 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3200 | Binary Floating-Point Arithmetic.
3201 *----------------------------------------------------------------------------*/
3202 
3203 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3204                                    float_status *status)
3205 {
3206     int8_t roundingMode;
3207     flag roundNearestEven;
3208     int roundIncrement, roundBits;
3209     flag isTiny;
3210 
3211     roundingMode = status->float_rounding_mode;
3212     roundNearestEven = ( roundingMode == float_round_nearest_even );
3213     switch (roundingMode) {
3214     case float_round_nearest_even:
3215     case float_round_ties_away:
3216         roundIncrement = 0x200;
3217         break;
3218     case float_round_to_zero:
3219         roundIncrement = 0;
3220         break;
3221     case float_round_up:
3222         roundIncrement = zSign ? 0 : 0x3ff;
3223         break;
3224     case float_round_down:
3225         roundIncrement = zSign ? 0x3ff : 0;
3226         break;
3227     case float_round_to_odd:
3228         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3229         break;
3230     default:
3231         abort();
3232     }
3233     roundBits = zSig & 0x3FF;
3234     if ( 0x7FD <= (uint16_t) zExp ) {
3235         if (    ( 0x7FD < zExp )
3236              || (    ( zExp == 0x7FD )
3237                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3238            ) {
3239             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3240                                    roundIncrement != 0;
3241             float_raise(float_flag_overflow | float_flag_inexact, status);
3242             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3243         }
3244         if ( zExp < 0 ) {
3245             if (status->flush_to_zero) {
3246                 float_raise(float_flag_output_denormal, status);
3247                 return packFloat64(zSign, 0, 0);
3248             }
3249             isTiny =
3250                    (status->float_detect_tininess
3251                     == float_tininess_before_rounding)
3252                 || ( zExp < -1 )
3253                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
3254             shift64RightJamming( zSig, - zExp, &zSig );
3255             zExp = 0;
3256             roundBits = zSig & 0x3FF;
3257             if (isTiny && roundBits) {
3258                 float_raise(float_flag_underflow, status);
3259             }
3260             if (roundingMode == float_round_to_odd) {
3261                 /*
3262                  * For round-to-odd case, the roundIncrement depends on
3263                  * zSig which just changed.
3264                  */
3265                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3266             }
3267         }
3268     }
3269     if (roundBits) {
3270         status->float_exception_flags |= float_flag_inexact;
3271     }
3272     zSig = ( zSig + roundIncrement )>>10;
3273     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3274     if ( zSig == 0 ) zExp = 0;
3275     return packFloat64( zSign, zExp, zSig );
3276 
3277 }
3278 
3279 /*----------------------------------------------------------------------------
3280 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3281 | and significand `zSig', and returns the proper double-precision floating-
3282 | point value corresponding to the abstract input.  This routine is just like
3283 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3284 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3285 | floating-point exponent.
3286 *----------------------------------------------------------------------------*/
3287 
3288 static float64
3289  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3290                               float_status *status)
3291 {
3292     int8_t shiftCount;
3293 
3294     shiftCount = clz64(zSig) - 1;
3295     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3296                                status);
3297 
3298 }
3299 
3300 /*----------------------------------------------------------------------------
3301 | Normalizes the subnormal extended double-precision floating-point value
3302 | represented by the denormalized significand `aSig'.  The normalized exponent
3303 | and significand are stored at the locations pointed to by `zExpPtr' and
3304 | `zSigPtr', respectively.
3305 *----------------------------------------------------------------------------*/
3306 
3307 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3308                                 uint64_t *zSigPtr)
3309 {
3310     int8_t shiftCount;
3311 
3312     shiftCount = clz64(aSig);
3313     *zSigPtr = aSig<<shiftCount;
3314     *zExpPtr = 1 - shiftCount;
3315 }
3316 
3317 /*----------------------------------------------------------------------------
3318 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3319 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3320 | and returns the proper extended double-precision floating-point value
3321 | corresponding to the abstract input.  Ordinarily, the abstract value is
3322 | rounded and packed into the extended double-precision format, with the
3323 | inexact exception raised if the abstract input cannot be represented
3324 | exactly.  However, if the abstract value is too large, the overflow and
3325 | inexact exceptions are raised and an infinity or maximal finite value is
3326 | returned.  If the abstract value is too small, the input value is rounded to
3327 | a subnormal number, and the underflow and inexact exceptions are raised if
3328 | the abstract input cannot be represented exactly as a subnormal extended
3329 | double-precision floating-point number.
3330 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3331 | number of bits as single or double precision, respectively.  Otherwise, the
3332 | result is rounded to the full precision of the extended double-precision
3333 | format.
3334 |     The input significand must be normalized or smaller.  If the input
3335 | significand is not normalized, `zExp' must be 0; in that case, the result
3336 | returned is a subnormal number, and it must not require rounding.  The
3337 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3338 | Floating-Point Arithmetic.
3339 *----------------------------------------------------------------------------*/
3340 
3341 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3342                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3343                               float_status *status)
3344 {
3345     int8_t roundingMode;
3346     flag roundNearestEven, increment, isTiny;
3347     int64_t roundIncrement, roundMask, roundBits;
3348 
3349     roundingMode = status->float_rounding_mode;
3350     roundNearestEven = ( roundingMode == float_round_nearest_even );
3351     if ( roundingPrecision == 80 ) goto precision80;
3352     if ( roundingPrecision == 64 ) {
3353         roundIncrement = LIT64( 0x0000000000000400 );
3354         roundMask = LIT64( 0x00000000000007FF );
3355     }
3356     else if ( roundingPrecision == 32 ) {
3357         roundIncrement = LIT64( 0x0000008000000000 );
3358         roundMask = LIT64( 0x000000FFFFFFFFFF );
3359     }
3360     else {
3361         goto precision80;
3362     }
3363     zSig0 |= ( zSig1 != 0 );
3364     switch (roundingMode) {
3365     case float_round_nearest_even:
3366     case float_round_ties_away:
3367         break;
3368     case float_round_to_zero:
3369         roundIncrement = 0;
3370         break;
3371     case float_round_up:
3372         roundIncrement = zSign ? 0 : roundMask;
3373         break;
3374     case float_round_down:
3375         roundIncrement = zSign ? roundMask : 0;
3376         break;
3377     default:
3378         abort();
3379     }
3380     roundBits = zSig0 & roundMask;
3381     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3382         if (    ( 0x7FFE < zExp )
3383              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3384            ) {
3385             goto overflow;
3386         }
3387         if ( zExp <= 0 ) {
3388             if (status->flush_to_zero) {
3389                 float_raise(float_flag_output_denormal, status);
3390                 return packFloatx80(zSign, 0, 0);
3391             }
3392             isTiny =
3393                    (status->float_detect_tininess
3394                     == float_tininess_before_rounding)
3395                 || ( zExp < 0 )
3396                 || ( zSig0 <= zSig0 + roundIncrement );
3397             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3398             zExp = 0;
3399             roundBits = zSig0 & roundMask;
3400             if (isTiny && roundBits) {
3401                 float_raise(float_flag_underflow, status);
3402             }
3403             if (roundBits) {
3404                 status->float_exception_flags |= float_flag_inexact;
3405             }
3406             zSig0 += roundIncrement;
3407             if ( (int64_t) zSig0 < 0 ) zExp = 1;
3408             roundIncrement = roundMask + 1;
3409             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3410                 roundMask |= roundIncrement;
3411             }
3412             zSig0 &= ~ roundMask;
3413             return packFloatx80( zSign, zExp, zSig0 );
3414         }
3415     }
3416     if (roundBits) {
3417         status->float_exception_flags |= float_flag_inexact;
3418     }
3419     zSig0 += roundIncrement;
3420     if ( zSig0 < roundIncrement ) {
3421         ++zExp;
3422         zSig0 = LIT64( 0x8000000000000000 );
3423     }
3424     roundIncrement = roundMask + 1;
3425     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3426         roundMask |= roundIncrement;
3427     }
3428     zSig0 &= ~ roundMask;
3429     if ( zSig0 == 0 ) zExp = 0;
3430     return packFloatx80( zSign, zExp, zSig0 );
3431  precision80:
3432     switch (roundingMode) {
3433     case float_round_nearest_even:
3434     case float_round_ties_away:
3435         increment = ((int64_t)zSig1 < 0);
3436         break;
3437     case float_round_to_zero:
3438         increment = 0;
3439         break;
3440     case float_round_up:
3441         increment = !zSign && zSig1;
3442         break;
3443     case float_round_down:
3444         increment = zSign && zSig1;
3445         break;
3446     default:
3447         abort();
3448     }
3449     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3450         if (    ( 0x7FFE < zExp )
3451              || (    ( zExp == 0x7FFE )
3452                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3453                   && increment
3454                 )
3455            ) {
3456             roundMask = 0;
3457  overflow:
3458             float_raise(float_flag_overflow | float_flag_inexact, status);
3459             if (    ( roundingMode == float_round_to_zero )
3460                  || ( zSign && ( roundingMode == float_round_up ) )
3461                  || ( ! zSign && ( roundingMode == float_round_down ) )
3462                ) {
3463                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3464             }
3465             return packFloatx80(zSign,
3466                                 floatx80_infinity_high,
3467                                 floatx80_infinity_low);
3468         }
3469         if ( zExp <= 0 ) {
3470             isTiny =
3471                    (status->float_detect_tininess
3472                     == float_tininess_before_rounding)
3473                 || ( zExp < 0 )
3474                 || ! increment
3475                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3476             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3477             zExp = 0;
3478             if (isTiny && zSig1) {
3479                 float_raise(float_flag_underflow, status);
3480             }
3481             if (zSig1) {
3482                 status->float_exception_flags |= float_flag_inexact;
3483             }
3484             switch (roundingMode) {
3485             case float_round_nearest_even:
3486             case float_round_ties_away:
3487                 increment = ((int64_t)zSig1 < 0);
3488                 break;
3489             case float_round_to_zero:
3490                 increment = 0;
3491                 break;
3492             case float_round_up:
3493                 increment = !zSign && zSig1;
3494                 break;
3495             case float_round_down:
3496                 increment = zSign && zSig1;
3497                 break;
3498             default:
3499                 abort();
3500             }
3501             if ( increment ) {
3502                 ++zSig0;
3503                 zSig0 &=
3504                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3505                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3506             }
3507             return packFloatx80( zSign, zExp, zSig0 );
3508         }
3509     }
3510     if (zSig1) {
3511         status->float_exception_flags |= float_flag_inexact;
3512     }
3513     if ( increment ) {
3514         ++zSig0;
3515         if ( zSig0 == 0 ) {
3516             ++zExp;
3517             zSig0 = LIT64( 0x8000000000000000 );
3518         }
3519         else {
3520             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3521         }
3522     }
3523     else {
3524         if ( zSig0 == 0 ) zExp = 0;
3525     }
3526     return packFloatx80( zSign, zExp, zSig0 );
3527 
3528 }
3529 
3530 /*----------------------------------------------------------------------------
3531 | Takes an abstract floating-point value having sign `zSign', exponent
3532 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
3533 | and returns the proper extended double-precision floating-point value
3534 | corresponding to the abstract input.  This routine is just like
3535 | `roundAndPackFloatx80' except that the input significand does not have to be
3536 | normalized.
3537 *----------------------------------------------------------------------------*/
3538 
3539 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
3540                                        flag zSign, int32_t zExp,
3541                                        uint64_t zSig0, uint64_t zSig1,
3542                                        float_status *status)
3543 {
3544     int8_t shiftCount;
3545 
3546     if ( zSig0 == 0 ) {
3547         zSig0 = zSig1;
3548         zSig1 = 0;
3549         zExp -= 64;
3550     }
3551     shiftCount = clz64(zSig0);
3552     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3553     zExp -= shiftCount;
3554     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
3555                                 zSig0, zSig1, status);
3556 
3557 }
3558 
3559 /*----------------------------------------------------------------------------
3560 | Returns the least-significant 64 fraction bits of the quadruple-precision
3561 | floating-point value `a'.
3562 *----------------------------------------------------------------------------*/
3563 
3564 static inline uint64_t extractFloat128Frac1( float128 a )
3565 {
3566 
3567     return a.low;
3568 
3569 }
3570 
3571 /*----------------------------------------------------------------------------
3572 | Returns the most-significant 48 fraction bits of the quadruple-precision
3573 | floating-point value `a'.
3574 *----------------------------------------------------------------------------*/
3575 
3576 static inline uint64_t extractFloat128Frac0( float128 a )
3577 {
3578 
3579     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
3580 
3581 }
3582 
3583 /*----------------------------------------------------------------------------
3584 | Returns the exponent bits of the quadruple-precision floating-point value
3585 | `a'.
3586 *----------------------------------------------------------------------------*/
3587 
3588 static inline int32_t extractFloat128Exp( float128 a )
3589 {
3590 
3591     return ( a.high>>48 ) & 0x7FFF;
3592 
3593 }
3594 
3595 /*----------------------------------------------------------------------------
3596 | Returns the sign bit of the quadruple-precision floating-point value `a'.
3597 *----------------------------------------------------------------------------*/
3598 
3599 static inline flag extractFloat128Sign( float128 a )
3600 {
3601 
3602     return a.high>>63;
3603 
3604 }
3605 
3606 /*----------------------------------------------------------------------------
3607 | Normalizes the subnormal quadruple-precision floating-point value
3608 | represented by the denormalized significand formed by the concatenation of
3609 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
3610 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
3611 | significand are stored at the location pointed to by `zSig0Ptr', and the
3612 | least significant 64 bits of the normalized significand are stored at the
3613 | location pointed to by `zSig1Ptr'.
3614 *----------------------------------------------------------------------------*/
3615 
3616 static void
3617  normalizeFloat128Subnormal(
3618      uint64_t aSig0,
3619      uint64_t aSig1,
3620      int32_t *zExpPtr,
3621      uint64_t *zSig0Ptr,
3622      uint64_t *zSig1Ptr
3623  )
3624 {
3625     int8_t shiftCount;
3626 
3627     if ( aSig0 == 0 ) {
3628         shiftCount = clz64(aSig1) - 15;
3629         if ( shiftCount < 0 ) {
3630             *zSig0Ptr = aSig1>>( - shiftCount );
3631             *zSig1Ptr = aSig1<<( shiftCount & 63 );
3632         }
3633         else {
3634             *zSig0Ptr = aSig1<<shiftCount;
3635             *zSig1Ptr = 0;
3636         }
3637         *zExpPtr = - shiftCount - 63;
3638     }
3639     else {
3640         shiftCount = clz64(aSig0) - 15;
3641         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
3642         *zExpPtr = 1 - shiftCount;
3643     }
3644 
3645 }
3646 
3647 /*----------------------------------------------------------------------------
3648 | Packs the sign `zSign', the exponent `zExp', and the significand formed
3649 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
3650 | floating-point value, returning the result.  After being shifted into the
3651 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
3652 | added together to form the most significant 32 bits of the result.  This
3653 | means that any integer portion of `zSig0' will be added into the exponent.
3654 | Since a properly normalized significand will have an integer portion equal
3655 | to 1, the `zExp' input should be 1 less than the desired result exponent
3656 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
3657 | significand.
3658 *----------------------------------------------------------------------------*/
3659 
3660 static inline float128
3661  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
3662 {
3663     float128 z;
3664 
3665     z.low = zSig1;
3666     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
3667     return z;
3668 
3669 }
3670 
3671 /*----------------------------------------------------------------------------
3672 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3673 | and extended significand formed by the concatenation of `zSig0', `zSig1',
3674 | and `zSig2', and returns the proper quadruple-precision floating-point value
3675 | corresponding to the abstract input.  Ordinarily, the abstract value is
3676 | simply rounded and packed into the quadruple-precision format, with the
3677 | inexact exception raised if the abstract input cannot be represented
3678 | exactly.  However, if the abstract value is too large, the overflow and
3679 | inexact exceptions are raised and an infinity or maximal finite value is
3680 | returned.  If the abstract value is too small, the input value is rounded to
3681 | a subnormal number, and the underflow and inexact exceptions are raised if
3682 | the abstract input cannot be represented exactly as a subnormal quadruple-
3683 | precision floating-point number.
3684 |     The input significand must be normalized or smaller.  If the input
3685 | significand is not normalized, `zExp' must be 0; in that case, the result
3686 | returned is a subnormal number, and it must not require rounding.  In the
3687 | usual case that the input significand is normalized, `zExp' must be 1 less
3688 | than the ``true'' floating-point exponent.  The handling of underflow and
3689 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3690 *----------------------------------------------------------------------------*/
3691 
3692 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
3693                                      uint64_t zSig0, uint64_t zSig1,
3694                                      uint64_t zSig2, float_status *status)
3695 {
3696     int8_t roundingMode;
3697     flag roundNearestEven, increment, isTiny;
3698 
3699     roundingMode = status->float_rounding_mode;
3700     roundNearestEven = ( roundingMode == float_round_nearest_even );
3701     switch (roundingMode) {
3702     case float_round_nearest_even:
3703     case float_round_ties_away:
3704         increment = ((int64_t)zSig2 < 0);
3705         break;
3706     case float_round_to_zero:
3707         increment = 0;
3708         break;
3709     case float_round_up:
3710         increment = !zSign && zSig2;
3711         break;
3712     case float_round_down:
3713         increment = zSign && zSig2;
3714         break;
3715     case float_round_to_odd:
3716         increment = !(zSig1 & 0x1) && zSig2;
3717         break;
3718     default:
3719         abort();
3720     }
3721     if ( 0x7FFD <= (uint32_t) zExp ) {
3722         if (    ( 0x7FFD < zExp )
3723              || (    ( zExp == 0x7FFD )
3724                   && eq128(
3725                          LIT64( 0x0001FFFFFFFFFFFF ),
3726                          LIT64( 0xFFFFFFFFFFFFFFFF ),
3727                          zSig0,
3728                          zSig1
3729                      )
3730                   && increment
3731                 )
3732            ) {
3733             float_raise(float_flag_overflow | float_flag_inexact, status);
3734             if (    ( roundingMode == float_round_to_zero )
3735                  || ( zSign && ( roundingMode == float_round_up ) )
3736                  || ( ! zSign && ( roundingMode == float_round_down ) )
3737                  || (roundingMode == float_round_to_odd)
3738                ) {
3739                 return
3740                     packFloat128(
3741                         zSign,
3742                         0x7FFE,
3743                         LIT64( 0x0000FFFFFFFFFFFF ),
3744                         LIT64( 0xFFFFFFFFFFFFFFFF )
3745                     );
3746             }
3747             return packFloat128( zSign, 0x7FFF, 0, 0 );
3748         }
3749         if ( zExp < 0 ) {
3750             if (status->flush_to_zero) {
3751                 float_raise(float_flag_output_denormal, status);
3752                 return packFloat128(zSign, 0, 0, 0);
3753             }
3754             isTiny =
3755                    (status->float_detect_tininess
3756                     == float_tininess_before_rounding)
3757                 || ( zExp < -1 )
3758                 || ! increment
3759                 || lt128(
3760                        zSig0,
3761                        zSig1,
3762                        LIT64( 0x0001FFFFFFFFFFFF ),
3763                        LIT64( 0xFFFFFFFFFFFFFFFF )
3764                    );
3765             shift128ExtraRightJamming(
3766                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
3767             zExp = 0;
3768             if (isTiny && zSig2) {
3769                 float_raise(float_flag_underflow, status);
3770             }
3771             switch (roundingMode) {
3772             case float_round_nearest_even:
3773             case float_round_ties_away:
3774                 increment = ((int64_t)zSig2 < 0);
3775                 break;
3776             case float_round_to_zero:
3777                 increment = 0;
3778                 break;
3779             case float_round_up:
3780                 increment = !zSign && zSig2;
3781                 break;
3782             case float_round_down:
3783                 increment = zSign && zSig2;
3784                 break;
3785             case float_round_to_odd:
3786                 increment = !(zSig1 & 0x1) && zSig2;
3787                 break;
3788             default:
3789                 abort();
3790             }
3791         }
3792     }
3793     if (zSig2) {
3794         status->float_exception_flags |= float_flag_inexact;
3795     }
3796     if ( increment ) {
3797         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
3798         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
3799     }
3800     else {
3801         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
3802     }
3803     return packFloat128( zSign, zExp, zSig0, zSig1 );
3804 
3805 }
3806 
3807 /*----------------------------------------------------------------------------
3808 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3809 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3810 | returns the proper quadruple-precision floating-point value corresponding
3811 | to the abstract input.  This routine is just like `roundAndPackFloat128'
3812 | except that the input significand has fewer bits and does not have to be
3813 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
3814 | point exponent.
3815 *----------------------------------------------------------------------------*/
3816 
3817 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3818                                               uint64_t zSig0, uint64_t zSig1,
3819                                               float_status *status)
3820 {
3821     int8_t shiftCount;
3822     uint64_t zSig2;
3823 
3824     if ( zSig0 == 0 ) {
3825         zSig0 = zSig1;
3826         zSig1 = 0;
3827         zExp -= 64;
3828     }
3829     shiftCount = clz64(zSig0) - 15;
3830     if ( 0 <= shiftCount ) {
3831         zSig2 = 0;
3832         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3833     }
3834     else {
3835         shift128ExtraRightJamming(
3836             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3837     }
3838     zExp -= shiftCount;
3839     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3840 
3841 }
3842 
3843 
3844 /*----------------------------------------------------------------------------
3845 | Returns the result of converting the 32-bit two's complement integer `a'
3846 | to the extended double-precision floating-point format.  The conversion
3847 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3848 | Arithmetic.
3849 *----------------------------------------------------------------------------*/
3850 
3851 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3852 {
3853     flag zSign;
3854     uint32_t absA;
3855     int8_t shiftCount;
3856     uint64_t zSig;
3857 
3858     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3859     zSign = ( a < 0 );
3860     absA = zSign ? - a : a;
3861     shiftCount = clz32(absA) + 32;
3862     zSig = absA;
3863     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3864 
3865 }
3866 
3867 /*----------------------------------------------------------------------------
3868 | Returns the result of converting the 32-bit two's complement integer `a' to
3869 | the quadruple-precision floating-point format.  The conversion is performed
3870 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3871 *----------------------------------------------------------------------------*/
3872 
3873 float128 int32_to_float128(int32_t a, float_status *status)
3874 {
3875     flag zSign;
3876     uint32_t absA;
3877     int8_t shiftCount;
3878     uint64_t zSig0;
3879 
3880     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3881     zSign = ( a < 0 );
3882     absA = zSign ? - a : a;
3883     shiftCount = clz32(absA) + 17;
3884     zSig0 = absA;
3885     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3886 
3887 }
3888 
3889 /*----------------------------------------------------------------------------
3890 | Returns the result of converting the 64-bit two's complement integer `a'
3891 | to the extended double-precision floating-point format.  The conversion
3892 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3893 | Arithmetic.
3894 *----------------------------------------------------------------------------*/
3895 
3896 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3897 {
3898     flag zSign;
3899     uint64_t absA;
3900     int8_t shiftCount;
3901 
3902     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3903     zSign = ( a < 0 );
3904     absA = zSign ? - a : a;
3905     shiftCount = clz64(absA);
3906     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3907 
3908 }
3909 
3910 /*----------------------------------------------------------------------------
3911 | Returns the result of converting the 64-bit two's complement integer `a' to
3912 | the quadruple-precision floating-point format.  The conversion is performed
3913 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3914 *----------------------------------------------------------------------------*/
3915 
3916 float128 int64_to_float128(int64_t a, float_status *status)
3917 {
3918     flag zSign;
3919     uint64_t absA;
3920     int8_t shiftCount;
3921     int32_t zExp;
3922     uint64_t zSig0, zSig1;
3923 
3924     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3925     zSign = ( a < 0 );
3926     absA = zSign ? - a : a;
3927     shiftCount = clz64(absA) + 49;
3928     zExp = 0x406E - shiftCount;
3929     if ( 64 <= shiftCount ) {
3930         zSig1 = 0;
3931         zSig0 = absA;
3932         shiftCount -= 64;
3933     }
3934     else {
3935         zSig1 = absA;
3936         zSig0 = 0;
3937     }
3938     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3939     return packFloat128( zSign, zExp, zSig0, zSig1 );
3940 
3941 }
3942 
3943 /*----------------------------------------------------------------------------
3944 | Returns the result of converting the 64-bit unsigned integer `a'
3945 | to the quadruple-precision floating-point format.  The conversion is performed
3946 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3947 *----------------------------------------------------------------------------*/
3948 
3949 float128 uint64_to_float128(uint64_t a, float_status *status)
3950 {
3951     if (a == 0) {
3952         return float128_zero;
3953     }
3954     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
3955 }
3956 
3957 /*----------------------------------------------------------------------------
3958 | Returns the result of converting the single-precision floating-point value
3959 | `a' to the extended double-precision floating-point format.  The conversion
3960 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3961 | Arithmetic.
3962 *----------------------------------------------------------------------------*/
3963 
3964 floatx80 float32_to_floatx80(float32 a, float_status *status)
3965 {
3966     flag aSign;
3967     int aExp;
3968     uint32_t aSig;
3969 
3970     a = float32_squash_input_denormal(a, status);
3971     aSig = extractFloat32Frac( a );
3972     aExp = extractFloat32Exp( a );
3973     aSign = extractFloat32Sign( a );
3974     if ( aExp == 0xFF ) {
3975         if (aSig) {
3976             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3977         }
3978         return packFloatx80(aSign,
3979                             floatx80_infinity_high,
3980                             floatx80_infinity_low);
3981     }
3982     if ( aExp == 0 ) {
3983         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3984         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3985     }
3986     aSig |= 0x00800000;
3987     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3988 
3989 }
3990 
3991 /*----------------------------------------------------------------------------
3992 | Returns the result of converting the single-precision floating-point value
3993 | `a' to the double-precision floating-point format.  The conversion is
3994 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3995 | Arithmetic.
3996 *----------------------------------------------------------------------------*/
3997 
3998 float128 float32_to_float128(float32 a, float_status *status)
3999 {
4000     flag aSign;
4001     int aExp;
4002     uint32_t aSig;
4003 
4004     a = float32_squash_input_denormal(a, status);
4005     aSig = extractFloat32Frac( a );
4006     aExp = extractFloat32Exp( a );
4007     aSign = extractFloat32Sign( a );
4008     if ( aExp == 0xFF ) {
4009         if (aSig) {
4010             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4011         }
4012         return packFloat128( aSign, 0x7FFF, 0, 0 );
4013     }
4014     if ( aExp == 0 ) {
4015         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4016         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4017         --aExp;
4018     }
4019     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4020 
4021 }
4022 
4023 /*----------------------------------------------------------------------------
4024 | Returns the remainder of the single-precision floating-point value `a'
4025 | with respect to the corresponding value `b'.  The operation is performed
4026 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4027 *----------------------------------------------------------------------------*/
4028 
4029 float32 float32_rem(float32 a, float32 b, float_status *status)
4030 {
4031     flag aSign, zSign;
4032     int aExp, bExp, expDiff;
4033     uint32_t aSig, bSig;
4034     uint32_t q;
4035     uint64_t aSig64, bSig64, q64;
4036     uint32_t alternateASig;
4037     int32_t sigMean;
4038     a = float32_squash_input_denormal(a, status);
4039     b = float32_squash_input_denormal(b, status);
4040 
4041     aSig = extractFloat32Frac( a );
4042     aExp = extractFloat32Exp( a );
4043     aSign = extractFloat32Sign( a );
4044     bSig = extractFloat32Frac( b );
4045     bExp = extractFloat32Exp( b );
4046     if ( aExp == 0xFF ) {
4047         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4048             return propagateFloat32NaN(a, b, status);
4049         }
4050         float_raise(float_flag_invalid, status);
4051         return float32_default_nan(status);
4052     }
4053     if ( bExp == 0xFF ) {
4054         if (bSig) {
4055             return propagateFloat32NaN(a, b, status);
4056         }
4057         return a;
4058     }
4059     if ( bExp == 0 ) {
4060         if ( bSig == 0 ) {
4061             float_raise(float_flag_invalid, status);
4062             return float32_default_nan(status);
4063         }
4064         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4065     }
4066     if ( aExp == 0 ) {
4067         if ( aSig == 0 ) return a;
4068         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4069     }
4070     expDiff = aExp - bExp;
4071     aSig |= 0x00800000;
4072     bSig |= 0x00800000;
4073     if ( expDiff < 32 ) {
4074         aSig <<= 8;
4075         bSig <<= 8;
4076         if ( expDiff < 0 ) {
4077             if ( expDiff < -1 ) return a;
4078             aSig >>= 1;
4079         }
4080         q = ( bSig <= aSig );
4081         if ( q ) aSig -= bSig;
4082         if ( 0 < expDiff ) {
4083             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4084             q >>= 32 - expDiff;
4085             bSig >>= 2;
4086             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4087         }
4088         else {
4089             aSig >>= 2;
4090             bSig >>= 2;
4091         }
4092     }
4093     else {
4094         if ( bSig <= aSig ) aSig -= bSig;
4095         aSig64 = ( (uint64_t) aSig )<<40;
4096         bSig64 = ( (uint64_t) bSig )<<40;
4097         expDiff -= 64;
4098         while ( 0 < expDiff ) {
4099             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4100             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4101             aSig64 = - ( ( bSig * q64 )<<38 );
4102             expDiff -= 62;
4103         }
4104         expDiff += 64;
4105         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4106         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4107         q = q64>>( 64 - expDiff );
4108         bSig <<= 6;
4109         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4110     }
4111     do {
4112         alternateASig = aSig;
4113         ++q;
4114         aSig -= bSig;
4115     } while ( 0 <= (int32_t) aSig );
4116     sigMean = aSig + alternateASig;
4117     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4118         aSig = alternateASig;
4119     }
4120     zSign = ( (int32_t) aSig < 0 );
4121     if ( zSign ) aSig = - aSig;
4122     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4123 }
4124 
4125 
4126 
4127 /*----------------------------------------------------------------------------
4128 | Returns the binary exponential of the single-precision floating-point value
4129 | `a'. The operation is performed according to the IEC/IEEE Standard for
4130 | Binary Floating-Point Arithmetic.
4131 |
4132 | Uses the following identities:
4133 |
4134 | 1. -------------------------------------------------------------------------
4135 |      x    x*ln(2)
4136 |     2  = e
4137 |
4138 | 2. -------------------------------------------------------------------------
4139 |                      2     3     4     5           n
4140 |      x        x     x     x     x     x           x
4141 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4142 |               1!    2!    3!    4!    5!          n!
4143 *----------------------------------------------------------------------------*/
4144 
4145 static const float64 float32_exp2_coefficients[15] =
4146 {
4147     const_float64( 0x3ff0000000000000ll ), /*  1 */
4148     const_float64( 0x3fe0000000000000ll ), /*  2 */
4149     const_float64( 0x3fc5555555555555ll ), /*  3 */
4150     const_float64( 0x3fa5555555555555ll ), /*  4 */
4151     const_float64( 0x3f81111111111111ll ), /*  5 */
4152     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
4153     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
4154     const_float64( 0x3efa01a01a01a01all ), /*  8 */
4155     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
4156     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4157     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4158     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4159     const_float64( 0x3de6124613a86d09ll ), /* 13 */
4160     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4161     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4162 };
4163 
4164 float32 float32_exp2(float32 a, float_status *status)
4165 {
4166     flag aSign;
4167     int aExp;
4168     uint32_t aSig;
4169     float64 r, x, xn;
4170     int i;
4171     a = float32_squash_input_denormal(a, status);
4172 
4173     aSig = extractFloat32Frac( a );
4174     aExp = extractFloat32Exp( a );
4175     aSign = extractFloat32Sign( a );
4176 
4177     if ( aExp == 0xFF) {
4178         if (aSig) {
4179             return propagateFloat32NaN(a, float32_zero, status);
4180         }
4181         return (aSign) ? float32_zero : a;
4182     }
4183     if (aExp == 0) {
4184         if (aSig == 0) return float32_one;
4185     }
4186 
4187     float_raise(float_flag_inexact, status);
4188 
4189     /* ******************************* */
4190     /* using float64 for approximation */
4191     /* ******************************* */
4192     x = float32_to_float64(a, status);
4193     x = float64_mul(x, float64_ln2, status);
4194 
4195     xn = x;
4196     r = float64_one;
4197     for (i = 0 ; i < 15 ; i++) {
4198         float64 f;
4199 
4200         f = float64_mul(xn, float32_exp2_coefficients[i], status);
4201         r = float64_add(r, f, status);
4202 
4203         xn = float64_mul(xn, x, status);
4204     }
4205 
4206     return float64_to_float32(r, status);
4207 }
4208 
4209 /*----------------------------------------------------------------------------
4210 | Returns the binary log of the single-precision floating-point value `a'.
4211 | The operation is performed according to the IEC/IEEE Standard for Binary
4212 | Floating-Point Arithmetic.
4213 *----------------------------------------------------------------------------*/
4214 float32 float32_log2(float32 a, float_status *status)
4215 {
4216     flag aSign, zSign;
4217     int aExp;
4218     uint32_t aSig, zSig, i;
4219 
4220     a = float32_squash_input_denormal(a, status);
4221     aSig = extractFloat32Frac( a );
4222     aExp = extractFloat32Exp( a );
4223     aSign = extractFloat32Sign( a );
4224 
4225     if ( aExp == 0 ) {
4226         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4227         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4228     }
4229     if ( aSign ) {
4230         float_raise(float_flag_invalid, status);
4231         return float32_default_nan(status);
4232     }
4233     if ( aExp == 0xFF ) {
4234         if (aSig) {
4235             return propagateFloat32NaN(a, float32_zero, status);
4236         }
4237         return a;
4238     }
4239 
4240     aExp -= 0x7F;
4241     aSig |= 0x00800000;
4242     zSign = aExp < 0;
4243     zSig = aExp << 23;
4244 
4245     for (i = 1 << 22; i > 0; i >>= 1) {
4246         aSig = ( (uint64_t)aSig * aSig ) >> 23;
4247         if ( aSig & 0x01000000 ) {
4248             aSig >>= 1;
4249             zSig |= i;
4250         }
4251     }
4252 
4253     if ( zSign )
4254         zSig = -zSig;
4255 
4256     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4257 }
4258 
4259 /*----------------------------------------------------------------------------
4260 | Returns 1 if the single-precision floating-point value `a' is equal to
4261 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4262 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4263 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4264 *----------------------------------------------------------------------------*/
4265 
4266 int float32_eq(float32 a, float32 b, float_status *status)
4267 {
4268     uint32_t av, bv;
4269     a = float32_squash_input_denormal(a, status);
4270     b = float32_squash_input_denormal(b, status);
4271 
4272     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4273          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4274        ) {
4275         float_raise(float_flag_invalid, status);
4276         return 0;
4277     }
4278     av = float32_val(a);
4279     bv = float32_val(b);
4280     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4281 }
4282 
4283 /*----------------------------------------------------------------------------
4284 | Returns 1 if the single-precision floating-point value `a' is less than
4285 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
4286 | exception is raised if either operand is a NaN.  The comparison is performed
4287 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4288 *----------------------------------------------------------------------------*/
4289 
4290 int float32_le(float32 a, float32 b, float_status *status)
4291 {
4292     flag aSign, bSign;
4293     uint32_t av, bv;
4294     a = float32_squash_input_denormal(a, status);
4295     b = float32_squash_input_denormal(b, status);
4296 
4297     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4298          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4299        ) {
4300         float_raise(float_flag_invalid, status);
4301         return 0;
4302     }
4303     aSign = extractFloat32Sign( a );
4304     bSign = extractFloat32Sign( b );
4305     av = float32_val(a);
4306     bv = float32_val(b);
4307     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4308     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4309 
4310 }
4311 
4312 /*----------------------------------------------------------------------------
4313 | Returns 1 if the single-precision floating-point value `a' is less than
4314 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4315 | raised if either operand is a NaN.  The comparison is performed according
4316 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4317 *----------------------------------------------------------------------------*/
4318 
4319 int float32_lt(float32 a, float32 b, float_status *status)
4320 {
4321     flag aSign, bSign;
4322     uint32_t av, bv;
4323     a = float32_squash_input_denormal(a, status);
4324     b = float32_squash_input_denormal(b, status);
4325 
4326     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4327          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4328        ) {
4329         float_raise(float_flag_invalid, status);
4330         return 0;
4331     }
4332     aSign = extractFloat32Sign( a );
4333     bSign = extractFloat32Sign( b );
4334     av = float32_val(a);
4335     bv = float32_val(b);
4336     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4337     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4338 
4339 }
4340 
4341 /*----------------------------------------------------------------------------
4342 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4343 | be compared, and 0 otherwise.  The invalid exception is raised if either
4344 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4345 | Standard for Binary Floating-Point Arithmetic.
4346 *----------------------------------------------------------------------------*/
4347 
4348 int float32_unordered(float32 a, float32 b, float_status *status)
4349 {
4350     a = float32_squash_input_denormal(a, status);
4351     b = float32_squash_input_denormal(b, status);
4352 
4353     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4354          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4355        ) {
4356         float_raise(float_flag_invalid, status);
4357         return 1;
4358     }
4359     return 0;
4360 }
4361 
4362 /*----------------------------------------------------------------------------
4363 | Returns 1 if the single-precision floating-point value `a' is equal to
4364 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4365 | exception.  The comparison is performed according to the IEC/IEEE Standard
4366 | for Binary Floating-Point Arithmetic.
4367 *----------------------------------------------------------------------------*/
4368 
4369 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4370 {
4371     a = float32_squash_input_denormal(a, status);
4372     b = float32_squash_input_denormal(b, status);
4373 
4374     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4375          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4376        ) {
4377         if (float32_is_signaling_nan(a, status)
4378          || float32_is_signaling_nan(b, status)) {
4379             float_raise(float_flag_invalid, status);
4380         }
4381         return 0;
4382     }
4383     return ( float32_val(a) == float32_val(b) ) ||
4384             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4385 }
4386 
4387 /*----------------------------------------------------------------------------
4388 | Returns 1 if the single-precision floating-point value `a' is less than or
4389 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4390 | cause an exception.  Otherwise, the comparison is performed according to the
4391 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4392 *----------------------------------------------------------------------------*/
4393 
4394 int float32_le_quiet(float32 a, float32 b, float_status *status)
4395 {
4396     flag aSign, bSign;
4397     uint32_t av, bv;
4398     a = float32_squash_input_denormal(a, status);
4399     b = float32_squash_input_denormal(b, status);
4400 
4401     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4402          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4403        ) {
4404         if (float32_is_signaling_nan(a, status)
4405          || float32_is_signaling_nan(b, status)) {
4406             float_raise(float_flag_invalid, status);
4407         }
4408         return 0;
4409     }
4410     aSign = extractFloat32Sign( a );
4411     bSign = extractFloat32Sign( b );
4412     av = float32_val(a);
4413     bv = float32_val(b);
4414     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4415     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4416 
4417 }
4418 
4419 /*----------------------------------------------------------------------------
4420 | Returns 1 if the single-precision floating-point value `a' is less than
4421 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4422 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4423 | Standard for Binary Floating-Point Arithmetic.
4424 *----------------------------------------------------------------------------*/
4425 
4426 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4427 {
4428     flag aSign, bSign;
4429     uint32_t av, bv;
4430     a = float32_squash_input_denormal(a, status);
4431     b = float32_squash_input_denormal(b, status);
4432 
4433     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4434          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4435        ) {
4436         if (float32_is_signaling_nan(a, status)
4437          || float32_is_signaling_nan(b, status)) {
4438             float_raise(float_flag_invalid, status);
4439         }
4440         return 0;
4441     }
4442     aSign = extractFloat32Sign( a );
4443     bSign = extractFloat32Sign( b );
4444     av = float32_val(a);
4445     bv = float32_val(b);
4446     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4447     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4448 
4449 }
4450 
4451 /*----------------------------------------------------------------------------
4452 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4453 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4454 | comparison is performed according to the IEC/IEEE Standard for Binary
4455 | Floating-Point Arithmetic.
4456 *----------------------------------------------------------------------------*/
4457 
4458 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4459 {
4460     a = float32_squash_input_denormal(a, status);
4461     b = float32_squash_input_denormal(b, status);
4462 
4463     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4464          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4465        ) {
4466         if (float32_is_signaling_nan(a, status)
4467          || float32_is_signaling_nan(b, status)) {
4468             float_raise(float_flag_invalid, status);
4469         }
4470         return 1;
4471     }
4472     return 0;
4473 }
4474 
4475 /*----------------------------------------------------------------------------
4476 | If `a' is denormal and we are in flush-to-zero mode then set the
4477 | input-denormal exception and return zero. Otherwise just return the value.
4478 *----------------------------------------------------------------------------*/
4479 float16 float16_squash_input_denormal(float16 a, float_status *status)
4480 {
4481     if (status->flush_inputs_to_zero) {
4482         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4483             float_raise(float_flag_input_denormal, status);
4484             return make_float16(float16_val(a) & 0x8000);
4485         }
4486     }
4487     return a;
4488 }
4489 
4490 /*----------------------------------------------------------------------------
4491 | Returns the result of converting the double-precision floating-point value
4492 | `a' to the extended double-precision floating-point format.  The conversion
4493 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4494 | Arithmetic.
4495 *----------------------------------------------------------------------------*/
4496 
4497 floatx80 float64_to_floatx80(float64 a, float_status *status)
4498 {
4499     flag aSign;
4500     int aExp;
4501     uint64_t aSig;
4502 
4503     a = float64_squash_input_denormal(a, status);
4504     aSig = extractFloat64Frac( a );
4505     aExp = extractFloat64Exp( a );
4506     aSign = extractFloat64Sign( a );
4507     if ( aExp == 0x7FF ) {
4508         if (aSig) {
4509             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4510         }
4511         return packFloatx80(aSign,
4512                             floatx80_infinity_high,
4513                             floatx80_infinity_low);
4514     }
4515     if ( aExp == 0 ) {
4516         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4517         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4518     }
4519     return
4520         packFloatx80(
4521             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4522 
4523 }
4524 
4525 /*----------------------------------------------------------------------------
4526 | Returns the result of converting the double-precision floating-point value
4527 | `a' to the quadruple-precision floating-point format.  The conversion is
4528 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4529 | Arithmetic.
4530 *----------------------------------------------------------------------------*/
4531 
4532 float128 float64_to_float128(float64 a, float_status *status)
4533 {
4534     flag aSign;
4535     int aExp;
4536     uint64_t aSig, zSig0, zSig1;
4537 
4538     a = float64_squash_input_denormal(a, status);
4539     aSig = extractFloat64Frac( a );
4540     aExp = extractFloat64Exp( a );
4541     aSign = extractFloat64Sign( a );
4542     if ( aExp == 0x7FF ) {
4543         if (aSig) {
4544             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4545         }
4546         return packFloat128( aSign, 0x7FFF, 0, 0 );
4547     }
4548     if ( aExp == 0 ) {
4549         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4550         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4551         --aExp;
4552     }
4553     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4554     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4555 
4556 }
4557 
4558 
4559 /*----------------------------------------------------------------------------
4560 | Returns the remainder of the double-precision floating-point value `a'
4561 | with respect to the corresponding value `b'.  The operation is performed
4562 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4563 *----------------------------------------------------------------------------*/
4564 
4565 float64 float64_rem(float64 a, float64 b, float_status *status)
4566 {
4567     flag aSign, zSign;
4568     int aExp, bExp, expDiff;
4569     uint64_t aSig, bSig;
4570     uint64_t q, alternateASig;
4571     int64_t sigMean;
4572 
4573     a = float64_squash_input_denormal(a, status);
4574     b = float64_squash_input_denormal(b, status);
4575     aSig = extractFloat64Frac( a );
4576     aExp = extractFloat64Exp( a );
4577     aSign = extractFloat64Sign( a );
4578     bSig = extractFloat64Frac( b );
4579     bExp = extractFloat64Exp( b );
4580     if ( aExp == 0x7FF ) {
4581         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4582             return propagateFloat64NaN(a, b, status);
4583         }
4584         float_raise(float_flag_invalid, status);
4585         return float64_default_nan(status);
4586     }
4587     if ( bExp == 0x7FF ) {
4588         if (bSig) {
4589             return propagateFloat64NaN(a, b, status);
4590         }
4591         return a;
4592     }
4593     if ( bExp == 0 ) {
4594         if ( bSig == 0 ) {
4595             float_raise(float_flag_invalid, status);
4596             return float64_default_nan(status);
4597         }
4598         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4599     }
4600     if ( aExp == 0 ) {
4601         if ( aSig == 0 ) return a;
4602         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4603     }
4604     expDiff = aExp - bExp;
4605     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4606     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4607     if ( expDiff < 0 ) {
4608         if ( expDiff < -1 ) return a;
4609         aSig >>= 1;
4610     }
4611     q = ( bSig <= aSig );
4612     if ( q ) aSig -= bSig;
4613     expDiff -= 64;
4614     while ( 0 < expDiff ) {
4615         q = estimateDiv128To64( aSig, 0, bSig );
4616         q = ( 2 < q ) ? q - 2 : 0;
4617         aSig = - ( ( bSig>>2 ) * q );
4618         expDiff -= 62;
4619     }
4620     expDiff += 64;
4621     if ( 0 < expDiff ) {
4622         q = estimateDiv128To64( aSig, 0, bSig );
4623         q = ( 2 < q ) ? q - 2 : 0;
4624         q >>= 64 - expDiff;
4625         bSig >>= 2;
4626         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4627     }
4628     else {
4629         aSig >>= 2;
4630         bSig >>= 2;
4631     }
4632     do {
4633         alternateASig = aSig;
4634         ++q;
4635         aSig -= bSig;
4636     } while ( 0 <= (int64_t) aSig );
4637     sigMean = aSig + alternateASig;
4638     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4639         aSig = alternateASig;
4640     }
4641     zSign = ( (int64_t) aSig < 0 );
4642     if ( zSign ) aSig = - aSig;
4643     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4644 
4645 }
4646 
4647 /*----------------------------------------------------------------------------
4648 | Returns the binary log of the double-precision floating-point value `a'.
4649 | The operation is performed according to the IEC/IEEE Standard for Binary
4650 | Floating-Point Arithmetic.
4651 *----------------------------------------------------------------------------*/
4652 float64 float64_log2(float64 a, float_status *status)
4653 {
4654     flag aSign, zSign;
4655     int aExp;
4656     uint64_t aSig, aSig0, aSig1, zSig, i;
4657     a = float64_squash_input_denormal(a, status);
4658 
4659     aSig = extractFloat64Frac( a );
4660     aExp = extractFloat64Exp( a );
4661     aSign = extractFloat64Sign( a );
4662 
4663     if ( aExp == 0 ) {
4664         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4665         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4666     }
4667     if ( aSign ) {
4668         float_raise(float_flag_invalid, status);
4669         return float64_default_nan(status);
4670     }
4671     if ( aExp == 0x7FF ) {
4672         if (aSig) {
4673             return propagateFloat64NaN(a, float64_zero, status);
4674         }
4675         return a;
4676     }
4677 
4678     aExp -= 0x3FF;
4679     aSig |= LIT64( 0x0010000000000000 );
4680     zSign = aExp < 0;
4681     zSig = (uint64_t)aExp << 52;
4682     for (i = 1LL << 51; i > 0; i >>= 1) {
4683         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4684         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4685         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4686             aSig >>= 1;
4687             zSig |= i;
4688         }
4689     }
4690 
4691     if ( zSign )
4692         zSig = -zSig;
4693     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4694 }
4695 
4696 /*----------------------------------------------------------------------------
4697 | Returns 1 if the double-precision floating-point value `a' is equal to the
4698 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4699 | if either operand is a NaN.  Otherwise, the comparison is performed
4700 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4701 *----------------------------------------------------------------------------*/
4702 
4703 int float64_eq(float64 a, float64 b, float_status *status)
4704 {
4705     uint64_t av, bv;
4706     a = float64_squash_input_denormal(a, status);
4707     b = float64_squash_input_denormal(b, status);
4708 
4709     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4710          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4711        ) {
4712         float_raise(float_flag_invalid, status);
4713         return 0;
4714     }
4715     av = float64_val(a);
4716     bv = float64_val(b);
4717     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4718 
4719 }
4720 
4721 /*----------------------------------------------------------------------------
4722 | Returns 1 if the double-precision floating-point value `a' is less than or
4723 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4724 | exception is raised if either operand is a NaN.  The comparison is performed
4725 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4726 *----------------------------------------------------------------------------*/
4727 
4728 int float64_le(float64 a, float64 b, float_status *status)
4729 {
4730     flag aSign, bSign;
4731     uint64_t av, bv;
4732     a = float64_squash_input_denormal(a, status);
4733     b = float64_squash_input_denormal(b, status);
4734 
4735     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4736          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4737        ) {
4738         float_raise(float_flag_invalid, status);
4739         return 0;
4740     }
4741     aSign = extractFloat64Sign( a );
4742     bSign = extractFloat64Sign( b );
4743     av = float64_val(a);
4744     bv = float64_val(b);
4745     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4746     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4747 
4748 }
4749 
4750 /*----------------------------------------------------------------------------
4751 | Returns 1 if the double-precision floating-point value `a' is less than
4752 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4753 | raised if either operand is a NaN.  The comparison is performed according
4754 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4755 *----------------------------------------------------------------------------*/
4756 
4757 int float64_lt(float64 a, float64 b, float_status *status)
4758 {
4759     flag aSign, bSign;
4760     uint64_t av, bv;
4761 
4762     a = float64_squash_input_denormal(a, status);
4763     b = float64_squash_input_denormal(b, status);
4764     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4765          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4766        ) {
4767         float_raise(float_flag_invalid, status);
4768         return 0;
4769     }
4770     aSign = extractFloat64Sign( a );
4771     bSign = extractFloat64Sign( b );
4772     av = float64_val(a);
4773     bv = float64_val(b);
4774     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4775     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4776 
4777 }
4778 
4779 /*----------------------------------------------------------------------------
4780 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4781 | be compared, and 0 otherwise.  The invalid exception is raised if either
4782 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4783 | Standard for Binary Floating-Point Arithmetic.
4784 *----------------------------------------------------------------------------*/
4785 
4786 int float64_unordered(float64 a, float64 b, float_status *status)
4787 {
4788     a = float64_squash_input_denormal(a, status);
4789     b = float64_squash_input_denormal(b, status);
4790 
4791     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4792          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4793        ) {
4794         float_raise(float_flag_invalid, status);
4795         return 1;
4796     }
4797     return 0;
4798 }
4799 
4800 /*----------------------------------------------------------------------------
4801 | Returns 1 if the double-precision floating-point value `a' is equal to the
4802 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4803 | exception.The comparison is performed according to the IEC/IEEE Standard
4804 | for Binary Floating-Point Arithmetic.
4805 *----------------------------------------------------------------------------*/
4806 
4807 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4808 {
4809     uint64_t av, bv;
4810     a = float64_squash_input_denormal(a, status);
4811     b = float64_squash_input_denormal(b, status);
4812 
4813     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4814          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4815        ) {
4816         if (float64_is_signaling_nan(a, status)
4817          || float64_is_signaling_nan(b, status)) {
4818             float_raise(float_flag_invalid, status);
4819         }
4820         return 0;
4821     }
4822     av = float64_val(a);
4823     bv = float64_val(b);
4824     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4825 
4826 }
4827 
4828 /*----------------------------------------------------------------------------
4829 | Returns 1 if the double-precision floating-point value `a' is less than or
4830 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4831 | cause an exception.  Otherwise, the comparison is performed according to the
4832 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4833 *----------------------------------------------------------------------------*/
4834 
4835 int float64_le_quiet(float64 a, float64 b, float_status *status)
4836 {
4837     flag aSign, bSign;
4838     uint64_t av, bv;
4839     a = float64_squash_input_denormal(a, status);
4840     b = float64_squash_input_denormal(b, status);
4841 
4842     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4843          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4844        ) {
4845         if (float64_is_signaling_nan(a, status)
4846          || float64_is_signaling_nan(b, status)) {
4847             float_raise(float_flag_invalid, status);
4848         }
4849         return 0;
4850     }
4851     aSign = extractFloat64Sign( a );
4852     bSign = extractFloat64Sign( b );
4853     av = float64_val(a);
4854     bv = float64_val(b);
4855     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4856     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4857 
4858 }
4859 
4860 /*----------------------------------------------------------------------------
4861 | Returns 1 if the double-precision floating-point value `a' is less than
4862 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4863 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4864 | Standard for Binary Floating-Point Arithmetic.
4865 *----------------------------------------------------------------------------*/
4866 
4867 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4868 {
4869     flag aSign, bSign;
4870     uint64_t av, bv;
4871     a = float64_squash_input_denormal(a, status);
4872     b = float64_squash_input_denormal(b, status);
4873 
4874     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4875          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4876        ) {
4877         if (float64_is_signaling_nan(a, status)
4878          || float64_is_signaling_nan(b, status)) {
4879             float_raise(float_flag_invalid, status);
4880         }
4881         return 0;
4882     }
4883     aSign = extractFloat64Sign( a );
4884     bSign = extractFloat64Sign( b );
4885     av = float64_val(a);
4886     bv = float64_val(b);
4887     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4888     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4889 
4890 }
4891 
4892 /*----------------------------------------------------------------------------
4893 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4894 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4895 | comparison is performed according to the IEC/IEEE Standard for Binary
4896 | Floating-Point Arithmetic.
4897 *----------------------------------------------------------------------------*/
4898 
4899 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4900 {
4901     a = float64_squash_input_denormal(a, status);
4902     b = float64_squash_input_denormal(b, status);
4903 
4904     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4905          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4906        ) {
4907         if (float64_is_signaling_nan(a, status)
4908          || float64_is_signaling_nan(b, status)) {
4909             float_raise(float_flag_invalid, status);
4910         }
4911         return 1;
4912     }
4913     return 0;
4914 }
4915 
4916 /*----------------------------------------------------------------------------
4917 | Returns the result of converting the extended double-precision floating-
4918 | point value `a' to the 32-bit two's complement integer format.  The
4919 | conversion is performed according to the IEC/IEEE Standard for Binary
4920 | Floating-Point Arithmetic---which means in particular that the conversion
4921 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4922 | largest positive integer is returned.  Otherwise, if the conversion
4923 | overflows, the largest integer with the same sign as `a' is returned.
4924 *----------------------------------------------------------------------------*/
4925 
4926 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4927 {
4928     flag aSign;
4929     int32_t aExp, shiftCount;
4930     uint64_t aSig;
4931 
4932     if (floatx80_invalid_encoding(a)) {
4933         float_raise(float_flag_invalid, status);
4934         return 1 << 31;
4935     }
4936     aSig = extractFloatx80Frac( a );
4937     aExp = extractFloatx80Exp( a );
4938     aSign = extractFloatx80Sign( a );
4939     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4940     shiftCount = 0x4037 - aExp;
4941     if ( shiftCount <= 0 ) shiftCount = 1;
4942     shift64RightJamming( aSig, shiftCount, &aSig );
4943     return roundAndPackInt32(aSign, aSig, status);
4944 
4945 }
4946 
4947 /*----------------------------------------------------------------------------
4948 | Returns the result of converting the extended double-precision floating-
4949 | point value `a' to the 32-bit two's complement integer format.  The
4950 | conversion is performed according to the IEC/IEEE Standard for Binary
4951 | Floating-Point Arithmetic, except that the conversion is always rounded
4952 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4953 | Otherwise, if the conversion overflows, the largest integer with the same
4954 | sign as `a' is returned.
4955 *----------------------------------------------------------------------------*/
4956 
4957 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4958 {
4959     flag aSign;
4960     int32_t aExp, shiftCount;
4961     uint64_t aSig, savedASig;
4962     int32_t z;
4963 
4964     if (floatx80_invalid_encoding(a)) {
4965         float_raise(float_flag_invalid, status);
4966         return 1 << 31;
4967     }
4968     aSig = extractFloatx80Frac( a );
4969     aExp = extractFloatx80Exp( a );
4970     aSign = extractFloatx80Sign( a );
4971     if ( 0x401E < aExp ) {
4972         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4973         goto invalid;
4974     }
4975     else if ( aExp < 0x3FFF ) {
4976         if (aExp || aSig) {
4977             status->float_exception_flags |= float_flag_inexact;
4978         }
4979         return 0;
4980     }
4981     shiftCount = 0x403E - aExp;
4982     savedASig = aSig;
4983     aSig >>= shiftCount;
4984     z = aSig;
4985     if ( aSign ) z = - z;
4986     if ( ( z < 0 ) ^ aSign ) {
4987  invalid:
4988         float_raise(float_flag_invalid, status);
4989         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4990     }
4991     if ( ( aSig<<shiftCount ) != savedASig ) {
4992         status->float_exception_flags |= float_flag_inexact;
4993     }
4994     return z;
4995 
4996 }
4997 
4998 /*----------------------------------------------------------------------------
4999 | Returns the result of converting the extended double-precision floating-
5000 | point value `a' to the 64-bit two's complement integer format.  The
5001 | conversion is performed according to the IEC/IEEE Standard for Binary
5002 | Floating-Point Arithmetic---which means in particular that the conversion
5003 | is rounded according to the current rounding mode.  If `a' is a NaN,
5004 | the largest positive integer is returned.  Otherwise, if the conversion
5005 | overflows, the largest integer with the same sign as `a' is returned.
5006 *----------------------------------------------------------------------------*/
5007 
5008 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5009 {
5010     flag aSign;
5011     int32_t aExp, shiftCount;
5012     uint64_t aSig, aSigExtra;
5013 
5014     if (floatx80_invalid_encoding(a)) {
5015         float_raise(float_flag_invalid, status);
5016         return 1ULL << 63;
5017     }
5018     aSig = extractFloatx80Frac( a );
5019     aExp = extractFloatx80Exp( a );
5020     aSign = extractFloatx80Sign( a );
5021     shiftCount = 0x403E - aExp;
5022     if ( shiftCount <= 0 ) {
5023         if ( shiftCount ) {
5024             float_raise(float_flag_invalid, status);
5025             if (!aSign || floatx80_is_any_nan(a)) {
5026                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5027             }
5028             return (int64_t) LIT64( 0x8000000000000000 );
5029         }
5030         aSigExtra = 0;
5031     }
5032     else {
5033         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5034     }
5035     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5036 
5037 }
5038 
5039 /*----------------------------------------------------------------------------
5040 | Returns the result of converting the extended double-precision floating-
5041 | point value `a' to the 64-bit two's complement integer format.  The
5042 | conversion is performed according to the IEC/IEEE Standard for Binary
5043 | Floating-Point Arithmetic, except that the conversion is always rounded
5044 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5045 | Otherwise, if the conversion overflows, the largest integer with the same
5046 | sign as `a' is returned.
5047 *----------------------------------------------------------------------------*/
5048 
5049 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5050 {
5051     flag aSign;
5052     int32_t aExp, shiftCount;
5053     uint64_t aSig;
5054     int64_t z;
5055 
5056     if (floatx80_invalid_encoding(a)) {
5057         float_raise(float_flag_invalid, status);
5058         return 1ULL << 63;
5059     }
5060     aSig = extractFloatx80Frac( a );
5061     aExp = extractFloatx80Exp( a );
5062     aSign = extractFloatx80Sign( a );
5063     shiftCount = aExp - 0x403E;
5064     if ( 0 <= shiftCount ) {
5065         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
5066         if ( ( a.high != 0xC03E ) || aSig ) {
5067             float_raise(float_flag_invalid, status);
5068             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5069                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5070             }
5071         }
5072         return (int64_t) LIT64( 0x8000000000000000 );
5073     }
5074     else if ( aExp < 0x3FFF ) {
5075         if (aExp | aSig) {
5076             status->float_exception_flags |= float_flag_inexact;
5077         }
5078         return 0;
5079     }
5080     z = aSig>>( - shiftCount );
5081     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5082         status->float_exception_flags |= float_flag_inexact;
5083     }
5084     if ( aSign ) z = - z;
5085     return z;
5086 
5087 }
5088 
5089 /*----------------------------------------------------------------------------
5090 | Returns the result of converting the extended double-precision floating-
5091 | point value `a' to the single-precision floating-point format.  The
5092 | conversion is performed according to the IEC/IEEE Standard for Binary
5093 | Floating-Point Arithmetic.
5094 *----------------------------------------------------------------------------*/
5095 
5096 float32 floatx80_to_float32(floatx80 a, float_status *status)
5097 {
5098     flag aSign;
5099     int32_t aExp;
5100     uint64_t aSig;
5101 
5102     if (floatx80_invalid_encoding(a)) {
5103         float_raise(float_flag_invalid, status);
5104         return float32_default_nan(status);
5105     }
5106     aSig = extractFloatx80Frac( a );
5107     aExp = extractFloatx80Exp( a );
5108     aSign = extractFloatx80Sign( a );
5109     if ( aExp == 0x7FFF ) {
5110         if ( (uint64_t) ( aSig<<1 ) ) {
5111             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5112         }
5113         return packFloat32( aSign, 0xFF, 0 );
5114     }
5115     shift64RightJamming( aSig, 33, &aSig );
5116     if ( aExp || aSig ) aExp -= 0x3F81;
5117     return roundAndPackFloat32(aSign, aExp, aSig, status);
5118 
5119 }
5120 
5121 /*----------------------------------------------------------------------------
5122 | Returns the result of converting the extended double-precision floating-
5123 | point value `a' to the double-precision floating-point format.  The
5124 | conversion is performed according to the IEC/IEEE Standard for Binary
5125 | Floating-Point Arithmetic.
5126 *----------------------------------------------------------------------------*/
5127 
5128 float64 floatx80_to_float64(floatx80 a, float_status *status)
5129 {
5130     flag aSign;
5131     int32_t aExp;
5132     uint64_t aSig, zSig;
5133 
5134     if (floatx80_invalid_encoding(a)) {
5135         float_raise(float_flag_invalid, status);
5136         return float64_default_nan(status);
5137     }
5138     aSig = extractFloatx80Frac( a );
5139     aExp = extractFloatx80Exp( a );
5140     aSign = extractFloatx80Sign( a );
5141     if ( aExp == 0x7FFF ) {
5142         if ( (uint64_t) ( aSig<<1 ) ) {
5143             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5144         }
5145         return packFloat64( aSign, 0x7FF, 0 );
5146     }
5147     shift64RightJamming( aSig, 1, &zSig );
5148     if ( aExp || aSig ) aExp -= 0x3C01;
5149     return roundAndPackFloat64(aSign, aExp, zSig, status);
5150 
5151 }
5152 
5153 /*----------------------------------------------------------------------------
5154 | Returns the result of converting the extended double-precision floating-
5155 | point value `a' to the quadruple-precision floating-point format.  The
5156 | conversion is performed according to the IEC/IEEE Standard for Binary
5157 | Floating-Point Arithmetic.
5158 *----------------------------------------------------------------------------*/
5159 
5160 float128 floatx80_to_float128(floatx80 a, float_status *status)
5161 {
5162     flag aSign;
5163     int aExp;
5164     uint64_t aSig, zSig0, zSig1;
5165 
5166     if (floatx80_invalid_encoding(a)) {
5167         float_raise(float_flag_invalid, status);
5168         return float128_default_nan(status);
5169     }
5170     aSig = extractFloatx80Frac( a );
5171     aExp = extractFloatx80Exp( a );
5172     aSign = extractFloatx80Sign( a );
5173     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5174         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5175     }
5176     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5177     return packFloat128( aSign, aExp, zSig0, zSig1 );
5178 
5179 }
5180 
5181 /*----------------------------------------------------------------------------
5182 | Rounds the extended double-precision floating-point value `a'
5183 | to the precision provided by floatx80_rounding_precision and returns the
5184 | result as an extended double-precision floating-point value.
5185 | The operation is performed according to the IEC/IEEE Standard for Binary
5186 | Floating-Point Arithmetic.
5187 *----------------------------------------------------------------------------*/
5188 
5189 floatx80 floatx80_round(floatx80 a, float_status *status)
5190 {
5191     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5192                                 extractFloatx80Sign(a),
5193                                 extractFloatx80Exp(a),
5194                                 extractFloatx80Frac(a), 0, status);
5195 }
5196 
5197 /*----------------------------------------------------------------------------
5198 | Rounds the extended double-precision floating-point value `a' to an integer,
5199 | and returns the result as an extended quadruple-precision floating-point
5200 | value.  The operation is performed according to the IEC/IEEE Standard for
5201 | Binary Floating-Point Arithmetic.
5202 *----------------------------------------------------------------------------*/
5203 
5204 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5205 {
5206     flag aSign;
5207     int32_t aExp;
5208     uint64_t lastBitMask, roundBitsMask;
5209     floatx80 z;
5210 
5211     if (floatx80_invalid_encoding(a)) {
5212         float_raise(float_flag_invalid, status);
5213         return floatx80_default_nan(status);
5214     }
5215     aExp = extractFloatx80Exp( a );
5216     if ( 0x403E <= aExp ) {
5217         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5218             return propagateFloatx80NaN(a, a, status);
5219         }
5220         return a;
5221     }
5222     if ( aExp < 0x3FFF ) {
5223         if (    ( aExp == 0 )
5224              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5225             return a;
5226         }
5227         status->float_exception_flags |= float_flag_inexact;
5228         aSign = extractFloatx80Sign( a );
5229         switch (status->float_rounding_mode) {
5230          case float_round_nearest_even:
5231             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5232                ) {
5233                 return
5234                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5235             }
5236             break;
5237         case float_round_ties_away:
5238             if (aExp == 0x3FFE) {
5239                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5240             }
5241             break;
5242          case float_round_down:
5243             return
5244                   aSign ?
5245                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5246                 : packFloatx80( 0, 0, 0 );
5247          case float_round_up:
5248             return
5249                   aSign ? packFloatx80( 1, 0, 0 )
5250                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5251         }
5252         return packFloatx80( aSign, 0, 0 );
5253     }
5254     lastBitMask = 1;
5255     lastBitMask <<= 0x403E - aExp;
5256     roundBitsMask = lastBitMask - 1;
5257     z = a;
5258     switch (status->float_rounding_mode) {
5259     case float_round_nearest_even:
5260         z.low += lastBitMask>>1;
5261         if ((z.low & roundBitsMask) == 0) {
5262             z.low &= ~lastBitMask;
5263         }
5264         break;
5265     case float_round_ties_away:
5266         z.low += lastBitMask >> 1;
5267         break;
5268     case float_round_to_zero:
5269         break;
5270     case float_round_up:
5271         if (!extractFloatx80Sign(z)) {
5272             z.low += roundBitsMask;
5273         }
5274         break;
5275     case float_round_down:
5276         if (extractFloatx80Sign(z)) {
5277             z.low += roundBitsMask;
5278         }
5279         break;
5280     default:
5281         abort();
5282     }
5283     z.low &= ~ roundBitsMask;
5284     if ( z.low == 0 ) {
5285         ++z.high;
5286         z.low = LIT64( 0x8000000000000000 );
5287     }
5288     if (z.low != a.low) {
5289         status->float_exception_flags |= float_flag_inexact;
5290     }
5291     return z;
5292 
5293 }
5294 
5295 /*----------------------------------------------------------------------------
5296 | Returns the result of adding the absolute values of the extended double-
5297 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5298 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5299 | The addition is performed according to the IEC/IEEE Standard for Binary
5300 | Floating-Point Arithmetic.
5301 *----------------------------------------------------------------------------*/
5302 
5303 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5304                                 float_status *status)
5305 {
5306     int32_t aExp, bExp, zExp;
5307     uint64_t aSig, bSig, zSig0, zSig1;
5308     int32_t expDiff;
5309 
5310     aSig = extractFloatx80Frac( a );
5311     aExp = extractFloatx80Exp( a );
5312     bSig = extractFloatx80Frac( b );
5313     bExp = extractFloatx80Exp( b );
5314     expDiff = aExp - bExp;
5315     if ( 0 < expDiff ) {
5316         if ( aExp == 0x7FFF ) {
5317             if ((uint64_t)(aSig << 1)) {
5318                 return propagateFloatx80NaN(a, b, status);
5319             }
5320             return a;
5321         }
5322         if ( bExp == 0 ) --expDiff;
5323         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5324         zExp = aExp;
5325     }
5326     else if ( expDiff < 0 ) {
5327         if ( bExp == 0x7FFF ) {
5328             if ((uint64_t)(bSig << 1)) {
5329                 return propagateFloatx80NaN(a, b, status);
5330             }
5331             return packFloatx80(zSign,
5332                                 floatx80_infinity_high,
5333                                 floatx80_infinity_low);
5334         }
5335         if ( aExp == 0 ) ++expDiff;
5336         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5337         zExp = bExp;
5338     }
5339     else {
5340         if ( aExp == 0x7FFF ) {
5341             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5342                 return propagateFloatx80NaN(a, b, status);
5343             }
5344             return a;
5345         }
5346         zSig1 = 0;
5347         zSig0 = aSig + bSig;
5348         if ( aExp == 0 ) {
5349             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5350             goto roundAndPack;
5351         }
5352         zExp = aExp;
5353         goto shiftRight1;
5354     }
5355     zSig0 = aSig + bSig;
5356     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5357  shiftRight1:
5358     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5359     zSig0 |= LIT64( 0x8000000000000000 );
5360     ++zExp;
5361  roundAndPack:
5362     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5363                                 zSign, zExp, zSig0, zSig1, status);
5364 }
5365 
5366 /*----------------------------------------------------------------------------
5367 | Returns the result of subtracting the absolute values of the extended
5368 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5369 | difference is negated before being returned.  `zSign' is ignored if the
5370 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5371 | Standard for Binary Floating-Point Arithmetic.
5372 *----------------------------------------------------------------------------*/
5373 
5374 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5375                                 float_status *status)
5376 {
5377     int32_t aExp, bExp, zExp;
5378     uint64_t aSig, bSig, zSig0, zSig1;
5379     int32_t expDiff;
5380 
5381     aSig = extractFloatx80Frac( a );
5382     aExp = extractFloatx80Exp( a );
5383     bSig = extractFloatx80Frac( b );
5384     bExp = extractFloatx80Exp( b );
5385     expDiff = aExp - bExp;
5386     if ( 0 < expDiff ) goto aExpBigger;
5387     if ( expDiff < 0 ) goto bExpBigger;
5388     if ( aExp == 0x7FFF ) {
5389         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5390             return propagateFloatx80NaN(a, b, status);
5391         }
5392         float_raise(float_flag_invalid, status);
5393         return floatx80_default_nan(status);
5394     }
5395     if ( aExp == 0 ) {
5396         aExp = 1;
5397         bExp = 1;
5398     }
5399     zSig1 = 0;
5400     if ( bSig < aSig ) goto aBigger;
5401     if ( aSig < bSig ) goto bBigger;
5402     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5403  bExpBigger:
5404     if ( bExp == 0x7FFF ) {
5405         if ((uint64_t)(bSig << 1)) {
5406             return propagateFloatx80NaN(a, b, status);
5407         }
5408         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5409                             floatx80_infinity_low);
5410     }
5411     if ( aExp == 0 ) ++expDiff;
5412     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5413  bBigger:
5414     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5415     zExp = bExp;
5416     zSign ^= 1;
5417     goto normalizeRoundAndPack;
5418  aExpBigger:
5419     if ( aExp == 0x7FFF ) {
5420         if ((uint64_t)(aSig << 1)) {
5421             return propagateFloatx80NaN(a, b, status);
5422         }
5423         return a;
5424     }
5425     if ( bExp == 0 ) --expDiff;
5426     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5427  aBigger:
5428     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5429     zExp = aExp;
5430  normalizeRoundAndPack:
5431     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5432                                          zSign, zExp, zSig0, zSig1, status);
5433 }
5434 
5435 /*----------------------------------------------------------------------------
5436 | Returns the result of adding the extended double-precision floating-point
5437 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5438 | Standard for Binary Floating-Point Arithmetic.
5439 *----------------------------------------------------------------------------*/
5440 
5441 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5442 {
5443     flag aSign, bSign;
5444 
5445     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5446         float_raise(float_flag_invalid, status);
5447         return floatx80_default_nan(status);
5448     }
5449     aSign = extractFloatx80Sign( a );
5450     bSign = extractFloatx80Sign( b );
5451     if ( aSign == bSign ) {
5452         return addFloatx80Sigs(a, b, aSign, status);
5453     }
5454     else {
5455         return subFloatx80Sigs(a, b, aSign, status);
5456     }
5457 
5458 }
5459 
5460 /*----------------------------------------------------------------------------
5461 | Returns the result of subtracting the extended double-precision floating-
5462 | point values `a' and `b'.  The operation is performed according to the
5463 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5464 *----------------------------------------------------------------------------*/
5465 
5466 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5467 {
5468     flag aSign, bSign;
5469 
5470     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5471         float_raise(float_flag_invalid, status);
5472         return floatx80_default_nan(status);
5473     }
5474     aSign = extractFloatx80Sign( a );
5475     bSign = extractFloatx80Sign( b );
5476     if ( aSign == bSign ) {
5477         return subFloatx80Sigs(a, b, aSign, status);
5478     }
5479     else {
5480         return addFloatx80Sigs(a, b, aSign, status);
5481     }
5482 
5483 }
5484 
5485 /*----------------------------------------------------------------------------
5486 | Returns the result of multiplying the extended double-precision floating-
5487 | point values `a' and `b'.  The operation is performed according to the
5488 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5489 *----------------------------------------------------------------------------*/
5490 
5491 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5492 {
5493     flag aSign, bSign, zSign;
5494     int32_t aExp, bExp, zExp;
5495     uint64_t aSig, bSig, zSig0, zSig1;
5496 
5497     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5498         float_raise(float_flag_invalid, status);
5499         return floatx80_default_nan(status);
5500     }
5501     aSig = extractFloatx80Frac( a );
5502     aExp = extractFloatx80Exp( a );
5503     aSign = extractFloatx80Sign( a );
5504     bSig = extractFloatx80Frac( b );
5505     bExp = extractFloatx80Exp( b );
5506     bSign = extractFloatx80Sign( b );
5507     zSign = aSign ^ bSign;
5508     if ( aExp == 0x7FFF ) {
5509         if (    (uint64_t) ( aSig<<1 )
5510              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5511             return propagateFloatx80NaN(a, b, status);
5512         }
5513         if ( ( bExp | bSig ) == 0 ) goto invalid;
5514         return packFloatx80(zSign, floatx80_infinity_high,
5515                                    floatx80_infinity_low);
5516     }
5517     if ( bExp == 0x7FFF ) {
5518         if ((uint64_t)(bSig << 1)) {
5519             return propagateFloatx80NaN(a, b, status);
5520         }
5521         if ( ( aExp | aSig ) == 0 ) {
5522  invalid:
5523             float_raise(float_flag_invalid, status);
5524             return floatx80_default_nan(status);
5525         }
5526         return packFloatx80(zSign, floatx80_infinity_high,
5527                                    floatx80_infinity_low);
5528     }
5529     if ( aExp == 0 ) {
5530         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5531         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5532     }
5533     if ( bExp == 0 ) {
5534         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5535         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5536     }
5537     zExp = aExp + bExp - 0x3FFE;
5538     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5539     if ( 0 < (int64_t) zSig0 ) {
5540         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5541         --zExp;
5542     }
5543     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5544                                 zSign, zExp, zSig0, zSig1, status);
5545 }
5546 
5547 /*----------------------------------------------------------------------------
5548 | Returns the result of dividing the extended double-precision floating-point
5549 | value `a' by the corresponding value `b'.  The operation is performed
5550 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5551 *----------------------------------------------------------------------------*/
5552 
5553 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5554 {
5555     flag aSign, bSign, zSign;
5556     int32_t aExp, bExp, zExp;
5557     uint64_t aSig, bSig, zSig0, zSig1;
5558     uint64_t rem0, rem1, rem2, term0, term1, term2;
5559 
5560     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5561         float_raise(float_flag_invalid, status);
5562         return floatx80_default_nan(status);
5563     }
5564     aSig = extractFloatx80Frac( a );
5565     aExp = extractFloatx80Exp( a );
5566     aSign = extractFloatx80Sign( a );
5567     bSig = extractFloatx80Frac( b );
5568     bExp = extractFloatx80Exp( b );
5569     bSign = extractFloatx80Sign( b );
5570     zSign = aSign ^ bSign;
5571     if ( aExp == 0x7FFF ) {
5572         if ((uint64_t)(aSig << 1)) {
5573             return propagateFloatx80NaN(a, b, status);
5574         }
5575         if ( bExp == 0x7FFF ) {
5576             if ((uint64_t)(bSig << 1)) {
5577                 return propagateFloatx80NaN(a, b, status);
5578             }
5579             goto invalid;
5580         }
5581         return packFloatx80(zSign, floatx80_infinity_high,
5582                                    floatx80_infinity_low);
5583     }
5584     if ( bExp == 0x7FFF ) {
5585         if ((uint64_t)(bSig << 1)) {
5586             return propagateFloatx80NaN(a, b, status);
5587         }
5588         return packFloatx80( zSign, 0, 0 );
5589     }
5590     if ( bExp == 0 ) {
5591         if ( bSig == 0 ) {
5592             if ( ( aExp | aSig ) == 0 ) {
5593  invalid:
5594                 float_raise(float_flag_invalid, status);
5595                 return floatx80_default_nan(status);
5596             }
5597             float_raise(float_flag_divbyzero, status);
5598             return packFloatx80(zSign, floatx80_infinity_high,
5599                                        floatx80_infinity_low);
5600         }
5601         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5602     }
5603     if ( aExp == 0 ) {
5604         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5605         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5606     }
5607     zExp = aExp - bExp + 0x3FFE;
5608     rem1 = 0;
5609     if ( bSig <= aSig ) {
5610         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5611         ++zExp;
5612     }
5613     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5614     mul64To128( bSig, zSig0, &term0, &term1 );
5615     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5616     while ( (int64_t) rem0 < 0 ) {
5617         --zSig0;
5618         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5619     }
5620     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5621     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5622         mul64To128( bSig, zSig1, &term1, &term2 );
5623         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5624         while ( (int64_t) rem1 < 0 ) {
5625             --zSig1;
5626             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5627         }
5628         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5629     }
5630     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5631                                 zSign, zExp, zSig0, zSig1, status);
5632 }
5633 
5634 /*----------------------------------------------------------------------------
5635 | Returns the remainder of the extended double-precision floating-point value
5636 | `a' with respect to the corresponding value `b'.  The operation is performed
5637 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5638 *----------------------------------------------------------------------------*/
5639 
5640 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5641 {
5642     flag aSign, zSign;
5643     int32_t aExp, bExp, expDiff;
5644     uint64_t aSig0, aSig1, bSig;
5645     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5646 
5647     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5648         float_raise(float_flag_invalid, status);
5649         return floatx80_default_nan(status);
5650     }
5651     aSig0 = extractFloatx80Frac( a );
5652     aExp = extractFloatx80Exp( a );
5653     aSign = extractFloatx80Sign( a );
5654     bSig = extractFloatx80Frac( b );
5655     bExp = extractFloatx80Exp( b );
5656     if ( aExp == 0x7FFF ) {
5657         if (    (uint64_t) ( aSig0<<1 )
5658              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5659             return propagateFloatx80NaN(a, b, status);
5660         }
5661         goto invalid;
5662     }
5663     if ( bExp == 0x7FFF ) {
5664         if ((uint64_t)(bSig << 1)) {
5665             return propagateFloatx80NaN(a, b, status);
5666         }
5667         return a;
5668     }
5669     if ( bExp == 0 ) {
5670         if ( bSig == 0 ) {
5671  invalid:
5672             float_raise(float_flag_invalid, status);
5673             return floatx80_default_nan(status);
5674         }
5675         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5676     }
5677     if ( aExp == 0 ) {
5678         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5679         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5680     }
5681     bSig |= LIT64( 0x8000000000000000 );
5682     zSign = aSign;
5683     expDiff = aExp - bExp;
5684     aSig1 = 0;
5685     if ( expDiff < 0 ) {
5686         if ( expDiff < -1 ) return a;
5687         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5688         expDiff = 0;
5689     }
5690     q = ( bSig <= aSig0 );
5691     if ( q ) aSig0 -= bSig;
5692     expDiff -= 64;
5693     while ( 0 < expDiff ) {
5694         q = estimateDiv128To64( aSig0, aSig1, bSig );
5695         q = ( 2 < q ) ? q - 2 : 0;
5696         mul64To128( bSig, q, &term0, &term1 );
5697         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5698         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5699         expDiff -= 62;
5700     }
5701     expDiff += 64;
5702     if ( 0 < expDiff ) {
5703         q = estimateDiv128To64( aSig0, aSig1, bSig );
5704         q = ( 2 < q ) ? q - 2 : 0;
5705         q >>= 64 - expDiff;
5706         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5707         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5708         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5709         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5710             ++q;
5711             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5712         }
5713     }
5714     else {
5715         term1 = 0;
5716         term0 = bSig;
5717     }
5718     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5719     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5720          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5721               && ( q & 1 ) )
5722        ) {
5723         aSig0 = alternateASig0;
5724         aSig1 = alternateASig1;
5725         zSign = ! zSign;
5726     }
5727     return
5728         normalizeRoundAndPackFloatx80(
5729             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5730 
5731 }
5732 
5733 /*----------------------------------------------------------------------------
5734 | Returns the square root of the extended double-precision floating-point
5735 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5736 | for Binary Floating-Point Arithmetic.
5737 *----------------------------------------------------------------------------*/
5738 
5739 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5740 {
5741     flag aSign;
5742     int32_t aExp, zExp;
5743     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5744     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5745 
5746     if (floatx80_invalid_encoding(a)) {
5747         float_raise(float_flag_invalid, status);
5748         return floatx80_default_nan(status);
5749     }
5750     aSig0 = extractFloatx80Frac( a );
5751     aExp = extractFloatx80Exp( a );
5752     aSign = extractFloatx80Sign( a );
5753     if ( aExp == 0x7FFF ) {
5754         if ((uint64_t)(aSig0 << 1)) {
5755             return propagateFloatx80NaN(a, a, status);
5756         }
5757         if ( ! aSign ) return a;
5758         goto invalid;
5759     }
5760     if ( aSign ) {
5761         if ( ( aExp | aSig0 ) == 0 ) return a;
5762  invalid:
5763         float_raise(float_flag_invalid, status);
5764         return floatx80_default_nan(status);
5765     }
5766     if ( aExp == 0 ) {
5767         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5768         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5769     }
5770     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5771     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5772     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5773     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5774     doubleZSig0 = zSig0<<1;
5775     mul64To128( zSig0, zSig0, &term0, &term1 );
5776     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5777     while ( (int64_t) rem0 < 0 ) {
5778         --zSig0;
5779         doubleZSig0 -= 2;
5780         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5781     }
5782     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5783     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5784         if ( zSig1 == 0 ) zSig1 = 1;
5785         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5786         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5787         mul64To128( zSig1, zSig1, &term2, &term3 );
5788         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5789         while ( (int64_t) rem1 < 0 ) {
5790             --zSig1;
5791             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5792             term3 |= 1;
5793             term2 |= doubleZSig0;
5794             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5795         }
5796         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5797     }
5798     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5799     zSig0 |= doubleZSig0;
5800     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5801                                 0, zExp, zSig0, zSig1, status);
5802 }
5803 
5804 /*----------------------------------------------------------------------------
5805 | Returns 1 if the extended double-precision floating-point value `a' is equal
5806 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5807 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5808 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5809 *----------------------------------------------------------------------------*/
5810 
5811 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5812 {
5813 
5814     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5815         || (extractFloatx80Exp(a) == 0x7FFF
5816             && (uint64_t) (extractFloatx80Frac(a) << 1))
5817         || (extractFloatx80Exp(b) == 0x7FFF
5818             && (uint64_t) (extractFloatx80Frac(b) << 1))
5819        ) {
5820         float_raise(float_flag_invalid, status);
5821         return 0;
5822     }
5823     return
5824            ( a.low == b.low )
5825         && (    ( a.high == b.high )
5826              || (    ( a.low == 0 )
5827                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5828            );
5829 
5830 }
5831 
5832 /*----------------------------------------------------------------------------
5833 | Returns 1 if the extended double-precision floating-point value `a' is
5834 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5835 | invalid exception is raised if either operand is a NaN.  The comparison is
5836 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5837 | Arithmetic.
5838 *----------------------------------------------------------------------------*/
5839 
5840 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5841 {
5842     flag aSign, bSign;
5843 
5844     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5845         || (extractFloatx80Exp(a) == 0x7FFF
5846             && (uint64_t) (extractFloatx80Frac(a) << 1))
5847         || (extractFloatx80Exp(b) == 0x7FFF
5848             && (uint64_t) (extractFloatx80Frac(b) << 1))
5849        ) {
5850         float_raise(float_flag_invalid, status);
5851         return 0;
5852     }
5853     aSign = extractFloatx80Sign( a );
5854     bSign = extractFloatx80Sign( b );
5855     if ( aSign != bSign ) {
5856         return
5857                aSign
5858             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5859                  == 0 );
5860     }
5861     return
5862           aSign ? le128( b.high, b.low, a.high, a.low )
5863         : le128( a.high, a.low, b.high, b.low );
5864 
5865 }
5866 
5867 /*----------------------------------------------------------------------------
5868 | Returns 1 if the extended double-precision floating-point value `a' is
5869 | less than the corresponding value `b', and 0 otherwise.  The invalid
5870 | exception is raised if either operand is a NaN.  The comparison is performed
5871 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5872 *----------------------------------------------------------------------------*/
5873 
5874 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5875 {
5876     flag aSign, bSign;
5877 
5878     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5879         || (extractFloatx80Exp(a) == 0x7FFF
5880             && (uint64_t) (extractFloatx80Frac(a) << 1))
5881         || (extractFloatx80Exp(b) == 0x7FFF
5882             && (uint64_t) (extractFloatx80Frac(b) << 1))
5883        ) {
5884         float_raise(float_flag_invalid, status);
5885         return 0;
5886     }
5887     aSign = extractFloatx80Sign( a );
5888     bSign = extractFloatx80Sign( b );
5889     if ( aSign != bSign ) {
5890         return
5891                aSign
5892             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5893                  != 0 );
5894     }
5895     return
5896           aSign ? lt128( b.high, b.low, a.high, a.low )
5897         : lt128( a.high, a.low, b.high, b.low );
5898 
5899 }
5900 
5901 /*----------------------------------------------------------------------------
5902 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5903 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5904 | either operand is a NaN.   The comparison is performed according to the
5905 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5906 *----------------------------------------------------------------------------*/
5907 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5908 {
5909     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5910         || (extractFloatx80Exp(a) == 0x7FFF
5911             && (uint64_t) (extractFloatx80Frac(a) << 1))
5912         || (extractFloatx80Exp(b) == 0x7FFF
5913             && (uint64_t) (extractFloatx80Frac(b) << 1))
5914        ) {
5915         float_raise(float_flag_invalid, status);
5916         return 1;
5917     }
5918     return 0;
5919 }
5920 
5921 /*----------------------------------------------------------------------------
5922 | Returns 1 if the extended double-precision floating-point value `a' is
5923 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5924 | cause an exception.  The comparison is performed according to the IEC/IEEE
5925 | Standard for Binary Floating-Point Arithmetic.
5926 *----------------------------------------------------------------------------*/
5927 
5928 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5929 {
5930 
5931     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5932         float_raise(float_flag_invalid, status);
5933         return 0;
5934     }
5935     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5936               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5937          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5938               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5939        ) {
5940         if (floatx80_is_signaling_nan(a, status)
5941          || floatx80_is_signaling_nan(b, status)) {
5942             float_raise(float_flag_invalid, status);
5943         }
5944         return 0;
5945     }
5946     return
5947            ( a.low == b.low )
5948         && (    ( a.high == b.high )
5949              || (    ( a.low == 0 )
5950                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5951            );
5952 
5953 }
5954 
5955 /*----------------------------------------------------------------------------
5956 | Returns 1 if the extended double-precision floating-point value `a' is less
5957 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5958 | do not cause an exception.  Otherwise, the comparison is performed according
5959 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5960 *----------------------------------------------------------------------------*/
5961 
5962 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5963 {
5964     flag aSign, bSign;
5965 
5966     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5967         float_raise(float_flag_invalid, status);
5968         return 0;
5969     }
5970     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5971               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5972          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5973               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5974        ) {
5975         if (floatx80_is_signaling_nan(a, status)
5976          || floatx80_is_signaling_nan(b, status)) {
5977             float_raise(float_flag_invalid, status);
5978         }
5979         return 0;
5980     }
5981     aSign = extractFloatx80Sign( a );
5982     bSign = extractFloatx80Sign( b );
5983     if ( aSign != bSign ) {
5984         return
5985                aSign
5986             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5987                  == 0 );
5988     }
5989     return
5990           aSign ? le128( b.high, b.low, a.high, a.low )
5991         : le128( a.high, a.low, b.high, b.low );
5992 
5993 }
5994 
5995 /*----------------------------------------------------------------------------
5996 | Returns 1 if the extended double-precision floating-point value `a' is less
5997 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5998 | an exception.  Otherwise, the comparison is performed according to the
5999 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6000 *----------------------------------------------------------------------------*/
6001 
6002 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
6003 {
6004     flag aSign, bSign;
6005 
6006     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6007         float_raise(float_flag_invalid, status);
6008         return 0;
6009     }
6010     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6011               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6012          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6013               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6014        ) {
6015         if (floatx80_is_signaling_nan(a, status)
6016          || floatx80_is_signaling_nan(b, status)) {
6017             float_raise(float_flag_invalid, status);
6018         }
6019         return 0;
6020     }
6021     aSign = extractFloatx80Sign( a );
6022     bSign = extractFloatx80Sign( b );
6023     if ( aSign != bSign ) {
6024         return
6025                aSign
6026             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6027                  != 0 );
6028     }
6029     return
6030           aSign ? lt128( b.high, b.low, a.high, a.low )
6031         : lt128( a.high, a.low, b.high, b.low );
6032 
6033 }
6034 
6035 /*----------------------------------------------------------------------------
6036 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6037 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
6038 | The comparison is performed according to the IEC/IEEE Standard for Binary
6039 | Floating-Point Arithmetic.
6040 *----------------------------------------------------------------------------*/
6041 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6042 {
6043     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6044         float_raise(float_flag_invalid, status);
6045         return 1;
6046     }
6047     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6048               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6049          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6050               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6051        ) {
6052         if (floatx80_is_signaling_nan(a, status)
6053          || floatx80_is_signaling_nan(b, status)) {
6054             float_raise(float_flag_invalid, status);
6055         }
6056         return 1;
6057     }
6058     return 0;
6059 }
6060 
6061 /*----------------------------------------------------------------------------
6062 | Returns the result of converting the quadruple-precision floating-point
6063 | value `a' to the 32-bit two's complement integer format.  The conversion
6064 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6065 | Arithmetic---which means in particular that the conversion is rounded
6066 | according to the current rounding mode.  If `a' is a NaN, the largest
6067 | positive integer is returned.  Otherwise, if the conversion overflows, the
6068 | largest integer with the same sign as `a' is returned.
6069 *----------------------------------------------------------------------------*/
6070 
6071 int32_t float128_to_int32(float128 a, float_status *status)
6072 {
6073     flag aSign;
6074     int32_t aExp, shiftCount;
6075     uint64_t aSig0, aSig1;
6076 
6077     aSig1 = extractFloat128Frac1( a );
6078     aSig0 = extractFloat128Frac0( a );
6079     aExp = extractFloat128Exp( a );
6080     aSign = extractFloat128Sign( a );
6081     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6082     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6083     aSig0 |= ( aSig1 != 0 );
6084     shiftCount = 0x4028 - aExp;
6085     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6086     return roundAndPackInt32(aSign, aSig0, status);
6087 
6088 }
6089 
6090 /*----------------------------------------------------------------------------
6091 | Returns the result of converting the quadruple-precision floating-point
6092 | value `a' to the 32-bit two's complement integer format.  The conversion
6093 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6094 | Arithmetic, except that the conversion is always rounded toward zero.  If
6095 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6096 | conversion overflows, the largest integer with the same sign as `a' is
6097 | returned.
6098 *----------------------------------------------------------------------------*/
6099 
6100 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6101 {
6102     flag aSign;
6103     int32_t aExp, shiftCount;
6104     uint64_t aSig0, aSig1, savedASig;
6105     int32_t z;
6106 
6107     aSig1 = extractFloat128Frac1( a );
6108     aSig0 = extractFloat128Frac0( a );
6109     aExp = extractFloat128Exp( a );
6110     aSign = extractFloat128Sign( a );
6111     aSig0 |= ( aSig1 != 0 );
6112     if ( 0x401E < aExp ) {
6113         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6114         goto invalid;
6115     }
6116     else if ( aExp < 0x3FFF ) {
6117         if (aExp || aSig0) {
6118             status->float_exception_flags |= float_flag_inexact;
6119         }
6120         return 0;
6121     }
6122     aSig0 |= LIT64( 0x0001000000000000 );
6123     shiftCount = 0x402F - aExp;
6124     savedASig = aSig0;
6125     aSig0 >>= shiftCount;
6126     z = aSig0;
6127     if ( aSign ) z = - z;
6128     if ( ( z < 0 ) ^ aSign ) {
6129  invalid:
6130         float_raise(float_flag_invalid, status);
6131         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6132     }
6133     if ( ( aSig0<<shiftCount ) != savedASig ) {
6134         status->float_exception_flags |= float_flag_inexact;
6135     }
6136     return z;
6137 
6138 }
6139 
6140 /*----------------------------------------------------------------------------
6141 | Returns the result of converting the quadruple-precision floating-point
6142 | value `a' to the 64-bit two's complement integer format.  The conversion
6143 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6144 | Arithmetic---which means in particular that the conversion is rounded
6145 | according to the current rounding mode.  If `a' is a NaN, the largest
6146 | positive integer is returned.  Otherwise, if the conversion overflows, the
6147 | largest integer with the same sign as `a' is returned.
6148 *----------------------------------------------------------------------------*/
6149 
6150 int64_t float128_to_int64(float128 a, float_status *status)
6151 {
6152     flag aSign;
6153     int32_t aExp, shiftCount;
6154     uint64_t aSig0, aSig1;
6155 
6156     aSig1 = extractFloat128Frac1( a );
6157     aSig0 = extractFloat128Frac0( a );
6158     aExp = extractFloat128Exp( a );
6159     aSign = extractFloat128Sign( a );
6160     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6161     shiftCount = 0x402F - aExp;
6162     if ( shiftCount <= 0 ) {
6163         if ( 0x403E < aExp ) {
6164             float_raise(float_flag_invalid, status);
6165             if (    ! aSign
6166                  || (    ( aExp == 0x7FFF )
6167                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6168                     )
6169                ) {
6170                 return LIT64( 0x7FFFFFFFFFFFFFFF );
6171             }
6172             return (int64_t) LIT64( 0x8000000000000000 );
6173         }
6174         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6175     }
6176     else {
6177         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6178     }
6179     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6180 
6181 }
6182 
6183 /*----------------------------------------------------------------------------
6184 | Returns the result of converting the quadruple-precision floating-point
6185 | value `a' to the 64-bit two's complement integer format.  The conversion
6186 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6187 | Arithmetic, except that the conversion is always rounded toward zero.
6188 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6189 | the conversion overflows, the largest integer with the same sign as `a' is
6190 | returned.
6191 *----------------------------------------------------------------------------*/
6192 
6193 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6194 {
6195     flag aSign;
6196     int32_t aExp, shiftCount;
6197     uint64_t aSig0, aSig1;
6198     int64_t z;
6199 
6200     aSig1 = extractFloat128Frac1( a );
6201     aSig0 = extractFloat128Frac0( a );
6202     aExp = extractFloat128Exp( a );
6203     aSign = extractFloat128Sign( a );
6204     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6205     shiftCount = aExp - 0x402F;
6206     if ( 0 < shiftCount ) {
6207         if ( 0x403E <= aExp ) {
6208             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6209             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6210                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6211                 if (aSig1) {
6212                     status->float_exception_flags |= float_flag_inexact;
6213                 }
6214             }
6215             else {
6216                 float_raise(float_flag_invalid, status);
6217                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6218                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6219                 }
6220             }
6221             return (int64_t) LIT64( 0x8000000000000000 );
6222         }
6223         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6224         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6225             status->float_exception_flags |= float_flag_inexact;
6226         }
6227     }
6228     else {
6229         if ( aExp < 0x3FFF ) {
6230             if ( aExp | aSig0 | aSig1 ) {
6231                 status->float_exception_flags |= float_flag_inexact;
6232             }
6233             return 0;
6234         }
6235         z = aSig0>>( - shiftCount );
6236         if (    aSig1
6237              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6238             status->float_exception_flags |= float_flag_inexact;
6239         }
6240     }
6241     if ( aSign ) z = - z;
6242     return z;
6243 
6244 }
6245 
6246 /*----------------------------------------------------------------------------
6247 | Returns the result of converting the quadruple-precision floating-point value
6248 | `a' to the 64-bit unsigned integer format.  The conversion is
6249 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6250 | Arithmetic---which means in particular that the conversion is rounded
6251 | according to the current rounding mode.  If `a' is a NaN, the largest
6252 | positive integer is returned.  If the conversion overflows, the
6253 | largest unsigned integer is returned.  If 'a' is negative, the value is
6254 | rounded and zero is returned; negative values that do not round to zero
6255 | will raise the inexact exception.
6256 *----------------------------------------------------------------------------*/
6257 
6258 uint64_t float128_to_uint64(float128 a, float_status *status)
6259 {
6260     flag aSign;
6261     int aExp;
6262     int shiftCount;
6263     uint64_t aSig0, aSig1;
6264 
6265     aSig0 = extractFloat128Frac0(a);
6266     aSig1 = extractFloat128Frac1(a);
6267     aExp = extractFloat128Exp(a);
6268     aSign = extractFloat128Sign(a);
6269     if (aSign && (aExp > 0x3FFE)) {
6270         float_raise(float_flag_invalid, status);
6271         if (float128_is_any_nan(a)) {
6272             return LIT64(0xFFFFFFFFFFFFFFFF);
6273         } else {
6274             return 0;
6275         }
6276     }
6277     if (aExp) {
6278         aSig0 |= LIT64(0x0001000000000000);
6279     }
6280     shiftCount = 0x402F - aExp;
6281     if (shiftCount <= 0) {
6282         if (0x403E < aExp) {
6283             float_raise(float_flag_invalid, status);
6284             return LIT64(0xFFFFFFFFFFFFFFFF);
6285         }
6286         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6287     } else {
6288         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6289     }
6290     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6291 }
6292 
6293 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6294 {
6295     uint64_t v;
6296     signed char current_rounding_mode = status->float_rounding_mode;
6297 
6298     set_float_rounding_mode(float_round_to_zero, status);
6299     v = float128_to_uint64(a, status);
6300     set_float_rounding_mode(current_rounding_mode, status);
6301 
6302     return v;
6303 }
6304 
6305 /*----------------------------------------------------------------------------
6306 | Returns the result of converting the quadruple-precision floating-point
6307 | value `a' to the 32-bit unsigned integer format.  The conversion
6308 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6309 | Arithmetic except that the conversion is always rounded toward zero.
6310 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6311 | if the conversion overflows, the largest unsigned integer is returned.
6312 | If 'a' is negative, the value is rounded and zero is returned; negative
6313 | values that do not round to zero will raise the inexact exception.
6314 *----------------------------------------------------------------------------*/
6315 
6316 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6317 {
6318     uint64_t v;
6319     uint32_t res;
6320     int old_exc_flags = get_float_exception_flags(status);
6321 
6322     v = float128_to_uint64_round_to_zero(a, status);
6323     if (v > 0xffffffff) {
6324         res = 0xffffffff;
6325     } else {
6326         return v;
6327     }
6328     set_float_exception_flags(old_exc_flags, status);
6329     float_raise(float_flag_invalid, status);
6330     return res;
6331 }
6332 
6333 /*----------------------------------------------------------------------------
6334 | Returns the result of converting the quadruple-precision floating-point
6335 | value `a' to the single-precision floating-point format.  The conversion
6336 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6337 | Arithmetic.
6338 *----------------------------------------------------------------------------*/
6339 
6340 float32 float128_to_float32(float128 a, float_status *status)
6341 {
6342     flag aSign;
6343     int32_t aExp;
6344     uint64_t aSig0, aSig1;
6345     uint32_t zSig;
6346 
6347     aSig1 = extractFloat128Frac1( a );
6348     aSig0 = extractFloat128Frac0( a );
6349     aExp = extractFloat128Exp( a );
6350     aSign = extractFloat128Sign( a );
6351     if ( aExp == 0x7FFF ) {
6352         if ( aSig0 | aSig1 ) {
6353             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6354         }
6355         return packFloat32( aSign, 0xFF, 0 );
6356     }
6357     aSig0 |= ( aSig1 != 0 );
6358     shift64RightJamming( aSig0, 18, &aSig0 );
6359     zSig = aSig0;
6360     if ( aExp || zSig ) {
6361         zSig |= 0x40000000;
6362         aExp -= 0x3F81;
6363     }
6364     return roundAndPackFloat32(aSign, aExp, zSig, status);
6365 
6366 }
6367 
6368 /*----------------------------------------------------------------------------
6369 | Returns the result of converting the quadruple-precision floating-point
6370 | value `a' to the double-precision floating-point format.  The conversion
6371 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6372 | Arithmetic.
6373 *----------------------------------------------------------------------------*/
6374 
6375 float64 float128_to_float64(float128 a, float_status *status)
6376 {
6377     flag aSign;
6378     int32_t aExp;
6379     uint64_t aSig0, aSig1;
6380 
6381     aSig1 = extractFloat128Frac1( a );
6382     aSig0 = extractFloat128Frac0( a );
6383     aExp = extractFloat128Exp( a );
6384     aSign = extractFloat128Sign( a );
6385     if ( aExp == 0x7FFF ) {
6386         if ( aSig0 | aSig1 ) {
6387             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6388         }
6389         return packFloat64( aSign, 0x7FF, 0 );
6390     }
6391     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6392     aSig0 |= ( aSig1 != 0 );
6393     if ( aExp || aSig0 ) {
6394         aSig0 |= LIT64( 0x4000000000000000 );
6395         aExp -= 0x3C01;
6396     }
6397     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6398 
6399 }
6400 
6401 /*----------------------------------------------------------------------------
6402 | Returns the result of converting the quadruple-precision floating-point
6403 | value `a' to the extended double-precision floating-point format.  The
6404 | conversion is performed according to the IEC/IEEE Standard for Binary
6405 | Floating-Point Arithmetic.
6406 *----------------------------------------------------------------------------*/
6407 
6408 floatx80 float128_to_floatx80(float128 a, float_status *status)
6409 {
6410     flag aSign;
6411     int32_t aExp;
6412     uint64_t aSig0, aSig1;
6413 
6414     aSig1 = extractFloat128Frac1( a );
6415     aSig0 = extractFloat128Frac0( a );
6416     aExp = extractFloat128Exp( a );
6417     aSign = extractFloat128Sign( a );
6418     if ( aExp == 0x7FFF ) {
6419         if ( aSig0 | aSig1 ) {
6420             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6421         }
6422         return packFloatx80(aSign, floatx80_infinity_high,
6423                                    floatx80_infinity_low);
6424     }
6425     if ( aExp == 0 ) {
6426         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6427         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6428     }
6429     else {
6430         aSig0 |= LIT64( 0x0001000000000000 );
6431     }
6432     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6433     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6434 
6435 }
6436 
6437 /*----------------------------------------------------------------------------
6438 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6439 | returns the result as a quadruple-precision floating-point value.  The
6440 | operation is performed according to the IEC/IEEE Standard for Binary
6441 | Floating-Point Arithmetic.
6442 *----------------------------------------------------------------------------*/
6443 
6444 float128 float128_round_to_int(float128 a, float_status *status)
6445 {
6446     flag aSign;
6447     int32_t aExp;
6448     uint64_t lastBitMask, roundBitsMask;
6449     float128 z;
6450 
6451     aExp = extractFloat128Exp( a );
6452     if ( 0x402F <= aExp ) {
6453         if ( 0x406F <= aExp ) {
6454             if (    ( aExp == 0x7FFF )
6455                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6456                ) {
6457                 return propagateFloat128NaN(a, a, status);
6458             }
6459             return a;
6460         }
6461         lastBitMask = 1;
6462         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6463         roundBitsMask = lastBitMask - 1;
6464         z = a;
6465         switch (status->float_rounding_mode) {
6466         case float_round_nearest_even:
6467             if ( lastBitMask ) {
6468                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6469                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6470             }
6471             else {
6472                 if ( (int64_t) z.low < 0 ) {
6473                     ++z.high;
6474                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6475                 }
6476             }
6477             break;
6478         case float_round_ties_away:
6479             if (lastBitMask) {
6480                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6481             } else {
6482                 if ((int64_t) z.low < 0) {
6483                     ++z.high;
6484                 }
6485             }
6486             break;
6487         case float_round_to_zero:
6488             break;
6489         case float_round_up:
6490             if (!extractFloat128Sign(z)) {
6491                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6492             }
6493             break;
6494         case float_round_down:
6495             if (extractFloat128Sign(z)) {
6496                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6497             }
6498             break;
6499         default:
6500             abort();
6501         }
6502         z.low &= ~ roundBitsMask;
6503     }
6504     else {
6505         if ( aExp < 0x3FFF ) {
6506             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6507             status->float_exception_flags |= float_flag_inexact;
6508             aSign = extractFloat128Sign( a );
6509             switch (status->float_rounding_mode) {
6510              case float_round_nearest_even:
6511                 if (    ( aExp == 0x3FFE )
6512                      && (   extractFloat128Frac0( a )
6513                           | extractFloat128Frac1( a ) )
6514                    ) {
6515                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6516                 }
6517                 break;
6518             case float_round_ties_away:
6519                 if (aExp == 0x3FFE) {
6520                     return packFloat128(aSign, 0x3FFF, 0, 0);
6521                 }
6522                 break;
6523              case float_round_down:
6524                 return
6525                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6526                     : packFloat128( 0, 0, 0, 0 );
6527              case float_round_up:
6528                 return
6529                       aSign ? packFloat128( 1, 0, 0, 0 )
6530                     : packFloat128( 0, 0x3FFF, 0, 0 );
6531             }
6532             return packFloat128( aSign, 0, 0, 0 );
6533         }
6534         lastBitMask = 1;
6535         lastBitMask <<= 0x402F - aExp;
6536         roundBitsMask = lastBitMask - 1;
6537         z.low = 0;
6538         z.high = a.high;
6539         switch (status->float_rounding_mode) {
6540         case float_round_nearest_even:
6541             z.high += lastBitMask>>1;
6542             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6543                 z.high &= ~ lastBitMask;
6544             }
6545             break;
6546         case float_round_ties_away:
6547             z.high += lastBitMask>>1;
6548             break;
6549         case float_round_to_zero:
6550             break;
6551         case float_round_up:
6552             if (!extractFloat128Sign(z)) {
6553                 z.high |= ( a.low != 0 );
6554                 z.high += roundBitsMask;
6555             }
6556             break;
6557         case float_round_down:
6558             if (extractFloat128Sign(z)) {
6559                 z.high |= (a.low != 0);
6560                 z.high += roundBitsMask;
6561             }
6562             break;
6563         default:
6564             abort();
6565         }
6566         z.high &= ~ roundBitsMask;
6567     }
6568     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6569         status->float_exception_flags |= float_flag_inexact;
6570     }
6571     return z;
6572 
6573 }
6574 
6575 /*----------------------------------------------------------------------------
6576 | Returns the result of adding the absolute values of the quadruple-precision
6577 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6578 | before being returned.  `zSign' is ignored if the result is a NaN.
6579 | The addition is performed according to the IEC/IEEE Standard for Binary
6580 | Floating-Point Arithmetic.
6581 *----------------------------------------------------------------------------*/
6582 
6583 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6584                                 float_status *status)
6585 {
6586     int32_t aExp, bExp, zExp;
6587     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6588     int32_t expDiff;
6589 
6590     aSig1 = extractFloat128Frac1( a );
6591     aSig0 = extractFloat128Frac0( a );
6592     aExp = extractFloat128Exp( a );
6593     bSig1 = extractFloat128Frac1( b );
6594     bSig0 = extractFloat128Frac0( b );
6595     bExp = extractFloat128Exp( b );
6596     expDiff = aExp - bExp;
6597     if ( 0 < expDiff ) {
6598         if ( aExp == 0x7FFF ) {
6599             if (aSig0 | aSig1) {
6600                 return propagateFloat128NaN(a, b, status);
6601             }
6602             return a;
6603         }
6604         if ( bExp == 0 ) {
6605             --expDiff;
6606         }
6607         else {
6608             bSig0 |= LIT64( 0x0001000000000000 );
6609         }
6610         shift128ExtraRightJamming(
6611             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6612         zExp = aExp;
6613     }
6614     else if ( expDiff < 0 ) {
6615         if ( bExp == 0x7FFF ) {
6616             if (bSig0 | bSig1) {
6617                 return propagateFloat128NaN(a, b, status);
6618             }
6619             return packFloat128( zSign, 0x7FFF, 0, 0 );
6620         }
6621         if ( aExp == 0 ) {
6622             ++expDiff;
6623         }
6624         else {
6625             aSig0 |= LIT64( 0x0001000000000000 );
6626         }
6627         shift128ExtraRightJamming(
6628             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6629         zExp = bExp;
6630     }
6631     else {
6632         if ( aExp == 0x7FFF ) {
6633             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6634                 return propagateFloat128NaN(a, b, status);
6635             }
6636             return a;
6637         }
6638         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6639         if ( aExp == 0 ) {
6640             if (status->flush_to_zero) {
6641                 if (zSig0 | zSig1) {
6642                     float_raise(float_flag_output_denormal, status);
6643                 }
6644                 return packFloat128(zSign, 0, 0, 0);
6645             }
6646             return packFloat128( zSign, 0, zSig0, zSig1 );
6647         }
6648         zSig2 = 0;
6649         zSig0 |= LIT64( 0x0002000000000000 );
6650         zExp = aExp;
6651         goto shiftRight1;
6652     }
6653     aSig0 |= LIT64( 0x0001000000000000 );
6654     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6655     --zExp;
6656     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6657     ++zExp;
6658  shiftRight1:
6659     shift128ExtraRightJamming(
6660         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6661  roundAndPack:
6662     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6663 
6664 }
6665 
6666 /*----------------------------------------------------------------------------
6667 | Returns the result of subtracting the absolute values of the quadruple-
6668 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6669 | difference is negated before being returned.  `zSign' is ignored if the
6670 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6671 | Standard for Binary Floating-Point Arithmetic.
6672 *----------------------------------------------------------------------------*/
6673 
6674 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6675                                 float_status *status)
6676 {
6677     int32_t aExp, bExp, zExp;
6678     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6679     int32_t expDiff;
6680 
6681     aSig1 = extractFloat128Frac1( a );
6682     aSig0 = extractFloat128Frac0( a );
6683     aExp = extractFloat128Exp( a );
6684     bSig1 = extractFloat128Frac1( b );
6685     bSig0 = extractFloat128Frac0( b );
6686     bExp = extractFloat128Exp( b );
6687     expDiff = aExp - bExp;
6688     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6689     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6690     if ( 0 < expDiff ) goto aExpBigger;
6691     if ( expDiff < 0 ) goto bExpBigger;
6692     if ( aExp == 0x7FFF ) {
6693         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6694             return propagateFloat128NaN(a, b, status);
6695         }
6696         float_raise(float_flag_invalid, status);
6697         return float128_default_nan(status);
6698     }
6699     if ( aExp == 0 ) {
6700         aExp = 1;
6701         bExp = 1;
6702     }
6703     if ( bSig0 < aSig0 ) goto aBigger;
6704     if ( aSig0 < bSig0 ) goto bBigger;
6705     if ( bSig1 < aSig1 ) goto aBigger;
6706     if ( aSig1 < bSig1 ) goto bBigger;
6707     return packFloat128(status->float_rounding_mode == float_round_down,
6708                         0, 0, 0);
6709  bExpBigger:
6710     if ( bExp == 0x7FFF ) {
6711         if (bSig0 | bSig1) {
6712             return propagateFloat128NaN(a, b, status);
6713         }
6714         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6715     }
6716     if ( aExp == 0 ) {
6717         ++expDiff;
6718     }
6719     else {
6720         aSig0 |= LIT64( 0x4000000000000000 );
6721     }
6722     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6723     bSig0 |= LIT64( 0x4000000000000000 );
6724  bBigger:
6725     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6726     zExp = bExp;
6727     zSign ^= 1;
6728     goto normalizeRoundAndPack;
6729  aExpBigger:
6730     if ( aExp == 0x7FFF ) {
6731         if (aSig0 | aSig1) {
6732             return propagateFloat128NaN(a, b, status);
6733         }
6734         return a;
6735     }
6736     if ( bExp == 0 ) {
6737         --expDiff;
6738     }
6739     else {
6740         bSig0 |= LIT64( 0x4000000000000000 );
6741     }
6742     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6743     aSig0 |= LIT64( 0x4000000000000000 );
6744  aBigger:
6745     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6746     zExp = aExp;
6747  normalizeRoundAndPack:
6748     --zExp;
6749     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6750                                          status);
6751 
6752 }
6753 
6754 /*----------------------------------------------------------------------------
6755 | Returns the result of adding the quadruple-precision floating-point values
6756 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6757 | for Binary Floating-Point Arithmetic.
6758 *----------------------------------------------------------------------------*/
6759 
6760 float128 float128_add(float128 a, float128 b, float_status *status)
6761 {
6762     flag aSign, bSign;
6763 
6764     aSign = extractFloat128Sign( a );
6765     bSign = extractFloat128Sign( b );
6766     if ( aSign == bSign ) {
6767         return addFloat128Sigs(a, b, aSign, status);
6768     }
6769     else {
6770         return subFloat128Sigs(a, b, aSign, status);
6771     }
6772 
6773 }
6774 
6775 /*----------------------------------------------------------------------------
6776 | Returns the result of subtracting the quadruple-precision floating-point
6777 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6778 | Standard for Binary Floating-Point Arithmetic.
6779 *----------------------------------------------------------------------------*/
6780 
6781 float128 float128_sub(float128 a, float128 b, float_status *status)
6782 {
6783     flag aSign, bSign;
6784 
6785     aSign = extractFloat128Sign( a );
6786     bSign = extractFloat128Sign( b );
6787     if ( aSign == bSign ) {
6788         return subFloat128Sigs(a, b, aSign, status);
6789     }
6790     else {
6791         return addFloat128Sigs(a, b, aSign, status);
6792     }
6793 
6794 }
6795 
6796 /*----------------------------------------------------------------------------
6797 | Returns the result of multiplying the quadruple-precision floating-point
6798 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6799 | Standard for Binary Floating-Point Arithmetic.
6800 *----------------------------------------------------------------------------*/
6801 
6802 float128 float128_mul(float128 a, float128 b, float_status *status)
6803 {
6804     flag aSign, bSign, zSign;
6805     int32_t aExp, bExp, zExp;
6806     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6807 
6808     aSig1 = extractFloat128Frac1( a );
6809     aSig0 = extractFloat128Frac0( a );
6810     aExp = extractFloat128Exp( a );
6811     aSign = extractFloat128Sign( a );
6812     bSig1 = extractFloat128Frac1( b );
6813     bSig0 = extractFloat128Frac0( b );
6814     bExp = extractFloat128Exp( b );
6815     bSign = extractFloat128Sign( b );
6816     zSign = aSign ^ bSign;
6817     if ( aExp == 0x7FFF ) {
6818         if (    ( aSig0 | aSig1 )
6819              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6820             return propagateFloat128NaN(a, b, status);
6821         }
6822         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6823         return packFloat128( zSign, 0x7FFF, 0, 0 );
6824     }
6825     if ( bExp == 0x7FFF ) {
6826         if (bSig0 | bSig1) {
6827             return propagateFloat128NaN(a, b, status);
6828         }
6829         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6830  invalid:
6831             float_raise(float_flag_invalid, status);
6832             return float128_default_nan(status);
6833         }
6834         return packFloat128( zSign, 0x7FFF, 0, 0 );
6835     }
6836     if ( aExp == 0 ) {
6837         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6838         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6839     }
6840     if ( bExp == 0 ) {
6841         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6842         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6843     }
6844     zExp = aExp + bExp - 0x4000;
6845     aSig0 |= LIT64( 0x0001000000000000 );
6846     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6847     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6848     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6849     zSig2 |= ( zSig3 != 0 );
6850     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6851         shift128ExtraRightJamming(
6852             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6853         ++zExp;
6854     }
6855     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6856 
6857 }
6858 
6859 /*----------------------------------------------------------------------------
6860 | Returns the result of dividing the quadruple-precision floating-point value
6861 | `a' by the corresponding value `b'.  The operation is performed according to
6862 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6863 *----------------------------------------------------------------------------*/
6864 
6865 float128 float128_div(float128 a, float128 b, float_status *status)
6866 {
6867     flag aSign, bSign, zSign;
6868     int32_t aExp, bExp, zExp;
6869     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6870     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6871 
6872     aSig1 = extractFloat128Frac1( a );
6873     aSig0 = extractFloat128Frac0( a );
6874     aExp = extractFloat128Exp( a );
6875     aSign = extractFloat128Sign( a );
6876     bSig1 = extractFloat128Frac1( b );
6877     bSig0 = extractFloat128Frac0( b );
6878     bExp = extractFloat128Exp( b );
6879     bSign = extractFloat128Sign( b );
6880     zSign = aSign ^ bSign;
6881     if ( aExp == 0x7FFF ) {
6882         if (aSig0 | aSig1) {
6883             return propagateFloat128NaN(a, b, status);
6884         }
6885         if ( bExp == 0x7FFF ) {
6886             if (bSig0 | bSig1) {
6887                 return propagateFloat128NaN(a, b, status);
6888             }
6889             goto invalid;
6890         }
6891         return packFloat128( zSign, 0x7FFF, 0, 0 );
6892     }
6893     if ( bExp == 0x7FFF ) {
6894         if (bSig0 | bSig1) {
6895             return propagateFloat128NaN(a, b, status);
6896         }
6897         return packFloat128( zSign, 0, 0, 0 );
6898     }
6899     if ( bExp == 0 ) {
6900         if ( ( bSig0 | bSig1 ) == 0 ) {
6901             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6902  invalid:
6903                 float_raise(float_flag_invalid, status);
6904                 return float128_default_nan(status);
6905             }
6906             float_raise(float_flag_divbyzero, status);
6907             return packFloat128( zSign, 0x7FFF, 0, 0 );
6908         }
6909         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6910     }
6911     if ( aExp == 0 ) {
6912         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6913         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6914     }
6915     zExp = aExp - bExp + 0x3FFD;
6916     shortShift128Left(
6917         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6918     shortShift128Left(
6919         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6920     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6921         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6922         ++zExp;
6923     }
6924     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6925     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6926     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6927     while ( (int64_t) rem0 < 0 ) {
6928         --zSig0;
6929         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6930     }
6931     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6932     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6933         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6934         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6935         while ( (int64_t) rem1 < 0 ) {
6936             --zSig1;
6937             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6938         }
6939         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6940     }
6941     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6942     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6943 
6944 }
6945 
6946 /*----------------------------------------------------------------------------
6947 | Returns the remainder of the quadruple-precision floating-point value `a'
6948 | with respect to the corresponding value `b'.  The operation is performed
6949 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6950 *----------------------------------------------------------------------------*/
6951 
6952 float128 float128_rem(float128 a, float128 b, float_status *status)
6953 {
6954     flag aSign, zSign;
6955     int32_t aExp, bExp, expDiff;
6956     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6957     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6958     int64_t sigMean0;
6959 
6960     aSig1 = extractFloat128Frac1( a );
6961     aSig0 = extractFloat128Frac0( a );
6962     aExp = extractFloat128Exp( a );
6963     aSign = extractFloat128Sign( a );
6964     bSig1 = extractFloat128Frac1( b );
6965     bSig0 = extractFloat128Frac0( b );
6966     bExp = extractFloat128Exp( b );
6967     if ( aExp == 0x7FFF ) {
6968         if (    ( aSig0 | aSig1 )
6969              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6970             return propagateFloat128NaN(a, b, status);
6971         }
6972         goto invalid;
6973     }
6974     if ( bExp == 0x7FFF ) {
6975         if (bSig0 | bSig1) {
6976             return propagateFloat128NaN(a, b, status);
6977         }
6978         return a;
6979     }
6980     if ( bExp == 0 ) {
6981         if ( ( bSig0 | bSig1 ) == 0 ) {
6982  invalid:
6983             float_raise(float_flag_invalid, status);
6984             return float128_default_nan(status);
6985         }
6986         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6987     }
6988     if ( aExp == 0 ) {
6989         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6990         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6991     }
6992     expDiff = aExp - bExp;
6993     if ( expDiff < -1 ) return a;
6994     shortShift128Left(
6995         aSig0 | LIT64( 0x0001000000000000 ),
6996         aSig1,
6997         15 - ( expDiff < 0 ),
6998         &aSig0,
6999         &aSig1
7000     );
7001     shortShift128Left(
7002         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7003     q = le128( bSig0, bSig1, aSig0, aSig1 );
7004     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7005     expDiff -= 64;
7006     while ( 0 < expDiff ) {
7007         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7008         q = ( 4 < q ) ? q - 4 : 0;
7009         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7010         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7011         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7012         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7013         expDiff -= 61;
7014     }
7015     if ( -64 < expDiff ) {
7016         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7017         q = ( 4 < q ) ? q - 4 : 0;
7018         q >>= - expDiff;
7019         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7020         expDiff += 52;
7021         if ( expDiff < 0 ) {
7022             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7023         }
7024         else {
7025             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7026         }
7027         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7028         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7029     }
7030     else {
7031         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7032         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7033     }
7034     do {
7035         alternateASig0 = aSig0;
7036         alternateASig1 = aSig1;
7037         ++q;
7038         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7039     } while ( 0 <= (int64_t) aSig0 );
7040     add128(
7041         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7042     if (    ( sigMean0 < 0 )
7043          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7044         aSig0 = alternateASig0;
7045         aSig1 = alternateASig1;
7046     }
7047     zSign = ( (int64_t) aSig0 < 0 );
7048     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7049     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7050                                          status);
7051 }
7052 
7053 /*----------------------------------------------------------------------------
7054 | Returns the square root of the quadruple-precision floating-point value `a'.
7055 | The operation is performed according to the IEC/IEEE Standard for Binary
7056 | Floating-Point Arithmetic.
7057 *----------------------------------------------------------------------------*/
7058 
7059 float128 float128_sqrt(float128 a, float_status *status)
7060 {
7061     flag aSign;
7062     int32_t aExp, zExp;
7063     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7064     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7065 
7066     aSig1 = extractFloat128Frac1( a );
7067     aSig0 = extractFloat128Frac0( a );
7068     aExp = extractFloat128Exp( a );
7069     aSign = extractFloat128Sign( a );
7070     if ( aExp == 0x7FFF ) {
7071         if (aSig0 | aSig1) {
7072             return propagateFloat128NaN(a, a, status);
7073         }
7074         if ( ! aSign ) return a;
7075         goto invalid;
7076     }
7077     if ( aSign ) {
7078         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7079  invalid:
7080         float_raise(float_flag_invalid, status);
7081         return float128_default_nan(status);
7082     }
7083     if ( aExp == 0 ) {
7084         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7085         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7086     }
7087     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7088     aSig0 |= LIT64( 0x0001000000000000 );
7089     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7090     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7091     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7092     doubleZSig0 = zSig0<<1;
7093     mul64To128( zSig0, zSig0, &term0, &term1 );
7094     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7095     while ( (int64_t) rem0 < 0 ) {
7096         --zSig0;
7097         doubleZSig0 -= 2;
7098         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7099     }
7100     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7101     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7102         if ( zSig1 == 0 ) zSig1 = 1;
7103         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7104         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7105         mul64To128( zSig1, zSig1, &term2, &term3 );
7106         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7107         while ( (int64_t) rem1 < 0 ) {
7108             --zSig1;
7109             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7110             term3 |= 1;
7111             term2 |= doubleZSig0;
7112             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7113         }
7114         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7115     }
7116     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7117     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7118 
7119 }
7120 
7121 /*----------------------------------------------------------------------------
7122 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7123 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7124 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7125 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7126 *----------------------------------------------------------------------------*/
7127 
7128 int float128_eq(float128 a, float128 b, float_status *status)
7129 {
7130 
7131     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7132               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7133          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7134               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7135        ) {
7136         float_raise(float_flag_invalid, status);
7137         return 0;
7138     }
7139     return
7140            ( a.low == b.low )
7141         && (    ( a.high == b.high )
7142              || (    ( a.low == 0 )
7143                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7144            );
7145 
7146 }
7147 
7148 /*----------------------------------------------------------------------------
7149 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7150 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7151 | exception is raised if either operand is a NaN.  The comparison is performed
7152 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7153 *----------------------------------------------------------------------------*/
7154 
7155 int float128_le(float128 a, float128 b, float_status *status)
7156 {
7157     flag aSign, bSign;
7158 
7159     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7160               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7161          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7162               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7163        ) {
7164         float_raise(float_flag_invalid, status);
7165         return 0;
7166     }
7167     aSign = extractFloat128Sign( a );
7168     bSign = extractFloat128Sign( b );
7169     if ( aSign != bSign ) {
7170         return
7171                aSign
7172             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7173                  == 0 );
7174     }
7175     return
7176           aSign ? le128( b.high, b.low, a.high, a.low )
7177         : le128( a.high, a.low, b.high, b.low );
7178 
7179 }
7180 
7181 /*----------------------------------------------------------------------------
7182 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7183 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7184 | raised if either operand is a NaN.  The comparison is performed according
7185 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7186 *----------------------------------------------------------------------------*/
7187 
7188 int float128_lt(float128 a, float128 b, float_status *status)
7189 {
7190     flag aSign, bSign;
7191 
7192     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7193               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7194          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7195               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7196        ) {
7197         float_raise(float_flag_invalid, status);
7198         return 0;
7199     }
7200     aSign = extractFloat128Sign( a );
7201     bSign = extractFloat128Sign( b );
7202     if ( aSign != bSign ) {
7203         return
7204                aSign
7205             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7206                  != 0 );
7207     }
7208     return
7209           aSign ? lt128( b.high, b.low, a.high, a.low )
7210         : lt128( a.high, a.low, b.high, b.low );
7211 
7212 }
7213 
7214 /*----------------------------------------------------------------------------
7215 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7216 | be compared, and 0 otherwise.  The invalid exception is raised if either
7217 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7218 | Standard for Binary Floating-Point Arithmetic.
7219 *----------------------------------------------------------------------------*/
7220 
7221 int float128_unordered(float128 a, float128 b, float_status *status)
7222 {
7223     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7224               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7225          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7226               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7227        ) {
7228         float_raise(float_flag_invalid, status);
7229         return 1;
7230     }
7231     return 0;
7232 }
7233 
7234 /*----------------------------------------------------------------------------
7235 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7236 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7237 | exception.  The comparison is performed according to the IEC/IEEE Standard
7238 | for Binary Floating-Point Arithmetic.
7239 *----------------------------------------------------------------------------*/
7240 
7241 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7242 {
7243 
7244     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7245               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7246          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7247               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7248        ) {
7249         if (float128_is_signaling_nan(a, status)
7250          || float128_is_signaling_nan(b, status)) {
7251             float_raise(float_flag_invalid, status);
7252         }
7253         return 0;
7254     }
7255     return
7256            ( a.low == b.low )
7257         && (    ( a.high == b.high )
7258              || (    ( a.low == 0 )
7259                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7260            );
7261 
7262 }
7263 
7264 /*----------------------------------------------------------------------------
7265 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7266 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7267 | cause an exception.  Otherwise, the comparison is performed according to the
7268 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7269 *----------------------------------------------------------------------------*/
7270 
7271 int float128_le_quiet(float128 a, float128 b, float_status *status)
7272 {
7273     flag aSign, bSign;
7274 
7275     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7276               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7277          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7278               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7279        ) {
7280         if (float128_is_signaling_nan(a, status)
7281          || float128_is_signaling_nan(b, status)) {
7282             float_raise(float_flag_invalid, status);
7283         }
7284         return 0;
7285     }
7286     aSign = extractFloat128Sign( a );
7287     bSign = extractFloat128Sign( b );
7288     if ( aSign != bSign ) {
7289         return
7290                aSign
7291             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7292                  == 0 );
7293     }
7294     return
7295           aSign ? le128( b.high, b.low, a.high, a.low )
7296         : le128( a.high, a.low, b.high, b.low );
7297 
7298 }
7299 
7300 /*----------------------------------------------------------------------------
7301 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7302 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7303 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7304 | Standard for Binary Floating-Point Arithmetic.
7305 *----------------------------------------------------------------------------*/
7306 
7307 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7308 {
7309     flag aSign, bSign;
7310 
7311     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7312               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7313          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7314               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7315        ) {
7316         if (float128_is_signaling_nan(a, status)
7317          || float128_is_signaling_nan(b, status)) {
7318             float_raise(float_flag_invalid, status);
7319         }
7320         return 0;
7321     }
7322     aSign = extractFloat128Sign( a );
7323     bSign = extractFloat128Sign( b );
7324     if ( aSign != bSign ) {
7325         return
7326                aSign
7327             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7328                  != 0 );
7329     }
7330     return
7331           aSign ? lt128( b.high, b.low, a.high, a.low )
7332         : lt128( a.high, a.low, b.high, b.low );
7333 
7334 }
7335 
7336 /*----------------------------------------------------------------------------
7337 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7338 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7339 | comparison is performed according to the IEC/IEEE Standard for Binary
7340 | Floating-Point Arithmetic.
7341 *----------------------------------------------------------------------------*/
7342 
7343 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7344 {
7345     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7346               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7347          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7348               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7349        ) {
7350         if (float128_is_signaling_nan(a, status)
7351          || float128_is_signaling_nan(b, status)) {
7352             float_raise(float_flag_invalid, status);
7353         }
7354         return 1;
7355     }
7356     return 0;
7357 }
7358 
7359 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7360                                             int is_quiet, float_status *status)
7361 {
7362     flag aSign, bSign;
7363 
7364     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7365         float_raise(float_flag_invalid, status);
7366         return float_relation_unordered;
7367     }
7368     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7369           ( extractFloatx80Frac( a )<<1 ) ) ||
7370         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7371           ( extractFloatx80Frac( b )<<1 ) )) {
7372         if (!is_quiet ||
7373             floatx80_is_signaling_nan(a, status) ||
7374             floatx80_is_signaling_nan(b, status)) {
7375             float_raise(float_flag_invalid, status);
7376         }
7377         return float_relation_unordered;
7378     }
7379     aSign = extractFloatx80Sign( a );
7380     bSign = extractFloatx80Sign( b );
7381     if ( aSign != bSign ) {
7382 
7383         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7384              ( ( a.low | b.low ) == 0 ) ) {
7385             /* zero case */
7386             return float_relation_equal;
7387         } else {
7388             return 1 - (2 * aSign);
7389         }
7390     } else {
7391         if (a.low == b.low && a.high == b.high) {
7392             return float_relation_equal;
7393         } else {
7394             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7395         }
7396     }
7397 }
7398 
7399 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7400 {
7401     return floatx80_compare_internal(a, b, 0, status);
7402 }
7403 
7404 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7405 {
7406     return floatx80_compare_internal(a, b, 1, status);
7407 }
7408 
7409 static inline int float128_compare_internal(float128 a, float128 b,
7410                                             int is_quiet, float_status *status)
7411 {
7412     flag aSign, bSign;
7413 
7414     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7415           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7416         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7417           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7418         if (!is_quiet ||
7419             float128_is_signaling_nan(a, status) ||
7420             float128_is_signaling_nan(b, status)) {
7421             float_raise(float_flag_invalid, status);
7422         }
7423         return float_relation_unordered;
7424     }
7425     aSign = extractFloat128Sign( a );
7426     bSign = extractFloat128Sign( b );
7427     if ( aSign != bSign ) {
7428         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7429             /* zero case */
7430             return float_relation_equal;
7431         } else {
7432             return 1 - (2 * aSign);
7433         }
7434     } else {
7435         if (a.low == b.low && a.high == b.high) {
7436             return float_relation_equal;
7437         } else {
7438             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7439         }
7440     }
7441 }
7442 
7443 int float128_compare(float128 a, float128 b, float_status *status)
7444 {
7445     return float128_compare_internal(a, b, 0, status);
7446 }
7447 
7448 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7449 {
7450     return float128_compare_internal(a, b, 1, status);
7451 }
7452 
7453 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7454 {
7455     flag aSign;
7456     int32_t aExp;
7457     uint64_t aSig;
7458 
7459     if (floatx80_invalid_encoding(a)) {
7460         float_raise(float_flag_invalid, status);
7461         return floatx80_default_nan(status);
7462     }
7463     aSig = extractFloatx80Frac( a );
7464     aExp = extractFloatx80Exp( a );
7465     aSign = extractFloatx80Sign( a );
7466 
7467     if ( aExp == 0x7FFF ) {
7468         if ( aSig<<1 ) {
7469             return propagateFloatx80NaN(a, a, status);
7470         }
7471         return a;
7472     }
7473 
7474     if (aExp == 0) {
7475         if (aSig == 0) {
7476             return a;
7477         }
7478         aExp++;
7479     }
7480 
7481     if (n > 0x10000) {
7482         n = 0x10000;
7483     } else if (n < -0x10000) {
7484         n = -0x10000;
7485     }
7486 
7487     aExp += n;
7488     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7489                                          aSign, aExp, aSig, 0, status);
7490 }
7491 
7492 float128 float128_scalbn(float128 a, int n, float_status *status)
7493 {
7494     flag aSign;
7495     int32_t aExp;
7496     uint64_t aSig0, aSig1;
7497 
7498     aSig1 = extractFloat128Frac1( a );
7499     aSig0 = extractFloat128Frac0( a );
7500     aExp = extractFloat128Exp( a );
7501     aSign = extractFloat128Sign( a );
7502     if ( aExp == 0x7FFF ) {
7503         if ( aSig0 | aSig1 ) {
7504             return propagateFloat128NaN(a, a, status);
7505         }
7506         return a;
7507     }
7508     if (aExp != 0) {
7509         aSig0 |= LIT64( 0x0001000000000000 );
7510     } else if (aSig0 == 0 && aSig1 == 0) {
7511         return a;
7512     } else {
7513         aExp++;
7514     }
7515 
7516     if (n > 0x10000) {
7517         n = 0x10000;
7518     } else if (n < -0x10000) {
7519         n = -0x10000;
7520     }
7521 
7522     aExp += n - 1;
7523     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7524                                          , status);
7525 
7526 }
7527