xref: /qemu/fpu/softfloat.c (revision a90119b5a2c174250601be6503b91e5c9df6e83b)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include "fpu/softfloat.h"
87 
88 /* We only need stdlib for abort() */
89 
90 /*----------------------------------------------------------------------------
91 | Primitive arithmetic functions, including multi-word arithmetic, and
92 | division and square root approximations.  (Can be specialized to target if
93 | desired.)
94 *----------------------------------------------------------------------------*/
95 #include "softfloat-macros.h"
96 
97 /*----------------------------------------------------------------------------
98 | Functions and definitions to determine:  (1) whether tininess for underflow
99 | is detected before or after rounding by default, (2) what (if anything)
100 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
101 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
102 | are propagated from function inputs to output.  These details are target-
103 | specific.
104 *----------------------------------------------------------------------------*/
105 #include "softfloat-specialize.h"
106 
107 /*----------------------------------------------------------------------------
108 | Returns the fraction bits of the half-precision floating-point value `a'.
109 *----------------------------------------------------------------------------*/
110 
111 static inline uint32_t extractFloat16Frac(float16 a)
112 {
113     return float16_val(a) & 0x3ff;
114 }
115 
116 /*----------------------------------------------------------------------------
117 | Returns the exponent bits of the half-precision floating-point value `a'.
118 *----------------------------------------------------------------------------*/
119 
120 static inline int extractFloat16Exp(float16 a)
121 {
122     return (float16_val(a) >> 10) & 0x1f;
123 }
124 
125 /*----------------------------------------------------------------------------
126 | Returns the sign bit of the single-precision floating-point value `a'.
127 *----------------------------------------------------------------------------*/
128 
129 static inline flag extractFloat16Sign(float16 a)
130 {
131     return float16_val(a)>>15;
132 }
133 
134 /*----------------------------------------------------------------------------
135 | Returns the fraction bits of the single-precision floating-point value `a'.
136 *----------------------------------------------------------------------------*/
137 
138 static inline uint32_t extractFloat32Frac(float32 a)
139 {
140     return float32_val(a) & 0x007FFFFF;
141 }
142 
143 /*----------------------------------------------------------------------------
144 | Returns the exponent bits of the single-precision floating-point value `a'.
145 *----------------------------------------------------------------------------*/
146 
147 static inline int extractFloat32Exp(float32 a)
148 {
149     return (float32_val(a) >> 23) & 0xFF;
150 }
151 
152 /*----------------------------------------------------------------------------
153 | Returns the sign bit of the single-precision floating-point value `a'.
154 *----------------------------------------------------------------------------*/
155 
156 static inline flag extractFloat32Sign(float32 a)
157 {
158     return float32_val(a) >> 31;
159 }
160 
161 /*----------------------------------------------------------------------------
162 | Returns the fraction bits of the double-precision floating-point value `a'.
163 *----------------------------------------------------------------------------*/
164 
165 static inline uint64_t extractFloat64Frac(float64 a)
166 {
167     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
168 }
169 
170 /*----------------------------------------------------------------------------
171 | Returns the exponent bits of the double-precision floating-point value `a'.
172 *----------------------------------------------------------------------------*/
173 
174 static inline int extractFloat64Exp(float64 a)
175 {
176     return (float64_val(a) >> 52) & 0x7FF;
177 }
178 
179 /*----------------------------------------------------------------------------
180 | Returns the sign bit of the double-precision floating-point value `a'.
181 *----------------------------------------------------------------------------*/
182 
183 static inline flag extractFloat64Sign(float64 a)
184 {
185     return float64_val(a) >> 63;
186 }
187 
188 /*
189  * Classify a floating point number. Everything above float_class_qnan
190  * is a NaN so cls >= float_class_qnan is any NaN.
191  */
192 
193 typedef enum __attribute__ ((__packed__)) {
194     float_class_unclassified,
195     float_class_zero,
196     float_class_normal,
197     float_class_inf,
198     float_class_qnan,  /* all NaNs from here */
199     float_class_snan,
200     float_class_dnan,
201     float_class_msnan, /* maybe silenced */
202 } FloatClass;
203 
204 /*
205  * Structure holding all of the decomposed parts of a float. The
206  * exponent is unbiased and the fraction is normalized. All
207  * calculations are done with a 64 bit fraction and then rounded as
208  * appropriate for the final format.
209  *
210  * Thanks to the packed FloatClass a decent compiler should be able to
211  * fit the whole structure into registers and avoid using the stack
212  * for parameter passing.
213  */
214 
215 typedef struct {
216     uint64_t frac;
217     int32_t  exp;
218     FloatClass cls;
219     bool sign;
220 } FloatParts;
221 
222 #define DECOMPOSED_BINARY_POINT    (64 - 2)
223 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
224 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
225 
226 /* Structure holding all of the relevant parameters for a format.
227  *   exp_size: the size of the exponent field
228  *   exp_bias: the offset applied to the exponent field
229  *   exp_max: the maximum normalised exponent
230  *   frac_size: the size of the fraction field
231  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
232  * The following are computed based the size of fraction
233  *   frac_lsb: least significant bit of fraction
234  *   fram_lsbm1: the bit bellow the least significant bit (for rounding)
235  *   round_mask/roundeven_mask: masks used for rounding
236  */
237 typedef struct {
238     int exp_size;
239     int exp_bias;
240     int exp_max;
241     int frac_size;
242     int frac_shift;
243     uint64_t frac_lsb;
244     uint64_t frac_lsbm1;
245     uint64_t round_mask;
246     uint64_t roundeven_mask;
247 } FloatFmt;
248 
249 /* Expand fields based on the size of exponent and fraction */
250 #define FLOAT_PARAMS(E, F)                                           \
251     .exp_size       = E,                                             \
252     .exp_bias       = ((1 << E) - 1) >> 1,                           \
253     .exp_max        = (1 << E) - 1,                                  \
254     .frac_size      = F,                                             \
255     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
256     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
257     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
258     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
259     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
260 
261 static const FloatFmt float16_params = {
262     FLOAT_PARAMS(5, 10)
263 };
264 
265 static const FloatFmt float32_params = {
266     FLOAT_PARAMS(8, 23)
267 };
268 
269 static const FloatFmt float64_params = {
270     FLOAT_PARAMS(11, 52)
271 };
272 
273 /*----------------------------------------------------------------------------
274 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
275 | and 7, and returns the properly rounded 32-bit integer corresponding to the
276 | input.  If `zSign' is 1, the input is negated before being converted to an
277 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
278 | is simply rounded to an integer, with the inexact exception raised if the
279 | input cannot be represented exactly as an integer.  However, if the fixed-
280 | point input is too large, the invalid exception is raised and the largest
281 | positive or negative integer is returned.
282 *----------------------------------------------------------------------------*/
283 
284 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
285 {
286     int8_t roundingMode;
287     flag roundNearestEven;
288     int8_t roundIncrement, roundBits;
289     int32_t z;
290 
291     roundingMode = status->float_rounding_mode;
292     roundNearestEven = ( roundingMode == float_round_nearest_even );
293     switch (roundingMode) {
294     case float_round_nearest_even:
295     case float_round_ties_away:
296         roundIncrement = 0x40;
297         break;
298     case float_round_to_zero:
299         roundIncrement = 0;
300         break;
301     case float_round_up:
302         roundIncrement = zSign ? 0 : 0x7f;
303         break;
304     case float_round_down:
305         roundIncrement = zSign ? 0x7f : 0;
306         break;
307     default:
308         abort();
309     }
310     roundBits = absZ & 0x7F;
311     absZ = ( absZ + roundIncrement )>>7;
312     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
313     z = absZ;
314     if ( zSign ) z = - z;
315     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
316         float_raise(float_flag_invalid, status);
317         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
318     }
319     if (roundBits) {
320         status->float_exception_flags |= float_flag_inexact;
321     }
322     return z;
323 
324 }
325 
326 /*----------------------------------------------------------------------------
327 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
328 | `absZ1', with binary point between bits 63 and 64 (between the input words),
329 | and returns the properly rounded 64-bit integer corresponding to the input.
330 | If `zSign' is 1, the input is negated before being converted to an integer.
331 | Ordinarily, the fixed-point input is simply rounded to an integer, with
332 | the inexact exception raised if the input cannot be represented exactly as
333 | an integer.  However, if the fixed-point input is too large, the invalid
334 | exception is raised and the largest positive or negative integer is
335 | returned.
336 *----------------------------------------------------------------------------*/
337 
338 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
339                                float_status *status)
340 {
341     int8_t roundingMode;
342     flag roundNearestEven, increment;
343     int64_t z;
344 
345     roundingMode = status->float_rounding_mode;
346     roundNearestEven = ( roundingMode == float_round_nearest_even );
347     switch (roundingMode) {
348     case float_round_nearest_even:
349     case float_round_ties_away:
350         increment = ((int64_t) absZ1 < 0);
351         break;
352     case float_round_to_zero:
353         increment = 0;
354         break;
355     case float_round_up:
356         increment = !zSign && absZ1;
357         break;
358     case float_round_down:
359         increment = zSign && absZ1;
360         break;
361     default:
362         abort();
363     }
364     if ( increment ) {
365         ++absZ0;
366         if ( absZ0 == 0 ) goto overflow;
367         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
368     }
369     z = absZ0;
370     if ( zSign ) z = - z;
371     if ( z && ( ( z < 0 ) ^ zSign ) ) {
372  overflow:
373         float_raise(float_flag_invalid, status);
374         return
375               zSign ? (int64_t) LIT64( 0x8000000000000000 )
376             : LIT64( 0x7FFFFFFFFFFFFFFF );
377     }
378     if (absZ1) {
379         status->float_exception_flags |= float_flag_inexact;
380     }
381     return z;
382 
383 }
384 
385 /*----------------------------------------------------------------------------
386 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
387 | `absZ1', with binary point between bits 63 and 64 (between the input words),
388 | and returns the properly rounded 64-bit unsigned integer corresponding to the
389 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
390 | with the inexact exception raised if the input cannot be represented exactly
391 | as an integer.  However, if the fixed-point input is too large, the invalid
392 | exception is raised and the largest unsigned integer is returned.
393 *----------------------------------------------------------------------------*/
394 
395 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
396                                 uint64_t absZ1, float_status *status)
397 {
398     int8_t roundingMode;
399     flag roundNearestEven, increment;
400 
401     roundingMode = status->float_rounding_mode;
402     roundNearestEven = (roundingMode == float_round_nearest_even);
403     switch (roundingMode) {
404     case float_round_nearest_even:
405     case float_round_ties_away:
406         increment = ((int64_t)absZ1 < 0);
407         break;
408     case float_round_to_zero:
409         increment = 0;
410         break;
411     case float_round_up:
412         increment = !zSign && absZ1;
413         break;
414     case float_round_down:
415         increment = zSign && absZ1;
416         break;
417     default:
418         abort();
419     }
420     if (increment) {
421         ++absZ0;
422         if (absZ0 == 0) {
423             float_raise(float_flag_invalid, status);
424             return LIT64(0xFFFFFFFFFFFFFFFF);
425         }
426         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
427     }
428 
429     if (zSign && absZ0) {
430         float_raise(float_flag_invalid, status);
431         return 0;
432     }
433 
434     if (absZ1) {
435         status->float_exception_flags |= float_flag_inexact;
436     }
437     return absZ0;
438 }
439 
440 /*----------------------------------------------------------------------------
441 | If `a' is denormal and we are in flush-to-zero mode then set the
442 | input-denormal exception and return zero. Otherwise just return the value.
443 *----------------------------------------------------------------------------*/
444 float32 float32_squash_input_denormal(float32 a, float_status *status)
445 {
446     if (status->flush_inputs_to_zero) {
447         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
448             float_raise(float_flag_input_denormal, status);
449             return make_float32(float32_val(a) & 0x80000000);
450         }
451     }
452     return a;
453 }
454 
455 /*----------------------------------------------------------------------------
456 | Normalizes the subnormal single-precision floating-point value represented
457 | by the denormalized significand `aSig'.  The normalized exponent and
458 | significand are stored at the locations pointed to by `zExpPtr' and
459 | `zSigPtr', respectively.
460 *----------------------------------------------------------------------------*/
461 
462 static void
463  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
464 {
465     int8_t shiftCount;
466 
467     shiftCount = countLeadingZeros32( aSig ) - 8;
468     *zSigPtr = aSig<<shiftCount;
469     *zExpPtr = 1 - shiftCount;
470 
471 }
472 
473 /*----------------------------------------------------------------------------
474 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
475 | single-precision floating-point value, returning the result.  After being
476 | shifted into the proper positions, the three fields are simply added
477 | together to form the result.  This means that any integer portion of `zSig'
478 | will be added into the exponent.  Since a properly normalized significand
479 | will have an integer portion equal to 1, the `zExp' input should be 1 less
480 | than the desired result exponent whenever `zSig' is a complete, normalized
481 | significand.
482 *----------------------------------------------------------------------------*/
483 
484 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
485 {
486 
487     return make_float32(
488           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
489 
490 }
491 
492 /*----------------------------------------------------------------------------
493 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
494 | and significand `zSig', and returns the proper single-precision floating-
495 | point value corresponding to the abstract input.  Ordinarily, the abstract
496 | value is simply rounded and packed into the single-precision format, with
497 | the inexact exception raised if the abstract input cannot be represented
498 | exactly.  However, if the abstract value is too large, the overflow and
499 | inexact exceptions are raised and an infinity or maximal finite value is
500 | returned.  If the abstract value is too small, the input value is rounded to
501 | a subnormal number, and the underflow and inexact exceptions are raised if
502 | the abstract input cannot be represented exactly as a subnormal single-
503 | precision floating-point number.
504 |     The input significand `zSig' has its binary point between bits 30
505 | and 29, which is 7 bits to the left of the usual location.  This shifted
506 | significand must be normalized or smaller.  If `zSig' is not normalized,
507 | `zExp' must be 0; in that case, the result returned is a subnormal number,
508 | and it must not require rounding.  In the usual case that `zSig' is
509 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
510 | The handling of underflow and overflow follows the IEC/IEEE Standard for
511 | Binary Floating-Point Arithmetic.
512 *----------------------------------------------------------------------------*/
513 
514 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
515                                    float_status *status)
516 {
517     int8_t roundingMode;
518     flag roundNearestEven;
519     int8_t roundIncrement, roundBits;
520     flag isTiny;
521 
522     roundingMode = status->float_rounding_mode;
523     roundNearestEven = ( roundingMode == float_round_nearest_even );
524     switch (roundingMode) {
525     case float_round_nearest_even:
526     case float_round_ties_away:
527         roundIncrement = 0x40;
528         break;
529     case float_round_to_zero:
530         roundIncrement = 0;
531         break;
532     case float_round_up:
533         roundIncrement = zSign ? 0 : 0x7f;
534         break;
535     case float_round_down:
536         roundIncrement = zSign ? 0x7f : 0;
537         break;
538     default:
539         abort();
540         break;
541     }
542     roundBits = zSig & 0x7F;
543     if ( 0xFD <= (uint16_t) zExp ) {
544         if (    ( 0xFD < zExp )
545              || (    ( zExp == 0xFD )
546                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
547            ) {
548             float_raise(float_flag_overflow | float_flag_inexact, status);
549             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
550         }
551         if ( zExp < 0 ) {
552             if (status->flush_to_zero) {
553                 float_raise(float_flag_output_denormal, status);
554                 return packFloat32(zSign, 0, 0);
555             }
556             isTiny =
557                 (status->float_detect_tininess
558                  == float_tininess_before_rounding)
559                 || ( zExp < -1 )
560                 || ( zSig + roundIncrement < 0x80000000 );
561             shift32RightJamming( zSig, - zExp, &zSig );
562             zExp = 0;
563             roundBits = zSig & 0x7F;
564             if (isTiny && roundBits) {
565                 float_raise(float_flag_underflow, status);
566             }
567         }
568     }
569     if (roundBits) {
570         status->float_exception_flags |= float_flag_inexact;
571     }
572     zSig = ( zSig + roundIncrement )>>7;
573     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
574     if ( zSig == 0 ) zExp = 0;
575     return packFloat32( zSign, zExp, zSig );
576 
577 }
578 
579 /*----------------------------------------------------------------------------
580 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
581 | and significand `zSig', and returns the proper single-precision floating-
582 | point value corresponding to the abstract input.  This routine is just like
583 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
584 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
585 | floating-point exponent.
586 *----------------------------------------------------------------------------*/
587 
588 static float32
589  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
590                               float_status *status)
591 {
592     int8_t shiftCount;
593 
594     shiftCount = countLeadingZeros32( zSig ) - 1;
595     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
596                                status);
597 
598 }
599 
600 /*----------------------------------------------------------------------------
601 | If `a' is denormal and we are in flush-to-zero mode then set the
602 | input-denormal exception and return zero. Otherwise just return the value.
603 *----------------------------------------------------------------------------*/
604 float64 float64_squash_input_denormal(float64 a, float_status *status)
605 {
606     if (status->flush_inputs_to_zero) {
607         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
608             float_raise(float_flag_input_denormal, status);
609             return make_float64(float64_val(a) & (1ULL << 63));
610         }
611     }
612     return a;
613 }
614 
615 /*----------------------------------------------------------------------------
616 | Normalizes the subnormal double-precision floating-point value represented
617 | by the denormalized significand `aSig'.  The normalized exponent and
618 | significand are stored at the locations pointed to by `zExpPtr' and
619 | `zSigPtr', respectively.
620 *----------------------------------------------------------------------------*/
621 
622 static void
623  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
624 {
625     int8_t shiftCount;
626 
627     shiftCount = countLeadingZeros64( aSig ) - 11;
628     *zSigPtr = aSig<<shiftCount;
629     *zExpPtr = 1 - shiftCount;
630 
631 }
632 
633 /*----------------------------------------------------------------------------
634 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
635 | double-precision floating-point value, returning the result.  After being
636 | shifted into the proper positions, the three fields are simply added
637 | together to form the result.  This means that any integer portion of `zSig'
638 | will be added into the exponent.  Since a properly normalized significand
639 | will have an integer portion equal to 1, the `zExp' input should be 1 less
640 | than the desired result exponent whenever `zSig' is a complete, normalized
641 | significand.
642 *----------------------------------------------------------------------------*/
643 
644 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
645 {
646 
647     return make_float64(
648         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
649 
650 }
651 
652 /*----------------------------------------------------------------------------
653 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
654 | and significand `zSig', and returns the proper double-precision floating-
655 | point value corresponding to the abstract input.  Ordinarily, the abstract
656 | value is simply rounded and packed into the double-precision format, with
657 | the inexact exception raised if the abstract input cannot be represented
658 | exactly.  However, if the abstract value is too large, the overflow and
659 | inexact exceptions are raised and an infinity or maximal finite value is
660 | returned.  If the abstract value is too small, the input value is rounded to
661 | a subnormal number, and the underflow and inexact exceptions are raised if
662 | the abstract input cannot be represented exactly as a subnormal double-
663 | precision floating-point number.
664 |     The input significand `zSig' has its binary point between bits 62
665 | and 61, which is 10 bits to the left of the usual location.  This shifted
666 | significand must be normalized or smaller.  If `zSig' is not normalized,
667 | `zExp' must be 0; in that case, the result returned is a subnormal number,
668 | and it must not require rounding.  In the usual case that `zSig' is
669 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
670 | The handling of underflow and overflow follows the IEC/IEEE Standard for
671 | Binary Floating-Point Arithmetic.
672 *----------------------------------------------------------------------------*/
673 
674 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
675                                    float_status *status)
676 {
677     int8_t roundingMode;
678     flag roundNearestEven;
679     int roundIncrement, roundBits;
680     flag isTiny;
681 
682     roundingMode = status->float_rounding_mode;
683     roundNearestEven = ( roundingMode == float_round_nearest_even );
684     switch (roundingMode) {
685     case float_round_nearest_even:
686     case float_round_ties_away:
687         roundIncrement = 0x200;
688         break;
689     case float_round_to_zero:
690         roundIncrement = 0;
691         break;
692     case float_round_up:
693         roundIncrement = zSign ? 0 : 0x3ff;
694         break;
695     case float_round_down:
696         roundIncrement = zSign ? 0x3ff : 0;
697         break;
698     case float_round_to_odd:
699         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
700         break;
701     default:
702         abort();
703     }
704     roundBits = zSig & 0x3FF;
705     if ( 0x7FD <= (uint16_t) zExp ) {
706         if (    ( 0x7FD < zExp )
707              || (    ( zExp == 0x7FD )
708                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
709            ) {
710             bool overflow_to_inf = roundingMode != float_round_to_odd &&
711                                    roundIncrement != 0;
712             float_raise(float_flag_overflow | float_flag_inexact, status);
713             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
714         }
715         if ( zExp < 0 ) {
716             if (status->flush_to_zero) {
717                 float_raise(float_flag_output_denormal, status);
718                 return packFloat64(zSign, 0, 0);
719             }
720             isTiny =
721                    (status->float_detect_tininess
722                     == float_tininess_before_rounding)
723                 || ( zExp < -1 )
724                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
725             shift64RightJamming( zSig, - zExp, &zSig );
726             zExp = 0;
727             roundBits = zSig & 0x3FF;
728             if (isTiny && roundBits) {
729                 float_raise(float_flag_underflow, status);
730             }
731             if (roundingMode == float_round_to_odd) {
732                 /*
733                  * For round-to-odd case, the roundIncrement depends on
734                  * zSig which just changed.
735                  */
736                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
737             }
738         }
739     }
740     if (roundBits) {
741         status->float_exception_flags |= float_flag_inexact;
742     }
743     zSig = ( zSig + roundIncrement )>>10;
744     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
745     if ( zSig == 0 ) zExp = 0;
746     return packFloat64( zSign, zExp, zSig );
747 
748 }
749 
750 /*----------------------------------------------------------------------------
751 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
752 | and significand `zSig', and returns the proper double-precision floating-
753 | point value corresponding to the abstract input.  This routine is just like
754 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
755 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
756 | floating-point exponent.
757 *----------------------------------------------------------------------------*/
758 
759 static float64
760  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
761                               float_status *status)
762 {
763     int8_t shiftCount;
764 
765     shiftCount = countLeadingZeros64( zSig ) - 1;
766     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
767                                status);
768 
769 }
770 
771 /*----------------------------------------------------------------------------
772 | Returns the fraction bits of the extended double-precision floating-point
773 | value `a'.
774 *----------------------------------------------------------------------------*/
775 
776 static inline uint64_t extractFloatx80Frac( floatx80 a )
777 {
778 
779     return a.low;
780 
781 }
782 
783 /*----------------------------------------------------------------------------
784 | Returns the exponent bits of the extended double-precision floating-point
785 | value `a'.
786 *----------------------------------------------------------------------------*/
787 
788 static inline int32_t extractFloatx80Exp( floatx80 a )
789 {
790 
791     return a.high & 0x7FFF;
792 
793 }
794 
795 /*----------------------------------------------------------------------------
796 | Returns the sign bit of the extended double-precision floating-point value
797 | `a'.
798 *----------------------------------------------------------------------------*/
799 
800 static inline flag extractFloatx80Sign( floatx80 a )
801 {
802 
803     return a.high>>15;
804 
805 }
806 
807 /*----------------------------------------------------------------------------
808 | Normalizes the subnormal extended double-precision floating-point value
809 | represented by the denormalized significand `aSig'.  The normalized exponent
810 | and significand are stored at the locations pointed to by `zExpPtr' and
811 | `zSigPtr', respectively.
812 *----------------------------------------------------------------------------*/
813 
814 static void
815  normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
816 {
817     int8_t shiftCount;
818 
819     shiftCount = countLeadingZeros64( aSig );
820     *zSigPtr = aSig<<shiftCount;
821     *zExpPtr = 1 - shiftCount;
822 
823 }
824 
825 /*----------------------------------------------------------------------------
826 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
827 | extended double-precision floating-point value, returning the result.
828 *----------------------------------------------------------------------------*/
829 
830 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
831 {
832     floatx80 z;
833 
834     z.low = zSig;
835     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
836     return z;
837 
838 }
839 
840 /*----------------------------------------------------------------------------
841 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
842 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
843 | and returns the proper extended double-precision floating-point value
844 | corresponding to the abstract input.  Ordinarily, the abstract value is
845 | rounded and packed into the extended double-precision format, with the
846 | inexact exception raised if the abstract input cannot be represented
847 | exactly.  However, if the abstract value is too large, the overflow and
848 | inexact exceptions are raised and an infinity or maximal finite value is
849 | returned.  If the abstract value is too small, the input value is rounded to
850 | a subnormal number, and the underflow and inexact exceptions are raised if
851 | the abstract input cannot be represented exactly as a subnormal extended
852 | double-precision floating-point number.
853 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
854 | number of bits as single or double precision, respectively.  Otherwise, the
855 | result is rounded to the full precision of the extended double-precision
856 | format.
857 |     The input significand must be normalized or smaller.  If the input
858 | significand is not normalized, `zExp' must be 0; in that case, the result
859 | returned is a subnormal number, and it must not require rounding.  The
860 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
861 | Floating-Point Arithmetic.
862 *----------------------------------------------------------------------------*/
863 
864 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
865                                      int32_t zExp, uint64_t zSig0, uint64_t zSig1,
866                                      float_status *status)
867 {
868     int8_t roundingMode;
869     flag roundNearestEven, increment, isTiny;
870     int64_t roundIncrement, roundMask, roundBits;
871 
872     roundingMode = status->float_rounding_mode;
873     roundNearestEven = ( roundingMode == float_round_nearest_even );
874     if ( roundingPrecision == 80 ) goto precision80;
875     if ( roundingPrecision == 64 ) {
876         roundIncrement = LIT64( 0x0000000000000400 );
877         roundMask = LIT64( 0x00000000000007FF );
878     }
879     else if ( roundingPrecision == 32 ) {
880         roundIncrement = LIT64( 0x0000008000000000 );
881         roundMask = LIT64( 0x000000FFFFFFFFFF );
882     }
883     else {
884         goto precision80;
885     }
886     zSig0 |= ( zSig1 != 0 );
887     switch (roundingMode) {
888     case float_round_nearest_even:
889     case float_round_ties_away:
890         break;
891     case float_round_to_zero:
892         roundIncrement = 0;
893         break;
894     case float_round_up:
895         roundIncrement = zSign ? 0 : roundMask;
896         break;
897     case float_round_down:
898         roundIncrement = zSign ? roundMask : 0;
899         break;
900     default:
901         abort();
902     }
903     roundBits = zSig0 & roundMask;
904     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
905         if (    ( 0x7FFE < zExp )
906              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
907            ) {
908             goto overflow;
909         }
910         if ( zExp <= 0 ) {
911             if (status->flush_to_zero) {
912                 float_raise(float_flag_output_denormal, status);
913                 return packFloatx80(zSign, 0, 0);
914             }
915             isTiny =
916                    (status->float_detect_tininess
917                     == float_tininess_before_rounding)
918                 || ( zExp < 0 )
919                 || ( zSig0 <= zSig0 + roundIncrement );
920             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
921             zExp = 0;
922             roundBits = zSig0 & roundMask;
923             if (isTiny && roundBits) {
924                 float_raise(float_flag_underflow, status);
925             }
926             if (roundBits) {
927                 status->float_exception_flags |= float_flag_inexact;
928             }
929             zSig0 += roundIncrement;
930             if ( (int64_t) zSig0 < 0 ) zExp = 1;
931             roundIncrement = roundMask + 1;
932             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
933                 roundMask |= roundIncrement;
934             }
935             zSig0 &= ~ roundMask;
936             return packFloatx80( zSign, zExp, zSig0 );
937         }
938     }
939     if (roundBits) {
940         status->float_exception_flags |= float_flag_inexact;
941     }
942     zSig0 += roundIncrement;
943     if ( zSig0 < roundIncrement ) {
944         ++zExp;
945         zSig0 = LIT64( 0x8000000000000000 );
946     }
947     roundIncrement = roundMask + 1;
948     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
949         roundMask |= roundIncrement;
950     }
951     zSig0 &= ~ roundMask;
952     if ( zSig0 == 0 ) zExp = 0;
953     return packFloatx80( zSign, zExp, zSig0 );
954  precision80:
955     switch (roundingMode) {
956     case float_round_nearest_even:
957     case float_round_ties_away:
958         increment = ((int64_t)zSig1 < 0);
959         break;
960     case float_round_to_zero:
961         increment = 0;
962         break;
963     case float_round_up:
964         increment = !zSign && zSig1;
965         break;
966     case float_round_down:
967         increment = zSign && zSig1;
968         break;
969     default:
970         abort();
971     }
972     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
973         if (    ( 0x7FFE < zExp )
974              || (    ( zExp == 0x7FFE )
975                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
976                   && increment
977                 )
978            ) {
979             roundMask = 0;
980  overflow:
981             float_raise(float_flag_overflow | float_flag_inexact, status);
982             if (    ( roundingMode == float_round_to_zero )
983                  || ( zSign && ( roundingMode == float_round_up ) )
984                  || ( ! zSign && ( roundingMode == float_round_down ) )
985                ) {
986                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
987             }
988             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
989         }
990         if ( zExp <= 0 ) {
991             isTiny =
992                    (status->float_detect_tininess
993                     == float_tininess_before_rounding)
994                 || ( zExp < 0 )
995                 || ! increment
996                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
997             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
998             zExp = 0;
999             if (isTiny && zSig1) {
1000                 float_raise(float_flag_underflow, status);
1001             }
1002             if (zSig1) {
1003                 status->float_exception_flags |= float_flag_inexact;
1004             }
1005             switch (roundingMode) {
1006             case float_round_nearest_even:
1007             case float_round_ties_away:
1008                 increment = ((int64_t)zSig1 < 0);
1009                 break;
1010             case float_round_to_zero:
1011                 increment = 0;
1012                 break;
1013             case float_round_up:
1014                 increment = !zSign && zSig1;
1015                 break;
1016             case float_round_down:
1017                 increment = zSign && zSig1;
1018                 break;
1019             default:
1020                 abort();
1021             }
1022             if ( increment ) {
1023                 ++zSig0;
1024                 zSig0 &=
1025                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
1026                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
1027             }
1028             return packFloatx80( zSign, zExp, zSig0 );
1029         }
1030     }
1031     if (zSig1) {
1032         status->float_exception_flags |= float_flag_inexact;
1033     }
1034     if ( increment ) {
1035         ++zSig0;
1036         if ( zSig0 == 0 ) {
1037             ++zExp;
1038             zSig0 = LIT64( 0x8000000000000000 );
1039         }
1040         else {
1041             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
1042         }
1043     }
1044     else {
1045         if ( zSig0 == 0 ) zExp = 0;
1046     }
1047     return packFloatx80( zSign, zExp, zSig0 );
1048 
1049 }
1050 
1051 /*----------------------------------------------------------------------------
1052 | Takes an abstract floating-point value having sign `zSign', exponent
1053 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
1054 | and returns the proper extended double-precision floating-point value
1055 | corresponding to the abstract input.  This routine is just like
1056 | `roundAndPackFloatx80' except that the input significand does not have to be
1057 | normalized.
1058 *----------------------------------------------------------------------------*/
1059 
1060 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
1061                                               flag zSign, int32_t zExp,
1062                                               uint64_t zSig0, uint64_t zSig1,
1063                                               float_status *status)
1064 {
1065     int8_t shiftCount;
1066 
1067     if ( zSig0 == 0 ) {
1068         zSig0 = zSig1;
1069         zSig1 = 0;
1070         zExp -= 64;
1071     }
1072     shiftCount = countLeadingZeros64( zSig0 );
1073     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1074     zExp -= shiftCount;
1075     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
1076                                 zSig0, zSig1, status);
1077 
1078 }
1079 
1080 /*----------------------------------------------------------------------------
1081 | Returns the least-significant 64 fraction bits of the quadruple-precision
1082 | floating-point value `a'.
1083 *----------------------------------------------------------------------------*/
1084 
1085 static inline uint64_t extractFloat128Frac1( float128 a )
1086 {
1087 
1088     return a.low;
1089 
1090 }
1091 
1092 /*----------------------------------------------------------------------------
1093 | Returns the most-significant 48 fraction bits of the quadruple-precision
1094 | floating-point value `a'.
1095 *----------------------------------------------------------------------------*/
1096 
1097 static inline uint64_t extractFloat128Frac0( float128 a )
1098 {
1099 
1100     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1101 
1102 }
1103 
1104 /*----------------------------------------------------------------------------
1105 | Returns the exponent bits of the quadruple-precision floating-point value
1106 | `a'.
1107 *----------------------------------------------------------------------------*/
1108 
1109 static inline int32_t extractFloat128Exp( float128 a )
1110 {
1111 
1112     return ( a.high>>48 ) & 0x7FFF;
1113 
1114 }
1115 
1116 /*----------------------------------------------------------------------------
1117 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1118 *----------------------------------------------------------------------------*/
1119 
1120 static inline flag extractFloat128Sign( float128 a )
1121 {
1122 
1123     return a.high>>63;
1124 
1125 }
1126 
1127 /*----------------------------------------------------------------------------
1128 | Normalizes the subnormal quadruple-precision floating-point value
1129 | represented by the denormalized significand formed by the concatenation of
1130 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
1131 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
1132 | significand are stored at the location pointed to by `zSig0Ptr', and the
1133 | least significant 64 bits of the normalized significand are stored at the
1134 | location pointed to by `zSig1Ptr'.
1135 *----------------------------------------------------------------------------*/
1136 
1137 static void
1138  normalizeFloat128Subnormal(
1139      uint64_t aSig0,
1140      uint64_t aSig1,
1141      int32_t *zExpPtr,
1142      uint64_t *zSig0Ptr,
1143      uint64_t *zSig1Ptr
1144  )
1145 {
1146     int8_t shiftCount;
1147 
1148     if ( aSig0 == 0 ) {
1149         shiftCount = countLeadingZeros64( aSig1 ) - 15;
1150         if ( shiftCount < 0 ) {
1151             *zSig0Ptr = aSig1>>( - shiftCount );
1152             *zSig1Ptr = aSig1<<( shiftCount & 63 );
1153         }
1154         else {
1155             *zSig0Ptr = aSig1<<shiftCount;
1156             *zSig1Ptr = 0;
1157         }
1158         *zExpPtr = - shiftCount - 63;
1159     }
1160     else {
1161         shiftCount = countLeadingZeros64( aSig0 ) - 15;
1162         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1163         *zExpPtr = 1 - shiftCount;
1164     }
1165 
1166 }
1167 
1168 /*----------------------------------------------------------------------------
1169 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1170 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1171 | floating-point value, returning the result.  After being shifted into the
1172 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1173 | added together to form the most significant 32 bits of the result.  This
1174 | means that any integer portion of `zSig0' will be added into the exponent.
1175 | Since a properly normalized significand will have an integer portion equal
1176 | to 1, the `zExp' input should be 1 less than the desired result exponent
1177 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1178 | significand.
1179 *----------------------------------------------------------------------------*/
1180 
1181 static inline float128
1182  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
1183 {
1184     float128 z;
1185 
1186     z.low = zSig1;
1187     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1188     return z;
1189 
1190 }
1191 
1192 /*----------------------------------------------------------------------------
1193 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1194 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1195 | and `zSig2', and returns the proper quadruple-precision floating-point value
1196 | corresponding to the abstract input.  Ordinarily, the abstract value is
1197 | simply rounded and packed into the quadruple-precision format, with the
1198 | inexact exception raised if the abstract input cannot be represented
1199 | exactly.  However, if the abstract value is too large, the overflow and
1200 | inexact exceptions are raised and an infinity or maximal finite value is
1201 | returned.  If the abstract value is too small, the input value is rounded to
1202 | a subnormal number, and the underflow and inexact exceptions are raised if
1203 | the abstract input cannot be represented exactly as a subnormal quadruple-
1204 | precision floating-point number.
1205 |     The input significand must be normalized or smaller.  If the input
1206 | significand is not normalized, `zExp' must be 0; in that case, the result
1207 | returned is a subnormal number, and it must not require rounding.  In the
1208 | usual case that the input significand is normalized, `zExp' must be 1 less
1209 | than the ``true'' floating-point exponent.  The handling of underflow and
1210 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1211 *----------------------------------------------------------------------------*/
1212 
1213 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
1214                                      uint64_t zSig0, uint64_t zSig1,
1215                                      uint64_t zSig2, float_status *status)
1216 {
1217     int8_t roundingMode;
1218     flag roundNearestEven, increment, isTiny;
1219 
1220     roundingMode = status->float_rounding_mode;
1221     roundNearestEven = ( roundingMode == float_round_nearest_even );
1222     switch (roundingMode) {
1223     case float_round_nearest_even:
1224     case float_round_ties_away:
1225         increment = ((int64_t)zSig2 < 0);
1226         break;
1227     case float_round_to_zero:
1228         increment = 0;
1229         break;
1230     case float_round_up:
1231         increment = !zSign && zSig2;
1232         break;
1233     case float_round_down:
1234         increment = zSign && zSig2;
1235         break;
1236     case float_round_to_odd:
1237         increment = !(zSig1 & 0x1) && zSig2;
1238         break;
1239     default:
1240         abort();
1241     }
1242     if ( 0x7FFD <= (uint32_t) zExp ) {
1243         if (    ( 0x7FFD < zExp )
1244              || (    ( zExp == 0x7FFD )
1245                   && eq128(
1246                          LIT64( 0x0001FFFFFFFFFFFF ),
1247                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1248                          zSig0,
1249                          zSig1
1250                      )
1251                   && increment
1252                 )
1253            ) {
1254             float_raise(float_flag_overflow | float_flag_inexact, status);
1255             if (    ( roundingMode == float_round_to_zero )
1256                  || ( zSign && ( roundingMode == float_round_up ) )
1257                  || ( ! zSign && ( roundingMode == float_round_down ) )
1258                  || (roundingMode == float_round_to_odd)
1259                ) {
1260                 return
1261                     packFloat128(
1262                         zSign,
1263                         0x7FFE,
1264                         LIT64( 0x0000FFFFFFFFFFFF ),
1265                         LIT64( 0xFFFFFFFFFFFFFFFF )
1266                     );
1267             }
1268             return packFloat128( zSign, 0x7FFF, 0, 0 );
1269         }
1270         if ( zExp < 0 ) {
1271             if (status->flush_to_zero) {
1272                 float_raise(float_flag_output_denormal, status);
1273                 return packFloat128(zSign, 0, 0, 0);
1274             }
1275             isTiny =
1276                    (status->float_detect_tininess
1277                     == float_tininess_before_rounding)
1278                 || ( zExp < -1 )
1279                 || ! increment
1280                 || lt128(
1281                        zSig0,
1282                        zSig1,
1283                        LIT64( 0x0001FFFFFFFFFFFF ),
1284                        LIT64( 0xFFFFFFFFFFFFFFFF )
1285                    );
1286             shift128ExtraRightJamming(
1287                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1288             zExp = 0;
1289             if (isTiny && zSig2) {
1290                 float_raise(float_flag_underflow, status);
1291             }
1292             switch (roundingMode) {
1293             case float_round_nearest_even:
1294             case float_round_ties_away:
1295                 increment = ((int64_t)zSig2 < 0);
1296                 break;
1297             case float_round_to_zero:
1298                 increment = 0;
1299                 break;
1300             case float_round_up:
1301                 increment = !zSign && zSig2;
1302                 break;
1303             case float_round_down:
1304                 increment = zSign && zSig2;
1305                 break;
1306             case float_round_to_odd:
1307                 increment = !(zSig1 & 0x1) && zSig2;
1308                 break;
1309             default:
1310                 abort();
1311             }
1312         }
1313     }
1314     if (zSig2) {
1315         status->float_exception_flags |= float_flag_inexact;
1316     }
1317     if ( increment ) {
1318         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1319         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1320     }
1321     else {
1322         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1323     }
1324     return packFloat128( zSign, zExp, zSig0, zSig1 );
1325 
1326 }
1327 
1328 /*----------------------------------------------------------------------------
1329 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1330 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1331 | returns the proper quadruple-precision floating-point value corresponding
1332 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1333 | except that the input significand has fewer bits and does not have to be
1334 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1335 | point exponent.
1336 *----------------------------------------------------------------------------*/
1337 
1338 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
1339                                               uint64_t zSig0, uint64_t zSig1,
1340                                               float_status *status)
1341 {
1342     int8_t shiftCount;
1343     uint64_t zSig2;
1344 
1345     if ( zSig0 == 0 ) {
1346         zSig0 = zSig1;
1347         zSig1 = 0;
1348         zExp -= 64;
1349     }
1350     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1351     if ( 0 <= shiftCount ) {
1352         zSig2 = 0;
1353         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1354     }
1355     else {
1356         shift128ExtraRightJamming(
1357             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1358     }
1359     zExp -= shiftCount;
1360     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
1361 
1362 }
1363 
1364 /*----------------------------------------------------------------------------
1365 | Returns the result of converting the 32-bit two's complement integer `a'
1366 | to the single-precision floating-point format.  The conversion is performed
1367 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1368 *----------------------------------------------------------------------------*/
1369 
1370 float32 int32_to_float32(int32_t a, float_status *status)
1371 {
1372     flag zSign;
1373 
1374     if ( a == 0 ) return float32_zero;
1375     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1376     zSign = ( a < 0 );
1377     return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
1378 }
1379 
1380 /*----------------------------------------------------------------------------
1381 | Returns the result of converting the 32-bit two's complement integer `a'
1382 | to the double-precision floating-point format.  The conversion is performed
1383 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1384 *----------------------------------------------------------------------------*/
1385 
1386 float64 int32_to_float64(int32_t a, float_status *status)
1387 {
1388     flag zSign;
1389     uint32_t absA;
1390     int8_t shiftCount;
1391     uint64_t zSig;
1392 
1393     if ( a == 0 ) return float64_zero;
1394     zSign = ( a < 0 );
1395     absA = zSign ? - a : a;
1396     shiftCount = countLeadingZeros32( absA ) + 21;
1397     zSig = absA;
1398     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1399 
1400 }
1401 
1402 /*----------------------------------------------------------------------------
1403 | Returns the result of converting the 32-bit two's complement integer `a'
1404 | to the extended double-precision floating-point format.  The conversion
1405 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1406 | Arithmetic.
1407 *----------------------------------------------------------------------------*/
1408 
1409 floatx80 int32_to_floatx80(int32_t a, float_status *status)
1410 {
1411     flag zSign;
1412     uint32_t absA;
1413     int8_t shiftCount;
1414     uint64_t zSig;
1415 
1416     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1417     zSign = ( a < 0 );
1418     absA = zSign ? - a : a;
1419     shiftCount = countLeadingZeros32( absA ) + 32;
1420     zSig = absA;
1421     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1422 
1423 }
1424 
1425 /*----------------------------------------------------------------------------
1426 | Returns the result of converting the 32-bit two's complement integer `a' to
1427 | the quadruple-precision floating-point format.  The conversion is performed
1428 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1429 *----------------------------------------------------------------------------*/
1430 
1431 float128 int32_to_float128(int32_t a, float_status *status)
1432 {
1433     flag zSign;
1434     uint32_t absA;
1435     int8_t shiftCount;
1436     uint64_t zSig0;
1437 
1438     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1439     zSign = ( a < 0 );
1440     absA = zSign ? - a : a;
1441     shiftCount = countLeadingZeros32( absA ) + 17;
1442     zSig0 = absA;
1443     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1444 
1445 }
1446 
1447 /*----------------------------------------------------------------------------
1448 | Returns the result of converting the 64-bit two's complement integer `a'
1449 | to the single-precision floating-point format.  The conversion is performed
1450 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1451 *----------------------------------------------------------------------------*/
1452 
1453 float32 int64_to_float32(int64_t a, float_status *status)
1454 {
1455     flag zSign;
1456     uint64_t absA;
1457     int8_t shiftCount;
1458 
1459     if ( a == 0 ) return float32_zero;
1460     zSign = ( a < 0 );
1461     absA = zSign ? - a : a;
1462     shiftCount = countLeadingZeros64( absA ) - 40;
1463     if ( 0 <= shiftCount ) {
1464         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1465     }
1466     else {
1467         shiftCount += 7;
1468         if ( shiftCount < 0 ) {
1469             shift64RightJamming( absA, - shiftCount, &absA );
1470         }
1471         else {
1472             absA <<= shiftCount;
1473         }
1474         return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
1475     }
1476 
1477 }
1478 
1479 /*----------------------------------------------------------------------------
1480 | Returns the result of converting the 64-bit two's complement integer `a'
1481 | to the double-precision floating-point format.  The conversion is performed
1482 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1483 *----------------------------------------------------------------------------*/
1484 
1485 float64 int64_to_float64(int64_t a, float_status *status)
1486 {
1487     flag zSign;
1488 
1489     if ( a == 0 ) return float64_zero;
1490     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1491         return packFloat64( 1, 0x43E, 0 );
1492     }
1493     zSign = ( a < 0 );
1494     return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
1495 }
1496 
1497 /*----------------------------------------------------------------------------
1498 | Returns the result of converting the 64-bit two's complement integer `a'
1499 | to the extended double-precision floating-point format.  The conversion
1500 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1501 | Arithmetic.
1502 *----------------------------------------------------------------------------*/
1503 
1504 floatx80 int64_to_floatx80(int64_t a, float_status *status)
1505 {
1506     flag zSign;
1507     uint64_t absA;
1508     int8_t shiftCount;
1509 
1510     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1511     zSign = ( a < 0 );
1512     absA = zSign ? - a : a;
1513     shiftCount = countLeadingZeros64( absA );
1514     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1515 
1516 }
1517 
1518 /*----------------------------------------------------------------------------
1519 | Returns the result of converting the 64-bit two's complement integer `a' to
1520 | the quadruple-precision floating-point format.  The conversion is performed
1521 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1522 *----------------------------------------------------------------------------*/
1523 
1524 float128 int64_to_float128(int64_t a, float_status *status)
1525 {
1526     flag zSign;
1527     uint64_t absA;
1528     int8_t shiftCount;
1529     int32_t zExp;
1530     uint64_t zSig0, zSig1;
1531 
1532     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1533     zSign = ( a < 0 );
1534     absA = zSign ? - a : a;
1535     shiftCount = countLeadingZeros64( absA ) + 49;
1536     zExp = 0x406E - shiftCount;
1537     if ( 64 <= shiftCount ) {
1538         zSig1 = 0;
1539         zSig0 = absA;
1540         shiftCount -= 64;
1541     }
1542     else {
1543         zSig1 = absA;
1544         zSig0 = 0;
1545     }
1546     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1547     return packFloat128( zSign, zExp, zSig0, zSig1 );
1548 
1549 }
1550 
1551 /*----------------------------------------------------------------------------
1552 | Returns the result of converting the 64-bit unsigned integer `a'
1553 | to the single-precision floating-point format.  The conversion is performed
1554 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1555 *----------------------------------------------------------------------------*/
1556 
1557 float32 uint64_to_float32(uint64_t a, float_status *status)
1558 {
1559     int shiftcount;
1560 
1561     if (a == 0) {
1562         return float32_zero;
1563     }
1564 
1565     /* Determine (left) shift needed to put first set bit into bit posn 23
1566      * (since packFloat32() expects the binary point between bits 23 and 22);
1567      * this is the fast case for smallish numbers.
1568      */
1569     shiftcount = countLeadingZeros64(a) - 40;
1570     if (shiftcount >= 0) {
1571         return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1572     }
1573     /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1574      * expects the binary point between bits 30 and 29, hence the + 7.
1575      */
1576     shiftcount += 7;
1577     if (shiftcount < 0) {
1578         shift64RightJamming(a, -shiftcount, &a);
1579     } else {
1580         a <<= shiftcount;
1581     }
1582 
1583     return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
1584 }
1585 
1586 /*----------------------------------------------------------------------------
1587 | Returns the result of converting the 64-bit unsigned integer `a'
1588 | to the double-precision floating-point format.  The conversion is performed
1589 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1590 *----------------------------------------------------------------------------*/
1591 
1592 float64 uint64_to_float64(uint64_t a, float_status *status)
1593 {
1594     int exp = 0x43C;
1595     int shiftcount;
1596 
1597     if (a == 0) {
1598         return float64_zero;
1599     }
1600 
1601     shiftcount = countLeadingZeros64(a) - 1;
1602     if (shiftcount < 0) {
1603         shift64RightJamming(a, -shiftcount, &a);
1604     } else {
1605         a <<= shiftcount;
1606     }
1607     return roundAndPackFloat64(0, exp - shiftcount, a, status);
1608 }
1609 
1610 /*----------------------------------------------------------------------------
1611 | Returns the result of converting the 64-bit unsigned integer `a'
1612 | to the quadruple-precision floating-point format.  The conversion is performed
1613 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1614 *----------------------------------------------------------------------------*/
1615 
1616 float128 uint64_to_float128(uint64_t a, float_status *status)
1617 {
1618     if (a == 0) {
1619         return float128_zero;
1620     }
1621     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1622 }
1623 
1624 /*----------------------------------------------------------------------------
1625 | Returns the result of converting the single-precision floating-point value
1626 | `a' to the 32-bit two's complement integer format.  The conversion is
1627 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1628 | Arithmetic---which means in particular that the conversion is rounded
1629 | according to the current rounding mode.  If `a' is a NaN, the largest
1630 | positive integer is returned.  Otherwise, if the conversion overflows, the
1631 | largest integer with the same sign as `a' is returned.
1632 *----------------------------------------------------------------------------*/
1633 
1634 int32_t float32_to_int32(float32 a, float_status *status)
1635 {
1636     flag aSign;
1637     int aExp;
1638     int shiftCount;
1639     uint32_t aSig;
1640     uint64_t aSig64;
1641 
1642     a = float32_squash_input_denormal(a, status);
1643     aSig = extractFloat32Frac( a );
1644     aExp = extractFloat32Exp( a );
1645     aSign = extractFloat32Sign( a );
1646     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1647     if ( aExp ) aSig |= 0x00800000;
1648     shiftCount = 0xAF - aExp;
1649     aSig64 = aSig;
1650     aSig64 <<= 32;
1651     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1652     return roundAndPackInt32(aSign, aSig64, status);
1653 
1654 }
1655 
1656 /*----------------------------------------------------------------------------
1657 | Returns the result of converting the single-precision floating-point value
1658 | `a' to the 32-bit two's complement integer format.  The conversion is
1659 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1660 | Arithmetic, except that the conversion is always rounded toward zero.
1661 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1662 | the conversion overflows, the largest integer with the same sign as `a' is
1663 | returned.
1664 *----------------------------------------------------------------------------*/
1665 
1666 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
1667 {
1668     flag aSign;
1669     int aExp;
1670     int shiftCount;
1671     uint32_t aSig;
1672     int32_t z;
1673     a = float32_squash_input_denormal(a, status);
1674 
1675     aSig = extractFloat32Frac( a );
1676     aExp = extractFloat32Exp( a );
1677     aSign = extractFloat32Sign( a );
1678     shiftCount = aExp - 0x9E;
1679     if ( 0 <= shiftCount ) {
1680         if ( float32_val(a) != 0xCF000000 ) {
1681             float_raise(float_flag_invalid, status);
1682             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1683         }
1684         return (int32_t) 0x80000000;
1685     }
1686     else if ( aExp <= 0x7E ) {
1687         if (aExp | aSig) {
1688             status->float_exception_flags |= float_flag_inexact;
1689         }
1690         return 0;
1691     }
1692     aSig = ( aSig | 0x00800000 )<<8;
1693     z = aSig>>( - shiftCount );
1694     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1695         status->float_exception_flags |= float_flag_inexact;
1696     }
1697     if ( aSign ) z = - z;
1698     return z;
1699 
1700 }
1701 
1702 /*----------------------------------------------------------------------------
1703 | Returns the result of converting the single-precision floating-point value
1704 | `a' to the 16-bit two's complement integer format.  The conversion is
1705 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1706 | Arithmetic, except that the conversion is always rounded toward zero.
1707 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1708 | the conversion overflows, the largest integer with the same sign as `a' is
1709 | returned.
1710 *----------------------------------------------------------------------------*/
1711 
1712 int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
1713 {
1714     flag aSign;
1715     int aExp;
1716     int shiftCount;
1717     uint32_t aSig;
1718     int32_t z;
1719 
1720     aSig = extractFloat32Frac( a );
1721     aExp = extractFloat32Exp( a );
1722     aSign = extractFloat32Sign( a );
1723     shiftCount = aExp - 0x8E;
1724     if ( 0 <= shiftCount ) {
1725         if ( float32_val(a) != 0xC7000000 ) {
1726             float_raise(float_flag_invalid, status);
1727             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1728                 return 0x7FFF;
1729             }
1730         }
1731         return (int32_t) 0xffff8000;
1732     }
1733     else if ( aExp <= 0x7E ) {
1734         if ( aExp | aSig ) {
1735             status->float_exception_flags |= float_flag_inexact;
1736         }
1737         return 0;
1738     }
1739     shiftCount -= 0x10;
1740     aSig = ( aSig | 0x00800000 )<<8;
1741     z = aSig>>( - shiftCount );
1742     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1743         status->float_exception_flags |= float_flag_inexact;
1744     }
1745     if ( aSign ) {
1746         z = - z;
1747     }
1748     return z;
1749 
1750 }
1751 
1752 /*----------------------------------------------------------------------------
1753 | Returns the result of converting the single-precision floating-point value
1754 | `a' to the 64-bit two's complement integer format.  The conversion is
1755 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1756 | Arithmetic---which means in particular that the conversion is rounded
1757 | according to the current rounding mode.  If `a' is a NaN, the largest
1758 | positive integer is returned.  Otherwise, if the conversion overflows, the
1759 | largest integer with the same sign as `a' is returned.
1760 *----------------------------------------------------------------------------*/
1761 
1762 int64_t float32_to_int64(float32 a, float_status *status)
1763 {
1764     flag aSign;
1765     int aExp;
1766     int shiftCount;
1767     uint32_t aSig;
1768     uint64_t aSig64, aSigExtra;
1769     a = float32_squash_input_denormal(a, status);
1770 
1771     aSig = extractFloat32Frac( a );
1772     aExp = extractFloat32Exp( a );
1773     aSign = extractFloat32Sign( a );
1774     shiftCount = 0xBE - aExp;
1775     if ( shiftCount < 0 ) {
1776         float_raise(float_flag_invalid, status);
1777         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1778             return LIT64( 0x7FFFFFFFFFFFFFFF );
1779         }
1780         return (int64_t) LIT64( 0x8000000000000000 );
1781     }
1782     if ( aExp ) aSig |= 0x00800000;
1783     aSig64 = aSig;
1784     aSig64 <<= 40;
1785     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1786     return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
1787 
1788 }
1789 
1790 /*----------------------------------------------------------------------------
1791 | Returns the result of converting the single-precision floating-point value
1792 | `a' to the 64-bit unsigned integer format.  The conversion is
1793 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1794 | Arithmetic---which means in particular that the conversion is rounded
1795 | according to the current rounding mode.  If `a' is a NaN, the largest
1796 | unsigned integer is returned.  Otherwise, if the conversion overflows, the
1797 | largest unsigned integer is returned.  If the 'a' is negative, the result
1798 | is rounded and zero is returned; values that do not round to zero will
1799 | raise the inexact exception flag.
1800 *----------------------------------------------------------------------------*/
1801 
1802 uint64_t float32_to_uint64(float32 a, float_status *status)
1803 {
1804     flag aSign;
1805     int aExp;
1806     int shiftCount;
1807     uint32_t aSig;
1808     uint64_t aSig64, aSigExtra;
1809     a = float32_squash_input_denormal(a, status);
1810 
1811     aSig = extractFloat32Frac(a);
1812     aExp = extractFloat32Exp(a);
1813     aSign = extractFloat32Sign(a);
1814     if ((aSign) && (aExp > 126)) {
1815         float_raise(float_flag_invalid, status);
1816         if (float32_is_any_nan(a)) {
1817             return LIT64(0xFFFFFFFFFFFFFFFF);
1818         } else {
1819             return 0;
1820         }
1821     }
1822     shiftCount = 0xBE - aExp;
1823     if (aExp) {
1824         aSig |= 0x00800000;
1825     }
1826     if (shiftCount < 0) {
1827         float_raise(float_flag_invalid, status);
1828         return LIT64(0xFFFFFFFFFFFFFFFF);
1829     }
1830 
1831     aSig64 = aSig;
1832     aSig64 <<= 40;
1833     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1834     return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
1835 }
1836 
1837 /*----------------------------------------------------------------------------
1838 | Returns the result of converting the single-precision floating-point value
1839 | `a' to the 64-bit unsigned integer format.  The conversion is
1840 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1841 | Arithmetic, except that the conversion is always rounded toward zero.  If
1842 | `a' is a NaN, the largest unsigned integer is returned.  Otherwise, if the
1843 | conversion overflows, the largest unsigned integer is returned.  If the
1844 | 'a' is negative, the result is rounded and zero is returned; values that do
1845 | not round to zero will raise the inexact flag.
1846 *----------------------------------------------------------------------------*/
1847 
1848 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
1849 {
1850     signed char current_rounding_mode = status->float_rounding_mode;
1851     set_float_rounding_mode(float_round_to_zero, status);
1852     int64_t v = float32_to_uint64(a, status);
1853     set_float_rounding_mode(current_rounding_mode, status);
1854     return v;
1855 }
1856 
1857 /*----------------------------------------------------------------------------
1858 | Returns the result of converting the single-precision floating-point value
1859 | `a' to the 64-bit two's complement integer format.  The conversion is
1860 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1861 | Arithmetic, except that the conversion is always rounded toward zero.  If
1862 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1863 | conversion overflows, the largest integer with the same sign as `a' is
1864 | returned.
1865 *----------------------------------------------------------------------------*/
1866 
1867 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
1868 {
1869     flag aSign;
1870     int aExp;
1871     int shiftCount;
1872     uint32_t aSig;
1873     uint64_t aSig64;
1874     int64_t z;
1875     a = float32_squash_input_denormal(a, status);
1876 
1877     aSig = extractFloat32Frac( a );
1878     aExp = extractFloat32Exp( a );
1879     aSign = extractFloat32Sign( a );
1880     shiftCount = aExp - 0xBE;
1881     if ( 0 <= shiftCount ) {
1882         if ( float32_val(a) != 0xDF000000 ) {
1883             float_raise(float_flag_invalid, status);
1884             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1885                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1886             }
1887         }
1888         return (int64_t) LIT64( 0x8000000000000000 );
1889     }
1890     else if ( aExp <= 0x7E ) {
1891         if (aExp | aSig) {
1892             status->float_exception_flags |= float_flag_inexact;
1893         }
1894         return 0;
1895     }
1896     aSig64 = aSig | 0x00800000;
1897     aSig64 <<= 40;
1898     z = aSig64>>( - shiftCount );
1899     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1900         status->float_exception_flags |= float_flag_inexact;
1901     }
1902     if ( aSign ) z = - z;
1903     return z;
1904 
1905 }
1906 
1907 /*----------------------------------------------------------------------------
1908 | Returns the result of converting the single-precision floating-point value
1909 | `a' to the double-precision floating-point format.  The conversion is
1910 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1911 | Arithmetic.
1912 *----------------------------------------------------------------------------*/
1913 
1914 float64 float32_to_float64(float32 a, float_status *status)
1915 {
1916     flag aSign;
1917     int aExp;
1918     uint32_t aSig;
1919     a = float32_squash_input_denormal(a, status);
1920 
1921     aSig = extractFloat32Frac( a );
1922     aExp = extractFloat32Exp( a );
1923     aSign = extractFloat32Sign( a );
1924     if ( aExp == 0xFF ) {
1925         if (aSig) {
1926             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1927         }
1928         return packFloat64( aSign, 0x7FF, 0 );
1929     }
1930     if ( aExp == 0 ) {
1931         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1932         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1933         --aExp;
1934     }
1935     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1936 
1937 }
1938 
1939 /*----------------------------------------------------------------------------
1940 | Returns the result of converting the single-precision floating-point value
1941 | `a' to the extended double-precision floating-point format.  The conversion
1942 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1943 | Arithmetic.
1944 *----------------------------------------------------------------------------*/
1945 
1946 floatx80 float32_to_floatx80(float32 a, float_status *status)
1947 {
1948     flag aSign;
1949     int aExp;
1950     uint32_t aSig;
1951 
1952     a = float32_squash_input_denormal(a, status);
1953     aSig = extractFloat32Frac( a );
1954     aExp = extractFloat32Exp( a );
1955     aSign = extractFloat32Sign( a );
1956     if ( aExp == 0xFF ) {
1957         if (aSig) {
1958             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1959         }
1960         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1961     }
1962     if ( aExp == 0 ) {
1963         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1964         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1965     }
1966     aSig |= 0x00800000;
1967     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1968 
1969 }
1970 
1971 /*----------------------------------------------------------------------------
1972 | Returns the result of converting the single-precision floating-point value
1973 | `a' to the double-precision floating-point format.  The conversion is
1974 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1975 | Arithmetic.
1976 *----------------------------------------------------------------------------*/
1977 
1978 float128 float32_to_float128(float32 a, float_status *status)
1979 {
1980     flag aSign;
1981     int aExp;
1982     uint32_t aSig;
1983 
1984     a = float32_squash_input_denormal(a, status);
1985     aSig = extractFloat32Frac( a );
1986     aExp = extractFloat32Exp( a );
1987     aSign = extractFloat32Sign( a );
1988     if ( aExp == 0xFF ) {
1989         if (aSig) {
1990             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1991         }
1992         return packFloat128( aSign, 0x7FFF, 0, 0 );
1993     }
1994     if ( aExp == 0 ) {
1995         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1996         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1997         --aExp;
1998     }
1999     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
2000 
2001 }
2002 
2003 /*----------------------------------------------------------------------------
2004 | Rounds the single-precision floating-point value `a' to an integer, and
2005 | returns the result as a single-precision floating-point value.  The
2006 | operation is performed according to the IEC/IEEE Standard for Binary
2007 | Floating-Point Arithmetic.
2008 *----------------------------------------------------------------------------*/
2009 
2010 float32 float32_round_to_int(float32 a, float_status *status)
2011 {
2012     flag aSign;
2013     int aExp;
2014     uint32_t lastBitMask, roundBitsMask;
2015     uint32_t z;
2016     a = float32_squash_input_denormal(a, status);
2017 
2018     aExp = extractFloat32Exp( a );
2019     if ( 0x96 <= aExp ) {
2020         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
2021             return propagateFloat32NaN(a, a, status);
2022         }
2023         return a;
2024     }
2025     if ( aExp <= 0x7E ) {
2026         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
2027         status->float_exception_flags |= float_flag_inexact;
2028         aSign = extractFloat32Sign( a );
2029         switch (status->float_rounding_mode) {
2030          case float_round_nearest_even:
2031             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
2032                 return packFloat32( aSign, 0x7F, 0 );
2033             }
2034             break;
2035         case float_round_ties_away:
2036             if (aExp == 0x7E) {
2037                 return packFloat32(aSign, 0x7F, 0);
2038             }
2039             break;
2040          case float_round_down:
2041             return make_float32(aSign ? 0xBF800000 : 0);
2042          case float_round_up:
2043             return make_float32(aSign ? 0x80000000 : 0x3F800000);
2044         }
2045         return packFloat32( aSign, 0, 0 );
2046     }
2047     lastBitMask = 1;
2048     lastBitMask <<= 0x96 - aExp;
2049     roundBitsMask = lastBitMask - 1;
2050     z = float32_val(a);
2051     switch (status->float_rounding_mode) {
2052     case float_round_nearest_even:
2053         z += lastBitMask>>1;
2054         if ((z & roundBitsMask) == 0) {
2055             z &= ~lastBitMask;
2056         }
2057         break;
2058     case float_round_ties_away:
2059         z += lastBitMask >> 1;
2060         break;
2061     case float_round_to_zero:
2062         break;
2063     case float_round_up:
2064         if (!extractFloat32Sign(make_float32(z))) {
2065             z += roundBitsMask;
2066         }
2067         break;
2068     case float_round_down:
2069         if (extractFloat32Sign(make_float32(z))) {
2070             z += roundBitsMask;
2071         }
2072         break;
2073     default:
2074         abort();
2075     }
2076     z &= ~ roundBitsMask;
2077     if (z != float32_val(a)) {
2078         status->float_exception_flags |= float_flag_inexact;
2079     }
2080     return make_float32(z);
2081 
2082 }
2083 
2084 /*----------------------------------------------------------------------------
2085 | Returns the result of adding the absolute values of the single-precision
2086 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2087 | before being returned.  `zSign' is ignored if the result is a NaN.
2088 | The addition is performed according to the IEC/IEEE Standard for Binary
2089 | Floating-Point Arithmetic.
2090 *----------------------------------------------------------------------------*/
2091 
2092 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
2093                               float_status *status)
2094 {
2095     int aExp, bExp, zExp;
2096     uint32_t aSig, bSig, zSig;
2097     int expDiff;
2098 
2099     aSig = extractFloat32Frac( a );
2100     aExp = extractFloat32Exp( a );
2101     bSig = extractFloat32Frac( b );
2102     bExp = extractFloat32Exp( b );
2103     expDiff = aExp - bExp;
2104     aSig <<= 6;
2105     bSig <<= 6;
2106     if ( 0 < expDiff ) {
2107         if ( aExp == 0xFF ) {
2108             if (aSig) {
2109                 return propagateFloat32NaN(a, b, status);
2110             }
2111             return a;
2112         }
2113         if ( bExp == 0 ) {
2114             --expDiff;
2115         }
2116         else {
2117             bSig |= 0x20000000;
2118         }
2119         shift32RightJamming( bSig, expDiff, &bSig );
2120         zExp = aExp;
2121     }
2122     else if ( expDiff < 0 ) {
2123         if ( bExp == 0xFF ) {
2124             if (bSig) {
2125                 return propagateFloat32NaN(a, b, status);
2126             }
2127             return packFloat32( zSign, 0xFF, 0 );
2128         }
2129         if ( aExp == 0 ) {
2130             ++expDiff;
2131         }
2132         else {
2133             aSig |= 0x20000000;
2134         }
2135         shift32RightJamming( aSig, - expDiff, &aSig );
2136         zExp = bExp;
2137     }
2138     else {
2139         if ( aExp == 0xFF ) {
2140             if (aSig | bSig) {
2141                 return propagateFloat32NaN(a, b, status);
2142             }
2143             return a;
2144         }
2145         if ( aExp == 0 ) {
2146             if (status->flush_to_zero) {
2147                 if (aSig | bSig) {
2148                     float_raise(float_flag_output_denormal, status);
2149                 }
2150                 return packFloat32(zSign, 0, 0);
2151             }
2152             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2153         }
2154         zSig = 0x40000000 + aSig + bSig;
2155         zExp = aExp;
2156         goto roundAndPack;
2157     }
2158     aSig |= 0x20000000;
2159     zSig = ( aSig + bSig )<<1;
2160     --zExp;
2161     if ( (int32_t) zSig < 0 ) {
2162         zSig = aSig + bSig;
2163         ++zExp;
2164     }
2165  roundAndPack:
2166     return roundAndPackFloat32(zSign, zExp, zSig, status);
2167 
2168 }
2169 
2170 /*----------------------------------------------------------------------------
2171 | Returns the result of subtracting the absolute values of the single-
2172 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2173 | difference is negated before being returned.  `zSign' is ignored if the
2174 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2175 | Standard for Binary Floating-Point Arithmetic.
2176 *----------------------------------------------------------------------------*/
2177 
2178 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2179                               float_status *status)
2180 {
2181     int aExp, bExp, zExp;
2182     uint32_t aSig, bSig, zSig;
2183     int expDiff;
2184 
2185     aSig = extractFloat32Frac( a );
2186     aExp = extractFloat32Exp( a );
2187     bSig = extractFloat32Frac( b );
2188     bExp = extractFloat32Exp( b );
2189     expDiff = aExp - bExp;
2190     aSig <<= 7;
2191     bSig <<= 7;
2192     if ( 0 < expDiff ) goto aExpBigger;
2193     if ( expDiff < 0 ) goto bExpBigger;
2194     if ( aExp == 0xFF ) {
2195         if (aSig | bSig) {
2196             return propagateFloat32NaN(a, b, status);
2197         }
2198         float_raise(float_flag_invalid, status);
2199         return float32_default_nan(status);
2200     }
2201     if ( aExp == 0 ) {
2202         aExp = 1;
2203         bExp = 1;
2204     }
2205     if ( bSig < aSig ) goto aBigger;
2206     if ( aSig < bSig ) goto bBigger;
2207     return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
2208  bExpBigger:
2209     if ( bExp == 0xFF ) {
2210         if (bSig) {
2211             return propagateFloat32NaN(a, b, status);
2212         }
2213         return packFloat32( zSign ^ 1, 0xFF, 0 );
2214     }
2215     if ( aExp == 0 ) {
2216         ++expDiff;
2217     }
2218     else {
2219         aSig |= 0x40000000;
2220     }
2221     shift32RightJamming( aSig, - expDiff, &aSig );
2222     bSig |= 0x40000000;
2223  bBigger:
2224     zSig = bSig - aSig;
2225     zExp = bExp;
2226     zSign ^= 1;
2227     goto normalizeRoundAndPack;
2228  aExpBigger:
2229     if ( aExp == 0xFF ) {
2230         if (aSig) {
2231             return propagateFloat32NaN(a, b, status);
2232         }
2233         return a;
2234     }
2235     if ( bExp == 0 ) {
2236         --expDiff;
2237     }
2238     else {
2239         bSig |= 0x40000000;
2240     }
2241     shift32RightJamming( bSig, expDiff, &bSig );
2242     aSig |= 0x40000000;
2243  aBigger:
2244     zSig = aSig - bSig;
2245     zExp = aExp;
2246  normalizeRoundAndPack:
2247     --zExp;
2248     return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
2249 
2250 }
2251 
2252 /*----------------------------------------------------------------------------
2253 | Returns the result of adding the single-precision floating-point values `a'
2254 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2255 | Binary Floating-Point Arithmetic.
2256 *----------------------------------------------------------------------------*/
2257 
2258 float32 float32_add(float32 a, float32 b, float_status *status)
2259 {
2260     flag aSign, bSign;
2261     a = float32_squash_input_denormal(a, status);
2262     b = float32_squash_input_denormal(b, status);
2263 
2264     aSign = extractFloat32Sign( a );
2265     bSign = extractFloat32Sign( b );
2266     if ( aSign == bSign ) {
2267         return addFloat32Sigs(a, b, aSign, status);
2268     }
2269     else {
2270         return subFloat32Sigs(a, b, aSign, status);
2271     }
2272 
2273 }
2274 
2275 /*----------------------------------------------------------------------------
2276 | Returns the result of subtracting the single-precision floating-point values
2277 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2278 | for Binary Floating-Point Arithmetic.
2279 *----------------------------------------------------------------------------*/
2280 
2281 float32 float32_sub(float32 a, float32 b, float_status *status)
2282 {
2283     flag aSign, bSign;
2284     a = float32_squash_input_denormal(a, status);
2285     b = float32_squash_input_denormal(b, status);
2286 
2287     aSign = extractFloat32Sign( a );
2288     bSign = extractFloat32Sign( b );
2289     if ( aSign == bSign ) {
2290         return subFloat32Sigs(a, b, aSign, status);
2291     }
2292     else {
2293         return addFloat32Sigs(a, b, aSign, status);
2294     }
2295 
2296 }
2297 
2298 /*----------------------------------------------------------------------------
2299 | Returns the result of multiplying the single-precision floating-point values
2300 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2301 | for Binary Floating-Point Arithmetic.
2302 *----------------------------------------------------------------------------*/
2303 
2304 float32 float32_mul(float32 a, float32 b, float_status *status)
2305 {
2306     flag aSign, bSign, zSign;
2307     int aExp, bExp, zExp;
2308     uint32_t aSig, bSig;
2309     uint64_t zSig64;
2310     uint32_t zSig;
2311 
2312     a = float32_squash_input_denormal(a, status);
2313     b = float32_squash_input_denormal(b, status);
2314 
2315     aSig = extractFloat32Frac( a );
2316     aExp = extractFloat32Exp( a );
2317     aSign = extractFloat32Sign( a );
2318     bSig = extractFloat32Frac( b );
2319     bExp = extractFloat32Exp( b );
2320     bSign = extractFloat32Sign( b );
2321     zSign = aSign ^ bSign;
2322     if ( aExp == 0xFF ) {
2323         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2324             return propagateFloat32NaN(a, b, status);
2325         }
2326         if ( ( bExp | bSig ) == 0 ) {
2327             float_raise(float_flag_invalid, status);
2328             return float32_default_nan(status);
2329         }
2330         return packFloat32( zSign, 0xFF, 0 );
2331     }
2332     if ( bExp == 0xFF ) {
2333         if (bSig) {
2334             return propagateFloat32NaN(a, b, status);
2335         }
2336         if ( ( aExp | aSig ) == 0 ) {
2337             float_raise(float_flag_invalid, status);
2338             return float32_default_nan(status);
2339         }
2340         return packFloat32( zSign, 0xFF, 0 );
2341     }
2342     if ( aExp == 0 ) {
2343         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2344         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2345     }
2346     if ( bExp == 0 ) {
2347         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2348         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2349     }
2350     zExp = aExp + bExp - 0x7F;
2351     aSig = ( aSig | 0x00800000 )<<7;
2352     bSig = ( bSig | 0x00800000 )<<8;
2353     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2354     zSig = zSig64;
2355     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2356         zSig <<= 1;
2357         --zExp;
2358     }
2359     return roundAndPackFloat32(zSign, zExp, zSig, status);
2360 
2361 }
2362 
2363 /*----------------------------------------------------------------------------
2364 | Returns the result of dividing the single-precision floating-point value `a'
2365 | by the corresponding value `b'.  The operation is performed according to the
2366 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2367 *----------------------------------------------------------------------------*/
2368 
2369 float32 float32_div(float32 a, float32 b, float_status *status)
2370 {
2371     flag aSign, bSign, zSign;
2372     int aExp, bExp, zExp;
2373     uint32_t aSig, bSig, zSig;
2374     a = float32_squash_input_denormal(a, status);
2375     b = float32_squash_input_denormal(b, status);
2376 
2377     aSig = extractFloat32Frac( a );
2378     aExp = extractFloat32Exp( a );
2379     aSign = extractFloat32Sign( a );
2380     bSig = extractFloat32Frac( b );
2381     bExp = extractFloat32Exp( b );
2382     bSign = extractFloat32Sign( b );
2383     zSign = aSign ^ bSign;
2384     if ( aExp == 0xFF ) {
2385         if (aSig) {
2386             return propagateFloat32NaN(a, b, status);
2387         }
2388         if ( bExp == 0xFF ) {
2389             if (bSig) {
2390                 return propagateFloat32NaN(a, b, status);
2391             }
2392             float_raise(float_flag_invalid, status);
2393             return float32_default_nan(status);
2394         }
2395         return packFloat32( zSign, 0xFF, 0 );
2396     }
2397     if ( bExp == 0xFF ) {
2398         if (bSig) {
2399             return propagateFloat32NaN(a, b, status);
2400         }
2401         return packFloat32( zSign, 0, 0 );
2402     }
2403     if ( bExp == 0 ) {
2404         if ( bSig == 0 ) {
2405             if ( ( aExp | aSig ) == 0 ) {
2406                 float_raise(float_flag_invalid, status);
2407                 return float32_default_nan(status);
2408             }
2409             float_raise(float_flag_divbyzero, status);
2410             return packFloat32( zSign, 0xFF, 0 );
2411         }
2412         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2413     }
2414     if ( aExp == 0 ) {
2415         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2416         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2417     }
2418     zExp = aExp - bExp + 0x7D;
2419     aSig = ( aSig | 0x00800000 )<<7;
2420     bSig = ( bSig | 0x00800000 )<<8;
2421     if ( bSig <= ( aSig + aSig ) ) {
2422         aSig >>= 1;
2423         ++zExp;
2424     }
2425     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2426     if ( ( zSig & 0x3F ) == 0 ) {
2427         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2428     }
2429     return roundAndPackFloat32(zSign, zExp, zSig, status);
2430 
2431 }
2432 
2433 /*----------------------------------------------------------------------------
2434 | Returns the remainder of the single-precision floating-point value `a'
2435 | with respect to the corresponding value `b'.  The operation is performed
2436 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2437 *----------------------------------------------------------------------------*/
2438 
2439 float32 float32_rem(float32 a, float32 b, float_status *status)
2440 {
2441     flag aSign, zSign;
2442     int aExp, bExp, expDiff;
2443     uint32_t aSig, bSig;
2444     uint32_t q;
2445     uint64_t aSig64, bSig64, q64;
2446     uint32_t alternateASig;
2447     int32_t sigMean;
2448     a = float32_squash_input_denormal(a, status);
2449     b = float32_squash_input_denormal(b, status);
2450 
2451     aSig = extractFloat32Frac( a );
2452     aExp = extractFloat32Exp( a );
2453     aSign = extractFloat32Sign( a );
2454     bSig = extractFloat32Frac( b );
2455     bExp = extractFloat32Exp( b );
2456     if ( aExp == 0xFF ) {
2457         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2458             return propagateFloat32NaN(a, b, status);
2459         }
2460         float_raise(float_flag_invalid, status);
2461         return float32_default_nan(status);
2462     }
2463     if ( bExp == 0xFF ) {
2464         if (bSig) {
2465             return propagateFloat32NaN(a, b, status);
2466         }
2467         return a;
2468     }
2469     if ( bExp == 0 ) {
2470         if ( bSig == 0 ) {
2471             float_raise(float_flag_invalid, status);
2472             return float32_default_nan(status);
2473         }
2474         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2475     }
2476     if ( aExp == 0 ) {
2477         if ( aSig == 0 ) return a;
2478         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2479     }
2480     expDiff = aExp - bExp;
2481     aSig |= 0x00800000;
2482     bSig |= 0x00800000;
2483     if ( expDiff < 32 ) {
2484         aSig <<= 8;
2485         bSig <<= 8;
2486         if ( expDiff < 0 ) {
2487             if ( expDiff < -1 ) return a;
2488             aSig >>= 1;
2489         }
2490         q = ( bSig <= aSig );
2491         if ( q ) aSig -= bSig;
2492         if ( 0 < expDiff ) {
2493             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2494             q >>= 32 - expDiff;
2495             bSig >>= 2;
2496             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2497         }
2498         else {
2499             aSig >>= 2;
2500             bSig >>= 2;
2501         }
2502     }
2503     else {
2504         if ( bSig <= aSig ) aSig -= bSig;
2505         aSig64 = ( (uint64_t) aSig )<<40;
2506         bSig64 = ( (uint64_t) bSig )<<40;
2507         expDiff -= 64;
2508         while ( 0 < expDiff ) {
2509             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2510             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2511             aSig64 = - ( ( bSig * q64 )<<38 );
2512             expDiff -= 62;
2513         }
2514         expDiff += 64;
2515         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2516         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2517         q = q64>>( 64 - expDiff );
2518         bSig <<= 6;
2519         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2520     }
2521     do {
2522         alternateASig = aSig;
2523         ++q;
2524         aSig -= bSig;
2525     } while ( 0 <= (int32_t) aSig );
2526     sigMean = aSig + alternateASig;
2527     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2528         aSig = alternateASig;
2529     }
2530     zSign = ( (int32_t) aSig < 0 );
2531     if ( zSign ) aSig = - aSig;
2532     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
2533 }
2534 
2535 /*----------------------------------------------------------------------------
2536 | Returns the result of multiplying the single-precision floating-point values
2537 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2538 | multiplication.  The operation is performed according to the IEC/IEEE
2539 | Standard for Binary Floating-Point Arithmetic 754-2008.
2540 | The flags argument allows the caller to select negation of the
2541 | addend, the intermediate product, or the final result. (The difference
2542 | between this and having the caller do a separate negation is that negating
2543 | externally will flip the sign bit on NaNs.)
2544 *----------------------------------------------------------------------------*/
2545 
2546 float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2547                        float_status *status)
2548 {
2549     flag aSign, bSign, cSign, zSign;
2550     int aExp, bExp, cExp, pExp, zExp, expDiff;
2551     uint32_t aSig, bSig, cSig;
2552     flag pInf, pZero, pSign;
2553     uint64_t pSig64, cSig64, zSig64;
2554     uint32_t pSig;
2555     int shiftcount;
2556     flag signflip, infzero;
2557 
2558     a = float32_squash_input_denormal(a, status);
2559     b = float32_squash_input_denormal(b, status);
2560     c = float32_squash_input_denormal(c, status);
2561     aSig = extractFloat32Frac(a);
2562     aExp = extractFloat32Exp(a);
2563     aSign = extractFloat32Sign(a);
2564     bSig = extractFloat32Frac(b);
2565     bExp = extractFloat32Exp(b);
2566     bSign = extractFloat32Sign(b);
2567     cSig = extractFloat32Frac(c);
2568     cExp = extractFloat32Exp(c);
2569     cSign = extractFloat32Sign(c);
2570 
2571     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2572                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2573 
2574     /* It is implementation-defined whether the cases of (0,inf,qnan)
2575      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2576      * they return if they do), so we have to hand this information
2577      * off to the target-specific pick-a-NaN routine.
2578      */
2579     if (((aExp == 0xff) && aSig) ||
2580         ((bExp == 0xff) && bSig) ||
2581         ((cExp == 0xff) && cSig)) {
2582         return propagateFloat32MulAddNaN(a, b, c, infzero, status);
2583     }
2584 
2585     if (infzero) {
2586         float_raise(float_flag_invalid, status);
2587         return float32_default_nan(status);
2588     }
2589 
2590     if (flags & float_muladd_negate_c) {
2591         cSign ^= 1;
2592     }
2593 
2594     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2595 
2596     /* Work out the sign and type of the product */
2597     pSign = aSign ^ bSign;
2598     if (flags & float_muladd_negate_product) {
2599         pSign ^= 1;
2600     }
2601     pInf = (aExp == 0xff) || (bExp == 0xff);
2602     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2603 
2604     if (cExp == 0xff) {
2605         if (pInf && (pSign ^ cSign)) {
2606             /* addition of opposite-signed infinities => InvalidOperation */
2607             float_raise(float_flag_invalid, status);
2608             return float32_default_nan(status);
2609         }
2610         /* Otherwise generate an infinity of the same sign */
2611         return packFloat32(cSign ^ signflip, 0xff, 0);
2612     }
2613 
2614     if (pInf) {
2615         return packFloat32(pSign ^ signflip, 0xff, 0);
2616     }
2617 
2618     if (pZero) {
2619         if (cExp == 0) {
2620             if (cSig == 0) {
2621                 /* Adding two exact zeroes */
2622                 if (pSign == cSign) {
2623                     zSign = pSign;
2624                 } else if (status->float_rounding_mode == float_round_down) {
2625                     zSign = 1;
2626                 } else {
2627                     zSign = 0;
2628                 }
2629                 return packFloat32(zSign ^ signflip, 0, 0);
2630             }
2631             /* Exact zero plus a denorm */
2632             if (status->flush_to_zero) {
2633                 float_raise(float_flag_output_denormal, status);
2634                 return packFloat32(cSign ^ signflip, 0, 0);
2635             }
2636         }
2637         /* Zero plus something non-zero : just return the something */
2638         if (flags & float_muladd_halve_result) {
2639             if (cExp == 0) {
2640                 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2641             }
2642             /* Subtract one to halve, and one again because roundAndPackFloat32
2643              * wants one less than the true exponent.
2644              */
2645             cExp -= 2;
2646             cSig = (cSig | 0x00800000) << 7;
2647             return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
2648         }
2649         return packFloat32(cSign ^ signflip, cExp, cSig);
2650     }
2651 
2652     if (aExp == 0) {
2653         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2654     }
2655     if (bExp == 0) {
2656         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2657     }
2658 
2659     /* Calculate the actual result a * b + c */
2660 
2661     /* Multiply first; this is easy. */
2662     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2663      * because we want the true exponent, not the "one-less-than"
2664      * flavour that roundAndPackFloat32() takes.
2665      */
2666     pExp = aExp + bExp - 0x7e;
2667     aSig = (aSig | 0x00800000) << 7;
2668     bSig = (bSig | 0x00800000) << 8;
2669     pSig64 = (uint64_t)aSig * bSig;
2670     if ((int64_t)(pSig64 << 1) >= 0) {
2671         pSig64 <<= 1;
2672         pExp--;
2673     }
2674 
2675     zSign = pSign ^ signflip;
2676 
2677     /* Now pSig64 is the significand of the multiply, with the explicit bit in
2678      * position 62.
2679      */
2680     if (cExp == 0) {
2681         if (!cSig) {
2682             /* Throw out the special case of c being an exact zero now */
2683             shift64RightJamming(pSig64, 32, &pSig64);
2684             pSig = pSig64;
2685             if (flags & float_muladd_halve_result) {
2686                 pExp--;
2687             }
2688             return roundAndPackFloat32(zSign, pExp - 1,
2689                                        pSig, status);
2690         }
2691         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2692     }
2693 
2694     cSig64 = (uint64_t)cSig << (62 - 23);
2695     cSig64 |= LIT64(0x4000000000000000);
2696     expDiff = pExp - cExp;
2697 
2698     if (pSign == cSign) {
2699         /* Addition */
2700         if (expDiff > 0) {
2701             /* scale c to match p */
2702             shift64RightJamming(cSig64, expDiff, &cSig64);
2703             zExp = pExp;
2704         } else if (expDiff < 0) {
2705             /* scale p to match c */
2706             shift64RightJamming(pSig64, -expDiff, &pSig64);
2707             zExp = cExp;
2708         } else {
2709             /* no scaling needed */
2710             zExp = cExp;
2711         }
2712         /* Add significands and make sure explicit bit ends up in posn 62 */
2713         zSig64 = pSig64 + cSig64;
2714         if ((int64_t)zSig64 < 0) {
2715             shift64RightJamming(zSig64, 1, &zSig64);
2716         } else {
2717             zExp--;
2718         }
2719     } else {
2720         /* Subtraction */
2721         if (expDiff > 0) {
2722             shift64RightJamming(cSig64, expDiff, &cSig64);
2723             zSig64 = pSig64 - cSig64;
2724             zExp = pExp;
2725         } else if (expDiff < 0) {
2726             shift64RightJamming(pSig64, -expDiff, &pSig64);
2727             zSig64 = cSig64 - pSig64;
2728             zExp = cExp;
2729             zSign ^= 1;
2730         } else {
2731             zExp = pExp;
2732             if (cSig64 < pSig64) {
2733                 zSig64 = pSig64 - cSig64;
2734             } else if (pSig64 < cSig64) {
2735                 zSig64 = cSig64 - pSig64;
2736                 zSign ^= 1;
2737             } else {
2738                 /* Exact zero */
2739                 zSign = signflip;
2740                 if (status->float_rounding_mode == float_round_down) {
2741                     zSign ^= 1;
2742                 }
2743                 return packFloat32(zSign, 0, 0);
2744             }
2745         }
2746         --zExp;
2747         /* Normalize to put the explicit bit back into bit 62. */
2748         shiftcount = countLeadingZeros64(zSig64) - 1;
2749         zSig64 <<= shiftcount;
2750         zExp -= shiftcount;
2751     }
2752     if (flags & float_muladd_halve_result) {
2753         zExp--;
2754     }
2755 
2756     shift64RightJamming(zSig64, 32, &zSig64);
2757     return roundAndPackFloat32(zSign, zExp, zSig64, status);
2758 }
2759 
2760 
2761 /*----------------------------------------------------------------------------
2762 | Returns the square root of the single-precision floating-point value `a'.
2763 | The operation is performed according to the IEC/IEEE Standard for Binary
2764 | Floating-Point Arithmetic.
2765 *----------------------------------------------------------------------------*/
2766 
2767 float32 float32_sqrt(float32 a, float_status *status)
2768 {
2769     flag aSign;
2770     int aExp, zExp;
2771     uint32_t aSig, zSig;
2772     uint64_t rem, term;
2773     a = float32_squash_input_denormal(a, status);
2774 
2775     aSig = extractFloat32Frac( a );
2776     aExp = extractFloat32Exp( a );
2777     aSign = extractFloat32Sign( a );
2778     if ( aExp == 0xFF ) {
2779         if (aSig) {
2780             return propagateFloat32NaN(a, float32_zero, status);
2781         }
2782         if ( ! aSign ) return a;
2783         float_raise(float_flag_invalid, status);
2784         return float32_default_nan(status);
2785     }
2786     if ( aSign ) {
2787         if ( ( aExp | aSig ) == 0 ) return a;
2788         float_raise(float_flag_invalid, status);
2789         return float32_default_nan(status);
2790     }
2791     if ( aExp == 0 ) {
2792         if ( aSig == 0 ) return float32_zero;
2793         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2794     }
2795     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2796     aSig = ( aSig | 0x00800000 )<<8;
2797     zSig = estimateSqrt32( aExp, aSig ) + 2;
2798     if ( ( zSig & 0x7F ) <= 5 ) {
2799         if ( zSig < 2 ) {
2800             zSig = 0x7FFFFFFF;
2801             goto roundAndPack;
2802         }
2803         aSig >>= aExp & 1;
2804         term = ( (uint64_t) zSig ) * zSig;
2805         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2806         while ( (int64_t) rem < 0 ) {
2807             --zSig;
2808             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2809         }
2810         zSig |= ( rem != 0 );
2811     }
2812     shift32RightJamming( zSig, 1, &zSig );
2813  roundAndPack:
2814     return roundAndPackFloat32(0, zExp, zSig, status);
2815 
2816 }
2817 
2818 /*----------------------------------------------------------------------------
2819 | Returns the binary exponential of the single-precision floating-point value
2820 | `a'. The operation is performed according to the IEC/IEEE Standard for
2821 | Binary Floating-Point Arithmetic.
2822 |
2823 | Uses the following identities:
2824 |
2825 | 1. -------------------------------------------------------------------------
2826 |      x    x*ln(2)
2827 |     2  = e
2828 |
2829 | 2. -------------------------------------------------------------------------
2830 |                      2     3     4     5           n
2831 |      x        x     x     x     x     x           x
2832 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2833 |               1!    2!    3!    4!    5!          n!
2834 *----------------------------------------------------------------------------*/
2835 
2836 static const float64 float32_exp2_coefficients[15] =
2837 {
2838     const_float64( 0x3ff0000000000000ll ), /*  1 */
2839     const_float64( 0x3fe0000000000000ll ), /*  2 */
2840     const_float64( 0x3fc5555555555555ll ), /*  3 */
2841     const_float64( 0x3fa5555555555555ll ), /*  4 */
2842     const_float64( 0x3f81111111111111ll ), /*  5 */
2843     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2844     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2845     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2846     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2847     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2848     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2849     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2850     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2851     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2852     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2853 };
2854 
2855 float32 float32_exp2(float32 a, float_status *status)
2856 {
2857     flag aSign;
2858     int aExp;
2859     uint32_t aSig;
2860     float64 r, x, xn;
2861     int i;
2862     a = float32_squash_input_denormal(a, status);
2863 
2864     aSig = extractFloat32Frac( a );
2865     aExp = extractFloat32Exp( a );
2866     aSign = extractFloat32Sign( a );
2867 
2868     if ( aExp == 0xFF) {
2869         if (aSig) {
2870             return propagateFloat32NaN(a, float32_zero, status);
2871         }
2872         return (aSign) ? float32_zero : a;
2873     }
2874     if (aExp == 0) {
2875         if (aSig == 0) return float32_one;
2876     }
2877 
2878     float_raise(float_flag_inexact, status);
2879 
2880     /* ******************************* */
2881     /* using float64 for approximation */
2882     /* ******************************* */
2883     x = float32_to_float64(a, status);
2884     x = float64_mul(x, float64_ln2, status);
2885 
2886     xn = x;
2887     r = float64_one;
2888     for (i = 0 ; i < 15 ; i++) {
2889         float64 f;
2890 
2891         f = float64_mul(xn, float32_exp2_coefficients[i], status);
2892         r = float64_add(r, f, status);
2893 
2894         xn = float64_mul(xn, x, status);
2895     }
2896 
2897     return float64_to_float32(r, status);
2898 }
2899 
2900 /*----------------------------------------------------------------------------
2901 | Returns the binary log of the single-precision floating-point value `a'.
2902 | The operation is performed according to the IEC/IEEE Standard for Binary
2903 | Floating-Point Arithmetic.
2904 *----------------------------------------------------------------------------*/
2905 float32 float32_log2(float32 a, float_status *status)
2906 {
2907     flag aSign, zSign;
2908     int aExp;
2909     uint32_t aSig, zSig, i;
2910 
2911     a = float32_squash_input_denormal(a, status);
2912     aSig = extractFloat32Frac( a );
2913     aExp = extractFloat32Exp( a );
2914     aSign = extractFloat32Sign( a );
2915 
2916     if ( aExp == 0 ) {
2917         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2918         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2919     }
2920     if ( aSign ) {
2921         float_raise(float_flag_invalid, status);
2922         return float32_default_nan(status);
2923     }
2924     if ( aExp == 0xFF ) {
2925         if (aSig) {
2926             return propagateFloat32NaN(a, float32_zero, status);
2927         }
2928         return a;
2929     }
2930 
2931     aExp -= 0x7F;
2932     aSig |= 0x00800000;
2933     zSign = aExp < 0;
2934     zSig = aExp << 23;
2935 
2936     for (i = 1 << 22; i > 0; i >>= 1) {
2937         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2938         if ( aSig & 0x01000000 ) {
2939             aSig >>= 1;
2940             zSig |= i;
2941         }
2942     }
2943 
2944     if ( zSign )
2945         zSig = -zSig;
2946 
2947     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
2948 }
2949 
2950 /*----------------------------------------------------------------------------
2951 | Returns 1 if the single-precision floating-point value `a' is equal to
2952 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2953 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2954 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2955 *----------------------------------------------------------------------------*/
2956 
2957 int float32_eq(float32 a, float32 b, float_status *status)
2958 {
2959     uint32_t av, bv;
2960     a = float32_squash_input_denormal(a, status);
2961     b = float32_squash_input_denormal(b, status);
2962 
2963     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2964          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2965        ) {
2966         float_raise(float_flag_invalid, status);
2967         return 0;
2968     }
2969     av = float32_val(a);
2970     bv = float32_val(b);
2971     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2972 }
2973 
2974 /*----------------------------------------------------------------------------
2975 | Returns 1 if the single-precision floating-point value `a' is less than
2976 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2977 | exception is raised if either operand is a NaN.  The comparison is performed
2978 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2979 *----------------------------------------------------------------------------*/
2980 
2981 int float32_le(float32 a, float32 b, float_status *status)
2982 {
2983     flag aSign, bSign;
2984     uint32_t av, bv;
2985     a = float32_squash_input_denormal(a, status);
2986     b = float32_squash_input_denormal(b, status);
2987 
2988     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2989          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2990        ) {
2991         float_raise(float_flag_invalid, status);
2992         return 0;
2993     }
2994     aSign = extractFloat32Sign( a );
2995     bSign = extractFloat32Sign( b );
2996     av = float32_val(a);
2997     bv = float32_val(b);
2998     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2999     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3000 
3001 }
3002 
3003 /*----------------------------------------------------------------------------
3004 | Returns 1 if the single-precision floating-point value `a' is less than
3005 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3006 | raised if either operand is a NaN.  The comparison is performed according
3007 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3008 *----------------------------------------------------------------------------*/
3009 
3010 int float32_lt(float32 a, float32 b, float_status *status)
3011 {
3012     flag aSign, bSign;
3013     uint32_t av, bv;
3014     a = float32_squash_input_denormal(a, status);
3015     b = float32_squash_input_denormal(b, status);
3016 
3017     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3018          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3019        ) {
3020         float_raise(float_flag_invalid, status);
3021         return 0;
3022     }
3023     aSign = extractFloat32Sign( a );
3024     bSign = extractFloat32Sign( b );
3025     av = float32_val(a);
3026     bv = float32_val(b);
3027     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3028     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3029 
3030 }
3031 
3032 /*----------------------------------------------------------------------------
3033 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3034 | be compared, and 0 otherwise.  The invalid exception is raised if either
3035 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3036 | Standard for Binary Floating-Point Arithmetic.
3037 *----------------------------------------------------------------------------*/
3038 
3039 int float32_unordered(float32 a, float32 b, float_status *status)
3040 {
3041     a = float32_squash_input_denormal(a, status);
3042     b = float32_squash_input_denormal(b, status);
3043 
3044     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3045          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3046        ) {
3047         float_raise(float_flag_invalid, status);
3048         return 1;
3049     }
3050     return 0;
3051 }
3052 
3053 /*----------------------------------------------------------------------------
3054 | Returns 1 if the single-precision floating-point value `a' is equal to
3055 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3056 | exception.  The comparison is performed according to the IEC/IEEE Standard
3057 | for Binary Floating-Point Arithmetic.
3058 *----------------------------------------------------------------------------*/
3059 
3060 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3061 {
3062     a = float32_squash_input_denormal(a, status);
3063     b = float32_squash_input_denormal(b, status);
3064 
3065     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3066          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3067        ) {
3068         if (float32_is_signaling_nan(a, status)
3069          || float32_is_signaling_nan(b, status)) {
3070             float_raise(float_flag_invalid, status);
3071         }
3072         return 0;
3073     }
3074     return ( float32_val(a) == float32_val(b) ) ||
3075             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3076 }
3077 
3078 /*----------------------------------------------------------------------------
3079 | Returns 1 if the single-precision floating-point value `a' is less than or
3080 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3081 | cause an exception.  Otherwise, the comparison is performed according to the
3082 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3083 *----------------------------------------------------------------------------*/
3084 
3085 int float32_le_quiet(float32 a, float32 b, float_status *status)
3086 {
3087     flag aSign, bSign;
3088     uint32_t av, bv;
3089     a = float32_squash_input_denormal(a, status);
3090     b = float32_squash_input_denormal(b, status);
3091 
3092     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3093          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3094        ) {
3095         if (float32_is_signaling_nan(a, status)
3096          || float32_is_signaling_nan(b, status)) {
3097             float_raise(float_flag_invalid, status);
3098         }
3099         return 0;
3100     }
3101     aSign = extractFloat32Sign( a );
3102     bSign = extractFloat32Sign( b );
3103     av = float32_val(a);
3104     bv = float32_val(b);
3105     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3106     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3107 
3108 }
3109 
3110 /*----------------------------------------------------------------------------
3111 | Returns 1 if the single-precision floating-point value `a' is less than
3112 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3113 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3114 | Standard for Binary Floating-Point Arithmetic.
3115 *----------------------------------------------------------------------------*/
3116 
3117 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3118 {
3119     flag aSign, bSign;
3120     uint32_t av, bv;
3121     a = float32_squash_input_denormal(a, status);
3122     b = float32_squash_input_denormal(b, status);
3123 
3124     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3125          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3126        ) {
3127         if (float32_is_signaling_nan(a, status)
3128          || float32_is_signaling_nan(b, status)) {
3129             float_raise(float_flag_invalid, status);
3130         }
3131         return 0;
3132     }
3133     aSign = extractFloat32Sign( a );
3134     bSign = extractFloat32Sign( b );
3135     av = float32_val(a);
3136     bv = float32_val(b);
3137     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3138     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3139 
3140 }
3141 
3142 /*----------------------------------------------------------------------------
3143 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3144 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3145 | comparison is performed according to the IEC/IEEE Standard for Binary
3146 | Floating-Point Arithmetic.
3147 *----------------------------------------------------------------------------*/
3148 
3149 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3150 {
3151     a = float32_squash_input_denormal(a, status);
3152     b = float32_squash_input_denormal(b, status);
3153 
3154     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3155          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3156        ) {
3157         if (float32_is_signaling_nan(a, status)
3158          || float32_is_signaling_nan(b, status)) {
3159             float_raise(float_flag_invalid, status);
3160         }
3161         return 1;
3162     }
3163     return 0;
3164 }
3165 
3166 /*----------------------------------------------------------------------------
3167 | Returns the result of converting the double-precision floating-point value
3168 | `a' to the 32-bit two's complement integer format.  The conversion is
3169 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3170 | Arithmetic---which means in particular that the conversion is rounded
3171 | according to the current rounding mode.  If `a' is a NaN, the largest
3172 | positive integer is returned.  Otherwise, if the conversion overflows, the
3173 | largest integer with the same sign as `a' is returned.
3174 *----------------------------------------------------------------------------*/
3175 
3176 int32_t float64_to_int32(float64 a, float_status *status)
3177 {
3178     flag aSign;
3179     int aExp;
3180     int shiftCount;
3181     uint64_t aSig;
3182     a = float64_squash_input_denormal(a, status);
3183 
3184     aSig = extractFloat64Frac( a );
3185     aExp = extractFloat64Exp( a );
3186     aSign = extractFloat64Sign( a );
3187     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3188     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3189     shiftCount = 0x42C - aExp;
3190     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3191     return roundAndPackInt32(aSign, aSig, status);
3192 
3193 }
3194 
3195 /*----------------------------------------------------------------------------
3196 | Returns the result of converting the double-precision floating-point value
3197 | `a' to the 32-bit two's complement integer format.  The conversion is
3198 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3199 | Arithmetic, except that the conversion is always rounded toward zero.
3200 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3201 | the conversion overflows, the largest integer with the same sign as `a' is
3202 | returned.
3203 *----------------------------------------------------------------------------*/
3204 
3205 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
3206 {
3207     flag aSign;
3208     int aExp;
3209     int shiftCount;
3210     uint64_t aSig, savedASig;
3211     int32_t z;
3212     a = float64_squash_input_denormal(a, status);
3213 
3214     aSig = extractFloat64Frac( a );
3215     aExp = extractFloat64Exp( a );
3216     aSign = extractFloat64Sign( a );
3217     if ( 0x41E < aExp ) {
3218         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3219         goto invalid;
3220     }
3221     else if ( aExp < 0x3FF ) {
3222         if (aExp || aSig) {
3223             status->float_exception_flags |= float_flag_inexact;
3224         }
3225         return 0;
3226     }
3227     aSig |= LIT64( 0x0010000000000000 );
3228     shiftCount = 0x433 - aExp;
3229     savedASig = aSig;
3230     aSig >>= shiftCount;
3231     z = aSig;
3232     if ( aSign ) z = - z;
3233     if ( ( z < 0 ) ^ aSign ) {
3234  invalid:
3235         float_raise(float_flag_invalid, status);
3236         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3237     }
3238     if ( ( aSig<<shiftCount ) != savedASig ) {
3239         status->float_exception_flags |= float_flag_inexact;
3240     }
3241     return z;
3242 
3243 }
3244 
3245 /*----------------------------------------------------------------------------
3246 | Returns the result of converting the double-precision floating-point value
3247 | `a' to the 16-bit two's complement integer format.  The conversion is
3248 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3249 | Arithmetic, except that the conversion is always rounded toward zero.
3250 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3251 | the conversion overflows, the largest integer with the same sign as `a' is
3252 | returned.
3253 *----------------------------------------------------------------------------*/
3254 
3255 int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3256 {
3257     flag aSign;
3258     int aExp;
3259     int shiftCount;
3260     uint64_t aSig, savedASig;
3261     int32_t z;
3262 
3263     aSig = extractFloat64Frac( a );
3264     aExp = extractFloat64Exp( a );
3265     aSign = extractFloat64Sign( a );
3266     if ( 0x40E < aExp ) {
3267         if ( ( aExp == 0x7FF ) && aSig ) {
3268             aSign = 0;
3269         }
3270         goto invalid;
3271     }
3272     else if ( aExp < 0x3FF ) {
3273         if ( aExp || aSig ) {
3274             status->float_exception_flags |= float_flag_inexact;
3275         }
3276         return 0;
3277     }
3278     aSig |= LIT64( 0x0010000000000000 );
3279     shiftCount = 0x433 - aExp;
3280     savedASig = aSig;
3281     aSig >>= shiftCount;
3282     z = aSig;
3283     if ( aSign ) {
3284         z = - z;
3285     }
3286     if ( ( (int16_t)z < 0 ) ^ aSign ) {
3287  invalid:
3288         float_raise(float_flag_invalid, status);
3289         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3290     }
3291     if ( ( aSig<<shiftCount ) != savedASig ) {
3292         status->float_exception_flags |= float_flag_inexact;
3293     }
3294     return z;
3295 }
3296 
3297 /*----------------------------------------------------------------------------
3298 | Returns the result of converting the double-precision floating-point value
3299 | `a' to the 64-bit two's complement integer format.  The conversion is
3300 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3301 | Arithmetic---which means in particular that the conversion is rounded
3302 | according to the current rounding mode.  If `a' is a NaN, the largest
3303 | positive integer is returned.  Otherwise, if the conversion overflows, the
3304 | largest integer with the same sign as `a' is returned.
3305 *----------------------------------------------------------------------------*/
3306 
3307 int64_t float64_to_int64(float64 a, float_status *status)
3308 {
3309     flag aSign;
3310     int aExp;
3311     int shiftCount;
3312     uint64_t aSig, aSigExtra;
3313     a = float64_squash_input_denormal(a, status);
3314 
3315     aSig = extractFloat64Frac( a );
3316     aExp = extractFloat64Exp( a );
3317     aSign = extractFloat64Sign( a );
3318     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3319     shiftCount = 0x433 - aExp;
3320     if ( shiftCount <= 0 ) {
3321         if ( 0x43E < aExp ) {
3322             float_raise(float_flag_invalid, status);
3323             if (    ! aSign
3324                  || (    ( aExp == 0x7FF )
3325                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
3326                ) {
3327                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3328             }
3329             return (int64_t) LIT64( 0x8000000000000000 );
3330         }
3331         aSigExtra = 0;
3332         aSig <<= - shiftCount;
3333     }
3334     else {
3335         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3336     }
3337     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3338 
3339 }
3340 
3341 /*----------------------------------------------------------------------------
3342 | Returns the result of converting the double-precision floating-point value
3343 | `a' to the 64-bit two's complement integer format.  The conversion is
3344 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3345 | Arithmetic, except that the conversion is always rounded toward zero.
3346 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3347 | the conversion overflows, the largest integer with the same sign as `a' is
3348 | returned.
3349 *----------------------------------------------------------------------------*/
3350 
3351 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
3352 {
3353     flag aSign;
3354     int aExp;
3355     int shiftCount;
3356     uint64_t aSig;
3357     int64_t z;
3358     a = float64_squash_input_denormal(a, status);
3359 
3360     aSig = extractFloat64Frac( a );
3361     aExp = extractFloat64Exp( a );
3362     aSign = extractFloat64Sign( a );
3363     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3364     shiftCount = aExp - 0x433;
3365     if ( 0 <= shiftCount ) {
3366         if ( 0x43E <= aExp ) {
3367             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3368                 float_raise(float_flag_invalid, status);
3369                 if (    ! aSign
3370                      || (    ( aExp == 0x7FF )
3371                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
3372                    ) {
3373                     return LIT64( 0x7FFFFFFFFFFFFFFF );
3374                 }
3375             }
3376             return (int64_t) LIT64( 0x8000000000000000 );
3377         }
3378         z = aSig<<shiftCount;
3379     }
3380     else {
3381         if ( aExp < 0x3FE ) {
3382             if (aExp | aSig) {
3383                 status->float_exception_flags |= float_flag_inexact;
3384             }
3385             return 0;
3386         }
3387         z = aSig>>( - shiftCount );
3388         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3389             status->float_exception_flags |= float_flag_inexact;
3390         }
3391     }
3392     if ( aSign ) z = - z;
3393     return z;
3394 
3395 }
3396 
3397 /*----------------------------------------------------------------------------
3398 | Returns the result of converting the double-precision floating-point value
3399 | `a' to the single-precision floating-point format.  The conversion is
3400 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3401 | Arithmetic.
3402 *----------------------------------------------------------------------------*/
3403 
3404 float32 float64_to_float32(float64 a, float_status *status)
3405 {
3406     flag aSign;
3407     int aExp;
3408     uint64_t aSig;
3409     uint32_t zSig;
3410     a = float64_squash_input_denormal(a, status);
3411 
3412     aSig = extractFloat64Frac( a );
3413     aExp = extractFloat64Exp( a );
3414     aSign = extractFloat64Sign( a );
3415     if ( aExp == 0x7FF ) {
3416         if (aSig) {
3417             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3418         }
3419         return packFloat32( aSign, 0xFF, 0 );
3420     }
3421     shift64RightJamming( aSig, 22, &aSig );
3422     zSig = aSig;
3423     if ( aExp || zSig ) {
3424         zSig |= 0x40000000;
3425         aExp -= 0x381;
3426     }
3427     return roundAndPackFloat32(aSign, aExp, zSig, status);
3428 
3429 }
3430 
3431 
3432 /*----------------------------------------------------------------------------
3433 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3434 | half-precision floating-point value, returning the result.  After being
3435 | shifted into the proper positions, the three fields are simply added
3436 | together to form the result.  This means that any integer portion of `zSig'
3437 | will be added into the exponent.  Since a properly normalized significand
3438 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3439 | than the desired result exponent whenever `zSig' is a complete, normalized
3440 | significand.
3441 *----------------------------------------------------------------------------*/
3442 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3443 {
3444     return make_float16(
3445         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3446 }
3447 
3448 /*----------------------------------------------------------------------------
3449 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3450 | and significand `zSig', and returns the proper half-precision floating-
3451 | point value corresponding to the abstract input.  Ordinarily, the abstract
3452 | value is simply rounded and packed into the half-precision format, with
3453 | the inexact exception raised if the abstract input cannot be represented
3454 | exactly.  However, if the abstract value is too large, the overflow and
3455 | inexact exceptions are raised and an infinity or maximal finite value is
3456 | returned.  If the abstract value is too small, the input value is rounded to
3457 | a subnormal number, and the underflow and inexact exceptions are raised if
3458 | the abstract input cannot be represented exactly as a subnormal half-
3459 | precision floating-point number.
3460 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3461 | ARM-style "alternative representation", which omits the NaN and Inf
3462 | encodings in order to raise the maximum representable exponent by one.
3463 |     The input significand `zSig' has its binary point between bits 22
3464 | and 23, which is 13 bits to the left of the usual location.  This shifted
3465 | significand must be normalized or smaller.  If `zSig' is not normalized,
3466 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3467 | and it must not require rounding.  In the usual case that `zSig' is
3468 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3469 | Note the slightly odd position of the binary point in zSig compared with the
3470 | other roundAndPackFloat functions. This should probably be fixed if we
3471 | need to implement more float16 routines than just conversion.
3472 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3473 | Binary Floating-Point Arithmetic.
3474 *----------------------------------------------------------------------------*/
3475 
3476 static float16 roundAndPackFloat16(flag zSign, int zExp,
3477                                    uint32_t zSig, flag ieee,
3478                                    float_status *status)
3479 {
3480     int maxexp = ieee ? 29 : 30;
3481     uint32_t mask;
3482     uint32_t increment;
3483     bool rounding_bumps_exp;
3484     bool is_tiny = false;
3485 
3486     /* Calculate the mask of bits of the mantissa which are not
3487      * representable in half-precision and will be lost.
3488      */
3489     if (zExp < 1) {
3490         /* Will be denormal in halfprec */
3491         mask = 0x00ffffff;
3492         if (zExp >= -11) {
3493             mask >>= 11 + zExp;
3494         }
3495     } else {
3496         /* Normal number in halfprec */
3497         mask = 0x00001fff;
3498     }
3499 
3500     switch (status->float_rounding_mode) {
3501     case float_round_nearest_even:
3502         increment = (mask + 1) >> 1;
3503         if ((zSig & mask) == increment) {
3504             increment = zSig & (increment << 1);
3505         }
3506         break;
3507     case float_round_ties_away:
3508         increment = (mask + 1) >> 1;
3509         break;
3510     case float_round_up:
3511         increment = zSign ? 0 : mask;
3512         break;
3513     case float_round_down:
3514         increment = zSign ? mask : 0;
3515         break;
3516     default: /* round_to_zero */
3517         increment = 0;
3518         break;
3519     }
3520 
3521     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3522 
3523     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3524         if (ieee) {
3525             float_raise(float_flag_overflow | float_flag_inexact, status);
3526             return packFloat16(zSign, 0x1f, 0);
3527         } else {
3528             float_raise(float_flag_invalid, status);
3529             return packFloat16(zSign, 0x1f, 0x3ff);
3530         }
3531     }
3532 
3533     if (zExp < 0) {
3534         /* Note that flush-to-zero does not affect half-precision results */
3535         is_tiny =
3536             (status->float_detect_tininess == float_tininess_before_rounding)
3537             || (zExp < -1)
3538             || (!rounding_bumps_exp);
3539     }
3540     if (zSig & mask) {
3541         float_raise(float_flag_inexact, status);
3542         if (is_tiny) {
3543             float_raise(float_flag_underflow, status);
3544         }
3545     }
3546 
3547     zSig += increment;
3548     if (rounding_bumps_exp) {
3549         zSig >>= 1;
3550         zExp++;
3551     }
3552 
3553     if (zExp < -10) {
3554         return packFloat16(zSign, 0, 0);
3555     }
3556     if (zExp < 0) {
3557         zSig >>= -zExp;
3558         zExp = 0;
3559     }
3560     return packFloat16(zSign, zExp, zSig >> 13);
3561 }
3562 
3563 /*----------------------------------------------------------------------------
3564 | If `a' is denormal and we are in flush-to-zero mode then set the
3565 | input-denormal exception and return zero. Otherwise just return the value.
3566 *----------------------------------------------------------------------------*/
3567 float16 float16_squash_input_denormal(float16 a, float_status *status)
3568 {
3569     if (status->flush_inputs_to_zero) {
3570         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3571             float_raise(float_flag_input_denormal, status);
3572             return make_float16(float16_val(a) & 0x8000);
3573         }
3574     }
3575     return a;
3576 }
3577 
3578 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3579                                       uint32_t *zSigPtr)
3580 {
3581     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3582     *zSigPtr = aSig << shiftCount;
3583     *zExpPtr = 1 - shiftCount;
3584 }
3585 
3586 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3587    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3588 
3589 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3590 {
3591     flag aSign;
3592     int aExp;
3593     uint32_t aSig;
3594 
3595     aSign = extractFloat16Sign(a);
3596     aExp = extractFloat16Exp(a);
3597     aSig = extractFloat16Frac(a);
3598 
3599     if (aExp == 0x1f && ieee) {
3600         if (aSig) {
3601             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3602         }
3603         return packFloat32(aSign, 0xff, 0);
3604     }
3605     if (aExp == 0) {
3606         if (aSig == 0) {
3607             return packFloat32(aSign, 0, 0);
3608         }
3609 
3610         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3611         aExp--;
3612     }
3613     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3614 }
3615 
3616 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3617 {
3618     flag aSign;
3619     int aExp;
3620     uint32_t aSig;
3621 
3622     a = float32_squash_input_denormal(a, status);
3623 
3624     aSig = extractFloat32Frac( a );
3625     aExp = extractFloat32Exp( a );
3626     aSign = extractFloat32Sign( a );
3627     if ( aExp == 0xFF ) {
3628         if (aSig) {
3629             /* Input is a NaN */
3630             if (!ieee) {
3631                 float_raise(float_flag_invalid, status);
3632                 return packFloat16(aSign, 0, 0);
3633             }
3634             return commonNaNToFloat16(
3635                 float32ToCommonNaN(a, status), status);
3636         }
3637         /* Infinity */
3638         if (!ieee) {
3639             float_raise(float_flag_invalid, status);
3640             return packFloat16(aSign, 0x1f, 0x3ff);
3641         }
3642         return packFloat16(aSign, 0x1f, 0);
3643     }
3644     if (aExp == 0 && aSig == 0) {
3645         return packFloat16(aSign, 0, 0);
3646     }
3647     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3648      * even if the input is denormal; however this is harmless because
3649      * the largest possible single-precision denormal is still smaller
3650      * than the smallest representable half-precision denormal, and so we
3651      * will end up ignoring aSig and returning via the "always return zero"
3652      * codepath.
3653      */
3654     aSig |= 0x00800000;
3655     aExp -= 0x71;
3656 
3657     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3658 }
3659 
3660 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3661 {
3662     flag aSign;
3663     int aExp;
3664     uint32_t aSig;
3665 
3666     aSign = extractFloat16Sign(a);
3667     aExp = extractFloat16Exp(a);
3668     aSig = extractFloat16Frac(a);
3669 
3670     if (aExp == 0x1f && ieee) {
3671         if (aSig) {
3672             return commonNaNToFloat64(
3673                 float16ToCommonNaN(a, status), status);
3674         }
3675         return packFloat64(aSign, 0x7ff, 0);
3676     }
3677     if (aExp == 0) {
3678         if (aSig == 0) {
3679             return packFloat64(aSign, 0, 0);
3680         }
3681 
3682         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3683         aExp--;
3684     }
3685     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3686 }
3687 
3688 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3689 {
3690     flag aSign;
3691     int aExp;
3692     uint64_t aSig;
3693     uint32_t zSig;
3694 
3695     a = float64_squash_input_denormal(a, status);
3696 
3697     aSig = extractFloat64Frac(a);
3698     aExp = extractFloat64Exp(a);
3699     aSign = extractFloat64Sign(a);
3700     if (aExp == 0x7FF) {
3701         if (aSig) {
3702             /* Input is a NaN */
3703             if (!ieee) {
3704                 float_raise(float_flag_invalid, status);
3705                 return packFloat16(aSign, 0, 0);
3706             }
3707             return commonNaNToFloat16(
3708                 float64ToCommonNaN(a, status), status);
3709         }
3710         /* Infinity */
3711         if (!ieee) {
3712             float_raise(float_flag_invalid, status);
3713             return packFloat16(aSign, 0x1f, 0x3ff);
3714         }
3715         return packFloat16(aSign, 0x1f, 0);
3716     }
3717     shift64RightJamming(aSig, 29, &aSig);
3718     zSig = aSig;
3719     if (aExp == 0 && zSig == 0) {
3720         return packFloat16(aSign, 0, 0);
3721     }
3722     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3723      * even if the input is denormal; however this is harmless because
3724      * the largest possible single-precision denormal is still smaller
3725      * than the smallest representable half-precision denormal, and so we
3726      * will end up ignoring aSig and returning via the "always return zero"
3727      * codepath.
3728      */
3729     zSig |= 0x00800000;
3730     aExp -= 0x3F1;
3731 
3732     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3733 }
3734 
3735 /*----------------------------------------------------------------------------
3736 | Returns the result of converting the double-precision floating-point value
3737 | `a' to the extended double-precision floating-point format.  The conversion
3738 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3739 | Arithmetic.
3740 *----------------------------------------------------------------------------*/
3741 
3742 floatx80 float64_to_floatx80(float64 a, float_status *status)
3743 {
3744     flag aSign;
3745     int aExp;
3746     uint64_t aSig;
3747 
3748     a = float64_squash_input_denormal(a, status);
3749     aSig = extractFloat64Frac( a );
3750     aExp = extractFloat64Exp( a );
3751     aSign = extractFloat64Sign( a );
3752     if ( aExp == 0x7FF ) {
3753         if (aSig) {
3754             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3755         }
3756         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3757     }
3758     if ( aExp == 0 ) {
3759         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3760         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3761     }
3762     return
3763         packFloatx80(
3764             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3765 
3766 }
3767 
3768 /*----------------------------------------------------------------------------
3769 | Returns the result of converting the double-precision floating-point value
3770 | `a' to the quadruple-precision floating-point format.  The conversion is
3771 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3772 | Arithmetic.
3773 *----------------------------------------------------------------------------*/
3774 
3775 float128 float64_to_float128(float64 a, float_status *status)
3776 {
3777     flag aSign;
3778     int aExp;
3779     uint64_t aSig, zSig0, zSig1;
3780 
3781     a = float64_squash_input_denormal(a, status);
3782     aSig = extractFloat64Frac( a );
3783     aExp = extractFloat64Exp( a );
3784     aSign = extractFloat64Sign( a );
3785     if ( aExp == 0x7FF ) {
3786         if (aSig) {
3787             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3788         }
3789         return packFloat128( aSign, 0x7FFF, 0, 0 );
3790     }
3791     if ( aExp == 0 ) {
3792         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3793         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3794         --aExp;
3795     }
3796     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3797     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3798 
3799 }
3800 
3801 /*----------------------------------------------------------------------------
3802 | Rounds the double-precision floating-point value `a' to an integer, and
3803 | returns the result as a double-precision floating-point value.  The
3804 | operation is performed according to the IEC/IEEE Standard for Binary
3805 | Floating-Point Arithmetic.
3806 *----------------------------------------------------------------------------*/
3807 
3808 float64 float64_round_to_int(float64 a, float_status *status)
3809 {
3810     flag aSign;
3811     int aExp;
3812     uint64_t lastBitMask, roundBitsMask;
3813     uint64_t z;
3814     a = float64_squash_input_denormal(a, status);
3815 
3816     aExp = extractFloat64Exp( a );
3817     if ( 0x433 <= aExp ) {
3818         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3819             return propagateFloat64NaN(a, a, status);
3820         }
3821         return a;
3822     }
3823     if ( aExp < 0x3FF ) {
3824         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3825         status->float_exception_flags |= float_flag_inexact;
3826         aSign = extractFloat64Sign( a );
3827         switch (status->float_rounding_mode) {
3828          case float_round_nearest_even:
3829             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3830                 return packFloat64( aSign, 0x3FF, 0 );
3831             }
3832             break;
3833         case float_round_ties_away:
3834             if (aExp == 0x3FE) {
3835                 return packFloat64(aSign, 0x3ff, 0);
3836             }
3837             break;
3838          case float_round_down:
3839             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3840          case float_round_up:
3841             return make_float64(
3842             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3843         }
3844         return packFloat64( aSign, 0, 0 );
3845     }
3846     lastBitMask = 1;
3847     lastBitMask <<= 0x433 - aExp;
3848     roundBitsMask = lastBitMask - 1;
3849     z = float64_val(a);
3850     switch (status->float_rounding_mode) {
3851     case float_round_nearest_even:
3852         z += lastBitMask >> 1;
3853         if ((z & roundBitsMask) == 0) {
3854             z &= ~lastBitMask;
3855         }
3856         break;
3857     case float_round_ties_away:
3858         z += lastBitMask >> 1;
3859         break;
3860     case float_round_to_zero:
3861         break;
3862     case float_round_up:
3863         if (!extractFloat64Sign(make_float64(z))) {
3864             z += roundBitsMask;
3865         }
3866         break;
3867     case float_round_down:
3868         if (extractFloat64Sign(make_float64(z))) {
3869             z += roundBitsMask;
3870         }
3871         break;
3872     default:
3873         abort();
3874     }
3875     z &= ~ roundBitsMask;
3876     if (z != float64_val(a)) {
3877         status->float_exception_flags |= float_flag_inexact;
3878     }
3879     return make_float64(z);
3880 
3881 }
3882 
3883 float64 float64_trunc_to_int(float64 a, float_status *status)
3884 {
3885     int oldmode;
3886     float64 res;
3887     oldmode = status->float_rounding_mode;
3888     status->float_rounding_mode = float_round_to_zero;
3889     res = float64_round_to_int(a, status);
3890     status->float_rounding_mode = oldmode;
3891     return res;
3892 }
3893 
3894 /*----------------------------------------------------------------------------
3895 | Returns the result of adding the absolute values of the double-precision
3896 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3897 | before being returned.  `zSign' is ignored if the result is a NaN.
3898 | The addition is performed according to the IEC/IEEE Standard for Binary
3899 | Floating-Point Arithmetic.
3900 *----------------------------------------------------------------------------*/
3901 
3902 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3903                               float_status *status)
3904 {
3905     int aExp, bExp, zExp;
3906     uint64_t aSig, bSig, zSig;
3907     int expDiff;
3908 
3909     aSig = extractFloat64Frac( a );
3910     aExp = extractFloat64Exp( a );
3911     bSig = extractFloat64Frac( b );
3912     bExp = extractFloat64Exp( b );
3913     expDiff = aExp - bExp;
3914     aSig <<= 9;
3915     bSig <<= 9;
3916     if ( 0 < expDiff ) {
3917         if ( aExp == 0x7FF ) {
3918             if (aSig) {
3919                 return propagateFloat64NaN(a, b, status);
3920             }
3921             return a;
3922         }
3923         if ( bExp == 0 ) {
3924             --expDiff;
3925         }
3926         else {
3927             bSig |= LIT64( 0x2000000000000000 );
3928         }
3929         shift64RightJamming( bSig, expDiff, &bSig );
3930         zExp = aExp;
3931     }
3932     else if ( expDiff < 0 ) {
3933         if ( bExp == 0x7FF ) {
3934             if (bSig) {
3935                 return propagateFloat64NaN(a, b, status);
3936             }
3937             return packFloat64( zSign, 0x7FF, 0 );
3938         }
3939         if ( aExp == 0 ) {
3940             ++expDiff;
3941         }
3942         else {
3943             aSig |= LIT64( 0x2000000000000000 );
3944         }
3945         shift64RightJamming( aSig, - expDiff, &aSig );
3946         zExp = bExp;
3947     }
3948     else {
3949         if ( aExp == 0x7FF ) {
3950             if (aSig | bSig) {
3951                 return propagateFloat64NaN(a, b, status);
3952             }
3953             return a;
3954         }
3955         if ( aExp == 0 ) {
3956             if (status->flush_to_zero) {
3957                 if (aSig | bSig) {
3958                     float_raise(float_flag_output_denormal, status);
3959                 }
3960                 return packFloat64(zSign, 0, 0);
3961             }
3962             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3963         }
3964         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3965         zExp = aExp;
3966         goto roundAndPack;
3967     }
3968     aSig |= LIT64( 0x2000000000000000 );
3969     zSig = ( aSig + bSig )<<1;
3970     --zExp;
3971     if ( (int64_t) zSig < 0 ) {
3972         zSig = aSig + bSig;
3973         ++zExp;
3974     }
3975  roundAndPack:
3976     return roundAndPackFloat64(zSign, zExp, zSig, status);
3977 
3978 }
3979 
3980 /*----------------------------------------------------------------------------
3981 | Returns the result of subtracting the absolute values of the double-
3982 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3983 | difference is negated before being returned.  `zSign' is ignored if the
3984 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3985 | Standard for Binary Floating-Point Arithmetic.
3986 *----------------------------------------------------------------------------*/
3987 
3988 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3989                               float_status *status)
3990 {
3991     int aExp, bExp, zExp;
3992     uint64_t aSig, bSig, zSig;
3993     int expDiff;
3994 
3995     aSig = extractFloat64Frac( a );
3996     aExp = extractFloat64Exp( a );
3997     bSig = extractFloat64Frac( b );
3998     bExp = extractFloat64Exp( b );
3999     expDiff = aExp - bExp;
4000     aSig <<= 10;
4001     bSig <<= 10;
4002     if ( 0 < expDiff ) goto aExpBigger;
4003     if ( expDiff < 0 ) goto bExpBigger;
4004     if ( aExp == 0x7FF ) {
4005         if (aSig | bSig) {
4006             return propagateFloat64NaN(a, b, status);
4007         }
4008         float_raise(float_flag_invalid, status);
4009         return float64_default_nan(status);
4010     }
4011     if ( aExp == 0 ) {
4012         aExp = 1;
4013         bExp = 1;
4014     }
4015     if ( bSig < aSig ) goto aBigger;
4016     if ( aSig < bSig ) goto bBigger;
4017     return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
4018  bExpBigger:
4019     if ( bExp == 0x7FF ) {
4020         if (bSig) {
4021             return propagateFloat64NaN(a, b, status);
4022         }
4023         return packFloat64( zSign ^ 1, 0x7FF, 0 );
4024     }
4025     if ( aExp == 0 ) {
4026         ++expDiff;
4027     }
4028     else {
4029         aSig |= LIT64( 0x4000000000000000 );
4030     }
4031     shift64RightJamming( aSig, - expDiff, &aSig );
4032     bSig |= LIT64( 0x4000000000000000 );
4033  bBigger:
4034     zSig = bSig - aSig;
4035     zExp = bExp;
4036     zSign ^= 1;
4037     goto normalizeRoundAndPack;
4038  aExpBigger:
4039     if ( aExp == 0x7FF ) {
4040         if (aSig) {
4041             return propagateFloat64NaN(a, b, status);
4042         }
4043         return a;
4044     }
4045     if ( bExp == 0 ) {
4046         --expDiff;
4047     }
4048     else {
4049         bSig |= LIT64( 0x4000000000000000 );
4050     }
4051     shift64RightJamming( bSig, expDiff, &bSig );
4052     aSig |= LIT64( 0x4000000000000000 );
4053  aBigger:
4054     zSig = aSig - bSig;
4055     zExp = aExp;
4056  normalizeRoundAndPack:
4057     --zExp;
4058     return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
4059 
4060 }
4061 
4062 /*----------------------------------------------------------------------------
4063 | Returns the result of adding the double-precision floating-point values `a'
4064 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
4065 | Binary Floating-Point Arithmetic.
4066 *----------------------------------------------------------------------------*/
4067 
4068 float64 float64_add(float64 a, float64 b, float_status *status)
4069 {
4070     flag aSign, bSign;
4071     a = float64_squash_input_denormal(a, status);
4072     b = float64_squash_input_denormal(b, status);
4073 
4074     aSign = extractFloat64Sign( a );
4075     bSign = extractFloat64Sign( b );
4076     if ( aSign == bSign ) {
4077         return addFloat64Sigs(a, b, aSign, status);
4078     }
4079     else {
4080         return subFloat64Sigs(a, b, aSign, status);
4081     }
4082 
4083 }
4084 
4085 /*----------------------------------------------------------------------------
4086 | Returns the result of subtracting the double-precision floating-point values
4087 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4088 | for Binary Floating-Point Arithmetic.
4089 *----------------------------------------------------------------------------*/
4090 
4091 float64 float64_sub(float64 a, float64 b, float_status *status)
4092 {
4093     flag aSign, bSign;
4094     a = float64_squash_input_denormal(a, status);
4095     b = float64_squash_input_denormal(b, status);
4096 
4097     aSign = extractFloat64Sign( a );
4098     bSign = extractFloat64Sign( b );
4099     if ( aSign == bSign ) {
4100         return subFloat64Sigs(a, b, aSign, status);
4101     }
4102     else {
4103         return addFloat64Sigs(a, b, aSign, status);
4104     }
4105 
4106 }
4107 
4108 /*----------------------------------------------------------------------------
4109 | Returns the result of multiplying the double-precision floating-point values
4110 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4111 | for Binary Floating-Point Arithmetic.
4112 *----------------------------------------------------------------------------*/
4113 
4114 float64 float64_mul(float64 a, float64 b, float_status *status)
4115 {
4116     flag aSign, bSign, zSign;
4117     int aExp, bExp, zExp;
4118     uint64_t aSig, bSig, zSig0, zSig1;
4119 
4120     a = float64_squash_input_denormal(a, status);
4121     b = float64_squash_input_denormal(b, status);
4122 
4123     aSig = extractFloat64Frac( a );
4124     aExp = extractFloat64Exp( a );
4125     aSign = extractFloat64Sign( a );
4126     bSig = extractFloat64Frac( b );
4127     bExp = extractFloat64Exp( b );
4128     bSign = extractFloat64Sign( b );
4129     zSign = aSign ^ bSign;
4130     if ( aExp == 0x7FF ) {
4131         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4132             return propagateFloat64NaN(a, b, status);
4133         }
4134         if ( ( bExp | bSig ) == 0 ) {
4135             float_raise(float_flag_invalid, status);
4136             return float64_default_nan(status);
4137         }
4138         return packFloat64( zSign, 0x7FF, 0 );
4139     }
4140     if ( bExp == 0x7FF ) {
4141         if (bSig) {
4142             return propagateFloat64NaN(a, b, status);
4143         }
4144         if ( ( aExp | aSig ) == 0 ) {
4145             float_raise(float_flag_invalid, status);
4146             return float64_default_nan(status);
4147         }
4148         return packFloat64( zSign, 0x7FF, 0 );
4149     }
4150     if ( aExp == 0 ) {
4151         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4152         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4153     }
4154     if ( bExp == 0 ) {
4155         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4156         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4157     }
4158     zExp = aExp + bExp - 0x3FF;
4159     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4160     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4161     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4162     zSig0 |= ( zSig1 != 0 );
4163     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
4164         zSig0 <<= 1;
4165         --zExp;
4166     }
4167     return roundAndPackFloat64(zSign, zExp, zSig0, status);
4168 
4169 }
4170 
4171 /*----------------------------------------------------------------------------
4172 | Returns the result of dividing the double-precision floating-point value `a'
4173 | by the corresponding value `b'.  The operation is performed according to
4174 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4175 *----------------------------------------------------------------------------*/
4176 
4177 float64 float64_div(float64 a, float64 b, float_status *status)
4178 {
4179     flag aSign, bSign, zSign;
4180     int aExp, bExp, zExp;
4181     uint64_t aSig, bSig, zSig;
4182     uint64_t rem0, rem1;
4183     uint64_t term0, term1;
4184     a = float64_squash_input_denormal(a, status);
4185     b = float64_squash_input_denormal(b, status);
4186 
4187     aSig = extractFloat64Frac( a );
4188     aExp = extractFloat64Exp( a );
4189     aSign = extractFloat64Sign( a );
4190     bSig = extractFloat64Frac( b );
4191     bExp = extractFloat64Exp( b );
4192     bSign = extractFloat64Sign( b );
4193     zSign = aSign ^ bSign;
4194     if ( aExp == 0x7FF ) {
4195         if (aSig) {
4196             return propagateFloat64NaN(a, b, status);
4197         }
4198         if ( bExp == 0x7FF ) {
4199             if (bSig) {
4200                 return propagateFloat64NaN(a, b, status);
4201             }
4202             float_raise(float_flag_invalid, status);
4203             return float64_default_nan(status);
4204         }
4205         return packFloat64( zSign, 0x7FF, 0 );
4206     }
4207     if ( bExp == 0x7FF ) {
4208         if (bSig) {
4209             return propagateFloat64NaN(a, b, status);
4210         }
4211         return packFloat64( zSign, 0, 0 );
4212     }
4213     if ( bExp == 0 ) {
4214         if ( bSig == 0 ) {
4215             if ( ( aExp | aSig ) == 0 ) {
4216                 float_raise(float_flag_invalid, status);
4217                 return float64_default_nan(status);
4218             }
4219             float_raise(float_flag_divbyzero, status);
4220             return packFloat64( zSign, 0x7FF, 0 );
4221         }
4222         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4223     }
4224     if ( aExp == 0 ) {
4225         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4226         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4227     }
4228     zExp = aExp - bExp + 0x3FD;
4229     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4230     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4231     if ( bSig <= ( aSig + aSig ) ) {
4232         aSig >>= 1;
4233         ++zExp;
4234     }
4235     zSig = estimateDiv128To64( aSig, 0, bSig );
4236     if ( ( zSig & 0x1FF ) <= 2 ) {
4237         mul64To128( bSig, zSig, &term0, &term1 );
4238         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4239         while ( (int64_t) rem0 < 0 ) {
4240             --zSig;
4241             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4242         }
4243         zSig |= ( rem1 != 0 );
4244     }
4245     return roundAndPackFloat64(zSign, zExp, zSig, status);
4246 
4247 }
4248 
4249 /*----------------------------------------------------------------------------
4250 | Returns the remainder of the double-precision floating-point value `a'
4251 | with respect to the corresponding value `b'.  The operation is performed
4252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4253 *----------------------------------------------------------------------------*/
4254 
4255 float64 float64_rem(float64 a, float64 b, float_status *status)
4256 {
4257     flag aSign, zSign;
4258     int aExp, bExp, expDiff;
4259     uint64_t aSig, bSig;
4260     uint64_t q, alternateASig;
4261     int64_t sigMean;
4262 
4263     a = float64_squash_input_denormal(a, status);
4264     b = float64_squash_input_denormal(b, status);
4265     aSig = extractFloat64Frac( a );
4266     aExp = extractFloat64Exp( a );
4267     aSign = extractFloat64Sign( a );
4268     bSig = extractFloat64Frac( b );
4269     bExp = extractFloat64Exp( b );
4270     if ( aExp == 0x7FF ) {
4271         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4272             return propagateFloat64NaN(a, b, status);
4273         }
4274         float_raise(float_flag_invalid, status);
4275         return float64_default_nan(status);
4276     }
4277     if ( bExp == 0x7FF ) {
4278         if (bSig) {
4279             return propagateFloat64NaN(a, b, status);
4280         }
4281         return a;
4282     }
4283     if ( bExp == 0 ) {
4284         if ( bSig == 0 ) {
4285             float_raise(float_flag_invalid, status);
4286             return float64_default_nan(status);
4287         }
4288         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4289     }
4290     if ( aExp == 0 ) {
4291         if ( aSig == 0 ) return a;
4292         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4293     }
4294     expDiff = aExp - bExp;
4295     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4296     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4297     if ( expDiff < 0 ) {
4298         if ( expDiff < -1 ) return a;
4299         aSig >>= 1;
4300     }
4301     q = ( bSig <= aSig );
4302     if ( q ) aSig -= bSig;
4303     expDiff -= 64;
4304     while ( 0 < expDiff ) {
4305         q = estimateDiv128To64( aSig, 0, bSig );
4306         q = ( 2 < q ) ? q - 2 : 0;
4307         aSig = - ( ( bSig>>2 ) * q );
4308         expDiff -= 62;
4309     }
4310     expDiff += 64;
4311     if ( 0 < expDiff ) {
4312         q = estimateDiv128To64( aSig, 0, bSig );
4313         q = ( 2 < q ) ? q - 2 : 0;
4314         q >>= 64 - expDiff;
4315         bSig >>= 2;
4316         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4317     }
4318     else {
4319         aSig >>= 2;
4320         bSig >>= 2;
4321     }
4322     do {
4323         alternateASig = aSig;
4324         ++q;
4325         aSig -= bSig;
4326     } while ( 0 <= (int64_t) aSig );
4327     sigMean = aSig + alternateASig;
4328     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4329         aSig = alternateASig;
4330     }
4331     zSign = ( (int64_t) aSig < 0 );
4332     if ( zSign ) aSig = - aSig;
4333     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4334 
4335 }
4336 
4337 /*----------------------------------------------------------------------------
4338 | Returns the result of multiplying the double-precision floating-point values
4339 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4340 | multiplication.  The operation is performed according to the IEC/IEEE
4341 | Standard for Binary Floating-Point Arithmetic 754-2008.
4342 | The flags argument allows the caller to select negation of the
4343 | addend, the intermediate product, or the final result. (The difference
4344 | between this and having the caller do a separate negation is that negating
4345 | externally will flip the sign bit on NaNs.)
4346 *----------------------------------------------------------------------------*/
4347 
4348 float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4349                        float_status *status)
4350 {
4351     flag aSign, bSign, cSign, zSign;
4352     int aExp, bExp, cExp, pExp, zExp, expDiff;
4353     uint64_t aSig, bSig, cSig;
4354     flag pInf, pZero, pSign;
4355     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4356     int shiftcount;
4357     flag signflip, infzero;
4358 
4359     a = float64_squash_input_denormal(a, status);
4360     b = float64_squash_input_denormal(b, status);
4361     c = float64_squash_input_denormal(c, status);
4362     aSig = extractFloat64Frac(a);
4363     aExp = extractFloat64Exp(a);
4364     aSign = extractFloat64Sign(a);
4365     bSig = extractFloat64Frac(b);
4366     bExp = extractFloat64Exp(b);
4367     bSign = extractFloat64Sign(b);
4368     cSig = extractFloat64Frac(c);
4369     cExp = extractFloat64Exp(c);
4370     cSign = extractFloat64Sign(c);
4371 
4372     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4373                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4374 
4375     /* It is implementation-defined whether the cases of (0,inf,qnan)
4376      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4377      * they return if they do), so we have to hand this information
4378      * off to the target-specific pick-a-NaN routine.
4379      */
4380     if (((aExp == 0x7ff) && aSig) ||
4381         ((bExp == 0x7ff) && bSig) ||
4382         ((cExp == 0x7ff) && cSig)) {
4383         return propagateFloat64MulAddNaN(a, b, c, infzero, status);
4384     }
4385 
4386     if (infzero) {
4387         float_raise(float_flag_invalid, status);
4388         return float64_default_nan(status);
4389     }
4390 
4391     if (flags & float_muladd_negate_c) {
4392         cSign ^= 1;
4393     }
4394 
4395     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4396 
4397     /* Work out the sign and type of the product */
4398     pSign = aSign ^ bSign;
4399     if (flags & float_muladd_negate_product) {
4400         pSign ^= 1;
4401     }
4402     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4403     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4404 
4405     if (cExp == 0x7ff) {
4406         if (pInf && (pSign ^ cSign)) {
4407             /* addition of opposite-signed infinities => InvalidOperation */
4408             float_raise(float_flag_invalid, status);
4409             return float64_default_nan(status);
4410         }
4411         /* Otherwise generate an infinity of the same sign */
4412         return packFloat64(cSign ^ signflip, 0x7ff, 0);
4413     }
4414 
4415     if (pInf) {
4416         return packFloat64(pSign ^ signflip, 0x7ff, 0);
4417     }
4418 
4419     if (pZero) {
4420         if (cExp == 0) {
4421             if (cSig == 0) {
4422                 /* Adding two exact zeroes */
4423                 if (pSign == cSign) {
4424                     zSign = pSign;
4425                 } else if (status->float_rounding_mode == float_round_down) {
4426                     zSign = 1;
4427                 } else {
4428                     zSign = 0;
4429                 }
4430                 return packFloat64(zSign ^ signflip, 0, 0);
4431             }
4432             /* Exact zero plus a denorm */
4433             if (status->flush_to_zero) {
4434                 float_raise(float_flag_output_denormal, status);
4435                 return packFloat64(cSign ^ signflip, 0, 0);
4436             }
4437         }
4438         /* Zero plus something non-zero : just return the something */
4439         if (flags & float_muladd_halve_result) {
4440             if (cExp == 0) {
4441                 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4442             }
4443             /* Subtract one to halve, and one again because roundAndPackFloat64
4444              * wants one less than the true exponent.
4445              */
4446             cExp -= 2;
4447             cSig = (cSig | 0x0010000000000000ULL) << 10;
4448             return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
4449         }
4450         return packFloat64(cSign ^ signflip, cExp, cSig);
4451     }
4452 
4453     if (aExp == 0) {
4454         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4455     }
4456     if (bExp == 0) {
4457         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4458     }
4459 
4460     /* Calculate the actual result a * b + c */
4461 
4462     /* Multiply first; this is easy. */
4463     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4464      * because we want the true exponent, not the "one-less-than"
4465      * flavour that roundAndPackFloat64() takes.
4466      */
4467     pExp = aExp + bExp - 0x3fe;
4468     aSig = (aSig | LIT64(0x0010000000000000))<<10;
4469     bSig = (bSig | LIT64(0x0010000000000000))<<11;
4470     mul64To128(aSig, bSig, &pSig0, &pSig1);
4471     if ((int64_t)(pSig0 << 1) >= 0) {
4472         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4473         pExp--;
4474     }
4475 
4476     zSign = pSign ^ signflip;
4477 
4478     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4479      * bit in position 126.
4480      */
4481     if (cExp == 0) {
4482         if (!cSig) {
4483             /* Throw out the special case of c being an exact zero now */
4484             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4485             if (flags & float_muladd_halve_result) {
4486                 pExp--;
4487             }
4488             return roundAndPackFloat64(zSign, pExp - 1,
4489                                        pSig1, status);
4490         }
4491         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4492     }
4493 
4494     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4495      * significand of the addend, with the explicit bit in position 126.
4496      */
4497     cSig0 = cSig << (126 - 64 - 52);
4498     cSig1 = 0;
4499     cSig0 |= LIT64(0x4000000000000000);
4500     expDiff = pExp - cExp;
4501 
4502     if (pSign == cSign) {
4503         /* Addition */
4504         if (expDiff > 0) {
4505             /* scale c to match p */
4506             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4507             zExp = pExp;
4508         } else if (expDiff < 0) {
4509             /* scale p to match c */
4510             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4511             zExp = cExp;
4512         } else {
4513             /* no scaling needed */
4514             zExp = cExp;
4515         }
4516         /* Add significands and make sure explicit bit ends up in posn 126 */
4517         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4518         if ((int64_t)zSig0 < 0) {
4519             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4520         } else {
4521             zExp--;
4522         }
4523         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4524         if (flags & float_muladd_halve_result) {
4525             zExp--;
4526         }
4527         return roundAndPackFloat64(zSign, zExp, zSig1, status);
4528     } else {
4529         /* Subtraction */
4530         if (expDiff > 0) {
4531             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4532             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4533             zExp = pExp;
4534         } else if (expDiff < 0) {
4535             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4536             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4537             zExp = cExp;
4538             zSign ^= 1;
4539         } else {
4540             zExp = pExp;
4541             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4542                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4543             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4544                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4545                 zSign ^= 1;
4546             } else {
4547                 /* Exact zero */
4548                 zSign = signflip;
4549                 if (status->float_rounding_mode == float_round_down) {
4550                     zSign ^= 1;
4551                 }
4552                 return packFloat64(zSign, 0, 0);
4553             }
4554         }
4555         --zExp;
4556         /* Do the equivalent of normalizeRoundAndPackFloat64() but
4557          * starting with the significand in a pair of uint64_t.
4558          */
4559         if (zSig0) {
4560             shiftcount = countLeadingZeros64(zSig0) - 1;
4561             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4562             if (zSig1) {
4563                 zSig0 |= 1;
4564             }
4565             zExp -= shiftcount;
4566         } else {
4567             shiftcount = countLeadingZeros64(zSig1);
4568             if (shiftcount == 0) {
4569                 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4570                 zExp -= 63;
4571             } else {
4572                 shiftcount--;
4573                 zSig0 = zSig1 << shiftcount;
4574                 zExp -= (shiftcount + 64);
4575             }
4576         }
4577         if (flags & float_muladd_halve_result) {
4578             zExp--;
4579         }
4580         return roundAndPackFloat64(zSign, zExp, zSig0, status);
4581     }
4582 }
4583 
4584 /*----------------------------------------------------------------------------
4585 | Returns the square root of the double-precision floating-point value `a'.
4586 | The operation is performed according to the IEC/IEEE Standard for Binary
4587 | Floating-Point Arithmetic.
4588 *----------------------------------------------------------------------------*/
4589 
4590 float64 float64_sqrt(float64 a, float_status *status)
4591 {
4592     flag aSign;
4593     int aExp, zExp;
4594     uint64_t aSig, zSig, doubleZSig;
4595     uint64_t rem0, rem1, term0, term1;
4596     a = float64_squash_input_denormal(a, status);
4597 
4598     aSig = extractFloat64Frac( a );
4599     aExp = extractFloat64Exp( a );
4600     aSign = extractFloat64Sign( a );
4601     if ( aExp == 0x7FF ) {
4602         if (aSig) {
4603             return propagateFloat64NaN(a, a, status);
4604         }
4605         if ( ! aSign ) return a;
4606         float_raise(float_flag_invalid, status);
4607         return float64_default_nan(status);
4608     }
4609     if ( aSign ) {
4610         if ( ( aExp | aSig ) == 0 ) return a;
4611         float_raise(float_flag_invalid, status);
4612         return float64_default_nan(status);
4613     }
4614     if ( aExp == 0 ) {
4615         if ( aSig == 0 ) return float64_zero;
4616         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4617     }
4618     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4619     aSig |= LIT64( 0x0010000000000000 );
4620     zSig = estimateSqrt32( aExp, aSig>>21 );
4621     aSig <<= 9 - ( aExp & 1 );
4622     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4623     if ( ( zSig & 0x1FF ) <= 5 ) {
4624         doubleZSig = zSig<<1;
4625         mul64To128( zSig, zSig, &term0, &term1 );
4626         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4627         while ( (int64_t) rem0 < 0 ) {
4628             --zSig;
4629             doubleZSig -= 2;
4630             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4631         }
4632         zSig |= ( ( rem0 | rem1 ) != 0 );
4633     }
4634     return roundAndPackFloat64(0, zExp, zSig, status);
4635 
4636 }
4637 
4638 /*----------------------------------------------------------------------------
4639 | Returns the binary log of the double-precision floating-point value `a'.
4640 | The operation is performed according to the IEC/IEEE Standard for Binary
4641 | Floating-Point Arithmetic.
4642 *----------------------------------------------------------------------------*/
4643 float64 float64_log2(float64 a, float_status *status)
4644 {
4645     flag aSign, zSign;
4646     int aExp;
4647     uint64_t aSig, aSig0, aSig1, zSig, i;
4648     a = float64_squash_input_denormal(a, status);
4649 
4650     aSig = extractFloat64Frac( a );
4651     aExp = extractFloat64Exp( a );
4652     aSign = extractFloat64Sign( a );
4653 
4654     if ( aExp == 0 ) {
4655         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4656         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4657     }
4658     if ( aSign ) {
4659         float_raise(float_flag_invalid, status);
4660         return float64_default_nan(status);
4661     }
4662     if ( aExp == 0x7FF ) {
4663         if (aSig) {
4664             return propagateFloat64NaN(a, float64_zero, status);
4665         }
4666         return a;
4667     }
4668 
4669     aExp -= 0x3FF;
4670     aSig |= LIT64( 0x0010000000000000 );
4671     zSign = aExp < 0;
4672     zSig = (uint64_t)aExp << 52;
4673     for (i = 1LL << 51; i > 0; i >>= 1) {
4674         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4675         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4676         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4677             aSig >>= 1;
4678             zSig |= i;
4679         }
4680     }
4681 
4682     if ( zSign )
4683         zSig = -zSig;
4684     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4685 }
4686 
4687 /*----------------------------------------------------------------------------
4688 | Returns 1 if the double-precision floating-point value `a' is equal to the
4689 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4690 | if either operand is a NaN.  Otherwise, the comparison is performed
4691 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4692 *----------------------------------------------------------------------------*/
4693 
4694 int float64_eq(float64 a, float64 b, float_status *status)
4695 {
4696     uint64_t av, bv;
4697     a = float64_squash_input_denormal(a, status);
4698     b = float64_squash_input_denormal(b, status);
4699 
4700     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4701          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4702        ) {
4703         float_raise(float_flag_invalid, status);
4704         return 0;
4705     }
4706     av = float64_val(a);
4707     bv = float64_val(b);
4708     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4709 
4710 }
4711 
4712 /*----------------------------------------------------------------------------
4713 | Returns 1 if the double-precision floating-point value `a' is less than or
4714 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4715 | exception is raised if either operand is a NaN.  The comparison is performed
4716 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4717 *----------------------------------------------------------------------------*/
4718 
4719 int float64_le(float64 a, float64 b, float_status *status)
4720 {
4721     flag aSign, bSign;
4722     uint64_t av, bv;
4723     a = float64_squash_input_denormal(a, status);
4724     b = float64_squash_input_denormal(b, status);
4725 
4726     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4727          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4728        ) {
4729         float_raise(float_flag_invalid, status);
4730         return 0;
4731     }
4732     aSign = extractFloat64Sign( a );
4733     bSign = extractFloat64Sign( b );
4734     av = float64_val(a);
4735     bv = float64_val(b);
4736     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4737     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4738 
4739 }
4740 
4741 /*----------------------------------------------------------------------------
4742 | Returns 1 if the double-precision floating-point value `a' is less than
4743 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4744 | raised if either operand is a NaN.  The comparison is performed according
4745 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4746 *----------------------------------------------------------------------------*/
4747 
4748 int float64_lt(float64 a, float64 b, float_status *status)
4749 {
4750     flag aSign, bSign;
4751     uint64_t av, bv;
4752 
4753     a = float64_squash_input_denormal(a, status);
4754     b = float64_squash_input_denormal(b, status);
4755     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4756          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4757        ) {
4758         float_raise(float_flag_invalid, status);
4759         return 0;
4760     }
4761     aSign = extractFloat64Sign( a );
4762     bSign = extractFloat64Sign( b );
4763     av = float64_val(a);
4764     bv = float64_val(b);
4765     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4766     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4767 
4768 }
4769 
4770 /*----------------------------------------------------------------------------
4771 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4772 | be compared, and 0 otherwise.  The invalid exception is raised if either
4773 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4774 | Standard for Binary Floating-Point Arithmetic.
4775 *----------------------------------------------------------------------------*/
4776 
4777 int float64_unordered(float64 a, float64 b, float_status *status)
4778 {
4779     a = float64_squash_input_denormal(a, status);
4780     b = float64_squash_input_denormal(b, status);
4781 
4782     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4783          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4784        ) {
4785         float_raise(float_flag_invalid, status);
4786         return 1;
4787     }
4788     return 0;
4789 }
4790 
4791 /*----------------------------------------------------------------------------
4792 | Returns 1 if the double-precision floating-point value `a' is equal to the
4793 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4794 | exception.The comparison is performed according to the IEC/IEEE Standard
4795 | for Binary Floating-Point Arithmetic.
4796 *----------------------------------------------------------------------------*/
4797 
4798 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4799 {
4800     uint64_t av, bv;
4801     a = float64_squash_input_denormal(a, status);
4802     b = float64_squash_input_denormal(b, status);
4803 
4804     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4805          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4806        ) {
4807         if (float64_is_signaling_nan(a, status)
4808          || float64_is_signaling_nan(b, status)) {
4809             float_raise(float_flag_invalid, status);
4810         }
4811         return 0;
4812     }
4813     av = float64_val(a);
4814     bv = float64_val(b);
4815     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4816 
4817 }
4818 
4819 /*----------------------------------------------------------------------------
4820 | Returns 1 if the double-precision floating-point value `a' is less than or
4821 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4822 | cause an exception.  Otherwise, the comparison is performed according to the
4823 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4824 *----------------------------------------------------------------------------*/
4825 
4826 int float64_le_quiet(float64 a, float64 b, float_status *status)
4827 {
4828     flag aSign, bSign;
4829     uint64_t av, bv;
4830     a = float64_squash_input_denormal(a, status);
4831     b = float64_squash_input_denormal(b, status);
4832 
4833     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4834          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4835        ) {
4836         if (float64_is_signaling_nan(a, status)
4837          || float64_is_signaling_nan(b, status)) {
4838             float_raise(float_flag_invalid, status);
4839         }
4840         return 0;
4841     }
4842     aSign = extractFloat64Sign( a );
4843     bSign = extractFloat64Sign( b );
4844     av = float64_val(a);
4845     bv = float64_val(b);
4846     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4847     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4848 
4849 }
4850 
4851 /*----------------------------------------------------------------------------
4852 | Returns 1 if the double-precision floating-point value `a' is less than
4853 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4854 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4855 | Standard for Binary Floating-Point Arithmetic.
4856 *----------------------------------------------------------------------------*/
4857 
4858 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4859 {
4860     flag aSign, bSign;
4861     uint64_t av, bv;
4862     a = float64_squash_input_denormal(a, status);
4863     b = float64_squash_input_denormal(b, status);
4864 
4865     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4866          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4867        ) {
4868         if (float64_is_signaling_nan(a, status)
4869          || float64_is_signaling_nan(b, status)) {
4870             float_raise(float_flag_invalid, status);
4871         }
4872         return 0;
4873     }
4874     aSign = extractFloat64Sign( a );
4875     bSign = extractFloat64Sign( b );
4876     av = float64_val(a);
4877     bv = float64_val(b);
4878     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4879     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4880 
4881 }
4882 
4883 /*----------------------------------------------------------------------------
4884 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4885 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4886 | comparison is performed according to the IEC/IEEE Standard for Binary
4887 | Floating-Point Arithmetic.
4888 *----------------------------------------------------------------------------*/
4889 
4890 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4891 {
4892     a = float64_squash_input_denormal(a, status);
4893     b = float64_squash_input_denormal(b, status);
4894 
4895     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4896          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4897        ) {
4898         if (float64_is_signaling_nan(a, status)
4899          || float64_is_signaling_nan(b, status)) {
4900             float_raise(float_flag_invalid, status);
4901         }
4902         return 1;
4903     }
4904     return 0;
4905 }
4906 
4907 /*----------------------------------------------------------------------------
4908 | Returns the result of converting the extended double-precision floating-
4909 | point value `a' to the 32-bit two's complement integer format.  The
4910 | conversion is performed according to the IEC/IEEE Standard for Binary
4911 | Floating-Point Arithmetic---which means in particular that the conversion
4912 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4913 | largest positive integer is returned.  Otherwise, if the conversion
4914 | overflows, the largest integer with the same sign as `a' is returned.
4915 *----------------------------------------------------------------------------*/
4916 
4917 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4918 {
4919     flag aSign;
4920     int32_t aExp, shiftCount;
4921     uint64_t aSig;
4922 
4923     if (floatx80_invalid_encoding(a)) {
4924         float_raise(float_flag_invalid, status);
4925         return 1 << 31;
4926     }
4927     aSig = extractFloatx80Frac( a );
4928     aExp = extractFloatx80Exp( a );
4929     aSign = extractFloatx80Sign( a );
4930     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4931     shiftCount = 0x4037 - aExp;
4932     if ( shiftCount <= 0 ) shiftCount = 1;
4933     shift64RightJamming( aSig, shiftCount, &aSig );
4934     return roundAndPackInt32(aSign, aSig, status);
4935 
4936 }
4937 
4938 /*----------------------------------------------------------------------------
4939 | Returns the result of converting the extended double-precision floating-
4940 | point value `a' to the 32-bit two's complement integer format.  The
4941 | conversion is performed according to the IEC/IEEE Standard for Binary
4942 | Floating-Point Arithmetic, except that the conversion is always rounded
4943 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4944 | Otherwise, if the conversion overflows, the largest integer with the same
4945 | sign as `a' is returned.
4946 *----------------------------------------------------------------------------*/
4947 
4948 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4949 {
4950     flag aSign;
4951     int32_t aExp, shiftCount;
4952     uint64_t aSig, savedASig;
4953     int32_t z;
4954 
4955     if (floatx80_invalid_encoding(a)) {
4956         float_raise(float_flag_invalid, status);
4957         return 1 << 31;
4958     }
4959     aSig = extractFloatx80Frac( a );
4960     aExp = extractFloatx80Exp( a );
4961     aSign = extractFloatx80Sign( a );
4962     if ( 0x401E < aExp ) {
4963         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4964         goto invalid;
4965     }
4966     else if ( aExp < 0x3FFF ) {
4967         if (aExp || aSig) {
4968             status->float_exception_flags |= float_flag_inexact;
4969         }
4970         return 0;
4971     }
4972     shiftCount = 0x403E - aExp;
4973     savedASig = aSig;
4974     aSig >>= shiftCount;
4975     z = aSig;
4976     if ( aSign ) z = - z;
4977     if ( ( z < 0 ) ^ aSign ) {
4978  invalid:
4979         float_raise(float_flag_invalid, status);
4980         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4981     }
4982     if ( ( aSig<<shiftCount ) != savedASig ) {
4983         status->float_exception_flags |= float_flag_inexact;
4984     }
4985     return z;
4986 
4987 }
4988 
4989 /*----------------------------------------------------------------------------
4990 | Returns the result of converting the extended double-precision floating-
4991 | point value `a' to the 64-bit two's complement integer format.  The
4992 | conversion is performed according to the IEC/IEEE Standard for Binary
4993 | Floating-Point Arithmetic---which means in particular that the conversion
4994 | is rounded according to the current rounding mode.  If `a' is a NaN,
4995 | the largest positive integer is returned.  Otherwise, if the conversion
4996 | overflows, the largest integer with the same sign as `a' is returned.
4997 *----------------------------------------------------------------------------*/
4998 
4999 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5000 {
5001     flag aSign;
5002     int32_t aExp, shiftCount;
5003     uint64_t aSig, aSigExtra;
5004 
5005     if (floatx80_invalid_encoding(a)) {
5006         float_raise(float_flag_invalid, status);
5007         return 1ULL << 63;
5008     }
5009     aSig = extractFloatx80Frac( a );
5010     aExp = extractFloatx80Exp( a );
5011     aSign = extractFloatx80Sign( a );
5012     shiftCount = 0x403E - aExp;
5013     if ( shiftCount <= 0 ) {
5014         if ( shiftCount ) {
5015             float_raise(float_flag_invalid, status);
5016             if (    ! aSign
5017                  || (    ( aExp == 0x7FFF )
5018                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
5019                ) {
5020                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5021             }
5022             return (int64_t) LIT64( 0x8000000000000000 );
5023         }
5024         aSigExtra = 0;
5025     }
5026     else {
5027         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5028     }
5029     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5030 
5031 }
5032 
5033 /*----------------------------------------------------------------------------
5034 | Returns the result of converting the extended double-precision floating-
5035 | point value `a' to the 64-bit two's complement integer format.  The
5036 | conversion is performed according to the IEC/IEEE Standard for Binary
5037 | Floating-Point Arithmetic, except that the conversion is always rounded
5038 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5039 | Otherwise, if the conversion overflows, the largest integer with the same
5040 | sign as `a' is returned.
5041 *----------------------------------------------------------------------------*/
5042 
5043 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5044 {
5045     flag aSign;
5046     int32_t aExp, shiftCount;
5047     uint64_t aSig;
5048     int64_t z;
5049 
5050     if (floatx80_invalid_encoding(a)) {
5051         float_raise(float_flag_invalid, status);
5052         return 1ULL << 63;
5053     }
5054     aSig = extractFloatx80Frac( a );
5055     aExp = extractFloatx80Exp( a );
5056     aSign = extractFloatx80Sign( a );
5057     shiftCount = aExp - 0x403E;
5058     if ( 0 <= shiftCount ) {
5059         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
5060         if ( ( a.high != 0xC03E ) || aSig ) {
5061             float_raise(float_flag_invalid, status);
5062             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5063                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5064             }
5065         }
5066         return (int64_t) LIT64( 0x8000000000000000 );
5067     }
5068     else if ( aExp < 0x3FFF ) {
5069         if (aExp | aSig) {
5070             status->float_exception_flags |= float_flag_inexact;
5071         }
5072         return 0;
5073     }
5074     z = aSig>>( - shiftCount );
5075     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5076         status->float_exception_flags |= float_flag_inexact;
5077     }
5078     if ( aSign ) z = - z;
5079     return z;
5080 
5081 }
5082 
5083 /*----------------------------------------------------------------------------
5084 | Returns the result of converting the extended double-precision floating-
5085 | point value `a' to the single-precision floating-point format.  The
5086 | conversion is performed according to the IEC/IEEE Standard for Binary
5087 | Floating-Point Arithmetic.
5088 *----------------------------------------------------------------------------*/
5089 
5090 float32 floatx80_to_float32(floatx80 a, float_status *status)
5091 {
5092     flag aSign;
5093     int32_t aExp;
5094     uint64_t aSig;
5095 
5096     if (floatx80_invalid_encoding(a)) {
5097         float_raise(float_flag_invalid, status);
5098         return float32_default_nan(status);
5099     }
5100     aSig = extractFloatx80Frac( a );
5101     aExp = extractFloatx80Exp( a );
5102     aSign = extractFloatx80Sign( a );
5103     if ( aExp == 0x7FFF ) {
5104         if ( (uint64_t) ( aSig<<1 ) ) {
5105             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5106         }
5107         return packFloat32( aSign, 0xFF, 0 );
5108     }
5109     shift64RightJamming( aSig, 33, &aSig );
5110     if ( aExp || aSig ) aExp -= 0x3F81;
5111     return roundAndPackFloat32(aSign, aExp, aSig, status);
5112 
5113 }
5114 
5115 /*----------------------------------------------------------------------------
5116 | Returns the result of converting the extended double-precision floating-
5117 | point value `a' to the double-precision floating-point format.  The
5118 | conversion is performed according to the IEC/IEEE Standard for Binary
5119 | Floating-Point Arithmetic.
5120 *----------------------------------------------------------------------------*/
5121 
5122 float64 floatx80_to_float64(floatx80 a, float_status *status)
5123 {
5124     flag aSign;
5125     int32_t aExp;
5126     uint64_t aSig, zSig;
5127 
5128     if (floatx80_invalid_encoding(a)) {
5129         float_raise(float_flag_invalid, status);
5130         return float64_default_nan(status);
5131     }
5132     aSig = extractFloatx80Frac( a );
5133     aExp = extractFloatx80Exp( a );
5134     aSign = extractFloatx80Sign( a );
5135     if ( aExp == 0x7FFF ) {
5136         if ( (uint64_t) ( aSig<<1 ) ) {
5137             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5138         }
5139         return packFloat64( aSign, 0x7FF, 0 );
5140     }
5141     shift64RightJamming( aSig, 1, &zSig );
5142     if ( aExp || aSig ) aExp -= 0x3C01;
5143     return roundAndPackFloat64(aSign, aExp, zSig, status);
5144 
5145 }
5146 
5147 /*----------------------------------------------------------------------------
5148 | Returns the result of converting the extended double-precision floating-
5149 | point value `a' to the quadruple-precision floating-point format.  The
5150 | conversion is performed according to the IEC/IEEE Standard for Binary
5151 | Floating-Point Arithmetic.
5152 *----------------------------------------------------------------------------*/
5153 
5154 float128 floatx80_to_float128(floatx80 a, float_status *status)
5155 {
5156     flag aSign;
5157     int aExp;
5158     uint64_t aSig, zSig0, zSig1;
5159 
5160     if (floatx80_invalid_encoding(a)) {
5161         float_raise(float_flag_invalid, status);
5162         return float128_default_nan(status);
5163     }
5164     aSig = extractFloatx80Frac( a );
5165     aExp = extractFloatx80Exp( a );
5166     aSign = extractFloatx80Sign( a );
5167     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5168         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5169     }
5170     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5171     return packFloat128( aSign, aExp, zSig0, zSig1 );
5172 
5173 }
5174 
5175 /*----------------------------------------------------------------------------
5176 | Rounds the extended double-precision floating-point value `a'
5177 | to the precision provided by floatx80_rounding_precision and returns the
5178 | result as an extended double-precision floating-point value.
5179 | The operation is performed according to the IEC/IEEE Standard for Binary
5180 | Floating-Point Arithmetic.
5181 *----------------------------------------------------------------------------*/
5182 
5183 floatx80 floatx80_round(floatx80 a, float_status *status)
5184 {
5185     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5186                                 extractFloatx80Sign(a),
5187                                 extractFloatx80Exp(a),
5188                                 extractFloatx80Frac(a), 0, status);
5189 }
5190 
5191 /*----------------------------------------------------------------------------
5192 | Rounds the extended double-precision floating-point value `a' to an integer,
5193 | and returns the result as an extended quadruple-precision floating-point
5194 | value.  The operation is performed according to the IEC/IEEE Standard for
5195 | Binary Floating-Point Arithmetic.
5196 *----------------------------------------------------------------------------*/
5197 
5198 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5199 {
5200     flag aSign;
5201     int32_t aExp;
5202     uint64_t lastBitMask, roundBitsMask;
5203     floatx80 z;
5204 
5205     if (floatx80_invalid_encoding(a)) {
5206         float_raise(float_flag_invalid, status);
5207         return floatx80_default_nan(status);
5208     }
5209     aExp = extractFloatx80Exp( a );
5210     if ( 0x403E <= aExp ) {
5211         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5212             return propagateFloatx80NaN(a, a, status);
5213         }
5214         return a;
5215     }
5216     if ( aExp < 0x3FFF ) {
5217         if (    ( aExp == 0 )
5218              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5219             return a;
5220         }
5221         status->float_exception_flags |= float_flag_inexact;
5222         aSign = extractFloatx80Sign( a );
5223         switch (status->float_rounding_mode) {
5224          case float_round_nearest_even:
5225             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5226                ) {
5227                 return
5228                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5229             }
5230             break;
5231         case float_round_ties_away:
5232             if (aExp == 0x3FFE) {
5233                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5234             }
5235             break;
5236          case float_round_down:
5237             return
5238                   aSign ?
5239                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5240                 : packFloatx80( 0, 0, 0 );
5241          case float_round_up:
5242             return
5243                   aSign ? packFloatx80( 1, 0, 0 )
5244                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5245         }
5246         return packFloatx80( aSign, 0, 0 );
5247     }
5248     lastBitMask = 1;
5249     lastBitMask <<= 0x403E - aExp;
5250     roundBitsMask = lastBitMask - 1;
5251     z = a;
5252     switch (status->float_rounding_mode) {
5253     case float_round_nearest_even:
5254         z.low += lastBitMask>>1;
5255         if ((z.low & roundBitsMask) == 0) {
5256             z.low &= ~lastBitMask;
5257         }
5258         break;
5259     case float_round_ties_away:
5260         z.low += lastBitMask >> 1;
5261         break;
5262     case float_round_to_zero:
5263         break;
5264     case float_round_up:
5265         if (!extractFloatx80Sign(z)) {
5266             z.low += roundBitsMask;
5267         }
5268         break;
5269     case float_round_down:
5270         if (extractFloatx80Sign(z)) {
5271             z.low += roundBitsMask;
5272         }
5273         break;
5274     default:
5275         abort();
5276     }
5277     z.low &= ~ roundBitsMask;
5278     if ( z.low == 0 ) {
5279         ++z.high;
5280         z.low = LIT64( 0x8000000000000000 );
5281     }
5282     if (z.low != a.low) {
5283         status->float_exception_flags |= float_flag_inexact;
5284     }
5285     return z;
5286 
5287 }
5288 
5289 /*----------------------------------------------------------------------------
5290 | Returns the result of adding the absolute values of the extended double-
5291 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5292 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5293 | The addition is performed according to the IEC/IEEE Standard for Binary
5294 | Floating-Point Arithmetic.
5295 *----------------------------------------------------------------------------*/
5296 
5297 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5298                                 float_status *status)
5299 {
5300     int32_t aExp, bExp, zExp;
5301     uint64_t aSig, bSig, zSig0, zSig1;
5302     int32_t expDiff;
5303 
5304     aSig = extractFloatx80Frac( a );
5305     aExp = extractFloatx80Exp( a );
5306     bSig = extractFloatx80Frac( b );
5307     bExp = extractFloatx80Exp( b );
5308     expDiff = aExp - bExp;
5309     if ( 0 < expDiff ) {
5310         if ( aExp == 0x7FFF ) {
5311             if ((uint64_t)(aSig << 1)) {
5312                 return propagateFloatx80NaN(a, b, status);
5313             }
5314             return a;
5315         }
5316         if ( bExp == 0 ) --expDiff;
5317         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5318         zExp = aExp;
5319     }
5320     else if ( expDiff < 0 ) {
5321         if ( bExp == 0x7FFF ) {
5322             if ((uint64_t)(bSig << 1)) {
5323                 return propagateFloatx80NaN(a, b, status);
5324             }
5325             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5326         }
5327         if ( aExp == 0 ) ++expDiff;
5328         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5329         zExp = bExp;
5330     }
5331     else {
5332         if ( aExp == 0x7FFF ) {
5333             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5334                 return propagateFloatx80NaN(a, b, status);
5335             }
5336             return a;
5337         }
5338         zSig1 = 0;
5339         zSig0 = aSig + bSig;
5340         if ( aExp == 0 ) {
5341             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5342             goto roundAndPack;
5343         }
5344         zExp = aExp;
5345         goto shiftRight1;
5346     }
5347     zSig0 = aSig + bSig;
5348     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5349  shiftRight1:
5350     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5351     zSig0 |= LIT64( 0x8000000000000000 );
5352     ++zExp;
5353  roundAndPack:
5354     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5355                                 zSign, zExp, zSig0, zSig1, status);
5356 }
5357 
5358 /*----------------------------------------------------------------------------
5359 | Returns the result of subtracting the absolute values of the extended
5360 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5361 | difference is negated before being returned.  `zSign' is ignored if the
5362 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5363 | Standard for Binary Floating-Point Arithmetic.
5364 *----------------------------------------------------------------------------*/
5365 
5366 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5367                                 float_status *status)
5368 {
5369     int32_t aExp, bExp, zExp;
5370     uint64_t aSig, bSig, zSig0, zSig1;
5371     int32_t expDiff;
5372 
5373     aSig = extractFloatx80Frac( a );
5374     aExp = extractFloatx80Exp( a );
5375     bSig = extractFloatx80Frac( b );
5376     bExp = extractFloatx80Exp( b );
5377     expDiff = aExp - bExp;
5378     if ( 0 < expDiff ) goto aExpBigger;
5379     if ( expDiff < 0 ) goto bExpBigger;
5380     if ( aExp == 0x7FFF ) {
5381         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5382             return propagateFloatx80NaN(a, b, status);
5383         }
5384         float_raise(float_flag_invalid, status);
5385         return floatx80_default_nan(status);
5386     }
5387     if ( aExp == 0 ) {
5388         aExp = 1;
5389         bExp = 1;
5390     }
5391     zSig1 = 0;
5392     if ( bSig < aSig ) goto aBigger;
5393     if ( aSig < bSig ) goto bBigger;
5394     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5395  bExpBigger:
5396     if ( bExp == 0x7FFF ) {
5397         if ((uint64_t)(bSig << 1)) {
5398             return propagateFloatx80NaN(a, b, status);
5399         }
5400         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5401     }
5402     if ( aExp == 0 ) ++expDiff;
5403     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5404  bBigger:
5405     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5406     zExp = bExp;
5407     zSign ^= 1;
5408     goto normalizeRoundAndPack;
5409  aExpBigger:
5410     if ( aExp == 0x7FFF ) {
5411         if ((uint64_t)(aSig << 1)) {
5412             return propagateFloatx80NaN(a, b, status);
5413         }
5414         return a;
5415     }
5416     if ( bExp == 0 ) --expDiff;
5417     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5418  aBigger:
5419     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5420     zExp = aExp;
5421  normalizeRoundAndPack:
5422     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5423                                          zSign, zExp, zSig0, zSig1, status);
5424 }
5425 
5426 /*----------------------------------------------------------------------------
5427 | Returns the result of adding the extended double-precision floating-point
5428 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5429 | Standard for Binary Floating-Point Arithmetic.
5430 *----------------------------------------------------------------------------*/
5431 
5432 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5433 {
5434     flag aSign, bSign;
5435 
5436     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5437         float_raise(float_flag_invalid, status);
5438         return floatx80_default_nan(status);
5439     }
5440     aSign = extractFloatx80Sign( a );
5441     bSign = extractFloatx80Sign( b );
5442     if ( aSign == bSign ) {
5443         return addFloatx80Sigs(a, b, aSign, status);
5444     }
5445     else {
5446         return subFloatx80Sigs(a, b, aSign, status);
5447     }
5448 
5449 }
5450 
5451 /*----------------------------------------------------------------------------
5452 | Returns the result of subtracting the extended double-precision floating-
5453 | point values `a' and `b'.  The operation is performed according to the
5454 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5455 *----------------------------------------------------------------------------*/
5456 
5457 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5458 {
5459     flag aSign, bSign;
5460 
5461     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5462         float_raise(float_flag_invalid, status);
5463         return floatx80_default_nan(status);
5464     }
5465     aSign = extractFloatx80Sign( a );
5466     bSign = extractFloatx80Sign( b );
5467     if ( aSign == bSign ) {
5468         return subFloatx80Sigs(a, b, aSign, status);
5469     }
5470     else {
5471         return addFloatx80Sigs(a, b, aSign, status);
5472     }
5473 
5474 }
5475 
5476 /*----------------------------------------------------------------------------
5477 | Returns the result of multiplying the extended double-precision floating-
5478 | point values `a' and `b'.  The operation is performed according to the
5479 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5480 *----------------------------------------------------------------------------*/
5481 
5482 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5483 {
5484     flag aSign, bSign, zSign;
5485     int32_t aExp, bExp, zExp;
5486     uint64_t aSig, bSig, zSig0, zSig1;
5487 
5488     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5489         float_raise(float_flag_invalid, status);
5490         return floatx80_default_nan(status);
5491     }
5492     aSig = extractFloatx80Frac( a );
5493     aExp = extractFloatx80Exp( a );
5494     aSign = extractFloatx80Sign( a );
5495     bSig = extractFloatx80Frac( b );
5496     bExp = extractFloatx80Exp( b );
5497     bSign = extractFloatx80Sign( b );
5498     zSign = aSign ^ bSign;
5499     if ( aExp == 0x7FFF ) {
5500         if (    (uint64_t) ( aSig<<1 )
5501              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5502             return propagateFloatx80NaN(a, b, status);
5503         }
5504         if ( ( bExp | bSig ) == 0 ) goto invalid;
5505         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5506     }
5507     if ( bExp == 0x7FFF ) {
5508         if ((uint64_t)(bSig << 1)) {
5509             return propagateFloatx80NaN(a, b, status);
5510         }
5511         if ( ( aExp | aSig ) == 0 ) {
5512  invalid:
5513             float_raise(float_flag_invalid, status);
5514             return floatx80_default_nan(status);
5515         }
5516         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5517     }
5518     if ( aExp == 0 ) {
5519         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5520         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5521     }
5522     if ( bExp == 0 ) {
5523         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5524         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5525     }
5526     zExp = aExp + bExp - 0x3FFE;
5527     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5528     if ( 0 < (int64_t) zSig0 ) {
5529         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5530         --zExp;
5531     }
5532     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5533                                 zSign, zExp, zSig0, zSig1, status);
5534 }
5535 
5536 /*----------------------------------------------------------------------------
5537 | Returns the result of dividing the extended double-precision floating-point
5538 | value `a' by the corresponding value `b'.  The operation is performed
5539 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5540 *----------------------------------------------------------------------------*/
5541 
5542 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5543 {
5544     flag aSign, bSign, zSign;
5545     int32_t aExp, bExp, zExp;
5546     uint64_t aSig, bSig, zSig0, zSig1;
5547     uint64_t rem0, rem1, rem2, term0, term1, term2;
5548 
5549     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5550         float_raise(float_flag_invalid, status);
5551         return floatx80_default_nan(status);
5552     }
5553     aSig = extractFloatx80Frac( a );
5554     aExp = extractFloatx80Exp( a );
5555     aSign = extractFloatx80Sign( a );
5556     bSig = extractFloatx80Frac( b );
5557     bExp = extractFloatx80Exp( b );
5558     bSign = extractFloatx80Sign( b );
5559     zSign = aSign ^ bSign;
5560     if ( aExp == 0x7FFF ) {
5561         if ((uint64_t)(aSig << 1)) {
5562             return propagateFloatx80NaN(a, b, status);
5563         }
5564         if ( bExp == 0x7FFF ) {
5565             if ((uint64_t)(bSig << 1)) {
5566                 return propagateFloatx80NaN(a, b, status);
5567             }
5568             goto invalid;
5569         }
5570         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5571     }
5572     if ( bExp == 0x7FFF ) {
5573         if ((uint64_t)(bSig << 1)) {
5574             return propagateFloatx80NaN(a, b, status);
5575         }
5576         return packFloatx80( zSign, 0, 0 );
5577     }
5578     if ( bExp == 0 ) {
5579         if ( bSig == 0 ) {
5580             if ( ( aExp | aSig ) == 0 ) {
5581  invalid:
5582                 float_raise(float_flag_invalid, status);
5583                 return floatx80_default_nan(status);
5584             }
5585             float_raise(float_flag_divbyzero, status);
5586             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5587         }
5588         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5589     }
5590     if ( aExp == 0 ) {
5591         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5592         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5593     }
5594     zExp = aExp - bExp + 0x3FFE;
5595     rem1 = 0;
5596     if ( bSig <= aSig ) {
5597         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5598         ++zExp;
5599     }
5600     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5601     mul64To128( bSig, zSig0, &term0, &term1 );
5602     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5603     while ( (int64_t) rem0 < 0 ) {
5604         --zSig0;
5605         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5606     }
5607     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5608     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5609         mul64To128( bSig, zSig1, &term1, &term2 );
5610         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5611         while ( (int64_t) rem1 < 0 ) {
5612             --zSig1;
5613             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5614         }
5615         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5616     }
5617     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5618                                 zSign, zExp, zSig0, zSig1, status);
5619 }
5620 
5621 /*----------------------------------------------------------------------------
5622 | Returns the remainder of the extended double-precision floating-point value
5623 | `a' with respect to the corresponding value `b'.  The operation is performed
5624 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5625 *----------------------------------------------------------------------------*/
5626 
5627 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5628 {
5629     flag aSign, zSign;
5630     int32_t aExp, bExp, expDiff;
5631     uint64_t aSig0, aSig1, bSig;
5632     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5633 
5634     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5635         float_raise(float_flag_invalid, status);
5636         return floatx80_default_nan(status);
5637     }
5638     aSig0 = extractFloatx80Frac( a );
5639     aExp = extractFloatx80Exp( a );
5640     aSign = extractFloatx80Sign( a );
5641     bSig = extractFloatx80Frac( b );
5642     bExp = extractFloatx80Exp( b );
5643     if ( aExp == 0x7FFF ) {
5644         if (    (uint64_t) ( aSig0<<1 )
5645              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5646             return propagateFloatx80NaN(a, b, status);
5647         }
5648         goto invalid;
5649     }
5650     if ( bExp == 0x7FFF ) {
5651         if ((uint64_t)(bSig << 1)) {
5652             return propagateFloatx80NaN(a, b, status);
5653         }
5654         return a;
5655     }
5656     if ( bExp == 0 ) {
5657         if ( bSig == 0 ) {
5658  invalid:
5659             float_raise(float_flag_invalid, status);
5660             return floatx80_default_nan(status);
5661         }
5662         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5663     }
5664     if ( aExp == 0 ) {
5665         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5666         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5667     }
5668     bSig |= LIT64( 0x8000000000000000 );
5669     zSign = aSign;
5670     expDiff = aExp - bExp;
5671     aSig1 = 0;
5672     if ( expDiff < 0 ) {
5673         if ( expDiff < -1 ) return a;
5674         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5675         expDiff = 0;
5676     }
5677     q = ( bSig <= aSig0 );
5678     if ( q ) aSig0 -= bSig;
5679     expDiff -= 64;
5680     while ( 0 < expDiff ) {
5681         q = estimateDiv128To64( aSig0, aSig1, bSig );
5682         q = ( 2 < q ) ? q - 2 : 0;
5683         mul64To128( bSig, q, &term0, &term1 );
5684         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5685         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5686         expDiff -= 62;
5687     }
5688     expDiff += 64;
5689     if ( 0 < expDiff ) {
5690         q = estimateDiv128To64( aSig0, aSig1, bSig );
5691         q = ( 2 < q ) ? q - 2 : 0;
5692         q >>= 64 - expDiff;
5693         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5694         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5695         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5696         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5697             ++q;
5698             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5699         }
5700     }
5701     else {
5702         term1 = 0;
5703         term0 = bSig;
5704     }
5705     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5706     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5707          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5708               && ( q & 1 ) )
5709        ) {
5710         aSig0 = alternateASig0;
5711         aSig1 = alternateASig1;
5712         zSign = ! zSign;
5713     }
5714     return
5715         normalizeRoundAndPackFloatx80(
5716             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5717 
5718 }
5719 
5720 /*----------------------------------------------------------------------------
5721 | Returns the square root of the extended double-precision floating-point
5722 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5723 | for Binary Floating-Point Arithmetic.
5724 *----------------------------------------------------------------------------*/
5725 
5726 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5727 {
5728     flag aSign;
5729     int32_t aExp, zExp;
5730     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5731     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5732 
5733     if (floatx80_invalid_encoding(a)) {
5734         float_raise(float_flag_invalid, status);
5735         return floatx80_default_nan(status);
5736     }
5737     aSig0 = extractFloatx80Frac( a );
5738     aExp = extractFloatx80Exp( a );
5739     aSign = extractFloatx80Sign( a );
5740     if ( aExp == 0x7FFF ) {
5741         if ((uint64_t)(aSig0 << 1)) {
5742             return propagateFloatx80NaN(a, a, status);
5743         }
5744         if ( ! aSign ) return a;
5745         goto invalid;
5746     }
5747     if ( aSign ) {
5748         if ( ( aExp | aSig0 ) == 0 ) return a;
5749  invalid:
5750         float_raise(float_flag_invalid, status);
5751         return floatx80_default_nan(status);
5752     }
5753     if ( aExp == 0 ) {
5754         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5755         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5756     }
5757     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5758     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5759     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5760     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5761     doubleZSig0 = zSig0<<1;
5762     mul64To128( zSig0, zSig0, &term0, &term1 );
5763     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5764     while ( (int64_t) rem0 < 0 ) {
5765         --zSig0;
5766         doubleZSig0 -= 2;
5767         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5768     }
5769     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5770     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5771         if ( zSig1 == 0 ) zSig1 = 1;
5772         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5773         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5774         mul64To128( zSig1, zSig1, &term2, &term3 );
5775         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5776         while ( (int64_t) rem1 < 0 ) {
5777             --zSig1;
5778             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5779             term3 |= 1;
5780             term2 |= doubleZSig0;
5781             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5782         }
5783         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5784     }
5785     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5786     zSig0 |= doubleZSig0;
5787     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5788                                 0, zExp, zSig0, zSig1, status);
5789 }
5790 
5791 /*----------------------------------------------------------------------------
5792 | Returns 1 if the extended double-precision floating-point value `a' is equal
5793 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5794 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5795 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5796 *----------------------------------------------------------------------------*/
5797 
5798 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5799 {
5800 
5801     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5802         || (extractFloatx80Exp(a) == 0x7FFF
5803             && (uint64_t) (extractFloatx80Frac(a) << 1))
5804         || (extractFloatx80Exp(b) == 0x7FFF
5805             && (uint64_t) (extractFloatx80Frac(b) << 1))
5806        ) {
5807         float_raise(float_flag_invalid, status);
5808         return 0;
5809     }
5810     return
5811            ( a.low == b.low )
5812         && (    ( a.high == b.high )
5813              || (    ( a.low == 0 )
5814                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5815            );
5816 
5817 }
5818 
5819 /*----------------------------------------------------------------------------
5820 | Returns 1 if the extended double-precision floating-point value `a' is
5821 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5822 | invalid exception is raised if either operand is a NaN.  The comparison is
5823 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5824 | Arithmetic.
5825 *----------------------------------------------------------------------------*/
5826 
5827 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5828 {
5829     flag aSign, bSign;
5830 
5831     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5832         || (extractFloatx80Exp(a) == 0x7FFF
5833             && (uint64_t) (extractFloatx80Frac(a) << 1))
5834         || (extractFloatx80Exp(b) == 0x7FFF
5835             && (uint64_t) (extractFloatx80Frac(b) << 1))
5836        ) {
5837         float_raise(float_flag_invalid, status);
5838         return 0;
5839     }
5840     aSign = extractFloatx80Sign( a );
5841     bSign = extractFloatx80Sign( b );
5842     if ( aSign != bSign ) {
5843         return
5844                aSign
5845             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5846                  == 0 );
5847     }
5848     return
5849           aSign ? le128( b.high, b.low, a.high, a.low )
5850         : le128( a.high, a.low, b.high, b.low );
5851 
5852 }
5853 
5854 /*----------------------------------------------------------------------------
5855 | Returns 1 if the extended double-precision floating-point value `a' is
5856 | less than the corresponding value `b', and 0 otherwise.  The invalid
5857 | exception is raised if either operand is a NaN.  The comparison is performed
5858 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5859 *----------------------------------------------------------------------------*/
5860 
5861 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5862 {
5863     flag aSign, bSign;
5864 
5865     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5866         || (extractFloatx80Exp(a) == 0x7FFF
5867             && (uint64_t) (extractFloatx80Frac(a) << 1))
5868         || (extractFloatx80Exp(b) == 0x7FFF
5869             && (uint64_t) (extractFloatx80Frac(b) << 1))
5870        ) {
5871         float_raise(float_flag_invalid, status);
5872         return 0;
5873     }
5874     aSign = extractFloatx80Sign( a );
5875     bSign = extractFloatx80Sign( b );
5876     if ( aSign != bSign ) {
5877         return
5878                aSign
5879             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5880                  != 0 );
5881     }
5882     return
5883           aSign ? lt128( b.high, b.low, a.high, a.low )
5884         : lt128( a.high, a.low, b.high, b.low );
5885 
5886 }
5887 
5888 /*----------------------------------------------------------------------------
5889 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5890 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5891 | either operand is a NaN.   The comparison is performed according to the
5892 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5893 *----------------------------------------------------------------------------*/
5894 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5895 {
5896     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5897         || (extractFloatx80Exp(a) == 0x7FFF
5898             && (uint64_t) (extractFloatx80Frac(a) << 1))
5899         || (extractFloatx80Exp(b) == 0x7FFF
5900             && (uint64_t) (extractFloatx80Frac(b) << 1))
5901        ) {
5902         float_raise(float_flag_invalid, status);
5903         return 1;
5904     }
5905     return 0;
5906 }
5907 
5908 /*----------------------------------------------------------------------------
5909 | Returns 1 if the extended double-precision floating-point value `a' is
5910 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5911 | cause an exception.  The comparison is performed according to the IEC/IEEE
5912 | Standard for Binary Floating-Point Arithmetic.
5913 *----------------------------------------------------------------------------*/
5914 
5915 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5916 {
5917 
5918     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5919         float_raise(float_flag_invalid, status);
5920         return 0;
5921     }
5922     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5923               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5924          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5925               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5926        ) {
5927         if (floatx80_is_signaling_nan(a, status)
5928          || floatx80_is_signaling_nan(b, status)) {
5929             float_raise(float_flag_invalid, status);
5930         }
5931         return 0;
5932     }
5933     return
5934            ( a.low == b.low )
5935         && (    ( a.high == b.high )
5936              || (    ( a.low == 0 )
5937                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5938            );
5939 
5940 }
5941 
5942 /*----------------------------------------------------------------------------
5943 | Returns 1 if the extended double-precision floating-point value `a' is less
5944 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5945 | do not cause an exception.  Otherwise, the comparison is performed according
5946 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5947 *----------------------------------------------------------------------------*/
5948 
5949 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5950 {
5951     flag aSign, bSign;
5952 
5953     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5954         float_raise(float_flag_invalid, status);
5955         return 0;
5956     }
5957     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5958               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5959          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5960               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5961        ) {
5962         if (floatx80_is_signaling_nan(a, status)
5963          || floatx80_is_signaling_nan(b, status)) {
5964             float_raise(float_flag_invalid, status);
5965         }
5966         return 0;
5967     }
5968     aSign = extractFloatx80Sign( a );
5969     bSign = extractFloatx80Sign( b );
5970     if ( aSign != bSign ) {
5971         return
5972                aSign
5973             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5974                  == 0 );
5975     }
5976     return
5977           aSign ? le128( b.high, b.low, a.high, a.low )
5978         : le128( a.high, a.low, b.high, b.low );
5979 
5980 }
5981 
5982 /*----------------------------------------------------------------------------
5983 | Returns 1 if the extended double-precision floating-point value `a' is less
5984 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5985 | an exception.  Otherwise, the comparison is performed according to the
5986 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5987 *----------------------------------------------------------------------------*/
5988 
5989 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5990 {
5991     flag aSign, bSign;
5992 
5993     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5994         float_raise(float_flag_invalid, status);
5995         return 0;
5996     }
5997     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5998               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5999          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6000               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6001        ) {
6002         if (floatx80_is_signaling_nan(a, status)
6003          || floatx80_is_signaling_nan(b, status)) {
6004             float_raise(float_flag_invalid, status);
6005         }
6006         return 0;
6007     }
6008     aSign = extractFloatx80Sign( a );
6009     bSign = extractFloatx80Sign( b );
6010     if ( aSign != bSign ) {
6011         return
6012                aSign
6013             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6014                  != 0 );
6015     }
6016     return
6017           aSign ? lt128( b.high, b.low, a.high, a.low )
6018         : lt128( a.high, a.low, b.high, b.low );
6019 
6020 }
6021 
6022 /*----------------------------------------------------------------------------
6023 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6024 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
6025 | The comparison is performed according to the IEC/IEEE Standard for Binary
6026 | Floating-Point Arithmetic.
6027 *----------------------------------------------------------------------------*/
6028 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6029 {
6030     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6031         float_raise(float_flag_invalid, status);
6032         return 1;
6033     }
6034     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6035               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6036          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6037               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6038        ) {
6039         if (floatx80_is_signaling_nan(a, status)
6040          || floatx80_is_signaling_nan(b, status)) {
6041             float_raise(float_flag_invalid, status);
6042         }
6043         return 1;
6044     }
6045     return 0;
6046 }
6047 
6048 /*----------------------------------------------------------------------------
6049 | Returns the result of converting the quadruple-precision floating-point
6050 | value `a' to the 32-bit two's complement integer format.  The conversion
6051 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6052 | Arithmetic---which means in particular that the conversion is rounded
6053 | according to the current rounding mode.  If `a' is a NaN, the largest
6054 | positive integer is returned.  Otherwise, if the conversion overflows, the
6055 | largest integer with the same sign as `a' is returned.
6056 *----------------------------------------------------------------------------*/
6057 
6058 int32_t float128_to_int32(float128 a, float_status *status)
6059 {
6060     flag aSign;
6061     int32_t aExp, shiftCount;
6062     uint64_t aSig0, aSig1;
6063 
6064     aSig1 = extractFloat128Frac1( a );
6065     aSig0 = extractFloat128Frac0( a );
6066     aExp = extractFloat128Exp( a );
6067     aSign = extractFloat128Sign( a );
6068     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6069     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6070     aSig0 |= ( aSig1 != 0 );
6071     shiftCount = 0x4028 - aExp;
6072     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6073     return roundAndPackInt32(aSign, aSig0, status);
6074 
6075 }
6076 
6077 /*----------------------------------------------------------------------------
6078 | Returns the result of converting the quadruple-precision floating-point
6079 | value `a' to the 32-bit two's complement integer format.  The conversion
6080 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6081 | Arithmetic, except that the conversion is always rounded toward zero.  If
6082 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6083 | conversion overflows, the largest integer with the same sign as `a' is
6084 | returned.
6085 *----------------------------------------------------------------------------*/
6086 
6087 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6088 {
6089     flag aSign;
6090     int32_t aExp, shiftCount;
6091     uint64_t aSig0, aSig1, savedASig;
6092     int32_t z;
6093 
6094     aSig1 = extractFloat128Frac1( a );
6095     aSig0 = extractFloat128Frac0( a );
6096     aExp = extractFloat128Exp( a );
6097     aSign = extractFloat128Sign( a );
6098     aSig0 |= ( aSig1 != 0 );
6099     if ( 0x401E < aExp ) {
6100         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6101         goto invalid;
6102     }
6103     else if ( aExp < 0x3FFF ) {
6104         if (aExp || aSig0) {
6105             status->float_exception_flags |= float_flag_inexact;
6106         }
6107         return 0;
6108     }
6109     aSig0 |= LIT64( 0x0001000000000000 );
6110     shiftCount = 0x402F - aExp;
6111     savedASig = aSig0;
6112     aSig0 >>= shiftCount;
6113     z = aSig0;
6114     if ( aSign ) z = - z;
6115     if ( ( z < 0 ) ^ aSign ) {
6116  invalid:
6117         float_raise(float_flag_invalid, status);
6118         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6119     }
6120     if ( ( aSig0<<shiftCount ) != savedASig ) {
6121         status->float_exception_flags |= float_flag_inexact;
6122     }
6123     return z;
6124 
6125 }
6126 
6127 /*----------------------------------------------------------------------------
6128 | Returns the result of converting the quadruple-precision floating-point
6129 | value `a' to the 64-bit two's complement integer format.  The conversion
6130 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6131 | Arithmetic---which means in particular that the conversion is rounded
6132 | according to the current rounding mode.  If `a' is a NaN, the largest
6133 | positive integer is returned.  Otherwise, if the conversion overflows, the
6134 | largest integer with the same sign as `a' is returned.
6135 *----------------------------------------------------------------------------*/
6136 
6137 int64_t float128_to_int64(float128 a, float_status *status)
6138 {
6139     flag aSign;
6140     int32_t aExp, shiftCount;
6141     uint64_t aSig0, aSig1;
6142 
6143     aSig1 = extractFloat128Frac1( a );
6144     aSig0 = extractFloat128Frac0( a );
6145     aExp = extractFloat128Exp( a );
6146     aSign = extractFloat128Sign( a );
6147     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6148     shiftCount = 0x402F - aExp;
6149     if ( shiftCount <= 0 ) {
6150         if ( 0x403E < aExp ) {
6151             float_raise(float_flag_invalid, status);
6152             if (    ! aSign
6153                  || (    ( aExp == 0x7FFF )
6154                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6155                     )
6156                ) {
6157                 return LIT64( 0x7FFFFFFFFFFFFFFF );
6158             }
6159             return (int64_t) LIT64( 0x8000000000000000 );
6160         }
6161         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6162     }
6163     else {
6164         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6165     }
6166     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6167 
6168 }
6169 
6170 /*----------------------------------------------------------------------------
6171 | Returns the result of converting the quadruple-precision floating-point
6172 | value `a' to the 64-bit two's complement integer format.  The conversion
6173 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6174 | Arithmetic, except that the conversion is always rounded toward zero.
6175 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6176 | the conversion overflows, the largest integer with the same sign as `a' is
6177 | returned.
6178 *----------------------------------------------------------------------------*/
6179 
6180 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6181 {
6182     flag aSign;
6183     int32_t aExp, shiftCount;
6184     uint64_t aSig0, aSig1;
6185     int64_t z;
6186 
6187     aSig1 = extractFloat128Frac1( a );
6188     aSig0 = extractFloat128Frac0( a );
6189     aExp = extractFloat128Exp( a );
6190     aSign = extractFloat128Sign( a );
6191     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6192     shiftCount = aExp - 0x402F;
6193     if ( 0 < shiftCount ) {
6194         if ( 0x403E <= aExp ) {
6195             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6196             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6197                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6198                 if (aSig1) {
6199                     status->float_exception_flags |= float_flag_inexact;
6200                 }
6201             }
6202             else {
6203                 float_raise(float_flag_invalid, status);
6204                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6205                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6206                 }
6207             }
6208             return (int64_t) LIT64( 0x8000000000000000 );
6209         }
6210         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6211         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6212             status->float_exception_flags |= float_flag_inexact;
6213         }
6214     }
6215     else {
6216         if ( aExp < 0x3FFF ) {
6217             if ( aExp | aSig0 | aSig1 ) {
6218                 status->float_exception_flags |= float_flag_inexact;
6219             }
6220             return 0;
6221         }
6222         z = aSig0>>( - shiftCount );
6223         if (    aSig1
6224              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6225             status->float_exception_flags |= float_flag_inexact;
6226         }
6227     }
6228     if ( aSign ) z = - z;
6229     return z;
6230 
6231 }
6232 
6233 /*----------------------------------------------------------------------------
6234 | Returns the result of converting the quadruple-precision floating-point value
6235 | `a' to the 64-bit unsigned integer format.  The conversion is
6236 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6237 | Arithmetic---which means in particular that the conversion is rounded
6238 | according to the current rounding mode.  If `a' is a NaN, the largest
6239 | positive integer is returned.  If the conversion overflows, the
6240 | largest unsigned integer is returned.  If 'a' is negative, the value is
6241 | rounded and zero is returned; negative values that do not round to zero
6242 | will raise the inexact exception.
6243 *----------------------------------------------------------------------------*/
6244 
6245 uint64_t float128_to_uint64(float128 a, float_status *status)
6246 {
6247     flag aSign;
6248     int aExp;
6249     int shiftCount;
6250     uint64_t aSig0, aSig1;
6251 
6252     aSig0 = extractFloat128Frac0(a);
6253     aSig1 = extractFloat128Frac1(a);
6254     aExp = extractFloat128Exp(a);
6255     aSign = extractFloat128Sign(a);
6256     if (aSign && (aExp > 0x3FFE)) {
6257         float_raise(float_flag_invalid, status);
6258         if (float128_is_any_nan(a)) {
6259             return LIT64(0xFFFFFFFFFFFFFFFF);
6260         } else {
6261             return 0;
6262         }
6263     }
6264     if (aExp) {
6265         aSig0 |= LIT64(0x0001000000000000);
6266     }
6267     shiftCount = 0x402F - aExp;
6268     if (shiftCount <= 0) {
6269         if (0x403E < aExp) {
6270             float_raise(float_flag_invalid, status);
6271             return LIT64(0xFFFFFFFFFFFFFFFF);
6272         }
6273         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6274     } else {
6275         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6276     }
6277     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6278 }
6279 
6280 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6281 {
6282     uint64_t v;
6283     signed char current_rounding_mode = status->float_rounding_mode;
6284 
6285     set_float_rounding_mode(float_round_to_zero, status);
6286     v = float128_to_uint64(a, status);
6287     set_float_rounding_mode(current_rounding_mode, status);
6288 
6289     return v;
6290 }
6291 
6292 /*----------------------------------------------------------------------------
6293 | Returns the result of converting the quadruple-precision floating-point
6294 | value `a' to the 32-bit unsigned integer format.  The conversion
6295 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6296 | Arithmetic except that the conversion is always rounded toward zero.
6297 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6298 | if the conversion overflows, the largest unsigned integer is returned.
6299 | If 'a' is negative, the value is rounded and zero is returned; negative
6300 | values that do not round to zero will raise the inexact exception.
6301 *----------------------------------------------------------------------------*/
6302 
6303 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6304 {
6305     uint64_t v;
6306     uint32_t res;
6307     int old_exc_flags = get_float_exception_flags(status);
6308 
6309     v = float128_to_uint64_round_to_zero(a, status);
6310     if (v > 0xffffffff) {
6311         res = 0xffffffff;
6312     } else {
6313         return v;
6314     }
6315     set_float_exception_flags(old_exc_flags, status);
6316     float_raise(float_flag_invalid, status);
6317     return res;
6318 }
6319 
6320 /*----------------------------------------------------------------------------
6321 | Returns the result of converting the quadruple-precision floating-point
6322 | value `a' to the single-precision floating-point format.  The conversion
6323 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6324 | Arithmetic.
6325 *----------------------------------------------------------------------------*/
6326 
6327 float32 float128_to_float32(float128 a, float_status *status)
6328 {
6329     flag aSign;
6330     int32_t aExp;
6331     uint64_t aSig0, aSig1;
6332     uint32_t zSig;
6333 
6334     aSig1 = extractFloat128Frac1( a );
6335     aSig0 = extractFloat128Frac0( a );
6336     aExp = extractFloat128Exp( a );
6337     aSign = extractFloat128Sign( a );
6338     if ( aExp == 0x7FFF ) {
6339         if ( aSig0 | aSig1 ) {
6340             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6341         }
6342         return packFloat32( aSign, 0xFF, 0 );
6343     }
6344     aSig0 |= ( aSig1 != 0 );
6345     shift64RightJamming( aSig0, 18, &aSig0 );
6346     zSig = aSig0;
6347     if ( aExp || zSig ) {
6348         zSig |= 0x40000000;
6349         aExp -= 0x3F81;
6350     }
6351     return roundAndPackFloat32(aSign, aExp, zSig, status);
6352 
6353 }
6354 
6355 /*----------------------------------------------------------------------------
6356 | Returns the result of converting the quadruple-precision floating-point
6357 | value `a' to the double-precision floating-point format.  The conversion
6358 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6359 | Arithmetic.
6360 *----------------------------------------------------------------------------*/
6361 
6362 float64 float128_to_float64(float128 a, float_status *status)
6363 {
6364     flag aSign;
6365     int32_t aExp;
6366     uint64_t aSig0, aSig1;
6367 
6368     aSig1 = extractFloat128Frac1( a );
6369     aSig0 = extractFloat128Frac0( a );
6370     aExp = extractFloat128Exp( a );
6371     aSign = extractFloat128Sign( a );
6372     if ( aExp == 0x7FFF ) {
6373         if ( aSig0 | aSig1 ) {
6374             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6375         }
6376         return packFloat64( aSign, 0x7FF, 0 );
6377     }
6378     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6379     aSig0 |= ( aSig1 != 0 );
6380     if ( aExp || aSig0 ) {
6381         aSig0 |= LIT64( 0x4000000000000000 );
6382         aExp -= 0x3C01;
6383     }
6384     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6385 
6386 }
6387 
6388 /*----------------------------------------------------------------------------
6389 | Returns the result of converting the quadruple-precision floating-point
6390 | value `a' to the extended double-precision floating-point format.  The
6391 | conversion is performed according to the IEC/IEEE Standard for Binary
6392 | Floating-Point Arithmetic.
6393 *----------------------------------------------------------------------------*/
6394 
6395 floatx80 float128_to_floatx80(float128 a, float_status *status)
6396 {
6397     flag aSign;
6398     int32_t aExp;
6399     uint64_t aSig0, aSig1;
6400 
6401     aSig1 = extractFloat128Frac1( a );
6402     aSig0 = extractFloat128Frac0( a );
6403     aExp = extractFloat128Exp( a );
6404     aSign = extractFloat128Sign( a );
6405     if ( aExp == 0x7FFF ) {
6406         if ( aSig0 | aSig1 ) {
6407             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6408         }
6409         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6410     }
6411     if ( aExp == 0 ) {
6412         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6413         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6414     }
6415     else {
6416         aSig0 |= LIT64( 0x0001000000000000 );
6417     }
6418     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6419     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6420 
6421 }
6422 
6423 /*----------------------------------------------------------------------------
6424 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6425 | returns the result as a quadruple-precision floating-point value.  The
6426 | operation is performed according to the IEC/IEEE Standard for Binary
6427 | Floating-Point Arithmetic.
6428 *----------------------------------------------------------------------------*/
6429 
6430 float128 float128_round_to_int(float128 a, float_status *status)
6431 {
6432     flag aSign;
6433     int32_t aExp;
6434     uint64_t lastBitMask, roundBitsMask;
6435     float128 z;
6436 
6437     aExp = extractFloat128Exp( a );
6438     if ( 0x402F <= aExp ) {
6439         if ( 0x406F <= aExp ) {
6440             if (    ( aExp == 0x7FFF )
6441                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6442                ) {
6443                 return propagateFloat128NaN(a, a, status);
6444             }
6445             return a;
6446         }
6447         lastBitMask = 1;
6448         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6449         roundBitsMask = lastBitMask - 1;
6450         z = a;
6451         switch (status->float_rounding_mode) {
6452         case float_round_nearest_even:
6453             if ( lastBitMask ) {
6454                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6455                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6456             }
6457             else {
6458                 if ( (int64_t) z.low < 0 ) {
6459                     ++z.high;
6460                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6461                 }
6462             }
6463             break;
6464         case float_round_ties_away:
6465             if (lastBitMask) {
6466                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6467             } else {
6468                 if ((int64_t) z.low < 0) {
6469                     ++z.high;
6470                 }
6471             }
6472             break;
6473         case float_round_to_zero:
6474             break;
6475         case float_round_up:
6476             if (!extractFloat128Sign(z)) {
6477                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6478             }
6479             break;
6480         case float_round_down:
6481             if (extractFloat128Sign(z)) {
6482                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6483             }
6484             break;
6485         default:
6486             abort();
6487         }
6488         z.low &= ~ roundBitsMask;
6489     }
6490     else {
6491         if ( aExp < 0x3FFF ) {
6492             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6493             status->float_exception_flags |= float_flag_inexact;
6494             aSign = extractFloat128Sign( a );
6495             switch (status->float_rounding_mode) {
6496              case float_round_nearest_even:
6497                 if (    ( aExp == 0x3FFE )
6498                      && (   extractFloat128Frac0( a )
6499                           | extractFloat128Frac1( a ) )
6500                    ) {
6501                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6502                 }
6503                 break;
6504             case float_round_ties_away:
6505                 if (aExp == 0x3FFE) {
6506                     return packFloat128(aSign, 0x3FFF, 0, 0);
6507                 }
6508                 break;
6509              case float_round_down:
6510                 return
6511                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6512                     : packFloat128( 0, 0, 0, 0 );
6513              case float_round_up:
6514                 return
6515                       aSign ? packFloat128( 1, 0, 0, 0 )
6516                     : packFloat128( 0, 0x3FFF, 0, 0 );
6517             }
6518             return packFloat128( aSign, 0, 0, 0 );
6519         }
6520         lastBitMask = 1;
6521         lastBitMask <<= 0x402F - aExp;
6522         roundBitsMask = lastBitMask - 1;
6523         z.low = 0;
6524         z.high = a.high;
6525         switch (status->float_rounding_mode) {
6526         case float_round_nearest_even:
6527             z.high += lastBitMask>>1;
6528             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6529                 z.high &= ~ lastBitMask;
6530             }
6531             break;
6532         case float_round_ties_away:
6533             z.high += lastBitMask>>1;
6534             break;
6535         case float_round_to_zero:
6536             break;
6537         case float_round_up:
6538             if (!extractFloat128Sign(z)) {
6539                 z.high |= ( a.low != 0 );
6540                 z.high += roundBitsMask;
6541             }
6542             break;
6543         case float_round_down:
6544             if (extractFloat128Sign(z)) {
6545                 z.high |= (a.low != 0);
6546                 z.high += roundBitsMask;
6547             }
6548             break;
6549         default:
6550             abort();
6551         }
6552         z.high &= ~ roundBitsMask;
6553     }
6554     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6555         status->float_exception_flags |= float_flag_inexact;
6556     }
6557     return z;
6558 
6559 }
6560 
6561 /*----------------------------------------------------------------------------
6562 | Returns the result of adding the absolute values of the quadruple-precision
6563 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6564 | before being returned.  `zSign' is ignored if the result is a NaN.
6565 | The addition is performed according to the IEC/IEEE Standard for Binary
6566 | Floating-Point Arithmetic.
6567 *----------------------------------------------------------------------------*/
6568 
6569 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6570                                 float_status *status)
6571 {
6572     int32_t aExp, bExp, zExp;
6573     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6574     int32_t expDiff;
6575 
6576     aSig1 = extractFloat128Frac1( a );
6577     aSig0 = extractFloat128Frac0( a );
6578     aExp = extractFloat128Exp( a );
6579     bSig1 = extractFloat128Frac1( b );
6580     bSig0 = extractFloat128Frac0( b );
6581     bExp = extractFloat128Exp( b );
6582     expDiff = aExp - bExp;
6583     if ( 0 < expDiff ) {
6584         if ( aExp == 0x7FFF ) {
6585             if (aSig0 | aSig1) {
6586                 return propagateFloat128NaN(a, b, status);
6587             }
6588             return a;
6589         }
6590         if ( bExp == 0 ) {
6591             --expDiff;
6592         }
6593         else {
6594             bSig0 |= LIT64( 0x0001000000000000 );
6595         }
6596         shift128ExtraRightJamming(
6597             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6598         zExp = aExp;
6599     }
6600     else if ( expDiff < 0 ) {
6601         if ( bExp == 0x7FFF ) {
6602             if (bSig0 | bSig1) {
6603                 return propagateFloat128NaN(a, b, status);
6604             }
6605             return packFloat128( zSign, 0x7FFF, 0, 0 );
6606         }
6607         if ( aExp == 0 ) {
6608             ++expDiff;
6609         }
6610         else {
6611             aSig0 |= LIT64( 0x0001000000000000 );
6612         }
6613         shift128ExtraRightJamming(
6614             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6615         zExp = bExp;
6616     }
6617     else {
6618         if ( aExp == 0x7FFF ) {
6619             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6620                 return propagateFloat128NaN(a, b, status);
6621             }
6622             return a;
6623         }
6624         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6625         if ( aExp == 0 ) {
6626             if (status->flush_to_zero) {
6627                 if (zSig0 | zSig1) {
6628                     float_raise(float_flag_output_denormal, status);
6629                 }
6630                 return packFloat128(zSign, 0, 0, 0);
6631             }
6632             return packFloat128( zSign, 0, zSig0, zSig1 );
6633         }
6634         zSig2 = 0;
6635         zSig0 |= LIT64( 0x0002000000000000 );
6636         zExp = aExp;
6637         goto shiftRight1;
6638     }
6639     aSig0 |= LIT64( 0x0001000000000000 );
6640     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6641     --zExp;
6642     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6643     ++zExp;
6644  shiftRight1:
6645     shift128ExtraRightJamming(
6646         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6647  roundAndPack:
6648     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6649 
6650 }
6651 
6652 /*----------------------------------------------------------------------------
6653 | Returns the result of subtracting the absolute values of the quadruple-
6654 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6655 | difference is negated before being returned.  `zSign' is ignored if the
6656 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6657 | Standard for Binary Floating-Point Arithmetic.
6658 *----------------------------------------------------------------------------*/
6659 
6660 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6661                                 float_status *status)
6662 {
6663     int32_t aExp, bExp, zExp;
6664     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6665     int32_t expDiff;
6666 
6667     aSig1 = extractFloat128Frac1( a );
6668     aSig0 = extractFloat128Frac0( a );
6669     aExp = extractFloat128Exp( a );
6670     bSig1 = extractFloat128Frac1( b );
6671     bSig0 = extractFloat128Frac0( b );
6672     bExp = extractFloat128Exp( b );
6673     expDiff = aExp - bExp;
6674     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6675     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6676     if ( 0 < expDiff ) goto aExpBigger;
6677     if ( expDiff < 0 ) goto bExpBigger;
6678     if ( aExp == 0x7FFF ) {
6679         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6680             return propagateFloat128NaN(a, b, status);
6681         }
6682         float_raise(float_flag_invalid, status);
6683         return float128_default_nan(status);
6684     }
6685     if ( aExp == 0 ) {
6686         aExp = 1;
6687         bExp = 1;
6688     }
6689     if ( bSig0 < aSig0 ) goto aBigger;
6690     if ( aSig0 < bSig0 ) goto bBigger;
6691     if ( bSig1 < aSig1 ) goto aBigger;
6692     if ( aSig1 < bSig1 ) goto bBigger;
6693     return packFloat128(status->float_rounding_mode == float_round_down,
6694                         0, 0, 0);
6695  bExpBigger:
6696     if ( bExp == 0x7FFF ) {
6697         if (bSig0 | bSig1) {
6698             return propagateFloat128NaN(a, b, status);
6699         }
6700         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6701     }
6702     if ( aExp == 0 ) {
6703         ++expDiff;
6704     }
6705     else {
6706         aSig0 |= LIT64( 0x4000000000000000 );
6707     }
6708     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6709     bSig0 |= LIT64( 0x4000000000000000 );
6710  bBigger:
6711     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6712     zExp = bExp;
6713     zSign ^= 1;
6714     goto normalizeRoundAndPack;
6715  aExpBigger:
6716     if ( aExp == 0x7FFF ) {
6717         if (aSig0 | aSig1) {
6718             return propagateFloat128NaN(a, b, status);
6719         }
6720         return a;
6721     }
6722     if ( bExp == 0 ) {
6723         --expDiff;
6724     }
6725     else {
6726         bSig0 |= LIT64( 0x4000000000000000 );
6727     }
6728     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6729     aSig0 |= LIT64( 0x4000000000000000 );
6730  aBigger:
6731     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6732     zExp = aExp;
6733  normalizeRoundAndPack:
6734     --zExp;
6735     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6736                                          status);
6737 
6738 }
6739 
6740 /*----------------------------------------------------------------------------
6741 | Returns the result of adding the quadruple-precision floating-point values
6742 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6743 | for Binary Floating-Point Arithmetic.
6744 *----------------------------------------------------------------------------*/
6745 
6746 float128 float128_add(float128 a, float128 b, float_status *status)
6747 {
6748     flag aSign, bSign;
6749 
6750     aSign = extractFloat128Sign( a );
6751     bSign = extractFloat128Sign( b );
6752     if ( aSign == bSign ) {
6753         return addFloat128Sigs(a, b, aSign, status);
6754     }
6755     else {
6756         return subFloat128Sigs(a, b, aSign, status);
6757     }
6758 
6759 }
6760 
6761 /*----------------------------------------------------------------------------
6762 | Returns the result of subtracting the quadruple-precision floating-point
6763 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6764 | Standard for Binary Floating-Point Arithmetic.
6765 *----------------------------------------------------------------------------*/
6766 
6767 float128 float128_sub(float128 a, float128 b, float_status *status)
6768 {
6769     flag aSign, bSign;
6770 
6771     aSign = extractFloat128Sign( a );
6772     bSign = extractFloat128Sign( b );
6773     if ( aSign == bSign ) {
6774         return subFloat128Sigs(a, b, aSign, status);
6775     }
6776     else {
6777         return addFloat128Sigs(a, b, aSign, status);
6778     }
6779 
6780 }
6781 
6782 /*----------------------------------------------------------------------------
6783 | Returns the result of multiplying the quadruple-precision floating-point
6784 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6785 | Standard for Binary Floating-Point Arithmetic.
6786 *----------------------------------------------------------------------------*/
6787 
6788 float128 float128_mul(float128 a, float128 b, float_status *status)
6789 {
6790     flag aSign, bSign, zSign;
6791     int32_t aExp, bExp, zExp;
6792     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6793 
6794     aSig1 = extractFloat128Frac1( a );
6795     aSig0 = extractFloat128Frac0( a );
6796     aExp = extractFloat128Exp( a );
6797     aSign = extractFloat128Sign( a );
6798     bSig1 = extractFloat128Frac1( b );
6799     bSig0 = extractFloat128Frac0( b );
6800     bExp = extractFloat128Exp( b );
6801     bSign = extractFloat128Sign( b );
6802     zSign = aSign ^ bSign;
6803     if ( aExp == 0x7FFF ) {
6804         if (    ( aSig0 | aSig1 )
6805              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6806             return propagateFloat128NaN(a, b, status);
6807         }
6808         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6809         return packFloat128( zSign, 0x7FFF, 0, 0 );
6810     }
6811     if ( bExp == 0x7FFF ) {
6812         if (bSig0 | bSig1) {
6813             return propagateFloat128NaN(a, b, status);
6814         }
6815         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6816  invalid:
6817             float_raise(float_flag_invalid, status);
6818             return float128_default_nan(status);
6819         }
6820         return packFloat128( zSign, 0x7FFF, 0, 0 );
6821     }
6822     if ( aExp == 0 ) {
6823         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6824         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6825     }
6826     if ( bExp == 0 ) {
6827         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6828         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6829     }
6830     zExp = aExp + bExp - 0x4000;
6831     aSig0 |= LIT64( 0x0001000000000000 );
6832     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6833     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6834     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6835     zSig2 |= ( zSig3 != 0 );
6836     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6837         shift128ExtraRightJamming(
6838             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6839         ++zExp;
6840     }
6841     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6842 
6843 }
6844 
6845 /*----------------------------------------------------------------------------
6846 | Returns the result of dividing the quadruple-precision floating-point value
6847 | `a' by the corresponding value `b'.  The operation is performed according to
6848 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6849 *----------------------------------------------------------------------------*/
6850 
6851 float128 float128_div(float128 a, float128 b, float_status *status)
6852 {
6853     flag aSign, bSign, zSign;
6854     int32_t aExp, bExp, zExp;
6855     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6856     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6857 
6858     aSig1 = extractFloat128Frac1( a );
6859     aSig0 = extractFloat128Frac0( a );
6860     aExp = extractFloat128Exp( a );
6861     aSign = extractFloat128Sign( a );
6862     bSig1 = extractFloat128Frac1( b );
6863     bSig0 = extractFloat128Frac0( b );
6864     bExp = extractFloat128Exp( b );
6865     bSign = extractFloat128Sign( b );
6866     zSign = aSign ^ bSign;
6867     if ( aExp == 0x7FFF ) {
6868         if (aSig0 | aSig1) {
6869             return propagateFloat128NaN(a, b, status);
6870         }
6871         if ( bExp == 0x7FFF ) {
6872             if (bSig0 | bSig1) {
6873                 return propagateFloat128NaN(a, b, status);
6874             }
6875             goto invalid;
6876         }
6877         return packFloat128( zSign, 0x7FFF, 0, 0 );
6878     }
6879     if ( bExp == 0x7FFF ) {
6880         if (bSig0 | bSig1) {
6881             return propagateFloat128NaN(a, b, status);
6882         }
6883         return packFloat128( zSign, 0, 0, 0 );
6884     }
6885     if ( bExp == 0 ) {
6886         if ( ( bSig0 | bSig1 ) == 0 ) {
6887             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6888  invalid:
6889                 float_raise(float_flag_invalid, status);
6890                 return float128_default_nan(status);
6891             }
6892             float_raise(float_flag_divbyzero, status);
6893             return packFloat128( zSign, 0x7FFF, 0, 0 );
6894         }
6895         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6896     }
6897     if ( aExp == 0 ) {
6898         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6899         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6900     }
6901     zExp = aExp - bExp + 0x3FFD;
6902     shortShift128Left(
6903         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6904     shortShift128Left(
6905         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6906     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6907         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6908         ++zExp;
6909     }
6910     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6911     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6912     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6913     while ( (int64_t) rem0 < 0 ) {
6914         --zSig0;
6915         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6916     }
6917     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6918     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6919         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6920         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6921         while ( (int64_t) rem1 < 0 ) {
6922             --zSig1;
6923             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6924         }
6925         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6926     }
6927     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6928     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6929 
6930 }
6931 
6932 /*----------------------------------------------------------------------------
6933 | Returns the remainder of the quadruple-precision floating-point value `a'
6934 | with respect to the corresponding value `b'.  The operation is performed
6935 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6936 *----------------------------------------------------------------------------*/
6937 
6938 float128 float128_rem(float128 a, float128 b, float_status *status)
6939 {
6940     flag aSign, zSign;
6941     int32_t aExp, bExp, expDiff;
6942     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6943     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6944     int64_t sigMean0;
6945 
6946     aSig1 = extractFloat128Frac1( a );
6947     aSig0 = extractFloat128Frac0( a );
6948     aExp = extractFloat128Exp( a );
6949     aSign = extractFloat128Sign( a );
6950     bSig1 = extractFloat128Frac1( b );
6951     bSig0 = extractFloat128Frac0( b );
6952     bExp = extractFloat128Exp( b );
6953     if ( aExp == 0x7FFF ) {
6954         if (    ( aSig0 | aSig1 )
6955              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6956             return propagateFloat128NaN(a, b, status);
6957         }
6958         goto invalid;
6959     }
6960     if ( bExp == 0x7FFF ) {
6961         if (bSig0 | bSig1) {
6962             return propagateFloat128NaN(a, b, status);
6963         }
6964         return a;
6965     }
6966     if ( bExp == 0 ) {
6967         if ( ( bSig0 | bSig1 ) == 0 ) {
6968  invalid:
6969             float_raise(float_flag_invalid, status);
6970             return float128_default_nan(status);
6971         }
6972         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6973     }
6974     if ( aExp == 0 ) {
6975         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6976         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6977     }
6978     expDiff = aExp - bExp;
6979     if ( expDiff < -1 ) return a;
6980     shortShift128Left(
6981         aSig0 | LIT64( 0x0001000000000000 ),
6982         aSig1,
6983         15 - ( expDiff < 0 ),
6984         &aSig0,
6985         &aSig1
6986     );
6987     shortShift128Left(
6988         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6989     q = le128( bSig0, bSig1, aSig0, aSig1 );
6990     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6991     expDiff -= 64;
6992     while ( 0 < expDiff ) {
6993         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6994         q = ( 4 < q ) ? q - 4 : 0;
6995         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6996         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6997         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6998         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6999         expDiff -= 61;
7000     }
7001     if ( -64 < expDiff ) {
7002         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7003         q = ( 4 < q ) ? q - 4 : 0;
7004         q >>= - expDiff;
7005         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7006         expDiff += 52;
7007         if ( expDiff < 0 ) {
7008             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7009         }
7010         else {
7011             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7012         }
7013         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7014         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7015     }
7016     else {
7017         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7018         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7019     }
7020     do {
7021         alternateASig0 = aSig0;
7022         alternateASig1 = aSig1;
7023         ++q;
7024         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7025     } while ( 0 <= (int64_t) aSig0 );
7026     add128(
7027         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7028     if (    ( sigMean0 < 0 )
7029          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7030         aSig0 = alternateASig0;
7031         aSig1 = alternateASig1;
7032     }
7033     zSign = ( (int64_t) aSig0 < 0 );
7034     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7035     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7036                                          status);
7037 }
7038 
7039 /*----------------------------------------------------------------------------
7040 | Returns the square root of the quadruple-precision floating-point value `a'.
7041 | The operation is performed according to the IEC/IEEE Standard for Binary
7042 | Floating-Point Arithmetic.
7043 *----------------------------------------------------------------------------*/
7044 
7045 float128 float128_sqrt(float128 a, float_status *status)
7046 {
7047     flag aSign;
7048     int32_t aExp, zExp;
7049     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7050     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7051 
7052     aSig1 = extractFloat128Frac1( a );
7053     aSig0 = extractFloat128Frac0( a );
7054     aExp = extractFloat128Exp( a );
7055     aSign = extractFloat128Sign( a );
7056     if ( aExp == 0x7FFF ) {
7057         if (aSig0 | aSig1) {
7058             return propagateFloat128NaN(a, a, status);
7059         }
7060         if ( ! aSign ) return a;
7061         goto invalid;
7062     }
7063     if ( aSign ) {
7064         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7065  invalid:
7066         float_raise(float_flag_invalid, status);
7067         return float128_default_nan(status);
7068     }
7069     if ( aExp == 0 ) {
7070         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7071         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7072     }
7073     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7074     aSig0 |= LIT64( 0x0001000000000000 );
7075     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7076     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7077     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7078     doubleZSig0 = zSig0<<1;
7079     mul64To128( zSig0, zSig0, &term0, &term1 );
7080     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7081     while ( (int64_t) rem0 < 0 ) {
7082         --zSig0;
7083         doubleZSig0 -= 2;
7084         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7085     }
7086     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7087     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7088         if ( zSig1 == 0 ) zSig1 = 1;
7089         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7090         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7091         mul64To128( zSig1, zSig1, &term2, &term3 );
7092         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7093         while ( (int64_t) rem1 < 0 ) {
7094             --zSig1;
7095             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7096             term3 |= 1;
7097             term2 |= doubleZSig0;
7098             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7099         }
7100         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7101     }
7102     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7103     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7104 
7105 }
7106 
7107 /*----------------------------------------------------------------------------
7108 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7109 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7110 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7111 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7112 *----------------------------------------------------------------------------*/
7113 
7114 int float128_eq(float128 a, float128 b, float_status *status)
7115 {
7116 
7117     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7118               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7119          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7120               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7121        ) {
7122         float_raise(float_flag_invalid, status);
7123         return 0;
7124     }
7125     return
7126            ( a.low == b.low )
7127         && (    ( a.high == b.high )
7128              || (    ( a.low == 0 )
7129                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7130            );
7131 
7132 }
7133 
7134 /*----------------------------------------------------------------------------
7135 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7136 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7137 | exception is raised if either operand is a NaN.  The comparison is performed
7138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7139 *----------------------------------------------------------------------------*/
7140 
7141 int float128_le(float128 a, float128 b, float_status *status)
7142 {
7143     flag aSign, bSign;
7144 
7145     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7146               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7147          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7148               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7149        ) {
7150         float_raise(float_flag_invalid, status);
7151         return 0;
7152     }
7153     aSign = extractFloat128Sign( a );
7154     bSign = extractFloat128Sign( b );
7155     if ( aSign != bSign ) {
7156         return
7157                aSign
7158             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7159                  == 0 );
7160     }
7161     return
7162           aSign ? le128( b.high, b.low, a.high, a.low )
7163         : le128( a.high, a.low, b.high, b.low );
7164 
7165 }
7166 
7167 /*----------------------------------------------------------------------------
7168 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7169 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7170 | raised if either operand is a NaN.  The comparison is performed according
7171 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7172 *----------------------------------------------------------------------------*/
7173 
7174 int float128_lt(float128 a, float128 b, float_status *status)
7175 {
7176     flag aSign, bSign;
7177 
7178     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7179               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7180          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7181               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7182        ) {
7183         float_raise(float_flag_invalid, status);
7184         return 0;
7185     }
7186     aSign = extractFloat128Sign( a );
7187     bSign = extractFloat128Sign( b );
7188     if ( aSign != bSign ) {
7189         return
7190                aSign
7191             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7192                  != 0 );
7193     }
7194     return
7195           aSign ? lt128( b.high, b.low, a.high, a.low )
7196         : lt128( a.high, a.low, b.high, b.low );
7197 
7198 }
7199 
7200 /*----------------------------------------------------------------------------
7201 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7202 | be compared, and 0 otherwise.  The invalid exception is raised if either
7203 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7204 | Standard for Binary Floating-Point Arithmetic.
7205 *----------------------------------------------------------------------------*/
7206 
7207 int float128_unordered(float128 a, float128 b, float_status *status)
7208 {
7209     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7210               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7211          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7212               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7213        ) {
7214         float_raise(float_flag_invalid, status);
7215         return 1;
7216     }
7217     return 0;
7218 }
7219 
7220 /*----------------------------------------------------------------------------
7221 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7222 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7223 | exception.  The comparison is performed according to the IEC/IEEE Standard
7224 | for Binary Floating-Point Arithmetic.
7225 *----------------------------------------------------------------------------*/
7226 
7227 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7228 {
7229 
7230     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7231               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7232          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7233               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7234        ) {
7235         if (float128_is_signaling_nan(a, status)
7236          || float128_is_signaling_nan(b, status)) {
7237             float_raise(float_flag_invalid, status);
7238         }
7239         return 0;
7240     }
7241     return
7242            ( a.low == b.low )
7243         && (    ( a.high == b.high )
7244              || (    ( a.low == 0 )
7245                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7246            );
7247 
7248 }
7249 
7250 /*----------------------------------------------------------------------------
7251 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7252 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7253 | cause an exception.  Otherwise, the comparison is performed according to the
7254 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7255 *----------------------------------------------------------------------------*/
7256 
7257 int float128_le_quiet(float128 a, float128 b, float_status *status)
7258 {
7259     flag aSign, bSign;
7260 
7261     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7262               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7263          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7264               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7265        ) {
7266         if (float128_is_signaling_nan(a, status)
7267          || float128_is_signaling_nan(b, status)) {
7268             float_raise(float_flag_invalid, status);
7269         }
7270         return 0;
7271     }
7272     aSign = extractFloat128Sign( a );
7273     bSign = extractFloat128Sign( b );
7274     if ( aSign != bSign ) {
7275         return
7276                aSign
7277             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7278                  == 0 );
7279     }
7280     return
7281           aSign ? le128( b.high, b.low, a.high, a.low )
7282         : le128( a.high, a.low, b.high, b.low );
7283 
7284 }
7285 
7286 /*----------------------------------------------------------------------------
7287 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7288 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7289 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7290 | Standard for Binary Floating-Point Arithmetic.
7291 *----------------------------------------------------------------------------*/
7292 
7293 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7294 {
7295     flag aSign, bSign;
7296 
7297     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7298               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7299          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7300               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7301        ) {
7302         if (float128_is_signaling_nan(a, status)
7303          || float128_is_signaling_nan(b, status)) {
7304             float_raise(float_flag_invalid, status);
7305         }
7306         return 0;
7307     }
7308     aSign = extractFloat128Sign( a );
7309     bSign = extractFloat128Sign( b );
7310     if ( aSign != bSign ) {
7311         return
7312                aSign
7313             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7314                  != 0 );
7315     }
7316     return
7317           aSign ? lt128( b.high, b.low, a.high, a.low )
7318         : lt128( a.high, a.low, b.high, b.low );
7319 
7320 }
7321 
7322 /*----------------------------------------------------------------------------
7323 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7324 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7325 | comparison is performed according to the IEC/IEEE Standard for Binary
7326 | Floating-Point Arithmetic.
7327 *----------------------------------------------------------------------------*/
7328 
7329 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7330 {
7331     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7332               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7333          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7334               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7335        ) {
7336         if (float128_is_signaling_nan(a, status)
7337          || float128_is_signaling_nan(b, status)) {
7338             float_raise(float_flag_invalid, status);
7339         }
7340         return 1;
7341     }
7342     return 0;
7343 }
7344 
7345 /* misc functions */
7346 float32 uint32_to_float32(uint32_t a, float_status *status)
7347 {
7348     return int64_to_float32(a, status);
7349 }
7350 
7351 float64 uint32_to_float64(uint32_t a, float_status *status)
7352 {
7353     return int64_to_float64(a, status);
7354 }
7355 
7356 uint32_t float32_to_uint32(float32 a, float_status *status)
7357 {
7358     int64_t v;
7359     uint32_t res;
7360     int old_exc_flags = get_float_exception_flags(status);
7361 
7362     v = float32_to_int64(a, status);
7363     if (v < 0) {
7364         res = 0;
7365     } else if (v > 0xffffffff) {
7366         res = 0xffffffff;
7367     } else {
7368         return v;
7369     }
7370     set_float_exception_flags(old_exc_flags, status);
7371     float_raise(float_flag_invalid, status);
7372     return res;
7373 }
7374 
7375 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
7376 {
7377     int64_t v;
7378     uint32_t res;
7379     int old_exc_flags = get_float_exception_flags(status);
7380 
7381     v = float32_to_int64_round_to_zero(a, status);
7382     if (v < 0) {
7383         res = 0;
7384     } else if (v > 0xffffffff) {
7385         res = 0xffffffff;
7386     } else {
7387         return v;
7388     }
7389     set_float_exception_flags(old_exc_flags, status);
7390     float_raise(float_flag_invalid, status);
7391     return res;
7392 }
7393 
7394 int16_t float32_to_int16(float32 a, float_status *status)
7395 {
7396     int32_t v;
7397     int16_t res;
7398     int old_exc_flags = get_float_exception_flags(status);
7399 
7400     v = float32_to_int32(a, status);
7401     if (v < -0x8000) {
7402         res = -0x8000;
7403     } else if (v > 0x7fff) {
7404         res = 0x7fff;
7405     } else {
7406         return v;
7407     }
7408 
7409     set_float_exception_flags(old_exc_flags, status);
7410     float_raise(float_flag_invalid, status);
7411     return res;
7412 }
7413 
7414 uint16_t float32_to_uint16(float32 a, float_status *status)
7415 {
7416     int32_t v;
7417     uint16_t res;
7418     int old_exc_flags = get_float_exception_flags(status);
7419 
7420     v = float32_to_int32(a, status);
7421     if (v < 0) {
7422         res = 0;
7423     } else if (v > 0xffff) {
7424         res = 0xffff;
7425     } else {
7426         return v;
7427     }
7428 
7429     set_float_exception_flags(old_exc_flags, status);
7430     float_raise(float_flag_invalid, status);
7431     return res;
7432 }
7433 
7434 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7435 {
7436     int64_t v;
7437     uint16_t res;
7438     int old_exc_flags = get_float_exception_flags(status);
7439 
7440     v = float32_to_int64_round_to_zero(a, status);
7441     if (v < 0) {
7442         res = 0;
7443     } else if (v > 0xffff) {
7444         res = 0xffff;
7445     } else {
7446         return v;
7447     }
7448     set_float_exception_flags(old_exc_flags, status);
7449     float_raise(float_flag_invalid, status);
7450     return res;
7451 }
7452 
7453 uint32_t float64_to_uint32(float64 a, float_status *status)
7454 {
7455     uint64_t v;
7456     uint32_t res;
7457     int old_exc_flags = get_float_exception_flags(status);
7458 
7459     v = float64_to_uint64(a, status);
7460     if (v > 0xffffffff) {
7461         res = 0xffffffff;
7462     } else {
7463         return v;
7464     }
7465     set_float_exception_flags(old_exc_flags, status);
7466     float_raise(float_flag_invalid, status);
7467     return res;
7468 }
7469 
7470 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
7471 {
7472     uint64_t v;
7473     uint32_t res;
7474     int old_exc_flags = get_float_exception_flags(status);
7475 
7476     v = float64_to_uint64_round_to_zero(a, status);
7477     if (v > 0xffffffff) {
7478         res = 0xffffffff;
7479     } else {
7480         return v;
7481     }
7482     set_float_exception_flags(old_exc_flags, status);
7483     float_raise(float_flag_invalid, status);
7484     return res;
7485 }
7486 
7487 int16_t float64_to_int16(float64 a, float_status *status)
7488 {
7489     int64_t v;
7490     int16_t res;
7491     int old_exc_flags = get_float_exception_flags(status);
7492 
7493     v = float64_to_int32(a, status);
7494     if (v < -0x8000) {
7495         res = -0x8000;
7496     } else if (v > 0x7fff) {
7497         res = 0x7fff;
7498     } else {
7499         return v;
7500     }
7501 
7502     set_float_exception_flags(old_exc_flags, status);
7503     float_raise(float_flag_invalid, status);
7504     return res;
7505 }
7506 
7507 uint16_t float64_to_uint16(float64 a, float_status *status)
7508 {
7509     int64_t v;
7510     uint16_t res;
7511     int old_exc_flags = get_float_exception_flags(status);
7512 
7513     v = float64_to_int32(a, status);
7514     if (v < 0) {
7515         res = 0;
7516     } else if (v > 0xffff) {
7517         res = 0xffff;
7518     } else {
7519         return v;
7520     }
7521 
7522     set_float_exception_flags(old_exc_flags, status);
7523     float_raise(float_flag_invalid, status);
7524     return res;
7525 }
7526 
7527 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7528 {
7529     int64_t v;
7530     uint16_t res;
7531     int old_exc_flags = get_float_exception_flags(status);
7532 
7533     v = float64_to_int64_round_to_zero(a, status);
7534     if (v < 0) {
7535         res = 0;
7536     } else if (v > 0xffff) {
7537         res = 0xffff;
7538     } else {
7539         return v;
7540     }
7541     set_float_exception_flags(old_exc_flags, status);
7542     float_raise(float_flag_invalid, status);
7543     return res;
7544 }
7545 
7546 /*----------------------------------------------------------------------------
7547 | Returns the result of converting the double-precision floating-point value
7548 | `a' to the 64-bit unsigned integer format.  The conversion is
7549 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7550 | Arithmetic---which means in particular that the conversion is rounded
7551 | according to the current rounding mode.  If `a' is a NaN, the largest
7552 | positive integer is returned.  If the conversion overflows, the
7553 | largest unsigned integer is returned.  If 'a' is negative, the value is
7554 | rounded and zero is returned; negative values that do not round to zero
7555 | will raise the inexact exception.
7556 *----------------------------------------------------------------------------*/
7557 
7558 uint64_t float64_to_uint64(float64 a, float_status *status)
7559 {
7560     flag aSign;
7561     int aExp;
7562     int shiftCount;
7563     uint64_t aSig, aSigExtra;
7564     a = float64_squash_input_denormal(a, status);
7565 
7566     aSig = extractFloat64Frac(a);
7567     aExp = extractFloat64Exp(a);
7568     aSign = extractFloat64Sign(a);
7569     if (aSign && (aExp > 1022)) {
7570         float_raise(float_flag_invalid, status);
7571         if (float64_is_any_nan(a)) {
7572             return LIT64(0xFFFFFFFFFFFFFFFF);
7573         } else {
7574             return 0;
7575         }
7576     }
7577     if (aExp) {
7578         aSig |= LIT64(0x0010000000000000);
7579     }
7580     shiftCount = 0x433 - aExp;
7581     if (shiftCount <= 0) {
7582         if (0x43E < aExp) {
7583             float_raise(float_flag_invalid, status);
7584             return LIT64(0xFFFFFFFFFFFFFFFF);
7585         }
7586         aSigExtra = 0;
7587         aSig <<= -shiftCount;
7588     } else {
7589         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7590     }
7591     return roundAndPackUint64(aSign, aSig, aSigExtra, status);
7592 }
7593 
7594 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7595 {
7596     signed char current_rounding_mode = status->float_rounding_mode;
7597     set_float_rounding_mode(float_round_to_zero, status);
7598     uint64_t v = float64_to_uint64(a, status);
7599     set_float_rounding_mode(current_rounding_mode, status);
7600     return v;
7601 }
7602 
7603 #define COMPARE(s, nan_exp)                                                  \
7604 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7605                                       int is_quiet, float_status *status)    \
7606 {                                                                            \
7607     flag aSign, bSign;                                                       \
7608     uint ## s ## _t av, bv;                                                  \
7609     a = float ## s ## _squash_input_denormal(a, status);                     \
7610     b = float ## s ## _squash_input_denormal(b, status);                     \
7611                                                                              \
7612     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7613          extractFloat ## s ## Frac( a ) ) ||                                 \
7614         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7615           extractFloat ## s ## Frac( b ) )) {                                \
7616         if (!is_quiet ||                                                     \
7617             float ## s ## _is_signaling_nan(a, status) ||                  \
7618             float ## s ## _is_signaling_nan(b, status)) {                 \
7619             float_raise(float_flag_invalid, status);                         \
7620         }                                                                    \
7621         return float_relation_unordered;                                     \
7622     }                                                                        \
7623     aSign = extractFloat ## s ## Sign( a );                                  \
7624     bSign = extractFloat ## s ## Sign( b );                                  \
7625     av = float ## s ## _val(a);                                              \
7626     bv = float ## s ## _val(b);                                              \
7627     if ( aSign != bSign ) {                                                  \
7628         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7629             /* zero case */                                                  \
7630             return float_relation_equal;                                     \
7631         } else {                                                             \
7632             return 1 - (2 * aSign);                                          \
7633         }                                                                    \
7634     } else {                                                                 \
7635         if (av == bv) {                                                      \
7636             return float_relation_equal;                                     \
7637         } else {                                                             \
7638             return 1 - 2 * (aSign ^ ( av < bv ));                            \
7639         }                                                                    \
7640     }                                                                        \
7641 }                                                                            \
7642                                                                              \
7643 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7644 {                                                                            \
7645     return float ## s ## _compare_internal(a, b, 0, status);                 \
7646 }                                                                            \
7647                                                                              \
7648 int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
7649                                  float_status *status)                       \
7650 {                                                                            \
7651     return float ## s ## _compare_internal(a, b, 1, status);                 \
7652 }
7653 
7654 COMPARE(32, 0xff)
7655 COMPARE(64, 0x7ff)
7656 
7657 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7658                                             int is_quiet, float_status *status)
7659 {
7660     flag aSign, bSign;
7661 
7662     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7663         float_raise(float_flag_invalid, status);
7664         return float_relation_unordered;
7665     }
7666     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7667           ( extractFloatx80Frac( a )<<1 ) ) ||
7668         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7669           ( extractFloatx80Frac( b )<<1 ) )) {
7670         if (!is_quiet ||
7671             floatx80_is_signaling_nan(a, status) ||
7672             floatx80_is_signaling_nan(b, status)) {
7673             float_raise(float_flag_invalid, status);
7674         }
7675         return float_relation_unordered;
7676     }
7677     aSign = extractFloatx80Sign( a );
7678     bSign = extractFloatx80Sign( b );
7679     if ( aSign != bSign ) {
7680 
7681         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7682              ( ( a.low | b.low ) == 0 ) ) {
7683             /* zero case */
7684             return float_relation_equal;
7685         } else {
7686             return 1 - (2 * aSign);
7687         }
7688     } else {
7689         if (a.low == b.low && a.high == b.high) {
7690             return float_relation_equal;
7691         } else {
7692             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7693         }
7694     }
7695 }
7696 
7697 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7698 {
7699     return floatx80_compare_internal(a, b, 0, status);
7700 }
7701 
7702 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7703 {
7704     return floatx80_compare_internal(a, b, 1, status);
7705 }
7706 
7707 static inline int float128_compare_internal(float128 a, float128 b,
7708                                             int is_quiet, float_status *status)
7709 {
7710     flag aSign, bSign;
7711 
7712     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7713           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7714         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7715           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7716         if (!is_quiet ||
7717             float128_is_signaling_nan(a, status) ||
7718             float128_is_signaling_nan(b, status)) {
7719             float_raise(float_flag_invalid, status);
7720         }
7721         return float_relation_unordered;
7722     }
7723     aSign = extractFloat128Sign( a );
7724     bSign = extractFloat128Sign( b );
7725     if ( aSign != bSign ) {
7726         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7727             /* zero case */
7728             return float_relation_equal;
7729         } else {
7730             return 1 - (2 * aSign);
7731         }
7732     } else {
7733         if (a.low == b.low && a.high == b.high) {
7734             return float_relation_equal;
7735         } else {
7736             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7737         }
7738     }
7739 }
7740 
7741 int float128_compare(float128 a, float128 b, float_status *status)
7742 {
7743     return float128_compare_internal(a, b, 0, status);
7744 }
7745 
7746 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7747 {
7748     return float128_compare_internal(a, b, 1, status);
7749 }
7750 
7751 /* min() and max() functions. These can't be implemented as
7752  * 'compare and pick one input' because that would mishandle
7753  * NaNs and +0 vs -0.
7754  *
7755  * minnum() and maxnum() functions. These are similar to the min()
7756  * and max() functions but if one of the arguments is a QNaN and
7757  * the other is numerical then the numerical argument is returned.
7758  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7759  * and maxNum() operations. min() and max() are the typical min/max
7760  * semantics provided by many CPUs which predate that specification.
7761  *
7762  * minnummag() and maxnummag() functions correspond to minNumMag()
7763  * and minNumMag() from the IEEE-754 2008.
7764  */
7765 #define MINMAX(s)                                                       \
7766 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7767                                                int ismin, int isieee,   \
7768                                                int ismag,               \
7769                                                float_status *status)    \
7770 {                                                                       \
7771     flag aSign, bSign;                                                  \
7772     uint ## s ## _t av, bv, aav, abv;                                   \
7773     a = float ## s ## _squash_input_denormal(a, status);                \
7774     b = float ## s ## _squash_input_denormal(b, status);                \
7775     if (float ## s ## _is_any_nan(a) ||                                 \
7776         float ## s ## _is_any_nan(b)) {                                 \
7777         if (isieee) {                                                   \
7778             if (float ## s ## _is_quiet_nan(a, status) &&               \
7779                 !float ## s ##_is_any_nan(b)) {                         \
7780                 return b;                                               \
7781             } else if (float ## s ## _is_quiet_nan(b, status) &&        \
7782                        !float ## s ## _is_any_nan(a)) {                \
7783                 return a;                                               \
7784             }                                                           \
7785         }                                                               \
7786         return propagateFloat ## s ## NaN(a, b, status);                \
7787     }                                                                   \
7788     aSign = extractFloat ## s ## Sign(a);                               \
7789     bSign = extractFloat ## s ## Sign(b);                               \
7790     av = float ## s ## _val(a);                                         \
7791     bv = float ## s ## _val(b);                                         \
7792     if (ismag) {                                                        \
7793         aav = float ## s ## _abs(av);                                   \
7794         abv = float ## s ## _abs(bv);                                   \
7795         if (aav != abv) {                                               \
7796             if (ismin) {                                                \
7797                 return (aav < abv) ? a : b;                             \
7798             } else {                                                    \
7799                 return (aav < abv) ? b : a;                             \
7800             }                                                           \
7801         }                                                               \
7802     }                                                                   \
7803     if (aSign != bSign) {                                               \
7804         if (ismin) {                                                    \
7805             return aSign ? a : b;                                       \
7806         } else {                                                        \
7807             return aSign ? b : a;                                       \
7808         }                                                               \
7809     } else {                                                            \
7810         if (ismin) {                                                    \
7811             return (aSign ^ (av < bv)) ? a : b;                         \
7812         } else {                                                        \
7813             return (aSign ^ (av < bv)) ? b : a;                         \
7814         }                                                               \
7815     }                                                                   \
7816 }                                                                       \
7817                                                                         \
7818 float ## s float ## s ## _min(float ## s a, float ## s b,               \
7819                               float_status *status)                     \
7820 {                                                                       \
7821     return float ## s ## _minmax(a, b, 1, 0, 0, status);                \
7822 }                                                                       \
7823                                                                         \
7824 float ## s float ## s ## _max(float ## s a, float ## s b,               \
7825                               float_status *status)                     \
7826 {                                                                       \
7827     return float ## s ## _minmax(a, b, 0, 0, 0, status);                \
7828 }                                                                       \
7829                                                                         \
7830 float ## s float ## s ## _minnum(float ## s a, float ## s b,            \
7831                                  float_status *status)                  \
7832 {                                                                       \
7833     return float ## s ## _minmax(a, b, 1, 1, 0, status);                \
7834 }                                                                       \
7835                                                                         \
7836 float ## s float ## s ## _maxnum(float ## s a, float ## s b,            \
7837                                  float_status *status)                  \
7838 {                                                                       \
7839     return float ## s ## _minmax(a, b, 0, 1, 0, status);                \
7840 }                                                                       \
7841                                                                         \
7842 float ## s float ## s ## _minnummag(float ## s a, float ## s b,         \
7843                                     float_status *status)               \
7844 {                                                                       \
7845     return float ## s ## _minmax(a, b, 1, 1, 1, status);                \
7846 }                                                                       \
7847                                                                         \
7848 float ## s float ## s ## _maxnummag(float ## s a, float ## s b,         \
7849                                     float_status *status)               \
7850 {                                                                       \
7851     return float ## s ## _minmax(a, b, 0, 1, 1, status);                \
7852 }
7853 
7854 MINMAX(32)
7855 MINMAX(64)
7856 
7857 
7858 /* Multiply A by 2 raised to the power N.  */
7859 float32 float32_scalbn(float32 a, int n, float_status *status)
7860 {
7861     flag aSign;
7862     int16_t aExp;
7863     uint32_t aSig;
7864 
7865     a = float32_squash_input_denormal(a, status);
7866     aSig = extractFloat32Frac( a );
7867     aExp = extractFloat32Exp( a );
7868     aSign = extractFloat32Sign( a );
7869 
7870     if ( aExp == 0xFF ) {
7871         if ( aSig ) {
7872             return propagateFloat32NaN(a, a, status);
7873         }
7874         return a;
7875     }
7876     if (aExp != 0) {
7877         aSig |= 0x00800000;
7878     } else if (aSig == 0) {
7879         return a;
7880     } else {
7881         aExp++;
7882     }
7883 
7884     if (n > 0x200) {
7885         n = 0x200;
7886     } else if (n < -0x200) {
7887         n = -0x200;
7888     }
7889 
7890     aExp += n - 1;
7891     aSig <<= 7;
7892     return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
7893 }
7894 
7895 float64 float64_scalbn(float64 a, int n, float_status *status)
7896 {
7897     flag aSign;
7898     int16_t aExp;
7899     uint64_t aSig;
7900 
7901     a = float64_squash_input_denormal(a, status);
7902     aSig = extractFloat64Frac( a );
7903     aExp = extractFloat64Exp( a );
7904     aSign = extractFloat64Sign( a );
7905 
7906     if ( aExp == 0x7FF ) {
7907         if ( aSig ) {
7908             return propagateFloat64NaN(a, a, status);
7909         }
7910         return a;
7911     }
7912     if (aExp != 0) {
7913         aSig |= LIT64( 0x0010000000000000 );
7914     } else if (aSig == 0) {
7915         return a;
7916     } else {
7917         aExp++;
7918     }
7919 
7920     if (n > 0x1000) {
7921         n = 0x1000;
7922     } else if (n < -0x1000) {
7923         n = -0x1000;
7924     }
7925 
7926     aExp += n - 1;
7927     aSig <<= 10;
7928     return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
7929 }
7930 
7931 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7932 {
7933     flag aSign;
7934     int32_t aExp;
7935     uint64_t aSig;
7936 
7937     if (floatx80_invalid_encoding(a)) {
7938         float_raise(float_flag_invalid, status);
7939         return floatx80_default_nan(status);
7940     }
7941     aSig = extractFloatx80Frac( a );
7942     aExp = extractFloatx80Exp( a );
7943     aSign = extractFloatx80Sign( a );
7944 
7945     if ( aExp == 0x7FFF ) {
7946         if ( aSig<<1 ) {
7947             return propagateFloatx80NaN(a, a, status);
7948         }
7949         return a;
7950     }
7951 
7952     if (aExp == 0) {
7953         if (aSig == 0) {
7954             return a;
7955         }
7956         aExp++;
7957     }
7958 
7959     if (n > 0x10000) {
7960         n = 0x10000;
7961     } else if (n < -0x10000) {
7962         n = -0x10000;
7963     }
7964 
7965     aExp += n;
7966     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7967                                          aSign, aExp, aSig, 0, status);
7968 }
7969 
7970 float128 float128_scalbn(float128 a, int n, float_status *status)
7971 {
7972     flag aSign;
7973     int32_t aExp;
7974     uint64_t aSig0, aSig1;
7975 
7976     aSig1 = extractFloat128Frac1( a );
7977     aSig0 = extractFloat128Frac0( a );
7978     aExp = extractFloat128Exp( a );
7979     aSign = extractFloat128Sign( a );
7980     if ( aExp == 0x7FFF ) {
7981         if ( aSig0 | aSig1 ) {
7982             return propagateFloat128NaN(a, a, status);
7983         }
7984         return a;
7985     }
7986     if (aExp != 0) {
7987         aSig0 |= LIT64( 0x0001000000000000 );
7988     } else if (aSig0 == 0 && aSig1 == 0) {
7989         return a;
7990     } else {
7991         aExp++;
7992     }
7993 
7994     if (n > 0x10000) {
7995         n = 0x10000;
7996     } else if (n < -0x10000) {
7997         n = -0x10000;
7998     }
7999 
8000     aExp += n - 1;
8001     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
8002                                          , status);
8003 
8004 }
8005