xref: /qemu/fpu/softfloat.c (revision 210cbd4910ae9e41e0a1785b96890ea2c291b381)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 
87 #include "fpu/softfloat.h"
88 
89 /* We only need stdlib for abort() */
90 
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations.  (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
97 
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine:  (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output.  These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
107 
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
111 
112 static inline uint32_t extractFloat16Frac(float16 a)
113 {
114     return float16_val(a) & 0x3ff;
115 }
116 
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
120 
121 static inline int extractFloat16Exp(float16 a)
122 {
123     return (float16_val(a) >> 10) & 0x1f;
124 }
125 
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
129 
130 static inline flag extractFloat16Sign(float16 a)
131 {
132     return float16_val(a)>>15;
133 }
134 
135 /*----------------------------------------------------------------------------
136 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
137 | and 7, and returns the properly rounded 32-bit integer corresponding to the
138 | input.  If `zSign' is 1, the input is negated before being converted to an
139 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
140 | is simply rounded to an integer, with the inexact exception raised if the
141 | input cannot be represented exactly as an integer.  However, if the fixed-
142 | point input is too large, the invalid exception is raised and the largest
143 | positive or negative integer is returned.
144 *----------------------------------------------------------------------------*/
145 
146 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
147 {
148     int8_t roundingMode;
149     flag roundNearestEven;
150     int8_t roundIncrement, roundBits;
151     int32_t z;
152 
153     roundingMode = status->float_rounding_mode;
154     roundNearestEven = ( roundingMode == float_round_nearest_even );
155     switch (roundingMode) {
156     case float_round_nearest_even:
157     case float_round_ties_away:
158         roundIncrement = 0x40;
159         break;
160     case float_round_to_zero:
161         roundIncrement = 0;
162         break;
163     case float_round_up:
164         roundIncrement = zSign ? 0 : 0x7f;
165         break;
166     case float_round_down:
167         roundIncrement = zSign ? 0x7f : 0;
168         break;
169     default:
170         abort();
171     }
172     roundBits = absZ & 0x7F;
173     absZ = ( absZ + roundIncrement )>>7;
174     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
175     z = absZ;
176     if ( zSign ) z = - z;
177     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
178         float_raise(float_flag_invalid, status);
179         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
180     }
181     if (roundBits) {
182         status->float_exception_flags |= float_flag_inexact;
183     }
184     return z;
185 
186 }
187 
188 /*----------------------------------------------------------------------------
189 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
190 | `absZ1', with binary point between bits 63 and 64 (between the input words),
191 | and returns the properly rounded 64-bit integer corresponding to the input.
192 | If `zSign' is 1, the input is negated before being converted to an integer.
193 | Ordinarily, the fixed-point input is simply rounded to an integer, with
194 | the inexact exception raised if the input cannot be represented exactly as
195 | an integer.  However, if the fixed-point input is too large, the invalid
196 | exception is raised and the largest positive or negative integer is
197 | returned.
198 *----------------------------------------------------------------------------*/
199 
200 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
201                                float_status *status)
202 {
203     int8_t roundingMode;
204     flag roundNearestEven, increment;
205     int64_t z;
206 
207     roundingMode = status->float_rounding_mode;
208     roundNearestEven = ( roundingMode == float_round_nearest_even );
209     switch (roundingMode) {
210     case float_round_nearest_even:
211     case float_round_ties_away:
212         increment = ((int64_t) absZ1 < 0);
213         break;
214     case float_round_to_zero:
215         increment = 0;
216         break;
217     case float_round_up:
218         increment = !zSign && absZ1;
219         break;
220     case float_round_down:
221         increment = zSign && absZ1;
222         break;
223     default:
224         abort();
225     }
226     if ( increment ) {
227         ++absZ0;
228         if ( absZ0 == 0 ) goto overflow;
229         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
230     }
231     z = absZ0;
232     if ( zSign ) z = - z;
233     if ( z && ( ( z < 0 ) ^ zSign ) ) {
234  overflow:
235         float_raise(float_flag_invalid, status);
236         return
237               zSign ? (int64_t) LIT64( 0x8000000000000000 )
238             : LIT64( 0x7FFFFFFFFFFFFFFF );
239     }
240     if (absZ1) {
241         status->float_exception_flags |= float_flag_inexact;
242     }
243     return z;
244 
245 }
246 
247 /*----------------------------------------------------------------------------
248 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
249 | `absZ1', with binary point between bits 63 and 64 (between the input words),
250 | and returns the properly rounded 64-bit unsigned integer corresponding to the
251 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
252 | with the inexact exception raised if the input cannot be represented exactly
253 | as an integer.  However, if the fixed-point input is too large, the invalid
254 | exception is raised and the largest unsigned integer is returned.
255 *----------------------------------------------------------------------------*/
256 
257 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
258                                 uint64_t absZ1, float_status *status)
259 {
260     int8_t roundingMode;
261     flag roundNearestEven, increment;
262 
263     roundingMode = status->float_rounding_mode;
264     roundNearestEven = (roundingMode == float_round_nearest_even);
265     switch (roundingMode) {
266     case float_round_nearest_even:
267     case float_round_ties_away:
268         increment = ((int64_t)absZ1 < 0);
269         break;
270     case float_round_to_zero:
271         increment = 0;
272         break;
273     case float_round_up:
274         increment = !zSign && absZ1;
275         break;
276     case float_round_down:
277         increment = zSign && absZ1;
278         break;
279     default:
280         abort();
281     }
282     if (increment) {
283         ++absZ0;
284         if (absZ0 == 0) {
285             float_raise(float_flag_invalid, status);
286             return LIT64(0xFFFFFFFFFFFFFFFF);
287         }
288         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
289     }
290 
291     if (zSign && absZ0) {
292         float_raise(float_flag_invalid, status);
293         return 0;
294     }
295 
296     if (absZ1) {
297         status->float_exception_flags |= float_flag_inexact;
298     }
299     return absZ0;
300 }
301 
302 /*----------------------------------------------------------------------------
303 | Returns the fraction bits of the single-precision floating-point value `a'.
304 *----------------------------------------------------------------------------*/
305 
306 static inline uint32_t extractFloat32Frac( float32 a )
307 {
308 
309     return float32_val(a) & 0x007FFFFF;
310 
311 }
312 
313 /*----------------------------------------------------------------------------
314 | Returns the exponent bits of the single-precision floating-point value `a'.
315 *----------------------------------------------------------------------------*/
316 
317 static inline int extractFloat32Exp(float32 a)
318 {
319 
320     return ( float32_val(a)>>23 ) & 0xFF;
321 
322 }
323 
324 /*----------------------------------------------------------------------------
325 | Returns the sign bit of the single-precision floating-point value `a'.
326 *----------------------------------------------------------------------------*/
327 
328 static inline flag extractFloat32Sign( float32 a )
329 {
330 
331     return float32_val(a)>>31;
332 
333 }
334 
335 /*----------------------------------------------------------------------------
336 | If `a' is denormal and we are in flush-to-zero mode then set the
337 | input-denormal exception and return zero. Otherwise just return the value.
338 *----------------------------------------------------------------------------*/
339 float32 float32_squash_input_denormal(float32 a, float_status *status)
340 {
341     if (status->flush_inputs_to_zero) {
342         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
343             float_raise(float_flag_input_denormal, status);
344             return make_float32(float32_val(a) & 0x80000000);
345         }
346     }
347     return a;
348 }
349 
350 /*----------------------------------------------------------------------------
351 | Normalizes the subnormal single-precision floating-point value represented
352 | by the denormalized significand `aSig'.  The normalized exponent and
353 | significand are stored at the locations pointed to by `zExpPtr' and
354 | `zSigPtr', respectively.
355 *----------------------------------------------------------------------------*/
356 
357 static void
358  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
359 {
360     int8_t shiftCount;
361 
362     shiftCount = countLeadingZeros32( aSig ) - 8;
363     *zSigPtr = aSig<<shiftCount;
364     *zExpPtr = 1 - shiftCount;
365 
366 }
367 
368 /*----------------------------------------------------------------------------
369 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
370 | single-precision floating-point value, returning the result.  After being
371 | shifted into the proper positions, the three fields are simply added
372 | together to form the result.  This means that any integer portion of `zSig'
373 | will be added into the exponent.  Since a properly normalized significand
374 | will have an integer portion equal to 1, the `zExp' input should be 1 less
375 | than the desired result exponent whenever `zSig' is a complete, normalized
376 | significand.
377 *----------------------------------------------------------------------------*/
378 
379 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
380 {
381 
382     return make_float32(
383           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
384 
385 }
386 
387 /*----------------------------------------------------------------------------
388 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
389 | and significand `zSig', and returns the proper single-precision floating-
390 | point value corresponding to the abstract input.  Ordinarily, the abstract
391 | value is simply rounded and packed into the single-precision format, with
392 | the inexact exception raised if the abstract input cannot be represented
393 | exactly.  However, if the abstract value is too large, the overflow and
394 | inexact exceptions are raised and an infinity or maximal finite value is
395 | returned.  If the abstract value is too small, the input value is rounded to
396 | a subnormal number, and the underflow and inexact exceptions are raised if
397 | the abstract input cannot be represented exactly as a subnormal single-
398 | precision floating-point number.
399 |     The input significand `zSig' has its binary point between bits 30
400 | and 29, which is 7 bits to the left of the usual location.  This shifted
401 | significand must be normalized or smaller.  If `zSig' is not normalized,
402 | `zExp' must be 0; in that case, the result returned is a subnormal number,
403 | and it must not require rounding.  In the usual case that `zSig' is
404 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
405 | The handling of underflow and overflow follows the IEC/IEEE Standard for
406 | Binary Floating-Point Arithmetic.
407 *----------------------------------------------------------------------------*/
408 
409 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
410                                    float_status *status)
411 {
412     int8_t roundingMode;
413     flag roundNearestEven;
414     int8_t roundIncrement, roundBits;
415     flag isTiny;
416 
417     roundingMode = status->float_rounding_mode;
418     roundNearestEven = ( roundingMode == float_round_nearest_even );
419     switch (roundingMode) {
420     case float_round_nearest_even:
421     case float_round_ties_away:
422         roundIncrement = 0x40;
423         break;
424     case float_round_to_zero:
425         roundIncrement = 0;
426         break;
427     case float_round_up:
428         roundIncrement = zSign ? 0 : 0x7f;
429         break;
430     case float_round_down:
431         roundIncrement = zSign ? 0x7f : 0;
432         break;
433     default:
434         abort();
435         break;
436     }
437     roundBits = zSig & 0x7F;
438     if ( 0xFD <= (uint16_t) zExp ) {
439         if (    ( 0xFD < zExp )
440              || (    ( zExp == 0xFD )
441                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
442            ) {
443             float_raise(float_flag_overflow | float_flag_inexact, status);
444             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
445         }
446         if ( zExp < 0 ) {
447             if (status->flush_to_zero) {
448                 float_raise(float_flag_output_denormal, status);
449                 return packFloat32(zSign, 0, 0);
450             }
451             isTiny =
452                 (status->float_detect_tininess
453                  == float_tininess_before_rounding)
454                 || ( zExp < -1 )
455                 || ( zSig + roundIncrement < 0x80000000 );
456             shift32RightJamming( zSig, - zExp, &zSig );
457             zExp = 0;
458             roundBits = zSig & 0x7F;
459             if (isTiny && roundBits) {
460                 float_raise(float_flag_underflow, status);
461             }
462         }
463     }
464     if (roundBits) {
465         status->float_exception_flags |= float_flag_inexact;
466     }
467     zSig = ( zSig + roundIncrement )>>7;
468     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
469     if ( zSig == 0 ) zExp = 0;
470     return packFloat32( zSign, zExp, zSig );
471 
472 }
473 
474 /*----------------------------------------------------------------------------
475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
476 | and significand `zSig', and returns the proper single-precision floating-
477 | point value corresponding to the abstract input.  This routine is just like
478 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
479 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
480 | floating-point exponent.
481 *----------------------------------------------------------------------------*/
482 
483 static float32
484  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
485                               float_status *status)
486 {
487     int8_t shiftCount;
488 
489     shiftCount = countLeadingZeros32( zSig ) - 1;
490     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
491                                status);
492 
493 }
494 
495 /*----------------------------------------------------------------------------
496 | Returns the fraction bits of the double-precision floating-point value `a'.
497 *----------------------------------------------------------------------------*/
498 
499 static inline uint64_t extractFloat64Frac( float64 a )
500 {
501 
502     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
503 
504 }
505 
506 /*----------------------------------------------------------------------------
507 | Returns the exponent bits of the double-precision floating-point value `a'.
508 *----------------------------------------------------------------------------*/
509 
510 static inline int extractFloat64Exp(float64 a)
511 {
512 
513     return ( float64_val(a)>>52 ) & 0x7FF;
514 
515 }
516 
517 /*----------------------------------------------------------------------------
518 | Returns the sign bit of the double-precision floating-point value `a'.
519 *----------------------------------------------------------------------------*/
520 
521 static inline flag extractFloat64Sign( float64 a )
522 {
523 
524     return float64_val(a)>>63;
525 
526 }
527 
528 /*----------------------------------------------------------------------------
529 | If `a' is denormal and we are in flush-to-zero mode then set the
530 | input-denormal exception and return zero. Otherwise just return the value.
531 *----------------------------------------------------------------------------*/
532 float64 float64_squash_input_denormal(float64 a, float_status *status)
533 {
534     if (status->flush_inputs_to_zero) {
535         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
536             float_raise(float_flag_input_denormal, status);
537             return make_float64(float64_val(a) & (1ULL << 63));
538         }
539     }
540     return a;
541 }
542 
543 /*----------------------------------------------------------------------------
544 | Normalizes the subnormal double-precision floating-point value represented
545 | by the denormalized significand `aSig'.  The normalized exponent and
546 | significand are stored at the locations pointed to by `zExpPtr' and
547 | `zSigPtr', respectively.
548 *----------------------------------------------------------------------------*/
549 
550 static void
551  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
552 {
553     int8_t shiftCount;
554 
555     shiftCount = countLeadingZeros64( aSig ) - 11;
556     *zSigPtr = aSig<<shiftCount;
557     *zExpPtr = 1 - shiftCount;
558 
559 }
560 
561 /*----------------------------------------------------------------------------
562 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
563 | double-precision floating-point value, returning the result.  After being
564 | shifted into the proper positions, the three fields are simply added
565 | together to form the result.  This means that any integer portion of `zSig'
566 | will be added into the exponent.  Since a properly normalized significand
567 | will have an integer portion equal to 1, the `zExp' input should be 1 less
568 | than the desired result exponent whenever `zSig' is a complete, normalized
569 | significand.
570 *----------------------------------------------------------------------------*/
571 
572 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
573 {
574 
575     return make_float64(
576         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
577 
578 }
579 
580 /*----------------------------------------------------------------------------
581 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
582 | and significand `zSig', and returns the proper double-precision floating-
583 | point value corresponding to the abstract input.  Ordinarily, the abstract
584 | value is simply rounded and packed into the double-precision format, with
585 | the inexact exception raised if the abstract input cannot be represented
586 | exactly.  However, if the abstract value is too large, the overflow and
587 | inexact exceptions are raised and an infinity or maximal finite value is
588 | returned.  If the abstract value is too small, the input value is rounded to
589 | a subnormal number, and the underflow and inexact exceptions are raised if
590 | the abstract input cannot be represented exactly as a subnormal double-
591 | precision floating-point number.
592 |     The input significand `zSig' has its binary point between bits 62
593 | and 61, which is 10 bits to the left of the usual location.  This shifted
594 | significand must be normalized or smaller.  If `zSig' is not normalized,
595 | `zExp' must be 0; in that case, the result returned is a subnormal number,
596 | and it must not require rounding.  In the usual case that `zSig' is
597 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
598 | The handling of underflow and overflow follows the IEC/IEEE Standard for
599 | Binary Floating-Point Arithmetic.
600 *----------------------------------------------------------------------------*/
601 
602 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
603                                    float_status *status)
604 {
605     int8_t roundingMode;
606     flag roundNearestEven;
607     int roundIncrement, roundBits;
608     flag isTiny;
609 
610     roundingMode = status->float_rounding_mode;
611     roundNearestEven = ( roundingMode == float_round_nearest_even );
612     switch (roundingMode) {
613     case float_round_nearest_even:
614     case float_round_ties_away:
615         roundIncrement = 0x200;
616         break;
617     case float_round_to_zero:
618         roundIncrement = 0;
619         break;
620     case float_round_up:
621         roundIncrement = zSign ? 0 : 0x3ff;
622         break;
623     case float_round_down:
624         roundIncrement = zSign ? 0x3ff : 0;
625         break;
626     case float_round_to_odd:
627         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
628         break;
629     default:
630         abort();
631     }
632     roundBits = zSig & 0x3FF;
633     if ( 0x7FD <= (uint16_t) zExp ) {
634         if (    ( 0x7FD < zExp )
635              || (    ( zExp == 0x7FD )
636                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
637            ) {
638             bool overflow_to_inf = roundingMode != float_round_to_odd &&
639                                    roundIncrement != 0;
640             float_raise(float_flag_overflow | float_flag_inexact, status);
641             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
642         }
643         if ( zExp < 0 ) {
644             if (status->flush_to_zero) {
645                 float_raise(float_flag_output_denormal, status);
646                 return packFloat64(zSign, 0, 0);
647             }
648             isTiny =
649                    (status->float_detect_tininess
650                     == float_tininess_before_rounding)
651                 || ( zExp < -1 )
652                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
653             shift64RightJamming( zSig, - zExp, &zSig );
654             zExp = 0;
655             roundBits = zSig & 0x3FF;
656             if (isTiny && roundBits) {
657                 float_raise(float_flag_underflow, status);
658             }
659             if (roundingMode == float_round_to_odd) {
660                 /*
661                  * For round-to-odd case, the roundIncrement depends on
662                  * zSig which just changed.
663                  */
664                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
665             }
666         }
667     }
668     if (roundBits) {
669         status->float_exception_flags |= float_flag_inexact;
670     }
671     zSig = ( zSig + roundIncrement )>>10;
672     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
673     if ( zSig == 0 ) zExp = 0;
674     return packFloat64( zSign, zExp, zSig );
675 
676 }
677 
678 /*----------------------------------------------------------------------------
679 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
680 | and significand `zSig', and returns the proper double-precision floating-
681 | point value corresponding to the abstract input.  This routine is just like
682 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
683 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
684 | floating-point exponent.
685 *----------------------------------------------------------------------------*/
686 
687 static float64
688  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
689                               float_status *status)
690 {
691     int8_t shiftCount;
692 
693     shiftCount = countLeadingZeros64( zSig ) - 1;
694     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
695                                status);
696 
697 }
698 
699 /*----------------------------------------------------------------------------
700 | Returns the fraction bits of the extended double-precision floating-point
701 | value `a'.
702 *----------------------------------------------------------------------------*/
703 
704 static inline uint64_t extractFloatx80Frac( floatx80 a )
705 {
706 
707     return a.low;
708 
709 }
710 
711 /*----------------------------------------------------------------------------
712 | Returns the exponent bits of the extended double-precision floating-point
713 | value `a'.
714 *----------------------------------------------------------------------------*/
715 
716 static inline int32_t extractFloatx80Exp( floatx80 a )
717 {
718 
719     return a.high & 0x7FFF;
720 
721 }
722 
723 /*----------------------------------------------------------------------------
724 | Returns the sign bit of the extended double-precision floating-point value
725 | `a'.
726 *----------------------------------------------------------------------------*/
727 
728 static inline flag extractFloatx80Sign( floatx80 a )
729 {
730 
731     return a.high>>15;
732 
733 }
734 
735 /*----------------------------------------------------------------------------
736 | Normalizes the subnormal extended double-precision floating-point value
737 | represented by the denormalized significand `aSig'.  The normalized exponent
738 | and significand are stored at the locations pointed to by `zExpPtr' and
739 | `zSigPtr', respectively.
740 *----------------------------------------------------------------------------*/
741 
742 static void
743  normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
744 {
745     int8_t shiftCount;
746 
747     shiftCount = countLeadingZeros64( aSig );
748     *zSigPtr = aSig<<shiftCount;
749     *zExpPtr = 1 - shiftCount;
750 
751 }
752 
753 /*----------------------------------------------------------------------------
754 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
755 | extended double-precision floating-point value, returning the result.
756 *----------------------------------------------------------------------------*/
757 
758 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
759 {
760     floatx80 z;
761 
762     z.low = zSig;
763     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
764     return z;
765 
766 }
767 
768 /*----------------------------------------------------------------------------
769 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
770 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
771 | and returns the proper extended double-precision floating-point value
772 | corresponding to the abstract input.  Ordinarily, the abstract value is
773 | rounded and packed into the extended double-precision format, with the
774 | inexact exception raised if the abstract input cannot be represented
775 | exactly.  However, if the abstract value is too large, the overflow and
776 | inexact exceptions are raised and an infinity or maximal finite value is
777 | returned.  If the abstract value is too small, the input value is rounded to
778 | a subnormal number, and the underflow and inexact exceptions are raised if
779 | the abstract input cannot be represented exactly as a subnormal extended
780 | double-precision floating-point number.
781 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
782 | number of bits as single or double precision, respectively.  Otherwise, the
783 | result is rounded to the full precision of the extended double-precision
784 | format.
785 |     The input significand must be normalized or smaller.  If the input
786 | significand is not normalized, `zExp' must be 0; in that case, the result
787 | returned is a subnormal number, and it must not require rounding.  The
788 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
789 | Floating-Point Arithmetic.
790 *----------------------------------------------------------------------------*/
791 
792 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
793                                      int32_t zExp, uint64_t zSig0, uint64_t zSig1,
794                                      float_status *status)
795 {
796     int8_t roundingMode;
797     flag roundNearestEven, increment, isTiny;
798     int64_t roundIncrement, roundMask, roundBits;
799 
800     roundingMode = status->float_rounding_mode;
801     roundNearestEven = ( roundingMode == float_round_nearest_even );
802     if ( roundingPrecision == 80 ) goto precision80;
803     if ( roundingPrecision == 64 ) {
804         roundIncrement = LIT64( 0x0000000000000400 );
805         roundMask = LIT64( 0x00000000000007FF );
806     }
807     else if ( roundingPrecision == 32 ) {
808         roundIncrement = LIT64( 0x0000008000000000 );
809         roundMask = LIT64( 0x000000FFFFFFFFFF );
810     }
811     else {
812         goto precision80;
813     }
814     zSig0 |= ( zSig1 != 0 );
815     switch (roundingMode) {
816     case float_round_nearest_even:
817     case float_round_ties_away:
818         break;
819     case float_round_to_zero:
820         roundIncrement = 0;
821         break;
822     case float_round_up:
823         roundIncrement = zSign ? 0 : roundMask;
824         break;
825     case float_round_down:
826         roundIncrement = zSign ? roundMask : 0;
827         break;
828     default:
829         abort();
830     }
831     roundBits = zSig0 & roundMask;
832     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
833         if (    ( 0x7FFE < zExp )
834              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
835            ) {
836             goto overflow;
837         }
838         if ( zExp <= 0 ) {
839             if (status->flush_to_zero) {
840                 float_raise(float_flag_output_denormal, status);
841                 return packFloatx80(zSign, 0, 0);
842             }
843             isTiny =
844                    (status->float_detect_tininess
845                     == float_tininess_before_rounding)
846                 || ( zExp < 0 )
847                 || ( zSig0 <= zSig0 + roundIncrement );
848             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
849             zExp = 0;
850             roundBits = zSig0 & roundMask;
851             if (isTiny && roundBits) {
852                 float_raise(float_flag_underflow, status);
853             }
854             if (roundBits) {
855                 status->float_exception_flags |= float_flag_inexact;
856             }
857             zSig0 += roundIncrement;
858             if ( (int64_t) zSig0 < 0 ) zExp = 1;
859             roundIncrement = roundMask + 1;
860             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
861                 roundMask |= roundIncrement;
862             }
863             zSig0 &= ~ roundMask;
864             return packFloatx80( zSign, zExp, zSig0 );
865         }
866     }
867     if (roundBits) {
868         status->float_exception_flags |= float_flag_inexact;
869     }
870     zSig0 += roundIncrement;
871     if ( zSig0 < roundIncrement ) {
872         ++zExp;
873         zSig0 = LIT64( 0x8000000000000000 );
874     }
875     roundIncrement = roundMask + 1;
876     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
877         roundMask |= roundIncrement;
878     }
879     zSig0 &= ~ roundMask;
880     if ( zSig0 == 0 ) zExp = 0;
881     return packFloatx80( zSign, zExp, zSig0 );
882  precision80:
883     switch (roundingMode) {
884     case float_round_nearest_even:
885     case float_round_ties_away:
886         increment = ((int64_t)zSig1 < 0);
887         break;
888     case float_round_to_zero:
889         increment = 0;
890         break;
891     case float_round_up:
892         increment = !zSign && zSig1;
893         break;
894     case float_round_down:
895         increment = zSign && zSig1;
896         break;
897     default:
898         abort();
899     }
900     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
901         if (    ( 0x7FFE < zExp )
902              || (    ( zExp == 0x7FFE )
903                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
904                   && increment
905                 )
906            ) {
907             roundMask = 0;
908  overflow:
909             float_raise(float_flag_overflow | float_flag_inexact, status);
910             if (    ( roundingMode == float_round_to_zero )
911                  || ( zSign && ( roundingMode == float_round_up ) )
912                  || ( ! zSign && ( roundingMode == float_round_down ) )
913                ) {
914                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
915             }
916             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
917         }
918         if ( zExp <= 0 ) {
919             isTiny =
920                    (status->float_detect_tininess
921                     == float_tininess_before_rounding)
922                 || ( zExp < 0 )
923                 || ! increment
924                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
925             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
926             zExp = 0;
927             if (isTiny && zSig1) {
928                 float_raise(float_flag_underflow, status);
929             }
930             if (zSig1) {
931                 status->float_exception_flags |= float_flag_inexact;
932             }
933             switch (roundingMode) {
934             case float_round_nearest_even:
935             case float_round_ties_away:
936                 increment = ((int64_t)zSig1 < 0);
937                 break;
938             case float_round_to_zero:
939                 increment = 0;
940                 break;
941             case float_round_up:
942                 increment = !zSign && zSig1;
943                 break;
944             case float_round_down:
945                 increment = zSign && zSig1;
946                 break;
947             default:
948                 abort();
949             }
950             if ( increment ) {
951                 ++zSig0;
952                 zSig0 &=
953                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
954                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
955             }
956             return packFloatx80( zSign, zExp, zSig0 );
957         }
958     }
959     if (zSig1) {
960         status->float_exception_flags |= float_flag_inexact;
961     }
962     if ( increment ) {
963         ++zSig0;
964         if ( zSig0 == 0 ) {
965             ++zExp;
966             zSig0 = LIT64( 0x8000000000000000 );
967         }
968         else {
969             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
970         }
971     }
972     else {
973         if ( zSig0 == 0 ) zExp = 0;
974     }
975     return packFloatx80( zSign, zExp, zSig0 );
976 
977 }
978 
979 /*----------------------------------------------------------------------------
980 | Takes an abstract floating-point value having sign `zSign', exponent
981 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
982 | and returns the proper extended double-precision floating-point value
983 | corresponding to the abstract input.  This routine is just like
984 | `roundAndPackFloatx80' except that the input significand does not have to be
985 | normalized.
986 *----------------------------------------------------------------------------*/
987 
988 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
989                                               flag zSign, int32_t zExp,
990                                               uint64_t zSig0, uint64_t zSig1,
991                                               float_status *status)
992 {
993     int8_t shiftCount;
994 
995     if ( zSig0 == 0 ) {
996         zSig0 = zSig1;
997         zSig1 = 0;
998         zExp -= 64;
999     }
1000     shiftCount = countLeadingZeros64( zSig0 );
1001     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1002     zExp -= shiftCount;
1003     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
1004                                 zSig0, zSig1, status);
1005 
1006 }
1007 
1008 /*----------------------------------------------------------------------------
1009 | Returns the least-significant 64 fraction bits of the quadruple-precision
1010 | floating-point value `a'.
1011 *----------------------------------------------------------------------------*/
1012 
1013 static inline uint64_t extractFloat128Frac1( float128 a )
1014 {
1015 
1016     return a.low;
1017 
1018 }
1019 
1020 /*----------------------------------------------------------------------------
1021 | Returns the most-significant 48 fraction bits of the quadruple-precision
1022 | floating-point value `a'.
1023 *----------------------------------------------------------------------------*/
1024 
1025 static inline uint64_t extractFloat128Frac0( float128 a )
1026 {
1027 
1028     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1029 
1030 }
1031 
1032 /*----------------------------------------------------------------------------
1033 | Returns the exponent bits of the quadruple-precision floating-point value
1034 | `a'.
1035 *----------------------------------------------------------------------------*/
1036 
1037 static inline int32_t extractFloat128Exp( float128 a )
1038 {
1039 
1040     return ( a.high>>48 ) & 0x7FFF;
1041 
1042 }
1043 
1044 /*----------------------------------------------------------------------------
1045 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1046 *----------------------------------------------------------------------------*/
1047 
1048 static inline flag extractFloat128Sign( float128 a )
1049 {
1050 
1051     return a.high>>63;
1052 
1053 }
1054 
1055 /*----------------------------------------------------------------------------
1056 | Normalizes the subnormal quadruple-precision floating-point value
1057 | represented by the denormalized significand formed by the concatenation of
1058 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
1059 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
1060 | significand are stored at the location pointed to by `zSig0Ptr', and the
1061 | least significant 64 bits of the normalized significand are stored at the
1062 | location pointed to by `zSig1Ptr'.
1063 *----------------------------------------------------------------------------*/
1064 
1065 static void
1066  normalizeFloat128Subnormal(
1067      uint64_t aSig0,
1068      uint64_t aSig1,
1069      int32_t *zExpPtr,
1070      uint64_t *zSig0Ptr,
1071      uint64_t *zSig1Ptr
1072  )
1073 {
1074     int8_t shiftCount;
1075 
1076     if ( aSig0 == 0 ) {
1077         shiftCount = countLeadingZeros64( aSig1 ) - 15;
1078         if ( shiftCount < 0 ) {
1079             *zSig0Ptr = aSig1>>( - shiftCount );
1080             *zSig1Ptr = aSig1<<( shiftCount & 63 );
1081         }
1082         else {
1083             *zSig0Ptr = aSig1<<shiftCount;
1084             *zSig1Ptr = 0;
1085         }
1086         *zExpPtr = - shiftCount - 63;
1087     }
1088     else {
1089         shiftCount = countLeadingZeros64( aSig0 ) - 15;
1090         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1091         *zExpPtr = 1 - shiftCount;
1092     }
1093 
1094 }
1095 
1096 /*----------------------------------------------------------------------------
1097 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1098 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1099 | floating-point value, returning the result.  After being shifted into the
1100 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1101 | added together to form the most significant 32 bits of the result.  This
1102 | means that any integer portion of `zSig0' will be added into the exponent.
1103 | Since a properly normalized significand will have an integer portion equal
1104 | to 1, the `zExp' input should be 1 less than the desired result exponent
1105 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1106 | significand.
1107 *----------------------------------------------------------------------------*/
1108 
1109 static inline float128
1110  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
1111 {
1112     float128 z;
1113 
1114     z.low = zSig1;
1115     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1116     return z;
1117 
1118 }
1119 
1120 /*----------------------------------------------------------------------------
1121 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1122 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1123 | and `zSig2', and returns the proper quadruple-precision floating-point value
1124 | corresponding to the abstract input.  Ordinarily, the abstract value is
1125 | simply rounded and packed into the quadruple-precision format, with the
1126 | inexact exception raised if the abstract input cannot be represented
1127 | exactly.  However, if the abstract value is too large, the overflow and
1128 | inexact exceptions are raised and an infinity or maximal finite value is
1129 | returned.  If the abstract value is too small, the input value is rounded to
1130 | a subnormal number, and the underflow and inexact exceptions are raised if
1131 | the abstract input cannot be represented exactly as a subnormal quadruple-
1132 | precision floating-point number.
1133 |     The input significand must be normalized or smaller.  If the input
1134 | significand is not normalized, `zExp' must be 0; in that case, the result
1135 | returned is a subnormal number, and it must not require rounding.  In the
1136 | usual case that the input significand is normalized, `zExp' must be 1 less
1137 | than the ``true'' floating-point exponent.  The handling of underflow and
1138 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139 *----------------------------------------------------------------------------*/
1140 
1141 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
1142                                      uint64_t zSig0, uint64_t zSig1,
1143                                      uint64_t zSig2, float_status *status)
1144 {
1145     int8_t roundingMode;
1146     flag roundNearestEven, increment, isTiny;
1147 
1148     roundingMode = status->float_rounding_mode;
1149     roundNearestEven = ( roundingMode == float_round_nearest_even );
1150     switch (roundingMode) {
1151     case float_round_nearest_even:
1152     case float_round_ties_away:
1153         increment = ((int64_t)zSig2 < 0);
1154         break;
1155     case float_round_to_zero:
1156         increment = 0;
1157         break;
1158     case float_round_up:
1159         increment = !zSign && zSig2;
1160         break;
1161     case float_round_down:
1162         increment = zSign && zSig2;
1163         break;
1164     case float_round_to_odd:
1165         increment = !(zSig1 & 0x1) && zSig2;
1166         break;
1167     default:
1168         abort();
1169     }
1170     if ( 0x7FFD <= (uint32_t) zExp ) {
1171         if (    ( 0x7FFD < zExp )
1172              || (    ( zExp == 0x7FFD )
1173                   && eq128(
1174                          LIT64( 0x0001FFFFFFFFFFFF ),
1175                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1176                          zSig0,
1177                          zSig1
1178                      )
1179                   && increment
1180                 )
1181            ) {
1182             float_raise(float_flag_overflow | float_flag_inexact, status);
1183             if (    ( roundingMode == float_round_to_zero )
1184                  || ( zSign && ( roundingMode == float_round_up ) )
1185                  || ( ! zSign && ( roundingMode == float_round_down ) )
1186                  || (roundingMode == float_round_to_odd)
1187                ) {
1188                 return
1189                     packFloat128(
1190                         zSign,
1191                         0x7FFE,
1192                         LIT64( 0x0000FFFFFFFFFFFF ),
1193                         LIT64( 0xFFFFFFFFFFFFFFFF )
1194                     );
1195             }
1196             return packFloat128( zSign, 0x7FFF, 0, 0 );
1197         }
1198         if ( zExp < 0 ) {
1199             if (status->flush_to_zero) {
1200                 float_raise(float_flag_output_denormal, status);
1201                 return packFloat128(zSign, 0, 0, 0);
1202             }
1203             isTiny =
1204                    (status->float_detect_tininess
1205                     == float_tininess_before_rounding)
1206                 || ( zExp < -1 )
1207                 || ! increment
1208                 || lt128(
1209                        zSig0,
1210                        zSig1,
1211                        LIT64( 0x0001FFFFFFFFFFFF ),
1212                        LIT64( 0xFFFFFFFFFFFFFFFF )
1213                    );
1214             shift128ExtraRightJamming(
1215                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1216             zExp = 0;
1217             if (isTiny && zSig2) {
1218                 float_raise(float_flag_underflow, status);
1219             }
1220             switch (roundingMode) {
1221             case float_round_nearest_even:
1222             case float_round_ties_away:
1223                 increment = ((int64_t)zSig2 < 0);
1224                 break;
1225             case float_round_to_zero:
1226                 increment = 0;
1227                 break;
1228             case float_round_up:
1229                 increment = !zSign && zSig2;
1230                 break;
1231             case float_round_down:
1232                 increment = zSign && zSig2;
1233                 break;
1234             case float_round_to_odd:
1235                 increment = !(zSig1 & 0x1) && zSig2;
1236                 break;
1237             default:
1238                 abort();
1239             }
1240         }
1241     }
1242     if (zSig2) {
1243         status->float_exception_flags |= float_flag_inexact;
1244     }
1245     if ( increment ) {
1246         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1247         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1248     }
1249     else {
1250         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1251     }
1252     return packFloat128( zSign, zExp, zSig0, zSig1 );
1253 
1254 }
1255 
1256 /*----------------------------------------------------------------------------
1257 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1258 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1259 | returns the proper quadruple-precision floating-point value corresponding
1260 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1261 | except that the input significand has fewer bits and does not have to be
1262 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1263 | point exponent.
1264 *----------------------------------------------------------------------------*/
1265 
1266 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
1267                                               uint64_t zSig0, uint64_t zSig1,
1268                                               float_status *status)
1269 {
1270     int8_t shiftCount;
1271     uint64_t zSig2;
1272 
1273     if ( zSig0 == 0 ) {
1274         zSig0 = zSig1;
1275         zSig1 = 0;
1276         zExp -= 64;
1277     }
1278     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1279     if ( 0 <= shiftCount ) {
1280         zSig2 = 0;
1281         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1282     }
1283     else {
1284         shift128ExtraRightJamming(
1285             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1286     }
1287     zExp -= shiftCount;
1288     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
1289 
1290 }
1291 
1292 /*----------------------------------------------------------------------------
1293 | Returns the result of converting the 32-bit two's complement integer `a'
1294 | to the single-precision floating-point format.  The conversion is performed
1295 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1296 *----------------------------------------------------------------------------*/
1297 
1298 float32 int32_to_float32(int32_t a, float_status *status)
1299 {
1300     flag zSign;
1301 
1302     if ( a == 0 ) return float32_zero;
1303     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1304     zSign = ( a < 0 );
1305     return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
1306 }
1307 
1308 /*----------------------------------------------------------------------------
1309 | Returns the result of converting the 32-bit two's complement integer `a'
1310 | to the double-precision floating-point format.  The conversion is performed
1311 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1312 *----------------------------------------------------------------------------*/
1313 
1314 float64 int32_to_float64(int32_t a, float_status *status)
1315 {
1316     flag zSign;
1317     uint32_t absA;
1318     int8_t shiftCount;
1319     uint64_t zSig;
1320 
1321     if ( a == 0 ) return float64_zero;
1322     zSign = ( a < 0 );
1323     absA = zSign ? - a : a;
1324     shiftCount = countLeadingZeros32( absA ) + 21;
1325     zSig = absA;
1326     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1327 
1328 }
1329 
1330 /*----------------------------------------------------------------------------
1331 | Returns the result of converting the 32-bit two's complement integer `a'
1332 | to the extended double-precision floating-point format.  The conversion
1333 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1334 | Arithmetic.
1335 *----------------------------------------------------------------------------*/
1336 
1337 floatx80 int32_to_floatx80(int32_t a, float_status *status)
1338 {
1339     flag zSign;
1340     uint32_t absA;
1341     int8_t shiftCount;
1342     uint64_t zSig;
1343 
1344     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1345     zSign = ( a < 0 );
1346     absA = zSign ? - a : a;
1347     shiftCount = countLeadingZeros32( absA ) + 32;
1348     zSig = absA;
1349     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1350 
1351 }
1352 
1353 /*----------------------------------------------------------------------------
1354 | Returns the result of converting the 32-bit two's complement integer `a' to
1355 | the quadruple-precision floating-point format.  The conversion is performed
1356 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1357 *----------------------------------------------------------------------------*/
1358 
1359 float128 int32_to_float128(int32_t a, float_status *status)
1360 {
1361     flag zSign;
1362     uint32_t absA;
1363     int8_t shiftCount;
1364     uint64_t zSig0;
1365 
1366     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1367     zSign = ( a < 0 );
1368     absA = zSign ? - a : a;
1369     shiftCount = countLeadingZeros32( absA ) + 17;
1370     zSig0 = absA;
1371     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1372 
1373 }
1374 
1375 /*----------------------------------------------------------------------------
1376 | Returns the result of converting the 64-bit two's complement integer `a'
1377 | to the single-precision floating-point format.  The conversion is performed
1378 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1379 *----------------------------------------------------------------------------*/
1380 
1381 float32 int64_to_float32(int64_t a, float_status *status)
1382 {
1383     flag zSign;
1384     uint64_t absA;
1385     int8_t shiftCount;
1386 
1387     if ( a == 0 ) return float32_zero;
1388     zSign = ( a < 0 );
1389     absA = zSign ? - a : a;
1390     shiftCount = countLeadingZeros64( absA ) - 40;
1391     if ( 0 <= shiftCount ) {
1392         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1393     }
1394     else {
1395         shiftCount += 7;
1396         if ( shiftCount < 0 ) {
1397             shift64RightJamming( absA, - shiftCount, &absA );
1398         }
1399         else {
1400             absA <<= shiftCount;
1401         }
1402         return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
1403     }
1404 
1405 }
1406 
1407 /*----------------------------------------------------------------------------
1408 | Returns the result of converting the 64-bit two's complement integer `a'
1409 | to the double-precision floating-point format.  The conversion is performed
1410 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1411 *----------------------------------------------------------------------------*/
1412 
1413 float64 int64_to_float64(int64_t a, float_status *status)
1414 {
1415     flag zSign;
1416 
1417     if ( a == 0 ) return float64_zero;
1418     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1419         return packFloat64( 1, 0x43E, 0 );
1420     }
1421     zSign = ( a < 0 );
1422     return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
1423 }
1424 
1425 /*----------------------------------------------------------------------------
1426 | Returns the result of converting the 64-bit two's complement integer `a'
1427 | to the extended double-precision floating-point format.  The conversion
1428 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1429 | Arithmetic.
1430 *----------------------------------------------------------------------------*/
1431 
1432 floatx80 int64_to_floatx80(int64_t a, float_status *status)
1433 {
1434     flag zSign;
1435     uint64_t absA;
1436     int8_t shiftCount;
1437 
1438     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1439     zSign = ( a < 0 );
1440     absA = zSign ? - a : a;
1441     shiftCount = countLeadingZeros64( absA );
1442     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1443 
1444 }
1445 
1446 /*----------------------------------------------------------------------------
1447 | Returns the result of converting the 64-bit two's complement integer `a' to
1448 | the quadruple-precision floating-point format.  The conversion is performed
1449 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1450 *----------------------------------------------------------------------------*/
1451 
1452 float128 int64_to_float128(int64_t a, float_status *status)
1453 {
1454     flag zSign;
1455     uint64_t absA;
1456     int8_t shiftCount;
1457     int32_t zExp;
1458     uint64_t zSig0, zSig1;
1459 
1460     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1461     zSign = ( a < 0 );
1462     absA = zSign ? - a : a;
1463     shiftCount = countLeadingZeros64( absA ) + 49;
1464     zExp = 0x406E - shiftCount;
1465     if ( 64 <= shiftCount ) {
1466         zSig1 = 0;
1467         zSig0 = absA;
1468         shiftCount -= 64;
1469     }
1470     else {
1471         zSig1 = absA;
1472         zSig0 = 0;
1473     }
1474     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1475     return packFloat128( zSign, zExp, zSig0, zSig1 );
1476 
1477 }
1478 
1479 /*----------------------------------------------------------------------------
1480 | Returns the result of converting the 64-bit unsigned integer `a'
1481 | to the single-precision floating-point format.  The conversion is performed
1482 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1483 *----------------------------------------------------------------------------*/
1484 
1485 float32 uint64_to_float32(uint64_t a, float_status *status)
1486 {
1487     int shiftcount;
1488 
1489     if (a == 0) {
1490         return float32_zero;
1491     }
1492 
1493     /* Determine (left) shift needed to put first set bit into bit posn 23
1494      * (since packFloat32() expects the binary point between bits 23 and 22);
1495      * this is the fast case for smallish numbers.
1496      */
1497     shiftcount = countLeadingZeros64(a) - 40;
1498     if (shiftcount >= 0) {
1499         return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1500     }
1501     /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1502      * expects the binary point between bits 30 and 29, hence the + 7.
1503      */
1504     shiftcount += 7;
1505     if (shiftcount < 0) {
1506         shift64RightJamming(a, -shiftcount, &a);
1507     } else {
1508         a <<= shiftcount;
1509     }
1510 
1511     return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
1512 }
1513 
1514 /*----------------------------------------------------------------------------
1515 | Returns the result of converting the 64-bit unsigned integer `a'
1516 | to the double-precision floating-point format.  The conversion is performed
1517 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1518 *----------------------------------------------------------------------------*/
1519 
1520 float64 uint64_to_float64(uint64_t a, float_status *status)
1521 {
1522     int exp = 0x43C;
1523     int shiftcount;
1524 
1525     if (a == 0) {
1526         return float64_zero;
1527     }
1528 
1529     shiftcount = countLeadingZeros64(a) - 1;
1530     if (shiftcount < 0) {
1531         shift64RightJamming(a, -shiftcount, &a);
1532     } else {
1533         a <<= shiftcount;
1534     }
1535     return roundAndPackFloat64(0, exp - shiftcount, a, status);
1536 }
1537 
1538 /*----------------------------------------------------------------------------
1539 | Returns the result of converting the 64-bit unsigned integer `a'
1540 | to the quadruple-precision floating-point format.  The conversion is performed
1541 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1542 *----------------------------------------------------------------------------*/
1543 
1544 float128 uint64_to_float128(uint64_t a, float_status *status)
1545 {
1546     if (a == 0) {
1547         return float128_zero;
1548     }
1549     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1550 }
1551 
1552 /*----------------------------------------------------------------------------
1553 | Returns the result of converting the single-precision floating-point value
1554 | `a' to the 32-bit two's complement integer format.  The conversion is
1555 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1556 | Arithmetic---which means in particular that the conversion is rounded
1557 | according to the current rounding mode.  If `a' is a NaN, the largest
1558 | positive integer is returned.  Otherwise, if the conversion overflows, the
1559 | largest integer with the same sign as `a' is returned.
1560 *----------------------------------------------------------------------------*/
1561 
1562 int32_t float32_to_int32(float32 a, float_status *status)
1563 {
1564     flag aSign;
1565     int aExp;
1566     int shiftCount;
1567     uint32_t aSig;
1568     uint64_t aSig64;
1569 
1570     a = float32_squash_input_denormal(a, status);
1571     aSig = extractFloat32Frac( a );
1572     aExp = extractFloat32Exp( a );
1573     aSign = extractFloat32Sign( a );
1574     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1575     if ( aExp ) aSig |= 0x00800000;
1576     shiftCount = 0xAF - aExp;
1577     aSig64 = aSig;
1578     aSig64 <<= 32;
1579     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1580     return roundAndPackInt32(aSign, aSig64, status);
1581 
1582 }
1583 
1584 /*----------------------------------------------------------------------------
1585 | Returns the result of converting the single-precision floating-point value
1586 | `a' to the 32-bit two's complement integer format.  The conversion is
1587 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1588 | Arithmetic, except that the conversion is always rounded toward zero.
1589 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1590 | the conversion overflows, the largest integer with the same sign as `a' is
1591 | returned.
1592 *----------------------------------------------------------------------------*/
1593 
1594 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
1595 {
1596     flag aSign;
1597     int aExp;
1598     int shiftCount;
1599     uint32_t aSig;
1600     int32_t z;
1601     a = float32_squash_input_denormal(a, status);
1602 
1603     aSig = extractFloat32Frac( a );
1604     aExp = extractFloat32Exp( a );
1605     aSign = extractFloat32Sign( a );
1606     shiftCount = aExp - 0x9E;
1607     if ( 0 <= shiftCount ) {
1608         if ( float32_val(a) != 0xCF000000 ) {
1609             float_raise(float_flag_invalid, status);
1610             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1611         }
1612         return (int32_t) 0x80000000;
1613     }
1614     else if ( aExp <= 0x7E ) {
1615         if (aExp | aSig) {
1616             status->float_exception_flags |= float_flag_inexact;
1617         }
1618         return 0;
1619     }
1620     aSig = ( aSig | 0x00800000 )<<8;
1621     z = aSig>>( - shiftCount );
1622     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1623         status->float_exception_flags |= float_flag_inexact;
1624     }
1625     if ( aSign ) z = - z;
1626     return z;
1627 
1628 }
1629 
1630 /*----------------------------------------------------------------------------
1631 | Returns the result of converting the single-precision floating-point value
1632 | `a' to the 16-bit two's complement integer format.  The conversion is
1633 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1634 | Arithmetic, except that the conversion is always rounded toward zero.
1635 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1636 | the conversion overflows, the largest integer with the same sign as `a' is
1637 | returned.
1638 *----------------------------------------------------------------------------*/
1639 
1640 int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
1641 {
1642     flag aSign;
1643     int aExp;
1644     int shiftCount;
1645     uint32_t aSig;
1646     int32_t z;
1647 
1648     aSig = extractFloat32Frac( a );
1649     aExp = extractFloat32Exp( a );
1650     aSign = extractFloat32Sign( a );
1651     shiftCount = aExp - 0x8E;
1652     if ( 0 <= shiftCount ) {
1653         if ( float32_val(a) != 0xC7000000 ) {
1654             float_raise(float_flag_invalid, status);
1655             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1656                 return 0x7FFF;
1657             }
1658         }
1659         return (int32_t) 0xffff8000;
1660     }
1661     else if ( aExp <= 0x7E ) {
1662         if ( aExp | aSig ) {
1663             status->float_exception_flags |= float_flag_inexact;
1664         }
1665         return 0;
1666     }
1667     shiftCount -= 0x10;
1668     aSig = ( aSig | 0x00800000 )<<8;
1669     z = aSig>>( - shiftCount );
1670     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1671         status->float_exception_flags |= float_flag_inexact;
1672     }
1673     if ( aSign ) {
1674         z = - z;
1675     }
1676     return z;
1677 
1678 }
1679 
1680 /*----------------------------------------------------------------------------
1681 | Returns the result of converting the single-precision floating-point value
1682 | `a' to the 64-bit two's complement integer format.  The conversion is
1683 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1684 | Arithmetic---which means in particular that the conversion is rounded
1685 | according to the current rounding mode.  If `a' is a NaN, the largest
1686 | positive integer is returned.  Otherwise, if the conversion overflows, the
1687 | largest integer with the same sign as `a' is returned.
1688 *----------------------------------------------------------------------------*/
1689 
1690 int64_t float32_to_int64(float32 a, float_status *status)
1691 {
1692     flag aSign;
1693     int aExp;
1694     int shiftCount;
1695     uint32_t aSig;
1696     uint64_t aSig64, aSigExtra;
1697     a = float32_squash_input_denormal(a, status);
1698 
1699     aSig = extractFloat32Frac( a );
1700     aExp = extractFloat32Exp( a );
1701     aSign = extractFloat32Sign( a );
1702     shiftCount = 0xBE - aExp;
1703     if ( shiftCount < 0 ) {
1704         float_raise(float_flag_invalid, status);
1705         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1706             return LIT64( 0x7FFFFFFFFFFFFFFF );
1707         }
1708         return (int64_t) LIT64( 0x8000000000000000 );
1709     }
1710     if ( aExp ) aSig |= 0x00800000;
1711     aSig64 = aSig;
1712     aSig64 <<= 40;
1713     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1714     return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
1715 
1716 }
1717 
1718 /*----------------------------------------------------------------------------
1719 | Returns the result of converting the single-precision floating-point value
1720 | `a' to the 64-bit unsigned integer format.  The conversion is
1721 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1722 | Arithmetic---which means in particular that the conversion is rounded
1723 | according to the current rounding mode.  If `a' is a NaN, the largest
1724 | unsigned integer is returned.  Otherwise, if the conversion overflows, the
1725 | largest unsigned integer is returned.  If the 'a' is negative, the result
1726 | is rounded and zero is returned; values that do not round to zero will
1727 | raise the inexact exception flag.
1728 *----------------------------------------------------------------------------*/
1729 
1730 uint64_t float32_to_uint64(float32 a, float_status *status)
1731 {
1732     flag aSign;
1733     int aExp;
1734     int shiftCount;
1735     uint32_t aSig;
1736     uint64_t aSig64, aSigExtra;
1737     a = float32_squash_input_denormal(a, status);
1738 
1739     aSig = extractFloat32Frac(a);
1740     aExp = extractFloat32Exp(a);
1741     aSign = extractFloat32Sign(a);
1742     if ((aSign) && (aExp > 126)) {
1743         float_raise(float_flag_invalid, status);
1744         if (float32_is_any_nan(a)) {
1745             return LIT64(0xFFFFFFFFFFFFFFFF);
1746         } else {
1747             return 0;
1748         }
1749     }
1750     shiftCount = 0xBE - aExp;
1751     if (aExp) {
1752         aSig |= 0x00800000;
1753     }
1754     if (shiftCount < 0) {
1755         float_raise(float_flag_invalid, status);
1756         return LIT64(0xFFFFFFFFFFFFFFFF);
1757     }
1758 
1759     aSig64 = aSig;
1760     aSig64 <<= 40;
1761     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1762     return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
1763 }
1764 
1765 /*----------------------------------------------------------------------------
1766 | Returns the result of converting the single-precision floating-point value
1767 | `a' to the 64-bit unsigned integer format.  The conversion is
1768 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1769 | Arithmetic, except that the conversion is always rounded toward zero.  If
1770 | `a' is a NaN, the largest unsigned integer is returned.  Otherwise, if the
1771 | conversion overflows, the largest unsigned integer is returned.  If the
1772 | 'a' is negative, the result is rounded and zero is returned; values that do
1773 | not round to zero will raise the inexact flag.
1774 *----------------------------------------------------------------------------*/
1775 
1776 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
1777 {
1778     signed char current_rounding_mode = status->float_rounding_mode;
1779     set_float_rounding_mode(float_round_to_zero, status);
1780     int64_t v = float32_to_uint64(a, status);
1781     set_float_rounding_mode(current_rounding_mode, status);
1782     return v;
1783 }
1784 
1785 /*----------------------------------------------------------------------------
1786 | Returns the result of converting the single-precision floating-point value
1787 | `a' to the 64-bit two's complement integer format.  The conversion is
1788 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1789 | Arithmetic, except that the conversion is always rounded toward zero.  If
1790 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1791 | conversion overflows, the largest integer with the same sign as `a' is
1792 | returned.
1793 *----------------------------------------------------------------------------*/
1794 
1795 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
1796 {
1797     flag aSign;
1798     int aExp;
1799     int shiftCount;
1800     uint32_t aSig;
1801     uint64_t aSig64;
1802     int64_t z;
1803     a = float32_squash_input_denormal(a, status);
1804 
1805     aSig = extractFloat32Frac( a );
1806     aExp = extractFloat32Exp( a );
1807     aSign = extractFloat32Sign( a );
1808     shiftCount = aExp - 0xBE;
1809     if ( 0 <= shiftCount ) {
1810         if ( float32_val(a) != 0xDF000000 ) {
1811             float_raise(float_flag_invalid, status);
1812             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1813                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1814             }
1815         }
1816         return (int64_t) LIT64( 0x8000000000000000 );
1817     }
1818     else if ( aExp <= 0x7E ) {
1819         if (aExp | aSig) {
1820             status->float_exception_flags |= float_flag_inexact;
1821         }
1822         return 0;
1823     }
1824     aSig64 = aSig | 0x00800000;
1825     aSig64 <<= 40;
1826     z = aSig64>>( - shiftCount );
1827     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1828         status->float_exception_flags |= float_flag_inexact;
1829     }
1830     if ( aSign ) z = - z;
1831     return z;
1832 
1833 }
1834 
1835 /*----------------------------------------------------------------------------
1836 | Returns the result of converting the single-precision floating-point value
1837 | `a' to the double-precision floating-point format.  The conversion is
1838 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1839 | Arithmetic.
1840 *----------------------------------------------------------------------------*/
1841 
1842 float64 float32_to_float64(float32 a, float_status *status)
1843 {
1844     flag aSign;
1845     int aExp;
1846     uint32_t aSig;
1847     a = float32_squash_input_denormal(a, status);
1848 
1849     aSig = extractFloat32Frac( a );
1850     aExp = extractFloat32Exp( a );
1851     aSign = extractFloat32Sign( a );
1852     if ( aExp == 0xFF ) {
1853         if (aSig) {
1854             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1855         }
1856         return packFloat64( aSign, 0x7FF, 0 );
1857     }
1858     if ( aExp == 0 ) {
1859         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1860         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1861         --aExp;
1862     }
1863     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1864 
1865 }
1866 
1867 /*----------------------------------------------------------------------------
1868 | Returns the result of converting the single-precision floating-point value
1869 | `a' to the extended double-precision floating-point format.  The conversion
1870 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1871 | Arithmetic.
1872 *----------------------------------------------------------------------------*/
1873 
1874 floatx80 float32_to_floatx80(float32 a, float_status *status)
1875 {
1876     flag aSign;
1877     int aExp;
1878     uint32_t aSig;
1879 
1880     a = float32_squash_input_denormal(a, status);
1881     aSig = extractFloat32Frac( a );
1882     aExp = extractFloat32Exp( a );
1883     aSign = extractFloat32Sign( a );
1884     if ( aExp == 0xFF ) {
1885         if (aSig) {
1886             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1887         }
1888         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1889     }
1890     if ( aExp == 0 ) {
1891         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1892         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1893     }
1894     aSig |= 0x00800000;
1895     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1896 
1897 }
1898 
1899 /*----------------------------------------------------------------------------
1900 | Returns the result of converting the single-precision floating-point value
1901 | `a' to the double-precision floating-point format.  The conversion is
1902 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1903 | Arithmetic.
1904 *----------------------------------------------------------------------------*/
1905 
1906 float128 float32_to_float128(float32 a, float_status *status)
1907 {
1908     flag aSign;
1909     int aExp;
1910     uint32_t aSig;
1911 
1912     a = float32_squash_input_denormal(a, status);
1913     aSig = extractFloat32Frac( a );
1914     aExp = extractFloat32Exp( a );
1915     aSign = extractFloat32Sign( a );
1916     if ( aExp == 0xFF ) {
1917         if (aSig) {
1918             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1919         }
1920         return packFloat128( aSign, 0x7FFF, 0, 0 );
1921     }
1922     if ( aExp == 0 ) {
1923         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1924         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1925         --aExp;
1926     }
1927     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1928 
1929 }
1930 
1931 /*----------------------------------------------------------------------------
1932 | Rounds the single-precision floating-point value `a' to an integer, and
1933 | returns the result as a single-precision floating-point value.  The
1934 | operation is performed according to the IEC/IEEE Standard for Binary
1935 | Floating-Point Arithmetic.
1936 *----------------------------------------------------------------------------*/
1937 
1938 float32 float32_round_to_int(float32 a, float_status *status)
1939 {
1940     flag aSign;
1941     int aExp;
1942     uint32_t lastBitMask, roundBitsMask;
1943     uint32_t z;
1944     a = float32_squash_input_denormal(a, status);
1945 
1946     aExp = extractFloat32Exp( a );
1947     if ( 0x96 <= aExp ) {
1948         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1949             return propagateFloat32NaN(a, a, status);
1950         }
1951         return a;
1952     }
1953     if ( aExp <= 0x7E ) {
1954         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1955         status->float_exception_flags |= float_flag_inexact;
1956         aSign = extractFloat32Sign( a );
1957         switch (status->float_rounding_mode) {
1958          case float_round_nearest_even:
1959             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1960                 return packFloat32( aSign, 0x7F, 0 );
1961             }
1962             break;
1963         case float_round_ties_away:
1964             if (aExp == 0x7E) {
1965                 return packFloat32(aSign, 0x7F, 0);
1966             }
1967             break;
1968          case float_round_down:
1969             return make_float32(aSign ? 0xBF800000 : 0);
1970          case float_round_up:
1971             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1972         }
1973         return packFloat32( aSign, 0, 0 );
1974     }
1975     lastBitMask = 1;
1976     lastBitMask <<= 0x96 - aExp;
1977     roundBitsMask = lastBitMask - 1;
1978     z = float32_val(a);
1979     switch (status->float_rounding_mode) {
1980     case float_round_nearest_even:
1981         z += lastBitMask>>1;
1982         if ((z & roundBitsMask) == 0) {
1983             z &= ~lastBitMask;
1984         }
1985         break;
1986     case float_round_ties_away:
1987         z += lastBitMask >> 1;
1988         break;
1989     case float_round_to_zero:
1990         break;
1991     case float_round_up:
1992         if (!extractFloat32Sign(make_float32(z))) {
1993             z += roundBitsMask;
1994         }
1995         break;
1996     case float_round_down:
1997         if (extractFloat32Sign(make_float32(z))) {
1998             z += roundBitsMask;
1999         }
2000         break;
2001     default:
2002         abort();
2003     }
2004     z &= ~ roundBitsMask;
2005     if (z != float32_val(a)) {
2006         status->float_exception_flags |= float_flag_inexact;
2007     }
2008     return make_float32(z);
2009 
2010 }
2011 
2012 /*----------------------------------------------------------------------------
2013 | Returns the result of adding the absolute values of the single-precision
2014 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2015 | before being returned.  `zSign' is ignored if the result is a NaN.
2016 | The addition is performed according to the IEC/IEEE Standard for Binary
2017 | Floating-Point Arithmetic.
2018 *----------------------------------------------------------------------------*/
2019 
2020 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
2021                               float_status *status)
2022 {
2023     int aExp, bExp, zExp;
2024     uint32_t aSig, bSig, zSig;
2025     int expDiff;
2026 
2027     aSig = extractFloat32Frac( a );
2028     aExp = extractFloat32Exp( a );
2029     bSig = extractFloat32Frac( b );
2030     bExp = extractFloat32Exp( b );
2031     expDiff = aExp - bExp;
2032     aSig <<= 6;
2033     bSig <<= 6;
2034     if ( 0 < expDiff ) {
2035         if ( aExp == 0xFF ) {
2036             if (aSig) {
2037                 return propagateFloat32NaN(a, b, status);
2038             }
2039             return a;
2040         }
2041         if ( bExp == 0 ) {
2042             --expDiff;
2043         }
2044         else {
2045             bSig |= 0x20000000;
2046         }
2047         shift32RightJamming( bSig, expDiff, &bSig );
2048         zExp = aExp;
2049     }
2050     else if ( expDiff < 0 ) {
2051         if ( bExp == 0xFF ) {
2052             if (bSig) {
2053                 return propagateFloat32NaN(a, b, status);
2054             }
2055             return packFloat32( zSign, 0xFF, 0 );
2056         }
2057         if ( aExp == 0 ) {
2058             ++expDiff;
2059         }
2060         else {
2061             aSig |= 0x20000000;
2062         }
2063         shift32RightJamming( aSig, - expDiff, &aSig );
2064         zExp = bExp;
2065     }
2066     else {
2067         if ( aExp == 0xFF ) {
2068             if (aSig | bSig) {
2069                 return propagateFloat32NaN(a, b, status);
2070             }
2071             return a;
2072         }
2073         if ( aExp == 0 ) {
2074             if (status->flush_to_zero) {
2075                 if (aSig | bSig) {
2076                     float_raise(float_flag_output_denormal, status);
2077                 }
2078                 return packFloat32(zSign, 0, 0);
2079             }
2080             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2081         }
2082         zSig = 0x40000000 + aSig + bSig;
2083         zExp = aExp;
2084         goto roundAndPack;
2085     }
2086     aSig |= 0x20000000;
2087     zSig = ( aSig + bSig )<<1;
2088     --zExp;
2089     if ( (int32_t) zSig < 0 ) {
2090         zSig = aSig + bSig;
2091         ++zExp;
2092     }
2093  roundAndPack:
2094     return roundAndPackFloat32(zSign, zExp, zSig, status);
2095 
2096 }
2097 
2098 /*----------------------------------------------------------------------------
2099 | Returns the result of subtracting the absolute values of the single-
2100 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2101 | difference is negated before being returned.  `zSign' is ignored if the
2102 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2103 | Standard for Binary Floating-Point Arithmetic.
2104 *----------------------------------------------------------------------------*/
2105 
2106 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2107                               float_status *status)
2108 {
2109     int aExp, bExp, zExp;
2110     uint32_t aSig, bSig, zSig;
2111     int expDiff;
2112 
2113     aSig = extractFloat32Frac( a );
2114     aExp = extractFloat32Exp( a );
2115     bSig = extractFloat32Frac( b );
2116     bExp = extractFloat32Exp( b );
2117     expDiff = aExp - bExp;
2118     aSig <<= 7;
2119     bSig <<= 7;
2120     if ( 0 < expDiff ) goto aExpBigger;
2121     if ( expDiff < 0 ) goto bExpBigger;
2122     if ( aExp == 0xFF ) {
2123         if (aSig | bSig) {
2124             return propagateFloat32NaN(a, b, status);
2125         }
2126         float_raise(float_flag_invalid, status);
2127         return float32_default_nan(status);
2128     }
2129     if ( aExp == 0 ) {
2130         aExp = 1;
2131         bExp = 1;
2132     }
2133     if ( bSig < aSig ) goto aBigger;
2134     if ( aSig < bSig ) goto bBigger;
2135     return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
2136  bExpBigger:
2137     if ( bExp == 0xFF ) {
2138         if (bSig) {
2139             return propagateFloat32NaN(a, b, status);
2140         }
2141         return packFloat32( zSign ^ 1, 0xFF, 0 );
2142     }
2143     if ( aExp == 0 ) {
2144         ++expDiff;
2145     }
2146     else {
2147         aSig |= 0x40000000;
2148     }
2149     shift32RightJamming( aSig, - expDiff, &aSig );
2150     bSig |= 0x40000000;
2151  bBigger:
2152     zSig = bSig - aSig;
2153     zExp = bExp;
2154     zSign ^= 1;
2155     goto normalizeRoundAndPack;
2156  aExpBigger:
2157     if ( aExp == 0xFF ) {
2158         if (aSig) {
2159             return propagateFloat32NaN(a, b, status);
2160         }
2161         return a;
2162     }
2163     if ( bExp == 0 ) {
2164         --expDiff;
2165     }
2166     else {
2167         bSig |= 0x40000000;
2168     }
2169     shift32RightJamming( bSig, expDiff, &bSig );
2170     aSig |= 0x40000000;
2171  aBigger:
2172     zSig = aSig - bSig;
2173     zExp = aExp;
2174  normalizeRoundAndPack:
2175     --zExp;
2176     return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
2177 
2178 }
2179 
2180 /*----------------------------------------------------------------------------
2181 | Returns the result of adding the single-precision floating-point values `a'
2182 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2183 | Binary Floating-Point Arithmetic.
2184 *----------------------------------------------------------------------------*/
2185 
2186 float32 float32_add(float32 a, float32 b, float_status *status)
2187 {
2188     flag aSign, bSign;
2189     a = float32_squash_input_denormal(a, status);
2190     b = float32_squash_input_denormal(b, status);
2191 
2192     aSign = extractFloat32Sign( a );
2193     bSign = extractFloat32Sign( b );
2194     if ( aSign == bSign ) {
2195         return addFloat32Sigs(a, b, aSign, status);
2196     }
2197     else {
2198         return subFloat32Sigs(a, b, aSign, status);
2199     }
2200 
2201 }
2202 
2203 /*----------------------------------------------------------------------------
2204 | Returns the result of subtracting the single-precision floating-point values
2205 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2206 | for Binary Floating-Point Arithmetic.
2207 *----------------------------------------------------------------------------*/
2208 
2209 float32 float32_sub(float32 a, float32 b, float_status *status)
2210 {
2211     flag aSign, bSign;
2212     a = float32_squash_input_denormal(a, status);
2213     b = float32_squash_input_denormal(b, status);
2214 
2215     aSign = extractFloat32Sign( a );
2216     bSign = extractFloat32Sign( b );
2217     if ( aSign == bSign ) {
2218         return subFloat32Sigs(a, b, aSign, status);
2219     }
2220     else {
2221         return addFloat32Sigs(a, b, aSign, status);
2222     }
2223 
2224 }
2225 
2226 /*----------------------------------------------------------------------------
2227 | Returns the result of multiplying the single-precision floating-point values
2228 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2229 | for Binary Floating-Point Arithmetic.
2230 *----------------------------------------------------------------------------*/
2231 
2232 float32 float32_mul(float32 a, float32 b, float_status *status)
2233 {
2234     flag aSign, bSign, zSign;
2235     int aExp, bExp, zExp;
2236     uint32_t aSig, bSig;
2237     uint64_t zSig64;
2238     uint32_t zSig;
2239 
2240     a = float32_squash_input_denormal(a, status);
2241     b = float32_squash_input_denormal(b, status);
2242 
2243     aSig = extractFloat32Frac( a );
2244     aExp = extractFloat32Exp( a );
2245     aSign = extractFloat32Sign( a );
2246     bSig = extractFloat32Frac( b );
2247     bExp = extractFloat32Exp( b );
2248     bSign = extractFloat32Sign( b );
2249     zSign = aSign ^ bSign;
2250     if ( aExp == 0xFF ) {
2251         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2252             return propagateFloat32NaN(a, b, status);
2253         }
2254         if ( ( bExp | bSig ) == 0 ) {
2255             float_raise(float_flag_invalid, status);
2256             return float32_default_nan(status);
2257         }
2258         return packFloat32( zSign, 0xFF, 0 );
2259     }
2260     if ( bExp == 0xFF ) {
2261         if (bSig) {
2262             return propagateFloat32NaN(a, b, status);
2263         }
2264         if ( ( aExp | aSig ) == 0 ) {
2265             float_raise(float_flag_invalid, status);
2266             return float32_default_nan(status);
2267         }
2268         return packFloat32( zSign, 0xFF, 0 );
2269     }
2270     if ( aExp == 0 ) {
2271         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2272         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2273     }
2274     if ( bExp == 0 ) {
2275         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2276         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2277     }
2278     zExp = aExp + bExp - 0x7F;
2279     aSig = ( aSig | 0x00800000 )<<7;
2280     bSig = ( bSig | 0x00800000 )<<8;
2281     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2282     zSig = zSig64;
2283     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2284         zSig <<= 1;
2285         --zExp;
2286     }
2287     return roundAndPackFloat32(zSign, zExp, zSig, status);
2288 
2289 }
2290 
2291 /*----------------------------------------------------------------------------
2292 | Returns the result of dividing the single-precision floating-point value `a'
2293 | by the corresponding value `b'.  The operation is performed according to the
2294 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2295 *----------------------------------------------------------------------------*/
2296 
2297 float32 float32_div(float32 a, float32 b, float_status *status)
2298 {
2299     flag aSign, bSign, zSign;
2300     int aExp, bExp, zExp;
2301     uint32_t aSig, bSig, zSig;
2302     a = float32_squash_input_denormal(a, status);
2303     b = float32_squash_input_denormal(b, status);
2304 
2305     aSig = extractFloat32Frac( a );
2306     aExp = extractFloat32Exp( a );
2307     aSign = extractFloat32Sign( a );
2308     bSig = extractFloat32Frac( b );
2309     bExp = extractFloat32Exp( b );
2310     bSign = extractFloat32Sign( b );
2311     zSign = aSign ^ bSign;
2312     if ( aExp == 0xFF ) {
2313         if (aSig) {
2314             return propagateFloat32NaN(a, b, status);
2315         }
2316         if ( bExp == 0xFF ) {
2317             if (bSig) {
2318                 return propagateFloat32NaN(a, b, status);
2319             }
2320             float_raise(float_flag_invalid, status);
2321             return float32_default_nan(status);
2322         }
2323         return packFloat32( zSign, 0xFF, 0 );
2324     }
2325     if ( bExp == 0xFF ) {
2326         if (bSig) {
2327             return propagateFloat32NaN(a, b, status);
2328         }
2329         return packFloat32( zSign, 0, 0 );
2330     }
2331     if ( bExp == 0 ) {
2332         if ( bSig == 0 ) {
2333             if ( ( aExp | aSig ) == 0 ) {
2334                 float_raise(float_flag_invalid, status);
2335                 return float32_default_nan(status);
2336             }
2337             float_raise(float_flag_divbyzero, status);
2338             return packFloat32( zSign, 0xFF, 0 );
2339         }
2340         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2341     }
2342     if ( aExp == 0 ) {
2343         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2344         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2345     }
2346     zExp = aExp - bExp + 0x7D;
2347     aSig = ( aSig | 0x00800000 )<<7;
2348     bSig = ( bSig | 0x00800000 )<<8;
2349     if ( bSig <= ( aSig + aSig ) ) {
2350         aSig >>= 1;
2351         ++zExp;
2352     }
2353     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2354     if ( ( zSig & 0x3F ) == 0 ) {
2355         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2356     }
2357     return roundAndPackFloat32(zSign, zExp, zSig, status);
2358 
2359 }
2360 
2361 /*----------------------------------------------------------------------------
2362 | Returns the remainder of the single-precision floating-point value `a'
2363 | with respect to the corresponding value `b'.  The operation is performed
2364 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2365 *----------------------------------------------------------------------------*/
2366 
2367 float32 float32_rem(float32 a, float32 b, float_status *status)
2368 {
2369     flag aSign, zSign;
2370     int aExp, bExp, expDiff;
2371     uint32_t aSig, bSig;
2372     uint32_t q;
2373     uint64_t aSig64, bSig64, q64;
2374     uint32_t alternateASig;
2375     int32_t sigMean;
2376     a = float32_squash_input_denormal(a, status);
2377     b = float32_squash_input_denormal(b, status);
2378 
2379     aSig = extractFloat32Frac( a );
2380     aExp = extractFloat32Exp( a );
2381     aSign = extractFloat32Sign( a );
2382     bSig = extractFloat32Frac( b );
2383     bExp = extractFloat32Exp( b );
2384     if ( aExp == 0xFF ) {
2385         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2386             return propagateFloat32NaN(a, b, status);
2387         }
2388         float_raise(float_flag_invalid, status);
2389         return float32_default_nan(status);
2390     }
2391     if ( bExp == 0xFF ) {
2392         if (bSig) {
2393             return propagateFloat32NaN(a, b, status);
2394         }
2395         return a;
2396     }
2397     if ( bExp == 0 ) {
2398         if ( bSig == 0 ) {
2399             float_raise(float_flag_invalid, status);
2400             return float32_default_nan(status);
2401         }
2402         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2403     }
2404     if ( aExp == 0 ) {
2405         if ( aSig == 0 ) return a;
2406         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2407     }
2408     expDiff = aExp - bExp;
2409     aSig |= 0x00800000;
2410     bSig |= 0x00800000;
2411     if ( expDiff < 32 ) {
2412         aSig <<= 8;
2413         bSig <<= 8;
2414         if ( expDiff < 0 ) {
2415             if ( expDiff < -1 ) return a;
2416             aSig >>= 1;
2417         }
2418         q = ( bSig <= aSig );
2419         if ( q ) aSig -= bSig;
2420         if ( 0 < expDiff ) {
2421             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2422             q >>= 32 - expDiff;
2423             bSig >>= 2;
2424             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2425         }
2426         else {
2427             aSig >>= 2;
2428             bSig >>= 2;
2429         }
2430     }
2431     else {
2432         if ( bSig <= aSig ) aSig -= bSig;
2433         aSig64 = ( (uint64_t) aSig )<<40;
2434         bSig64 = ( (uint64_t) bSig )<<40;
2435         expDiff -= 64;
2436         while ( 0 < expDiff ) {
2437             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2438             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2439             aSig64 = - ( ( bSig * q64 )<<38 );
2440             expDiff -= 62;
2441         }
2442         expDiff += 64;
2443         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2444         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2445         q = q64>>( 64 - expDiff );
2446         bSig <<= 6;
2447         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2448     }
2449     do {
2450         alternateASig = aSig;
2451         ++q;
2452         aSig -= bSig;
2453     } while ( 0 <= (int32_t) aSig );
2454     sigMean = aSig + alternateASig;
2455     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2456         aSig = alternateASig;
2457     }
2458     zSign = ( (int32_t) aSig < 0 );
2459     if ( zSign ) aSig = - aSig;
2460     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
2461 }
2462 
2463 /*----------------------------------------------------------------------------
2464 | Returns the result of multiplying the single-precision floating-point values
2465 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2466 | multiplication.  The operation is performed according to the IEC/IEEE
2467 | Standard for Binary Floating-Point Arithmetic 754-2008.
2468 | The flags argument allows the caller to select negation of the
2469 | addend, the intermediate product, or the final result. (The difference
2470 | between this and having the caller do a separate negation is that negating
2471 | externally will flip the sign bit on NaNs.)
2472 *----------------------------------------------------------------------------*/
2473 
2474 float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2475                        float_status *status)
2476 {
2477     flag aSign, bSign, cSign, zSign;
2478     int aExp, bExp, cExp, pExp, zExp, expDiff;
2479     uint32_t aSig, bSig, cSig;
2480     flag pInf, pZero, pSign;
2481     uint64_t pSig64, cSig64, zSig64;
2482     uint32_t pSig;
2483     int shiftcount;
2484     flag signflip, infzero;
2485 
2486     a = float32_squash_input_denormal(a, status);
2487     b = float32_squash_input_denormal(b, status);
2488     c = float32_squash_input_denormal(c, status);
2489     aSig = extractFloat32Frac(a);
2490     aExp = extractFloat32Exp(a);
2491     aSign = extractFloat32Sign(a);
2492     bSig = extractFloat32Frac(b);
2493     bExp = extractFloat32Exp(b);
2494     bSign = extractFloat32Sign(b);
2495     cSig = extractFloat32Frac(c);
2496     cExp = extractFloat32Exp(c);
2497     cSign = extractFloat32Sign(c);
2498 
2499     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2500                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2501 
2502     /* It is implementation-defined whether the cases of (0,inf,qnan)
2503      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2504      * they return if they do), so we have to hand this information
2505      * off to the target-specific pick-a-NaN routine.
2506      */
2507     if (((aExp == 0xff) && aSig) ||
2508         ((bExp == 0xff) && bSig) ||
2509         ((cExp == 0xff) && cSig)) {
2510         return propagateFloat32MulAddNaN(a, b, c, infzero, status);
2511     }
2512 
2513     if (infzero) {
2514         float_raise(float_flag_invalid, status);
2515         return float32_default_nan(status);
2516     }
2517 
2518     if (flags & float_muladd_negate_c) {
2519         cSign ^= 1;
2520     }
2521 
2522     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2523 
2524     /* Work out the sign and type of the product */
2525     pSign = aSign ^ bSign;
2526     if (flags & float_muladd_negate_product) {
2527         pSign ^= 1;
2528     }
2529     pInf = (aExp == 0xff) || (bExp == 0xff);
2530     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2531 
2532     if (cExp == 0xff) {
2533         if (pInf && (pSign ^ cSign)) {
2534             /* addition of opposite-signed infinities => InvalidOperation */
2535             float_raise(float_flag_invalid, status);
2536             return float32_default_nan(status);
2537         }
2538         /* Otherwise generate an infinity of the same sign */
2539         return packFloat32(cSign ^ signflip, 0xff, 0);
2540     }
2541 
2542     if (pInf) {
2543         return packFloat32(pSign ^ signflip, 0xff, 0);
2544     }
2545 
2546     if (pZero) {
2547         if (cExp == 0) {
2548             if (cSig == 0) {
2549                 /* Adding two exact zeroes */
2550                 if (pSign == cSign) {
2551                     zSign = pSign;
2552                 } else if (status->float_rounding_mode == float_round_down) {
2553                     zSign = 1;
2554                 } else {
2555                     zSign = 0;
2556                 }
2557                 return packFloat32(zSign ^ signflip, 0, 0);
2558             }
2559             /* Exact zero plus a denorm */
2560             if (status->flush_to_zero) {
2561                 float_raise(float_flag_output_denormal, status);
2562                 return packFloat32(cSign ^ signflip, 0, 0);
2563             }
2564         }
2565         /* Zero plus something non-zero : just return the something */
2566         if (flags & float_muladd_halve_result) {
2567             if (cExp == 0) {
2568                 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2569             }
2570             /* Subtract one to halve, and one again because roundAndPackFloat32
2571              * wants one less than the true exponent.
2572              */
2573             cExp -= 2;
2574             cSig = (cSig | 0x00800000) << 7;
2575             return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
2576         }
2577         return packFloat32(cSign ^ signflip, cExp, cSig);
2578     }
2579 
2580     if (aExp == 0) {
2581         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2582     }
2583     if (bExp == 0) {
2584         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2585     }
2586 
2587     /* Calculate the actual result a * b + c */
2588 
2589     /* Multiply first; this is easy. */
2590     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2591      * because we want the true exponent, not the "one-less-than"
2592      * flavour that roundAndPackFloat32() takes.
2593      */
2594     pExp = aExp + bExp - 0x7e;
2595     aSig = (aSig | 0x00800000) << 7;
2596     bSig = (bSig | 0x00800000) << 8;
2597     pSig64 = (uint64_t)aSig * bSig;
2598     if ((int64_t)(pSig64 << 1) >= 0) {
2599         pSig64 <<= 1;
2600         pExp--;
2601     }
2602 
2603     zSign = pSign ^ signflip;
2604 
2605     /* Now pSig64 is the significand of the multiply, with the explicit bit in
2606      * position 62.
2607      */
2608     if (cExp == 0) {
2609         if (!cSig) {
2610             /* Throw out the special case of c being an exact zero now */
2611             shift64RightJamming(pSig64, 32, &pSig64);
2612             pSig = pSig64;
2613             if (flags & float_muladd_halve_result) {
2614                 pExp--;
2615             }
2616             return roundAndPackFloat32(zSign, pExp - 1,
2617                                        pSig, status);
2618         }
2619         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2620     }
2621 
2622     cSig64 = (uint64_t)cSig << (62 - 23);
2623     cSig64 |= LIT64(0x4000000000000000);
2624     expDiff = pExp - cExp;
2625 
2626     if (pSign == cSign) {
2627         /* Addition */
2628         if (expDiff > 0) {
2629             /* scale c to match p */
2630             shift64RightJamming(cSig64, expDiff, &cSig64);
2631             zExp = pExp;
2632         } else if (expDiff < 0) {
2633             /* scale p to match c */
2634             shift64RightJamming(pSig64, -expDiff, &pSig64);
2635             zExp = cExp;
2636         } else {
2637             /* no scaling needed */
2638             zExp = cExp;
2639         }
2640         /* Add significands and make sure explicit bit ends up in posn 62 */
2641         zSig64 = pSig64 + cSig64;
2642         if ((int64_t)zSig64 < 0) {
2643             shift64RightJamming(zSig64, 1, &zSig64);
2644         } else {
2645             zExp--;
2646         }
2647     } else {
2648         /* Subtraction */
2649         if (expDiff > 0) {
2650             shift64RightJamming(cSig64, expDiff, &cSig64);
2651             zSig64 = pSig64 - cSig64;
2652             zExp = pExp;
2653         } else if (expDiff < 0) {
2654             shift64RightJamming(pSig64, -expDiff, &pSig64);
2655             zSig64 = cSig64 - pSig64;
2656             zExp = cExp;
2657             zSign ^= 1;
2658         } else {
2659             zExp = pExp;
2660             if (cSig64 < pSig64) {
2661                 zSig64 = pSig64 - cSig64;
2662             } else if (pSig64 < cSig64) {
2663                 zSig64 = cSig64 - pSig64;
2664                 zSign ^= 1;
2665             } else {
2666                 /* Exact zero */
2667                 zSign = signflip;
2668                 if (status->float_rounding_mode == float_round_down) {
2669                     zSign ^= 1;
2670                 }
2671                 return packFloat32(zSign, 0, 0);
2672             }
2673         }
2674         --zExp;
2675         /* Normalize to put the explicit bit back into bit 62. */
2676         shiftcount = countLeadingZeros64(zSig64) - 1;
2677         zSig64 <<= shiftcount;
2678         zExp -= shiftcount;
2679     }
2680     if (flags & float_muladd_halve_result) {
2681         zExp--;
2682     }
2683 
2684     shift64RightJamming(zSig64, 32, &zSig64);
2685     return roundAndPackFloat32(zSign, zExp, zSig64, status);
2686 }
2687 
2688 
2689 /*----------------------------------------------------------------------------
2690 | Returns the square root of the single-precision floating-point value `a'.
2691 | The operation is performed according to the IEC/IEEE Standard for Binary
2692 | Floating-Point Arithmetic.
2693 *----------------------------------------------------------------------------*/
2694 
2695 float32 float32_sqrt(float32 a, float_status *status)
2696 {
2697     flag aSign;
2698     int aExp, zExp;
2699     uint32_t aSig, zSig;
2700     uint64_t rem, term;
2701     a = float32_squash_input_denormal(a, status);
2702 
2703     aSig = extractFloat32Frac( a );
2704     aExp = extractFloat32Exp( a );
2705     aSign = extractFloat32Sign( a );
2706     if ( aExp == 0xFF ) {
2707         if (aSig) {
2708             return propagateFloat32NaN(a, float32_zero, status);
2709         }
2710         if ( ! aSign ) return a;
2711         float_raise(float_flag_invalid, status);
2712         return float32_default_nan(status);
2713     }
2714     if ( aSign ) {
2715         if ( ( aExp | aSig ) == 0 ) return a;
2716         float_raise(float_flag_invalid, status);
2717         return float32_default_nan(status);
2718     }
2719     if ( aExp == 0 ) {
2720         if ( aSig == 0 ) return float32_zero;
2721         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2722     }
2723     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2724     aSig = ( aSig | 0x00800000 )<<8;
2725     zSig = estimateSqrt32( aExp, aSig ) + 2;
2726     if ( ( zSig & 0x7F ) <= 5 ) {
2727         if ( zSig < 2 ) {
2728             zSig = 0x7FFFFFFF;
2729             goto roundAndPack;
2730         }
2731         aSig >>= aExp & 1;
2732         term = ( (uint64_t) zSig ) * zSig;
2733         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2734         while ( (int64_t) rem < 0 ) {
2735             --zSig;
2736             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2737         }
2738         zSig |= ( rem != 0 );
2739     }
2740     shift32RightJamming( zSig, 1, &zSig );
2741  roundAndPack:
2742     return roundAndPackFloat32(0, zExp, zSig, status);
2743 
2744 }
2745 
2746 /*----------------------------------------------------------------------------
2747 | Returns the binary exponential of the single-precision floating-point value
2748 | `a'. The operation is performed according to the IEC/IEEE Standard for
2749 | Binary Floating-Point Arithmetic.
2750 |
2751 | Uses the following identities:
2752 |
2753 | 1. -------------------------------------------------------------------------
2754 |      x    x*ln(2)
2755 |     2  = e
2756 |
2757 | 2. -------------------------------------------------------------------------
2758 |                      2     3     4     5           n
2759 |      x        x     x     x     x     x           x
2760 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2761 |               1!    2!    3!    4!    5!          n!
2762 *----------------------------------------------------------------------------*/
2763 
2764 static const float64 float32_exp2_coefficients[15] =
2765 {
2766     const_float64( 0x3ff0000000000000ll ), /*  1 */
2767     const_float64( 0x3fe0000000000000ll ), /*  2 */
2768     const_float64( 0x3fc5555555555555ll ), /*  3 */
2769     const_float64( 0x3fa5555555555555ll ), /*  4 */
2770     const_float64( 0x3f81111111111111ll ), /*  5 */
2771     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2772     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2773     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2774     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2775     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2776     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2777     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2778     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2779     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2780     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2781 };
2782 
2783 float32 float32_exp2(float32 a, float_status *status)
2784 {
2785     flag aSign;
2786     int aExp;
2787     uint32_t aSig;
2788     float64 r, x, xn;
2789     int i;
2790     a = float32_squash_input_denormal(a, status);
2791 
2792     aSig = extractFloat32Frac( a );
2793     aExp = extractFloat32Exp( a );
2794     aSign = extractFloat32Sign( a );
2795 
2796     if ( aExp == 0xFF) {
2797         if (aSig) {
2798             return propagateFloat32NaN(a, float32_zero, status);
2799         }
2800         return (aSign) ? float32_zero : a;
2801     }
2802     if (aExp == 0) {
2803         if (aSig == 0) return float32_one;
2804     }
2805 
2806     float_raise(float_flag_inexact, status);
2807 
2808     /* ******************************* */
2809     /* using float64 for approximation */
2810     /* ******************************* */
2811     x = float32_to_float64(a, status);
2812     x = float64_mul(x, float64_ln2, status);
2813 
2814     xn = x;
2815     r = float64_one;
2816     for (i = 0 ; i < 15 ; i++) {
2817         float64 f;
2818 
2819         f = float64_mul(xn, float32_exp2_coefficients[i], status);
2820         r = float64_add(r, f, status);
2821 
2822         xn = float64_mul(xn, x, status);
2823     }
2824 
2825     return float64_to_float32(r, status);
2826 }
2827 
2828 /*----------------------------------------------------------------------------
2829 | Returns the binary log of the single-precision floating-point value `a'.
2830 | The operation is performed according to the IEC/IEEE Standard for Binary
2831 | Floating-Point Arithmetic.
2832 *----------------------------------------------------------------------------*/
2833 float32 float32_log2(float32 a, float_status *status)
2834 {
2835     flag aSign, zSign;
2836     int aExp;
2837     uint32_t aSig, zSig, i;
2838 
2839     a = float32_squash_input_denormal(a, status);
2840     aSig = extractFloat32Frac( a );
2841     aExp = extractFloat32Exp( a );
2842     aSign = extractFloat32Sign( a );
2843 
2844     if ( aExp == 0 ) {
2845         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2846         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2847     }
2848     if ( aSign ) {
2849         float_raise(float_flag_invalid, status);
2850         return float32_default_nan(status);
2851     }
2852     if ( aExp == 0xFF ) {
2853         if (aSig) {
2854             return propagateFloat32NaN(a, float32_zero, status);
2855         }
2856         return a;
2857     }
2858 
2859     aExp -= 0x7F;
2860     aSig |= 0x00800000;
2861     zSign = aExp < 0;
2862     zSig = aExp << 23;
2863 
2864     for (i = 1 << 22; i > 0; i >>= 1) {
2865         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2866         if ( aSig & 0x01000000 ) {
2867             aSig >>= 1;
2868             zSig |= i;
2869         }
2870     }
2871 
2872     if ( zSign )
2873         zSig = -zSig;
2874 
2875     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
2876 }
2877 
2878 /*----------------------------------------------------------------------------
2879 | Returns 1 if the single-precision floating-point value `a' is equal to
2880 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2881 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2882 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2883 *----------------------------------------------------------------------------*/
2884 
2885 int float32_eq(float32 a, float32 b, float_status *status)
2886 {
2887     uint32_t av, bv;
2888     a = float32_squash_input_denormal(a, status);
2889     b = float32_squash_input_denormal(b, status);
2890 
2891     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2892          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2893        ) {
2894         float_raise(float_flag_invalid, status);
2895         return 0;
2896     }
2897     av = float32_val(a);
2898     bv = float32_val(b);
2899     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2900 }
2901 
2902 /*----------------------------------------------------------------------------
2903 | Returns 1 if the single-precision floating-point value `a' is less than
2904 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2905 | exception is raised if either operand is a NaN.  The comparison is performed
2906 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2907 *----------------------------------------------------------------------------*/
2908 
2909 int float32_le(float32 a, float32 b, float_status *status)
2910 {
2911     flag aSign, bSign;
2912     uint32_t av, bv;
2913     a = float32_squash_input_denormal(a, status);
2914     b = float32_squash_input_denormal(b, status);
2915 
2916     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2917          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2918        ) {
2919         float_raise(float_flag_invalid, status);
2920         return 0;
2921     }
2922     aSign = extractFloat32Sign( a );
2923     bSign = extractFloat32Sign( b );
2924     av = float32_val(a);
2925     bv = float32_val(b);
2926     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2927     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2928 
2929 }
2930 
2931 /*----------------------------------------------------------------------------
2932 | Returns 1 if the single-precision floating-point value `a' is less than
2933 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2934 | raised if either operand is a NaN.  The comparison is performed according
2935 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2936 *----------------------------------------------------------------------------*/
2937 
2938 int float32_lt(float32 a, float32 b, float_status *status)
2939 {
2940     flag aSign, bSign;
2941     uint32_t av, bv;
2942     a = float32_squash_input_denormal(a, status);
2943     b = float32_squash_input_denormal(b, status);
2944 
2945     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2946          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2947        ) {
2948         float_raise(float_flag_invalid, status);
2949         return 0;
2950     }
2951     aSign = extractFloat32Sign( a );
2952     bSign = extractFloat32Sign( b );
2953     av = float32_val(a);
2954     bv = float32_val(b);
2955     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2956     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2957 
2958 }
2959 
2960 /*----------------------------------------------------------------------------
2961 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2962 | be compared, and 0 otherwise.  The invalid exception is raised if either
2963 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2964 | Standard for Binary Floating-Point Arithmetic.
2965 *----------------------------------------------------------------------------*/
2966 
2967 int float32_unordered(float32 a, float32 b, float_status *status)
2968 {
2969     a = float32_squash_input_denormal(a, status);
2970     b = float32_squash_input_denormal(b, status);
2971 
2972     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2973          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2974        ) {
2975         float_raise(float_flag_invalid, status);
2976         return 1;
2977     }
2978     return 0;
2979 }
2980 
2981 /*----------------------------------------------------------------------------
2982 | Returns 1 if the single-precision floating-point value `a' is equal to
2983 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2984 | exception.  The comparison is performed according to the IEC/IEEE Standard
2985 | for Binary Floating-Point Arithmetic.
2986 *----------------------------------------------------------------------------*/
2987 
2988 int float32_eq_quiet(float32 a, float32 b, float_status *status)
2989 {
2990     a = float32_squash_input_denormal(a, status);
2991     b = float32_squash_input_denormal(b, status);
2992 
2993     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2994          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2995        ) {
2996         if (float32_is_signaling_nan(a, status)
2997          || float32_is_signaling_nan(b, status)) {
2998             float_raise(float_flag_invalid, status);
2999         }
3000         return 0;
3001     }
3002     return ( float32_val(a) == float32_val(b) ) ||
3003             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3004 }
3005 
3006 /*----------------------------------------------------------------------------
3007 | Returns 1 if the single-precision floating-point value `a' is less than or
3008 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3009 | cause an exception.  Otherwise, the comparison is performed according to the
3010 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3011 *----------------------------------------------------------------------------*/
3012 
3013 int float32_le_quiet(float32 a, float32 b, float_status *status)
3014 {
3015     flag aSign, bSign;
3016     uint32_t av, bv;
3017     a = float32_squash_input_denormal(a, status);
3018     b = float32_squash_input_denormal(b, status);
3019 
3020     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3021          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3022        ) {
3023         if (float32_is_signaling_nan(a, status)
3024          || float32_is_signaling_nan(b, status)) {
3025             float_raise(float_flag_invalid, status);
3026         }
3027         return 0;
3028     }
3029     aSign = extractFloat32Sign( a );
3030     bSign = extractFloat32Sign( b );
3031     av = float32_val(a);
3032     bv = float32_val(b);
3033     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3034     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3035 
3036 }
3037 
3038 /*----------------------------------------------------------------------------
3039 | Returns 1 if the single-precision floating-point value `a' is less than
3040 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3041 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3042 | Standard for Binary Floating-Point Arithmetic.
3043 *----------------------------------------------------------------------------*/
3044 
3045 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3046 {
3047     flag aSign, bSign;
3048     uint32_t av, bv;
3049     a = float32_squash_input_denormal(a, status);
3050     b = float32_squash_input_denormal(b, status);
3051 
3052     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3053          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3054        ) {
3055         if (float32_is_signaling_nan(a, status)
3056          || float32_is_signaling_nan(b, status)) {
3057             float_raise(float_flag_invalid, status);
3058         }
3059         return 0;
3060     }
3061     aSign = extractFloat32Sign( a );
3062     bSign = extractFloat32Sign( b );
3063     av = float32_val(a);
3064     bv = float32_val(b);
3065     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3066     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3067 
3068 }
3069 
3070 /*----------------------------------------------------------------------------
3071 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3072 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3073 | comparison is performed according to the IEC/IEEE Standard for Binary
3074 | Floating-Point Arithmetic.
3075 *----------------------------------------------------------------------------*/
3076 
3077 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3078 {
3079     a = float32_squash_input_denormal(a, status);
3080     b = float32_squash_input_denormal(b, status);
3081 
3082     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3083          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3084        ) {
3085         if (float32_is_signaling_nan(a, status)
3086          || float32_is_signaling_nan(b, status)) {
3087             float_raise(float_flag_invalid, status);
3088         }
3089         return 1;
3090     }
3091     return 0;
3092 }
3093 
3094 /*----------------------------------------------------------------------------
3095 | Returns the result of converting the double-precision floating-point value
3096 | `a' to the 32-bit two's complement integer format.  The conversion is
3097 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3098 | Arithmetic---which means in particular that the conversion is rounded
3099 | according to the current rounding mode.  If `a' is a NaN, the largest
3100 | positive integer is returned.  Otherwise, if the conversion overflows, the
3101 | largest integer with the same sign as `a' is returned.
3102 *----------------------------------------------------------------------------*/
3103 
3104 int32_t float64_to_int32(float64 a, float_status *status)
3105 {
3106     flag aSign;
3107     int aExp;
3108     int shiftCount;
3109     uint64_t aSig;
3110     a = float64_squash_input_denormal(a, status);
3111 
3112     aSig = extractFloat64Frac( a );
3113     aExp = extractFloat64Exp( a );
3114     aSign = extractFloat64Sign( a );
3115     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3116     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3117     shiftCount = 0x42C - aExp;
3118     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3119     return roundAndPackInt32(aSign, aSig, status);
3120 
3121 }
3122 
3123 /*----------------------------------------------------------------------------
3124 | Returns the result of converting the double-precision floating-point value
3125 | `a' to the 32-bit two's complement integer format.  The conversion is
3126 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3127 | Arithmetic, except that the conversion is always rounded toward zero.
3128 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3129 | the conversion overflows, the largest integer with the same sign as `a' is
3130 | returned.
3131 *----------------------------------------------------------------------------*/
3132 
3133 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
3134 {
3135     flag aSign;
3136     int aExp;
3137     int shiftCount;
3138     uint64_t aSig, savedASig;
3139     int32_t z;
3140     a = float64_squash_input_denormal(a, status);
3141 
3142     aSig = extractFloat64Frac( a );
3143     aExp = extractFloat64Exp( a );
3144     aSign = extractFloat64Sign( a );
3145     if ( 0x41E < aExp ) {
3146         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3147         goto invalid;
3148     }
3149     else if ( aExp < 0x3FF ) {
3150         if (aExp || aSig) {
3151             status->float_exception_flags |= float_flag_inexact;
3152         }
3153         return 0;
3154     }
3155     aSig |= LIT64( 0x0010000000000000 );
3156     shiftCount = 0x433 - aExp;
3157     savedASig = aSig;
3158     aSig >>= shiftCount;
3159     z = aSig;
3160     if ( aSign ) z = - z;
3161     if ( ( z < 0 ) ^ aSign ) {
3162  invalid:
3163         float_raise(float_flag_invalid, status);
3164         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3165     }
3166     if ( ( aSig<<shiftCount ) != savedASig ) {
3167         status->float_exception_flags |= float_flag_inexact;
3168     }
3169     return z;
3170 
3171 }
3172 
3173 /*----------------------------------------------------------------------------
3174 | Returns the result of converting the double-precision floating-point value
3175 | `a' to the 16-bit two's complement integer format.  The conversion is
3176 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3177 | Arithmetic, except that the conversion is always rounded toward zero.
3178 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3179 | the conversion overflows, the largest integer with the same sign as `a' is
3180 | returned.
3181 *----------------------------------------------------------------------------*/
3182 
3183 int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3184 {
3185     flag aSign;
3186     int aExp;
3187     int shiftCount;
3188     uint64_t aSig, savedASig;
3189     int32_t z;
3190 
3191     aSig = extractFloat64Frac( a );
3192     aExp = extractFloat64Exp( a );
3193     aSign = extractFloat64Sign( a );
3194     if ( 0x40E < aExp ) {
3195         if ( ( aExp == 0x7FF ) && aSig ) {
3196             aSign = 0;
3197         }
3198         goto invalid;
3199     }
3200     else if ( aExp < 0x3FF ) {
3201         if ( aExp || aSig ) {
3202             status->float_exception_flags |= float_flag_inexact;
3203         }
3204         return 0;
3205     }
3206     aSig |= LIT64( 0x0010000000000000 );
3207     shiftCount = 0x433 - aExp;
3208     savedASig = aSig;
3209     aSig >>= shiftCount;
3210     z = aSig;
3211     if ( aSign ) {
3212         z = - z;
3213     }
3214     if ( ( (int16_t)z < 0 ) ^ aSign ) {
3215  invalid:
3216         float_raise(float_flag_invalid, status);
3217         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3218     }
3219     if ( ( aSig<<shiftCount ) != savedASig ) {
3220         status->float_exception_flags |= float_flag_inexact;
3221     }
3222     return z;
3223 }
3224 
3225 /*----------------------------------------------------------------------------
3226 | Returns the result of converting the double-precision floating-point value
3227 | `a' to the 64-bit two's complement integer format.  The conversion is
3228 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3229 | Arithmetic---which means in particular that the conversion is rounded
3230 | according to the current rounding mode.  If `a' is a NaN, the largest
3231 | positive integer is returned.  Otherwise, if the conversion overflows, the
3232 | largest integer with the same sign as `a' is returned.
3233 *----------------------------------------------------------------------------*/
3234 
3235 int64_t float64_to_int64(float64 a, float_status *status)
3236 {
3237     flag aSign;
3238     int aExp;
3239     int shiftCount;
3240     uint64_t aSig, aSigExtra;
3241     a = float64_squash_input_denormal(a, status);
3242 
3243     aSig = extractFloat64Frac( a );
3244     aExp = extractFloat64Exp( a );
3245     aSign = extractFloat64Sign( a );
3246     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3247     shiftCount = 0x433 - aExp;
3248     if ( shiftCount <= 0 ) {
3249         if ( 0x43E < aExp ) {
3250             float_raise(float_flag_invalid, status);
3251             if (    ! aSign
3252                  || (    ( aExp == 0x7FF )
3253                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
3254                ) {
3255                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3256             }
3257             return (int64_t) LIT64( 0x8000000000000000 );
3258         }
3259         aSigExtra = 0;
3260         aSig <<= - shiftCount;
3261     }
3262     else {
3263         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3264     }
3265     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3266 
3267 }
3268 
3269 /*----------------------------------------------------------------------------
3270 | Returns the result of converting the double-precision floating-point value
3271 | `a' to the 64-bit two's complement integer format.  The conversion is
3272 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3273 | Arithmetic, except that the conversion is always rounded toward zero.
3274 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3275 | the conversion overflows, the largest integer with the same sign as `a' is
3276 | returned.
3277 *----------------------------------------------------------------------------*/
3278 
3279 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
3280 {
3281     flag aSign;
3282     int aExp;
3283     int shiftCount;
3284     uint64_t aSig;
3285     int64_t z;
3286     a = float64_squash_input_denormal(a, status);
3287 
3288     aSig = extractFloat64Frac( a );
3289     aExp = extractFloat64Exp( a );
3290     aSign = extractFloat64Sign( a );
3291     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3292     shiftCount = aExp - 0x433;
3293     if ( 0 <= shiftCount ) {
3294         if ( 0x43E <= aExp ) {
3295             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3296                 float_raise(float_flag_invalid, status);
3297                 if (    ! aSign
3298                      || (    ( aExp == 0x7FF )
3299                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
3300                    ) {
3301                     return LIT64( 0x7FFFFFFFFFFFFFFF );
3302                 }
3303             }
3304             return (int64_t) LIT64( 0x8000000000000000 );
3305         }
3306         z = aSig<<shiftCount;
3307     }
3308     else {
3309         if ( aExp < 0x3FE ) {
3310             if (aExp | aSig) {
3311                 status->float_exception_flags |= float_flag_inexact;
3312             }
3313             return 0;
3314         }
3315         z = aSig>>( - shiftCount );
3316         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3317             status->float_exception_flags |= float_flag_inexact;
3318         }
3319     }
3320     if ( aSign ) z = - z;
3321     return z;
3322 
3323 }
3324 
3325 /*----------------------------------------------------------------------------
3326 | Returns the result of converting the double-precision floating-point value
3327 | `a' to the single-precision floating-point format.  The conversion is
3328 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3329 | Arithmetic.
3330 *----------------------------------------------------------------------------*/
3331 
3332 float32 float64_to_float32(float64 a, float_status *status)
3333 {
3334     flag aSign;
3335     int aExp;
3336     uint64_t aSig;
3337     uint32_t zSig;
3338     a = float64_squash_input_denormal(a, status);
3339 
3340     aSig = extractFloat64Frac( a );
3341     aExp = extractFloat64Exp( a );
3342     aSign = extractFloat64Sign( a );
3343     if ( aExp == 0x7FF ) {
3344         if (aSig) {
3345             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3346         }
3347         return packFloat32( aSign, 0xFF, 0 );
3348     }
3349     shift64RightJamming( aSig, 22, &aSig );
3350     zSig = aSig;
3351     if ( aExp || zSig ) {
3352         zSig |= 0x40000000;
3353         aExp -= 0x381;
3354     }
3355     return roundAndPackFloat32(aSign, aExp, zSig, status);
3356 
3357 }
3358 
3359 
3360 /*----------------------------------------------------------------------------
3361 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3362 | half-precision floating-point value, returning the result.  After being
3363 | shifted into the proper positions, the three fields are simply added
3364 | together to form the result.  This means that any integer portion of `zSig'
3365 | will be added into the exponent.  Since a properly normalized significand
3366 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3367 | than the desired result exponent whenever `zSig' is a complete, normalized
3368 | significand.
3369 *----------------------------------------------------------------------------*/
3370 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3371 {
3372     return make_float16(
3373         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3374 }
3375 
3376 /*----------------------------------------------------------------------------
3377 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3378 | and significand `zSig', and returns the proper half-precision floating-
3379 | point value corresponding to the abstract input.  Ordinarily, the abstract
3380 | value is simply rounded and packed into the half-precision format, with
3381 | the inexact exception raised if the abstract input cannot be represented
3382 | exactly.  However, if the abstract value is too large, the overflow and
3383 | inexact exceptions are raised and an infinity or maximal finite value is
3384 | returned.  If the abstract value is too small, the input value is rounded to
3385 | a subnormal number, and the underflow and inexact exceptions are raised if
3386 | the abstract input cannot be represented exactly as a subnormal half-
3387 | precision floating-point number.
3388 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3389 | ARM-style "alternative representation", which omits the NaN and Inf
3390 | encodings in order to raise the maximum representable exponent by one.
3391 |     The input significand `zSig' has its binary point between bits 22
3392 | and 23, which is 13 bits to the left of the usual location.  This shifted
3393 | significand must be normalized or smaller.  If `zSig' is not normalized,
3394 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3395 | and it must not require rounding.  In the usual case that `zSig' is
3396 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3397 | Note the slightly odd position of the binary point in zSig compared with the
3398 | other roundAndPackFloat functions. This should probably be fixed if we
3399 | need to implement more float16 routines than just conversion.
3400 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3401 | Binary Floating-Point Arithmetic.
3402 *----------------------------------------------------------------------------*/
3403 
3404 static float16 roundAndPackFloat16(flag zSign, int zExp,
3405                                    uint32_t zSig, flag ieee,
3406                                    float_status *status)
3407 {
3408     int maxexp = ieee ? 29 : 30;
3409     uint32_t mask;
3410     uint32_t increment;
3411     bool rounding_bumps_exp;
3412     bool is_tiny = false;
3413 
3414     /* Calculate the mask of bits of the mantissa which are not
3415      * representable in half-precision and will be lost.
3416      */
3417     if (zExp < 1) {
3418         /* Will be denormal in halfprec */
3419         mask = 0x00ffffff;
3420         if (zExp >= -11) {
3421             mask >>= 11 + zExp;
3422         }
3423     } else {
3424         /* Normal number in halfprec */
3425         mask = 0x00001fff;
3426     }
3427 
3428     switch (status->float_rounding_mode) {
3429     case float_round_nearest_even:
3430         increment = (mask + 1) >> 1;
3431         if ((zSig & mask) == increment) {
3432             increment = zSig & (increment << 1);
3433         }
3434         break;
3435     case float_round_ties_away:
3436         increment = (mask + 1) >> 1;
3437         break;
3438     case float_round_up:
3439         increment = zSign ? 0 : mask;
3440         break;
3441     case float_round_down:
3442         increment = zSign ? mask : 0;
3443         break;
3444     default: /* round_to_zero */
3445         increment = 0;
3446         break;
3447     }
3448 
3449     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3450 
3451     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3452         if (ieee) {
3453             float_raise(float_flag_overflow | float_flag_inexact, status);
3454             return packFloat16(zSign, 0x1f, 0);
3455         } else {
3456             float_raise(float_flag_invalid, status);
3457             return packFloat16(zSign, 0x1f, 0x3ff);
3458         }
3459     }
3460 
3461     if (zExp < 0) {
3462         /* Note that flush-to-zero does not affect half-precision results */
3463         is_tiny =
3464             (status->float_detect_tininess == float_tininess_before_rounding)
3465             || (zExp < -1)
3466             || (!rounding_bumps_exp);
3467     }
3468     if (zSig & mask) {
3469         float_raise(float_flag_inexact, status);
3470         if (is_tiny) {
3471             float_raise(float_flag_underflow, status);
3472         }
3473     }
3474 
3475     zSig += increment;
3476     if (rounding_bumps_exp) {
3477         zSig >>= 1;
3478         zExp++;
3479     }
3480 
3481     if (zExp < -10) {
3482         return packFloat16(zSign, 0, 0);
3483     }
3484     if (zExp < 0) {
3485         zSig >>= -zExp;
3486         zExp = 0;
3487     }
3488     return packFloat16(zSign, zExp, zSig >> 13);
3489 }
3490 
3491 /*----------------------------------------------------------------------------
3492 | If `a' is denormal and we are in flush-to-zero mode then set the
3493 | input-denormal exception and return zero. Otherwise just return the value.
3494 *----------------------------------------------------------------------------*/
3495 float16 float16_squash_input_denormal(float16 a, float_status *status)
3496 {
3497     if (status->flush_inputs_to_zero) {
3498         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3499             float_raise(float_flag_input_denormal, status);
3500             return make_float16(float16_val(a) & 0x8000);
3501         }
3502     }
3503     return a;
3504 }
3505 
3506 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3507                                       uint32_t *zSigPtr)
3508 {
3509     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3510     *zSigPtr = aSig << shiftCount;
3511     *zExpPtr = 1 - shiftCount;
3512 }
3513 
3514 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3515    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3516 
3517 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3518 {
3519     flag aSign;
3520     int aExp;
3521     uint32_t aSig;
3522 
3523     aSign = extractFloat16Sign(a);
3524     aExp = extractFloat16Exp(a);
3525     aSig = extractFloat16Frac(a);
3526 
3527     if (aExp == 0x1f && ieee) {
3528         if (aSig) {
3529             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3530         }
3531         return packFloat32(aSign, 0xff, 0);
3532     }
3533     if (aExp == 0) {
3534         if (aSig == 0) {
3535             return packFloat32(aSign, 0, 0);
3536         }
3537 
3538         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3539         aExp--;
3540     }
3541     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3542 }
3543 
3544 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3545 {
3546     flag aSign;
3547     int aExp;
3548     uint32_t aSig;
3549 
3550     a = float32_squash_input_denormal(a, status);
3551 
3552     aSig = extractFloat32Frac( a );
3553     aExp = extractFloat32Exp( a );
3554     aSign = extractFloat32Sign( a );
3555     if ( aExp == 0xFF ) {
3556         if (aSig) {
3557             /* Input is a NaN */
3558             if (!ieee) {
3559                 float_raise(float_flag_invalid, status);
3560                 return packFloat16(aSign, 0, 0);
3561             }
3562             return commonNaNToFloat16(
3563                 float32ToCommonNaN(a, status), status);
3564         }
3565         /* Infinity */
3566         if (!ieee) {
3567             float_raise(float_flag_invalid, status);
3568             return packFloat16(aSign, 0x1f, 0x3ff);
3569         }
3570         return packFloat16(aSign, 0x1f, 0);
3571     }
3572     if (aExp == 0 && aSig == 0) {
3573         return packFloat16(aSign, 0, 0);
3574     }
3575     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3576      * even if the input is denormal; however this is harmless because
3577      * the largest possible single-precision denormal is still smaller
3578      * than the smallest representable half-precision denormal, and so we
3579      * will end up ignoring aSig and returning via the "always return zero"
3580      * codepath.
3581      */
3582     aSig |= 0x00800000;
3583     aExp -= 0x71;
3584 
3585     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3586 }
3587 
3588 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3589 {
3590     flag aSign;
3591     int aExp;
3592     uint32_t aSig;
3593 
3594     aSign = extractFloat16Sign(a);
3595     aExp = extractFloat16Exp(a);
3596     aSig = extractFloat16Frac(a);
3597 
3598     if (aExp == 0x1f && ieee) {
3599         if (aSig) {
3600             return commonNaNToFloat64(
3601                 float16ToCommonNaN(a, status), status);
3602         }
3603         return packFloat64(aSign, 0x7ff, 0);
3604     }
3605     if (aExp == 0) {
3606         if (aSig == 0) {
3607             return packFloat64(aSign, 0, 0);
3608         }
3609 
3610         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3611         aExp--;
3612     }
3613     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3614 }
3615 
3616 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3617 {
3618     flag aSign;
3619     int aExp;
3620     uint64_t aSig;
3621     uint32_t zSig;
3622 
3623     a = float64_squash_input_denormal(a, status);
3624 
3625     aSig = extractFloat64Frac(a);
3626     aExp = extractFloat64Exp(a);
3627     aSign = extractFloat64Sign(a);
3628     if (aExp == 0x7FF) {
3629         if (aSig) {
3630             /* Input is a NaN */
3631             if (!ieee) {
3632                 float_raise(float_flag_invalid, status);
3633                 return packFloat16(aSign, 0, 0);
3634             }
3635             return commonNaNToFloat16(
3636                 float64ToCommonNaN(a, status), status);
3637         }
3638         /* Infinity */
3639         if (!ieee) {
3640             float_raise(float_flag_invalid, status);
3641             return packFloat16(aSign, 0x1f, 0x3ff);
3642         }
3643         return packFloat16(aSign, 0x1f, 0);
3644     }
3645     shift64RightJamming(aSig, 29, &aSig);
3646     zSig = aSig;
3647     if (aExp == 0 && zSig == 0) {
3648         return packFloat16(aSign, 0, 0);
3649     }
3650     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3651      * even if the input is denormal; however this is harmless because
3652      * the largest possible single-precision denormal is still smaller
3653      * than the smallest representable half-precision denormal, and so we
3654      * will end up ignoring aSig and returning via the "always return zero"
3655      * codepath.
3656      */
3657     zSig |= 0x00800000;
3658     aExp -= 0x3F1;
3659 
3660     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3661 }
3662 
3663 /*----------------------------------------------------------------------------
3664 | Returns the result of converting the double-precision floating-point value
3665 | `a' to the extended double-precision floating-point format.  The conversion
3666 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3667 | Arithmetic.
3668 *----------------------------------------------------------------------------*/
3669 
3670 floatx80 float64_to_floatx80(float64 a, float_status *status)
3671 {
3672     flag aSign;
3673     int aExp;
3674     uint64_t aSig;
3675 
3676     a = float64_squash_input_denormal(a, status);
3677     aSig = extractFloat64Frac( a );
3678     aExp = extractFloat64Exp( a );
3679     aSign = extractFloat64Sign( a );
3680     if ( aExp == 0x7FF ) {
3681         if (aSig) {
3682             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3683         }
3684         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3685     }
3686     if ( aExp == 0 ) {
3687         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3688         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3689     }
3690     return
3691         packFloatx80(
3692             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3693 
3694 }
3695 
3696 /*----------------------------------------------------------------------------
3697 | Returns the result of converting the double-precision floating-point value
3698 | `a' to the quadruple-precision floating-point format.  The conversion is
3699 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3700 | Arithmetic.
3701 *----------------------------------------------------------------------------*/
3702 
3703 float128 float64_to_float128(float64 a, float_status *status)
3704 {
3705     flag aSign;
3706     int aExp;
3707     uint64_t aSig, zSig0, zSig1;
3708 
3709     a = float64_squash_input_denormal(a, status);
3710     aSig = extractFloat64Frac( a );
3711     aExp = extractFloat64Exp( a );
3712     aSign = extractFloat64Sign( a );
3713     if ( aExp == 0x7FF ) {
3714         if (aSig) {
3715             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3716         }
3717         return packFloat128( aSign, 0x7FFF, 0, 0 );
3718     }
3719     if ( aExp == 0 ) {
3720         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3721         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3722         --aExp;
3723     }
3724     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3725     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3726 
3727 }
3728 
3729 /*----------------------------------------------------------------------------
3730 | Rounds the double-precision floating-point value `a' to an integer, and
3731 | returns the result as a double-precision floating-point value.  The
3732 | operation is performed according to the IEC/IEEE Standard for Binary
3733 | Floating-Point Arithmetic.
3734 *----------------------------------------------------------------------------*/
3735 
3736 float64 float64_round_to_int(float64 a, float_status *status)
3737 {
3738     flag aSign;
3739     int aExp;
3740     uint64_t lastBitMask, roundBitsMask;
3741     uint64_t z;
3742     a = float64_squash_input_denormal(a, status);
3743 
3744     aExp = extractFloat64Exp( a );
3745     if ( 0x433 <= aExp ) {
3746         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3747             return propagateFloat64NaN(a, a, status);
3748         }
3749         return a;
3750     }
3751     if ( aExp < 0x3FF ) {
3752         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3753         status->float_exception_flags |= float_flag_inexact;
3754         aSign = extractFloat64Sign( a );
3755         switch (status->float_rounding_mode) {
3756          case float_round_nearest_even:
3757             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3758                 return packFloat64( aSign, 0x3FF, 0 );
3759             }
3760             break;
3761         case float_round_ties_away:
3762             if (aExp == 0x3FE) {
3763                 return packFloat64(aSign, 0x3ff, 0);
3764             }
3765             break;
3766          case float_round_down:
3767             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3768          case float_round_up:
3769             return make_float64(
3770             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3771         }
3772         return packFloat64( aSign, 0, 0 );
3773     }
3774     lastBitMask = 1;
3775     lastBitMask <<= 0x433 - aExp;
3776     roundBitsMask = lastBitMask - 1;
3777     z = float64_val(a);
3778     switch (status->float_rounding_mode) {
3779     case float_round_nearest_even:
3780         z += lastBitMask >> 1;
3781         if ((z & roundBitsMask) == 0) {
3782             z &= ~lastBitMask;
3783         }
3784         break;
3785     case float_round_ties_away:
3786         z += lastBitMask >> 1;
3787         break;
3788     case float_round_to_zero:
3789         break;
3790     case float_round_up:
3791         if (!extractFloat64Sign(make_float64(z))) {
3792             z += roundBitsMask;
3793         }
3794         break;
3795     case float_round_down:
3796         if (extractFloat64Sign(make_float64(z))) {
3797             z += roundBitsMask;
3798         }
3799         break;
3800     default:
3801         abort();
3802     }
3803     z &= ~ roundBitsMask;
3804     if (z != float64_val(a)) {
3805         status->float_exception_flags |= float_flag_inexact;
3806     }
3807     return make_float64(z);
3808 
3809 }
3810 
3811 float64 float64_trunc_to_int(float64 a, float_status *status)
3812 {
3813     int oldmode;
3814     float64 res;
3815     oldmode = status->float_rounding_mode;
3816     status->float_rounding_mode = float_round_to_zero;
3817     res = float64_round_to_int(a, status);
3818     status->float_rounding_mode = oldmode;
3819     return res;
3820 }
3821 
3822 /*----------------------------------------------------------------------------
3823 | Returns the result of adding the absolute values of the double-precision
3824 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3825 | before being returned.  `zSign' is ignored if the result is a NaN.
3826 | The addition is performed according to the IEC/IEEE Standard for Binary
3827 | Floating-Point Arithmetic.
3828 *----------------------------------------------------------------------------*/
3829 
3830 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3831                               float_status *status)
3832 {
3833     int aExp, bExp, zExp;
3834     uint64_t aSig, bSig, zSig;
3835     int expDiff;
3836 
3837     aSig = extractFloat64Frac( a );
3838     aExp = extractFloat64Exp( a );
3839     bSig = extractFloat64Frac( b );
3840     bExp = extractFloat64Exp( b );
3841     expDiff = aExp - bExp;
3842     aSig <<= 9;
3843     bSig <<= 9;
3844     if ( 0 < expDiff ) {
3845         if ( aExp == 0x7FF ) {
3846             if (aSig) {
3847                 return propagateFloat64NaN(a, b, status);
3848             }
3849             return a;
3850         }
3851         if ( bExp == 0 ) {
3852             --expDiff;
3853         }
3854         else {
3855             bSig |= LIT64( 0x2000000000000000 );
3856         }
3857         shift64RightJamming( bSig, expDiff, &bSig );
3858         zExp = aExp;
3859     }
3860     else if ( expDiff < 0 ) {
3861         if ( bExp == 0x7FF ) {
3862             if (bSig) {
3863                 return propagateFloat64NaN(a, b, status);
3864             }
3865             return packFloat64( zSign, 0x7FF, 0 );
3866         }
3867         if ( aExp == 0 ) {
3868             ++expDiff;
3869         }
3870         else {
3871             aSig |= LIT64( 0x2000000000000000 );
3872         }
3873         shift64RightJamming( aSig, - expDiff, &aSig );
3874         zExp = bExp;
3875     }
3876     else {
3877         if ( aExp == 0x7FF ) {
3878             if (aSig | bSig) {
3879                 return propagateFloat64NaN(a, b, status);
3880             }
3881             return a;
3882         }
3883         if ( aExp == 0 ) {
3884             if (status->flush_to_zero) {
3885                 if (aSig | bSig) {
3886                     float_raise(float_flag_output_denormal, status);
3887                 }
3888                 return packFloat64(zSign, 0, 0);
3889             }
3890             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3891         }
3892         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3893         zExp = aExp;
3894         goto roundAndPack;
3895     }
3896     aSig |= LIT64( 0x2000000000000000 );
3897     zSig = ( aSig + bSig )<<1;
3898     --zExp;
3899     if ( (int64_t) zSig < 0 ) {
3900         zSig = aSig + bSig;
3901         ++zExp;
3902     }
3903  roundAndPack:
3904     return roundAndPackFloat64(zSign, zExp, zSig, status);
3905 
3906 }
3907 
3908 /*----------------------------------------------------------------------------
3909 | Returns the result of subtracting the absolute values of the double-
3910 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3911 | difference is negated before being returned.  `zSign' is ignored if the
3912 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3913 | Standard for Binary Floating-Point Arithmetic.
3914 *----------------------------------------------------------------------------*/
3915 
3916 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3917                               float_status *status)
3918 {
3919     int aExp, bExp, zExp;
3920     uint64_t aSig, bSig, zSig;
3921     int expDiff;
3922 
3923     aSig = extractFloat64Frac( a );
3924     aExp = extractFloat64Exp( a );
3925     bSig = extractFloat64Frac( b );
3926     bExp = extractFloat64Exp( b );
3927     expDiff = aExp - bExp;
3928     aSig <<= 10;
3929     bSig <<= 10;
3930     if ( 0 < expDiff ) goto aExpBigger;
3931     if ( expDiff < 0 ) goto bExpBigger;
3932     if ( aExp == 0x7FF ) {
3933         if (aSig | bSig) {
3934             return propagateFloat64NaN(a, b, status);
3935         }
3936         float_raise(float_flag_invalid, status);
3937         return float64_default_nan(status);
3938     }
3939     if ( aExp == 0 ) {
3940         aExp = 1;
3941         bExp = 1;
3942     }
3943     if ( bSig < aSig ) goto aBigger;
3944     if ( aSig < bSig ) goto bBigger;
3945     return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
3946  bExpBigger:
3947     if ( bExp == 0x7FF ) {
3948         if (bSig) {
3949             return propagateFloat64NaN(a, b, status);
3950         }
3951         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3952     }
3953     if ( aExp == 0 ) {
3954         ++expDiff;
3955     }
3956     else {
3957         aSig |= LIT64( 0x4000000000000000 );
3958     }
3959     shift64RightJamming( aSig, - expDiff, &aSig );
3960     bSig |= LIT64( 0x4000000000000000 );
3961  bBigger:
3962     zSig = bSig - aSig;
3963     zExp = bExp;
3964     zSign ^= 1;
3965     goto normalizeRoundAndPack;
3966  aExpBigger:
3967     if ( aExp == 0x7FF ) {
3968         if (aSig) {
3969             return propagateFloat64NaN(a, b, status);
3970         }
3971         return a;
3972     }
3973     if ( bExp == 0 ) {
3974         --expDiff;
3975     }
3976     else {
3977         bSig |= LIT64( 0x4000000000000000 );
3978     }
3979     shift64RightJamming( bSig, expDiff, &bSig );
3980     aSig |= LIT64( 0x4000000000000000 );
3981  aBigger:
3982     zSig = aSig - bSig;
3983     zExp = aExp;
3984  normalizeRoundAndPack:
3985     --zExp;
3986     return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
3987 
3988 }
3989 
3990 /*----------------------------------------------------------------------------
3991 | Returns the result of adding the double-precision floating-point values `a'
3992 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3993 | Binary Floating-Point Arithmetic.
3994 *----------------------------------------------------------------------------*/
3995 
3996 float64 float64_add(float64 a, float64 b, float_status *status)
3997 {
3998     flag aSign, bSign;
3999     a = float64_squash_input_denormal(a, status);
4000     b = float64_squash_input_denormal(b, status);
4001 
4002     aSign = extractFloat64Sign( a );
4003     bSign = extractFloat64Sign( b );
4004     if ( aSign == bSign ) {
4005         return addFloat64Sigs(a, b, aSign, status);
4006     }
4007     else {
4008         return subFloat64Sigs(a, b, aSign, status);
4009     }
4010 
4011 }
4012 
4013 /*----------------------------------------------------------------------------
4014 | Returns the result of subtracting the double-precision floating-point values
4015 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4016 | for Binary Floating-Point Arithmetic.
4017 *----------------------------------------------------------------------------*/
4018 
4019 float64 float64_sub(float64 a, float64 b, float_status *status)
4020 {
4021     flag aSign, bSign;
4022     a = float64_squash_input_denormal(a, status);
4023     b = float64_squash_input_denormal(b, status);
4024 
4025     aSign = extractFloat64Sign( a );
4026     bSign = extractFloat64Sign( b );
4027     if ( aSign == bSign ) {
4028         return subFloat64Sigs(a, b, aSign, status);
4029     }
4030     else {
4031         return addFloat64Sigs(a, b, aSign, status);
4032     }
4033 
4034 }
4035 
4036 /*----------------------------------------------------------------------------
4037 | Returns the result of multiplying the double-precision floating-point values
4038 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4039 | for Binary Floating-Point Arithmetic.
4040 *----------------------------------------------------------------------------*/
4041 
4042 float64 float64_mul(float64 a, float64 b, float_status *status)
4043 {
4044     flag aSign, bSign, zSign;
4045     int aExp, bExp, zExp;
4046     uint64_t aSig, bSig, zSig0, zSig1;
4047 
4048     a = float64_squash_input_denormal(a, status);
4049     b = float64_squash_input_denormal(b, status);
4050 
4051     aSig = extractFloat64Frac( a );
4052     aExp = extractFloat64Exp( a );
4053     aSign = extractFloat64Sign( a );
4054     bSig = extractFloat64Frac( b );
4055     bExp = extractFloat64Exp( b );
4056     bSign = extractFloat64Sign( b );
4057     zSign = aSign ^ bSign;
4058     if ( aExp == 0x7FF ) {
4059         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4060             return propagateFloat64NaN(a, b, status);
4061         }
4062         if ( ( bExp | bSig ) == 0 ) {
4063             float_raise(float_flag_invalid, status);
4064             return float64_default_nan(status);
4065         }
4066         return packFloat64( zSign, 0x7FF, 0 );
4067     }
4068     if ( bExp == 0x7FF ) {
4069         if (bSig) {
4070             return propagateFloat64NaN(a, b, status);
4071         }
4072         if ( ( aExp | aSig ) == 0 ) {
4073             float_raise(float_flag_invalid, status);
4074             return float64_default_nan(status);
4075         }
4076         return packFloat64( zSign, 0x7FF, 0 );
4077     }
4078     if ( aExp == 0 ) {
4079         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4080         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4081     }
4082     if ( bExp == 0 ) {
4083         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4084         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4085     }
4086     zExp = aExp + bExp - 0x3FF;
4087     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4088     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4089     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4090     zSig0 |= ( zSig1 != 0 );
4091     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
4092         zSig0 <<= 1;
4093         --zExp;
4094     }
4095     return roundAndPackFloat64(zSign, zExp, zSig0, status);
4096 
4097 }
4098 
4099 /*----------------------------------------------------------------------------
4100 | Returns the result of dividing the double-precision floating-point value `a'
4101 | by the corresponding value `b'.  The operation is performed according to
4102 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4103 *----------------------------------------------------------------------------*/
4104 
4105 float64 float64_div(float64 a, float64 b, float_status *status)
4106 {
4107     flag aSign, bSign, zSign;
4108     int aExp, bExp, zExp;
4109     uint64_t aSig, bSig, zSig;
4110     uint64_t rem0, rem1;
4111     uint64_t term0, term1;
4112     a = float64_squash_input_denormal(a, status);
4113     b = float64_squash_input_denormal(b, status);
4114 
4115     aSig = extractFloat64Frac( a );
4116     aExp = extractFloat64Exp( a );
4117     aSign = extractFloat64Sign( a );
4118     bSig = extractFloat64Frac( b );
4119     bExp = extractFloat64Exp( b );
4120     bSign = extractFloat64Sign( b );
4121     zSign = aSign ^ bSign;
4122     if ( aExp == 0x7FF ) {
4123         if (aSig) {
4124             return propagateFloat64NaN(a, b, status);
4125         }
4126         if ( bExp == 0x7FF ) {
4127             if (bSig) {
4128                 return propagateFloat64NaN(a, b, status);
4129             }
4130             float_raise(float_flag_invalid, status);
4131             return float64_default_nan(status);
4132         }
4133         return packFloat64( zSign, 0x7FF, 0 );
4134     }
4135     if ( bExp == 0x7FF ) {
4136         if (bSig) {
4137             return propagateFloat64NaN(a, b, status);
4138         }
4139         return packFloat64( zSign, 0, 0 );
4140     }
4141     if ( bExp == 0 ) {
4142         if ( bSig == 0 ) {
4143             if ( ( aExp | aSig ) == 0 ) {
4144                 float_raise(float_flag_invalid, status);
4145                 return float64_default_nan(status);
4146             }
4147             float_raise(float_flag_divbyzero, status);
4148             return packFloat64( zSign, 0x7FF, 0 );
4149         }
4150         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4151     }
4152     if ( aExp == 0 ) {
4153         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4154         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4155     }
4156     zExp = aExp - bExp + 0x3FD;
4157     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4158     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4159     if ( bSig <= ( aSig + aSig ) ) {
4160         aSig >>= 1;
4161         ++zExp;
4162     }
4163     zSig = estimateDiv128To64( aSig, 0, bSig );
4164     if ( ( zSig & 0x1FF ) <= 2 ) {
4165         mul64To128( bSig, zSig, &term0, &term1 );
4166         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4167         while ( (int64_t) rem0 < 0 ) {
4168             --zSig;
4169             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4170         }
4171         zSig |= ( rem1 != 0 );
4172     }
4173     return roundAndPackFloat64(zSign, zExp, zSig, status);
4174 
4175 }
4176 
4177 /*----------------------------------------------------------------------------
4178 | Returns the remainder of the double-precision floating-point value `a'
4179 | with respect to the corresponding value `b'.  The operation is performed
4180 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4181 *----------------------------------------------------------------------------*/
4182 
4183 float64 float64_rem(float64 a, float64 b, float_status *status)
4184 {
4185     flag aSign, zSign;
4186     int aExp, bExp, expDiff;
4187     uint64_t aSig, bSig;
4188     uint64_t q, alternateASig;
4189     int64_t sigMean;
4190 
4191     a = float64_squash_input_denormal(a, status);
4192     b = float64_squash_input_denormal(b, status);
4193     aSig = extractFloat64Frac( a );
4194     aExp = extractFloat64Exp( a );
4195     aSign = extractFloat64Sign( a );
4196     bSig = extractFloat64Frac( b );
4197     bExp = extractFloat64Exp( b );
4198     if ( aExp == 0x7FF ) {
4199         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4200             return propagateFloat64NaN(a, b, status);
4201         }
4202         float_raise(float_flag_invalid, status);
4203         return float64_default_nan(status);
4204     }
4205     if ( bExp == 0x7FF ) {
4206         if (bSig) {
4207             return propagateFloat64NaN(a, b, status);
4208         }
4209         return a;
4210     }
4211     if ( bExp == 0 ) {
4212         if ( bSig == 0 ) {
4213             float_raise(float_flag_invalid, status);
4214             return float64_default_nan(status);
4215         }
4216         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4217     }
4218     if ( aExp == 0 ) {
4219         if ( aSig == 0 ) return a;
4220         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4221     }
4222     expDiff = aExp - bExp;
4223     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4224     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4225     if ( expDiff < 0 ) {
4226         if ( expDiff < -1 ) return a;
4227         aSig >>= 1;
4228     }
4229     q = ( bSig <= aSig );
4230     if ( q ) aSig -= bSig;
4231     expDiff -= 64;
4232     while ( 0 < expDiff ) {
4233         q = estimateDiv128To64( aSig, 0, bSig );
4234         q = ( 2 < q ) ? q - 2 : 0;
4235         aSig = - ( ( bSig>>2 ) * q );
4236         expDiff -= 62;
4237     }
4238     expDiff += 64;
4239     if ( 0 < expDiff ) {
4240         q = estimateDiv128To64( aSig, 0, bSig );
4241         q = ( 2 < q ) ? q - 2 : 0;
4242         q >>= 64 - expDiff;
4243         bSig >>= 2;
4244         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4245     }
4246     else {
4247         aSig >>= 2;
4248         bSig >>= 2;
4249     }
4250     do {
4251         alternateASig = aSig;
4252         ++q;
4253         aSig -= bSig;
4254     } while ( 0 <= (int64_t) aSig );
4255     sigMean = aSig + alternateASig;
4256     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4257         aSig = alternateASig;
4258     }
4259     zSign = ( (int64_t) aSig < 0 );
4260     if ( zSign ) aSig = - aSig;
4261     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4262 
4263 }
4264 
4265 /*----------------------------------------------------------------------------
4266 | Returns the result of multiplying the double-precision floating-point values
4267 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4268 | multiplication.  The operation is performed according to the IEC/IEEE
4269 | Standard for Binary Floating-Point Arithmetic 754-2008.
4270 | The flags argument allows the caller to select negation of the
4271 | addend, the intermediate product, or the final result. (The difference
4272 | between this and having the caller do a separate negation is that negating
4273 | externally will flip the sign bit on NaNs.)
4274 *----------------------------------------------------------------------------*/
4275 
4276 float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4277                        float_status *status)
4278 {
4279     flag aSign, bSign, cSign, zSign;
4280     int aExp, bExp, cExp, pExp, zExp, expDiff;
4281     uint64_t aSig, bSig, cSig;
4282     flag pInf, pZero, pSign;
4283     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4284     int shiftcount;
4285     flag signflip, infzero;
4286 
4287     a = float64_squash_input_denormal(a, status);
4288     b = float64_squash_input_denormal(b, status);
4289     c = float64_squash_input_denormal(c, status);
4290     aSig = extractFloat64Frac(a);
4291     aExp = extractFloat64Exp(a);
4292     aSign = extractFloat64Sign(a);
4293     bSig = extractFloat64Frac(b);
4294     bExp = extractFloat64Exp(b);
4295     bSign = extractFloat64Sign(b);
4296     cSig = extractFloat64Frac(c);
4297     cExp = extractFloat64Exp(c);
4298     cSign = extractFloat64Sign(c);
4299 
4300     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4301                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4302 
4303     /* It is implementation-defined whether the cases of (0,inf,qnan)
4304      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4305      * they return if they do), so we have to hand this information
4306      * off to the target-specific pick-a-NaN routine.
4307      */
4308     if (((aExp == 0x7ff) && aSig) ||
4309         ((bExp == 0x7ff) && bSig) ||
4310         ((cExp == 0x7ff) && cSig)) {
4311         return propagateFloat64MulAddNaN(a, b, c, infzero, status);
4312     }
4313 
4314     if (infzero) {
4315         float_raise(float_flag_invalid, status);
4316         return float64_default_nan(status);
4317     }
4318 
4319     if (flags & float_muladd_negate_c) {
4320         cSign ^= 1;
4321     }
4322 
4323     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4324 
4325     /* Work out the sign and type of the product */
4326     pSign = aSign ^ bSign;
4327     if (flags & float_muladd_negate_product) {
4328         pSign ^= 1;
4329     }
4330     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4331     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4332 
4333     if (cExp == 0x7ff) {
4334         if (pInf && (pSign ^ cSign)) {
4335             /* addition of opposite-signed infinities => InvalidOperation */
4336             float_raise(float_flag_invalid, status);
4337             return float64_default_nan(status);
4338         }
4339         /* Otherwise generate an infinity of the same sign */
4340         return packFloat64(cSign ^ signflip, 0x7ff, 0);
4341     }
4342 
4343     if (pInf) {
4344         return packFloat64(pSign ^ signflip, 0x7ff, 0);
4345     }
4346 
4347     if (pZero) {
4348         if (cExp == 0) {
4349             if (cSig == 0) {
4350                 /* Adding two exact zeroes */
4351                 if (pSign == cSign) {
4352                     zSign = pSign;
4353                 } else if (status->float_rounding_mode == float_round_down) {
4354                     zSign = 1;
4355                 } else {
4356                     zSign = 0;
4357                 }
4358                 return packFloat64(zSign ^ signflip, 0, 0);
4359             }
4360             /* Exact zero plus a denorm */
4361             if (status->flush_to_zero) {
4362                 float_raise(float_flag_output_denormal, status);
4363                 return packFloat64(cSign ^ signflip, 0, 0);
4364             }
4365         }
4366         /* Zero plus something non-zero : just return the something */
4367         if (flags & float_muladd_halve_result) {
4368             if (cExp == 0) {
4369                 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4370             }
4371             /* Subtract one to halve, and one again because roundAndPackFloat64
4372              * wants one less than the true exponent.
4373              */
4374             cExp -= 2;
4375             cSig = (cSig | 0x0010000000000000ULL) << 10;
4376             return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
4377         }
4378         return packFloat64(cSign ^ signflip, cExp, cSig);
4379     }
4380 
4381     if (aExp == 0) {
4382         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4383     }
4384     if (bExp == 0) {
4385         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4386     }
4387 
4388     /* Calculate the actual result a * b + c */
4389 
4390     /* Multiply first; this is easy. */
4391     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4392      * because we want the true exponent, not the "one-less-than"
4393      * flavour that roundAndPackFloat64() takes.
4394      */
4395     pExp = aExp + bExp - 0x3fe;
4396     aSig = (aSig | LIT64(0x0010000000000000))<<10;
4397     bSig = (bSig | LIT64(0x0010000000000000))<<11;
4398     mul64To128(aSig, bSig, &pSig0, &pSig1);
4399     if ((int64_t)(pSig0 << 1) >= 0) {
4400         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4401         pExp--;
4402     }
4403 
4404     zSign = pSign ^ signflip;
4405 
4406     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4407      * bit in position 126.
4408      */
4409     if (cExp == 0) {
4410         if (!cSig) {
4411             /* Throw out the special case of c being an exact zero now */
4412             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4413             if (flags & float_muladd_halve_result) {
4414                 pExp--;
4415             }
4416             return roundAndPackFloat64(zSign, pExp - 1,
4417                                        pSig1, status);
4418         }
4419         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4420     }
4421 
4422     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4423      * significand of the addend, with the explicit bit in position 126.
4424      */
4425     cSig0 = cSig << (126 - 64 - 52);
4426     cSig1 = 0;
4427     cSig0 |= LIT64(0x4000000000000000);
4428     expDiff = pExp - cExp;
4429 
4430     if (pSign == cSign) {
4431         /* Addition */
4432         if (expDiff > 0) {
4433             /* scale c to match p */
4434             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4435             zExp = pExp;
4436         } else if (expDiff < 0) {
4437             /* scale p to match c */
4438             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4439             zExp = cExp;
4440         } else {
4441             /* no scaling needed */
4442             zExp = cExp;
4443         }
4444         /* Add significands and make sure explicit bit ends up in posn 126 */
4445         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4446         if ((int64_t)zSig0 < 0) {
4447             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4448         } else {
4449             zExp--;
4450         }
4451         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4452         if (flags & float_muladd_halve_result) {
4453             zExp--;
4454         }
4455         return roundAndPackFloat64(zSign, zExp, zSig1, status);
4456     } else {
4457         /* Subtraction */
4458         if (expDiff > 0) {
4459             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4460             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4461             zExp = pExp;
4462         } else if (expDiff < 0) {
4463             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4464             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4465             zExp = cExp;
4466             zSign ^= 1;
4467         } else {
4468             zExp = pExp;
4469             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4470                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4471             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4472                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4473                 zSign ^= 1;
4474             } else {
4475                 /* Exact zero */
4476                 zSign = signflip;
4477                 if (status->float_rounding_mode == float_round_down) {
4478                     zSign ^= 1;
4479                 }
4480                 return packFloat64(zSign, 0, 0);
4481             }
4482         }
4483         --zExp;
4484         /* Do the equivalent of normalizeRoundAndPackFloat64() but
4485          * starting with the significand in a pair of uint64_t.
4486          */
4487         if (zSig0) {
4488             shiftcount = countLeadingZeros64(zSig0) - 1;
4489             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4490             if (zSig1) {
4491                 zSig0 |= 1;
4492             }
4493             zExp -= shiftcount;
4494         } else {
4495             shiftcount = countLeadingZeros64(zSig1);
4496             if (shiftcount == 0) {
4497                 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4498                 zExp -= 63;
4499             } else {
4500                 shiftcount--;
4501                 zSig0 = zSig1 << shiftcount;
4502                 zExp -= (shiftcount + 64);
4503             }
4504         }
4505         if (flags & float_muladd_halve_result) {
4506             zExp--;
4507         }
4508         return roundAndPackFloat64(zSign, zExp, zSig0, status);
4509     }
4510 }
4511 
4512 /*----------------------------------------------------------------------------
4513 | Returns the square root of the double-precision floating-point value `a'.
4514 | The operation is performed according to the IEC/IEEE Standard for Binary
4515 | Floating-Point Arithmetic.
4516 *----------------------------------------------------------------------------*/
4517 
4518 float64 float64_sqrt(float64 a, float_status *status)
4519 {
4520     flag aSign;
4521     int aExp, zExp;
4522     uint64_t aSig, zSig, doubleZSig;
4523     uint64_t rem0, rem1, term0, term1;
4524     a = float64_squash_input_denormal(a, status);
4525 
4526     aSig = extractFloat64Frac( a );
4527     aExp = extractFloat64Exp( a );
4528     aSign = extractFloat64Sign( a );
4529     if ( aExp == 0x7FF ) {
4530         if (aSig) {
4531             return propagateFloat64NaN(a, a, status);
4532         }
4533         if ( ! aSign ) return a;
4534         float_raise(float_flag_invalid, status);
4535         return float64_default_nan(status);
4536     }
4537     if ( aSign ) {
4538         if ( ( aExp | aSig ) == 0 ) return a;
4539         float_raise(float_flag_invalid, status);
4540         return float64_default_nan(status);
4541     }
4542     if ( aExp == 0 ) {
4543         if ( aSig == 0 ) return float64_zero;
4544         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4545     }
4546     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4547     aSig |= LIT64( 0x0010000000000000 );
4548     zSig = estimateSqrt32( aExp, aSig>>21 );
4549     aSig <<= 9 - ( aExp & 1 );
4550     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4551     if ( ( zSig & 0x1FF ) <= 5 ) {
4552         doubleZSig = zSig<<1;
4553         mul64To128( zSig, zSig, &term0, &term1 );
4554         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4555         while ( (int64_t) rem0 < 0 ) {
4556             --zSig;
4557             doubleZSig -= 2;
4558             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4559         }
4560         zSig |= ( ( rem0 | rem1 ) != 0 );
4561     }
4562     return roundAndPackFloat64(0, zExp, zSig, status);
4563 
4564 }
4565 
4566 /*----------------------------------------------------------------------------
4567 | Returns the binary log of the double-precision floating-point value `a'.
4568 | The operation is performed according to the IEC/IEEE Standard for Binary
4569 | Floating-Point Arithmetic.
4570 *----------------------------------------------------------------------------*/
4571 float64 float64_log2(float64 a, float_status *status)
4572 {
4573     flag aSign, zSign;
4574     int aExp;
4575     uint64_t aSig, aSig0, aSig1, zSig, i;
4576     a = float64_squash_input_denormal(a, status);
4577 
4578     aSig = extractFloat64Frac( a );
4579     aExp = extractFloat64Exp( a );
4580     aSign = extractFloat64Sign( a );
4581 
4582     if ( aExp == 0 ) {
4583         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4584         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4585     }
4586     if ( aSign ) {
4587         float_raise(float_flag_invalid, status);
4588         return float64_default_nan(status);
4589     }
4590     if ( aExp == 0x7FF ) {
4591         if (aSig) {
4592             return propagateFloat64NaN(a, float64_zero, status);
4593         }
4594         return a;
4595     }
4596 
4597     aExp -= 0x3FF;
4598     aSig |= LIT64( 0x0010000000000000 );
4599     zSign = aExp < 0;
4600     zSig = (uint64_t)aExp << 52;
4601     for (i = 1LL << 51; i > 0; i >>= 1) {
4602         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4603         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4604         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4605             aSig >>= 1;
4606             zSig |= i;
4607         }
4608     }
4609 
4610     if ( zSign )
4611         zSig = -zSig;
4612     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4613 }
4614 
4615 /*----------------------------------------------------------------------------
4616 | Returns 1 if the double-precision floating-point value `a' is equal to the
4617 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4618 | if either operand is a NaN.  Otherwise, the comparison is performed
4619 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4620 *----------------------------------------------------------------------------*/
4621 
4622 int float64_eq(float64 a, float64 b, float_status *status)
4623 {
4624     uint64_t av, bv;
4625     a = float64_squash_input_denormal(a, status);
4626     b = float64_squash_input_denormal(b, status);
4627 
4628     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4629          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4630        ) {
4631         float_raise(float_flag_invalid, status);
4632         return 0;
4633     }
4634     av = float64_val(a);
4635     bv = float64_val(b);
4636     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4637 
4638 }
4639 
4640 /*----------------------------------------------------------------------------
4641 | Returns 1 if the double-precision floating-point value `a' is less than or
4642 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4643 | exception is raised if either operand is a NaN.  The comparison is performed
4644 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4645 *----------------------------------------------------------------------------*/
4646 
4647 int float64_le(float64 a, float64 b, float_status *status)
4648 {
4649     flag aSign, bSign;
4650     uint64_t av, bv;
4651     a = float64_squash_input_denormal(a, status);
4652     b = float64_squash_input_denormal(b, status);
4653 
4654     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4655          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4656        ) {
4657         float_raise(float_flag_invalid, status);
4658         return 0;
4659     }
4660     aSign = extractFloat64Sign( a );
4661     bSign = extractFloat64Sign( b );
4662     av = float64_val(a);
4663     bv = float64_val(b);
4664     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4665     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4666 
4667 }
4668 
4669 /*----------------------------------------------------------------------------
4670 | Returns 1 if the double-precision floating-point value `a' is less than
4671 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4672 | raised if either operand is a NaN.  The comparison is performed according
4673 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4674 *----------------------------------------------------------------------------*/
4675 
4676 int float64_lt(float64 a, float64 b, float_status *status)
4677 {
4678     flag aSign, bSign;
4679     uint64_t av, bv;
4680 
4681     a = float64_squash_input_denormal(a, status);
4682     b = float64_squash_input_denormal(b, status);
4683     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4684          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4685        ) {
4686         float_raise(float_flag_invalid, status);
4687         return 0;
4688     }
4689     aSign = extractFloat64Sign( a );
4690     bSign = extractFloat64Sign( b );
4691     av = float64_val(a);
4692     bv = float64_val(b);
4693     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4694     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4695 
4696 }
4697 
4698 /*----------------------------------------------------------------------------
4699 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4700 | be compared, and 0 otherwise.  The invalid exception is raised if either
4701 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4702 | Standard for Binary Floating-Point Arithmetic.
4703 *----------------------------------------------------------------------------*/
4704 
4705 int float64_unordered(float64 a, float64 b, float_status *status)
4706 {
4707     a = float64_squash_input_denormal(a, status);
4708     b = float64_squash_input_denormal(b, status);
4709 
4710     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4711          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4712        ) {
4713         float_raise(float_flag_invalid, status);
4714         return 1;
4715     }
4716     return 0;
4717 }
4718 
4719 /*----------------------------------------------------------------------------
4720 | Returns 1 if the double-precision floating-point value `a' is equal to the
4721 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4722 | exception.The comparison is performed according to the IEC/IEEE Standard
4723 | for Binary Floating-Point Arithmetic.
4724 *----------------------------------------------------------------------------*/
4725 
4726 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4727 {
4728     uint64_t av, bv;
4729     a = float64_squash_input_denormal(a, status);
4730     b = float64_squash_input_denormal(b, status);
4731 
4732     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4733          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4734        ) {
4735         if (float64_is_signaling_nan(a, status)
4736          || float64_is_signaling_nan(b, status)) {
4737             float_raise(float_flag_invalid, status);
4738         }
4739         return 0;
4740     }
4741     av = float64_val(a);
4742     bv = float64_val(b);
4743     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4744 
4745 }
4746 
4747 /*----------------------------------------------------------------------------
4748 | Returns 1 if the double-precision floating-point value `a' is less than or
4749 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4750 | cause an exception.  Otherwise, the comparison is performed according to the
4751 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4752 *----------------------------------------------------------------------------*/
4753 
4754 int float64_le_quiet(float64 a, float64 b, float_status *status)
4755 {
4756     flag aSign, bSign;
4757     uint64_t av, bv;
4758     a = float64_squash_input_denormal(a, status);
4759     b = float64_squash_input_denormal(b, status);
4760 
4761     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4762          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4763        ) {
4764         if (float64_is_signaling_nan(a, status)
4765          || float64_is_signaling_nan(b, status)) {
4766             float_raise(float_flag_invalid, status);
4767         }
4768         return 0;
4769     }
4770     aSign = extractFloat64Sign( a );
4771     bSign = extractFloat64Sign( b );
4772     av = float64_val(a);
4773     bv = float64_val(b);
4774     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4775     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4776 
4777 }
4778 
4779 /*----------------------------------------------------------------------------
4780 | Returns 1 if the double-precision floating-point value `a' is less than
4781 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4782 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4783 | Standard for Binary Floating-Point Arithmetic.
4784 *----------------------------------------------------------------------------*/
4785 
4786 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4787 {
4788     flag aSign, bSign;
4789     uint64_t av, bv;
4790     a = float64_squash_input_denormal(a, status);
4791     b = float64_squash_input_denormal(b, status);
4792 
4793     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4794          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4795        ) {
4796         if (float64_is_signaling_nan(a, status)
4797          || float64_is_signaling_nan(b, status)) {
4798             float_raise(float_flag_invalid, status);
4799         }
4800         return 0;
4801     }
4802     aSign = extractFloat64Sign( a );
4803     bSign = extractFloat64Sign( b );
4804     av = float64_val(a);
4805     bv = float64_val(b);
4806     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4807     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4808 
4809 }
4810 
4811 /*----------------------------------------------------------------------------
4812 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4813 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4814 | comparison is performed according to the IEC/IEEE Standard for Binary
4815 | Floating-Point Arithmetic.
4816 *----------------------------------------------------------------------------*/
4817 
4818 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4819 {
4820     a = float64_squash_input_denormal(a, status);
4821     b = float64_squash_input_denormal(b, status);
4822 
4823     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4824          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4825        ) {
4826         if (float64_is_signaling_nan(a, status)
4827          || float64_is_signaling_nan(b, status)) {
4828             float_raise(float_flag_invalid, status);
4829         }
4830         return 1;
4831     }
4832     return 0;
4833 }
4834 
4835 /*----------------------------------------------------------------------------
4836 | Returns the result of converting the extended double-precision floating-
4837 | point value `a' to the 32-bit two's complement integer format.  The
4838 | conversion is performed according to the IEC/IEEE Standard for Binary
4839 | Floating-Point Arithmetic---which means in particular that the conversion
4840 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4841 | largest positive integer is returned.  Otherwise, if the conversion
4842 | overflows, the largest integer with the same sign as `a' is returned.
4843 *----------------------------------------------------------------------------*/
4844 
4845 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4846 {
4847     flag aSign;
4848     int32_t aExp, shiftCount;
4849     uint64_t aSig;
4850 
4851     if (floatx80_invalid_encoding(a)) {
4852         float_raise(float_flag_invalid, status);
4853         return 1 << 31;
4854     }
4855     aSig = extractFloatx80Frac( a );
4856     aExp = extractFloatx80Exp( a );
4857     aSign = extractFloatx80Sign( a );
4858     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4859     shiftCount = 0x4037 - aExp;
4860     if ( shiftCount <= 0 ) shiftCount = 1;
4861     shift64RightJamming( aSig, shiftCount, &aSig );
4862     return roundAndPackInt32(aSign, aSig, status);
4863 
4864 }
4865 
4866 /*----------------------------------------------------------------------------
4867 | Returns the result of converting the extended double-precision floating-
4868 | point value `a' to the 32-bit two's complement integer format.  The
4869 | conversion is performed according to the IEC/IEEE Standard for Binary
4870 | Floating-Point Arithmetic, except that the conversion is always rounded
4871 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4872 | Otherwise, if the conversion overflows, the largest integer with the same
4873 | sign as `a' is returned.
4874 *----------------------------------------------------------------------------*/
4875 
4876 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4877 {
4878     flag aSign;
4879     int32_t aExp, shiftCount;
4880     uint64_t aSig, savedASig;
4881     int32_t z;
4882 
4883     if (floatx80_invalid_encoding(a)) {
4884         float_raise(float_flag_invalid, status);
4885         return 1 << 31;
4886     }
4887     aSig = extractFloatx80Frac( a );
4888     aExp = extractFloatx80Exp( a );
4889     aSign = extractFloatx80Sign( a );
4890     if ( 0x401E < aExp ) {
4891         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4892         goto invalid;
4893     }
4894     else if ( aExp < 0x3FFF ) {
4895         if (aExp || aSig) {
4896             status->float_exception_flags |= float_flag_inexact;
4897         }
4898         return 0;
4899     }
4900     shiftCount = 0x403E - aExp;
4901     savedASig = aSig;
4902     aSig >>= shiftCount;
4903     z = aSig;
4904     if ( aSign ) z = - z;
4905     if ( ( z < 0 ) ^ aSign ) {
4906  invalid:
4907         float_raise(float_flag_invalid, status);
4908         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4909     }
4910     if ( ( aSig<<shiftCount ) != savedASig ) {
4911         status->float_exception_flags |= float_flag_inexact;
4912     }
4913     return z;
4914 
4915 }
4916 
4917 /*----------------------------------------------------------------------------
4918 | Returns the result of converting the extended double-precision floating-
4919 | point value `a' to the 64-bit two's complement integer format.  The
4920 | conversion is performed according to the IEC/IEEE Standard for Binary
4921 | Floating-Point Arithmetic---which means in particular that the conversion
4922 | is rounded according to the current rounding mode.  If `a' is a NaN,
4923 | the largest positive integer is returned.  Otherwise, if the conversion
4924 | overflows, the largest integer with the same sign as `a' is returned.
4925 *----------------------------------------------------------------------------*/
4926 
4927 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4928 {
4929     flag aSign;
4930     int32_t aExp, shiftCount;
4931     uint64_t aSig, aSigExtra;
4932 
4933     if (floatx80_invalid_encoding(a)) {
4934         float_raise(float_flag_invalid, status);
4935         return 1ULL << 63;
4936     }
4937     aSig = extractFloatx80Frac( a );
4938     aExp = extractFloatx80Exp( a );
4939     aSign = extractFloatx80Sign( a );
4940     shiftCount = 0x403E - aExp;
4941     if ( shiftCount <= 0 ) {
4942         if ( shiftCount ) {
4943             float_raise(float_flag_invalid, status);
4944             if (    ! aSign
4945                  || (    ( aExp == 0x7FFF )
4946                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4947                ) {
4948                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4949             }
4950             return (int64_t) LIT64( 0x8000000000000000 );
4951         }
4952         aSigExtra = 0;
4953     }
4954     else {
4955         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4956     }
4957     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4958 
4959 }
4960 
4961 /*----------------------------------------------------------------------------
4962 | Returns the result of converting the extended double-precision floating-
4963 | point value `a' to the 64-bit two's complement integer format.  The
4964 | conversion is performed according to the IEC/IEEE Standard for Binary
4965 | Floating-Point Arithmetic, except that the conversion is always rounded
4966 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4967 | Otherwise, if the conversion overflows, the largest integer with the same
4968 | sign as `a' is returned.
4969 *----------------------------------------------------------------------------*/
4970 
4971 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4972 {
4973     flag aSign;
4974     int32_t aExp, shiftCount;
4975     uint64_t aSig;
4976     int64_t z;
4977 
4978     if (floatx80_invalid_encoding(a)) {
4979         float_raise(float_flag_invalid, status);
4980         return 1ULL << 63;
4981     }
4982     aSig = extractFloatx80Frac( a );
4983     aExp = extractFloatx80Exp( a );
4984     aSign = extractFloatx80Sign( a );
4985     shiftCount = aExp - 0x403E;
4986     if ( 0 <= shiftCount ) {
4987         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4988         if ( ( a.high != 0xC03E ) || aSig ) {
4989             float_raise(float_flag_invalid, status);
4990             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4991                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4992             }
4993         }
4994         return (int64_t) LIT64( 0x8000000000000000 );
4995     }
4996     else if ( aExp < 0x3FFF ) {
4997         if (aExp | aSig) {
4998             status->float_exception_flags |= float_flag_inexact;
4999         }
5000         return 0;
5001     }
5002     z = aSig>>( - shiftCount );
5003     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5004         status->float_exception_flags |= float_flag_inexact;
5005     }
5006     if ( aSign ) z = - z;
5007     return z;
5008 
5009 }
5010 
5011 /*----------------------------------------------------------------------------
5012 | Returns the result of converting the extended double-precision floating-
5013 | point value `a' to the single-precision floating-point format.  The
5014 | conversion is performed according to the IEC/IEEE Standard for Binary
5015 | Floating-Point Arithmetic.
5016 *----------------------------------------------------------------------------*/
5017 
5018 float32 floatx80_to_float32(floatx80 a, float_status *status)
5019 {
5020     flag aSign;
5021     int32_t aExp;
5022     uint64_t aSig;
5023 
5024     if (floatx80_invalid_encoding(a)) {
5025         float_raise(float_flag_invalid, status);
5026         return float32_default_nan(status);
5027     }
5028     aSig = extractFloatx80Frac( a );
5029     aExp = extractFloatx80Exp( a );
5030     aSign = extractFloatx80Sign( a );
5031     if ( aExp == 0x7FFF ) {
5032         if ( (uint64_t) ( aSig<<1 ) ) {
5033             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5034         }
5035         return packFloat32( aSign, 0xFF, 0 );
5036     }
5037     shift64RightJamming( aSig, 33, &aSig );
5038     if ( aExp || aSig ) aExp -= 0x3F81;
5039     return roundAndPackFloat32(aSign, aExp, aSig, status);
5040 
5041 }
5042 
5043 /*----------------------------------------------------------------------------
5044 | Returns the result of converting the extended double-precision floating-
5045 | point value `a' to the double-precision floating-point format.  The
5046 | conversion is performed according to the IEC/IEEE Standard for Binary
5047 | Floating-Point Arithmetic.
5048 *----------------------------------------------------------------------------*/
5049 
5050 float64 floatx80_to_float64(floatx80 a, float_status *status)
5051 {
5052     flag aSign;
5053     int32_t aExp;
5054     uint64_t aSig, zSig;
5055 
5056     if (floatx80_invalid_encoding(a)) {
5057         float_raise(float_flag_invalid, status);
5058         return float64_default_nan(status);
5059     }
5060     aSig = extractFloatx80Frac( a );
5061     aExp = extractFloatx80Exp( a );
5062     aSign = extractFloatx80Sign( a );
5063     if ( aExp == 0x7FFF ) {
5064         if ( (uint64_t) ( aSig<<1 ) ) {
5065             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5066         }
5067         return packFloat64( aSign, 0x7FF, 0 );
5068     }
5069     shift64RightJamming( aSig, 1, &zSig );
5070     if ( aExp || aSig ) aExp -= 0x3C01;
5071     return roundAndPackFloat64(aSign, aExp, zSig, status);
5072 
5073 }
5074 
5075 /*----------------------------------------------------------------------------
5076 | Returns the result of converting the extended double-precision floating-
5077 | point value `a' to the quadruple-precision floating-point format.  The
5078 | conversion is performed according to the IEC/IEEE Standard for Binary
5079 | Floating-Point Arithmetic.
5080 *----------------------------------------------------------------------------*/
5081 
5082 float128 floatx80_to_float128(floatx80 a, float_status *status)
5083 {
5084     flag aSign;
5085     int aExp;
5086     uint64_t aSig, zSig0, zSig1;
5087 
5088     if (floatx80_invalid_encoding(a)) {
5089         float_raise(float_flag_invalid, status);
5090         return float128_default_nan(status);
5091     }
5092     aSig = extractFloatx80Frac( a );
5093     aExp = extractFloatx80Exp( a );
5094     aSign = extractFloatx80Sign( a );
5095     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5096         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5097     }
5098     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5099     return packFloat128( aSign, aExp, zSig0, zSig1 );
5100 
5101 }
5102 
5103 /*----------------------------------------------------------------------------
5104 | Rounds the extended double-precision floating-point value `a'
5105 | to the precision provided by floatx80_rounding_precision and returns the
5106 | result as an extended double-precision floating-point value.
5107 | The operation is performed according to the IEC/IEEE Standard for Binary
5108 | Floating-Point Arithmetic.
5109 *----------------------------------------------------------------------------*/
5110 
5111 floatx80 floatx80_round(floatx80 a, float_status *status)
5112 {
5113     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5114                                 extractFloatx80Sign(a),
5115                                 extractFloatx80Exp(a),
5116                                 extractFloatx80Frac(a), 0, status);
5117 }
5118 
5119 /*----------------------------------------------------------------------------
5120 | Rounds the extended double-precision floating-point value `a' to an integer,
5121 | and returns the result as an extended quadruple-precision floating-point
5122 | value.  The operation is performed according to the IEC/IEEE Standard for
5123 | Binary Floating-Point Arithmetic.
5124 *----------------------------------------------------------------------------*/
5125 
5126 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5127 {
5128     flag aSign;
5129     int32_t aExp;
5130     uint64_t lastBitMask, roundBitsMask;
5131     floatx80 z;
5132 
5133     if (floatx80_invalid_encoding(a)) {
5134         float_raise(float_flag_invalid, status);
5135         return floatx80_default_nan(status);
5136     }
5137     aExp = extractFloatx80Exp( a );
5138     if ( 0x403E <= aExp ) {
5139         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5140             return propagateFloatx80NaN(a, a, status);
5141         }
5142         return a;
5143     }
5144     if ( aExp < 0x3FFF ) {
5145         if (    ( aExp == 0 )
5146              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5147             return a;
5148         }
5149         status->float_exception_flags |= float_flag_inexact;
5150         aSign = extractFloatx80Sign( a );
5151         switch (status->float_rounding_mode) {
5152          case float_round_nearest_even:
5153             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5154                ) {
5155                 return
5156                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5157             }
5158             break;
5159         case float_round_ties_away:
5160             if (aExp == 0x3FFE) {
5161                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5162             }
5163             break;
5164          case float_round_down:
5165             return
5166                   aSign ?
5167                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5168                 : packFloatx80( 0, 0, 0 );
5169          case float_round_up:
5170             return
5171                   aSign ? packFloatx80( 1, 0, 0 )
5172                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5173         }
5174         return packFloatx80( aSign, 0, 0 );
5175     }
5176     lastBitMask = 1;
5177     lastBitMask <<= 0x403E - aExp;
5178     roundBitsMask = lastBitMask - 1;
5179     z = a;
5180     switch (status->float_rounding_mode) {
5181     case float_round_nearest_even:
5182         z.low += lastBitMask>>1;
5183         if ((z.low & roundBitsMask) == 0) {
5184             z.low &= ~lastBitMask;
5185         }
5186         break;
5187     case float_round_ties_away:
5188         z.low += lastBitMask >> 1;
5189         break;
5190     case float_round_to_zero:
5191         break;
5192     case float_round_up:
5193         if (!extractFloatx80Sign(z)) {
5194             z.low += roundBitsMask;
5195         }
5196         break;
5197     case float_round_down:
5198         if (extractFloatx80Sign(z)) {
5199             z.low += roundBitsMask;
5200         }
5201         break;
5202     default:
5203         abort();
5204     }
5205     z.low &= ~ roundBitsMask;
5206     if ( z.low == 0 ) {
5207         ++z.high;
5208         z.low = LIT64( 0x8000000000000000 );
5209     }
5210     if (z.low != a.low) {
5211         status->float_exception_flags |= float_flag_inexact;
5212     }
5213     return z;
5214 
5215 }
5216 
5217 /*----------------------------------------------------------------------------
5218 | Returns the result of adding the absolute values of the extended double-
5219 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5220 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5221 | The addition is performed according to the IEC/IEEE Standard for Binary
5222 | Floating-Point Arithmetic.
5223 *----------------------------------------------------------------------------*/
5224 
5225 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5226                                 float_status *status)
5227 {
5228     int32_t aExp, bExp, zExp;
5229     uint64_t aSig, bSig, zSig0, zSig1;
5230     int32_t expDiff;
5231 
5232     aSig = extractFloatx80Frac( a );
5233     aExp = extractFloatx80Exp( a );
5234     bSig = extractFloatx80Frac( b );
5235     bExp = extractFloatx80Exp( b );
5236     expDiff = aExp - bExp;
5237     if ( 0 < expDiff ) {
5238         if ( aExp == 0x7FFF ) {
5239             if ((uint64_t)(aSig << 1)) {
5240                 return propagateFloatx80NaN(a, b, status);
5241             }
5242             return a;
5243         }
5244         if ( bExp == 0 ) --expDiff;
5245         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5246         zExp = aExp;
5247     }
5248     else if ( expDiff < 0 ) {
5249         if ( bExp == 0x7FFF ) {
5250             if ((uint64_t)(bSig << 1)) {
5251                 return propagateFloatx80NaN(a, b, status);
5252             }
5253             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5254         }
5255         if ( aExp == 0 ) ++expDiff;
5256         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5257         zExp = bExp;
5258     }
5259     else {
5260         if ( aExp == 0x7FFF ) {
5261             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5262                 return propagateFloatx80NaN(a, b, status);
5263             }
5264             return a;
5265         }
5266         zSig1 = 0;
5267         zSig0 = aSig + bSig;
5268         if ( aExp == 0 ) {
5269             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5270             goto roundAndPack;
5271         }
5272         zExp = aExp;
5273         goto shiftRight1;
5274     }
5275     zSig0 = aSig + bSig;
5276     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5277  shiftRight1:
5278     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5279     zSig0 |= LIT64( 0x8000000000000000 );
5280     ++zExp;
5281  roundAndPack:
5282     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5283                                 zSign, zExp, zSig0, zSig1, status);
5284 }
5285 
5286 /*----------------------------------------------------------------------------
5287 | Returns the result of subtracting the absolute values of the extended
5288 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5289 | difference is negated before being returned.  `zSign' is ignored if the
5290 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5291 | Standard for Binary Floating-Point Arithmetic.
5292 *----------------------------------------------------------------------------*/
5293 
5294 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5295                                 float_status *status)
5296 {
5297     int32_t aExp, bExp, zExp;
5298     uint64_t aSig, bSig, zSig0, zSig1;
5299     int32_t expDiff;
5300 
5301     aSig = extractFloatx80Frac( a );
5302     aExp = extractFloatx80Exp( a );
5303     bSig = extractFloatx80Frac( b );
5304     bExp = extractFloatx80Exp( b );
5305     expDiff = aExp - bExp;
5306     if ( 0 < expDiff ) goto aExpBigger;
5307     if ( expDiff < 0 ) goto bExpBigger;
5308     if ( aExp == 0x7FFF ) {
5309         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5310             return propagateFloatx80NaN(a, b, status);
5311         }
5312         float_raise(float_flag_invalid, status);
5313         return floatx80_default_nan(status);
5314     }
5315     if ( aExp == 0 ) {
5316         aExp = 1;
5317         bExp = 1;
5318     }
5319     zSig1 = 0;
5320     if ( bSig < aSig ) goto aBigger;
5321     if ( aSig < bSig ) goto bBigger;
5322     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5323  bExpBigger:
5324     if ( bExp == 0x7FFF ) {
5325         if ((uint64_t)(bSig << 1)) {
5326             return propagateFloatx80NaN(a, b, status);
5327         }
5328         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5329     }
5330     if ( aExp == 0 ) ++expDiff;
5331     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5332  bBigger:
5333     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5334     zExp = bExp;
5335     zSign ^= 1;
5336     goto normalizeRoundAndPack;
5337  aExpBigger:
5338     if ( aExp == 0x7FFF ) {
5339         if ((uint64_t)(aSig << 1)) {
5340             return propagateFloatx80NaN(a, b, status);
5341         }
5342         return a;
5343     }
5344     if ( bExp == 0 ) --expDiff;
5345     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5346  aBigger:
5347     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5348     zExp = aExp;
5349  normalizeRoundAndPack:
5350     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5351                                          zSign, zExp, zSig0, zSig1, status);
5352 }
5353 
5354 /*----------------------------------------------------------------------------
5355 | Returns the result of adding the extended double-precision floating-point
5356 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5357 | Standard for Binary Floating-Point Arithmetic.
5358 *----------------------------------------------------------------------------*/
5359 
5360 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5361 {
5362     flag aSign, bSign;
5363 
5364     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5365         float_raise(float_flag_invalid, status);
5366         return floatx80_default_nan(status);
5367     }
5368     aSign = extractFloatx80Sign( a );
5369     bSign = extractFloatx80Sign( b );
5370     if ( aSign == bSign ) {
5371         return addFloatx80Sigs(a, b, aSign, status);
5372     }
5373     else {
5374         return subFloatx80Sigs(a, b, aSign, status);
5375     }
5376 
5377 }
5378 
5379 /*----------------------------------------------------------------------------
5380 | Returns the result of subtracting the extended double-precision floating-
5381 | point values `a' and `b'.  The operation is performed according to the
5382 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5383 *----------------------------------------------------------------------------*/
5384 
5385 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5386 {
5387     flag aSign, bSign;
5388 
5389     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5390         float_raise(float_flag_invalid, status);
5391         return floatx80_default_nan(status);
5392     }
5393     aSign = extractFloatx80Sign( a );
5394     bSign = extractFloatx80Sign( b );
5395     if ( aSign == bSign ) {
5396         return subFloatx80Sigs(a, b, aSign, status);
5397     }
5398     else {
5399         return addFloatx80Sigs(a, b, aSign, status);
5400     }
5401 
5402 }
5403 
5404 /*----------------------------------------------------------------------------
5405 | Returns the result of multiplying the extended double-precision floating-
5406 | point values `a' and `b'.  The operation is performed according to the
5407 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5408 *----------------------------------------------------------------------------*/
5409 
5410 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5411 {
5412     flag aSign, bSign, zSign;
5413     int32_t aExp, bExp, zExp;
5414     uint64_t aSig, bSig, zSig0, zSig1;
5415 
5416     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5417         float_raise(float_flag_invalid, status);
5418         return floatx80_default_nan(status);
5419     }
5420     aSig = extractFloatx80Frac( a );
5421     aExp = extractFloatx80Exp( a );
5422     aSign = extractFloatx80Sign( a );
5423     bSig = extractFloatx80Frac( b );
5424     bExp = extractFloatx80Exp( b );
5425     bSign = extractFloatx80Sign( b );
5426     zSign = aSign ^ bSign;
5427     if ( aExp == 0x7FFF ) {
5428         if (    (uint64_t) ( aSig<<1 )
5429              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5430             return propagateFloatx80NaN(a, b, status);
5431         }
5432         if ( ( bExp | bSig ) == 0 ) goto invalid;
5433         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5434     }
5435     if ( bExp == 0x7FFF ) {
5436         if ((uint64_t)(bSig << 1)) {
5437             return propagateFloatx80NaN(a, b, status);
5438         }
5439         if ( ( aExp | aSig ) == 0 ) {
5440  invalid:
5441             float_raise(float_flag_invalid, status);
5442             return floatx80_default_nan(status);
5443         }
5444         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5445     }
5446     if ( aExp == 0 ) {
5447         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5448         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5449     }
5450     if ( bExp == 0 ) {
5451         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5452         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5453     }
5454     zExp = aExp + bExp - 0x3FFE;
5455     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5456     if ( 0 < (int64_t) zSig0 ) {
5457         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5458         --zExp;
5459     }
5460     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5461                                 zSign, zExp, zSig0, zSig1, status);
5462 }
5463 
5464 /*----------------------------------------------------------------------------
5465 | Returns the result of dividing the extended double-precision floating-point
5466 | value `a' by the corresponding value `b'.  The operation is performed
5467 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5468 *----------------------------------------------------------------------------*/
5469 
5470 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5471 {
5472     flag aSign, bSign, zSign;
5473     int32_t aExp, bExp, zExp;
5474     uint64_t aSig, bSig, zSig0, zSig1;
5475     uint64_t rem0, rem1, rem2, term0, term1, term2;
5476 
5477     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5478         float_raise(float_flag_invalid, status);
5479         return floatx80_default_nan(status);
5480     }
5481     aSig = extractFloatx80Frac( a );
5482     aExp = extractFloatx80Exp( a );
5483     aSign = extractFloatx80Sign( a );
5484     bSig = extractFloatx80Frac( b );
5485     bExp = extractFloatx80Exp( b );
5486     bSign = extractFloatx80Sign( b );
5487     zSign = aSign ^ bSign;
5488     if ( aExp == 0x7FFF ) {
5489         if ((uint64_t)(aSig << 1)) {
5490             return propagateFloatx80NaN(a, b, status);
5491         }
5492         if ( bExp == 0x7FFF ) {
5493             if ((uint64_t)(bSig << 1)) {
5494                 return propagateFloatx80NaN(a, b, status);
5495             }
5496             goto invalid;
5497         }
5498         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5499     }
5500     if ( bExp == 0x7FFF ) {
5501         if ((uint64_t)(bSig << 1)) {
5502             return propagateFloatx80NaN(a, b, status);
5503         }
5504         return packFloatx80( zSign, 0, 0 );
5505     }
5506     if ( bExp == 0 ) {
5507         if ( bSig == 0 ) {
5508             if ( ( aExp | aSig ) == 0 ) {
5509  invalid:
5510                 float_raise(float_flag_invalid, status);
5511                 return floatx80_default_nan(status);
5512             }
5513             float_raise(float_flag_divbyzero, status);
5514             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5515         }
5516         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5517     }
5518     if ( aExp == 0 ) {
5519         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5520         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5521     }
5522     zExp = aExp - bExp + 0x3FFE;
5523     rem1 = 0;
5524     if ( bSig <= aSig ) {
5525         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5526         ++zExp;
5527     }
5528     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5529     mul64To128( bSig, zSig0, &term0, &term1 );
5530     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5531     while ( (int64_t) rem0 < 0 ) {
5532         --zSig0;
5533         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5534     }
5535     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5536     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5537         mul64To128( bSig, zSig1, &term1, &term2 );
5538         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5539         while ( (int64_t) rem1 < 0 ) {
5540             --zSig1;
5541             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5542         }
5543         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5544     }
5545     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5546                                 zSign, zExp, zSig0, zSig1, status);
5547 }
5548 
5549 /*----------------------------------------------------------------------------
5550 | Returns the remainder of the extended double-precision floating-point value
5551 | `a' with respect to the corresponding value `b'.  The operation is performed
5552 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5553 *----------------------------------------------------------------------------*/
5554 
5555 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5556 {
5557     flag aSign, zSign;
5558     int32_t aExp, bExp, expDiff;
5559     uint64_t aSig0, aSig1, bSig;
5560     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5561 
5562     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5563         float_raise(float_flag_invalid, status);
5564         return floatx80_default_nan(status);
5565     }
5566     aSig0 = extractFloatx80Frac( a );
5567     aExp = extractFloatx80Exp( a );
5568     aSign = extractFloatx80Sign( a );
5569     bSig = extractFloatx80Frac( b );
5570     bExp = extractFloatx80Exp( b );
5571     if ( aExp == 0x7FFF ) {
5572         if (    (uint64_t) ( aSig0<<1 )
5573              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5574             return propagateFloatx80NaN(a, b, status);
5575         }
5576         goto invalid;
5577     }
5578     if ( bExp == 0x7FFF ) {
5579         if ((uint64_t)(bSig << 1)) {
5580             return propagateFloatx80NaN(a, b, status);
5581         }
5582         return a;
5583     }
5584     if ( bExp == 0 ) {
5585         if ( bSig == 0 ) {
5586  invalid:
5587             float_raise(float_flag_invalid, status);
5588             return floatx80_default_nan(status);
5589         }
5590         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5591     }
5592     if ( aExp == 0 ) {
5593         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5594         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5595     }
5596     bSig |= LIT64( 0x8000000000000000 );
5597     zSign = aSign;
5598     expDiff = aExp - bExp;
5599     aSig1 = 0;
5600     if ( expDiff < 0 ) {
5601         if ( expDiff < -1 ) return a;
5602         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5603         expDiff = 0;
5604     }
5605     q = ( bSig <= aSig0 );
5606     if ( q ) aSig0 -= bSig;
5607     expDiff -= 64;
5608     while ( 0 < expDiff ) {
5609         q = estimateDiv128To64( aSig0, aSig1, bSig );
5610         q = ( 2 < q ) ? q - 2 : 0;
5611         mul64To128( bSig, q, &term0, &term1 );
5612         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5613         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5614         expDiff -= 62;
5615     }
5616     expDiff += 64;
5617     if ( 0 < expDiff ) {
5618         q = estimateDiv128To64( aSig0, aSig1, bSig );
5619         q = ( 2 < q ) ? q - 2 : 0;
5620         q >>= 64 - expDiff;
5621         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5622         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5623         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5624         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5625             ++q;
5626             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5627         }
5628     }
5629     else {
5630         term1 = 0;
5631         term0 = bSig;
5632     }
5633     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5634     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5635          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5636               && ( q & 1 ) )
5637        ) {
5638         aSig0 = alternateASig0;
5639         aSig1 = alternateASig1;
5640         zSign = ! zSign;
5641     }
5642     return
5643         normalizeRoundAndPackFloatx80(
5644             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5645 
5646 }
5647 
5648 /*----------------------------------------------------------------------------
5649 | Returns the square root of the extended double-precision floating-point
5650 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5651 | for Binary Floating-Point Arithmetic.
5652 *----------------------------------------------------------------------------*/
5653 
5654 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5655 {
5656     flag aSign;
5657     int32_t aExp, zExp;
5658     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5659     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5660 
5661     if (floatx80_invalid_encoding(a)) {
5662         float_raise(float_flag_invalid, status);
5663         return floatx80_default_nan(status);
5664     }
5665     aSig0 = extractFloatx80Frac( a );
5666     aExp = extractFloatx80Exp( a );
5667     aSign = extractFloatx80Sign( a );
5668     if ( aExp == 0x7FFF ) {
5669         if ((uint64_t)(aSig0 << 1)) {
5670             return propagateFloatx80NaN(a, a, status);
5671         }
5672         if ( ! aSign ) return a;
5673         goto invalid;
5674     }
5675     if ( aSign ) {
5676         if ( ( aExp | aSig0 ) == 0 ) return a;
5677  invalid:
5678         float_raise(float_flag_invalid, status);
5679         return floatx80_default_nan(status);
5680     }
5681     if ( aExp == 0 ) {
5682         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5683         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5684     }
5685     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5686     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5687     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5688     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5689     doubleZSig0 = zSig0<<1;
5690     mul64To128( zSig0, zSig0, &term0, &term1 );
5691     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5692     while ( (int64_t) rem0 < 0 ) {
5693         --zSig0;
5694         doubleZSig0 -= 2;
5695         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5696     }
5697     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5698     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5699         if ( zSig1 == 0 ) zSig1 = 1;
5700         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5701         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5702         mul64To128( zSig1, zSig1, &term2, &term3 );
5703         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5704         while ( (int64_t) rem1 < 0 ) {
5705             --zSig1;
5706             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5707             term3 |= 1;
5708             term2 |= doubleZSig0;
5709             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5710         }
5711         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5712     }
5713     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5714     zSig0 |= doubleZSig0;
5715     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5716                                 0, zExp, zSig0, zSig1, status);
5717 }
5718 
5719 /*----------------------------------------------------------------------------
5720 | Returns 1 if the extended double-precision floating-point value `a' is equal
5721 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5722 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5723 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5724 *----------------------------------------------------------------------------*/
5725 
5726 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5727 {
5728 
5729     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5730         || (extractFloatx80Exp(a) == 0x7FFF
5731             && (uint64_t) (extractFloatx80Frac(a) << 1))
5732         || (extractFloatx80Exp(b) == 0x7FFF
5733             && (uint64_t) (extractFloatx80Frac(b) << 1))
5734        ) {
5735         float_raise(float_flag_invalid, status);
5736         return 0;
5737     }
5738     return
5739            ( a.low == b.low )
5740         && (    ( a.high == b.high )
5741              || (    ( a.low == 0 )
5742                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5743            );
5744 
5745 }
5746 
5747 /*----------------------------------------------------------------------------
5748 | Returns 1 if the extended double-precision floating-point value `a' is
5749 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5750 | invalid exception is raised if either operand is a NaN.  The comparison is
5751 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5752 | Arithmetic.
5753 *----------------------------------------------------------------------------*/
5754 
5755 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5756 {
5757     flag aSign, bSign;
5758 
5759     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5760         || (extractFloatx80Exp(a) == 0x7FFF
5761             && (uint64_t) (extractFloatx80Frac(a) << 1))
5762         || (extractFloatx80Exp(b) == 0x7FFF
5763             && (uint64_t) (extractFloatx80Frac(b) << 1))
5764        ) {
5765         float_raise(float_flag_invalid, status);
5766         return 0;
5767     }
5768     aSign = extractFloatx80Sign( a );
5769     bSign = extractFloatx80Sign( b );
5770     if ( aSign != bSign ) {
5771         return
5772                aSign
5773             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5774                  == 0 );
5775     }
5776     return
5777           aSign ? le128( b.high, b.low, a.high, a.low )
5778         : le128( a.high, a.low, b.high, b.low );
5779 
5780 }
5781 
5782 /*----------------------------------------------------------------------------
5783 | Returns 1 if the extended double-precision floating-point value `a' is
5784 | less than the corresponding value `b', and 0 otherwise.  The invalid
5785 | exception is raised if either operand is a NaN.  The comparison is performed
5786 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5787 *----------------------------------------------------------------------------*/
5788 
5789 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5790 {
5791     flag aSign, bSign;
5792 
5793     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5794         || (extractFloatx80Exp(a) == 0x7FFF
5795             && (uint64_t) (extractFloatx80Frac(a) << 1))
5796         || (extractFloatx80Exp(b) == 0x7FFF
5797             && (uint64_t) (extractFloatx80Frac(b) << 1))
5798        ) {
5799         float_raise(float_flag_invalid, status);
5800         return 0;
5801     }
5802     aSign = extractFloatx80Sign( a );
5803     bSign = extractFloatx80Sign( b );
5804     if ( aSign != bSign ) {
5805         return
5806                aSign
5807             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5808                  != 0 );
5809     }
5810     return
5811           aSign ? lt128( b.high, b.low, a.high, a.low )
5812         : lt128( a.high, a.low, b.high, b.low );
5813 
5814 }
5815 
5816 /*----------------------------------------------------------------------------
5817 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5818 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5819 | either operand is a NaN.   The comparison is performed according to the
5820 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5821 *----------------------------------------------------------------------------*/
5822 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5823 {
5824     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5825         || (extractFloatx80Exp(a) == 0x7FFF
5826             && (uint64_t) (extractFloatx80Frac(a) << 1))
5827         || (extractFloatx80Exp(b) == 0x7FFF
5828             && (uint64_t) (extractFloatx80Frac(b) << 1))
5829        ) {
5830         float_raise(float_flag_invalid, status);
5831         return 1;
5832     }
5833     return 0;
5834 }
5835 
5836 /*----------------------------------------------------------------------------
5837 | Returns 1 if the extended double-precision floating-point value `a' is
5838 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5839 | cause an exception.  The comparison is performed according to the IEC/IEEE
5840 | Standard for Binary Floating-Point Arithmetic.
5841 *----------------------------------------------------------------------------*/
5842 
5843 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5844 {
5845 
5846     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5847         float_raise(float_flag_invalid, status);
5848         return 0;
5849     }
5850     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5851               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5852          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5853               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5854        ) {
5855         if (floatx80_is_signaling_nan(a, status)
5856          || floatx80_is_signaling_nan(b, status)) {
5857             float_raise(float_flag_invalid, status);
5858         }
5859         return 0;
5860     }
5861     return
5862            ( a.low == b.low )
5863         && (    ( a.high == b.high )
5864              || (    ( a.low == 0 )
5865                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5866            );
5867 
5868 }
5869 
5870 /*----------------------------------------------------------------------------
5871 | Returns 1 if the extended double-precision floating-point value `a' is less
5872 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5873 | do not cause an exception.  Otherwise, the comparison is performed according
5874 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5875 *----------------------------------------------------------------------------*/
5876 
5877 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5878 {
5879     flag aSign, bSign;
5880 
5881     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5882         float_raise(float_flag_invalid, status);
5883         return 0;
5884     }
5885     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5886               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5887          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5888               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5889        ) {
5890         if (floatx80_is_signaling_nan(a, status)
5891          || floatx80_is_signaling_nan(b, status)) {
5892             float_raise(float_flag_invalid, status);
5893         }
5894         return 0;
5895     }
5896     aSign = extractFloatx80Sign( a );
5897     bSign = extractFloatx80Sign( b );
5898     if ( aSign != bSign ) {
5899         return
5900                aSign
5901             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5902                  == 0 );
5903     }
5904     return
5905           aSign ? le128( b.high, b.low, a.high, a.low )
5906         : le128( a.high, a.low, b.high, b.low );
5907 
5908 }
5909 
5910 /*----------------------------------------------------------------------------
5911 | Returns 1 if the extended double-precision floating-point value `a' is less
5912 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5913 | an exception.  Otherwise, the comparison is performed according to the
5914 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5915 *----------------------------------------------------------------------------*/
5916 
5917 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5918 {
5919     flag aSign, bSign;
5920 
5921     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5922         float_raise(float_flag_invalid, status);
5923         return 0;
5924     }
5925     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5926               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5927          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5928               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5929        ) {
5930         if (floatx80_is_signaling_nan(a, status)
5931          || floatx80_is_signaling_nan(b, status)) {
5932             float_raise(float_flag_invalid, status);
5933         }
5934         return 0;
5935     }
5936     aSign = extractFloatx80Sign( a );
5937     bSign = extractFloatx80Sign( b );
5938     if ( aSign != bSign ) {
5939         return
5940                aSign
5941             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5942                  != 0 );
5943     }
5944     return
5945           aSign ? lt128( b.high, b.low, a.high, a.low )
5946         : lt128( a.high, a.low, b.high, b.low );
5947 
5948 }
5949 
5950 /*----------------------------------------------------------------------------
5951 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5952 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5953 | The comparison is performed according to the IEC/IEEE Standard for Binary
5954 | Floating-Point Arithmetic.
5955 *----------------------------------------------------------------------------*/
5956 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5957 {
5958     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5959         float_raise(float_flag_invalid, status);
5960         return 1;
5961     }
5962     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5963               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5964          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5965               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5966        ) {
5967         if (floatx80_is_signaling_nan(a, status)
5968          || floatx80_is_signaling_nan(b, status)) {
5969             float_raise(float_flag_invalid, status);
5970         }
5971         return 1;
5972     }
5973     return 0;
5974 }
5975 
5976 /*----------------------------------------------------------------------------
5977 | Returns the result of converting the quadruple-precision floating-point
5978 | value `a' to the 32-bit two's complement integer format.  The conversion
5979 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5980 | Arithmetic---which means in particular that the conversion is rounded
5981 | according to the current rounding mode.  If `a' is a NaN, the largest
5982 | positive integer is returned.  Otherwise, if the conversion overflows, the
5983 | largest integer with the same sign as `a' is returned.
5984 *----------------------------------------------------------------------------*/
5985 
5986 int32_t float128_to_int32(float128 a, float_status *status)
5987 {
5988     flag aSign;
5989     int32_t aExp, shiftCount;
5990     uint64_t aSig0, aSig1;
5991 
5992     aSig1 = extractFloat128Frac1( a );
5993     aSig0 = extractFloat128Frac0( a );
5994     aExp = extractFloat128Exp( a );
5995     aSign = extractFloat128Sign( a );
5996     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5997     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5998     aSig0 |= ( aSig1 != 0 );
5999     shiftCount = 0x4028 - aExp;
6000     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6001     return roundAndPackInt32(aSign, aSig0, status);
6002 
6003 }
6004 
6005 /*----------------------------------------------------------------------------
6006 | Returns the result of converting the quadruple-precision floating-point
6007 | value `a' to the 32-bit two's complement integer format.  The conversion
6008 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6009 | Arithmetic, except that the conversion is always rounded toward zero.  If
6010 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6011 | conversion overflows, the largest integer with the same sign as `a' is
6012 | returned.
6013 *----------------------------------------------------------------------------*/
6014 
6015 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6016 {
6017     flag aSign;
6018     int32_t aExp, shiftCount;
6019     uint64_t aSig0, aSig1, savedASig;
6020     int32_t z;
6021 
6022     aSig1 = extractFloat128Frac1( a );
6023     aSig0 = extractFloat128Frac0( a );
6024     aExp = extractFloat128Exp( a );
6025     aSign = extractFloat128Sign( a );
6026     aSig0 |= ( aSig1 != 0 );
6027     if ( 0x401E < aExp ) {
6028         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6029         goto invalid;
6030     }
6031     else if ( aExp < 0x3FFF ) {
6032         if (aExp || aSig0) {
6033             status->float_exception_flags |= float_flag_inexact;
6034         }
6035         return 0;
6036     }
6037     aSig0 |= LIT64( 0x0001000000000000 );
6038     shiftCount = 0x402F - aExp;
6039     savedASig = aSig0;
6040     aSig0 >>= shiftCount;
6041     z = aSig0;
6042     if ( aSign ) z = - z;
6043     if ( ( z < 0 ) ^ aSign ) {
6044  invalid:
6045         float_raise(float_flag_invalid, status);
6046         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6047     }
6048     if ( ( aSig0<<shiftCount ) != savedASig ) {
6049         status->float_exception_flags |= float_flag_inexact;
6050     }
6051     return z;
6052 
6053 }
6054 
6055 /*----------------------------------------------------------------------------
6056 | Returns the result of converting the quadruple-precision floating-point
6057 | value `a' to the 64-bit two's complement integer format.  The conversion
6058 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6059 | Arithmetic---which means in particular that the conversion is rounded
6060 | according to the current rounding mode.  If `a' is a NaN, the largest
6061 | positive integer is returned.  Otherwise, if the conversion overflows, the
6062 | largest integer with the same sign as `a' is returned.
6063 *----------------------------------------------------------------------------*/
6064 
6065 int64_t float128_to_int64(float128 a, float_status *status)
6066 {
6067     flag aSign;
6068     int32_t aExp, shiftCount;
6069     uint64_t aSig0, aSig1;
6070 
6071     aSig1 = extractFloat128Frac1( a );
6072     aSig0 = extractFloat128Frac0( a );
6073     aExp = extractFloat128Exp( a );
6074     aSign = extractFloat128Sign( a );
6075     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6076     shiftCount = 0x402F - aExp;
6077     if ( shiftCount <= 0 ) {
6078         if ( 0x403E < aExp ) {
6079             float_raise(float_flag_invalid, status);
6080             if (    ! aSign
6081                  || (    ( aExp == 0x7FFF )
6082                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6083                     )
6084                ) {
6085                 return LIT64( 0x7FFFFFFFFFFFFFFF );
6086             }
6087             return (int64_t) LIT64( 0x8000000000000000 );
6088         }
6089         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6090     }
6091     else {
6092         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6093     }
6094     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6095 
6096 }
6097 
6098 /*----------------------------------------------------------------------------
6099 | Returns the result of converting the quadruple-precision floating-point
6100 | value `a' to the 64-bit two's complement integer format.  The conversion
6101 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6102 | Arithmetic, except that the conversion is always rounded toward zero.
6103 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6104 | the conversion overflows, the largest integer with the same sign as `a' is
6105 | returned.
6106 *----------------------------------------------------------------------------*/
6107 
6108 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6109 {
6110     flag aSign;
6111     int32_t aExp, shiftCount;
6112     uint64_t aSig0, aSig1;
6113     int64_t z;
6114 
6115     aSig1 = extractFloat128Frac1( a );
6116     aSig0 = extractFloat128Frac0( a );
6117     aExp = extractFloat128Exp( a );
6118     aSign = extractFloat128Sign( a );
6119     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6120     shiftCount = aExp - 0x402F;
6121     if ( 0 < shiftCount ) {
6122         if ( 0x403E <= aExp ) {
6123             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6124             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6125                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6126                 if (aSig1) {
6127                     status->float_exception_flags |= float_flag_inexact;
6128                 }
6129             }
6130             else {
6131                 float_raise(float_flag_invalid, status);
6132                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6133                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6134                 }
6135             }
6136             return (int64_t) LIT64( 0x8000000000000000 );
6137         }
6138         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6139         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6140             status->float_exception_flags |= float_flag_inexact;
6141         }
6142     }
6143     else {
6144         if ( aExp < 0x3FFF ) {
6145             if ( aExp | aSig0 | aSig1 ) {
6146                 status->float_exception_flags |= float_flag_inexact;
6147             }
6148             return 0;
6149         }
6150         z = aSig0>>( - shiftCount );
6151         if (    aSig1
6152              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6153             status->float_exception_flags |= float_flag_inexact;
6154         }
6155     }
6156     if ( aSign ) z = - z;
6157     return z;
6158 
6159 }
6160 
6161 /*----------------------------------------------------------------------------
6162 | Returns the result of converting the quadruple-precision floating-point value
6163 | `a' to the 64-bit unsigned integer format.  The conversion is
6164 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6165 | Arithmetic---which means in particular that the conversion is rounded
6166 | according to the current rounding mode.  If `a' is a NaN, the largest
6167 | positive integer is returned.  If the conversion overflows, the
6168 | largest unsigned integer is returned.  If 'a' is negative, the value is
6169 | rounded and zero is returned; negative values that do not round to zero
6170 | will raise the inexact exception.
6171 *----------------------------------------------------------------------------*/
6172 
6173 uint64_t float128_to_uint64(float128 a, float_status *status)
6174 {
6175     flag aSign;
6176     int aExp;
6177     int shiftCount;
6178     uint64_t aSig0, aSig1;
6179 
6180     aSig0 = extractFloat128Frac0(a);
6181     aSig1 = extractFloat128Frac1(a);
6182     aExp = extractFloat128Exp(a);
6183     aSign = extractFloat128Sign(a);
6184     if (aSign && (aExp > 0x3FFE)) {
6185         float_raise(float_flag_invalid, status);
6186         if (float128_is_any_nan(a)) {
6187             return LIT64(0xFFFFFFFFFFFFFFFF);
6188         } else {
6189             return 0;
6190         }
6191     }
6192     if (aExp) {
6193         aSig0 |= LIT64(0x0001000000000000);
6194     }
6195     shiftCount = 0x402F - aExp;
6196     if (shiftCount <= 0) {
6197         if (0x403E < aExp) {
6198             float_raise(float_flag_invalid, status);
6199             return LIT64(0xFFFFFFFFFFFFFFFF);
6200         }
6201         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6202     } else {
6203         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6204     }
6205     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6206 }
6207 
6208 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6209 {
6210     uint64_t v;
6211     signed char current_rounding_mode = status->float_rounding_mode;
6212 
6213     set_float_rounding_mode(float_round_to_zero, status);
6214     v = float128_to_uint64(a, status);
6215     set_float_rounding_mode(current_rounding_mode, status);
6216 
6217     return v;
6218 }
6219 
6220 /*----------------------------------------------------------------------------
6221 | Returns the result of converting the quadruple-precision floating-point
6222 | value `a' to the 32-bit unsigned integer format.  The conversion
6223 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6224 | Arithmetic except that the conversion is always rounded toward zero.
6225 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6226 | if the conversion overflows, the largest unsigned integer is returned.
6227 | If 'a' is negative, the value is rounded and zero is returned; negative
6228 | values that do not round to zero will raise the inexact exception.
6229 *----------------------------------------------------------------------------*/
6230 
6231 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6232 {
6233     uint64_t v;
6234     uint32_t res;
6235     int old_exc_flags = get_float_exception_flags(status);
6236 
6237     v = float128_to_uint64_round_to_zero(a, status);
6238     if (v > 0xffffffff) {
6239         res = 0xffffffff;
6240     } else {
6241         return v;
6242     }
6243     set_float_exception_flags(old_exc_flags, status);
6244     float_raise(float_flag_invalid, status);
6245     return res;
6246 }
6247 
6248 /*----------------------------------------------------------------------------
6249 | Returns the result of converting the quadruple-precision floating-point
6250 | value `a' to the single-precision floating-point format.  The conversion
6251 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6252 | Arithmetic.
6253 *----------------------------------------------------------------------------*/
6254 
6255 float32 float128_to_float32(float128 a, float_status *status)
6256 {
6257     flag aSign;
6258     int32_t aExp;
6259     uint64_t aSig0, aSig1;
6260     uint32_t zSig;
6261 
6262     aSig1 = extractFloat128Frac1( a );
6263     aSig0 = extractFloat128Frac0( a );
6264     aExp = extractFloat128Exp( a );
6265     aSign = extractFloat128Sign( a );
6266     if ( aExp == 0x7FFF ) {
6267         if ( aSig0 | aSig1 ) {
6268             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6269         }
6270         return packFloat32( aSign, 0xFF, 0 );
6271     }
6272     aSig0 |= ( aSig1 != 0 );
6273     shift64RightJamming( aSig0, 18, &aSig0 );
6274     zSig = aSig0;
6275     if ( aExp || zSig ) {
6276         zSig |= 0x40000000;
6277         aExp -= 0x3F81;
6278     }
6279     return roundAndPackFloat32(aSign, aExp, zSig, status);
6280 
6281 }
6282 
6283 /*----------------------------------------------------------------------------
6284 | Returns the result of converting the quadruple-precision floating-point
6285 | value `a' to the double-precision floating-point format.  The conversion
6286 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6287 | Arithmetic.
6288 *----------------------------------------------------------------------------*/
6289 
6290 float64 float128_to_float64(float128 a, float_status *status)
6291 {
6292     flag aSign;
6293     int32_t aExp;
6294     uint64_t aSig0, aSig1;
6295 
6296     aSig1 = extractFloat128Frac1( a );
6297     aSig0 = extractFloat128Frac0( a );
6298     aExp = extractFloat128Exp( a );
6299     aSign = extractFloat128Sign( a );
6300     if ( aExp == 0x7FFF ) {
6301         if ( aSig0 | aSig1 ) {
6302             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6303         }
6304         return packFloat64( aSign, 0x7FF, 0 );
6305     }
6306     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6307     aSig0 |= ( aSig1 != 0 );
6308     if ( aExp || aSig0 ) {
6309         aSig0 |= LIT64( 0x4000000000000000 );
6310         aExp -= 0x3C01;
6311     }
6312     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6313 
6314 }
6315 
6316 /*----------------------------------------------------------------------------
6317 | Returns the result of converting the quadruple-precision floating-point
6318 | value `a' to the extended double-precision floating-point format.  The
6319 | conversion is performed according to the IEC/IEEE Standard for Binary
6320 | Floating-Point Arithmetic.
6321 *----------------------------------------------------------------------------*/
6322 
6323 floatx80 float128_to_floatx80(float128 a, float_status *status)
6324 {
6325     flag aSign;
6326     int32_t aExp;
6327     uint64_t aSig0, aSig1;
6328 
6329     aSig1 = extractFloat128Frac1( a );
6330     aSig0 = extractFloat128Frac0( a );
6331     aExp = extractFloat128Exp( a );
6332     aSign = extractFloat128Sign( a );
6333     if ( aExp == 0x7FFF ) {
6334         if ( aSig0 | aSig1 ) {
6335             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6336         }
6337         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6338     }
6339     if ( aExp == 0 ) {
6340         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6341         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6342     }
6343     else {
6344         aSig0 |= LIT64( 0x0001000000000000 );
6345     }
6346     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6347     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6348 
6349 }
6350 
6351 /*----------------------------------------------------------------------------
6352 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6353 | returns the result as a quadruple-precision floating-point value.  The
6354 | operation is performed according to the IEC/IEEE Standard for Binary
6355 | Floating-Point Arithmetic.
6356 *----------------------------------------------------------------------------*/
6357 
6358 float128 float128_round_to_int(float128 a, float_status *status)
6359 {
6360     flag aSign;
6361     int32_t aExp;
6362     uint64_t lastBitMask, roundBitsMask;
6363     float128 z;
6364 
6365     aExp = extractFloat128Exp( a );
6366     if ( 0x402F <= aExp ) {
6367         if ( 0x406F <= aExp ) {
6368             if (    ( aExp == 0x7FFF )
6369                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6370                ) {
6371                 return propagateFloat128NaN(a, a, status);
6372             }
6373             return a;
6374         }
6375         lastBitMask = 1;
6376         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6377         roundBitsMask = lastBitMask - 1;
6378         z = a;
6379         switch (status->float_rounding_mode) {
6380         case float_round_nearest_even:
6381             if ( lastBitMask ) {
6382                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6383                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6384             }
6385             else {
6386                 if ( (int64_t) z.low < 0 ) {
6387                     ++z.high;
6388                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6389                 }
6390             }
6391             break;
6392         case float_round_ties_away:
6393             if (lastBitMask) {
6394                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6395             } else {
6396                 if ((int64_t) z.low < 0) {
6397                     ++z.high;
6398                 }
6399             }
6400             break;
6401         case float_round_to_zero:
6402             break;
6403         case float_round_up:
6404             if (!extractFloat128Sign(z)) {
6405                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6406             }
6407             break;
6408         case float_round_down:
6409             if (extractFloat128Sign(z)) {
6410                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6411             }
6412             break;
6413         default:
6414             abort();
6415         }
6416         z.low &= ~ roundBitsMask;
6417     }
6418     else {
6419         if ( aExp < 0x3FFF ) {
6420             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6421             status->float_exception_flags |= float_flag_inexact;
6422             aSign = extractFloat128Sign( a );
6423             switch (status->float_rounding_mode) {
6424              case float_round_nearest_even:
6425                 if (    ( aExp == 0x3FFE )
6426                      && (   extractFloat128Frac0( a )
6427                           | extractFloat128Frac1( a ) )
6428                    ) {
6429                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6430                 }
6431                 break;
6432             case float_round_ties_away:
6433                 if (aExp == 0x3FFE) {
6434                     return packFloat128(aSign, 0x3FFF, 0, 0);
6435                 }
6436                 break;
6437              case float_round_down:
6438                 return
6439                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6440                     : packFloat128( 0, 0, 0, 0 );
6441              case float_round_up:
6442                 return
6443                       aSign ? packFloat128( 1, 0, 0, 0 )
6444                     : packFloat128( 0, 0x3FFF, 0, 0 );
6445             }
6446             return packFloat128( aSign, 0, 0, 0 );
6447         }
6448         lastBitMask = 1;
6449         lastBitMask <<= 0x402F - aExp;
6450         roundBitsMask = lastBitMask - 1;
6451         z.low = 0;
6452         z.high = a.high;
6453         switch (status->float_rounding_mode) {
6454         case float_round_nearest_even:
6455             z.high += lastBitMask>>1;
6456             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6457                 z.high &= ~ lastBitMask;
6458             }
6459             break;
6460         case float_round_ties_away:
6461             z.high += lastBitMask>>1;
6462             break;
6463         case float_round_to_zero:
6464             break;
6465         case float_round_up:
6466             if (!extractFloat128Sign(z)) {
6467                 z.high |= ( a.low != 0 );
6468                 z.high += roundBitsMask;
6469             }
6470             break;
6471         case float_round_down:
6472             if (extractFloat128Sign(z)) {
6473                 z.high |= (a.low != 0);
6474                 z.high += roundBitsMask;
6475             }
6476             break;
6477         default:
6478             abort();
6479         }
6480         z.high &= ~ roundBitsMask;
6481     }
6482     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6483         status->float_exception_flags |= float_flag_inexact;
6484     }
6485     return z;
6486 
6487 }
6488 
6489 /*----------------------------------------------------------------------------
6490 | Returns the result of adding the absolute values of the quadruple-precision
6491 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6492 | before being returned.  `zSign' is ignored if the result is a NaN.
6493 | The addition is performed according to the IEC/IEEE Standard for Binary
6494 | Floating-Point Arithmetic.
6495 *----------------------------------------------------------------------------*/
6496 
6497 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6498                                 float_status *status)
6499 {
6500     int32_t aExp, bExp, zExp;
6501     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6502     int32_t expDiff;
6503 
6504     aSig1 = extractFloat128Frac1( a );
6505     aSig0 = extractFloat128Frac0( a );
6506     aExp = extractFloat128Exp( a );
6507     bSig1 = extractFloat128Frac1( b );
6508     bSig0 = extractFloat128Frac0( b );
6509     bExp = extractFloat128Exp( b );
6510     expDiff = aExp - bExp;
6511     if ( 0 < expDiff ) {
6512         if ( aExp == 0x7FFF ) {
6513             if (aSig0 | aSig1) {
6514                 return propagateFloat128NaN(a, b, status);
6515             }
6516             return a;
6517         }
6518         if ( bExp == 0 ) {
6519             --expDiff;
6520         }
6521         else {
6522             bSig0 |= LIT64( 0x0001000000000000 );
6523         }
6524         shift128ExtraRightJamming(
6525             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6526         zExp = aExp;
6527     }
6528     else if ( expDiff < 0 ) {
6529         if ( bExp == 0x7FFF ) {
6530             if (bSig0 | bSig1) {
6531                 return propagateFloat128NaN(a, b, status);
6532             }
6533             return packFloat128( zSign, 0x7FFF, 0, 0 );
6534         }
6535         if ( aExp == 0 ) {
6536             ++expDiff;
6537         }
6538         else {
6539             aSig0 |= LIT64( 0x0001000000000000 );
6540         }
6541         shift128ExtraRightJamming(
6542             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6543         zExp = bExp;
6544     }
6545     else {
6546         if ( aExp == 0x7FFF ) {
6547             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6548                 return propagateFloat128NaN(a, b, status);
6549             }
6550             return a;
6551         }
6552         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6553         if ( aExp == 0 ) {
6554             if (status->flush_to_zero) {
6555                 if (zSig0 | zSig1) {
6556                     float_raise(float_flag_output_denormal, status);
6557                 }
6558                 return packFloat128(zSign, 0, 0, 0);
6559             }
6560             return packFloat128( zSign, 0, zSig0, zSig1 );
6561         }
6562         zSig2 = 0;
6563         zSig0 |= LIT64( 0x0002000000000000 );
6564         zExp = aExp;
6565         goto shiftRight1;
6566     }
6567     aSig0 |= LIT64( 0x0001000000000000 );
6568     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6569     --zExp;
6570     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6571     ++zExp;
6572  shiftRight1:
6573     shift128ExtraRightJamming(
6574         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6575  roundAndPack:
6576     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6577 
6578 }
6579 
6580 /*----------------------------------------------------------------------------
6581 | Returns the result of subtracting the absolute values of the quadruple-
6582 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6583 | difference is negated before being returned.  `zSign' is ignored if the
6584 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6585 | Standard for Binary Floating-Point Arithmetic.
6586 *----------------------------------------------------------------------------*/
6587 
6588 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6589                                 float_status *status)
6590 {
6591     int32_t aExp, bExp, zExp;
6592     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6593     int32_t expDiff;
6594 
6595     aSig1 = extractFloat128Frac1( a );
6596     aSig0 = extractFloat128Frac0( a );
6597     aExp = extractFloat128Exp( a );
6598     bSig1 = extractFloat128Frac1( b );
6599     bSig0 = extractFloat128Frac0( b );
6600     bExp = extractFloat128Exp( b );
6601     expDiff = aExp - bExp;
6602     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6603     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6604     if ( 0 < expDiff ) goto aExpBigger;
6605     if ( expDiff < 0 ) goto bExpBigger;
6606     if ( aExp == 0x7FFF ) {
6607         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6608             return propagateFloat128NaN(a, b, status);
6609         }
6610         float_raise(float_flag_invalid, status);
6611         return float128_default_nan(status);
6612     }
6613     if ( aExp == 0 ) {
6614         aExp = 1;
6615         bExp = 1;
6616     }
6617     if ( bSig0 < aSig0 ) goto aBigger;
6618     if ( aSig0 < bSig0 ) goto bBigger;
6619     if ( bSig1 < aSig1 ) goto aBigger;
6620     if ( aSig1 < bSig1 ) goto bBigger;
6621     return packFloat128(status->float_rounding_mode == float_round_down,
6622                         0, 0, 0);
6623  bExpBigger:
6624     if ( bExp == 0x7FFF ) {
6625         if (bSig0 | bSig1) {
6626             return propagateFloat128NaN(a, b, status);
6627         }
6628         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6629     }
6630     if ( aExp == 0 ) {
6631         ++expDiff;
6632     }
6633     else {
6634         aSig0 |= LIT64( 0x4000000000000000 );
6635     }
6636     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6637     bSig0 |= LIT64( 0x4000000000000000 );
6638  bBigger:
6639     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6640     zExp = bExp;
6641     zSign ^= 1;
6642     goto normalizeRoundAndPack;
6643  aExpBigger:
6644     if ( aExp == 0x7FFF ) {
6645         if (aSig0 | aSig1) {
6646             return propagateFloat128NaN(a, b, status);
6647         }
6648         return a;
6649     }
6650     if ( bExp == 0 ) {
6651         --expDiff;
6652     }
6653     else {
6654         bSig0 |= LIT64( 0x4000000000000000 );
6655     }
6656     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6657     aSig0 |= LIT64( 0x4000000000000000 );
6658  aBigger:
6659     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6660     zExp = aExp;
6661  normalizeRoundAndPack:
6662     --zExp;
6663     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6664                                          status);
6665 
6666 }
6667 
6668 /*----------------------------------------------------------------------------
6669 | Returns the result of adding the quadruple-precision floating-point values
6670 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6671 | for Binary Floating-Point Arithmetic.
6672 *----------------------------------------------------------------------------*/
6673 
6674 float128 float128_add(float128 a, float128 b, float_status *status)
6675 {
6676     flag aSign, bSign;
6677 
6678     aSign = extractFloat128Sign( a );
6679     bSign = extractFloat128Sign( b );
6680     if ( aSign == bSign ) {
6681         return addFloat128Sigs(a, b, aSign, status);
6682     }
6683     else {
6684         return subFloat128Sigs(a, b, aSign, status);
6685     }
6686 
6687 }
6688 
6689 /*----------------------------------------------------------------------------
6690 | Returns the result of subtracting the quadruple-precision floating-point
6691 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6692 | Standard for Binary Floating-Point Arithmetic.
6693 *----------------------------------------------------------------------------*/
6694 
6695 float128 float128_sub(float128 a, float128 b, float_status *status)
6696 {
6697     flag aSign, bSign;
6698 
6699     aSign = extractFloat128Sign( a );
6700     bSign = extractFloat128Sign( b );
6701     if ( aSign == bSign ) {
6702         return subFloat128Sigs(a, b, aSign, status);
6703     }
6704     else {
6705         return addFloat128Sigs(a, b, aSign, status);
6706     }
6707 
6708 }
6709 
6710 /*----------------------------------------------------------------------------
6711 | Returns the result of multiplying the quadruple-precision floating-point
6712 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6713 | Standard for Binary Floating-Point Arithmetic.
6714 *----------------------------------------------------------------------------*/
6715 
6716 float128 float128_mul(float128 a, float128 b, float_status *status)
6717 {
6718     flag aSign, bSign, zSign;
6719     int32_t aExp, bExp, zExp;
6720     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6721 
6722     aSig1 = extractFloat128Frac1( a );
6723     aSig0 = extractFloat128Frac0( a );
6724     aExp = extractFloat128Exp( a );
6725     aSign = extractFloat128Sign( a );
6726     bSig1 = extractFloat128Frac1( b );
6727     bSig0 = extractFloat128Frac0( b );
6728     bExp = extractFloat128Exp( b );
6729     bSign = extractFloat128Sign( b );
6730     zSign = aSign ^ bSign;
6731     if ( aExp == 0x7FFF ) {
6732         if (    ( aSig0 | aSig1 )
6733              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6734             return propagateFloat128NaN(a, b, status);
6735         }
6736         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6737         return packFloat128( zSign, 0x7FFF, 0, 0 );
6738     }
6739     if ( bExp == 0x7FFF ) {
6740         if (bSig0 | bSig1) {
6741             return propagateFloat128NaN(a, b, status);
6742         }
6743         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6744  invalid:
6745             float_raise(float_flag_invalid, status);
6746             return float128_default_nan(status);
6747         }
6748         return packFloat128( zSign, 0x7FFF, 0, 0 );
6749     }
6750     if ( aExp == 0 ) {
6751         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6752         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6753     }
6754     if ( bExp == 0 ) {
6755         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6756         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6757     }
6758     zExp = aExp + bExp - 0x4000;
6759     aSig0 |= LIT64( 0x0001000000000000 );
6760     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6761     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6762     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6763     zSig2 |= ( zSig3 != 0 );
6764     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6765         shift128ExtraRightJamming(
6766             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6767         ++zExp;
6768     }
6769     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6770 
6771 }
6772 
6773 /*----------------------------------------------------------------------------
6774 | Returns the result of dividing the quadruple-precision floating-point value
6775 | `a' by the corresponding value `b'.  The operation is performed according to
6776 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6777 *----------------------------------------------------------------------------*/
6778 
6779 float128 float128_div(float128 a, float128 b, float_status *status)
6780 {
6781     flag aSign, bSign, zSign;
6782     int32_t aExp, bExp, zExp;
6783     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6784     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6785 
6786     aSig1 = extractFloat128Frac1( a );
6787     aSig0 = extractFloat128Frac0( a );
6788     aExp = extractFloat128Exp( a );
6789     aSign = extractFloat128Sign( a );
6790     bSig1 = extractFloat128Frac1( b );
6791     bSig0 = extractFloat128Frac0( b );
6792     bExp = extractFloat128Exp( b );
6793     bSign = extractFloat128Sign( b );
6794     zSign = aSign ^ bSign;
6795     if ( aExp == 0x7FFF ) {
6796         if (aSig0 | aSig1) {
6797             return propagateFloat128NaN(a, b, status);
6798         }
6799         if ( bExp == 0x7FFF ) {
6800             if (bSig0 | bSig1) {
6801                 return propagateFloat128NaN(a, b, status);
6802             }
6803             goto invalid;
6804         }
6805         return packFloat128( zSign, 0x7FFF, 0, 0 );
6806     }
6807     if ( bExp == 0x7FFF ) {
6808         if (bSig0 | bSig1) {
6809             return propagateFloat128NaN(a, b, status);
6810         }
6811         return packFloat128( zSign, 0, 0, 0 );
6812     }
6813     if ( bExp == 0 ) {
6814         if ( ( bSig0 | bSig1 ) == 0 ) {
6815             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6816  invalid:
6817                 float_raise(float_flag_invalid, status);
6818                 return float128_default_nan(status);
6819             }
6820             float_raise(float_flag_divbyzero, status);
6821             return packFloat128( zSign, 0x7FFF, 0, 0 );
6822         }
6823         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6824     }
6825     if ( aExp == 0 ) {
6826         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6827         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6828     }
6829     zExp = aExp - bExp + 0x3FFD;
6830     shortShift128Left(
6831         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6832     shortShift128Left(
6833         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6834     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6835         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6836         ++zExp;
6837     }
6838     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6839     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6840     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6841     while ( (int64_t) rem0 < 0 ) {
6842         --zSig0;
6843         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6844     }
6845     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6846     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6847         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6848         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6849         while ( (int64_t) rem1 < 0 ) {
6850             --zSig1;
6851             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6852         }
6853         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6854     }
6855     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6856     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6857 
6858 }
6859 
6860 /*----------------------------------------------------------------------------
6861 | Returns the remainder of the quadruple-precision floating-point value `a'
6862 | with respect to the corresponding value `b'.  The operation is performed
6863 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6864 *----------------------------------------------------------------------------*/
6865 
6866 float128 float128_rem(float128 a, float128 b, float_status *status)
6867 {
6868     flag aSign, zSign;
6869     int32_t aExp, bExp, expDiff;
6870     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6871     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6872     int64_t sigMean0;
6873 
6874     aSig1 = extractFloat128Frac1( a );
6875     aSig0 = extractFloat128Frac0( a );
6876     aExp = extractFloat128Exp( a );
6877     aSign = extractFloat128Sign( a );
6878     bSig1 = extractFloat128Frac1( b );
6879     bSig0 = extractFloat128Frac0( b );
6880     bExp = extractFloat128Exp( b );
6881     if ( aExp == 0x7FFF ) {
6882         if (    ( aSig0 | aSig1 )
6883              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6884             return propagateFloat128NaN(a, b, status);
6885         }
6886         goto invalid;
6887     }
6888     if ( bExp == 0x7FFF ) {
6889         if (bSig0 | bSig1) {
6890             return propagateFloat128NaN(a, b, status);
6891         }
6892         return a;
6893     }
6894     if ( bExp == 0 ) {
6895         if ( ( bSig0 | bSig1 ) == 0 ) {
6896  invalid:
6897             float_raise(float_flag_invalid, status);
6898             return float128_default_nan(status);
6899         }
6900         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6901     }
6902     if ( aExp == 0 ) {
6903         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6904         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6905     }
6906     expDiff = aExp - bExp;
6907     if ( expDiff < -1 ) return a;
6908     shortShift128Left(
6909         aSig0 | LIT64( 0x0001000000000000 ),
6910         aSig1,
6911         15 - ( expDiff < 0 ),
6912         &aSig0,
6913         &aSig1
6914     );
6915     shortShift128Left(
6916         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6917     q = le128( bSig0, bSig1, aSig0, aSig1 );
6918     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6919     expDiff -= 64;
6920     while ( 0 < expDiff ) {
6921         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6922         q = ( 4 < q ) ? q - 4 : 0;
6923         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6924         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6925         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6926         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6927         expDiff -= 61;
6928     }
6929     if ( -64 < expDiff ) {
6930         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6931         q = ( 4 < q ) ? q - 4 : 0;
6932         q >>= - expDiff;
6933         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6934         expDiff += 52;
6935         if ( expDiff < 0 ) {
6936             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6937         }
6938         else {
6939             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6940         }
6941         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6942         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6943     }
6944     else {
6945         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6946         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6947     }
6948     do {
6949         alternateASig0 = aSig0;
6950         alternateASig1 = aSig1;
6951         ++q;
6952         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6953     } while ( 0 <= (int64_t) aSig0 );
6954     add128(
6955         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6956     if (    ( sigMean0 < 0 )
6957          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6958         aSig0 = alternateASig0;
6959         aSig1 = alternateASig1;
6960     }
6961     zSign = ( (int64_t) aSig0 < 0 );
6962     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6963     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6964                                          status);
6965 }
6966 
6967 /*----------------------------------------------------------------------------
6968 | Returns the square root of the quadruple-precision floating-point value `a'.
6969 | The operation is performed according to the IEC/IEEE Standard for Binary
6970 | Floating-Point Arithmetic.
6971 *----------------------------------------------------------------------------*/
6972 
6973 float128 float128_sqrt(float128 a, float_status *status)
6974 {
6975     flag aSign;
6976     int32_t aExp, zExp;
6977     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6978     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6979 
6980     aSig1 = extractFloat128Frac1( a );
6981     aSig0 = extractFloat128Frac0( a );
6982     aExp = extractFloat128Exp( a );
6983     aSign = extractFloat128Sign( a );
6984     if ( aExp == 0x7FFF ) {
6985         if (aSig0 | aSig1) {
6986             return propagateFloat128NaN(a, a, status);
6987         }
6988         if ( ! aSign ) return a;
6989         goto invalid;
6990     }
6991     if ( aSign ) {
6992         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6993  invalid:
6994         float_raise(float_flag_invalid, status);
6995         return float128_default_nan(status);
6996     }
6997     if ( aExp == 0 ) {
6998         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6999         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7000     }
7001     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7002     aSig0 |= LIT64( 0x0001000000000000 );
7003     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7004     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7005     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7006     doubleZSig0 = zSig0<<1;
7007     mul64To128( zSig0, zSig0, &term0, &term1 );
7008     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7009     while ( (int64_t) rem0 < 0 ) {
7010         --zSig0;
7011         doubleZSig0 -= 2;
7012         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7013     }
7014     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7015     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7016         if ( zSig1 == 0 ) zSig1 = 1;
7017         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7018         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7019         mul64To128( zSig1, zSig1, &term2, &term3 );
7020         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7021         while ( (int64_t) rem1 < 0 ) {
7022             --zSig1;
7023             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7024             term3 |= 1;
7025             term2 |= doubleZSig0;
7026             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7027         }
7028         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7029     }
7030     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7031     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7032 
7033 }
7034 
7035 /*----------------------------------------------------------------------------
7036 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7037 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7038 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7039 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7040 *----------------------------------------------------------------------------*/
7041 
7042 int float128_eq(float128 a, float128 b, float_status *status)
7043 {
7044 
7045     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7046               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7047          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7048               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7049        ) {
7050         float_raise(float_flag_invalid, status);
7051         return 0;
7052     }
7053     return
7054            ( a.low == b.low )
7055         && (    ( a.high == b.high )
7056              || (    ( a.low == 0 )
7057                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7058            );
7059 
7060 }
7061 
7062 /*----------------------------------------------------------------------------
7063 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7064 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7065 | exception is raised if either operand is a NaN.  The comparison is performed
7066 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7067 *----------------------------------------------------------------------------*/
7068 
7069 int float128_le(float128 a, float128 b, float_status *status)
7070 {
7071     flag aSign, bSign;
7072 
7073     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7074               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7075          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7076               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7077        ) {
7078         float_raise(float_flag_invalid, status);
7079         return 0;
7080     }
7081     aSign = extractFloat128Sign( a );
7082     bSign = extractFloat128Sign( b );
7083     if ( aSign != bSign ) {
7084         return
7085                aSign
7086             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7087                  == 0 );
7088     }
7089     return
7090           aSign ? le128( b.high, b.low, a.high, a.low )
7091         : le128( a.high, a.low, b.high, b.low );
7092 
7093 }
7094 
7095 /*----------------------------------------------------------------------------
7096 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7097 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7098 | raised if either operand is a NaN.  The comparison is performed according
7099 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7100 *----------------------------------------------------------------------------*/
7101 
7102 int float128_lt(float128 a, float128 b, float_status *status)
7103 {
7104     flag aSign, bSign;
7105 
7106     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7107               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7108          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7109               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7110        ) {
7111         float_raise(float_flag_invalid, status);
7112         return 0;
7113     }
7114     aSign = extractFloat128Sign( a );
7115     bSign = extractFloat128Sign( b );
7116     if ( aSign != bSign ) {
7117         return
7118                aSign
7119             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7120                  != 0 );
7121     }
7122     return
7123           aSign ? lt128( b.high, b.low, a.high, a.low )
7124         : lt128( a.high, a.low, b.high, b.low );
7125 
7126 }
7127 
7128 /*----------------------------------------------------------------------------
7129 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7130 | be compared, and 0 otherwise.  The invalid exception is raised if either
7131 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7132 | Standard for Binary Floating-Point Arithmetic.
7133 *----------------------------------------------------------------------------*/
7134 
7135 int float128_unordered(float128 a, float128 b, float_status *status)
7136 {
7137     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7138               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7139          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7140               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7141        ) {
7142         float_raise(float_flag_invalid, status);
7143         return 1;
7144     }
7145     return 0;
7146 }
7147 
7148 /*----------------------------------------------------------------------------
7149 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7150 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7151 | exception.  The comparison is performed according to the IEC/IEEE Standard
7152 | for Binary Floating-Point Arithmetic.
7153 *----------------------------------------------------------------------------*/
7154 
7155 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7156 {
7157 
7158     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7159               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7160          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7161               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7162        ) {
7163         if (float128_is_signaling_nan(a, status)
7164          || float128_is_signaling_nan(b, status)) {
7165             float_raise(float_flag_invalid, status);
7166         }
7167         return 0;
7168     }
7169     return
7170            ( a.low == b.low )
7171         && (    ( a.high == b.high )
7172              || (    ( a.low == 0 )
7173                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7174            );
7175 
7176 }
7177 
7178 /*----------------------------------------------------------------------------
7179 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7180 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7181 | cause an exception.  Otherwise, the comparison is performed according to the
7182 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7183 *----------------------------------------------------------------------------*/
7184 
7185 int float128_le_quiet(float128 a, float128 b, float_status *status)
7186 {
7187     flag aSign, bSign;
7188 
7189     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7190               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7191          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7192               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7193        ) {
7194         if (float128_is_signaling_nan(a, status)
7195          || float128_is_signaling_nan(b, status)) {
7196             float_raise(float_flag_invalid, status);
7197         }
7198         return 0;
7199     }
7200     aSign = extractFloat128Sign( a );
7201     bSign = extractFloat128Sign( b );
7202     if ( aSign != bSign ) {
7203         return
7204                aSign
7205             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7206                  == 0 );
7207     }
7208     return
7209           aSign ? le128( b.high, b.low, a.high, a.low )
7210         : le128( a.high, a.low, b.high, b.low );
7211 
7212 }
7213 
7214 /*----------------------------------------------------------------------------
7215 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7216 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7217 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7218 | Standard for Binary Floating-Point Arithmetic.
7219 *----------------------------------------------------------------------------*/
7220 
7221 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7222 {
7223     flag aSign, bSign;
7224 
7225     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7226               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7227          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7228               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7229        ) {
7230         if (float128_is_signaling_nan(a, status)
7231          || float128_is_signaling_nan(b, status)) {
7232             float_raise(float_flag_invalid, status);
7233         }
7234         return 0;
7235     }
7236     aSign = extractFloat128Sign( a );
7237     bSign = extractFloat128Sign( b );
7238     if ( aSign != bSign ) {
7239         return
7240                aSign
7241             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7242                  != 0 );
7243     }
7244     return
7245           aSign ? lt128( b.high, b.low, a.high, a.low )
7246         : lt128( a.high, a.low, b.high, b.low );
7247 
7248 }
7249 
7250 /*----------------------------------------------------------------------------
7251 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7252 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7253 | comparison is performed according to the IEC/IEEE Standard for Binary
7254 | Floating-Point Arithmetic.
7255 *----------------------------------------------------------------------------*/
7256 
7257 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7258 {
7259     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7260               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7261          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7262               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7263        ) {
7264         if (float128_is_signaling_nan(a, status)
7265          || float128_is_signaling_nan(b, status)) {
7266             float_raise(float_flag_invalid, status);
7267         }
7268         return 1;
7269     }
7270     return 0;
7271 }
7272 
7273 /* misc functions */
7274 float32 uint32_to_float32(uint32_t a, float_status *status)
7275 {
7276     return int64_to_float32(a, status);
7277 }
7278 
7279 float64 uint32_to_float64(uint32_t a, float_status *status)
7280 {
7281     return int64_to_float64(a, status);
7282 }
7283 
7284 uint32_t float32_to_uint32(float32 a, float_status *status)
7285 {
7286     int64_t v;
7287     uint32_t res;
7288     int old_exc_flags = get_float_exception_flags(status);
7289 
7290     v = float32_to_int64(a, status);
7291     if (v < 0) {
7292         res = 0;
7293     } else if (v > 0xffffffff) {
7294         res = 0xffffffff;
7295     } else {
7296         return v;
7297     }
7298     set_float_exception_flags(old_exc_flags, status);
7299     float_raise(float_flag_invalid, status);
7300     return res;
7301 }
7302 
7303 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
7304 {
7305     int64_t v;
7306     uint32_t res;
7307     int old_exc_flags = get_float_exception_flags(status);
7308 
7309     v = float32_to_int64_round_to_zero(a, status);
7310     if (v < 0) {
7311         res = 0;
7312     } else if (v > 0xffffffff) {
7313         res = 0xffffffff;
7314     } else {
7315         return v;
7316     }
7317     set_float_exception_flags(old_exc_flags, status);
7318     float_raise(float_flag_invalid, status);
7319     return res;
7320 }
7321 
7322 int16_t float32_to_int16(float32 a, float_status *status)
7323 {
7324     int32_t v;
7325     int16_t res;
7326     int old_exc_flags = get_float_exception_flags(status);
7327 
7328     v = float32_to_int32(a, status);
7329     if (v < -0x8000) {
7330         res = -0x8000;
7331     } else if (v > 0x7fff) {
7332         res = 0x7fff;
7333     } else {
7334         return v;
7335     }
7336 
7337     set_float_exception_flags(old_exc_flags, status);
7338     float_raise(float_flag_invalid, status);
7339     return res;
7340 }
7341 
7342 uint16_t float32_to_uint16(float32 a, float_status *status)
7343 {
7344     int32_t v;
7345     uint16_t res;
7346     int old_exc_flags = get_float_exception_flags(status);
7347 
7348     v = float32_to_int32(a, status);
7349     if (v < 0) {
7350         res = 0;
7351     } else if (v > 0xffff) {
7352         res = 0xffff;
7353     } else {
7354         return v;
7355     }
7356 
7357     set_float_exception_flags(old_exc_flags, status);
7358     float_raise(float_flag_invalid, status);
7359     return res;
7360 }
7361 
7362 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7363 {
7364     int64_t v;
7365     uint16_t res;
7366     int old_exc_flags = get_float_exception_flags(status);
7367 
7368     v = float32_to_int64_round_to_zero(a, status);
7369     if (v < 0) {
7370         res = 0;
7371     } else if (v > 0xffff) {
7372         res = 0xffff;
7373     } else {
7374         return v;
7375     }
7376     set_float_exception_flags(old_exc_flags, status);
7377     float_raise(float_flag_invalid, status);
7378     return res;
7379 }
7380 
7381 uint32_t float64_to_uint32(float64 a, float_status *status)
7382 {
7383     uint64_t v;
7384     uint32_t res;
7385     int old_exc_flags = get_float_exception_flags(status);
7386 
7387     v = float64_to_uint64(a, status);
7388     if (v > 0xffffffff) {
7389         res = 0xffffffff;
7390     } else {
7391         return v;
7392     }
7393     set_float_exception_flags(old_exc_flags, status);
7394     float_raise(float_flag_invalid, status);
7395     return res;
7396 }
7397 
7398 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
7399 {
7400     uint64_t v;
7401     uint32_t res;
7402     int old_exc_flags = get_float_exception_flags(status);
7403 
7404     v = float64_to_uint64_round_to_zero(a, status);
7405     if (v > 0xffffffff) {
7406         res = 0xffffffff;
7407     } else {
7408         return v;
7409     }
7410     set_float_exception_flags(old_exc_flags, status);
7411     float_raise(float_flag_invalid, status);
7412     return res;
7413 }
7414 
7415 int16_t float64_to_int16(float64 a, float_status *status)
7416 {
7417     int64_t v;
7418     int16_t res;
7419     int old_exc_flags = get_float_exception_flags(status);
7420 
7421     v = float64_to_int32(a, status);
7422     if (v < -0x8000) {
7423         res = -0x8000;
7424     } else if (v > 0x7fff) {
7425         res = 0x7fff;
7426     } else {
7427         return v;
7428     }
7429 
7430     set_float_exception_flags(old_exc_flags, status);
7431     float_raise(float_flag_invalid, status);
7432     return res;
7433 }
7434 
7435 uint16_t float64_to_uint16(float64 a, float_status *status)
7436 {
7437     int64_t v;
7438     uint16_t res;
7439     int old_exc_flags = get_float_exception_flags(status);
7440 
7441     v = float64_to_int32(a, status);
7442     if (v < 0) {
7443         res = 0;
7444     } else if (v > 0xffff) {
7445         res = 0xffff;
7446     } else {
7447         return v;
7448     }
7449 
7450     set_float_exception_flags(old_exc_flags, status);
7451     float_raise(float_flag_invalid, status);
7452     return res;
7453 }
7454 
7455 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7456 {
7457     int64_t v;
7458     uint16_t res;
7459     int old_exc_flags = get_float_exception_flags(status);
7460 
7461     v = float64_to_int64_round_to_zero(a, status);
7462     if (v < 0) {
7463         res = 0;
7464     } else if (v > 0xffff) {
7465         res = 0xffff;
7466     } else {
7467         return v;
7468     }
7469     set_float_exception_flags(old_exc_flags, status);
7470     float_raise(float_flag_invalid, status);
7471     return res;
7472 }
7473 
7474 /*----------------------------------------------------------------------------
7475 | Returns the result of converting the double-precision floating-point value
7476 | `a' to the 64-bit unsigned integer format.  The conversion is
7477 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7478 | Arithmetic---which means in particular that the conversion is rounded
7479 | according to the current rounding mode.  If `a' is a NaN, the largest
7480 | positive integer is returned.  If the conversion overflows, the
7481 | largest unsigned integer is returned.  If 'a' is negative, the value is
7482 | rounded and zero is returned; negative values that do not round to zero
7483 | will raise the inexact exception.
7484 *----------------------------------------------------------------------------*/
7485 
7486 uint64_t float64_to_uint64(float64 a, float_status *status)
7487 {
7488     flag aSign;
7489     int aExp;
7490     int shiftCount;
7491     uint64_t aSig, aSigExtra;
7492     a = float64_squash_input_denormal(a, status);
7493 
7494     aSig = extractFloat64Frac(a);
7495     aExp = extractFloat64Exp(a);
7496     aSign = extractFloat64Sign(a);
7497     if (aSign && (aExp > 1022)) {
7498         float_raise(float_flag_invalid, status);
7499         if (float64_is_any_nan(a)) {
7500             return LIT64(0xFFFFFFFFFFFFFFFF);
7501         } else {
7502             return 0;
7503         }
7504     }
7505     if (aExp) {
7506         aSig |= LIT64(0x0010000000000000);
7507     }
7508     shiftCount = 0x433 - aExp;
7509     if (shiftCount <= 0) {
7510         if (0x43E < aExp) {
7511             float_raise(float_flag_invalid, status);
7512             return LIT64(0xFFFFFFFFFFFFFFFF);
7513         }
7514         aSigExtra = 0;
7515         aSig <<= -shiftCount;
7516     } else {
7517         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7518     }
7519     return roundAndPackUint64(aSign, aSig, aSigExtra, status);
7520 }
7521 
7522 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7523 {
7524     signed char current_rounding_mode = status->float_rounding_mode;
7525     set_float_rounding_mode(float_round_to_zero, status);
7526     uint64_t v = float64_to_uint64(a, status);
7527     set_float_rounding_mode(current_rounding_mode, status);
7528     return v;
7529 }
7530 
7531 #define COMPARE(s, nan_exp)                                                  \
7532 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7533                                       int is_quiet, float_status *status)    \
7534 {                                                                            \
7535     flag aSign, bSign;                                                       \
7536     uint ## s ## _t av, bv;                                                  \
7537     a = float ## s ## _squash_input_denormal(a, status);                     \
7538     b = float ## s ## _squash_input_denormal(b, status);                     \
7539                                                                              \
7540     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7541          extractFloat ## s ## Frac( a ) ) ||                                 \
7542         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7543           extractFloat ## s ## Frac( b ) )) {                                \
7544         if (!is_quiet ||                                                     \
7545             float ## s ## _is_signaling_nan(a, status) ||                  \
7546             float ## s ## _is_signaling_nan(b, status)) {                 \
7547             float_raise(float_flag_invalid, status);                         \
7548         }                                                                    \
7549         return float_relation_unordered;                                     \
7550     }                                                                        \
7551     aSign = extractFloat ## s ## Sign( a );                                  \
7552     bSign = extractFloat ## s ## Sign( b );                                  \
7553     av = float ## s ## _val(a);                                              \
7554     bv = float ## s ## _val(b);                                              \
7555     if ( aSign != bSign ) {                                                  \
7556         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7557             /* zero case */                                                  \
7558             return float_relation_equal;                                     \
7559         } else {                                                             \
7560             return 1 - (2 * aSign);                                          \
7561         }                                                                    \
7562     } else {                                                                 \
7563         if (av == bv) {                                                      \
7564             return float_relation_equal;                                     \
7565         } else {                                                             \
7566             return 1 - 2 * (aSign ^ ( av < bv ));                            \
7567         }                                                                    \
7568     }                                                                        \
7569 }                                                                            \
7570                                                                              \
7571 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7572 {                                                                            \
7573     return float ## s ## _compare_internal(a, b, 0, status);                 \
7574 }                                                                            \
7575                                                                              \
7576 int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
7577                                  float_status *status)                       \
7578 {                                                                            \
7579     return float ## s ## _compare_internal(a, b, 1, status);                 \
7580 }
7581 
7582 COMPARE(32, 0xff)
7583 COMPARE(64, 0x7ff)
7584 
7585 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7586                                             int is_quiet, float_status *status)
7587 {
7588     flag aSign, bSign;
7589 
7590     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7591         float_raise(float_flag_invalid, status);
7592         return float_relation_unordered;
7593     }
7594     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7595           ( extractFloatx80Frac( a )<<1 ) ) ||
7596         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7597           ( extractFloatx80Frac( b )<<1 ) )) {
7598         if (!is_quiet ||
7599             floatx80_is_signaling_nan(a, status) ||
7600             floatx80_is_signaling_nan(b, status)) {
7601             float_raise(float_flag_invalid, status);
7602         }
7603         return float_relation_unordered;
7604     }
7605     aSign = extractFloatx80Sign( a );
7606     bSign = extractFloatx80Sign( b );
7607     if ( aSign != bSign ) {
7608 
7609         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7610              ( ( a.low | b.low ) == 0 ) ) {
7611             /* zero case */
7612             return float_relation_equal;
7613         } else {
7614             return 1 - (2 * aSign);
7615         }
7616     } else {
7617         if (a.low == b.low && a.high == b.high) {
7618             return float_relation_equal;
7619         } else {
7620             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7621         }
7622     }
7623 }
7624 
7625 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7626 {
7627     return floatx80_compare_internal(a, b, 0, status);
7628 }
7629 
7630 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7631 {
7632     return floatx80_compare_internal(a, b, 1, status);
7633 }
7634 
7635 static inline int float128_compare_internal(float128 a, float128 b,
7636                                             int is_quiet, float_status *status)
7637 {
7638     flag aSign, bSign;
7639 
7640     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7641           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7642         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7643           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7644         if (!is_quiet ||
7645             float128_is_signaling_nan(a, status) ||
7646             float128_is_signaling_nan(b, status)) {
7647             float_raise(float_flag_invalid, status);
7648         }
7649         return float_relation_unordered;
7650     }
7651     aSign = extractFloat128Sign( a );
7652     bSign = extractFloat128Sign( b );
7653     if ( aSign != bSign ) {
7654         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7655             /* zero case */
7656             return float_relation_equal;
7657         } else {
7658             return 1 - (2 * aSign);
7659         }
7660     } else {
7661         if (a.low == b.low && a.high == b.high) {
7662             return float_relation_equal;
7663         } else {
7664             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7665         }
7666     }
7667 }
7668 
7669 int float128_compare(float128 a, float128 b, float_status *status)
7670 {
7671     return float128_compare_internal(a, b, 0, status);
7672 }
7673 
7674 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7675 {
7676     return float128_compare_internal(a, b, 1, status);
7677 }
7678 
7679 /* min() and max() functions. These can't be implemented as
7680  * 'compare and pick one input' because that would mishandle
7681  * NaNs and +0 vs -0.
7682  *
7683  * minnum() and maxnum() functions. These are similar to the min()
7684  * and max() functions but if one of the arguments is a QNaN and
7685  * the other is numerical then the numerical argument is returned.
7686  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7687  * and maxNum() operations. min() and max() are the typical min/max
7688  * semantics provided by many CPUs which predate that specification.
7689  *
7690  * minnummag() and maxnummag() functions correspond to minNumMag()
7691  * and minNumMag() from the IEEE-754 2008.
7692  */
7693 #define MINMAX(s)                                                       \
7694 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7695                                                int ismin, int isieee,   \
7696                                                int ismag,               \
7697                                                float_status *status)    \
7698 {                                                                       \
7699     flag aSign, bSign;                                                  \
7700     uint ## s ## _t av, bv, aav, abv;                                   \
7701     a = float ## s ## _squash_input_denormal(a, status);                \
7702     b = float ## s ## _squash_input_denormal(b, status);                \
7703     if (float ## s ## _is_any_nan(a) ||                                 \
7704         float ## s ## _is_any_nan(b)) {                                 \
7705         if (isieee) {                                                   \
7706             if (float ## s ## _is_quiet_nan(a, status) &&               \
7707                 !float ## s ##_is_any_nan(b)) {                         \
7708                 return b;                                               \
7709             } else if (float ## s ## _is_quiet_nan(b, status) &&        \
7710                        !float ## s ## _is_any_nan(a)) {                \
7711                 return a;                                               \
7712             }                                                           \
7713         }                                                               \
7714         return propagateFloat ## s ## NaN(a, b, status);                \
7715     }                                                                   \
7716     aSign = extractFloat ## s ## Sign(a);                               \
7717     bSign = extractFloat ## s ## Sign(b);                               \
7718     av = float ## s ## _val(a);                                         \
7719     bv = float ## s ## _val(b);                                         \
7720     if (ismag) {                                                        \
7721         aav = float ## s ## _abs(av);                                   \
7722         abv = float ## s ## _abs(bv);                                   \
7723         if (aav != abv) {                                               \
7724             if (ismin) {                                                \
7725                 return (aav < abv) ? a : b;                             \
7726             } else {                                                    \
7727                 return (aav < abv) ? b : a;                             \
7728             }                                                           \
7729         }                                                               \
7730     }                                                                   \
7731     if (aSign != bSign) {                                               \
7732         if (ismin) {                                                    \
7733             return aSign ? a : b;                                       \
7734         } else {                                                        \
7735             return aSign ? b : a;                                       \
7736         }                                                               \
7737     } else {                                                            \
7738         if (ismin) {                                                    \
7739             return (aSign ^ (av < bv)) ? a : b;                         \
7740         } else {                                                        \
7741             return (aSign ^ (av < bv)) ? b : a;                         \
7742         }                                                               \
7743     }                                                                   \
7744 }                                                                       \
7745                                                                         \
7746 float ## s float ## s ## _min(float ## s a, float ## s b,               \
7747                               float_status *status)                     \
7748 {                                                                       \
7749     return float ## s ## _minmax(a, b, 1, 0, 0, status);                \
7750 }                                                                       \
7751                                                                         \
7752 float ## s float ## s ## _max(float ## s a, float ## s b,               \
7753                               float_status *status)                     \
7754 {                                                                       \
7755     return float ## s ## _minmax(a, b, 0, 0, 0, status);                \
7756 }                                                                       \
7757                                                                         \
7758 float ## s float ## s ## _minnum(float ## s a, float ## s b,            \
7759                                  float_status *status)                  \
7760 {                                                                       \
7761     return float ## s ## _minmax(a, b, 1, 1, 0, status);                \
7762 }                                                                       \
7763                                                                         \
7764 float ## s float ## s ## _maxnum(float ## s a, float ## s b,            \
7765                                  float_status *status)                  \
7766 {                                                                       \
7767     return float ## s ## _minmax(a, b, 0, 1, 0, status);                \
7768 }                                                                       \
7769                                                                         \
7770 float ## s float ## s ## _minnummag(float ## s a, float ## s b,         \
7771                                     float_status *status)               \
7772 {                                                                       \
7773     return float ## s ## _minmax(a, b, 1, 1, 1, status);                \
7774 }                                                                       \
7775                                                                         \
7776 float ## s float ## s ## _maxnummag(float ## s a, float ## s b,         \
7777                                     float_status *status)               \
7778 {                                                                       \
7779     return float ## s ## _minmax(a, b, 0, 1, 1, status);                \
7780 }
7781 
7782 MINMAX(32)
7783 MINMAX(64)
7784 
7785 
7786 /* Multiply A by 2 raised to the power N.  */
7787 float32 float32_scalbn(float32 a, int n, float_status *status)
7788 {
7789     flag aSign;
7790     int16_t aExp;
7791     uint32_t aSig;
7792 
7793     a = float32_squash_input_denormal(a, status);
7794     aSig = extractFloat32Frac( a );
7795     aExp = extractFloat32Exp( a );
7796     aSign = extractFloat32Sign( a );
7797 
7798     if ( aExp == 0xFF ) {
7799         if ( aSig ) {
7800             return propagateFloat32NaN(a, a, status);
7801         }
7802         return a;
7803     }
7804     if (aExp != 0) {
7805         aSig |= 0x00800000;
7806     } else if (aSig == 0) {
7807         return a;
7808     } else {
7809         aExp++;
7810     }
7811 
7812     if (n > 0x200) {
7813         n = 0x200;
7814     } else if (n < -0x200) {
7815         n = -0x200;
7816     }
7817 
7818     aExp += n - 1;
7819     aSig <<= 7;
7820     return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
7821 }
7822 
7823 float64 float64_scalbn(float64 a, int n, float_status *status)
7824 {
7825     flag aSign;
7826     int16_t aExp;
7827     uint64_t aSig;
7828 
7829     a = float64_squash_input_denormal(a, status);
7830     aSig = extractFloat64Frac( a );
7831     aExp = extractFloat64Exp( a );
7832     aSign = extractFloat64Sign( a );
7833 
7834     if ( aExp == 0x7FF ) {
7835         if ( aSig ) {
7836             return propagateFloat64NaN(a, a, status);
7837         }
7838         return a;
7839     }
7840     if (aExp != 0) {
7841         aSig |= LIT64( 0x0010000000000000 );
7842     } else if (aSig == 0) {
7843         return a;
7844     } else {
7845         aExp++;
7846     }
7847 
7848     if (n > 0x1000) {
7849         n = 0x1000;
7850     } else if (n < -0x1000) {
7851         n = -0x1000;
7852     }
7853 
7854     aExp += n - 1;
7855     aSig <<= 10;
7856     return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
7857 }
7858 
7859 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7860 {
7861     flag aSign;
7862     int32_t aExp;
7863     uint64_t aSig;
7864 
7865     if (floatx80_invalid_encoding(a)) {
7866         float_raise(float_flag_invalid, status);
7867         return floatx80_default_nan(status);
7868     }
7869     aSig = extractFloatx80Frac( a );
7870     aExp = extractFloatx80Exp( a );
7871     aSign = extractFloatx80Sign( a );
7872 
7873     if ( aExp == 0x7FFF ) {
7874         if ( aSig<<1 ) {
7875             return propagateFloatx80NaN(a, a, status);
7876         }
7877         return a;
7878     }
7879 
7880     if (aExp == 0) {
7881         if (aSig == 0) {
7882             return a;
7883         }
7884         aExp++;
7885     }
7886 
7887     if (n > 0x10000) {
7888         n = 0x10000;
7889     } else if (n < -0x10000) {
7890         n = -0x10000;
7891     }
7892 
7893     aExp += n;
7894     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7895                                          aSign, aExp, aSig, 0, status);
7896 }
7897 
7898 float128 float128_scalbn(float128 a, int n, float_status *status)
7899 {
7900     flag aSign;
7901     int32_t aExp;
7902     uint64_t aSig0, aSig1;
7903 
7904     aSig1 = extractFloat128Frac1( a );
7905     aSig0 = extractFloat128Frac0( a );
7906     aExp = extractFloat128Exp( a );
7907     aSign = extractFloat128Sign( a );
7908     if ( aExp == 0x7FFF ) {
7909         if ( aSig0 | aSig1 ) {
7910             return propagateFloat128NaN(a, a, status);
7911         }
7912         return a;
7913     }
7914     if (aExp != 0) {
7915         aSig0 |= LIT64( 0x0001000000000000 );
7916     } else if (aSig0 == 0 && aSig1 == 0) {
7917         return a;
7918     } else {
7919         aExp++;
7920     }
7921 
7922     if (n > 0x10000) {
7923         n = 0x10000;
7924     } else if (n < -0x10000) {
7925         n = -0x10000;
7926     }
7927 
7928     aExp += n - 1;
7929     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7930                                          , status);
7931 
7932 }
7933