xref: /qemu/fpu/softfloat.c (revision d97544c94a37371347402bcbee19dd3748d70e48)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 
87 #include "fpu/softfloat.h"
88 
89 /* We only need stdlib for abort() */
90 
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations.  (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
97 
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine:  (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output.  These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
107 
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
111 
112 static inline uint32_t extractFloat16Frac(float16 a)
113 {
114     return float16_val(a) & 0x3ff;
115 }
116 
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
120 
121 static inline int extractFloat16Exp(float16 a)
122 {
123     return (float16_val(a) >> 10) & 0x1f;
124 }
125 
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
129 
130 static inline flag extractFloat16Sign(float16 a)
131 {
132     return float16_val(a)>>15;
133 }
134 
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
138 
139 static inline uint32_t extractFloat32Frac(float32 a)
140 {
141     return float32_val(a) & 0x007FFFFF;
142 }
143 
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
147 
148 static inline int extractFloat32Exp(float32 a)
149 {
150     return (float32_val(a) >> 23) & 0xFF;
151 }
152 
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
156 
157 static inline flag extractFloat32Sign(float32 a)
158 {
159     return float32_val(a) >> 31;
160 }
161 
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
165 
166 static inline uint64_t extractFloat64Frac(float64 a)
167 {
168     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
169 }
170 
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
174 
175 static inline int extractFloat64Exp(float64 a)
176 {
177     return (float64_val(a) >> 52) & 0x7FF;
178 }
179 
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
183 
184 static inline flag extractFloat64Sign(float64 a)
185 {
186     return float64_val(a) >> 63;
187 }
188 
189 /*----------------------------------------------------------------------------
190 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
191 | and 7, and returns the properly rounded 32-bit integer corresponding to the
192 | input.  If `zSign' is 1, the input is negated before being converted to an
193 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
194 | is simply rounded to an integer, with the inexact exception raised if the
195 | input cannot be represented exactly as an integer.  However, if the fixed-
196 | point input is too large, the invalid exception is raised and the largest
197 | positive or negative integer is returned.
198 *----------------------------------------------------------------------------*/
199 
200 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
201 {
202     int8_t roundingMode;
203     flag roundNearestEven;
204     int8_t roundIncrement, roundBits;
205     int32_t z;
206 
207     roundingMode = status->float_rounding_mode;
208     roundNearestEven = ( roundingMode == float_round_nearest_even );
209     switch (roundingMode) {
210     case float_round_nearest_even:
211     case float_round_ties_away:
212         roundIncrement = 0x40;
213         break;
214     case float_round_to_zero:
215         roundIncrement = 0;
216         break;
217     case float_round_up:
218         roundIncrement = zSign ? 0 : 0x7f;
219         break;
220     case float_round_down:
221         roundIncrement = zSign ? 0x7f : 0;
222         break;
223     default:
224         abort();
225     }
226     roundBits = absZ & 0x7F;
227     absZ = ( absZ + roundIncrement )>>7;
228     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
229     z = absZ;
230     if ( zSign ) z = - z;
231     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
232         float_raise(float_flag_invalid, status);
233         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
234     }
235     if (roundBits) {
236         status->float_exception_flags |= float_flag_inexact;
237     }
238     return z;
239 
240 }
241 
242 /*----------------------------------------------------------------------------
243 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
244 | `absZ1', with binary point between bits 63 and 64 (between the input words),
245 | and returns the properly rounded 64-bit integer corresponding to the input.
246 | If `zSign' is 1, the input is negated before being converted to an integer.
247 | Ordinarily, the fixed-point input is simply rounded to an integer, with
248 | the inexact exception raised if the input cannot be represented exactly as
249 | an integer.  However, if the fixed-point input is too large, the invalid
250 | exception is raised and the largest positive or negative integer is
251 | returned.
252 *----------------------------------------------------------------------------*/
253 
254 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
255                                float_status *status)
256 {
257     int8_t roundingMode;
258     flag roundNearestEven, increment;
259     int64_t z;
260 
261     roundingMode = status->float_rounding_mode;
262     roundNearestEven = ( roundingMode == float_round_nearest_even );
263     switch (roundingMode) {
264     case float_round_nearest_even:
265     case float_round_ties_away:
266         increment = ((int64_t) absZ1 < 0);
267         break;
268     case float_round_to_zero:
269         increment = 0;
270         break;
271     case float_round_up:
272         increment = !zSign && absZ1;
273         break;
274     case float_round_down:
275         increment = zSign && absZ1;
276         break;
277     default:
278         abort();
279     }
280     if ( increment ) {
281         ++absZ0;
282         if ( absZ0 == 0 ) goto overflow;
283         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
284     }
285     z = absZ0;
286     if ( zSign ) z = - z;
287     if ( z && ( ( z < 0 ) ^ zSign ) ) {
288  overflow:
289         float_raise(float_flag_invalid, status);
290         return
291               zSign ? (int64_t) LIT64( 0x8000000000000000 )
292             : LIT64( 0x7FFFFFFFFFFFFFFF );
293     }
294     if (absZ1) {
295         status->float_exception_flags |= float_flag_inexact;
296     }
297     return z;
298 
299 }
300 
301 /*----------------------------------------------------------------------------
302 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
303 | `absZ1', with binary point between bits 63 and 64 (between the input words),
304 | and returns the properly rounded 64-bit unsigned integer corresponding to the
305 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
306 | with the inexact exception raised if the input cannot be represented exactly
307 | as an integer.  However, if the fixed-point input is too large, the invalid
308 | exception is raised and the largest unsigned integer is returned.
309 *----------------------------------------------------------------------------*/
310 
311 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
312                                 uint64_t absZ1, float_status *status)
313 {
314     int8_t roundingMode;
315     flag roundNearestEven, increment;
316 
317     roundingMode = status->float_rounding_mode;
318     roundNearestEven = (roundingMode == float_round_nearest_even);
319     switch (roundingMode) {
320     case float_round_nearest_even:
321     case float_round_ties_away:
322         increment = ((int64_t)absZ1 < 0);
323         break;
324     case float_round_to_zero:
325         increment = 0;
326         break;
327     case float_round_up:
328         increment = !zSign && absZ1;
329         break;
330     case float_round_down:
331         increment = zSign && absZ1;
332         break;
333     default:
334         abort();
335     }
336     if (increment) {
337         ++absZ0;
338         if (absZ0 == 0) {
339             float_raise(float_flag_invalid, status);
340             return LIT64(0xFFFFFFFFFFFFFFFF);
341         }
342         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
343     }
344 
345     if (zSign && absZ0) {
346         float_raise(float_flag_invalid, status);
347         return 0;
348     }
349 
350     if (absZ1) {
351         status->float_exception_flags |= float_flag_inexact;
352     }
353     return absZ0;
354 }
355 
356 /*----------------------------------------------------------------------------
357 | If `a' is denormal and we are in flush-to-zero mode then set the
358 | input-denormal exception and return zero. Otherwise just return the value.
359 *----------------------------------------------------------------------------*/
360 float32 float32_squash_input_denormal(float32 a, float_status *status)
361 {
362     if (status->flush_inputs_to_zero) {
363         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
364             float_raise(float_flag_input_denormal, status);
365             return make_float32(float32_val(a) & 0x80000000);
366         }
367     }
368     return a;
369 }
370 
371 /*----------------------------------------------------------------------------
372 | Normalizes the subnormal single-precision floating-point value represented
373 | by the denormalized significand `aSig'.  The normalized exponent and
374 | significand are stored at the locations pointed to by `zExpPtr' and
375 | `zSigPtr', respectively.
376 *----------------------------------------------------------------------------*/
377 
378 static void
379  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
380 {
381     int8_t shiftCount;
382 
383     shiftCount = countLeadingZeros32( aSig ) - 8;
384     *zSigPtr = aSig<<shiftCount;
385     *zExpPtr = 1 - shiftCount;
386 
387 }
388 
389 /*----------------------------------------------------------------------------
390 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
391 | single-precision floating-point value, returning the result.  After being
392 | shifted into the proper positions, the three fields are simply added
393 | together to form the result.  This means that any integer portion of `zSig'
394 | will be added into the exponent.  Since a properly normalized significand
395 | will have an integer portion equal to 1, the `zExp' input should be 1 less
396 | than the desired result exponent whenever `zSig' is a complete, normalized
397 | significand.
398 *----------------------------------------------------------------------------*/
399 
400 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
401 {
402 
403     return make_float32(
404           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
405 
406 }
407 
408 /*----------------------------------------------------------------------------
409 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
410 | and significand `zSig', and returns the proper single-precision floating-
411 | point value corresponding to the abstract input.  Ordinarily, the abstract
412 | value is simply rounded and packed into the single-precision format, with
413 | the inexact exception raised if the abstract input cannot be represented
414 | exactly.  However, if the abstract value is too large, the overflow and
415 | inexact exceptions are raised and an infinity or maximal finite value is
416 | returned.  If the abstract value is too small, the input value is rounded to
417 | a subnormal number, and the underflow and inexact exceptions are raised if
418 | the abstract input cannot be represented exactly as a subnormal single-
419 | precision floating-point number.
420 |     The input significand `zSig' has its binary point between bits 30
421 | and 29, which is 7 bits to the left of the usual location.  This shifted
422 | significand must be normalized or smaller.  If `zSig' is not normalized,
423 | `zExp' must be 0; in that case, the result returned is a subnormal number,
424 | and it must not require rounding.  In the usual case that `zSig' is
425 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
426 | The handling of underflow and overflow follows the IEC/IEEE Standard for
427 | Binary Floating-Point Arithmetic.
428 *----------------------------------------------------------------------------*/
429 
430 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
431                                    float_status *status)
432 {
433     int8_t roundingMode;
434     flag roundNearestEven;
435     int8_t roundIncrement, roundBits;
436     flag isTiny;
437 
438     roundingMode = status->float_rounding_mode;
439     roundNearestEven = ( roundingMode == float_round_nearest_even );
440     switch (roundingMode) {
441     case float_round_nearest_even:
442     case float_round_ties_away:
443         roundIncrement = 0x40;
444         break;
445     case float_round_to_zero:
446         roundIncrement = 0;
447         break;
448     case float_round_up:
449         roundIncrement = zSign ? 0 : 0x7f;
450         break;
451     case float_round_down:
452         roundIncrement = zSign ? 0x7f : 0;
453         break;
454     default:
455         abort();
456         break;
457     }
458     roundBits = zSig & 0x7F;
459     if ( 0xFD <= (uint16_t) zExp ) {
460         if (    ( 0xFD < zExp )
461              || (    ( zExp == 0xFD )
462                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
463            ) {
464             float_raise(float_flag_overflow | float_flag_inexact, status);
465             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
466         }
467         if ( zExp < 0 ) {
468             if (status->flush_to_zero) {
469                 float_raise(float_flag_output_denormal, status);
470                 return packFloat32(zSign, 0, 0);
471             }
472             isTiny =
473                 (status->float_detect_tininess
474                  == float_tininess_before_rounding)
475                 || ( zExp < -1 )
476                 || ( zSig + roundIncrement < 0x80000000 );
477             shift32RightJamming( zSig, - zExp, &zSig );
478             zExp = 0;
479             roundBits = zSig & 0x7F;
480             if (isTiny && roundBits) {
481                 float_raise(float_flag_underflow, status);
482             }
483         }
484     }
485     if (roundBits) {
486         status->float_exception_flags |= float_flag_inexact;
487     }
488     zSig = ( zSig + roundIncrement )>>7;
489     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
490     if ( zSig == 0 ) zExp = 0;
491     return packFloat32( zSign, zExp, zSig );
492 
493 }
494 
495 /*----------------------------------------------------------------------------
496 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
497 | and significand `zSig', and returns the proper single-precision floating-
498 | point value corresponding to the abstract input.  This routine is just like
499 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
500 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
501 | floating-point exponent.
502 *----------------------------------------------------------------------------*/
503 
504 static float32
505  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
506                               float_status *status)
507 {
508     int8_t shiftCount;
509 
510     shiftCount = countLeadingZeros32( zSig ) - 1;
511     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
512                                status);
513 
514 }
515 
516 /*----------------------------------------------------------------------------
517 | If `a' is denormal and we are in flush-to-zero mode then set the
518 | input-denormal exception and return zero. Otherwise just return the value.
519 *----------------------------------------------------------------------------*/
520 float64 float64_squash_input_denormal(float64 a, float_status *status)
521 {
522     if (status->flush_inputs_to_zero) {
523         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
524             float_raise(float_flag_input_denormal, status);
525             return make_float64(float64_val(a) & (1ULL << 63));
526         }
527     }
528     return a;
529 }
530 
531 /*----------------------------------------------------------------------------
532 | Normalizes the subnormal double-precision floating-point value represented
533 | by the denormalized significand `aSig'.  The normalized exponent and
534 | significand are stored at the locations pointed to by `zExpPtr' and
535 | `zSigPtr', respectively.
536 *----------------------------------------------------------------------------*/
537 
538 static void
539  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
540 {
541     int8_t shiftCount;
542 
543     shiftCount = countLeadingZeros64( aSig ) - 11;
544     *zSigPtr = aSig<<shiftCount;
545     *zExpPtr = 1 - shiftCount;
546 
547 }
548 
549 /*----------------------------------------------------------------------------
550 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
551 | double-precision floating-point value, returning the result.  After being
552 | shifted into the proper positions, the three fields are simply added
553 | together to form the result.  This means that any integer portion of `zSig'
554 | will be added into the exponent.  Since a properly normalized significand
555 | will have an integer portion equal to 1, the `zExp' input should be 1 less
556 | than the desired result exponent whenever `zSig' is a complete, normalized
557 | significand.
558 *----------------------------------------------------------------------------*/
559 
560 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
561 {
562 
563     return make_float64(
564         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
565 
566 }
567 
568 /*----------------------------------------------------------------------------
569 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
570 | and significand `zSig', and returns the proper double-precision floating-
571 | point value corresponding to the abstract input.  Ordinarily, the abstract
572 | value is simply rounded and packed into the double-precision format, with
573 | the inexact exception raised if the abstract input cannot be represented
574 | exactly.  However, if the abstract value is too large, the overflow and
575 | inexact exceptions are raised and an infinity or maximal finite value is
576 | returned.  If the abstract value is too small, the input value is rounded to
577 | a subnormal number, and the underflow and inexact exceptions are raised if
578 | the abstract input cannot be represented exactly as a subnormal double-
579 | precision floating-point number.
580 |     The input significand `zSig' has its binary point between bits 62
581 | and 61, which is 10 bits to the left of the usual location.  This shifted
582 | significand must be normalized or smaller.  If `zSig' is not normalized,
583 | `zExp' must be 0; in that case, the result returned is a subnormal number,
584 | and it must not require rounding.  In the usual case that `zSig' is
585 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
586 | The handling of underflow and overflow follows the IEC/IEEE Standard for
587 | Binary Floating-Point Arithmetic.
588 *----------------------------------------------------------------------------*/
589 
590 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
591                                    float_status *status)
592 {
593     int8_t roundingMode;
594     flag roundNearestEven;
595     int roundIncrement, roundBits;
596     flag isTiny;
597 
598     roundingMode = status->float_rounding_mode;
599     roundNearestEven = ( roundingMode == float_round_nearest_even );
600     switch (roundingMode) {
601     case float_round_nearest_even:
602     case float_round_ties_away:
603         roundIncrement = 0x200;
604         break;
605     case float_round_to_zero:
606         roundIncrement = 0;
607         break;
608     case float_round_up:
609         roundIncrement = zSign ? 0 : 0x3ff;
610         break;
611     case float_round_down:
612         roundIncrement = zSign ? 0x3ff : 0;
613         break;
614     case float_round_to_odd:
615         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
616         break;
617     default:
618         abort();
619     }
620     roundBits = zSig & 0x3FF;
621     if ( 0x7FD <= (uint16_t) zExp ) {
622         if (    ( 0x7FD < zExp )
623              || (    ( zExp == 0x7FD )
624                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
625            ) {
626             bool overflow_to_inf = roundingMode != float_round_to_odd &&
627                                    roundIncrement != 0;
628             float_raise(float_flag_overflow | float_flag_inexact, status);
629             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
630         }
631         if ( zExp < 0 ) {
632             if (status->flush_to_zero) {
633                 float_raise(float_flag_output_denormal, status);
634                 return packFloat64(zSign, 0, 0);
635             }
636             isTiny =
637                    (status->float_detect_tininess
638                     == float_tininess_before_rounding)
639                 || ( zExp < -1 )
640                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
641             shift64RightJamming( zSig, - zExp, &zSig );
642             zExp = 0;
643             roundBits = zSig & 0x3FF;
644             if (isTiny && roundBits) {
645                 float_raise(float_flag_underflow, status);
646             }
647             if (roundingMode == float_round_to_odd) {
648                 /*
649                  * For round-to-odd case, the roundIncrement depends on
650                  * zSig which just changed.
651                  */
652                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
653             }
654         }
655     }
656     if (roundBits) {
657         status->float_exception_flags |= float_flag_inexact;
658     }
659     zSig = ( zSig + roundIncrement )>>10;
660     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
661     if ( zSig == 0 ) zExp = 0;
662     return packFloat64( zSign, zExp, zSig );
663 
664 }
665 
666 /*----------------------------------------------------------------------------
667 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
668 | and significand `zSig', and returns the proper double-precision floating-
669 | point value corresponding to the abstract input.  This routine is just like
670 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
671 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
672 | floating-point exponent.
673 *----------------------------------------------------------------------------*/
674 
675 static float64
676  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
677                               float_status *status)
678 {
679     int8_t shiftCount;
680 
681     shiftCount = countLeadingZeros64( zSig ) - 1;
682     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
683                                status);
684 
685 }
686 
687 /*----------------------------------------------------------------------------
688 | Returns the fraction bits of the extended double-precision floating-point
689 | value `a'.
690 *----------------------------------------------------------------------------*/
691 
692 static inline uint64_t extractFloatx80Frac( floatx80 a )
693 {
694 
695     return a.low;
696 
697 }
698 
699 /*----------------------------------------------------------------------------
700 | Returns the exponent bits of the extended double-precision floating-point
701 | value `a'.
702 *----------------------------------------------------------------------------*/
703 
704 static inline int32_t extractFloatx80Exp( floatx80 a )
705 {
706 
707     return a.high & 0x7FFF;
708 
709 }
710 
711 /*----------------------------------------------------------------------------
712 | Returns the sign bit of the extended double-precision floating-point value
713 | `a'.
714 *----------------------------------------------------------------------------*/
715 
716 static inline flag extractFloatx80Sign( floatx80 a )
717 {
718 
719     return a.high>>15;
720 
721 }
722 
723 /*----------------------------------------------------------------------------
724 | Normalizes the subnormal extended double-precision floating-point value
725 | represented by the denormalized significand `aSig'.  The normalized exponent
726 | and significand are stored at the locations pointed to by `zExpPtr' and
727 | `zSigPtr', respectively.
728 *----------------------------------------------------------------------------*/
729 
730 static void
731  normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
732 {
733     int8_t shiftCount;
734 
735     shiftCount = countLeadingZeros64( aSig );
736     *zSigPtr = aSig<<shiftCount;
737     *zExpPtr = 1 - shiftCount;
738 
739 }
740 
741 /*----------------------------------------------------------------------------
742 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
743 | extended double-precision floating-point value, returning the result.
744 *----------------------------------------------------------------------------*/
745 
746 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
747 {
748     floatx80 z;
749 
750     z.low = zSig;
751     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
752     return z;
753 
754 }
755 
756 /*----------------------------------------------------------------------------
757 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
758 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
759 | and returns the proper extended double-precision floating-point value
760 | corresponding to the abstract input.  Ordinarily, the abstract value is
761 | rounded and packed into the extended double-precision format, with the
762 | inexact exception raised if the abstract input cannot be represented
763 | exactly.  However, if the abstract value is too large, the overflow and
764 | inexact exceptions are raised and an infinity or maximal finite value is
765 | returned.  If the abstract value is too small, the input value is rounded to
766 | a subnormal number, and the underflow and inexact exceptions are raised if
767 | the abstract input cannot be represented exactly as a subnormal extended
768 | double-precision floating-point number.
769 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
770 | number of bits as single or double precision, respectively.  Otherwise, the
771 | result is rounded to the full precision of the extended double-precision
772 | format.
773 |     The input significand must be normalized or smaller.  If the input
774 | significand is not normalized, `zExp' must be 0; in that case, the result
775 | returned is a subnormal number, and it must not require rounding.  The
776 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
777 | Floating-Point Arithmetic.
778 *----------------------------------------------------------------------------*/
779 
780 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
781                                      int32_t zExp, uint64_t zSig0, uint64_t zSig1,
782                                      float_status *status)
783 {
784     int8_t roundingMode;
785     flag roundNearestEven, increment, isTiny;
786     int64_t roundIncrement, roundMask, roundBits;
787 
788     roundingMode = status->float_rounding_mode;
789     roundNearestEven = ( roundingMode == float_round_nearest_even );
790     if ( roundingPrecision == 80 ) goto precision80;
791     if ( roundingPrecision == 64 ) {
792         roundIncrement = LIT64( 0x0000000000000400 );
793         roundMask = LIT64( 0x00000000000007FF );
794     }
795     else if ( roundingPrecision == 32 ) {
796         roundIncrement = LIT64( 0x0000008000000000 );
797         roundMask = LIT64( 0x000000FFFFFFFFFF );
798     }
799     else {
800         goto precision80;
801     }
802     zSig0 |= ( zSig1 != 0 );
803     switch (roundingMode) {
804     case float_round_nearest_even:
805     case float_round_ties_away:
806         break;
807     case float_round_to_zero:
808         roundIncrement = 0;
809         break;
810     case float_round_up:
811         roundIncrement = zSign ? 0 : roundMask;
812         break;
813     case float_round_down:
814         roundIncrement = zSign ? roundMask : 0;
815         break;
816     default:
817         abort();
818     }
819     roundBits = zSig0 & roundMask;
820     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
821         if (    ( 0x7FFE < zExp )
822              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
823            ) {
824             goto overflow;
825         }
826         if ( zExp <= 0 ) {
827             if (status->flush_to_zero) {
828                 float_raise(float_flag_output_denormal, status);
829                 return packFloatx80(zSign, 0, 0);
830             }
831             isTiny =
832                    (status->float_detect_tininess
833                     == float_tininess_before_rounding)
834                 || ( zExp < 0 )
835                 || ( zSig0 <= zSig0 + roundIncrement );
836             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
837             zExp = 0;
838             roundBits = zSig0 & roundMask;
839             if (isTiny && roundBits) {
840                 float_raise(float_flag_underflow, status);
841             }
842             if (roundBits) {
843                 status->float_exception_flags |= float_flag_inexact;
844             }
845             zSig0 += roundIncrement;
846             if ( (int64_t) zSig0 < 0 ) zExp = 1;
847             roundIncrement = roundMask + 1;
848             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
849                 roundMask |= roundIncrement;
850             }
851             zSig0 &= ~ roundMask;
852             return packFloatx80( zSign, zExp, zSig0 );
853         }
854     }
855     if (roundBits) {
856         status->float_exception_flags |= float_flag_inexact;
857     }
858     zSig0 += roundIncrement;
859     if ( zSig0 < roundIncrement ) {
860         ++zExp;
861         zSig0 = LIT64( 0x8000000000000000 );
862     }
863     roundIncrement = roundMask + 1;
864     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
865         roundMask |= roundIncrement;
866     }
867     zSig0 &= ~ roundMask;
868     if ( zSig0 == 0 ) zExp = 0;
869     return packFloatx80( zSign, zExp, zSig0 );
870  precision80:
871     switch (roundingMode) {
872     case float_round_nearest_even:
873     case float_round_ties_away:
874         increment = ((int64_t)zSig1 < 0);
875         break;
876     case float_round_to_zero:
877         increment = 0;
878         break;
879     case float_round_up:
880         increment = !zSign && zSig1;
881         break;
882     case float_round_down:
883         increment = zSign && zSig1;
884         break;
885     default:
886         abort();
887     }
888     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
889         if (    ( 0x7FFE < zExp )
890              || (    ( zExp == 0x7FFE )
891                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
892                   && increment
893                 )
894            ) {
895             roundMask = 0;
896  overflow:
897             float_raise(float_flag_overflow | float_flag_inexact, status);
898             if (    ( roundingMode == float_round_to_zero )
899                  || ( zSign && ( roundingMode == float_round_up ) )
900                  || ( ! zSign && ( roundingMode == float_round_down ) )
901                ) {
902                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
903             }
904             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
905         }
906         if ( zExp <= 0 ) {
907             isTiny =
908                    (status->float_detect_tininess
909                     == float_tininess_before_rounding)
910                 || ( zExp < 0 )
911                 || ! increment
912                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
913             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
914             zExp = 0;
915             if (isTiny && zSig1) {
916                 float_raise(float_flag_underflow, status);
917             }
918             if (zSig1) {
919                 status->float_exception_flags |= float_flag_inexact;
920             }
921             switch (roundingMode) {
922             case float_round_nearest_even:
923             case float_round_ties_away:
924                 increment = ((int64_t)zSig1 < 0);
925                 break;
926             case float_round_to_zero:
927                 increment = 0;
928                 break;
929             case float_round_up:
930                 increment = !zSign && zSig1;
931                 break;
932             case float_round_down:
933                 increment = zSign && zSig1;
934                 break;
935             default:
936                 abort();
937             }
938             if ( increment ) {
939                 ++zSig0;
940                 zSig0 &=
941                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
942                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
943             }
944             return packFloatx80( zSign, zExp, zSig0 );
945         }
946     }
947     if (zSig1) {
948         status->float_exception_flags |= float_flag_inexact;
949     }
950     if ( increment ) {
951         ++zSig0;
952         if ( zSig0 == 0 ) {
953             ++zExp;
954             zSig0 = LIT64( 0x8000000000000000 );
955         }
956         else {
957             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
958         }
959     }
960     else {
961         if ( zSig0 == 0 ) zExp = 0;
962     }
963     return packFloatx80( zSign, zExp, zSig0 );
964 
965 }
966 
967 /*----------------------------------------------------------------------------
968 | Takes an abstract floating-point value having sign `zSign', exponent
969 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
970 | and returns the proper extended double-precision floating-point value
971 | corresponding to the abstract input.  This routine is just like
972 | `roundAndPackFloatx80' except that the input significand does not have to be
973 | normalized.
974 *----------------------------------------------------------------------------*/
975 
976 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
977                                               flag zSign, int32_t zExp,
978                                               uint64_t zSig0, uint64_t zSig1,
979                                               float_status *status)
980 {
981     int8_t shiftCount;
982 
983     if ( zSig0 == 0 ) {
984         zSig0 = zSig1;
985         zSig1 = 0;
986         zExp -= 64;
987     }
988     shiftCount = countLeadingZeros64( zSig0 );
989     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
990     zExp -= shiftCount;
991     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
992                                 zSig0, zSig1, status);
993 
994 }
995 
996 /*----------------------------------------------------------------------------
997 | Returns the least-significant 64 fraction bits of the quadruple-precision
998 | floating-point value `a'.
999 *----------------------------------------------------------------------------*/
1000 
1001 static inline uint64_t extractFloat128Frac1( float128 a )
1002 {
1003 
1004     return a.low;
1005 
1006 }
1007 
1008 /*----------------------------------------------------------------------------
1009 | Returns the most-significant 48 fraction bits of the quadruple-precision
1010 | floating-point value `a'.
1011 *----------------------------------------------------------------------------*/
1012 
1013 static inline uint64_t extractFloat128Frac0( float128 a )
1014 {
1015 
1016     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1017 
1018 }
1019 
1020 /*----------------------------------------------------------------------------
1021 | Returns the exponent bits of the quadruple-precision floating-point value
1022 | `a'.
1023 *----------------------------------------------------------------------------*/
1024 
1025 static inline int32_t extractFloat128Exp( float128 a )
1026 {
1027 
1028     return ( a.high>>48 ) & 0x7FFF;
1029 
1030 }
1031 
1032 /*----------------------------------------------------------------------------
1033 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1034 *----------------------------------------------------------------------------*/
1035 
1036 static inline flag extractFloat128Sign( float128 a )
1037 {
1038 
1039     return a.high>>63;
1040 
1041 }
1042 
1043 /*----------------------------------------------------------------------------
1044 | Normalizes the subnormal quadruple-precision floating-point value
1045 | represented by the denormalized significand formed by the concatenation of
1046 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
1047 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
1048 | significand are stored at the location pointed to by `zSig0Ptr', and the
1049 | least significant 64 bits of the normalized significand are stored at the
1050 | location pointed to by `zSig1Ptr'.
1051 *----------------------------------------------------------------------------*/
1052 
1053 static void
1054  normalizeFloat128Subnormal(
1055      uint64_t aSig0,
1056      uint64_t aSig1,
1057      int32_t *zExpPtr,
1058      uint64_t *zSig0Ptr,
1059      uint64_t *zSig1Ptr
1060  )
1061 {
1062     int8_t shiftCount;
1063 
1064     if ( aSig0 == 0 ) {
1065         shiftCount = countLeadingZeros64( aSig1 ) - 15;
1066         if ( shiftCount < 0 ) {
1067             *zSig0Ptr = aSig1>>( - shiftCount );
1068             *zSig1Ptr = aSig1<<( shiftCount & 63 );
1069         }
1070         else {
1071             *zSig0Ptr = aSig1<<shiftCount;
1072             *zSig1Ptr = 0;
1073         }
1074         *zExpPtr = - shiftCount - 63;
1075     }
1076     else {
1077         shiftCount = countLeadingZeros64( aSig0 ) - 15;
1078         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1079         *zExpPtr = 1 - shiftCount;
1080     }
1081 
1082 }
1083 
1084 /*----------------------------------------------------------------------------
1085 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1086 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1087 | floating-point value, returning the result.  After being shifted into the
1088 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1089 | added together to form the most significant 32 bits of the result.  This
1090 | means that any integer portion of `zSig0' will be added into the exponent.
1091 | Since a properly normalized significand will have an integer portion equal
1092 | to 1, the `zExp' input should be 1 less than the desired result exponent
1093 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1094 | significand.
1095 *----------------------------------------------------------------------------*/
1096 
1097 static inline float128
1098  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
1099 {
1100     float128 z;
1101 
1102     z.low = zSig1;
1103     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1104     return z;
1105 
1106 }
1107 
1108 /*----------------------------------------------------------------------------
1109 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1110 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1111 | and `zSig2', and returns the proper quadruple-precision floating-point value
1112 | corresponding to the abstract input.  Ordinarily, the abstract value is
1113 | simply rounded and packed into the quadruple-precision format, with the
1114 | inexact exception raised if the abstract input cannot be represented
1115 | exactly.  However, if the abstract value is too large, the overflow and
1116 | inexact exceptions are raised and an infinity or maximal finite value is
1117 | returned.  If the abstract value is too small, the input value is rounded to
1118 | a subnormal number, and the underflow and inexact exceptions are raised if
1119 | the abstract input cannot be represented exactly as a subnormal quadruple-
1120 | precision floating-point number.
1121 |     The input significand must be normalized or smaller.  If the input
1122 | significand is not normalized, `zExp' must be 0; in that case, the result
1123 | returned is a subnormal number, and it must not require rounding.  In the
1124 | usual case that the input significand is normalized, `zExp' must be 1 less
1125 | than the ``true'' floating-point exponent.  The handling of underflow and
1126 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 *----------------------------------------------------------------------------*/
1128 
1129 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
1130                                      uint64_t zSig0, uint64_t zSig1,
1131                                      uint64_t zSig2, float_status *status)
1132 {
1133     int8_t roundingMode;
1134     flag roundNearestEven, increment, isTiny;
1135 
1136     roundingMode = status->float_rounding_mode;
1137     roundNearestEven = ( roundingMode == float_round_nearest_even );
1138     switch (roundingMode) {
1139     case float_round_nearest_even:
1140     case float_round_ties_away:
1141         increment = ((int64_t)zSig2 < 0);
1142         break;
1143     case float_round_to_zero:
1144         increment = 0;
1145         break;
1146     case float_round_up:
1147         increment = !zSign && zSig2;
1148         break;
1149     case float_round_down:
1150         increment = zSign && zSig2;
1151         break;
1152     case float_round_to_odd:
1153         increment = !(zSig1 & 0x1) && zSig2;
1154         break;
1155     default:
1156         abort();
1157     }
1158     if ( 0x7FFD <= (uint32_t) zExp ) {
1159         if (    ( 0x7FFD < zExp )
1160              || (    ( zExp == 0x7FFD )
1161                   && eq128(
1162                          LIT64( 0x0001FFFFFFFFFFFF ),
1163                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1164                          zSig0,
1165                          zSig1
1166                      )
1167                   && increment
1168                 )
1169            ) {
1170             float_raise(float_flag_overflow | float_flag_inexact, status);
1171             if (    ( roundingMode == float_round_to_zero )
1172                  || ( zSign && ( roundingMode == float_round_up ) )
1173                  || ( ! zSign && ( roundingMode == float_round_down ) )
1174                  || (roundingMode == float_round_to_odd)
1175                ) {
1176                 return
1177                     packFloat128(
1178                         zSign,
1179                         0x7FFE,
1180                         LIT64( 0x0000FFFFFFFFFFFF ),
1181                         LIT64( 0xFFFFFFFFFFFFFFFF )
1182                     );
1183             }
1184             return packFloat128( zSign, 0x7FFF, 0, 0 );
1185         }
1186         if ( zExp < 0 ) {
1187             if (status->flush_to_zero) {
1188                 float_raise(float_flag_output_denormal, status);
1189                 return packFloat128(zSign, 0, 0, 0);
1190             }
1191             isTiny =
1192                    (status->float_detect_tininess
1193                     == float_tininess_before_rounding)
1194                 || ( zExp < -1 )
1195                 || ! increment
1196                 || lt128(
1197                        zSig0,
1198                        zSig1,
1199                        LIT64( 0x0001FFFFFFFFFFFF ),
1200                        LIT64( 0xFFFFFFFFFFFFFFFF )
1201                    );
1202             shift128ExtraRightJamming(
1203                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1204             zExp = 0;
1205             if (isTiny && zSig2) {
1206                 float_raise(float_flag_underflow, status);
1207             }
1208             switch (roundingMode) {
1209             case float_round_nearest_even:
1210             case float_round_ties_away:
1211                 increment = ((int64_t)zSig2 < 0);
1212                 break;
1213             case float_round_to_zero:
1214                 increment = 0;
1215                 break;
1216             case float_round_up:
1217                 increment = !zSign && zSig2;
1218                 break;
1219             case float_round_down:
1220                 increment = zSign && zSig2;
1221                 break;
1222             case float_round_to_odd:
1223                 increment = !(zSig1 & 0x1) && zSig2;
1224                 break;
1225             default:
1226                 abort();
1227             }
1228         }
1229     }
1230     if (zSig2) {
1231         status->float_exception_flags |= float_flag_inexact;
1232     }
1233     if ( increment ) {
1234         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1235         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1236     }
1237     else {
1238         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1239     }
1240     return packFloat128( zSign, zExp, zSig0, zSig1 );
1241 
1242 }
1243 
1244 /*----------------------------------------------------------------------------
1245 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1246 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1247 | returns the proper quadruple-precision floating-point value corresponding
1248 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1249 | except that the input significand has fewer bits and does not have to be
1250 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1251 | point exponent.
1252 *----------------------------------------------------------------------------*/
1253 
1254 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
1255                                               uint64_t zSig0, uint64_t zSig1,
1256                                               float_status *status)
1257 {
1258     int8_t shiftCount;
1259     uint64_t zSig2;
1260 
1261     if ( zSig0 == 0 ) {
1262         zSig0 = zSig1;
1263         zSig1 = 0;
1264         zExp -= 64;
1265     }
1266     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1267     if ( 0 <= shiftCount ) {
1268         zSig2 = 0;
1269         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1270     }
1271     else {
1272         shift128ExtraRightJamming(
1273             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1274     }
1275     zExp -= shiftCount;
1276     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
1277 
1278 }
1279 
1280 /*----------------------------------------------------------------------------
1281 | Returns the result of converting the 32-bit two's complement integer `a'
1282 | to the single-precision floating-point format.  The conversion is performed
1283 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1284 *----------------------------------------------------------------------------*/
1285 
1286 float32 int32_to_float32(int32_t a, float_status *status)
1287 {
1288     flag zSign;
1289 
1290     if ( a == 0 ) return float32_zero;
1291     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1292     zSign = ( a < 0 );
1293     return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
1294 }
1295 
1296 /*----------------------------------------------------------------------------
1297 | Returns the result of converting the 32-bit two's complement integer `a'
1298 | to the double-precision floating-point format.  The conversion is performed
1299 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1300 *----------------------------------------------------------------------------*/
1301 
1302 float64 int32_to_float64(int32_t a, float_status *status)
1303 {
1304     flag zSign;
1305     uint32_t absA;
1306     int8_t shiftCount;
1307     uint64_t zSig;
1308 
1309     if ( a == 0 ) return float64_zero;
1310     zSign = ( a < 0 );
1311     absA = zSign ? - a : a;
1312     shiftCount = countLeadingZeros32( absA ) + 21;
1313     zSig = absA;
1314     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1315 
1316 }
1317 
1318 /*----------------------------------------------------------------------------
1319 | Returns the result of converting the 32-bit two's complement integer `a'
1320 | to the extended double-precision floating-point format.  The conversion
1321 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1322 | Arithmetic.
1323 *----------------------------------------------------------------------------*/
1324 
1325 floatx80 int32_to_floatx80(int32_t a, float_status *status)
1326 {
1327     flag zSign;
1328     uint32_t absA;
1329     int8_t shiftCount;
1330     uint64_t zSig;
1331 
1332     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1333     zSign = ( a < 0 );
1334     absA = zSign ? - a : a;
1335     shiftCount = countLeadingZeros32( absA ) + 32;
1336     zSig = absA;
1337     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1338 
1339 }
1340 
1341 /*----------------------------------------------------------------------------
1342 | Returns the result of converting the 32-bit two's complement integer `a' to
1343 | the quadruple-precision floating-point format.  The conversion is performed
1344 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1345 *----------------------------------------------------------------------------*/
1346 
1347 float128 int32_to_float128(int32_t a, float_status *status)
1348 {
1349     flag zSign;
1350     uint32_t absA;
1351     int8_t shiftCount;
1352     uint64_t zSig0;
1353 
1354     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1355     zSign = ( a < 0 );
1356     absA = zSign ? - a : a;
1357     shiftCount = countLeadingZeros32( absA ) + 17;
1358     zSig0 = absA;
1359     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1360 
1361 }
1362 
1363 /*----------------------------------------------------------------------------
1364 | Returns the result of converting the 64-bit two's complement integer `a'
1365 | to the single-precision floating-point format.  The conversion is performed
1366 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1367 *----------------------------------------------------------------------------*/
1368 
1369 float32 int64_to_float32(int64_t a, float_status *status)
1370 {
1371     flag zSign;
1372     uint64_t absA;
1373     int8_t shiftCount;
1374 
1375     if ( a == 0 ) return float32_zero;
1376     zSign = ( a < 0 );
1377     absA = zSign ? - a : a;
1378     shiftCount = countLeadingZeros64( absA ) - 40;
1379     if ( 0 <= shiftCount ) {
1380         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1381     }
1382     else {
1383         shiftCount += 7;
1384         if ( shiftCount < 0 ) {
1385             shift64RightJamming( absA, - shiftCount, &absA );
1386         }
1387         else {
1388             absA <<= shiftCount;
1389         }
1390         return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
1391     }
1392 
1393 }
1394 
1395 /*----------------------------------------------------------------------------
1396 | Returns the result of converting the 64-bit two's complement integer `a'
1397 | to the double-precision floating-point format.  The conversion is performed
1398 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1399 *----------------------------------------------------------------------------*/
1400 
1401 float64 int64_to_float64(int64_t a, float_status *status)
1402 {
1403     flag zSign;
1404 
1405     if ( a == 0 ) return float64_zero;
1406     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1407         return packFloat64( 1, 0x43E, 0 );
1408     }
1409     zSign = ( a < 0 );
1410     return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
1411 }
1412 
1413 /*----------------------------------------------------------------------------
1414 | Returns the result of converting the 64-bit two's complement integer `a'
1415 | to the extended double-precision floating-point format.  The conversion
1416 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1417 | Arithmetic.
1418 *----------------------------------------------------------------------------*/
1419 
1420 floatx80 int64_to_floatx80(int64_t a, float_status *status)
1421 {
1422     flag zSign;
1423     uint64_t absA;
1424     int8_t shiftCount;
1425 
1426     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1427     zSign = ( a < 0 );
1428     absA = zSign ? - a : a;
1429     shiftCount = countLeadingZeros64( absA );
1430     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1431 
1432 }
1433 
1434 /*----------------------------------------------------------------------------
1435 | Returns the result of converting the 64-bit two's complement integer `a' to
1436 | the quadruple-precision floating-point format.  The conversion is performed
1437 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1438 *----------------------------------------------------------------------------*/
1439 
1440 float128 int64_to_float128(int64_t a, float_status *status)
1441 {
1442     flag zSign;
1443     uint64_t absA;
1444     int8_t shiftCount;
1445     int32_t zExp;
1446     uint64_t zSig0, zSig1;
1447 
1448     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1449     zSign = ( a < 0 );
1450     absA = zSign ? - a : a;
1451     shiftCount = countLeadingZeros64( absA ) + 49;
1452     zExp = 0x406E - shiftCount;
1453     if ( 64 <= shiftCount ) {
1454         zSig1 = 0;
1455         zSig0 = absA;
1456         shiftCount -= 64;
1457     }
1458     else {
1459         zSig1 = absA;
1460         zSig0 = 0;
1461     }
1462     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1463     return packFloat128( zSign, zExp, zSig0, zSig1 );
1464 
1465 }
1466 
1467 /*----------------------------------------------------------------------------
1468 | Returns the result of converting the 64-bit unsigned integer `a'
1469 | to the single-precision floating-point format.  The conversion is performed
1470 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1471 *----------------------------------------------------------------------------*/
1472 
1473 float32 uint64_to_float32(uint64_t a, float_status *status)
1474 {
1475     int shiftcount;
1476 
1477     if (a == 0) {
1478         return float32_zero;
1479     }
1480 
1481     /* Determine (left) shift needed to put first set bit into bit posn 23
1482      * (since packFloat32() expects the binary point between bits 23 and 22);
1483      * this is the fast case for smallish numbers.
1484      */
1485     shiftcount = countLeadingZeros64(a) - 40;
1486     if (shiftcount >= 0) {
1487         return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1488     }
1489     /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1490      * expects the binary point between bits 30 and 29, hence the + 7.
1491      */
1492     shiftcount += 7;
1493     if (shiftcount < 0) {
1494         shift64RightJamming(a, -shiftcount, &a);
1495     } else {
1496         a <<= shiftcount;
1497     }
1498 
1499     return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
1500 }
1501 
1502 /*----------------------------------------------------------------------------
1503 | Returns the result of converting the 64-bit unsigned integer `a'
1504 | to the double-precision floating-point format.  The conversion is performed
1505 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1506 *----------------------------------------------------------------------------*/
1507 
1508 float64 uint64_to_float64(uint64_t a, float_status *status)
1509 {
1510     int exp = 0x43C;
1511     int shiftcount;
1512 
1513     if (a == 0) {
1514         return float64_zero;
1515     }
1516 
1517     shiftcount = countLeadingZeros64(a) - 1;
1518     if (shiftcount < 0) {
1519         shift64RightJamming(a, -shiftcount, &a);
1520     } else {
1521         a <<= shiftcount;
1522     }
1523     return roundAndPackFloat64(0, exp - shiftcount, a, status);
1524 }
1525 
1526 /*----------------------------------------------------------------------------
1527 | Returns the result of converting the 64-bit unsigned integer `a'
1528 | to the quadruple-precision floating-point format.  The conversion is performed
1529 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1530 *----------------------------------------------------------------------------*/
1531 
1532 float128 uint64_to_float128(uint64_t a, float_status *status)
1533 {
1534     if (a == 0) {
1535         return float128_zero;
1536     }
1537     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1538 }
1539 
1540 /*----------------------------------------------------------------------------
1541 | Returns the result of converting the single-precision floating-point value
1542 | `a' to the 32-bit two's complement integer format.  The conversion is
1543 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1544 | Arithmetic---which means in particular that the conversion is rounded
1545 | according to the current rounding mode.  If `a' is a NaN, the largest
1546 | positive integer is returned.  Otherwise, if the conversion overflows, the
1547 | largest integer with the same sign as `a' is returned.
1548 *----------------------------------------------------------------------------*/
1549 
1550 int32_t float32_to_int32(float32 a, float_status *status)
1551 {
1552     flag aSign;
1553     int aExp;
1554     int shiftCount;
1555     uint32_t aSig;
1556     uint64_t aSig64;
1557 
1558     a = float32_squash_input_denormal(a, status);
1559     aSig = extractFloat32Frac( a );
1560     aExp = extractFloat32Exp( a );
1561     aSign = extractFloat32Sign( a );
1562     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1563     if ( aExp ) aSig |= 0x00800000;
1564     shiftCount = 0xAF - aExp;
1565     aSig64 = aSig;
1566     aSig64 <<= 32;
1567     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1568     return roundAndPackInt32(aSign, aSig64, status);
1569 
1570 }
1571 
1572 /*----------------------------------------------------------------------------
1573 | Returns the result of converting the single-precision floating-point value
1574 | `a' to the 32-bit two's complement integer format.  The conversion is
1575 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1576 | Arithmetic, except that the conversion is always rounded toward zero.
1577 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1578 | the conversion overflows, the largest integer with the same sign as `a' is
1579 | returned.
1580 *----------------------------------------------------------------------------*/
1581 
1582 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
1583 {
1584     flag aSign;
1585     int aExp;
1586     int shiftCount;
1587     uint32_t aSig;
1588     int32_t z;
1589     a = float32_squash_input_denormal(a, status);
1590 
1591     aSig = extractFloat32Frac( a );
1592     aExp = extractFloat32Exp( a );
1593     aSign = extractFloat32Sign( a );
1594     shiftCount = aExp - 0x9E;
1595     if ( 0 <= shiftCount ) {
1596         if ( float32_val(a) != 0xCF000000 ) {
1597             float_raise(float_flag_invalid, status);
1598             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1599         }
1600         return (int32_t) 0x80000000;
1601     }
1602     else if ( aExp <= 0x7E ) {
1603         if (aExp | aSig) {
1604             status->float_exception_flags |= float_flag_inexact;
1605         }
1606         return 0;
1607     }
1608     aSig = ( aSig | 0x00800000 )<<8;
1609     z = aSig>>( - shiftCount );
1610     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1611         status->float_exception_flags |= float_flag_inexact;
1612     }
1613     if ( aSign ) z = - z;
1614     return z;
1615 
1616 }
1617 
1618 /*----------------------------------------------------------------------------
1619 | Returns the result of converting the single-precision floating-point value
1620 | `a' to the 16-bit two's complement integer format.  The conversion is
1621 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1622 | Arithmetic, except that the conversion is always rounded toward zero.
1623 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1624 | the conversion overflows, the largest integer with the same sign as `a' is
1625 | returned.
1626 *----------------------------------------------------------------------------*/
1627 
1628 int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
1629 {
1630     flag aSign;
1631     int aExp;
1632     int shiftCount;
1633     uint32_t aSig;
1634     int32_t z;
1635 
1636     aSig = extractFloat32Frac( a );
1637     aExp = extractFloat32Exp( a );
1638     aSign = extractFloat32Sign( a );
1639     shiftCount = aExp - 0x8E;
1640     if ( 0 <= shiftCount ) {
1641         if ( float32_val(a) != 0xC7000000 ) {
1642             float_raise(float_flag_invalid, status);
1643             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1644                 return 0x7FFF;
1645             }
1646         }
1647         return (int32_t) 0xffff8000;
1648     }
1649     else if ( aExp <= 0x7E ) {
1650         if ( aExp | aSig ) {
1651             status->float_exception_flags |= float_flag_inexact;
1652         }
1653         return 0;
1654     }
1655     shiftCount -= 0x10;
1656     aSig = ( aSig | 0x00800000 )<<8;
1657     z = aSig>>( - shiftCount );
1658     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1659         status->float_exception_flags |= float_flag_inexact;
1660     }
1661     if ( aSign ) {
1662         z = - z;
1663     }
1664     return z;
1665 
1666 }
1667 
1668 /*----------------------------------------------------------------------------
1669 | Returns the result of converting the single-precision floating-point value
1670 | `a' to the 64-bit two's complement integer format.  The conversion is
1671 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1672 | Arithmetic---which means in particular that the conversion is rounded
1673 | according to the current rounding mode.  If `a' is a NaN, the largest
1674 | positive integer is returned.  Otherwise, if the conversion overflows, the
1675 | largest integer with the same sign as `a' is returned.
1676 *----------------------------------------------------------------------------*/
1677 
1678 int64_t float32_to_int64(float32 a, float_status *status)
1679 {
1680     flag aSign;
1681     int aExp;
1682     int shiftCount;
1683     uint32_t aSig;
1684     uint64_t aSig64, aSigExtra;
1685     a = float32_squash_input_denormal(a, status);
1686 
1687     aSig = extractFloat32Frac( a );
1688     aExp = extractFloat32Exp( a );
1689     aSign = extractFloat32Sign( a );
1690     shiftCount = 0xBE - aExp;
1691     if ( shiftCount < 0 ) {
1692         float_raise(float_flag_invalid, status);
1693         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1694             return LIT64( 0x7FFFFFFFFFFFFFFF );
1695         }
1696         return (int64_t) LIT64( 0x8000000000000000 );
1697     }
1698     if ( aExp ) aSig |= 0x00800000;
1699     aSig64 = aSig;
1700     aSig64 <<= 40;
1701     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1702     return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
1703 
1704 }
1705 
1706 /*----------------------------------------------------------------------------
1707 | Returns the result of converting the single-precision floating-point value
1708 | `a' to the 64-bit unsigned integer format.  The conversion is
1709 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1710 | Arithmetic---which means in particular that the conversion is rounded
1711 | according to the current rounding mode.  If `a' is a NaN, the largest
1712 | unsigned integer is returned.  Otherwise, if the conversion overflows, the
1713 | largest unsigned integer is returned.  If the 'a' is negative, the result
1714 | is rounded and zero is returned; values that do not round to zero will
1715 | raise the inexact exception flag.
1716 *----------------------------------------------------------------------------*/
1717 
1718 uint64_t float32_to_uint64(float32 a, float_status *status)
1719 {
1720     flag aSign;
1721     int aExp;
1722     int shiftCount;
1723     uint32_t aSig;
1724     uint64_t aSig64, aSigExtra;
1725     a = float32_squash_input_denormal(a, status);
1726 
1727     aSig = extractFloat32Frac(a);
1728     aExp = extractFloat32Exp(a);
1729     aSign = extractFloat32Sign(a);
1730     if ((aSign) && (aExp > 126)) {
1731         float_raise(float_flag_invalid, status);
1732         if (float32_is_any_nan(a)) {
1733             return LIT64(0xFFFFFFFFFFFFFFFF);
1734         } else {
1735             return 0;
1736         }
1737     }
1738     shiftCount = 0xBE - aExp;
1739     if (aExp) {
1740         aSig |= 0x00800000;
1741     }
1742     if (shiftCount < 0) {
1743         float_raise(float_flag_invalid, status);
1744         return LIT64(0xFFFFFFFFFFFFFFFF);
1745     }
1746 
1747     aSig64 = aSig;
1748     aSig64 <<= 40;
1749     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1750     return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
1751 }
1752 
1753 /*----------------------------------------------------------------------------
1754 | Returns the result of converting the single-precision floating-point value
1755 | `a' to the 64-bit unsigned integer format.  The conversion is
1756 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1757 | Arithmetic, except that the conversion is always rounded toward zero.  If
1758 | `a' is a NaN, the largest unsigned integer is returned.  Otherwise, if the
1759 | conversion overflows, the largest unsigned integer is returned.  If the
1760 | 'a' is negative, the result is rounded and zero is returned; values that do
1761 | not round to zero will raise the inexact flag.
1762 *----------------------------------------------------------------------------*/
1763 
1764 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
1765 {
1766     signed char current_rounding_mode = status->float_rounding_mode;
1767     set_float_rounding_mode(float_round_to_zero, status);
1768     int64_t v = float32_to_uint64(a, status);
1769     set_float_rounding_mode(current_rounding_mode, status);
1770     return v;
1771 }
1772 
1773 /*----------------------------------------------------------------------------
1774 | Returns the result of converting the single-precision floating-point value
1775 | `a' to the 64-bit two's complement integer format.  The conversion is
1776 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1777 | Arithmetic, except that the conversion is always rounded toward zero.  If
1778 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1779 | conversion overflows, the largest integer with the same sign as `a' is
1780 | returned.
1781 *----------------------------------------------------------------------------*/
1782 
1783 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
1784 {
1785     flag aSign;
1786     int aExp;
1787     int shiftCount;
1788     uint32_t aSig;
1789     uint64_t aSig64;
1790     int64_t z;
1791     a = float32_squash_input_denormal(a, status);
1792 
1793     aSig = extractFloat32Frac( a );
1794     aExp = extractFloat32Exp( a );
1795     aSign = extractFloat32Sign( a );
1796     shiftCount = aExp - 0xBE;
1797     if ( 0 <= shiftCount ) {
1798         if ( float32_val(a) != 0xDF000000 ) {
1799             float_raise(float_flag_invalid, status);
1800             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1801                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1802             }
1803         }
1804         return (int64_t) LIT64( 0x8000000000000000 );
1805     }
1806     else if ( aExp <= 0x7E ) {
1807         if (aExp | aSig) {
1808             status->float_exception_flags |= float_flag_inexact;
1809         }
1810         return 0;
1811     }
1812     aSig64 = aSig | 0x00800000;
1813     aSig64 <<= 40;
1814     z = aSig64>>( - shiftCount );
1815     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1816         status->float_exception_flags |= float_flag_inexact;
1817     }
1818     if ( aSign ) z = - z;
1819     return z;
1820 
1821 }
1822 
1823 /*----------------------------------------------------------------------------
1824 | Returns the result of converting the single-precision floating-point value
1825 | `a' to the double-precision floating-point format.  The conversion is
1826 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1827 | Arithmetic.
1828 *----------------------------------------------------------------------------*/
1829 
1830 float64 float32_to_float64(float32 a, float_status *status)
1831 {
1832     flag aSign;
1833     int aExp;
1834     uint32_t aSig;
1835     a = float32_squash_input_denormal(a, status);
1836 
1837     aSig = extractFloat32Frac( a );
1838     aExp = extractFloat32Exp( a );
1839     aSign = extractFloat32Sign( a );
1840     if ( aExp == 0xFF ) {
1841         if (aSig) {
1842             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1843         }
1844         return packFloat64( aSign, 0x7FF, 0 );
1845     }
1846     if ( aExp == 0 ) {
1847         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1848         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1849         --aExp;
1850     }
1851     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1852 
1853 }
1854 
1855 /*----------------------------------------------------------------------------
1856 | Returns the result of converting the single-precision floating-point value
1857 | `a' to the extended double-precision floating-point format.  The conversion
1858 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1859 | Arithmetic.
1860 *----------------------------------------------------------------------------*/
1861 
1862 floatx80 float32_to_floatx80(float32 a, float_status *status)
1863 {
1864     flag aSign;
1865     int aExp;
1866     uint32_t aSig;
1867 
1868     a = float32_squash_input_denormal(a, status);
1869     aSig = extractFloat32Frac( a );
1870     aExp = extractFloat32Exp( a );
1871     aSign = extractFloat32Sign( a );
1872     if ( aExp == 0xFF ) {
1873         if (aSig) {
1874             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1875         }
1876         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1877     }
1878     if ( aExp == 0 ) {
1879         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1880         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1881     }
1882     aSig |= 0x00800000;
1883     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1884 
1885 }
1886 
1887 /*----------------------------------------------------------------------------
1888 | Returns the result of converting the single-precision floating-point value
1889 | `a' to the double-precision floating-point format.  The conversion is
1890 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1891 | Arithmetic.
1892 *----------------------------------------------------------------------------*/
1893 
1894 float128 float32_to_float128(float32 a, float_status *status)
1895 {
1896     flag aSign;
1897     int aExp;
1898     uint32_t aSig;
1899 
1900     a = float32_squash_input_denormal(a, status);
1901     aSig = extractFloat32Frac( a );
1902     aExp = extractFloat32Exp( a );
1903     aSign = extractFloat32Sign( a );
1904     if ( aExp == 0xFF ) {
1905         if (aSig) {
1906             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1907         }
1908         return packFloat128( aSign, 0x7FFF, 0, 0 );
1909     }
1910     if ( aExp == 0 ) {
1911         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1912         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1913         --aExp;
1914     }
1915     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1916 
1917 }
1918 
1919 /*----------------------------------------------------------------------------
1920 | Rounds the single-precision floating-point value `a' to an integer, and
1921 | returns the result as a single-precision floating-point value.  The
1922 | operation is performed according to the IEC/IEEE Standard for Binary
1923 | Floating-Point Arithmetic.
1924 *----------------------------------------------------------------------------*/
1925 
1926 float32 float32_round_to_int(float32 a, float_status *status)
1927 {
1928     flag aSign;
1929     int aExp;
1930     uint32_t lastBitMask, roundBitsMask;
1931     uint32_t z;
1932     a = float32_squash_input_denormal(a, status);
1933 
1934     aExp = extractFloat32Exp( a );
1935     if ( 0x96 <= aExp ) {
1936         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1937             return propagateFloat32NaN(a, a, status);
1938         }
1939         return a;
1940     }
1941     if ( aExp <= 0x7E ) {
1942         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1943         status->float_exception_flags |= float_flag_inexact;
1944         aSign = extractFloat32Sign( a );
1945         switch (status->float_rounding_mode) {
1946          case float_round_nearest_even:
1947             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1948                 return packFloat32( aSign, 0x7F, 0 );
1949             }
1950             break;
1951         case float_round_ties_away:
1952             if (aExp == 0x7E) {
1953                 return packFloat32(aSign, 0x7F, 0);
1954             }
1955             break;
1956          case float_round_down:
1957             return make_float32(aSign ? 0xBF800000 : 0);
1958          case float_round_up:
1959             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1960         }
1961         return packFloat32( aSign, 0, 0 );
1962     }
1963     lastBitMask = 1;
1964     lastBitMask <<= 0x96 - aExp;
1965     roundBitsMask = lastBitMask - 1;
1966     z = float32_val(a);
1967     switch (status->float_rounding_mode) {
1968     case float_round_nearest_even:
1969         z += lastBitMask>>1;
1970         if ((z & roundBitsMask) == 0) {
1971             z &= ~lastBitMask;
1972         }
1973         break;
1974     case float_round_ties_away:
1975         z += lastBitMask >> 1;
1976         break;
1977     case float_round_to_zero:
1978         break;
1979     case float_round_up:
1980         if (!extractFloat32Sign(make_float32(z))) {
1981             z += roundBitsMask;
1982         }
1983         break;
1984     case float_round_down:
1985         if (extractFloat32Sign(make_float32(z))) {
1986             z += roundBitsMask;
1987         }
1988         break;
1989     default:
1990         abort();
1991     }
1992     z &= ~ roundBitsMask;
1993     if (z != float32_val(a)) {
1994         status->float_exception_flags |= float_flag_inexact;
1995     }
1996     return make_float32(z);
1997 
1998 }
1999 
2000 /*----------------------------------------------------------------------------
2001 | Returns the result of adding the absolute values of the single-precision
2002 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2003 | before being returned.  `zSign' is ignored if the result is a NaN.
2004 | The addition is performed according to the IEC/IEEE Standard for Binary
2005 | Floating-Point Arithmetic.
2006 *----------------------------------------------------------------------------*/
2007 
2008 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
2009                               float_status *status)
2010 {
2011     int aExp, bExp, zExp;
2012     uint32_t aSig, bSig, zSig;
2013     int expDiff;
2014 
2015     aSig = extractFloat32Frac( a );
2016     aExp = extractFloat32Exp( a );
2017     bSig = extractFloat32Frac( b );
2018     bExp = extractFloat32Exp( b );
2019     expDiff = aExp - bExp;
2020     aSig <<= 6;
2021     bSig <<= 6;
2022     if ( 0 < expDiff ) {
2023         if ( aExp == 0xFF ) {
2024             if (aSig) {
2025                 return propagateFloat32NaN(a, b, status);
2026             }
2027             return a;
2028         }
2029         if ( bExp == 0 ) {
2030             --expDiff;
2031         }
2032         else {
2033             bSig |= 0x20000000;
2034         }
2035         shift32RightJamming( bSig, expDiff, &bSig );
2036         zExp = aExp;
2037     }
2038     else if ( expDiff < 0 ) {
2039         if ( bExp == 0xFF ) {
2040             if (bSig) {
2041                 return propagateFloat32NaN(a, b, status);
2042             }
2043             return packFloat32( zSign, 0xFF, 0 );
2044         }
2045         if ( aExp == 0 ) {
2046             ++expDiff;
2047         }
2048         else {
2049             aSig |= 0x20000000;
2050         }
2051         shift32RightJamming( aSig, - expDiff, &aSig );
2052         zExp = bExp;
2053     }
2054     else {
2055         if ( aExp == 0xFF ) {
2056             if (aSig | bSig) {
2057                 return propagateFloat32NaN(a, b, status);
2058             }
2059             return a;
2060         }
2061         if ( aExp == 0 ) {
2062             if (status->flush_to_zero) {
2063                 if (aSig | bSig) {
2064                     float_raise(float_flag_output_denormal, status);
2065                 }
2066                 return packFloat32(zSign, 0, 0);
2067             }
2068             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2069         }
2070         zSig = 0x40000000 + aSig + bSig;
2071         zExp = aExp;
2072         goto roundAndPack;
2073     }
2074     aSig |= 0x20000000;
2075     zSig = ( aSig + bSig )<<1;
2076     --zExp;
2077     if ( (int32_t) zSig < 0 ) {
2078         zSig = aSig + bSig;
2079         ++zExp;
2080     }
2081  roundAndPack:
2082     return roundAndPackFloat32(zSign, zExp, zSig, status);
2083 
2084 }
2085 
2086 /*----------------------------------------------------------------------------
2087 | Returns the result of subtracting the absolute values of the single-
2088 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2089 | difference is negated before being returned.  `zSign' is ignored if the
2090 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2091 | Standard for Binary Floating-Point Arithmetic.
2092 *----------------------------------------------------------------------------*/
2093 
2094 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2095                               float_status *status)
2096 {
2097     int aExp, bExp, zExp;
2098     uint32_t aSig, bSig, zSig;
2099     int expDiff;
2100 
2101     aSig = extractFloat32Frac( a );
2102     aExp = extractFloat32Exp( a );
2103     bSig = extractFloat32Frac( b );
2104     bExp = extractFloat32Exp( b );
2105     expDiff = aExp - bExp;
2106     aSig <<= 7;
2107     bSig <<= 7;
2108     if ( 0 < expDiff ) goto aExpBigger;
2109     if ( expDiff < 0 ) goto bExpBigger;
2110     if ( aExp == 0xFF ) {
2111         if (aSig | bSig) {
2112             return propagateFloat32NaN(a, b, status);
2113         }
2114         float_raise(float_flag_invalid, status);
2115         return float32_default_nan(status);
2116     }
2117     if ( aExp == 0 ) {
2118         aExp = 1;
2119         bExp = 1;
2120     }
2121     if ( bSig < aSig ) goto aBigger;
2122     if ( aSig < bSig ) goto bBigger;
2123     return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
2124  bExpBigger:
2125     if ( bExp == 0xFF ) {
2126         if (bSig) {
2127             return propagateFloat32NaN(a, b, status);
2128         }
2129         return packFloat32( zSign ^ 1, 0xFF, 0 );
2130     }
2131     if ( aExp == 0 ) {
2132         ++expDiff;
2133     }
2134     else {
2135         aSig |= 0x40000000;
2136     }
2137     shift32RightJamming( aSig, - expDiff, &aSig );
2138     bSig |= 0x40000000;
2139  bBigger:
2140     zSig = bSig - aSig;
2141     zExp = bExp;
2142     zSign ^= 1;
2143     goto normalizeRoundAndPack;
2144  aExpBigger:
2145     if ( aExp == 0xFF ) {
2146         if (aSig) {
2147             return propagateFloat32NaN(a, b, status);
2148         }
2149         return a;
2150     }
2151     if ( bExp == 0 ) {
2152         --expDiff;
2153     }
2154     else {
2155         bSig |= 0x40000000;
2156     }
2157     shift32RightJamming( bSig, expDiff, &bSig );
2158     aSig |= 0x40000000;
2159  aBigger:
2160     zSig = aSig - bSig;
2161     zExp = aExp;
2162  normalizeRoundAndPack:
2163     --zExp;
2164     return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
2165 
2166 }
2167 
2168 /*----------------------------------------------------------------------------
2169 | Returns the result of adding the single-precision floating-point values `a'
2170 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2171 | Binary Floating-Point Arithmetic.
2172 *----------------------------------------------------------------------------*/
2173 
2174 float32 float32_add(float32 a, float32 b, float_status *status)
2175 {
2176     flag aSign, bSign;
2177     a = float32_squash_input_denormal(a, status);
2178     b = float32_squash_input_denormal(b, status);
2179 
2180     aSign = extractFloat32Sign( a );
2181     bSign = extractFloat32Sign( b );
2182     if ( aSign == bSign ) {
2183         return addFloat32Sigs(a, b, aSign, status);
2184     }
2185     else {
2186         return subFloat32Sigs(a, b, aSign, status);
2187     }
2188 
2189 }
2190 
2191 /*----------------------------------------------------------------------------
2192 | Returns the result of subtracting the single-precision floating-point values
2193 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2194 | for Binary Floating-Point Arithmetic.
2195 *----------------------------------------------------------------------------*/
2196 
2197 float32 float32_sub(float32 a, float32 b, float_status *status)
2198 {
2199     flag aSign, bSign;
2200     a = float32_squash_input_denormal(a, status);
2201     b = float32_squash_input_denormal(b, status);
2202 
2203     aSign = extractFloat32Sign( a );
2204     bSign = extractFloat32Sign( b );
2205     if ( aSign == bSign ) {
2206         return subFloat32Sigs(a, b, aSign, status);
2207     }
2208     else {
2209         return addFloat32Sigs(a, b, aSign, status);
2210     }
2211 
2212 }
2213 
2214 /*----------------------------------------------------------------------------
2215 | Returns the result of multiplying the single-precision floating-point values
2216 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2217 | for Binary Floating-Point Arithmetic.
2218 *----------------------------------------------------------------------------*/
2219 
2220 float32 float32_mul(float32 a, float32 b, float_status *status)
2221 {
2222     flag aSign, bSign, zSign;
2223     int aExp, bExp, zExp;
2224     uint32_t aSig, bSig;
2225     uint64_t zSig64;
2226     uint32_t zSig;
2227 
2228     a = float32_squash_input_denormal(a, status);
2229     b = float32_squash_input_denormal(b, status);
2230 
2231     aSig = extractFloat32Frac( a );
2232     aExp = extractFloat32Exp( a );
2233     aSign = extractFloat32Sign( a );
2234     bSig = extractFloat32Frac( b );
2235     bExp = extractFloat32Exp( b );
2236     bSign = extractFloat32Sign( b );
2237     zSign = aSign ^ bSign;
2238     if ( aExp == 0xFF ) {
2239         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2240             return propagateFloat32NaN(a, b, status);
2241         }
2242         if ( ( bExp | bSig ) == 0 ) {
2243             float_raise(float_flag_invalid, status);
2244             return float32_default_nan(status);
2245         }
2246         return packFloat32( zSign, 0xFF, 0 );
2247     }
2248     if ( bExp == 0xFF ) {
2249         if (bSig) {
2250             return propagateFloat32NaN(a, b, status);
2251         }
2252         if ( ( aExp | aSig ) == 0 ) {
2253             float_raise(float_flag_invalid, status);
2254             return float32_default_nan(status);
2255         }
2256         return packFloat32( zSign, 0xFF, 0 );
2257     }
2258     if ( aExp == 0 ) {
2259         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2260         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2261     }
2262     if ( bExp == 0 ) {
2263         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2264         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2265     }
2266     zExp = aExp + bExp - 0x7F;
2267     aSig = ( aSig | 0x00800000 )<<7;
2268     bSig = ( bSig | 0x00800000 )<<8;
2269     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2270     zSig = zSig64;
2271     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2272         zSig <<= 1;
2273         --zExp;
2274     }
2275     return roundAndPackFloat32(zSign, zExp, zSig, status);
2276 
2277 }
2278 
2279 /*----------------------------------------------------------------------------
2280 | Returns the result of dividing the single-precision floating-point value `a'
2281 | by the corresponding value `b'.  The operation is performed according to the
2282 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2283 *----------------------------------------------------------------------------*/
2284 
2285 float32 float32_div(float32 a, float32 b, float_status *status)
2286 {
2287     flag aSign, bSign, zSign;
2288     int aExp, bExp, zExp;
2289     uint32_t aSig, bSig, zSig;
2290     a = float32_squash_input_denormal(a, status);
2291     b = float32_squash_input_denormal(b, status);
2292 
2293     aSig = extractFloat32Frac( a );
2294     aExp = extractFloat32Exp( a );
2295     aSign = extractFloat32Sign( a );
2296     bSig = extractFloat32Frac( b );
2297     bExp = extractFloat32Exp( b );
2298     bSign = extractFloat32Sign( b );
2299     zSign = aSign ^ bSign;
2300     if ( aExp == 0xFF ) {
2301         if (aSig) {
2302             return propagateFloat32NaN(a, b, status);
2303         }
2304         if ( bExp == 0xFF ) {
2305             if (bSig) {
2306                 return propagateFloat32NaN(a, b, status);
2307             }
2308             float_raise(float_flag_invalid, status);
2309             return float32_default_nan(status);
2310         }
2311         return packFloat32( zSign, 0xFF, 0 );
2312     }
2313     if ( bExp == 0xFF ) {
2314         if (bSig) {
2315             return propagateFloat32NaN(a, b, status);
2316         }
2317         return packFloat32( zSign, 0, 0 );
2318     }
2319     if ( bExp == 0 ) {
2320         if ( bSig == 0 ) {
2321             if ( ( aExp | aSig ) == 0 ) {
2322                 float_raise(float_flag_invalid, status);
2323                 return float32_default_nan(status);
2324             }
2325             float_raise(float_flag_divbyzero, status);
2326             return packFloat32( zSign, 0xFF, 0 );
2327         }
2328         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2329     }
2330     if ( aExp == 0 ) {
2331         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2332         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2333     }
2334     zExp = aExp - bExp + 0x7D;
2335     aSig = ( aSig | 0x00800000 )<<7;
2336     bSig = ( bSig | 0x00800000 )<<8;
2337     if ( bSig <= ( aSig + aSig ) ) {
2338         aSig >>= 1;
2339         ++zExp;
2340     }
2341     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2342     if ( ( zSig & 0x3F ) == 0 ) {
2343         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2344     }
2345     return roundAndPackFloat32(zSign, zExp, zSig, status);
2346 
2347 }
2348 
2349 /*----------------------------------------------------------------------------
2350 | Returns the remainder of the single-precision floating-point value `a'
2351 | with respect to the corresponding value `b'.  The operation is performed
2352 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2353 *----------------------------------------------------------------------------*/
2354 
2355 float32 float32_rem(float32 a, float32 b, float_status *status)
2356 {
2357     flag aSign, zSign;
2358     int aExp, bExp, expDiff;
2359     uint32_t aSig, bSig;
2360     uint32_t q;
2361     uint64_t aSig64, bSig64, q64;
2362     uint32_t alternateASig;
2363     int32_t sigMean;
2364     a = float32_squash_input_denormal(a, status);
2365     b = float32_squash_input_denormal(b, status);
2366 
2367     aSig = extractFloat32Frac( a );
2368     aExp = extractFloat32Exp( a );
2369     aSign = extractFloat32Sign( a );
2370     bSig = extractFloat32Frac( b );
2371     bExp = extractFloat32Exp( b );
2372     if ( aExp == 0xFF ) {
2373         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2374             return propagateFloat32NaN(a, b, status);
2375         }
2376         float_raise(float_flag_invalid, status);
2377         return float32_default_nan(status);
2378     }
2379     if ( bExp == 0xFF ) {
2380         if (bSig) {
2381             return propagateFloat32NaN(a, b, status);
2382         }
2383         return a;
2384     }
2385     if ( bExp == 0 ) {
2386         if ( bSig == 0 ) {
2387             float_raise(float_flag_invalid, status);
2388             return float32_default_nan(status);
2389         }
2390         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2391     }
2392     if ( aExp == 0 ) {
2393         if ( aSig == 0 ) return a;
2394         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2395     }
2396     expDiff = aExp - bExp;
2397     aSig |= 0x00800000;
2398     bSig |= 0x00800000;
2399     if ( expDiff < 32 ) {
2400         aSig <<= 8;
2401         bSig <<= 8;
2402         if ( expDiff < 0 ) {
2403             if ( expDiff < -1 ) return a;
2404             aSig >>= 1;
2405         }
2406         q = ( bSig <= aSig );
2407         if ( q ) aSig -= bSig;
2408         if ( 0 < expDiff ) {
2409             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2410             q >>= 32 - expDiff;
2411             bSig >>= 2;
2412             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2413         }
2414         else {
2415             aSig >>= 2;
2416             bSig >>= 2;
2417         }
2418     }
2419     else {
2420         if ( bSig <= aSig ) aSig -= bSig;
2421         aSig64 = ( (uint64_t) aSig )<<40;
2422         bSig64 = ( (uint64_t) bSig )<<40;
2423         expDiff -= 64;
2424         while ( 0 < expDiff ) {
2425             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2426             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2427             aSig64 = - ( ( bSig * q64 )<<38 );
2428             expDiff -= 62;
2429         }
2430         expDiff += 64;
2431         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2432         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2433         q = q64>>( 64 - expDiff );
2434         bSig <<= 6;
2435         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2436     }
2437     do {
2438         alternateASig = aSig;
2439         ++q;
2440         aSig -= bSig;
2441     } while ( 0 <= (int32_t) aSig );
2442     sigMean = aSig + alternateASig;
2443     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2444         aSig = alternateASig;
2445     }
2446     zSign = ( (int32_t) aSig < 0 );
2447     if ( zSign ) aSig = - aSig;
2448     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
2449 }
2450 
2451 /*----------------------------------------------------------------------------
2452 | Returns the result of multiplying the single-precision floating-point values
2453 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2454 | multiplication.  The operation is performed according to the IEC/IEEE
2455 | Standard for Binary Floating-Point Arithmetic 754-2008.
2456 | The flags argument allows the caller to select negation of the
2457 | addend, the intermediate product, or the final result. (The difference
2458 | between this and having the caller do a separate negation is that negating
2459 | externally will flip the sign bit on NaNs.)
2460 *----------------------------------------------------------------------------*/
2461 
2462 float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2463                        float_status *status)
2464 {
2465     flag aSign, bSign, cSign, zSign;
2466     int aExp, bExp, cExp, pExp, zExp, expDiff;
2467     uint32_t aSig, bSig, cSig;
2468     flag pInf, pZero, pSign;
2469     uint64_t pSig64, cSig64, zSig64;
2470     uint32_t pSig;
2471     int shiftcount;
2472     flag signflip, infzero;
2473 
2474     a = float32_squash_input_denormal(a, status);
2475     b = float32_squash_input_denormal(b, status);
2476     c = float32_squash_input_denormal(c, status);
2477     aSig = extractFloat32Frac(a);
2478     aExp = extractFloat32Exp(a);
2479     aSign = extractFloat32Sign(a);
2480     bSig = extractFloat32Frac(b);
2481     bExp = extractFloat32Exp(b);
2482     bSign = extractFloat32Sign(b);
2483     cSig = extractFloat32Frac(c);
2484     cExp = extractFloat32Exp(c);
2485     cSign = extractFloat32Sign(c);
2486 
2487     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2488                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2489 
2490     /* It is implementation-defined whether the cases of (0,inf,qnan)
2491      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2492      * they return if they do), so we have to hand this information
2493      * off to the target-specific pick-a-NaN routine.
2494      */
2495     if (((aExp == 0xff) && aSig) ||
2496         ((bExp == 0xff) && bSig) ||
2497         ((cExp == 0xff) && cSig)) {
2498         return propagateFloat32MulAddNaN(a, b, c, infzero, status);
2499     }
2500 
2501     if (infzero) {
2502         float_raise(float_flag_invalid, status);
2503         return float32_default_nan(status);
2504     }
2505 
2506     if (flags & float_muladd_negate_c) {
2507         cSign ^= 1;
2508     }
2509 
2510     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2511 
2512     /* Work out the sign and type of the product */
2513     pSign = aSign ^ bSign;
2514     if (flags & float_muladd_negate_product) {
2515         pSign ^= 1;
2516     }
2517     pInf = (aExp == 0xff) || (bExp == 0xff);
2518     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2519 
2520     if (cExp == 0xff) {
2521         if (pInf && (pSign ^ cSign)) {
2522             /* addition of opposite-signed infinities => InvalidOperation */
2523             float_raise(float_flag_invalid, status);
2524             return float32_default_nan(status);
2525         }
2526         /* Otherwise generate an infinity of the same sign */
2527         return packFloat32(cSign ^ signflip, 0xff, 0);
2528     }
2529 
2530     if (pInf) {
2531         return packFloat32(pSign ^ signflip, 0xff, 0);
2532     }
2533 
2534     if (pZero) {
2535         if (cExp == 0) {
2536             if (cSig == 0) {
2537                 /* Adding two exact zeroes */
2538                 if (pSign == cSign) {
2539                     zSign = pSign;
2540                 } else if (status->float_rounding_mode == float_round_down) {
2541                     zSign = 1;
2542                 } else {
2543                     zSign = 0;
2544                 }
2545                 return packFloat32(zSign ^ signflip, 0, 0);
2546             }
2547             /* Exact zero plus a denorm */
2548             if (status->flush_to_zero) {
2549                 float_raise(float_flag_output_denormal, status);
2550                 return packFloat32(cSign ^ signflip, 0, 0);
2551             }
2552         }
2553         /* Zero plus something non-zero : just return the something */
2554         if (flags & float_muladd_halve_result) {
2555             if (cExp == 0) {
2556                 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2557             }
2558             /* Subtract one to halve, and one again because roundAndPackFloat32
2559              * wants one less than the true exponent.
2560              */
2561             cExp -= 2;
2562             cSig = (cSig | 0x00800000) << 7;
2563             return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
2564         }
2565         return packFloat32(cSign ^ signflip, cExp, cSig);
2566     }
2567 
2568     if (aExp == 0) {
2569         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2570     }
2571     if (bExp == 0) {
2572         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2573     }
2574 
2575     /* Calculate the actual result a * b + c */
2576 
2577     /* Multiply first; this is easy. */
2578     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2579      * because we want the true exponent, not the "one-less-than"
2580      * flavour that roundAndPackFloat32() takes.
2581      */
2582     pExp = aExp + bExp - 0x7e;
2583     aSig = (aSig | 0x00800000) << 7;
2584     bSig = (bSig | 0x00800000) << 8;
2585     pSig64 = (uint64_t)aSig * bSig;
2586     if ((int64_t)(pSig64 << 1) >= 0) {
2587         pSig64 <<= 1;
2588         pExp--;
2589     }
2590 
2591     zSign = pSign ^ signflip;
2592 
2593     /* Now pSig64 is the significand of the multiply, with the explicit bit in
2594      * position 62.
2595      */
2596     if (cExp == 0) {
2597         if (!cSig) {
2598             /* Throw out the special case of c being an exact zero now */
2599             shift64RightJamming(pSig64, 32, &pSig64);
2600             pSig = pSig64;
2601             if (flags & float_muladd_halve_result) {
2602                 pExp--;
2603             }
2604             return roundAndPackFloat32(zSign, pExp - 1,
2605                                        pSig, status);
2606         }
2607         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2608     }
2609 
2610     cSig64 = (uint64_t)cSig << (62 - 23);
2611     cSig64 |= LIT64(0x4000000000000000);
2612     expDiff = pExp - cExp;
2613 
2614     if (pSign == cSign) {
2615         /* Addition */
2616         if (expDiff > 0) {
2617             /* scale c to match p */
2618             shift64RightJamming(cSig64, expDiff, &cSig64);
2619             zExp = pExp;
2620         } else if (expDiff < 0) {
2621             /* scale p to match c */
2622             shift64RightJamming(pSig64, -expDiff, &pSig64);
2623             zExp = cExp;
2624         } else {
2625             /* no scaling needed */
2626             zExp = cExp;
2627         }
2628         /* Add significands and make sure explicit bit ends up in posn 62 */
2629         zSig64 = pSig64 + cSig64;
2630         if ((int64_t)zSig64 < 0) {
2631             shift64RightJamming(zSig64, 1, &zSig64);
2632         } else {
2633             zExp--;
2634         }
2635     } else {
2636         /* Subtraction */
2637         if (expDiff > 0) {
2638             shift64RightJamming(cSig64, expDiff, &cSig64);
2639             zSig64 = pSig64 - cSig64;
2640             zExp = pExp;
2641         } else if (expDiff < 0) {
2642             shift64RightJamming(pSig64, -expDiff, &pSig64);
2643             zSig64 = cSig64 - pSig64;
2644             zExp = cExp;
2645             zSign ^= 1;
2646         } else {
2647             zExp = pExp;
2648             if (cSig64 < pSig64) {
2649                 zSig64 = pSig64 - cSig64;
2650             } else if (pSig64 < cSig64) {
2651                 zSig64 = cSig64 - pSig64;
2652                 zSign ^= 1;
2653             } else {
2654                 /* Exact zero */
2655                 zSign = signflip;
2656                 if (status->float_rounding_mode == float_round_down) {
2657                     zSign ^= 1;
2658                 }
2659                 return packFloat32(zSign, 0, 0);
2660             }
2661         }
2662         --zExp;
2663         /* Normalize to put the explicit bit back into bit 62. */
2664         shiftcount = countLeadingZeros64(zSig64) - 1;
2665         zSig64 <<= shiftcount;
2666         zExp -= shiftcount;
2667     }
2668     if (flags & float_muladd_halve_result) {
2669         zExp--;
2670     }
2671 
2672     shift64RightJamming(zSig64, 32, &zSig64);
2673     return roundAndPackFloat32(zSign, zExp, zSig64, status);
2674 }
2675 
2676 
2677 /*----------------------------------------------------------------------------
2678 | Returns the square root of the single-precision floating-point value `a'.
2679 | The operation is performed according to the IEC/IEEE Standard for Binary
2680 | Floating-Point Arithmetic.
2681 *----------------------------------------------------------------------------*/
2682 
2683 float32 float32_sqrt(float32 a, float_status *status)
2684 {
2685     flag aSign;
2686     int aExp, zExp;
2687     uint32_t aSig, zSig;
2688     uint64_t rem, term;
2689     a = float32_squash_input_denormal(a, status);
2690 
2691     aSig = extractFloat32Frac( a );
2692     aExp = extractFloat32Exp( a );
2693     aSign = extractFloat32Sign( a );
2694     if ( aExp == 0xFF ) {
2695         if (aSig) {
2696             return propagateFloat32NaN(a, float32_zero, status);
2697         }
2698         if ( ! aSign ) return a;
2699         float_raise(float_flag_invalid, status);
2700         return float32_default_nan(status);
2701     }
2702     if ( aSign ) {
2703         if ( ( aExp | aSig ) == 0 ) return a;
2704         float_raise(float_flag_invalid, status);
2705         return float32_default_nan(status);
2706     }
2707     if ( aExp == 0 ) {
2708         if ( aSig == 0 ) return float32_zero;
2709         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2710     }
2711     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2712     aSig = ( aSig | 0x00800000 )<<8;
2713     zSig = estimateSqrt32( aExp, aSig ) + 2;
2714     if ( ( zSig & 0x7F ) <= 5 ) {
2715         if ( zSig < 2 ) {
2716             zSig = 0x7FFFFFFF;
2717             goto roundAndPack;
2718         }
2719         aSig >>= aExp & 1;
2720         term = ( (uint64_t) zSig ) * zSig;
2721         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2722         while ( (int64_t) rem < 0 ) {
2723             --zSig;
2724             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2725         }
2726         zSig |= ( rem != 0 );
2727     }
2728     shift32RightJamming( zSig, 1, &zSig );
2729  roundAndPack:
2730     return roundAndPackFloat32(0, zExp, zSig, status);
2731 
2732 }
2733 
2734 /*----------------------------------------------------------------------------
2735 | Returns the binary exponential of the single-precision floating-point value
2736 | `a'. The operation is performed according to the IEC/IEEE Standard for
2737 | Binary Floating-Point Arithmetic.
2738 |
2739 | Uses the following identities:
2740 |
2741 | 1. -------------------------------------------------------------------------
2742 |      x    x*ln(2)
2743 |     2  = e
2744 |
2745 | 2. -------------------------------------------------------------------------
2746 |                      2     3     4     5           n
2747 |      x        x     x     x     x     x           x
2748 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2749 |               1!    2!    3!    4!    5!          n!
2750 *----------------------------------------------------------------------------*/
2751 
2752 static const float64 float32_exp2_coefficients[15] =
2753 {
2754     const_float64( 0x3ff0000000000000ll ), /*  1 */
2755     const_float64( 0x3fe0000000000000ll ), /*  2 */
2756     const_float64( 0x3fc5555555555555ll ), /*  3 */
2757     const_float64( 0x3fa5555555555555ll ), /*  4 */
2758     const_float64( 0x3f81111111111111ll ), /*  5 */
2759     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2760     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2761     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2762     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2763     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2764     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2765     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2766     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2767     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2768     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2769 };
2770 
2771 float32 float32_exp2(float32 a, float_status *status)
2772 {
2773     flag aSign;
2774     int aExp;
2775     uint32_t aSig;
2776     float64 r, x, xn;
2777     int i;
2778     a = float32_squash_input_denormal(a, status);
2779 
2780     aSig = extractFloat32Frac( a );
2781     aExp = extractFloat32Exp( a );
2782     aSign = extractFloat32Sign( a );
2783 
2784     if ( aExp == 0xFF) {
2785         if (aSig) {
2786             return propagateFloat32NaN(a, float32_zero, status);
2787         }
2788         return (aSign) ? float32_zero : a;
2789     }
2790     if (aExp == 0) {
2791         if (aSig == 0) return float32_one;
2792     }
2793 
2794     float_raise(float_flag_inexact, status);
2795 
2796     /* ******************************* */
2797     /* using float64 for approximation */
2798     /* ******************************* */
2799     x = float32_to_float64(a, status);
2800     x = float64_mul(x, float64_ln2, status);
2801 
2802     xn = x;
2803     r = float64_one;
2804     for (i = 0 ; i < 15 ; i++) {
2805         float64 f;
2806 
2807         f = float64_mul(xn, float32_exp2_coefficients[i], status);
2808         r = float64_add(r, f, status);
2809 
2810         xn = float64_mul(xn, x, status);
2811     }
2812 
2813     return float64_to_float32(r, status);
2814 }
2815 
2816 /*----------------------------------------------------------------------------
2817 | Returns the binary log of the single-precision floating-point value `a'.
2818 | The operation is performed according to the IEC/IEEE Standard for Binary
2819 | Floating-Point Arithmetic.
2820 *----------------------------------------------------------------------------*/
2821 float32 float32_log2(float32 a, float_status *status)
2822 {
2823     flag aSign, zSign;
2824     int aExp;
2825     uint32_t aSig, zSig, i;
2826 
2827     a = float32_squash_input_denormal(a, status);
2828     aSig = extractFloat32Frac( a );
2829     aExp = extractFloat32Exp( a );
2830     aSign = extractFloat32Sign( a );
2831 
2832     if ( aExp == 0 ) {
2833         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2834         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2835     }
2836     if ( aSign ) {
2837         float_raise(float_flag_invalid, status);
2838         return float32_default_nan(status);
2839     }
2840     if ( aExp == 0xFF ) {
2841         if (aSig) {
2842             return propagateFloat32NaN(a, float32_zero, status);
2843         }
2844         return a;
2845     }
2846 
2847     aExp -= 0x7F;
2848     aSig |= 0x00800000;
2849     zSign = aExp < 0;
2850     zSig = aExp << 23;
2851 
2852     for (i = 1 << 22; i > 0; i >>= 1) {
2853         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2854         if ( aSig & 0x01000000 ) {
2855             aSig >>= 1;
2856             zSig |= i;
2857         }
2858     }
2859 
2860     if ( zSign )
2861         zSig = -zSig;
2862 
2863     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
2864 }
2865 
2866 /*----------------------------------------------------------------------------
2867 | Returns 1 if the single-precision floating-point value `a' is equal to
2868 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2869 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2870 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2871 *----------------------------------------------------------------------------*/
2872 
2873 int float32_eq(float32 a, float32 b, float_status *status)
2874 {
2875     uint32_t av, bv;
2876     a = float32_squash_input_denormal(a, status);
2877     b = float32_squash_input_denormal(b, status);
2878 
2879     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2880          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2881        ) {
2882         float_raise(float_flag_invalid, status);
2883         return 0;
2884     }
2885     av = float32_val(a);
2886     bv = float32_val(b);
2887     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2888 }
2889 
2890 /*----------------------------------------------------------------------------
2891 | Returns 1 if the single-precision floating-point value `a' is less than
2892 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2893 | exception is raised if either operand is a NaN.  The comparison is performed
2894 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2895 *----------------------------------------------------------------------------*/
2896 
2897 int float32_le(float32 a, float32 b, float_status *status)
2898 {
2899     flag aSign, bSign;
2900     uint32_t av, bv;
2901     a = float32_squash_input_denormal(a, status);
2902     b = float32_squash_input_denormal(b, status);
2903 
2904     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2905          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2906        ) {
2907         float_raise(float_flag_invalid, status);
2908         return 0;
2909     }
2910     aSign = extractFloat32Sign( a );
2911     bSign = extractFloat32Sign( b );
2912     av = float32_val(a);
2913     bv = float32_val(b);
2914     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2915     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2916 
2917 }
2918 
2919 /*----------------------------------------------------------------------------
2920 | Returns 1 if the single-precision floating-point value `a' is less than
2921 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2922 | raised if either operand is a NaN.  The comparison is performed according
2923 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2924 *----------------------------------------------------------------------------*/
2925 
2926 int float32_lt(float32 a, float32 b, float_status *status)
2927 {
2928     flag aSign, bSign;
2929     uint32_t av, bv;
2930     a = float32_squash_input_denormal(a, status);
2931     b = float32_squash_input_denormal(b, status);
2932 
2933     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2934          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2935        ) {
2936         float_raise(float_flag_invalid, status);
2937         return 0;
2938     }
2939     aSign = extractFloat32Sign( a );
2940     bSign = extractFloat32Sign( b );
2941     av = float32_val(a);
2942     bv = float32_val(b);
2943     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2944     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2945 
2946 }
2947 
2948 /*----------------------------------------------------------------------------
2949 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2950 | be compared, and 0 otherwise.  The invalid exception is raised if either
2951 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2952 | Standard for Binary Floating-Point Arithmetic.
2953 *----------------------------------------------------------------------------*/
2954 
2955 int float32_unordered(float32 a, float32 b, float_status *status)
2956 {
2957     a = float32_squash_input_denormal(a, status);
2958     b = float32_squash_input_denormal(b, status);
2959 
2960     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2961          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2962        ) {
2963         float_raise(float_flag_invalid, status);
2964         return 1;
2965     }
2966     return 0;
2967 }
2968 
2969 /*----------------------------------------------------------------------------
2970 | Returns 1 if the single-precision floating-point value `a' is equal to
2971 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2972 | exception.  The comparison is performed according to the IEC/IEEE Standard
2973 | for Binary Floating-Point Arithmetic.
2974 *----------------------------------------------------------------------------*/
2975 
2976 int float32_eq_quiet(float32 a, float32 b, float_status *status)
2977 {
2978     a = float32_squash_input_denormal(a, status);
2979     b = float32_squash_input_denormal(b, status);
2980 
2981     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2982          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2983        ) {
2984         if (float32_is_signaling_nan(a, status)
2985          || float32_is_signaling_nan(b, status)) {
2986             float_raise(float_flag_invalid, status);
2987         }
2988         return 0;
2989     }
2990     return ( float32_val(a) == float32_val(b) ) ||
2991             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2992 }
2993 
2994 /*----------------------------------------------------------------------------
2995 | Returns 1 if the single-precision floating-point value `a' is less than or
2996 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2997 | cause an exception.  Otherwise, the comparison is performed according to the
2998 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2999 *----------------------------------------------------------------------------*/
3000 
3001 int float32_le_quiet(float32 a, float32 b, float_status *status)
3002 {
3003     flag aSign, bSign;
3004     uint32_t av, bv;
3005     a = float32_squash_input_denormal(a, status);
3006     b = float32_squash_input_denormal(b, status);
3007 
3008     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3009          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3010        ) {
3011         if (float32_is_signaling_nan(a, status)
3012          || float32_is_signaling_nan(b, status)) {
3013             float_raise(float_flag_invalid, status);
3014         }
3015         return 0;
3016     }
3017     aSign = extractFloat32Sign( a );
3018     bSign = extractFloat32Sign( b );
3019     av = float32_val(a);
3020     bv = float32_val(b);
3021     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3022     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3023 
3024 }
3025 
3026 /*----------------------------------------------------------------------------
3027 | Returns 1 if the single-precision floating-point value `a' is less than
3028 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3029 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3030 | Standard for Binary Floating-Point Arithmetic.
3031 *----------------------------------------------------------------------------*/
3032 
3033 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3034 {
3035     flag aSign, bSign;
3036     uint32_t av, bv;
3037     a = float32_squash_input_denormal(a, status);
3038     b = float32_squash_input_denormal(b, status);
3039 
3040     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3041          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3042        ) {
3043         if (float32_is_signaling_nan(a, status)
3044          || float32_is_signaling_nan(b, status)) {
3045             float_raise(float_flag_invalid, status);
3046         }
3047         return 0;
3048     }
3049     aSign = extractFloat32Sign( a );
3050     bSign = extractFloat32Sign( b );
3051     av = float32_val(a);
3052     bv = float32_val(b);
3053     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3054     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3055 
3056 }
3057 
3058 /*----------------------------------------------------------------------------
3059 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3060 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3061 | comparison is performed according to the IEC/IEEE Standard for Binary
3062 | Floating-Point Arithmetic.
3063 *----------------------------------------------------------------------------*/
3064 
3065 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3066 {
3067     a = float32_squash_input_denormal(a, status);
3068     b = float32_squash_input_denormal(b, status);
3069 
3070     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3071          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3072        ) {
3073         if (float32_is_signaling_nan(a, status)
3074          || float32_is_signaling_nan(b, status)) {
3075             float_raise(float_flag_invalid, status);
3076         }
3077         return 1;
3078     }
3079     return 0;
3080 }
3081 
3082 /*----------------------------------------------------------------------------
3083 | Returns the result of converting the double-precision floating-point value
3084 | `a' to the 32-bit two's complement integer format.  The conversion is
3085 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3086 | Arithmetic---which means in particular that the conversion is rounded
3087 | according to the current rounding mode.  If `a' is a NaN, the largest
3088 | positive integer is returned.  Otherwise, if the conversion overflows, the
3089 | largest integer with the same sign as `a' is returned.
3090 *----------------------------------------------------------------------------*/
3091 
3092 int32_t float64_to_int32(float64 a, float_status *status)
3093 {
3094     flag aSign;
3095     int aExp;
3096     int shiftCount;
3097     uint64_t aSig;
3098     a = float64_squash_input_denormal(a, status);
3099 
3100     aSig = extractFloat64Frac( a );
3101     aExp = extractFloat64Exp( a );
3102     aSign = extractFloat64Sign( a );
3103     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3104     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3105     shiftCount = 0x42C - aExp;
3106     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3107     return roundAndPackInt32(aSign, aSig, status);
3108 
3109 }
3110 
3111 /*----------------------------------------------------------------------------
3112 | Returns the result of converting the double-precision floating-point value
3113 | `a' to the 32-bit two's complement integer format.  The conversion is
3114 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3115 | Arithmetic, except that the conversion is always rounded toward zero.
3116 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3117 | the conversion overflows, the largest integer with the same sign as `a' is
3118 | returned.
3119 *----------------------------------------------------------------------------*/
3120 
3121 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
3122 {
3123     flag aSign;
3124     int aExp;
3125     int shiftCount;
3126     uint64_t aSig, savedASig;
3127     int32_t z;
3128     a = float64_squash_input_denormal(a, status);
3129 
3130     aSig = extractFloat64Frac( a );
3131     aExp = extractFloat64Exp( a );
3132     aSign = extractFloat64Sign( a );
3133     if ( 0x41E < aExp ) {
3134         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3135         goto invalid;
3136     }
3137     else if ( aExp < 0x3FF ) {
3138         if (aExp || aSig) {
3139             status->float_exception_flags |= float_flag_inexact;
3140         }
3141         return 0;
3142     }
3143     aSig |= LIT64( 0x0010000000000000 );
3144     shiftCount = 0x433 - aExp;
3145     savedASig = aSig;
3146     aSig >>= shiftCount;
3147     z = aSig;
3148     if ( aSign ) z = - z;
3149     if ( ( z < 0 ) ^ aSign ) {
3150  invalid:
3151         float_raise(float_flag_invalid, status);
3152         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3153     }
3154     if ( ( aSig<<shiftCount ) != savedASig ) {
3155         status->float_exception_flags |= float_flag_inexact;
3156     }
3157     return z;
3158 
3159 }
3160 
3161 /*----------------------------------------------------------------------------
3162 | Returns the result of converting the double-precision floating-point value
3163 | `a' to the 16-bit two's complement integer format.  The conversion is
3164 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3165 | Arithmetic, except that the conversion is always rounded toward zero.
3166 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3167 | the conversion overflows, the largest integer with the same sign as `a' is
3168 | returned.
3169 *----------------------------------------------------------------------------*/
3170 
3171 int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3172 {
3173     flag aSign;
3174     int aExp;
3175     int shiftCount;
3176     uint64_t aSig, savedASig;
3177     int32_t z;
3178 
3179     aSig = extractFloat64Frac( a );
3180     aExp = extractFloat64Exp( a );
3181     aSign = extractFloat64Sign( a );
3182     if ( 0x40E < aExp ) {
3183         if ( ( aExp == 0x7FF ) && aSig ) {
3184             aSign = 0;
3185         }
3186         goto invalid;
3187     }
3188     else if ( aExp < 0x3FF ) {
3189         if ( aExp || aSig ) {
3190             status->float_exception_flags |= float_flag_inexact;
3191         }
3192         return 0;
3193     }
3194     aSig |= LIT64( 0x0010000000000000 );
3195     shiftCount = 0x433 - aExp;
3196     savedASig = aSig;
3197     aSig >>= shiftCount;
3198     z = aSig;
3199     if ( aSign ) {
3200         z = - z;
3201     }
3202     if ( ( (int16_t)z < 0 ) ^ aSign ) {
3203  invalid:
3204         float_raise(float_flag_invalid, status);
3205         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3206     }
3207     if ( ( aSig<<shiftCount ) != savedASig ) {
3208         status->float_exception_flags |= float_flag_inexact;
3209     }
3210     return z;
3211 }
3212 
3213 /*----------------------------------------------------------------------------
3214 | Returns the result of converting the double-precision floating-point value
3215 | `a' to the 64-bit two's complement integer format.  The conversion is
3216 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3217 | Arithmetic---which means in particular that the conversion is rounded
3218 | according to the current rounding mode.  If `a' is a NaN, the largest
3219 | positive integer is returned.  Otherwise, if the conversion overflows, the
3220 | largest integer with the same sign as `a' is returned.
3221 *----------------------------------------------------------------------------*/
3222 
3223 int64_t float64_to_int64(float64 a, float_status *status)
3224 {
3225     flag aSign;
3226     int aExp;
3227     int shiftCount;
3228     uint64_t aSig, aSigExtra;
3229     a = float64_squash_input_denormal(a, status);
3230 
3231     aSig = extractFloat64Frac( a );
3232     aExp = extractFloat64Exp( a );
3233     aSign = extractFloat64Sign( a );
3234     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3235     shiftCount = 0x433 - aExp;
3236     if ( shiftCount <= 0 ) {
3237         if ( 0x43E < aExp ) {
3238             float_raise(float_flag_invalid, status);
3239             if (    ! aSign
3240                  || (    ( aExp == 0x7FF )
3241                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
3242                ) {
3243                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3244             }
3245             return (int64_t) LIT64( 0x8000000000000000 );
3246         }
3247         aSigExtra = 0;
3248         aSig <<= - shiftCount;
3249     }
3250     else {
3251         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3252     }
3253     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3254 
3255 }
3256 
3257 /*----------------------------------------------------------------------------
3258 | Returns the result of converting the double-precision floating-point value
3259 | `a' to the 64-bit two's complement integer format.  The conversion is
3260 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3261 | Arithmetic, except that the conversion is always rounded toward zero.
3262 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3263 | the conversion overflows, the largest integer with the same sign as `a' is
3264 | returned.
3265 *----------------------------------------------------------------------------*/
3266 
3267 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
3268 {
3269     flag aSign;
3270     int aExp;
3271     int shiftCount;
3272     uint64_t aSig;
3273     int64_t z;
3274     a = float64_squash_input_denormal(a, status);
3275 
3276     aSig = extractFloat64Frac( a );
3277     aExp = extractFloat64Exp( a );
3278     aSign = extractFloat64Sign( a );
3279     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3280     shiftCount = aExp - 0x433;
3281     if ( 0 <= shiftCount ) {
3282         if ( 0x43E <= aExp ) {
3283             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3284                 float_raise(float_flag_invalid, status);
3285                 if (    ! aSign
3286                      || (    ( aExp == 0x7FF )
3287                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
3288                    ) {
3289                     return LIT64( 0x7FFFFFFFFFFFFFFF );
3290                 }
3291             }
3292             return (int64_t) LIT64( 0x8000000000000000 );
3293         }
3294         z = aSig<<shiftCount;
3295     }
3296     else {
3297         if ( aExp < 0x3FE ) {
3298             if (aExp | aSig) {
3299                 status->float_exception_flags |= float_flag_inexact;
3300             }
3301             return 0;
3302         }
3303         z = aSig>>( - shiftCount );
3304         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3305             status->float_exception_flags |= float_flag_inexact;
3306         }
3307     }
3308     if ( aSign ) z = - z;
3309     return z;
3310 
3311 }
3312 
3313 /*----------------------------------------------------------------------------
3314 | Returns the result of converting the double-precision floating-point value
3315 | `a' to the single-precision floating-point format.  The conversion is
3316 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3317 | Arithmetic.
3318 *----------------------------------------------------------------------------*/
3319 
3320 float32 float64_to_float32(float64 a, float_status *status)
3321 {
3322     flag aSign;
3323     int aExp;
3324     uint64_t aSig;
3325     uint32_t zSig;
3326     a = float64_squash_input_denormal(a, status);
3327 
3328     aSig = extractFloat64Frac( a );
3329     aExp = extractFloat64Exp( a );
3330     aSign = extractFloat64Sign( a );
3331     if ( aExp == 0x7FF ) {
3332         if (aSig) {
3333             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3334         }
3335         return packFloat32( aSign, 0xFF, 0 );
3336     }
3337     shift64RightJamming( aSig, 22, &aSig );
3338     zSig = aSig;
3339     if ( aExp || zSig ) {
3340         zSig |= 0x40000000;
3341         aExp -= 0x381;
3342     }
3343     return roundAndPackFloat32(aSign, aExp, zSig, status);
3344 
3345 }
3346 
3347 
3348 /*----------------------------------------------------------------------------
3349 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3350 | half-precision floating-point value, returning the result.  After being
3351 | shifted into the proper positions, the three fields are simply added
3352 | together to form the result.  This means that any integer portion of `zSig'
3353 | will be added into the exponent.  Since a properly normalized significand
3354 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3355 | than the desired result exponent whenever `zSig' is a complete, normalized
3356 | significand.
3357 *----------------------------------------------------------------------------*/
3358 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3359 {
3360     return make_float16(
3361         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3362 }
3363 
3364 /*----------------------------------------------------------------------------
3365 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3366 | and significand `zSig', and returns the proper half-precision floating-
3367 | point value corresponding to the abstract input.  Ordinarily, the abstract
3368 | value is simply rounded and packed into the half-precision format, with
3369 | the inexact exception raised if the abstract input cannot be represented
3370 | exactly.  However, if the abstract value is too large, the overflow and
3371 | inexact exceptions are raised and an infinity or maximal finite value is
3372 | returned.  If the abstract value is too small, the input value is rounded to
3373 | a subnormal number, and the underflow and inexact exceptions are raised if
3374 | the abstract input cannot be represented exactly as a subnormal half-
3375 | precision floating-point number.
3376 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3377 | ARM-style "alternative representation", which omits the NaN and Inf
3378 | encodings in order to raise the maximum representable exponent by one.
3379 |     The input significand `zSig' has its binary point between bits 22
3380 | and 23, which is 13 bits to the left of the usual location.  This shifted
3381 | significand must be normalized or smaller.  If `zSig' is not normalized,
3382 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3383 | and it must not require rounding.  In the usual case that `zSig' is
3384 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3385 | Note the slightly odd position of the binary point in zSig compared with the
3386 | other roundAndPackFloat functions. This should probably be fixed if we
3387 | need to implement more float16 routines than just conversion.
3388 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3389 | Binary Floating-Point Arithmetic.
3390 *----------------------------------------------------------------------------*/
3391 
3392 static float16 roundAndPackFloat16(flag zSign, int zExp,
3393                                    uint32_t zSig, flag ieee,
3394                                    float_status *status)
3395 {
3396     int maxexp = ieee ? 29 : 30;
3397     uint32_t mask;
3398     uint32_t increment;
3399     bool rounding_bumps_exp;
3400     bool is_tiny = false;
3401 
3402     /* Calculate the mask of bits of the mantissa which are not
3403      * representable in half-precision and will be lost.
3404      */
3405     if (zExp < 1) {
3406         /* Will be denormal in halfprec */
3407         mask = 0x00ffffff;
3408         if (zExp >= -11) {
3409             mask >>= 11 + zExp;
3410         }
3411     } else {
3412         /* Normal number in halfprec */
3413         mask = 0x00001fff;
3414     }
3415 
3416     switch (status->float_rounding_mode) {
3417     case float_round_nearest_even:
3418         increment = (mask + 1) >> 1;
3419         if ((zSig & mask) == increment) {
3420             increment = zSig & (increment << 1);
3421         }
3422         break;
3423     case float_round_ties_away:
3424         increment = (mask + 1) >> 1;
3425         break;
3426     case float_round_up:
3427         increment = zSign ? 0 : mask;
3428         break;
3429     case float_round_down:
3430         increment = zSign ? mask : 0;
3431         break;
3432     default: /* round_to_zero */
3433         increment = 0;
3434         break;
3435     }
3436 
3437     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3438 
3439     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3440         if (ieee) {
3441             float_raise(float_flag_overflow | float_flag_inexact, status);
3442             return packFloat16(zSign, 0x1f, 0);
3443         } else {
3444             float_raise(float_flag_invalid, status);
3445             return packFloat16(zSign, 0x1f, 0x3ff);
3446         }
3447     }
3448 
3449     if (zExp < 0) {
3450         /* Note that flush-to-zero does not affect half-precision results */
3451         is_tiny =
3452             (status->float_detect_tininess == float_tininess_before_rounding)
3453             || (zExp < -1)
3454             || (!rounding_bumps_exp);
3455     }
3456     if (zSig & mask) {
3457         float_raise(float_flag_inexact, status);
3458         if (is_tiny) {
3459             float_raise(float_flag_underflow, status);
3460         }
3461     }
3462 
3463     zSig += increment;
3464     if (rounding_bumps_exp) {
3465         zSig >>= 1;
3466         zExp++;
3467     }
3468 
3469     if (zExp < -10) {
3470         return packFloat16(zSign, 0, 0);
3471     }
3472     if (zExp < 0) {
3473         zSig >>= -zExp;
3474         zExp = 0;
3475     }
3476     return packFloat16(zSign, zExp, zSig >> 13);
3477 }
3478 
3479 /*----------------------------------------------------------------------------
3480 | If `a' is denormal and we are in flush-to-zero mode then set the
3481 | input-denormal exception and return zero. Otherwise just return the value.
3482 *----------------------------------------------------------------------------*/
3483 float16 float16_squash_input_denormal(float16 a, float_status *status)
3484 {
3485     if (status->flush_inputs_to_zero) {
3486         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3487             float_raise(float_flag_input_denormal, status);
3488             return make_float16(float16_val(a) & 0x8000);
3489         }
3490     }
3491     return a;
3492 }
3493 
3494 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3495                                       uint32_t *zSigPtr)
3496 {
3497     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3498     *zSigPtr = aSig << shiftCount;
3499     *zExpPtr = 1 - shiftCount;
3500 }
3501 
3502 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3503    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3504 
3505 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3506 {
3507     flag aSign;
3508     int aExp;
3509     uint32_t aSig;
3510 
3511     aSign = extractFloat16Sign(a);
3512     aExp = extractFloat16Exp(a);
3513     aSig = extractFloat16Frac(a);
3514 
3515     if (aExp == 0x1f && ieee) {
3516         if (aSig) {
3517             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3518         }
3519         return packFloat32(aSign, 0xff, 0);
3520     }
3521     if (aExp == 0) {
3522         if (aSig == 0) {
3523             return packFloat32(aSign, 0, 0);
3524         }
3525 
3526         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3527         aExp--;
3528     }
3529     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3530 }
3531 
3532 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3533 {
3534     flag aSign;
3535     int aExp;
3536     uint32_t aSig;
3537 
3538     a = float32_squash_input_denormal(a, status);
3539 
3540     aSig = extractFloat32Frac( a );
3541     aExp = extractFloat32Exp( a );
3542     aSign = extractFloat32Sign( a );
3543     if ( aExp == 0xFF ) {
3544         if (aSig) {
3545             /* Input is a NaN */
3546             if (!ieee) {
3547                 float_raise(float_flag_invalid, status);
3548                 return packFloat16(aSign, 0, 0);
3549             }
3550             return commonNaNToFloat16(
3551                 float32ToCommonNaN(a, status), status);
3552         }
3553         /* Infinity */
3554         if (!ieee) {
3555             float_raise(float_flag_invalid, status);
3556             return packFloat16(aSign, 0x1f, 0x3ff);
3557         }
3558         return packFloat16(aSign, 0x1f, 0);
3559     }
3560     if (aExp == 0 && aSig == 0) {
3561         return packFloat16(aSign, 0, 0);
3562     }
3563     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3564      * even if the input is denormal; however this is harmless because
3565      * the largest possible single-precision denormal is still smaller
3566      * than the smallest representable half-precision denormal, and so we
3567      * will end up ignoring aSig and returning via the "always return zero"
3568      * codepath.
3569      */
3570     aSig |= 0x00800000;
3571     aExp -= 0x71;
3572 
3573     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3574 }
3575 
3576 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3577 {
3578     flag aSign;
3579     int aExp;
3580     uint32_t aSig;
3581 
3582     aSign = extractFloat16Sign(a);
3583     aExp = extractFloat16Exp(a);
3584     aSig = extractFloat16Frac(a);
3585 
3586     if (aExp == 0x1f && ieee) {
3587         if (aSig) {
3588             return commonNaNToFloat64(
3589                 float16ToCommonNaN(a, status), status);
3590         }
3591         return packFloat64(aSign, 0x7ff, 0);
3592     }
3593     if (aExp == 0) {
3594         if (aSig == 0) {
3595             return packFloat64(aSign, 0, 0);
3596         }
3597 
3598         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3599         aExp--;
3600     }
3601     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3602 }
3603 
3604 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3605 {
3606     flag aSign;
3607     int aExp;
3608     uint64_t aSig;
3609     uint32_t zSig;
3610 
3611     a = float64_squash_input_denormal(a, status);
3612 
3613     aSig = extractFloat64Frac(a);
3614     aExp = extractFloat64Exp(a);
3615     aSign = extractFloat64Sign(a);
3616     if (aExp == 0x7FF) {
3617         if (aSig) {
3618             /* Input is a NaN */
3619             if (!ieee) {
3620                 float_raise(float_flag_invalid, status);
3621                 return packFloat16(aSign, 0, 0);
3622             }
3623             return commonNaNToFloat16(
3624                 float64ToCommonNaN(a, status), status);
3625         }
3626         /* Infinity */
3627         if (!ieee) {
3628             float_raise(float_flag_invalid, status);
3629             return packFloat16(aSign, 0x1f, 0x3ff);
3630         }
3631         return packFloat16(aSign, 0x1f, 0);
3632     }
3633     shift64RightJamming(aSig, 29, &aSig);
3634     zSig = aSig;
3635     if (aExp == 0 && zSig == 0) {
3636         return packFloat16(aSign, 0, 0);
3637     }
3638     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3639      * even if the input is denormal; however this is harmless because
3640      * the largest possible single-precision denormal is still smaller
3641      * than the smallest representable half-precision denormal, and so we
3642      * will end up ignoring aSig and returning via the "always return zero"
3643      * codepath.
3644      */
3645     zSig |= 0x00800000;
3646     aExp -= 0x3F1;
3647 
3648     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3649 }
3650 
3651 /*----------------------------------------------------------------------------
3652 | Returns the result of converting the double-precision floating-point value
3653 | `a' to the extended double-precision floating-point format.  The conversion
3654 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3655 | Arithmetic.
3656 *----------------------------------------------------------------------------*/
3657 
3658 floatx80 float64_to_floatx80(float64 a, float_status *status)
3659 {
3660     flag aSign;
3661     int aExp;
3662     uint64_t aSig;
3663 
3664     a = float64_squash_input_denormal(a, status);
3665     aSig = extractFloat64Frac( a );
3666     aExp = extractFloat64Exp( a );
3667     aSign = extractFloat64Sign( a );
3668     if ( aExp == 0x7FF ) {
3669         if (aSig) {
3670             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3671         }
3672         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3673     }
3674     if ( aExp == 0 ) {
3675         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3676         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3677     }
3678     return
3679         packFloatx80(
3680             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3681 
3682 }
3683 
3684 /*----------------------------------------------------------------------------
3685 | Returns the result of converting the double-precision floating-point value
3686 | `a' to the quadruple-precision floating-point format.  The conversion is
3687 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3688 | Arithmetic.
3689 *----------------------------------------------------------------------------*/
3690 
3691 float128 float64_to_float128(float64 a, float_status *status)
3692 {
3693     flag aSign;
3694     int aExp;
3695     uint64_t aSig, zSig0, zSig1;
3696 
3697     a = float64_squash_input_denormal(a, status);
3698     aSig = extractFloat64Frac( a );
3699     aExp = extractFloat64Exp( a );
3700     aSign = extractFloat64Sign( a );
3701     if ( aExp == 0x7FF ) {
3702         if (aSig) {
3703             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3704         }
3705         return packFloat128( aSign, 0x7FFF, 0, 0 );
3706     }
3707     if ( aExp == 0 ) {
3708         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3709         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3710         --aExp;
3711     }
3712     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3713     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3714 
3715 }
3716 
3717 /*----------------------------------------------------------------------------
3718 | Rounds the double-precision floating-point value `a' to an integer, and
3719 | returns the result as a double-precision floating-point value.  The
3720 | operation is performed according to the IEC/IEEE Standard for Binary
3721 | Floating-Point Arithmetic.
3722 *----------------------------------------------------------------------------*/
3723 
3724 float64 float64_round_to_int(float64 a, float_status *status)
3725 {
3726     flag aSign;
3727     int aExp;
3728     uint64_t lastBitMask, roundBitsMask;
3729     uint64_t z;
3730     a = float64_squash_input_denormal(a, status);
3731 
3732     aExp = extractFloat64Exp( a );
3733     if ( 0x433 <= aExp ) {
3734         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3735             return propagateFloat64NaN(a, a, status);
3736         }
3737         return a;
3738     }
3739     if ( aExp < 0x3FF ) {
3740         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3741         status->float_exception_flags |= float_flag_inexact;
3742         aSign = extractFloat64Sign( a );
3743         switch (status->float_rounding_mode) {
3744          case float_round_nearest_even:
3745             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3746                 return packFloat64( aSign, 0x3FF, 0 );
3747             }
3748             break;
3749         case float_round_ties_away:
3750             if (aExp == 0x3FE) {
3751                 return packFloat64(aSign, 0x3ff, 0);
3752             }
3753             break;
3754          case float_round_down:
3755             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3756          case float_round_up:
3757             return make_float64(
3758             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3759         }
3760         return packFloat64( aSign, 0, 0 );
3761     }
3762     lastBitMask = 1;
3763     lastBitMask <<= 0x433 - aExp;
3764     roundBitsMask = lastBitMask - 1;
3765     z = float64_val(a);
3766     switch (status->float_rounding_mode) {
3767     case float_round_nearest_even:
3768         z += lastBitMask >> 1;
3769         if ((z & roundBitsMask) == 0) {
3770             z &= ~lastBitMask;
3771         }
3772         break;
3773     case float_round_ties_away:
3774         z += lastBitMask >> 1;
3775         break;
3776     case float_round_to_zero:
3777         break;
3778     case float_round_up:
3779         if (!extractFloat64Sign(make_float64(z))) {
3780             z += roundBitsMask;
3781         }
3782         break;
3783     case float_round_down:
3784         if (extractFloat64Sign(make_float64(z))) {
3785             z += roundBitsMask;
3786         }
3787         break;
3788     default:
3789         abort();
3790     }
3791     z &= ~ roundBitsMask;
3792     if (z != float64_val(a)) {
3793         status->float_exception_flags |= float_flag_inexact;
3794     }
3795     return make_float64(z);
3796 
3797 }
3798 
3799 float64 float64_trunc_to_int(float64 a, float_status *status)
3800 {
3801     int oldmode;
3802     float64 res;
3803     oldmode = status->float_rounding_mode;
3804     status->float_rounding_mode = float_round_to_zero;
3805     res = float64_round_to_int(a, status);
3806     status->float_rounding_mode = oldmode;
3807     return res;
3808 }
3809 
3810 /*----------------------------------------------------------------------------
3811 | Returns the result of adding the absolute values of the double-precision
3812 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3813 | before being returned.  `zSign' is ignored if the result is a NaN.
3814 | The addition is performed according to the IEC/IEEE Standard for Binary
3815 | Floating-Point Arithmetic.
3816 *----------------------------------------------------------------------------*/
3817 
3818 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3819                               float_status *status)
3820 {
3821     int aExp, bExp, zExp;
3822     uint64_t aSig, bSig, zSig;
3823     int expDiff;
3824 
3825     aSig = extractFloat64Frac( a );
3826     aExp = extractFloat64Exp( a );
3827     bSig = extractFloat64Frac( b );
3828     bExp = extractFloat64Exp( b );
3829     expDiff = aExp - bExp;
3830     aSig <<= 9;
3831     bSig <<= 9;
3832     if ( 0 < expDiff ) {
3833         if ( aExp == 0x7FF ) {
3834             if (aSig) {
3835                 return propagateFloat64NaN(a, b, status);
3836             }
3837             return a;
3838         }
3839         if ( bExp == 0 ) {
3840             --expDiff;
3841         }
3842         else {
3843             bSig |= LIT64( 0x2000000000000000 );
3844         }
3845         shift64RightJamming( bSig, expDiff, &bSig );
3846         zExp = aExp;
3847     }
3848     else if ( expDiff < 0 ) {
3849         if ( bExp == 0x7FF ) {
3850             if (bSig) {
3851                 return propagateFloat64NaN(a, b, status);
3852             }
3853             return packFloat64( zSign, 0x7FF, 0 );
3854         }
3855         if ( aExp == 0 ) {
3856             ++expDiff;
3857         }
3858         else {
3859             aSig |= LIT64( 0x2000000000000000 );
3860         }
3861         shift64RightJamming( aSig, - expDiff, &aSig );
3862         zExp = bExp;
3863     }
3864     else {
3865         if ( aExp == 0x7FF ) {
3866             if (aSig | bSig) {
3867                 return propagateFloat64NaN(a, b, status);
3868             }
3869             return a;
3870         }
3871         if ( aExp == 0 ) {
3872             if (status->flush_to_zero) {
3873                 if (aSig | bSig) {
3874                     float_raise(float_flag_output_denormal, status);
3875                 }
3876                 return packFloat64(zSign, 0, 0);
3877             }
3878             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3879         }
3880         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3881         zExp = aExp;
3882         goto roundAndPack;
3883     }
3884     aSig |= LIT64( 0x2000000000000000 );
3885     zSig = ( aSig + bSig )<<1;
3886     --zExp;
3887     if ( (int64_t) zSig < 0 ) {
3888         zSig = aSig + bSig;
3889         ++zExp;
3890     }
3891  roundAndPack:
3892     return roundAndPackFloat64(zSign, zExp, zSig, status);
3893 
3894 }
3895 
3896 /*----------------------------------------------------------------------------
3897 | Returns the result of subtracting the absolute values of the double-
3898 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3899 | difference is negated before being returned.  `zSign' is ignored if the
3900 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3901 | Standard for Binary Floating-Point Arithmetic.
3902 *----------------------------------------------------------------------------*/
3903 
3904 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3905                               float_status *status)
3906 {
3907     int aExp, bExp, zExp;
3908     uint64_t aSig, bSig, zSig;
3909     int expDiff;
3910 
3911     aSig = extractFloat64Frac( a );
3912     aExp = extractFloat64Exp( a );
3913     bSig = extractFloat64Frac( b );
3914     bExp = extractFloat64Exp( b );
3915     expDiff = aExp - bExp;
3916     aSig <<= 10;
3917     bSig <<= 10;
3918     if ( 0 < expDiff ) goto aExpBigger;
3919     if ( expDiff < 0 ) goto bExpBigger;
3920     if ( aExp == 0x7FF ) {
3921         if (aSig | bSig) {
3922             return propagateFloat64NaN(a, b, status);
3923         }
3924         float_raise(float_flag_invalid, status);
3925         return float64_default_nan(status);
3926     }
3927     if ( aExp == 0 ) {
3928         aExp = 1;
3929         bExp = 1;
3930     }
3931     if ( bSig < aSig ) goto aBigger;
3932     if ( aSig < bSig ) goto bBigger;
3933     return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
3934  bExpBigger:
3935     if ( bExp == 0x7FF ) {
3936         if (bSig) {
3937             return propagateFloat64NaN(a, b, status);
3938         }
3939         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3940     }
3941     if ( aExp == 0 ) {
3942         ++expDiff;
3943     }
3944     else {
3945         aSig |= LIT64( 0x4000000000000000 );
3946     }
3947     shift64RightJamming( aSig, - expDiff, &aSig );
3948     bSig |= LIT64( 0x4000000000000000 );
3949  bBigger:
3950     zSig = bSig - aSig;
3951     zExp = bExp;
3952     zSign ^= 1;
3953     goto normalizeRoundAndPack;
3954  aExpBigger:
3955     if ( aExp == 0x7FF ) {
3956         if (aSig) {
3957             return propagateFloat64NaN(a, b, status);
3958         }
3959         return a;
3960     }
3961     if ( bExp == 0 ) {
3962         --expDiff;
3963     }
3964     else {
3965         bSig |= LIT64( 0x4000000000000000 );
3966     }
3967     shift64RightJamming( bSig, expDiff, &bSig );
3968     aSig |= LIT64( 0x4000000000000000 );
3969  aBigger:
3970     zSig = aSig - bSig;
3971     zExp = aExp;
3972  normalizeRoundAndPack:
3973     --zExp;
3974     return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
3975 
3976 }
3977 
3978 /*----------------------------------------------------------------------------
3979 | Returns the result of adding the double-precision floating-point values `a'
3980 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3981 | Binary Floating-Point Arithmetic.
3982 *----------------------------------------------------------------------------*/
3983 
3984 float64 float64_add(float64 a, float64 b, float_status *status)
3985 {
3986     flag aSign, bSign;
3987     a = float64_squash_input_denormal(a, status);
3988     b = float64_squash_input_denormal(b, status);
3989 
3990     aSign = extractFloat64Sign( a );
3991     bSign = extractFloat64Sign( b );
3992     if ( aSign == bSign ) {
3993         return addFloat64Sigs(a, b, aSign, status);
3994     }
3995     else {
3996         return subFloat64Sigs(a, b, aSign, status);
3997     }
3998 
3999 }
4000 
4001 /*----------------------------------------------------------------------------
4002 | Returns the result of subtracting the double-precision floating-point values
4003 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4004 | for Binary Floating-Point Arithmetic.
4005 *----------------------------------------------------------------------------*/
4006 
4007 float64 float64_sub(float64 a, float64 b, float_status *status)
4008 {
4009     flag aSign, bSign;
4010     a = float64_squash_input_denormal(a, status);
4011     b = float64_squash_input_denormal(b, status);
4012 
4013     aSign = extractFloat64Sign( a );
4014     bSign = extractFloat64Sign( b );
4015     if ( aSign == bSign ) {
4016         return subFloat64Sigs(a, b, aSign, status);
4017     }
4018     else {
4019         return addFloat64Sigs(a, b, aSign, status);
4020     }
4021 
4022 }
4023 
4024 /*----------------------------------------------------------------------------
4025 | Returns the result of multiplying the double-precision floating-point values
4026 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4027 | for Binary Floating-Point Arithmetic.
4028 *----------------------------------------------------------------------------*/
4029 
4030 float64 float64_mul(float64 a, float64 b, float_status *status)
4031 {
4032     flag aSign, bSign, zSign;
4033     int aExp, bExp, zExp;
4034     uint64_t aSig, bSig, zSig0, zSig1;
4035 
4036     a = float64_squash_input_denormal(a, status);
4037     b = float64_squash_input_denormal(b, status);
4038 
4039     aSig = extractFloat64Frac( a );
4040     aExp = extractFloat64Exp( a );
4041     aSign = extractFloat64Sign( a );
4042     bSig = extractFloat64Frac( b );
4043     bExp = extractFloat64Exp( b );
4044     bSign = extractFloat64Sign( b );
4045     zSign = aSign ^ bSign;
4046     if ( aExp == 0x7FF ) {
4047         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4048             return propagateFloat64NaN(a, b, status);
4049         }
4050         if ( ( bExp | bSig ) == 0 ) {
4051             float_raise(float_flag_invalid, status);
4052             return float64_default_nan(status);
4053         }
4054         return packFloat64( zSign, 0x7FF, 0 );
4055     }
4056     if ( bExp == 0x7FF ) {
4057         if (bSig) {
4058             return propagateFloat64NaN(a, b, status);
4059         }
4060         if ( ( aExp | aSig ) == 0 ) {
4061             float_raise(float_flag_invalid, status);
4062             return float64_default_nan(status);
4063         }
4064         return packFloat64( zSign, 0x7FF, 0 );
4065     }
4066     if ( aExp == 0 ) {
4067         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4068         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4069     }
4070     if ( bExp == 0 ) {
4071         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4072         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4073     }
4074     zExp = aExp + bExp - 0x3FF;
4075     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4076     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4077     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4078     zSig0 |= ( zSig1 != 0 );
4079     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
4080         zSig0 <<= 1;
4081         --zExp;
4082     }
4083     return roundAndPackFloat64(zSign, zExp, zSig0, status);
4084 
4085 }
4086 
4087 /*----------------------------------------------------------------------------
4088 | Returns the result of dividing the double-precision floating-point value `a'
4089 | by the corresponding value `b'.  The operation is performed according to
4090 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4091 *----------------------------------------------------------------------------*/
4092 
4093 float64 float64_div(float64 a, float64 b, float_status *status)
4094 {
4095     flag aSign, bSign, zSign;
4096     int aExp, bExp, zExp;
4097     uint64_t aSig, bSig, zSig;
4098     uint64_t rem0, rem1;
4099     uint64_t term0, term1;
4100     a = float64_squash_input_denormal(a, status);
4101     b = float64_squash_input_denormal(b, status);
4102 
4103     aSig = extractFloat64Frac( a );
4104     aExp = extractFloat64Exp( a );
4105     aSign = extractFloat64Sign( a );
4106     bSig = extractFloat64Frac( b );
4107     bExp = extractFloat64Exp( b );
4108     bSign = extractFloat64Sign( b );
4109     zSign = aSign ^ bSign;
4110     if ( aExp == 0x7FF ) {
4111         if (aSig) {
4112             return propagateFloat64NaN(a, b, status);
4113         }
4114         if ( bExp == 0x7FF ) {
4115             if (bSig) {
4116                 return propagateFloat64NaN(a, b, status);
4117             }
4118             float_raise(float_flag_invalid, status);
4119             return float64_default_nan(status);
4120         }
4121         return packFloat64( zSign, 0x7FF, 0 );
4122     }
4123     if ( bExp == 0x7FF ) {
4124         if (bSig) {
4125             return propagateFloat64NaN(a, b, status);
4126         }
4127         return packFloat64( zSign, 0, 0 );
4128     }
4129     if ( bExp == 0 ) {
4130         if ( bSig == 0 ) {
4131             if ( ( aExp | aSig ) == 0 ) {
4132                 float_raise(float_flag_invalid, status);
4133                 return float64_default_nan(status);
4134             }
4135             float_raise(float_flag_divbyzero, status);
4136             return packFloat64( zSign, 0x7FF, 0 );
4137         }
4138         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4139     }
4140     if ( aExp == 0 ) {
4141         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4142         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4143     }
4144     zExp = aExp - bExp + 0x3FD;
4145     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4146     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4147     if ( bSig <= ( aSig + aSig ) ) {
4148         aSig >>= 1;
4149         ++zExp;
4150     }
4151     zSig = estimateDiv128To64( aSig, 0, bSig );
4152     if ( ( zSig & 0x1FF ) <= 2 ) {
4153         mul64To128( bSig, zSig, &term0, &term1 );
4154         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4155         while ( (int64_t) rem0 < 0 ) {
4156             --zSig;
4157             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4158         }
4159         zSig |= ( rem1 != 0 );
4160     }
4161     return roundAndPackFloat64(zSign, zExp, zSig, status);
4162 
4163 }
4164 
4165 /*----------------------------------------------------------------------------
4166 | Returns the remainder of the double-precision floating-point value `a'
4167 | with respect to the corresponding value `b'.  The operation is performed
4168 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4169 *----------------------------------------------------------------------------*/
4170 
4171 float64 float64_rem(float64 a, float64 b, float_status *status)
4172 {
4173     flag aSign, zSign;
4174     int aExp, bExp, expDiff;
4175     uint64_t aSig, bSig;
4176     uint64_t q, alternateASig;
4177     int64_t sigMean;
4178 
4179     a = float64_squash_input_denormal(a, status);
4180     b = float64_squash_input_denormal(b, status);
4181     aSig = extractFloat64Frac( a );
4182     aExp = extractFloat64Exp( a );
4183     aSign = extractFloat64Sign( a );
4184     bSig = extractFloat64Frac( b );
4185     bExp = extractFloat64Exp( b );
4186     if ( aExp == 0x7FF ) {
4187         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4188             return propagateFloat64NaN(a, b, status);
4189         }
4190         float_raise(float_flag_invalid, status);
4191         return float64_default_nan(status);
4192     }
4193     if ( bExp == 0x7FF ) {
4194         if (bSig) {
4195             return propagateFloat64NaN(a, b, status);
4196         }
4197         return a;
4198     }
4199     if ( bExp == 0 ) {
4200         if ( bSig == 0 ) {
4201             float_raise(float_flag_invalid, status);
4202             return float64_default_nan(status);
4203         }
4204         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4205     }
4206     if ( aExp == 0 ) {
4207         if ( aSig == 0 ) return a;
4208         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4209     }
4210     expDiff = aExp - bExp;
4211     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4212     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4213     if ( expDiff < 0 ) {
4214         if ( expDiff < -1 ) return a;
4215         aSig >>= 1;
4216     }
4217     q = ( bSig <= aSig );
4218     if ( q ) aSig -= bSig;
4219     expDiff -= 64;
4220     while ( 0 < expDiff ) {
4221         q = estimateDiv128To64( aSig, 0, bSig );
4222         q = ( 2 < q ) ? q - 2 : 0;
4223         aSig = - ( ( bSig>>2 ) * q );
4224         expDiff -= 62;
4225     }
4226     expDiff += 64;
4227     if ( 0 < expDiff ) {
4228         q = estimateDiv128To64( aSig, 0, bSig );
4229         q = ( 2 < q ) ? q - 2 : 0;
4230         q >>= 64 - expDiff;
4231         bSig >>= 2;
4232         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4233     }
4234     else {
4235         aSig >>= 2;
4236         bSig >>= 2;
4237     }
4238     do {
4239         alternateASig = aSig;
4240         ++q;
4241         aSig -= bSig;
4242     } while ( 0 <= (int64_t) aSig );
4243     sigMean = aSig + alternateASig;
4244     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4245         aSig = alternateASig;
4246     }
4247     zSign = ( (int64_t) aSig < 0 );
4248     if ( zSign ) aSig = - aSig;
4249     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4250 
4251 }
4252 
4253 /*----------------------------------------------------------------------------
4254 | Returns the result of multiplying the double-precision floating-point values
4255 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4256 | multiplication.  The operation is performed according to the IEC/IEEE
4257 | Standard for Binary Floating-Point Arithmetic 754-2008.
4258 | The flags argument allows the caller to select negation of the
4259 | addend, the intermediate product, or the final result. (The difference
4260 | between this and having the caller do a separate negation is that negating
4261 | externally will flip the sign bit on NaNs.)
4262 *----------------------------------------------------------------------------*/
4263 
4264 float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4265                        float_status *status)
4266 {
4267     flag aSign, bSign, cSign, zSign;
4268     int aExp, bExp, cExp, pExp, zExp, expDiff;
4269     uint64_t aSig, bSig, cSig;
4270     flag pInf, pZero, pSign;
4271     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4272     int shiftcount;
4273     flag signflip, infzero;
4274 
4275     a = float64_squash_input_denormal(a, status);
4276     b = float64_squash_input_denormal(b, status);
4277     c = float64_squash_input_denormal(c, status);
4278     aSig = extractFloat64Frac(a);
4279     aExp = extractFloat64Exp(a);
4280     aSign = extractFloat64Sign(a);
4281     bSig = extractFloat64Frac(b);
4282     bExp = extractFloat64Exp(b);
4283     bSign = extractFloat64Sign(b);
4284     cSig = extractFloat64Frac(c);
4285     cExp = extractFloat64Exp(c);
4286     cSign = extractFloat64Sign(c);
4287 
4288     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4289                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4290 
4291     /* It is implementation-defined whether the cases of (0,inf,qnan)
4292      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4293      * they return if they do), so we have to hand this information
4294      * off to the target-specific pick-a-NaN routine.
4295      */
4296     if (((aExp == 0x7ff) && aSig) ||
4297         ((bExp == 0x7ff) && bSig) ||
4298         ((cExp == 0x7ff) && cSig)) {
4299         return propagateFloat64MulAddNaN(a, b, c, infzero, status);
4300     }
4301 
4302     if (infzero) {
4303         float_raise(float_flag_invalid, status);
4304         return float64_default_nan(status);
4305     }
4306 
4307     if (flags & float_muladd_negate_c) {
4308         cSign ^= 1;
4309     }
4310 
4311     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4312 
4313     /* Work out the sign and type of the product */
4314     pSign = aSign ^ bSign;
4315     if (flags & float_muladd_negate_product) {
4316         pSign ^= 1;
4317     }
4318     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4319     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4320 
4321     if (cExp == 0x7ff) {
4322         if (pInf && (pSign ^ cSign)) {
4323             /* addition of opposite-signed infinities => InvalidOperation */
4324             float_raise(float_flag_invalid, status);
4325             return float64_default_nan(status);
4326         }
4327         /* Otherwise generate an infinity of the same sign */
4328         return packFloat64(cSign ^ signflip, 0x7ff, 0);
4329     }
4330 
4331     if (pInf) {
4332         return packFloat64(pSign ^ signflip, 0x7ff, 0);
4333     }
4334 
4335     if (pZero) {
4336         if (cExp == 0) {
4337             if (cSig == 0) {
4338                 /* Adding two exact zeroes */
4339                 if (pSign == cSign) {
4340                     zSign = pSign;
4341                 } else if (status->float_rounding_mode == float_round_down) {
4342                     zSign = 1;
4343                 } else {
4344                     zSign = 0;
4345                 }
4346                 return packFloat64(zSign ^ signflip, 0, 0);
4347             }
4348             /* Exact zero plus a denorm */
4349             if (status->flush_to_zero) {
4350                 float_raise(float_flag_output_denormal, status);
4351                 return packFloat64(cSign ^ signflip, 0, 0);
4352             }
4353         }
4354         /* Zero plus something non-zero : just return the something */
4355         if (flags & float_muladd_halve_result) {
4356             if (cExp == 0) {
4357                 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4358             }
4359             /* Subtract one to halve, and one again because roundAndPackFloat64
4360              * wants one less than the true exponent.
4361              */
4362             cExp -= 2;
4363             cSig = (cSig | 0x0010000000000000ULL) << 10;
4364             return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
4365         }
4366         return packFloat64(cSign ^ signflip, cExp, cSig);
4367     }
4368 
4369     if (aExp == 0) {
4370         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4371     }
4372     if (bExp == 0) {
4373         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4374     }
4375 
4376     /* Calculate the actual result a * b + c */
4377 
4378     /* Multiply first; this is easy. */
4379     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4380      * because we want the true exponent, not the "one-less-than"
4381      * flavour that roundAndPackFloat64() takes.
4382      */
4383     pExp = aExp + bExp - 0x3fe;
4384     aSig = (aSig | LIT64(0x0010000000000000))<<10;
4385     bSig = (bSig | LIT64(0x0010000000000000))<<11;
4386     mul64To128(aSig, bSig, &pSig0, &pSig1);
4387     if ((int64_t)(pSig0 << 1) >= 0) {
4388         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4389         pExp--;
4390     }
4391 
4392     zSign = pSign ^ signflip;
4393 
4394     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4395      * bit in position 126.
4396      */
4397     if (cExp == 0) {
4398         if (!cSig) {
4399             /* Throw out the special case of c being an exact zero now */
4400             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4401             if (flags & float_muladd_halve_result) {
4402                 pExp--;
4403             }
4404             return roundAndPackFloat64(zSign, pExp - 1,
4405                                        pSig1, status);
4406         }
4407         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4408     }
4409 
4410     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4411      * significand of the addend, with the explicit bit in position 126.
4412      */
4413     cSig0 = cSig << (126 - 64 - 52);
4414     cSig1 = 0;
4415     cSig0 |= LIT64(0x4000000000000000);
4416     expDiff = pExp - cExp;
4417 
4418     if (pSign == cSign) {
4419         /* Addition */
4420         if (expDiff > 0) {
4421             /* scale c to match p */
4422             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4423             zExp = pExp;
4424         } else if (expDiff < 0) {
4425             /* scale p to match c */
4426             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4427             zExp = cExp;
4428         } else {
4429             /* no scaling needed */
4430             zExp = cExp;
4431         }
4432         /* Add significands and make sure explicit bit ends up in posn 126 */
4433         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4434         if ((int64_t)zSig0 < 0) {
4435             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4436         } else {
4437             zExp--;
4438         }
4439         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4440         if (flags & float_muladd_halve_result) {
4441             zExp--;
4442         }
4443         return roundAndPackFloat64(zSign, zExp, zSig1, status);
4444     } else {
4445         /* Subtraction */
4446         if (expDiff > 0) {
4447             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4448             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4449             zExp = pExp;
4450         } else if (expDiff < 0) {
4451             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4452             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4453             zExp = cExp;
4454             zSign ^= 1;
4455         } else {
4456             zExp = pExp;
4457             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4458                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4459             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4460                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4461                 zSign ^= 1;
4462             } else {
4463                 /* Exact zero */
4464                 zSign = signflip;
4465                 if (status->float_rounding_mode == float_round_down) {
4466                     zSign ^= 1;
4467                 }
4468                 return packFloat64(zSign, 0, 0);
4469             }
4470         }
4471         --zExp;
4472         /* Do the equivalent of normalizeRoundAndPackFloat64() but
4473          * starting with the significand in a pair of uint64_t.
4474          */
4475         if (zSig0) {
4476             shiftcount = countLeadingZeros64(zSig0) - 1;
4477             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4478             if (zSig1) {
4479                 zSig0 |= 1;
4480             }
4481             zExp -= shiftcount;
4482         } else {
4483             shiftcount = countLeadingZeros64(zSig1);
4484             if (shiftcount == 0) {
4485                 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4486                 zExp -= 63;
4487             } else {
4488                 shiftcount--;
4489                 zSig0 = zSig1 << shiftcount;
4490                 zExp -= (shiftcount + 64);
4491             }
4492         }
4493         if (flags & float_muladd_halve_result) {
4494             zExp--;
4495         }
4496         return roundAndPackFloat64(zSign, zExp, zSig0, status);
4497     }
4498 }
4499 
4500 /*----------------------------------------------------------------------------
4501 | Returns the square root of the double-precision floating-point value `a'.
4502 | The operation is performed according to the IEC/IEEE Standard for Binary
4503 | Floating-Point Arithmetic.
4504 *----------------------------------------------------------------------------*/
4505 
4506 float64 float64_sqrt(float64 a, float_status *status)
4507 {
4508     flag aSign;
4509     int aExp, zExp;
4510     uint64_t aSig, zSig, doubleZSig;
4511     uint64_t rem0, rem1, term0, term1;
4512     a = float64_squash_input_denormal(a, status);
4513 
4514     aSig = extractFloat64Frac( a );
4515     aExp = extractFloat64Exp( a );
4516     aSign = extractFloat64Sign( a );
4517     if ( aExp == 0x7FF ) {
4518         if (aSig) {
4519             return propagateFloat64NaN(a, a, status);
4520         }
4521         if ( ! aSign ) return a;
4522         float_raise(float_flag_invalid, status);
4523         return float64_default_nan(status);
4524     }
4525     if ( aSign ) {
4526         if ( ( aExp | aSig ) == 0 ) return a;
4527         float_raise(float_flag_invalid, status);
4528         return float64_default_nan(status);
4529     }
4530     if ( aExp == 0 ) {
4531         if ( aSig == 0 ) return float64_zero;
4532         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4533     }
4534     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4535     aSig |= LIT64( 0x0010000000000000 );
4536     zSig = estimateSqrt32( aExp, aSig>>21 );
4537     aSig <<= 9 - ( aExp & 1 );
4538     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4539     if ( ( zSig & 0x1FF ) <= 5 ) {
4540         doubleZSig = zSig<<1;
4541         mul64To128( zSig, zSig, &term0, &term1 );
4542         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4543         while ( (int64_t) rem0 < 0 ) {
4544             --zSig;
4545             doubleZSig -= 2;
4546             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4547         }
4548         zSig |= ( ( rem0 | rem1 ) != 0 );
4549     }
4550     return roundAndPackFloat64(0, zExp, zSig, status);
4551 
4552 }
4553 
4554 /*----------------------------------------------------------------------------
4555 | Returns the binary log of the double-precision floating-point value `a'.
4556 | The operation is performed according to the IEC/IEEE Standard for Binary
4557 | Floating-Point Arithmetic.
4558 *----------------------------------------------------------------------------*/
4559 float64 float64_log2(float64 a, float_status *status)
4560 {
4561     flag aSign, zSign;
4562     int aExp;
4563     uint64_t aSig, aSig0, aSig1, zSig, i;
4564     a = float64_squash_input_denormal(a, status);
4565 
4566     aSig = extractFloat64Frac( a );
4567     aExp = extractFloat64Exp( a );
4568     aSign = extractFloat64Sign( a );
4569 
4570     if ( aExp == 0 ) {
4571         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4572         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4573     }
4574     if ( aSign ) {
4575         float_raise(float_flag_invalid, status);
4576         return float64_default_nan(status);
4577     }
4578     if ( aExp == 0x7FF ) {
4579         if (aSig) {
4580             return propagateFloat64NaN(a, float64_zero, status);
4581         }
4582         return a;
4583     }
4584 
4585     aExp -= 0x3FF;
4586     aSig |= LIT64( 0x0010000000000000 );
4587     zSign = aExp < 0;
4588     zSig = (uint64_t)aExp << 52;
4589     for (i = 1LL << 51; i > 0; i >>= 1) {
4590         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4591         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4592         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4593             aSig >>= 1;
4594             zSig |= i;
4595         }
4596     }
4597 
4598     if ( zSign )
4599         zSig = -zSig;
4600     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4601 }
4602 
4603 /*----------------------------------------------------------------------------
4604 | Returns 1 if the double-precision floating-point value `a' is equal to the
4605 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4606 | if either operand is a NaN.  Otherwise, the comparison is performed
4607 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4608 *----------------------------------------------------------------------------*/
4609 
4610 int float64_eq(float64 a, float64 b, float_status *status)
4611 {
4612     uint64_t av, bv;
4613     a = float64_squash_input_denormal(a, status);
4614     b = float64_squash_input_denormal(b, status);
4615 
4616     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4617          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4618        ) {
4619         float_raise(float_flag_invalid, status);
4620         return 0;
4621     }
4622     av = float64_val(a);
4623     bv = float64_val(b);
4624     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4625 
4626 }
4627 
4628 /*----------------------------------------------------------------------------
4629 | Returns 1 if the double-precision floating-point value `a' is less than or
4630 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4631 | exception is raised if either operand is a NaN.  The comparison is performed
4632 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4633 *----------------------------------------------------------------------------*/
4634 
4635 int float64_le(float64 a, float64 b, float_status *status)
4636 {
4637     flag aSign, bSign;
4638     uint64_t av, bv;
4639     a = float64_squash_input_denormal(a, status);
4640     b = float64_squash_input_denormal(b, status);
4641 
4642     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4643          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4644        ) {
4645         float_raise(float_flag_invalid, status);
4646         return 0;
4647     }
4648     aSign = extractFloat64Sign( a );
4649     bSign = extractFloat64Sign( b );
4650     av = float64_val(a);
4651     bv = float64_val(b);
4652     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4653     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4654 
4655 }
4656 
4657 /*----------------------------------------------------------------------------
4658 | Returns 1 if the double-precision floating-point value `a' is less than
4659 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4660 | raised if either operand is a NaN.  The comparison is performed according
4661 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4662 *----------------------------------------------------------------------------*/
4663 
4664 int float64_lt(float64 a, float64 b, float_status *status)
4665 {
4666     flag aSign, bSign;
4667     uint64_t av, bv;
4668 
4669     a = float64_squash_input_denormal(a, status);
4670     b = float64_squash_input_denormal(b, status);
4671     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4672          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4673        ) {
4674         float_raise(float_flag_invalid, status);
4675         return 0;
4676     }
4677     aSign = extractFloat64Sign( a );
4678     bSign = extractFloat64Sign( b );
4679     av = float64_val(a);
4680     bv = float64_val(b);
4681     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4682     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4683 
4684 }
4685 
4686 /*----------------------------------------------------------------------------
4687 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4688 | be compared, and 0 otherwise.  The invalid exception is raised if either
4689 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4690 | Standard for Binary Floating-Point Arithmetic.
4691 *----------------------------------------------------------------------------*/
4692 
4693 int float64_unordered(float64 a, float64 b, float_status *status)
4694 {
4695     a = float64_squash_input_denormal(a, status);
4696     b = float64_squash_input_denormal(b, status);
4697 
4698     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4699          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4700        ) {
4701         float_raise(float_flag_invalid, status);
4702         return 1;
4703     }
4704     return 0;
4705 }
4706 
4707 /*----------------------------------------------------------------------------
4708 | Returns 1 if the double-precision floating-point value `a' is equal to the
4709 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4710 | exception.The comparison is performed according to the IEC/IEEE Standard
4711 | for Binary Floating-Point Arithmetic.
4712 *----------------------------------------------------------------------------*/
4713 
4714 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4715 {
4716     uint64_t av, bv;
4717     a = float64_squash_input_denormal(a, status);
4718     b = float64_squash_input_denormal(b, status);
4719 
4720     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4721          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4722        ) {
4723         if (float64_is_signaling_nan(a, status)
4724          || float64_is_signaling_nan(b, status)) {
4725             float_raise(float_flag_invalid, status);
4726         }
4727         return 0;
4728     }
4729     av = float64_val(a);
4730     bv = float64_val(b);
4731     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4732 
4733 }
4734 
4735 /*----------------------------------------------------------------------------
4736 | Returns 1 if the double-precision floating-point value `a' is less than or
4737 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4738 | cause an exception.  Otherwise, the comparison is performed according to the
4739 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4740 *----------------------------------------------------------------------------*/
4741 
4742 int float64_le_quiet(float64 a, float64 b, float_status *status)
4743 {
4744     flag aSign, bSign;
4745     uint64_t av, bv;
4746     a = float64_squash_input_denormal(a, status);
4747     b = float64_squash_input_denormal(b, status);
4748 
4749     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4750          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4751        ) {
4752         if (float64_is_signaling_nan(a, status)
4753          || float64_is_signaling_nan(b, status)) {
4754             float_raise(float_flag_invalid, status);
4755         }
4756         return 0;
4757     }
4758     aSign = extractFloat64Sign( a );
4759     bSign = extractFloat64Sign( b );
4760     av = float64_val(a);
4761     bv = float64_val(b);
4762     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4763     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4764 
4765 }
4766 
4767 /*----------------------------------------------------------------------------
4768 | Returns 1 if the double-precision floating-point value `a' is less than
4769 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4770 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4771 | Standard for Binary Floating-Point Arithmetic.
4772 *----------------------------------------------------------------------------*/
4773 
4774 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4775 {
4776     flag aSign, bSign;
4777     uint64_t av, bv;
4778     a = float64_squash_input_denormal(a, status);
4779     b = float64_squash_input_denormal(b, status);
4780 
4781     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4782          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4783        ) {
4784         if (float64_is_signaling_nan(a, status)
4785          || float64_is_signaling_nan(b, status)) {
4786             float_raise(float_flag_invalid, status);
4787         }
4788         return 0;
4789     }
4790     aSign = extractFloat64Sign( a );
4791     bSign = extractFloat64Sign( b );
4792     av = float64_val(a);
4793     bv = float64_val(b);
4794     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4795     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4796 
4797 }
4798 
4799 /*----------------------------------------------------------------------------
4800 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4801 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4802 | comparison is performed according to the IEC/IEEE Standard for Binary
4803 | Floating-Point Arithmetic.
4804 *----------------------------------------------------------------------------*/
4805 
4806 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4807 {
4808     a = float64_squash_input_denormal(a, status);
4809     b = float64_squash_input_denormal(b, status);
4810 
4811     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4812          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4813        ) {
4814         if (float64_is_signaling_nan(a, status)
4815          || float64_is_signaling_nan(b, status)) {
4816             float_raise(float_flag_invalid, status);
4817         }
4818         return 1;
4819     }
4820     return 0;
4821 }
4822 
4823 /*----------------------------------------------------------------------------
4824 | Returns the result of converting the extended double-precision floating-
4825 | point value `a' to the 32-bit two's complement integer format.  The
4826 | conversion is performed according to the IEC/IEEE Standard for Binary
4827 | Floating-Point Arithmetic---which means in particular that the conversion
4828 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4829 | largest positive integer is returned.  Otherwise, if the conversion
4830 | overflows, the largest integer with the same sign as `a' is returned.
4831 *----------------------------------------------------------------------------*/
4832 
4833 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4834 {
4835     flag aSign;
4836     int32_t aExp, shiftCount;
4837     uint64_t aSig;
4838 
4839     if (floatx80_invalid_encoding(a)) {
4840         float_raise(float_flag_invalid, status);
4841         return 1 << 31;
4842     }
4843     aSig = extractFloatx80Frac( a );
4844     aExp = extractFloatx80Exp( a );
4845     aSign = extractFloatx80Sign( a );
4846     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4847     shiftCount = 0x4037 - aExp;
4848     if ( shiftCount <= 0 ) shiftCount = 1;
4849     shift64RightJamming( aSig, shiftCount, &aSig );
4850     return roundAndPackInt32(aSign, aSig, status);
4851 
4852 }
4853 
4854 /*----------------------------------------------------------------------------
4855 | Returns the result of converting the extended double-precision floating-
4856 | point value `a' to the 32-bit two's complement integer format.  The
4857 | conversion is performed according to the IEC/IEEE Standard for Binary
4858 | Floating-Point Arithmetic, except that the conversion is always rounded
4859 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4860 | Otherwise, if the conversion overflows, the largest integer with the same
4861 | sign as `a' is returned.
4862 *----------------------------------------------------------------------------*/
4863 
4864 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4865 {
4866     flag aSign;
4867     int32_t aExp, shiftCount;
4868     uint64_t aSig, savedASig;
4869     int32_t z;
4870 
4871     if (floatx80_invalid_encoding(a)) {
4872         float_raise(float_flag_invalid, status);
4873         return 1 << 31;
4874     }
4875     aSig = extractFloatx80Frac( a );
4876     aExp = extractFloatx80Exp( a );
4877     aSign = extractFloatx80Sign( a );
4878     if ( 0x401E < aExp ) {
4879         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4880         goto invalid;
4881     }
4882     else if ( aExp < 0x3FFF ) {
4883         if (aExp || aSig) {
4884             status->float_exception_flags |= float_flag_inexact;
4885         }
4886         return 0;
4887     }
4888     shiftCount = 0x403E - aExp;
4889     savedASig = aSig;
4890     aSig >>= shiftCount;
4891     z = aSig;
4892     if ( aSign ) z = - z;
4893     if ( ( z < 0 ) ^ aSign ) {
4894  invalid:
4895         float_raise(float_flag_invalid, status);
4896         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4897     }
4898     if ( ( aSig<<shiftCount ) != savedASig ) {
4899         status->float_exception_flags |= float_flag_inexact;
4900     }
4901     return z;
4902 
4903 }
4904 
4905 /*----------------------------------------------------------------------------
4906 | Returns the result of converting the extended double-precision floating-
4907 | point value `a' to the 64-bit two's complement integer format.  The
4908 | conversion is performed according to the IEC/IEEE Standard for Binary
4909 | Floating-Point Arithmetic---which means in particular that the conversion
4910 | is rounded according to the current rounding mode.  If `a' is a NaN,
4911 | the largest positive integer is returned.  Otherwise, if the conversion
4912 | overflows, the largest integer with the same sign as `a' is returned.
4913 *----------------------------------------------------------------------------*/
4914 
4915 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4916 {
4917     flag aSign;
4918     int32_t aExp, shiftCount;
4919     uint64_t aSig, aSigExtra;
4920 
4921     if (floatx80_invalid_encoding(a)) {
4922         float_raise(float_flag_invalid, status);
4923         return 1ULL << 63;
4924     }
4925     aSig = extractFloatx80Frac( a );
4926     aExp = extractFloatx80Exp( a );
4927     aSign = extractFloatx80Sign( a );
4928     shiftCount = 0x403E - aExp;
4929     if ( shiftCount <= 0 ) {
4930         if ( shiftCount ) {
4931             float_raise(float_flag_invalid, status);
4932             if (    ! aSign
4933                  || (    ( aExp == 0x7FFF )
4934                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4935                ) {
4936                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4937             }
4938             return (int64_t) LIT64( 0x8000000000000000 );
4939         }
4940         aSigExtra = 0;
4941     }
4942     else {
4943         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4944     }
4945     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4946 
4947 }
4948 
4949 /*----------------------------------------------------------------------------
4950 | Returns the result of converting the extended double-precision floating-
4951 | point value `a' to the 64-bit two's complement integer format.  The
4952 | conversion is performed according to the IEC/IEEE Standard for Binary
4953 | Floating-Point Arithmetic, except that the conversion is always rounded
4954 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4955 | Otherwise, if the conversion overflows, the largest integer with the same
4956 | sign as `a' is returned.
4957 *----------------------------------------------------------------------------*/
4958 
4959 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4960 {
4961     flag aSign;
4962     int32_t aExp, shiftCount;
4963     uint64_t aSig;
4964     int64_t z;
4965 
4966     if (floatx80_invalid_encoding(a)) {
4967         float_raise(float_flag_invalid, status);
4968         return 1ULL << 63;
4969     }
4970     aSig = extractFloatx80Frac( a );
4971     aExp = extractFloatx80Exp( a );
4972     aSign = extractFloatx80Sign( a );
4973     shiftCount = aExp - 0x403E;
4974     if ( 0 <= shiftCount ) {
4975         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4976         if ( ( a.high != 0xC03E ) || aSig ) {
4977             float_raise(float_flag_invalid, status);
4978             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4979                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4980             }
4981         }
4982         return (int64_t) LIT64( 0x8000000000000000 );
4983     }
4984     else if ( aExp < 0x3FFF ) {
4985         if (aExp | aSig) {
4986             status->float_exception_flags |= float_flag_inexact;
4987         }
4988         return 0;
4989     }
4990     z = aSig>>( - shiftCount );
4991     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4992         status->float_exception_flags |= float_flag_inexact;
4993     }
4994     if ( aSign ) z = - z;
4995     return z;
4996 
4997 }
4998 
4999 /*----------------------------------------------------------------------------
5000 | Returns the result of converting the extended double-precision floating-
5001 | point value `a' to the single-precision floating-point format.  The
5002 | conversion is performed according to the IEC/IEEE Standard for Binary
5003 | Floating-Point Arithmetic.
5004 *----------------------------------------------------------------------------*/
5005 
5006 float32 floatx80_to_float32(floatx80 a, float_status *status)
5007 {
5008     flag aSign;
5009     int32_t aExp;
5010     uint64_t aSig;
5011 
5012     if (floatx80_invalid_encoding(a)) {
5013         float_raise(float_flag_invalid, status);
5014         return float32_default_nan(status);
5015     }
5016     aSig = extractFloatx80Frac( a );
5017     aExp = extractFloatx80Exp( a );
5018     aSign = extractFloatx80Sign( a );
5019     if ( aExp == 0x7FFF ) {
5020         if ( (uint64_t) ( aSig<<1 ) ) {
5021             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5022         }
5023         return packFloat32( aSign, 0xFF, 0 );
5024     }
5025     shift64RightJamming( aSig, 33, &aSig );
5026     if ( aExp || aSig ) aExp -= 0x3F81;
5027     return roundAndPackFloat32(aSign, aExp, aSig, status);
5028 
5029 }
5030 
5031 /*----------------------------------------------------------------------------
5032 | Returns the result of converting the extended double-precision floating-
5033 | point value `a' to the double-precision floating-point format.  The
5034 | conversion is performed according to the IEC/IEEE Standard for Binary
5035 | Floating-Point Arithmetic.
5036 *----------------------------------------------------------------------------*/
5037 
5038 float64 floatx80_to_float64(floatx80 a, float_status *status)
5039 {
5040     flag aSign;
5041     int32_t aExp;
5042     uint64_t aSig, zSig;
5043 
5044     if (floatx80_invalid_encoding(a)) {
5045         float_raise(float_flag_invalid, status);
5046         return float64_default_nan(status);
5047     }
5048     aSig = extractFloatx80Frac( a );
5049     aExp = extractFloatx80Exp( a );
5050     aSign = extractFloatx80Sign( a );
5051     if ( aExp == 0x7FFF ) {
5052         if ( (uint64_t) ( aSig<<1 ) ) {
5053             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5054         }
5055         return packFloat64( aSign, 0x7FF, 0 );
5056     }
5057     shift64RightJamming( aSig, 1, &zSig );
5058     if ( aExp || aSig ) aExp -= 0x3C01;
5059     return roundAndPackFloat64(aSign, aExp, zSig, status);
5060 
5061 }
5062 
5063 /*----------------------------------------------------------------------------
5064 | Returns the result of converting the extended double-precision floating-
5065 | point value `a' to the quadruple-precision floating-point format.  The
5066 | conversion is performed according to the IEC/IEEE Standard for Binary
5067 | Floating-Point Arithmetic.
5068 *----------------------------------------------------------------------------*/
5069 
5070 float128 floatx80_to_float128(floatx80 a, float_status *status)
5071 {
5072     flag aSign;
5073     int aExp;
5074     uint64_t aSig, zSig0, zSig1;
5075 
5076     if (floatx80_invalid_encoding(a)) {
5077         float_raise(float_flag_invalid, status);
5078         return float128_default_nan(status);
5079     }
5080     aSig = extractFloatx80Frac( a );
5081     aExp = extractFloatx80Exp( a );
5082     aSign = extractFloatx80Sign( a );
5083     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5084         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5085     }
5086     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5087     return packFloat128( aSign, aExp, zSig0, zSig1 );
5088 
5089 }
5090 
5091 /*----------------------------------------------------------------------------
5092 | Rounds the extended double-precision floating-point value `a'
5093 | to the precision provided by floatx80_rounding_precision and returns the
5094 | result as an extended double-precision floating-point value.
5095 | The operation is performed according to the IEC/IEEE Standard for Binary
5096 | Floating-Point Arithmetic.
5097 *----------------------------------------------------------------------------*/
5098 
5099 floatx80 floatx80_round(floatx80 a, float_status *status)
5100 {
5101     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5102                                 extractFloatx80Sign(a),
5103                                 extractFloatx80Exp(a),
5104                                 extractFloatx80Frac(a), 0, status);
5105 }
5106 
5107 /*----------------------------------------------------------------------------
5108 | Rounds the extended double-precision floating-point value `a' to an integer,
5109 | and returns the result as an extended quadruple-precision floating-point
5110 | value.  The operation is performed according to the IEC/IEEE Standard for
5111 | Binary Floating-Point Arithmetic.
5112 *----------------------------------------------------------------------------*/
5113 
5114 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5115 {
5116     flag aSign;
5117     int32_t aExp;
5118     uint64_t lastBitMask, roundBitsMask;
5119     floatx80 z;
5120 
5121     if (floatx80_invalid_encoding(a)) {
5122         float_raise(float_flag_invalid, status);
5123         return floatx80_default_nan(status);
5124     }
5125     aExp = extractFloatx80Exp( a );
5126     if ( 0x403E <= aExp ) {
5127         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5128             return propagateFloatx80NaN(a, a, status);
5129         }
5130         return a;
5131     }
5132     if ( aExp < 0x3FFF ) {
5133         if (    ( aExp == 0 )
5134              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5135             return a;
5136         }
5137         status->float_exception_flags |= float_flag_inexact;
5138         aSign = extractFloatx80Sign( a );
5139         switch (status->float_rounding_mode) {
5140          case float_round_nearest_even:
5141             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5142                ) {
5143                 return
5144                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5145             }
5146             break;
5147         case float_round_ties_away:
5148             if (aExp == 0x3FFE) {
5149                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5150             }
5151             break;
5152          case float_round_down:
5153             return
5154                   aSign ?
5155                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5156                 : packFloatx80( 0, 0, 0 );
5157          case float_round_up:
5158             return
5159                   aSign ? packFloatx80( 1, 0, 0 )
5160                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5161         }
5162         return packFloatx80( aSign, 0, 0 );
5163     }
5164     lastBitMask = 1;
5165     lastBitMask <<= 0x403E - aExp;
5166     roundBitsMask = lastBitMask - 1;
5167     z = a;
5168     switch (status->float_rounding_mode) {
5169     case float_round_nearest_even:
5170         z.low += lastBitMask>>1;
5171         if ((z.low & roundBitsMask) == 0) {
5172             z.low &= ~lastBitMask;
5173         }
5174         break;
5175     case float_round_ties_away:
5176         z.low += lastBitMask >> 1;
5177         break;
5178     case float_round_to_zero:
5179         break;
5180     case float_round_up:
5181         if (!extractFloatx80Sign(z)) {
5182             z.low += roundBitsMask;
5183         }
5184         break;
5185     case float_round_down:
5186         if (extractFloatx80Sign(z)) {
5187             z.low += roundBitsMask;
5188         }
5189         break;
5190     default:
5191         abort();
5192     }
5193     z.low &= ~ roundBitsMask;
5194     if ( z.low == 0 ) {
5195         ++z.high;
5196         z.low = LIT64( 0x8000000000000000 );
5197     }
5198     if (z.low != a.low) {
5199         status->float_exception_flags |= float_flag_inexact;
5200     }
5201     return z;
5202 
5203 }
5204 
5205 /*----------------------------------------------------------------------------
5206 | Returns the result of adding the absolute values of the extended double-
5207 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5208 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5209 | The addition is performed according to the IEC/IEEE Standard for Binary
5210 | Floating-Point Arithmetic.
5211 *----------------------------------------------------------------------------*/
5212 
5213 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5214                                 float_status *status)
5215 {
5216     int32_t aExp, bExp, zExp;
5217     uint64_t aSig, bSig, zSig0, zSig1;
5218     int32_t expDiff;
5219 
5220     aSig = extractFloatx80Frac( a );
5221     aExp = extractFloatx80Exp( a );
5222     bSig = extractFloatx80Frac( b );
5223     bExp = extractFloatx80Exp( b );
5224     expDiff = aExp - bExp;
5225     if ( 0 < expDiff ) {
5226         if ( aExp == 0x7FFF ) {
5227             if ((uint64_t)(aSig << 1)) {
5228                 return propagateFloatx80NaN(a, b, status);
5229             }
5230             return a;
5231         }
5232         if ( bExp == 0 ) --expDiff;
5233         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5234         zExp = aExp;
5235     }
5236     else if ( expDiff < 0 ) {
5237         if ( bExp == 0x7FFF ) {
5238             if ((uint64_t)(bSig << 1)) {
5239                 return propagateFloatx80NaN(a, b, status);
5240             }
5241             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5242         }
5243         if ( aExp == 0 ) ++expDiff;
5244         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5245         zExp = bExp;
5246     }
5247     else {
5248         if ( aExp == 0x7FFF ) {
5249             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5250                 return propagateFloatx80NaN(a, b, status);
5251             }
5252             return a;
5253         }
5254         zSig1 = 0;
5255         zSig0 = aSig + bSig;
5256         if ( aExp == 0 ) {
5257             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5258             goto roundAndPack;
5259         }
5260         zExp = aExp;
5261         goto shiftRight1;
5262     }
5263     zSig0 = aSig + bSig;
5264     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5265  shiftRight1:
5266     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5267     zSig0 |= LIT64( 0x8000000000000000 );
5268     ++zExp;
5269  roundAndPack:
5270     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5271                                 zSign, zExp, zSig0, zSig1, status);
5272 }
5273 
5274 /*----------------------------------------------------------------------------
5275 | Returns the result of subtracting the absolute values of the extended
5276 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5277 | difference is negated before being returned.  `zSign' is ignored if the
5278 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5279 | Standard for Binary Floating-Point Arithmetic.
5280 *----------------------------------------------------------------------------*/
5281 
5282 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5283                                 float_status *status)
5284 {
5285     int32_t aExp, bExp, zExp;
5286     uint64_t aSig, bSig, zSig0, zSig1;
5287     int32_t expDiff;
5288 
5289     aSig = extractFloatx80Frac( a );
5290     aExp = extractFloatx80Exp( a );
5291     bSig = extractFloatx80Frac( b );
5292     bExp = extractFloatx80Exp( b );
5293     expDiff = aExp - bExp;
5294     if ( 0 < expDiff ) goto aExpBigger;
5295     if ( expDiff < 0 ) goto bExpBigger;
5296     if ( aExp == 0x7FFF ) {
5297         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5298             return propagateFloatx80NaN(a, b, status);
5299         }
5300         float_raise(float_flag_invalid, status);
5301         return floatx80_default_nan(status);
5302     }
5303     if ( aExp == 0 ) {
5304         aExp = 1;
5305         bExp = 1;
5306     }
5307     zSig1 = 0;
5308     if ( bSig < aSig ) goto aBigger;
5309     if ( aSig < bSig ) goto bBigger;
5310     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5311  bExpBigger:
5312     if ( bExp == 0x7FFF ) {
5313         if ((uint64_t)(bSig << 1)) {
5314             return propagateFloatx80NaN(a, b, status);
5315         }
5316         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5317     }
5318     if ( aExp == 0 ) ++expDiff;
5319     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5320  bBigger:
5321     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5322     zExp = bExp;
5323     zSign ^= 1;
5324     goto normalizeRoundAndPack;
5325  aExpBigger:
5326     if ( aExp == 0x7FFF ) {
5327         if ((uint64_t)(aSig << 1)) {
5328             return propagateFloatx80NaN(a, b, status);
5329         }
5330         return a;
5331     }
5332     if ( bExp == 0 ) --expDiff;
5333     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5334  aBigger:
5335     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5336     zExp = aExp;
5337  normalizeRoundAndPack:
5338     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5339                                          zSign, zExp, zSig0, zSig1, status);
5340 }
5341 
5342 /*----------------------------------------------------------------------------
5343 | Returns the result of adding the extended double-precision floating-point
5344 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5345 | Standard for Binary Floating-Point Arithmetic.
5346 *----------------------------------------------------------------------------*/
5347 
5348 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5349 {
5350     flag aSign, bSign;
5351 
5352     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5353         float_raise(float_flag_invalid, status);
5354         return floatx80_default_nan(status);
5355     }
5356     aSign = extractFloatx80Sign( a );
5357     bSign = extractFloatx80Sign( b );
5358     if ( aSign == bSign ) {
5359         return addFloatx80Sigs(a, b, aSign, status);
5360     }
5361     else {
5362         return subFloatx80Sigs(a, b, aSign, status);
5363     }
5364 
5365 }
5366 
5367 /*----------------------------------------------------------------------------
5368 | Returns the result of subtracting the extended double-precision floating-
5369 | point values `a' and `b'.  The operation is performed according to the
5370 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5371 *----------------------------------------------------------------------------*/
5372 
5373 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5374 {
5375     flag aSign, bSign;
5376 
5377     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5378         float_raise(float_flag_invalid, status);
5379         return floatx80_default_nan(status);
5380     }
5381     aSign = extractFloatx80Sign( a );
5382     bSign = extractFloatx80Sign( b );
5383     if ( aSign == bSign ) {
5384         return subFloatx80Sigs(a, b, aSign, status);
5385     }
5386     else {
5387         return addFloatx80Sigs(a, b, aSign, status);
5388     }
5389 
5390 }
5391 
5392 /*----------------------------------------------------------------------------
5393 | Returns the result of multiplying the extended double-precision floating-
5394 | point values `a' and `b'.  The operation is performed according to the
5395 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5396 *----------------------------------------------------------------------------*/
5397 
5398 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5399 {
5400     flag aSign, bSign, zSign;
5401     int32_t aExp, bExp, zExp;
5402     uint64_t aSig, bSig, zSig0, zSig1;
5403 
5404     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5405         float_raise(float_flag_invalid, status);
5406         return floatx80_default_nan(status);
5407     }
5408     aSig = extractFloatx80Frac( a );
5409     aExp = extractFloatx80Exp( a );
5410     aSign = extractFloatx80Sign( a );
5411     bSig = extractFloatx80Frac( b );
5412     bExp = extractFloatx80Exp( b );
5413     bSign = extractFloatx80Sign( b );
5414     zSign = aSign ^ bSign;
5415     if ( aExp == 0x7FFF ) {
5416         if (    (uint64_t) ( aSig<<1 )
5417              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5418             return propagateFloatx80NaN(a, b, status);
5419         }
5420         if ( ( bExp | bSig ) == 0 ) goto invalid;
5421         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5422     }
5423     if ( bExp == 0x7FFF ) {
5424         if ((uint64_t)(bSig << 1)) {
5425             return propagateFloatx80NaN(a, b, status);
5426         }
5427         if ( ( aExp | aSig ) == 0 ) {
5428  invalid:
5429             float_raise(float_flag_invalid, status);
5430             return floatx80_default_nan(status);
5431         }
5432         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5433     }
5434     if ( aExp == 0 ) {
5435         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5436         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5437     }
5438     if ( bExp == 0 ) {
5439         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5440         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5441     }
5442     zExp = aExp + bExp - 0x3FFE;
5443     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5444     if ( 0 < (int64_t) zSig0 ) {
5445         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5446         --zExp;
5447     }
5448     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5449                                 zSign, zExp, zSig0, zSig1, status);
5450 }
5451 
5452 /*----------------------------------------------------------------------------
5453 | Returns the result of dividing the extended double-precision floating-point
5454 | value `a' by the corresponding value `b'.  The operation is performed
5455 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5456 *----------------------------------------------------------------------------*/
5457 
5458 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5459 {
5460     flag aSign, bSign, zSign;
5461     int32_t aExp, bExp, zExp;
5462     uint64_t aSig, bSig, zSig0, zSig1;
5463     uint64_t rem0, rem1, rem2, term0, term1, term2;
5464 
5465     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5466         float_raise(float_flag_invalid, status);
5467         return floatx80_default_nan(status);
5468     }
5469     aSig = extractFloatx80Frac( a );
5470     aExp = extractFloatx80Exp( a );
5471     aSign = extractFloatx80Sign( a );
5472     bSig = extractFloatx80Frac( b );
5473     bExp = extractFloatx80Exp( b );
5474     bSign = extractFloatx80Sign( b );
5475     zSign = aSign ^ bSign;
5476     if ( aExp == 0x7FFF ) {
5477         if ((uint64_t)(aSig << 1)) {
5478             return propagateFloatx80NaN(a, b, status);
5479         }
5480         if ( bExp == 0x7FFF ) {
5481             if ((uint64_t)(bSig << 1)) {
5482                 return propagateFloatx80NaN(a, b, status);
5483             }
5484             goto invalid;
5485         }
5486         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5487     }
5488     if ( bExp == 0x7FFF ) {
5489         if ((uint64_t)(bSig << 1)) {
5490             return propagateFloatx80NaN(a, b, status);
5491         }
5492         return packFloatx80( zSign, 0, 0 );
5493     }
5494     if ( bExp == 0 ) {
5495         if ( bSig == 0 ) {
5496             if ( ( aExp | aSig ) == 0 ) {
5497  invalid:
5498                 float_raise(float_flag_invalid, status);
5499                 return floatx80_default_nan(status);
5500             }
5501             float_raise(float_flag_divbyzero, status);
5502             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5503         }
5504         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5505     }
5506     if ( aExp == 0 ) {
5507         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5508         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5509     }
5510     zExp = aExp - bExp + 0x3FFE;
5511     rem1 = 0;
5512     if ( bSig <= aSig ) {
5513         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5514         ++zExp;
5515     }
5516     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5517     mul64To128( bSig, zSig0, &term0, &term1 );
5518     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5519     while ( (int64_t) rem0 < 0 ) {
5520         --zSig0;
5521         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5522     }
5523     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5524     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5525         mul64To128( bSig, zSig1, &term1, &term2 );
5526         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5527         while ( (int64_t) rem1 < 0 ) {
5528             --zSig1;
5529             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5530         }
5531         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5532     }
5533     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5534                                 zSign, zExp, zSig0, zSig1, status);
5535 }
5536 
5537 /*----------------------------------------------------------------------------
5538 | Returns the remainder of the extended double-precision floating-point value
5539 | `a' with respect to the corresponding value `b'.  The operation is performed
5540 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5541 *----------------------------------------------------------------------------*/
5542 
5543 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5544 {
5545     flag aSign, zSign;
5546     int32_t aExp, bExp, expDiff;
5547     uint64_t aSig0, aSig1, bSig;
5548     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5549 
5550     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5551         float_raise(float_flag_invalid, status);
5552         return floatx80_default_nan(status);
5553     }
5554     aSig0 = extractFloatx80Frac( a );
5555     aExp = extractFloatx80Exp( a );
5556     aSign = extractFloatx80Sign( a );
5557     bSig = extractFloatx80Frac( b );
5558     bExp = extractFloatx80Exp( b );
5559     if ( aExp == 0x7FFF ) {
5560         if (    (uint64_t) ( aSig0<<1 )
5561              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5562             return propagateFloatx80NaN(a, b, status);
5563         }
5564         goto invalid;
5565     }
5566     if ( bExp == 0x7FFF ) {
5567         if ((uint64_t)(bSig << 1)) {
5568             return propagateFloatx80NaN(a, b, status);
5569         }
5570         return a;
5571     }
5572     if ( bExp == 0 ) {
5573         if ( bSig == 0 ) {
5574  invalid:
5575             float_raise(float_flag_invalid, status);
5576             return floatx80_default_nan(status);
5577         }
5578         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5579     }
5580     if ( aExp == 0 ) {
5581         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5582         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5583     }
5584     bSig |= LIT64( 0x8000000000000000 );
5585     zSign = aSign;
5586     expDiff = aExp - bExp;
5587     aSig1 = 0;
5588     if ( expDiff < 0 ) {
5589         if ( expDiff < -1 ) return a;
5590         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5591         expDiff = 0;
5592     }
5593     q = ( bSig <= aSig0 );
5594     if ( q ) aSig0 -= bSig;
5595     expDiff -= 64;
5596     while ( 0 < expDiff ) {
5597         q = estimateDiv128To64( aSig0, aSig1, bSig );
5598         q = ( 2 < q ) ? q - 2 : 0;
5599         mul64To128( bSig, q, &term0, &term1 );
5600         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5601         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5602         expDiff -= 62;
5603     }
5604     expDiff += 64;
5605     if ( 0 < expDiff ) {
5606         q = estimateDiv128To64( aSig0, aSig1, bSig );
5607         q = ( 2 < q ) ? q - 2 : 0;
5608         q >>= 64 - expDiff;
5609         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5610         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5611         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5612         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5613             ++q;
5614             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5615         }
5616     }
5617     else {
5618         term1 = 0;
5619         term0 = bSig;
5620     }
5621     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5622     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5623          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5624               && ( q & 1 ) )
5625        ) {
5626         aSig0 = alternateASig0;
5627         aSig1 = alternateASig1;
5628         zSign = ! zSign;
5629     }
5630     return
5631         normalizeRoundAndPackFloatx80(
5632             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5633 
5634 }
5635 
5636 /*----------------------------------------------------------------------------
5637 | Returns the square root of the extended double-precision floating-point
5638 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5639 | for Binary Floating-Point Arithmetic.
5640 *----------------------------------------------------------------------------*/
5641 
5642 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5643 {
5644     flag aSign;
5645     int32_t aExp, zExp;
5646     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5647     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5648 
5649     if (floatx80_invalid_encoding(a)) {
5650         float_raise(float_flag_invalid, status);
5651         return floatx80_default_nan(status);
5652     }
5653     aSig0 = extractFloatx80Frac( a );
5654     aExp = extractFloatx80Exp( a );
5655     aSign = extractFloatx80Sign( a );
5656     if ( aExp == 0x7FFF ) {
5657         if ((uint64_t)(aSig0 << 1)) {
5658             return propagateFloatx80NaN(a, a, status);
5659         }
5660         if ( ! aSign ) return a;
5661         goto invalid;
5662     }
5663     if ( aSign ) {
5664         if ( ( aExp | aSig0 ) == 0 ) return a;
5665  invalid:
5666         float_raise(float_flag_invalid, status);
5667         return floatx80_default_nan(status);
5668     }
5669     if ( aExp == 0 ) {
5670         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5671         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5672     }
5673     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5674     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5675     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5676     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5677     doubleZSig0 = zSig0<<1;
5678     mul64To128( zSig0, zSig0, &term0, &term1 );
5679     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5680     while ( (int64_t) rem0 < 0 ) {
5681         --zSig0;
5682         doubleZSig0 -= 2;
5683         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5684     }
5685     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5686     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5687         if ( zSig1 == 0 ) zSig1 = 1;
5688         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5689         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5690         mul64To128( zSig1, zSig1, &term2, &term3 );
5691         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5692         while ( (int64_t) rem1 < 0 ) {
5693             --zSig1;
5694             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5695             term3 |= 1;
5696             term2 |= doubleZSig0;
5697             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5698         }
5699         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5700     }
5701     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5702     zSig0 |= doubleZSig0;
5703     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5704                                 0, zExp, zSig0, zSig1, status);
5705 }
5706 
5707 /*----------------------------------------------------------------------------
5708 | Returns 1 if the extended double-precision floating-point value `a' is equal
5709 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5710 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5711 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5712 *----------------------------------------------------------------------------*/
5713 
5714 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5715 {
5716 
5717     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5718         || (extractFloatx80Exp(a) == 0x7FFF
5719             && (uint64_t) (extractFloatx80Frac(a) << 1))
5720         || (extractFloatx80Exp(b) == 0x7FFF
5721             && (uint64_t) (extractFloatx80Frac(b) << 1))
5722        ) {
5723         float_raise(float_flag_invalid, status);
5724         return 0;
5725     }
5726     return
5727            ( a.low == b.low )
5728         && (    ( a.high == b.high )
5729              || (    ( a.low == 0 )
5730                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5731            );
5732 
5733 }
5734 
5735 /*----------------------------------------------------------------------------
5736 | Returns 1 if the extended double-precision floating-point value `a' is
5737 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5738 | invalid exception is raised if either operand is a NaN.  The comparison is
5739 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5740 | Arithmetic.
5741 *----------------------------------------------------------------------------*/
5742 
5743 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5744 {
5745     flag aSign, bSign;
5746 
5747     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5748         || (extractFloatx80Exp(a) == 0x7FFF
5749             && (uint64_t) (extractFloatx80Frac(a) << 1))
5750         || (extractFloatx80Exp(b) == 0x7FFF
5751             && (uint64_t) (extractFloatx80Frac(b) << 1))
5752        ) {
5753         float_raise(float_flag_invalid, status);
5754         return 0;
5755     }
5756     aSign = extractFloatx80Sign( a );
5757     bSign = extractFloatx80Sign( b );
5758     if ( aSign != bSign ) {
5759         return
5760                aSign
5761             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5762                  == 0 );
5763     }
5764     return
5765           aSign ? le128( b.high, b.low, a.high, a.low )
5766         : le128( a.high, a.low, b.high, b.low );
5767 
5768 }
5769 
5770 /*----------------------------------------------------------------------------
5771 | Returns 1 if the extended double-precision floating-point value `a' is
5772 | less than the corresponding value `b', and 0 otherwise.  The invalid
5773 | exception is raised if either operand is a NaN.  The comparison is performed
5774 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5775 *----------------------------------------------------------------------------*/
5776 
5777 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5778 {
5779     flag aSign, bSign;
5780 
5781     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5782         || (extractFloatx80Exp(a) == 0x7FFF
5783             && (uint64_t) (extractFloatx80Frac(a) << 1))
5784         || (extractFloatx80Exp(b) == 0x7FFF
5785             && (uint64_t) (extractFloatx80Frac(b) << 1))
5786        ) {
5787         float_raise(float_flag_invalid, status);
5788         return 0;
5789     }
5790     aSign = extractFloatx80Sign( a );
5791     bSign = extractFloatx80Sign( b );
5792     if ( aSign != bSign ) {
5793         return
5794                aSign
5795             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5796                  != 0 );
5797     }
5798     return
5799           aSign ? lt128( b.high, b.low, a.high, a.low )
5800         : lt128( a.high, a.low, b.high, b.low );
5801 
5802 }
5803 
5804 /*----------------------------------------------------------------------------
5805 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5806 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5807 | either operand is a NaN.   The comparison is performed according to the
5808 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5809 *----------------------------------------------------------------------------*/
5810 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5811 {
5812     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5813         || (extractFloatx80Exp(a) == 0x7FFF
5814             && (uint64_t) (extractFloatx80Frac(a) << 1))
5815         || (extractFloatx80Exp(b) == 0x7FFF
5816             && (uint64_t) (extractFloatx80Frac(b) << 1))
5817        ) {
5818         float_raise(float_flag_invalid, status);
5819         return 1;
5820     }
5821     return 0;
5822 }
5823 
5824 /*----------------------------------------------------------------------------
5825 | Returns 1 if the extended double-precision floating-point value `a' is
5826 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5827 | cause an exception.  The comparison is performed according to the IEC/IEEE
5828 | Standard for Binary Floating-Point Arithmetic.
5829 *----------------------------------------------------------------------------*/
5830 
5831 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5832 {
5833 
5834     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5835         float_raise(float_flag_invalid, status);
5836         return 0;
5837     }
5838     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5839               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5840          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5841               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5842        ) {
5843         if (floatx80_is_signaling_nan(a, status)
5844          || floatx80_is_signaling_nan(b, status)) {
5845             float_raise(float_flag_invalid, status);
5846         }
5847         return 0;
5848     }
5849     return
5850            ( a.low == b.low )
5851         && (    ( a.high == b.high )
5852              || (    ( a.low == 0 )
5853                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5854            );
5855 
5856 }
5857 
5858 /*----------------------------------------------------------------------------
5859 | Returns 1 if the extended double-precision floating-point value `a' is less
5860 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5861 | do not cause an exception.  Otherwise, the comparison is performed according
5862 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5863 *----------------------------------------------------------------------------*/
5864 
5865 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5866 {
5867     flag aSign, bSign;
5868 
5869     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5870         float_raise(float_flag_invalid, status);
5871         return 0;
5872     }
5873     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5874               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5875          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5876               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5877        ) {
5878         if (floatx80_is_signaling_nan(a, status)
5879          || floatx80_is_signaling_nan(b, status)) {
5880             float_raise(float_flag_invalid, status);
5881         }
5882         return 0;
5883     }
5884     aSign = extractFloatx80Sign( a );
5885     bSign = extractFloatx80Sign( b );
5886     if ( aSign != bSign ) {
5887         return
5888                aSign
5889             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5890                  == 0 );
5891     }
5892     return
5893           aSign ? le128( b.high, b.low, a.high, a.low )
5894         : le128( a.high, a.low, b.high, b.low );
5895 
5896 }
5897 
5898 /*----------------------------------------------------------------------------
5899 | Returns 1 if the extended double-precision floating-point value `a' is less
5900 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5901 | an exception.  Otherwise, the comparison is performed according to the
5902 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5903 *----------------------------------------------------------------------------*/
5904 
5905 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5906 {
5907     flag aSign, bSign;
5908 
5909     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5910         float_raise(float_flag_invalid, status);
5911         return 0;
5912     }
5913     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5914               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5915          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5916               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5917        ) {
5918         if (floatx80_is_signaling_nan(a, status)
5919          || floatx80_is_signaling_nan(b, status)) {
5920             float_raise(float_flag_invalid, status);
5921         }
5922         return 0;
5923     }
5924     aSign = extractFloatx80Sign( a );
5925     bSign = extractFloatx80Sign( b );
5926     if ( aSign != bSign ) {
5927         return
5928                aSign
5929             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5930                  != 0 );
5931     }
5932     return
5933           aSign ? lt128( b.high, b.low, a.high, a.low )
5934         : lt128( a.high, a.low, b.high, b.low );
5935 
5936 }
5937 
5938 /*----------------------------------------------------------------------------
5939 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5940 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5941 | The comparison is performed according to the IEC/IEEE Standard for Binary
5942 | Floating-Point Arithmetic.
5943 *----------------------------------------------------------------------------*/
5944 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5945 {
5946     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5947         float_raise(float_flag_invalid, status);
5948         return 1;
5949     }
5950     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5951               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5952          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5953               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5954        ) {
5955         if (floatx80_is_signaling_nan(a, status)
5956          || floatx80_is_signaling_nan(b, status)) {
5957             float_raise(float_flag_invalid, status);
5958         }
5959         return 1;
5960     }
5961     return 0;
5962 }
5963 
5964 /*----------------------------------------------------------------------------
5965 | Returns the result of converting the quadruple-precision floating-point
5966 | value `a' to the 32-bit two's complement integer format.  The conversion
5967 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5968 | Arithmetic---which means in particular that the conversion is rounded
5969 | according to the current rounding mode.  If `a' is a NaN, the largest
5970 | positive integer is returned.  Otherwise, if the conversion overflows, the
5971 | largest integer with the same sign as `a' is returned.
5972 *----------------------------------------------------------------------------*/
5973 
5974 int32_t float128_to_int32(float128 a, float_status *status)
5975 {
5976     flag aSign;
5977     int32_t aExp, shiftCount;
5978     uint64_t aSig0, aSig1;
5979 
5980     aSig1 = extractFloat128Frac1( a );
5981     aSig0 = extractFloat128Frac0( a );
5982     aExp = extractFloat128Exp( a );
5983     aSign = extractFloat128Sign( a );
5984     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5985     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5986     aSig0 |= ( aSig1 != 0 );
5987     shiftCount = 0x4028 - aExp;
5988     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5989     return roundAndPackInt32(aSign, aSig0, status);
5990 
5991 }
5992 
5993 /*----------------------------------------------------------------------------
5994 | Returns the result of converting the quadruple-precision floating-point
5995 | value `a' to the 32-bit two's complement integer format.  The conversion
5996 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5997 | Arithmetic, except that the conversion is always rounded toward zero.  If
5998 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5999 | conversion overflows, the largest integer with the same sign as `a' is
6000 | returned.
6001 *----------------------------------------------------------------------------*/
6002 
6003 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6004 {
6005     flag aSign;
6006     int32_t aExp, shiftCount;
6007     uint64_t aSig0, aSig1, savedASig;
6008     int32_t z;
6009 
6010     aSig1 = extractFloat128Frac1( a );
6011     aSig0 = extractFloat128Frac0( a );
6012     aExp = extractFloat128Exp( a );
6013     aSign = extractFloat128Sign( a );
6014     aSig0 |= ( aSig1 != 0 );
6015     if ( 0x401E < aExp ) {
6016         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6017         goto invalid;
6018     }
6019     else if ( aExp < 0x3FFF ) {
6020         if (aExp || aSig0) {
6021             status->float_exception_flags |= float_flag_inexact;
6022         }
6023         return 0;
6024     }
6025     aSig0 |= LIT64( 0x0001000000000000 );
6026     shiftCount = 0x402F - aExp;
6027     savedASig = aSig0;
6028     aSig0 >>= shiftCount;
6029     z = aSig0;
6030     if ( aSign ) z = - z;
6031     if ( ( z < 0 ) ^ aSign ) {
6032  invalid:
6033         float_raise(float_flag_invalid, status);
6034         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6035     }
6036     if ( ( aSig0<<shiftCount ) != savedASig ) {
6037         status->float_exception_flags |= float_flag_inexact;
6038     }
6039     return z;
6040 
6041 }
6042 
6043 /*----------------------------------------------------------------------------
6044 | Returns the result of converting the quadruple-precision floating-point
6045 | value `a' to the 64-bit two's complement integer format.  The conversion
6046 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6047 | Arithmetic---which means in particular that the conversion is rounded
6048 | according to the current rounding mode.  If `a' is a NaN, the largest
6049 | positive integer is returned.  Otherwise, if the conversion overflows, the
6050 | largest integer with the same sign as `a' is returned.
6051 *----------------------------------------------------------------------------*/
6052 
6053 int64_t float128_to_int64(float128 a, float_status *status)
6054 {
6055     flag aSign;
6056     int32_t aExp, shiftCount;
6057     uint64_t aSig0, aSig1;
6058 
6059     aSig1 = extractFloat128Frac1( a );
6060     aSig0 = extractFloat128Frac0( a );
6061     aExp = extractFloat128Exp( a );
6062     aSign = extractFloat128Sign( a );
6063     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6064     shiftCount = 0x402F - aExp;
6065     if ( shiftCount <= 0 ) {
6066         if ( 0x403E < aExp ) {
6067             float_raise(float_flag_invalid, status);
6068             if (    ! aSign
6069                  || (    ( aExp == 0x7FFF )
6070                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6071                     )
6072                ) {
6073                 return LIT64( 0x7FFFFFFFFFFFFFFF );
6074             }
6075             return (int64_t) LIT64( 0x8000000000000000 );
6076         }
6077         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6078     }
6079     else {
6080         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6081     }
6082     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6083 
6084 }
6085 
6086 /*----------------------------------------------------------------------------
6087 | Returns the result of converting the quadruple-precision floating-point
6088 | value `a' to the 64-bit two's complement integer format.  The conversion
6089 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6090 | Arithmetic, except that the conversion is always rounded toward zero.
6091 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6092 | the conversion overflows, the largest integer with the same sign as `a' is
6093 | returned.
6094 *----------------------------------------------------------------------------*/
6095 
6096 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6097 {
6098     flag aSign;
6099     int32_t aExp, shiftCount;
6100     uint64_t aSig0, aSig1;
6101     int64_t z;
6102 
6103     aSig1 = extractFloat128Frac1( a );
6104     aSig0 = extractFloat128Frac0( a );
6105     aExp = extractFloat128Exp( a );
6106     aSign = extractFloat128Sign( a );
6107     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6108     shiftCount = aExp - 0x402F;
6109     if ( 0 < shiftCount ) {
6110         if ( 0x403E <= aExp ) {
6111             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6112             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6113                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6114                 if (aSig1) {
6115                     status->float_exception_flags |= float_flag_inexact;
6116                 }
6117             }
6118             else {
6119                 float_raise(float_flag_invalid, status);
6120                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6121                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6122                 }
6123             }
6124             return (int64_t) LIT64( 0x8000000000000000 );
6125         }
6126         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6127         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6128             status->float_exception_flags |= float_flag_inexact;
6129         }
6130     }
6131     else {
6132         if ( aExp < 0x3FFF ) {
6133             if ( aExp | aSig0 | aSig1 ) {
6134                 status->float_exception_flags |= float_flag_inexact;
6135             }
6136             return 0;
6137         }
6138         z = aSig0>>( - shiftCount );
6139         if (    aSig1
6140              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6141             status->float_exception_flags |= float_flag_inexact;
6142         }
6143     }
6144     if ( aSign ) z = - z;
6145     return z;
6146 
6147 }
6148 
6149 /*----------------------------------------------------------------------------
6150 | Returns the result of converting the quadruple-precision floating-point value
6151 | `a' to the 64-bit unsigned integer format.  The conversion is
6152 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6153 | Arithmetic---which means in particular that the conversion is rounded
6154 | according to the current rounding mode.  If `a' is a NaN, the largest
6155 | positive integer is returned.  If the conversion overflows, the
6156 | largest unsigned integer is returned.  If 'a' is negative, the value is
6157 | rounded and zero is returned; negative values that do not round to zero
6158 | will raise the inexact exception.
6159 *----------------------------------------------------------------------------*/
6160 
6161 uint64_t float128_to_uint64(float128 a, float_status *status)
6162 {
6163     flag aSign;
6164     int aExp;
6165     int shiftCount;
6166     uint64_t aSig0, aSig1;
6167 
6168     aSig0 = extractFloat128Frac0(a);
6169     aSig1 = extractFloat128Frac1(a);
6170     aExp = extractFloat128Exp(a);
6171     aSign = extractFloat128Sign(a);
6172     if (aSign && (aExp > 0x3FFE)) {
6173         float_raise(float_flag_invalid, status);
6174         if (float128_is_any_nan(a)) {
6175             return LIT64(0xFFFFFFFFFFFFFFFF);
6176         } else {
6177             return 0;
6178         }
6179     }
6180     if (aExp) {
6181         aSig0 |= LIT64(0x0001000000000000);
6182     }
6183     shiftCount = 0x402F - aExp;
6184     if (shiftCount <= 0) {
6185         if (0x403E < aExp) {
6186             float_raise(float_flag_invalid, status);
6187             return LIT64(0xFFFFFFFFFFFFFFFF);
6188         }
6189         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6190     } else {
6191         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6192     }
6193     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6194 }
6195 
6196 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6197 {
6198     uint64_t v;
6199     signed char current_rounding_mode = status->float_rounding_mode;
6200 
6201     set_float_rounding_mode(float_round_to_zero, status);
6202     v = float128_to_uint64(a, status);
6203     set_float_rounding_mode(current_rounding_mode, status);
6204 
6205     return v;
6206 }
6207 
6208 /*----------------------------------------------------------------------------
6209 | Returns the result of converting the quadruple-precision floating-point
6210 | value `a' to the 32-bit unsigned integer format.  The conversion
6211 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6212 | Arithmetic except that the conversion is always rounded toward zero.
6213 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6214 | if the conversion overflows, the largest unsigned integer is returned.
6215 | If 'a' is negative, the value is rounded and zero is returned; negative
6216 | values that do not round to zero will raise the inexact exception.
6217 *----------------------------------------------------------------------------*/
6218 
6219 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6220 {
6221     uint64_t v;
6222     uint32_t res;
6223     int old_exc_flags = get_float_exception_flags(status);
6224 
6225     v = float128_to_uint64_round_to_zero(a, status);
6226     if (v > 0xffffffff) {
6227         res = 0xffffffff;
6228     } else {
6229         return v;
6230     }
6231     set_float_exception_flags(old_exc_flags, status);
6232     float_raise(float_flag_invalid, status);
6233     return res;
6234 }
6235 
6236 /*----------------------------------------------------------------------------
6237 | Returns the result of converting the quadruple-precision floating-point
6238 | value `a' to the single-precision floating-point format.  The conversion
6239 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6240 | Arithmetic.
6241 *----------------------------------------------------------------------------*/
6242 
6243 float32 float128_to_float32(float128 a, float_status *status)
6244 {
6245     flag aSign;
6246     int32_t aExp;
6247     uint64_t aSig0, aSig1;
6248     uint32_t zSig;
6249 
6250     aSig1 = extractFloat128Frac1( a );
6251     aSig0 = extractFloat128Frac0( a );
6252     aExp = extractFloat128Exp( a );
6253     aSign = extractFloat128Sign( a );
6254     if ( aExp == 0x7FFF ) {
6255         if ( aSig0 | aSig1 ) {
6256             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6257         }
6258         return packFloat32( aSign, 0xFF, 0 );
6259     }
6260     aSig0 |= ( aSig1 != 0 );
6261     shift64RightJamming( aSig0, 18, &aSig0 );
6262     zSig = aSig0;
6263     if ( aExp || zSig ) {
6264         zSig |= 0x40000000;
6265         aExp -= 0x3F81;
6266     }
6267     return roundAndPackFloat32(aSign, aExp, zSig, status);
6268 
6269 }
6270 
6271 /*----------------------------------------------------------------------------
6272 | Returns the result of converting the quadruple-precision floating-point
6273 | value `a' to the double-precision floating-point format.  The conversion
6274 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6275 | Arithmetic.
6276 *----------------------------------------------------------------------------*/
6277 
6278 float64 float128_to_float64(float128 a, float_status *status)
6279 {
6280     flag aSign;
6281     int32_t aExp;
6282     uint64_t aSig0, aSig1;
6283 
6284     aSig1 = extractFloat128Frac1( a );
6285     aSig0 = extractFloat128Frac0( a );
6286     aExp = extractFloat128Exp( a );
6287     aSign = extractFloat128Sign( a );
6288     if ( aExp == 0x7FFF ) {
6289         if ( aSig0 | aSig1 ) {
6290             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6291         }
6292         return packFloat64( aSign, 0x7FF, 0 );
6293     }
6294     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6295     aSig0 |= ( aSig1 != 0 );
6296     if ( aExp || aSig0 ) {
6297         aSig0 |= LIT64( 0x4000000000000000 );
6298         aExp -= 0x3C01;
6299     }
6300     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6301 
6302 }
6303 
6304 /*----------------------------------------------------------------------------
6305 | Returns the result of converting the quadruple-precision floating-point
6306 | value `a' to the extended double-precision floating-point format.  The
6307 | conversion is performed according to the IEC/IEEE Standard for Binary
6308 | Floating-Point Arithmetic.
6309 *----------------------------------------------------------------------------*/
6310 
6311 floatx80 float128_to_floatx80(float128 a, float_status *status)
6312 {
6313     flag aSign;
6314     int32_t aExp;
6315     uint64_t aSig0, aSig1;
6316 
6317     aSig1 = extractFloat128Frac1( a );
6318     aSig0 = extractFloat128Frac0( a );
6319     aExp = extractFloat128Exp( a );
6320     aSign = extractFloat128Sign( a );
6321     if ( aExp == 0x7FFF ) {
6322         if ( aSig0 | aSig1 ) {
6323             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6324         }
6325         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6326     }
6327     if ( aExp == 0 ) {
6328         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6329         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6330     }
6331     else {
6332         aSig0 |= LIT64( 0x0001000000000000 );
6333     }
6334     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6335     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6336 
6337 }
6338 
6339 /*----------------------------------------------------------------------------
6340 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6341 | returns the result as a quadruple-precision floating-point value.  The
6342 | operation is performed according to the IEC/IEEE Standard for Binary
6343 | Floating-Point Arithmetic.
6344 *----------------------------------------------------------------------------*/
6345 
6346 float128 float128_round_to_int(float128 a, float_status *status)
6347 {
6348     flag aSign;
6349     int32_t aExp;
6350     uint64_t lastBitMask, roundBitsMask;
6351     float128 z;
6352 
6353     aExp = extractFloat128Exp( a );
6354     if ( 0x402F <= aExp ) {
6355         if ( 0x406F <= aExp ) {
6356             if (    ( aExp == 0x7FFF )
6357                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6358                ) {
6359                 return propagateFloat128NaN(a, a, status);
6360             }
6361             return a;
6362         }
6363         lastBitMask = 1;
6364         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6365         roundBitsMask = lastBitMask - 1;
6366         z = a;
6367         switch (status->float_rounding_mode) {
6368         case float_round_nearest_even:
6369             if ( lastBitMask ) {
6370                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6371                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6372             }
6373             else {
6374                 if ( (int64_t) z.low < 0 ) {
6375                     ++z.high;
6376                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6377                 }
6378             }
6379             break;
6380         case float_round_ties_away:
6381             if (lastBitMask) {
6382                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6383             } else {
6384                 if ((int64_t) z.low < 0) {
6385                     ++z.high;
6386                 }
6387             }
6388             break;
6389         case float_round_to_zero:
6390             break;
6391         case float_round_up:
6392             if (!extractFloat128Sign(z)) {
6393                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6394             }
6395             break;
6396         case float_round_down:
6397             if (extractFloat128Sign(z)) {
6398                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6399             }
6400             break;
6401         default:
6402             abort();
6403         }
6404         z.low &= ~ roundBitsMask;
6405     }
6406     else {
6407         if ( aExp < 0x3FFF ) {
6408             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6409             status->float_exception_flags |= float_flag_inexact;
6410             aSign = extractFloat128Sign( a );
6411             switch (status->float_rounding_mode) {
6412              case float_round_nearest_even:
6413                 if (    ( aExp == 0x3FFE )
6414                      && (   extractFloat128Frac0( a )
6415                           | extractFloat128Frac1( a ) )
6416                    ) {
6417                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6418                 }
6419                 break;
6420             case float_round_ties_away:
6421                 if (aExp == 0x3FFE) {
6422                     return packFloat128(aSign, 0x3FFF, 0, 0);
6423                 }
6424                 break;
6425              case float_round_down:
6426                 return
6427                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6428                     : packFloat128( 0, 0, 0, 0 );
6429              case float_round_up:
6430                 return
6431                       aSign ? packFloat128( 1, 0, 0, 0 )
6432                     : packFloat128( 0, 0x3FFF, 0, 0 );
6433             }
6434             return packFloat128( aSign, 0, 0, 0 );
6435         }
6436         lastBitMask = 1;
6437         lastBitMask <<= 0x402F - aExp;
6438         roundBitsMask = lastBitMask - 1;
6439         z.low = 0;
6440         z.high = a.high;
6441         switch (status->float_rounding_mode) {
6442         case float_round_nearest_even:
6443             z.high += lastBitMask>>1;
6444             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6445                 z.high &= ~ lastBitMask;
6446             }
6447             break;
6448         case float_round_ties_away:
6449             z.high += lastBitMask>>1;
6450             break;
6451         case float_round_to_zero:
6452             break;
6453         case float_round_up:
6454             if (!extractFloat128Sign(z)) {
6455                 z.high |= ( a.low != 0 );
6456                 z.high += roundBitsMask;
6457             }
6458             break;
6459         case float_round_down:
6460             if (extractFloat128Sign(z)) {
6461                 z.high |= (a.low != 0);
6462                 z.high += roundBitsMask;
6463             }
6464             break;
6465         default:
6466             abort();
6467         }
6468         z.high &= ~ roundBitsMask;
6469     }
6470     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6471         status->float_exception_flags |= float_flag_inexact;
6472     }
6473     return z;
6474 
6475 }
6476 
6477 /*----------------------------------------------------------------------------
6478 | Returns the result of adding the absolute values of the quadruple-precision
6479 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6480 | before being returned.  `zSign' is ignored if the result is a NaN.
6481 | The addition is performed according to the IEC/IEEE Standard for Binary
6482 | Floating-Point Arithmetic.
6483 *----------------------------------------------------------------------------*/
6484 
6485 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6486                                 float_status *status)
6487 {
6488     int32_t aExp, bExp, zExp;
6489     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6490     int32_t expDiff;
6491 
6492     aSig1 = extractFloat128Frac1( a );
6493     aSig0 = extractFloat128Frac0( a );
6494     aExp = extractFloat128Exp( a );
6495     bSig1 = extractFloat128Frac1( b );
6496     bSig0 = extractFloat128Frac0( b );
6497     bExp = extractFloat128Exp( b );
6498     expDiff = aExp - bExp;
6499     if ( 0 < expDiff ) {
6500         if ( aExp == 0x7FFF ) {
6501             if (aSig0 | aSig1) {
6502                 return propagateFloat128NaN(a, b, status);
6503             }
6504             return a;
6505         }
6506         if ( bExp == 0 ) {
6507             --expDiff;
6508         }
6509         else {
6510             bSig0 |= LIT64( 0x0001000000000000 );
6511         }
6512         shift128ExtraRightJamming(
6513             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6514         zExp = aExp;
6515     }
6516     else if ( expDiff < 0 ) {
6517         if ( bExp == 0x7FFF ) {
6518             if (bSig0 | bSig1) {
6519                 return propagateFloat128NaN(a, b, status);
6520             }
6521             return packFloat128( zSign, 0x7FFF, 0, 0 );
6522         }
6523         if ( aExp == 0 ) {
6524             ++expDiff;
6525         }
6526         else {
6527             aSig0 |= LIT64( 0x0001000000000000 );
6528         }
6529         shift128ExtraRightJamming(
6530             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6531         zExp = bExp;
6532     }
6533     else {
6534         if ( aExp == 0x7FFF ) {
6535             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6536                 return propagateFloat128NaN(a, b, status);
6537             }
6538             return a;
6539         }
6540         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6541         if ( aExp == 0 ) {
6542             if (status->flush_to_zero) {
6543                 if (zSig0 | zSig1) {
6544                     float_raise(float_flag_output_denormal, status);
6545                 }
6546                 return packFloat128(zSign, 0, 0, 0);
6547             }
6548             return packFloat128( zSign, 0, zSig0, zSig1 );
6549         }
6550         zSig2 = 0;
6551         zSig0 |= LIT64( 0x0002000000000000 );
6552         zExp = aExp;
6553         goto shiftRight1;
6554     }
6555     aSig0 |= LIT64( 0x0001000000000000 );
6556     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6557     --zExp;
6558     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6559     ++zExp;
6560  shiftRight1:
6561     shift128ExtraRightJamming(
6562         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6563  roundAndPack:
6564     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6565 
6566 }
6567 
6568 /*----------------------------------------------------------------------------
6569 | Returns the result of subtracting the absolute values of the quadruple-
6570 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6571 | difference is negated before being returned.  `zSign' is ignored if the
6572 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6573 | Standard for Binary Floating-Point Arithmetic.
6574 *----------------------------------------------------------------------------*/
6575 
6576 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6577                                 float_status *status)
6578 {
6579     int32_t aExp, bExp, zExp;
6580     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6581     int32_t expDiff;
6582 
6583     aSig1 = extractFloat128Frac1( a );
6584     aSig0 = extractFloat128Frac0( a );
6585     aExp = extractFloat128Exp( a );
6586     bSig1 = extractFloat128Frac1( b );
6587     bSig0 = extractFloat128Frac0( b );
6588     bExp = extractFloat128Exp( b );
6589     expDiff = aExp - bExp;
6590     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6591     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6592     if ( 0 < expDiff ) goto aExpBigger;
6593     if ( expDiff < 0 ) goto bExpBigger;
6594     if ( aExp == 0x7FFF ) {
6595         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6596             return propagateFloat128NaN(a, b, status);
6597         }
6598         float_raise(float_flag_invalid, status);
6599         return float128_default_nan(status);
6600     }
6601     if ( aExp == 0 ) {
6602         aExp = 1;
6603         bExp = 1;
6604     }
6605     if ( bSig0 < aSig0 ) goto aBigger;
6606     if ( aSig0 < bSig0 ) goto bBigger;
6607     if ( bSig1 < aSig1 ) goto aBigger;
6608     if ( aSig1 < bSig1 ) goto bBigger;
6609     return packFloat128(status->float_rounding_mode == float_round_down,
6610                         0, 0, 0);
6611  bExpBigger:
6612     if ( bExp == 0x7FFF ) {
6613         if (bSig0 | bSig1) {
6614             return propagateFloat128NaN(a, b, status);
6615         }
6616         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6617     }
6618     if ( aExp == 0 ) {
6619         ++expDiff;
6620     }
6621     else {
6622         aSig0 |= LIT64( 0x4000000000000000 );
6623     }
6624     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6625     bSig0 |= LIT64( 0x4000000000000000 );
6626  bBigger:
6627     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6628     zExp = bExp;
6629     zSign ^= 1;
6630     goto normalizeRoundAndPack;
6631  aExpBigger:
6632     if ( aExp == 0x7FFF ) {
6633         if (aSig0 | aSig1) {
6634             return propagateFloat128NaN(a, b, status);
6635         }
6636         return a;
6637     }
6638     if ( bExp == 0 ) {
6639         --expDiff;
6640     }
6641     else {
6642         bSig0 |= LIT64( 0x4000000000000000 );
6643     }
6644     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6645     aSig0 |= LIT64( 0x4000000000000000 );
6646  aBigger:
6647     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6648     zExp = aExp;
6649  normalizeRoundAndPack:
6650     --zExp;
6651     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6652                                          status);
6653 
6654 }
6655 
6656 /*----------------------------------------------------------------------------
6657 | Returns the result of adding the quadruple-precision floating-point values
6658 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6659 | for Binary Floating-Point Arithmetic.
6660 *----------------------------------------------------------------------------*/
6661 
6662 float128 float128_add(float128 a, float128 b, float_status *status)
6663 {
6664     flag aSign, bSign;
6665 
6666     aSign = extractFloat128Sign( a );
6667     bSign = extractFloat128Sign( b );
6668     if ( aSign == bSign ) {
6669         return addFloat128Sigs(a, b, aSign, status);
6670     }
6671     else {
6672         return subFloat128Sigs(a, b, aSign, status);
6673     }
6674 
6675 }
6676 
6677 /*----------------------------------------------------------------------------
6678 | Returns the result of subtracting the quadruple-precision floating-point
6679 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6680 | Standard for Binary Floating-Point Arithmetic.
6681 *----------------------------------------------------------------------------*/
6682 
6683 float128 float128_sub(float128 a, float128 b, float_status *status)
6684 {
6685     flag aSign, bSign;
6686 
6687     aSign = extractFloat128Sign( a );
6688     bSign = extractFloat128Sign( b );
6689     if ( aSign == bSign ) {
6690         return subFloat128Sigs(a, b, aSign, status);
6691     }
6692     else {
6693         return addFloat128Sigs(a, b, aSign, status);
6694     }
6695 
6696 }
6697 
6698 /*----------------------------------------------------------------------------
6699 | Returns the result of multiplying the quadruple-precision floating-point
6700 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6701 | Standard for Binary Floating-Point Arithmetic.
6702 *----------------------------------------------------------------------------*/
6703 
6704 float128 float128_mul(float128 a, float128 b, float_status *status)
6705 {
6706     flag aSign, bSign, zSign;
6707     int32_t aExp, bExp, zExp;
6708     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6709 
6710     aSig1 = extractFloat128Frac1( a );
6711     aSig0 = extractFloat128Frac0( a );
6712     aExp = extractFloat128Exp( a );
6713     aSign = extractFloat128Sign( a );
6714     bSig1 = extractFloat128Frac1( b );
6715     bSig0 = extractFloat128Frac0( b );
6716     bExp = extractFloat128Exp( b );
6717     bSign = extractFloat128Sign( b );
6718     zSign = aSign ^ bSign;
6719     if ( aExp == 0x7FFF ) {
6720         if (    ( aSig0 | aSig1 )
6721              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6722             return propagateFloat128NaN(a, b, status);
6723         }
6724         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6725         return packFloat128( zSign, 0x7FFF, 0, 0 );
6726     }
6727     if ( bExp == 0x7FFF ) {
6728         if (bSig0 | bSig1) {
6729             return propagateFloat128NaN(a, b, status);
6730         }
6731         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6732  invalid:
6733             float_raise(float_flag_invalid, status);
6734             return float128_default_nan(status);
6735         }
6736         return packFloat128( zSign, 0x7FFF, 0, 0 );
6737     }
6738     if ( aExp == 0 ) {
6739         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6740         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6741     }
6742     if ( bExp == 0 ) {
6743         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6744         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6745     }
6746     zExp = aExp + bExp - 0x4000;
6747     aSig0 |= LIT64( 0x0001000000000000 );
6748     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6749     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6750     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6751     zSig2 |= ( zSig3 != 0 );
6752     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6753         shift128ExtraRightJamming(
6754             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6755         ++zExp;
6756     }
6757     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6758 
6759 }
6760 
6761 /*----------------------------------------------------------------------------
6762 | Returns the result of dividing the quadruple-precision floating-point value
6763 | `a' by the corresponding value `b'.  The operation is performed according to
6764 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6765 *----------------------------------------------------------------------------*/
6766 
6767 float128 float128_div(float128 a, float128 b, float_status *status)
6768 {
6769     flag aSign, bSign, zSign;
6770     int32_t aExp, bExp, zExp;
6771     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6772     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6773 
6774     aSig1 = extractFloat128Frac1( a );
6775     aSig0 = extractFloat128Frac0( a );
6776     aExp = extractFloat128Exp( a );
6777     aSign = extractFloat128Sign( a );
6778     bSig1 = extractFloat128Frac1( b );
6779     bSig0 = extractFloat128Frac0( b );
6780     bExp = extractFloat128Exp( b );
6781     bSign = extractFloat128Sign( b );
6782     zSign = aSign ^ bSign;
6783     if ( aExp == 0x7FFF ) {
6784         if (aSig0 | aSig1) {
6785             return propagateFloat128NaN(a, b, status);
6786         }
6787         if ( bExp == 0x7FFF ) {
6788             if (bSig0 | bSig1) {
6789                 return propagateFloat128NaN(a, b, status);
6790             }
6791             goto invalid;
6792         }
6793         return packFloat128( zSign, 0x7FFF, 0, 0 );
6794     }
6795     if ( bExp == 0x7FFF ) {
6796         if (bSig0 | bSig1) {
6797             return propagateFloat128NaN(a, b, status);
6798         }
6799         return packFloat128( zSign, 0, 0, 0 );
6800     }
6801     if ( bExp == 0 ) {
6802         if ( ( bSig0 | bSig1 ) == 0 ) {
6803             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6804  invalid:
6805                 float_raise(float_flag_invalid, status);
6806                 return float128_default_nan(status);
6807             }
6808             float_raise(float_flag_divbyzero, status);
6809             return packFloat128( zSign, 0x7FFF, 0, 0 );
6810         }
6811         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6812     }
6813     if ( aExp == 0 ) {
6814         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6815         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6816     }
6817     zExp = aExp - bExp + 0x3FFD;
6818     shortShift128Left(
6819         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6820     shortShift128Left(
6821         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6822     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6823         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6824         ++zExp;
6825     }
6826     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6827     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6828     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6829     while ( (int64_t) rem0 < 0 ) {
6830         --zSig0;
6831         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6832     }
6833     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6834     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6835         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6836         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6837         while ( (int64_t) rem1 < 0 ) {
6838             --zSig1;
6839             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6840         }
6841         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6842     }
6843     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6844     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6845 
6846 }
6847 
6848 /*----------------------------------------------------------------------------
6849 | Returns the remainder of the quadruple-precision floating-point value `a'
6850 | with respect to the corresponding value `b'.  The operation is performed
6851 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6852 *----------------------------------------------------------------------------*/
6853 
6854 float128 float128_rem(float128 a, float128 b, float_status *status)
6855 {
6856     flag aSign, zSign;
6857     int32_t aExp, bExp, expDiff;
6858     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6859     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6860     int64_t sigMean0;
6861 
6862     aSig1 = extractFloat128Frac1( a );
6863     aSig0 = extractFloat128Frac0( a );
6864     aExp = extractFloat128Exp( a );
6865     aSign = extractFloat128Sign( a );
6866     bSig1 = extractFloat128Frac1( b );
6867     bSig0 = extractFloat128Frac0( b );
6868     bExp = extractFloat128Exp( b );
6869     if ( aExp == 0x7FFF ) {
6870         if (    ( aSig0 | aSig1 )
6871              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6872             return propagateFloat128NaN(a, b, status);
6873         }
6874         goto invalid;
6875     }
6876     if ( bExp == 0x7FFF ) {
6877         if (bSig0 | bSig1) {
6878             return propagateFloat128NaN(a, b, status);
6879         }
6880         return a;
6881     }
6882     if ( bExp == 0 ) {
6883         if ( ( bSig0 | bSig1 ) == 0 ) {
6884  invalid:
6885             float_raise(float_flag_invalid, status);
6886             return float128_default_nan(status);
6887         }
6888         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6889     }
6890     if ( aExp == 0 ) {
6891         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6892         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6893     }
6894     expDiff = aExp - bExp;
6895     if ( expDiff < -1 ) return a;
6896     shortShift128Left(
6897         aSig0 | LIT64( 0x0001000000000000 ),
6898         aSig1,
6899         15 - ( expDiff < 0 ),
6900         &aSig0,
6901         &aSig1
6902     );
6903     shortShift128Left(
6904         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6905     q = le128( bSig0, bSig1, aSig0, aSig1 );
6906     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6907     expDiff -= 64;
6908     while ( 0 < expDiff ) {
6909         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6910         q = ( 4 < q ) ? q - 4 : 0;
6911         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6912         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6913         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6914         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6915         expDiff -= 61;
6916     }
6917     if ( -64 < expDiff ) {
6918         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6919         q = ( 4 < q ) ? q - 4 : 0;
6920         q >>= - expDiff;
6921         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6922         expDiff += 52;
6923         if ( expDiff < 0 ) {
6924             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6925         }
6926         else {
6927             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6928         }
6929         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6930         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6931     }
6932     else {
6933         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6934         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6935     }
6936     do {
6937         alternateASig0 = aSig0;
6938         alternateASig1 = aSig1;
6939         ++q;
6940         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6941     } while ( 0 <= (int64_t) aSig0 );
6942     add128(
6943         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6944     if (    ( sigMean0 < 0 )
6945          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6946         aSig0 = alternateASig0;
6947         aSig1 = alternateASig1;
6948     }
6949     zSign = ( (int64_t) aSig0 < 0 );
6950     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6951     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6952                                          status);
6953 }
6954 
6955 /*----------------------------------------------------------------------------
6956 | Returns the square root of the quadruple-precision floating-point value `a'.
6957 | The operation is performed according to the IEC/IEEE Standard for Binary
6958 | Floating-Point Arithmetic.
6959 *----------------------------------------------------------------------------*/
6960 
6961 float128 float128_sqrt(float128 a, float_status *status)
6962 {
6963     flag aSign;
6964     int32_t aExp, zExp;
6965     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6966     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6967 
6968     aSig1 = extractFloat128Frac1( a );
6969     aSig0 = extractFloat128Frac0( a );
6970     aExp = extractFloat128Exp( a );
6971     aSign = extractFloat128Sign( a );
6972     if ( aExp == 0x7FFF ) {
6973         if (aSig0 | aSig1) {
6974             return propagateFloat128NaN(a, a, status);
6975         }
6976         if ( ! aSign ) return a;
6977         goto invalid;
6978     }
6979     if ( aSign ) {
6980         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6981  invalid:
6982         float_raise(float_flag_invalid, status);
6983         return float128_default_nan(status);
6984     }
6985     if ( aExp == 0 ) {
6986         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6987         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6988     }
6989     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6990     aSig0 |= LIT64( 0x0001000000000000 );
6991     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6992     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6993     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6994     doubleZSig0 = zSig0<<1;
6995     mul64To128( zSig0, zSig0, &term0, &term1 );
6996     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6997     while ( (int64_t) rem0 < 0 ) {
6998         --zSig0;
6999         doubleZSig0 -= 2;
7000         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7001     }
7002     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7003     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7004         if ( zSig1 == 0 ) zSig1 = 1;
7005         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7006         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7007         mul64To128( zSig1, zSig1, &term2, &term3 );
7008         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7009         while ( (int64_t) rem1 < 0 ) {
7010             --zSig1;
7011             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7012             term3 |= 1;
7013             term2 |= doubleZSig0;
7014             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7015         }
7016         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7017     }
7018     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7019     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7020 
7021 }
7022 
7023 /*----------------------------------------------------------------------------
7024 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7025 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7026 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7027 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7028 *----------------------------------------------------------------------------*/
7029 
7030 int float128_eq(float128 a, float128 b, float_status *status)
7031 {
7032 
7033     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7034               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7035          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7036               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7037        ) {
7038         float_raise(float_flag_invalid, status);
7039         return 0;
7040     }
7041     return
7042            ( a.low == b.low )
7043         && (    ( a.high == b.high )
7044              || (    ( a.low == 0 )
7045                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7046            );
7047 
7048 }
7049 
7050 /*----------------------------------------------------------------------------
7051 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7052 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7053 | exception is raised if either operand is a NaN.  The comparison is performed
7054 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7055 *----------------------------------------------------------------------------*/
7056 
7057 int float128_le(float128 a, float128 b, float_status *status)
7058 {
7059     flag aSign, bSign;
7060 
7061     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7062               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7063          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7064               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7065        ) {
7066         float_raise(float_flag_invalid, status);
7067         return 0;
7068     }
7069     aSign = extractFloat128Sign( a );
7070     bSign = extractFloat128Sign( b );
7071     if ( aSign != bSign ) {
7072         return
7073                aSign
7074             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7075                  == 0 );
7076     }
7077     return
7078           aSign ? le128( b.high, b.low, a.high, a.low )
7079         : le128( a.high, a.low, b.high, b.low );
7080 
7081 }
7082 
7083 /*----------------------------------------------------------------------------
7084 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7085 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7086 | raised if either operand is a NaN.  The comparison is performed according
7087 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7088 *----------------------------------------------------------------------------*/
7089 
7090 int float128_lt(float128 a, float128 b, float_status *status)
7091 {
7092     flag aSign, bSign;
7093 
7094     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7095               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7096          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7097               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7098        ) {
7099         float_raise(float_flag_invalid, status);
7100         return 0;
7101     }
7102     aSign = extractFloat128Sign( a );
7103     bSign = extractFloat128Sign( b );
7104     if ( aSign != bSign ) {
7105         return
7106                aSign
7107             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7108                  != 0 );
7109     }
7110     return
7111           aSign ? lt128( b.high, b.low, a.high, a.low )
7112         : lt128( a.high, a.low, b.high, b.low );
7113 
7114 }
7115 
7116 /*----------------------------------------------------------------------------
7117 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7118 | be compared, and 0 otherwise.  The invalid exception is raised if either
7119 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7120 | Standard for Binary Floating-Point Arithmetic.
7121 *----------------------------------------------------------------------------*/
7122 
7123 int float128_unordered(float128 a, float128 b, float_status *status)
7124 {
7125     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7126               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7127          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7128               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7129        ) {
7130         float_raise(float_flag_invalid, status);
7131         return 1;
7132     }
7133     return 0;
7134 }
7135 
7136 /*----------------------------------------------------------------------------
7137 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7138 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7139 | exception.  The comparison is performed according to the IEC/IEEE Standard
7140 | for Binary Floating-Point Arithmetic.
7141 *----------------------------------------------------------------------------*/
7142 
7143 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7144 {
7145 
7146     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7147               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7148          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7149               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7150        ) {
7151         if (float128_is_signaling_nan(a, status)
7152          || float128_is_signaling_nan(b, status)) {
7153             float_raise(float_flag_invalid, status);
7154         }
7155         return 0;
7156     }
7157     return
7158            ( a.low == b.low )
7159         && (    ( a.high == b.high )
7160              || (    ( a.low == 0 )
7161                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7162            );
7163 
7164 }
7165 
7166 /*----------------------------------------------------------------------------
7167 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7168 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7169 | cause an exception.  Otherwise, the comparison is performed according to the
7170 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7171 *----------------------------------------------------------------------------*/
7172 
7173 int float128_le_quiet(float128 a, float128 b, float_status *status)
7174 {
7175     flag aSign, bSign;
7176 
7177     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7178               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7179          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7180               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7181        ) {
7182         if (float128_is_signaling_nan(a, status)
7183          || float128_is_signaling_nan(b, status)) {
7184             float_raise(float_flag_invalid, status);
7185         }
7186         return 0;
7187     }
7188     aSign = extractFloat128Sign( a );
7189     bSign = extractFloat128Sign( b );
7190     if ( aSign != bSign ) {
7191         return
7192                aSign
7193             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7194                  == 0 );
7195     }
7196     return
7197           aSign ? le128( b.high, b.low, a.high, a.low )
7198         : le128( a.high, a.low, b.high, b.low );
7199 
7200 }
7201 
7202 /*----------------------------------------------------------------------------
7203 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7204 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7205 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7206 | Standard for Binary Floating-Point Arithmetic.
7207 *----------------------------------------------------------------------------*/
7208 
7209 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7210 {
7211     flag aSign, bSign;
7212 
7213     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7214               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7215          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7216               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7217        ) {
7218         if (float128_is_signaling_nan(a, status)
7219          || float128_is_signaling_nan(b, status)) {
7220             float_raise(float_flag_invalid, status);
7221         }
7222         return 0;
7223     }
7224     aSign = extractFloat128Sign( a );
7225     bSign = extractFloat128Sign( b );
7226     if ( aSign != bSign ) {
7227         return
7228                aSign
7229             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7230                  != 0 );
7231     }
7232     return
7233           aSign ? lt128( b.high, b.low, a.high, a.low )
7234         : lt128( a.high, a.low, b.high, b.low );
7235 
7236 }
7237 
7238 /*----------------------------------------------------------------------------
7239 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7240 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7241 | comparison is performed according to the IEC/IEEE Standard for Binary
7242 | Floating-Point Arithmetic.
7243 *----------------------------------------------------------------------------*/
7244 
7245 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7246 {
7247     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7248               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7249          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7250               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7251        ) {
7252         if (float128_is_signaling_nan(a, status)
7253          || float128_is_signaling_nan(b, status)) {
7254             float_raise(float_flag_invalid, status);
7255         }
7256         return 1;
7257     }
7258     return 0;
7259 }
7260 
7261 /* misc functions */
7262 float32 uint32_to_float32(uint32_t a, float_status *status)
7263 {
7264     return int64_to_float32(a, status);
7265 }
7266 
7267 float64 uint32_to_float64(uint32_t a, float_status *status)
7268 {
7269     return int64_to_float64(a, status);
7270 }
7271 
7272 uint32_t float32_to_uint32(float32 a, float_status *status)
7273 {
7274     int64_t v;
7275     uint32_t res;
7276     int old_exc_flags = get_float_exception_flags(status);
7277 
7278     v = float32_to_int64(a, status);
7279     if (v < 0) {
7280         res = 0;
7281     } else if (v > 0xffffffff) {
7282         res = 0xffffffff;
7283     } else {
7284         return v;
7285     }
7286     set_float_exception_flags(old_exc_flags, status);
7287     float_raise(float_flag_invalid, status);
7288     return res;
7289 }
7290 
7291 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
7292 {
7293     int64_t v;
7294     uint32_t res;
7295     int old_exc_flags = get_float_exception_flags(status);
7296 
7297     v = float32_to_int64_round_to_zero(a, status);
7298     if (v < 0) {
7299         res = 0;
7300     } else if (v > 0xffffffff) {
7301         res = 0xffffffff;
7302     } else {
7303         return v;
7304     }
7305     set_float_exception_flags(old_exc_flags, status);
7306     float_raise(float_flag_invalid, status);
7307     return res;
7308 }
7309 
7310 int16_t float32_to_int16(float32 a, float_status *status)
7311 {
7312     int32_t v;
7313     int16_t res;
7314     int old_exc_flags = get_float_exception_flags(status);
7315 
7316     v = float32_to_int32(a, status);
7317     if (v < -0x8000) {
7318         res = -0x8000;
7319     } else if (v > 0x7fff) {
7320         res = 0x7fff;
7321     } else {
7322         return v;
7323     }
7324 
7325     set_float_exception_flags(old_exc_flags, status);
7326     float_raise(float_flag_invalid, status);
7327     return res;
7328 }
7329 
7330 uint16_t float32_to_uint16(float32 a, float_status *status)
7331 {
7332     int32_t v;
7333     uint16_t res;
7334     int old_exc_flags = get_float_exception_flags(status);
7335 
7336     v = float32_to_int32(a, status);
7337     if (v < 0) {
7338         res = 0;
7339     } else if (v > 0xffff) {
7340         res = 0xffff;
7341     } else {
7342         return v;
7343     }
7344 
7345     set_float_exception_flags(old_exc_flags, status);
7346     float_raise(float_flag_invalid, status);
7347     return res;
7348 }
7349 
7350 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7351 {
7352     int64_t v;
7353     uint16_t res;
7354     int old_exc_flags = get_float_exception_flags(status);
7355 
7356     v = float32_to_int64_round_to_zero(a, status);
7357     if (v < 0) {
7358         res = 0;
7359     } else if (v > 0xffff) {
7360         res = 0xffff;
7361     } else {
7362         return v;
7363     }
7364     set_float_exception_flags(old_exc_flags, status);
7365     float_raise(float_flag_invalid, status);
7366     return res;
7367 }
7368 
7369 uint32_t float64_to_uint32(float64 a, float_status *status)
7370 {
7371     uint64_t v;
7372     uint32_t res;
7373     int old_exc_flags = get_float_exception_flags(status);
7374 
7375     v = float64_to_uint64(a, status);
7376     if (v > 0xffffffff) {
7377         res = 0xffffffff;
7378     } else {
7379         return v;
7380     }
7381     set_float_exception_flags(old_exc_flags, status);
7382     float_raise(float_flag_invalid, status);
7383     return res;
7384 }
7385 
7386 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
7387 {
7388     uint64_t v;
7389     uint32_t res;
7390     int old_exc_flags = get_float_exception_flags(status);
7391 
7392     v = float64_to_uint64_round_to_zero(a, status);
7393     if (v > 0xffffffff) {
7394         res = 0xffffffff;
7395     } else {
7396         return v;
7397     }
7398     set_float_exception_flags(old_exc_flags, status);
7399     float_raise(float_flag_invalid, status);
7400     return res;
7401 }
7402 
7403 int16_t float64_to_int16(float64 a, float_status *status)
7404 {
7405     int64_t v;
7406     int16_t res;
7407     int old_exc_flags = get_float_exception_flags(status);
7408 
7409     v = float64_to_int32(a, status);
7410     if (v < -0x8000) {
7411         res = -0x8000;
7412     } else if (v > 0x7fff) {
7413         res = 0x7fff;
7414     } else {
7415         return v;
7416     }
7417 
7418     set_float_exception_flags(old_exc_flags, status);
7419     float_raise(float_flag_invalid, status);
7420     return res;
7421 }
7422 
7423 uint16_t float64_to_uint16(float64 a, float_status *status)
7424 {
7425     int64_t v;
7426     uint16_t res;
7427     int old_exc_flags = get_float_exception_flags(status);
7428 
7429     v = float64_to_int32(a, status);
7430     if (v < 0) {
7431         res = 0;
7432     } else if (v > 0xffff) {
7433         res = 0xffff;
7434     } else {
7435         return v;
7436     }
7437 
7438     set_float_exception_flags(old_exc_flags, status);
7439     float_raise(float_flag_invalid, status);
7440     return res;
7441 }
7442 
7443 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7444 {
7445     int64_t v;
7446     uint16_t res;
7447     int old_exc_flags = get_float_exception_flags(status);
7448 
7449     v = float64_to_int64_round_to_zero(a, status);
7450     if (v < 0) {
7451         res = 0;
7452     } else if (v > 0xffff) {
7453         res = 0xffff;
7454     } else {
7455         return v;
7456     }
7457     set_float_exception_flags(old_exc_flags, status);
7458     float_raise(float_flag_invalid, status);
7459     return res;
7460 }
7461 
7462 /*----------------------------------------------------------------------------
7463 | Returns the result of converting the double-precision floating-point value
7464 | `a' to the 64-bit unsigned integer format.  The conversion is
7465 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7466 | Arithmetic---which means in particular that the conversion is rounded
7467 | according to the current rounding mode.  If `a' is a NaN, the largest
7468 | positive integer is returned.  If the conversion overflows, the
7469 | largest unsigned integer is returned.  If 'a' is negative, the value is
7470 | rounded and zero is returned; negative values that do not round to zero
7471 | will raise the inexact exception.
7472 *----------------------------------------------------------------------------*/
7473 
7474 uint64_t float64_to_uint64(float64 a, float_status *status)
7475 {
7476     flag aSign;
7477     int aExp;
7478     int shiftCount;
7479     uint64_t aSig, aSigExtra;
7480     a = float64_squash_input_denormal(a, status);
7481 
7482     aSig = extractFloat64Frac(a);
7483     aExp = extractFloat64Exp(a);
7484     aSign = extractFloat64Sign(a);
7485     if (aSign && (aExp > 1022)) {
7486         float_raise(float_flag_invalid, status);
7487         if (float64_is_any_nan(a)) {
7488             return LIT64(0xFFFFFFFFFFFFFFFF);
7489         } else {
7490             return 0;
7491         }
7492     }
7493     if (aExp) {
7494         aSig |= LIT64(0x0010000000000000);
7495     }
7496     shiftCount = 0x433 - aExp;
7497     if (shiftCount <= 0) {
7498         if (0x43E < aExp) {
7499             float_raise(float_flag_invalid, status);
7500             return LIT64(0xFFFFFFFFFFFFFFFF);
7501         }
7502         aSigExtra = 0;
7503         aSig <<= -shiftCount;
7504     } else {
7505         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7506     }
7507     return roundAndPackUint64(aSign, aSig, aSigExtra, status);
7508 }
7509 
7510 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7511 {
7512     signed char current_rounding_mode = status->float_rounding_mode;
7513     set_float_rounding_mode(float_round_to_zero, status);
7514     uint64_t v = float64_to_uint64(a, status);
7515     set_float_rounding_mode(current_rounding_mode, status);
7516     return v;
7517 }
7518 
7519 #define COMPARE(s, nan_exp)                                                  \
7520 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7521                                       int is_quiet, float_status *status)    \
7522 {                                                                            \
7523     flag aSign, bSign;                                                       \
7524     uint ## s ## _t av, bv;                                                  \
7525     a = float ## s ## _squash_input_denormal(a, status);                     \
7526     b = float ## s ## _squash_input_denormal(b, status);                     \
7527                                                                              \
7528     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7529          extractFloat ## s ## Frac( a ) ) ||                                 \
7530         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7531           extractFloat ## s ## Frac( b ) )) {                                \
7532         if (!is_quiet ||                                                     \
7533             float ## s ## _is_signaling_nan(a, status) ||                  \
7534             float ## s ## _is_signaling_nan(b, status)) {                 \
7535             float_raise(float_flag_invalid, status);                         \
7536         }                                                                    \
7537         return float_relation_unordered;                                     \
7538     }                                                                        \
7539     aSign = extractFloat ## s ## Sign( a );                                  \
7540     bSign = extractFloat ## s ## Sign( b );                                  \
7541     av = float ## s ## _val(a);                                              \
7542     bv = float ## s ## _val(b);                                              \
7543     if ( aSign != bSign ) {                                                  \
7544         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7545             /* zero case */                                                  \
7546             return float_relation_equal;                                     \
7547         } else {                                                             \
7548             return 1 - (2 * aSign);                                          \
7549         }                                                                    \
7550     } else {                                                                 \
7551         if (av == bv) {                                                      \
7552             return float_relation_equal;                                     \
7553         } else {                                                             \
7554             return 1 - 2 * (aSign ^ ( av < bv ));                            \
7555         }                                                                    \
7556     }                                                                        \
7557 }                                                                            \
7558                                                                              \
7559 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7560 {                                                                            \
7561     return float ## s ## _compare_internal(a, b, 0, status);                 \
7562 }                                                                            \
7563                                                                              \
7564 int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
7565                                  float_status *status)                       \
7566 {                                                                            \
7567     return float ## s ## _compare_internal(a, b, 1, status);                 \
7568 }
7569 
7570 COMPARE(32, 0xff)
7571 COMPARE(64, 0x7ff)
7572 
7573 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7574                                             int is_quiet, float_status *status)
7575 {
7576     flag aSign, bSign;
7577 
7578     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7579         float_raise(float_flag_invalid, status);
7580         return float_relation_unordered;
7581     }
7582     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7583           ( extractFloatx80Frac( a )<<1 ) ) ||
7584         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7585           ( extractFloatx80Frac( b )<<1 ) )) {
7586         if (!is_quiet ||
7587             floatx80_is_signaling_nan(a, status) ||
7588             floatx80_is_signaling_nan(b, status)) {
7589             float_raise(float_flag_invalid, status);
7590         }
7591         return float_relation_unordered;
7592     }
7593     aSign = extractFloatx80Sign( a );
7594     bSign = extractFloatx80Sign( b );
7595     if ( aSign != bSign ) {
7596 
7597         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7598              ( ( a.low | b.low ) == 0 ) ) {
7599             /* zero case */
7600             return float_relation_equal;
7601         } else {
7602             return 1 - (2 * aSign);
7603         }
7604     } else {
7605         if (a.low == b.low && a.high == b.high) {
7606             return float_relation_equal;
7607         } else {
7608             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7609         }
7610     }
7611 }
7612 
7613 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7614 {
7615     return floatx80_compare_internal(a, b, 0, status);
7616 }
7617 
7618 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7619 {
7620     return floatx80_compare_internal(a, b, 1, status);
7621 }
7622 
7623 static inline int float128_compare_internal(float128 a, float128 b,
7624                                             int is_quiet, float_status *status)
7625 {
7626     flag aSign, bSign;
7627 
7628     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7629           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7630         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7631           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7632         if (!is_quiet ||
7633             float128_is_signaling_nan(a, status) ||
7634             float128_is_signaling_nan(b, status)) {
7635             float_raise(float_flag_invalid, status);
7636         }
7637         return float_relation_unordered;
7638     }
7639     aSign = extractFloat128Sign( a );
7640     bSign = extractFloat128Sign( b );
7641     if ( aSign != bSign ) {
7642         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7643             /* zero case */
7644             return float_relation_equal;
7645         } else {
7646             return 1 - (2 * aSign);
7647         }
7648     } else {
7649         if (a.low == b.low && a.high == b.high) {
7650             return float_relation_equal;
7651         } else {
7652             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7653         }
7654     }
7655 }
7656 
7657 int float128_compare(float128 a, float128 b, float_status *status)
7658 {
7659     return float128_compare_internal(a, b, 0, status);
7660 }
7661 
7662 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7663 {
7664     return float128_compare_internal(a, b, 1, status);
7665 }
7666 
7667 /* min() and max() functions. These can't be implemented as
7668  * 'compare and pick one input' because that would mishandle
7669  * NaNs and +0 vs -0.
7670  *
7671  * minnum() and maxnum() functions. These are similar to the min()
7672  * and max() functions but if one of the arguments is a QNaN and
7673  * the other is numerical then the numerical argument is returned.
7674  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7675  * and maxNum() operations. min() and max() are the typical min/max
7676  * semantics provided by many CPUs which predate that specification.
7677  *
7678  * minnummag() and maxnummag() functions correspond to minNumMag()
7679  * and minNumMag() from the IEEE-754 2008.
7680  */
7681 #define MINMAX(s)                                                       \
7682 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7683                                                int ismin, int isieee,   \
7684                                                int ismag,               \
7685                                                float_status *status)    \
7686 {                                                                       \
7687     flag aSign, bSign;                                                  \
7688     uint ## s ## _t av, bv, aav, abv;                                   \
7689     a = float ## s ## _squash_input_denormal(a, status);                \
7690     b = float ## s ## _squash_input_denormal(b, status);                \
7691     if (float ## s ## _is_any_nan(a) ||                                 \
7692         float ## s ## _is_any_nan(b)) {                                 \
7693         if (isieee) {                                                   \
7694             if (float ## s ## _is_quiet_nan(a, status) &&               \
7695                 !float ## s ##_is_any_nan(b)) {                         \
7696                 return b;                                               \
7697             } else if (float ## s ## _is_quiet_nan(b, status) &&        \
7698                        !float ## s ## _is_any_nan(a)) {                \
7699                 return a;                                               \
7700             }                                                           \
7701         }                                                               \
7702         return propagateFloat ## s ## NaN(a, b, status);                \
7703     }                                                                   \
7704     aSign = extractFloat ## s ## Sign(a);                               \
7705     bSign = extractFloat ## s ## Sign(b);                               \
7706     av = float ## s ## _val(a);                                         \
7707     bv = float ## s ## _val(b);                                         \
7708     if (ismag) {                                                        \
7709         aav = float ## s ## _abs(av);                                   \
7710         abv = float ## s ## _abs(bv);                                   \
7711         if (aav != abv) {                                               \
7712             if (ismin) {                                                \
7713                 return (aav < abv) ? a : b;                             \
7714             } else {                                                    \
7715                 return (aav < abv) ? b : a;                             \
7716             }                                                           \
7717         }                                                               \
7718     }                                                                   \
7719     if (aSign != bSign) {                                               \
7720         if (ismin) {                                                    \
7721             return aSign ? a : b;                                       \
7722         } else {                                                        \
7723             return aSign ? b : a;                                       \
7724         }                                                               \
7725     } else {                                                            \
7726         if (ismin) {                                                    \
7727             return (aSign ^ (av < bv)) ? a : b;                         \
7728         } else {                                                        \
7729             return (aSign ^ (av < bv)) ? b : a;                         \
7730         }                                                               \
7731     }                                                                   \
7732 }                                                                       \
7733                                                                         \
7734 float ## s float ## s ## _min(float ## s a, float ## s b,               \
7735                               float_status *status)                     \
7736 {                                                                       \
7737     return float ## s ## _minmax(a, b, 1, 0, 0, status);                \
7738 }                                                                       \
7739                                                                         \
7740 float ## s float ## s ## _max(float ## s a, float ## s b,               \
7741                               float_status *status)                     \
7742 {                                                                       \
7743     return float ## s ## _minmax(a, b, 0, 0, 0, status);                \
7744 }                                                                       \
7745                                                                         \
7746 float ## s float ## s ## _minnum(float ## s a, float ## s b,            \
7747                                  float_status *status)                  \
7748 {                                                                       \
7749     return float ## s ## _minmax(a, b, 1, 1, 0, status);                \
7750 }                                                                       \
7751                                                                         \
7752 float ## s float ## s ## _maxnum(float ## s a, float ## s b,            \
7753                                  float_status *status)                  \
7754 {                                                                       \
7755     return float ## s ## _minmax(a, b, 0, 1, 0, status);                \
7756 }                                                                       \
7757                                                                         \
7758 float ## s float ## s ## _minnummag(float ## s a, float ## s b,         \
7759                                     float_status *status)               \
7760 {                                                                       \
7761     return float ## s ## _minmax(a, b, 1, 1, 1, status);                \
7762 }                                                                       \
7763                                                                         \
7764 float ## s float ## s ## _maxnummag(float ## s a, float ## s b,         \
7765                                     float_status *status)               \
7766 {                                                                       \
7767     return float ## s ## _minmax(a, b, 0, 1, 1, status);                \
7768 }
7769 
7770 MINMAX(32)
7771 MINMAX(64)
7772 
7773 
7774 /* Multiply A by 2 raised to the power N.  */
7775 float32 float32_scalbn(float32 a, int n, float_status *status)
7776 {
7777     flag aSign;
7778     int16_t aExp;
7779     uint32_t aSig;
7780 
7781     a = float32_squash_input_denormal(a, status);
7782     aSig = extractFloat32Frac( a );
7783     aExp = extractFloat32Exp( a );
7784     aSign = extractFloat32Sign( a );
7785 
7786     if ( aExp == 0xFF ) {
7787         if ( aSig ) {
7788             return propagateFloat32NaN(a, a, status);
7789         }
7790         return a;
7791     }
7792     if (aExp != 0) {
7793         aSig |= 0x00800000;
7794     } else if (aSig == 0) {
7795         return a;
7796     } else {
7797         aExp++;
7798     }
7799 
7800     if (n > 0x200) {
7801         n = 0x200;
7802     } else if (n < -0x200) {
7803         n = -0x200;
7804     }
7805 
7806     aExp += n - 1;
7807     aSig <<= 7;
7808     return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
7809 }
7810 
7811 float64 float64_scalbn(float64 a, int n, float_status *status)
7812 {
7813     flag aSign;
7814     int16_t aExp;
7815     uint64_t aSig;
7816 
7817     a = float64_squash_input_denormal(a, status);
7818     aSig = extractFloat64Frac( a );
7819     aExp = extractFloat64Exp( a );
7820     aSign = extractFloat64Sign( a );
7821 
7822     if ( aExp == 0x7FF ) {
7823         if ( aSig ) {
7824             return propagateFloat64NaN(a, a, status);
7825         }
7826         return a;
7827     }
7828     if (aExp != 0) {
7829         aSig |= LIT64( 0x0010000000000000 );
7830     } else if (aSig == 0) {
7831         return a;
7832     } else {
7833         aExp++;
7834     }
7835 
7836     if (n > 0x1000) {
7837         n = 0x1000;
7838     } else if (n < -0x1000) {
7839         n = -0x1000;
7840     }
7841 
7842     aExp += n - 1;
7843     aSig <<= 10;
7844     return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
7845 }
7846 
7847 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7848 {
7849     flag aSign;
7850     int32_t aExp;
7851     uint64_t aSig;
7852 
7853     if (floatx80_invalid_encoding(a)) {
7854         float_raise(float_flag_invalid, status);
7855         return floatx80_default_nan(status);
7856     }
7857     aSig = extractFloatx80Frac( a );
7858     aExp = extractFloatx80Exp( a );
7859     aSign = extractFloatx80Sign( a );
7860 
7861     if ( aExp == 0x7FFF ) {
7862         if ( aSig<<1 ) {
7863             return propagateFloatx80NaN(a, a, status);
7864         }
7865         return a;
7866     }
7867 
7868     if (aExp == 0) {
7869         if (aSig == 0) {
7870             return a;
7871         }
7872         aExp++;
7873     }
7874 
7875     if (n > 0x10000) {
7876         n = 0x10000;
7877     } else if (n < -0x10000) {
7878         n = -0x10000;
7879     }
7880 
7881     aExp += n;
7882     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7883                                          aSign, aExp, aSig, 0, status);
7884 }
7885 
7886 float128 float128_scalbn(float128 a, int n, float_status *status)
7887 {
7888     flag aSign;
7889     int32_t aExp;
7890     uint64_t aSig0, aSig1;
7891 
7892     aSig1 = extractFloat128Frac1( a );
7893     aSig0 = extractFloat128Frac0( a );
7894     aExp = extractFloat128Exp( a );
7895     aSign = extractFloat128Sign( a );
7896     if ( aExp == 0x7FFF ) {
7897         if ( aSig0 | aSig1 ) {
7898             return propagateFloat128NaN(a, a, status);
7899         }
7900         return a;
7901     }
7902     if (aExp != 0) {
7903         aSig0 |= LIT64( 0x0001000000000000 );
7904     } else if (aSig0 == 0 && aSig1 == 0) {
7905         return a;
7906     } else {
7907         aExp++;
7908     }
7909 
7910     if (n > 0x10000) {
7911         n = 0x10000;
7912     } else if (n < -0x10000) {
7913         n = -0x10000;
7914     }
7915 
7916     aExp += n - 1;
7917     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7918                                          , status);
7919 
7920 }
7921