xref: /qemu/include/fpu/softfloat-macros.h (revision b4d09b1794b08cbed5b33ea4bd146f523a2c3c1f)
1 /*
2  * QEMU float support macros
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 #ifndef FPU_SOFTFLOAT_MACROS_H
83 #define FPU_SOFTFLOAT_MACROS_H
84 
85 #include "fpu/softfloat-types.h"
86 #include "qemu/host-utils.h"
87 
88 /*----------------------------------------------------------------------------
89 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
90 | bits are shifted off, they are ``jammed'' into the least significant bit of
91 | the result by setting the least significant bit to 1.  The value of `count'
92 | can be arbitrarily large; in particular, if `count' is greater than 32, the
93 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
94 | The result is stored in the location pointed to by `zPtr'.
95 *----------------------------------------------------------------------------*/
96 
97 static inline void shift32RightJamming(uint32_t a, int count, uint32_t *zPtr)
98 {
99     uint32_t z;
100 
101     if ( count == 0 ) {
102         z = a;
103     }
104     else if ( count < 32 ) {
105         z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
106     }
107     else {
108         z = ( a != 0 );
109     }
110     *zPtr = z;
111 
112 }
113 
114 /*----------------------------------------------------------------------------
115 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
116 | bits are shifted off, they are ``jammed'' into the least significant bit of
117 | the result by setting the least significant bit to 1.  The value of `count'
118 | can be arbitrarily large; in particular, if `count' is greater than 64, the
119 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
120 | The result is stored in the location pointed to by `zPtr'.
121 *----------------------------------------------------------------------------*/
122 
123 static inline void shift64RightJamming(uint64_t a, int count, uint64_t *zPtr)
124 {
125     uint64_t z;
126 
127     if ( count == 0 ) {
128         z = a;
129     }
130     else if ( count < 64 ) {
131         z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
132     }
133     else {
134         z = ( a != 0 );
135     }
136     *zPtr = z;
137 
138 }
139 
140 /*----------------------------------------------------------------------------
141 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
142 | _plus_ the number of bits given in `count'.  The shifted result is at most
143 | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'.  The
144 | bits shifted off form a second 64-bit result as follows:  The _last_ bit
145 | shifted off is the most-significant bit of the extra result, and the other
146 | 63 bits of the extra result are all zero if and only if _all_but_the_last_
147 | bits shifted off were all zero.  This extra result is stored in the location
148 | pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large.
149 |     (This routine makes more sense if `a0' and `a1' are considered to form a
150 | fixed-point value with binary point between `a0' and `a1'.  This fixed-point
151 | value is shifted right by the number of bits given in `count', and the
152 | integer part of the result is returned at the location pointed to by
153 | `z0Ptr'.  The fractional part of the result may be slightly corrupted as
154 | described above, and is returned at the location pointed to by `z1Ptr'.)
155 *----------------------------------------------------------------------------*/
156 
157 static inline void
158  shift64ExtraRightJamming(
159      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
160 {
161     uint64_t z0, z1;
162     int8_t negCount = ( - count ) & 63;
163 
164     if ( count == 0 ) {
165         z1 = a1;
166         z0 = a0;
167     }
168     else if ( count < 64 ) {
169         z1 = ( a0<<negCount ) | ( a1 != 0 );
170         z0 = a0>>count;
171     }
172     else {
173         if ( count == 64 ) {
174             z1 = a0 | ( a1 != 0 );
175         }
176         else {
177             z1 = ( ( a0 | a1 ) != 0 );
178         }
179         z0 = 0;
180     }
181     *z1Ptr = z1;
182     *z0Ptr = z0;
183 
184 }
185 
186 /*----------------------------------------------------------------------------
187 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
188 | number of bits given in `count'.  Any bits shifted off are lost.  The value
189 | of `count' can be arbitrarily large; in particular, if `count' is greater
190 | than 128, the result will be 0.  The result is broken into two 64-bit pieces
191 | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
192 *----------------------------------------------------------------------------*/
193 
194 static inline void
195  shift128Right(
196      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
197 {
198     uint64_t z0, z1;
199     int8_t negCount = ( - count ) & 63;
200 
201     if ( count == 0 ) {
202         z1 = a1;
203         z0 = a0;
204     }
205     else if ( count < 64 ) {
206         z1 = ( a0<<negCount ) | ( a1>>count );
207         z0 = a0>>count;
208     }
209     else {
210         z1 = (count < 128) ? (a0 >> (count & 63)) : 0;
211         z0 = 0;
212     }
213     *z1Ptr = z1;
214     *z0Ptr = z0;
215 
216 }
217 
218 /*----------------------------------------------------------------------------
219 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
220 | number of bits given in `count'.  If any nonzero bits are shifted off, they
221 | are ``jammed'' into the least significant bit of the result by setting the
222 | least significant bit to 1.  The value of `count' can be arbitrarily large;
223 | in particular, if `count' is greater than 128, the result will be either
224 | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
225 | nonzero.  The result is broken into two 64-bit pieces which are stored at
226 | the locations pointed to by `z0Ptr' and `z1Ptr'.
227 *----------------------------------------------------------------------------*/
228 
229 static inline void
230  shift128RightJamming(
231      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
232 {
233     uint64_t z0, z1;
234     int8_t negCount = ( - count ) & 63;
235 
236     if ( count == 0 ) {
237         z1 = a1;
238         z0 = a0;
239     }
240     else if ( count < 64 ) {
241         z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
242         z0 = a0>>count;
243     }
244     else {
245         if ( count == 64 ) {
246             z1 = a0 | ( a1 != 0 );
247         }
248         else if ( count < 128 ) {
249             z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
250         }
251         else {
252             z1 = ( ( a0 | a1 ) != 0 );
253         }
254         z0 = 0;
255     }
256     *z1Ptr = z1;
257     *z0Ptr = z0;
258 
259 }
260 
261 /*----------------------------------------------------------------------------
262 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
263 | by 64 _plus_ the number of bits given in `count'.  The shifted result is
264 | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
265 | stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
266 | off form a third 64-bit result as follows:  The _last_ bit shifted off is
267 | the most-significant bit of the extra result, and the other 63 bits of the
268 | extra result are all zero if and only if _all_but_the_last_ bits shifted off
269 | were all zero.  This extra result is stored in the location pointed to by
270 | `z2Ptr'.  The value of `count' can be arbitrarily large.
271 |     (This routine makes more sense if `a0', `a1', and `a2' are considered
272 | to form a fixed-point value with binary point between `a1' and `a2'.  This
273 | fixed-point value is shifted right by the number of bits given in `count',
274 | and the integer part of the result is returned at the locations pointed to
275 | by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
276 | corrupted as described above, and is returned at the location pointed to by
277 | `z2Ptr'.)
278 *----------------------------------------------------------------------------*/
279 
280 static inline void
281  shift128ExtraRightJamming(
282      uint64_t a0,
283      uint64_t a1,
284      uint64_t a2,
285      int count,
286      uint64_t *z0Ptr,
287      uint64_t *z1Ptr,
288      uint64_t *z2Ptr
289  )
290 {
291     uint64_t z0, z1, z2;
292     int8_t negCount = ( - count ) & 63;
293 
294     if ( count == 0 ) {
295         z2 = a2;
296         z1 = a1;
297         z0 = a0;
298     }
299     else {
300         if ( count < 64 ) {
301             z2 = a1<<negCount;
302             z1 = ( a0<<negCount ) | ( a1>>count );
303             z0 = a0>>count;
304         }
305         else {
306             if ( count == 64 ) {
307                 z2 = a1;
308                 z1 = a0;
309             }
310             else {
311                 a2 |= a1;
312                 if ( count < 128 ) {
313                     z2 = a0<<negCount;
314                     z1 = a0>>( count & 63 );
315                 }
316                 else {
317                     z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
318                     z1 = 0;
319                 }
320             }
321             z0 = 0;
322         }
323         z2 |= ( a2 != 0 );
324     }
325     *z2Ptr = z2;
326     *z1Ptr = z1;
327     *z0Ptr = z0;
328 
329 }
330 
331 /*----------------------------------------------------------------------------
332 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
333 | number of bits given in `count'.  Any bits shifted off are lost.  The value
334 | of `count' must be less than 64.  The result is broken into two 64-bit
335 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
336 *----------------------------------------------------------------------------*/
337 
338 static inline void shortShift128Left(uint64_t a0, uint64_t a1, int count,
339                                      uint64_t *z0Ptr, uint64_t *z1Ptr)
340 {
341     *z1Ptr = a1 << count;
342     *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
343 }
344 
345 /*----------------------------------------------------------------------------
346 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
347 | number of bits given in `count'.  Any bits shifted off are lost.  The value
348 | of `count' may be greater than 64.  The result is broken into two 64-bit
349 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
350 *----------------------------------------------------------------------------*/
351 
352 static inline void shift128Left(uint64_t a0, uint64_t a1, int count,
353                                 uint64_t *z0Ptr, uint64_t *z1Ptr)
354 {
355     if (count < 64) {
356         *z1Ptr = a1 << count;
357         *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
358     } else {
359         *z1Ptr = 0;
360         *z0Ptr = a1 << (count - 64);
361     }
362 }
363 
364 /*----------------------------------------------------------------------------
365 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
366 | by the number of bits given in `count'.  Any bits shifted off are lost.
367 | The value of `count' must be less than 64.  The result is broken into three
368 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
369 | `z1Ptr', and `z2Ptr'.
370 *----------------------------------------------------------------------------*/
371 
372 static inline void
373  shortShift192Left(
374      uint64_t a0,
375      uint64_t a1,
376      uint64_t a2,
377      int count,
378      uint64_t *z0Ptr,
379      uint64_t *z1Ptr,
380      uint64_t *z2Ptr
381  )
382 {
383     uint64_t z0, z1, z2;
384     int8_t negCount;
385 
386     z2 = a2<<count;
387     z1 = a1<<count;
388     z0 = a0<<count;
389     if ( 0 < count ) {
390         negCount = ( ( - count ) & 63 );
391         z1 |= a2>>negCount;
392         z0 |= a1>>negCount;
393     }
394     *z2Ptr = z2;
395     *z1Ptr = z1;
396     *z0Ptr = z0;
397 
398 }
399 
400 /*----------------------------------------------------------------------------
401 | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
402 | value formed by concatenating `b0' and `b1'.  Addition is modulo 2^128, so
403 | any carry out is lost.  The result is broken into two 64-bit pieces which
404 | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
405 *----------------------------------------------------------------------------*/
406 
407 static inline void add128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
408                           uint64_t *z0Ptr, uint64_t *z1Ptr)
409 {
410     bool c = 0;
411     *z1Ptr = uadd64_carry(a1, b1, &c);
412     *z0Ptr = uadd64_carry(a0, b0, &c);
413 }
414 
415 /*----------------------------------------------------------------------------
416 | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
417 | 192-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
418 | modulo 2^192, so any carry out is lost.  The result is broken into three
419 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
420 | `z1Ptr', and `z2Ptr'.
421 *----------------------------------------------------------------------------*/
422 
423 static inline void add192(uint64_t a0, uint64_t a1, uint64_t a2,
424                           uint64_t b0, uint64_t b1, uint64_t b2,
425                           uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
426 {
427     bool c = 0;
428     *z2Ptr = uadd64_carry(a2, b2, &c);
429     *z1Ptr = uadd64_carry(a1, b1, &c);
430     *z0Ptr = uadd64_carry(a0, b0, &c);
431 }
432 
433 /*----------------------------------------------------------------------------
434 | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
435 | 128-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo
436 | 2^128, so any borrow out (carry out) is lost.  The result is broken into two
437 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
438 | `z1Ptr'.
439 *----------------------------------------------------------------------------*/
440 
441 static inline void sub128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
442                           uint64_t *z0Ptr, uint64_t *z1Ptr)
443 {
444     bool c = 0;
445     *z1Ptr = usub64_borrow(a1, b1, &c);
446     *z0Ptr = usub64_borrow(a0, b0, &c);
447 }
448 
449 /*----------------------------------------------------------------------------
450 | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
451 | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
452 | Subtraction is modulo 2^192, so any borrow out (carry out) is lost.  The
453 | result is broken into three 64-bit pieces which are stored at the locations
454 | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
455 *----------------------------------------------------------------------------*/
456 
457 static inline void sub192(uint64_t a0, uint64_t a1, uint64_t a2,
458                           uint64_t b0, uint64_t b1, uint64_t b2,
459                           uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
460 {
461     bool c = 0;
462     *z2Ptr = usub64_borrow(a2, b2, &c);
463     *z1Ptr = usub64_borrow(a1, b1, &c);
464     *z0Ptr = usub64_borrow(a0, b0, &c);
465 }
466 
467 /*----------------------------------------------------------------------------
468 | Multiplies `a' by `b' to obtain a 128-bit product.  The product is broken
469 | into two 64-bit pieces which are stored at the locations pointed to by
470 | `z0Ptr' and `z1Ptr'.
471 *----------------------------------------------------------------------------*/
472 
473 static inline void
474 mul64To128(uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr)
475 {
476     mulu64(z1Ptr, z0Ptr, a, b);
477 }
478 
479 /*----------------------------------------------------------------------------
480 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
481 | `b' to obtain a 192-bit product.  The product is broken into three 64-bit
482 | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
483 | `z2Ptr'.
484 *----------------------------------------------------------------------------*/
485 
486 static inline void
487  mul128By64To192(
488      uint64_t a0,
489      uint64_t a1,
490      uint64_t b,
491      uint64_t *z0Ptr,
492      uint64_t *z1Ptr,
493      uint64_t *z2Ptr
494  )
495 {
496     uint64_t z0, z1, z2, more1;
497 
498     mul64To128( a1, b, &z1, &z2 );
499     mul64To128( a0, b, &z0, &more1 );
500     add128( z0, more1, 0, z1, &z0, &z1 );
501     *z2Ptr = z2;
502     *z1Ptr = z1;
503     *z0Ptr = z0;
504 
505 }
506 
507 /*----------------------------------------------------------------------------
508 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
509 | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
510 | product.  The product is broken into four 64-bit pieces which are stored at
511 | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
512 *----------------------------------------------------------------------------*/
513 
514 static inline void
515  mul128To256(
516      uint64_t a0,
517      uint64_t a1,
518      uint64_t b0,
519      uint64_t b1,
520      uint64_t *z0Ptr,
521      uint64_t *z1Ptr,
522      uint64_t *z2Ptr,
523      uint64_t *z3Ptr
524  )
525 {
526     uint64_t z0, z1, z2, z3;
527     uint64_t more1, more2;
528 
529     mul64To128( a1, b1, &z2, &z3 );
530     mul64To128( a1, b0, &z1, &more2 );
531     add128( z1, more2, 0, z2, &z1, &z2 );
532     mul64To128( a0, b0, &z0, &more1 );
533     add128( z0, more1, 0, z1, &z0, &z1 );
534     mul64To128( a0, b1, &more1, &more2 );
535     add128( more1, more2, 0, z2, &more1, &z2 );
536     add128( z0, z1, 0, more1, &z0, &z1 );
537     *z3Ptr = z3;
538     *z2Ptr = z2;
539     *z1Ptr = z1;
540     *z0Ptr = z0;
541 
542 }
543 
544 /*----------------------------------------------------------------------------
545 | Returns an approximation to the 64-bit integer quotient obtained by dividing
546 | `b' into the 128-bit value formed by concatenating `a0' and `a1'.  The
547 | divisor `b' must be at least 2^63.  If q is the exact quotient truncated
548 | toward zero, the approximation returned lies between q and q + 2 inclusive.
549 | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
550 | unsigned integer is returned.
551 *----------------------------------------------------------------------------*/
552 
553 static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
554 {
555     uint64_t b0, b1;
556     uint64_t rem0, rem1, term0, term1;
557     uint64_t z;
558 
559     if ( b <= a0 ) return UINT64_C(0xFFFFFFFFFFFFFFFF);
560     b0 = b>>32;
561     z = ( b0<<32 <= a0 ) ? UINT64_C(0xFFFFFFFF00000000) : ( a0 / b0 )<<32;
562     mul64To128( b, z, &term0, &term1 );
563     sub128( a0, a1, term0, term1, &rem0, &rem1 );
564     while ( ( (int64_t) rem0 ) < 0 ) {
565         z -= UINT64_C(0x100000000);
566         b1 = b<<32;
567         add128( rem0, rem1, b0, b1, &rem0, &rem1 );
568     }
569     rem0 = ( rem0<<32 ) | ( rem1>>32 );
570     z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
571     return z;
572 
573 }
574 
575 /* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
576  * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
577  *
578  * Licensed under the GPLv2/LGPLv3
579  */
580 static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
581                                   uint64_t n0, uint64_t d)
582 {
583 #if defined(__x86_64__)
584     uint64_t q;
585     asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
586     return q;
587 #elif defined(__s390x__) && !defined(__clang__)
588     /* Need to use a TImode type to get an even register pair for DLGR.  */
589     unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
590     asm("dlgr %0, %1" : "+r"(n) : "r"(d));
591     *r = n >> 64;
592     return n;
593 #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
594     /* From Power ISA 2.06, programming note for divdeu.  */
595     uint64_t q1, q2, Q, r1, r2, R;
596     asm("divdeu %0,%2,%4; divdu %1,%3,%4"
597         : "=&r"(q1), "=r"(q2)
598         : "r"(n1), "r"(n0), "r"(d));
599     r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
600     r2 = n0 - (q2 * d);
601     Q = q1 + q2;
602     R = r1 + r2;
603     if (R >= d || R < r2) { /* overflow implies R > d */
604         Q += 1;
605         R -= d;
606     }
607     *r = R;
608     return Q;
609 #else
610     uint64_t d0, d1, q0, q1, r1, r0, m;
611 
612     d0 = (uint32_t)d;
613     d1 = d >> 32;
614 
615     r1 = n1 % d1;
616     q1 = n1 / d1;
617     m = q1 * d0;
618     r1 = (r1 << 32) | (n0 >> 32);
619     if (r1 < m) {
620         q1 -= 1;
621         r1 += d;
622         if (r1 >= d) {
623             if (r1 < m) {
624                 q1 -= 1;
625                 r1 += d;
626             }
627         }
628     }
629     r1 -= m;
630 
631     r0 = r1 % d1;
632     q0 = r1 / d1;
633     m = q0 * d0;
634     r0 = (r0 << 32) | (uint32_t)n0;
635     if (r0 < m) {
636         q0 -= 1;
637         r0 += d;
638         if (r0 >= d) {
639             if (r0 < m) {
640                 q0 -= 1;
641                 r0 += d;
642             }
643         }
644     }
645     r0 -= m;
646 
647     *r = r0;
648     return (q1 << 32) | q0;
649 #endif
650 }
651 
652 /*----------------------------------------------------------------------------
653 | Returns an approximation to the square root of the 32-bit significand given
654 | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
655 | `aExp' (the least significant bit) is 1, the integer returned approximates
656 | 2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp'
657 | is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either
658 | case, the approximation returned lies strictly within +/-2 of the exact
659 | value.
660 *----------------------------------------------------------------------------*/
661 
662 static inline uint32_t estimateSqrt32(int aExp, uint32_t a)
663 {
664     static const uint16_t sqrtOddAdjustments[] = {
665         0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
666         0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
667     };
668     static const uint16_t sqrtEvenAdjustments[] = {
669         0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
670         0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
671     };
672     int8_t index;
673     uint32_t z;
674 
675     index = ( a>>27 ) & 15;
676     if ( aExp & 1 ) {
677         z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ];
678         z = ( ( a / z )<<14 ) + ( z<<15 );
679         a >>= 1;
680     }
681     else {
682         z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ];
683         z = a / z + z;
684         z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
685         if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 );
686     }
687     return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 );
688 
689 }
690 
691 /*----------------------------------------------------------------------------
692 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
693 | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
694 | Otherwise, returns 0.
695 *----------------------------------------------------------------------------*/
696 
697 static inline bool eq128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
698 {
699     return a0 == b0 && a1 == b1;
700 }
701 
702 /*----------------------------------------------------------------------------
703 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
704 | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
705 | Otherwise, returns 0.
706 *----------------------------------------------------------------------------*/
707 
708 static inline bool le128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
709 {
710     return a0 < b0 || (a0 == b0 && a1 <= b1);
711 }
712 
713 /*----------------------------------------------------------------------------
714 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
715 | than the 128-bit value formed by concatenating `b0' and `b1'.  Otherwise,
716 | returns 0.
717 *----------------------------------------------------------------------------*/
718 
719 static inline bool lt128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
720 {
721     return a0 < b0 || (a0 == b0 && a1 < b1);
722 }
723 
724 /*----------------------------------------------------------------------------
725 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
726 | not equal to the 128-bit value formed by concatenating `b0' and `b1'.
727 | Otherwise, returns 0.
728 *----------------------------------------------------------------------------*/
729 
730 static inline bool ne128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
731 {
732     return a0 != b0 || a1 != b1;
733 }
734 
735 #endif
736