xref: /qemu/include/fpu/softfloat-macros.h (revision 5ffb6bd9c44763a0ee11981c632b9be96ec68d8c)
1 /*
2  * QEMU float support macros
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 #ifndef FPU_SOFTFLOAT_MACROS_H
83 #define FPU_SOFTFLOAT_MACROS_H
84 
85 #include "fpu/softfloat-types.h"
86 #include "qemu/host-utils.h"
87 
88 /*----------------------------------------------------------------------------
89 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
90 | bits are shifted off, they are ``jammed'' into the least significant bit of
91 | the result by setting the least significant bit to 1.  The value of `count'
92 | can be arbitrarily large; in particular, if `count' is greater than 32, the
93 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
94 | The result is stored in the location pointed to by `zPtr'.
95 *----------------------------------------------------------------------------*/
96 
97 static inline void shift32RightJamming(uint32_t a, int count, uint32_t *zPtr)
98 {
99     uint32_t z;
100 
101     if ( count == 0 ) {
102         z = a;
103     }
104     else if ( count < 32 ) {
105         z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
106     }
107     else {
108         z = ( a != 0 );
109     }
110     *zPtr = z;
111 
112 }
113 
114 /*----------------------------------------------------------------------------
115 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
116 | bits are shifted off, they are ``jammed'' into the least significant bit of
117 | the result by setting the least significant bit to 1.  The value of `count'
118 | can be arbitrarily large; in particular, if `count' is greater than 64, the
119 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
120 | The result is stored in the location pointed to by `zPtr'.
121 *----------------------------------------------------------------------------*/
122 
123 static inline void shift64RightJamming(uint64_t a, int count, uint64_t *zPtr)
124 {
125     uint64_t z;
126 
127     if ( count == 0 ) {
128         z = a;
129     }
130     else if ( count < 64 ) {
131         z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
132     }
133     else {
134         z = ( a != 0 );
135     }
136     *zPtr = z;
137 
138 }
139 
140 /*----------------------------------------------------------------------------
141 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
142 | _plus_ the number of bits given in `count'.  The shifted result is at most
143 | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'.  The
144 | bits shifted off form a second 64-bit result as follows:  The _last_ bit
145 | shifted off is the most-significant bit of the extra result, and the other
146 | 63 bits of the extra result are all zero if and only if _all_but_the_last_
147 | bits shifted off were all zero.  This extra result is stored in the location
148 | pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large.
149 |     (This routine makes more sense if `a0' and `a1' are considered to form a
150 | fixed-point value with binary point between `a0' and `a1'.  This fixed-point
151 | value is shifted right by the number of bits given in `count', and the
152 | integer part of the result is returned at the location pointed to by
153 | `z0Ptr'.  The fractional part of the result may be slightly corrupted as
154 | described above, and is returned at the location pointed to by `z1Ptr'.)
155 *----------------------------------------------------------------------------*/
156 
157 static inline void
158  shift64ExtraRightJamming(
159      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
160 {
161     uint64_t z0, z1;
162     int8_t negCount = ( - count ) & 63;
163 
164     if ( count == 0 ) {
165         z1 = a1;
166         z0 = a0;
167     }
168     else if ( count < 64 ) {
169         z1 = ( a0<<negCount ) | ( a1 != 0 );
170         z0 = a0>>count;
171     }
172     else {
173         if ( count == 64 ) {
174             z1 = a0 | ( a1 != 0 );
175         }
176         else {
177             z1 = ( ( a0 | a1 ) != 0 );
178         }
179         z0 = 0;
180     }
181     *z1Ptr = z1;
182     *z0Ptr = z0;
183 
184 }
185 
186 /*----------------------------------------------------------------------------
187 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
188 | number of bits given in `count'.  Any bits shifted off are lost.  The value
189 | of `count' can be arbitrarily large; in particular, if `count' is greater
190 | than 128, the result will be 0.  The result is broken into two 64-bit pieces
191 | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
192 *----------------------------------------------------------------------------*/
193 
194 static inline void
195  shift128Right(
196      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
197 {
198     uint64_t z0, z1;
199     int8_t negCount = ( - count ) & 63;
200 
201     if ( count == 0 ) {
202         z1 = a1;
203         z0 = a0;
204     }
205     else if ( count < 64 ) {
206         z1 = ( a0<<negCount ) | ( a1>>count );
207         z0 = a0>>count;
208     }
209     else {
210         z1 = (count < 128) ? (a0 >> (count & 63)) : 0;
211         z0 = 0;
212     }
213     *z1Ptr = z1;
214     *z0Ptr = z0;
215 
216 }
217 
218 /*----------------------------------------------------------------------------
219 | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
220 | number of bits given in `count'.  If any nonzero bits are shifted off, they
221 | are ``jammed'' into the least significant bit of the result by setting the
222 | least significant bit to 1.  The value of `count' can be arbitrarily large;
223 | in particular, if `count' is greater than 128, the result will be either
224 | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
225 | nonzero.  The result is broken into two 64-bit pieces which are stored at
226 | the locations pointed to by `z0Ptr' and `z1Ptr'.
227 *----------------------------------------------------------------------------*/
228 
229 static inline void
230  shift128RightJamming(
231      uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
232 {
233     uint64_t z0, z1;
234     int8_t negCount = ( - count ) & 63;
235 
236     if ( count == 0 ) {
237         z1 = a1;
238         z0 = a0;
239     }
240     else if ( count < 64 ) {
241         z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
242         z0 = a0>>count;
243     }
244     else {
245         if ( count == 64 ) {
246             z1 = a0 | ( a1 != 0 );
247         }
248         else if ( count < 128 ) {
249             z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
250         }
251         else {
252             z1 = ( ( a0 | a1 ) != 0 );
253         }
254         z0 = 0;
255     }
256     *z1Ptr = z1;
257     *z0Ptr = z0;
258 
259 }
260 
261 /*----------------------------------------------------------------------------
262 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
263 | by 64 _plus_ the number of bits given in `count'.  The shifted result is
264 | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
265 | stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
266 | off form a third 64-bit result as follows:  The _last_ bit shifted off is
267 | the most-significant bit of the extra result, and the other 63 bits of the
268 | extra result are all zero if and only if _all_but_the_last_ bits shifted off
269 | were all zero.  This extra result is stored in the location pointed to by
270 | `z2Ptr'.  The value of `count' can be arbitrarily large.
271 |     (This routine makes more sense if `a0', `a1', and `a2' are considered
272 | to form a fixed-point value with binary point between `a1' and `a2'.  This
273 | fixed-point value is shifted right by the number of bits given in `count',
274 | and the integer part of the result is returned at the locations pointed to
275 | by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
276 | corrupted as described above, and is returned at the location pointed to by
277 | `z2Ptr'.)
278 *----------------------------------------------------------------------------*/
279 
280 static inline void
281  shift128ExtraRightJamming(
282      uint64_t a0,
283      uint64_t a1,
284      uint64_t a2,
285      int count,
286      uint64_t *z0Ptr,
287      uint64_t *z1Ptr,
288      uint64_t *z2Ptr
289  )
290 {
291     uint64_t z0, z1, z2;
292     int8_t negCount = ( - count ) & 63;
293 
294     if ( count == 0 ) {
295         z2 = a2;
296         z1 = a1;
297         z0 = a0;
298     }
299     else {
300         if ( count < 64 ) {
301             z2 = a1<<negCount;
302             z1 = ( a0<<negCount ) | ( a1>>count );
303             z0 = a0>>count;
304         }
305         else {
306             if ( count == 64 ) {
307                 z2 = a1;
308                 z1 = a0;
309             }
310             else {
311                 a2 |= a1;
312                 if ( count < 128 ) {
313                     z2 = a0<<negCount;
314                     z1 = a0>>( count & 63 );
315                 }
316                 else {
317                     z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
318                     z1 = 0;
319                 }
320             }
321             z0 = 0;
322         }
323         z2 |= ( a2 != 0 );
324     }
325     *z2Ptr = z2;
326     *z1Ptr = z1;
327     *z0Ptr = z0;
328 
329 }
330 
331 /*----------------------------------------------------------------------------
332 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
333 | number of bits given in `count'.  Any bits shifted off are lost.  The value
334 | of `count' must be less than 64.  The result is broken into two 64-bit
335 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
336 *----------------------------------------------------------------------------*/
337 
338 static inline void shortShift128Left(uint64_t a0, uint64_t a1, int count,
339                                      uint64_t *z0Ptr, uint64_t *z1Ptr)
340 {
341     *z1Ptr = a1 << count;
342     *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
343 }
344 
345 /*----------------------------------------------------------------------------
346 | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
347 | number of bits given in `count'.  Any bits shifted off are lost.  The value
348 | of `count' may be greater than 64.  The result is broken into two 64-bit
349 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
350 *----------------------------------------------------------------------------*/
351 
352 static inline void shift128Left(uint64_t a0, uint64_t a1, int count,
353                                 uint64_t *z0Ptr, uint64_t *z1Ptr)
354 {
355     if (count < 64) {
356         *z1Ptr = a1 << count;
357         *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
358     } else {
359         *z1Ptr = 0;
360         *z0Ptr = a1 << (count - 64);
361     }
362 }
363 
364 /*----------------------------------------------------------------------------
365 | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
366 | by the number of bits given in `count'.  Any bits shifted off are lost.
367 | The value of `count' must be less than 64.  The result is broken into three
368 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
369 | `z1Ptr', and `z2Ptr'.
370 *----------------------------------------------------------------------------*/
371 
372 static inline void
373  shortShift192Left(
374      uint64_t a0,
375      uint64_t a1,
376      uint64_t a2,
377      int count,
378      uint64_t *z0Ptr,
379      uint64_t *z1Ptr,
380      uint64_t *z2Ptr
381  )
382 {
383     uint64_t z0, z1, z2;
384     int8_t negCount;
385 
386     z2 = a2<<count;
387     z1 = a1<<count;
388     z0 = a0<<count;
389     if ( 0 < count ) {
390         negCount = ( ( - count ) & 63 );
391         z1 |= a2>>negCount;
392         z0 |= a1>>negCount;
393     }
394     *z2Ptr = z2;
395     *z1Ptr = z1;
396     *z0Ptr = z0;
397 
398 }
399 
400 /*----------------------------------------------------------------------------
401 | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
402 | value formed by concatenating `b0' and `b1'.  Addition is modulo 2^128, so
403 | any carry out is lost.  The result is broken into two 64-bit pieces which
404 | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
405 *----------------------------------------------------------------------------*/
406 
407 static inline void add128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
408                           uint64_t *z0Ptr, uint64_t *z1Ptr)
409 {
410     bool c = 0;
411     *z1Ptr = uadd64_carry(a1, b1, &c);
412     *z0Ptr = uadd64_carry(a0, b0, &c);
413 }
414 
415 /*----------------------------------------------------------------------------
416 | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
417 | 192-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
418 | modulo 2^192, so any carry out is lost.  The result is broken into three
419 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
420 | `z1Ptr', and `z2Ptr'.
421 *----------------------------------------------------------------------------*/
422 
423 static inline void add192(uint64_t a0, uint64_t a1, uint64_t a2,
424                           uint64_t b0, uint64_t b1, uint64_t b2,
425                           uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
426 {
427     bool c = 0;
428     *z2Ptr = uadd64_carry(a2, b2, &c);
429     *z1Ptr = uadd64_carry(a1, b1, &c);
430     *z0Ptr = uadd64_carry(a0, b0, &c);
431 }
432 
433 /*----------------------------------------------------------------------------
434 | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
435 | 128-bit value formed by concatenating `a0' and `a1'.  Subtraction is modulo
436 | 2^128, so any borrow out (carry out) is lost.  The result is broken into two
437 | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
438 | `z1Ptr'.
439 *----------------------------------------------------------------------------*/
440 
441 static inline void sub128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
442                           uint64_t *z0Ptr, uint64_t *z1Ptr)
443 {
444     bool c = 0;
445     *z1Ptr = usub64_borrow(a1, b1, &c);
446     *z0Ptr = usub64_borrow(a0, b0, &c);
447 }
448 
449 /*----------------------------------------------------------------------------
450 | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
451 | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
452 | Subtraction is modulo 2^192, so any borrow out (carry out) is lost.  The
453 | result is broken into three 64-bit pieces which are stored at the locations
454 | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
455 *----------------------------------------------------------------------------*/
456 
457 static inline void sub192(uint64_t a0, uint64_t a1, uint64_t a2,
458                           uint64_t b0, uint64_t b1, uint64_t b2,
459                           uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
460 {
461     bool c = 0;
462     *z2Ptr = usub64_borrow(a2, b2, &c);
463     *z1Ptr = usub64_borrow(a1, b1, &c);
464     *z0Ptr = usub64_borrow(a0, b0, &c);
465 }
466 
467 /*----------------------------------------------------------------------------
468 | Multiplies `a' by `b' to obtain a 128-bit product.  The product is broken
469 | into two 64-bit pieces which are stored at the locations pointed to by
470 | `z0Ptr' and `z1Ptr'.
471 *----------------------------------------------------------------------------*/
472 
473 static inline void
474 mul64To128(uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr)
475 {
476     mulu64(z1Ptr, z0Ptr, a, b);
477 }
478 
479 /*----------------------------------------------------------------------------
480 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
481 | `b' to obtain a 192-bit product.  The product is broken into three 64-bit
482 | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
483 | `z2Ptr'.
484 *----------------------------------------------------------------------------*/
485 
486 static inline void
487 mul128By64To192(uint64_t a0, uint64_t a1, uint64_t b,
488                 uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
489 {
490     uint64_t z0, z1, m1;
491 
492     mul64To128(a1, b, &m1, z2Ptr);
493     mul64To128(a0, b, &z0, &z1);
494     add128(z0, z1, 0, m1, z0Ptr, z1Ptr);
495 }
496 
497 /*----------------------------------------------------------------------------
498 | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
499 | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
500 | product.  The product is broken into four 64-bit pieces which are stored at
501 | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
502 *----------------------------------------------------------------------------*/
503 
504 static inline void mul128To256(uint64_t a0, uint64_t a1,
505                                uint64_t b0, uint64_t b1,
506                                uint64_t *z0Ptr, uint64_t *z1Ptr,
507                                uint64_t *z2Ptr, uint64_t *z3Ptr)
508 {
509     uint64_t z0, z1, z2;
510     uint64_t m0, m1, m2, n1, n2;
511 
512     mul64To128(a1, b0, &m1, &m2);
513     mul64To128(a0, b1, &n1, &n2);
514     mul64To128(a1, b1, &z2, z3Ptr);
515     mul64To128(a0, b0, &z0, &z1);
516 
517     add192( 0, m1, m2,  0, n1, n2, &m0, &m1, &m2);
518     add192(m0, m1, m2, z0, z1, z2, z0Ptr, z1Ptr, z2Ptr);
519 }
520 
521 /*----------------------------------------------------------------------------
522 | Returns an approximation to the 64-bit integer quotient obtained by dividing
523 | `b' into the 128-bit value formed by concatenating `a0' and `a1'.  The
524 | divisor `b' must be at least 2^63.  If q is the exact quotient truncated
525 | toward zero, the approximation returned lies between q and q + 2 inclusive.
526 | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
527 | unsigned integer is returned.
528 *----------------------------------------------------------------------------*/
529 
530 static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
531 {
532     uint64_t b0, b1;
533     uint64_t rem0, rem1, term0, term1;
534     uint64_t z;
535 
536     if ( b <= a0 ) return UINT64_C(0xFFFFFFFFFFFFFFFF);
537     b0 = b>>32;
538     z = ( b0<<32 <= a0 ) ? UINT64_C(0xFFFFFFFF00000000) : ( a0 / b0 )<<32;
539     mul64To128( b, z, &term0, &term1 );
540     sub128( a0, a1, term0, term1, &rem0, &rem1 );
541     while ( ( (int64_t) rem0 ) < 0 ) {
542         z -= UINT64_C(0x100000000);
543         b1 = b<<32;
544         add128( rem0, rem1, b0, b1, &rem0, &rem1 );
545     }
546     rem0 = ( rem0<<32 ) | ( rem1>>32 );
547     z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
548     return z;
549 
550 }
551 
552 /* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
553  * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
554  *
555  * Licensed under the GPLv2/LGPLv3
556  */
557 static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1,
558                                   uint64_t n0, uint64_t d)
559 {
560 #if defined(__x86_64__)
561     uint64_t q;
562     asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d));
563     return q;
564 #elif defined(__s390x__) && !defined(__clang__)
565     /* Need to use a TImode type to get an even register pair for DLGR.  */
566     unsigned __int128 n = (unsigned __int128)n1 << 64 | n0;
567     asm("dlgr %0, %1" : "+r"(n) : "r"(d));
568     *r = n >> 64;
569     return n;
570 #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7)
571     /* From Power ISA 2.06, programming note for divdeu.  */
572     uint64_t q1, q2, Q, r1, r2, R;
573     asm("divdeu %0,%2,%4; divdu %1,%3,%4"
574         : "=&r"(q1), "=r"(q2)
575         : "r"(n1), "r"(n0), "r"(d));
576     r1 = -(q1 * d);         /* low part of (n1<<64) - (q1 * d) */
577     r2 = n0 - (q2 * d);
578     Q = q1 + q2;
579     R = r1 + r2;
580     if (R >= d || R < r2) { /* overflow implies R > d */
581         Q += 1;
582         R -= d;
583     }
584     *r = R;
585     return Q;
586 #else
587     uint64_t d0, d1, q0, q1, r1, r0, m;
588 
589     d0 = (uint32_t)d;
590     d1 = d >> 32;
591 
592     r1 = n1 % d1;
593     q1 = n1 / d1;
594     m = q1 * d0;
595     r1 = (r1 << 32) | (n0 >> 32);
596     if (r1 < m) {
597         q1 -= 1;
598         r1 += d;
599         if (r1 >= d) {
600             if (r1 < m) {
601                 q1 -= 1;
602                 r1 += d;
603             }
604         }
605     }
606     r1 -= m;
607 
608     r0 = r1 % d1;
609     q0 = r1 / d1;
610     m = q0 * d0;
611     r0 = (r0 << 32) | (uint32_t)n0;
612     if (r0 < m) {
613         q0 -= 1;
614         r0 += d;
615         if (r0 >= d) {
616             if (r0 < m) {
617                 q0 -= 1;
618                 r0 += d;
619             }
620         }
621     }
622     r0 -= m;
623 
624     *r = r0;
625     return (q1 << 32) | q0;
626 #endif
627 }
628 
629 /*----------------------------------------------------------------------------
630 | Returns an approximation to the square root of the 32-bit significand given
631 | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
632 | `aExp' (the least significant bit) is 1, the integer returned approximates
633 | 2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp'
634 | is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either
635 | case, the approximation returned lies strictly within +/-2 of the exact
636 | value.
637 *----------------------------------------------------------------------------*/
638 
639 static inline uint32_t estimateSqrt32(int aExp, uint32_t a)
640 {
641     static const uint16_t sqrtOddAdjustments[] = {
642         0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
643         0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
644     };
645     static const uint16_t sqrtEvenAdjustments[] = {
646         0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
647         0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
648     };
649     int8_t index;
650     uint32_t z;
651 
652     index = ( a>>27 ) & 15;
653     if ( aExp & 1 ) {
654         z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ];
655         z = ( ( a / z )<<14 ) + ( z<<15 );
656         a >>= 1;
657     }
658     else {
659         z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ];
660         z = a / z + z;
661         z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
662         if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 );
663     }
664     return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 );
665 
666 }
667 
668 /*----------------------------------------------------------------------------
669 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
670 | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
671 | Otherwise, returns 0.
672 *----------------------------------------------------------------------------*/
673 
674 static inline bool eq128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
675 {
676     return a0 == b0 && a1 == b1;
677 }
678 
679 /*----------------------------------------------------------------------------
680 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
681 | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
682 | Otherwise, returns 0.
683 *----------------------------------------------------------------------------*/
684 
685 static inline bool le128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
686 {
687     return a0 < b0 || (a0 == b0 && a1 <= b1);
688 }
689 
690 /*----------------------------------------------------------------------------
691 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
692 | than the 128-bit value formed by concatenating `b0' and `b1'.  Otherwise,
693 | returns 0.
694 *----------------------------------------------------------------------------*/
695 
696 static inline bool lt128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
697 {
698     return a0 < b0 || (a0 == b0 && a1 < b1);
699 }
700 
701 /*----------------------------------------------------------------------------
702 | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
703 | not equal to the 128-bit value formed by concatenating `b0' and `b1'.
704 | Otherwise, returns 0.
705 *----------------------------------------------------------------------------*/
706 
707 static inline bool ne128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
708 {
709     return a0 != b0 || a1 != b1;
710 }
711 
712 #endif
713