xref: /qemu/target/arm/tcg/vfp_helper.c (revision e34cfba5e8d7bd631398a09d658dee40b1aef085)
1 /*
2  * ARM VFP floating-point operations
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/helper-proto.h"
23 #include "internals.h"
24 #include "cpu-features.h"
25 #include "fpu/softfloat.h"
26 #include "qemu/log.h"
27 
28 /*
29  * VFP support.  We follow the convention used for VFP instructions:
30  * Single precision routines have a "s" suffix, double precision a
31  * "d" suffix.
32  */
33 
34 #define VFP_HELPER(name, p) HELPER(glue(glue(vfp_,name),p))
35 
36 #define VFP_BINOP(name) \
37 dh_ctype_f16 VFP_HELPER(name, h)(dh_ctype_f16 a, dh_ctype_f16 b, float_status *fpst) \
38 { \
39     return float16_ ## name(a, b, fpst); \
40 } \
41 float32 VFP_HELPER(name, s)(float32 a, float32 b, float_status *fpst) \
42 { \
43     return float32_ ## name(a, b, fpst); \
44 } \
45 float64 VFP_HELPER(name, d)(float64 a, float64 b, float_status *fpst) \
46 { \
47     return float64_ ## name(a, b, fpst); \
48 }
49 VFP_BINOP(add)
50 VFP_BINOP(sub)
51 VFP_BINOP(mul)
52 VFP_BINOP(div)
53 VFP_BINOP(min)
54 VFP_BINOP(max)
55 VFP_BINOP(minnum)
56 VFP_BINOP(maxnum)
57 #undef VFP_BINOP
58 
59 dh_ctype_f16 VFP_HELPER(sqrt, h)(dh_ctype_f16 a, float_status *fpst)
60 {
61     return float16_sqrt(a, fpst);
62 }
63 
64 float32 VFP_HELPER(sqrt, s)(float32 a, float_status *fpst)
65 {
66     return float32_sqrt(a, fpst);
67 }
68 
69 float64 VFP_HELPER(sqrt, d)(float64 a, float_status *fpst)
70 {
71     return float64_sqrt(a, fpst);
72 }
73 
74 static void softfloat_to_vfp_compare(CPUARMState *env, FloatRelation cmp)
75 {
76     uint32_t flags;
77     switch (cmp) {
78     case float_relation_equal:
79         flags = 0x6;
80         break;
81     case float_relation_less:
82         flags = 0x8;
83         break;
84     case float_relation_greater:
85         flags = 0x2;
86         break;
87     case float_relation_unordered:
88         flags = 0x3;
89         break;
90     default:
91         g_assert_not_reached();
92     }
93     env->vfp.fpsr = deposit64(env->vfp.fpsr, 28, 4, flags); /* NZCV */
94 }
95 
96 /* XXX: check quiet/signaling case */
97 #define DO_VFP_cmp(P, FLOATTYPE, ARGTYPE, FPST) \
98 void VFP_HELPER(cmp, P)(ARGTYPE a, ARGTYPE b, CPUARMState *env)  \
99 { \
100     softfloat_to_vfp_compare(env, \
101         FLOATTYPE ## _compare_quiet(a, b, &env->vfp.fp_status[FPST])); \
102 } \
103 void VFP_HELPER(cmpe, P)(ARGTYPE a, ARGTYPE b, CPUARMState *env) \
104 { \
105     softfloat_to_vfp_compare(env, \
106         FLOATTYPE ## _compare(a, b, &env->vfp.fp_status[FPST])); \
107 }
108 DO_VFP_cmp(h, float16, dh_ctype_f16, FPST_A32_F16)
109 DO_VFP_cmp(s, float32, float32, FPST_A32)
110 DO_VFP_cmp(d, float64, float64, FPST_A32)
111 #undef DO_VFP_cmp
112 
113 /* Integer to float and float to integer conversions */
114 
115 #define CONV_ITOF(name, ftype, fsz, sign)                           \
116 ftype HELPER(name)(uint32_t x, float_status *fpst)                  \
117 {                                                                   \
118     return sign##int32_to_##float##fsz((sign##int32_t)x, fpst);     \
119 }
120 
121 #define CONV_FTOI(name, ftype, fsz, sign, round)                \
122 sign##int32_t HELPER(name)(ftype x, float_status *fpst)         \
123 {                                                               \
124     if (float##fsz##_is_any_nan(x)) {                           \
125         float_raise(float_flag_invalid, fpst);                  \
126         return 0;                                               \
127     }                                                           \
128     return float##fsz##_to_##sign##int32##round(x, fpst);       \
129 }
130 
131 #define FLOAT_CONVS(name, p, ftype, fsz, sign)            \
132     CONV_ITOF(vfp_##name##to##p, ftype, fsz, sign)        \
133     CONV_FTOI(vfp_to##name##p, ftype, fsz, sign, )        \
134     CONV_FTOI(vfp_to##name##z##p, ftype, fsz, sign, _round_to_zero)
135 
136 FLOAT_CONVS(si, h, uint32_t, 16, )
137 FLOAT_CONVS(si, s, float32, 32, )
138 FLOAT_CONVS(si, d, float64, 64, )
139 FLOAT_CONVS(ui, h, uint32_t, 16, u)
140 FLOAT_CONVS(ui, s, float32, 32, u)
141 FLOAT_CONVS(ui, d, float64, 64, u)
142 
143 #undef CONV_ITOF
144 #undef CONV_FTOI
145 #undef FLOAT_CONVS
146 
147 /* floating point conversion */
148 float64 VFP_HELPER(fcvtd, s)(float32 x, float_status *status)
149 {
150     return float32_to_float64(x, status);
151 }
152 
153 float32 VFP_HELPER(fcvts, d)(float64 x, float_status *status)
154 {
155     return float64_to_float32(x, status);
156 }
157 
158 uint32_t HELPER(bfcvt)(float32 x, float_status *status)
159 {
160     return float32_to_bfloat16(x, status);
161 }
162 
163 uint32_t HELPER(bfcvt_pair)(uint64_t pair, float_status *status)
164 {
165     bfloat16 lo = float32_to_bfloat16(extract64(pair, 0, 32), status);
166     bfloat16 hi = float32_to_bfloat16(extract64(pair, 32, 32), status);
167     return deposit32(lo, 16, 16, hi);
168 }
169 
170 /*
171  * VFP3 fixed point conversion. The AArch32 versions of fix-to-float
172  * must always round-to-nearest; the AArch64 ones honour the FPSCR
173  * rounding mode. (For AArch32 Neon the standard-FPSCR is set to
174  * round-to-nearest so either helper will work.) AArch32 float-to-fix
175  * must round-to-zero.
176  */
177 #define VFP_CONV_FIX_FLOAT(name, p, fsz, ftype, isz, itype)            \
178 ftype HELPER(vfp_##name##to##p)(uint##isz##_t  x, uint32_t shift,      \
179                                 float_status *fpst)                    \
180 { return itype##_to_##float##fsz##_scalbn(x, -shift, fpst); }
181 
182 #define VFP_CONV_FIX_FLOAT_ROUND(name, p, fsz, ftype, isz, itype)      \
183     ftype HELPER(vfp_##name##to##p##_round_to_nearest)(uint##isz##_t  x, \
184                                                      uint32_t shift,   \
185                                                      float_status *fpst) \
186     {                                                                  \
187         ftype ret;                                                     \
188         FloatRoundMode oldmode = fpst->float_rounding_mode;            \
189         fpst->float_rounding_mode = float_round_nearest_even;          \
190         ret = itype##_to_##float##fsz##_scalbn(x, -shift, fpst);       \
191         fpst->float_rounding_mode = oldmode;                           \
192         return ret;                                                    \
193     }
194 
195 #define VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, ftype, isz, itype, ROUND, suff) \
196 uint##isz##_t HELPER(vfp_to##name##p##suff)(ftype x, uint32_t shift,      \
197                                             float_status *fpst)           \
198 {                                                                         \
199     if (unlikely(float##fsz##_is_any_nan(x))) {                           \
200         float_raise(float_flag_invalid, fpst);                            \
201         return 0;                                                         \
202     }                                                                     \
203     return float##fsz##_to_##itype##_scalbn(x, ROUND, shift, fpst);       \
204 }
205 
206 #define VFP_CONV_FIX(name, p, fsz, ftype, isz, itype)            \
207 VFP_CONV_FIX_FLOAT(name, p, fsz, ftype, isz, itype)              \
208 VFP_CONV_FIX_FLOAT_ROUND(name, p, fsz, ftype, isz, itype)        \
209 VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, ftype, isz, itype,        \
210                          float_round_to_zero, _round_to_zero)    \
211 VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, ftype, isz, itype,        \
212                          get_float_rounding_mode(fpst), )
213 
214 #define VFP_CONV_FIX_A64(name, p, fsz, ftype, isz, itype)        \
215 VFP_CONV_FIX_FLOAT(name, p, fsz, ftype, isz, itype)              \
216 VFP_CONV_FLOAT_FIX_ROUND(name, p, fsz, ftype, isz, itype,        \
217                          get_float_rounding_mode(fpst), )
218 
219 VFP_CONV_FIX(sh, d, 64, float64, 64, int16)
220 VFP_CONV_FIX(sl, d, 64, float64, 64, int32)
221 VFP_CONV_FIX_A64(sq, d, 64, float64, 64, int64)
222 VFP_CONV_FIX(uh, d, 64, float64, 64, uint16)
223 VFP_CONV_FIX(ul, d, 64, float64, 64, uint32)
224 VFP_CONV_FIX_A64(uq, d, 64, float64, 64, uint64)
225 VFP_CONV_FIX(sh, s, 32, float32, 32, int16)
226 VFP_CONV_FIX(sl, s, 32, float32, 32, int32)
227 VFP_CONV_FIX_A64(sq, s, 32, float32, 64, int64)
228 VFP_CONV_FIX(uh, s, 32, float32, 32, uint16)
229 VFP_CONV_FIX(ul, s, 32, float32, 32, uint32)
230 VFP_CONV_FIX_A64(uq, s, 32, float32, 64, uint64)
231 VFP_CONV_FIX(sh, h, 16, dh_ctype_f16, 32, int16)
232 VFP_CONV_FIX(sl, h, 16, dh_ctype_f16, 32, int32)
233 VFP_CONV_FIX_A64(sq, h, 16, dh_ctype_f16, 64, int64)
234 VFP_CONV_FIX(uh, h, 16, dh_ctype_f16, 32, uint16)
235 VFP_CONV_FIX(ul, h, 16, dh_ctype_f16, 32, uint32)
236 VFP_CONV_FIX_A64(uq, h, 16, dh_ctype_f16, 64, uint64)
237 VFP_CONV_FLOAT_FIX_ROUND(sq, d, 64, float64, 64, int64,
238                          float_round_to_zero, _round_to_zero)
239 VFP_CONV_FLOAT_FIX_ROUND(uq, d, 64, float64, 64, uint64,
240                          float_round_to_zero, _round_to_zero)
241 
242 #undef VFP_CONV_FIX
243 #undef VFP_CONV_FIX_FLOAT
244 #undef VFP_CONV_FLOAT_FIX_ROUND
245 #undef VFP_CONV_FIX_A64
246 
247 /* Set the current fp rounding mode and return the old one.
248  * The argument is a softfloat float_round_ value.
249  */
250 uint32_t HELPER(set_rmode)(uint32_t rmode, float_status *fp_status)
251 {
252     uint32_t prev_rmode = get_float_rounding_mode(fp_status);
253     set_float_rounding_mode(rmode, fp_status);
254 
255     return prev_rmode;
256 }
257 
258 /* Half precision conversions.  */
259 float32 HELPER(vfp_fcvt_f16_to_f32)(uint32_t a, float_status *fpst,
260                                     uint32_t ahp_mode)
261 {
262     /* Squash FZ16 to 0 for the duration of conversion.  In this case,
263      * it would affect flushing input denormals.
264      */
265     bool save = get_flush_inputs_to_zero(fpst);
266     set_flush_inputs_to_zero(false, fpst);
267     float32 r = float16_to_float32(a, !ahp_mode, fpst);
268     set_flush_inputs_to_zero(save, fpst);
269     return r;
270 }
271 
272 uint32_t HELPER(vfp_fcvt_f32_to_f16)(float32 a, float_status *fpst,
273                                      uint32_t ahp_mode)
274 {
275     /* Squash FZ16 to 0 for the duration of conversion.  In this case,
276      * it would affect flushing output denormals.
277      */
278     bool save = get_flush_to_zero(fpst);
279     set_flush_to_zero(false, fpst);
280     float16 r = float32_to_float16(a, !ahp_mode, fpst);
281     set_flush_to_zero(save, fpst);
282     return r;
283 }
284 
285 float64 HELPER(vfp_fcvt_f16_to_f64)(uint32_t a, float_status *fpst,
286                                     uint32_t ahp_mode)
287 {
288     /* Squash FZ16 to 0 for the duration of conversion.  In this case,
289      * it would affect flushing input denormals.
290      */
291     bool save = get_flush_inputs_to_zero(fpst);
292     set_flush_inputs_to_zero(false, fpst);
293     float64 r = float16_to_float64(a, !ahp_mode, fpst);
294     set_flush_inputs_to_zero(save, fpst);
295     return r;
296 }
297 
298 uint32_t HELPER(vfp_fcvt_f64_to_f16)(float64 a, float_status *fpst,
299                                      uint32_t ahp_mode)
300 {
301     /* Squash FZ16 to 0 for the duration of conversion.  In this case,
302      * it would affect flushing output denormals.
303      */
304     bool save = get_flush_to_zero(fpst);
305     set_flush_to_zero(false, fpst);
306     float16 r = float64_to_float16(a, !ahp_mode, fpst);
307     set_flush_to_zero(save, fpst);
308     return r;
309 }
310 
311 /* NEON helpers.  */
312 
313 /* Constants 256 and 512 are used in some helpers; we avoid relying on
314  * int->float conversions at run-time.  */
315 #define float64_256 make_float64(0x4070000000000000LL)
316 #define float64_512 make_float64(0x4080000000000000LL)
317 #define float16_maxnorm make_float16(0x7bff)
318 #define float32_maxnorm make_float32(0x7f7fffff)
319 #define float64_maxnorm make_float64(0x7fefffffffffffffLL)
320 
321 /* Reciprocal functions
322  *
323  * The algorithm that must be used to calculate the estimate
324  * is specified by the ARM ARM, see FPRecipEstimate()/RecipEstimate
325  */
326 
327 /* See RecipEstimate()
328  *
329  * input is a 9 bit fixed point number
330  * input range 256 .. 511 for a number from 0.5 <= x < 1.0.
331  * result range 256 .. 511 for a number from 1.0 to 511/256.
332  */
333 
334 static int recip_estimate(int input)
335 {
336     int a, b, r;
337     assert(256 <= input && input < 512);
338     a = (input * 2) + 1;
339     b = (1 << 19) / a;
340     r = (b + 1) >> 1;
341     assert(256 <= r && r < 512);
342     return r;
343 }
344 
345 /*
346  * Increased precision version:
347  * input is a 13 bit fixed point number
348  * input range 2048 .. 4095 for a number from 0.5 <= x < 1.0.
349  * result range 4096 .. 8191 for a number from 1.0 to 2.0
350  */
351 static int recip_estimate_incprec(int input)
352 {
353     int a, b, r;
354     assert(2048 <= input && input < 4096);
355     a = (input * 2) + 1;
356     /*
357      * The pseudocode expresses this as an operation on infinite
358      * precision reals where it calculates 2^25 / a and then looks
359      * at the error between that and the rounded-down-to-integer
360      * value to see if it should instead round up. We instead
361      * follow the same approach as the pseudocode for the 8-bit
362      * precision version, and calculate (2 * (2^25 / a)) as an
363      * integer so we can do the "add one and halve" to round it.
364      * So the 1 << 26 here is correct.
365      */
366     b = (1 << 26) / a;
367     r = (b + 1) >> 1;
368     assert(4096 <= r && r < 8192);
369     return r;
370 }
371 
372 /*
373  * Common wrapper to call recip_estimate
374  *
375  * The parameters are exponent and 64 bit fraction (without implicit
376  * bit) where the binary point is nominally at bit 52. Returns a
377  * float64 which can then be rounded to the appropriate size by the
378  * callee.
379  */
380 
381 static uint64_t call_recip_estimate(int *exp, int exp_off, uint64_t frac,
382                                     bool increasedprecision)
383 {
384     uint32_t scaled, estimate;
385     uint64_t result_frac;
386     int result_exp;
387 
388     /* Handle sub-normals */
389     if (*exp == 0) {
390         if (extract64(frac, 51, 1) == 0) {
391             *exp = -1;
392             frac <<= 2;
393         } else {
394             frac <<= 1;
395         }
396     }
397 
398     if (increasedprecision) {
399         /* scaled = UInt('1':fraction<51:41>) */
400         scaled = deposit32(1 << 11, 0, 11, extract64(frac, 41, 11));
401         estimate = recip_estimate_incprec(scaled);
402     } else {
403         /* scaled = UInt('1':fraction<51:44>) */
404         scaled = deposit32(1 << 8, 0, 8, extract64(frac, 44, 8));
405         estimate = recip_estimate(scaled);
406     }
407 
408     result_exp = exp_off - *exp;
409     if (increasedprecision) {
410         result_frac = deposit64(0, 40, 12, estimate);
411     } else {
412         result_frac = deposit64(0, 44, 8, estimate);
413     }
414     if (result_exp == 0) {
415         result_frac = deposit64(result_frac >> 1, 51, 1, 1);
416     } else if (result_exp == -1) {
417         result_frac = deposit64(result_frac >> 2, 50, 2, 1);
418         result_exp = 0;
419     }
420 
421     *exp = result_exp;
422 
423     return result_frac;
424 }
425 
426 static bool round_to_inf(float_status *fpst, bool sign_bit)
427 {
428     switch (fpst->float_rounding_mode) {
429     case float_round_nearest_even: /* Round to Nearest */
430         return true;
431     case float_round_up: /* Round to +Inf */
432         return !sign_bit;
433     case float_round_down: /* Round to -Inf */
434         return sign_bit;
435     case float_round_to_zero: /* Round to Zero */
436         return false;
437     default:
438         g_assert_not_reached();
439     }
440 }
441 
442 uint32_t HELPER(recpe_f16)(uint32_t input, float_status *fpst)
443 {
444     float16 f16 = float16_squash_input_denormal(input, fpst);
445     uint32_t f16_val = float16_val(f16);
446     uint32_t f16_sign = float16_is_neg(f16);
447     int f16_exp = extract32(f16_val, 10, 5);
448     uint32_t f16_frac = extract32(f16_val, 0, 10);
449     uint64_t f64_frac;
450 
451     if (float16_is_any_nan(f16)) {
452         float16 nan = f16;
453         if (float16_is_signaling_nan(f16, fpst)) {
454             float_raise(float_flag_invalid, fpst);
455             if (!fpst->default_nan_mode) {
456                 nan = float16_silence_nan(f16, fpst);
457             }
458         }
459         if (fpst->default_nan_mode) {
460             nan =  float16_default_nan(fpst);
461         }
462         return nan;
463     } else if (float16_is_infinity(f16)) {
464         return float16_set_sign(float16_zero, float16_is_neg(f16));
465     } else if (float16_is_zero(f16)) {
466         float_raise(float_flag_divbyzero, fpst);
467         return float16_set_sign(float16_infinity, float16_is_neg(f16));
468     } else if (float16_abs(f16) < (1 << 8)) {
469         /* Abs(value) < 2.0^-16 */
470         float_raise(float_flag_overflow | float_flag_inexact, fpst);
471         if (round_to_inf(fpst, f16_sign)) {
472             return float16_set_sign(float16_infinity, f16_sign);
473         } else {
474             return float16_set_sign(float16_maxnorm, f16_sign);
475         }
476     } else if (f16_exp >= 29 && fpst->flush_to_zero) {
477         float_raise(float_flag_underflow, fpst);
478         return float16_set_sign(float16_zero, float16_is_neg(f16));
479     }
480 
481     f64_frac = call_recip_estimate(&f16_exp, 29,
482                                    ((uint64_t) f16_frac) << (52 - 10), false);
483 
484     /* result = sign : result_exp<4:0> : fraction<51:42> */
485     f16_val = deposit32(0, 15, 1, f16_sign);
486     f16_val = deposit32(f16_val, 10, 5, f16_exp);
487     f16_val = deposit32(f16_val, 0, 10, extract64(f64_frac, 52 - 10, 10));
488     return make_float16(f16_val);
489 }
490 
491 /*
492  * FEAT_RPRES means the f32 FRECPE has an "increased precision" variant
493  * which is used when FPCR.AH == 1.
494  */
495 static float32 do_recpe_f32(float32 input, float_status *fpst, bool rpres)
496 {
497     float32 f32 = float32_squash_input_denormal(input, fpst);
498     uint32_t f32_val = float32_val(f32);
499     bool f32_sign = float32_is_neg(f32);
500     int f32_exp = extract32(f32_val, 23, 8);
501     uint32_t f32_frac = extract32(f32_val, 0, 23);
502     uint64_t f64_frac;
503 
504     if (float32_is_any_nan(f32)) {
505         float32 nan = f32;
506         if (float32_is_signaling_nan(f32, fpst)) {
507             float_raise(float_flag_invalid, fpst);
508             if (!fpst->default_nan_mode) {
509                 nan = float32_silence_nan(f32, fpst);
510             }
511         }
512         if (fpst->default_nan_mode) {
513             nan =  float32_default_nan(fpst);
514         }
515         return nan;
516     } else if (float32_is_infinity(f32)) {
517         return float32_set_sign(float32_zero, float32_is_neg(f32));
518     } else if (float32_is_zero(f32)) {
519         float_raise(float_flag_divbyzero, fpst);
520         return float32_set_sign(float32_infinity, float32_is_neg(f32));
521     } else if (float32_abs(f32) < (1ULL << 21)) {
522         /* Abs(value) < 2.0^-128 */
523         float_raise(float_flag_overflow | float_flag_inexact, fpst);
524         if (round_to_inf(fpst, f32_sign)) {
525             return float32_set_sign(float32_infinity, f32_sign);
526         } else {
527             return float32_set_sign(float32_maxnorm, f32_sign);
528         }
529     } else if (f32_exp >= 253 && fpst->flush_to_zero) {
530         float_raise(float_flag_underflow, fpst);
531         return float32_set_sign(float32_zero, float32_is_neg(f32));
532     }
533 
534     f64_frac = call_recip_estimate(&f32_exp, 253,
535                                    ((uint64_t) f32_frac) << (52 - 23), rpres);
536 
537     /* result = sign : result_exp<7:0> : fraction<51:29> */
538     f32_val = deposit32(0, 31, 1, f32_sign);
539     f32_val = deposit32(f32_val, 23, 8, f32_exp);
540     f32_val = deposit32(f32_val, 0, 23, extract64(f64_frac, 52 - 23, 23));
541     return make_float32(f32_val);
542 }
543 
544 float32 HELPER(recpe_f32)(float32 input, float_status *fpst)
545 {
546     return do_recpe_f32(input, fpst, false);
547 }
548 
549 float32 HELPER(recpe_rpres_f32)(float32 input, float_status *fpst)
550 {
551     return do_recpe_f32(input, fpst, true);
552 }
553 
554 float64 HELPER(recpe_f64)(float64 input, float_status *fpst)
555 {
556     float64 f64 = float64_squash_input_denormal(input, fpst);
557     uint64_t f64_val = float64_val(f64);
558     bool f64_sign = float64_is_neg(f64);
559     int f64_exp = extract64(f64_val, 52, 11);
560     uint64_t f64_frac = extract64(f64_val, 0, 52);
561 
562     /* Deal with any special cases */
563     if (float64_is_any_nan(f64)) {
564         float64 nan = f64;
565         if (float64_is_signaling_nan(f64, fpst)) {
566             float_raise(float_flag_invalid, fpst);
567             if (!fpst->default_nan_mode) {
568                 nan = float64_silence_nan(f64, fpst);
569             }
570         }
571         if (fpst->default_nan_mode) {
572             nan =  float64_default_nan(fpst);
573         }
574         return nan;
575     } else if (float64_is_infinity(f64)) {
576         return float64_set_sign(float64_zero, float64_is_neg(f64));
577     } else if (float64_is_zero(f64)) {
578         float_raise(float_flag_divbyzero, fpst);
579         return float64_set_sign(float64_infinity, float64_is_neg(f64));
580     } else if ((f64_val & ~(1ULL << 63)) < (1ULL << 50)) {
581         /* Abs(value) < 2.0^-1024 */
582         float_raise(float_flag_overflow | float_flag_inexact, fpst);
583         if (round_to_inf(fpst, f64_sign)) {
584             return float64_set_sign(float64_infinity, f64_sign);
585         } else {
586             return float64_set_sign(float64_maxnorm, f64_sign);
587         }
588     } else if (f64_exp >= 2045 && fpst->flush_to_zero) {
589         float_raise(float_flag_underflow, fpst);
590         return float64_set_sign(float64_zero, float64_is_neg(f64));
591     }
592 
593     f64_frac = call_recip_estimate(&f64_exp, 2045, f64_frac, false);
594 
595     /* result = sign : result_exp<10:0> : fraction<51:0>; */
596     f64_val = deposit64(0, 63, 1, f64_sign);
597     f64_val = deposit64(f64_val, 52, 11, f64_exp);
598     f64_val = deposit64(f64_val, 0, 52, f64_frac);
599     return make_float64(f64_val);
600 }
601 
602 /* The algorithm that must be used to calculate the estimate
603  * is specified by the ARM ARM.
604  */
605 
606 static int do_recip_sqrt_estimate(int a)
607 {
608     int b, estimate;
609 
610     assert(128 <= a && a < 512);
611     if (a < 256) {
612         a = a * 2 + 1;
613     } else {
614         a = (a >> 1) << 1;
615         a = (a + 1) * 2;
616     }
617     b = 512;
618     while (a * (b + 1) * (b + 1) < (1 << 28)) {
619         b += 1;
620     }
621     estimate = (b + 1) / 2;
622     assert(256 <= estimate && estimate < 512);
623 
624     return estimate;
625 }
626 
627 static int do_recip_sqrt_estimate_incprec(int a)
628 {
629     /*
630      * The Arm ARM describes the 12-bit precision version of RecipSqrtEstimate
631      * in terms of an infinite-precision floating point calculation of a
632      * square root. We implement this using the same kind of pure integer
633      * algorithm as the 8-bit mantissa, to get the same bit-for-bit result.
634      */
635     int64_t b, estimate;
636 
637     assert(1024 <= a && a < 4096);
638     if (a < 2048) {
639         a = a * 2 + 1;
640     } else {
641         a = (a >> 1) << 1;
642         a = (a + 1) * 2;
643     }
644     b = 8192;
645     while (a * (b + 1) * (b + 1) < (1ULL << 39)) {
646         b += 1;
647     }
648     estimate = (b + 1) / 2;
649 
650     assert(4096 <= estimate && estimate < 8192);
651 
652     return estimate;
653 }
654 
655 static uint64_t recip_sqrt_estimate(int *exp , int exp_off, uint64_t frac,
656                                     bool increasedprecision)
657 {
658     int estimate;
659     uint32_t scaled;
660 
661     if (*exp == 0) {
662         while (extract64(frac, 51, 1) == 0) {
663             frac = frac << 1;
664             *exp -= 1;
665         }
666         frac = extract64(frac, 0, 51) << 1;
667     }
668 
669     if (increasedprecision) {
670         if (*exp & 1) {
671             /* scaled = UInt('01':fraction<51:42>) */
672             scaled = deposit32(1 << 10, 0, 10, extract64(frac, 42, 10));
673         } else {
674             /* scaled = UInt('1':fraction<51:41>) */
675             scaled = deposit32(1 << 11, 0, 11, extract64(frac, 41, 11));
676         }
677         estimate = do_recip_sqrt_estimate_incprec(scaled);
678     } else {
679         if (*exp & 1) {
680             /* scaled = UInt('01':fraction<51:45>) */
681             scaled = deposit32(1 << 7, 0, 7, extract64(frac, 45, 7));
682         } else {
683             /* scaled = UInt('1':fraction<51:44>) */
684             scaled = deposit32(1 << 8, 0, 8, extract64(frac, 44, 8));
685         }
686         estimate = do_recip_sqrt_estimate(scaled);
687     }
688 
689     *exp = (exp_off - *exp) / 2;
690     if (increasedprecision) {
691         return extract64(estimate, 0, 12) << 40;
692     } else {
693         return extract64(estimate, 0, 8) << 44;
694     }
695 }
696 
697 uint32_t HELPER(rsqrte_f16)(uint32_t input, float_status *s)
698 {
699     float16 f16 = float16_squash_input_denormal(input, s);
700     uint16_t val = float16_val(f16);
701     bool f16_sign = float16_is_neg(f16);
702     int f16_exp = extract32(val, 10, 5);
703     uint16_t f16_frac = extract32(val, 0, 10);
704     uint64_t f64_frac;
705 
706     if (float16_is_any_nan(f16)) {
707         float16 nan = f16;
708         if (float16_is_signaling_nan(f16, s)) {
709             float_raise(float_flag_invalid, s);
710             if (!s->default_nan_mode) {
711                 nan = float16_silence_nan(f16, s);
712             }
713         }
714         if (s->default_nan_mode) {
715             nan =  float16_default_nan(s);
716         }
717         return nan;
718     } else if (float16_is_zero(f16)) {
719         float_raise(float_flag_divbyzero, s);
720         return float16_set_sign(float16_infinity, f16_sign);
721     } else if (f16_sign) {
722         float_raise(float_flag_invalid, s);
723         return float16_default_nan(s);
724     } else if (float16_is_infinity(f16)) {
725         return float16_zero;
726     }
727 
728     /* Scale and normalize to a double-precision value between 0.25 and 1.0,
729      * preserving the parity of the exponent.  */
730 
731     f64_frac = ((uint64_t) f16_frac) << (52 - 10);
732 
733     f64_frac = recip_sqrt_estimate(&f16_exp, 44, f64_frac, false);
734 
735     /* result = sign : result_exp<4:0> : estimate<7:0> : Zeros(2) */
736     val = deposit32(0, 15, 1, f16_sign);
737     val = deposit32(val, 10, 5, f16_exp);
738     val = deposit32(val, 2, 8, extract64(f64_frac, 52 - 8, 8));
739     return make_float16(val);
740 }
741 
742 /*
743  * FEAT_RPRES means the f32 FRSQRTE has an "increased precision" variant
744  * which is used when FPCR.AH == 1.
745  */
746 static float32 do_rsqrte_f32(float32 input, float_status *s, bool rpres)
747 {
748     float32 f32 = float32_squash_input_denormal(input, s);
749     uint32_t val = float32_val(f32);
750     uint32_t f32_sign = float32_is_neg(f32);
751     int f32_exp = extract32(val, 23, 8);
752     uint32_t f32_frac = extract32(val, 0, 23);
753     uint64_t f64_frac;
754 
755     if (float32_is_any_nan(f32)) {
756         float32 nan = f32;
757         if (float32_is_signaling_nan(f32, s)) {
758             float_raise(float_flag_invalid, s);
759             if (!s->default_nan_mode) {
760                 nan = float32_silence_nan(f32, s);
761             }
762         }
763         if (s->default_nan_mode) {
764             nan =  float32_default_nan(s);
765         }
766         return nan;
767     } else if (float32_is_zero(f32)) {
768         float_raise(float_flag_divbyzero, s);
769         return float32_set_sign(float32_infinity, float32_is_neg(f32));
770     } else if (float32_is_neg(f32)) {
771         float_raise(float_flag_invalid, s);
772         return float32_default_nan(s);
773     } else if (float32_is_infinity(f32)) {
774         return float32_zero;
775     }
776 
777     /* Scale and normalize to a double-precision value between 0.25 and 1.0,
778      * preserving the parity of the exponent.  */
779 
780     f64_frac = ((uint64_t) f32_frac) << 29;
781 
782     f64_frac = recip_sqrt_estimate(&f32_exp, 380, f64_frac, rpres);
783 
784     /*
785      * result = sign : result_exp<7:0> : estimate<7:0> : Zeros(15)
786      * or for increased precision
787      * result = sign : result_exp<7:0> : estimate<11:0> : Zeros(11)
788      */
789     val = deposit32(0, 31, 1, f32_sign);
790     val = deposit32(val, 23, 8, f32_exp);
791     if (rpres) {
792         val = deposit32(val, 11, 12, extract64(f64_frac, 52 - 12, 12));
793     } else {
794         val = deposit32(val, 15, 8, extract64(f64_frac, 52 - 8, 8));
795     }
796     return make_float32(val);
797 }
798 
799 float32 HELPER(rsqrte_f32)(float32 input, float_status *s)
800 {
801     return do_rsqrte_f32(input, s, false);
802 }
803 
804 float32 HELPER(rsqrte_rpres_f32)(float32 input, float_status *s)
805 {
806     return do_rsqrte_f32(input, s, true);
807 }
808 
809 float64 HELPER(rsqrte_f64)(float64 input, float_status *s)
810 {
811     float64 f64 = float64_squash_input_denormal(input, s);
812     uint64_t val = float64_val(f64);
813     bool f64_sign = float64_is_neg(f64);
814     int f64_exp = extract64(val, 52, 11);
815     uint64_t f64_frac = extract64(val, 0, 52);
816 
817     if (float64_is_any_nan(f64)) {
818         float64 nan = f64;
819         if (float64_is_signaling_nan(f64, s)) {
820             float_raise(float_flag_invalid, s);
821             if (!s->default_nan_mode) {
822                 nan = float64_silence_nan(f64, s);
823             }
824         }
825         if (s->default_nan_mode) {
826             nan =  float64_default_nan(s);
827         }
828         return nan;
829     } else if (float64_is_zero(f64)) {
830         float_raise(float_flag_divbyzero, s);
831         return float64_set_sign(float64_infinity, float64_is_neg(f64));
832     } else if (float64_is_neg(f64)) {
833         float_raise(float_flag_invalid, s);
834         return float64_default_nan(s);
835     } else if (float64_is_infinity(f64)) {
836         return float64_zero;
837     }
838 
839     f64_frac = recip_sqrt_estimate(&f64_exp, 3068, f64_frac, false);
840 
841     /* result = sign : result_exp<4:0> : estimate<7:0> : Zeros(44) */
842     val = deposit64(0, 61, 1, f64_sign);
843     val = deposit64(val, 52, 11, f64_exp);
844     val = deposit64(val, 44, 8, extract64(f64_frac, 52 - 8, 8));
845     return make_float64(val);
846 }
847 
848 uint32_t HELPER(recpe_u32)(uint32_t a)
849 {
850     int input, estimate;
851 
852     if ((a & 0x80000000) == 0) {
853         return 0xffffffff;
854     }
855 
856     input = extract32(a, 23, 9);
857     estimate = recip_estimate(input);
858 
859     return deposit32(0, (32 - 9), 9, estimate);
860 }
861 
862 uint32_t HELPER(rsqrte_u32)(uint32_t a)
863 {
864     int estimate;
865 
866     if ((a & 0xc0000000) == 0) {
867         return 0xffffffff;
868     }
869 
870     estimate = do_recip_sqrt_estimate(extract32(a, 23, 9));
871 
872     return deposit32(0, 23, 9, estimate);
873 }
874 
875 /* VFPv4 fused multiply-accumulate */
876 dh_ctype_f16 VFP_HELPER(muladd, h)(dh_ctype_f16 a, dh_ctype_f16 b,
877                                    dh_ctype_f16 c, float_status *fpst)
878 {
879     return float16_muladd(a, b, c, 0, fpst);
880 }
881 
882 float32 VFP_HELPER(muladd, s)(float32 a, float32 b, float32 c,
883                               float_status *fpst)
884 {
885     return float32_muladd(a, b, c, 0, fpst);
886 }
887 
888 float64 VFP_HELPER(muladd, d)(float64 a, float64 b, float64 c,
889                               float_status *fpst)
890 {
891     return float64_muladd(a, b, c, 0, fpst);
892 }
893 
894 /* ARMv8 round to integral */
895 dh_ctype_f16 HELPER(rinth_exact)(dh_ctype_f16 x, float_status *fp_status)
896 {
897     return float16_round_to_int(x, fp_status);
898 }
899 
900 float32 HELPER(rints_exact)(float32 x, float_status *fp_status)
901 {
902     return float32_round_to_int(x, fp_status);
903 }
904 
905 float64 HELPER(rintd_exact)(float64 x, float_status *fp_status)
906 {
907     return float64_round_to_int(x, fp_status);
908 }
909 
910 dh_ctype_f16 HELPER(rinth)(dh_ctype_f16 x, float_status *fp_status)
911 {
912     int old_flags = get_float_exception_flags(fp_status), new_flags;
913     float16 ret;
914 
915     ret = float16_round_to_int(x, fp_status);
916 
917     /* Suppress any inexact exceptions the conversion produced */
918     if (!(old_flags & float_flag_inexact)) {
919         new_flags = get_float_exception_flags(fp_status);
920         set_float_exception_flags(new_flags & ~float_flag_inexact, fp_status);
921     }
922 
923     return ret;
924 }
925 
926 float32 HELPER(rints)(float32 x, float_status *fp_status)
927 {
928     int old_flags = get_float_exception_flags(fp_status), new_flags;
929     float32 ret;
930 
931     ret = float32_round_to_int(x, fp_status);
932 
933     /* Suppress any inexact exceptions the conversion produced */
934     if (!(old_flags & float_flag_inexact)) {
935         new_flags = get_float_exception_flags(fp_status);
936         set_float_exception_flags(new_flags & ~float_flag_inexact, fp_status);
937     }
938 
939     return ret;
940 }
941 
942 float64 HELPER(rintd)(float64 x, float_status *fp_status)
943 {
944     int old_flags = get_float_exception_flags(fp_status), new_flags;
945     float64 ret;
946 
947     ret = float64_round_to_int(x, fp_status);
948 
949     /* Suppress any inexact exceptions the conversion produced */
950     if (!(old_flags & float_flag_inexact)) {
951         new_flags = get_float_exception_flags(fp_status);
952         set_float_exception_flags(new_flags & ~float_flag_inexact, fp_status);
953     }
954 
955     return ret;
956 }
957 
958 /* Convert ARM rounding mode to softfloat */
959 const FloatRoundMode arm_rmode_to_sf_map[] = {
960     [FPROUNDING_TIEEVEN] = float_round_nearest_even,
961     [FPROUNDING_POSINF] = float_round_up,
962     [FPROUNDING_NEGINF] = float_round_down,
963     [FPROUNDING_ZERO] = float_round_to_zero,
964     [FPROUNDING_TIEAWAY] = float_round_ties_away,
965     [FPROUNDING_ODD] = float_round_to_odd,
966 };
967 
968 /*
969  * Implement float64 to int32_t conversion without saturation;
970  * the result is supplied modulo 2^32.
971  */
972 uint64_t HELPER(fjcvtzs)(float64 value, float_status *status)
973 {
974     uint32_t frac, e_old, e_new;
975     bool inexact;
976 
977     e_old = get_float_exception_flags(status);
978     set_float_exception_flags(0, status);
979     frac = float64_to_int32_modulo(value, float_round_to_zero, status);
980     e_new = get_float_exception_flags(status);
981     set_float_exception_flags(e_old | e_new, status);
982 
983     /* Normal inexact, denormal with flush-to-zero, or overflow or NaN */
984     inexact = e_new & (float_flag_inexact |
985                        float_flag_input_denormal_flushed |
986                        float_flag_invalid);
987 
988     /* While not inexact for IEEE FP, -0.0 is inexact for JavaScript. */
989     inexact |= value == float64_chs(float64_zero);
990 
991     /* Pack the result and the env->ZF representation of Z together.  */
992     return deposit64(frac, 32, 32, inexact);
993 }
994 
995 uint32_t HELPER(vjcvt)(float64 value, CPUARMState *env)
996 {
997     uint64_t pair = HELPER(fjcvtzs)(value, &env->vfp.fp_status[FPST_A32]);
998     uint32_t result = pair;
999     uint32_t z = (pair >> 32) == 0;
1000 
1001     /* Store Z, clear NCV, in FPSCR.NZCV.  */
1002     env->vfp.fpsr = (env->vfp.fpsr & ~FPSR_NZCV_MASK) | (z * FPSR_Z);
1003 
1004     return result;
1005 }
1006 
1007 /* Round a float32 to an integer that fits in int32_t or int64_t.  */
1008 static float32 frint_s(float32 f, float_status *fpst, int intsize)
1009 {
1010     int old_flags = get_float_exception_flags(fpst);
1011     uint32_t exp = extract32(f, 23, 8);
1012 
1013     if (unlikely(exp == 0xff)) {
1014         /* NaN or Inf.  */
1015         goto overflow;
1016     }
1017 
1018     /* Round and re-extract the exponent.  */
1019     f = float32_round_to_int(f, fpst);
1020     exp = extract32(f, 23, 8);
1021 
1022     /* Validate the range of the result.  */
1023     if (exp < 126 + intsize) {
1024         /* abs(F) <= INT{N}_MAX */
1025         return f;
1026     }
1027     if (exp == 126 + intsize) {
1028         uint32_t sign = extract32(f, 31, 1);
1029         uint32_t frac = extract32(f, 0, 23);
1030         if (sign && frac == 0) {
1031             /* F == INT{N}_MIN */
1032             return f;
1033         }
1034     }
1035 
1036  overflow:
1037     /*
1038      * Raise Invalid and return INT{N}_MIN as a float.  Revert any
1039      * inexact exception float32_round_to_int may have raised.
1040      */
1041     set_float_exception_flags(old_flags | float_flag_invalid, fpst);
1042     return (0x100u + 126u + intsize) << 23;
1043 }
1044 
1045 float32 HELPER(frint32_s)(float32 f, float_status *fpst)
1046 {
1047     return frint_s(f, fpst, 32);
1048 }
1049 
1050 float32 HELPER(frint64_s)(float32 f, float_status *fpst)
1051 {
1052     return frint_s(f, fpst, 64);
1053 }
1054 
1055 /* Round a float64 to an integer that fits in int32_t or int64_t.  */
1056 static float64 frint_d(float64 f, float_status *fpst, int intsize)
1057 {
1058     int old_flags = get_float_exception_flags(fpst);
1059     uint32_t exp = extract64(f, 52, 11);
1060 
1061     if (unlikely(exp == 0x7ff)) {
1062         /* NaN or Inf.  */
1063         goto overflow;
1064     }
1065 
1066     /* Round and re-extract the exponent.  */
1067     f = float64_round_to_int(f, fpst);
1068     exp = extract64(f, 52, 11);
1069 
1070     /* Validate the range of the result.  */
1071     if (exp < 1022 + intsize) {
1072         /* abs(F) <= INT{N}_MAX */
1073         return f;
1074     }
1075     if (exp == 1022 + intsize) {
1076         uint64_t sign = extract64(f, 63, 1);
1077         uint64_t frac = extract64(f, 0, 52);
1078         if (sign && frac == 0) {
1079             /* F == INT{N}_MIN */
1080             return f;
1081         }
1082     }
1083 
1084  overflow:
1085     /*
1086      * Raise Invalid and return INT{N}_MIN as a float.  Revert any
1087      * inexact exception float64_round_to_int may have raised.
1088      */
1089     set_float_exception_flags(old_flags | float_flag_invalid, fpst);
1090     return (uint64_t)(0x800 + 1022 + intsize) << 52;
1091 }
1092 
1093 float64 HELPER(frint32_d)(float64 f, float_status *fpst)
1094 {
1095     return frint_d(f, fpst, 32);
1096 }
1097 
1098 float64 HELPER(frint64_d)(float64 f, float_status *fpst)
1099 {
1100     return frint_d(f, fpst, 64);
1101 }
1102 
1103 void HELPER(check_hcr_el2_trap)(CPUARMState *env, uint32_t rt, uint32_t reg)
1104 {
1105     uint32_t syndrome;
1106 
1107     switch (reg) {
1108     case ARM_VFP_MVFR0:
1109     case ARM_VFP_MVFR1:
1110     case ARM_VFP_MVFR2:
1111         if (!(arm_hcr_el2_eff(env) & HCR_TID3)) {
1112             return;
1113         }
1114         break;
1115     case ARM_VFP_FPSID:
1116         if (!(arm_hcr_el2_eff(env) & HCR_TID0)) {
1117             return;
1118         }
1119         break;
1120     default:
1121         g_assert_not_reached();
1122     }
1123 
1124     syndrome = ((EC_FPIDTRAP << ARM_EL_EC_SHIFT)
1125                 | ARM_EL_IL
1126                 | (1 << 24) | (0xe << 20) | (7 << 14)
1127                 | (reg << 10) | (rt << 5) | 1);
1128 
1129     raise_exception(env, EXCP_HYP_TRAP, syndrome, 2);
1130 }
1131 
1132 uint32_t HELPER(vfp_get_fpscr)(CPUARMState *env)
1133 {
1134     return vfp_get_fpscr(env);
1135 }
1136 
1137 void HELPER(vfp_set_fpscr)(CPUARMState *env, uint32_t val)
1138 {
1139     vfp_set_fpscr(env, val);
1140 }
1141