xref: /qemu/target/i386/tcg/fpu_helper.c (revision 513823e7521a09ed7ad1e32e6454bac3b2cbf52d)
1 /*
2  *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include <math.h>
22 #include "cpu.h"
23 #include "tcg-cpu.h"
24 #include "exec/exec-all.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/helper-proto.h"
27 #include "fpu/softfloat.h"
28 #include "fpu/softfloat-macros.h"
29 #include "helper-tcg.h"
30 #include "access.h"
31 
32 /* float macros */
33 #define FT0    (env->ft0)
34 #define ST0    (env->fpregs[env->fpstt].d)
35 #define ST(n)  (env->fpregs[(env->fpstt + (n)) & 7].d)
36 #define ST1    ST(1)
37 
38 #define FPU_RC_SHIFT        10
39 #define FPU_RC_MASK         (3 << FPU_RC_SHIFT)
40 #define FPU_RC_NEAR         0x000
41 #define FPU_RC_DOWN         0x400
42 #define FPU_RC_UP           0x800
43 #define FPU_RC_CHOP         0xc00
44 
45 #define MAXTAN 9223372036854775808.0
46 
47 /* the following deal with x86 long double-precision numbers */
48 #define MAXEXPD 0x7fff
49 #define EXPBIAS 16383
50 #define EXPD(fp)        (fp.l.upper & 0x7fff)
51 #define SIGND(fp)       ((fp.l.upper) & 0x8000)
52 #define MANTD(fp)       (fp.l.lower)
53 #define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
54 
55 #define FPUS_IE (1 << 0)
56 #define FPUS_DE (1 << 1)
57 #define FPUS_ZE (1 << 2)
58 #define FPUS_OE (1 << 3)
59 #define FPUS_UE (1 << 4)
60 #define FPUS_PE (1 << 5)
61 #define FPUS_SF (1 << 6)
62 #define FPUS_SE (1 << 7)
63 #define FPUS_B  (1 << 15)
64 
65 #define FPUC_EM 0x3f
66 
67 #define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
68 #define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
69 #define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
70 #define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
71 #define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
72 #define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
73 #define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
74 #define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
75 
76 static inline void fpush(CPUX86State *env)
77 {
78     env->fpstt = (env->fpstt - 1) & 7;
79     env->fptags[env->fpstt] = 0; /* validate stack entry */
80 }
81 
82 static inline void fpop(CPUX86State *env)
83 {
84     env->fptags[env->fpstt] = 1; /* invalidate stack entry */
85     env->fpstt = (env->fpstt + 1) & 7;
86 }
87 
88 static floatx80 do_fldt(X86Access *ac, target_ulong ptr)
89 {
90     CPU_LDoubleU temp;
91 
92     temp.l.lower = access_ldq(ac, ptr);
93     temp.l.upper = access_ldw(ac, ptr + 8);
94     return temp.d;
95 }
96 
97 static void do_fstt(X86Access *ac, target_ulong ptr, floatx80 f)
98 {
99     CPU_LDoubleU temp;
100 
101     temp.d = f;
102     access_stq(ac, ptr, temp.l.lower);
103     access_stw(ac, ptr + 8, temp.l.upper);
104 }
105 
106 /* x87 FPU helpers */
107 
108 static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
109 {
110     union {
111         float64 f64;
112         double d;
113     } u;
114 
115     u.f64 = floatx80_to_float64(a, &env->fp_status);
116     return u.d;
117 }
118 
119 static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
120 {
121     union {
122         float64 f64;
123         double d;
124     } u;
125 
126     u.d = a;
127     return float64_to_floatx80(u.f64, &env->fp_status);
128 }
129 
130 static void fpu_set_exception(CPUX86State *env, int mask)
131 {
132     env->fpus |= mask;
133     if (env->fpus & (~env->fpuc & FPUC_EM)) {
134         env->fpus |= FPUS_SE | FPUS_B;
135     }
136 }
137 
138 void cpu_init_fp_statuses(CPUX86State *env)
139 {
140     /*
141      * Initialise the non-runtime-varying fields of the various
142      * float_status words to x86 behaviour. This must be called at
143      * CPU reset because the float_status words are in the
144      * "zeroed on reset" portion of the CPU state struct.
145      * Fields in float_status that vary under guest control are set
146      * via the codepath for setting that register, eg cpu_set_fpuc().
147      */
148     /*
149      * Use x87 NaN propagation rules:
150      * SNaN + QNaN => return the QNaN
151      * two SNaNs => return the one with the larger significand, silenced
152      * two QNaNs => return the one with the larger significand
153      * SNaN and a non-NaN => return the SNaN, silenced
154      * QNaN and a non-NaN => return the QNaN
155      *
156      * If we get down to comparing significands and they are the same,
157      * return the NaN with the positive sign bit (if any).
158      */
159     set_float_2nan_prop_rule(float_2nan_prop_x87, &env->fp_status);
160     /*
161      * TODO: These are incorrect: the x86 Software Developer's Manual vol 1
162      * section 4.8.3.5 "Operating on SNaNs and QNaNs" says that the
163      * "larger significand" behaviour is only used for x87 FPU operations.
164      * For SSE the required behaviour is to always return the first NaN,
165      * which is float_2nan_prop_ab.
166      *
167      * mmx_status is used only for the AMD 3DNow! instructions, which
168      * are documented in the "3DNow! Technology Manual" as not supporting
169      * NaNs or infinities as inputs. The result of passing two NaNs is
170      * documented as "undefined", so we can do what we choose.
171      * (Strictly there is some behaviour we don't implement correctly
172      * for these "unsupported" NaN and Inf values, like "NaN * 0 == 0".)
173      */
174     set_float_2nan_prop_rule(float_2nan_prop_x87, &env->mmx_status);
175     set_float_2nan_prop_rule(float_2nan_prop_x87, &env->sse_status);
176     /*
177      * Only SSE has multiply-add instructions. In the SDM Section 14.5.2
178      * "Fused-Multiply-ADD (FMA) Numeric Behavior" the NaN handling is
179      * specified -- for 0 * inf + NaN the input NaN is selected, and if
180      * there are multiple input NaNs they are selected in the order a, b, c.
181      * We also do not raise Invalid for the 0 * inf + (Q)NaN case.
182      */
183     set_float_infzeronan_rule(float_infzeronan_dnan_never |
184                               float_infzeronan_suppress_invalid,
185                               &env->sse_status);
186     set_float_3nan_prop_rule(float_3nan_prop_abc, &env->sse_status);
187     /* Default NaN: sign bit set, most significant frac bit set */
188     set_float_default_nan_pattern(0b11000000, &env->fp_status);
189     set_float_default_nan_pattern(0b11000000, &env->mmx_status);
190     set_float_default_nan_pattern(0b11000000, &env->sse_status);
191     /*
192      * TODO: x86 does flush-to-zero detection after rounding (the SDM
193      * section 10.2.3.3 on the FTZ bit of MXCSR says that we flush
194      * when we detect underflow, which x86 does after rounding).
195      */
196     set_float_ftz_detection(float_ftz_before_rounding, &env->fp_status);
197     set_float_ftz_detection(float_ftz_before_rounding, &env->mmx_status);
198     set_float_ftz_detection(float_ftz_before_rounding, &env->sse_status);
199 }
200 
201 static inline uint8_t save_exception_flags(CPUX86State *env)
202 {
203     uint8_t old_flags = get_float_exception_flags(&env->fp_status);
204     set_float_exception_flags(0, &env->fp_status);
205     return old_flags;
206 }
207 
208 static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
209 {
210     uint8_t new_flags = get_float_exception_flags(&env->fp_status);
211     float_raise(old_flags, &env->fp_status);
212     fpu_set_exception(env,
213                       ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
214                        (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
215                        (new_flags & float_flag_overflow ? FPUS_OE : 0) |
216                        (new_flags & float_flag_underflow ? FPUS_UE : 0) |
217                        (new_flags & float_flag_inexact ? FPUS_PE : 0) |
218                        (new_flags & float_flag_input_denormal_flushed ? FPUS_DE : 0)));
219 }
220 
221 static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
222 {
223     uint8_t old_flags = save_exception_flags(env);
224     floatx80 ret = floatx80_div(a, b, &env->fp_status);
225     merge_exception_flags(env, old_flags);
226     return ret;
227 }
228 
229 static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
230 {
231     if (env->cr[0] & CR0_NE_MASK) {
232         raise_exception_ra(env, EXCP10_COPR, retaddr);
233     }
234 #if !defined(CONFIG_USER_ONLY)
235     else {
236         fpu_check_raise_ferr_irq(env);
237     }
238 #endif
239 }
240 
241 void helper_flds_FT0(CPUX86State *env, uint32_t val)
242 {
243     uint8_t old_flags = save_exception_flags(env);
244     union {
245         float32 f;
246         uint32_t i;
247     } u;
248 
249     u.i = val;
250     FT0 = float32_to_floatx80(u.f, &env->fp_status);
251     merge_exception_flags(env, old_flags);
252 }
253 
254 void helper_fldl_FT0(CPUX86State *env, uint64_t val)
255 {
256     uint8_t old_flags = save_exception_flags(env);
257     union {
258         float64 f;
259         uint64_t i;
260     } u;
261 
262     u.i = val;
263     FT0 = float64_to_floatx80(u.f, &env->fp_status);
264     merge_exception_flags(env, old_flags);
265 }
266 
267 void helper_fildl_FT0(CPUX86State *env, int32_t val)
268 {
269     FT0 = int32_to_floatx80(val, &env->fp_status);
270 }
271 
272 void helper_flds_ST0(CPUX86State *env, uint32_t val)
273 {
274     uint8_t old_flags = save_exception_flags(env);
275     int new_fpstt;
276     union {
277         float32 f;
278         uint32_t i;
279     } u;
280 
281     new_fpstt = (env->fpstt - 1) & 7;
282     u.i = val;
283     env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
284     env->fpstt = new_fpstt;
285     env->fptags[new_fpstt] = 0; /* validate stack entry */
286     merge_exception_flags(env, old_flags);
287 }
288 
289 void helper_fldl_ST0(CPUX86State *env, uint64_t val)
290 {
291     uint8_t old_flags = save_exception_flags(env);
292     int new_fpstt;
293     union {
294         float64 f;
295         uint64_t i;
296     } u;
297 
298     new_fpstt = (env->fpstt - 1) & 7;
299     u.i = val;
300     env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
301     env->fpstt = new_fpstt;
302     env->fptags[new_fpstt] = 0; /* validate stack entry */
303     merge_exception_flags(env, old_flags);
304 }
305 
306 static FloatX80RoundPrec tmp_maximise_precision(float_status *st)
307 {
308     FloatX80RoundPrec old = get_floatx80_rounding_precision(st);
309     set_floatx80_rounding_precision(floatx80_precision_x, st);
310     return old;
311 }
312 
313 void helper_fildl_ST0(CPUX86State *env, int32_t val)
314 {
315     int new_fpstt;
316     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
317 
318     new_fpstt = (env->fpstt - 1) & 7;
319     env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
320     env->fpstt = new_fpstt;
321     env->fptags[new_fpstt] = 0; /* validate stack entry */
322 
323     set_floatx80_rounding_precision(old, &env->fp_status);
324 }
325 
326 void helper_fildll_ST0(CPUX86State *env, int64_t val)
327 {
328     int new_fpstt;
329     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
330 
331     new_fpstt = (env->fpstt - 1) & 7;
332     env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
333     env->fpstt = new_fpstt;
334     env->fptags[new_fpstt] = 0; /* validate stack entry */
335 
336     set_floatx80_rounding_precision(old, &env->fp_status);
337 }
338 
339 uint32_t helper_fsts_ST0(CPUX86State *env)
340 {
341     uint8_t old_flags = save_exception_flags(env);
342     union {
343         float32 f;
344         uint32_t i;
345     } u;
346 
347     u.f = floatx80_to_float32(ST0, &env->fp_status);
348     merge_exception_flags(env, old_flags);
349     return u.i;
350 }
351 
352 uint64_t helper_fstl_ST0(CPUX86State *env)
353 {
354     uint8_t old_flags = save_exception_flags(env);
355     union {
356         float64 f;
357         uint64_t i;
358     } u;
359 
360     u.f = floatx80_to_float64(ST0, &env->fp_status);
361     merge_exception_flags(env, old_flags);
362     return u.i;
363 }
364 
365 int32_t helper_fist_ST0(CPUX86State *env)
366 {
367     uint8_t old_flags = save_exception_flags(env);
368     int32_t val;
369 
370     val = floatx80_to_int32(ST0, &env->fp_status);
371     if (val != (int16_t)val) {
372         set_float_exception_flags(float_flag_invalid, &env->fp_status);
373         val = -32768;
374     }
375     merge_exception_flags(env, old_flags);
376     return val;
377 }
378 
379 int32_t helper_fistl_ST0(CPUX86State *env)
380 {
381     uint8_t old_flags = save_exception_flags(env);
382     int32_t val;
383 
384     val = floatx80_to_int32(ST0, &env->fp_status);
385     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
386         val = 0x80000000;
387     }
388     merge_exception_flags(env, old_flags);
389     return val;
390 }
391 
392 int64_t helper_fistll_ST0(CPUX86State *env)
393 {
394     uint8_t old_flags = save_exception_flags(env);
395     int64_t val;
396 
397     val = floatx80_to_int64(ST0, &env->fp_status);
398     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
399         val = 0x8000000000000000ULL;
400     }
401     merge_exception_flags(env, old_flags);
402     return val;
403 }
404 
405 int32_t helper_fistt_ST0(CPUX86State *env)
406 {
407     uint8_t old_flags = save_exception_flags(env);
408     int32_t val;
409 
410     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
411     if (val != (int16_t)val) {
412         set_float_exception_flags(float_flag_invalid, &env->fp_status);
413         val = -32768;
414     }
415     merge_exception_flags(env, old_flags);
416     return val;
417 }
418 
419 int32_t helper_fisttl_ST0(CPUX86State *env)
420 {
421     uint8_t old_flags = save_exception_flags(env);
422     int32_t val;
423 
424     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
425     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
426         val = 0x80000000;
427     }
428     merge_exception_flags(env, old_flags);
429     return val;
430 }
431 
432 int64_t helper_fisttll_ST0(CPUX86State *env)
433 {
434     uint8_t old_flags = save_exception_flags(env);
435     int64_t val;
436 
437     val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
438     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
439         val = 0x8000000000000000ULL;
440     }
441     merge_exception_flags(env, old_flags);
442     return val;
443 }
444 
445 void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
446 {
447     int new_fpstt;
448     X86Access ac;
449 
450     access_prepare(&ac, env, ptr, 10, MMU_DATA_LOAD, GETPC());
451 
452     new_fpstt = (env->fpstt - 1) & 7;
453     env->fpregs[new_fpstt].d = do_fldt(&ac, ptr);
454     env->fpstt = new_fpstt;
455     env->fptags[new_fpstt] = 0; /* validate stack entry */
456 }
457 
458 void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
459 {
460     X86Access ac;
461 
462     access_prepare(&ac, env, ptr, 10, MMU_DATA_STORE, GETPC());
463     do_fstt(&ac, ptr, ST0);
464 }
465 
466 void helper_fpush(CPUX86State *env)
467 {
468     fpush(env);
469 }
470 
471 void helper_fpop(CPUX86State *env)
472 {
473     fpop(env);
474 }
475 
476 void helper_fdecstp(CPUX86State *env)
477 {
478     env->fpstt = (env->fpstt - 1) & 7;
479     env->fpus &= ~0x4700;
480 }
481 
482 void helper_fincstp(CPUX86State *env)
483 {
484     env->fpstt = (env->fpstt + 1) & 7;
485     env->fpus &= ~0x4700;
486 }
487 
488 /* FPU move */
489 
490 void helper_ffree_STN(CPUX86State *env, int st_index)
491 {
492     env->fptags[(env->fpstt + st_index) & 7] = 1;
493 }
494 
495 void helper_fmov_ST0_FT0(CPUX86State *env)
496 {
497     ST0 = FT0;
498 }
499 
500 void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
501 {
502     FT0 = ST(st_index);
503 }
504 
505 void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
506 {
507     ST0 = ST(st_index);
508 }
509 
510 void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
511 {
512     ST(st_index) = ST0;
513 }
514 
515 void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
516 {
517     floatx80 tmp;
518 
519     tmp = ST(st_index);
520     ST(st_index) = ST0;
521     ST0 = tmp;
522 }
523 
524 /* FPU operations */
525 
526 static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
527 
528 void helper_fcom_ST0_FT0(CPUX86State *env)
529 {
530     uint8_t old_flags = save_exception_flags(env);
531     FloatRelation ret;
532 
533     ret = floatx80_compare(ST0, FT0, &env->fp_status);
534     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
535     merge_exception_flags(env, old_flags);
536 }
537 
538 void helper_fucom_ST0_FT0(CPUX86State *env)
539 {
540     uint8_t old_flags = save_exception_flags(env);
541     FloatRelation ret;
542 
543     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
544     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
545     merge_exception_flags(env, old_flags);
546 }
547 
548 static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
549 
550 void helper_fcomi_ST0_FT0(CPUX86State *env)
551 {
552     uint8_t old_flags = save_exception_flags(env);
553     int eflags;
554     FloatRelation ret;
555 
556     ret = floatx80_compare(ST0, FT0, &env->fp_status);
557     eflags = cpu_cc_compute_all(env) & ~(CC_Z | CC_P | CC_C);
558     CC_SRC = eflags | fcomi_ccval[ret + 1];
559     CC_OP = CC_OP_EFLAGS;
560     merge_exception_flags(env, old_flags);
561 }
562 
563 void helper_fucomi_ST0_FT0(CPUX86State *env)
564 {
565     uint8_t old_flags = save_exception_flags(env);
566     int eflags;
567     FloatRelation ret;
568 
569     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
570     eflags = cpu_cc_compute_all(env) & ~(CC_Z | CC_P | CC_C);
571     CC_SRC = eflags | fcomi_ccval[ret + 1];
572     CC_OP = CC_OP_EFLAGS;
573     merge_exception_flags(env, old_flags);
574 }
575 
576 void helper_fadd_ST0_FT0(CPUX86State *env)
577 {
578     uint8_t old_flags = save_exception_flags(env);
579     ST0 = floatx80_add(ST0, FT0, &env->fp_status);
580     merge_exception_flags(env, old_flags);
581 }
582 
583 void helper_fmul_ST0_FT0(CPUX86State *env)
584 {
585     uint8_t old_flags = save_exception_flags(env);
586     ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
587     merge_exception_flags(env, old_flags);
588 }
589 
590 void helper_fsub_ST0_FT0(CPUX86State *env)
591 {
592     uint8_t old_flags = save_exception_flags(env);
593     ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
594     merge_exception_flags(env, old_flags);
595 }
596 
597 void helper_fsubr_ST0_FT0(CPUX86State *env)
598 {
599     uint8_t old_flags = save_exception_flags(env);
600     ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
601     merge_exception_flags(env, old_flags);
602 }
603 
604 void helper_fdiv_ST0_FT0(CPUX86State *env)
605 {
606     ST0 = helper_fdiv(env, ST0, FT0);
607 }
608 
609 void helper_fdivr_ST0_FT0(CPUX86State *env)
610 {
611     ST0 = helper_fdiv(env, FT0, ST0);
612 }
613 
614 /* fp operations between STN and ST0 */
615 
616 void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
617 {
618     uint8_t old_flags = save_exception_flags(env);
619     ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
620     merge_exception_flags(env, old_flags);
621 }
622 
623 void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
624 {
625     uint8_t old_flags = save_exception_flags(env);
626     ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
627     merge_exception_flags(env, old_flags);
628 }
629 
630 void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
631 {
632     uint8_t old_flags = save_exception_flags(env);
633     ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
634     merge_exception_flags(env, old_flags);
635 }
636 
637 void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
638 {
639     uint8_t old_flags = save_exception_flags(env);
640     ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
641     merge_exception_flags(env, old_flags);
642 }
643 
644 void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
645 {
646     floatx80 *p;
647 
648     p = &ST(st_index);
649     *p = helper_fdiv(env, *p, ST0);
650 }
651 
652 void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
653 {
654     floatx80 *p;
655 
656     p = &ST(st_index);
657     *p = helper_fdiv(env, ST0, *p);
658 }
659 
660 /* misc FPU operations */
661 void helper_fchs_ST0(CPUX86State *env)
662 {
663     ST0 = floatx80_chs(ST0);
664 }
665 
666 void helper_fabs_ST0(CPUX86State *env)
667 {
668     ST0 = floatx80_abs(ST0);
669 }
670 
671 void helper_fld1_ST0(CPUX86State *env)
672 {
673     ST0 = floatx80_one;
674 }
675 
676 void helper_fldl2t_ST0(CPUX86State *env)
677 {
678     switch (env->fpuc & FPU_RC_MASK) {
679     case FPU_RC_UP:
680         ST0 = floatx80_l2t_u;
681         break;
682     default:
683         ST0 = floatx80_l2t;
684         break;
685     }
686 }
687 
688 void helper_fldl2e_ST0(CPUX86State *env)
689 {
690     switch (env->fpuc & FPU_RC_MASK) {
691     case FPU_RC_DOWN:
692     case FPU_RC_CHOP:
693         ST0 = floatx80_l2e_d;
694         break;
695     default:
696         ST0 = floatx80_l2e;
697         break;
698     }
699 }
700 
701 void helper_fldpi_ST0(CPUX86State *env)
702 {
703     switch (env->fpuc & FPU_RC_MASK) {
704     case FPU_RC_DOWN:
705     case FPU_RC_CHOP:
706         ST0 = floatx80_pi_d;
707         break;
708     default:
709         ST0 = floatx80_pi;
710         break;
711     }
712 }
713 
714 void helper_fldlg2_ST0(CPUX86State *env)
715 {
716     switch (env->fpuc & FPU_RC_MASK) {
717     case FPU_RC_DOWN:
718     case FPU_RC_CHOP:
719         ST0 = floatx80_lg2_d;
720         break;
721     default:
722         ST0 = floatx80_lg2;
723         break;
724     }
725 }
726 
727 void helper_fldln2_ST0(CPUX86State *env)
728 {
729     switch (env->fpuc & FPU_RC_MASK) {
730     case FPU_RC_DOWN:
731     case FPU_RC_CHOP:
732         ST0 = floatx80_ln2_d;
733         break;
734     default:
735         ST0 = floatx80_ln2;
736         break;
737     }
738 }
739 
740 void helper_fldz_ST0(CPUX86State *env)
741 {
742     ST0 = floatx80_zero;
743 }
744 
745 void helper_fldz_FT0(CPUX86State *env)
746 {
747     FT0 = floatx80_zero;
748 }
749 
750 uint32_t helper_fnstsw(CPUX86State *env)
751 {
752     return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
753 }
754 
755 uint32_t helper_fnstcw(CPUX86State *env)
756 {
757     return env->fpuc;
758 }
759 
760 static void set_x86_rounding_mode(unsigned mode, float_status *status)
761 {
762     static FloatRoundMode x86_round_mode[4] = {
763         float_round_nearest_even,
764         float_round_down,
765         float_round_up,
766         float_round_to_zero
767     };
768     assert(mode < ARRAY_SIZE(x86_round_mode));
769     set_float_rounding_mode(x86_round_mode[mode], status);
770 }
771 
772 void update_fp_status(CPUX86State *env)
773 {
774     int rnd_mode;
775     FloatX80RoundPrec rnd_prec;
776 
777     /* set rounding mode */
778     rnd_mode = (env->fpuc & FPU_RC_MASK) >> FPU_RC_SHIFT;
779     set_x86_rounding_mode(rnd_mode, &env->fp_status);
780 
781     switch ((env->fpuc >> 8) & 3) {
782     case 0:
783         rnd_prec = floatx80_precision_s;
784         break;
785     case 2:
786         rnd_prec = floatx80_precision_d;
787         break;
788     case 3:
789     default:
790         rnd_prec = floatx80_precision_x;
791         break;
792     }
793     set_floatx80_rounding_precision(rnd_prec, &env->fp_status);
794 }
795 
796 void helper_fldcw(CPUX86State *env, uint32_t val)
797 {
798     cpu_set_fpuc(env, val);
799 }
800 
801 void helper_fclex(CPUX86State *env)
802 {
803     env->fpus &= 0x7f00;
804 }
805 
806 void helper_fwait(CPUX86State *env)
807 {
808     if (env->fpus & FPUS_SE) {
809         fpu_raise_exception(env, GETPC());
810     }
811 }
812 
813 static void do_fninit(CPUX86State *env)
814 {
815     env->fpus = 0;
816     env->fpstt = 0;
817     env->fpcs = 0;
818     env->fpds = 0;
819     env->fpip = 0;
820     env->fpdp = 0;
821     cpu_set_fpuc(env, 0x37f);
822     env->fptags[0] = 1;
823     env->fptags[1] = 1;
824     env->fptags[2] = 1;
825     env->fptags[3] = 1;
826     env->fptags[4] = 1;
827     env->fptags[5] = 1;
828     env->fptags[6] = 1;
829     env->fptags[7] = 1;
830 }
831 
832 void helper_fninit(CPUX86State *env)
833 {
834     do_fninit(env);
835 }
836 
837 /* BCD ops */
838 
839 void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
840 {
841     X86Access ac;
842     floatx80 tmp;
843     uint64_t val;
844     unsigned int v;
845     int i;
846 
847     access_prepare(&ac, env, ptr, 10, MMU_DATA_LOAD, GETPC());
848 
849     val = 0;
850     for (i = 8; i >= 0; i--) {
851         v = access_ldb(&ac, ptr + i);
852         val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
853     }
854     tmp = int64_to_floatx80(val, &env->fp_status);
855     if (access_ldb(&ac, ptr + 9) & 0x80) {
856         tmp = floatx80_chs(tmp);
857     }
858     fpush(env);
859     ST0 = tmp;
860 }
861 
862 void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
863 {
864     uint8_t old_flags = save_exception_flags(env);
865     int v;
866     target_ulong mem_ref, mem_end;
867     int64_t val;
868     CPU_LDoubleU temp;
869     X86Access ac;
870 
871     access_prepare(&ac, env, ptr, 10, MMU_DATA_STORE, GETPC());
872     temp.d = ST0;
873 
874     val = floatx80_to_int64(ST0, &env->fp_status);
875     mem_ref = ptr;
876     if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
877         set_float_exception_flags(float_flag_invalid, &env->fp_status);
878         while (mem_ref < ptr + 7) {
879             access_stb(&ac, mem_ref++, 0);
880         }
881         access_stb(&ac, mem_ref++, 0xc0);
882         access_stb(&ac, mem_ref++, 0xff);
883         access_stb(&ac, mem_ref++, 0xff);
884         merge_exception_flags(env, old_flags);
885         return;
886     }
887     mem_end = mem_ref + 9;
888     if (SIGND(temp)) {
889         access_stb(&ac, mem_end, 0x80);
890         val = -val;
891     } else {
892         access_stb(&ac, mem_end, 0x00);
893     }
894     while (mem_ref < mem_end) {
895         if (val == 0) {
896             break;
897         }
898         v = val % 100;
899         val = val / 100;
900         v = ((v / 10) << 4) | (v % 10);
901         access_stb(&ac, mem_ref++, v);
902     }
903     while (mem_ref < mem_end) {
904         access_stb(&ac, mem_ref++, 0);
905     }
906     merge_exception_flags(env, old_flags);
907 }
908 
909 /* 128-bit significand of log(2).  */
910 #define ln2_sig_high 0xb17217f7d1cf79abULL
911 #define ln2_sig_low 0xc9e3b39803f2f6afULL
912 
913 /*
914  * Polynomial coefficients for an approximation to (2^x - 1) / x, on
915  * the interval [-1/64, 1/64].
916  */
917 #define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL)
918 #define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL)
919 #define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL)
920 #define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL)
921 #define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL)
922 #define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL)
923 #define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL)
924 #define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL)
925 #define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL)
926 
927 struct f2xm1_data {
928     /*
929      * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1
930      * are very close to exact floatx80 values.
931      */
932     floatx80 t;
933     /* The value of 2^t.  */
934     floatx80 exp2;
935     /* The value of 2^t - 1.  */
936     floatx80 exp2m1;
937 };
938 
939 static const struct f2xm1_data f2xm1_table[65] = {
940     { make_floatx80_init(0xbfff, 0x8000000000000000ULL),
941       make_floatx80_init(0x3ffe, 0x8000000000000000ULL),
942       make_floatx80_init(0xbffe, 0x8000000000000000ULL) },
943     { make_floatx80_init(0xbffe, 0xf800000000002e7eULL),
944       make_floatx80_init(0x3ffe, 0x82cd8698ac2b9160ULL),
945       make_floatx80_init(0xbffd, 0xfa64f2cea7a8dd40ULL) },
946     { make_floatx80_init(0xbffe, 0xefffffffffffe960ULL),
947       make_floatx80_init(0x3ffe, 0x85aac367cc488345ULL),
948       make_floatx80_init(0xbffd, 0xf4aa7930676ef976ULL) },
949     { make_floatx80_init(0xbffe, 0xe800000000006f10ULL),
950       make_floatx80_init(0x3ffe, 0x88980e8092da5c14ULL),
951       make_floatx80_init(0xbffd, 0xeecfe2feda4b47d8ULL) },
952     { make_floatx80_init(0xbffe, 0xe000000000008a45ULL),
953       make_floatx80_init(0x3ffe, 0x8b95c1e3ea8ba2a5ULL),
954       make_floatx80_init(0xbffd, 0xe8d47c382ae8bab6ULL) },
955     { make_floatx80_init(0xbffe, 0xd7ffffffffff8a9eULL),
956       make_floatx80_init(0x3ffe, 0x8ea4398b45cd8116ULL),
957       make_floatx80_init(0xbffd, 0xe2b78ce97464fdd4ULL) },
958     { make_floatx80_init(0xbffe, 0xd0000000000019a0ULL),
959       make_floatx80_init(0x3ffe, 0x91c3d373ab11b919ULL),
960       make_floatx80_init(0xbffd, 0xdc785918a9dc8dceULL) },
961     { make_floatx80_init(0xbffe, 0xc7ffffffffff14dfULL),
962       make_floatx80_init(0x3ffe, 0x94f4efa8fef76836ULL),
963       make_floatx80_init(0xbffd, 0xd61620ae02112f94ULL) },
964     { make_floatx80_init(0xbffe, 0xc000000000006530ULL),
965       make_floatx80_init(0x3ffe, 0x9837f0518db87fbbULL),
966       make_floatx80_init(0xbffd, 0xcf901f5ce48f008aULL) },
967     { make_floatx80_init(0xbffe, 0xb7ffffffffff1723ULL),
968       make_floatx80_init(0x3ffe, 0x9b8d39b9d54eb74cULL),
969       make_floatx80_init(0xbffd, 0xc8e58c8c55629168ULL) },
970     { make_floatx80_init(0xbffe, 0xb00000000000b5e1ULL),
971       make_floatx80_init(0x3ffe, 0x9ef5326091a0c366ULL),
972       make_floatx80_init(0xbffd, 0xc2159b3edcbe7934ULL) },
973     { make_floatx80_init(0xbffe, 0xa800000000006f8aULL),
974       make_floatx80_init(0x3ffe, 0xa27043030c49370aULL),
975       make_floatx80_init(0xbffd, 0xbb1f79f9e76d91ecULL) },
976     { make_floatx80_init(0xbffe, 0x9fffffffffff816aULL),
977       make_floatx80_init(0x3ffe, 0xa5fed6a9b15171cfULL),
978       make_floatx80_init(0xbffd, 0xb40252ac9d5d1c62ULL) },
979     { make_floatx80_init(0xbffe, 0x97ffffffffffb621ULL),
980       make_floatx80_init(0x3ffe, 0xa9a15ab4ea7c30e6ULL),
981       make_floatx80_init(0xbffd, 0xacbd4a962b079e34ULL) },
982     { make_floatx80_init(0xbffe, 0x8fffffffffff162bULL),
983       make_floatx80_init(0x3ffe, 0xad583eea42a1b886ULL),
984       make_floatx80_init(0xbffd, 0xa54f822b7abc8ef4ULL) },
985     { make_floatx80_init(0xbffe, 0x87ffffffffff4d34ULL),
986       make_floatx80_init(0x3ffe, 0xb123f581d2ac7b51ULL),
987       make_floatx80_init(0xbffd, 0x9db814fc5aa7095eULL) },
988     { make_floatx80_init(0xbffe, 0x800000000000227dULL),
989       make_floatx80_init(0x3ffe, 0xb504f333f9de539dULL),
990       make_floatx80_init(0xbffd, 0x95f619980c4358c6ULL) },
991     { make_floatx80_init(0xbffd, 0xefffffffffff3978ULL),
992       make_floatx80_init(0x3ffe, 0xb8fbaf4762fbd0a1ULL),
993       make_floatx80_init(0xbffd, 0x8e08a1713a085ebeULL) },
994     { make_floatx80_init(0xbffd, 0xe00000000000df81ULL),
995       make_floatx80_init(0x3ffe, 0xbd08a39f580bfd8cULL),
996       make_floatx80_init(0xbffd, 0x85eeb8c14fe804e8ULL) },
997     { make_floatx80_init(0xbffd, 0xd00000000000bccfULL),
998       make_floatx80_init(0x3ffe, 0xc12c4cca667062f6ULL),
999       make_floatx80_init(0xbffc, 0xfb4eccd6663e7428ULL) },
1000     { make_floatx80_init(0xbffd, 0xc00000000000eff0ULL),
1001       make_floatx80_init(0x3ffe, 0xc5672a1155069abeULL),
1002       make_floatx80_init(0xbffc, 0xea6357baabe59508ULL) },
1003     { make_floatx80_init(0xbffd, 0xb000000000000fe6ULL),
1004       make_floatx80_init(0x3ffe, 0xc9b9bd866e2f234bULL),
1005       make_floatx80_init(0xbffc, 0xd91909e6474372d4ULL) },
1006     { make_floatx80_init(0xbffd, 0x9fffffffffff2172ULL),
1007       make_floatx80_init(0x3ffe, 0xce248c151f84bf00ULL),
1008       make_floatx80_init(0xbffc, 0xc76dcfab81ed0400ULL) },
1009     { make_floatx80_init(0xbffd, 0x8fffffffffffafffULL),
1010       make_floatx80_init(0x3ffe, 0xd2a81d91f12afb2bULL),
1011       make_floatx80_init(0xbffc, 0xb55f89b83b541354ULL) },
1012     { make_floatx80_init(0xbffc, 0xffffffffffff81a3ULL),
1013       make_floatx80_init(0x3ffe, 0xd744fccad69d7d5eULL),
1014       make_floatx80_init(0xbffc, 0xa2ec0cd4a58a0a88ULL) },
1015     { make_floatx80_init(0xbffc, 0xdfffffffffff1568ULL),
1016       make_floatx80_init(0x3ffe, 0xdbfbb797daf25a44ULL),
1017       make_floatx80_init(0xbffc, 0x901121a0943696f0ULL) },
1018     { make_floatx80_init(0xbffc, 0xbfffffffffff68daULL),
1019       make_floatx80_init(0x3ffe, 0xe0ccdeec2a94f811ULL),
1020       make_floatx80_init(0xbffb, 0xf999089eab583f78ULL) },
1021     { make_floatx80_init(0xbffc, 0x9fffffffffff4690ULL),
1022       make_floatx80_init(0x3ffe, 0xe5b906e77c83657eULL),
1023       make_floatx80_init(0xbffb, 0xd237c8c41be4d410ULL) },
1024     { make_floatx80_init(0xbffb, 0xffffffffffff8aeeULL),
1025       make_floatx80_init(0x3ffe, 0xeac0c6e7dd24427cULL),
1026       make_floatx80_init(0xbffb, 0xa9f9c8c116ddec20ULL) },
1027     { make_floatx80_init(0xbffb, 0xbfffffffffff2d18ULL),
1028       make_floatx80_init(0x3ffe, 0xefe4b99bdcdb06ebULL),
1029       make_floatx80_init(0xbffb, 0x80da33211927c8a8ULL) },
1030     { make_floatx80_init(0xbffa, 0xffffffffffff8ccbULL),
1031       make_floatx80_init(0x3ffe, 0xf5257d152486d0f4ULL),
1032       make_floatx80_init(0xbffa, 0xada82eadb792f0c0ULL) },
1033     { make_floatx80_init(0xbff9, 0xffffffffffff11feULL),
1034       make_floatx80_init(0x3ffe, 0xfa83b2db722a0846ULL),
1035       make_floatx80_init(0xbff9, 0xaf89a491babef740ULL) },
1036     { floatx80_zero_init,
1037       make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1038       floatx80_zero_init },
1039     { make_floatx80_init(0x3ff9, 0xffffffffffff2680ULL),
1040       make_floatx80_init(0x3fff, 0x82cd8698ac2b9f6fULL),
1041       make_floatx80_init(0x3ff9, 0xb361a62b0ae7dbc0ULL) },
1042     { make_floatx80_init(0x3ffb, 0x800000000000b500ULL),
1043       make_floatx80_init(0x3fff, 0x85aac367cc488345ULL),
1044       make_floatx80_init(0x3ffa, 0xb5586cf9891068a0ULL) },
1045     { make_floatx80_init(0x3ffb, 0xbfffffffffff4b67ULL),
1046       make_floatx80_init(0x3fff, 0x88980e8092da7cceULL),
1047       make_floatx80_init(0x3ffb, 0x8980e8092da7cce0ULL) },
1048     { make_floatx80_init(0x3ffb, 0xffffffffffffff57ULL),
1049       make_floatx80_init(0x3fff, 0x8b95c1e3ea8bd6dfULL),
1050       make_floatx80_init(0x3ffb, 0xb95c1e3ea8bd6df0ULL) },
1051     { make_floatx80_init(0x3ffc, 0x9fffffffffff811fULL),
1052       make_floatx80_init(0x3fff, 0x8ea4398b45cd4780ULL),
1053       make_floatx80_init(0x3ffb, 0xea4398b45cd47800ULL) },
1054     { make_floatx80_init(0x3ffc, 0xbfffffffffff9980ULL),
1055       make_floatx80_init(0x3fff, 0x91c3d373ab11b919ULL),
1056       make_floatx80_init(0x3ffc, 0x8e1e9b9d588dc8c8ULL) },
1057     { make_floatx80_init(0x3ffc, 0xdffffffffffff631ULL),
1058       make_floatx80_init(0x3fff, 0x94f4efa8fef70864ULL),
1059       make_floatx80_init(0x3ffc, 0xa7a77d47f7b84320ULL) },
1060     { make_floatx80_init(0x3ffc, 0xffffffffffff2499ULL),
1061       make_floatx80_init(0x3fff, 0x9837f0518db892d4ULL),
1062       make_floatx80_init(0x3ffc, 0xc1bf828c6dc496a0ULL) },
1063     { make_floatx80_init(0x3ffd, 0x8fffffffffff80fbULL),
1064       make_floatx80_init(0x3fff, 0x9b8d39b9d54e3a79ULL),
1065       make_floatx80_init(0x3ffc, 0xdc69cdceaa71d3c8ULL) },
1066     { make_floatx80_init(0x3ffd, 0x9fffffffffffbc23ULL),
1067       make_floatx80_init(0x3fff, 0x9ef5326091a10313ULL),
1068       make_floatx80_init(0x3ffc, 0xf7a993048d081898ULL) },
1069     { make_floatx80_init(0x3ffd, 0xafffffffffff20ecULL),
1070       make_floatx80_init(0x3fff, 0xa27043030c49370aULL),
1071       make_floatx80_init(0x3ffd, 0x89c10c0c3124dc28ULL) },
1072     { make_floatx80_init(0x3ffd, 0xc00000000000fd2cULL),
1073       make_floatx80_init(0x3fff, 0xa5fed6a9b15171cfULL),
1074       make_floatx80_init(0x3ffd, 0x97fb5aa6c545c73cULL) },
1075     { make_floatx80_init(0x3ffd, 0xd0000000000093beULL),
1076       make_floatx80_init(0x3fff, 0xa9a15ab4ea7c30e6ULL),
1077       make_floatx80_init(0x3ffd, 0xa6856ad3a9f0c398ULL) },
1078     { make_floatx80_init(0x3ffd, 0xe00000000000c2aeULL),
1079       make_floatx80_init(0x3fff, 0xad583eea42a17876ULL),
1080       make_floatx80_init(0x3ffd, 0xb560fba90a85e1d8ULL) },
1081     { make_floatx80_init(0x3ffd, 0xefffffffffff1e3fULL),
1082       make_floatx80_init(0x3fff, 0xb123f581d2abef6cULL),
1083       make_floatx80_init(0x3ffd, 0xc48fd6074aafbdb0ULL) },
1084     { make_floatx80_init(0x3ffd, 0xffffffffffff1c23ULL),
1085       make_floatx80_init(0x3fff, 0xb504f333f9de2cadULL),
1086       make_floatx80_init(0x3ffd, 0xd413cccfe778b2b4ULL) },
1087     { make_floatx80_init(0x3ffe, 0x8800000000006344ULL),
1088       make_floatx80_init(0x3fff, 0xb8fbaf4762fbd0a1ULL),
1089       make_floatx80_init(0x3ffd, 0xe3eebd1d8bef4284ULL) },
1090     { make_floatx80_init(0x3ffe, 0x9000000000005d67ULL),
1091       make_floatx80_init(0x3fff, 0xbd08a39f580c668dULL),
1092       make_floatx80_init(0x3ffd, 0xf4228e7d60319a34ULL) },
1093     { make_floatx80_init(0x3ffe, 0x9800000000009127ULL),
1094       make_floatx80_init(0x3fff, 0xc12c4cca6670e042ULL),
1095       make_floatx80_init(0x3ffe, 0x82589994cce1c084ULL) },
1096     { make_floatx80_init(0x3ffe, 0x9fffffffffff06f9ULL),
1097       make_floatx80_init(0x3fff, 0xc5672a11550655c3ULL),
1098       make_floatx80_init(0x3ffe, 0x8ace5422aa0cab86ULL) },
1099     { make_floatx80_init(0x3ffe, 0xa7fffffffffff80dULL),
1100       make_floatx80_init(0x3fff, 0xc9b9bd866e2f234bULL),
1101       make_floatx80_init(0x3ffe, 0x93737b0cdc5e4696ULL) },
1102     { make_floatx80_init(0x3ffe, 0xafffffffffff1470ULL),
1103       make_floatx80_init(0x3fff, 0xce248c151f83fd69ULL),
1104       make_floatx80_init(0x3ffe, 0x9c49182a3f07fad2ULL) },
1105     { make_floatx80_init(0x3ffe, 0xb800000000000e0aULL),
1106       make_floatx80_init(0x3fff, 0xd2a81d91f12aec5cULL),
1107       make_floatx80_init(0x3ffe, 0xa5503b23e255d8b8ULL) },
1108     { make_floatx80_init(0x3ffe, 0xc00000000000b7faULL),
1109       make_floatx80_init(0x3fff, 0xd744fccad69dd630ULL),
1110       make_floatx80_init(0x3ffe, 0xae89f995ad3bac60ULL) },
1111     { make_floatx80_init(0x3ffe, 0xc800000000003aa6ULL),
1112       make_floatx80_init(0x3fff, 0xdbfbb797daf25a44ULL),
1113       make_floatx80_init(0x3ffe, 0xb7f76f2fb5e4b488ULL) },
1114     { make_floatx80_init(0x3ffe, 0xd00000000000a6aeULL),
1115       make_floatx80_init(0x3fff, 0xe0ccdeec2a954685ULL),
1116       make_floatx80_init(0x3ffe, 0xc199bdd8552a8d0aULL) },
1117     { make_floatx80_init(0x3ffe, 0xd800000000004165ULL),
1118       make_floatx80_init(0x3fff, 0xe5b906e77c837155ULL),
1119       make_floatx80_init(0x3ffe, 0xcb720dcef906e2aaULL) },
1120     { make_floatx80_init(0x3ffe, 0xe00000000000582cULL),
1121       make_floatx80_init(0x3fff, 0xeac0c6e7dd24713aULL),
1122       make_floatx80_init(0x3ffe, 0xd5818dcfba48e274ULL) },
1123     { make_floatx80_init(0x3ffe, 0xe800000000001a5dULL),
1124       make_floatx80_init(0x3fff, 0xefe4b99bdcdb06ebULL),
1125       make_floatx80_init(0x3ffe, 0xdfc97337b9b60dd6ULL) },
1126     { make_floatx80_init(0x3ffe, 0xefffffffffffc1efULL),
1127       make_floatx80_init(0x3fff, 0xf5257d152486a2faULL),
1128       make_floatx80_init(0x3ffe, 0xea4afa2a490d45f4ULL) },
1129     { make_floatx80_init(0x3ffe, 0xf800000000001069ULL),
1130       make_floatx80_init(0x3fff, 0xfa83b2db722a0e5cULL),
1131       make_floatx80_init(0x3ffe, 0xf50765b6e4541cb8ULL) },
1132     { make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1133       make_floatx80_init(0x4000, 0x8000000000000000ULL),
1134       make_floatx80_init(0x3fff, 0x8000000000000000ULL) },
1135 };
1136 
1137 void helper_f2xm1(CPUX86State *env)
1138 {
1139     uint8_t old_flags = save_exception_flags(env);
1140     uint64_t sig = extractFloatx80Frac(ST0);
1141     int32_t exp = extractFloatx80Exp(ST0);
1142     bool sign = extractFloatx80Sign(ST0);
1143 
1144     if (floatx80_invalid_encoding(ST0)) {
1145         float_raise(float_flag_invalid, &env->fp_status);
1146         ST0 = floatx80_default_nan(&env->fp_status);
1147     } else if (floatx80_is_any_nan(ST0)) {
1148         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1149             float_raise(float_flag_invalid, &env->fp_status);
1150             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1151         }
1152     } else if (exp > 0x3fff ||
1153                (exp == 0x3fff && sig != (0x8000000000000000ULL))) {
1154         /* Out of range for the instruction, treat as invalid.  */
1155         float_raise(float_flag_invalid, &env->fp_status);
1156         ST0 = floatx80_default_nan(&env->fp_status);
1157     } else if (exp == 0x3fff) {
1158         /* Argument 1 or -1, exact result 1 or -0.5.  */
1159         if (sign) {
1160             ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL);
1161         }
1162     } else if (exp < 0x3fb0) {
1163         if (!floatx80_is_zero(ST0)) {
1164             /*
1165              * Multiplying the argument by an extra-precision version
1166              * of log(2) is sufficiently precise.  Zero arguments are
1167              * returned unchanged.
1168              */
1169             uint64_t sig0, sig1, sig2;
1170             if (exp == 0) {
1171                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1172             }
1173             mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1,
1174                             &sig2);
1175             /* This result is inexact.  */
1176             sig1 |= 1;
1177             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1178                                                 sign, exp, sig0, sig1,
1179                                                 &env->fp_status);
1180         }
1181     } else {
1182         floatx80 tmp, y, accum;
1183         bool asign, bsign;
1184         int32_t n, aexp, bexp;
1185         uint64_t asig0, asig1, asig2, bsig0, bsig1;
1186         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1187         FloatX80RoundPrec save_prec =
1188             env->fp_status.floatx80_rounding_precision;
1189         env->fp_status.float_rounding_mode = float_round_nearest_even;
1190         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1191 
1192         /* Find the nearest multiple of 1/32 to the argument.  */
1193         tmp = floatx80_scalbn(ST0, 5, &env->fp_status);
1194         n = 32 + floatx80_to_int32(tmp, &env->fp_status);
1195         y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status);
1196 
1197         if (floatx80_is_zero(y)) {
1198             /*
1199              * Use the value of 2^t - 1 from the table, to avoid
1200              * needing to special-case zero as a result of
1201              * multiplication below.
1202              */
1203             ST0 = f2xm1_table[n].t;
1204             set_float_exception_flags(float_flag_inexact, &env->fp_status);
1205             env->fp_status.float_rounding_mode = save_mode;
1206         } else {
1207             /*
1208              * Compute the lower parts of a polynomial expansion for
1209              * (2^y - 1) / y.
1210              */
1211             accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status);
1212             accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status);
1213             accum = floatx80_mul(accum, y, &env->fp_status);
1214             accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status);
1215             accum = floatx80_mul(accum, y, &env->fp_status);
1216             accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status);
1217             accum = floatx80_mul(accum, y, &env->fp_status);
1218             accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status);
1219             accum = floatx80_mul(accum, y, &env->fp_status);
1220             accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status);
1221             accum = floatx80_mul(accum, y, &env->fp_status);
1222             accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status);
1223             accum = floatx80_mul(accum, y, &env->fp_status);
1224             accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status);
1225 
1226             /*
1227              * The full polynomial expansion is f2xm1_coeff_0 + accum
1228              * (where accum has much lower magnitude, and so, in
1229              * particular, carry out of the addition is not possible).
1230              * (This expansion is only accurate to about 70 bits, not
1231              * 128 bits.)
1232              */
1233             aexp = extractFloatx80Exp(f2xm1_coeff_0);
1234             asign = extractFloatx80Sign(f2xm1_coeff_0);
1235             shift128RightJamming(extractFloatx80Frac(accum), 0,
1236                                  aexp - extractFloatx80Exp(accum),
1237                                  &asig0, &asig1);
1238             bsig0 = extractFloatx80Frac(f2xm1_coeff_0);
1239             bsig1 = 0;
1240             if (asign == extractFloatx80Sign(accum)) {
1241                 add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1242             } else {
1243                 sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1244             }
1245             /* And thus compute an approximation to 2^y - 1.  */
1246             mul128By64To192(asig0, asig1, extractFloatx80Frac(y),
1247                             &asig0, &asig1, &asig2);
1248             aexp += extractFloatx80Exp(y) - 0x3ffe;
1249             asign ^= extractFloatx80Sign(y);
1250             if (n != 32) {
1251                 /*
1252                  * Multiply this by the precomputed value of 2^t and
1253                  * add that of 2^t - 1.
1254                  */
1255                 mul128By64To192(asig0, asig1,
1256                                 extractFloatx80Frac(f2xm1_table[n].exp2),
1257                                 &asig0, &asig1, &asig2);
1258                 aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe;
1259                 bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1);
1260                 bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1);
1261                 bsig1 = 0;
1262                 if (bexp < aexp) {
1263                     shift128RightJamming(bsig0, bsig1, aexp - bexp,
1264                                          &bsig0, &bsig1);
1265                 } else if (aexp < bexp) {
1266                     shift128RightJamming(asig0, asig1, bexp - aexp,
1267                                          &asig0, &asig1);
1268                     aexp = bexp;
1269                 }
1270                 /* The sign of 2^t - 1 is always that of the result.  */
1271                 bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1);
1272                 if (asign == bsign) {
1273                     /* Avoid possible carry out of the addition.  */
1274                     shift128RightJamming(asig0, asig1, 1,
1275                                          &asig0, &asig1);
1276                     shift128RightJamming(bsig0, bsig1, 1,
1277                                          &bsig0, &bsig1);
1278                     ++aexp;
1279                     add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1);
1280                 } else {
1281                     sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1282                     asign = bsign;
1283                 }
1284             }
1285             env->fp_status.float_rounding_mode = save_mode;
1286             /* This result is inexact.  */
1287             asig1 |= 1;
1288             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1289                                                 asign, aexp, asig0, asig1,
1290                                                 &env->fp_status);
1291         }
1292 
1293         env->fp_status.floatx80_rounding_precision = save_prec;
1294     }
1295     merge_exception_flags(env, old_flags);
1296 }
1297 
1298 void helper_fptan(CPUX86State *env)
1299 {
1300     double fptemp = floatx80_to_double(env, ST0);
1301 
1302     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1303         env->fpus |= 0x400;
1304     } else {
1305         fptemp = tan(fptemp);
1306         ST0 = double_to_floatx80(env, fptemp);
1307         fpush(env);
1308         ST0 = floatx80_one;
1309         env->fpus &= ~0x400; /* C2 <-- 0 */
1310         /* the above code is for |arg| < 2**52 only */
1311     }
1312 }
1313 
1314 /* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision.  */
1315 #define pi_4_exp 0x3ffe
1316 #define pi_4_sig_high 0xc90fdaa22168c234ULL
1317 #define pi_4_sig_low 0xc4c6628b80dc1cd1ULL
1318 #define pi_2_exp 0x3fff
1319 #define pi_2_sig_high 0xc90fdaa22168c234ULL
1320 #define pi_2_sig_low 0xc4c6628b80dc1cd1ULL
1321 #define pi_34_exp 0x4000
1322 #define pi_34_sig_high 0x96cbe3f9990e91a7ULL
1323 #define pi_34_sig_low 0x9394c9e8a0a5159dULL
1324 #define pi_exp 0x4000
1325 #define pi_sig_high 0xc90fdaa22168c234ULL
1326 #define pi_sig_low 0xc4c6628b80dc1cd1ULL
1327 
1328 /*
1329  * Polynomial coefficients for an approximation to atan(x), with only
1330  * odd powers of x used, for x in the interval [-1/16, 1/16].  (Unlike
1331  * for some other approximations, no low part is needed for the first
1332  * coefficient here to achieve a sufficiently accurate result, because
1333  * the coefficient in this minimax approximation is very close to
1334  * exactly 1.)
1335  */
1336 #define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL)
1337 #define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL)
1338 #define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL)
1339 #define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL)
1340 #define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL)
1341 #define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL)
1342 #define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL)
1343 
1344 struct fpatan_data {
1345     /* High and low parts of atan(x).  */
1346     floatx80 atan_high, atan_low;
1347 };
1348 
1349 static const struct fpatan_data fpatan_table[9] = {
1350     { floatx80_zero_init,
1351       floatx80_zero_init },
1352     { make_floatx80_init(0x3ffb, 0xfeadd4d5617b6e33ULL),
1353       make_floatx80_init(0xbfb9, 0xdda19d8305ddc420ULL) },
1354     { make_floatx80_init(0x3ffc, 0xfadbafc96406eb15ULL),
1355       make_floatx80_init(0x3fbb, 0xdb8f3debef442fccULL) },
1356     { make_floatx80_init(0x3ffd, 0xb7b0ca0f26f78474ULL),
1357       make_floatx80_init(0xbfbc, 0xeab9bdba460376faULL) },
1358     { make_floatx80_init(0x3ffd, 0xed63382b0dda7b45ULL),
1359       make_floatx80_init(0x3fbc, 0xdfc88bd978751a06ULL) },
1360     { make_floatx80_init(0x3ffe, 0x8f005d5ef7f59f9bULL),
1361       make_floatx80_init(0x3fbd, 0xb906bc2ccb886e90ULL) },
1362     { make_floatx80_init(0x3ffe, 0xa4bc7d1934f70924ULL),
1363       make_floatx80_init(0x3fbb, 0xcd43f9522bed64f8ULL) },
1364     { make_floatx80_init(0x3ffe, 0xb8053e2bc2319e74ULL),
1365       make_floatx80_init(0xbfbc, 0xd3496ab7bd6eef0cULL) },
1366     { make_floatx80_init(0x3ffe, 0xc90fdaa22168c235ULL),
1367       make_floatx80_init(0xbfbc, 0xece675d1fc8f8cbcULL) },
1368 };
1369 
1370 void helper_fpatan(CPUX86State *env)
1371 {
1372     uint8_t old_flags = save_exception_flags(env);
1373     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1374     int32_t arg0_exp = extractFloatx80Exp(ST0);
1375     bool arg0_sign = extractFloatx80Sign(ST0);
1376     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1377     int32_t arg1_exp = extractFloatx80Exp(ST1);
1378     bool arg1_sign = extractFloatx80Sign(ST1);
1379 
1380     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1381         float_raise(float_flag_invalid, &env->fp_status);
1382         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1383     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1384         float_raise(float_flag_invalid, &env->fp_status);
1385         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1386     } else if (floatx80_invalid_encoding(ST0) ||
1387                floatx80_invalid_encoding(ST1)) {
1388         float_raise(float_flag_invalid, &env->fp_status);
1389         ST1 = floatx80_default_nan(&env->fp_status);
1390     } else if (floatx80_is_any_nan(ST0)) {
1391         ST1 = ST0;
1392     } else if (floatx80_is_any_nan(ST1)) {
1393         /* Pass this NaN through.  */
1394     } else if (floatx80_is_zero(ST1) && !arg0_sign) {
1395         /* Pass this zero through.  */
1396     } else if (((floatx80_is_infinity(ST0) && !floatx80_is_infinity(ST1)) ||
1397                  arg0_exp - arg1_exp >= 80) &&
1398                !arg0_sign) {
1399         /*
1400          * Dividing ST1 by ST0 gives the correct result up to
1401          * rounding, and avoids spurious underflow exceptions that
1402          * might result from passing some small values through the
1403          * polynomial approximation, but if a finite nonzero result of
1404          * division is exact, the result of fpatan is still inexact
1405          * (and underflowing where appropriate).
1406          */
1407         FloatX80RoundPrec save_prec =
1408             env->fp_status.floatx80_rounding_precision;
1409         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1410         ST1 = floatx80_div(ST1, ST0, &env->fp_status);
1411         env->fp_status.floatx80_rounding_precision = save_prec;
1412         if (!floatx80_is_zero(ST1) &&
1413             !(get_float_exception_flags(&env->fp_status) &
1414               float_flag_inexact)) {
1415             /*
1416              * The mathematical result is very slightly closer to zero
1417              * than this exact result.  Round a value with the
1418              * significand adjusted accordingly to get the correct
1419              * exceptions, and possibly an adjusted result depending
1420              * on the rounding mode.
1421              */
1422             uint64_t sig = extractFloatx80Frac(ST1);
1423             int32_t exp = extractFloatx80Exp(ST1);
1424             bool sign = extractFloatx80Sign(ST1);
1425             if (exp == 0) {
1426                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1427             }
1428             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1429                                                 sign, exp, sig - 1,
1430                                                 -1, &env->fp_status);
1431         }
1432     } else {
1433         /* The result is inexact.  */
1434         bool rsign = arg1_sign;
1435         int32_t rexp;
1436         uint64_t rsig0, rsig1;
1437         if (floatx80_is_zero(ST1)) {
1438             /*
1439              * ST0 is negative.  The result is pi with the sign of
1440              * ST1.
1441              */
1442             rexp = pi_exp;
1443             rsig0 = pi_sig_high;
1444             rsig1 = pi_sig_low;
1445         } else if (floatx80_is_infinity(ST1)) {
1446             if (floatx80_is_infinity(ST0)) {
1447                 if (arg0_sign) {
1448                     rexp = pi_34_exp;
1449                     rsig0 = pi_34_sig_high;
1450                     rsig1 = pi_34_sig_low;
1451                 } else {
1452                     rexp = pi_4_exp;
1453                     rsig0 = pi_4_sig_high;
1454                     rsig1 = pi_4_sig_low;
1455                 }
1456             } else {
1457                 rexp = pi_2_exp;
1458                 rsig0 = pi_2_sig_high;
1459                 rsig1 = pi_2_sig_low;
1460             }
1461         } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) {
1462             rexp = pi_2_exp;
1463             rsig0 = pi_2_sig_high;
1464             rsig1 = pi_2_sig_low;
1465         } else if (floatx80_is_infinity(ST0) || arg0_exp - arg1_exp >= 80) {
1466             /* ST0 is negative.  */
1467             rexp = pi_exp;
1468             rsig0 = pi_sig_high;
1469             rsig1 = pi_sig_low;
1470         } else {
1471             /*
1472              * ST0 and ST1 are finite, nonzero and with exponents not
1473              * too far apart.
1474              */
1475             int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp;
1476             int32_t azexp, axexp;
1477             bool adj_sub, ysign, zsign;
1478             uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1;
1479             uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2;
1480             uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1;
1481             uint64_t azsig0, azsig1;
1482             uint64_t azsig2, azsig3, axsig0, axsig1;
1483             floatx80 x8;
1484             FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1485             FloatX80RoundPrec save_prec =
1486                 env->fp_status.floatx80_rounding_precision;
1487             env->fp_status.float_rounding_mode = float_round_nearest_even;
1488             env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1489 
1490             if (arg0_exp == 0) {
1491                 normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1492             }
1493             if (arg1_exp == 0) {
1494                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1495             }
1496             if (arg0_exp > arg1_exp ||
1497                 (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) {
1498                 /* Work with abs(ST1) / abs(ST0).  */
1499                 num_exp = arg1_exp;
1500                 num_sig = arg1_sig;
1501                 den_exp = arg0_exp;
1502                 den_sig = arg0_sig;
1503                 if (arg0_sign) {
1504                     /* The result is subtracted from pi.  */
1505                     adj_exp = pi_exp;
1506                     adj_sig0 = pi_sig_high;
1507                     adj_sig1 = pi_sig_low;
1508                     adj_sub = true;
1509                 } else {
1510                     /* The result is used as-is.  */
1511                     adj_exp = 0;
1512                     adj_sig0 = 0;
1513                     adj_sig1 = 0;
1514                     adj_sub = false;
1515                 }
1516             } else {
1517                 /* Work with abs(ST0) / abs(ST1).  */
1518                 num_exp = arg0_exp;
1519                 num_sig = arg0_sig;
1520                 den_exp = arg1_exp;
1521                 den_sig = arg1_sig;
1522                 /* The result is added to or subtracted from pi/2.  */
1523                 adj_exp = pi_2_exp;
1524                 adj_sig0 = pi_2_sig_high;
1525                 adj_sig1 = pi_2_sig_low;
1526                 adj_sub = !arg0_sign;
1527             }
1528 
1529             /*
1530              * Compute x = num/den, where 0 < x <= 1 and x is not too
1531              * small.
1532              */
1533             xexp = num_exp - den_exp + 0x3ffe;
1534             remsig0 = num_sig;
1535             remsig1 = 0;
1536             if (den_sig <= remsig0) {
1537                 shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1538                 ++xexp;
1539             }
1540             xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig);
1541             mul64To128(den_sig, xsig0, &msig0, &msig1);
1542             sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1);
1543             while ((int64_t) remsig0 < 0) {
1544                 --xsig0;
1545                 add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1);
1546             }
1547             xsig1 = estimateDiv128To64(remsig1, 0, den_sig);
1548             /*
1549              * No need to correct any estimation error in xsig1; even
1550              * with such error, it is accurate enough.
1551              */
1552 
1553             /*
1554              * Split x as x = t + y, where t = n/8 is the nearest
1555              * multiple of 1/8 to x.
1556              */
1557             x8 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1558                                                false, xexp + 3, xsig0,
1559                                                xsig1, &env->fp_status);
1560             n = floatx80_to_int32(x8, &env->fp_status);
1561             if (n == 0) {
1562                 ysign = false;
1563                 yexp = xexp;
1564                 ysig0 = xsig0;
1565                 ysig1 = xsig1;
1566                 texp = 0;
1567                 tsig = 0;
1568             } else {
1569                 int shift = clz32(n) + 32;
1570                 texp = 0x403b - shift;
1571                 tsig = n;
1572                 tsig <<= shift;
1573                 if (texp == xexp) {
1574                     sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1);
1575                     if ((int64_t) ysig0 >= 0) {
1576                         ysign = false;
1577                         if (ysig0 == 0) {
1578                             if (ysig1 == 0) {
1579                                 yexp = 0;
1580                             } else {
1581                                 shift = clz64(ysig1) + 64;
1582                                 yexp = xexp - shift;
1583                                 shift128Left(ysig0, ysig1, shift,
1584                                              &ysig0, &ysig1);
1585                             }
1586                         } else {
1587                             shift = clz64(ysig0);
1588                             yexp = xexp - shift;
1589                             shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1590                         }
1591                     } else {
1592                         ysign = true;
1593                         sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1);
1594                         if (ysig0 == 0) {
1595                             shift = clz64(ysig1) + 64;
1596                         } else {
1597                             shift = clz64(ysig0);
1598                         }
1599                         yexp = xexp - shift;
1600                         shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1601                     }
1602                 } else {
1603                     /*
1604                      * t's exponent must be greater than x's because t
1605                      * is positive and the nearest multiple of 1/8 to
1606                      * x, and if x has a greater exponent, the power
1607                      * of 2 with that exponent is also a multiple of
1608                      * 1/8.
1609                      */
1610                     uint64_t usig0, usig1;
1611                     shift128RightJamming(xsig0, xsig1, texp - xexp,
1612                                          &usig0, &usig1);
1613                     ysign = true;
1614                     sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1);
1615                     if (ysig0 == 0) {
1616                         shift = clz64(ysig1) + 64;
1617                     } else {
1618                         shift = clz64(ysig0);
1619                     }
1620                     yexp = texp - shift;
1621                     shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1622                 }
1623             }
1624 
1625             /*
1626              * Compute z = y/(1+tx), so arctan(x) = arctan(t) +
1627              * arctan(z).
1628              */
1629             zsign = ysign;
1630             if (texp == 0 || yexp == 0) {
1631                 zexp = yexp;
1632                 zsig0 = ysig0;
1633                 zsig1 = ysig1;
1634             } else {
1635                 /*
1636                  * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1.
1637                  */
1638                 int32_t dexp = texp + xexp - 0x3ffe;
1639                 uint64_t dsig0, dsig1, dsig2;
1640                 mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2);
1641                 /*
1642                  * dexp <= 0x3fff (and if equal, dsig0 has a leading 0
1643                  * bit).  Add 1 to produce the denominator 1+tx.
1644                  */
1645                 shift128RightJamming(dsig0, dsig1, 0x3fff - dexp,
1646                                      &dsig0, &dsig1);
1647                 dsig0 |= 0x8000000000000000ULL;
1648                 zexp = yexp - 1;
1649                 remsig0 = ysig0;
1650                 remsig1 = ysig1;
1651                 remsig2 = 0;
1652                 if (dsig0 <= remsig0) {
1653                     shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1654                     ++zexp;
1655                 }
1656                 zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0);
1657                 mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2);
1658                 sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2,
1659                        &remsig0, &remsig1, &remsig2);
1660                 while ((int64_t) remsig0 < 0) {
1661                     --zsig0;
1662                     add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1,
1663                            &remsig0, &remsig1, &remsig2);
1664                 }
1665                 zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0);
1666                 /* No need to correct any estimation error in zsig1.  */
1667             }
1668 
1669             if (zexp == 0) {
1670                 azexp = 0;
1671                 azsig0 = 0;
1672                 azsig1 = 0;
1673             } else {
1674                 floatx80 z2, accum;
1675                 uint64_t z2sig0, z2sig1, z2sig2, z2sig3;
1676                 /* Compute z^2.  */
1677                 mul128To256(zsig0, zsig1, zsig0, zsig1,
1678                             &z2sig0, &z2sig1, &z2sig2, &z2sig3);
1679                 z2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1680                                                    zexp + zexp - 0x3ffe,
1681                                                    z2sig0, z2sig1,
1682                                                    &env->fp_status);
1683 
1684                 /* Compute the lower parts of the polynomial expansion.  */
1685                 accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status);
1686                 accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status);
1687                 accum = floatx80_mul(accum, z2, &env->fp_status);
1688                 accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status);
1689                 accum = floatx80_mul(accum, z2, &env->fp_status);
1690                 accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status);
1691                 accum = floatx80_mul(accum, z2, &env->fp_status);
1692                 accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status);
1693                 accum = floatx80_mul(accum, z2, &env->fp_status);
1694                 accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status);
1695                 accum = floatx80_mul(accum, z2, &env->fp_status);
1696 
1697                 /*
1698                  * The full polynomial expansion is z*(fpatan_coeff_0 + accum).
1699                  * fpatan_coeff_0 is 1, and accum is negative and much smaller.
1700                  */
1701                 aexp = extractFloatx80Exp(fpatan_coeff_0);
1702                 shift128RightJamming(extractFloatx80Frac(accum), 0,
1703                                      aexp - extractFloatx80Exp(accum),
1704                                      &asig0, &asig1);
1705                 sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1,
1706                        &asig0, &asig1);
1707                 /* Multiply by z to compute arctan(z).  */
1708                 azexp = aexp + zexp - 0x3ffe;
1709                 mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1,
1710                             &azsig2, &azsig3);
1711             }
1712 
1713             /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign).  */
1714             if (texp == 0) {
1715                 /* z is positive.  */
1716                 axexp = azexp;
1717                 axsig0 = azsig0;
1718                 axsig1 = azsig1;
1719             } else {
1720                 bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low);
1721                 int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low);
1722                 uint64_t low_sig0 =
1723                     extractFloatx80Frac(fpatan_table[n].atan_low);
1724                 uint64_t low_sig1 = 0;
1725                 axexp = extractFloatx80Exp(fpatan_table[n].atan_high);
1726                 axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high);
1727                 axsig1 = 0;
1728                 shift128RightJamming(low_sig0, low_sig1, axexp - low_exp,
1729                                      &low_sig0, &low_sig1);
1730                 if (low_sign) {
1731                     sub128(axsig0, axsig1, low_sig0, low_sig1,
1732                            &axsig0, &axsig1);
1733                 } else {
1734                     add128(axsig0, axsig1, low_sig0, low_sig1,
1735                            &axsig0, &axsig1);
1736                 }
1737                 if (azexp >= axexp) {
1738                     shift128RightJamming(axsig0, axsig1, azexp - axexp + 1,
1739                                          &axsig0, &axsig1);
1740                     axexp = azexp + 1;
1741                     shift128RightJamming(azsig0, azsig1, 1,
1742                                          &azsig0, &azsig1);
1743                 } else {
1744                     shift128RightJamming(axsig0, axsig1, 1,
1745                                          &axsig0, &axsig1);
1746                     shift128RightJamming(azsig0, azsig1, axexp - azexp + 1,
1747                                          &azsig0, &azsig1);
1748                     ++axexp;
1749                 }
1750                 if (zsign) {
1751                     sub128(axsig0, axsig1, azsig0, azsig1,
1752                            &axsig0, &axsig1);
1753                 } else {
1754                     add128(axsig0, axsig1, azsig0, azsig1,
1755                            &axsig0, &axsig1);
1756                 }
1757             }
1758 
1759             if (adj_exp == 0) {
1760                 rexp = axexp;
1761                 rsig0 = axsig0;
1762                 rsig1 = axsig1;
1763             } else {
1764                 /*
1765                  * Add or subtract arctan(x) (exponent axexp,
1766                  * significand axsig0 and axsig1, positive, not
1767                  * necessarily normalized) to the number given by
1768                  * adj_exp, adj_sig0 and adj_sig1, according to
1769                  * adj_sub.
1770                  */
1771                 if (adj_exp >= axexp) {
1772                     shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1,
1773                                          &axsig0, &axsig1);
1774                     rexp = adj_exp + 1;
1775                     shift128RightJamming(adj_sig0, adj_sig1, 1,
1776                                          &adj_sig0, &adj_sig1);
1777                 } else {
1778                     shift128RightJamming(axsig0, axsig1, 1,
1779                                          &axsig0, &axsig1);
1780                     shift128RightJamming(adj_sig0, adj_sig1,
1781                                          axexp - adj_exp + 1,
1782                                          &adj_sig0, &adj_sig1);
1783                     rexp = axexp + 1;
1784                 }
1785                 if (adj_sub) {
1786                     sub128(adj_sig0, adj_sig1, axsig0, axsig1,
1787                            &rsig0, &rsig1);
1788                 } else {
1789                     add128(adj_sig0, adj_sig1, axsig0, axsig1,
1790                            &rsig0, &rsig1);
1791                 }
1792             }
1793 
1794             env->fp_status.float_rounding_mode = save_mode;
1795             env->fp_status.floatx80_rounding_precision = save_prec;
1796         }
1797         /* This result is inexact.  */
1798         rsig1 |= 1;
1799         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x, rsign, rexp,
1800                                             rsig0, rsig1, &env->fp_status);
1801     }
1802 
1803     fpop(env);
1804     merge_exception_flags(env, old_flags);
1805 }
1806 
1807 void helper_fxtract(CPUX86State *env)
1808 {
1809     uint8_t old_flags = save_exception_flags(env);
1810     CPU_LDoubleU temp;
1811 
1812     temp.d = ST0;
1813 
1814     if (floatx80_is_zero(ST0)) {
1815         /* Easy way to generate -inf and raising division by 0 exception */
1816         ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
1817                            &env->fp_status);
1818         fpush(env);
1819         ST0 = temp.d;
1820     } else if (floatx80_invalid_encoding(ST0)) {
1821         float_raise(float_flag_invalid, &env->fp_status);
1822         ST0 = floatx80_default_nan(&env->fp_status);
1823         fpush(env);
1824         ST0 = ST1;
1825     } else if (floatx80_is_any_nan(ST0)) {
1826         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1827             float_raise(float_flag_invalid, &env->fp_status);
1828             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1829         }
1830         fpush(env);
1831         ST0 = ST1;
1832     } else if (floatx80_is_infinity(ST0)) {
1833         fpush(env);
1834         ST0 = ST1;
1835         ST1 = floatx80_infinity;
1836     } else {
1837         int expdif;
1838 
1839         if (EXPD(temp) == 0) {
1840             int shift = clz64(temp.l.lower);
1841             temp.l.lower <<= shift;
1842             expdif = 1 - EXPBIAS - shift;
1843             float_raise(float_flag_input_denormal_flushed, &env->fp_status);
1844         } else {
1845             expdif = EXPD(temp) - EXPBIAS;
1846         }
1847         /* DP exponent bias */
1848         ST0 = int32_to_floatx80(expdif, &env->fp_status);
1849         fpush(env);
1850         BIASEXPONENT(temp);
1851         ST0 = temp.d;
1852     }
1853     merge_exception_flags(env, old_flags);
1854 }
1855 
1856 static void helper_fprem_common(CPUX86State *env, bool mod)
1857 {
1858     uint8_t old_flags = save_exception_flags(env);
1859     uint64_t quotient;
1860     CPU_LDoubleU temp0, temp1;
1861     int exp0, exp1, expdiff;
1862 
1863     temp0.d = ST0;
1864     temp1.d = ST1;
1865     exp0 = EXPD(temp0);
1866     exp1 = EXPD(temp1);
1867 
1868     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1869     if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1870         exp0 == 0x7fff || exp1 == 0x7fff ||
1871         floatx80_invalid_encoding(ST0) || floatx80_invalid_encoding(ST1)) {
1872         ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1873     } else {
1874         if (exp0 == 0) {
1875             exp0 = 1 - clz64(temp0.l.lower);
1876         }
1877         if (exp1 == 0) {
1878             exp1 = 1 - clz64(temp1.l.lower);
1879         }
1880         expdiff = exp0 - exp1;
1881         if (expdiff < 64) {
1882             ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1883             env->fpus |= (quotient & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1884             env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */
1885             env->fpus |= (quotient & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1886         } else {
1887             /*
1888              * Partial remainder.  This choice of how many bits to
1889              * process at once is specified in AMD instruction set
1890              * manuals, and empirically is followed by Intel
1891              * processors as well; it ensures that the final remainder
1892              * operation in a loop does produce the correct low three
1893              * bits of the quotient.  AMD manuals specify that the
1894              * flags other than C2 are cleared, and empirically Intel
1895              * processors clear them as well.
1896              */
1897             int n = 32 + (expdiff % 32);
1898             temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status);
1899             ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status);
1900             env->fpus |= 0x400;  /* C2 <-- 1 */
1901         }
1902     }
1903     merge_exception_flags(env, old_flags);
1904 }
1905 
1906 void helper_fprem1(CPUX86State *env)
1907 {
1908     helper_fprem_common(env, false);
1909 }
1910 
1911 void helper_fprem(CPUX86State *env)
1912 {
1913     helper_fprem_common(env, true);
1914 }
1915 
1916 /* 128-bit significand of log2(e).  */
1917 #define log2_e_sig_high 0xb8aa3b295c17f0bbULL
1918 #define log2_e_sig_low 0xbe87fed0691d3e89ULL
1919 
1920 /*
1921  * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
1922  * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
1923  * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
1924  * interval [sqrt(2)/2, sqrt(2)].
1925  */
1926 #define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
1927 #define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
1928 #define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
1929 #define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
1930 #define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
1931 #define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
1932 #define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
1933 #define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
1934 #define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
1935 #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
1936 #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
1937 
1938 /*
1939  * Compute an approximation of log2(1+arg), where 1+arg is in the
1940  * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
1941  * function is called, rounding precision is set to 80 and the
1942  * round-to-nearest mode is in effect.  arg must not be exactly zero,
1943  * and must not be so close to zero that underflow might occur.
1944  */
1945 static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
1946                                 uint64_t *sig0, uint64_t *sig1)
1947 {
1948     uint64_t arg0_sig = extractFloatx80Frac(arg);
1949     int32_t arg0_exp = extractFloatx80Exp(arg);
1950     bool arg0_sign = extractFloatx80Sign(arg);
1951     bool asign;
1952     int32_t dexp, texp, aexp;
1953     uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
1954     uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
1955     uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
1956     floatx80 t2, accum;
1957 
1958     /*
1959      * Compute an approximation of arg/(2+arg), with extra precision,
1960      * as the argument to a polynomial approximation.  The extra
1961      * precision is only needed for the first term of the
1962      * approximation, with subsequent terms being significantly
1963      * smaller; the approximation only uses odd exponents, and the
1964      * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
1965      */
1966     if (arg0_sign) {
1967         dexp = 0x3fff;
1968         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1969         sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
1970     } else {
1971         dexp = 0x4000;
1972         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1973         dsig0 |= 0x8000000000000000ULL;
1974     }
1975     texp = arg0_exp - dexp + 0x3ffe;
1976     rsig0 = arg0_sig;
1977     rsig1 = 0;
1978     rsig2 = 0;
1979     if (dsig0 <= rsig0) {
1980         shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
1981         ++texp;
1982     }
1983     tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
1984     mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
1985     sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
1986            &rsig0, &rsig1, &rsig2);
1987     while ((int64_t) rsig0 < 0) {
1988         --tsig0;
1989         add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
1990                &rsig0, &rsig1, &rsig2);
1991     }
1992     tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
1993     /*
1994      * No need to correct any estimation error in tsig1; even with
1995      * such error, it is accurate enough.  Now compute the square of
1996      * that approximation.
1997      */
1998     mul128To256(tsig0, tsig1, tsig0, tsig1,
1999                 &t2sig0, &t2sig1, &t2sig2, &t2sig3);
2000     t2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
2001                                        texp + texp - 0x3ffe,
2002                                        t2sig0, t2sig1, &env->fp_status);
2003 
2004     /* Compute the lower parts of the polynomial expansion.  */
2005     accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
2006     accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
2007     accum = floatx80_mul(accum, t2, &env->fp_status);
2008     accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
2009     accum = floatx80_mul(accum, t2, &env->fp_status);
2010     accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
2011     accum = floatx80_mul(accum, t2, &env->fp_status);
2012     accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
2013     accum = floatx80_mul(accum, t2, &env->fp_status);
2014     accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
2015     accum = floatx80_mul(accum, t2, &env->fp_status);
2016     accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
2017     accum = floatx80_mul(accum, t2, &env->fp_status);
2018     accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
2019     accum = floatx80_mul(accum, t2, &env->fp_status);
2020     accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
2021     accum = floatx80_mul(accum, t2, &env->fp_status);
2022     accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
2023 
2024     /*
2025      * The full polynomial expansion is fyl2x_coeff_0 + accum (where
2026      * accum has much lower magnitude, and so, in particular, carry
2027      * out of the addition is not possible), multiplied by t.  (This
2028      * expansion is only accurate to about 70 bits, not 128 bits.)
2029      */
2030     aexp = extractFloatx80Exp(fyl2x_coeff_0);
2031     asign = extractFloatx80Sign(fyl2x_coeff_0);
2032     shift128RightJamming(extractFloatx80Frac(accum), 0,
2033                          aexp - extractFloatx80Exp(accum),
2034                          &asig0, &asig1);
2035     bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
2036     bsig1 = 0;
2037     if (asign == extractFloatx80Sign(accum)) {
2038         add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
2039     } else {
2040         sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
2041     }
2042     /* Multiply by t to compute the required result.  */
2043     mul128To256(asig0, asig1, tsig0, tsig1,
2044                 &asig0, &asig1, &asig2, &asig3);
2045     aexp += texp - 0x3ffe;
2046     *exp = aexp;
2047     *sig0 = asig0;
2048     *sig1 = asig1;
2049 }
2050 
2051 void helper_fyl2xp1(CPUX86State *env)
2052 {
2053     uint8_t old_flags = save_exception_flags(env);
2054     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2055     int32_t arg0_exp = extractFloatx80Exp(ST0);
2056     bool arg0_sign = extractFloatx80Sign(ST0);
2057     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2058     int32_t arg1_exp = extractFloatx80Exp(ST1);
2059     bool arg1_sign = extractFloatx80Sign(ST1);
2060 
2061     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2062         float_raise(float_flag_invalid, &env->fp_status);
2063         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2064     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2065         float_raise(float_flag_invalid, &env->fp_status);
2066         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2067     } else if (floatx80_invalid_encoding(ST0) ||
2068                floatx80_invalid_encoding(ST1)) {
2069         float_raise(float_flag_invalid, &env->fp_status);
2070         ST1 = floatx80_default_nan(&env->fp_status);
2071     } else if (floatx80_is_any_nan(ST0)) {
2072         ST1 = ST0;
2073     } else if (floatx80_is_any_nan(ST1)) {
2074         /* Pass this NaN through.  */
2075     } else if (arg0_exp > 0x3ffd ||
2076                (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
2077                                                   0x95f619980c4336f7ULL :
2078                                                   0xd413cccfe7799211ULL))) {
2079         /*
2080          * Out of range for the instruction (ST0 must have absolute
2081          * value less than 1 - sqrt(2)/2 = 0.292..., according to
2082          * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
2083          * to sqrt(2) - 1, which we allow here), treat as invalid.
2084          */
2085         float_raise(float_flag_invalid, &env->fp_status);
2086         ST1 = floatx80_default_nan(&env->fp_status);
2087     } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
2088                arg1_exp == 0x7fff) {
2089         /*
2090          * One argument is zero, or multiplying by infinity; correct
2091          * result is exact and can be obtained by multiplying the
2092          * arguments.
2093          */
2094         ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
2095     } else if (arg0_exp < 0x3fb0) {
2096         /*
2097          * Multiplying both arguments and an extra-precision version
2098          * of log2(e) is sufficiently precise.
2099          */
2100         uint64_t sig0, sig1, sig2;
2101         int32_t exp;
2102         if (arg0_exp == 0) {
2103             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2104         }
2105         if (arg1_exp == 0) {
2106             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2107         }
2108         mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
2109                         &sig0, &sig1, &sig2);
2110         exp = arg0_exp + 1;
2111         mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
2112         exp += arg1_exp - 0x3ffe;
2113         /* This result is inexact.  */
2114         sig1 |= 1;
2115         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2116                                             arg0_sign ^ arg1_sign, exp,
2117                                             sig0, sig1, &env->fp_status);
2118     } else {
2119         int32_t aexp;
2120         uint64_t asig0, asig1, asig2;
2121         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2122         FloatX80RoundPrec save_prec =
2123             env->fp_status.floatx80_rounding_precision;
2124         env->fp_status.float_rounding_mode = float_round_nearest_even;
2125         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2126 
2127         helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
2128         /*
2129          * Multiply by the second argument to compute the required
2130          * result.
2131          */
2132         if (arg1_exp == 0) {
2133             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2134         }
2135         mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2136         aexp += arg1_exp - 0x3ffe;
2137         /* This result is inexact.  */
2138         asig1 |= 1;
2139         env->fp_status.float_rounding_mode = save_mode;
2140         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2141                                             arg0_sign ^ arg1_sign, aexp,
2142                                             asig0, asig1, &env->fp_status);
2143         env->fp_status.floatx80_rounding_precision = save_prec;
2144     }
2145     fpop(env);
2146     merge_exception_flags(env, old_flags);
2147 }
2148 
2149 void helper_fyl2x(CPUX86State *env)
2150 {
2151     uint8_t old_flags = save_exception_flags(env);
2152     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2153     int32_t arg0_exp = extractFloatx80Exp(ST0);
2154     bool arg0_sign = extractFloatx80Sign(ST0);
2155     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2156     int32_t arg1_exp = extractFloatx80Exp(ST1);
2157     bool arg1_sign = extractFloatx80Sign(ST1);
2158 
2159     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2160         float_raise(float_flag_invalid, &env->fp_status);
2161         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2162     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2163         float_raise(float_flag_invalid, &env->fp_status);
2164         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2165     } else if (floatx80_invalid_encoding(ST0) ||
2166                floatx80_invalid_encoding(ST1)) {
2167         float_raise(float_flag_invalid, &env->fp_status);
2168         ST1 = floatx80_default_nan(&env->fp_status);
2169     } else if (floatx80_is_any_nan(ST0)) {
2170         ST1 = ST0;
2171     } else if (floatx80_is_any_nan(ST1)) {
2172         /* Pass this NaN through.  */
2173     } else if (arg0_sign && !floatx80_is_zero(ST0)) {
2174         float_raise(float_flag_invalid, &env->fp_status);
2175         ST1 = floatx80_default_nan(&env->fp_status);
2176     } else if (floatx80_is_infinity(ST1)) {
2177         FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
2178                                              &env->fp_status);
2179         switch (cmp) {
2180         case float_relation_less:
2181             ST1 = floatx80_chs(ST1);
2182             break;
2183         case float_relation_greater:
2184             /* Result is infinity of the same sign as ST1.  */
2185             break;
2186         default:
2187             float_raise(float_flag_invalid, &env->fp_status);
2188             ST1 = floatx80_default_nan(&env->fp_status);
2189             break;
2190         }
2191     } else if (floatx80_is_infinity(ST0)) {
2192         if (floatx80_is_zero(ST1)) {
2193             float_raise(float_flag_invalid, &env->fp_status);
2194             ST1 = floatx80_default_nan(&env->fp_status);
2195         } else if (arg1_sign) {
2196             ST1 = floatx80_chs(ST0);
2197         } else {
2198             ST1 = ST0;
2199         }
2200     } else if (floatx80_is_zero(ST0)) {
2201         if (floatx80_is_zero(ST1)) {
2202             float_raise(float_flag_invalid, &env->fp_status);
2203             ST1 = floatx80_default_nan(&env->fp_status);
2204         } else {
2205             /* Result is infinity with opposite sign to ST1.  */
2206             float_raise(float_flag_divbyzero, &env->fp_status);
2207             ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
2208                                 0x8000000000000000ULL);
2209         }
2210     } else if (floatx80_is_zero(ST1)) {
2211         if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
2212             ST1 = floatx80_chs(ST1);
2213         }
2214         /* Otherwise, ST1 is already the correct result.  */
2215     } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
2216         if (arg1_sign) {
2217             ST1 = floatx80_chs(floatx80_zero);
2218         } else {
2219             ST1 = floatx80_zero;
2220         }
2221     } else {
2222         int32_t int_exp;
2223         floatx80 arg0_m1;
2224         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2225         FloatX80RoundPrec save_prec =
2226             env->fp_status.floatx80_rounding_precision;
2227         env->fp_status.float_rounding_mode = float_round_nearest_even;
2228         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2229 
2230         if (arg0_exp == 0) {
2231             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2232         }
2233         if (arg1_exp == 0) {
2234             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2235         }
2236         int_exp = arg0_exp - 0x3fff;
2237         if (arg0_sig > 0xb504f333f9de6484ULL) {
2238             ++int_exp;
2239         }
2240         arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
2241                                                &env->fp_status),
2242                                floatx80_one, &env->fp_status);
2243         if (floatx80_is_zero(arg0_m1)) {
2244             /* Exact power of 2; multiply by ST1.  */
2245             env->fp_status.float_rounding_mode = save_mode;
2246             ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
2247                                ST1, &env->fp_status);
2248         } else {
2249             bool asign = extractFloatx80Sign(arg0_m1);
2250             int32_t aexp;
2251             uint64_t asig0, asig1, asig2;
2252             helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
2253             if (int_exp != 0) {
2254                 bool isign = (int_exp < 0);
2255                 int32_t iexp;
2256                 uint64_t isig;
2257                 int shift;
2258                 int_exp = isign ? -int_exp : int_exp;
2259                 shift = clz32(int_exp) + 32;
2260                 isig = int_exp;
2261                 isig <<= shift;
2262                 iexp = 0x403e - shift;
2263                 shift128RightJamming(asig0, asig1, iexp - aexp,
2264                                      &asig0, &asig1);
2265                 if (asign == isign) {
2266                     add128(isig, 0, asig0, asig1, &asig0, &asig1);
2267                 } else {
2268                     sub128(isig, 0, asig0, asig1, &asig0, &asig1);
2269                 }
2270                 aexp = iexp;
2271                 asign = isign;
2272             }
2273             /*
2274              * Multiply by the second argument to compute the required
2275              * result.
2276              */
2277             if (arg1_exp == 0) {
2278                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2279             }
2280             mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2281             aexp += arg1_exp - 0x3ffe;
2282             /* This result is inexact.  */
2283             asig1 |= 1;
2284             env->fp_status.float_rounding_mode = save_mode;
2285             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2286                                                 asign ^ arg1_sign, aexp,
2287                                                 asig0, asig1, &env->fp_status);
2288         }
2289 
2290         env->fp_status.floatx80_rounding_precision = save_prec;
2291     }
2292     fpop(env);
2293     merge_exception_flags(env, old_flags);
2294 }
2295 
2296 void helper_fsqrt(CPUX86State *env)
2297 {
2298     uint8_t old_flags = save_exception_flags(env);
2299     if (floatx80_is_neg(ST0)) {
2300         env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
2301         env->fpus |= 0x400;
2302     }
2303     ST0 = floatx80_sqrt(ST0, &env->fp_status);
2304     merge_exception_flags(env, old_flags);
2305 }
2306 
2307 void helper_fsincos(CPUX86State *env)
2308 {
2309     double fptemp = floatx80_to_double(env, ST0);
2310 
2311     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2312         env->fpus |= 0x400;
2313     } else {
2314         ST0 = double_to_floatx80(env, sin(fptemp));
2315         fpush(env);
2316         ST0 = double_to_floatx80(env, cos(fptemp));
2317         env->fpus &= ~0x400;  /* C2 <-- 0 */
2318         /* the above code is for |arg| < 2**63 only */
2319     }
2320 }
2321 
2322 void helper_frndint(CPUX86State *env)
2323 {
2324     uint8_t old_flags = save_exception_flags(env);
2325     ST0 = floatx80_round_to_int(ST0, &env->fp_status);
2326     merge_exception_flags(env, old_flags);
2327 }
2328 
2329 void helper_fscale(CPUX86State *env)
2330 {
2331     uint8_t old_flags = save_exception_flags(env);
2332     if (floatx80_invalid_encoding(ST1) || floatx80_invalid_encoding(ST0)) {
2333         float_raise(float_flag_invalid, &env->fp_status);
2334         ST0 = floatx80_default_nan(&env->fp_status);
2335     } else if (floatx80_is_any_nan(ST1)) {
2336         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2337             float_raise(float_flag_invalid, &env->fp_status);
2338         }
2339         ST0 = ST1;
2340         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2341             float_raise(float_flag_invalid, &env->fp_status);
2342             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
2343         }
2344     } else if (floatx80_is_infinity(ST1) &&
2345                !floatx80_invalid_encoding(ST0) &&
2346                !floatx80_is_any_nan(ST0)) {
2347         if (floatx80_is_neg(ST1)) {
2348             if (floatx80_is_infinity(ST0)) {
2349                 float_raise(float_flag_invalid, &env->fp_status);
2350                 ST0 = floatx80_default_nan(&env->fp_status);
2351             } else {
2352                 ST0 = (floatx80_is_neg(ST0) ?
2353                        floatx80_chs(floatx80_zero) :
2354                        floatx80_zero);
2355             }
2356         } else {
2357             if (floatx80_is_zero(ST0)) {
2358                 float_raise(float_flag_invalid, &env->fp_status);
2359                 ST0 = floatx80_default_nan(&env->fp_status);
2360             } else {
2361                 ST0 = (floatx80_is_neg(ST0) ?
2362                        floatx80_chs(floatx80_infinity) :
2363                        floatx80_infinity);
2364             }
2365         }
2366     } else {
2367         int n;
2368         FloatX80RoundPrec save = env->fp_status.floatx80_rounding_precision;
2369         uint8_t save_flags = get_float_exception_flags(&env->fp_status);
2370         set_float_exception_flags(0, &env->fp_status);
2371         n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
2372         set_float_exception_flags(save_flags, &env->fp_status);
2373         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2374         ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
2375         env->fp_status.floatx80_rounding_precision = save;
2376     }
2377     merge_exception_flags(env, old_flags);
2378 }
2379 
2380 void helper_fsin(CPUX86State *env)
2381 {
2382     double fptemp = floatx80_to_double(env, ST0);
2383 
2384     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2385         env->fpus |= 0x400;
2386     } else {
2387         ST0 = double_to_floatx80(env, sin(fptemp));
2388         env->fpus &= ~0x400;  /* C2 <-- 0 */
2389         /* the above code is for |arg| < 2**53 only */
2390     }
2391 }
2392 
2393 void helper_fcos(CPUX86State *env)
2394 {
2395     double fptemp = floatx80_to_double(env, ST0);
2396 
2397     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2398         env->fpus |= 0x400;
2399     } else {
2400         ST0 = double_to_floatx80(env, cos(fptemp));
2401         env->fpus &= ~0x400;  /* C2 <-- 0 */
2402         /* the above code is for |arg| < 2**63 only */
2403     }
2404 }
2405 
2406 void helper_fxam_ST0(CPUX86State *env)
2407 {
2408     CPU_LDoubleU temp;
2409     int expdif;
2410 
2411     temp.d = ST0;
2412 
2413     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
2414     if (SIGND(temp)) {
2415         env->fpus |= 0x200; /* C1 <-- 1 */
2416     }
2417 
2418     if (env->fptags[env->fpstt]) {
2419         env->fpus |= 0x4100; /* Empty */
2420         return;
2421     }
2422 
2423     expdif = EXPD(temp);
2424     if (expdif == MAXEXPD) {
2425         if (MANTD(temp) == 0x8000000000000000ULL) {
2426             env->fpus |= 0x500; /* Infinity */
2427         } else if (MANTD(temp) & 0x8000000000000000ULL) {
2428             env->fpus |= 0x100; /* NaN */
2429         }
2430     } else if (expdif == 0) {
2431         if (MANTD(temp) == 0) {
2432             env->fpus |=  0x4000; /* Zero */
2433         } else {
2434             env->fpus |= 0x4400; /* Denormal */
2435         }
2436     } else if (MANTD(temp) & 0x8000000000000000ULL) {
2437         env->fpus |= 0x400;
2438     }
2439 }
2440 
2441 static void do_fstenv(X86Access *ac, target_ulong ptr, int data32)
2442 {
2443     CPUX86State *env = ac->env;
2444     int fpus, fptag, exp, i;
2445     uint64_t mant;
2446     CPU_LDoubleU tmp;
2447 
2448     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2449     fptag = 0;
2450     for (i = 7; i >= 0; i--) {
2451         fptag <<= 2;
2452         if (env->fptags[i]) {
2453             fptag |= 3;
2454         } else {
2455             tmp.d = env->fpregs[i].d;
2456             exp = EXPD(tmp);
2457             mant = MANTD(tmp);
2458             if (exp == 0 && mant == 0) {
2459                 /* zero */
2460                 fptag |= 1;
2461             } else if (exp == 0 || exp == MAXEXPD
2462                        || (mant & (1LL << 63)) == 0) {
2463                 /* NaNs, infinity, denormal */
2464                 fptag |= 2;
2465             }
2466         }
2467     }
2468     if (data32) {
2469         /* 32 bit */
2470         access_stl(ac, ptr, env->fpuc);
2471         access_stl(ac, ptr + 4, fpus);
2472         access_stl(ac, ptr + 8, fptag);
2473         access_stl(ac, ptr + 12, env->fpip); /* fpip */
2474         access_stl(ac, ptr + 16, env->fpcs); /* fpcs */
2475         access_stl(ac, ptr + 20, env->fpdp); /* fpoo */
2476         access_stl(ac, ptr + 24, env->fpds); /* fpos */
2477     } else {
2478         /* 16 bit */
2479         access_stw(ac, ptr, env->fpuc);
2480         access_stw(ac, ptr + 2, fpus);
2481         access_stw(ac, ptr + 4, fptag);
2482         access_stw(ac, ptr + 6, env->fpip);
2483         access_stw(ac, ptr + 8, env->fpcs);
2484         access_stw(ac, ptr + 10, env->fpdp);
2485         access_stw(ac, ptr + 12, env->fpds);
2486     }
2487 }
2488 
2489 void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
2490 {
2491     X86Access ac;
2492 
2493     access_prepare(&ac, env, ptr, 14 << data32, MMU_DATA_STORE, GETPC());
2494     do_fstenv(&ac, ptr, data32);
2495 }
2496 
2497 static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
2498 {
2499     env->fpstt = (fpus >> 11) & 7;
2500     env->fpus = fpus & ~0x3800 & ~FPUS_B;
2501     env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
2502 #if !defined(CONFIG_USER_ONLY)
2503     if (!(env->fpus & FPUS_SE)) {
2504         /*
2505          * Here the processor deasserts FERR#; in response, the chipset deasserts
2506          * IGNNE#.
2507          */
2508         cpu_clear_ignne();
2509     }
2510 #endif
2511 }
2512 
2513 static void do_fldenv(X86Access *ac, target_ulong ptr, int data32)
2514 {
2515     int i, fpus, fptag;
2516     CPUX86State *env = ac->env;
2517 
2518     cpu_set_fpuc(env, access_ldw(ac, ptr));
2519     fpus = access_ldw(ac, ptr + (2 << data32));
2520     fptag = access_ldw(ac, ptr + (4 << data32));
2521 
2522     cpu_set_fpus(env, fpus);
2523     for (i = 0; i < 8; i++) {
2524         env->fptags[i] = ((fptag & 3) == 3);
2525         fptag >>= 2;
2526     }
2527 }
2528 
2529 void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
2530 {
2531     X86Access ac;
2532 
2533     access_prepare(&ac, env, ptr, 14 << data32, MMU_DATA_STORE, GETPC());
2534     do_fldenv(&ac, ptr, data32);
2535 }
2536 
2537 static void do_fsave(X86Access *ac, target_ulong ptr, int data32)
2538 {
2539     CPUX86State *env = ac->env;
2540 
2541     do_fstenv(ac, ptr, data32);
2542     ptr += 14 << data32;
2543 
2544     for (int i = 0; i < 8; i++) {
2545         floatx80 tmp = ST(i);
2546         do_fstt(ac, ptr, tmp);
2547         ptr += 10;
2548     }
2549 
2550     do_fninit(env);
2551 }
2552 
2553 void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
2554 {
2555     int size = (14 << data32) + 80;
2556     X86Access ac;
2557 
2558     access_prepare(&ac, env, ptr, size, MMU_DATA_STORE, GETPC());
2559     do_fsave(&ac, ptr, data32);
2560 }
2561 
2562 static void do_frstor(X86Access *ac, target_ulong ptr, int data32)
2563 {
2564     CPUX86State *env = ac->env;
2565 
2566     do_fldenv(ac, ptr, data32);
2567     ptr += 14 << data32;
2568 
2569     for (int i = 0; i < 8; i++) {
2570         floatx80 tmp = do_fldt(ac, ptr);
2571         ST(i) = tmp;
2572         ptr += 10;
2573     }
2574 }
2575 
2576 void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
2577 {
2578     int size = (14 << data32) + 80;
2579     X86Access ac;
2580 
2581     access_prepare(&ac, env, ptr, size, MMU_DATA_LOAD, GETPC());
2582     do_frstor(&ac, ptr, data32);
2583 }
2584 
2585 #define XO(X)  offsetof(X86XSaveArea, X)
2586 
2587 static void do_xsave_fpu(X86Access *ac, target_ulong ptr)
2588 {
2589     CPUX86State *env = ac->env;
2590     int fpus, fptag, i;
2591     target_ulong addr;
2592 
2593     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2594     fptag = 0;
2595     for (i = 0; i < 8; i++) {
2596         fptag |= (env->fptags[i] << i);
2597     }
2598 
2599     access_stw(ac, ptr + XO(legacy.fcw), env->fpuc);
2600     access_stw(ac, ptr + XO(legacy.fsw), fpus);
2601     access_stw(ac, ptr + XO(legacy.ftw), fptag ^ 0xff);
2602 
2603     /* In 32-bit mode this is eip, sel, dp, sel.
2604        In 64-bit mode this is rip, rdp.
2605        But in either case we don't write actual data, just zeros.  */
2606     access_stq(ac, ptr + XO(legacy.fpip), 0); /* eip+sel; rip */
2607     access_stq(ac, ptr + XO(legacy.fpdp), 0); /* edp+sel; rdp */
2608 
2609     addr = ptr + XO(legacy.fpregs);
2610 
2611     for (i = 0; i < 8; i++) {
2612         floatx80 tmp = ST(i);
2613         do_fstt(ac, addr, tmp);
2614         addr += 16;
2615     }
2616 }
2617 
2618 static void do_xsave_mxcsr(X86Access *ac, target_ulong ptr)
2619 {
2620     CPUX86State *env = ac->env;
2621 
2622     update_mxcsr_from_sse_status(env);
2623     access_stl(ac, ptr + XO(legacy.mxcsr), env->mxcsr);
2624     access_stl(ac, ptr + XO(legacy.mxcsr_mask), 0x0000ffff);
2625 }
2626 
2627 static void do_xsave_sse(X86Access *ac, target_ulong ptr)
2628 {
2629     CPUX86State *env = ac->env;
2630     int i, nb_xmm_regs;
2631     target_ulong addr;
2632 
2633     if (env->hflags & HF_CS64_MASK) {
2634         nb_xmm_regs = 16;
2635     } else {
2636         nb_xmm_regs = 8;
2637     }
2638 
2639     addr = ptr + XO(legacy.xmm_regs);
2640     for (i = 0; i < nb_xmm_regs; i++) {
2641         access_stq(ac, addr, env->xmm_regs[i].ZMM_Q(0));
2642         access_stq(ac, addr + 8, env->xmm_regs[i].ZMM_Q(1));
2643         addr += 16;
2644     }
2645 }
2646 
2647 static void do_xsave_ymmh(X86Access *ac, target_ulong ptr)
2648 {
2649     CPUX86State *env = ac->env;
2650     int i, nb_xmm_regs;
2651 
2652     if (env->hflags & HF_CS64_MASK) {
2653         nb_xmm_regs = 16;
2654     } else {
2655         nb_xmm_regs = 8;
2656     }
2657 
2658     for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
2659         access_stq(ac, ptr, env->xmm_regs[i].ZMM_Q(2));
2660         access_stq(ac, ptr + 8, env->xmm_regs[i].ZMM_Q(3));
2661     }
2662 }
2663 
2664 static void do_xsave_bndregs(X86Access *ac, target_ulong ptr)
2665 {
2666     CPUX86State *env = ac->env;
2667     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2668     int i;
2669 
2670     for (i = 0; i < 4; i++, addr += 16) {
2671         access_stq(ac, addr, env->bnd_regs[i].lb);
2672         access_stq(ac, addr + 8, env->bnd_regs[i].ub);
2673     }
2674 }
2675 
2676 static void do_xsave_bndcsr(X86Access *ac, target_ulong ptr)
2677 {
2678     CPUX86State *env = ac->env;
2679 
2680     access_stq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
2681                env->bndcs_regs.cfgu);
2682     access_stq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
2683                env->bndcs_regs.sts);
2684 }
2685 
2686 static void do_xsave_pkru(X86Access *ac, target_ulong ptr)
2687 {
2688     access_stq(ac, ptr, ac->env->pkru);
2689 }
2690 
2691 static void do_fxsave(X86Access *ac, target_ulong ptr)
2692 {
2693     CPUX86State *env = ac->env;
2694 
2695     do_xsave_fpu(ac, ptr);
2696     if (env->cr[4] & CR4_OSFXSR_MASK) {
2697         do_xsave_mxcsr(ac, ptr);
2698         /* Fast FXSAVE leaves out the XMM registers */
2699         if (!(env->efer & MSR_EFER_FFXSR)
2700             || (env->hflags & HF_CPL_MASK)
2701             || !(env->hflags & HF_LMA_MASK)) {
2702             do_xsave_sse(ac, ptr);
2703         }
2704     }
2705 }
2706 
2707 void helper_fxsave(CPUX86State *env, target_ulong ptr)
2708 {
2709     uintptr_t ra = GETPC();
2710     X86Access ac;
2711 
2712     /* The operand must be 16 byte aligned */
2713     if (ptr & 0xf) {
2714         raise_exception_ra(env, EXCP0D_GPF, ra);
2715     }
2716 
2717     access_prepare(&ac, env, ptr, sizeof(X86LegacyXSaveArea),
2718                    MMU_DATA_STORE, ra);
2719     do_fxsave(&ac, ptr);
2720 }
2721 
2722 static uint64_t get_xinuse(CPUX86State *env)
2723 {
2724     uint64_t inuse = -1;
2725 
2726     /* For the most part, we don't track XINUSE.  We could calculate it
2727        here for all components, but it's probably less work to simply
2728        indicate in use.  That said, the state of BNDREGS is important
2729        enough to track in HFLAGS, so we might as well use that here.  */
2730     if ((env->hflags & HF_MPX_IU_MASK) == 0) {
2731        inuse &= ~XSTATE_BNDREGS_MASK;
2732     }
2733     return inuse;
2734 }
2735 
2736 static void do_xsave_access(X86Access *ac, target_ulong ptr, uint64_t rfbm,
2737                             uint64_t inuse, uint64_t opt)
2738 {
2739     uint64_t old_bv, new_bv;
2740 
2741     if (opt & XSTATE_FP_MASK) {
2742         do_xsave_fpu(ac, ptr);
2743     }
2744     if (rfbm & XSTATE_SSE_MASK) {
2745         /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
2746         do_xsave_mxcsr(ac, ptr);
2747     }
2748     if (opt & XSTATE_SSE_MASK) {
2749         do_xsave_sse(ac, ptr);
2750     }
2751     if (opt & XSTATE_YMM_MASK) {
2752         do_xsave_ymmh(ac, ptr + XO(avx_state));
2753     }
2754     if (opt & XSTATE_BNDREGS_MASK) {
2755         do_xsave_bndregs(ac, ptr + XO(bndreg_state));
2756     }
2757     if (opt & XSTATE_BNDCSR_MASK) {
2758         do_xsave_bndcsr(ac, ptr + XO(bndcsr_state));
2759     }
2760     if (opt & XSTATE_PKRU_MASK) {
2761         do_xsave_pkru(ac, ptr + XO(pkru_state));
2762     }
2763 
2764     /* Update the XSTATE_BV field.  */
2765     old_bv = access_ldq(ac, ptr + XO(header.xstate_bv));
2766     new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
2767     access_stq(ac, ptr + XO(header.xstate_bv), new_bv);
2768 }
2769 
2770 static void do_xsave_chk(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2771 {
2772     /* The OS must have enabled XSAVE.  */
2773     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2774         raise_exception_ra(env, EXCP06_ILLOP, ra);
2775     }
2776 
2777     /* The operand must be 64 byte aligned.  */
2778     if (ptr & 63) {
2779         raise_exception_ra(env, EXCP0D_GPF, ra);
2780     }
2781 }
2782 
2783 static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
2784                      uint64_t inuse, uint64_t opt, uintptr_t ra)
2785 {
2786     X86Access ac;
2787     unsigned size;
2788 
2789     do_xsave_chk(env, ptr, ra);
2790 
2791     /* Never save anything not enabled by XCR0.  */
2792     rfbm &= env->xcr0;
2793     opt &= rfbm;
2794     size = xsave_area_size(opt, false);
2795 
2796     access_prepare(&ac, env, ptr, size, MMU_DATA_STORE, ra);
2797     do_xsave_access(&ac, ptr, rfbm, inuse, opt);
2798 }
2799 
2800 void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2801 {
2802     do_xsave(env, ptr, rfbm, get_xinuse(env), rfbm, GETPC());
2803 }
2804 
2805 void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2806 {
2807     uint64_t inuse = get_xinuse(env);
2808     do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
2809 }
2810 
2811 static void do_xrstor_fpu(X86Access *ac, target_ulong ptr)
2812 {
2813     CPUX86State *env = ac->env;
2814     int i, fpuc, fpus, fptag;
2815     target_ulong addr;
2816 
2817     fpuc = access_ldw(ac, ptr + XO(legacy.fcw));
2818     fpus = access_ldw(ac, ptr + XO(legacy.fsw));
2819     fptag = access_ldw(ac, ptr + XO(legacy.ftw));
2820     cpu_set_fpuc(env, fpuc);
2821     cpu_set_fpus(env, fpus);
2822 
2823     fptag ^= 0xff;
2824     for (i = 0; i < 8; i++) {
2825         env->fptags[i] = ((fptag >> i) & 1);
2826     }
2827 
2828     addr = ptr + XO(legacy.fpregs);
2829 
2830     for (i = 0; i < 8; i++) {
2831         floatx80 tmp = do_fldt(ac, addr);
2832         ST(i) = tmp;
2833         addr += 16;
2834     }
2835 }
2836 
2837 static void do_xrstor_mxcsr(X86Access *ac, target_ulong ptr)
2838 {
2839     CPUX86State *env = ac->env;
2840     cpu_set_mxcsr(env, access_ldl(ac, ptr + XO(legacy.mxcsr)));
2841 }
2842 
2843 static void do_xrstor_sse(X86Access *ac, target_ulong ptr)
2844 {
2845     CPUX86State *env = ac->env;
2846     int i, nb_xmm_regs;
2847     target_ulong addr;
2848 
2849     if (env->hflags & HF_CS64_MASK) {
2850         nb_xmm_regs = 16;
2851     } else {
2852         nb_xmm_regs = 8;
2853     }
2854 
2855     addr = ptr + XO(legacy.xmm_regs);
2856     for (i = 0; i < nb_xmm_regs; i++) {
2857         env->xmm_regs[i].ZMM_Q(0) = access_ldq(ac, addr);
2858         env->xmm_regs[i].ZMM_Q(1) = access_ldq(ac, addr + 8);
2859         addr += 16;
2860     }
2861 }
2862 
2863 static void do_clear_sse(CPUX86State *env)
2864 {
2865     int i, nb_xmm_regs;
2866 
2867     if (env->hflags & HF_CS64_MASK) {
2868         nb_xmm_regs = 16;
2869     } else {
2870         nb_xmm_regs = 8;
2871     }
2872 
2873     for (i = 0; i < nb_xmm_regs; i++) {
2874         env->xmm_regs[i].ZMM_Q(0) = 0;
2875         env->xmm_regs[i].ZMM_Q(1) = 0;
2876     }
2877 }
2878 
2879 static void do_xrstor_ymmh(X86Access *ac, target_ulong ptr)
2880 {
2881     CPUX86State *env = ac->env;
2882     int i, nb_xmm_regs;
2883 
2884     if (env->hflags & HF_CS64_MASK) {
2885         nb_xmm_regs = 16;
2886     } else {
2887         nb_xmm_regs = 8;
2888     }
2889 
2890     for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
2891         env->xmm_regs[i].ZMM_Q(2) = access_ldq(ac, ptr);
2892         env->xmm_regs[i].ZMM_Q(3) = access_ldq(ac, ptr + 8);
2893     }
2894 }
2895 
2896 static void do_clear_ymmh(CPUX86State *env)
2897 {
2898     int i, nb_xmm_regs;
2899 
2900     if (env->hflags & HF_CS64_MASK) {
2901         nb_xmm_regs = 16;
2902     } else {
2903         nb_xmm_regs = 8;
2904     }
2905 
2906     for (i = 0; i < nb_xmm_regs; i++) {
2907         env->xmm_regs[i].ZMM_Q(2) = 0;
2908         env->xmm_regs[i].ZMM_Q(3) = 0;
2909     }
2910 }
2911 
2912 static void do_xrstor_bndregs(X86Access *ac, target_ulong ptr)
2913 {
2914     CPUX86State *env = ac->env;
2915     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2916     int i;
2917 
2918     for (i = 0; i < 4; i++, addr += 16) {
2919         env->bnd_regs[i].lb = access_ldq(ac, addr);
2920         env->bnd_regs[i].ub = access_ldq(ac, addr + 8);
2921     }
2922 }
2923 
2924 static void do_xrstor_bndcsr(X86Access *ac, target_ulong ptr)
2925 {
2926     CPUX86State *env = ac->env;
2927 
2928     /* FIXME: Extend highest implemented bit of linear address.  */
2929     env->bndcs_regs.cfgu
2930         = access_ldq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu));
2931     env->bndcs_regs.sts
2932         = access_ldq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.sts));
2933 }
2934 
2935 static void do_xrstor_pkru(X86Access *ac, target_ulong ptr)
2936 {
2937     ac->env->pkru = access_ldq(ac, ptr);
2938 }
2939 
2940 static void do_fxrstor(X86Access *ac, target_ulong ptr)
2941 {
2942     CPUX86State *env = ac->env;
2943 
2944     do_xrstor_fpu(ac, ptr);
2945     if (env->cr[4] & CR4_OSFXSR_MASK) {
2946         do_xrstor_mxcsr(ac, ptr);
2947         /* Fast FXRSTOR leaves out the XMM registers */
2948         if (!(env->efer & MSR_EFER_FFXSR)
2949             || (env->hflags & HF_CPL_MASK)
2950             || !(env->hflags & HF_LMA_MASK)) {
2951             do_xrstor_sse(ac, ptr);
2952         }
2953     }
2954 }
2955 
2956 void helper_fxrstor(CPUX86State *env, target_ulong ptr)
2957 {
2958     uintptr_t ra = GETPC();
2959     X86Access ac;
2960 
2961     /* The operand must be 16 byte aligned */
2962     if (ptr & 0xf) {
2963         raise_exception_ra(env, EXCP0D_GPF, ra);
2964     }
2965 
2966     access_prepare(&ac, env, ptr, sizeof(X86LegacyXSaveArea),
2967                    MMU_DATA_LOAD, ra);
2968     do_fxrstor(&ac, ptr);
2969 }
2970 
2971 static bool valid_xrstor_header(X86Access *ac, uint64_t *pxsbv,
2972                                 target_ulong ptr)
2973 {
2974     uint64_t xstate_bv, xcomp_bv, reserve0;
2975 
2976     xstate_bv = access_ldq(ac, ptr + XO(header.xstate_bv));
2977     xcomp_bv = access_ldq(ac, ptr + XO(header.xcomp_bv));
2978     reserve0 = access_ldq(ac, ptr + XO(header.reserve0));
2979     *pxsbv = xstate_bv;
2980 
2981     /*
2982      * XCOMP_BV bit 63 indicates compact form, which we do not support,
2983      * and thus must raise #GP.  That leaves us in standard form.
2984      * In standard form, bytes 23:8 must be zero -- which is both
2985      * XCOMP_BV and the following 64-bit field.
2986      */
2987     if (xcomp_bv || reserve0) {
2988         return false;
2989     }
2990 
2991     /* The XSTATE_BV field must not set bits not present in XCR0.  */
2992     return (xstate_bv & ~ac->env->xcr0) == 0;
2993 }
2994 
2995 static void do_xrstor(X86Access *ac, target_ulong ptr,
2996                       uint64_t rfbm, uint64_t xstate_bv)
2997 {
2998     CPUX86State *env = ac->env;
2999 
3000     if (rfbm & XSTATE_FP_MASK) {
3001         if (xstate_bv & XSTATE_FP_MASK) {
3002             do_xrstor_fpu(ac, ptr);
3003         } else {
3004             do_fninit(env);
3005             memset(env->fpregs, 0, sizeof(env->fpregs));
3006         }
3007     }
3008     if (rfbm & XSTATE_SSE_MASK) {
3009         /* Note that the standard form of XRSTOR loads MXCSR from memory
3010            whether or not the XSTATE_BV bit is set.  */
3011         do_xrstor_mxcsr(ac, ptr);
3012         if (xstate_bv & XSTATE_SSE_MASK) {
3013             do_xrstor_sse(ac, ptr);
3014         } else {
3015             do_clear_sse(env);
3016         }
3017     }
3018     if (rfbm & XSTATE_YMM_MASK) {
3019         if (xstate_bv & XSTATE_YMM_MASK) {
3020             do_xrstor_ymmh(ac, ptr + XO(avx_state));
3021         } else {
3022             do_clear_ymmh(env);
3023         }
3024     }
3025     if (rfbm & XSTATE_BNDREGS_MASK) {
3026         if (xstate_bv & XSTATE_BNDREGS_MASK) {
3027             do_xrstor_bndregs(ac, ptr + XO(bndreg_state));
3028             env->hflags |= HF_MPX_IU_MASK;
3029         } else {
3030             memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
3031             env->hflags &= ~HF_MPX_IU_MASK;
3032         }
3033     }
3034     if (rfbm & XSTATE_BNDCSR_MASK) {
3035         if (xstate_bv & XSTATE_BNDCSR_MASK) {
3036             do_xrstor_bndcsr(ac, ptr + XO(bndcsr_state));
3037         } else {
3038             memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
3039         }
3040         cpu_sync_bndcs_hflags(env);
3041     }
3042     if (rfbm & XSTATE_PKRU_MASK) {
3043         uint64_t old_pkru = env->pkru;
3044         if (xstate_bv & XSTATE_PKRU_MASK) {
3045             do_xrstor_pkru(ac, ptr + XO(pkru_state));
3046         } else {
3047             env->pkru = 0;
3048         }
3049         if (env->pkru != old_pkru) {
3050             CPUState *cs = env_cpu(env);
3051             tlb_flush(cs);
3052         }
3053     }
3054 }
3055 
3056 #undef XO
3057 
3058 void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
3059 {
3060     uintptr_t ra = GETPC();
3061     X86Access ac;
3062     uint64_t xstate_bv;
3063     unsigned size, size_ext;
3064 
3065     do_xsave_chk(env, ptr, ra);
3066 
3067     /* Begin with just the minimum size to validate the header. */
3068     size = sizeof(X86LegacyXSaveArea) + sizeof(X86XSaveHeader);
3069     access_prepare(&ac, env, ptr, size, MMU_DATA_LOAD, ra);
3070     if (!valid_xrstor_header(&ac, &xstate_bv, ptr)) {
3071         raise_exception_ra(env, EXCP0D_GPF, ra);
3072     }
3073 
3074     rfbm &= env->xcr0;
3075     size_ext = xsave_area_size(rfbm & xstate_bv, false);
3076     if (size < size_ext) {
3077         /* TODO: See if existing page probe has covered extra size. */
3078         access_prepare(&ac, env, ptr, size_ext, MMU_DATA_LOAD, ra);
3079     }
3080 
3081     do_xrstor(&ac, ptr, rfbm, xstate_bv);
3082 }
3083 
3084 #if defined(CONFIG_USER_ONLY)
3085 void cpu_x86_fsave(CPUX86State *env, void *host, size_t len)
3086 {
3087     X86Access ac = {
3088         .haddr1 = host,
3089         .size = 4 * 7 + 8 * 10,
3090         .env = env,
3091     };
3092 
3093     assert(ac.size <= len);
3094     do_fsave(&ac, 0, true);
3095 }
3096 
3097 void cpu_x86_frstor(CPUX86State *env, void *host, size_t len)
3098 {
3099     X86Access ac = {
3100         .haddr1 = host,
3101         .size = 4 * 7 + 8 * 10,
3102         .env = env,
3103     };
3104 
3105     assert(ac.size <= len);
3106     do_frstor(&ac, 0, true);
3107 }
3108 
3109 void cpu_x86_fxsave(CPUX86State *env, void *host, size_t len)
3110 {
3111     X86Access ac = {
3112         .haddr1 = host,
3113         .size = sizeof(X86LegacyXSaveArea),
3114         .env = env,
3115     };
3116 
3117     assert(ac.size <= len);
3118     do_fxsave(&ac, 0);
3119 }
3120 
3121 void cpu_x86_fxrstor(CPUX86State *env, void *host, size_t len)
3122 {
3123     X86Access ac = {
3124         .haddr1 = host,
3125         .size = sizeof(X86LegacyXSaveArea),
3126         .env = env,
3127     };
3128 
3129     assert(ac.size <= len);
3130     do_fxrstor(&ac, 0);
3131 }
3132 
3133 void cpu_x86_xsave(CPUX86State *env, void *host, size_t len, uint64_t rfbm)
3134 {
3135     X86Access ac = {
3136         .haddr1 = host,
3137         .env = env,
3138     };
3139 
3140     /*
3141      * Since this is only called from user-level signal handling,
3142      * we should have done the job correctly there.
3143      */
3144     assert((rfbm & ~env->xcr0) == 0);
3145     ac.size = xsave_area_size(rfbm, false);
3146     assert(ac.size <= len);
3147     do_xsave_access(&ac, 0, rfbm, get_xinuse(env), rfbm);
3148 }
3149 
3150 bool cpu_x86_xrstor(CPUX86State *env, void *host, size_t len, uint64_t rfbm)
3151 {
3152     X86Access ac = {
3153         .haddr1 = host,
3154         .env = env,
3155     };
3156     uint64_t xstate_bv;
3157 
3158     /*
3159      * Since this is only called from user-level signal handling,
3160      * we should have done the job correctly there.
3161      */
3162     assert((rfbm & ~env->xcr0) == 0);
3163     ac.size = xsave_area_size(rfbm, false);
3164     assert(ac.size <= len);
3165 
3166     if (!valid_xrstor_header(&ac, &xstate_bv, 0)) {
3167         return false;
3168     }
3169     do_xrstor(&ac, 0, rfbm, xstate_bv);
3170     return true;
3171 }
3172 #endif
3173 
3174 uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
3175 {
3176     /* The OS must have enabled XSAVE.  */
3177     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
3178         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
3179     }
3180 
3181     switch (ecx) {
3182     case 0:
3183         return env->xcr0;
3184     case 1:
3185         if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
3186             return env->xcr0 & get_xinuse(env);
3187         }
3188         break;
3189     }
3190     raise_exception_ra(env, EXCP0D_GPF, GETPC());
3191 }
3192 
3193 void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
3194 {
3195     uint32_t dummy, ena_lo, ena_hi;
3196     uint64_t ena;
3197 
3198     /* The OS must have enabled XSAVE.  */
3199     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
3200         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
3201     }
3202 
3203     /* Only XCR0 is defined at present; the FPU may not be disabled.  */
3204     if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
3205         goto do_gpf;
3206     }
3207 
3208     /* SSE can be disabled, but only if AVX is disabled too.  */
3209     if ((mask & (XSTATE_SSE_MASK | XSTATE_YMM_MASK)) == XSTATE_YMM_MASK) {
3210         goto do_gpf;
3211     }
3212 
3213     /* Disallow enabling unimplemented features.  */
3214     cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
3215     ena = ((uint64_t)ena_hi << 32) | ena_lo;
3216     if (mask & ~ena) {
3217         goto do_gpf;
3218     }
3219 
3220     /* Disallow enabling only half of MPX.  */
3221     if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
3222         & XSTATE_BNDCSR_MASK) {
3223         goto do_gpf;
3224     }
3225 
3226     env->xcr0 = mask;
3227     cpu_sync_bndcs_hflags(env);
3228     cpu_sync_avx_hflag(env);
3229     return;
3230 
3231  do_gpf:
3232     raise_exception_ra(env, EXCP0D_GPF, GETPC());
3233 }
3234 
3235 /* MMX/SSE */
3236 /* XXX: optimize by storing fptt and fptags in the static cpu state */
3237 
3238 #define SSE_DAZ             0x0040
3239 #define SSE_RC_SHIFT        13
3240 #define SSE_RC_MASK         (3 << SSE_RC_SHIFT)
3241 #define SSE_FZ              0x8000
3242 
3243 void update_mxcsr_status(CPUX86State *env)
3244 {
3245     uint32_t mxcsr = env->mxcsr;
3246     int rnd_type;
3247 
3248     /* set rounding mode */
3249     rnd_type = (mxcsr & SSE_RC_MASK) >> SSE_RC_SHIFT;
3250     set_x86_rounding_mode(rnd_type, &env->sse_status);
3251 
3252     /* Set exception flags.  */
3253     set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) |
3254                               (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) |
3255                               (mxcsr & FPUS_OE ? float_flag_overflow : 0) |
3256                               (mxcsr & FPUS_UE ? float_flag_underflow : 0) |
3257                               (mxcsr & FPUS_PE ? float_flag_inexact : 0),
3258                               &env->sse_status);
3259 
3260     /* set denormals are zero */
3261     set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
3262 
3263     /* set flush to zero */
3264     set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->sse_status);
3265 }
3266 
3267 void update_mxcsr_from_sse_status(CPUX86State *env)
3268 {
3269     uint8_t flags = get_float_exception_flags(&env->sse_status);
3270     /*
3271      * The MXCSR denormal flag has opposite semantics to
3272      * float_flag_input_denormal_flushed (the softfloat code sets that flag
3273      * only when flushing input denormals to zero, but SSE sets it
3274      * only when not flushing them to zero), so is not converted
3275      * here.
3276      */
3277     env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) |
3278                    (flags & float_flag_divbyzero ? FPUS_ZE : 0) |
3279                    (flags & float_flag_overflow ? FPUS_OE : 0) |
3280                    (flags & float_flag_underflow ? FPUS_UE : 0) |
3281                    (flags & float_flag_inexact ? FPUS_PE : 0) |
3282                    (flags & float_flag_output_denormal_flushed ? FPUS_UE | FPUS_PE :
3283                     0));
3284 }
3285 
3286 void helper_update_mxcsr(CPUX86State *env)
3287 {
3288     update_mxcsr_from_sse_status(env);
3289 }
3290 
3291 void helper_ldmxcsr(CPUX86State *env, uint32_t val)
3292 {
3293     cpu_set_mxcsr(env, val);
3294 }
3295 
3296 void helper_enter_mmx(CPUX86State *env)
3297 {
3298     env->fpstt = 0;
3299     *(uint32_t *)(env->fptags) = 0;
3300     *(uint32_t *)(env->fptags + 4) = 0;
3301 }
3302 
3303 void helper_emms(CPUX86State *env)
3304 {
3305     /* set to empty state */
3306     *(uint32_t *)(env->fptags) = 0x01010101;
3307     *(uint32_t *)(env->fptags + 4) = 0x01010101;
3308 }
3309 
3310 #define SHIFT 0
3311 #include "ops_sse.h"
3312 
3313 #define SHIFT 1
3314 #include "ops_sse.h"
3315 
3316 #define SHIFT 2
3317 #include "ops_sse.h"
3318