xref: /qemu/target/i386/tcg/fpu_helper.c (revision 68df8c8dba57f539d24f1a92a8699a179d9bb6fb)
1 /*
2  *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include <math.h>
22 #include "cpu.h"
23 #include "tcg-cpu.h"
24 #include "exec/exec-all.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/helper-proto.h"
27 #include "fpu/softfloat.h"
28 #include "fpu/softfloat-macros.h"
29 #include "helper-tcg.h"
30 #include "access.h"
31 
32 /* float macros */
33 #define FT0    (env->ft0)
34 #define ST0    (env->fpregs[env->fpstt].d)
35 #define ST(n)  (env->fpregs[(env->fpstt + (n)) & 7].d)
36 #define ST1    ST(1)
37 
38 #define FPU_RC_SHIFT        10
39 #define FPU_RC_MASK         (3 << FPU_RC_SHIFT)
40 #define FPU_RC_NEAR         0x000
41 #define FPU_RC_DOWN         0x400
42 #define FPU_RC_UP           0x800
43 #define FPU_RC_CHOP         0xc00
44 
45 #define MAXTAN 9223372036854775808.0
46 
47 /* the following deal with x86 long double-precision numbers */
48 #define MAXEXPD 0x7fff
49 #define EXPBIAS 16383
50 #define EXPD(fp)        (fp.l.upper & 0x7fff)
51 #define SIGND(fp)       ((fp.l.upper) & 0x8000)
52 #define MANTD(fp)       (fp.l.lower)
53 #define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
54 
55 #define FPUS_IE (1 << 0)
56 #define FPUS_DE (1 << 1)
57 #define FPUS_ZE (1 << 2)
58 #define FPUS_OE (1 << 3)
59 #define FPUS_UE (1 << 4)
60 #define FPUS_PE (1 << 5)
61 #define FPUS_SF (1 << 6)
62 #define FPUS_SE (1 << 7)
63 #define FPUS_B  (1 << 15)
64 
65 #define FPUC_EM 0x3f
66 
67 #define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
68 #define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
69 #define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
70 #define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
71 #define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
72 #define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
73 #define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
74 #define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
75 
76 static inline void fpush(CPUX86State *env)
77 {
78     env->fpstt = (env->fpstt - 1) & 7;
79     env->fptags[env->fpstt] = 0; /* validate stack entry */
80 }
81 
82 static inline void fpop(CPUX86State *env)
83 {
84     env->fptags[env->fpstt] = 1; /* invalidate stack entry */
85     env->fpstt = (env->fpstt + 1) & 7;
86 }
87 
88 static floatx80 do_fldt(X86Access *ac, target_ulong ptr)
89 {
90     CPU_LDoubleU temp;
91 
92     temp.l.lower = access_ldq(ac, ptr);
93     temp.l.upper = access_ldw(ac, ptr + 8);
94     return temp.d;
95 }
96 
97 static void do_fstt(X86Access *ac, target_ulong ptr, floatx80 f)
98 {
99     CPU_LDoubleU temp;
100 
101     temp.d = f;
102     access_stq(ac, ptr, temp.l.lower);
103     access_stw(ac, ptr + 8, temp.l.upper);
104 }
105 
106 /* x87 FPU helpers */
107 
108 static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
109 {
110     union {
111         float64 f64;
112         double d;
113     } u;
114 
115     u.f64 = floatx80_to_float64(a, &env->fp_status);
116     return u.d;
117 }
118 
119 static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
120 {
121     union {
122         float64 f64;
123         double d;
124     } u;
125 
126     u.d = a;
127     return float64_to_floatx80(u.f64, &env->fp_status);
128 }
129 
130 static void fpu_set_exception(CPUX86State *env, int mask)
131 {
132     env->fpus |= mask;
133     if (env->fpus & (~env->fpuc & FPUC_EM)) {
134         env->fpus |= FPUS_SE | FPUS_B;
135     }
136 }
137 
138 void cpu_init_fp_statuses(CPUX86State *env)
139 {
140     /*
141      * Initialise the non-runtime-varying fields of the various
142      * float_status words to x86 behaviour. This must be called at
143      * CPU reset because the float_status words are in the
144      * "zeroed on reset" portion of the CPU state struct.
145      * Fields in float_status that vary under guest control are set
146      * via the codepath for setting that register, eg cpu_set_fpuc().
147      */
148     /*
149      * Use x87 NaN propagation rules:
150      * SNaN + QNaN => return the QNaN
151      * two SNaNs => return the one with the larger significand, silenced
152      * two QNaNs => return the one with the larger significand
153      * SNaN and a non-NaN => return the SNaN, silenced
154      * QNaN and a non-NaN => return the QNaN
155      *
156      * If we get down to comparing significands and they are the same,
157      * return the NaN with the positive sign bit (if any).
158      */
159     set_float_2nan_prop_rule(float_2nan_prop_x87, &env->fp_status);
160     /*
161      * TODO: These are incorrect: the x86 Software Developer's Manual vol 1
162      * section 4.8.3.5 "Operating on SNaNs and QNaNs" says that the
163      * "larger significand" behaviour is only used for x87 FPU operations.
164      * For SSE the required behaviour is to always return the first NaN,
165      * which is float_2nan_prop_ab.
166      *
167      * mmx_status is used only for the AMD 3DNow! instructions, which
168      * are documented in the "3DNow! Technology Manual" as not supporting
169      * NaNs or infinities as inputs. The result of passing two NaNs is
170      * documented as "undefined", so we can do what we choose.
171      * (Strictly there is some behaviour we don't implement correctly
172      * for these "unsupported" NaN and Inf values, like "NaN * 0 == 0".)
173      */
174     set_float_2nan_prop_rule(float_2nan_prop_x87, &env->mmx_status);
175     set_float_2nan_prop_rule(float_2nan_prop_x87, &env->sse_status);
176     /*
177      * Only SSE has multiply-add instructions. In the SDM Section 14.5.2
178      * "Fused-Multiply-ADD (FMA) Numeric Behavior" the NaN handling is
179      * specified -- for 0 * inf + NaN the input NaN is selected, and if
180      * there are multiple input NaNs they are selected in the order a, b, c.
181      */
182     set_float_infzeronan_rule(float_infzeronan_dnan_never, &env->sse_status);
183     set_float_3nan_prop_rule(float_3nan_prop_abc, &env->sse_status);
184     /* Default NaN: sign bit set, most significant frac bit set */
185     set_float_default_nan_pattern(0b11000000, &env->fp_status);
186     set_float_default_nan_pattern(0b11000000, &env->mmx_status);
187     set_float_default_nan_pattern(0b11000000, &env->sse_status);
188 }
189 
190 static inline uint8_t save_exception_flags(CPUX86State *env)
191 {
192     uint8_t old_flags = get_float_exception_flags(&env->fp_status);
193     set_float_exception_flags(0, &env->fp_status);
194     return old_flags;
195 }
196 
197 static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
198 {
199     uint8_t new_flags = get_float_exception_flags(&env->fp_status);
200     float_raise(old_flags, &env->fp_status);
201     fpu_set_exception(env,
202                       ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
203                        (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
204                        (new_flags & float_flag_overflow ? FPUS_OE : 0) |
205                        (new_flags & float_flag_underflow ? FPUS_UE : 0) |
206                        (new_flags & float_flag_inexact ? FPUS_PE : 0) |
207                        (new_flags & float_flag_input_denormal ? FPUS_DE : 0)));
208 }
209 
210 static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
211 {
212     uint8_t old_flags = save_exception_flags(env);
213     floatx80 ret = floatx80_div(a, b, &env->fp_status);
214     merge_exception_flags(env, old_flags);
215     return ret;
216 }
217 
218 static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
219 {
220     if (env->cr[0] & CR0_NE_MASK) {
221         raise_exception_ra(env, EXCP10_COPR, retaddr);
222     }
223 #if !defined(CONFIG_USER_ONLY)
224     else {
225         fpu_check_raise_ferr_irq(env);
226     }
227 #endif
228 }
229 
230 void helper_flds_FT0(CPUX86State *env, uint32_t val)
231 {
232     uint8_t old_flags = save_exception_flags(env);
233     union {
234         float32 f;
235         uint32_t i;
236     } u;
237 
238     u.i = val;
239     FT0 = float32_to_floatx80(u.f, &env->fp_status);
240     merge_exception_flags(env, old_flags);
241 }
242 
243 void helper_fldl_FT0(CPUX86State *env, uint64_t val)
244 {
245     uint8_t old_flags = save_exception_flags(env);
246     union {
247         float64 f;
248         uint64_t i;
249     } u;
250 
251     u.i = val;
252     FT0 = float64_to_floatx80(u.f, &env->fp_status);
253     merge_exception_flags(env, old_flags);
254 }
255 
256 void helper_fildl_FT0(CPUX86State *env, int32_t val)
257 {
258     FT0 = int32_to_floatx80(val, &env->fp_status);
259 }
260 
261 void helper_flds_ST0(CPUX86State *env, uint32_t val)
262 {
263     uint8_t old_flags = save_exception_flags(env);
264     int new_fpstt;
265     union {
266         float32 f;
267         uint32_t i;
268     } u;
269 
270     new_fpstt = (env->fpstt - 1) & 7;
271     u.i = val;
272     env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
273     env->fpstt = new_fpstt;
274     env->fptags[new_fpstt] = 0; /* validate stack entry */
275     merge_exception_flags(env, old_flags);
276 }
277 
278 void helper_fldl_ST0(CPUX86State *env, uint64_t val)
279 {
280     uint8_t old_flags = save_exception_flags(env);
281     int new_fpstt;
282     union {
283         float64 f;
284         uint64_t i;
285     } u;
286 
287     new_fpstt = (env->fpstt - 1) & 7;
288     u.i = val;
289     env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
290     env->fpstt = new_fpstt;
291     env->fptags[new_fpstt] = 0; /* validate stack entry */
292     merge_exception_flags(env, old_flags);
293 }
294 
295 static FloatX80RoundPrec tmp_maximise_precision(float_status *st)
296 {
297     FloatX80RoundPrec old = get_floatx80_rounding_precision(st);
298     set_floatx80_rounding_precision(floatx80_precision_x, st);
299     return old;
300 }
301 
302 void helper_fildl_ST0(CPUX86State *env, int32_t val)
303 {
304     int new_fpstt;
305     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
306 
307     new_fpstt = (env->fpstt - 1) & 7;
308     env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
309     env->fpstt = new_fpstt;
310     env->fptags[new_fpstt] = 0; /* validate stack entry */
311 
312     set_floatx80_rounding_precision(old, &env->fp_status);
313 }
314 
315 void helper_fildll_ST0(CPUX86State *env, int64_t val)
316 {
317     int new_fpstt;
318     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
319 
320     new_fpstt = (env->fpstt - 1) & 7;
321     env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
322     env->fpstt = new_fpstt;
323     env->fptags[new_fpstt] = 0; /* validate stack entry */
324 
325     set_floatx80_rounding_precision(old, &env->fp_status);
326 }
327 
328 uint32_t helper_fsts_ST0(CPUX86State *env)
329 {
330     uint8_t old_flags = save_exception_flags(env);
331     union {
332         float32 f;
333         uint32_t i;
334     } u;
335 
336     u.f = floatx80_to_float32(ST0, &env->fp_status);
337     merge_exception_flags(env, old_flags);
338     return u.i;
339 }
340 
341 uint64_t helper_fstl_ST0(CPUX86State *env)
342 {
343     uint8_t old_flags = save_exception_flags(env);
344     union {
345         float64 f;
346         uint64_t i;
347     } u;
348 
349     u.f = floatx80_to_float64(ST0, &env->fp_status);
350     merge_exception_flags(env, old_flags);
351     return u.i;
352 }
353 
354 int32_t helper_fist_ST0(CPUX86State *env)
355 {
356     uint8_t old_flags = save_exception_flags(env);
357     int32_t val;
358 
359     val = floatx80_to_int32(ST0, &env->fp_status);
360     if (val != (int16_t)val) {
361         set_float_exception_flags(float_flag_invalid, &env->fp_status);
362         val = -32768;
363     }
364     merge_exception_flags(env, old_flags);
365     return val;
366 }
367 
368 int32_t helper_fistl_ST0(CPUX86State *env)
369 {
370     uint8_t old_flags = save_exception_flags(env);
371     int32_t val;
372 
373     val = floatx80_to_int32(ST0, &env->fp_status);
374     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
375         val = 0x80000000;
376     }
377     merge_exception_flags(env, old_flags);
378     return val;
379 }
380 
381 int64_t helper_fistll_ST0(CPUX86State *env)
382 {
383     uint8_t old_flags = save_exception_flags(env);
384     int64_t val;
385 
386     val = floatx80_to_int64(ST0, &env->fp_status);
387     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
388         val = 0x8000000000000000ULL;
389     }
390     merge_exception_flags(env, old_flags);
391     return val;
392 }
393 
394 int32_t helper_fistt_ST0(CPUX86State *env)
395 {
396     uint8_t old_flags = save_exception_flags(env);
397     int32_t val;
398 
399     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
400     if (val != (int16_t)val) {
401         set_float_exception_flags(float_flag_invalid, &env->fp_status);
402         val = -32768;
403     }
404     merge_exception_flags(env, old_flags);
405     return val;
406 }
407 
408 int32_t helper_fisttl_ST0(CPUX86State *env)
409 {
410     uint8_t old_flags = save_exception_flags(env);
411     int32_t val;
412 
413     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
414     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
415         val = 0x80000000;
416     }
417     merge_exception_flags(env, old_flags);
418     return val;
419 }
420 
421 int64_t helper_fisttll_ST0(CPUX86State *env)
422 {
423     uint8_t old_flags = save_exception_flags(env);
424     int64_t val;
425 
426     val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
427     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
428         val = 0x8000000000000000ULL;
429     }
430     merge_exception_flags(env, old_flags);
431     return val;
432 }
433 
434 void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
435 {
436     int new_fpstt;
437     X86Access ac;
438 
439     access_prepare(&ac, env, ptr, 10, MMU_DATA_LOAD, GETPC());
440 
441     new_fpstt = (env->fpstt - 1) & 7;
442     env->fpregs[new_fpstt].d = do_fldt(&ac, ptr);
443     env->fpstt = new_fpstt;
444     env->fptags[new_fpstt] = 0; /* validate stack entry */
445 }
446 
447 void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
448 {
449     X86Access ac;
450 
451     access_prepare(&ac, env, ptr, 10, MMU_DATA_STORE, GETPC());
452     do_fstt(&ac, ptr, ST0);
453 }
454 
455 void helper_fpush(CPUX86State *env)
456 {
457     fpush(env);
458 }
459 
460 void helper_fpop(CPUX86State *env)
461 {
462     fpop(env);
463 }
464 
465 void helper_fdecstp(CPUX86State *env)
466 {
467     env->fpstt = (env->fpstt - 1) & 7;
468     env->fpus &= ~0x4700;
469 }
470 
471 void helper_fincstp(CPUX86State *env)
472 {
473     env->fpstt = (env->fpstt + 1) & 7;
474     env->fpus &= ~0x4700;
475 }
476 
477 /* FPU move */
478 
479 void helper_ffree_STN(CPUX86State *env, int st_index)
480 {
481     env->fptags[(env->fpstt + st_index) & 7] = 1;
482 }
483 
484 void helper_fmov_ST0_FT0(CPUX86State *env)
485 {
486     ST0 = FT0;
487 }
488 
489 void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
490 {
491     FT0 = ST(st_index);
492 }
493 
494 void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
495 {
496     ST0 = ST(st_index);
497 }
498 
499 void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
500 {
501     ST(st_index) = ST0;
502 }
503 
504 void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
505 {
506     floatx80 tmp;
507 
508     tmp = ST(st_index);
509     ST(st_index) = ST0;
510     ST0 = tmp;
511 }
512 
513 /* FPU operations */
514 
515 static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
516 
517 void helper_fcom_ST0_FT0(CPUX86State *env)
518 {
519     uint8_t old_flags = save_exception_flags(env);
520     FloatRelation ret;
521 
522     ret = floatx80_compare(ST0, FT0, &env->fp_status);
523     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
524     merge_exception_flags(env, old_flags);
525 }
526 
527 void helper_fucom_ST0_FT0(CPUX86State *env)
528 {
529     uint8_t old_flags = save_exception_flags(env);
530     FloatRelation ret;
531 
532     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
533     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
534     merge_exception_flags(env, old_flags);
535 }
536 
537 static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
538 
539 void helper_fcomi_ST0_FT0(CPUX86State *env)
540 {
541     uint8_t old_flags = save_exception_flags(env);
542     int eflags;
543     FloatRelation ret;
544 
545     ret = floatx80_compare(ST0, FT0, &env->fp_status);
546     eflags = cpu_cc_compute_all(env) & ~(CC_Z | CC_P | CC_C);
547     CC_SRC = eflags | fcomi_ccval[ret + 1];
548     CC_OP = CC_OP_EFLAGS;
549     merge_exception_flags(env, old_flags);
550 }
551 
552 void helper_fucomi_ST0_FT0(CPUX86State *env)
553 {
554     uint8_t old_flags = save_exception_flags(env);
555     int eflags;
556     FloatRelation ret;
557 
558     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
559     eflags = cpu_cc_compute_all(env) & ~(CC_Z | CC_P | CC_C);
560     CC_SRC = eflags | fcomi_ccval[ret + 1];
561     CC_OP = CC_OP_EFLAGS;
562     merge_exception_flags(env, old_flags);
563 }
564 
565 void helper_fadd_ST0_FT0(CPUX86State *env)
566 {
567     uint8_t old_flags = save_exception_flags(env);
568     ST0 = floatx80_add(ST0, FT0, &env->fp_status);
569     merge_exception_flags(env, old_flags);
570 }
571 
572 void helper_fmul_ST0_FT0(CPUX86State *env)
573 {
574     uint8_t old_flags = save_exception_flags(env);
575     ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
576     merge_exception_flags(env, old_flags);
577 }
578 
579 void helper_fsub_ST0_FT0(CPUX86State *env)
580 {
581     uint8_t old_flags = save_exception_flags(env);
582     ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
583     merge_exception_flags(env, old_flags);
584 }
585 
586 void helper_fsubr_ST0_FT0(CPUX86State *env)
587 {
588     uint8_t old_flags = save_exception_flags(env);
589     ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
590     merge_exception_flags(env, old_flags);
591 }
592 
593 void helper_fdiv_ST0_FT0(CPUX86State *env)
594 {
595     ST0 = helper_fdiv(env, ST0, FT0);
596 }
597 
598 void helper_fdivr_ST0_FT0(CPUX86State *env)
599 {
600     ST0 = helper_fdiv(env, FT0, ST0);
601 }
602 
603 /* fp operations between STN and ST0 */
604 
605 void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
606 {
607     uint8_t old_flags = save_exception_flags(env);
608     ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
609     merge_exception_flags(env, old_flags);
610 }
611 
612 void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
613 {
614     uint8_t old_flags = save_exception_flags(env);
615     ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
616     merge_exception_flags(env, old_flags);
617 }
618 
619 void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
620 {
621     uint8_t old_flags = save_exception_flags(env);
622     ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
623     merge_exception_flags(env, old_flags);
624 }
625 
626 void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
627 {
628     uint8_t old_flags = save_exception_flags(env);
629     ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
630     merge_exception_flags(env, old_flags);
631 }
632 
633 void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
634 {
635     floatx80 *p;
636 
637     p = &ST(st_index);
638     *p = helper_fdiv(env, *p, ST0);
639 }
640 
641 void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
642 {
643     floatx80 *p;
644 
645     p = &ST(st_index);
646     *p = helper_fdiv(env, ST0, *p);
647 }
648 
649 /* misc FPU operations */
650 void helper_fchs_ST0(CPUX86State *env)
651 {
652     ST0 = floatx80_chs(ST0);
653 }
654 
655 void helper_fabs_ST0(CPUX86State *env)
656 {
657     ST0 = floatx80_abs(ST0);
658 }
659 
660 void helper_fld1_ST0(CPUX86State *env)
661 {
662     ST0 = floatx80_one;
663 }
664 
665 void helper_fldl2t_ST0(CPUX86State *env)
666 {
667     switch (env->fpuc & FPU_RC_MASK) {
668     case FPU_RC_UP:
669         ST0 = floatx80_l2t_u;
670         break;
671     default:
672         ST0 = floatx80_l2t;
673         break;
674     }
675 }
676 
677 void helper_fldl2e_ST0(CPUX86State *env)
678 {
679     switch (env->fpuc & FPU_RC_MASK) {
680     case FPU_RC_DOWN:
681     case FPU_RC_CHOP:
682         ST0 = floatx80_l2e_d;
683         break;
684     default:
685         ST0 = floatx80_l2e;
686         break;
687     }
688 }
689 
690 void helper_fldpi_ST0(CPUX86State *env)
691 {
692     switch (env->fpuc & FPU_RC_MASK) {
693     case FPU_RC_DOWN:
694     case FPU_RC_CHOP:
695         ST0 = floatx80_pi_d;
696         break;
697     default:
698         ST0 = floatx80_pi;
699         break;
700     }
701 }
702 
703 void helper_fldlg2_ST0(CPUX86State *env)
704 {
705     switch (env->fpuc & FPU_RC_MASK) {
706     case FPU_RC_DOWN:
707     case FPU_RC_CHOP:
708         ST0 = floatx80_lg2_d;
709         break;
710     default:
711         ST0 = floatx80_lg2;
712         break;
713     }
714 }
715 
716 void helper_fldln2_ST0(CPUX86State *env)
717 {
718     switch (env->fpuc & FPU_RC_MASK) {
719     case FPU_RC_DOWN:
720     case FPU_RC_CHOP:
721         ST0 = floatx80_ln2_d;
722         break;
723     default:
724         ST0 = floatx80_ln2;
725         break;
726     }
727 }
728 
729 void helper_fldz_ST0(CPUX86State *env)
730 {
731     ST0 = floatx80_zero;
732 }
733 
734 void helper_fldz_FT0(CPUX86State *env)
735 {
736     FT0 = floatx80_zero;
737 }
738 
739 uint32_t helper_fnstsw(CPUX86State *env)
740 {
741     return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
742 }
743 
744 uint32_t helper_fnstcw(CPUX86State *env)
745 {
746     return env->fpuc;
747 }
748 
749 static void set_x86_rounding_mode(unsigned mode, float_status *status)
750 {
751     static FloatRoundMode x86_round_mode[4] = {
752         float_round_nearest_even,
753         float_round_down,
754         float_round_up,
755         float_round_to_zero
756     };
757     assert(mode < ARRAY_SIZE(x86_round_mode));
758     set_float_rounding_mode(x86_round_mode[mode], status);
759 }
760 
761 void update_fp_status(CPUX86State *env)
762 {
763     int rnd_mode;
764     FloatX80RoundPrec rnd_prec;
765 
766     /* set rounding mode */
767     rnd_mode = (env->fpuc & FPU_RC_MASK) >> FPU_RC_SHIFT;
768     set_x86_rounding_mode(rnd_mode, &env->fp_status);
769 
770     switch ((env->fpuc >> 8) & 3) {
771     case 0:
772         rnd_prec = floatx80_precision_s;
773         break;
774     case 2:
775         rnd_prec = floatx80_precision_d;
776         break;
777     case 3:
778     default:
779         rnd_prec = floatx80_precision_x;
780         break;
781     }
782     set_floatx80_rounding_precision(rnd_prec, &env->fp_status);
783 }
784 
785 void helper_fldcw(CPUX86State *env, uint32_t val)
786 {
787     cpu_set_fpuc(env, val);
788 }
789 
790 void helper_fclex(CPUX86State *env)
791 {
792     env->fpus &= 0x7f00;
793 }
794 
795 void helper_fwait(CPUX86State *env)
796 {
797     if (env->fpus & FPUS_SE) {
798         fpu_raise_exception(env, GETPC());
799     }
800 }
801 
802 static void do_fninit(CPUX86State *env)
803 {
804     env->fpus = 0;
805     env->fpstt = 0;
806     env->fpcs = 0;
807     env->fpds = 0;
808     env->fpip = 0;
809     env->fpdp = 0;
810     cpu_set_fpuc(env, 0x37f);
811     env->fptags[0] = 1;
812     env->fptags[1] = 1;
813     env->fptags[2] = 1;
814     env->fptags[3] = 1;
815     env->fptags[4] = 1;
816     env->fptags[5] = 1;
817     env->fptags[6] = 1;
818     env->fptags[7] = 1;
819 }
820 
821 void helper_fninit(CPUX86State *env)
822 {
823     do_fninit(env);
824 }
825 
826 /* BCD ops */
827 
828 void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
829 {
830     X86Access ac;
831     floatx80 tmp;
832     uint64_t val;
833     unsigned int v;
834     int i;
835 
836     access_prepare(&ac, env, ptr, 10, MMU_DATA_LOAD, GETPC());
837 
838     val = 0;
839     for (i = 8; i >= 0; i--) {
840         v = access_ldb(&ac, ptr + i);
841         val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
842     }
843     tmp = int64_to_floatx80(val, &env->fp_status);
844     if (access_ldb(&ac, ptr + 9) & 0x80) {
845         tmp = floatx80_chs(tmp);
846     }
847     fpush(env);
848     ST0 = tmp;
849 }
850 
851 void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
852 {
853     uint8_t old_flags = save_exception_flags(env);
854     int v;
855     target_ulong mem_ref, mem_end;
856     int64_t val;
857     CPU_LDoubleU temp;
858     X86Access ac;
859 
860     access_prepare(&ac, env, ptr, 10, MMU_DATA_STORE, GETPC());
861     temp.d = ST0;
862 
863     val = floatx80_to_int64(ST0, &env->fp_status);
864     mem_ref = ptr;
865     if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
866         set_float_exception_flags(float_flag_invalid, &env->fp_status);
867         while (mem_ref < ptr + 7) {
868             access_stb(&ac, mem_ref++, 0);
869         }
870         access_stb(&ac, mem_ref++, 0xc0);
871         access_stb(&ac, mem_ref++, 0xff);
872         access_stb(&ac, mem_ref++, 0xff);
873         merge_exception_flags(env, old_flags);
874         return;
875     }
876     mem_end = mem_ref + 9;
877     if (SIGND(temp)) {
878         access_stb(&ac, mem_end, 0x80);
879         val = -val;
880     } else {
881         access_stb(&ac, mem_end, 0x00);
882     }
883     while (mem_ref < mem_end) {
884         if (val == 0) {
885             break;
886         }
887         v = val % 100;
888         val = val / 100;
889         v = ((v / 10) << 4) | (v % 10);
890         access_stb(&ac, mem_ref++, v);
891     }
892     while (mem_ref < mem_end) {
893         access_stb(&ac, mem_ref++, 0);
894     }
895     merge_exception_flags(env, old_flags);
896 }
897 
898 /* 128-bit significand of log(2).  */
899 #define ln2_sig_high 0xb17217f7d1cf79abULL
900 #define ln2_sig_low 0xc9e3b39803f2f6afULL
901 
902 /*
903  * Polynomial coefficients for an approximation to (2^x - 1) / x, on
904  * the interval [-1/64, 1/64].
905  */
906 #define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL)
907 #define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL)
908 #define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL)
909 #define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL)
910 #define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL)
911 #define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL)
912 #define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL)
913 #define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL)
914 #define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL)
915 
916 struct f2xm1_data {
917     /*
918      * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1
919      * are very close to exact floatx80 values.
920      */
921     floatx80 t;
922     /* The value of 2^t.  */
923     floatx80 exp2;
924     /* The value of 2^t - 1.  */
925     floatx80 exp2m1;
926 };
927 
928 static const struct f2xm1_data f2xm1_table[65] = {
929     { make_floatx80_init(0xbfff, 0x8000000000000000ULL),
930       make_floatx80_init(0x3ffe, 0x8000000000000000ULL),
931       make_floatx80_init(0xbffe, 0x8000000000000000ULL) },
932     { make_floatx80_init(0xbffe, 0xf800000000002e7eULL),
933       make_floatx80_init(0x3ffe, 0x82cd8698ac2b9160ULL),
934       make_floatx80_init(0xbffd, 0xfa64f2cea7a8dd40ULL) },
935     { make_floatx80_init(0xbffe, 0xefffffffffffe960ULL),
936       make_floatx80_init(0x3ffe, 0x85aac367cc488345ULL),
937       make_floatx80_init(0xbffd, 0xf4aa7930676ef976ULL) },
938     { make_floatx80_init(0xbffe, 0xe800000000006f10ULL),
939       make_floatx80_init(0x3ffe, 0x88980e8092da5c14ULL),
940       make_floatx80_init(0xbffd, 0xeecfe2feda4b47d8ULL) },
941     { make_floatx80_init(0xbffe, 0xe000000000008a45ULL),
942       make_floatx80_init(0x3ffe, 0x8b95c1e3ea8ba2a5ULL),
943       make_floatx80_init(0xbffd, 0xe8d47c382ae8bab6ULL) },
944     { make_floatx80_init(0xbffe, 0xd7ffffffffff8a9eULL),
945       make_floatx80_init(0x3ffe, 0x8ea4398b45cd8116ULL),
946       make_floatx80_init(0xbffd, 0xe2b78ce97464fdd4ULL) },
947     { make_floatx80_init(0xbffe, 0xd0000000000019a0ULL),
948       make_floatx80_init(0x3ffe, 0x91c3d373ab11b919ULL),
949       make_floatx80_init(0xbffd, 0xdc785918a9dc8dceULL) },
950     { make_floatx80_init(0xbffe, 0xc7ffffffffff14dfULL),
951       make_floatx80_init(0x3ffe, 0x94f4efa8fef76836ULL),
952       make_floatx80_init(0xbffd, 0xd61620ae02112f94ULL) },
953     { make_floatx80_init(0xbffe, 0xc000000000006530ULL),
954       make_floatx80_init(0x3ffe, 0x9837f0518db87fbbULL),
955       make_floatx80_init(0xbffd, 0xcf901f5ce48f008aULL) },
956     { make_floatx80_init(0xbffe, 0xb7ffffffffff1723ULL),
957       make_floatx80_init(0x3ffe, 0x9b8d39b9d54eb74cULL),
958       make_floatx80_init(0xbffd, 0xc8e58c8c55629168ULL) },
959     { make_floatx80_init(0xbffe, 0xb00000000000b5e1ULL),
960       make_floatx80_init(0x3ffe, 0x9ef5326091a0c366ULL),
961       make_floatx80_init(0xbffd, 0xc2159b3edcbe7934ULL) },
962     { make_floatx80_init(0xbffe, 0xa800000000006f8aULL),
963       make_floatx80_init(0x3ffe, 0xa27043030c49370aULL),
964       make_floatx80_init(0xbffd, 0xbb1f79f9e76d91ecULL) },
965     { make_floatx80_init(0xbffe, 0x9fffffffffff816aULL),
966       make_floatx80_init(0x3ffe, 0xa5fed6a9b15171cfULL),
967       make_floatx80_init(0xbffd, 0xb40252ac9d5d1c62ULL) },
968     { make_floatx80_init(0xbffe, 0x97ffffffffffb621ULL),
969       make_floatx80_init(0x3ffe, 0xa9a15ab4ea7c30e6ULL),
970       make_floatx80_init(0xbffd, 0xacbd4a962b079e34ULL) },
971     { make_floatx80_init(0xbffe, 0x8fffffffffff162bULL),
972       make_floatx80_init(0x3ffe, 0xad583eea42a1b886ULL),
973       make_floatx80_init(0xbffd, 0xa54f822b7abc8ef4ULL) },
974     { make_floatx80_init(0xbffe, 0x87ffffffffff4d34ULL),
975       make_floatx80_init(0x3ffe, 0xb123f581d2ac7b51ULL),
976       make_floatx80_init(0xbffd, 0x9db814fc5aa7095eULL) },
977     { make_floatx80_init(0xbffe, 0x800000000000227dULL),
978       make_floatx80_init(0x3ffe, 0xb504f333f9de539dULL),
979       make_floatx80_init(0xbffd, 0x95f619980c4358c6ULL) },
980     { make_floatx80_init(0xbffd, 0xefffffffffff3978ULL),
981       make_floatx80_init(0x3ffe, 0xb8fbaf4762fbd0a1ULL),
982       make_floatx80_init(0xbffd, 0x8e08a1713a085ebeULL) },
983     { make_floatx80_init(0xbffd, 0xe00000000000df81ULL),
984       make_floatx80_init(0x3ffe, 0xbd08a39f580bfd8cULL),
985       make_floatx80_init(0xbffd, 0x85eeb8c14fe804e8ULL) },
986     { make_floatx80_init(0xbffd, 0xd00000000000bccfULL),
987       make_floatx80_init(0x3ffe, 0xc12c4cca667062f6ULL),
988       make_floatx80_init(0xbffc, 0xfb4eccd6663e7428ULL) },
989     { make_floatx80_init(0xbffd, 0xc00000000000eff0ULL),
990       make_floatx80_init(0x3ffe, 0xc5672a1155069abeULL),
991       make_floatx80_init(0xbffc, 0xea6357baabe59508ULL) },
992     { make_floatx80_init(0xbffd, 0xb000000000000fe6ULL),
993       make_floatx80_init(0x3ffe, 0xc9b9bd866e2f234bULL),
994       make_floatx80_init(0xbffc, 0xd91909e6474372d4ULL) },
995     { make_floatx80_init(0xbffd, 0x9fffffffffff2172ULL),
996       make_floatx80_init(0x3ffe, 0xce248c151f84bf00ULL),
997       make_floatx80_init(0xbffc, 0xc76dcfab81ed0400ULL) },
998     { make_floatx80_init(0xbffd, 0x8fffffffffffafffULL),
999       make_floatx80_init(0x3ffe, 0xd2a81d91f12afb2bULL),
1000       make_floatx80_init(0xbffc, 0xb55f89b83b541354ULL) },
1001     { make_floatx80_init(0xbffc, 0xffffffffffff81a3ULL),
1002       make_floatx80_init(0x3ffe, 0xd744fccad69d7d5eULL),
1003       make_floatx80_init(0xbffc, 0xa2ec0cd4a58a0a88ULL) },
1004     { make_floatx80_init(0xbffc, 0xdfffffffffff1568ULL),
1005       make_floatx80_init(0x3ffe, 0xdbfbb797daf25a44ULL),
1006       make_floatx80_init(0xbffc, 0x901121a0943696f0ULL) },
1007     { make_floatx80_init(0xbffc, 0xbfffffffffff68daULL),
1008       make_floatx80_init(0x3ffe, 0xe0ccdeec2a94f811ULL),
1009       make_floatx80_init(0xbffb, 0xf999089eab583f78ULL) },
1010     { make_floatx80_init(0xbffc, 0x9fffffffffff4690ULL),
1011       make_floatx80_init(0x3ffe, 0xe5b906e77c83657eULL),
1012       make_floatx80_init(0xbffb, 0xd237c8c41be4d410ULL) },
1013     { make_floatx80_init(0xbffb, 0xffffffffffff8aeeULL),
1014       make_floatx80_init(0x3ffe, 0xeac0c6e7dd24427cULL),
1015       make_floatx80_init(0xbffb, 0xa9f9c8c116ddec20ULL) },
1016     { make_floatx80_init(0xbffb, 0xbfffffffffff2d18ULL),
1017       make_floatx80_init(0x3ffe, 0xefe4b99bdcdb06ebULL),
1018       make_floatx80_init(0xbffb, 0x80da33211927c8a8ULL) },
1019     { make_floatx80_init(0xbffa, 0xffffffffffff8ccbULL),
1020       make_floatx80_init(0x3ffe, 0xf5257d152486d0f4ULL),
1021       make_floatx80_init(0xbffa, 0xada82eadb792f0c0ULL) },
1022     { make_floatx80_init(0xbff9, 0xffffffffffff11feULL),
1023       make_floatx80_init(0x3ffe, 0xfa83b2db722a0846ULL),
1024       make_floatx80_init(0xbff9, 0xaf89a491babef740ULL) },
1025     { floatx80_zero_init,
1026       make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1027       floatx80_zero_init },
1028     { make_floatx80_init(0x3ff9, 0xffffffffffff2680ULL),
1029       make_floatx80_init(0x3fff, 0x82cd8698ac2b9f6fULL),
1030       make_floatx80_init(0x3ff9, 0xb361a62b0ae7dbc0ULL) },
1031     { make_floatx80_init(0x3ffb, 0x800000000000b500ULL),
1032       make_floatx80_init(0x3fff, 0x85aac367cc488345ULL),
1033       make_floatx80_init(0x3ffa, 0xb5586cf9891068a0ULL) },
1034     { make_floatx80_init(0x3ffb, 0xbfffffffffff4b67ULL),
1035       make_floatx80_init(0x3fff, 0x88980e8092da7cceULL),
1036       make_floatx80_init(0x3ffb, 0x8980e8092da7cce0ULL) },
1037     { make_floatx80_init(0x3ffb, 0xffffffffffffff57ULL),
1038       make_floatx80_init(0x3fff, 0x8b95c1e3ea8bd6dfULL),
1039       make_floatx80_init(0x3ffb, 0xb95c1e3ea8bd6df0ULL) },
1040     { make_floatx80_init(0x3ffc, 0x9fffffffffff811fULL),
1041       make_floatx80_init(0x3fff, 0x8ea4398b45cd4780ULL),
1042       make_floatx80_init(0x3ffb, 0xea4398b45cd47800ULL) },
1043     { make_floatx80_init(0x3ffc, 0xbfffffffffff9980ULL),
1044       make_floatx80_init(0x3fff, 0x91c3d373ab11b919ULL),
1045       make_floatx80_init(0x3ffc, 0x8e1e9b9d588dc8c8ULL) },
1046     { make_floatx80_init(0x3ffc, 0xdffffffffffff631ULL),
1047       make_floatx80_init(0x3fff, 0x94f4efa8fef70864ULL),
1048       make_floatx80_init(0x3ffc, 0xa7a77d47f7b84320ULL) },
1049     { make_floatx80_init(0x3ffc, 0xffffffffffff2499ULL),
1050       make_floatx80_init(0x3fff, 0x9837f0518db892d4ULL),
1051       make_floatx80_init(0x3ffc, 0xc1bf828c6dc496a0ULL) },
1052     { make_floatx80_init(0x3ffd, 0x8fffffffffff80fbULL),
1053       make_floatx80_init(0x3fff, 0x9b8d39b9d54e3a79ULL),
1054       make_floatx80_init(0x3ffc, 0xdc69cdceaa71d3c8ULL) },
1055     { make_floatx80_init(0x3ffd, 0x9fffffffffffbc23ULL),
1056       make_floatx80_init(0x3fff, 0x9ef5326091a10313ULL),
1057       make_floatx80_init(0x3ffc, 0xf7a993048d081898ULL) },
1058     { make_floatx80_init(0x3ffd, 0xafffffffffff20ecULL),
1059       make_floatx80_init(0x3fff, 0xa27043030c49370aULL),
1060       make_floatx80_init(0x3ffd, 0x89c10c0c3124dc28ULL) },
1061     { make_floatx80_init(0x3ffd, 0xc00000000000fd2cULL),
1062       make_floatx80_init(0x3fff, 0xa5fed6a9b15171cfULL),
1063       make_floatx80_init(0x3ffd, 0x97fb5aa6c545c73cULL) },
1064     { make_floatx80_init(0x3ffd, 0xd0000000000093beULL),
1065       make_floatx80_init(0x3fff, 0xa9a15ab4ea7c30e6ULL),
1066       make_floatx80_init(0x3ffd, 0xa6856ad3a9f0c398ULL) },
1067     { make_floatx80_init(0x3ffd, 0xe00000000000c2aeULL),
1068       make_floatx80_init(0x3fff, 0xad583eea42a17876ULL),
1069       make_floatx80_init(0x3ffd, 0xb560fba90a85e1d8ULL) },
1070     { make_floatx80_init(0x3ffd, 0xefffffffffff1e3fULL),
1071       make_floatx80_init(0x3fff, 0xb123f581d2abef6cULL),
1072       make_floatx80_init(0x3ffd, 0xc48fd6074aafbdb0ULL) },
1073     { make_floatx80_init(0x3ffd, 0xffffffffffff1c23ULL),
1074       make_floatx80_init(0x3fff, 0xb504f333f9de2cadULL),
1075       make_floatx80_init(0x3ffd, 0xd413cccfe778b2b4ULL) },
1076     { make_floatx80_init(0x3ffe, 0x8800000000006344ULL),
1077       make_floatx80_init(0x3fff, 0xb8fbaf4762fbd0a1ULL),
1078       make_floatx80_init(0x3ffd, 0xe3eebd1d8bef4284ULL) },
1079     { make_floatx80_init(0x3ffe, 0x9000000000005d67ULL),
1080       make_floatx80_init(0x3fff, 0xbd08a39f580c668dULL),
1081       make_floatx80_init(0x3ffd, 0xf4228e7d60319a34ULL) },
1082     { make_floatx80_init(0x3ffe, 0x9800000000009127ULL),
1083       make_floatx80_init(0x3fff, 0xc12c4cca6670e042ULL),
1084       make_floatx80_init(0x3ffe, 0x82589994cce1c084ULL) },
1085     { make_floatx80_init(0x3ffe, 0x9fffffffffff06f9ULL),
1086       make_floatx80_init(0x3fff, 0xc5672a11550655c3ULL),
1087       make_floatx80_init(0x3ffe, 0x8ace5422aa0cab86ULL) },
1088     { make_floatx80_init(0x3ffe, 0xa7fffffffffff80dULL),
1089       make_floatx80_init(0x3fff, 0xc9b9bd866e2f234bULL),
1090       make_floatx80_init(0x3ffe, 0x93737b0cdc5e4696ULL) },
1091     { make_floatx80_init(0x3ffe, 0xafffffffffff1470ULL),
1092       make_floatx80_init(0x3fff, 0xce248c151f83fd69ULL),
1093       make_floatx80_init(0x3ffe, 0x9c49182a3f07fad2ULL) },
1094     { make_floatx80_init(0x3ffe, 0xb800000000000e0aULL),
1095       make_floatx80_init(0x3fff, 0xd2a81d91f12aec5cULL),
1096       make_floatx80_init(0x3ffe, 0xa5503b23e255d8b8ULL) },
1097     { make_floatx80_init(0x3ffe, 0xc00000000000b7faULL),
1098       make_floatx80_init(0x3fff, 0xd744fccad69dd630ULL),
1099       make_floatx80_init(0x3ffe, 0xae89f995ad3bac60ULL) },
1100     { make_floatx80_init(0x3ffe, 0xc800000000003aa6ULL),
1101       make_floatx80_init(0x3fff, 0xdbfbb797daf25a44ULL),
1102       make_floatx80_init(0x3ffe, 0xb7f76f2fb5e4b488ULL) },
1103     { make_floatx80_init(0x3ffe, 0xd00000000000a6aeULL),
1104       make_floatx80_init(0x3fff, 0xe0ccdeec2a954685ULL),
1105       make_floatx80_init(0x3ffe, 0xc199bdd8552a8d0aULL) },
1106     { make_floatx80_init(0x3ffe, 0xd800000000004165ULL),
1107       make_floatx80_init(0x3fff, 0xe5b906e77c837155ULL),
1108       make_floatx80_init(0x3ffe, 0xcb720dcef906e2aaULL) },
1109     { make_floatx80_init(0x3ffe, 0xe00000000000582cULL),
1110       make_floatx80_init(0x3fff, 0xeac0c6e7dd24713aULL),
1111       make_floatx80_init(0x3ffe, 0xd5818dcfba48e274ULL) },
1112     { make_floatx80_init(0x3ffe, 0xe800000000001a5dULL),
1113       make_floatx80_init(0x3fff, 0xefe4b99bdcdb06ebULL),
1114       make_floatx80_init(0x3ffe, 0xdfc97337b9b60dd6ULL) },
1115     { make_floatx80_init(0x3ffe, 0xefffffffffffc1efULL),
1116       make_floatx80_init(0x3fff, 0xf5257d152486a2faULL),
1117       make_floatx80_init(0x3ffe, 0xea4afa2a490d45f4ULL) },
1118     { make_floatx80_init(0x3ffe, 0xf800000000001069ULL),
1119       make_floatx80_init(0x3fff, 0xfa83b2db722a0e5cULL),
1120       make_floatx80_init(0x3ffe, 0xf50765b6e4541cb8ULL) },
1121     { make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1122       make_floatx80_init(0x4000, 0x8000000000000000ULL),
1123       make_floatx80_init(0x3fff, 0x8000000000000000ULL) },
1124 };
1125 
1126 void helper_f2xm1(CPUX86State *env)
1127 {
1128     uint8_t old_flags = save_exception_flags(env);
1129     uint64_t sig = extractFloatx80Frac(ST0);
1130     int32_t exp = extractFloatx80Exp(ST0);
1131     bool sign = extractFloatx80Sign(ST0);
1132 
1133     if (floatx80_invalid_encoding(ST0)) {
1134         float_raise(float_flag_invalid, &env->fp_status);
1135         ST0 = floatx80_default_nan(&env->fp_status);
1136     } else if (floatx80_is_any_nan(ST0)) {
1137         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1138             float_raise(float_flag_invalid, &env->fp_status);
1139             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1140         }
1141     } else if (exp > 0x3fff ||
1142                (exp == 0x3fff && sig != (0x8000000000000000ULL))) {
1143         /* Out of range for the instruction, treat as invalid.  */
1144         float_raise(float_flag_invalid, &env->fp_status);
1145         ST0 = floatx80_default_nan(&env->fp_status);
1146     } else if (exp == 0x3fff) {
1147         /* Argument 1 or -1, exact result 1 or -0.5.  */
1148         if (sign) {
1149             ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL);
1150         }
1151     } else if (exp < 0x3fb0) {
1152         if (!floatx80_is_zero(ST0)) {
1153             /*
1154              * Multiplying the argument by an extra-precision version
1155              * of log(2) is sufficiently precise.  Zero arguments are
1156              * returned unchanged.
1157              */
1158             uint64_t sig0, sig1, sig2;
1159             if (exp == 0) {
1160                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1161             }
1162             mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1,
1163                             &sig2);
1164             /* This result is inexact.  */
1165             sig1 |= 1;
1166             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1167                                                 sign, exp, sig0, sig1,
1168                                                 &env->fp_status);
1169         }
1170     } else {
1171         floatx80 tmp, y, accum;
1172         bool asign, bsign;
1173         int32_t n, aexp, bexp;
1174         uint64_t asig0, asig1, asig2, bsig0, bsig1;
1175         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1176         FloatX80RoundPrec save_prec =
1177             env->fp_status.floatx80_rounding_precision;
1178         env->fp_status.float_rounding_mode = float_round_nearest_even;
1179         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1180 
1181         /* Find the nearest multiple of 1/32 to the argument.  */
1182         tmp = floatx80_scalbn(ST0, 5, &env->fp_status);
1183         n = 32 + floatx80_to_int32(tmp, &env->fp_status);
1184         y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status);
1185 
1186         if (floatx80_is_zero(y)) {
1187             /*
1188              * Use the value of 2^t - 1 from the table, to avoid
1189              * needing to special-case zero as a result of
1190              * multiplication below.
1191              */
1192             ST0 = f2xm1_table[n].t;
1193             set_float_exception_flags(float_flag_inexact, &env->fp_status);
1194             env->fp_status.float_rounding_mode = save_mode;
1195         } else {
1196             /*
1197              * Compute the lower parts of a polynomial expansion for
1198              * (2^y - 1) / y.
1199              */
1200             accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status);
1201             accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status);
1202             accum = floatx80_mul(accum, y, &env->fp_status);
1203             accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status);
1204             accum = floatx80_mul(accum, y, &env->fp_status);
1205             accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status);
1206             accum = floatx80_mul(accum, y, &env->fp_status);
1207             accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status);
1208             accum = floatx80_mul(accum, y, &env->fp_status);
1209             accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status);
1210             accum = floatx80_mul(accum, y, &env->fp_status);
1211             accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status);
1212             accum = floatx80_mul(accum, y, &env->fp_status);
1213             accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status);
1214 
1215             /*
1216              * The full polynomial expansion is f2xm1_coeff_0 + accum
1217              * (where accum has much lower magnitude, and so, in
1218              * particular, carry out of the addition is not possible).
1219              * (This expansion is only accurate to about 70 bits, not
1220              * 128 bits.)
1221              */
1222             aexp = extractFloatx80Exp(f2xm1_coeff_0);
1223             asign = extractFloatx80Sign(f2xm1_coeff_0);
1224             shift128RightJamming(extractFloatx80Frac(accum), 0,
1225                                  aexp - extractFloatx80Exp(accum),
1226                                  &asig0, &asig1);
1227             bsig0 = extractFloatx80Frac(f2xm1_coeff_0);
1228             bsig1 = 0;
1229             if (asign == extractFloatx80Sign(accum)) {
1230                 add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1231             } else {
1232                 sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1233             }
1234             /* And thus compute an approximation to 2^y - 1.  */
1235             mul128By64To192(asig0, asig1, extractFloatx80Frac(y),
1236                             &asig0, &asig1, &asig2);
1237             aexp += extractFloatx80Exp(y) - 0x3ffe;
1238             asign ^= extractFloatx80Sign(y);
1239             if (n != 32) {
1240                 /*
1241                  * Multiply this by the precomputed value of 2^t and
1242                  * add that of 2^t - 1.
1243                  */
1244                 mul128By64To192(asig0, asig1,
1245                                 extractFloatx80Frac(f2xm1_table[n].exp2),
1246                                 &asig0, &asig1, &asig2);
1247                 aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe;
1248                 bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1);
1249                 bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1);
1250                 bsig1 = 0;
1251                 if (bexp < aexp) {
1252                     shift128RightJamming(bsig0, bsig1, aexp - bexp,
1253                                          &bsig0, &bsig1);
1254                 } else if (aexp < bexp) {
1255                     shift128RightJamming(asig0, asig1, bexp - aexp,
1256                                          &asig0, &asig1);
1257                     aexp = bexp;
1258                 }
1259                 /* The sign of 2^t - 1 is always that of the result.  */
1260                 bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1);
1261                 if (asign == bsign) {
1262                     /* Avoid possible carry out of the addition.  */
1263                     shift128RightJamming(asig0, asig1, 1,
1264                                          &asig0, &asig1);
1265                     shift128RightJamming(bsig0, bsig1, 1,
1266                                          &bsig0, &bsig1);
1267                     ++aexp;
1268                     add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1);
1269                 } else {
1270                     sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1271                     asign = bsign;
1272                 }
1273             }
1274             env->fp_status.float_rounding_mode = save_mode;
1275             /* This result is inexact.  */
1276             asig1 |= 1;
1277             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1278                                                 asign, aexp, asig0, asig1,
1279                                                 &env->fp_status);
1280         }
1281 
1282         env->fp_status.floatx80_rounding_precision = save_prec;
1283     }
1284     merge_exception_flags(env, old_flags);
1285 }
1286 
1287 void helper_fptan(CPUX86State *env)
1288 {
1289     double fptemp = floatx80_to_double(env, ST0);
1290 
1291     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1292         env->fpus |= 0x400;
1293     } else {
1294         fptemp = tan(fptemp);
1295         ST0 = double_to_floatx80(env, fptemp);
1296         fpush(env);
1297         ST0 = floatx80_one;
1298         env->fpus &= ~0x400; /* C2 <-- 0 */
1299         /* the above code is for |arg| < 2**52 only */
1300     }
1301 }
1302 
1303 /* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision.  */
1304 #define pi_4_exp 0x3ffe
1305 #define pi_4_sig_high 0xc90fdaa22168c234ULL
1306 #define pi_4_sig_low 0xc4c6628b80dc1cd1ULL
1307 #define pi_2_exp 0x3fff
1308 #define pi_2_sig_high 0xc90fdaa22168c234ULL
1309 #define pi_2_sig_low 0xc4c6628b80dc1cd1ULL
1310 #define pi_34_exp 0x4000
1311 #define pi_34_sig_high 0x96cbe3f9990e91a7ULL
1312 #define pi_34_sig_low 0x9394c9e8a0a5159dULL
1313 #define pi_exp 0x4000
1314 #define pi_sig_high 0xc90fdaa22168c234ULL
1315 #define pi_sig_low 0xc4c6628b80dc1cd1ULL
1316 
1317 /*
1318  * Polynomial coefficients for an approximation to atan(x), with only
1319  * odd powers of x used, for x in the interval [-1/16, 1/16].  (Unlike
1320  * for some other approximations, no low part is needed for the first
1321  * coefficient here to achieve a sufficiently accurate result, because
1322  * the coefficient in this minimax approximation is very close to
1323  * exactly 1.)
1324  */
1325 #define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL)
1326 #define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL)
1327 #define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL)
1328 #define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL)
1329 #define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL)
1330 #define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL)
1331 #define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL)
1332 
1333 struct fpatan_data {
1334     /* High and low parts of atan(x).  */
1335     floatx80 atan_high, atan_low;
1336 };
1337 
1338 static const struct fpatan_data fpatan_table[9] = {
1339     { floatx80_zero_init,
1340       floatx80_zero_init },
1341     { make_floatx80_init(0x3ffb, 0xfeadd4d5617b6e33ULL),
1342       make_floatx80_init(0xbfb9, 0xdda19d8305ddc420ULL) },
1343     { make_floatx80_init(0x3ffc, 0xfadbafc96406eb15ULL),
1344       make_floatx80_init(0x3fbb, 0xdb8f3debef442fccULL) },
1345     { make_floatx80_init(0x3ffd, 0xb7b0ca0f26f78474ULL),
1346       make_floatx80_init(0xbfbc, 0xeab9bdba460376faULL) },
1347     { make_floatx80_init(0x3ffd, 0xed63382b0dda7b45ULL),
1348       make_floatx80_init(0x3fbc, 0xdfc88bd978751a06ULL) },
1349     { make_floatx80_init(0x3ffe, 0x8f005d5ef7f59f9bULL),
1350       make_floatx80_init(0x3fbd, 0xb906bc2ccb886e90ULL) },
1351     { make_floatx80_init(0x3ffe, 0xa4bc7d1934f70924ULL),
1352       make_floatx80_init(0x3fbb, 0xcd43f9522bed64f8ULL) },
1353     { make_floatx80_init(0x3ffe, 0xb8053e2bc2319e74ULL),
1354       make_floatx80_init(0xbfbc, 0xd3496ab7bd6eef0cULL) },
1355     { make_floatx80_init(0x3ffe, 0xc90fdaa22168c235ULL),
1356       make_floatx80_init(0xbfbc, 0xece675d1fc8f8cbcULL) },
1357 };
1358 
1359 void helper_fpatan(CPUX86State *env)
1360 {
1361     uint8_t old_flags = save_exception_flags(env);
1362     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1363     int32_t arg0_exp = extractFloatx80Exp(ST0);
1364     bool arg0_sign = extractFloatx80Sign(ST0);
1365     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1366     int32_t arg1_exp = extractFloatx80Exp(ST1);
1367     bool arg1_sign = extractFloatx80Sign(ST1);
1368 
1369     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1370         float_raise(float_flag_invalid, &env->fp_status);
1371         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1372     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1373         float_raise(float_flag_invalid, &env->fp_status);
1374         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1375     } else if (floatx80_invalid_encoding(ST0) ||
1376                floatx80_invalid_encoding(ST1)) {
1377         float_raise(float_flag_invalid, &env->fp_status);
1378         ST1 = floatx80_default_nan(&env->fp_status);
1379     } else if (floatx80_is_any_nan(ST0)) {
1380         ST1 = ST0;
1381     } else if (floatx80_is_any_nan(ST1)) {
1382         /* Pass this NaN through.  */
1383     } else if (floatx80_is_zero(ST1) && !arg0_sign) {
1384         /* Pass this zero through.  */
1385     } else if (((floatx80_is_infinity(ST0) && !floatx80_is_infinity(ST1)) ||
1386                  arg0_exp - arg1_exp >= 80) &&
1387                !arg0_sign) {
1388         /*
1389          * Dividing ST1 by ST0 gives the correct result up to
1390          * rounding, and avoids spurious underflow exceptions that
1391          * might result from passing some small values through the
1392          * polynomial approximation, but if a finite nonzero result of
1393          * division is exact, the result of fpatan is still inexact
1394          * (and underflowing where appropriate).
1395          */
1396         FloatX80RoundPrec save_prec =
1397             env->fp_status.floatx80_rounding_precision;
1398         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1399         ST1 = floatx80_div(ST1, ST0, &env->fp_status);
1400         env->fp_status.floatx80_rounding_precision = save_prec;
1401         if (!floatx80_is_zero(ST1) &&
1402             !(get_float_exception_flags(&env->fp_status) &
1403               float_flag_inexact)) {
1404             /*
1405              * The mathematical result is very slightly closer to zero
1406              * than this exact result.  Round a value with the
1407              * significand adjusted accordingly to get the correct
1408              * exceptions, and possibly an adjusted result depending
1409              * on the rounding mode.
1410              */
1411             uint64_t sig = extractFloatx80Frac(ST1);
1412             int32_t exp = extractFloatx80Exp(ST1);
1413             bool sign = extractFloatx80Sign(ST1);
1414             if (exp == 0) {
1415                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1416             }
1417             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1418                                                 sign, exp, sig - 1,
1419                                                 -1, &env->fp_status);
1420         }
1421     } else {
1422         /* The result is inexact.  */
1423         bool rsign = arg1_sign;
1424         int32_t rexp;
1425         uint64_t rsig0, rsig1;
1426         if (floatx80_is_zero(ST1)) {
1427             /*
1428              * ST0 is negative.  The result is pi with the sign of
1429              * ST1.
1430              */
1431             rexp = pi_exp;
1432             rsig0 = pi_sig_high;
1433             rsig1 = pi_sig_low;
1434         } else if (floatx80_is_infinity(ST1)) {
1435             if (floatx80_is_infinity(ST0)) {
1436                 if (arg0_sign) {
1437                     rexp = pi_34_exp;
1438                     rsig0 = pi_34_sig_high;
1439                     rsig1 = pi_34_sig_low;
1440                 } else {
1441                     rexp = pi_4_exp;
1442                     rsig0 = pi_4_sig_high;
1443                     rsig1 = pi_4_sig_low;
1444                 }
1445             } else {
1446                 rexp = pi_2_exp;
1447                 rsig0 = pi_2_sig_high;
1448                 rsig1 = pi_2_sig_low;
1449             }
1450         } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) {
1451             rexp = pi_2_exp;
1452             rsig0 = pi_2_sig_high;
1453             rsig1 = pi_2_sig_low;
1454         } else if (floatx80_is_infinity(ST0) || arg0_exp - arg1_exp >= 80) {
1455             /* ST0 is negative.  */
1456             rexp = pi_exp;
1457             rsig0 = pi_sig_high;
1458             rsig1 = pi_sig_low;
1459         } else {
1460             /*
1461              * ST0 and ST1 are finite, nonzero and with exponents not
1462              * too far apart.
1463              */
1464             int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp;
1465             int32_t azexp, axexp;
1466             bool adj_sub, ysign, zsign;
1467             uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1;
1468             uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2;
1469             uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1;
1470             uint64_t azsig0, azsig1;
1471             uint64_t azsig2, azsig3, axsig0, axsig1;
1472             floatx80 x8;
1473             FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1474             FloatX80RoundPrec save_prec =
1475                 env->fp_status.floatx80_rounding_precision;
1476             env->fp_status.float_rounding_mode = float_round_nearest_even;
1477             env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1478 
1479             if (arg0_exp == 0) {
1480                 normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1481             }
1482             if (arg1_exp == 0) {
1483                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1484             }
1485             if (arg0_exp > arg1_exp ||
1486                 (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) {
1487                 /* Work with abs(ST1) / abs(ST0).  */
1488                 num_exp = arg1_exp;
1489                 num_sig = arg1_sig;
1490                 den_exp = arg0_exp;
1491                 den_sig = arg0_sig;
1492                 if (arg0_sign) {
1493                     /* The result is subtracted from pi.  */
1494                     adj_exp = pi_exp;
1495                     adj_sig0 = pi_sig_high;
1496                     adj_sig1 = pi_sig_low;
1497                     adj_sub = true;
1498                 } else {
1499                     /* The result is used as-is.  */
1500                     adj_exp = 0;
1501                     adj_sig0 = 0;
1502                     adj_sig1 = 0;
1503                     adj_sub = false;
1504                 }
1505             } else {
1506                 /* Work with abs(ST0) / abs(ST1).  */
1507                 num_exp = arg0_exp;
1508                 num_sig = arg0_sig;
1509                 den_exp = arg1_exp;
1510                 den_sig = arg1_sig;
1511                 /* The result is added to or subtracted from pi/2.  */
1512                 adj_exp = pi_2_exp;
1513                 adj_sig0 = pi_2_sig_high;
1514                 adj_sig1 = pi_2_sig_low;
1515                 adj_sub = !arg0_sign;
1516             }
1517 
1518             /*
1519              * Compute x = num/den, where 0 < x <= 1 and x is not too
1520              * small.
1521              */
1522             xexp = num_exp - den_exp + 0x3ffe;
1523             remsig0 = num_sig;
1524             remsig1 = 0;
1525             if (den_sig <= remsig0) {
1526                 shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1527                 ++xexp;
1528             }
1529             xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig);
1530             mul64To128(den_sig, xsig0, &msig0, &msig1);
1531             sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1);
1532             while ((int64_t) remsig0 < 0) {
1533                 --xsig0;
1534                 add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1);
1535             }
1536             xsig1 = estimateDiv128To64(remsig1, 0, den_sig);
1537             /*
1538              * No need to correct any estimation error in xsig1; even
1539              * with such error, it is accurate enough.
1540              */
1541 
1542             /*
1543              * Split x as x = t + y, where t = n/8 is the nearest
1544              * multiple of 1/8 to x.
1545              */
1546             x8 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1547                                                false, xexp + 3, xsig0,
1548                                                xsig1, &env->fp_status);
1549             n = floatx80_to_int32(x8, &env->fp_status);
1550             if (n == 0) {
1551                 ysign = false;
1552                 yexp = xexp;
1553                 ysig0 = xsig0;
1554                 ysig1 = xsig1;
1555                 texp = 0;
1556                 tsig = 0;
1557             } else {
1558                 int shift = clz32(n) + 32;
1559                 texp = 0x403b - shift;
1560                 tsig = n;
1561                 tsig <<= shift;
1562                 if (texp == xexp) {
1563                     sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1);
1564                     if ((int64_t) ysig0 >= 0) {
1565                         ysign = false;
1566                         if (ysig0 == 0) {
1567                             if (ysig1 == 0) {
1568                                 yexp = 0;
1569                             } else {
1570                                 shift = clz64(ysig1) + 64;
1571                                 yexp = xexp - shift;
1572                                 shift128Left(ysig0, ysig1, shift,
1573                                              &ysig0, &ysig1);
1574                             }
1575                         } else {
1576                             shift = clz64(ysig0);
1577                             yexp = xexp - shift;
1578                             shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1579                         }
1580                     } else {
1581                         ysign = true;
1582                         sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1);
1583                         if (ysig0 == 0) {
1584                             shift = clz64(ysig1) + 64;
1585                         } else {
1586                             shift = clz64(ysig0);
1587                         }
1588                         yexp = xexp - shift;
1589                         shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1590                     }
1591                 } else {
1592                     /*
1593                      * t's exponent must be greater than x's because t
1594                      * is positive and the nearest multiple of 1/8 to
1595                      * x, and if x has a greater exponent, the power
1596                      * of 2 with that exponent is also a multiple of
1597                      * 1/8.
1598                      */
1599                     uint64_t usig0, usig1;
1600                     shift128RightJamming(xsig0, xsig1, texp - xexp,
1601                                          &usig0, &usig1);
1602                     ysign = true;
1603                     sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1);
1604                     if (ysig0 == 0) {
1605                         shift = clz64(ysig1) + 64;
1606                     } else {
1607                         shift = clz64(ysig0);
1608                     }
1609                     yexp = texp - shift;
1610                     shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1611                 }
1612             }
1613 
1614             /*
1615              * Compute z = y/(1+tx), so arctan(x) = arctan(t) +
1616              * arctan(z).
1617              */
1618             zsign = ysign;
1619             if (texp == 0 || yexp == 0) {
1620                 zexp = yexp;
1621                 zsig0 = ysig0;
1622                 zsig1 = ysig1;
1623             } else {
1624                 /*
1625                  * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1.
1626                  */
1627                 int32_t dexp = texp + xexp - 0x3ffe;
1628                 uint64_t dsig0, dsig1, dsig2;
1629                 mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2);
1630                 /*
1631                  * dexp <= 0x3fff (and if equal, dsig0 has a leading 0
1632                  * bit).  Add 1 to produce the denominator 1+tx.
1633                  */
1634                 shift128RightJamming(dsig0, dsig1, 0x3fff - dexp,
1635                                      &dsig0, &dsig1);
1636                 dsig0 |= 0x8000000000000000ULL;
1637                 zexp = yexp - 1;
1638                 remsig0 = ysig0;
1639                 remsig1 = ysig1;
1640                 remsig2 = 0;
1641                 if (dsig0 <= remsig0) {
1642                     shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1643                     ++zexp;
1644                 }
1645                 zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0);
1646                 mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2);
1647                 sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2,
1648                        &remsig0, &remsig1, &remsig2);
1649                 while ((int64_t) remsig0 < 0) {
1650                     --zsig0;
1651                     add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1,
1652                            &remsig0, &remsig1, &remsig2);
1653                 }
1654                 zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0);
1655                 /* No need to correct any estimation error in zsig1.  */
1656             }
1657 
1658             if (zexp == 0) {
1659                 azexp = 0;
1660                 azsig0 = 0;
1661                 azsig1 = 0;
1662             } else {
1663                 floatx80 z2, accum;
1664                 uint64_t z2sig0, z2sig1, z2sig2, z2sig3;
1665                 /* Compute z^2.  */
1666                 mul128To256(zsig0, zsig1, zsig0, zsig1,
1667                             &z2sig0, &z2sig1, &z2sig2, &z2sig3);
1668                 z2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1669                                                    zexp + zexp - 0x3ffe,
1670                                                    z2sig0, z2sig1,
1671                                                    &env->fp_status);
1672 
1673                 /* Compute the lower parts of the polynomial expansion.  */
1674                 accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status);
1675                 accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status);
1676                 accum = floatx80_mul(accum, z2, &env->fp_status);
1677                 accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status);
1678                 accum = floatx80_mul(accum, z2, &env->fp_status);
1679                 accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status);
1680                 accum = floatx80_mul(accum, z2, &env->fp_status);
1681                 accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status);
1682                 accum = floatx80_mul(accum, z2, &env->fp_status);
1683                 accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status);
1684                 accum = floatx80_mul(accum, z2, &env->fp_status);
1685 
1686                 /*
1687                  * The full polynomial expansion is z*(fpatan_coeff_0 + accum).
1688                  * fpatan_coeff_0 is 1, and accum is negative and much smaller.
1689                  */
1690                 aexp = extractFloatx80Exp(fpatan_coeff_0);
1691                 shift128RightJamming(extractFloatx80Frac(accum), 0,
1692                                      aexp - extractFloatx80Exp(accum),
1693                                      &asig0, &asig1);
1694                 sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1,
1695                        &asig0, &asig1);
1696                 /* Multiply by z to compute arctan(z).  */
1697                 azexp = aexp + zexp - 0x3ffe;
1698                 mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1,
1699                             &azsig2, &azsig3);
1700             }
1701 
1702             /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign).  */
1703             if (texp == 0) {
1704                 /* z is positive.  */
1705                 axexp = azexp;
1706                 axsig0 = azsig0;
1707                 axsig1 = azsig1;
1708             } else {
1709                 bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low);
1710                 int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low);
1711                 uint64_t low_sig0 =
1712                     extractFloatx80Frac(fpatan_table[n].atan_low);
1713                 uint64_t low_sig1 = 0;
1714                 axexp = extractFloatx80Exp(fpatan_table[n].atan_high);
1715                 axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high);
1716                 axsig1 = 0;
1717                 shift128RightJamming(low_sig0, low_sig1, axexp - low_exp,
1718                                      &low_sig0, &low_sig1);
1719                 if (low_sign) {
1720                     sub128(axsig0, axsig1, low_sig0, low_sig1,
1721                            &axsig0, &axsig1);
1722                 } else {
1723                     add128(axsig0, axsig1, low_sig0, low_sig1,
1724                            &axsig0, &axsig1);
1725                 }
1726                 if (azexp >= axexp) {
1727                     shift128RightJamming(axsig0, axsig1, azexp - axexp + 1,
1728                                          &axsig0, &axsig1);
1729                     axexp = azexp + 1;
1730                     shift128RightJamming(azsig0, azsig1, 1,
1731                                          &azsig0, &azsig1);
1732                 } else {
1733                     shift128RightJamming(axsig0, axsig1, 1,
1734                                          &axsig0, &axsig1);
1735                     shift128RightJamming(azsig0, azsig1, axexp - azexp + 1,
1736                                          &azsig0, &azsig1);
1737                     ++axexp;
1738                 }
1739                 if (zsign) {
1740                     sub128(axsig0, axsig1, azsig0, azsig1,
1741                            &axsig0, &axsig1);
1742                 } else {
1743                     add128(axsig0, axsig1, azsig0, azsig1,
1744                            &axsig0, &axsig1);
1745                 }
1746             }
1747 
1748             if (adj_exp == 0) {
1749                 rexp = axexp;
1750                 rsig0 = axsig0;
1751                 rsig1 = axsig1;
1752             } else {
1753                 /*
1754                  * Add or subtract arctan(x) (exponent axexp,
1755                  * significand axsig0 and axsig1, positive, not
1756                  * necessarily normalized) to the number given by
1757                  * adj_exp, adj_sig0 and adj_sig1, according to
1758                  * adj_sub.
1759                  */
1760                 if (adj_exp >= axexp) {
1761                     shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1,
1762                                          &axsig0, &axsig1);
1763                     rexp = adj_exp + 1;
1764                     shift128RightJamming(adj_sig0, adj_sig1, 1,
1765                                          &adj_sig0, &adj_sig1);
1766                 } else {
1767                     shift128RightJamming(axsig0, axsig1, 1,
1768                                          &axsig0, &axsig1);
1769                     shift128RightJamming(adj_sig0, adj_sig1,
1770                                          axexp - adj_exp + 1,
1771                                          &adj_sig0, &adj_sig1);
1772                     rexp = axexp + 1;
1773                 }
1774                 if (adj_sub) {
1775                     sub128(adj_sig0, adj_sig1, axsig0, axsig1,
1776                            &rsig0, &rsig1);
1777                 } else {
1778                     add128(adj_sig0, adj_sig1, axsig0, axsig1,
1779                            &rsig0, &rsig1);
1780                 }
1781             }
1782 
1783             env->fp_status.float_rounding_mode = save_mode;
1784             env->fp_status.floatx80_rounding_precision = save_prec;
1785         }
1786         /* This result is inexact.  */
1787         rsig1 |= 1;
1788         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x, rsign, rexp,
1789                                             rsig0, rsig1, &env->fp_status);
1790     }
1791 
1792     fpop(env);
1793     merge_exception_flags(env, old_flags);
1794 }
1795 
1796 void helper_fxtract(CPUX86State *env)
1797 {
1798     uint8_t old_flags = save_exception_flags(env);
1799     CPU_LDoubleU temp;
1800 
1801     temp.d = ST0;
1802 
1803     if (floatx80_is_zero(ST0)) {
1804         /* Easy way to generate -inf and raising division by 0 exception */
1805         ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
1806                            &env->fp_status);
1807         fpush(env);
1808         ST0 = temp.d;
1809     } else if (floatx80_invalid_encoding(ST0)) {
1810         float_raise(float_flag_invalid, &env->fp_status);
1811         ST0 = floatx80_default_nan(&env->fp_status);
1812         fpush(env);
1813         ST0 = ST1;
1814     } else if (floatx80_is_any_nan(ST0)) {
1815         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1816             float_raise(float_flag_invalid, &env->fp_status);
1817             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1818         }
1819         fpush(env);
1820         ST0 = ST1;
1821     } else if (floatx80_is_infinity(ST0)) {
1822         fpush(env);
1823         ST0 = ST1;
1824         ST1 = floatx80_infinity;
1825     } else {
1826         int expdif;
1827 
1828         if (EXPD(temp) == 0) {
1829             int shift = clz64(temp.l.lower);
1830             temp.l.lower <<= shift;
1831             expdif = 1 - EXPBIAS - shift;
1832             float_raise(float_flag_input_denormal, &env->fp_status);
1833         } else {
1834             expdif = EXPD(temp) - EXPBIAS;
1835         }
1836         /* DP exponent bias */
1837         ST0 = int32_to_floatx80(expdif, &env->fp_status);
1838         fpush(env);
1839         BIASEXPONENT(temp);
1840         ST0 = temp.d;
1841     }
1842     merge_exception_flags(env, old_flags);
1843 }
1844 
1845 static void helper_fprem_common(CPUX86State *env, bool mod)
1846 {
1847     uint8_t old_flags = save_exception_flags(env);
1848     uint64_t quotient;
1849     CPU_LDoubleU temp0, temp1;
1850     int exp0, exp1, expdiff;
1851 
1852     temp0.d = ST0;
1853     temp1.d = ST1;
1854     exp0 = EXPD(temp0);
1855     exp1 = EXPD(temp1);
1856 
1857     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1858     if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1859         exp0 == 0x7fff || exp1 == 0x7fff ||
1860         floatx80_invalid_encoding(ST0) || floatx80_invalid_encoding(ST1)) {
1861         ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1862     } else {
1863         if (exp0 == 0) {
1864             exp0 = 1 - clz64(temp0.l.lower);
1865         }
1866         if (exp1 == 0) {
1867             exp1 = 1 - clz64(temp1.l.lower);
1868         }
1869         expdiff = exp0 - exp1;
1870         if (expdiff < 64) {
1871             ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1872             env->fpus |= (quotient & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1873             env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */
1874             env->fpus |= (quotient & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1875         } else {
1876             /*
1877              * Partial remainder.  This choice of how many bits to
1878              * process at once is specified in AMD instruction set
1879              * manuals, and empirically is followed by Intel
1880              * processors as well; it ensures that the final remainder
1881              * operation in a loop does produce the correct low three
1882              * bits of the quotient.  AMD manuals specify that the
1883              * flags other than C2 are cleared, and empirically Intel
1884              * processors clear them as well.
1885              */
1886             int n = 32 + (expdiff % 32);
1887             temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status);
1888             ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status);
1889             env->fpus |= 0x400;  /* C2 <-- 1 */
1890         }
1891     }
1892     merge_exception_flags(env, old_flags);
1893 }
1894 
1895 void helper_fprem1(CPUX86State *env)
1896 {
1897     helper_fprem_common(env, false);
1898 }
1899 
1900 void helper_fprem(CPUX86State *env)
1901 {
1902     helper_fprem_common(env, true);
1903 }
1904 
1905 /* 128-bit significand of log2(e).  */
1906 #define log2_e_sig_high 0xb8aa3b295c17f0bbULL
1907 #define log2_e_sig_low 0xbe87fed0691d3e89ULL
1908 
1909 /*
1910  * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
1911  * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
1912  * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
1913  * interval [sqrt(2)/2, sqrt(2)].
1914  */
1915 #define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
1916 #define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
1917 #define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
1918 #define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
1919 #define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
1920 #define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
1921 #define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
1922 #define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
1923 #define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
1924 #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
1925 #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
1926 
1927 /*
1928  * Compute an approximation of log2(1+arg), where 1+arg is in the
1929  * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
1930  * function is called, rounding precision is set to 80 and the
1931  * round-to-nearest mode is in effect.  arg must not be exactly zero,
1932  * and must not be so close to zero that underflow might occur.
1933  */
1934 static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
1935                                 uint64_t *sig0, uint64_t *sig1)
1936 {
1937     uint64_t arg0_sig = extractFloatx80Frac(arg);
1938     int32_t arg0_exp = extractFloatx80Exp(arg);
1939     bool arg0_sign = extractFloatx80Sign(arg);
1940     bool asign;
1941     int32_t dexp, texp, aexp;
1942     uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
1943     uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
1944     uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
1945     floatx80 t2, accum;
1946 
1947     /*
1948      * Compute an approximation of arg/(2+arg), with extra precision,
1949      * as the argument to a polynomial approximation.  The extra
1950      * precision is only needed for the first term of the
1951      * approximation, with subsequent terms being significantly
1952      * smaller; the approximation only uses odd exponents, and the
1953      * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
1954      */
1955     if (arg0_sign) {
1956         dexp = 0x3fff;
1957         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1958         sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
1959     } else {
1960         dexp = 0x4000;
1961         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1962         dsig0 |= 0x8000000000000000ULL;
1963     }
1964     texp = arg0_exp - dexp + 0x3ffe;
1965     rsig0 = arg0_sig;
1966     rsig1 = 0;
1967     rsig2 = 0;
1968     if (dsig0 <= rsig0) {
1969         shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
1970         ++texp;
1971     }
1972     tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
1973     mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
1974     sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
1975            &rsig0, &rsig1, &rsig2);
1976     while ((int64_t) rsig0 < 0) {
1977         --tsig0;
1978         add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
1979                &rsig0, &rsig1, &rsig2);
1980     }
1981     tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
1982     /*
1983      * No need to correct any estimation error in tsig1; even with
1984      * such error, it is accurate enough.  Now compute the square of
1985      * that approximation.
1986      */
1987     mul128To256(tsig0, tsig1, tsig0, tsig1,
1988                 &t2sig0, &t2sig1, &t2sig2, &t2sig3);
1989     t2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1990                                        texp + texp - 0x3ffe,
1991                                        t2sig0, t2sig1, &env->fp_status);
1992 
1993     /* Compute the lower parts of the polynomial expansion.  */
1994     accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
1995     accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
1996     accum = floatx80_mul(accum, t2, &env->fp_status);
1997     accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
1998     accum = floatx80_mul(accum, t2, &env->fp_status);
1999     accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
2000     accum = floatx80_mul(accum, t2, &env->fp_status);
2001     accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
2002     accum = floatx80_mul(accum, t2, &env->fp_status);
2003     accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
2004     accum = floatx80_mul(accum, t2, &env->fp_status);
2005     accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
2006     accum = floatx80_mul(accum, t2, &env->fp_status);
2007     accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
2008     accum = floatx80_mul(accum, t2, &env->fp_status);
2009     accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
2010     accum = floatx80_mul(accum, t2, &env->fp_status);
2011     accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
2012 
2013     /*
2014      * The full polynomial expansion is fyl2x_coeff_0 + accum (where
2015      * accum has much lower magnitude, and so, in particular, carry
2016      * out of the addition is not possible), multiplied by t.  (This
2017      * expansion is only accurate to about 70 bits, not 128 bits.)
2018      */
2019     aexp = extractFloatx80Exp(fyl2x_coeff_0);
2020     asign = extractFloatx80Sign(fyl2x_coeff_0);
2021     shift128RightJamming(extractFloatx80Frac(accum), 0,
2022                          aexp - extractFloatx80Exp(accum),
2023                          &asig0, &asig1);
2024     bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
2025     bsig1 = 0;
2026     if (asign == extractFloatx80Sign(accum)) {
2027         add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
2028     } else {
2029         sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
2030     }
2031     /* Multiply by t to compute the required result.  */
2032     mul128To256(asig0, asig1, tsig0, tsig1,
2033                 &asig0, &asig1, &asig2, &asig3);
2034     aexp += texp - 0x3ffe;
2035     *exp = aexp;
2036     *sig0 = asig0;
2037     *sig1 = asig1;
2038 }
2039 
2040 void helper_fyl2xp1(CPUX86State *env)
2041 {
2042     uint8_t old_flags = save_exception_flags(env);
2043     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2044     int32_t arg0_exp = extractFloatx80Exp(ST0);
2045     bool arg0_sign = extractFloatx80Sign(ST0);
2046     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2047     int32_t arg1_exp = extractFloatx80Exp(ST1);
2048     bool arg1_sign = extractFloatx80Sign(ST1);
2049 
2050     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2051         float_raise(float_flag_invalid, &env->fp_status);
2052         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2053     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2054         float_raise(float_flag_invalid, &env->fp_status);
2055         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2056     } else if (floatx80_invalid_encoding(ST0) ||
2057                floatx80_invalid_encoding(ST1)) {
2058         float_raise(float_flag_invalid, &env->fp_status);
2059         ST1 = floatx80_default_nan(&env->fp_status);
2060     } else if (floatx80_is_any_nan(ST0)) {
2061         ST1 = ST0;
2062     } else if (floatx80_is_any_nan(ST1)) {
2063         /* Pass this NaN through.  */
2064     } else if (arg0_exp > 0x3ffd ||
2065                (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
2066                                                   0x95f619980c4336f7ULL :
2067                                                   0xd413cccfe7799211ULL))) {
2068         /*
2069          * Out of range for the instruction (ST0 must have absolute
2070          * value less than 1 - sqrt(2)/2 = 0.292..., according to
2071          * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
2072          * to sqrt(2) - 1, which we allow here), treat as invalid.
2073          */
2074         float_raise(float_flag_invalid, &env->fp_status);
2075         ST1 = floatx80_default_nan(&env->fp_status);
2076     } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
2077                arg1_exp == 0x7fff) {
2078         /*
2079          * One argument is zero, or multiplying by infinity; correct
2080          * result is exact and can be obtained by multiplying the
2081          * arguments.
2082          */
2083         ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
2084     } else if (arg0_exp < 0x3fb0) {
2085         /*
2086          * Multiplying both arguments and an extra-precision version
2087          * of log2(e) is sufficiently precise.
2088          */
2089         uint64_t sig0, sig1, sig2;
2090         int32_t exp;
2091         if (arg0_exp == 0) {
2092             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2093         }
2094         if (arg1_exp == 0) {
2095             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2096         }
2097         mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
2098                         &sig0, &sig1, &sig2);
2099         exp = arg0_exp + 1;
2100         mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
2101         exp += arg1_exp - 0x3ffe;
2102         /* This result is inexact.  */
2103         sig1 |= 1;
2104         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2105                                             arg0_sign ^ arg1_sign, exp,
2106                                             sig0, sig1, &env->fp_status);
2107     } else {
2108         int32_t aexp;
2109         uint64_t asig0, asig1, asig2;
2110         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2111         FloatX80RoundPrec save_prec =
2112             env->fp_status.floatx80_rounding_precision;
2113         env->fp_status.float_rounding_mode = float_round_nearest_even;
2114         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2115 
2116         helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
2117         /*
2118          * Multiply by the second argument to compute the required
2119          * result.
2120          */
2121         if (arg1_exp == 0) {
2122             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2123         }
2124         mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2125         aexp += arg1_exp - 0x3ffe;
2126         /* This result is inexact.  */
2127         asig1 |= 1;
2128         env->fp_status.float_rounding_mode = save_mode;
2129         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2130                                             arg0_sign ^ arg1_sign, aexp,
2131                                             asig0, asig1, &env->fp_status);
2132         env->fp_status.floatx80_rounding_precision = save_prec;
2133     }
2134     fpop(env);
2135     merge_exception_flags(env, old_flags);
2136 }
2137 
2138 void helper_fyl2x(CPUX86State *env)
2139 {
2140     uint8_t old_flags = save_exception_flags(env);
2141     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2142     int32_t arg0_exp = extractFloatx80Exp(ST0);
2143     bool arg0_sign = extractFloatx80Sign(ST0);
2144     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2145     int32_t arg1_exp = extractFloatx80Exp(ST1);
2146     bool arg1_sign = extractFloatx80Sign(ST1);
2147 
2148     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2149         float_raise(float_flag_invalid, &env->fp_status);
2150         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2151     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2152         float_raise(float_flag_invalid, &env->fp_status);
2153         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2154     } else if (floatx80_invalid_encoding(ST0) ||
2155                floatx80_invalid_encoding(ST1)) {
2156         float_raise(float_flag_invalid, &env->fp_status);
2157         ST1 = floatx80_default_nan(&env->fp_status);
2158     } else if (floatx80_is_any_nan(ST0)) {
2159         ST1 = ST0;
2160     } else if (floatx80_is_any_nan(ST1)) {
2161         /* Pass this NaN through.  */
2162     } else if (arg0_sign && !floatx80_is_zero(ST0)) {
2163         float_raise(float_flag_invalid, &env->fp_status);
2164         ST1 = floatx80_default_nan(&env->fp_status);
2165     } else if (floatx80_is_infinity(ST1)) {
2166         FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
2167                                              &env->fp_status);
2168         switch (cmp) {
2169         case float_relation_less:
2170             ST1 = floatx80_chs(ST1);
2171             break;
2172         case float_relation_greater:
2173             /* Result is infinity of the same sign as ST1.  */
2174             break;
2175         default:
2176             float_raise(float_flag_invalid, &env->fp_status);
2177             ST1 = floatx80_default_nan(&env->fp_status);
2178             break;
2179         }
2180     } else if (floatx80_is_infinity(ST0)) {
2181         if (floatx80_is_zero(ST1)) {
2182             float_raise(float_flag_invalid, &env->fp_status);
2183             ST1 = floatx80_default_nan(&env->fp_status);
2184         } else if (arg1_sign) {
2185             ST1 = floatx80_chs(ST0);
2186         } else {
2187             ST1 = ST0;
2188         }
2189     } else if (floatx80_is_zero(ST0)) {
2190         if (floatx80_is_zero(ST1)) {
2191             float_raise(float_flag_invalid, &env->fp_status);
2192             ST1 = floatx80_default_nan(&env->fp_status);
2193         } else {
2194             /* Result is infinity with opposite sign to ST1.  */
2195             float_raise(float_flag_divbyzero, &env->fp_status);
2196             ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
2197                                 0x8000000000000000ULL);
2198         }
2199     } else if (floatx80_is_zero(ST1)) {
2200         if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
2201             ST1 = floatx80_chs(ST1);
2202         }
2203         /* Otherwise, ST1 is already the correct result.  */
2204     } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
2205         if (arg1_sign) {
2206             ST1 = floatx80_chs(floatx80_zero);
2207         } else {
2208             ST1 = floatx80_zero;
2209         }
2210     } else {
2211         int32_t int_exp;
2212         floatx80 arg0_m1;
2213         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2214         FloatX80RoundPrec save_prec =
2215             env->fp_status.floatx80_rounding_precision;
2216         env->fp_status.float_rounding_mode = float_round_nearest_even;
2217         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2218 
2219         if (arg0_exp == 0) {
2220             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2221         }
2222         if (arg1_exp == 0) {
2223             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2224         }
2225         int_exp = arg0_exp - 0x3fff;
2226         if (arg0_sig > 0xb504f333f9de6484ULL) {
2227             ++int_exp;
2228         }
2229         arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
2230                                                &env->fp_status),
2231                                floatx80_one, &env->fp_status);
2232         if (floatx80_is_zero(arg0_m1)) {
2233             /* Exact power of 2; multiply by ST1.  */
2234             env->fp_status.float_rounding_mode = save_mode;
2235             ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
2236                                ST1, &env->fp_status);
2237         } else {
2238             bool asign = extractFloatx80Sign(arg0_m1);
2239             int32_t aexp;
2240             uint64_t asig0, asig1, asig2;
2241             helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
2242             if (int_exp != 0) {
2243                 bool isign = (int_exp < 0);
2244                 int32_t iexp;
2245                 uint64_t isig;
2246                 int shift;
2247                 int_exp = isign ? -int_exp : int_exp;
2248                 shift = clz32(int_exp) + 32;
2249                 isig = int_exp;
2250                 isig <<= shift;
2251                 iexp = 0x403e - shift;
2252                 shift128RightJamming(asig0, asig1, iexp - aexp,
2253                                      &asig0, &asig1);
2254                 if (asign == isign) {
2255                     add128(isig, 0, asig0, asig1, &asig0, &asig1);
2256                 } else {
2257                     sub128(isig, 0, asig0, asig1, &asig0, &asig1);
2258                 }
2259                 aexp = iexp;
2260                 asign = isign;
2261             }
2262             /*
2263              * Multiply by the second argument to compute the required
2264              * result.
2265              */
2266             if (arg1_exp == 0) {
2267                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2268             }
2269             mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2270             aexp += arg1_exp - 0x3ffe;
2271             /* This result is inexact.  */
2272             asig1 |= 1;
2273             env->fp_status.float_rounding_mode = save_mode;
2274             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2275                                                 asign ^ arg1_sign, aexp,
2276                                                 asig0, asig1, &env->fp_status);
2277         }
2278 
2279         env->fp_status.floatx80_rounding_precision = save_prec;
2280     }
2281     fpop(env);
2282     merge_exception_flags(env, old_flags);
2283 }
2284 
2285 void helper_fsqrt(CPUX86State *env)
2286 {
2287     uint8_t old_flags = save_exception_flags(env);
2288     if (floatx80_is_neg(ST0)) {
2289         env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
2290         env->fpus |= 0x400;
2291     }
2292     ST0 = floatx80_sqrt(ST0, &env->fp_status);
2293     merge_exception_flags(env, old_flags);
2294 }
2295 
2296 void helper_fsincos(CPUX86State *env)
2297 {
2298     double fptemp = floatx80_to_double(env, ST0);
2299 
2300     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2301         env->fpus |= 0x400;
2302     } else {
2303         ST0 = double_to_floatx80(env, sin(fptemp));
2304         fpush(env);
2305         ST0 = double_to_floatx80(env, cos(fptemp));
2306         env->fpus &= ~0x400;  /* C2 <-- 0 */
2307         /* the above code is for |arg| < 2**63 only */
2308     }
2309 }
2310 
2311 void helper_frndint(CPUX86State *env)
2312 {
2313     uint8_t old_flags = save_exception_flags(env);
2314     ST0 = floatx80_round_to_int(ST0, &env->fp_status);
2315     merge_exception_flags(env, old_flags);
2316 }
2317 
2318 void helper_fscale(CPUX86State *env)
2319 {
2320     uint8_t old_flags = save_exception_flags(env);
2321     if (floatx80_invalid_encoding(ST1) || floatx80_invalid_encoding(ST0)) {
2322         float_raise(float_flag_invalid, &env->fp_status);
2323         ST0 = floatx80_default_nan(&env->fp_status);
2324     } else if (floatx80_is_any_nan(ST1)) {
2325         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2326             float_raise(float_flag_invalid, &env->fp_status);
2327         }
2328         ST0 = ST1;
2329         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2330             float_raise(float_flag_invalid, &env->fp_status);
2331             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
2332         }
2333     } else if (floatx80_is_infinity(ST1) &&
2334                !floatx80_invalid_encoding(ST0) &&
2335                !floatx80_is_any_nan(ST0)) {
2336         if (floatx80_is_neg(ST1)) {
2337             if (floatx80_is_infinity(ST0)) {
2338                 float_raise(float_flag_invalid, &env->fp_status);
2339                 ST0 = floatx80_default_nan(&env->fp_status);
2340             } else {
2341                 ST0 = (floatx80_is_neg(ST0) ?
2342                        floatx80_chs(floatx80_zero) :
2343                        floatx80_zero);
2344             }
2345         } else {
2346             if (floatx80_is_zero(ST0)) {
2347                 float_raise(float_flag_invalid, &env->fp_status);
2348                 ST0 = floatx80_default_nan(&env->fp_status);
2349             } else {
2350                 ST0 = (floatx80_is_neg(ST0) ?
2351                        floatx80_chs(floatx80_infinity) :
2352                        floatx80_infinity);
2353             }
2354         }
2355     } else {
2356         int n;
2357         FloatX80RoundPrec save = env->fp_status.floatx80_rounding_precision;
2358         uint8_t save_flags = get_float_exception_flags(&env->fp_status);
2359         set_float_exception_flags(0, &env->fp_status);
2360         n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
2361         set_float_exception_flags(save_flags, &env->fp_status);
2362         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2363         ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
2364         env->fp_status.floatx80_rounding_precision = save;
2365     }
2366     merge_exception_flags(env, old_flags);
2367 }
2368 
2369 void helper_fsin(CPUX86State *env)
2370 {
2371     double fptemp = floatx80_to_double(env, ST0);
2372 
2373     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2374         env->fpus |= 0x400;
2375     } else {
2376         ST0 = double_to_floatx80(env, sin(fptemp));
2377         env->fpus &= ~0x400;  /* C2 <-- 0 */
2378         /* the above code is for |arg| < 2**53 only */
2379     }
2380 }
2381 
2382 void helper_fcos(CPUX86State *env)
2383 {
2384     double fptemp = floatx80_to_double(env, ST0);
2385 
2386     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2387         env->fpus |= 0x400;
2388     } else {
2389         ST0 = double_to_floatx80(env, cos(fptemp));
2390         env->fpus &= ~0x400;  /* C2 <-- 0 */
2391         /* the above code is for |arg| < 2**63 only */
2392     }
2393 }
2394 
2395 void helper_fxam_ST0(CPUX86State *env)
2396 {
2397     CPU_LDoubleU temp;
2398     int expdif;
2399 
2400     temp.d = ST0;
2401 
2402     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
2403     if (SIGND(temp)) {
2404         env->fpus |= 0x200; /* C1 <-- 1 */
2405     }
2406 
2407     if (env->fptags[env->fpstt]) {
2408         env->fpus |= 0x4100; /* Empty */
2409         return;
2410     }
2411 
2412     expdif = EXPD(temp);
2413     if (expdif == MAXEXPD) {
2414         if (MANTD(temp) == 0x8000000000000000ULL) {
2415             env->fpus |= 0x500; /* Infinity */
2416         } else if (MANTD(temp) & 0x8000000000000000ULL) {
2417             env->fpus |= 0x100; /* NaN */
2418         }
2419     } else if (expdif == 0) {
2420         if (MANTD(temp) == 0) {
2421             env->fpus |=  0x4000; /* Zero */
2422         } else {
2423             env->fpus |= 0x4400; /* Denormal */
2424         }
2425     } else if (MANTD(temp) & 0x8000000000000000ULL) {
2426         env->fpus |= 0x400;
2427     }
2428 }
2429 
2430 static void do_fstenv(X86Access *ac, target_ulong ptr, int data32)
2431 {
2432     CPUX86State *env = ac->env;
2433     int fpus, fptag, exp, i;
2434     uint64_t mant;
2435     CPU_LDoubleU tmp;
2436 
2437     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2438     fptag = 0;
2439     for (i = 7; i >= 0; i--) {
2440         fptag <<= 2;
2441         if (env->fptags[i]) {
2442             fptag |= 3;
2443         } else {
2444             tmp.d = env->fpregs[i].d;
2445             exp = EXPD(tmp);
2446             mant = MANTD(tmp);
2447             if (exp == 0 && mant == 0) {
2448                 /* zero */
2449                 fptag |= 1;
2450             } else if (exp == 0 || exp == MAXEXPD
2451                        || (mant & (1LL << 63)) == 0) {
2452                 /* NaNs, infinity, denormal */
2453                 fptag |= 2;
2454             }
2455         }
2456     }
2457     if (data32) {
2458         /* 32 bit */
2459         access_stl(ac, ptr, env->fpuc);
2460         access_stl(ac, ptr + 4, fpus);
2461         access_stl(ac, ptr + 8, fptag);
2462         access_stl(ac, ptr + 12, env->fpip); /* fpip */
2463         access_stl(ac, ptr + 16, env->fpcs); /* fpcs */
2464         access_stl(ac, ptr + 20, env->fpdp); /* fpoo */
2465         access_stl(ac, ptr + 24, env->fpds); /* fpos */
2466     } else {
2467         /* 16 bit */
2468         access_stw(ac, ptr, env->fpuc);
2469         access_stw(ac, ptr + 2, fpus);
2470         access_stw(ac, ptr + 4, fptag);
2471         access_stw(ac, ptr + 6, env->fpip);
2472         access_stw(ac, ptr + 8, env->fpcs);
2473         access_stw(ac, ptr + 10, env->fpdp);
2474         access_stw(ac, ptr + 12, env->fpds);
2475     }
2476 }
2477 
2478 void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
2479 {
2480     X86Access ac;
2481 
2482     access_prepare(&ac, env, ptr, 14 << data32, MMU_DATA_STORE, GETPC());
2483     do_fstenv(&ac, ptr, data32);
2484 }
2485 
2486 static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
2487 {
2488     env->fpstt = (fpus >> 11) & 7;
2489     env->fpus = fpus & ~0x3800 & ~FPUS_B;
2490     env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
2491 #if !defined(CONFIG_USER_ONLY)
2492     if (!(env->fpus & FPUS_SE)) {
2493         /*
2494          * Here the processor deasserts FERR#; in response, the chipset deasserts
2495          * IGNNE#.
2496          */
2497         cpu_clear_ignne();
2498     }
2499 #endif
2500 }
2501 
2502 static void do_fldenv(X86Access *ac, target_ulong ptr, int data32)
2503 {
2504     int i, fpus, fptag;
2505     CPUX86State *env = ac->env;
2506 
2507     cpu_set_fpuc(env, access_ldw(ac, ptr));
2508     fpus = access_ldw(ac, ptr + (2 << data32));
2509     fptag = access_ldw(ac, ptr + (4 << data32));
2510 
2511     cpu_set_fpus(env, fpus);
2512     for (i = 0; i < 8; i++) {
2513         env->fptags[i] = ((fptag & 3) == 3);
2514         fptag >>= 2;
2515     }
2516 }
2517 
2518 void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
2519 {
2520     X86Access ac;
2521 
2522     access_prepare(&ac, env, ptr, 14 << data32, MMU_DATA_STORE, GETPC());
2523     do_fldenv(&ac, ptr, data32);
2524 }
2525 
2526 static void do_fsave(X86Access *ac, target_ulong ptr, int data32)
2527 {
2528     CPUX86State *env = ac->env;
2529 
2530     do_fstenv(ac, ptr, data32);
2531     ptr += 14 << data32;
2532 
2533     for (int i = 0; i < 8; i++) {
2534         floatx80 tmp = ST(i);
2535         do_fstt(ac, ptr, tmp);
2536         ptr += 10;
2537     }
2538 
2539     do_fninit(env);
2540 }
2541 
2542 void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
2543 {
2544     int size = (14 << data32) + 80;
2545     X86Access ac;
2546 
2547     access_prepare(&ac, env, ptr, size, MMU_DATA_STORE, GETPC());
2548     do_fsave(&ac, ptr, data32);
2549 }
2550 
2551 static void do_frstor(X86Access *ac, target_ulong ptr, int data32)
2552 {
2553     CPUX86State *env = ac->env;
2554 
2555     do_fldenv(ac, ptr, data32);
2556     ptr += 14 << data32;
2557 
2558     for (int i = 0; i < 8; i++) {
2559         floatx80 tmp = do_fldt(ac, ptr);
2560         ST(i) = tmp;
2561         ptr += 10;
2562     }
2563 }
2564 
2565 void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
2566 {
2567     int size = (14 << data32) + 80;
2568     X86Access ac;
2569 
2570     access_prepare(&ac, env, ptr, size, MMU_DATA_LOAD, GETPC());
2571     do_frstor(&ac, ptr, data32);
2572 }
2573 
2574 #define XO(X)  offsetof(X86XSaveArea, X)
2575 
2576 static void do_xsave_fpu(X86Access *ac, target_ulong ptr)
2577 {
2578     CPUX86State *env = ac->env;
2579     int fpus, fptag, i;
2580     target_ulong addr;
2581 
2582     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2583     fptag = 0;
2584     for (i = 0; i < 8; i++) {
2585         fptag |= (env->fptags[i] << i);
2586     }
2587 
2588     access_stw(ac, ptr + XO(legacy.fcw), env->fpuc);
2589     access_stw(ac, ptr + XO(legacy.fsw), fpus);
2590     access_stw(ac, ptr + XO(legacy.ftw), fptag ^ 0xff);
2591 
2592     /* In 32-bit mode this is eip, sel, dp, sel.
2593        In 64-bit mode this is rip, rdp.
2594        But in either case we don't write actual data, just zeros.  */
2595     access_stq(ac, ptr + XO(legacy.fpip), 0); /* eip+sel; rip */
2596     access_stq(ac, ptr + XO(legacy.fpdp), 0); /* edp+sel; rdp */
2597 
2598     addr = ptr + XO(legacy.fpregs);
2599 
2600     for (i = 0; i < 8; i++) {
2601         floatx80 tmp = ST(i);
2602         do_fstt(ac, addr, tmp);
2603         addr += 16;
2604     }
2605 }
2606 
2607 static void do_xsave_mxcsr(X86Access *ac, target_ulong ptr)
2608 {
2609     CPUX86State *env = ac->env;
2610 
2611     update_mxcsr_from_sse_status(env);
2612     access_stl(ac, ptr + XO(legacy.mxcsr), env->mxcsr);
2613     access_stl(ac, ptr + XO(legacy.mxcsr_mask), 0x0000ffff);
2614 }
2615 
2616 static void do_xsave_sse(X86Access *ac, target_ulong ptr)
2617 {
2618     CPUX86State *env = ac->env;
2619     int i, nb_xmm_regs;
2620     target_ulong addr;
2621 
2622     if (env->hflags & HF_CS64_MASK) {
2623         nb_xmm_regs = 16;
2624     } else {
2625         nb_xmm_regs = 8;
2626     }
2627 
2628     addr = ptr + XO(legacy.xmm_regs);
2629     for (i = 0; i < nb_xmm_regs; i++) {
2630         access_stq(ac, addr, env->xmm_regs[i].ZMM_Q(0));
2631         access_stq(ac, addr + 8, env->xmm_regs[i].ZMM_Q(1));
2632         addr += 16;
2633     }
2634 }
2635 
2636 static void do_xsave_ymmh(X86Access *ac, target_ulong ptr)
2637 {
2638     CPUX86State *env = ac->env;
2639     int i, nb_xmm_regs;
2640 
2641     if (env->hflags & HF_CS64_MASK) {
2642         nb_xmm_regs = 16;
2643     } else {
2644         nb_xmm_regs = 8;
2645     }
2646 
2647     for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
2648         access_stq(ac, ptr, env->xmm_regs[i].ZMM_Q(2));
2649         access_stq(ac, ptr + 8, env->xmm_regs[i].ZMM_Q(3));
2650     }
2651 }
2652 
2653 static void do_xsave_bndregs(X86Access *ac, target_ulong ptr)
2654 {
2655     CPUX86State *env = ac->env;
2656     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2657     int i;
2658 
2659     for (i = 0; i < 4; i++, addr += 16) {
2660         access_stq(ac, addr, env->bnd_regs[i].lb);
2661         access_stq(ac, addr + 8, env->bnd_regs[i].ub);
2662     }
2663 }
2664 
2665 static void do_xsave_bndcsr(X86Access *ac, target_ulong ptr)
2666 {
2667     CPUX86State *env = ac->env;
2668 
2669     access_stq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
2670                env->bndcs_regs.cfgu);
2671     access_stq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
2672                env->bndcs_regs.sts);
2673 }
2674 
2675 static void do_xsave_pkru(X86Access *ac, target_ulong ptr)
2676 {
2677     access_stq(ac, ptr, ac->env->pkru);
2678 }
2679 
2680 static void do_fxsave(X86Access *ac, target_ulong ptr)
2681 {
2682     CPUX86State *env = ac->env;
2683 
2684     do_xsave_fpu(ac, ptr);
2685     if (env->cr[4] & CR4_OSFXSR_MASK) {
2686         do_xsave_mxcsr(ac, ptr);
2687         /* Fast FXSAVE leaves out the XMM registers */
2688         if (!(env->efer & MSR_EFER_FFXSR)
2689             || (env->hflags & HF_CPL_MASK)
2690             || !(env->hflags & HF_LMA_MASK)) {
2691             do_xsave_sse(ac, ptr);
2692         }
2693     }
2694 }
2695 
2696 void helper_fxsave(CPUX86State *env, target_ulong ptr)
2697 {
2698     uintptr_t ra = GETPC();
2699     X86Access ac;
2700 
2701     /* The operand must be 16 byte aligned */
2702     if (ptr & 0xf) {
2703         raise_exception_ra(env, EXCP0D_GPF, ra);
2704     }
2705 
2706     access_prepare(&ac, env, ptr, sizeof(X86LegacyXSaveArea),
2707                    MMU_DATA_STORE, ra);
2708     do_fxsave(&ac, ptr);
2709 }
2710 
2711 static uint64_t get_xinuse(CPUX86State *env)
2712 {
2713     uint64_t inuse = -1;
2714 
2715     /* For the most part, we don't track XINUSE.  We could calculate it
2716        here for all components, but it's probably less work to simply
2717        indicate in use.  That said, the state of BNDREGS is important
2718        enough to track in HFLAGS, so we might as well use that here.  */
2719     if ((env->hflags & HF_MPX_IU_MASK) == 0) {
2720        inuse &= ~XSTATE_BNDREGS_MASK;
2721     }
2722     return inuse;
2723 }
2724 
2725 static void do_xsave_access(X86Access *ac, target_ulong ptr, uint64_t rfbm,
2726                             uint64_t inuse, uint64_t opt)
2727 {
2728     uint64_t old_bv, new_bv;
2729 
2730     if (opt & XSTATE_FP_MASK) {
2731         do_xsave_fpu(ac, ptr);
2732     }
2733     if (rfbm & XSTATE_SSE_MASK) {
2734         /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
2735         do_xsave_mxcsr(ac, ptr);
2736     }
2737     if (opt & XSTATE_SSE_MASK) {
2738         do_xsave_sse(ac, ptr);
2739     }
2740     if (opt & XSTATE_YMM_MASK) {
2741         do_xsave_ymmh(ac, ptr + XO(avx_state));
2742     }
2743     if (opt & XSTATE_BNDREGS_MASK) {
2744         do_xsave_bndregs(ac, ptr + XO(bndreg_state));
2745     }
2746     if (opt & XSTATE_BNDCSR_MASK) {
2747         do_xsave_bndcsr(ac, ptr + XO(bndcsr_state));
2748     }
2749     if (opt & XSTATE_PKRU_MASK) {
2750         do_xsave_pkru(ac, ptr + XO(pkru_state));
2751     }
2752 
2753     /* Update the XSTATE_BV field.  */
2754     old_bv = access_ldq(ac, ptr + XO(header.xstate_bv));
2755     new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
2756     access_stq(ac, ptr + XO(header.xstate_bv), new_bv);
2757 }
2758 
2759 static void do_xsave_chk(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2760 {
2761     /* The OS must have enabled XSAVE.  */
2762     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2763         raise_exception_ra(env, EXCP06_ILLOP, ra);
2764     }
2765 
2766     /* The operand must be 64 byte aligned.  */
2767     if (ptr & 63) {
2768         raise_exception_ra(env, EXCP0D_GPF, ra);
2769     }
2770 }
2771 
2772 static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
2773                      uint64_t inuse, uint64_t opt, uintptr_t ra)
2774 {
2775     X86Access ac;
2776     unsigned size;
2777 
2778     do_xsave_chk(env, ptr, ra);
2779 
2780     /* Never save anything not enabled by XCR0.  */
2781     rfbm &= env->xcr0;
2782     opt &= rfbm;
2783     size = xsave_area_size(opt, false);
2784 
2785     access_prepare(&ac, env, ptr, size, MMU_DATA_STORE, ra);
2786     do_xsave_access(&ac, ptr, rfbm, inuse, opt);
2787 }
2788 
2789 void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2790 {
2791     do_xsave(env, ptr, rfbm, get_xinuse(env), rfbm, GETPC());
2792 }
2793 
2794 void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2795 {
2796     uint64_t inuse = get_xinuse(env);
2797     do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
2798 }
2799 
2800 static void do_xrstor_fpu(X86Access *ac, target_ulong ptr)
2801 {
2802     CPUX86State *env = ac->env;
2803     int i, fpuc, fpus, fptag;
2804     target_ulong addr;
2805 
2806     fpuc = access_ldw(ac, ptr + XO(legacy.fcw));
2807     fpus = access_ldw(ac, ptr + XO(legacy.fsw));
2808     fptag = access_ldw(ac, ptr + XO(legacy.ftw));
2809     cpu_set_fpuc(env, fpuc);
2810     cpu_set_fpus(env, fpus);
2811 
2812     fptag ^= 0xff;
2813     for (i = 0; i < 8; i++) {
2814         env->fptags[i] = ((fptag >> i) & 1);
2815     }
2816 
2817     addr = ptr + XO(legacy.fpregs);
2818 
2819     for (i = 0; i < 8; i++) {
2820         floatx80 tmp = do_fldt(ac, addr);
2821         ST(i) = tmp;
2822         addr += 16;
2823     }
2824 }
2825 
2826 static void do_xrstor_mxcsr(X86Access *ac, target_ulong ptr)
2827 {
2828     CPUX86State *env = ac->env;
2829     cpu_set_mxcsr(env, access_ldl(ac, ptr + XO(legacy.mxcsr)));
2830 }
2831 
2832 static void do_xrstor_sse(X86Access *ac, target_ulong ptr)
2833 {
2834     CPUX86State *env = ac->env;
2835     int i, nb_xmm_regs;
2836     target_ulong addr;
2837 
2838     if (env->hflags & HF_CS64_MASK) {
2839         nb_xmm_regs = 16;
2840     } else {
2841         nb_xmm_regs = 8;
2842     }
2843 
2844     addr = ptr + XO(legacy.xmm_regs);
2845     for (i = 0; i < nb_xmm_regs; i++) {
2846         env->xmm_regs[i].ZMM_Q(0) = access_ldq(ac, addr);
2847         env->xmm_regs[i].ZMM_Q(1) = access_ldq(ac, addr + 8);
2848         addr += 16;
2849     }
2850 }
2851 
2852 static void do_clear_sse(CPUX86State *env)
2853 {
2854     int i, nb_xmm_regs;
2855 
2856     if (env->hflags & HF_CS64_MASK) {
2857         nb_xmm_regs = 16;
2858     } else {
2859         nb_xmm_regs = 8;
2860     }
2861 
2862     for (i = 0; i < nb_xmm_regs; i++) {
2863         env->xmm_regs[i].ZMM_Q(0) = 0;
2864         env->xmm_regs[i].ZMM_Q(1) = 0;
2865     }
2866 }
2867 
2868 static void do_xrstor_ymmh(X86Access *ac, target_ulong ptr)
2869 {
2870     CPUX86State *env = ac->env;
2871     int i, nb_xmm_regs;
2872 
2873     if (env->hflags & HF_CS64_MASK) {
2874         nb_xmm_regs = 16;
2875     } else {
2876         nb_xmm_regs = 8;
2877     }
2878 
2879     for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
2880         env->xmm_regs[i].ZMM_Q(2) = access_ldq(ac, ptr);
2881         env->xmm_regs[i].ZMM_Q(3) = access_ldq(ac, ptr + 8);
2882     }
2883 }
2884 
2885 static void do_clear_ymmh(CPUX86State *env)
2886 {
2887     int i, nb_xmm_regs;
2888 
2889     if (env->hflags & HF_CS64_MASK) {
2890         nb_xmm_regs = 16;
2891     } else {
2892         nb_xmm_regs = 8;
2893     }
2894 
2895     for (i = 0; i < nb_xmm_regs; i++) {
2896         env->xmm_regs[i].ZMM_Q(2) = 0;
2897         env->xmm_regs[i].ZMM_Q(3) = 0;
2898     }
2899 }
2900 
2901 static void do_xrstor_bndregs(X86Access *ac, target_ulong ptr)
2902 {
2903     CPUX86State *env = ac->env;
2904     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2905     int i;
2906 
2907     for (i = 0; i < 4; i++, addr += 16) {
2908         env->bnd_regs[i].lb = access_ldq(ac, addr);
2909         env->bnd_regs[i].ub = access_ldq(ac, addr + 8);
2910     }
2911 }
2912 
2913 static void do_xrstor_bndcsr(X86Access *ac, target_ulong ptr)
2914 {
2915     CPUX86State *env = ac->env;
2916 
2917     /* FIXME: Extend highest implemented bit of linear address.  */
2918     env->bndcs_regs.cfgu
2919         = access_ldq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu));
2920     env->bndcs_regs.sts
2921         = access_ldq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.sts));
2922 }
2923 
2924 static void do_xrstor_pkru(X86Access *ac, target_ulong ptr)
2925 {
2926     ac->env->pkru = access_ldq(ac, ptr);
2927 }
2928 
2929 static void do_fxrstor(X86Access *ac, target_ulong ptr)
2930 {
2931     CPUX86State *env = ac->env;
2932 
2933     do_xrstor_fpu(ac, ptr);
2934     if (env->cr[4] & CR4_OSFXSR_MASK) {
2935         do_xrstor_mxcsr(ac, ptr);
2936         /* Fast FXRSTOR leaves out the XMM registers */
2937         if (!(env->efer & MSR_EFER_FFXSR)
2938             || (env->hflags & HF_CPL_MASK)
2939             || !(env->hflags & HF_LMA_MASK)) {
2940             do_xrstor_sse(ac, ptr);
2941         }
2942     }
2943 }
2944 
2945 void helper_fxrstor(CPUX86State *env, target_ulong ptr)
2946 {
2947     uintptr_t ra = GETPC();
2948     X86Access ac;
2949 
2950     /* The operand must be 16 byte aligned */
2951     if (ptr & 0xf) {
2952         raise_exception_ra(env, EXCP0D_GPF, ra);
2953     }
2954 
2955     access_prepare(&ac, env, ptr, sizeof(X86LegacyXSaveArea),
2956                    MMU_DATA_LOAD, ra);
2957     do_fxrstor(&ac, ptr);
2958 }
2959 
2960 static bool valid_xrstor_header(X86Access *ac, uint64_t *pxsbv,
2961                                 target_ulong ptr)
2962 {
2963     uint64_t xstate_bv, xcomp_bv, reserve0;
2964 
2965     xstate_bv = access_ldq(ac, ptr + XO(header.xstate_bv));
2966     xcomp_bv = access_ldq(ac, ptr + XO(header.xcomp_bv));
2967     reserve0 = access_ldq(ac, ptr + XO(header.reserve0));
2968     *pxsbv = xstate_bv;
2969 
2970     /*
2971      * XCOMP_BV bit 63 indicates compact form, which we do not support,
2972      * and thus must raise #GP.  That leaves us in standard form.
2973      * In standard form, bytes 23:8 must be zero -- which is both
2974      * XCOMP_BV and the following 64-bit field.
2975      */
2976     if (xcomp_bv || reserve0) {
2977         return false;
2978     }
2979 
2980     /* The XSTATE_BV field must not set bits not present in XCR0.  */
2981     return (xstate_bv & ~ac->env->xcr0) == 0;
2982 }
2983 
2984 static void do_xrstor(X86Access *ac, target_ulong ptr,
2985                       uint64_t rfbm, uint64_t xstate_bv)
2986 {
2987     CPUX86State *env = ac->env;
2988 
2989     if (rfbm & XSTATE_FP_MASK) {
2990         if (xstate_bv & XSTATE_FP_MASK) {
2991             do_xrstor_fpu(ac, ptr);
2992         } else {
2993             do_fninit(env);
2994             memset(env->fpregs, 0, sizeof(env->fpregs));
2995         }
2996     }
2997     if (rfbm & XSTATE_SSE_MASK) {
2998         /* Note that the standard form of XRSTOR loads MXCSR from memory
2999            whether or not the XSTATE_BV bit is set.  */
3000         do_xrstor_mxcsr(ac, ptr);
3001         if (xstate_bv & XSTATE_SSE_MASK) {
3002             do_xrstor_sse(ac, ptr);
3003         } else {
3004             do_clear_sse(env);
3005         }
3006     }
3007     if (rfbm & XSTATE_YMM_MASK) {
3008         if (xstate_bv & XSTATE_YMM_MASK) {
3009             do_xrstor_ymmh(ac, ptr + XO(avx_state));
3010         } else {
3011             do_clear_ymmh(env);
3012         }
3013     }
3014     if (rfbm & XSTATE_BNDREGS_MASK) {
3015         if (xstate_bv & XSTATE_BNDREGS_MASK) {
3016             do_xrstor_bndregs(ac, ptr + XO(bndreg_state));
3017             env->hflags |= HF_MPX_IU_MASK;
3018         } else {
3019             memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
3020             env->hflags &= ~HF_MPX_IU_MASK;
3021         }
3022     }
3023     if (rfbm & XSTATE_BNDCSR_MASK) {
3024         if (xstate_bv & XSTATE_BNDCSR_MASK) {
3025             do_xrstor_bndcsr(ac, ptr + XO(bndcsr_state));
3026         } else {
3027             memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
3028         }
3029         cpu_sync_bndcs_hflags(env);
3030     }
3031     if (rfbm & XSTATE_PKRU_MASK) {
3032         uint64_t old_pkru = env->pkru;
3033         if (xstate_bv & XSTATE_PKRU_MASK) {
3034             do_xrstor_pkru(ac, ptr + XO(pkru_state));
3035         } else {
3036             env->pkru = 0;
3037         }
3038         if (env->pkru != old_pkru) {
3039             CPUState *cs = env_cpu(env);
3040             tlb_flush(cs);
3041         }
3042     }
3043 }
3044 
3045 #undef XO
3046 
3047 void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
3048 {
3049     uintptr_t ra = GETPC();
3050     X86Access ac;
3051     uint64_t xstate_bv;
3052     unsigned size, size_ext;
3053 
3054     do_xsave_chk(env, ptr, ra);
3055 
3056     /* Begin with just the minimum size to validate the header. */
3057     size = sizeof(X86LegacyXSaveArea) + sizeof(X86XSaveHeader);
3058     access_prepare(&ac, env, ptr, size, MMU_DATA_LOAD, ra);
3059     if (!valid_xrstor_header(&ac, &xstate_bv, ptr)) {
3060         raise_exception_ra(env, EXCP0D_GPF, ra);
3061     }
3062 
3063     rfbm &= env->xcr0;
3064     size_ext = xsave_area_size(rfbm & xstate_bv, false);
3065     if (size < size_ext) {
3066         /* TODO: See if existing page probe has covered extra size. */
3067         access_prepare(&ac, env, ptr, size_ext, MMU_DATA_LOAD, ra);
3068     }
3069 
3070     do_xrstor(&ac, ptr, rfbm, xstate_bv);
3071 }
3072 
3073 #if defined(CONFIG_USER_ONLY)
3074 void cpu_x86_fsave(CPUX86State *env, void *host, size_t len)
3075 {
3076     X86Access ac = {
3077         .haddr1 = host,
3078         .size = 4 * 7 + 8 * 10,
3079         .env = env,
3080     };
3081 
3082     assert(ac.size <= len);
3083     do_fsave(&ac, 0, true);
3084 }
3085 
3086 void cpu_x86_frstor(CPUX86State *env, void *host, size_t len)
3087 {
3088     X86Access ac = {
3089         .haddr1 = host,
3090         .size = 4 * 7 + 8 * 10,
3091         .env = env,
3092     };
3093 
3094     assert(ac.size <= len);
3095     do_frstor(&ac, 0, true);
3096 }
3097 
3098 void cpu_x86_fxsave(CPUX86State *env, void *host, size_t len)
3099 {
3100     X86Access ac = {
3101         .haddr1 = host,
3102         .size = sizeof(X86LegacyXSaveArea),
3103         .env = env,
3104     };
3105 
3106     assert(ac.size <= len);
3107     do_fxsave(&ac, 0);
3108 }
3109 
3110 void cpu_x86_fxrstor(CPUX86State *env, void *host, size_t len)
3111 {
3112     X86Access ac = {
3113         .haddr1 = host,
3114         .size = sizeof(X86LegacyXSaveArea),
3115         .env = env,
3116     };
3117 
3118     assert(ac.size <= len);
3119     do_fxrstor(&ac, 0);
3120 }
3121 
3122 void cpu_x86_xsave(CPUX86State *env, void *host, size_t len, uint64_t rfbm)
3123 {
3124     X86Access ac = {
3125         .haddr1 = host,
3126         .env = env,
3127     };
3128 
3129     /*
3130      * Since this is only called from user-level signal handling,
3131      * we should have done the job correctly there.
3132      */
3133     assert((rfbm & ~env->xcr0) == 0);
3134     ac.size = xsave_area_size(rfbm, false);
3135     assert(ac.size <= len);
3136     do_xsave_access(&ac, 0, rfbm, get_xinuse(env), rfbm);
3137 }
3138 
3139 bool cpu_x86_xrstor(CPUX86State *env, void *host, size_t len, uint64_t rfbm)
3140 {
3141     X86Access ac = {
3142         .haddr1 = host,
3143         .env = env,
3144     };
3145     uint64_t xstate_bv;
3146 
3147     /*
3148      * Since this is only called from user-level signal handling,
3149      * we should have done the job correctly there.
3150      */
3151     assert((rfbm & ~env->xcr0) == 0);
3152     ac.size = xsave_area_size(rfbm, false);
3153     assert(ac.size <= len);
3154 
3155     if (!valid_xrstor_header(&ac, &xstate_bv, 0)) {
3156         return false;
3157     }
3158     do_xrstor(&ac, 0, rfbm, xstate_bv);
3159     return true;
3160 }
3161 #endif
3162 
3163 uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
3164 {
3165     /* The OS must have enabled XSAVE.  */
3166     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
3167         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
3168     }
3169 
3170     switch (ecx) {
3171     case 0:
3172         return env->xcr0;
3173     case 1:
3174         if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
3175             return env->xcr0 & get_xinuse(env);
3176         }
3177         break;
3178     }
3179     raise_exception_ra(env, EXCP0D_GPF, GETPC());
3180 }
3181 
3182 void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
3183 {
3184     uint32_t dummy, ena_lo, ena_hi;
3185     uint64_t ena;
3186 
3187     /* The OS must have enabled XSAVE.  */
3188     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
3189         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
3190     }
3191 
3192     /* Only XCR0 is defined at present; the FPU may not be disabled.  */
3193     if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
3194         goto do_gpf;
3195     }
3196 
3197     /* SSE can be disabled, but only if AVX is disabled too.  */
3198     if ((mask & (XSTATE_SSE_MASK | XSTATE_YMM_MASK)) == XSTATE_YMM_MASK) {
3199         goto do_gpf;
3200     }
3201 
3202     /* Disallow enabling unimplemented features.  */
3203     cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
3204     ena = ((uint64_t)ena_hi << 32) | ena_lo;
3205     if (mask & ~ena) {
3206         goto do_gpf;
3207     }
3208 
3209     /* Disallow enabling only half of MPX.  */
3210     if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
3211         & XSTATE_BNDCSR_MASK) {
3212         goto do_gpf;
3213     }
3214 
3215     env->xcr0 = mask;
3216     cpu_sync_bndcs_hflags(env);
3217     cpu_sync_avx_hflag(env);
3218     return;
3219 
3220  do_gpf:
3221     raise_exception_ra(env, EXCP0D_GPF, GETPC());
3222 }
3223 
3224 /* MMX/SSE */
3225 /* XXX: optimize by storing fptt and fptags in the static cpu state */
3226 
3227 #define SSE_DAZ             0x0040
3228 #define SSE_RC_SHIFT        13
3229 #define SSE_RC_MASK         (3 << SSE_RC_SHIFT)
3230 #define SSE_FZ              0x8000
3231 
3232 void update_mxcsr_status(CPUX86State *env)
3233 {
3234     uint32_t mxcsr = env->mxcsr;
3235     int rnd_type;
3236 
3237     /* set rounding mode */
3238     rnd_type = (mxcsr & SSE_RC_MASK) >> SSE_RC_SHIFT;
3239     set_x86_rounding_mode(rnd_type, &env->sse_status);
3240 
3241     /* Set exception flags.  */
3242     set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) |
3243                               (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) |
3244                               (mxcsr & FPUS_OE ? float_flag_overflow : 0) |
3245                               (mxcsr & FPUS_UE ? float_flag_underflow : 0) |
3246                               (mxcsr & FPUS_PE ? float_flag_inexact : 0),
3247                               &env->sse_status);
3248 
3249     /* set denormals are zero */
3250     set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
3251 
3252     /* set flush to zero */
3253     set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->sse_status);
3254 }
3255 
3256 void update_mxcsr_from_sse_status(CPUX86State *env)
3257 {
3258     uint8_t flags = get_float_exception_flags(&env->sse_status);
3259     /*
3260      * The MXCSR denormal flag has opposite semantics to
3261      * float_flag_input_denormal (the softfloat code sets that flag
3262      * only when flushing input denormals to zero, but SSE sets it
3263      * only when not flushing them to zero), so is not converted
3264      * here.
3265      */
3266     env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) |
3267                    (flags & float_flag_divbyzero ? FPUS_ZE : 0) |
3268                    (flags & float_flag_overflow ? FPUS_OE : 0) |
3269                    (flags & float_flag_underflow ? FPUS_UE : 0) |
3270                    (flags & float_flag_inexact ? FPUS_PE : 0) |
3271                    (flags & float_flag_output_denormal ? FPUS_UE | FPUS_PE :
3272                     0));
3273 }
3274 
3275 void helper_update_mxcsr(CPUX86State *env)
3276 {
3277     update_mxcsr_from_sse_status(env);
3278 }
3279 
3280 void helper_ldmxcsr(CPUX86State *env, uint32_t val)
3281 {
3282     cpu_set_mxcsr(env, val);
3283 }
3284 
3285 void helper_enter_mmx(CPUX86State *env)
3286 {
3287     env->fpstt = 0;
3288     *(uint32_t *)(env->fptags) = 0;
3289     *(uint32_t *)(env->fptags + 4) = 0;
3290 }
3291 
3292 void helper_emms(CPUX86State *env)
3293 {
3294     /* set to empty state */
3295     *(uint32_t *)(env->fptags) = 0x01010101;
3296     *(uint32_t *)(env->fptags + 4) = 0x01010101;
3297 }
3298 
3299 #define SHIFT 0
3300 #include "ops_sse.h"
3301 
3302 #define SHIFT 1
3303 #include "ops_sse.h"
3304 
3305 #define SHIFT 2
3306 #include "ops_sse.h"
3307