xref: /qemu/target/i386/tcg/fpu_helper.c (revision 476d6e4c9c4965734d6f47ee299ac9f84440a9b3)
1 /*
2  *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include <math.h>
22 #include "cpu.h"
23 #include "tcg-cpu.h"
24 #include "exec/exec-all.h"
25 #include "exec/cpu_ldst.h"
26 #include "exec/helper-proto.h"
27 #include "fpu/softfloat.h"
28 #include "fpu/softfloat-macros.h"
29 #include "helper-tcg.h"
30 #include "access.h"
31 
32 /* float macros */
33 #define FT0    (env->ft0)
34 #define ST0    (env->fpregs[env->fpstt].d)
35 #define ST(n)  (env->fpregs[(env->fpstt + (n)) & 7].d)
36 #define ST1    ST(1)
37 
38 #define FPU_RC_SHIFT        10
39 #define FPU_RC_MASK         (3 << FPU_RC_SHIFT)
40 #define FPU_RC_NEAR         0x000
41 #define FPU_RC_DOWN         0x400
42 #define FPU_RC_UP           0x800
43 #define FPU_RC_CHOP         0xc00
44 
45 #define MAXTAN 9223372036854775808.0
46 
47 /* the following deal with x86 long double-precision numbers */
48 #define MAXEXPD 0x7fff
49 #define EXPBIAS 16383
50 #define EXPD(fp)        (fp.l.upper & 0x7fff)
51 #define SIGND(fp)       ((fp.l.upper) & 0x8000)
52 #define MANTD(fp)       (fp.l.lower)
53 #define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
54 
55 #define FPUS_IE (1 << 0)
56 #define FPUS_DE (1 << 1)
57 #define FPUS_ZE (1 << 2)
58 #define FPUS_OE (1 << 3)
59 #define FPUS_UE (1 << 4)
60 #define FPUS_PE (1 << 5)
61 #define FPUS_SF (1 << 6)
62 #define FPUS_SE (1 << 7)
63 #define FPUS_B  (1 << 15)
64 
65 #define FPUC_EM 0x3f
66 
67 #define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
68 #define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
69 #define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
70 #define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
71 #define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
72 #define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
73 #define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
74 #define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
75 
76 static inline void fpush(CPUX86State *env)
77 {
78     env->fpstt = (env->fpstt - 1) & 7;
79     env->fptags[env->fpstt] = 0; /* validate stack entry */
80 }
81 
82 static inline void fpop(CPUX86State *env)
83 {
84     env->fptags[env->fpstt] = 1; /* invalidate stack entry */
85     env->fpstt = (env->fpstt + 1) & 7;
86 }
87 
88 static floatx80 do_fldt(X86Access *ac, target_ulong ptr)
89 {
90     CPU_LDoubleU temp;
91 
92     temp.l.lower = access_ldq(ac, ptr);
93     temp.l.upper = access_ldw(ac, ptr + 8);
94     return temp.d;
95 }
96 
97 static void do_fstt(X86Access *ac, target_ulong ptr, floatx80 f)
98 {
99     CPU_LDoubleU temp;
100 
101     temp.d = f;
102     access_stq(ac, ptr, temp.l.lower);
103     access_stw(ac, ptr + 8, temp.l.upper);
104 }
105 
106 /* x87 FPU helpers */
107 
108 static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
109 {
110     union {
111         float64 f64;
112         double d;
113     } u;
114 
115     u.f64 = floatx80_to_float64(a, &env->fp_status);
116     return u.d;
117 }
118 
119 static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
120 {
121     union {
122         float64 f64;
123         double d;
124     } u;
125 
126     u.d = a;
127     return float64_to_floatx80(u.f64, &env->fp_status);
128 }
129 
130 static void fpu_set_exception(CPUX86State *env, int mask)
131 {
132     env->fpus |= mask;
133     if (env->fpus & (~env->fpuc & FPUC_EM)) {
134         env->fpus |= FPUS_SE | FPUS_B;
135     }
136 }
137 
138 void cpu_init_fp_statuses(CPUX86State *env)
139 {
140     /*
141      * Initialise the non-runtime-varying fields of the various
142      * float_status words to x86 behaviour. This must be called at
143      * CPU reset because the float_status words are in the
144      * "zeroed on reset" portion of the CPU state struct.
145      * Fields in float_status that vary under guest control are set
146      * via the codepath for setting that register, eg cpu_set_fpuc().
147      */
148     /*
149      * Use x87 NaN propagation rules:
150      * SNaN + QNaN => return the QNaN
151      * two SNaNs => return the one with the larger significand, silenced
152      * two QNaNs => return the one with the larger significand
153      * SNaN and a non-NaN => return the SNaN, silenced
154      * QNaN and a non-NaN => return the QNaN
155      *
156      * If we get down to comparing significands and they are the same,
157      * return the NaN with the positive sign bit (if any).
158      */
159     set_float_2nan_prop_rule(float_2nan_prop_x87, &env->fp_status);
160     /*
161      * TODO: These are incorrect: the x86 Software Developer's Manual vol 1
162      * section 4.8.3.5 "Operating on SNaNs and QNaNs" says that the
163      * "larger significand" behaviour is only used for x87 FPU operations.
164      * For SSE the required behaviour is to always return the first NaN,
165      * which is float_2nan_prop_ab.
166      *
167      * mmx_status is used only for the AMD 3DNow! instructions, which
168      * are documented in the "3DNow! Technology Manual" as not supporting
169      * NaNs or infinities as inputs. The result of passing two NaNs is
170      * documented as "undefined", so we can do what we choose.
171      * (Strictly there is some behaviour we don't implement correctly
172      * for these "unsupported" NaN and Inf values, like "NaN * 0 == 0".)
173      */
174     set_float_2nan_prop_rule(float_2nan_prop_x87, &env->mmx_status);
175     set_float_2nan_prop_rule(float_2nan_prop_x87, &env->sse_status);
176     /*
177      * Only SSE has multiply-add instructions. In the SDM Section 14.5.2
178      * "Fused-Multiply-ADD (FMA) Numeric Behavior" the NaN handling is
179      * specified -- for 0 * inf + NaN the input NaN is selected, and if
180      * there are multiple input NaNs they are selected in the order a, b, c.
181      * We also do not raise Invalid for the 0 * inf + (Q)NaN case.
182      */
183     set_float_infzeronan_rule(float_infzeronan_dnan_never |
184                               float_infzeronan_suppress_invalid,
185                               &env->sse_status);
186     set_float_3nan_prop_rule(float_3nan_prop_abc, &env->sse_status);
187     /* Default NaN: sign bit set, most significant frac bit set */
188     set_float_default_nan_pattern(0b11000000, &env->fp_status);
189     set_float_default_nan_pattern(0b11000000, &env->mmx_status);
190     set_float_default_nan_pattern(0b11000000, &env->sse_status);
191 }
192 
193 static inline uint8_t save_exception_flags(CPUX86State *env)
194 {
195     uint8_t old_flags = get_float_exception_flags(&env->fp_status);
196     set_float_exception_flags(0, &env->fp_status);
197     return old_flags;
198 }
199 
200 static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
201 {
202     uint8_t new_flags = get_float_exception_flags(&env->fp_status);
203     float_raise(old_flags, &env->fp_status);
204     fpu_set_exception(env,
205                       ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
206                        (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
207                        (new_flags & float_flag_overflow ? FPUS_OE : 0) |
208                        (new_flags & float_flag_underflow ? FPUS_UE : 0) |
209                        (new_flags & float_flag_inexact ? FPUS_PE : 0) |
210                        (new_flags & float_flag_input_denormal_flushed ? FPUS_DE : 0)));
211 }
212 
213 static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
214 {
215     uint8_t old_flags = save_exception_flags(env);
216     floatx80 ret = floatx80_div(a, b, &env->fp_status);
217     merge_exception_flags(env, old_flags);
218     return ret;
219 }
220 
221 static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
222 {
223     if (env->cr[0] & CR0_NE_MASK) {
224         raise_exception_ra(env, EXCP10_COPR, retaddr);
225     }
226 #if !defined(CONFIG_USER_ONLY)
227     else {
228         fpu_check_raise_ferr_irq(env);
229     }
230 #endif
231 }
232 
233 void helper_flds_FT0(CPUX86State *env, uint32_t val)
234 {
235     uint8_t old_flags = save_exception_flags(env);
236     union {
237         float32 f;
238         uint32_t i;
239     } u;
240 
241     u.i = val;
242     FT0 = float32_to_floatx80(u.f, &env->fp_status);
243     merge_exception_flags(env, old_flags);
244 }
245 
246 void helper_fldl_FT0(CPUX86State *env, uint64_t val)
247 {
248     uint8_t old_flags = save_exception_flags(env);
249     union {
250         float64 f;
251         uint64_t i;
252     } u;
253 
254     u.i = val;
255     FT0 = float64_to_floatx80(u.f, &env->fp_status);
256     merge_exception_flags(env, old_flags);
257 }
258 
259 void helper_fildl_FT0(CPUX86State *env, int32_t val)
260 {
261     FT0 = int32_to_floatx80(val, &env->fp_status);
262 }
263 
264 void helper_flds_ST0(CPUX86State *env, uint32_t val)
265 {
266     uint8_t old_flags = save_exception_flags(env);
267     int new_fpstt;
268     union {
269         float32 f;
270         uint32_t i;
271     } u;
272 
273     new_fpstt = (env->fpstt - 1) & 7;
274     u.i = val;
275     env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
276     env->fpstt = new_fpstt;
277     env->fptags[new_fpstt] = 0; /* validate stack entry */
278     merge_exception_flags(env, old_flags);
279 }
280 
281 void helper_fldl_ST0(CPUX86State *env, uint64_t val)
282 {
283     uint8_t old_flags = save_exception_flags(env);
284     int new_fpstt;
285     union {
286         float64 f;
287         uint64_t i;
288     } u;
289 
290     new_fpstt = (env->fpstt - 1) & 7;
291     u.i = val;
292     env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
293     env->fpstt = new_fpstt;
294     env->fptags[new_fpstt] = 0; /* validate stack entry */
295     merge_exception_flags(env, old_flags);
296 }
297 
298 static FloatX80RoundPrec tmp_maximise_precision(float_status *st)
299 {
300     FloatX80RoundPrec old = get_floatx80_rounding_precision(st);
301     set_floatx80_rounding_precision(floatx80_precision_x, st);
302     return old;
303 }
304 
305 void helper_fildl_ST0(CPUX86State *env, int32_t val)
306 {
307     int new_fpstt;
308     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
309 
310     new_fpstt = (env->fpstt - 1) & 7;
311     env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
312     env->fpstt = new_fpstt;
313     env->fptags[new_fpstt] = 0; /* validate stack entry */
314 
315     set_floatx80_rounding_precision(old, &env->fp_status);
316 }
317 
318 void helper_fildll_ST0(CPUX86State *env, int64_t val)
319 {
320     int new_fpstt;
321     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
322 
323     new_fpstt = (env->fpstt - 1) & 7;
324     env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
325     env->fpstt = new_fpstt;
326     env->fptags[new_fpstt] = 0; /* validate stack entry */
327 
328     set_floatx80_rounding_precision(old, &env->fp_status);
329 }
330 
331 uint32_t helper_fsts_ST0(CPUX86State *env)
332 {
333     uint8_t old_flags = save_exception_flags(env);
334     union {
335         float32 f;
336         uint32_t i;
337     } u;
338 
339     u.f = floatx80_to_float32(ST0, &env->fp_status);
340     merge_exception_flags(env, old_flags);
341     return u.i;
342 }
343 
344 uint64_t helper_fstl_ST0(CPUX86State *env)
345 {
346     uint8_t old_flags = save_exception_flags(env);
347     union {
348         float64 f;
349         uint64_t i;
350     } u;
351 
352     u.f = floatx80_to_float64(ST0, &env->fp_status);
353     merge_exception_flags(env, old_flags);
354     return u.i;
355 }
356 
357 int32_t helper_fist_ST0(CPUX86State *env)
358 {
359     uint8_t old_flags = save_exception_flags(env);
360     int32_t val;
361 
362     val = floatx80_to_int32(ST0, &env->fp_status);
363     if (val != (int16_t)val) {
364         set_float_exception_flags(float_flag_invalid, &env->fp_status);
365         val = -32768;
366     }
367     merge_exception_flags(env, old_flags);
368     return val;
369 }
370 
371 int32_t helper_fistl_ST0(CPUX86State *env)
372 {
373     uint8_t old_flags = save_exception_flags(env);
374     int32_t val;
375 
376     val = floatx80_to_int32(ST0, &env->fp_status);
377     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
378         val = 0x80000000;
379     }
380     merge_exception_flags(env, old_flags);
381     return val;
382 }
383 
384 int64_t helper_fistll_ST0(CPUX86State *env)
385 {
386     uint8_t old_flags = save_exception_flags(env);
387     int64_t val;
388 
389     val = floatx80_to_int64(ST0, &env->fp_status);
390     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
391         val = 0x8000000000000000ULL;
392     }
393     merge_exception_flags(env, old_flags);
394     return val;
395 }
396 
397 int32_t helper_fistt_ST0(CPUX86State *env)
398 {
399     uint8_t old_flags = save_exception_flags(env);
400     int32_t val;
401 
402     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
403     if (val != (int16_t)val) {
404         set_float_exception_flags(float_flag_invalid, &env->fp_status);
405         val = -32768;
406     }
407     merge_exception_flags(env, old_flags);
408     return val;
409 }
410 
411 int32_t helper_fisttl_ST0(CPUX86State *env)
412 {
413     uint8_t old_flags = save_exception_flags(env);
414     int32_t val;
415 
416     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
417     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
418         val = 0x80000000;
419     }
420     merge_exception_flags(env, old_flags);
421     return val;
422 }
423 
424 int64_t helper_fisttll_ST0(CPUX86State *env)
425 {
426     uint8_t old_flags = save_exception_flags(env);
427     int64_t val;
428 
429     val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
430     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
431         val = 0x8000000000000000ULL;
432     }
433     merge_exception_flags(env, old_flags);
434     return val;
435 }
436 
437 void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
438 {
439     int new_fpstt;
440     X86Access ac;
441 
442     access_prepare(&ac, env, ptr, 10, MMU_DATA_LOAD, GETPC());
443 
444     new_fpstt = (env->fpstt - 1) & 7;
445     env->fpregs[new_fpstt].d = do_fldt(&ac, ptr);
446     env->fpstt = new_fpstt;
447     env->fptags[new_fpstt] = 0; /* validate stack entry */
448 }
449 
450 void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
451 {
452     X86Access ac;
453 
454     access_prepare(&ac, env, ptr, 10, MMU_DATA_STORE, GETPC());
455     do_fstt(&ac, ptr, ST0);
456 }
457 
458 void helper_fpush(CPUX86State *env)
459 {
460     fpush(env);
461 }
462 
463 void helper_fpop(CPUX86State *env)
464 {
465     fpop(env);
466 }
467 
468 void helper_fdecstp(CPUX86State *env)
469 {
470     env->fpstt = (env->fpstt - 1) & 7;
471     env->fpus &= ~0x4700;
472 }
473 
474 void helper_fincstp(CPUX86State *env)
475 {
476     env->fpstt = (env->fpstt + 1) & 7;
477     env->fpus &= ~0x4700;
478 }
479 
480 /* FPU move */
481 
482 void helper_ffree_STN(CPUX86State *env, int st_index)
483 {
484     env->fptags[(env->fpstt + st_index) & 7] = 1;
485 }
486 
487 void helper_fmov_ST0_FT0(CPUX86State *env)
488 {
489     ST0 = FT0;
490 }
491 
492 void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
493 {
494     FT0 = ST(st_index);
495 }
496 
497 void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
498 {
499     ST0 = ST(st_index);
500 }
501 
502 void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
503 {
504     ST(st_index) = ST0;
505 }
506 
507 void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
508 {
509     floatx80 tmp;
510 
511     tmp = ST(st_index);
512     ST(st_index) = ST0;
513     ST0 = tmp;
514 }
515 
516 /* FPU operations */
517 
518 static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
519 
520 void helper_fcom_ST0_FT0(CPUX86State *env)
521 {
522     uint8_t old_flags = save_exception_flags(env);
523     FloatRelation ret;
524 
525     ret = floatx80_compare(ST0, FT0, &env->fp_status);
526     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
527     merge_exception_flags(env, old_flags);
528 }
529 
530 void helper_fucom_ST0_FT0(CPUX86State *env)
531 {
532     uint8_t old_flags = save_exception_flags(env);
533     FloatRelation ret;
534 
535     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
536     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
537     merge_exception_flags(env, old_flags);
538 }
539 
540 static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
541 
542 void helper_fcomi_ST0_FT0(CPUX86State *env)
543 {
544     uint8_t old_flags = save_exception_flags(env);
545     int eflags;
546     FloatRelation ret;
547 
548     ret = floatx80_compare(ST0, FT0, &env->fp_status);
549     eflags = cpu_cc_compute_all(env) & ~(CC_Z | CC_P | CC_C);
550     CC_SRC = eflags | fcomi_ccval[ret + 1];
551     CC_OP = CC_OP_EFLAGS;
552     merge_exception_flags(env, old_flags);
553 }
554 
555 void helper_fucomi_ST0_FT0(CPUX86State *env)
556 {
557     uint8_t old_flags = save_exception_flags(env);
558     int eflags;
559     FloatRelation ret;
560 
561     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
562     eflags = cpu_cc_compute_all(env) & ~(CC_Z | CC_P | CC_C);
563     CC_SRC = eflags | fcomi_ccval[ret + 1];
564     CC_OP = CC_OP_EFLAGS;
565     merge_exception_flags(env, old_flags);
566 }
567 
568 void helper_fadd_ST0_FT0(CPUX86State *env)
569 {
570     uint8_t old_flags = save_exception_flags(env);
571     ST0 = floatx80_add(ST0, FT0, &env->fp_status);
572     merge_exception_flags(env, old_flags);
573 }
574 
575 void helper_fmul_ST0_FT0(CPUX86State *env)
576 {
577     uint8_t old_flags = save_exception_flags(env);
578     ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
579     merge_exception_flags(env, old_flags);
580 }
581 
582 void helper_fsub_ST0_FT0(CPUX86State *env)
583 {
584     uint8_t old_flags = save_exception_flags(env);
585     ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
586     merge_exception_flags(env, old_flags);
587 }
588 
589 void helper_fsubr_ST0_FT0(CPUX86State *env)
590 {
591     uint8_t old_flags = save_exception_flags(env);
592     ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
593     merge_exception_flags(env, old_flags);
594 }
595 
596 void helper_fdiv_ST0_FT0(CPUX86State *env)
597 {
598     ST0 = helper_fdiv(env, ST0, FT0);
599 }
600 
601 void helper_fdivr_ST0_FT0(CPUX86State *env)
602 {
603     ST0 = helper_fdiv(env, FT0, ST0);
604 }
605 
606 /* fp operations between STN and ST0 */
607 
608 void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
609 {
610     uint8_t old_flags = save_exception_flags(env);
611     ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
612     merge_exception_flags(env, old_flags);
613 }
614 
615 void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
616 {
617     uint8_t old_flags = save_exception_flags(env);
618     ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
619     merge_exception_flags(env, old_flags);
620 }
621 
622 void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
623 {
624     uint8_t old_flags = save_exception_flags(env);
625     ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
626     merge_exception_flags(env, old_flags);
627 }
628 
629 void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
630 {
631     uint8_t old_flags = save_exception_flags(env);
632     ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
633     merge_exception_flags(env, old_flags);
634 }
635 
636 void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
637 {
638     floatx80 *p;
639 
640     p = &ST(st_index);
641     *p = helper_fdiv(env, *p, ST0);
642 }
643 
644 void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
645 {
646     floatx80 *p;
647 
648     p = &ST(st_index);
649     *p = helper_fdiv(env, ST0, *p);
650 }
651 
652 /* misc FPU operations */
653 void helper_fchs_ST0(CPUX86State *env)
654 {
655     ST0 = floatx80_chs(ST0);
656 }
657 
658 void helper_fabs_ST0(CPUX86State *env)
659 {
660     ST0 = floatx80_abs(ST0);
661 }
662 
663 void helper_fld1_ST0(CPUX86State *env)
664 {
665     ST0 = floatx80_one;
666 }
667 
668 void helper_fldl2t_ST0(CPUX86State *env)
669 {
670     switch (env->fpuc & FPU_RC_MASK) {
671     case FPU_RC_UP:
672         ST0 = floatx80_l2t_u;
673         break;
674     default:
675         ST0 = floatx80_l2t;
676         break;
677     }
678 }
679 
680 void helper_fldl2e_ST0(CPUX86State *env)
681 {
682     switch (env->fpuc & FPU_RC_MASK) {
683     case FPU_RC_DOWN:
684     case FPU_RC_CHOP:
685         ST0 = floatx80_l2e_d;
686         break;
687     default:
688         ST0 = floatx80_l2e;
689         break;
690     }
691 }
692 
693 void helper_fldpi_ST0(CPUX86State *env)
694 {
695     switch (env->fpuc & FPU_RC_MASK) {
696     case FPU_RC_DOWN:
697     case FPU_RC_CHOP:
698         ST0 = floatx80_pi_d;
699         break;
700     default:
701         ST0 = floatx80_pi;
702         break;
703     }
704 }
705 
706 void helper_fldlg2_ST0(CPUX86State *env)
707 {
708     switch (env->fpuc & FPU_RC_MASK) {
709     case FPU_RC_DOWN:
710     case FPU_RC_CHOP:
711         ST0 = floatx80_lg2_d;
712         break;
713     default:
714         ST0 = floatx80_lg2;
715         break;
716     }
717 }
718 
719 void helper_fldln2_ST0(CPUX86State *env)
720 {
721     switch (env->fpuc & FPU_RC_MASK) {
722     case FPU_RC_DOWN:
723     case FPU_RC_CHOP:
724         ST0 = floatx80_ln2_d;
725         break;
726     default:
727         ST0 = floatx80_ln2;
728         break;
729     }
730 }
731 
732 void helper_fldz_ST0(CPUX86State *env)
733 {
734     ST0 = floatx80_zero;
735 }
736 
737 void helper_fldz_FT0(CPUX86State *env)
738 {
739     FT0 = floatx80_zero;
740 }
741 
742 uint32_t helper_fnstsw(CPUX86State *env)
743 {
744     return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
745 }
746 
747 uint32_t helper_fnstcw(CPUX86State *env)
748 {
749     return env->fpuc;
750 }
751 
752 static void set_x86_rounding_mode(unsigned mode, float_status *status)
753 {
754     static FloatRoundMode x86_round_mode[4] = {
755         float_round_nearest_even,
756         float_round_down,
757         float_round_up,
758         float_round_to_zero
759     };
760     assert(mode < ARRAY_SIZE(x86_round_mode));
761     set_float_rounding_mode(x86_round_mode[mode], status);
762 }
763 
764 void update_fp_status(CPUX86State *env)
765 {
766     int rnd_mode;
767     FloatX80RoundPrec rnd_prec;
768 
769     /* set rounding mode */
770     rnd_mode = (env->fpuc & FPU_RC_MASK) >> FPU_RC_SHIFT;
771     set_x86_rounding_mode(rnd_mode, &env->fp_status);
772 
773     switch ((env->fpuc >> 8) & 3) {
774     case 0:
775         rnd_prec = floatx80_precision_s;
776         break;
777     case 2:
778         rnd_prec = floatx80_precision_d;
779         break;
780     case 3:
781     default:
782         rnd_prec = floatx80_precision_x;
783         break;
784     }
785     set_floatx80_rounding_precision(rnd_prec, &env->fp_status);
786 }
787 
788 void helper_fldcw(CPUX86State *env, uint32_t val)
789 {
790     cpu_set_fpuc(env, val);
791 }
792 
793 void helper_fclex(CPUX86State *env)
794 {
795     env->fpus &= 0x7f00;
796 }
797 
798 void helper_fwait(CPUX86State *env)
799 {
800     if (env->fpus & FPUS_SE) {
801         fpu_raise_exception(env, GETPC());
802     }
803 }
804 
805 static void do_fninit(CPUX86State *env)
806 {
807     env->fpus = 0;
808     env->fpstt = 0;
809     env->fpcs = 0;
810     env->fpds = 0;
811     env->fpip = 0;
812     env->fpdp = 0;
813     cpu_set_fpuc(env, 0x37f);
814     env->fptags[0] = 1;
815     env->fptags[1] = 1;
816     env->fptags[2] = 1;
817     env->fptags[3] = 1;
818     env->fptags[4] = 1;
819     env->fptags[5] = 1;
820     env->fptags[6] = 1;
821     env->fptags[7] = 1;
822 }
823 
824 void helper_fninit(CPUX86State *env)
825 {
826     do_fninit(env);
827 }
828 
829 /* BCD ops */
830 
831 void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
832 {
833     X86Access ac;
834     floatx80 tmp;
835     uint64_t val;
836     unsigned int v;
837     int i;
838 
839     access_prepare(&ac, env, ptr, 10, MMU_DATA_LOAD, GETPC());
840 
841     val = 0;
842     for (i = 8; i >= 0; i--) {
843         v = access_ldb(&ac, ptr + i);
844         val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
845     }
846     tmp = int64_to_floatx80(val, &env->fp_status);
847     if (access_ldb(&ac, ptr + 9) & 0x80) {
848         tmp = floatx80_chs(tmp);
849     }
850     fpush(env);
851     ST0 = tmp;
852 }
853 
854 void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
855 {
856     uint8_t old_flags = save_exception_flags(env);
857     int v;
858     target_ulong mem_ref, mem_end;
859     int64_t val;
860     CPU_LDoubleU temp;
861     X86Access ac;
862 
863     access_prepare(&ac, env, ptr, 10, MMU_DATA_STORE, GETPC());
864     temp.d = ST0;
865 
866     val = floatx80_to_int64(ST0, &env->fp_status);
867     mem_ref = ptr;
868     if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
869         set_float_exception_flags(float_flag_invalid, &env->fp_status);
870         while (mem_ref < ptr + 7) {
871             access_stb(&ac, mem_ref++, 0);
872         }
873         access_stb(&ac, mem_ref++, 0xc0);
874         access_stb(&ac, mem_ref++, 0xff);
875         access_stb(&ac, mem_ref++, 0xff);
876         merge_exception_flags(env, old_flags);
877         return;
878     }
879     mem_end = mem_ref + 9;
880     if (SIGND(temp)) {
881         access_stb(&ac, mem_end, 0x80);
882         val = -val;
883     } else {
884         access_stb(&ac, mem_end, 0x00);
885     }
886     while (mem_ref < mem_end) {
887         if (val == 0) {
888             break;
889         }
890         v = val % 100;
891         val = val / 100;
892         v = ((v / 10) << 4) | (v % 10);
893         access_stb(&ac, mem_ref++, v);
894     }
895     while (mem_ref < mem_end) {
896         access_stb(&ac, mem_ref++, 0);
897     }
898     merge_exception_flags(env, old_flags);
899 }
900 
901 /* 128-bit significand of log(2).  */
902 #define ln2_sig_high 0xb17217f7d1cf79abULL
903 #define ln2_sig_low 0xc9e3b39803f2f6afULL
904 
905 /*
906  * Polynomial coefficients for an approximation to (2^x - 1) / x, on
907  * the interval [-1/64, 1/64].
908  */
909 #define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL)
910 #define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL)
911 #define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL)
912 #define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL)
913 #define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL)
914 #define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL)
915 #define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL)
916 #define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL)
917 #define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL)
918 
919 struct f2xm1_data {
920     /*
921      * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1
922      * are very close to exact floatx80 values.
923      */
924     floatx80 t;
925     /* The value of 2^t.  */
926     floatx80 exp2;
927     /* The value of 2^t - 1.  */
928     floatx80 exp2m1;
929 };
930 
931 static const struct f2xm1_data f2xm1_table[65] = {
932     { make_floatx80_init(0xbfff, 0x8000000000000000ULL),
933       make_floatx80_init(0x3ffe, 0x8000000000000000ULL),
934       make_floatx80_init(0xbffe, 0x8000000000000000ULL) },
935     { make_floatx80_init(0xbffe, 0xf800000000002e7eULL),
936       make_floatx80_init(0x3ffe, 0x82cd8698ac2b9160ULL),
937       make_floatx80_init(0xbffd, 0xfa64f2cea7a8dd40ULL) },
938     { make_floatx80_init(0xbffe, 0xefffffffffffe960ULL),
939       make_floatx80_init(0x3ffe, 0x85aac367cc488345ULL),
940       make_floatx80_init(0xbffd, 0xf4aa7930676ef976ULL) },
941     { make_floatx80_init(0xbffe, 0xe800000000006f10ULL),
942       make_floatx80_init(0x3ffe, 0x88980e8092da5c14ULL),
943       make_floatx80_init(0xbffd, 0xeecfe2feda4b47d8ULL) },
944     { make_floatx80_init(0xbffe, 0xe000000000008a45ULL),
945       make_floatx80_init(0x3ffe, 0x8b95c1e3ea8ba2a5ULL),
946       make_floatx80_init(0xbffd, 0xe8d47c382ae8bab6ULL) },
947     { make_floatx80_init(0xbffe, 0xd7ffffffffff8a9eULL),
948       make_floatx80_init(0x3ffe, 0x8ea4398b45cd8116ULL),
949       make_floatx80_init(0xbffd, 0xe2b78ce97464fdd4ULL) },
950     { make_floatx80_init(0xbffe, 0xd0000000000019a0ULL),
951       make_floatx80_init(0x3ffe, 0x91c3d373ab11b919ULL),
952       make_floatx80_init(0xbffd, 0xdc785918a9dc8dceULL) },
953     { make_floatx80_init(0xbffe, 0xc7ffffffffff14dfULL),
954       make_floatx80_init(0x3ffe, 0x94f4efa8fef76836ULL),
955       make_floatx80_init(0xbffd, 0xd61620ae02112f94ULL) },
956     { make_floatx80_init(0xbffe, 0xc000000000006530ULL),
957       make_floatx80_init(0x3ffe, 0x9837f0518db87fbbULL),
958       make_floatx80_init(0xbffd, 0xcf901f5ce48f008aULL) },
959     { make_floatx80_init(0xbffe, 0xb7ffffffffff1723ULL),
960       make_floatx80_init(0x3ffe, 0x9b8d39b9d54eb74cULL),
961       make_floatx80_init(0xbffd, 0xc8e58c8c55629168ULL) },
962     { make_floatx80_init(0xbffe, 0xb00000000000b5e1ULL),
963       make_floatx80_init(0x3ffe, 0x9ef5326091a0c366ULL),
964       make_floatx80_init(0xbffd, 0xc2159b3edcbe7934ULL) },
965     { make_floatx80_init(0xbffe, 0xa800000000006f8aULL),
966       make_floatx80_init(0x3ffe, 0xa27043030c49370aULL),
967       make_floatx80_init(0xbffd, 0xbb1f79f9e76d91ecULL) },
968     { make_floatx80_init(0xbffe, 0x9fffffffffff816aULL),
969       make_floatx80_init(0x3ffe, 0xa5fed6a9b15171cfULL),
970       make_floatx80_init(0xbffd, 0xb40252ac9d5d1c62ULL) },
971     { make_floatx80_init(0xbffe, 0x97ffffffffffb621ULL),
972       make_floatx80_init(0x3ffe, 0xa9a15ab4ea7c30e6ULL),
973       make_floatx80_init(0xbffd, 0xacbd4a962b079e34ULL) },
974     { make_floatx80_init(0xbffe, 0x8fffffffffff162bULL),
975       make_floatx80_init(0x3ffe, 0xad583eea42a1b886ULL),
976       make_floatx80_init(0xbffd, 0xa54f822b7abc8ef4ULL) },
977     { make_floatx80_init(0xbffe, 0x87ffffffffff4d34ULL),
978       make_floatx80_init(0x3ffe, 0xb123f581d2ac7b51ULL),
979       make_floatx80_init(0xbffd, 0x9db814fc5aa7095eULL) },
980     { make_floatx80_init(0xbffe, 0x800000000000227dULL),
981       make_floatx80_init(0x3ffe, 0xb504f333f9de539dULL),
982       make_floatx80_init(0xbffd, 0x95f619980c4358c6ULL) },
983     { make_floatx80_init(0xbffd, 0xefffffffffff3978ULL),
984       make_floatx80_init(0x3ffe, 0xb8fbaf4762fbd0a1ULL),
985       make_floatx80_init(0xbffd, 0x8e08a1713a085ebeULL) },
986     { make_floatx80_init(0xbffd, 0xe00000000000df81ULL),
987       make_floatx80_init(0x3ffe, 0xbd08a39f580bfd8cULL),
988       make_floatx80_init(0xbffd, 0x85eeb8c14fe804e8ULL) },
989     { make_floatx80_init(0xbffd, 0xd00000000000bccfULL),
990       make_floatx80_init(0x3ffe, 0xc12c4cca667062f6ULL),
991       make_floatx80_init(0xbffc, 0xfb4eccd6663e7428ULL) },
992     { make_floatx80_init(0xbffd, 0xc00000000000eff0ULL),
993       make_floatx80_init(0x3ffe, 0xc5672a1155069abeULL),
994       make_floatx80_init(0xbffc, 0xea6357baabe59508ULL) },
995     { make_floatx80_init(0xbffd, 0xb000000000000fe6ULL),
996       make_floatx80_init(0x3ffe, 0xc9b9bd866e2f234bULL),
997       make_floatx80_init(0xbffc, 0xd91909e6474372d4ULL) },
998     { make_floatx80_init(0xbffd, 0x9fffffffffff2172ULL),
999       make_floatx80_init(0x3ffe, 0xce248c151f84bf00ULL),
1000       make_floatx80_init(0xbffc, 0xc76dcfab81ed0400ULL) },
1001     { make_floatx80_init(0xbffd, 0x8fffffffffffafffULL),
1002       make_floatx80_init(0x3ffe, 0xd2a81d91f12afb2bULL),
1003       make_floatx80_init(0xbffc, 0xb55f89b83b541354ULL) },
1004     { make_floatx80_init(0xbffc, 0xffffffffffff81a3ULL),
1005       make_floatx80_init(0x3ffe, 0xd744fccad69d7d5eULL),
1006       make_floatx80_init(0xbffc, 0xa2ec0cd4a58a0a88ULL) },
1007     { make_floatx80_init(0xbffc, 0xdfffffffffff1568ULL),
1008       make_floatx80_init(0x3ffe, 0xdbfbb797daf25a44ULL),
1009       make_floatx80_init(0xbffc, 0x901121a0943696f0ULL) },
1010     { make_floatx80_init(0xbffc, 0xbfffffffffff68daULL),
1011       make_floatx80_init(0x3ffe, 0xe0ccdeec2a94f811ULL),
1012       make_floatx80_init(0xbffb, 0xf999089eab583f78ULL) },
1013     { make_floatx80_init(0xbffc, 0x9fffffffffff4690ULL),
1014       make_floatx80_init(0x3ffe, 0xe5b906e77c83657eULL),
1015       make_floatx80_init(0xbffb, 0xd237c8c41be4d410ULL) },
1016     { make_floatx80_init(0xbffb, 0xffffffffffff8aeeULL),
1017       make_floatx80_init(0x3ffe, 0xeac0c6e7dd24427cULL),
1018       make_floatx80_init(0xbffb, 0xa9f9c8c116ddec20ULL) },
1019     { make_floatx80_init(0xbffb, 0xbfffffffffff2d18ULL),
1020       make_floatx80_init(0x3ffe, 0xefe4b99bdcdb06ebULL),
1021       make_floatx80_init(0xbffb, 0x80da33211927c8a8ULL) },
1022     { make_floatx80_init(0xbffa, 0xffffffffffff8ccbULL),
1023       make_floatx80_init(0x3ffe, 0xf5257d152486d0f4ULL),
1024       make_floatx80_init(0xbffa, 0xada82eadb792f0c0ULL) },
1025     { make_floatx80_init(0xbff9, 0xffffffffffff11feULL),
1026       make_floatx80_init(0x3ffe, 0xfa83b2db722a0846ULL),
1027       make_floatx80_init(0xbff9, 0xaf89a491babef740ULL) },
1028     { floatx80_zero_init,
1029       make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1030       floatx80_zero_init },
1031     { make_floatx80_init(0x3ff9, 0xffffffffffff2680ULL),
1032       make_floatx80_init(0x3fff, 0x82cd8698ac2b9f6fULL),
1033       make_floatx80_init(0x3ff9, 0xb361a62b0ae7dbc0ULL) },
1034     { make_floatx80_init(0x3ffb, 0x800000000000b500ULL),
1035       make_floatx80_init(0x3fff, 0x85aac367cc488345ULL),
1036       make_floatx80_init(0x3ffa, 0xb5586cf9891068a0ULL) },
1037     { make_floatx80_init(0x3ffb, 0xbfffffffffff4b67ULL),
1038       make_floatx80_init(0x3fff, 0x88980e8092da7cceULL),
1039       make_floatx80_init(0x3ffb, 0x8980e8092da7cce0ULL) },
1040     { make_floatx80_init(0x3ffb, 0xffffffffffffff57ULL),
1041       make_floatx80_init(0x3fff, 0x8b95c1e3ea8bd6dfULL),
1042       make_floatx80_init(0x3ffb, 0xb95c1e3ea8bd6df0ULL) },
1043     { make_floatx80_init(0x3ffc, 0x9fffffffffff811fULL),
1044       make_floatx80_init(0x3fff, 0x8ea4398b45cd4780ULL),
1045       make_floatx80_init(0x3ffb, 0xea4398b45cd47800ULL) },
1046     { make_floatx80_init(0x3ffc, 0xbfffffffffff9980ULL),
1047       make_floatx80_init(0x3fff, 0x91c3d373ab11b919ULL),
1048       make_floatx80_init(0x3ffc, 0x8e1e9b9d588dc8c8ULL) },
1049     { make_floatx80_init(0x3ffc, 0xdffffffffffff631ULL),
1050       make_floatx80_init(0x3fff, 0x94f4efa8fef70864ULL),
1051       make_floatx80_init(0x3ffc, 0xa7a77d47f7b84320ULL) },
1052     { make_floatx80_init(0x3ffc, 0xffffffffffff2499ULL),
1053       make_floatx80_init(0x3fff, 0x9837f0518db892d4ULL),
1054       make_floatx80_init(0x3ffc, 0xc1bf828c6dc496a0ULL) },
1055     { make_floatx80_init(0x3ffd, 0x8fffffffffff80fbULL),
1056       make_floatx80_init(0x3fff, 0x9b8d39b9d54e3a79ULL),
1057       make_floatx80_init(0x3ffc, 0xdc69cdceaa71d3c8ULL) },
1058     { make_floatx80_init(0x3ffd, 0x9fffffffffffbc23ULL),
1059       make_floatx80_init(0x3fff, 0x9ef5326091a10313ULL),
1060       make_floatx80_init(0x3ffc, 0xf7a993048d081898ULL) },
1061     { make_floatx80_init(0x3ffd, 0xafffffffffff20ecULL),
1062       make_floatx80_init(0x3fff, 0xa27043030c49370aULL),
1063       make_floatx80_init(0x3ffd, 0x89c10c0c3124dc28ULL) },
1064     { make_floatx80_init(0x3ffd, 0xc00000000000fd2cULL),
1065       make_floatx80_init(0x3fff, 0xa5fed6a9b15171cfULL),
1066       make_floatx80_init(0x3ffd, 0x97fb5aa6c545c73cULL) },
1067     { make_floatx80_init(0x3ffd, 0xd0000000000093beULL),
1068       make_floatx80_init(0x3fff, 0xa9a15ab4ea7c30e6ULL),
1069       make_floatx80_init(0x3ffd, 0xa6856ad3a9f0c398ULL) },
1070     { make_floatx80_init(0x3ffd, 0xe00000000000c2aeULL),
1071       make_floatx80_init(0x3fff, 0xad583eea42a17876ULL),
1072       make_floatx80_init(0x3ffd, 0xb560fba90a85e1d8ULL) },
1073     { make_floatx80_init(0x3ffd, 0xefffffffffff1e3fULL),
1074       make_floatx80_init(0x3fff, 0xb123f581d2abef6cULL),
1075       make_floatx80_init(0x3ffd, 0xc48fd6074aafbdb0ULL) },
1076     { make_floatx80_init(0x3ffd, 0xffffffffffff1c23ULL),
1077       make_floatx80_init(0x3fff, 0xb504f333f9de2cadULL),
1078       make_floatx80_init(0x3ffd, 0xd413cccfe778b2b4ULL) },
1079     { make_floatx80_init(0x3ffe, 0x8800000000006344ULL),
1080       make_floatx80_init(0x3fff, 0xb8fbaf4762fbd0a1ULL),
1081       make_floatx80_init(0x3ffd, 0xe3eebd1d8bef4284ULL) },
1082     { make_floatx80_init(0x3ffe, 0x9000000000005d67ULL),
1083       make_floatx80_init(0x3fff, 0xbd08a39f580c668dULL),
1084       make_floatx80_init(0x3ffd, 0xf4228e7d60319a34ULL) },
1085     { make_floatx80_init(0x3ffe, 0x9800000000009127ULL),
1086       make_floatx80_init(0x3fff, 0xc12c4cca6670e042ULL),
1087       make_floatx80_init(0x3ffe, 0x82589994cce1c084ULL) },
1088     { make_floatx80_init(0x3ffe, 0x9fffffffffff06f9ULL),
1089       make_floatx80_init(0x3fff, 0xc5672a11550655c3ULL),
1090       make_floatx80_init(0x3ffe, 0x8ace5422aa0cab86ULL) },
1091     { make_floatx80_init(0x3ffe, 0xa7fffffffffff80dULL),
1092       make_floatx80_init(0x3fff, 0xc9b9bd866e2f234bULL),
1093       make_floatx80_init(0x3ffe, 0x93737b0cdc5e4696ULL) },
1094     { make_floatx80_init(0x3ffe, 0xafffffffffff1470ULL),
1095       make_floatx80_init(0x3fff, 0xce248c151f83fd69ULL),
1096       make_floatx80_init(0x3ffe, 0x9c49182a3f07fad2ULL) },
1097     { make_floatx80_init(0x3ffe, 0xb800000000000e0aULL),
1098       make_floatx80_init(0x3fff, 0xd2a81d91f12aec5cULL),
1099       make_floatx80_init(0x3ffe, 0xa5503b23e255d8b8ULL) },
1100     { make_floatx80_init(0x3ffe, 0xc00000000000b7faULL),
1101       make_floatx80_init(0x3fff, 0xd744fccad69dd630ULL),
1102       make_floatx80_init(0x3ffe, 0xae89f995ad3bac60ULL) },
1103     { make_floatx80_init(0x3ffe, 0xc800000000003aa6ULL),
1104       make_floatx80_init(0x3fff, 0xdbfbb797daf25a44ULL),
1105       make_floatx80_init(0x3ffe, 0xb7f76f2fb5e4b488ULL) },
1106     { make_floatx80_init(0x3ffe, 0xd00000000000a6aeULL),
1107       make_floatx80_init(0x3fff, 0xe0ccdeec2a954685ULL),
1108       make_floatx80_init(0x3ffe, 0xc199bdd8552a8d0aULL) },
1109     { make_floatx80_init(0x3ffe, 0xd800000000004165ULL),
1110       make_floatx80_init(0x3fff, 0xe5b906e77c837155ULL),
1111       make_floatx80_init(0x3ffe, 0xcb720dcef906e2aaULL) },
1112     { make_floatx80_init(0x3ffe, 0xe00000000000582cULL),
1113       make_floatx80_init(0x3fff, 0xeac0c6e7dd24713aULL),
1114       make_floatx80_init(0x3ffe, 0xd5818dcfba48e274ULL) },
1115     { make_floatx80_init(0x3ffe, 0xe800000000001a5dULL),
1116       make_floatx80_init(0x3fff, 0xefe4b99bdcdb06ebULL),
1117       make_floatx80_init(0x3ffe, 0xdfc97337b9b60dd6ULL) },
1118     { make_floatx80_init(0x3ffe, 0xefffffffffffc1efULL),
1119       make_floatx80_init(0x3fff, 0xf5257d152486a2faULL),
1120       make_floatx80_init(0x3ffe, 0xea4afa2a490d45f4ULL) },
1121     { make_floatx80_init(0x3ffe, 0xf800000000001069ULL),
1122       make_floatx80_init(0x3fff, 0xfa83b2db722a0e5cULL),
1123       make_floatx80_init(0x3ffe, 0xf50765b6e4541cb8ULL) },
1124     { make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1125       make_floatx80_init(0x4000, 0x8000000000000000ULL),
1126       make_floatx80_init(0x3fff, 0x8000000000000000ULL) },
1127 };
1128 
1129 void helper_f2xm1(CPUX86State *env)
1130 {
1131     uint8_t old_flags = save_exception_flags(env);
1132     uint64_t sig = extractFloatx80Frac(ST0);
1133     int32_t exp = extractFloatx80Exp(ST0);
1134     bool sign = extractFloatx80Sign(ST0);
1135 
1136     if (floatx80_invalid_encoding(ST0)) {
1137         float_raise(float_flag_invalid, &env->fp_status);
1138         ST0 = floatx80_default_nan(&env->fp_status);
1139     } else if (floatx80_is_any_nan(ST0)) {
1140         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1141             float_raise(float_flag_invalid, &env->fp_status);
1142             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1143         }
1144     } else if (exp > 0x3fff ||
1145                (exp == 0x3fff && sig != (0x8000000000000000ULL))) {
1146         /* Out of range for the instruction, treat as invalid.  */
1147         float_raise(float_flag_invalid, &env->fp_status);
1148         ST0 = floatx80_default_nan(&env->fp_status);
1149     } else if (exp == 0x3fff) {
1150         /* Argument 1 or -1, exact result 1 or -0.5.  */
1151         if (sign) {
1152             ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL);
1153         }
1154     } else if (exp < 0x3fb0) {
1155         if (!floatx80_is_zero(ST0)) {
1156             /*
1157              * Multiplying the argument by an extra-precision version
1158              * of log(2) is sufficiently precise.  Zero arguments are
1159              * returned unchanged.
1160              */
1161             uint64_t sig0, sig1, sig2;
1162             if (exp == 0) {
1163                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1164             }
1165             mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1,
1166                             &sig2);
1167             /* This result is inexact.  */
1168             sig1 |= 1;
1169             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1170                                                 sign, exp, sig0, sig1,
1171                                                 &env->fp_status);
1172         }
1173     } else {
1174         floatx80 tmp, y, accum;
1175         bool asign, bsign;
1176         int32_t n, aexp, bexp;
1177         uint64_t asig0, asig1, asig2, bsig0, bsig1;
1178         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1179         FloatX80RoundPrec save_prec =
1180             env->fp_status.floatx80_rounding_precision;
1181         env->fp_status.float_rounding_mode = float_round_nearest_even;
1182         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1183 
1184         /* Find the nearest multiple of 1/32 to the argument.  */
1185         tmp = floatx80_scalbn(ST0, 5, &env->fp_status);
1186         n = 32 + floatx80_to_int32(tmp, &env->fp_status);
1187         y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status);
1188 
1189         if (floatx80_is_zero(y)) {
1190             /*
1191              * Use the value of 2^t - 1 from the table, to avoid
1192              * needing to special-case zero as a result of
1193              * multiplication below.
1194              */
1195             ST0 = f2xm1_table[n].t;
1196             set_float_exception_flags(float_flag_inexact, &env->fp_status);
1197             env->fp_status.float_rounding_mode = save_mode;
1198         } else {
1199             /*
1200              * Compute the lower parts of a polynomial expansion for
1201              * (2^y - 1) / y.
1202              */
1203             accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status);
1204             accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status);
1205             accum = floatx80_mul(accum, y, &env->fp_status);
1206             accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status);
1207             accum = floatx80_mul(accum, y, &env->fp_status);
1208             accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status);
1209             accum = floatx80_mul(accum, y, &env->fp_status);
1210             accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status);
1211             accum = floatx80_mul(accum, y, &env->fp_status);
1212             accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status);
1213             accum = floatx80_mul(accum, y, &env->fp_status);
1214             accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status);
1215             accum = floatx80_mul(accum, y, &env->fp_status);
1216             accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status);
1217 
1218             /*
1219              * The full polynomial expansion is f2xm1_coeff_0 + accum
1220              * (where accum has much lower magnitude, and so, in
1221              * particular, carry out of the addition is not possible).
1222              * (This expansion is only accurate to about 70 bits, not
1223              * 128 bits.)
1224              */
1225             aexp = extractFloatx80Exp(f2xm1_coeff_0);
1226             asign = extractFloatx80Sign(f2xm1_coeff_0);
1227             shift128RightJamming(extractFloatx80Frac(accum), 0,
1228                                  aexp - extractFloatx80Exp(accum),
1229                                  &asig0, &asig1);
1230             bsig0 = extractFloatx80Frac(f2xm1_coeff_0);
1231             bsig1 = 0;
1232             if (asign == extractFloatx80Sign(accum)) {
1233                 add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1234             } else {
1235                 sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1236             }
1237             /* And thus compute an approximation to 2^y - 1.  */
1238             mul128By64To192(asig0, asig1, extractFloatx80Frac(y),
1239                             &asig0, &asig1, &asig2);
1240             aexp += extractFloatx80Exp(y) - 0x3ffe;
1241             asign ^= extractFloatx80Sign(y);
1242             if (n != 32) {
1243                 /*
1244                  * Multiply this by the precomputed value of 2^t and
1245                  * add that of 2^t - 1.
1246                  */
1247                 mul128By64To192(asig0, asig1,
1248                                 extractFloatx80Frac(f2xm1_table[n].exp2),
1249                                 &asig0, &asig1, &asig2);
1250                 aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe;
1251                 bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1);
1252                 bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1);
1253                 bsig1 = 0;
1254                 if (bexp < aexp) {
1255                     shift128RightJamming(bsig0, bsig1, aexp - bexp,
1256                                          &bsig0, &bsig1);
1257                 } else if (aexp < bexp) {
1258                     shift128RightJamming(asig0, asig1, bexp - aexp,
1259                                          &asig0, &asig1);
1260                     aexp = bexp;
1261                 }
1262                 /* The sign of 2^t - 1 is always that of the result.  */
1263                 bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1);
1264                 if (asign == bsign) {
1265                     /* Avoid possible carry out of the addition.  */
1266                     shift128RightJamming(asig0, asig1, 1,
1267                                          &asig0, &asig1);
1268                     shift128RightJamming(bsig0, bsig1, 1,
1269                                          &bsig0, &bsig1);
1270                     ++aexp;
1271                     add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1);
1272                 } else {
1273                     sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1274                     asign = bsign;
1275                 }
1276             }
1277             env->fp_status.float_rounding_mode = save_mode;
1278             /* This result is inexact.  */
1279             asig1 |= 1;
1280             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1281                                                 asign, aexp, asig0, asig1,
1282                                                 &env->fp_status);
1283         }
1284 
1285         env->fp_status.floatx80_rounding_precision = save_prec;
1286     }
1287     merge_exception_flags(env, old_flags);
1288 }
1289 
1290 void helper_fptan(CPUX86State *env)
1291 {
1292     double fptemp = floatx80_to_double(env, ST0);
1293 
1294     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1295         env->fpus |= 0x400;
1296     } else {
1297         fptemp = tan(fptemp);
1298         ST0 = double_to_floatx80(env, fptemp);
1299         fpush(env);
1300         ST0 = floatx80_one;
1301         env->fpus &= ~0x400; /* C2 <-- 0 */
1302         /* the above code is for |arg| < 2**52 only */
1303     }
1304 }
1305 
1306 /* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision.  */
1307 #define pi_4_exp 0x3ffe
1308 #define pi_4_sig_high 0xc90fdaa22168c234ULL
1309 #define pi_4_sig_low 0xc4c6628b80dc1cd1ULL
1310 #define pi_2_exp 0x3fff
1311 #define pi_2_sig_high 0xc90fdaa22168c234ULL
1312 #define pi_2_sig_low 0xc4c6628b80dc1cd1ULL
1313 #define pi_34_exp 0x4000
1314 #define pi_34_sig_high 0x96cbe3f9990e91a7ULL
1315 #define pi_34_sig_low 0x9394c9e8a0a5159dULL
1316 #define pi_exp 0x4000
1317 #define pi_sig_high 0xc90fdaa22168c234ULL
1318 #define pi_sig_low 0xc4c6628b80dc1cd1ULL
1319 
1320 /*
1321  * Polynomial coefficients for an approximation to atan(x), with only
1322  * odd powers of x used, for x in the interval [-1/16, 1/16].  (Unlike
1323  * for some other approximations, no low part is needed for the first
1324  * coefficient here to achieve a sufficiently accurate result, because
1325  * the coefficient in this minimax approximation is very close to
1326  * exactly 1.)
1327  */
1328 #define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL)
1329 #define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL)
1330 #define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL)
1331 #define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL)
1332 #define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL)
1333 #define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL)
1334 #define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL)
1335 
1336 struct fpatan_data {
1337     /* High and low parts of atan(x).  */
1338     floatx80 atan_high, atan_low;
1339 };
1340 
1341 static const struct fpatan_data fpatan_table[9] = {
1342     { floatx80_zero_init,
1343       floatx80_zero_init },
1344     { make_floatx80_init(0x3ffb, 0xfeadd4d5617b6e33ULL),
1345       make_floatx80_init(0xbfb9, 0xdda19d8305ddc420ULL) },
1346     { make_floatx80_init(0x3ffc, 0xfadbafc96406eb15ULL),
1347       make_floatx80_init(0x3fbb, 0xdb8f3debef442fccULL) },
1348     { make_floatx80_init(0x3ffd, 0xb7b0ca0f26f78474ULL),
1349       make_floatx80_init(0xbfbc, 0xeab9bdba460376faULL) },
1350     { make_floatx80_init(0x3ffd, 0xed63382b0dda7b45ULL),
1351       make_floatx80_init(0x3fbc, 0xdfc88bd978751a06ULL) },
1352     { make_floatx80_init(0x3ffe, 0x8f005d5ef7f59f9bULL),
1353       make_floatx80_init(0x3fbd, 0xb906bc2ccb886e90ULL) },
1354     { make_floatx80_init(0x3ffe, 0xa4bc7d1934f70924ULL),
1355       make_floatx80_init(0x3fbb, 0xcd43f9522bed64f8ULL) },
1356     { make_floatx80_init(0x3ffe, 0xb8053e2bc2319e74ULL),
1357       make_floatx80_init(0xbfbc, 0xd3496ab7bd6eef0cULL) },
1358     { make_floatx80_init(0x3ffe, 0xc90fdaa22168c235ULL),
1359       make_floatx80_init(0xbfbc, 0xece675d1fc8f8cbcULL) },
1360 };
1361 
1362 void helper_fpatan(CPUX86State *env)
1363 {
1364     uint8_t old_flags = save_exception_flags(env);
1365     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1366     int32_t arg0_exp = extractFloatx80Exp(ST0);
1367     bool arg0_sign = extractFloatx80Sign(ST0);
1368     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1369     int32_t arg1_exp = extractFloatx80Exp(ST1);
1370     bool arg1_sign = extractFloatx80Sign(ST1);
1371 
1372     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1373         float_raise(float_flag_invalid, &env->fp_status);
1374         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1375     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1376         float_raise(float_flag_invalid, &env->fp_status);
1377         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1378     } else if (floatx80_invalid_encoding(ST0) ||
1379                floatx80_invalid_encoding(ST1)) {
1380         float_raise(float_flag_invalid, &env->fp_status);
1381         ST1 = floatx80_default_nan(&env->fp_status);
1382     } else if (floatx80_is_any_nan(ST0)) {
1383         ST1 = ST0;
1384     } else if (floatx80_is_any_nan(ST1)) {
1385         /* Pass this NaN through.  */
1386     } else if (floatx80_is_zero(ST1) && !arg0_sign) {
1387         /* Pass this zero through.  */
1388     } else if (((floatx80_is_infinity(ST0) && !floatx80_is_infinity(ST1)) ||
1389                  arg0_exp - arg1_exp >= 80) &&
1390                !arg0_sign) {
1391         /*
1392          * Dividing ST1 by ST0 gives the correct result up to
1393          * rounding, and avoids spurious underflow exceptions that
1394          * might result from passing some small values through the
1395          * polynomial approximation, but if a finite nonzero result of
1396          * division is exact, the result of fpatan is still inexact
1397          * (and underflowing where appropriate).
1398          */
1399         FloatX80RoundPrec save_prec =
1400             env->fp_status.floatx80_rounding_precision;
1401         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1402         ST1 = floatx80_div(ST1, ST0, &env->fp_status);
1403         env->fp_status.floatx80_rounding_precision = save_prec;
1404         if (!floatx80_is_zero(ST1) &&
1405             !(get_float_exception_flags(&env->fp_status) &
1406               float_flag_inexact)) {
1407             /*
1408              * The mathematical result is very slightly closer to zero
1409              * than this exact result.  Round a value with the
1410              * significand adjusted accordingly to get the correct
1411              * exceptions, and possibly an adjusted result depending
1412              * on the rounding mode.
1413              */
1414             uint64_t sig = extractFloatx80Frac(ST1);
1415             int32_t exp = extractFloatx80Exp(ST1);
1416             bool sign = extractFloatx80Sign(ST1);
1417             if (exp == 0) {
1418                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1419             }
1420             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1421                                                 sign, exp, sig - 1,
1422                                                 -1, &env->fp_status);
1423         }
1424     } else {
1425         /* The result is inexact.  */
1426         bool rsign = arg1_sign;
1427         int32_t rexp;
1428         uint64_t rsig0, rsig1;
1429         if (floatx80_is_zero(ST1)) {
1430             /*
1431              * ST0 is negative.  The result is pi with the sign of
1432              * ST1.
1433              */
1434             rexp = pi_exp;
1435             rsig0 = pi_sig_high;
1436             rsig1 = pi_sig_low;
1437         } else if (floatx80_is_infinity(ST1)) {
1438             if (floatx80_is_infinity(ST0)) {
1439                 if (arg0_sign) {
1440                     rexp = pi_34_exp;
1441                     rsig0 = pi_34_sig_high;
1442                     rsig1 = pi_34_sig_low;
1443                 } else {
1444                     rexp = pi_4_exp;
1445                     rsig0 = pi_4_sig_high;
1446                     rsig1 = pi_4_sig_low;
1447                 }
1448             } else {
1449                 rexp = pi_2_exp;
1450                 rsig0 = pi_2_sig_high;
1451                 rsig1 = pi_2_sig_low;
1452             }
1453         } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) {
1454             rexp = pi_2_exp;
1455             rsig0 = pi_2_sig_high;
1456             rsig1 = pi_2_sig_low;
1457         } else if (floatx80_is_infinity(ST0) || arg0_exp - arg1_exp >= 80) {
1458             /* ST0 is negative.  */
1459             rexp = pi_exp;
1460             rsig0 = pi_sig_high;
1461             rsig1 = pi_sig_low;
1462         } else {
1463             /*
1464              * ST0 and ST1 are finite, nonzero and with exponents not
1465              * too far apart.
1466              */
1467             int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp;
1468             int32_t azexp, axexp;
1469             bool adj_sub, ysign, zsign;
1470             uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1;
1471             uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2;
1472             uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1;
1473             uint64_t azsig0, azsig1;
1474             uint64_t azsig2, azsig3, axsig0, axsig1;
1475             floatx80 x8;
1476             FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1477             FloatX80RoundPrec save_prec =
1478                 env->fp_status.floatx80_rounding_precision;
1479             env->fp_status.float_rounding_mode = float_round_nearest_even;
1480             env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1481 
1482             if (arg0_exp == 0) {
1483                 normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1484             }
1485             if (arg1_exp == 0) {
1486                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1487             }
1488             if (arg0_exp > arg1_exp ||
1489                 (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) {
1490                 /* Work with abs(ST1) / abs(ST0).  */
1491                 num_exp = arg1_exp;
1492                 num_sig = arg1_sig;
1493                 den_exp = arg0_exp;
1494                 den_sig = arg0_sig;
1495                 if (arg0_sign) {
1496                     /* The result is subtracted from pi.  */
1497                     adj_exp = pi_exp;
1498                     adj_sig0 = pi_sig_high;
1499                     adj_sig1 = pi_sig_low;
1500                     adj_sub = true;
1501                 } else {
1502                     /* The result is used as-is.  */
1503                     adj_exp = 0;
1504                     adj_sig0 = 0;
1505                     adj_sig1 = 0;
1506                     adj_sub = false;
1507                 }
1508             } else {
1509                 /* Work with abs(ST0) / abs(ST1).  */
1510                 num_exp = arg0_exp;
1511                 num_sig = arg0_sig;
1512                 den_exp = arg1_exp;
1513                 den_sig = arg1_sig;
1514                 /* The result is added to or subtracted from pi/2.  */
1515                 adj_exp = pi_2_exp;
1516                 adj_sig0 = pi_2_sig_high;
1517                 adj_sig1 = pi_2_sig_low;
1518                 adj_sub = !arg0_sign;
1519             }
1520 
1521             /*
1522              * Compute x = num/den, where 0 < x <= 1 and x is not too
1523              * small.
1524              */
1525             xexp = num_exp - den_exp + 0x3ffe;
1526             remsig0 = num_sig;
1527             remsig1 = 0;
1528             if (den_sig <= remsig0) {
1529                 shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1530                 ++xexp;
1531             }
1532             xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig);
1533             mul64To128(den_sig, xsig0, &msig0, &msig1);
1534             sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1);
1535             while ((int64_t) remsig0 < 0) {
1536                 --xsig0;
1537                 add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1);
1538             }
1539             xsig1 = estimateDiv128To64(remsig1, 0, den_sig);
1540             /*
1541              * No need to correct any estimation error in xsig1; even
1542              * with such error, it is accurate enough.
1543              */
1544 
1545             /*
1546              * Split x as x = t + y, where t = n/8 is the nearest
1547              * multiple of 1/8 to x.
1548              */
1549             x8 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1550                                                false, xexp + 3, xsig0,
1551                                                xsig1, &env->fp_status);
1552             n = floatx80_to_int32(x8, &env->fp_status);
1553             if (n == 0) {
1554                 ysign = false;
1555                 yexp = xexp;
1556                 ysig0 = xsig0;
1557                 ysig1 = xsig1;
1558                 texp = 0;
1559                 tsig = 0;
1560             } else {
1561                 int shift = clz32(n) + 32;
1562                 texp = 0x403b - shift;
1563                 tsig = n;
1564                 tsig <<= shift;
1565                 if (texp == xexp) {
1566                     sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1);
1567                     if ((int64_t) ysig0 >= 0) {
1568                         ysign = false;
1569                         if (ysig0 == 0) {
1570                             if (ysig1 == 0) {
1571                                 yexp = 0;
1572                             } else {
1573                                 shift = clz64(ysig1) + 64;
1574                                 yexp = xexp - shift;
1575                                 shift128Left(ysig0, ysig1, shift,
1576                                              &ysig0, &ysig1);
1577                             }
1578                         } else {
1579                             shift = clz64(ysig0);
1580                             yexp = xexp - shift;
1581                             shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1582                         }
1583                     } else {
1584                         ysign = true;
1585                         sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1);
1586                         if (ysig0 == 0) {
1587                             shift = clz64(ysig1) + 64;
1588                         } else {
1589                             shift = clz64(ysig0);
1590                         }
1591                         yexp = xexp - shift;
1592                         shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1593                     }
1594                 } else {
1595                     /*
1596                      * t's exponent must be greater than x's because t
1597                      * is positive and the nearest multiple of 1/8 to
1598                      * x, and if x has a greater exponent, the power
1599                      * of 2 with that exponent is also a multiple of
1600                      * 1/8.
1601                      */
1602                     uint64_t usig0, usig1;
1603                     shift128RightJamming(xsig0, xsig1, texp - xexp,
1604                                          &usig0, &usig1);
1605                     ysign = true;
1606                     sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1);
1607                     if (ysig0 == 0) {
1608                         shift = clz64(ysig1) + 64;
1609                     } else {
1610                         shift = clz64(ysig0);
1611                     }
1612                     yexp = texp - shift;
1613                     shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1614                 }
1615             }
1616 
1617             /*
1618              * Compute z = y/(1+tx), so arctan(x) = arctan(t) +
1619              * arctan(z).
1620              */
1621             zsign = ysign;
1622             if (texp == 0 || yexp == 0) {
1623                 zexp = yexp;
1624                 zsig0 = ysig0;
1625                 zsig1 = ysig1;
1626             } else {
1627                 /*
1628                  * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1.
1629                  */
1630                 int32_t dexp = texp + xexp - 0x3ffe;
1631                 uint64_t dsig0, dsig1, dsig2;
1632                 mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2);
1633                 /*
1634                  * dexp <= 0x3fff (and if equal, dsig0 has a leading 0
1635                  * bit).  Add 1 to produce the denominator 1+tx.
1636                  */
1637                 shift128RightJamming(dsig0, dsig1, 0x3fff - dexp,
1638                                      &dsig0, &dsig1);
1639                 dsig0 |= 0x8000000000000000ULL;
1640                 zexp = yexp - 1;
1641                 remsig0 = ysig0;
1642                 remsig1 = ysig1;
1643                 remsig2 = 0;
1644                 if (dsig0 <= remsig0) {
1645                     shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1646                     ++zexp;
1647                 }
1648                 zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0);
1649                 mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2);
1650                 sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2,
1651                        &remsig0, &remsig1, &remsig2);
1652                 while ((int64_t) remsig0 < 0) {
1653                     --zsig0;
1654                     add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1,
1655                            &remsig0, &remsig1, &remsig2);
1656                 }
1657                 zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0);
1658                 /* No need to correct any estimation error in zsig1.  */
1659             }
1660 
1661             if (zexp == 0) {
1662                 azexp = 0;
1663                 azsig0 = 0;
1664                 azsig1 = 0;
1665             } else {
1666                 floatx80 z2, accum;
1667                 uint64_t z2sig0, z2sig1, z2sig2, z2sig3;
1668                 /* Compute z^2.  */
1669                 mul128To256(zsig0, zsig1, zsig0, zsig1,
1670                             &z2sig0, &z2sig1, &z2sig2, &z2sig3);
1671                 z2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1672                                                    zexp + zexp - 0x3ffe,
1673                                                    z2sig0, z2sig1,
1674                                                    &env->fp_status);
1675 
1676                 /* Compute the lower parts of the polynomial expansion.  */
1677                 accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status);
1678                 accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status);
1679                 accum = floatx80_mul(accum, z2, &env->fp_status);
1680                 accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status);
1681                 accum = floatx80_mul(accum, z2, &env->fp_status);
1682                 accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status);
1683                 accum = floatx80_mul(accum, z2, &env->fp_status);
1684                 accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status);
1685                 accum = floatx80_mul(accum, z2, &env->fp_status);
1686                 accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status);
1687                 accum = floatx80_mul(accum, z2, &env->fp_status);
1688 
1689                 /*
1690                  * The full polynomial expansion is z*(fpatan_coeff_0 + accum).
1691                  * fpatan_coeff_0 is 1, and accum is negative and much smaller.
1692                  */
1693                 aexp = extractFloatx80Exp(fpatan_coeff_0);
1694                 shift128RightJamming(extractFloatx80Frac(accum), 0,
1695                                      aexp - extractFloatx80Exp(accum),
1696                                      &asig0, &asig1);
1697                 sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1,
1698                        &asig0, &asig1);
1699                 /* Multiply by z to compute arctan(z).  */
1700                 azexp = aexp + zexp - 0x3ffe;
1701                 mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1,
1702                             &azsig2, &azsig3);
1703             }
1704 
1705             /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign).  */
1706             if (texp == 0) {
1707                 /* z is positive.  */
1708                 axexp = azexp;
1709                 axsig0 = azsig0;
1710                 axsig1 = azsig1;
1711             } else {
1712                 bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low);
1713                 int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low);
1714                 uint64_t low_sig0 =
1715                     extractFloatx80Frac(fpatan_table[n].atan_low);
1716                 uint64_t low_sig1 = 0;
1717                 axexp = extractFloatx80Exp(fpatan_table[n].atan_high);
1718                 axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high);
1719                 axsig1 = 0;
1720                 shift128RightJamming(low_sig0, low_sig1, axexp - low_exp,
1721                                      &low_sig0, &low_sig1);
1722                 if (low_sign) {
1723                     sub128(axsig0, axsig1, low_sig0, low_sig1,
1724                            &axsig0, &axsig1);
1725                 } else {
1726                     add128(axsig0, axsig1, low_sig0, low_sig1,
1727                            &axsig0, &axsig1);
1728                 }
1729                 if (azexp >= axexp) {
1730                     shift128RightJamming(axsig0, axsig1, azexp - axexp + 1,
1731                                          &axsig0, &axsig1);
1732                     axexp = azexp + 1;
1733                     shift128RightJamming(azsig0, azsig1, 1,
1734                                          &azsig0, &azsig1);
1735                 } else {
1736                     shift128RightJamming(axsig0, axsig1, 1,
1737                                          &axsig0, &axsig1);
1738                     shift128RightJamming(azsig0, azsig1, axexp - azexp + 1,
1739                                          &azsig0, &azsig1);
1740                     ++axexp;
1741                 }
1742                 if (zsign) {
1743                     sub128(axsig0, axsig1, azsig0, azsig1,
1744                            &axsig0, &axsig1);
1745                 } else {
1746                     add128(axsig0, axsig1, azsig0, azsig1,
1747                            &axsig0, &axsig1);
1748                 }
1749             }
1750 
1751             if (adj_exp == 0) {
1752                 rexp = axexp;
1753                 rsig0 = axsig0;
1754                 rsig1 = axsig1;
1755             } else {
1756                 /*
1757                  * Add or subtract arctan(x) (exponent axexp,
1758                  * significand axsig0 and axsig1, positive, not
1759                  * necessarily normalized) to the number given by
1760                  * adj_exp, adj_sig0 and adj_sig1, according to
1761                  * adj_sub.
1762                  */
1763                 if (adj_exp >= axexp) {
1764                     shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1,
1765                                          &axsig0, &axsig1);
1766                     rexp = adj_exp + 1;
1767                     shift128RightJamming(adj_sig0, adj_sig1, 1,
1768                                          &adj_sig0, &adj_sig1);
1769                 } else {
1770                     shift128RightJamming(axsig0, axsig1, 1,
1771                                          &axsig0, &axsig1);
1772                     shift128RightJamming(adj_sig0, adj_sig1,
1773                                          axexp - adj_exp + 1,
1774                                          &adj_sig0, &adj_sig1);
1775                     rexp = axexp + 1;
1776                 }
1777                 if (adj_sub) {
1778                     sub128(adj_sig0, adj_sig1, axsig0, axsig1,
1779                            &rsig0, &rsig1);
1780                 } else {
1781                     add128(adj_sig0, adj_sig1, axsig0, axsig1,
1782                            &rsig0, &rsig1);
1783                 }
1784             }
1785 
1786             env->fp_status.float_rounding_mode = save_mode;
1787             env->fp_status.floatx80_rounding_precision = save_prec;
1788         }
1789         /* This result is inexact.  */
1790         rsig1 |= 1;
1791         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x, rsign, rexp,
1792                                             rsig0, rsig1, &env->fp_status);
1793     }
1794 
1795     fpop(env);
1796     merge_exception_flags(env, old_flags);
1797 }
1798 
1799 void helper_fxtract(CPUX86State *env)
1800 {
1801     uint8_t old_flags = save_exception_flags(env);
1802     CPU_LDoubleU temp;
1803 
1804     temp.d = ST0;
1805 
1806     if (floatx80_is_zero(ST0)) {
1807         /* Easy way to generate -inf and raising division by 0 exception */
1808         ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
1809                            &env->fp_status);
1810         fpush(env);
1811         ST0 = temp.d;
1812     } else if (floatx80_invalid_encoding(ST0)) {
1813         float_raise(float_flag_invalid, &env->fp_status);
1814         ST0 = floatx80_default_nan(&env->fp_status);
1815         fpush(env);
1816         ST0 = ST1;
1817     } else if (floatx80_is_any_nan(ST0)) {
1818         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1819             float_raise(float_flag_invalid, &env->fp_status);
1820             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1821         }
1822         fpush(env);
1823         ST0 = ST1;
1824     } else if (floatx80_is_infinity(ST0)) {
1825         fpush(env);
1826         ST0 = ST1;
1827         ST1 = floatx80_infinity;
1828     } else {
1829         int expdif;
1830 
1831         if (EXPD(temp) == 0) {
1832             int shift = clz64(temp.l.lower);
1833             temp.l.lower <<= shift;
1834             expdif = 1 - EXPBIAS - shift;
1835             float_raise(float_flag_input_denormal_flushed, &env->fp_status);
1836         } else {
1837             expdif = EXPD(temp) - EXPBIAS;
1838         }
1839         /* DP exponent bias */
1840         ST0 = int32_to_floatx80(expdif, &env->fp_status);
1841         fpush(env);
1842         BIASEXPONENT(temp);
1843         ST0 = temp.d;
1844     }
1845     merge_exception_flags(env, old_flags);
1846 }
1847 
1848 static void helper_fprem_common(CPUX86State *env, bool mod)
1849 {
1850     uint8_t old_flags = save_exception_flags(env);
1851     uint64_t quotient;
1852     CPU_LDoubleU temp0, temp1;
1853     int exp0, exp1, expdiff;
1854 
1855     temp0.d = ST0;
1856     temp1.d = ST1;
1857     exp0 = EXPD(temp0);
1858     exp1 = EXPD(temp1);
1859 
1860     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1861     if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1862         exp0 == 0x7fff || exp1 == 0x7fff ||
1863         floatx80_invalid_encoding(ST0) || floatx80_invalid_encoding(ST1)) {
1864         ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1865     } else {
1866         if (exp0 == 0) {
1867             exp0 = 1 - clz64(temp0.l.lower);
1868         }
1869         if (exp1 == 0) {
1870             exp1 = 1 - clz64(temp1.l.lower);
1871         }
1872         expdiff = exp0 - exp1;
1873         if (expdiff < 64) {
1874             ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1875             env->fpus |= (quotient & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1876             env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */
1877             env->fpus |= (quotient & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1878         } else {
1879             /*
1880              * Partial remainder.  This choice of how many bits to
1881              * process at once is specified in AMD instruction set
1882              * manuals, and empirically is followed by Intel
1883              * processors as well; it ensures that the final remainder
1884              * operation in a loop does produce the correct low three
1885              * bits of the quotient.  AMD manuals specify that the
1886              * flags other than C2 are cleared, and empirically Intel
1887              * processors clear them as well.
1888              */
1889             int n = 32 + (expdiff % 32);
1890             temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status);
1891             ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status);
1892             env->fpus |= 0x400;  /* C2 <-- 1 */
1893         }
1894     }
1895     merge_exception_flags(env, old_flags);
1896 }
1897 
1898 void helper_fprem1(CPUX86State *env)
1899 {
1900     helper_fprem_common(env, false);
1901 }
1902 
1903 void helper_fprem(CPUX86State *env)
1904 {
1905     helper_fprem_common(env, true);
1906 }
1907 
1908 /* 128-bit significand of log2(e).  */
1909 #define log2_e_sig_high 0xb8aa3b295c17f0bbULL
1910 #define log2_e_sig_low 0xbe87fed0691d3e89ULL
1911 
1912 /*
1913  * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
1914  * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
1915  * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
1916  * interval [sqrt(2)/2, sqrt(2)].
1917  */
1918 #define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
1919 #define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
1920 #define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
1921 #define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
1922 #define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
1923 #define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
1924 #define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
1925 #define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
1926 #define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
1927 #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
1928 #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
1929 
1930 /*
1931  * Compute an approximation of log2(1+arg), where 1+arg is in the
1932  * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
1933  * function is called, rounding precision is set to 80 and the
1934  * round-to-nearest mode is in effect.  arg must not be exactly zero,
1935  * and must not be so close to zero that underflow might occur.
1936  */
1937 static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
1938                                 uint64_t *sig0, uint64_t *sig1)
1939 {
1940     uint64_t arg0_sig = extractFloatx80Frac(arg);
1941     int32_t arg0_exp = extractFloatx80Exp(arg);
1942     bool arg0_sign = extractFloatx80Sign(arg);
1943     bool asign;
1944     int32_t dexp, texp, aexp;
1945     uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
1946     uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
1947     uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
1948     floatx80 t2, accum;
1949 
1950     /*
1951      * Compute an approximation of arg/(2+arg), with extra precision,
1952      * as the argument to a polynomial approximation.  The extra
1953      * precision is only needed for the first term of the
1954      * approximation, with subsequent terms being significantly
1955      * smaller; the approximation only uses odd exponents, and the
1956      * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
1957      */
1958     if (arg0_sign) {
1959         dexp = 0x3fff;
1960         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1961         sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
1962     } else {
1963         dexp = 0x4000;
1964         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1965         dsig0 |= 0x8000000000000000ULL;
1966     }
1967     texp = arg0_exp - dexp + 0x3ffe;
1968     rsig0 = arg0_sig;
1969     rsig1 = 0;
1970     rsig2 = 0;
1971     if (dsig0 <= rsig0) {
1972         shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
1973         ++texp;
1974     }
1975     tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
1976     mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
1977     sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
1978            &rsig0, &rsig1, &rsig2);
1979     while ((int64_t) rsig0 < 0) {
1980         --tsig0;
1981         add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
1982                &rsig0, &rsig1, &rsig2);
1983     }
1984     tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
1985     /*
1986      * No need to correct any estimation error in tsig1; even with
1987      * such error, it is accurate enough.  Now compute the square of
1988      * that approximation.
1989      */
1990     mul128To256(tsig0, tsig1, tsig0, tsig1,
1991                 &t2sig0, &t2sig1, &t2sig2, &t2sig3);
1992     t2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1993                                        texp + texp - 0x3ffe,
1994                                        t2sig0, t2sig1, &env->fp_status);
1995 
1996     /* Compute the lower parts of the polynomial expansion.  */
1997     accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
1998     accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
1999     accum = floatx80_mul(accum, t2, &env->fp_status);
2000     accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
2001     accum = floatx80_mul(accum, t2, &env->fp_status);
2002     accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
2003     accum = floatx80_mul(accum, t2, &env->fp_status);
2004     accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
2005     accum = floatx80_mul(accum, t2, &env->fp_status);
2006     accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
2007     accum = floatx80_mul(accum, t2, &env->fp_status);
2008     accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
2009     accum = floatx80_mul(accum, t2, &env->fp_status);
2010     accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
2011     accum = floatx80_mul(accum, t2, &env->fp_status);
2012     accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
2013     accum = floatx80_mul(accum, t2, &env->fp_status);
2014     accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
2015 
2016     /*
2017      * The full polynomial expansion is fyl2x_coeff_0 + accum (where
2018      * accum has much lower magnitude, and so, in particular, carry
2019      * out of the addition is not possible), multiplied by t.  (This
2020      * expansion is only accurate to about 70 bits, not 128 bits.)
2021      */
2022     aexp = extractFloatx80Exp(fyl2x_coeff_0);
2023     asign = extractFloatx80Sign(fyl2x_coeff_0);
2024     shift128RightJamming(extractFloatx80Frac(accum), 0,
2025                          aexp - extractFloatx80Exp(accum),
2026                          &asig0, &asig1);
2027     bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
2028     bsig1 = 0;
2029     if (asign == extractFloatx80Sign(accum)) {
2030         add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
2031     } else {
2032         sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
2033     }
2034     /* Multiply by t to compute the required result.  */
2035     mul128To256(asig0, asig1, tsig0, tsig1,
2036                 &asig0, &asig1, &asig2, &asig3);
2037     aexp += texp - 0x3ffe;
2038     *exp = aexp;
2039     *sig0 = asig0;
2040     *sig1 = asig1;
2041 }
2042 
2043 void helper_fyl2xp1(CPUX86State *env)
2044 {
2045     uint8_t old_flags = save_exception_flags(env);
2046     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2047     int32_t arg0_exp = extractFloatx80Exp(ST0);
2048     bool arg0_sign = extractFloatx80Sign(ST0);
2049     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2050     int32_t arg1_exp = extractFloatx80Exp(ST1);
2051     bool arg1_sign = extractFloatx80Sign(ST1);
2052 
2053     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2054         float_raise(float_flag_invalid, &env->fp_status);
2055         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2056     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2057         float_raise(float_flag_invalid, &env->fp_status);
2058         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2059     } else if (floatx80_invalid_encoding(ST0) ||
2060                floatx80_invalid_encoding(ST1)) {
2061         float_raise(float_flag_invalid, &env->fp_status);
2062         ST1 = floatx80_default_nan(&env->fp_status);
2063     } else if (floatx80_is_any_nan(ST0)) {
2064         ST1 = ST0;
2065     } else if (floatx80_is_any_nan(ST1)) {
2066         /* Pass this NaN through.  */
2067     } else if (arg0_exp > 0x3ffd ||
2068                (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
2069                                                   0x95f619980c4336f7ULL :
2070                                                   0xd413cccfe7799211ULL))) {
2071         /*
2072          * Out of range for the instruction (ST0 must have absolute
2073          * value less than 1 - sqrt(2)/2 = 0.292..., according to
2074          * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
2075          * to sqrt(2) - 1, which we allow here), treat as invalid.
2076          */
2077         float_raise(float_flag_invalid, &env->fp_status);
2078         ST1 = floatx80_default_nan(&env->fp_status);
2079     } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
2080                arg1_exp == 0x7fff) {
2081         /*
2082          * One argument is zero, or multiplying by infinity; correct
2083          * result is exact and can be obtained by multiplying the
2084          * arguments.
2085          */
2086         ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
2087     } else if (arg0_exp < 0x3fb0) {
2088         /*
2089          * Multiplying both arguments and an extra-precision version
2090          * of log2(e) is sufficiently precise.
2091          */
2092         uint64_t sig0, sig1, sig2;
2093         int32_t exp;
2094         if (arg0_exp == 0) {
2095             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2096         }
2097         if (arg1_exp == 0) {
2098             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2099         }
2100         mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
2101                         &sig0, &sig1, &sig2);
2102         exp = arg0_exp + 1;
2103         mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
2104         exp += arg1_exp - 0x3ffe;
2105         /* This result is inexact.  */
2106         sig1 |= 1;
2107         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2108                                             arg0_sign ^ arg1_sign, exp,
2109                                             sig0, sig1, &env->fp_status);
2110     } else {
2111         int32_t aexp;
2112         uint64_t asig0, asig1, asig2;
2113         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2114         FloatX80RoundPrec save_prec =
2115             env->fp_status.floatx80_rounding_precision;
2116         env->fp_status.float_rounding_mode = float_round_nearest_even;
2117         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2118 
2119         helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
2120         /*
2121          * Multiply by the second argument to compute the required
2122          * result.
2123          */
2124         if (arg1_exp == 0) {
2125             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2126         }
2127         mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2128         aexp += arg1_exp - 0x3ffe;
2129         /* This result is inexact.  */
2130         asig1 |= 1;
2131         env->fp_status.float_rounding_mode = save_mode;
2132         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2133                                             arg0_sign ^ arg1_sign, aexp,
2134                                             asig0, asig1, &env->fp_status);
2135         env->fp_status.floatx80_rounding_precision = save_prec;
2136     }
2137     fpop(env);
2138     merge_exception_flags(env, old_flags);
2139 }
2140 
2141 void helper_fyl2x(CPUX86State *env)
2142 {
2143     uint8_t old_flags = save_exception_flags(env);
2144     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2145     int32_t arg0_exp = extractFloatx80Exp(ST0);
2146     bool arg0_sign = extractFloatx80Sign(ST0);
2147     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2148     int32_t arg1_exp = extractFloatx80Exp(ST1);
2149     bool arg1_sign = extractFloatx80Sign(ST1);
2150 
2151     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2152         float_raise(float_flag_invalid, &env->fp_status);
2153         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2154     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2155         float_raise(float_flag_invalid, &env->fp_status);
2156         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2157     } else if (floatx80_invalid_encoding(ST0) ||
2158                floatx80_invalid_encoding(ST1)) {
2159         float_raise(float_flag_invalid, &env->fp_status);
2160         ST1 = floatx80_default_nan(&env->fp_status);
2161     } else if (floatx80_is_any_nan(ST0)) {
2162         ST1 = ST0;
2163     } else if (floatx80_is_any_nan(ST1)) {
2164         /* Pass this NaN through.  */
2165     } else if (arg0_sign && !floatx80_is_zero(ST0)) {
2166         float_raise(float_flag_invalid, &env->fp_status);
2167         ST1 = floatx80_default_nan(&env->fp_status);
2168     } else if (floatx80_is_infinity(ST1)) {
2169         FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
2170                                              &env->fp_status);
2171         switch (cmp) {
2172         case float_relation_less:
2173             ST1 = floatx80_chs(ST1);
2174             break;
2175         case float_relation_greater:
2176             /* Result is infinity of the same sign as ST1.  */
2177             break;
2178         default:
2179             float_raise(float_flag_invalid, &env->fp_status);
2180             ST1 = floatx80_default_nan(&env->fp_status);
2181             break;
2182         }
2183     } else if (floatx80_is_infinity(ST0)) {
2184         if (floatx80_is_zero(ST1)) {
2185             float_raise(float_flag_invalid, &env->fp_status);
2186             ST1 = floatx80_default_nan(&env->fp_status);
2187         } else if (arg1_sign) {
2188             ST1 = floatx80_chs(ST0);
2189         } else {
2190             ST1 = ST0;
2191         }
2192     } else if (floatx80_is_zero(ST0)) {
2193         if (floatx80_is_zero(ST1)) {
2194             float_raise(float_flag_invalid, &env->fp_status);
2195             ST1 = floatx80_default_nan(&env->fp_status);
2196         } else {
2197             /* Result is infinity with opposite sign to ST1.  */
2198             float_raise(float_flag_divbyzero, &env->fp_status);
2199             ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
2200                                 0x8000000000000000ULL);
2201         }
2202     } else if (floatx80_is_zero(ST1)) {
2203         if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
2204             ST1 = floatx80_chs(ST1);
2205         }
2206         /* Otherwise, ST1 is already the correct result.  */
2207     } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
2208         if (arg1_sign) {
2209             ST1 = floatx80_chs(floatx80_zero);
2210         } else {
2211             ST1 = floatx80_zero;
2212         }
2213     } else {
2214         int32_t int_exp;
2215         floatx80 arg0_m1;
2216         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2217         FloatX80RoundPrec save_prec =
2218             env->fp_status.floatx80_rounding_precision;
2219         env->fp_status.float_rounding_mode = float_round_nearest_even;
2220         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2221 
2222         if (arg0_exp == 0) {
2223             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2224         }
2225         if (arg1_exp == 0) {
2226             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2227         }
2228         int_exp = arg0_exp - 0x3fff;
2229         if (arg0_sig > 0xb504f333f9de6484ULL) {
2230             ++int_exp;
2231         }
2232         arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
2233                                                &env->fp_status),
2234                                floatx80_one, &env->fp_status);
2235         if (floatx80_is_zero(arg0_m1)) {
2236             /* Exact power of 2; multiply by ST1.  */
2237             env->fp_status.float_rounding_mode = save_mode;
2238             ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
2239                                ST1, &env->fp_status);
2240         } else {
2241             bool asign = extractFloatx80Sign(arg0_m1);
2242             int32_t aexp;
2243             uint64_t asig0, asig1, asig2;
2244             helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
2245             if (int_exp != 0) {
2246                 bool isign = (int_exp < 0);
2247                 int32_t iexp;
2248                 uint64_t isig;
2249                 int shift;
2250                 int_exp = isign ? -int_exp : int_exp;
2251                 shift = clz32(int_exp) + 32;
2252                 isig = int_exp;
2253                 isig <<= shift;
2254                 iexp = 0x403e - shift;
2255                 shift128RightJamming(asig0, asig1, iexp - aexp,
2256                                      &asig0, &asig1);
2257                 if (asign == isign) {
2258                     add128(isig, 0, asig0, asig1, &asig0, &asig1);
2259                 } else {
2260                     sub128(isig, 0, asig0, asig1, &asig0, &asig1);
2261                 }
2262                 aexp = iexp;
2263                 asign = isign;
2264             }
2265             /*
2266              * Multiply by the second argument to compute the required
2267              * result.
2268              */
2269             if (arg1_exp == 0) {
2270                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2271             }
2272             mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2273             aexp += arg1_exp - 0x3ffe;
2274             /* This result is inexact.  */
2275             asig1 |= 1;
2276             env->fp_status.float_rounding_mode = save_mode;
2277             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2278                                                 asign ^ arg1_sign, aexp,
2279                                                 asig0, asig1, &env->fp_status);
2280         }
2281 
2282         env->fp_status.floatx80_rounding_precision = save_prec;
2283     }
2284     fpop(env);
2285     merge_exception_flags(env, old_flags);
2286 }
2287 
2288 void helper_fsqrt(CPUX86State *env)
2289 {
2290     uint8_t old_flags = save_exception_flags(env);
2291     if (floatx80_is_neg(ST0)) {
2292         env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
2293         env->fpus |= 0x400;
2294     }
2295     ST0 = floatx80_sqrt(ST0, &env->fp_status);
2296     merge_exception_flags(env, old_flags);
2297 }
2298 
2299 void helper_fsincos(CPUX86State *env)
2300 {
2301     double fptemp = floatx80_to_double(env, ST0);
2302 
2303     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2304         env->fpus |= 0x400;
2305     } else {
2306         ST0 = double_to_floatx80(env, sin(fptemp));
2307         fpush(env);
2308         ST0 = double_to_floatx80(env, cos(fptemp));
2309         env->fpus &= ~0x400;  /* C2 <-- 0 */
2310         /* the above code is for |arg| < 2**63 only */
2311     }
2312 }
2313 
2314 void helper_frndint(CPUX86State *env)
2315 {
2316     uint8_t old_flags = save_exception_flags(env);
2317     ST0 = floatx80_round_to_int(ST0, &env->fp_status);
2318     merge_exception_flags(env, old_flags);
2319 }
2320 
2321 void helper_fscale(CPUX86State *env)
2322 {
2323     uint8_t old_flags = save_exception_flags(env);
2324     if (floatx80_invalid_encoding(ST1) || floatx80_invalid_encoding(ST0)) {
2325         float_raise(float_flag_invalid, &env->fp_status);
2326         ST0 = floatx80_default_nan(&env->fp_status);
2327     } else if (floatx80_is_any_nan(ST1)) {
2328         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2329             float_raise(float_flag_invalid, &env->fp_status);
2330         }
2331         ST0 = ST1;
2332         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2333             float_raise(float_flag_invalid, &env->fp_status);
2334             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
2335         }
2336     } else if (floatx80_is_infinity(ST1) &&
2337                !floatx80_invalid_encoding(ST0) &&
2338                !floatx80_is_any_nan(ST0)) {
2339         if (floatx80_is_neg(ST1)) {
2340             if (floatx80_is_infinity(ST0)) {
2341                 float_raise(float_flag_invalid, &env->fp_status);
2342                 ST0 = floatx80_default_nan(&env->fp_status);
2343             } else {
2344                 ST0 = (floatx80_is_neg(ST0) ?
2345                        floatx80_chs(floatx80_zero) :
2346                        floatx80_zero);
2347             }
2348         } else {
2349             if (floatx80_is_zero(ST0)) {
2350                 float_raise(float_flag_invalid, &env->fp_status);
2351                 ST0 = floatx80_default_nan(&env->fp_status);
2352             } else {
2353                 ST0 = (floatx80_is_neg(ST0) ?
2354                        floatx80_chs(floatx80_infinity) :
2355                        floatx80_infinity);
2356             }
2357         }
2358     } else {
2359         int n;
2360         FloatX80RoundPrec save = env->fp_status.floatx80_rounding_precision;
2361         uint8_t save_flags = get_float_exception_flags(&env->fp_status);
2362         set_float_exception_flags(0, &env->fp_status);
2363         n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
2364         set_float_exception_flags(save_flags, &env->fp_status);
2365         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2366         ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
2367         env->fp_status.floatx80_rounding_precision = save;
2368     }
2369     merge_exception_flags(env, old_flags);
2370 }
2371 
2372 void helper_fsin(CPUX86State *env)
2373 {
2374     double fptemp = floatx80_to_double(env, ST0);
2375 
2376     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2377         env->fpus |= 0x400;
2378     } else {
2379         ST0 = double_to_floatx80(env, sin(fptemp));
2380         env->fpus &= ~0x400;  /* C2 <-- 0 */
2381         /* the above code is for |arg| < 2**53 only */
2382     }
2383 }
2384 
2385 void helper_fcos(CPUX86State *env)
2386 {
2387     double fptemp = floatx80_to_double(env, ST0);
2388 
2389     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2390         env->fpus |= 0x400;
2391     } else {
2392         ST0 = double_to_floatx80(env, cos(fptemp));
2393         env->fpus &= ~0x400;  /* C2 <-- 0 */
2394         /* the above code is for |arg| < 2**63 only */
2395     }
2396 }
2397 
2398 void helper_fxam_ST0(CPUX86State *env)
2399 {
2400     CPU_LDoubleU temp;
2401     int expdif;
2402 
2403     temp.d = ST0;
2404 
2405     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
2406     if (SIGND(temp)) {
2407         env->fpus |= 0x200; /* C1 <-- 1 */
2408     }
2409 
2410     if (env->fptags[env->fpstt]) {
2411         env->fpus |= 0x4100; /* Empty */
2412         return;
2413     }
2414 
2415     expdif = EXPD(temp);
2416     if (expdif == MAXEXPD) {
2417         if (MANTD(temp) == 0x8000000000000000ULL) {
2418             env->fpus |= 0x500; /* Infinity */
2419         } else if (MANTD(temp) & 0x8000000000000000ULL) {
2420             env->fpus |= 0x100; /* NaN */
2421         }
2422     } else if (expdif == 0) {
2423         if (MANTD(temp) == 0) {
2424             env->fpus |=  0x4000; /* Zero */
2425         } else {
2426             env->fpus |= 0x4400; /* Denormal */
2427         }
2428     } else if (MANTD(temp) & 0x8000000000000000ULL) {
2429         env->fpus |= 0x400;
2430     }
2431 }
2432 
2433 static void do_fstenv(X86Access *ac, target_ulong ptr, int data32)
2434 {
2435     CPUX86State *env = ac->env;
2436     int fpus, fptag, exp, i;
2437     uint64_t mant;
2438     CPU_LDoubleU tmp;
2439 
2440     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2441     fptag = 0;
2442     for (i = 7; i >= 0; i--) {
2443         fptag <<= 2;
2444         if (env->fptags[i]) {
2445             fptag |= 3;
2446         } else {
2447             tmp.d = env->fpregs[i].d;
2448             exp = EXPD(tmp);
2449             mant = MANTD(tmp);
2450             if (exp == 0 && mant == 0) {
2451                 /* zero */
2452                 fptag |= 1;
2453             } else if (exp == 0 || exp == MAXEXPD
2454                        || (mant & (1LL << 63)) == 0) {
2455                 /* NaNs, infinity, denormal */
2456                 fptag |= 2;
2457             }
2458         }
2459     }
2460     if (data32) {
2461         /* 32 bit */
2462         access_stl(ac, ptr, env->fpuc);
2463         access_stl(ac, ptr + 4, fpus);
2464         access_stl(ac, ptr + 8, fptag);
2465         access_stl(ac, ptr + 12, env->fpip); /* fpip */
2466         access_stl(ac, ptr + 16, env->fpcs); /* fpcs */
2467         access_stl(ac, ptr + 20, env->fpdp); /* fpoo */
2468         access_stl(ac, ptr + 24, env->fpds); /* fpos */
2469     } else {
2470         /* 16 bit */
2471         access_stw(ac, ptr, env->fpuc);
2472         access_stw(ac, ptr + 2, fpus);
2473         access_stw(ac, ptr + 4, fptag);
2474         access_stw(ac, ptr + 6, env->fpip);
2475         access_stw(ac, ptr + 8, env->fpcs);
2476         access_stw(ac, ptr + 10, env->fpdp);
2477         access_stw(ac, ptr + 12, env->fpds);
2478     }
2479 }
2480 
2481 void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
2482 {
2483     X86Access ac;
2484 
2485     access_prepare(&ac, env, ptr, 14 << data32, MMU_DATA_STORE, GETPC());
2486     do_fstenv(&ac, ptr, data32);
2487 }
2488 
2489 static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
2490 {
2491     env->fpstt = (fpus >> 11) & 7;
2492     env->fpus = fpus & ~0x3800 & ~FPUS_B;
2493     env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
2494 #if !defined(CONFIG_USER_ONLY)
2495     if (!(env->fpus & FPUS_SE)) {
2496         /*
2497          * Here the processor deasserts FERR#; in response, the chipset deasserts
2498          * IGNNE#.
2499          */
2500         cpu_clear_ignne();
2501     }
2502 #endif
2503 }
2504 
2505 static void do_fldenv(X86Access *ac, target_ulong ptr, int data32)
2506 {
2507     int i, fpus, fptag;
2508     CPUX86State *env = ac->env;
2509 
2510     cpu_set_fpuc(env, access_ldw(ac, ptr));
2511     fpus = access_ldw(ac, ptr + (2 << data32));
2512     fptag = access_ldw(ac, ptr + (4 << data32));
2513 
2514     cpu_set_fpus(env, fpus);
2515     for (i = 0; i < 8; i++) {
2516         env->fptags[i] = ((fptag & 3) == 3);
2517         fptag >>= 2;
2518     }
2519 }
2520 
2521 void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
2522 {
2523     X86Access ac;
2524 
2525     access_prepare(&ac, env, ptr, 14 << data32, MMU_DATA_STORE, GETPC());
2526     do_fldenv(&ac, ptr, data32);
2527 }
2528 
2529 static void do_fsave(X86Access *ac, target_ulong ptr, int data32)
2530 {
2531     CPUX86State *env = ac->env;
2532 
2533     do_fstenv(ac, ptr, data32);
2534     ptr += 14 << data32;
2535 
2536     for (int i = 0; i < 8; i++) {
2537         floatx80 tmp = ST(i);
2538         do_fstt(ac, ptr, tmp);
2539         ptr += 10;
2540     }
2541 
2542     do_fninit(env);
2543 }
2544 
2545 void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
2546 {
2547     int size = (14 << data32) + 80;
2548     X86Access ac;
2549 
2550     access_prepare(&ac, env, ptr, size, MMU_DATA_STORE, GETPC());
2551     do_fsave(&ac, ptr, data32);
2552 }
2553 
2554 static void do_frstor(X86Access *ac, target_ulong ptr, int data32)
2555 {
2556     CPUX86State *env = ac->env;
2557 
2558     do_fldenv(ac, ptr, data32);
2559     ptr += 14 << data32;
2560 
2561     for (int i = 0; i < 8; i++) {
2562         floatx80 tmp = do_fldt(ac, ptr);
2563         ST(i) = tmp;
2564         ptr += 10;
2565     }
2566 }
2567 
2568 void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
2569 {
2570     int size = (14 << data32) + 80;
2571     X86Access ac;
2572 
2573     access_prepare(&ac, env, ptr, size, MMU_DATA_LOAD, GETPC());
2574     do_frstor(&ac, ptr, data32);
2575 }
2576 
2577 #define XO(X)  offsetof(X86XSaveArea, X)
2578 
2579 static void do_xsave_fpu(X86Access *ac, target_ulong ptr)
2580 {
2581     CPUX86State *env = ac->env;
2582     int fpus, fptag, i;
2583     target_ulong addr;
2584 
2585     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2586     fptag = 0;
2587     for (i = 0; i < 8; i++) {
2588         fptag |= (env->fptags[i] << i);
2589     }
2590 
2591     access_stw(ac, ptr + XO(legacy.fcw), env->fpuc);
2592     access_stw(ac, ptr + XO(legacy.fsw), fpus);
2593     access_stw(ac, ptr + XO(legacy.ftw), fptag ^ 0xff);
2594 
2595     /* In 32-bit mode this is eip, sel, dp, sel.
2596        In 64-bit mode this is rip, rdp.
2597        But in either case we don't write actual data, just zeros.  */
2598     access_stq(ac, ptr + XO(legacy.fpip), 0); /* eip+sel; rip */
2599     access_stq(ac, ptr + XO(legacy.fpdp), 0); /* edp+sel; rdp */
2600 
2601     addr = ptr + XO(legacy.fpregs);
2602 
2603     for (i = 0; i < 8; i++) {
2604         floatx80 tmp = ST(i);
2605         do_fstt(ac, addr, tmp);
2606         addr += 16;
2607     }
2608 }
2609 
2610 static void do_xsave_mxcsr(X86Access *ac, target_ulong ptr)
2611 {
2612     CPUX86State *env = ac->env;
2613 
2614     update_mxcsr_from_sse_status(env);
2615     access_stl(ac, ptr + XO(legacy.mxcsr), env->mxcsr);
2616     access_stl(ac, ptr + XO(legacy.mxcsr_mask), 0x0000ffff);
2617 }
2618 
2619 static void do_xsave_sse(X86Access *ac, target_ulong ptr)
2620 {
2621     CPUX86State *env = ac->env;
2622     int i, nb_xmm_regs;
2623     target_ulong addr;
2624 
2625     if (env->hflags & HF_CS64_MASK) {
2626         nb_xmm_regs = 16;
2627     } else {
2628         nb_xmm_regs = 8;
2629     }
2630 
2631     addr = ptr + XO(legacy.xmm_regs);
2632     for (i = 0; i < nb_xmm_regs; i++) {
2633         access_stq(ac, addr, env->xmm_regs[i].ZMM_Q(0));
2634         access_stq(ac, addr + 8, env->xmm_regs[i].ZMM_Q(1));
2635         addr += 16;
2636     }
2637 }
2638 
2639 static void do_xsave_ymmh(X86Access *ac, target_ulong ptr)
2640 {
2641     CPUX86State *env = ac->env;
2642     int i, nb_xmm_regs;
2643 
2644     if (env->hflags & HF_CS64_MASK) {
2645         nb_xmm_regs = 16;
2646     } else {
2647         nb_xmm_regs = 8;
2648     }
2649 
2650     for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
2651         access_stq(ac, ptr, env->xmm_regs[i].ZMM_Q(2));
2652         access_stq(ac, ptr + 8, env->xmm_regs[i].ZMM_Q(3));
2653     }
2654 }
2655 
2656 static void do_xsave_bndregs(X86Access *ac, target_ulong ptr)
2657 {
2658     CPUX86State *env = ac->env;
2659     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2660     int i;
2661 
2662     for (i = 0; i < 4; i++, addr += 16) {
2663         access_stq(ac, addr, env->bnd_regs[i].lb);
2664         access_stq(ac, addr + 8, env->bnd_regs[i].ub);
2665     }
2666 }
2667 
2668 static void do_xsave_bndcsr(X86Access *ac, target_ulong ptr)
2669 {
2670     CPUX86State *env = ac->env;
2671 
2672     access_stq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
2673                env->bndcs_regs.cfgu);
2674     access_stq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
2675                env->bndcs_regs.sts);
2676 }
2677 
2678 static void do_xsave_pkru(X86Access *ac, target_ulong ptr)
2679 {
2680     access_stq(ac, ptr, ac->env->pkru);
2681 }
2682 
2683 static void do_fxsave(X86Access *ac, target_ulong ptr)
2684 {
2685     CPUX86State *env = ac->env;
2686 
2687     do_xsave_fpu(ac, ptr);
2688     if (env->cr[4] & CR4_OSFXSR_MASK) {
2689         do_xsave_mxcsr(ac, ptr);
2690         /* Fast FXSAVE leaves out the XMM registers */
2691         if (!(env->efer & MSR_EFER_FFXSR)
2692             || (env->hflags & HF_CPL_MASK)
2693             || !(env->hflags & HF_LMA_MASK)) {
2694             do_xsave_sse(ac, ptr);
2695         }
2696     }
2697 }
2698 
2699 void helper_fxsave(CPUX86State *env, target_ulong ptr)
2700 {
2701     uintptr_t ra = GETPC();
2702     X86Access ac;
2703 
2704     /* The operand must be 16 byte aligned */
2705     if (ptr & 0xf) {
2706         raise_exception_ra(env, EXCP0D_GPF, ra);
2707     }
2708 
2709     access_prepare(&ac, env, ptr, sizeof(X86LegacyXSaveArea),
2710                    MMU_DATA_STORE, ra);
2711     do_fxsave(&ac, ptr);
2712 }
2713 
2714 static uint64_t get_xinuse(CPUX86State *env)
2715 {
2716     uint64_t inuse = -1;
2717 
2718     /* For the most part, we don't track XINUSE.  We could calculate it
2719        here for all components, but it's probably less work to simply
2720        indicate in use.  That said, the state of BNDREGS is important
2721        enough to track in HFLAGS, so we might as well use that here.  */
2722     if ((env->hflags & HF_MPX_IU_MASK) == 0) {
2723        inuse &= ~XSTATE_BNDREGS_MASK;
2724     }
2725     return inuse;
2726 }
2727 
2728 static void do_xsave_access(X86Access *ac, target_ulong ptr, uint64_t rfbm,
2729                             uint64_t inuse, uint64_t opt)
2730 {
2731     uint64_t old_bv, new_bv;
2732 
2733     if (opt & XSTATE_FP_MASK) {
2734         do_xsave_fpu(ac, ptr);
2735     }
2736     if (rfbm & XSTATE_SSE_MASK) {
2737         /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
2738         do_xsave_mxcsr(ac, ptr);
2739     }
2740     if (opt & XSTATE_SSE_MASK) {
2741         do_xsave_sse(ac, ptr);
2742     }
2743     if (opt & XSTATE_YMM_MASK) {
2744         do_xsave_ymmh(ac, ptr + XO(avx_state));
2745     }
2746     if (opt & XSTATE_BNDREGS_MASK) {
2747         do_xsave_bndregs(ac, ptr + XO(bndreg_state));
2748     }
2749     if (opt & XSTATE_BNDCSR_MASK) {
2750         do_xsave_bndcsr(ac, ptr + XO(bndcsr_state));
2751     }
2752     if (opt & XSTATE_PKRU_MASK) {
2753         do_xsave_pkru(ac, ptr + XO(pkru_state));
2754     }
2755 
2756     /* Update the XSTATE_BV field.  */
2757     old_bv = access_ldq(ac, ptr + XO(header.xstate_bv));
2758     new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
2759     access_stq(ac, ptr + XO(header.xstate_bv), new_bv);
2760 }
2761 
2762 static void do_xsave_chk(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2763 {
2764     /* The OS must have enabled XSAVE.  */
2765     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2766         raise_exception_ra(env, EXCP06_ILLOP, ra);
2767     }
2768 
2769     /* The operand must be 64 byte aligned.  */
2770     if (ptr & 63) {
2771         raise_exception_ra(env, EXCP0D_GPF, ra);
2772     }
2773 }
2774 
2775 static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
2776                      uint64_t inuse, uint64_t opt, uintptr_t ra)
2777 {
2778     X86Access ac;
2779     unsigned size;
2780 
2781     do_xsave_chk(env, ptr, ra);
2782 
2783     /* Never save anything not enabled by XCR0.  */
2784     rfbm &= env->xcr0;
2785     opt &= rfbm;
2786     size = xsave_area_size(opt, false);
2787 
2788     access_prepare(&ac, env, ptr, size, MMU_DATA_STORE, ra);
2789     do_xsave_access(&ac, ptr, rfbm, inuse, opt);
2790 }
2791 
2792 void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2793 {
2794     do_xsave(env, ptr, rfbm, get_xinuse(env), rfbm, GETPC());
2795 }
2796 
2797 void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2798 {
2799     uint64_t inuse = get_xinuse(env);
2800     do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
2801 }
2802 
2803 static void do_xrstor_fpu(X86Access *ac, target_ulong ptr)
2804 {
2805     CPUX86State *env = ac->env;
2806     int i, fpuc, fpus, fptag;
2807     target_ulong addr;
2808 
2809     fpuc = access_ldw(ac, ptr + XO(legacy.fcw));
2810     fpus = access_ldw(ac, ptr + XO(legacy.fsw));
2811     fptag = access_ldw(ac, ptr + XO(legacy.ftw));
2812     cpu_set_fpuc(env, fpuc);
2813     cpu_set_fpus(env, fpus);
2814 
2815     fptag ^= 0xff;
2816     for (i = 0; i < 8; i++) {
2817         env->fptags[i] = ((fptag >> i) & 1);
2818     }
2819 
2820     addr = ptr + XO(legacy.fpregs);
2821 
2822     for (i = 0; i < 8; i++) {
2823         floatx80 tmp = do_fldt(ac, addr);
2824         ST(i) = tmp;
2825         addr += 16;
2826     }
2827 }
2828 
2829 static void do_xrstor_mxcsr(X86Access *ac, target_ulong ptr)
2830 {
2831     CPUX86State *env = ac->env;
2832     cpu_set_mxcsr(env, access_ldl(ac, ptr + XO(legacy.mxcsr)));
2833 }
2834 
2835 static void do_xrstor_sse(X86Access *ac, target_ulong ptr)
2836 {
2837     CPUX86State *env = ac->env;
2838     int i, nb_xmm_regs;
2839     target_ulong addr;
2840 
2841     if (env->hflags & HF_CS64_MASK) {
2842         nb_xmm_regs = 16;
2843     } else {
2844         nb_xmm_regs = 8;
2845     }
2846 
2847     addr = ptr + XO(legacy.xmm_regs);
2848     for (i = 0; i < nb_xmm_regs; i++) {
2849         env->xmm_regs[i].ZMM_Q(0) = access_ldq(ac, addr);
2850         env->xmm_regs[i].ZMM_Q(1) = access_ldq(ac, addr + 8);
2851         addr += 16;
2852     }
2853 }
2854 
2855 static void do_clear_sse(CPUX86State *env)
2856 {
2857     int i, nb_xmm_regs;
2858 
2859     if (env->hflags & HF_CS64_MASK) {
2860         nb_xmm_regs = 16;
2861     } else {
2862         nb_xmm_regs = 8;
2863     }
2864 
2865     for (i = 0; i < nb_xmm_regs; i++) {
2866         env->xmm_regs[i].ZMM_Q(0) = 0;
2867         env->xmm_regs[i].ZMM_Q(1) = 0;
2868     }
2869 }
2870 
2871 static void do_xrstor_ymmh(X86Access *ac, target_ulong ptr)
2872 {
2873     CPUX86State *env = ac->env;
2874     int i, nb_xmm_regs;
2875 
2876     if (env->hflags & HF_CS64_MASK) {
2877         nb_xmm_regs = 16;
2878     } else {
2879         nb_xmm_regs = 8;
2880     }
2881 
2882     for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
2883         env->xmm_regs[i].ZMM_Q(2) = access_ldq(ac, ptr);
2884         env->xmm_regs[i].ZMM_Q(3) = access_ldq(ac, ptr + 8);
2885     }
2886 }
2887 
2888 static void do_clear_ymmh(CPUX86State *env)
2889 {
2890     int i, nb_xmm_regs;
2891 
2892     if (env->hflags & HF_CS64_MASK) {
2893         nb_xmm_regs = 16;
2894     } else {
2895         nb_xmm_regs = 8;
2896     }
2897 
2898     for (i = 0; i < nb_xmm_regs; i++) {
2899         env->xmm_regs[i].ZMM_Q(2) = 0;
2900         env->xmm_regs[i].ZMM_Q(3) = 0;
2901     }
2902 }
2903 
2904 static void do_xrstor_bndregs(X86Access *ac, target_ulong ptr)
2905 {
2906     CPUX86State *env = ac->env;
2907     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2908     int i;
2909 
2910     for (i = 0; i < 4; i++, addr += 16) {
2911         env->bnd_regs[i].lb = access_ldq(ac, addr);
2912         env->bnd_regs[i].ub = access_ldq(ac, addr + 8);
2913     }
2914 }
2915 
2916 static void do_xrstor_bndcsr(X86Access *ac, target_ulong ptr)
2917 {
2918     CPUX86State *env = ac->env;
2919 
2920     /* FIXME: Extend highest implemented bit of linear address.  */
2921     env->bndcs_regs.cfgu
2922         = access_ldq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu));
2923     env->bndcs_regs.sts
2924         = access_ldq(ac, ptr + offsetof(XSaveBNDCSR, bndcsr.sts));
2925 }
2926 
2927 static void do_xrstor_pkru(X86Access *ac, target_ulong ptr)
2928 {
2929     ac->env->pkru = access_ldq(ac, ptr);
2930 }
2931 
2932 static void do_fxrstor(X86Access *ac, target_ulong ptr)
2933 {
2934     CPUX86State *env = ac->env;
2935 
2936     do_xrstor_fpu(ac, ptr);
2937     if (env->cr[4] & CR4_OSFXSR_MASK) {
2938         do_xrstor_mxcsr(ac, ptr);
2939         /* Fast FXRSTOR leaves out the XMM registers */
2940         if (!(env->efer & MSR_EFER_FFXSR)
2941             || (env->hflags & HF_CPL_MASK)
2942             || !(env->hflags & HF_LMA_MASK)) {
2943             do_xrstor_sse(ac, ptr);
2944         }
2945     }
2946 }
2947 
2948 void helper_fxrstor(CPUX86State *env, target_ulong ptr)
2949 {
2950     uintptr_t ra = GETPC();
2951     X86Access ac;
2952 
2953     /* The operand must be 16 byte aligned */
2954     if (ptr & 0xf) {
2955         raise_exception_ra(env, EXCP0D_GPF, ra);
2956     }
2957 
2958     access_prepare(&ac, env, ptr, sizeof(X86LegacyXSaveArea),
2959                    MMU_DATA_LOAD, ra);
2960     do_fxrstor(&ac, ptr);
2961 }
2962 
2963 static bool valid_xrstor_header(X86Access *ac, uint64_t *pxsbv,
2964                                 target_ulong ptr)
2965 {
2966     uint64_t xstate_bv, xcomp_bv, reserve0;
2967 
2968     xstate_bv = access_ldq(ac, ptr + XO(header.xstate_bv));
2969     xcomp_bv = access_ldq(ac, ptr + XO(header.xcomp_bv));
2970     reserve0 = access_ldq(ac, ptr + XO(header.reserve0));
2971     *pxsbv = xstate_bv;
2972 
2973     /*
2974      * XCOMP_BV bit 63 indicates compact form, which we do not support,
2975      * and thus must raise #GP.  That leaves us in standard form.
2976      * In standard form, bytes 23:8 must be zero -- which is both
2977      * XCOMP_BV and the following 64-bit field.
2978      */
2979     if (xcomp_bv || reserve0) {
2980         return false;
2981     }
2982 
2983     /* The XSTATE_BV field must not set bits not present in XCR0.  */
2984     return (xstate_bv & ~ac->env->xcr0) == 0;
2985 }
2986 
2987 static void do_xrstor(X86Access *ac, target_ulong ptr,
2988                       uint64_t rfbm, uint64_t xstate_bv)
2989 {
2990     CPUX86State *env = ac->env;
2991 
2992     if (rfbm & XSTATE_FP_MASK) {
2993         if (xstate_bv & XSTATE_FP_MASK) {
2994             do_xrstor_fpu(ac, ptr);
2995         } else {
2996             do_fninit(env);
2997             memset(env->fpregs, 0, sizeof(env->fpregs));
2998         }
2999     }
3000     if (rfbm & XSTATE_SSE_MASK) {
3001         /* Note that the standard form of XRSTOR loads MXCSR from memory
3002            whether or not the XSTATE_BV bit is set.  */
3003         do_xrstor_mxcsr(ac, ptr);
3004         if (xstate_bv & XSTATE_SSE_MASK) {
3005             do_xrstor_sse(ac, ptr);
3006         } else {
3007             do_clear_sse(env);
3008         }
3009     }
3010     if (rfbm & XSTATE_YMM_MASK) {
3011         if (xstate_bv & XSTATE_YMM_MASK) {
3012             do_xrstor_ymmh(ac, ptr + XO(avx_state));
3013         } else {
3014             do_clear_ymmh(env);
3015         }
3016     }
3017     if (rfbm & XSTATE_BNDREGS_MASK) {
3018         if (xstate_bv & XSTATE_BNDREGS_MASK) {
3019             do_xrstor_bndregs(ac, ptr + XO(bndreg_state));
3020             env->hflags |= HF_MPX_IU_MASK;
3021         } else {
3022             memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
3023             env->hflags &= ~HF_MPX_IU_MASK;
3024         }
3025     }
3026     if (rfbm & XSTATE_BNDCSR_MASK) {
3027         if (xstate_bv & XSTATE_BNDCSR_MASK) {
3028             do_xrstor_bndcsr(ac, ptr + XO(bndcsr_state));
3029         } else {
3030             memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
3031         }
3032         cpu_sync_bndcs_hflags(env);
3033     }
3034     if (rfbm & XSTATE_PKRU_MASK) {
3035         uint64_t old_pkru = env->pkru;
3036         if (xstate_bv & XSTATE_PKRU_MASK) {
3037             do_xrstor_pkru(ac, ptr + XO(pkru_state));
3038         } else {
3039             env->pkru = 0;
3040         }
3041         if (env->pkru != old_pkru) {
3042             CPUState *cs = env_cpu(env);
3043             tlb_flush(cs);
3044         }
3045     }
3046 }
3047 
3048 #undef XO
3049 
3050 void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
3051 {
3052     uintptr_t ra = GETPC();
3053     X86Access ac;
3054     uint64_t xstate_bv;
3055     unsigned size, size_ext;
3056 
3057     do_xsave_chk(env, ptr, ra);
3058 
3059     /* Begin with just the minimum size to validate the header. */
3060     size = sizeof(X86LegacyXSaveArea) + sizeof(X86XSaveHeader);
3061     access_prepare(&ac, env, ptr, size, MMU_DATA_LOAD, ra);
3062     if (!valid_xrstor_header(&ac, &xstate_bv, ptr)) {
3063         raise_exception_ra(env, EXCP0D_GPF, ra);
3064     }
3065 
3066     rfbm &= env->xcr0;
3067     size_ext = xsave_area_size(rfbm & xstate_bv, false);
3068     if (size < size_ext) {
3069         /* TODO: See if existing page probe has covered extra size. */
3070         access_prepare(&ac, env, ptr, size_ext, MMU_DATA_LOAD, ra);
3071     }
3072 
3073     do_xrstor(&ac, ptr, rfbm, xstate_bv);
3074 }
3075 
3076 #if defined(CONFIG_USER_ONLY)
3077 void cpu_x86_fsave(CPUX86State *env, void *host, size_t len)
3078 {
3079     X86Access ac = {
3080         .haddr1 = host,
3081         .size = 4 * 7 + 8 * 10,
3082         .env = env,
3083     };
3084 
3085     assert(ac.size <= len);
3086     do_fsave(&ac, 0, true);
3087 }
3088 
3089 void cpu_x86_frstor(CPUX86State *env, void *host, size_t len)
3090 {
3091     X86Access ac = {
3092         .haddr1 = host,
3093         .size = 4 * 7 + 8 * 10,
3094         .env = env,
3095     };
3096 
3097     assert(ac.size <= len);
3098     do_frstor(&ac, 0, true);
3099 }
3100 
3101 void cpu_x86_fxsave(CPUX86State *env, void *host, size_t len)
3102 {
3103     X86Access ac = {
3104         .haddr1 = host,
3105         .size = sizeof(X86LegacyXSaveArea),
3106         .env = env,
3107     };
3108 
3109     assert(ac.size <= len);
3110     do_fxsave(&ac, 0);
3111 }
3112 
3113 void cpu_x86_fxrstor(CPUX86State *env, void *host, size_t len)
3114 {
3115     X86Access ac = {
3116         .haddr1 = host,
3117         .size = sizeof(X86LegacyXSaveArea),
3118         .env = env,
3119     };
3120 
3121     assert(ac.size <= len);
3122     do_fxrstor(&ac, 0);
3123 }
3124 
3125 void cpu_x86_xsave(CPUX86State *env, void *host, size_t len, uint64_t rfbm)
3126 {
3127     X86Access ac = {
3128         .haddr1 = host,
3129         .env = env,
3130     };
3131 
3132     /*
3133      * Since this is only called from user-level signal handling,
3134      * we should have done the job correctly there.
3135      */
3136     assert((rfbm & ~env->xcr0) == 0);
3137     ac.size = xsave_area_size(rfbm, false);
3138     assert(ac.size <= len);
3139     do_xsave_access(&ac, 0, rfbm, get_xinuse(env), rfbm);
3140 }
3141 
3142 bool cpu_x86_xrstor(CPUX86State *env, void *host, size_t len, uint64_t rfbm)
3143 {
3144     X86Access ac = {
3145         .haddr1 = host,
3146         .env = env,
3147     };
3148     uint64_t xstate_bv;
3149 
3150     /*
3151      * Since this is only called from user-level signal handling,
3152      * we should have done the job correctly there.
3153      */
3154     assert((rfbm & ~env->xcr0) == 0);
3155     ac.size = xsave_area_size(rfbm, false);
3156     assert(ac.size <= len);
3157 
3158     if (!valid_xrstor_header(&ac, &xstate_bv, 0)) {
3159         return false;
3160     }
3161     do_xrstor(&ac, 0, rfbm, xstate_bv);
3162     return true;
3163 }
3164 #endif
3165 
3166 uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
3167 {
3168     /* The OS must have enabled XSAVE.  */
3169     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
3170         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
3171     }
3172 
3173     switch (ecx) {
3174     case 0:
3175         return env->xcr0;
3176     case 1:
3177         if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
3178             return env->xcr0 & get_xinuse(env);
3179         }
3180         break;
3181     }
3182     raise_exception_ra(env, EXCP0D_GPF, GETPC());
3183 }
3184 
3185 void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
3186 {
3187     uint32_t dummy, ena_lo, ena_hi;
3188     uint64_t ena;
3189 
3190     /* The OS must have enabled XSAVE.  */
3191     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
3192         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
3193     }
3194 
3195     /* Only XCR0 is defined at present; the FPU may not be disabled.  */
3196     if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
3197         goto do_gpf;
3198     }
3199 
3200     /* SSE can be disabled, but only if AVX is disabled too.  */
3201     if ((mask & (XSTATE_SSE_MASK | XSTATE_YMM_MASK)) == XSTATE_YMM_MASK) {
3202         goto do_gpf;
3203     }
3204 
3205     /* Disallow enabling unimplemented features.  */
3206     cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
3207     ena = ((uint64_t)ena_hi << 32) | ena_lo;
3208     if (mask & ~ena) {
3209         goto do_gpf;
3210     }
3211 
3212     /* Disallow enabling only half of MPX.  */
3213     if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
3214         & XSTATE_BNDCSR_MASK) {
3215         goto do_gpf;
3216     }
3217 
3218     env->xcr0 = mask;
3219     cpu_sync_bndcs_hflags(env);
3220     cpu_sync_avx_hflag(env);
3221     return;
3222 
3223  do_gpf:
3224     raise_exception_ra(env, EXCP0D_GPF, GETPC());
3225 }
3226 
3227 /* MMX/SSE */
3228 /* XXX: optimize by storing fptt and fptags in the static cpu state */
3229 
3230 #define SSE_DAZ             0x0040
3231 #define SSE_RC_SHIFT        13
3232 #define SSE_RC_MASK         (3 << SSE_RC_SHIFT)
3233 #define SSE_FZ              0x8000
3234 
3235 void update_mxcsr_status(CPUX86State *env)
3236 {
3237     uint32_t mxcsr = env->mxcsr;
3238     int rnd_type;
3239 
3240     /* set rounding mode */
3241     rnd_type = (mxcsr & SSE_RC_MASK) >> SSE_RC_SHIFT;
3242     set_x86_rounding_mode(rnd_type, &env->sse_status);
3243 
3244     /* Set exception flags.  */
3245     set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) |
3246                               (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) |
3247                               (mxcsr & FPUS_OE ? float_flag_overflow : 0) |
3248                               (mxcsr & FPUS_UE ? float_flag_underflow : 0) |
3249                               (mxcsr & FPUS_PE ? float_flag_inexact : 0),
3250                               &env->sse_status);
3251 
3252     /* set denormals are zero */
3253     set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
3254 
3255     /* set flush to zero */
3256     set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->sse_status);
3257 }
3258 
3259 void update_mxcsr_from_sse_status(CPUX86State *env)
3260 {
3261     uint8_t flags = get_float_exception_flags(&env->sse_status);
3262     /*
3263      * The MXCSR denormal flag has opposite semantics to
3264      * float_flag_input_denormal_flushed (the softfloat code sets that flag
3265      * only when flushing input denormals to zero, but SSE sets it
3266      * only when not flushing them to zero), so is not converted
3267      * here.
3268      */
3269     env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) |
3270                    (flags & float_flag_divbyzero ? FPUS_ZE : 0) |
3271                    (flags & float_flag_overflow ? FPUS_OE : 0) |
3272                    (flags & float_flag_underflow ? FPUS_UE : 0) |
3273                    (flags & float_flag_inexact ? FPUS_PE : 0) |
3274                    (flags & float_flag_output_denormal_flushed ? FPUS_UE | FPUS_PE :
3275                     0));
3276 }
3277 
3278 void helper_update_mxcsr(CPUX86State *env)
3279 {
3280     update_mxcsr_from_sse_status(env);
3281 }
3282 
3283 void helper_ldmxcsr(CPUX86State *env, uint32_t val)
3284 {
3285     cpu_set_mxcsr(env, val);
3286 }
3287 
3288 void helper_enter_mmx(CPUX86State *env)
3289 {
3290     env->fpstt = 0;
3291     *(uint32_t *)(env->fptags) = 0;
3292     *(uint32_t *)(env->fptags + 4) = 0;
3293 }
3294 
3295 void helper_emms(CPUX86State *env)
3296 {
3297     /* set to empty state */
3298     *(uint32_t *)(env->fptags) = 0x01010101;
3299     *(uint32_t *)(env->fptags + 4) = 0x01010101;
3300 }
3301 
3302 #define SHIFT 0
3303 #include "ops_sse.h"
3304 
3305 #define SHIFT 1
3306 #include "ops_sse.h"
3307 
3308 #define SHIFT 2
3309 #include "ops_sse.h"
3310