1edc5c0deSAndrew Turner /*
2edc5c0deSAndrew Turner * Double-precision SVE pow(x, y) function.
3edc5c0deSAndrew Turner *
49d1de259SAndrew Turner * Copyright (c) 2022-2025, Arm Limited.
5edc5c0deSAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6edc5c0deSAndrew Turner */
7edc5c0deSAndrew Turner
8edc5c0deSAndrew Turner #include "sv_math.h"
99d1de259SAndrew Turner #include "test_sig.h"
109d1de259SAndrew Turner #include "test_defs.h"
11edc5c0deSAndrew Turner
12edc5c0deSAndrew Turner /* This version share a similar algorithm as AOR scalar pow.
13edc5c0deSAndrew Turner
14edc5c0deSAndrew Turner The core computation consists in computing pow(x, y) as
15edc5c0deSAndrew Turner
16edc5c0deSAndrew Turner exp (y * log (x)).
17edc5c0deSAndrew Turner
18edc5c0deSAndrew Turner The algorithms for exp and log are very similar to scalar exp and log.
19edc5c0deSAndrew Turner The log relies on table lookup for 3 variables and an order 8 polynomial.
20edc5c0deSAndrew Turner It returns a high and a low contribution that are then passed to the exp,
21edc5c0deSAndrew Turner to minimise the loss of accuracy in both routines.
22edc5c0deSAndrew Turner The exp is based on 8-bit table lookup for scale and order-4 polynomial.
23edc5c0deSAndrew Turner The SVE algorithm drops the tail in the exp computation at the price of
24edc5c0deSAndrew Turner a lower accuracy, slightly above 1ULP.
25edc5c0deSAndrew Turner The SVE algorithm also drops the special treatement of small (< 2^-65) and
269d1de259SAndrew Turner large (> 2^63) finite values of |y|, as they only affect non-round to
279d1de259SAndrew Turner nearest modes.
28edc5c0deSAndrew Turner
29edc5c0deSAndrew Turner Maximum measured error is 1.04 ULPs:
30edc5c0deSAndrew Turner SV_NAME_D2 (pow) (0x1.3d2d45bc848acp+63, -0x1.a48a38b40cd43p-12)
31edc5c0deSAndrew Turner got 0x1.f7116284221fcp-1
32edc5c0deSAndrew Turner want 0x1.f7116284221fdp-1. */
33edc5c0deSAndrew Turner
34edc5c0deSAndrew Turner /* Data is defined in v_pow_log_data.c. */
35edc5c0deSAndrew Turner #define N_LOG (1 << V_POW_LOG_TABLE_BITS)
36edc5c0deSAndrew Turner #define Off 0x3fe6955500000000
37edc5c0deSAndrew Turner
38edc5c0deSAndrew Turner /* Data is defined in v_pow_exp_data.c. */
39edc5c0deSAndrew Turner #define N_EXP (1 << V_POW_EXP_TABLE_BITS)
40edc5c0deSAndrew Turner #define SignBias (0x800 << V_POW_EXP_TABLE_BITS)
41edc5c0deSAndrew Turner #define SmallExp 0x3c9 /* top12(0x1p-54). */
42edc5c0deSAndrew Turner #define BigExp 0x408 /* top12(512.). */
43edc5c0deSAndrew Turner #define ThresExp 0x03f /* BigExp - SmallExp. */
44edc5c0deSAndrew Turner #define HugeExp 0x409 /* top12(1024.). */
45edc5c0deSAndrew Turner
46edc5c0deSAndrew Turner /* Constants associated with pow. */
479d1de259SAndrew Turner #define SmallBoundX 0x1p-126
48edc5c0deSAndrew Turner #define SmallPowX 0x001 /* top12(0x1p-126). */
49edc5c0deSAndrew Turner #define BigPowX 0x7ff /* top12(INFINITY). */
50edc5c0deSAndrew Turner #define ThresPowX 0x7fe /* BigPowX - SmallPowX. */
51edc5c0deSAndrew Turner #define SmallPowY 0x3be /* top12(0x1.e7b6p-65). */
52edc5c0deSAndrew Turner #define BigPowY 0x43e /* top12(0x1.749p62). */
53edc5c0deSAndrew Turner #define ThresPowY 0x080 /* BigPowY - SmallPowY. */
54edc5c0deSAndrew Turner
559d1de259SAndrew Turner static const struct data
569d1de259SAndrew Turner {
579d1de259SAndrew Turner double log_c0, log_c2, log_c4, log_c6, ln2_hi, ln2_lo;
589d1de259SAndrew Turner double log_c1, log_c3, log_c5, off;
599d1de259SAndrew Turner double n_over_ln2, exp_c2, ln2_over_n_hi, ln2_over_n_lo;
609d1de259SAndrew Turner double exp_c0, exp_c1;
619d1de259SAndrew Turner } data = {
629d1de259SAndrew Turner .log_c0 = -0x1p-1,
639d1de259SAndrew Turner .log_c1 = -0x1.555555555556p-1,
649d1de259SAndrew Turner .log_c2 = 0x1.0000000000006p-1,
659d1de259SAndrew Turner .log_c3 = 0x1.999999959554ep-1,
669d1de259SAndrew Turner .log_c4 = -0x1.555555529a47ap-1,
679d1de259SAndrew Turner .log_c5 = -0x1.2495b9b4845e9p0,
689d1de259SAndrew Turner .log_c6 = 0x1.0002b8b263fc3p0,
699d1de259SAndrew Turner .off = Off,
709d1de259SAndrew Turner .exp_c0 = 0x1.fffffffffffd4p-2,
719d1de259SAndrew Turner .exp_c1 = 0x1.5555571d6ef9p-3,
729d1de259SAndrew Turner .exp_c2 = 0x1.5555576a5adcep-5,
739d1de259SAndrew Turner .ln2_hi = 0x1.62e42fefa3800p-1,
749d1de259SAndrew Turner .ln2_lo = 0x1.ef35793c76730p-45,
759d1de259SAndrew Turner .n_over_ln2 = 0x1.71547652b82fep0 * N_EXP,
769d1de259SAndrew Turner .ln2_over_n_hi = 0x1.62e42fefc0000p-9,
779d1de259SAndrew Turner .ln2_over_n_lo = -0x1.c610ca86c3899p-45,
789d1de259SAndrew Turner };
799d1de259SAndrew Turner
80edc5c0deSAndrew Turner /* Check if x is an integer. */
81edc5c0deSAndrew Turner static inline svbool_t
sv_isint(svbool_t pg,svfloat64_t x)82edc5c0deSAndrew Turner sv_isint (svbool_t pg, svfloat64_t x)
83edc5c0deSAndrew Turner {
84edc5c0deSAndrew Turner return svcmpeq (pg, svrintz_z (pg, x), x);
85edc5c0deSAndrew Turner }
86edc5c0deSAndrew Turner
87edc5c0deSAndrew Turner /* Check if x is real not integer valued. */
88edc5c0deSAndrew Turner static inline svbool_t
sv_isnotint(svbool_t pg,svfloat64_t x)89edc5c0deSAndrew Turner sv_isnotint (svbool_t pg, svfloat64_t x)
90edc5c0deSAndrew Turner {
91edc5c0deSAndrew Turner return svcmpne (pg, svrintz_z (pg, x), x);
92edc5c0deSAndrew Turner }
93edc5c0deSAndrew Turner
94edc5c0deSAndrew Turner /* Check if x is an odd integer. */
95edc5c0deSAndrew Turner static inline svbool_t
sv_isodd(svbool_t pg,svfloat64_t x)96edc5c0deSAndrew Turner sv_isodd (svbool_t pg, svfloat64_t x)
97edc5c0deSAndrew Turner {
989d1de259SAndrew Turner svfloat64_t y = svmul_x (svptrue_b64 (), x, 0.5);
99edc5c0deSAndrew Turner return sv_isnotint (pg, y);
100edc5c0deSAndrew Turner }
101edc5c0deSAndrew Turner
102edc5c0deSAndrew Turner /* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is
103edc5c0deSAndrew Turner the bit representation of a non-zero finite floating-point value. */
104edc5c0deSAndrew Turner static inline int
checkint(uint64_t iy)105edc5c0deSAndrew Turner checkint (uint64_t iy)
106edc5c0deSAndrew Turner {
107edc5c0deSAndrew Turner int e = iy >> 52 & 0x7ff;
108edc5c0deSAndrew Turner if (e < 0x3ff)
109edc5c0deSAndrew Turner return 0;
110edc5c0deSAndrew Turner if (e > 0x3ff + 52)
111edc5c0deSAndrew Turner return 2;
112edc5c0deSAndrew Turner if (iy & ((1ULL << (0x3ff + 52 - e)) - 1))
113edc5c0deSAndrew Turner return 0;
114edc5c0deSAndrew Turner if (iy & (1ULL << (0x3ff + 52 - e)))
115edc5c0deSAndrew Turner return 1;
116edc5c0deSAndrew Turner return 2;
117edc5c0deSAndrew Turner }
118edc5c0deSAndrew Turner
119edc5c0deSAndrew Turner /* Top 12 bits (sign and exponent of each double float lane). */
120edc5c0deSAndrew Turner static inline svuint64_t
sv_top12(svfloat64_t x)121edc5c0deSAndrew Turner sv_top12 (svfloat64_t x)
122edc5c0deSAndrew Turner {
123edc5c0deSAndrew Turner return svlsr_x (svptrue_b64 (), svreinterpret_u64 (x), 52);
124edc5c0deSAndrew Turner }
125edc5c0deSAndrew Turner
126edc5c0deSAndrew Turner /* Returns 1 if input is the bit representation of 0, infinity or nan. */
127edc5c0deSAndrew Turner static inline int
zeroinfnan(uint64_t i)128edc5c0deSAndrew Turner zeroinfnan (uint64_t i)
129edc5c0deSAndrew Turner {
130edc5c0deSAndrew Turner return 2 * i - 1 >= 2 * asuint64 (INFINITY) - 1;
131edc5c0deSAndrew Turner }
132edc5c0deSAndrew Turner
133edc5c0deSAndrew Turner /* Returns 1 if input is the bit representation of 0, infinity or nan. */
134edc5c0deSAndrew Turner static inline svbool_t
sv_zeroinfnan(svbool_t pg,svuint64_t i)135edc5c0deSAndrew Turner sv_zeroinfnan (svbool_t pg, svuint64_t i)
136edc5c0deSAndrew Turner {
1379d1de259SAndrew Turner return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
138edc5c0deSAndrew Turner 2 * asuint64 (INFINITY) - 1);
139edc5c0deSAndrew Turner }
140edc5c0deSAndrew Turner
141edc5c0deSAndrew Turner /* Handle cases that may overflow or underflow when computing the result that
142edc5c0deSAndrew Turner is scale*(1+TMP) without intermediate rounding. The bit representation of
143edc5c0deSAndrew Turner scale is in SBITS, however it has a computed exponent that may have
144edc5c0deSAndrew Turner overflown into the sign bit so that needs to be adjusted before using it as
145edc5c0deSAndrew Turner a double. (int32_t)KI is the k used in the argument reduction and exponent
146edc5c0deSAndrew Turner adjustment of scale, positive k here means the result may overflow and
147edc5c0deSAndrew Turner negative k means the result may underflow. */
148edc5c0deSAndrew Turner static inline double
specialcase(double tmp,uint64_t sbits,uint64_t ki)149edc5c0deSAndrew Turner specialcase (double tmp, uint64_t sbits, uint64_t ki)
150edc5c0deSAndrew Turner {
151edc5c0deSAndrew Turner double scale;
152edc5c0deSAndrew Turner if ((ki & 0x80000000) == 0)
153edc5c0deSAndrew Turner {
154edc5c0deSAndrew Turner /* k > 0, the exponent of scale might have overflowed by <= 460. */
155edc5c0deSAndrew Turner sbits -= 1009ull << 52;
156edc5c0deSAndrew Turner scale = asdouble (sbits);
157edc5c0deSAndrew Turner return 0x1p1009 * (scale + scale * tmp);
158edc5c0deSAndrew Turner }
159edc5c0deSAndrew Turner /* k < 0, need special care in the subnormal range. */
160edc5c0deSAndrew Turner sbits += 1022ull << 52;
161edc5c0deSAndrew Turner /* Note: sbits is signed scale. */
162edc5c0deSAndrew Turner scale = asdouble (sbits);
163edc5c0deSAndrew Turner double y = scale + scale * tmp;
164edc5c0deSAndrew Turner return 0x1p-1022 * y;
165edc5c0deSAndrew Turner }
166edc5c0deSAndrew Turner
167edc5c0deSAndrew Turner /* Scalar fallback for special cases of SVE pow's exp. */
168edc5c0deSAndrew Turner static inline svfloat64_t
sv_call_specialcase(svfloat64_t x1,svuint64_t u1,svuint64_t u2,svfloat64_t y,svbool_t cmp)169edc5c0deSAndrew Turner sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2,
170edc5c0deSAndrew Turner svfloat64_t y, svbool_t cmp)
171edc5c0deSAndrew Turner {
172edc5c0deSAndrew Turner svbool_t p = svpfirst (cmp, svpfalse ());
173edc5c0deSAndrew Turner while (svptest_any (cmp, p))
174edc5c0deSAndrew Turner {
175edc5c0deSAndrew Turner double sx1 = svclastb (p, 0, x1);
176edc5c0deSAndrew Turner uint64_t su1 = svclastb (p, 0, u1);
177edc5c0deSAndrew Turner uint64_t su2 = svclastb (p, 0, u2);
178edc5c0deSAndrew Turner double elem = specialcase (sx1, su1, su2);
179edc5c0deSAndrew Turner svfloat64_t y2 = sv_f64 (elem);
180edc5c0deSAndrew Turner y = svsel (p, y2, y);
181edc5c0deSAndrew Turner p = svpnext_b64 (cmp, p);
182edc5c0deSAndrew Turner }
183edc5c0deSAndrew Turner return y;
184edc5c0deSAndrew Turner }
185edc5c0deSAndrew Turner
186edc5c0deSAndrew Turner /* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about
187edc5c0deSAndrew Turner additional 15 bits precision. IX is the bit representation of x, but
188edc5c0deSAndrew Turner normalized in the subnormal range using the sign bit for the exponent. */
189edc5c0deSAndrew Turner static inline svfloat64_t
sv_log_inline(svbool_t pg,svuint64_t ix,svfloat64_t * tail,const struct data * d)1909d1de259SAndrew Turner sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail,
1919d1de259SAndrew Turner const struct data *d)
192edc5c0deSAndrew Turner {
193edc5c0deSAndrew Turner /* x = 2^k z; where z is in range [Off,2*Off) and exact.
194edc5c0deSAndrew Turner The range is split into N subintervals.
195edc5c0deSAndrew Turner The ith subinterval contains z and c is near its center. */
1969d1de259SAndrew Turner svuint64_t tmp = svsub_x (pg, ix, d->off);
197edc5c0deSAndrew Turner svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, 52 - V_POW_LOG_TABLE_BITS),
198edc5c0deSAndrew Turner sv_u64 (N_LOG - 1));
199edc5c0deSAndrew Turner svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
2009d1de259SAndrew Turner svuint64_t iz = svsub_x (pg, ix, svlsl_x (pg, svreinterpret_u64 (k), 52));
201edc5c0deSAndrew Turner svfloat64_t z = svreinterpret_f64 (iz);
202edc5c0deSAndrew Turner svfloat64_t kd = svcvt_f64_x (pg, k);
203edc5c0deSAndrew Turner
204edc5c0deSAndrew Turner /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */
205edc5c0deSAndrew Turner /* SVE lookup requires 3 separate lookup tables, as opposed to scalar version
2069d1de259SAndrew Turner that uses array of structures. We also do the lookup earlier in the code
2079d1de259SAndrew Turner to make sure it finishes as early as possible. */
208edc5c0deSAndrew Turner svfloat64_t invc = svld1_gather_index (pg, __v_pow_log_data.invc, i);
209edc5c0deSAndrew Turner svfloat64_t logc = svld1_gather_index (pg, __v_pow_log_data.logc, i);
210edc5c0deSAndrew Turner svfloat64_t logctail = svld1_gather_index (pg, __v_pow_log_data.logctail, i);
211edc5c0deSAndrew Turner
212edc5c0deSAndrew Turner /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and
213edc5c0deSAndrew Turner |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible. */
214edc5c0deSAndrew Turner svfloat64_t r = svmad_x (pg, z, invc, -1.0);
215edc5c0deSAndrew Turner /* k*Ln2 + log(c) + r. */
2169d1de259SAndrew Turner
2179d1de259SAndrew Turner svfloat64_t ln2_hilo = svld1rq_f64 (svptrue_b64 (), &d->ln2_hi);
2189d1de259SAndrew Turner svfloat64_t t1 = svmla_lane_f64 (logc, kd, ln2_hilo, 0);
219edc5c0deSAndrew Turner svfloat64_t t2 = svadd_x (pg, t1, r);
2209d1de259SAndrew Turner svfloat64_t lo1 = svmla_lane_f64 (logctail, kd, ln2_hilo, 1);
221edc5c0deSAndrew Turner svfloat64_t lo2 = svadd_x (pg, svsub_x (pg, t1, t2), r);
222edc5c0deSAndrew Turner
223edc5c0deSAndrew Turner /* Evaluation is optimized assuming superscalar pipelined execution. */
2249d1de259SAndrew Turner
2259d1de259SAndrew Turner svfloat64_t log_c02 = svld1rq_f64 (svptrue_b64 (), &d->log_c0);
2269d1de259SAndrew Turner svfloat64_t ar = svmul_lane_f64 (r, log_c02, 0);
2279d1de259SAndrew Turner svfloat64_t ar2 = svmul_x (svptrue_b64 (), r, ar);
2289d1de259SAndrew Turner svfloat64_t ar3 = svmul_x (svptrue_b64 (), r, ar2);
229edc5c0deSAndrew Turner /* k*Ln2 + log(c) + r + A[0]*r*r. */
230edc5c0deSAndrew Turner svfloat64_t hi = svadd_x (pg, t2, ar2);
2319d1de259SAndrew Turner svfloat64_t lo3 = svmls_x (pg, ar2, ar, r);
232edc5c0deSAndrew Turner svfloat64_t lo4 = svadd_x (pg, svsub_x (pg, t2, hi), ar2);
233edc5c0deSAndrew Turner /* p = log1p(r) - r - A[0]*r*r. */
234edc5c0deSAndrew Turner /* p = (ar3 * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r *
235edc5c0deSAndrew Turner A[6])))). */
2369d1de259SAndrew Turner
2379d1de259SAndrew Turner svfloat64_t log_c46 = svld1rq_f64 (svptrue_b64 (), &d->log_c4);
2389d1de259SAndrew Turner svfloat64_t a56 = svmla_lane_f64 (sv_f64 (d->log_c5), r, log_c46, 1);
2399d1de259SAndrew Turner svfloat64_t a34 = svmla_lane_f64 (sv_f64 (d->log_c3), r, log_c46, 0);
2409d1de259SAndrew Turner svfloat64_t a12 = svmla_lane_f64 (sv_f64 (d->log_c1), r, log_c02, 1);
241edc5c0deSAndrew Turner svfloat64_t p = svmla_x (pg, a34, ar2, a56);
242edc5c0deSAndrew Turner p = svmla_x (pg, a12, ar2, p);
2439d1de259SAndrew Turner p = svmul_x (svptrue_b64 (), ar3, p);
244edc5c0deSAndrew Turner svfloat64_t lo = svadd_x (
2459d1de259SAndrew Turner pg, svadd_x (pg, svsub_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p);
246edc5c0deSAndrew Turner svfloat64_t y = svadd_x (pg, hi, lo);
247edc5c0deSAndrew Turner *tail = svadd_x (pg, svsub_x (pg, hi, y), lo);
248edc5c0deSAndrew Turner return y;
249edc5c0deSAndrew Turner }
250edc5c0deSAndrew Turner
2519d1de259SAndrew Turner static inline svfloat64_t
sv_exp_core(svbool_t pg,svfloat64_t x,svfloat64_t xtail,svuint64_t sign_bias,svfloat64_t * tmp,svuint64_t * sbits,svuint64_t * ki,const struct data * d)2529d1de259SAndrew Turner sv_exp_core (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
2539d1de259SAndrew Turner svuint64_t sign_bias, svfloat64_t *tmp, svuint64_t *sbits,
2549d1de259SAndrew Turner svuint64_t *ki, const struct data *d)
2559d1de259SAndrew Turner {
2569d1de259SAndrew Turner /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)]. */
2579d1de259SAndrew Turner /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N]. */
2589d1de259SAndrew Turner svfloat64_t n_over_ln2_and_c2 = svld1rq_f64 (svptrue_b64 (), &d->n_over_ln2);
2599d1de259SAndrew Turner svfloat64_t z = svmul_lane_f64 (x, n_over_ln2_and_c2, 0);
2609d1de259SAndrew Turner /* z - kd is in [-1, 1] in non-nearest rounding modes. */
2619d1de259SAndrew Turner svfloat64_t kd = svrinta_x (pg, z);
2629d1de259SAndrew Turner *ki = svreinterpret_u64 (svcvt_s64_x (pg, kd));
2639d1de259SAndrew Turner
2649d1de259SAndrew Turner svfloat64_t ln2_over_n_hilo
2659d1de259SAndrew Turner = svld1rq_f64 (svptrue_b64 (), &d->ln2_over_n_hi);
2669d1de259SAndrew Turner svfloat64_t r = x;
2679d1de259SAndrew Turner r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 0);
2689d1de259SAndrew Turner r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 1);
2699d1de259SAndrew Turner /* The code assumes 2^-200 < |xtail| < 2^-8/N. */
2709d1de259SAndrew Turner r = svadd_x (pg, r, xtail);
2719d1de259SAndrew Turner /* 2^(k/N) ~= scale. */
2729d1de259SAndrew Turner svuint64_t idx = svand_x (pg, *ki, N_EXP - 1);
2739d1de259SAndrew Turner svuint64_t top
2749d1de259SAndrew Turner = svlsl_x (pg, svadd_x (pg, *ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS);
2759d1de259SAndrew Turner /* This is only a valid scale when -1023*N < k < 1024*N. */
2769d1de259SAndrew Turner *sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx);
2779d1de259SAndrew Turner *sbits = svadd_x (pg, *sbits, top);
2789d1de259SAndrew Turner /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1). */
2799d1de259SAndrew Turner svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
2809d1de259SAndrew Turner *tmp = svmla_lane_f64 (sv_f64 (d->exp_c1), r, n_over_ln2_and_c2, 1);
2819d1de259SAndrew Turner *tmp = svmla_x (pg, sv_f64 (d->exp_c0), r, *tmp);
2829d1de259SAndrew Turner *tmp = svmla_x (pg, r, r2, *tmp);
2839d1de259SAndrew Turner svfloat64_t scale = svreinterpret_f64 (*sbits);
2849d1de259SAndrew Turner /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
2859d1de259SAndrew Turner is no spurious underflow here even without fma. */
2869d1de259SAndrew Turner z = svmla_x (pg, scale, scale, *tmp);
2879d1de259SAndrew Turner return z;
2889d1de259SAndrew Turner }
2899d1de259SAndrew Turner
290edc5c0deSAndrew Turner /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
291edc5c0deSAndrew Turner The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1. */
292edc5c0deSAndrew Turner static inline svfloat64_t
sv_exp_inline(svbool_t pg,svfloat64_t x,svfloat64_t xtail,svuint64_t sign_bias,const struct data * d)293edc5c0deSAndrew Turner sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
2949d1de259SAndrew Turner svuint64_t sign_bias, const struct data *d)
295edc5c0deSAndrew Turner {
296edc5c0deSAndrew Turner /* 3 types of special cases: tiny (uflow and spurious uflow), huge (oflow)
297edc5c0deSAndrew Turner and other cases of large values of x (scale * (1 + TMP) oflow). */
298edc5c0deSAndrew Turner svuint64_t abstop = svand_x (pg, sv_top12 (x), 0x7ff);
299edc5c0deSAndrew Turner /* |x| is large (|x| >= 512) or tiny (|x| <= 0x1p-54). */
300edc5c0deSAndrew Turner svbool_t uoflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), ThresExp);
301edc5c0deSAndrew Turner
3029d1de259SAndrew Turner svfloat64_t tmp;
3039d1de259SAndrew Turner svuint64_t sbits, ki;
304edc5c0deSAndrew Turner if (unlikely (svptest_any (pg, uoflow)))
305edc5c0deSAndrew Turner {
3069d1de259SAndrew Turner svfloat64_t z
3079d1de259SAndrew Turner = sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d);
3089d1de259SAndrew Turner
309edc5c0deSAndrew Turner /* |x| is tiny (|x| <= 0x1p-54). */
3109d1de259SAndrew Turner svbool_t uflow
3119d1de259SAndrew Turner = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000);
312edc5c0deSAndrew Turner uflow = svand_z (pg, uoflow, uflow);
313edc5c0deSAndrew Turner /* |x| is huge (|x| >= 1024). */
3149d1de259SAndrew Turner svbool_t oflow = svcmpge (pg, abstop, HugeExp);
315edc5c0deSAndrew Turner oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow));
3169d1de259SAndrew Turner
317edc5c0deSAndrew Turner /* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow
318edc5c0deSAndrew Turner or underflow. */
3199d1de259SAndrew Turner svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
320edc5c0deSAndrew Turner
321edc5c0deSAndrew Turner /* Update result with special and large cases. */
322edc5c0deSAndrew Turner z = sv_call_specialcase (tmp, sbits, ki, z, special);
323edc5c0deSAndrew Turner
324edc5c0deSAndrew Turner /* Handle underflow and overflow. */
3259d1de259SAndrew Turner svbool_t x_is_neg = svcmplt (pg, x, 0);
3269d1de259SAndrew Turner svuint64_t sign_mask
3279d1de259SAndrew Turner = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
3289d1de259SAndrew Turner svfloat64_t res_uoflow
3299d1de259SAndrew Turner = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY));
330edc5c0deSAndrew Turner res_uoflow = svreinterpret_f64 (
331edc5c0deSAndrew Turner svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask));
332edc5c0deSAndrew Turner /* Avoid spurious underflow for tiny x. */
333edc5c0deSAndrew Turner svfloat64_t res_spurious_uflow
334edc5c0deSAndrew Turner = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000));
335edc5c0deSAndrew Turner
3369d1de259SAndrew Turner z = svsel (oflow, res_uoflow, z);
3379d1de259SAndrew Turner z = svsel (uflow, res_spurious_uflow, z);
338edc5c0deSAndrew Turner return z;
339edc5c0deSAndrew Turner }
340edc5c0deSAndrew Turner
3419d1de259SAndrew Turner return sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d);
3429d1de259SAndrew Turner }
3439d1de259SAndrew Turner
344edc5c0deSAndrew Turner static inline double
pow_sc(double x,double y)345edc5c0deSAndrew Turner pow_sc (double x, double y)
346edc5c0deSAndrew Turner {
347edc5c0deSAndrew Turner uint64_t ix = asuint64 (x);
348edc5c0deSAndrew Turner uint64_t iy = asuint64 (y);
349edc5c0deSAndrew Turner /* Special cases: |x| or |y| is 0, inf or nan. */
350edc5c0deSAndrew Turner if (unlikely (zeroinfnan (iy)))
351edc5c0deSAndrew Turner {
352edc5c0deSAndrew Turner if (2 * iy == 0)
353edc5c0deSAndrew Turner return issignaling_inline (x) ? x + y : 1.0;
354edc5c0deSAndrew Turner if (ix == asuint64 (1.0))
355edc5c0deSAndrew Turner return issignaling_inline (y) ? x + y : 1.0;
356edc5c0deSAndrew Turner if (2 * ix > 2 * asuint64 (INFINITY) || 2 * iy > 2 * asuint64 (INFINITY))
357edc5c0deSAndrew Turner return x + y;
358edc5c0deSAndrew Turner if (2 * ix == 2 * asuint64 (1.0))
359edc5c0deSAndrew Turner return 1.0;
360edc5c0deSAndrew Turner if ((2 * ix < 2 * asuint64 (1.0)) == !(iy >> 63))
361edc5c0deSAndrew Turner return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf. */
362edc5c0deSAndrew Turner return y * y;
363edc5c0deSAndrew Turner }
364edc5c0deSAndrew Turner if (unlikely (zeroinfnan (ix)))
365edc5c0deSAndrew Turner {
366edc5c0deSAndrew Turner double_t x2 = x * x;
367edc5c0deSAndrew Turner if (ix >> 63 && checkint (iy) == 1)
368edc5c0deSAndrew Turner x2 = -x2;
3699d1de259SAndrew Turner return (iy >> 63) ? 1 / x2 : x2;
370edc5c0deSAndrew Turner }
371edc5c0deSAndrew Turner return x;
372edc5c0deSAndrew Turner }
373edc5c0deSAndrew Turner
pow(svfloat64_t x,svfloat64_t y,const svbool_t pg)374edc5c0deSAndrew Turner svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
375edc5c0deSAndrew Turner {
3769d1de259SAndrew Turner const struct data *d = ptr_barrier (&data);
3779d1de259SAndrew Turner
378edc5c0deSAndrew Turner /* This preamble handles special case conditions used in the final scalar
379edc5c0deSAndrew Turner fallbacks. It also updates ix and sign_bias, that are used in the core
380edc5c0deSAndrew Turner computation too, i.e., exp( y * log (x) ). */
381edc5c0deSAndrew Turner svuint64_t vix0 = svreinterpret_u64 (x);
382edc5c0deSAndrew Turner svuint64_t viy0 = svreinterpret_u64 (y);
383edc5c0deSAndrew Turner
384edc5c0deSAndrew Turner /* Negative x cases. */
3859d1de259SAndrew Turner svbool_t xisneg = svcmplt (pg, x, 0);
386edc5c0deSAndrew Turner
387edc5c0deSAndrew Turner /* Set sign_bias and ix depending on sign of x and nature of y. */
3889d1de259SAndrew Turner svbool_t yint_or_xpos = pg;
389edc5c0deSAndrew Turner svuint64_t sign_bias = sv_u64 (0);
390edc5c0deSAndrew Turner svuint64_t vix = vix0;
391edc5c0deSAndrew Turner if (unlikely (svptest_any (pg, xisneg)))
392edc5c0deSAndrew Turner {
393edc5c0deSAndrew Turner /* Determine nature of y. */
3949d1de259SAndrew Turner yint_or_xpos = sv_isint (xisneg, y);
395edc5c0deSAndrew Turner svbool_t yisodd_xisneg = sv_isodd (xisneg, y);
396edc5c0deSAndrew Turner /* ix set to abs(ix) if y is integer. */
3979d1de259SAndrew Turner vix = svand_m (yint_or_xpos, vix0, 0x7fffffffffffffff);
398edc5c0deSAndrew Turner /* Set to SignBias if x is negative and y is odd. */
399edc5c0deSAndrew Turner sign_bias = svsel (yisodd_xisneg, sv_u64 (SignBias), sv_u64 (0));
400edc5c0deSAndrew Turner }
401edc5c0deSAndrew Turner
402edc5c0deSAndrew Turner /* Small cases of x: |x| < 0x1p-126. */
4039d1de259SAndrew Turner svbool_t xsmall = svaclt (yint_or_xpos, x, SmallBoundX);
4049d1de259SAndrew Turner if (unlikely (svptest_any (yint_or_xpos, xsmall)))
405edc5c0deSAndrew Turner {
406edc5c0deSAndrew Turner /* Normalize subnormal x so exponent becomes negative. */
4079d1de259SAndrew Turner svuint64_t vtopx = svlsr_x (svptrue_b64 (), vix, 52);
4089d1de259SAndrew Turner svbool_t topx_is_null = svcmpeq (xsmall, vtopx, 0);
409edc5c0deSAndrew Turner
410edc5c0deSAndrew Turner svuint64_t vix_norm = svreinterpret_u64 (svmul_m (xsmall, x, 0x1p52));
411edc5c0deSAndrew Turner vix_norm = svand_m (xsmall, vix_norm, 0x7fffffffffffffff);
412edc5c0deSAndrew Turner vix_norm = svsub_m (xsmall, vix_norm, 52ULL << 52);
413edc5c0deSAndrew Turner vix = svsel (topx_is_null, vix_norm, vix);
414edc5c0deSAndrew Turner }
415edc5c0deSAndrew Turner
416edc5c0deSAndrew Turner /* y_hi = log(ix, &y_lo). */
417edc5c0deSAndrew Turner svfloat64_t vlo;
4189d1de259SAndrew Turner svfloat64_t vhi = sv_log_inline (yint_or_xpos, vix, &vlo, d);
419edc5c0deSAndrew Turner
420edc5c0deSAndrew Turner /* z = exp(y_hi, y_lo, sign_bias). */
4219d1de259SAndrew Turner svfloat64_t vehi = svmul_x (svptrue_b64 (), y, vhi);
4229d1de259SAndrew Turner svfloat64_t vemi = svmls_x (yint_or_xpos, vehi, y, vhi);
4239d1de259SAndrew Turner svfloat64_t velo = svnmls_x (yint_or_xpos, vemi, y, vlo);
4249d1de259SAndrew Turner svfloat64_t vz = sv_exp_inline (yint_or_xpos, vehi, velo, sign_bias, d);
425edc5c0deSAndrew Turner
426edc5c0deSAndrew Turner /* Cases of finite y and finite negative x. */
4279d1de259SAndrew Turner vz = svsel (yint_or_xpos, vz, sv_f64 (__builtin_nan ("")));
4289d1de259SAndrew Turner
4299d1de259SAndrew Turner /* Special cases of x or y: zero, inf and nan. */
4309d1de259SAndrew Turner svbool_t xspecial = sv_zeroinfnan (svptrue_b64 (), vix0);
4319d1de259SAndrew Turner svbool_t yspecial = sv_zeroinfnan (svptrue_b64 (), viy0);
4329d1de259SAndrew Turner svbool_t special = svorr_z (svptrue_b64 (), xspecial, yspecial);
433edc5c0deSAndrew Turner
434edc5c0deSAndrew Turner /* Cases of zero/inf/nan x or y. */
4359d1de259SAndrew Turner if (unlikely (svptest_any (svptrue_b64 (), special)))
436edc5c0deSAndrew Turner vz = sv_call2_f64 (pow_sc, x, y, vz, special);
437edc5c0deSAndrew Turner
438edc5c0deSAndrew Turner return vz;
439edc5c0deSAndrew Turner }
440edc5c0deSAndrew Turner
4419d1de259SAndrew Turner TEST_SIG (SV, D, 2, pow)
4429d1de259SAndrew Turner TEST_ULP (SV_NAME_D2 (pow), 0.55)
4439d1de259SAndrew Turner TEST_DISABLE_FENV (SV_NAME_D2 (pow))
444edc5c0deSAndrew Turner /* Wide intervals spanning the whole domain but shared between x and y. */
445edc5c0deSAndrew Turner #define SV_POW_INTERVAL2(xlo, xhi, ylo, yhi, n) \
4469d1de259SAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, ylo, yhi, n) \
4479d1de259SAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n) \
4489d1de259SAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n) \
4499d1de259SAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n)
450edc5c0deSAndrew Turner #define EXPAND(str) str##000000000
451edc5c0deSAndrew Turner #define SHL52(str) EXPAND (str)
452edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000)
453edc5c0deSAndrew Turner SV_POW_INTERVAL2 (SHL52 (SmallPowX), SHL52 (BigPowX), 0, inf, 40000)
454edc5c0deSAndrew Turner SV_POW_INTERVAL2 (SHL52 (BigPowX), inf, 0, inf, 40000)
455edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0, inf, 0, SHL52 (SmallPowY), 40000)
456edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0, inf, SHL52 (SmallPowY), SHL52 (BigPowY), 40000)
457edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0, inf, SHL52 (BigPowY), inf, 40000)
458edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0, inf, 0, inf, 1000)
459edc5c0deSAndrew Turner /* x~1 or y~1. */
460edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000)
461edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000)
462edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000)
463edc5c0deSAndrew Turner /* around estimated argmaxs of ULP error. */
464edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
465edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
466edc5c0deSAndrew Turner /* x is negative, y is odd or even integer, or y is real not integer. */
4679d1de259SAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
4689d1de259SAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
4699d1de259SAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
4709d1de259SAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
471edc5c0deSAndrew Turner /* |x| is inf, y is odd or even integer, or y is real not integer. */
472edc5c0deSAndrew Turner SV_POW_INTERVAL2 (inf, inf, 0.5, 0.5, 1)
473edc5c0deSAndrew Turner SV_POW_INTERVAL2 (inf, inf, 1.0, 1.0, 1)
474edc5c0deSAndrew Turner SV_POW_INTERVAL2 (inf, inf, 2.0, 2.0, 1)
475edc5c0deSAndrew Turner SV_POW_INTERVAL2 (inf, inf, 3.0, 3.0, 1)
476edc5c0deSAndrew Turner /* 0.0^y. */
477edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000)
478edc5c0deSAndrew Turner /* 1.0^y. */
4799d1de259SAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
4809d1de259SAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
4819d1de259SAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
4829d1de259SAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
4839d1de259SAndrew Turner CLOSE_SVE_ATTR
484