xref: /src/contrib/arm-optimized-routines/math/aarch64/sve/pow.c (revision f3087bef11543b42e0d69b708f367097a4118d24)
1edc5c0deSAndrew Turner /*
2edc5c0deSAndrew Turner  * Double-precision SVE pow(x, y) function.
3edc5c0deSAndrew Turner  *
49d1de259SAndrew Turner  * Copyright (c) 2022-2025, Arm Limited.
5edc5c0deSAndrew Turner  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6edc5c0deSAndrew Turner  */
7edc5c0deSAndrew Turner 
8edc5c0deSAndrew Turner #include "sv_math.h"
99d1de259SAndrew Turner #include "test_sig.h"
109d1de259SAndrew Turner #include "test_defs.h"
11edc5c0deSAndrew Turner 
12edc5c0deSAndrew Turner /* This version share a similar algorithm as AOR scalar pow.
13edc5c0deSAndrew Turner 
14edc5c0deSAndrew Turner    The core computation consists in computing pow(x, y) as
15edc5c0deSAndrew Turner 
16edc5c0deSAndrew Turner      exp (y * log (x)).
17edc5c0deSAndrew Turner 
18edc5c0deSAndrew Turner    The algorithms for exp and log are very similar to scalar exp and log.
19edc5c0deSAndrew Turner    The log relies on table lookup for 3 variables and an order 8 polynomial.
20edc5c0deSAndrew Turner    It returns a high and a low contribution that are then passed to the exp,
21edc5c0deSAndrew Turner    to minimise the loss of accuracy in both routines.
22edc5c0deSAndrew Turner    The exp is based on 8-bit table lookup for scale and order-4 polynomial.
23edc5c0deSAndrew Turner    The SVE algorithm drops the tail in the exp computation at the price of
24edc5c0deSAndrew Turner    a lower accuracy, slightly above 1ULP.
25edc5c0deSAndrew Turner    The SVE algorithm also drops the special treatement of small (< 2^-65) and
269d1de259SAndrew Turner    large (> 2^63) finite values of |y|, as they only affect non-round to
279d1de259SAndrew Turner    nearest modes.
28edc5c0deSAndrew Turner 
29edc5c0deSAndrew Turner    Maximum measured error is 1.04 ULPs:
30edc5c0deSAndrew Turner    SV_NAME_D2 (pow) (0x1.3d2d45bc848acp+63, -0x1.a48a38b40cd43p-12)
31edc5c0deSAndrew Turner      got 0x1.f7116284221fcp-1
32edc5c0deSAndrew Turner     want 0x1.f7116284221fdp-1.  */
33edc5c0deSAndrew Turner 
34edc5c0deSAndrew Turner /* Data is defined in v_pow_log_data.c.  */
35edc5c0deSAndrew Turner #define N_LOG (1 << V_POW_LOG_TABLE_BITS)
36edc5c0deSAndrew Turner #define Off 0x3fe6955500000000
37edc5c0deSAndrew Turner 
38edc5c0deSAndrew Turner /* Data is defined in v_pow_exp_data.c.  */
39edc5c0deSAndrew Turner #define N_EXP (1 << V_POW_EXP_TABLE_BITS)
40edc5c0deSAndrew Turner #define SignBias (0x800 << V_POW_EXP_TABLE_BITS)
41edc5c0deSAndrew Turner #define SmallExp 0x3c9 /* top12(0x1p-54).  */
42edc5c0deSAndrew Turner #define BigExp 0x408   /* top12(512.).  */
43edc5c0deSAndrew Turner #define ThresExp 0x03f /* BigExp - SmallExp.  */
44edc5c0deSAndrew Turner #define HugeExp 0x409  /* top12(1024.).  */
45edc5c0deSAndrew Turner 
46edc5c0deSAndrew Turner /* Constants associated with pow.  */
479d1de259SAndrew Turner #define SmallBoundX 0x1p-126
48edc5c0deSAndrew Turner #define SmallPowX 0x001 /* top12(0x1p-126).  */
49edc5c0deSAndrew Turner #define BigPowX 0x7ff	/* top12(INFINITY).  */
50edc5c0deSAndrew Turner #define ThresPowX 0x7fe /* BigPowX - SmallPowX.  */
51edc5c0deSAndrew Turner #define SmallPowY 0x3be /* top12(0x1.e7b6p-65).  */
52edc5c0deSAndrew Turner #define BigPowY 0x43e	/* top12(0x1.749p62).  */
53edc5c0deSAndrew Turner #define ThresPowY 0x080 /* BigPowY - SmallPowY.  */
54edc5c0deSAndrew Turner 
559d1de259SAndrew Turner static const struct data
569d1de259SAndrew Turner {
579d1de259SAndrew Turner   double log_c0, log_c2, log_c4, log_c6, ln2_hi, ln2_lo;
589d1de259SAndrew Turner   double log_c1, log_c3, log_c5, off;
599d1de259SAndrew Turner   double n_over_ln2, exp_c2, ln2_over_n_hi, ln2_over_n_lo;
609d1de259SAndrew Turner   double exp_c0, exp_c1;
619d1de259SAndrew Turner } data = {
629d1de259SAndrew Turner   .log_c0 = -0x1p-1,
639d1de259SAndrew Turner   .log_c1 = -0x1.555555555556p-1,
649d1de259SAndrew Turner   .log_c2 = 0x1.0000000000006p-1,
659d1de259SAndrew Turner   .log_c3 = 0x1.999999959554ep-1,
669d1de259SAndrew Turner   .log_c4 = -0x1.555555529a47ap-1,
679d1de259SAndrew Turner   .log_c5 = -0x1.2495b9b4845e9p0,
689d1de259SAndrew Turner   .log_c6 = 0x1.0002b8b263fc3p0,
699d1de259SAndrew Turner   .off = Off,
709d1de259SAndrew Turner   .exp_c0 = 0x1.fffffffffffd4p-2,
719d1de259SAndrew Turner   .exp_c1 = 0x1.5555571d6ef9p-3,
729d1de259SAndrew Turner   .exp_c2 = 0x1.5555576a5adcep-5,
739d1de259SAndrew Turner   .ln2_hi = 0x1.62e42fefa3800p-1,
749d1de259SAndrew Turner   .ln2_lo = 0x1.ef35793c76730p-45,
759d1de259SAndrew Turner   .n_over_ln2 = 0x1.71547652b82fep0 * N_EXP,
769d1de259SAndrew Turner   .ln2_over_n_hi = 0x1.62e42fefc0000p-9,
779d1de259SAndrew Turner   .ln2_over_n_lo = -0x1.c610ca86c3899p-45,
789d1de259SAndrew Turner };
799d1de259SAndrew Turner 
80edc5c0deSAndrew Turner /* Check if x is an integer.  */
81edc5c0deSAndrew Turner static inline svbool_t
sv_isint(svbool_t pg,svfloat64_t x)82edc5c0deSAndrew Turner sv_isint (svbool_t pg, svfloat64_t x)
83edc5c0deSAndrew Turner {
84edc5c0deSAndrew Turner   return svcmpeq (pg, svrintz_z (pg, x), x);
85edc5c0deSAndrew Turner }
86edc5c0deSAndrew Turner 
87edc5c0deSAndrew Turner /* Check if x is real not integer valued.  */
88edc5c0deSAndrew Turner static inline svbool_t
sv_isnotint(svbool_t pg,svfloat64_t x)89edc5c0deSAndrew Turner sv_isnotint (svbool_t pg, svfloat64_t x)
90edc5c0deSAndrew Turner {
91edc5c0deSAndrew Turner   return svcmpne (pg, svrintz_z (pg, x), x);
92edc5c0deSAndrew Turner }
93edc5c0deSAndrew Turner 
94edc5c0deSAndrew Turner /* Check if x is an odd integer.  */
95edc5c0deSAndrew Turner static inline svbool_t
sv_isodd(svbool_t pg,svfloat64_t x)96edc5c0deSAndrew Turner sv_isodd (svbool_t pg, svfloat64_t x)
97edc5c0deSAndrew Turner {
989d1de259SAndrew Turner   svfloat64_t y = svmul_x (svptrue_b64 (), x, 0.5);
99edc5c0deSAndrew Turner   return sv_isnotint (pg, y);
100edc5c0deSAndrew Turner }
101edc5c0deSAndrew Turner 
102edc5c0deSAndrew Turner /* Returns 0 if not int, 1 if odd int, 2 if even int.  The argument is
103edc5c0deSAndrew Turner    the bit representation of a non-zero finite floating-point value.  */
104edc5c0deSAndrew Turner static inline int
checkint(uint64_t iy)105edc5c0deSAndrew Turner checkint (uint64_t iy)
106edc5c0deSAndrew Turner {
107edc5c0deSAndrew Turner   int e = iy >> 52 & 0x7ff;
108edc5c0deSAndrew Turner   if (e < 0x3ff)
109edc5c0deSAndrew Turner     return 0;
110edc5c0deSAndrew Turner   if (e > 0x3ff + 52)
111edc5c0deSAndrew Turner     return 2;
112edc5c0deSAndrew Turner   if (iy & ((1ULL << (0x3ff + 52 - e)) - 1))
113edc5c0deSAndrew Turner     return 0;
114edc5c0deSAndrew Turner   if (iy & (1ULL << (0x3ff + 52 - e)))
115edc5c0deSAndrew Turner     return 1;
116edc5c0deSAndrew Turner   return 2;
117edc5c0deSAndrew Turner }
118edc5c0deSAndrew Turner 
119edc5c0deSAndrew Turner /* Top 12 bits (sign and exponent of each double float lane).  */
120edc5c0deSAndrew Turner static inline svuint64_t
sv_top12(svfloat64_t x)121edc5c0deSAndrew Turner sv_top12 (svfloat64_t x)
122edc5c0deSAndrew Turner {
123edc5c0deSAndrew Turner   return svlsr_x (svptrue_b64 (), svreinterpret_u64 (x), 52);
124edc5c0deSAndrew Turner }
125edc5c0deSAndrew Turner 
126edc5c0deSAndrew Turner /* Returns 1 if input is the bit representation of 0, infinity or nan.  */
127edc5c0deSAndrew Turner static inline int
zeroinfnan(uint64_t i)128edc5c0deSAndrew Turner zeroinfnan (uint64_t i)
129edc5c0deSAndrew Turner {
130edc5c0deSAndrew Turner   return 2 * i - 1 >= 2 * asuint64 (INFINITY) - 1;
131edc5c0deSAndrew Turner }
132edc5c0deSAndrew Turner 
133edc5c0deSAndrew Turner /* Returns 1 if input is the bit representation of 0, infinity or nan.  */
134edc5c0deSAndrew Turner static inline svbool_t
sv_zeroinfnan(svbool_t pg,svuint64_t i)135edc5c0deSAndrew Turner sv_zeroinfnan (svbool_t pg, svuint64_t i)
136edc5c0deSAndrew Turner {
1379d1de259SAndrew Turner   return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
138edc5c0deSAndrew Turner 		  2 * asuint64 (INFINITY) - 1);
139edc5c0deSAndrew Turner }
140edc5c0deSAndrew Turner 
141edc5c0deSAndrew Turner /* Handle cases that may overflow or underflow when computing the result that
142edc5c0deSAndrew Turner    is scale*(1+TMP) without intermediate rounding.  The bit representation of
143edc5c0deSAndrew Turner    scale is in SBITS, however it has a computed exponent that may have
144edc5c0deSAndrew Turner    overflown into the sign bit so that needs to be adjusted before using it as
145edc5c0deSAndrew Turner    a double.  (int32_t)KI is the k used in the argument reduction and exponent
146edc5c0deSAndrew Turner    adjustment of scale, positive k here means the result may overflow and
147edc5c0deSAndrew Turner    negative k means the result may underflow.  */
148edc5c0deSAndrew Turner static inline double
specialcase(double tmp,uint64_t sbits,uint64_t ki)149edc5c0deSAndrew Turner specialcase (double tmp, uint64_t sbits, uint64_t ki)
150edc5c0deSAndrew Turner {
151edc5c0deSAndrew Turner   double scale;
152edc5c0deSAndrew Turner   if ((ki & 0x80000000) == 0)
153edc5c0deSAndrew Turner     {
154edc5c0deSAndrew Turner       /* k > 0, the exponent of scale might have overflowed by <= 460.  */
155edc5c0deSAndrew Turner       sbits -= 1009ull << 52;
156edc5c0deSAndrew Turner       scale = asdouble (sbits);
157edc5c0deSAndrew Turner       return 0x1p1009 * (scale + scale * tmp);
158edc5c0deSAndrew Turner     }
159edc5c0deSAndrew Turner   /* k < 0, need special care in the subnormal range.  */
160edc5c0deSAndrew Turner   sbits += 1022ull << 52;
161edc5c0deSAndrew Turner   /* Note: sbits is signed scale.  */
162edc5c0deSAndrew Turner   scale = asdouble (sbits);
163edc5c0deSAndrew Turner   double y = scale + scale * tmp;
164edc5c0deSAndrew Turner   return 0x1p-1022 * y;
165edc5c0deSAndrew Turner }
166edc5c0deSAndrew Turner 
167edc5c0deSAndrew Turner /* Scalar fallback for special cases of SVE pow's exp.  */
168edc5c0deSAndrew Turner static inline svfloat64_t
sv_call_specialcase(svfloat64_t x1,svuint64_t u1,svuint64_t u2,svfloat64_t y,svbool_t cmp)169edc5c0deSAndrew Turner sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2,
170edc5c0deSAndrew Turner 		     svfloat64_t y, svbool_t cmp)
171edc5c0deSAndrew Turner {
172edc5c0deSAndrew Turner   svbool_t p = svpfirst (cmp, svpfalse ());
173edc5c0deSAndrew Turner   while (svptest_any (cmp, p))
174edc5c0deSAndrew Turner     {
175edc5c0deSAndrew Turner       double sx1 = svclastb (p, 0, x1);
176edc5c0deSAndrew Turner       uint64_t su1 = svclastb (p, 0, u1);
177edc5c0deSAndrew Turner       uint64_t su2 = svclastb (p, 0, u2);
178edc5c0deSAndrew Turner       double elem = specialcase (sx1, su1, su2);
179edc5c0deSAndrew Turner       svfloat64_t y2 = sv_f64 (elem);
180edc5c0deSAndrew Turner       y = svsel (p, y2, y);
181edc5c0deSAndrew Turner       p = svpnext_b64 (cmp, p);
182edc5c0deSAndrew Turner     }
183edc5c0deSAndrew Turner   return y;
184edc5c0deSAndrew Turner }
185edc5c0deSAndrew Turner 
186edc5c0deSAndrew Turner /* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about
187edc5c0deSAndrew Turner    additional 15 bits precision.  IX is the bit representation of x, but
188edc5c0deSAndrew Turner    normalized in the subnormal range using the sign bit for the exponent.  */
189edc5c0deSAndrew Turner static inline svfloat64_t
sv_log_inline(svbool_t pg,svuint64_t ix,svfloat64_t * tail,const struct data * d)1909d1de259SAndrew Turner sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail,
1919d1de259SAndrew Turner 	       const struct data *d)
192edc5c0deSAndrew Turner {
193edc5c0deSAndrew Turner   /* x = 2^k z; where z is in range [Off,2*Off) and exact.
194edc5c0deSAndrew Turner      The range is split into N subintervals.
195edc5c0deSAndrew Turner      The ith subinterval contains z and c is near its center.  */
1969d1de259SAndrew Turner   svuint64_t tmp = svsub_x (pg, ix, d->off);
197edc5c0deSAndrew Turner   svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, 52 - V_POW_LOG_TABLE_BITS),
198edc5c0deSAndrew Turner 			  sv_u64 (N_LOG - 1));
199edc5c0deSAndrew Turner   svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
2009d1de259SAndrew Turner   svuint64_t iz = svsub_x (pg, ix, svlsl_x (pg, svreinterpret_u64 (k), 52));
201edc5c0deSAndrew Turner   svfloat64_t z = svreinterpret_f64 (iz);
202edc5c0deSAndrew Turner   svfloat64_t kd = svcvt_f64_x (pg, k);
203edc5c0deSAndrew Turner 
204edc5c0deSAndrew Turner   /* log(x) = k*Ln2 + log(c) + log1p(z/c-1).  */
205edc5c0deSAndrew Turner   /* SVE lookup requires 3 separate lookup tables, as opposed to scalar version
2069d1de259SAndrew Turner      that uses array of structures. We also do the lookup earlier in the code
2079d1de259SAndrew Turner      to make sure it finishes as early as possible.  */
208edc5c0deSAndrew Turner   svfloat64_t invc = svld1_gather_index (pg, __v_pow_log_data.invc, i);
209edc5c0deSAndrew Turner   svfloat64_t logc = svld1_gather_index (pg, __v_pow_log_data.logc, i);
210edc5c0deSAndrew Turner   svfloat64_t logctail = svld1_gather_index (pg, __v_pow_log_data.logctail, i);
211edc5c0deSAndrew Turner 
212edc5c0deSAndrew Turner   /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and
213edc5c0deSAndrew Turner      |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible.  */
214edc5c0deSAndrew Turner   svfloat64_t r = svmad_x (pg, z, invc, -1.0);
215edc5c0deSAndrew Turner   /* k*Ln2 + log(c) + r.  */
2169d1de259SAndrew Turner 
2179d1de259SAndrew Turner   svfloat64_t ln2_hilo = svld1rq_f64 (svptrue_b64 (), &d->ln2_hi);
2189d1de259SAndrew Turner   svfloat64_t t1 = svmla_lane_f64 (logc, kd, ln2_hilo, 0);
219edc5c0deSAndrew Turner   svfloat64_t t2 = svadd_x (pg, t1, r);
2209d1de259SAndrew Turner   svfloat64_t lo1 = svmla_lane_f64 (logctail, kd, ln2_hilo, 1);
221edc5c0deSAndrew Turner   svfloat64_t lo2 = svadd_x (pg, svsub_x (pg, t1, t2), r);
222edc5c0deSAndrew Turner 
223edc5c0deSAndrew Turner   /* Evaluation is optimized assuming superscalar pipelined execution.  */
2249d1de259SAndrew Turner 
2259d1de259SAndrew Turner   svfloat64_t log_c02 = svld1rq_f64 (svptrue_b64 (), &d->log_c0);
2269d1de259SAndrew Turner   svfloat64_t ar = svmul_lane_f64 (r, log_c02, 0);
2279d1de259SAndrew Turner   svfloat64_t ar2 = svmul_x (svptrue_b64 (), r, ar);
2289d1de259SAndrew Turner   svfloat64_t ar3 = svmul_x (svptrue_b64 (), r, ar2);
229edc5c0deSAndrew Turner   /* k*Ln2 + log(c) + r + A[0]*r*r.  */
230edc5c0deSAndrew Turner   svfloat64_t hi = svadd_x (pg, t2, ar2);
2319d1de259SAndrew Turner   svfloat64_t lo3 = svmls_x (pg, ar2, ar, r);
232edc5c0deSAndrew Turner   svfloat64_t lo4 = svadd_x (pg, svsub_x (pg, t2, hi), ar2);
233edc5c0deSAndrew Turner   /* p = log1p(r) - r - A[0]*r*r.  */
234edc5c0deSAndrew Turner   /* p = (ar3 * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r *
235edc5c0deSAndrew Turner      A[6])))).  */
2369d1de259SAndrew Turner 
2379d1de259SAndrew Turner   svfloat64_t log_c46 = svld1rq_f64 (svptrue_b64 (), &d->log_c4);
2389d1de259SAndrew Turner   svfloat64_t a56 = svmla_lane_f64 (sv_f64 (d->log_c5), r, log_c46, 1);
2399d1de259SAndrew Turner   svfloat64_t a34 = svmla_lane_f64 (sv_f64 (d->log_c3), r, log_c46, 0);
2409d1de259SAndrew Turner   svfloat64_t a12 = svmla_lane_f64 (sv_f64 (d->log_c1), r, log_c02, 1);
241edc5c0deSAndrew Turner   svfloat64_t p = svmla_x (pg, a34, ar2, a56);
242edc5c0deSAndrew Turner   p = svmla_x (pg, a12, ar2, p);
2439d1de259SAndrew Turner   p = svmul_x (svptrue_b64 (), ar3, p);
244edc5c0deSAndrew Turner   svfloat64_t lo = svadd_x (
2459d1de259SAndrew Turner       pg, svadd_x (pg, svsub_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p);
246edc5c0deSAndrew Turner   svfloat64_t y = svadd_x (pg, hi, lo);
247edc5c0deSAndrew Turner   *tail = svadd_x (pg, svsub_x (pg, hi, y), lo);
248edc5c0deSAndrew Turner   return y;
249edc5c0deSAndrew Turner }
250edc5c0deSAndrew Turner 
2519d1de259SAndrew Turner static inline svfloat64_t
sv_exp_core(svbool_t pg,svfloat64_t x,svfloat64_t xtail,svuint64_t sign_bias,svfloat64_t * tmp,svuint64_t * sbits,svuint64_t * ki,const struct data * d)2529d1de259SAndrew Turner sv_exp_core (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
2539d1de259SAndrew Turner 	     svuint64_t sign_bias, svfloat64_t *tmp, svuint64_t *sbits,
2549d1de259SAndrew Turner 	     svuint64_t *ki, const struct data *d)
2559d1de259SAndrew Turner {
2569d1de259SAndrew Turner   /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
2579d1de259SAndrew Turner   /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */
2589d1de259SAndrew Turner   svfloat64_t n_over_ln2_and_c2 = svld1rq_f64 (svptrue_b64 (), &d->n_over_ln2);
2599d1de259SAndrew Turner   svfloat64_t z = svmul_lane_f64 (x, n_over_ln2_and_c2, 0);
2609d1de259SAndrew Turner   /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
2619d1de259SAndrew Turner   svfloat64_t kd = svrinta_x (pg, z);
2629d1de259SAndrew Turner   *ki = svreinterpret_u64 (svcvt_s64_x (pg, kd));
2639d1de259SAndrew Turner 
2649d1de259SAndrew Turner   svfloat64_t ln2_over_n_hilo
2659d1de259SAndrew Turner       = svld1rq_f64 (svptrue_b64 (), &d->ln2_over_n_hi);
2669d1de259SAndrew Turner   svfloat64_t r = x;
2679d1de259SAndrew Turner   r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 0);
2689d1de259SAndrew Turner   r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 1);
2699d1de259SAndrew Turner   /* The code assumes 2^-200 < |xtail| < 2^-8/N.  */
2709d1de259SAndrew Turner   r = svadd_x (pg, r, xtail);
2719d1de259SAndrew Turner   /* 2^(k/N) ~= scale.  */
2729d1de259SAndrew Turner   svuint64_t idx = svand_x (pg, *ki, N_EXP - 1);
2739d1de259SAndrew Turner   svuint64_t top
2749d1de259SAndrew Turner       = svlsl_x (pg, svadd_x (pg, *ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS);
2759d1de259SAndrew Turner   /* This is only a valid scale when -1023*N < k < 1024*N.  */
2769d1de259SAndrew Turner   *sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx);
2779d1de259SAndrew Turner   *sbits = svadd_x (pg, *sbits, top);
2789d1de259SAndrew Turner   /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1).  */
2799d1de259SAndrew Turner   svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
2809d1de259SAndrew Turner   *tmp = svmla_lane_f64 (sv_f64 (d->exp_c1), r, n_over_ln2_and_c2, 1);
2819d1de259SAndrew Turner   *tmp = svmla_x (pg, sv_f64 (d->exp_c0), r, *tmp);
2829d1de259SAndrew Turner   *tmp = svmla_x (pg, r, r2, *tmp);
2839d1de259SAndrew Turner   svfloat64_t scale = svreinterpret_f64 (*sbits);
2849d1de259SAndrew Turner   /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
2859d1de259SAndrew Turner      is no spurious underflow here even without fma.  */
2869d1de259SAndrew Turner   z = svmla_x (pg, scale, scale, *tmp);
2879d1de259SAndrew Turner   return z;
2889d1de259SAndrew Turner }
2899d1de259SAndrew Turner 
290edc5c0deSAndrew Turner /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
291edc5c0deSAndrew Turner    The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1.  */
292edc5c0deSAndrew Turner static inline svfloat64_t
sv_exp_inline(svbool_t pg,svfloat64_t x,svfloat64_t xtail,svuint64_t sign_bias,const struct data * d)293edc5c0deSAndrew Turner sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
2949d1de259SAndrew Turner 	       svuint64_t sign_bias, const struct data *d)
295edc5c0deSAndrew Turner {
296edc5c0deSAndrew Turner   /* 3 types of special cases: tiny (uflow and spurious uflow), huge (oflow)
297edc5c0deSAndrew Turner      and other cases of large values of x (scale * (1 + TMP) oflow).  */
298edc5c0deSAndrew Turner   svuint64_t abstop = svand_x (pg, sv_top12 (x), 0x7ff);
299edc5c0deSAndrew Turner   /* |x| is large (|x| >= 512) or tiny (|x| <= 0x1p-54).  */
300edc5c0deSAndrew Turner   svbool_t uoflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), ThresExp);
301edc5c0deSAndrew Turner 
3029d1de259SAndrew Turner   svfloat64_t tmp;
3039d1de259SAndrew Turner   svuint64_t sbits, ki;
304edc5c0deSAndrew Turner   if (unlikely (svptest_any (pg, uoflow)))
305edc5c0deSAndrew Turner     {
3069d1de259SAndrew Turner       svfloat64_t z
3079d1de259SAndrew Turner 	  = sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d);
3089d1de259SAndrew Turner 
309edc5c0deSAndrew Turner       /* |x| is tiny (|x| <= 0x1p-54).  */
3109d1de259SAndrew Turner       svbool_t uflow
3119d1de259SAndrew Turner 	  = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000);
312edc5c0deSAndrew Turner       uflow = svand_z (pg, uoflow, uflow);
313edc5c0deSAndrew Turner       /* |x| is huge (|x| >= 1024).  */
3149d1de259SAndrew Turner       svbool_t oflow = svcmpge (pg, abstop, HugeExp);
315edc5c0deSAndrew Turner       oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow));
3169d1de259SAndrew Turner 
317edc5c0deSAndrew Turner       /* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow
318edc5c0deSAndrew Turner     or underflow.  */
3199d1de259SAndrew Turner       svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
320edc5c0deSAndrew Turner 
321edc5c0deSAndrew Turner       /* Update result with special and large cases.  */
322edc5c0deSAndrew Turner       z = sv_call_specialcase (tmp, sbits, ki, z, special);
323edc5c0deSAndrew Turner 
324edc5c0deSAndrew Turner       /* Handle underflow and overflow.  */
3259d1de259SAndrew Turner       svbool_t x_is_neg = svcmplt (pg, x, 0);
3269d1de259SAndrew Turner       svuint64_t sign_mask
3279d1de259SAndrew Turner 	  = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
3289d1de259SAndrew Turner       svfloat64_t res_uoflow
3299d1de259SAndrew Turner 	  = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY));
330edc5c0deSAndrew Turner       res_uoflow = svreinterpret_f64 (
331edc5c0deSAndrew Turner 	  svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask));
332edc5c0deSAndrew Turner       /* Avoid spurious underflow for tiny x.  */
333edc5c0deSAndrew Turner       svfloat64_t res_spurious_uflow
334edc5c0deSAndrew Turner 	  = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000));
335edc5c0deSAndrew Turner 
3369d1de259SAndrew Turner       z = svsel (oflow, res_uoflow, z);
3379d1de259SAndrew Turner       z = svsel (uflow, res_spurious_uflow, z);
338edc5c0deSAndrew Turner       return z;
339edc5c0deSAndrew Turner     }
340edc5c0deSAndrew Turner 
3419d1de259SAndrew Turner   return sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d);
3429d1de259SAndrew Turner }
3439d1de259SAndrew Turner 
344edc5c0deSAndrew Turner static inline double
pow_sc(double x,double y)345edc5c0deSAndrew Turner pow_sc (double x, double y)
346edc5c0deSAndrew Turner {
347edc5c0deSAndrew Turner   uint64_t ix = asuint64 (x);
348edc5c0deSAndrew Turner   uint64_t iy = asuint64 (y);
349edc5c0deSAndrew Turner   /* Special cases: |x| or |y| is 0, inf or nan.  */
350edc5c0deSAndrew Turner   if (unlikely (zeroinfnan (iy)))
351edc5c0deSAndrew Turner     {
352edc5c0deSAndrew Turner       if (2 * iy == 0)
353edc5c0deSAndrew Turner 	return issignaling_inline (x) ? x + y : 1.0;
354edc5c0deSAndrew Turner       if (ix == asuint64 (1.0))
355edc5c0deSAndrew Turner 	return issignaling_inline (y) ? x + y : 1.0;
356edc5c0deSAndrew Turner       if (2 * ix > 2 * asuint64 (INFINITY) || 2 * iy > 2 * asuint64 (INFINITY))
357edc5c0deSAndrew Turner 	return x + y;
358edc5c0deSAndrew Turner       if (2 * ix == 2 * asuint64 (1.0))
359edc5c0deSAndrew Turner 	return 1.0;
360edc5c0deSAndrew Turner       if ((2 * ix < 2 * asuint64 (1.0)) == !(iy >> 63))
361edc5c0deSAndrew Turner 	return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf.  */
362edc5c0deSAndrew Turner       return y * y;
363edc5c0deSAndrew Turner     }
364edc5c0deSAndrew Turner   if (unlikely (zeroinfnan (ix)))
365edc5c0deSAndrew Turner     {
366edc5c0deSAndrew Turner       double_t x2 = x * x;
367edc5c0deSAndrew Turner       if (ix >> 63 && checkint (iy) == 1)
368edc5c0deSAndrew Turner 	x2 = -x2;
3699d1de259SAndrew Turner       return (iy >> 63) ? 1 / x2 : x2;
370edc5c0deSAndrew Turner     }
371edc5c0deSAndrew Turner   return x;
372edc5c0deSAndrew Turner }
373edc5c0deSAndrew Turner 
pow(svfloat64_t x,svfloat64_t y,const svbool_t pg)374edc5c0deSAndrew Turner svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
375edc5c0deSAndrew Turner {
3769d1de259SAndrew Turner   const struct data *d = ptr_barrier (&data);
3779d1de259SAndrew Turner 
378edc5c0deSAndrew Turner   /* This preamble handles special case conditions used in the final scalar
379edc5c0deSAndrew Turner      fallbacks. It also updates ix and sign_bias, that are used in the core
380edc5c0deSAndrew Turner      computation too, i.e., exp( y * log (x) ).  */
381edc5c0deSAndrew Turner   svuint64_t vix0 = svreinterpret_u64 (x);
382edc5c0deSAndrew Turner   svuint64_t viy0 = svreinterpret_u64 (y);
383edc5c0deSAndrew Turner 
384edc5c0deSAndrew Turner   /* Negative x cases.  */
3859d1de259SAndrew Turner   svbool_t xisneg = svcmplt (pg, x, 0);
386edc5c0deSAndrew Turner 
387edc5c0deSAndrew Turner   /* Set sign_bias and ix depending on sign of x and nature of y.  */
3889d1de259SAndrew Turner   svbool_t yint_or_xpos = pg;
389edc5c0deSAndrew Turner   svuint64_t sign_bias = sv_u64 (0);
390edc5c0deSAndrew Turner   svuint64_t vix = vix0;
391edc5c0deSAndrew Turner   if (unlikely (svptest_any (pg, xisneg)))
392edc5c0deSAndrew Turner     {
393edc5c0deSAndrew Turner       /* Determine nature of y.  */
3949d1de259SAndrew Turner       yint_or_xpos = sv_isint (xisneg, y);
395edc5c0deSAndrew Turner       svbool_t yisodd_xisneg = sv_isodd (xisneg, y);
396edc5c0deSAndrew Turner       /* ix set to abs(ix) if y is integer.  */
3979d1de259SAndrew Turner       vix = svand_m (yint_or_xpos, vix0, 0x7fffffffffffffff);
398edc5c0deSAndrew Turner       /* Set to SignBias if x is negative and y is odd.  */
399edc5c0deSAndrew Turner       sign_bias = svsel (yisodd_xisneg, sv_u64 (SignBias), sv_u64 (0));
400edc5c0deSAndrew Turner     }
401edc5c0deSAndrew Turner 
402edc5c0deSAndrew Turner   /* Small cases of x: |x| < 0x1p-126.  */
4039d1de259SAndrew Turner   svbool_t xsmall = svaclt (yint_or_xpos, x, SmallBoundX);
4049d1de259SAndrew Turner   if (unlikely (svptest_any (yint_or_xpos, xsmall)))
405edc5c0deSAndrew Turner     {
406edc5c0deSAndrew Turner       /* Normalize subnormal x so exponent becomes negative.  */
4079d1de259SAndrew Turner       svuint64_t vtopx = svlsr_x (svptrue_b64 (), vix, 52);
4089d1de259SAndrew Turner       svbool_t topx_is_null = svcmpeq (xsmall, vtopx, 0);
409edc5c0deSAndrew Turner 
410edc5c0deSAndrew Turner       svuint64_t vix_norm = svreinterpret_u64 (svmul_m (xsmall, x, 0x1p52));
411edc5c0deSAndrew Turner       vix_norm = svand_m (xsmall, vix_norm, 0x7fffffffffffffff);
412edc5c0deSAndrew Turner       vix_norm = svsub_m (xsmall, vix_norm, 52ULL << 52);
413edc5c0deSAndrew Turner       vix = svsel (topx_is_null, vix_norm, vix);
414edc5c0deSAndrew Turner     }
415edc5c0deSAndrew Turner 
416edc5c0deSAndrew Turner   /* y_hi = log(ix, &y_lo).  */
417edc5c0deSAndrew Turner   svfloat64_t vlo;
4189d1de259SAndrew Turner   svfloat64_t vhi = sv_log_inline (yint_or_xpos, vix, &vlo, d);
419edc5c0deSAndrew Turner 
420edc5c0deSAndrew Turner   /* z = exp(y_hi, y_lo, sign_bias).  */
4219d1de259SAndrew Turner   svfloat64_t vehi = svmul_x (svptrue_b64 (), y, vhi);
4229d1de259SAndrew Turner   svfloat64_t vemi = svmls_x (yint_or_xpos, vehi, y, vhi);
4239d1de259SAndrew Turner   svfloat64_t velo = svnmls_x (yint_or_xpos, vemi, y, vlo);
4249d1de259SAndrew Turner   svfloat64_t vz = sv_exp_inline (yint_or_xpos, vehi, velo, sign_bias, d);
425edc5c0deSAndrew Turner 
426edc5c0deSAndrew Turner   /* Cases of finite y and finite negative x.  */
4279d1de259SAndrew Turner   vz = svsel (yint_or_xpos, vz, sv_f64 (__builtin_nan ("")));
4289d1de259SAndrew Turner 
4299d1de259SAndrew Turner   /* Special cases of x or y: zero, inf and nan.  */
4309d1de259SAndrew Turner   svbool_t xspecial = sv_zeroinfnan (svptrue_b64 (), vix0);
4319d1de259SAndrew Turner   svbool_t yspecial = sv_zeroinfnan (svptrue_b64 (), viy0);
4329d1de259SAndrew Turner   svbool_t special = svorr_z (svptrue_b64 (), xspecial, yspecial);
433edc5c0deSAndrew Turner 
434edc5c0deSAndrew Turner   /* Cases of zero/inf/nan x or y.  */
4359d1de259SAndrew Turner   if (unlikely (svptest_any (svptrue_b64 (), special)))
436edc5c0deSAndrew Turner     vz = sv_call2_f64 (pow_sc, x, y, vz, special);
437edc5c0deSAndrew Turner 
438edc5c0deSAndrew Turner   return vz;
439edc5c0deSAndrew Turner }
440edc5c0deSAndrew Turner 
4419d1de259SAndrew Turner TEST_SIG (SV, D, 2, pow)
4429d1de259SAndrew Turner TEST_ULP (SV_NAME_D2 (pow), 0.55)
4439d1de259SAndrew Turner TEST_DISABLE_FENV (SV_NAME_D2 (pow))
444edc5c0deSAndrew Turner /* Wide intervals spanning the whole domain but shared between x and y.  */
445edc5c0deSAndrew Turner #define SV_POW_INTERVAL2(xlo, xhi, ylo, yhi, n)                               \
4469d1de259SAndrew Turner   TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, ylo, yhi, n)                    \
4479d1de259SAndrew Turner   TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n)                  \
4489d1de259SAndrew Turner   TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n)                  \
4499d1de259SAndrew Turner   TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n)
450edc5c0deSAndrew Turner #define EXPAND(str) str##000000000
451edc5c0deSAndrew Turner #define SHL52(str) EXPAND (str)
452edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000)
453edc5c0deSAndrew Turner SV_POW_INTERVAL2 (SHL52 (SmallPowX), SHL52 (BigPowX), 0, inf, 40000)
454edc5c0deSAndrew Turner SV_POW_INTERVAL2 (SHL52 (BigPowX), inf, 0, inf, 40000)
455edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0, inf, 0, SHL52 (SmallPowY), 40000)
456edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0, inf, SHL52 (SmallPowY), SHL52 (BigPowY), 40000)
457edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0, inf, SHL52 (BigPowY), inf, 40000)
458edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0, inf, 0, inf, 1000)
459edc5c0deSAndrew Turner /* x~1 or y~1.  */
460edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000)
461edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000)
462edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000)
463edc5c0deSAndrew Turner /* around estimated argmaxs of ULP error.  */
464edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
465edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
466edc5c0deSAndrew Turner /* x is negative, y is odd or even integer, or y is real not integer.  */
4679d1de259SAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
4689d1de259SAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
4699d1de259SAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
4709d1de259SAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
471edc5c0deSAndrew Turner /* |x| is inf, y is odd or even integer, or y is real not integer.  */
472edc5c0deSAndrew Turner SV_POW_INTERVAL2 (inf, inf, 0.5, 0.5, 1)
473edc5c0deSAndrew Turner SV_POW_INTERVAL2 (inf, inf, 1.0, 1.0, 1)
474edc5c0deSAndrew Turner SV_POW_INTERVAL2 (inf, inf, 2.0, 2.0, 1)
475edc5c0deSAndrew Turner SV_POW_INTERVAL2 (inf, inf, 3.0, 3.0, 1)
476edc5c0deSAndrew Turner /* 0.0^y.  */
477edc5c0deSAndrew Turner SV_POW_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000)
478edc5c0deSAndrew Turner /* 1.0^y.  */
4799d1de259SAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
4809d1de259SAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
4819d1de259SAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
4829d1de259SAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
4839d1de259SAndrew Turner CLOSE_SVE_ATTR
484