| /src/contrib/arm-optimized-routines/math/include/ |
| H A D | mathlib.h | 114 __vpcs float32x4_t _ZGVnN4v_acosf (float32x4_t); 115 __vpcs float32x4_t _ZGVnN4v_acoshf (float32x4_t); 116 __vpcs float32x4_t _ZGVnN4v_asinf (float32x4_t); 117 __vpcs float32x4_t _ZGVnN4v_asinhf (float32x4_t); 118 __vpcs float32x4_t _ZGVnN4v_atanf (float32x4_t); 119 __vpcs float32x4_t _ZGVnN4v_atanhf (float32x4_t); 120 __vpcs float32x4_t _ZGVnN4v_cbrtf (float32x4_t); 121 __vpcs float32x4_t _ZGVnN4v_cosf (float32x4_t); 122 __vpcs float32x4_t _ZGVnN4v_coshf (float32x4_t); 123 __vpcs float32x4_t _ZGVnN4v_cospif (float32x4_t); [all …]
|
| /src/contrib/arm-optimized-routines/math/aarch64/advsimd/ |
| H A D | exp10f.c | 18 float32x4_t c0, c1, c3; 20 float32x4_t inv_log10_2, special_bound; 23 float32x4_t scale_thresh; 56 static float32x4_t VPCS_ATTR NOINLINE 57 special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) in special_case() 68 static float32x4_t VPCS_ATTR NOINLINE 69 special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, in special_case() 70 float32x4_t scale, const struct data *d) in special_case() 74 float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); in special_case() 75 float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); in special_case() [all …]
|
| H A D | exp2f.c | 14 float32x4_t c1, c3; 17 float32x4_t scale_thresh, special_bound; 42 static float32x4_t VPCS_ATTR NOINLINE 43 special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) in special_case() 52 static float32x4_t VPCS_ATTR NOINLINE 53 special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, in special_case() 54 float32x4_t scale, const struct data *d) in special_case() 58 float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); in special_case() 59 float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); in special_case() 61 float32x4_t r2 = vmulq_f32 (s1, s1); in special_case() [all …]
|
| H A D | erfcf.c | 15 float32x4_t max, shift; 17 float32x4_t third, two_over_five, tenth; 19 float32x4_t uflow_bound; 46 float32x4_t erfc; 47 float32x4_t scale; 62 float32x4_t e1 = vcombine_f32 (t0, t1); in lookup() 63 float32x4_t e2 = vcombine_f32 (t2, t3); in lookup() 70 static float32x4_t VPCS_ATTR NOINLINE 71 special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) in special_case() 94 NOINLINE VPCS_ATTR float32x4_t V_NAME_F1 (erfc) (float32x4_t x) in erfc() [all …]
|
| H A D | expf.c | 13 float32x4_t c1, c3, c4, inv_ln2; 17 float32x4_t special_bound, scale_thresh; 46 static float32x4_t VPCS_ATTR NOINLINE 47 special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) in special_case() 56 static float32x4_t VPCS_ATTR NOINLINE 57 special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1, in special_case() 58 float32x4_t scale, const struct data *d) in special_case() 62 float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); in special_case() 63 float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); in special_case() 65 float32x4_t r2 = vmulq_f32 (s1, s1); in special_case() [all …]
|
| H A D | tanf.c | 15 float32x4_t poly[6]; 17 float32x4_t shift; 19 float32x4_t range_val; 39 static float32x4_t VPCS_ATTR NOINLINE 40 special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp) in special_case() 46 static inline float32x4_t 47 eval_poly (float32x4_t z, const struct data *d) in eval_poly() 49 float32x4_t z2 = vmulq_f32 (z, z); in eval_poly() 59 float32x4_t z4 = vmulq_f32 (z2, z2); in eval_poly() 67 float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tan) (float32x4_t x) in tan() [all …]
|
| H A D | atan2f.c | 14 float32x4_t c0, pi_over_2, c4, c6, c2; 31 static float32x4_t VPCS_ATTR NOINLINE 32 special_case (float32x4_t y, float32x4_t x, float32x4_t ret, in special_case() 53 float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x) in atan2() 67 float32x4_t ax = vabsq_f32 (x); in atan2() 68 float32x4_t ay = vabsq_f32 (y); in atan2() 74 float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay); in atan2() 75 float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax); in atan2() 76 float32x4_t z = vdivq_f32 (n, q); in atan2() 79 float32x4_t shift = vreinterpretq_f32_u32 ( in atan2() [all …]
|
| H A D | log2f.c | 14 float32x4_t c0, c2, c4, c6, c8; 41 static float32x4_t VPCS_ATTR NOINLINE 42 special_case (float32x4_t n, uint32x4_t u_off, float32x4_t p, float32x4_t r, in special_case() 55 float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2) (float32x4_t x) in log2() 66 float32x4_t n = vcvtq_f32_s32 ( in log2() 73 float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); in log2() 76 float32x4_t r2 = vmulq_f32 (r, r); in log2() 78 float32x4_t c1357 = vld1q_f32 (&d->c1); in log2() 79 float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0); in log2() 80 float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1); in log2() [all …]
|
| H A D | v_log1pf_inline.h | 20 float32x4_t c4, c6, c1, c2, ln2; 35 static inline float32x4_t 36 eval_poly (float32x4_t m, const struct v_log1pf_data *d) in eval_poly() 39 float32x4_t c0357 = vld1q_f32 (&d->c0); in eval_poly() 40 float32x4_t q = vfmaq_laneq_f32 (v_f32 (-0.5), m, c0357, 0); in eval_poly() 41 float32x4_t m2 = vmulq_f32 (m, m); in eval_poly() 42 float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m, c0357, 3); in eval_poly() 43 float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m, c0357, 2); in eval_poly() 44 float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m, c0357, 1); in eval_poly() 45 float32x4_t p = vfmaq_f32 (p45, m2, p67); in eval_poly() [all …]
|
| H A D | tanpif.c | 14 float32x4_t c0, c2, c4, c6; 27 float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanpi) (float32x4_t x) in tanpi() 31 float32x4_t n = vrndnq_f32 (x); in tanpi() 34 float32x4_t xr = vsubq_f32 (x, n); in tanpi() 35 float32x4_t ar = vabdq_f32 (x, n); in tanpi() 37 float32x4_t r = vbslq_f32 (flip, vsubq_f32 (v_f32 (0.5f), ar), ar); in tanpi() 40 float32x4_t r2 = vmulq_f32 (r, r); in tanpi() 41 float32x4_t r4 = vmulq_f32 (r2, r2); in tanpi() 43 float32x4_t odd_coeffs = vld1q_f32 (&d->c1); in tanpi() 44 float32x4_t p01 = vfmaq_laneq_f32 (d->c0, r2, odd_coeffs, 0); in tanpi() [all …]
|
| H A D | expf_1u.c | 12 float32x4_t shift, inv_ln2; 14 float32x4_t c1, c2, c3, c4; 15 float32x4_t special_bound, scale_thresh; 36 static float32x4_t VPCS_ATTR NOINLINE 37 specialcase (float32x4_t p, float32x4_t n, uint32x4_t e, const struct data *d) in specialcase() 41 float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); in specialcase() 42 float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); in specialcase() 44 float32x4_t r1 = vmulq_f32 (s1, s1); in specialcase() 45 float32x4_t r0 = vmulq_f32 (vmulq_f32 (p, s1), s2); in specialcase() 50 float32x4_t VPCS_ATTR [all …]
|
| H A D | log10f.c | 14 float32x4_t c0, c2, c4, c6, inv_ln10, ln2; 41 static float32x4_t VPCS_ATTR NOINLINE 42 special_case (float32x4_t y, uint32x4_t u_off, float32x4_t p, float32x4_t r2, in special_case() 56 float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log10) (float32x4_t x) in log10() 59 float32x4_t c1357 = vld1q_f32 (&d->c1); in log10() 67 float32x4_t n = vcvtq_f32_s32 ( in log10() 74 float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); in log10() 77 float32x4_t r2 = vmulq_f32 (r, r); in log10() 79 float32x4_t c01 = vfmaq_laneq_f32 (d->c0, r, c1357, 0); in log10() 80 float32x4_t c23 = vfmaq_laneq_f32 (d->c2, r, c1357, 1); in log10() [all …]
|
| H A D | cbrtf.c | 15 float32x4_t poly[4], one_third; 33 static float32x4_t VPCS_ATTR NOINLINE 34 special_case (float32x4_t x, float32x4_t y, uint16x4_t special) in special_case() 39 static inline float32x4_t 42 return (float32x4_t){ table[i[0] + 2], table[i[1] + 2], table[i[2] + 2], in shifted_lookup() 52 float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cbrt) (float32x4_t x) in cbrt() 63 float32x4_t m = vbslq_f32 (MantissaMask, x, v_f32 (0.5)); in cbrt() 70 float32x4_t p = v_pairwise_poly_3_f32 (m, vmulq_f32 (m, m), d->poly); in cbrt() 72 float32x4_t one_third = d->one_third; in cbrt() 73 float32x4_t two_thirds = vaddq_f32 (one_third, one_third); in cbrt() [all …]
|
| H A D | exp2f_1u.c | 13 float32x4_t c0, c1, c2, c3, c4, c5, shift; 15 float32x4_t special_bound, scale_thresh; 33 static float32x4_t VPCS_ATTR NOINLINE 34 specialcase (float32x4_t p, float32x4_t n, uint32x4_t e, const struct data *d) in specialcase() 38 float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias)); in specialcase() 39 float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b)); in specialcase() 41 float32x4_t r1 = vmulq_f32 (s1, s1); in specialcase() 42 float32x4_t r0 = vmulq_f32 (vmulq_f32 (p, s1), s2); in specialcase() 47 float32x4_t VPCS_ATTR 48 _ZGVnN4v_exp2f_1u (float32x4_t x) in _ZGVnN4v_exp2f_1u() [all …]
|
| H A D | acosf.c | 15 float32x4_t poly[5]; 16 float32x4_t pi_over_2f, pif; 32 static float32x4_t VPCS_ATTR NOINLINE 33 special_case (float32x4_t x, float32x4_t y, uint32x4_t special) in special_case() 61 float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (acos) (float32x4_t x) in acos() 76 float32x4_t ax = vreinterpretq_f32_u32 (ia); in acos() 82 float32x4_t z2 = vbslq_f32 (a_le_half, vmulq_f32 (x, x), in acos() 84 float32x4_t z = vbslq_f32 (a_le_half, ax, vsqrtq_f32 (z2)); in acos() 87 float32x4_t p = v_horner_4_f32 (z2, d->poly); in acos() 94 float32x4_t y = vbslq_f32 (v_u32 (AbsMask), p, x); in acos() [all …]
|
| H A D | v_sincospif_common.h | 13 float32x4_t poly[6], range_val; 29 v_sincospif_inline (float32x4_t x, const struct v_sincospif_data *d) in v_sincospif_inline() 38 float32x4_t sr = vsubq_f32 (x, vrndaq_f32 (x)); in v_sincospif_inline() 40 float32x4_t cr = vsubq_f32 (v_f32 (0.5f), vabsq_f32 (sr)); in v_sincospif_inline() 43 float32x4_t sr2 = vmulq_f32 (sr, sr); in v_sincospif_inline() 44 float32x4_t sr4 = vmulq_f32 (sr2, sr2); in v_sincospif_inline() 45 float32x4_t cr2 = vmulq_f32 (cr, cr); in v_sincospif_inline() 46 float32x4_t cr4 = vmulq_f32 (cr2, cr2); in v_sincospif_inline() 48 float32x4_t ss = vmulq_f32 (v_pw_horner_5_f32 (sr2, sr4, d->poly), sr); in v_sincospif_inline() 49 float32x4_t cc = vmulq_f32 (v_pw_horner_5_f32 (cr2, cr4, d->poly), cr); in v_sincospif_inline() [all …]
|
| H A D | erff.c | 14 float32x4_t max, shift, third; 16 float32x4_t tiny_bound, scale_minus_one; 32 float32x4_t erf; 33 float32x4_t scale; 44 float32x4_t e1 = vcombine_f32 (t0, t1); in lookup() 45 float32x4_t e2 = vcombine_f32 (t2, t3); in lookup() 64 float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erf) (float32x4_t x) in erf() 71 float32x4_t xm = x; in erf() 79 float32x4_t a = vabsq_f32 (x); in erf() 84 float32x4_t shift = dat->shift; in erf() [all …]
|
| H A D | v_expf_inline.h | 17 float32x4_t inv_ln2, c1, c3, c4; 31 static inline float32x4_t 32 v_expf_inline (float32x4_t x, const struct v_expf_data *d) in v_expf_inline() 40 float32x4_t ax = vabsq_f32 (x); in v_expf_inline() 41 float32x4_t ln2_c02 = vld1q_f32 (&d->ln2_hi); in v_expf_inline() 42 float32x4_t n = vrndaq_f32 (vmulq_f32 (ax, d->inv_ln2)); in v_expf_inline() 43 float32x4_t r = vfmsq_laneq_f32 (ax, n, ln2_c02, 0); in v_expf_inline() 46 float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias)); in v_expf_inline() 49 float32x4_t r2 = vmulq_f32 (r, r); in v_expf_inline() 50 float32x4_t p = vfmaq_laneq_f32 (d->c1, r, ln2_c02, 2); in v_expf_inline() [all …]
|
| H A D | v_expm1f_inline.h | 16 float32x4_t c0, c2; 32 static inline float32x4_t 33 expm1f_inline (float32x4_t x, const struct v_expm1f_data *d) in expm1f_inline() 38 float32x4_t lane_consts = vld1q_f32 (&d->c1); in expm1f_inline() 41 float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2)); in expm1f_inline() 43 float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0); in expm1f_inline() 47 float32x4_t f2 = vmulq_f32 (f, f); in expm1f_inline() 48 float32x4_t f4 = vmulq_f32 (f2, f2); in expm1f_inline() 49 float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0); in expm1f_inline() 50 float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1); in expm1f_inline() [all …]
|
| H A D | hypotf.c | 31 static float32x4_t VPCS_ATTR NOINLINE 32 special_case (float32x4_t x, float32x4_t y, float32x4_t sqsum, in special_case() 44 float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) in hypot() 48 float32x4_t ax = vabsq_f32 (x); in hypot() 49 float32x4_t ay = vabsq_f32 (y); in hypot() 62 float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (ax, ax), ay, ay); in hypot() 71 float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (hypot) (float32x4_t x, float32x4_t y) in hypot() 75 float32x4_t sqsum = vfmaq_f32 (vmulq_f32 (x, x), y, y); in hypot()
|
| H A D | logf.c | 13 float32x4_t c2, c4, c6, ln2; 36 static float32x4_t VPCS_ATTR NOINLINE 37 special_case (float32x4_t p, uint32x4_t u_off, float32x4_t y, float32x4_t r2, in special_case() 45 float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log) (float32x4_t x) in log() 48 float32x4_t c1350 = vld1q_f32 (&d->c1); in log() 56 float32x4_t n = vcvtq_f32_s32 ( in log() 62 float32x4_t r = vsubq_f32 (vreinterpretq_f32_u32 (u), v_f32 (1.0f)); in log() 65 float32x4_t r2 = vmulq_f32 (r, r); in log() 67 float32x4_t p = vfmaq_laneq_f32 (d->c2, r, c1350, 0); in log() 68 float32x4_t q = vfmaq_laneq_f32 (d->c4, r, c1350, 1); in log() [all …]
|
| H A D | atanf.c | 15 float32x4_t poly[8]; 16 float32x4_t pi_over_2; 35 static float32x4_t VPCS_ATTR NOINLINE 36 special_case (float32x4_t x, float32x4_t y, uint32x4_t special) in special_case() 46 float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x) in atan() 71 float32x4_t z = vbslq_f32 (red, vdivq_f32 (v_f32 (1.0f), x), x); in atan() 72 float32x4_t shift = vreinterpretq_f32_u32 ( in atan() 75 float32x4_t az = vbslq_f32 ( in atan() 85 float32x4_t z2 = vmulq_f32 (z, z); in atan() 86 float32x4_t z4 = vmulq_f32 (z2, z2); in atan() [all …]
|
| H A D | cospif.c | 16 float32x4_t poly[6]; 17 float32x4_t range_val; 25 static float32x4_t VPCS_ATTR NOINLINE 26 special_case (float32x4_t x, float32x4_t y, uint32x4_t odd, uint32x4_t cmp) in special_case() 36 float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (cospi) (float32x4_t x) in cospi() 41 float32x4_t r = vabsq_f32 (x); in cospi() 50 float32x4_t r = x; in cospi() 65 float32x4_t r2 = vmulq_f32 (r, r); in cospi() 66 float32x4_t r4 = vmulq_f32 (r2, r2); in cospi() 67 float32x4_t y = vmulq_f32 (v_pw_horner_5_f32 (r2, r4, d->poly), r); in cospi()
|
| H A D | v_sincosf_common.h | 12 float32x4_t poly_sin[3], poly_cos[3], pio2[3], inv_pio2, shift, range_val; 25 check_ge_rangeval (float32x4_t x, const struct v_sincosf_data *d) in check_ge_rangeval() 38 v_sincosf_inline (float32x4_t x, const struct v_sincosf_data *d) in v_sincosf_inline() 41 float32x4_t shift = d->shift; in v_sincosf_inline() 42 float32x4_t q = vfmaq_f32 (shift, x, d->inv_pio2); in v_sincosf_inline() 47 float32x4_t r = x; in v_sincosf_inline() 53 float32x4_t r2 = vmulq_f32 (r, r), r3 = vmulq_f32 (r, r2); in v_sincosf_inline() 54 float32x4_t s = vfmaq_f32 (d->poly_sin[1], r2, d->poly_sin[2]); in v_sincosf_inline() 59 float32x4_t r4 = vmulq_f32 (r2, r2); in v_sincosf_inline() 60 float32x4_t p = vfmaq_f32 (d->poly_cos[1], r2, d->poly_cos[2]); in v_sincosf_inline() [all …]
|
| /src/contrib/arm-optimized-routines/math/aarch64/experimental/advsimd/ |
| H A D | erfinvf_5u.c | 28 float32x4_t P29_3, tailshift; 29 float32x4_t P_50[6], Q_50[2]; 30 float32x4_t P_10[3], Q_10[3]; 49 static inline float32x4_t 50 special (float32x4_t x, const struct data *d) in special() 58 float32x4_t t = vdivq_f32 ( in special() 61 float32x4_t ts = vbslq_f32 (v_u32 (0x7fffffff), t, x); in special() 62 float32x4_t q = vfmaq_f32 (d->Q_50[0], vaddq_f32 (t, d->Q_50[1]), t); in special() 66 static inline float32x4_t 67 notails (float32x4_t x, const struct data *d) in notails() [all …]
|