1a04b68e1SRichard Henderson /* 2a04b68e1SRichard Henderson * ARM AdvSIMD / SVE Vector Helpers 3a04b68e1SRichard Henderson * 4a04b68e1SRichard Henderson * Copyright (c) 2020 Linaro 5a04b68e1SRichard Henderson * 6a04b68e1SRichard Henderson * This library is free software; you can redistribute it and/or 7a04b68e1SRichard Henderson * modify it under the terms of the GNU Lesser General Public 8a04b68e1SRichard Henderson * License as published by the Free Software Foundation; either 950f57e09SChetan Pant * version 2.1 of the License, or (at your option) any later version. 10a04b68e1SRichard Henderson * 11a04b68e1SRichard Henderson * This library is distributed in the hope that it will be useful, 12a04b68e1SRichard Henderson * but WITHOUT ANY WARRANTY; without even the implied warranty of 13a04b68e1SRichard Henderson * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14a04b68e1SRichard Henderson * Lesser General Public License for more details. 15a04b68e1SRichard Henderson * 16a04b68e1SRichard Henderson * You should have received a copy of the GNU Lesser General Public 17a04b68e1SRichard Henderson * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18a04b68e1SRichard Henderson */ 19a04b68e1SRichard Henderson 2052581c71SMarkus Armbruster #ifndef TARGET_ARM_VEC_INTERNAL_H 2152581c71SMarkus Armbruster #define TARGET_ARM_VEC_INTERNAL_H 22a04b68e1SRichard Henderson 23*416650acSPeter Maydell #include "fpu/softfloat.h" 24*416650acSPeter Maydell 2593966af1SRichard Henderson /* 2693966af1SRichard Henderson * Note that vector data is stored in host-endian 64-bit chunks, 2793966af1SRichard Henderson * so addressing units smaller than that needs a host-endian fixup. 2893966af1SRichard Henderson * 2993966af1SRichard Henderson * The H<N> macros are used when indexing an array of elements of size N. 3093966af1SRichard Henderson * 3193966af1SRichard Henderson * The H1_<N> macros are used when performing byte arithmetic and then 3293966af1SRichard Henderson * casting the final pointer to a type of size N. 3393966af1SRichard Henderson */ 34e03b5686SMarc-André Lureau #if HOST_BIG_ENDIAN 3593966af1SRichard Henderson #define H1(x) ((x) ^ 7) 3693966af1SRichard Henderson #define H1_2(x) ((x) ^ 6) 3793966af1SRichard Henderson #define H1_4(x) ((x) ^ 4) 3893966af1SRichard Henderson #define H2(x) ((x) ^ 3) 3993966af1SRichard Henderson #define H4(x) ((x) ^ 1) 4093966af1SRichard Henderson #else 4193966af1SRichard Henderson #define H1(x) (x) 4293966af1SRichard Henderson #define H1_2(x) (x) 4393966af1SRichard Henderson #define H1_4(x) (x) 4493966af1SRichard Henderson #define H2(x) (x) 4593966af1SRichard Henderson #define H4(x) (x) 4693966af1SRichard Henderson #endif 476e802db3SPeter Maydell /* 486e802db3SPeter Maydell * Access to 64-bit elements isn't host-endian dependent; we provide H8 496e802db3SPeter Maydell * and H1_8 so that when a function is being generated from a macro we 506e802db3SPeter Maydell * can pass these rather than an empty macro argument, for clarity. 516e802db3SPeter Maydell */ 526e802db3SPeter Maydell #define H8(x) (x) 536e802db3SPeter Maydell #define H1_8(x) (x) 5493966af1SRichard Henderson 55820e0bb9SRichard Henderson /* 56820e0bb9SRichard Henderson * Expand active predicate bits to bytes, for byte elements. 57820e0bb9SRichard Henderson */ 5877f96148SPeter Maydell extern const uint64_t expand_pred_b_data[256]; 59820e0bb9SRichard Henderson static inline uint64_t expand_pred_b(uint8_t byte) 60820e0bb9SRichard Henderson { 61820e0bb9SRichard Henderson return expand_pred_b_data[byte]; 62820e0bb9SRichard Henderson } 6377f96148SPeter Maydell 64a613cf2dSRichard Henderson /* Similarly for half-word elements. */ 65a613cf2dSRichard Henderson extern const uint64_t expand_pred_h_data[0x55 + 1]; 66a613cf2dSRichard Henderson static inline uint64_t expand_pred_h(uint8_t byte) 67a613cf2dSRichard Henderson { 68a613cf2dSRichard Henderson return expand_pred_h_data[byte & 0x55]; 69a613cf2dSRichard Henderson } 70a613cf2dSRichard Henderson 71a04b68e1SRichard Henderson static inline void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz) 72a04b68e1SRichard Henderson { 73a04b68e1SRichard Henderson uint64_t *d = vd + opr_sz; 74a04b68e1SRichard Henderson uintptr_t i; 75a04b68e1SRichard Henderson 76a04b68e1SRichard Henderson for (i = opr_sz; i < max_sz; i += 8) { 77a04b68e1SRichard Henderson *d++ = 0; 78a04b68e1SRichard Henderson } 79a04b68e1SRichard Henderson } 80a04b68e1SRichard Henderson 818b3f15b0SRichard Henderson static inline int32_t do_sqrshl_bhs(int32_t src, int32_t shift, int bits, 828b3f15b0SRichard Henderson bool round, uint32_t *sat) 838b3f15b0SRichard Henderson { 848b3f15b0SRichard Henderson if (shift <= -bits) { 858b3f15b0SRichard Henderson /* Rounding the sign bit always produces 0. */ 868b3f15b0SRichard Henderson if (round) { 878b3f15b0SRichard Henderson return 0; 888b3f15b0SRichard Henderson } 898b3f15b0SRichard Henderson return src >> 31; 908b3f15b0SRichard Henderson } else if (shift < 0) { 918b3f15b0SRichard Henderson if (round) { 928b3f15b0SRichard Henderson src >>= -shift - 1; 938b3f15b0SRichard Henderson return (src >> 1) + (src & 1); 948b3f15b0SRichard Henderson } 958b3f15b0SRichard Henderson return src >> -shift; 968b3f15b0SRichard Henderson } else if (shift < bits) { 978b3f15b0SRichard Henderson int32_t val = src << shift; 988b3f15b0SRichard Henderson if (bits == 32) { 998b3f15b0SRichard Henderson if (!sat || val >> shift == src) { 1008b3f15b0SRichard Henderson return val; 1018b3f15b0SRichard Henderson } 1028b3f15b0SRichard Henderson } else { 1038b3f15b0SRichard Henderson int32_t extval = sextract32(val, 0, bits); 1048b3f15b0SRichard Henderson if (!sat || val == extval) { 1058b3f15b0SRichard Henderson return extval; 1068b3f15b0SRichard Henderson } 1078b3f15b0SRichard Henderson } 1088b3f15b0SRichard Henderson } else if (!sat || src == 0) { 1098b3f15b0SRichard Henderson return 0; 1108b3f15b0SRichard Henderson } 1118b3f15b0SRichard Henderson 1128b3f15b0SRichard Henderson *sat = 1; 1138b3f15b0SRichard Henderson return (1u << (bits - 1)) - (src >= 0); 1148b3f15b0SRichard Henderson } 1158b3f15b0SRichard Henderson 1168b3f15b0SRichard Henderson static inline uint32_t do_uqrshl_bhs(uint32_t src, int32_t shift, int bits, 1178b3f15b0SRichard Henderson bool round, uint32_t *sat) 1188b3f15b0SRichard Henderson { 1198b3f15b0SRichard Henderson if (shift <= -(bits + round)) { 1208b3f15b0SRichard Henderson return 0; 1218b3f15b0SRichard Henderson } else if (shift < 0) { 1228b3f15b0SRichard Henderson if (round) { 1238b3f15b0SRichard Henderson src >>= -shift - 1; 1248b3f15b0SRichard Henderson return (src >> 1) + (src & 1); 1258b3f15b0SRichard Henderson } 1268b3f15b0SRichard Henderson return src >> -shift; 1278b3f15b0SRichard Henderson } else if (shift < bits) { 1288b3f15b0SRichard Henderson uint32_t val = src << shift; 1298b3f15b0SRichard Henderson if (bits == 32) { 1308b3f15b0SRichard Henderson if (!sat || val >> shift == src) { 1318b3f15b0SRichard Henderson return val; 1328b3f15b0SRichard Henderson } 1338b3f15b0SRichard Henderson } else { 1348b3f15b0SRichard Henderson uint32_t extval = extract32(val, 0, bits); 1358b3f15b0SRichard Henderson if (!sat || val == extval) { 1368b3f15b0SRichard Henderson return extval; 1378b3f15b0SRichard Henderson } 1388b3f15b0SRichard Henderson } 1398b3f15b0SRichard Henderson } else if (!sat || src == 0) { 1408b3f15b0SRichard Henderson return 0; 1418b3f15b0SRichard Henderson } 1428b3f15b0SRichard Henderson 1438b3f15b0SRichard Henderson *sat = 1; 1448b3f15b0SRichard Henderson return MAKE_64BIT_MASK(0, bits); 1458b3f15b0SRichard Henderson } 1468b3f15b0SRichard Henderson 1478b3f15b0SRichard Henderson static inline int32_t do_suqrshl_bhs(int32_t src, int32_t shift, int bits, 1488b3f15b0SRichard Henderson bool round, uint32_t *sat) 1498b3f15b0SRichard Henderson { 1508b3f15b0SRichard Henderson if (sat && src < 0) { 1518b3f15b0SRichard Henderson *sat = 1; 1528b3f15b0SRichard Henderson return 0; 1538b3f15b0SRichard Henderson } 1548b3f15b0SRichard Henderson return do_uqrshl_bhs(src, shift, bits, round, sat); 1558b3f15b0SRichard Henderson } 1568b3f15b0SRichard Henderson 1578b3f15b0SRichard Henderson static inline int64_t do_sqrshl_d(int64_t src, int64_t shift, 1588b3f15b0SRichard Henderson bool round, uint32_t *sat) 1598b3f15b0SRichard Henderson { 1608b3f15b0SRichard Henderson if (shift <= -64) { 1618b3f15b0SRichard Henderson /* Rounding the sign bit always produces 0. */ 1628b3f15b0SRichard Henderson if (round) { 1638b3f15b0SRichard Henderson return 0; 1648b3f15b0SRichard Henderson } 1658b3f15b0SRichard Henderson return src >> 63; 1668b3f15b0SRichard Henderson } else if (shift < 0) { 1678b3f15b0SRichard Henderson if (round) { 1688b3f15b0SRichard Henderson src >>= -shift - 1; 1698b3f15b0SRichard Henderson return (src >> 1) + (src & 1); 1708b3f15b0SRichard Henderson } 1718b3f15b0SRichard Henderson return src >> -shift; 1728b3f15b0SRichard Henderson } else if (shift < 64) { 1738b3f15b0SRichard Henderson int64_t val = src << shift; 1748b3f15b0SRichard Henderson if (!sat || val >> shift == src) { 1758b3f15b0SRichard Henderson return val; 1768b3f15b0SRichard Henderson } 1778b3f15b0SRichard Henderson } else if (!sat || src == 0) { 1788b3f15b0SRichard Henderson return 0; 1798b3f15b0SRichard Henderson } 1808b3f15b0SRichard Henderson 1818b3f15b0SRichard Henderson *sat = 1; 1828b3f15b0SRichard Henderson return src < 0 ? INT64_MIN : INT64_MAX; 1838b3f15b0SRichard Henderson } 1848b3f15b0SRichard Henderson 1858b3f15b0SRichard Henderson static inline uint64_t do_uqrshl_d(uint64_t src, int64_t shift, 1868b3f15b0SRichard Henderson bool round, uint32_t *sat) 1878b3f15b0SRichard Henderson { 1888b3f15b0SRichard Henderson if (shift <= -(64 + round)) { 1898b3f15b0SRichard Henderson return 0; 1908b3f15b0SRichard Henderson } else if (shift < 0) { 1918b3f15b0SRichard Henderson if (round) { 1928b3f15b0SRichard Henderson src >>= -shift - 1; 1938b3f15b0SRichard Henderson return (src >> 1) + (src & 1); 1948b3f15b0SRichard Henderson } 1958b3f15b0SRichard Henderson return src >> -shift; 1968b3f15b0SRichard Henderson } else if (shift < 64) { 1978b3f15b0SRichard Henderson uint64_t val = src << shift; 1988b3f15b0SRichard Henderson if (!sat || val >> shift == src) { 1998b3f15b0SRichard Henderson return val; 2008b3f15b0SRichard Henderson } 2018b3f15b0SRichard Henderson } else if (!sat || src == 0) { 2028b3f15b0SRichard Henderson return 0; 2038b3f15b0SRichard Henderson } 2048b3f15b0SRichard Henderson 2058b3f15b0SRichard Henderson *sat = 1; 2068b3f15b0SRichard Henderson return UINT64_MAX; 2078b3f15b0SRichard Henderson } 2088b3f15b0SRichard Henderson 2098b3f15b0SRichard Henderson static inline int64_t do_suqrshl_d(int64_t src, int64_t shift, 2108b3f15b0SRichard Henderson bool round, uint32_t *sat) 2118b3f15b0SRichard Henderson { 2128b3f15b0SRichard Henderson if (sat && src < 0) { 2138b3f15b0SRichard Henderson *sat = 1; 2148b3f15b0SRichard Henderson return 0; 2158b3f15b0SRichard Henderson } 2168b3f15b0SRichard Henderson return do_uqrshl_d(src, shift, round, sat); 2178b3f15b0SRichard Henderson } 2188b3f15b0SRichard Henderson 219d782d3caSRichard Henderson int8_t do_sqrdmlah_b(int8_t, int8_t, int8_t, bool, bool); 220d782d3caSRichard Henderson int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *); 221d782d3caSRichard Henderson int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *); 222d782d3caSRichard Henderson int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool); 223d782d3caSRichard Henderson 22472db2aa3SRichard Henderson /** 22572db2aa3SRichard Henderson * bfdotadd: 22672db2aa3SRichard Henderson * @sum: addend 22772db2aa3SRichard Henderson * @e1, @e2: multiplicand vectors 22809b0d9e0SPeter Maydell * @fpst: floating-point status to use 22972db2aa3SRichard Henderson * 23072db2aa3SRichard Henderson * BFloat16 2-way dot product of @e1 & @e2, accumulating with @sum. 23172db2aa3SRichard Henderson * The @e1 and @e2 operands correspond to the 32-bit source vector 23272db2aa3SRichard Henderson * slots and contain two Bfloat16 values each. 23372db2aa3SRichard Henderson * 23409b0d9e0SPeter Maydell * Corresponds to the ARM pseudocode function BFDotAdd, specialized 23509b0d9e0SPeter Maydell * for the FPCR.EBF == 0 case. 23672db2aa3SRichard Henderson */ 23709b0d9e0SPeter Maydell float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst); 23809b0d9e0SPeter Maydell /** 23909b0d9e0SPeter Maydell * bfdotadd_ebf: 24009b0d9e0SPeter Maydell * @sum: addend 24109b0d9e0SPeter Maydell * @e1, @e2: multiplicand vectors 24209b0d9e0SPeter Maydell * @fpst: floating-point status to use 24309b0d9e0SPeter Maydell * @fpst_odd: floating-point status to use for round-to-odd operations 24409b0d9e0SPeter Maydell * 24509b0d9e0SPeter Maydell * BFloat16 2-way dot product of @e1 & @e2, accumulating with @sum. 24609b0d9e0SPeter Maydell * The @e1 and @e2 operands correspond to the 32-bit source vector 24709b0d9e0SPeter Maydell * slots and contain two Bfloat16 values each. 24809b0d9e0SPeter Maydell * 24909b0d9e0SPeter Maydell * Corresponds to the ARM pseudocode function BFDotAdd, specialized 25009b0d9e0SPeter Maydell * for the FPCR.EBF == 1 case. 25109b0d9e0SPeter Maydell */ 25209b0d9e0SPeter Maydell float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2, 25309b0d9e0SPeter Maydell float_status *fpst, float_status *fpst_odd); 25409b0d9e0SPeter Maydell 25509b0d9e0SPeter Maydell /** 25609b0d9e0SPeter Maydell * is_ebf: 25709b0d9e0SPeter Maydell * @env: CPU state 25809b0d9e0SPeter Maydell * @statusp: pointer to floating point status to fill in 25909b0d9e0SPeter Maydell * @oddstatusp: pointer to floating point status to fill in for round-to-odd 26009b0d9e0SPeter Maydell * 26109b0d9e0SPeter Maydell * Determine whether a BFDotAdd operation should use FPCR.EBF = 0 26209b0d9e0SPeter Maydell * or FPCR.EBF = 1 semantics. On return, has initialized *statusp 26309b0d9e0SPeter Maydell * and *oddstatusp to suitable float_status arguments to use with either 26409b0d9e0SPeter Maydell * bfdotadd() or bfdotadd_ebf(). 26509b0d9e0SPeter Maydell * Returns true for EBF = 1, false for EBF = 0. (The caller should use this 26609b0d9e0SPeter Maydell * to decide whether to call bfdotadd() or bfdotadd_ebf().) 26709b0d9e0SPeter Maydell */ 26809b0d9e0SPeter Maydell bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp); 26972db2aa3SRichard Henderson 270*416650acSPeter Maydell static inline float16 float16_maybe_ah_chs(float16 a, bool fpcr_ah) 271*416650acSPeter Maydell { 272*416650acSPeter Maydell return fpcr_ah && float16_is_any_nan(a) ? a : float16_chs(a); 273*416650acSPeter Maydell } 274*416650acSPeter Maydell 275*416650acSPeter Maydell static inline float32 float32_maybe_ah_chs(float32 a, bool fpcr_ah) 276*416650acSPeter Maydell { 277*416650acSPeter Maydell return fpcr_ah && float32_is_any_nan(a) ? a : float32_chs(a); 278*416650acSPeter Maydell } 279*416650acSPeter Maydell 280*416650acSPeter Maydell static inline float64 float64_maybe_ah_chs(float64 a, bool fpcr_ah) 281*416650acSPeter Maydell { 282*416650acSPeter Maydell return fpcr_ah && float64_is_any_nan(a) ? a : float64_chs(a); 283*416650acSPeter Maydell } 284*416650acSPeter Maydell 28552581c71SMarkus Armbruster #endif /* TARGET_ARM_VEC_INTERNAL_H */ 286