1a04b68e1SRichard Henderson /* 2a04b68e1SRichard Henderson * ARM AdvSIMD / SVE Vector Helpers 3a04b68e1SRichard Henderson * 4a04b68e1SRichard Henderson * Copyright (c) 2020 Linaro 5a04b68e1SRichard Henderson * 6a04b68e1SRichard Henderson * This library is free software; you can redistribute it and/or 7a04b68e1SRichard Henderson * modify it under the terms of the GNU Lesser General Public 8a04b68e1SRichard Henderson * License as published by the Free Software Foundation; either 950f57e09SChetan Pant * version 2.1 of the License, or (at your option) any later version. 10a04b68e1SRichard Henderson * 11a04b68e1SRichard Henderson * This library is distributed in the hope that it will be useful, 12a04b68e1SRichard Henderson * but WITHOUT ANY WARRANTY; without even the implied warranty of 13a04b68e1SRichard Henderson * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14a04b68e1SRichard Henderson * Lesser General Public License for more details. 15a04b68e1SRichard Henderson * 16a04b68e1SRichard Henderson * You should have received a copy of the GNU Lesser General Public 17a04b68e1SRichard Henderson * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18a04b68e1SRichard Henderson */ 19a04b68e1SRichard Henderson 2052581c71SMarkus Armbruster #ifndef TARGET_ARM_VEC_INTERNAL_H 2152581c71SMarkus Armbruster #define TARGET_ARM_VEC_INTERNAL_H 22a04b68e1SRichard Henderson 2393966af1SRichard Henderson /* 2493966af1SRichard Henderson * Note that vector data is stored in host-endian 64-bit chunks, 2593966af1SRichard Henderson * so addressing units smaller than that needs a host-endian fixup. 2693966af1SRichard Henderson * 2793966af1SRichard Henderson * The H<N> macros are used when indexing an array of elements of size N. 2893966af1SRichard Henderson * 2993966af1SRichard Henderson * The H1_<N> macros are used when performing byte arithmetic and then 3093966af1SRichard Henderson * casting the final pointer to a type of size N. 3193966af1SRichard Henderson */ 32e03b5686SMarc-André Lureau #if HOST_BIG_ENDIAN 3393966af1SRichard Henderson #define H1(x) ((x) ^ 7) 3493966af1SRichard Henderson #define H1_2(x) ((x) ^ 6) 3593966af1SRichard Henderson #define H1_4(x) ((x) ^ 4) 3693966af1SRichard Henderson #define H2(x) ((x) ^ 3) 3793966af1SRichard Henderson #define H4(x) ((x) ^ 1) 3893966af1SRichard Henderson #else 3993966af1SRichard Henderson #define H1(x) (x) 4093966af1SRichard Henderson #define H1_2(x) (x) 4193966af1SRichard Henderson #define H1_4(x) (x) 4293966af1SRichard Henderson #define H2(x) (x) 4393966af1SRichard Henderson #define H4(x) (x) 4493966af1SRichard Henderson #endif 456e802db3SPeter Maydell /* 466e802db3SPeter Maydell * Access to 64-bit elements isn't host-endian dependent; we provide H8 476e802db3SPeter Maydell * and H1_8 so that when a function is being generated from a macro we 486e802db3SPeter Maydell * can pass these rather than an empty macro argument, for clarity. 496e802db3SPeter Maydell */ 506e802db3SPeter Maydell #define H8(x) (x) 516e802db3SPeter Maydell #define H1_8(x) (x) 5293966af1SRichard Henderson 53820e0bb9SRichard Henderson /* 54820e0bb9SRichard Henderson * Expand active predicate bits to bytes, for byte elements. 55820e0bb9SRichard Henderson */ 5677f96148SPeter Maydell extern const uint64_t expand_pred_b_data[256]; 57820e0bb9SRichard Henderson static inline uint64_t expand_pred_b(uint8_t byte) 58820e0bb9SRichard Henderson { 59820e0bb9SRichard Henderson return expand_pred_b_data[byte]; 60820e0bb9SRichard Henderson } 6177f96148SPeter Maydell 62a613cf2dSRichard Henderson /* Similarly for half-word elements. */ 63a613cf2dSRichard Henderson extern const uint64_t expand_pred_h_data[0x55 + 1]; 64a613cf2dSRichard Henderson static inline uint64_t expand_pred_h(uint8_t byte) 65a613cf2dSRichard Henderson { 66a613cf2dSRichard Henderson return expand_pred_h_data[byte & 0x55]; 67a613cf2dSRichard Henderson } 68a613cf2dSRichard Henderson 69a04b68e1SRichard Henderson static inline void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz) 70a04b68e1SRichard Henderson { 71a04b68e1SRichard Henderson uint64_t *d = vd + opr_sz; 72a04b68e1SRichard Henderson uintptr_t i; 73a04b68e1SRichard Henderson 74a04b68e1SRichard Henderson for (i = opr_sz; i < max_sz; i += 8) { 75a04b68e1SRichard Henderson *d++ = 0; 76a04b68e1SRichard Henderson } 77a04b68e1SRichard Henderson } 78a04b68e1SRichard Henderson 798b3f15b0SRichard Henderson static inline int32_t do_sqrshl_bhs(int32_t src, int32_t shift, int bits, 808b3f15b0SRichard Henderson bool round, uint32_t *sat) 818b3f15b0SRichard Henderson { 828b3f15b0SRichard Henderson if (shift <= -bits) { 838b3f15b0SRichard Henderson /* Rounding the sign bit always produces 0. */ 848b3f15b0SRichard Henderson if (round) { 858b3f15b0SRichard Henderson return 0; 868b3f15b0SRichard Henderson } 878b3f15b0SRichard Henderson return src >> 31; 888b3f15b0SRichard Henderson } else if (shift < 0) { 898b3f15b0SRichard Henderson if (round) { 908b3f15b0SRichard Henderson src >>= -shift - 1; 918b3f15b0SRichard Henderson return (src >> 1) + (src & 1); 928b3f15b0SRichard Henderson } 938b3f15b0SRichard Henderson return src >> -shift; 948b3f15b0SRichard Henderson } else if (shift < bits) { 958b3f15b0SRichard Henderson int32_t val = src << shift; 968b3f15b0SRichard Henderson if (bits == 32) { 978b3f15b0SRichard Henderson if (!sat || val >> shift == src) { 988b3f15b0SRichard Henderson return val; 998b3f15b0SRichard Henderson } 1008b3f15b0SRichard Henderson } else { 1018b3f15b0SRichard Henderson int32_t extval = sextract32(val, 0, bits); 1028b3f15b0SRichard Henderson if (!sat || val == extval) { 1038b3f15b0SRichard Henderson return extval; 1048b3f15b0SRichard Henderson } 1058b3f15b0SRichard Henderson } 1068b3f15b0SRichard Henderson } else if (!sat || src == 0) { 1078b3f15b0SRichard Henderson return 0; 1088b3f15b0SRichard Henderson } 1098b3f15b0SRichard Henderson 1108b3f15b0SRichard Henderson *sat = 1; 1118b3f15b0SRichard Henderson return (1u << (bits - 1)) - (src >= 0); 1128b3f15b0SRichard Henderson } 1138b3f15b0SRichard Henderson 1148b3f15b0SRichard Henderson static inline uint32_t do_uqrshl_bhs(uint32_t src, int32_t shift, int bits, 1158b3f15b0SRichard Henderson bool round, uint32_t *sat) 1168b3f15b0SRichard Henderson { 1178b3f15b0SRichard Henderson if (shift <= -(bits + round)) { 1188b3f15b0SRichard Henderson return 0; 1198b3f15b0SRichard Henderson } else if (shift < 0) { 1208b3f15b0SRichard Henderson if (round) { 1218b3f15b0SRichard Henderson src >>= -shift - 1; 1228b3f15b0SRichard Henderson return (src >> 1) + (src & 1); 1238b3f15b0SRichard Henderson } 1248b3f15b0SRichard Henderson return src >> -shift; 1258b3f15b0SRichard Henderson } else if (shift < bits) { 1268b3f15b0SRichard Henderson uint32_t val = src << shift; 1278b3f15b0SRichard Henderson if (bits == 32) { 1288b3f15b0SRichard Henderson if (!sat || val >> shift == src) { 1298b3f15b0SRichard Henderson return val; 1308b3f15b0SRichard Henderson } 1318b3f15b0SRichard Henderson } else { 1328b3f15b0SRichard Henderson uint32_t extval = extract32(val, 0, bits); 1338b3f15b0SRichard Henderson if (!sat || val == extval) { 1348b3f15b0SRichard Henderson return extval; 1358b3f15b0SRichard Henderson } 1368b3f15b0SRichard Henderson } 1378b3f15b0SRichard Henderson } else if (!sat || src == 0) { 1388b3f15b0SRichard Henderson return 0; 1398b3f15b0SRichard Henderson } 1408b3f15b0SRichard Henderson 1418b3f15b0SRichard Henderson *sat = 1; 1428b3f15b0SRichard Henderson return MAKE_64BIT_MASK(0, bits); 1438b3f15b0SRichard Henderson } 1448b3f15b0SRichard Henderson 1458b3f15b0SRichard Henderson static inline int32_t do_suqrshl_bhs(int32_t src, int32_t shift, int bits, 1468b3f15b0SRichard Henderson bool round, uint32_t *sat) 1478b3f15b0SRichard Henderson { 1488b3f15b0SRichard Henderson if (sat && src < 0) { 1498b3f15b0SRichard Henderson *sat = 1; 1508b3f15b0SRichard Henderson return 0; 1518b3f15b0SRichard Henderson } 1528b3f15b0SRichard Henderson return do_uqrshl_bhs(src, shift, bits, round, sat); 1538b3f15b0SRichard Henderson } 1548b3f15b0SRichard Henderson 1558b3f15b0SRichard Henderson static inline int64_t do_sqrshl_d(int64_t src, int64_t shift, 1568b3f15b0SRichard Henderson bool round, uint32_t *sat) 1578b3f15b0SRichard Henderson { 1588b3f15b0SRichard Henderson if (shift <= -64) { 1598b3f15b0SRichard Henderson /* Rounding the sign bit always produces 0. */ 1608b3f15b0SRichard Henderson if (round) { 1618b3f15b0SRichard Henderson return 0; 1628b3f15b0SRichard Henderson } 1638b3f15b0SRichard Henderson return src >> 63; 1648b3f15b0SRichard Henderson } else if (shift < 0) { 1658b3f15b0SRichard Henderson if (round) { 1668b3f15b0SRichard Henderson src >>= -shift - 1; 1678b3f15b0SRichard Henderson return (src >> 1) + (src & 1); 1688b3f15b0SRichard Henderson } 1698b3f15b0SRichard Henderson return src >> -shift; 1708b3f15b0SRichard Henderson } else if (shift < 64) { 1718b3f15b0SRichard Henderson int64_t val = src << shift; 1728b3f15b0SRichard Henderson if (!sat || val >> shift == src) { 1738b3f15b0SRichard Henderson return val; 1748b3f15b0SRichard Henderson } 1758b3f15b0SRichard Henderson } else if (!sat || src == 0) { 1768b3f15b0SRichard Henderson return 0; 1778b3f15b0SRichard Henderson } 1788b3f15b0SRichard Henderson 1798b3f15b0SRichard Henderson *sat = 1; 1808b3f15b0SRichard Henderson return src < 0 ? INT64_MIN : INT64_MAX; 1818b3f15b0SRichard Henderson } 1828b3f15b0SRichard Henderson 1838b3f15b0SRichard Henderson static inline uint64_t do_uqrshl_d(uint64_t src, int64_t shift, 1848b3f15b0SRichard Henderson bool round, uint32_t *sat) 1858b3f15b0SRichard Henderson { 1868b3f15b0SRichard Henderson if (shift <= -(64 + round)) { 1878b3f15b0SRichard Henderson return 0; 1888b3f15b0SRichard Henderson } else if (shift < 0) { 1898b3f15b0SRichard Henderson if (round) { 1908b3f15b0SRichard Henderson src >>= -shift - 1; 1918b3f15b0SRichard Henderson return (src >> 1) + (src & 1); 1928b3f15b0SRichard Henderson } 1938b3f15b0SRichard Henderson return src >> -shift; 1948b3f15b0SRichard Henderson } else if (shift < 64) { 1958b3f15b0SRichard Henderson uint64_t val = src << shift; 1968b3f15b0SRichard Henderson if (!sat || val >> shift == src) { 1978b3f15b0SRichard Henderson return val; 1988b3f15b0SRichard Henderson } 1998b3f15b0SRichard Henderson } else if (!sat || src == 0) { 2008b3f15b0SRichard Henderson return 0; 2018b3f15b0SRichard Henderson } 2028b3f15b0SRichard Henderson 2038b3f15b0SRichard Henderson *sat = 1; 2048b3f15b0SRichard Henderson return UINT64_MAX; 2058b3f15b0SRichard Henderson } 2068b3f15b0SRichard Henderson 2078b3f15b0SRichard Henderson static inline int64_t do_suqrshl_d(int64_t src, int64_t shift, 2088b3f15b0SRichard Henderson bool round, uint32_t *sat) 2098b3f15b0SRichard Henderson { 2108b3f15b0SRichard Henderson if (sat && src < 0) { 2118b3f15b0SRichard Henderson *sat = 1; 2128b3f15b0SRichard Henderson return 0; 2138b3f15b0SRichard Henderson } 2148b3f15b0SRichard Henderson return do_uqrshl_d(src, shift, round, sat); 2158b3f15b0SRichard Henderson } 2168b3f15b0SRichard Henderson 217d782d3caSRichard Henderson int8_t do_sqrdmlah_b(int8_t, int8_t, int8_t, bool, bool); 218d782d3caSRichard Henderson int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *); 219d782d3caSRichard Henderson int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *); 220d782d3caSRichard Henderson int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool); 221d782d3caSRichard Henderson 222c1bd78cbSPeter Maydell /* 223c1bd78cbSPeter Maydell * 8 x 8 -> 16 vector polynomial multiply where the inputs are 224c1bd78cbSPeter Maydell * in the low 8 bits of each 16-bit element 225c1bd78cbSPeter Maydell */ 226c1bd78cbSPeter Maydell uint64_t pmull_h(uint64_t op1, uint64_t op2); 227c1bd78cbSPeter Maydell /* 228c1bd78cbSPeter Maydell * 16 x 16 -> 32 vector polynomial multiply where the inputs are 229c1bd78cbSPeter Maydell * in the low 16 bits of each 32-bit element 230c1bd78cbSPeter Maydell */ 231c1bd78cbSPeter Maydell uint64_t pmull_w(uint64_t op1, uint64_t op2); 232c1bd78cbSPeter Maydell 233*72db2aa3SRichard Henderson /** 234*72db2aa3SRichard Henderson * bfdotadd: 235*72db2aa3SRichard Henderson * @sum: addend 236*72db2aa3SRichard Henderson * @e1, @e2: multiplicand vectors 237*72db2aa3SRichard Henderson * 238*72db2aa3SRichard Henderson * BFloat16 2-way dot product of @e1 & @e2, accumulating with @sum. 239*72db2aa3SRichard Henderson * The @e1 and @e2 operands correspond to the 32-bit source vector 240*72db2aa3SRichard Henderson * slots and contain two Bfloat16 values each. 241*72db2aa3SRichard Henderson * 242*72db2aa3SRichard Henderson * Corresponds to the ARM pseudocode function BFDotAdd. 243*72db2aa3SRichard Henderson */ 244*72db2aa3SRichard Henderson float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2); 245*72db2aa3SRichard Henderson 24652581c71SMarkus Armbruster #endif /* TARGET_ARM_VEC_INTERNAL_H */ 247