1 /*
2 * ARM AdvSIMD / SVE Vector Helpers
3 *
4 * Copyright (c) 2020 Linaro
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifndef TARGET_ARM_VEC_INTERNAL_H
21 #define TARGET_ARM_VEC_INTERNAL_H
22
23 #include "fpu/softfloat.h"
24
25 typedef struct CPUArchState CPUARMState;
26
27 /*
28 * Note that vector data is stored in host-endian 64-bit chunks,
29 * so addressing units smaller than that needs a host-endian fixup.
30 *
31 * The H<N> macros are used when indexing an array of elements of size N.
32 *
33 * The H1_<N> macros are used when performing byte arithmetic and then
34 * casting the final pointer to a type of size N.
35 */
36 #if HOST_BIG_ENDIAN
37 #define H1(x) ((x) ^ 7)
38 #define H1_2(x) ((x) ^ 6)
39 #define H1_4(x) ((x) ^ 4)
40 #define H2(x) ((x) ^ 3)
41 #define H4(x) ((x) ^ 1)
42 #else
43 #define H1(x) (x)
44 #define H1_2(x) (x)
45 #define H1_4(x) (x)
46 #define H2(x) (x)
47 #define H4(x) (x)
48 #endif
49 /*
50 * Access to 64-bit elements isn't host-endian dependent; we provide H8
51 * and H1_8 so that when a function is being generated from a macro we
52 * can pass these rather than an empty macro argument, for clarity.
53 */
54 #define H8(x) (x)
55 #define H1_8(x) (x)
56
57 /*
58 * Expand active predicate bits to bytes, for byte elements.
59 */
60 extern const uint64_t expand_pred_b_data[256];
expand_pred_b(uint8_t byte)61 static inline uint64_t expand_pred_b(uint8_t byte)
62 {
63 return expand_pred_b_data[byte];
64 }
65
66 /* Similarly for half-word elements. */
67 extern const uint64_t expand_pred_h_data[0x55 + 1];
expand_pred_h(uint8_t byte)68 static inline uint64_t expand_pred_h(uint8_t byte)
69 {
70 return expand_pred_h_data[byte & 0x55];
71 }
72
clear_tail(void * vd,uintptr_t opr_sz,uintptr_t max_sz)73 static inline void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz)
74 {
75 uint64_t *d = vd + opr_sz;
76 uintptr_t i;
77
78 for (i = opr_sz; i < max_sz; i += 8) {
79 *d++ = 0;
80 }
81 }
82
do_sqrshl_bhs(int32_t src,int32_t shift,int bits,bool round,uint32_t * sat)83 static inline int32_t do_sqrshl_bhs(int32_t src, int32_t shift, int bits,
84 bool round, uint32_t *sat)
85 {
86 if (shift <= -bits) {
87 /* Rounding the sign bit always produces 0. */
88 if (round) {
89 return 0;
90 }
91 return src >> 31;
92 } else if (shift < 0) {
93 if (round) {
94 src >>= -shift - 1;
95 return (src >> 1) + (src & 1);
96 }
97 return src >> -shift;
98 } else if (shift < bits) {
99 int32_t val = src << shift;
100 if (bits == 32) {
101 if (!sat || val >> shift == src) {
102 return val;
103 }
104 } else {
105 int32_t extval = sextract32(val, 0, bits);
106 if (!sat || val == extval) {
107 return extval;
108 }
109 }
110 } else if (!sat || src == 0) {
111 return 0;
112 }
113
114 *sat = 1;
115 return (1u << (bits - 1)) - (src >= 0);
116 }
117
do_uqrshl_bhs(uint32_t src,int32_t shift,int bits,bool round,uint32_t * sat)118 static inline uint32_t do_uqrshl_bhs(uint32_t src, int32_t shift, int bits,
119 bool round, uint32_t *sat)
120 {
121 if (shift <= -(bits + round)) {
122 return 0;
123 } else if (shift < 0) {
124 if (round) {
125 src >>= -shift - 1;
126 return (src >> 1) + (src & 1);
127 }
128 return src >> -shift;
129 } else if (shift < bits) {
130 uint32_t val = src << shift;
131 if (bits == 32) {
132 if (!sat || val >> shift == src) {
133 return val;
134 }
135 } else {
136 uint32_t extval = extract32(val, 0, bits);
137 if (!sat || val == extval) {
138 return extval;
139 }
140 }
141 } else if (!sat || src == 0) {
142 return 0;
143 }
144
145 *sat = 1;
146 return MAKE_64BIT_MASK(0, bits);
147 }
148
do_suqrshl_bhs(int32_t src,int32_t shift,int bits,bool round,uint32_t * sat)149 static inline int32_t do_suqrshl_bhs(int32_t src, int32_t shift, int bits,
150 bool round, uint32_t *sat)
151 {
152 if (sat && src < 0) {
153 *sat = 1;
154 return 0;
155 }
156 return do_uqrshl_bhs(src, shift, bits, round, sat);
157 }
158
do_sqrshl_d(int64_t src,int64_t shift,bool round,uint32_t * sat)159 static inline int64_t do_sqrshl_d(int64_t src, int64_t shift,
160 bool round, uint32_t *sat)
161 {
162 if (shift <= -64) {
163 /* Rounding the sign bit always produces 0. */
164 if (round) {
165 return 0;
166 }
167 return src >> 63;
168 } else if (shift < 0) {
169 if (round) {
170 src >>= -shift - 1;
171 return (src >> 1) + (src & 1);
172 }
173 return src >> -shift;
174 } else if (shift < 64) {
175 int64_t val = src << shift;
176 if (!sat || val >> shift == src) {
177 return val;
178 }
179 } else if (!sat || src == 0) {
180 return 0;
181 }
182
183 *sat = 1;
184 return src < 0 ? INT64_MIN : INT64_MAX;
185 }
186
do_uqrshl_d(uint64_t src,int64_t shift,bool round,uint32_t * sat)187 static inline uint64_t do_uqrshl_d(uint64_t src, int64_t shift,
188 bool round, uint32_t *sat)
189 {
190 if (shift <= -(64 + round)) {
191 return 0;
192 } else if (shift < 0) {
193 if (round) {
194 src >>= -shift - 1;
195 return (src >> 1) + (src & 1);
196 }
197 return src >> -shift;
198 } else if (shift < 64) {
199 uint64_t val = src << shift;
200 if (!sat || val >> shift == src) {
201 return val;
202 }
203 } else if (!sat || src == 0) {
204 return 0;
205 }
206
207 *sat = 1;
208 return UINT64_MAX;
209 }
210
do_suqrshl_d(int64_t src,int64_t shift,bool round,uint32_t * sat)211 static inline int64_t do_suqrshl_d(int64_t src, int64_t shift,
212 bool round, uint32_t *sat)
213 {
214 if (sat && src < 0) {
215 *sat = 1;
216 return 0;
217 }
218 return do_uqrshl_d(src, shift, round, sat);
219 }
220
221 int8_t do_sqrdmlah_b(int8_t, int8_t, int8_t, bool, bool);
222 int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *);
223 int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *);
224 int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool);
225
226 /**
227 * bfdotadd:
228 * @sum: addend
229 * @e1, @e2: multiplicand vectors
230 * @fpst: floating-point status to use
231 *
232 * BFloat16 2-way dot product of @e1 & @e2, accumulating with @sum.
233 * The @e1 and @e2 operands correspond to the 32-bit source vector
234 * slots and contain two Bfloat16 values each.
235 *
236 * Corresponds to the ARM pseudocode function BFDotAdd, specialized
237 * for the FPCR.EBF == 0 case.
238 */
239 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst);
240 /**
241 * bfdotadd_ebf:
242 * @sum: addend
243 * @e1, @e2: multiplicand vectors
244 * @fpst: floating-point status to use
245 * @fpst_odd: floating-point status to use for round-to-odd operations
246 *
247 * BFloat16 2-way dot product of @e1 & @e2, accumulating with @sum.
248 * The @e1 and @e2 operands correspond to the 32-bit source vector
249 * slots and contain two Bfloat16 values each.
250 *
251 * Corresponds to the ARM pseudocode function BFDotAdd, specialized
252 * for the FPCR.EBF == 1 case.
253 */
254 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
255 float_status *fpst, float_status *fpst_odd);
256
257 /**
258 * is_ebf:
259 * @env: CPU state
260 * @statusp: pointer to floating point status to fill in
261 * @oddstatusp: pointer to floating point status to fill in for round-to-odd
262 *
263 * Determine whether a BFDotAdd operation should use FPCR.EBF = 0
264 * or FPCR.EBF = 1 semantics. On return, has initialized *statusp
265 * and *oddstatusp to suitable float_status arguments to use with either
266 * bfdotadd() or bfdotadd_ebf().
267 * Returns true for EBF = 1, false for EBF = 0. (The caller should use this
268 * to decide whether to call bfdotadd() or bfdotadd_ebf().)
269 */
270 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp);
271
272 /*
273 * Negate as for FPCR.AH=1 -- do not negate NaNs.
274 */
float16_ah_chs(float16 a)275 static inline float16 float16_ah_chs(float16 a)
276 {
277 return float16_is_any_nan(a) ? a : float16_chs(a);
278 }
279
float32_ah_chs(float32 a)280 static inline float32 float32_ah_chs(float32 a)
281 {
282 return float32_is_any_nan(a) ? a : float32_chs(a);
283 }
284
float64_ah_chs(float64 a)285 static inline float64 float64_ah_chs(float64 a)
286 {
287 return float64_is_any_nan(a) ? a : float64_chs(a);
288 }
289
float16_maybe_ah_chs(float16 a,bool fpcr_ah)290 static inline float16 float16_maybe_ah_chs(float16 a, bool fpcr_ah)
291 {
292 return fpcr_ah && float16_is_any_nan(a) ? a : float16_chs(a);
293 }
294
float32_maybe_ah_chs(float32 a,bool fpcr_ah)295 static inline float32 float32_maybe_ah_chs(float32 a, bool fpcr_ah)
296 {
297 return fpcr_ah && float32_is_any_nan(a) ? a : float32_chs(a);
298 }
299
float64_maybe_ah_chs(float64 a,bool fpcr_ah)300 static inline float64 float64_maybe_ah_chs(float64 a, bool fpcr_ah)
301 {
302 return fpcr_ah && float64_is_any_nan(a) ? a : float64_chs(a);
303 }
304
305 #endif /* TARGET_ARM_VEC_INTERNAL_H */
306