xref: /qemu/target/arm/tcg/vec_internal.h (revision 513823e7521a09ed7ad1e32e6454bac3b2cbf52d)
1 /*
2  * ARM AdvSIMD / SVE Vector Helpers
3  *
4  * Copyright (c) 2020 Linaro
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #ifndef TARGET_ARM_VEC_INTERNAL_H
21 #define TARGET_ARM_VEC_INTERNAL_H
22 
23 #include "fpu/softfloat.h"
24 
25 /*
26  * Note that vector data is stored in host-endian 64-bit chunks,
27  * so addressing units smaller than that needs a host-endian fixup.
28  *
29  * The H<N> macros are used when indexing an array of elements of size N.
30  *
31  * The H1_<N> macros are used when performing byte arithmetic and then
32  * casting the final pointer to a type of size N.
33  */
34 #if HOST_BIG_ENDIAN
35 #define H1(x)   ((x) ^ 7)
36 #define H1_2(x) ((x) ^ 6)
37 #define H1_4(x) ((x) ^ 4)
38 #define H2(x)   ((x) ^ 3)
39 #define H4(x)   ((x) ^ 1)
40 #else
41 #define H1(x)   (x)
42 #define H1_2(x) (x)
43 #define H1_4(x) (x)
44 #define H2(x)   (x)
45 #define H4(x)   (x)
46 #endif
47 /*
48  * Access to 64-bit elements isn't host-endian dependent; we provide H8
49  * and H1_8 so that when a function is being generated from a macro we
50  * can pass these rather than an empty macro argument, for clarity.
51  */
52 #define H8(x)   (x)
53 #define H1_8(x) (x)
54 
55 /*
56  * Expand active predicate bits to bytes, for byte elements.
57  */
58 extern const uint64_t expand_pred_b_data[256];
59 static inline uint64_t expand_pred_b(uint8_t byte)
60 {
61     return expand_pred_b_data[byte];
62 }
63 
64 /* Similarly for half-word elements. */
65 extern const uint64_t expand_pred_h_data[0x55 + 1];
66 static inline uint64_t expand_pred_h(uint8_t byte)
67 {
68     return expand_pred_h_data[byte & 0x55];
69 }
70 
71 static inline void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz)
72 {
73     uint64_t *d = vd + opr_sz;
74     uintptr_t i;
75 
76     for (i = opr_sz; i < max_sz; i += 8) {
77         *d++ = 0;
78     }
79 }
80 
81 static inline int32_t do_sqrshl_bhs(int32_t src, int32_t shift, int bits,
82                                     bool round, uint32_t *sat)
83 {
84     if (shift <= -bits) {
85         /* Rounding the sign bit always produces 0. */
86         if (round) {
87             return 0;
88         }
89         return src >> 31;
90     } else if (shift < 0) {
91         if (round) {
92             src >>= -shift - 1;
93             return (src >> 1) + (src & 1);
94         }
95         return src >> -shift;
96     } else if (shift < bits) {
97         int32_t val = src << shift;
98         if (bits == 32) {
99             if (!sat || val >> shift == src) {
100                 return val;
101             }
102         } else {
103             int32_t extval = sextract32(val, 0, bits);
104             if (!sat || val == extval) {
105                 return extval;
106             }
107         }
108     } else if (!sat || src == 0) {
109         return 0;
110     }
111 
112     *sat = 1;
113     return (1u << (bits - 1)) - (src >= 0);
114 }
115 
116 static inline uint32_t do_uqrshl_bhs(uint32_t src, int32_t shift, int bits,
117                                      bool round, uint32_t *sat)
118 {
119     if (shift <= -(bits + round)) {
120         return 0;
121     } else if (shift < 0) {
122         if (round) {
123             src >>= -shift - 1;
124             return (src >> 1) + (src & 1);
125         }
126         return src >> -shift;
127     } else if (shift < bits) {
128         uint32_t val = src << shift;
129         if (bits == 32) {
130             if (!sat || val >> shift == src) {
131                 return val;
132             }
133         } else {
134             uint32_t extval = extract32(val, 0, bits);
135             if (!sat || val == extval) {
136                 return extval;
137             }
138         }
139     } else if (!sat || src == 0) {
140         return 0;
141     }
142 
143     *sat = 1;
144     return MAKE_64BIT_MASK(0, bits);
145 }
146 
147 static inline int32_t do_suqrshl_bhs(int32_t src, int32_t shift, int bits,
148                                      bool round, uint32_t *sat)
149 {
150     if (sat && src < 0) {
151         *sat = 1;
152         return 0;
153     }
154     return do_uqrshl_bhs(src, shift, bits, round, sat);
155 }
156 
157 static inline int64_t do_sqrshl_d(int64_t src, int64_t shift,
158                                   bool round, uint32_t *sat)
159 {
160     if (shift <= -64) {
161         /* Rounding the sign bit always produces 0. */
162         if (round) {
163             return 0;
164         }
165         return src >> 63;
166     } else if (shift < 0) {
167         if (round) {
168             src >>= -shift - 1;
169             return (src >> 1) + (src & 1);
170         }
171         return src >> -shift;
172     } else if (shift < 64) {
173         int64_t val = src << shift;
174         if (!sat || val >> shift == src) {
175             return val;
176         }
177     } else if (!sat || src == 0) {
178         return 0;
179     }
180 
181     *sat = 1;
182     return src < 0 ? INT64_MIN : INT64_MAX;
183 }
184 
185 static inline uint64_t do_uqrshl_d(uint64_t src, int64_t shift,
186                                    bool round, uint32_t *sat)
187 {
188     if (shift <= -(64 + round)) {
189         return 0;
190     } else if (shift < 0) {
191         if (round) {
192             src >>= -shift - 1;
193             return (src >> 1) + (src & 1);
194         }
195         return src >> -shift;
196     } else if (shift < 64) {
197         uint64_t val = src << shift;
198         if (!sat || val >> shift == src) {
199             return val;
200         }
201     } else if (!sat || src == 0) {
202         return 0;
203     }
204 
205     *sat = 1;
206     return UINT64_MAX;
207 }
208 
209 static inline int64_t do_suqrshl_d(int64_t src, int64_t shift,
210                                    bool round, uint32_t *sat)
211 {
212     if (sat && src < 0) {
213         *sat = 1;
214         return 0;
215     }
216     return do_uqrshl_d(src, shift, round, sat);
217 }
218 
219 int8_t do_sqrdmlah_b(int8_t, int8_t, int8_t, bool, bool);
220 int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *);
221 int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *);
222 int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool);
223 
224 /**
225  * bfdotadd:
226  * @sum: addend
227  * @e1, @e2: multiplicand vectors
228  * @fpst: floating-point status to use
229  *
230  * BFloat16 2-way dot product of @e1 & @e2, accumulating with @sum.
231  * The @e1 and @e2 operands correspond to the 32-bit source vector
232  * slots and contain two Bfloat16 values each.
233  *
234  * Corresponds to the ARM pseudocode function BFDotAdd, specialized
235  * for the FPCR.EBF == 0 case.
236  */
237 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst);
238 /**
239  * bfdotadd_ebf:
240  * @sum: addend
241  * @e1, @e2: multiplicand vectors
242  * @fpst: floating-point status to use
243  * @fpst_odd: floating-point status to use for round-to-odd operations
244  *
245  * BFloat16 2-way dot product of @e1 & @e2, accumulating with @sum.
246  * The @e1 and @e2 operands correspond to the 32-bit source vector
247  * slots and contain two Bfloat16 values each.
248  *
249  * Corresponds to the ARM pseudocode function BFDotAdd, specialized
250  * for the FPCR.EBF == 1 case.
251  */
252 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2,
253                      float_status *fpst, float_status *fpst_odd);
254 
255 /**
256  * is_ebf:
257  * @env: CPU state
258  * @statusp: pointer to floating point status to fill in
259  * @oddstatusp: pointer to floating point status to fill in for round-to-odd
260  *
261  * Determine whether a BFDotAdd operation should use FPCR.EBF = 0
262  * or FPCR.EBF = 1 semantics. On return, has initialized *statusp
263  * and *oddstatusp to suitable float_status arguments to use with either
264  * bfdotadd() or bfdotadd_ebf().
265  * Returns true for EBF = 1, false for EBF = 0. (The caller should use this
266  * to decide whether to call bfdotadd() or bfdotadd_ebf().)
267  */
268 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp);
269 
270 /*
271  * Negate as for FPCR.AH=1 -- do not negate NaNs.
272  */
273 static inline float16 float16_ah_chs(float16 a)
274 {
275     return float16_is_any_nan(a) ? a : float16_chs(a);
276 }
277 
278 static inline float32 float32_ah_chs(float32 a)
279 {
280     return float32_is_any_nan(a) ? a : float32_chs(a);
281 }
282 
283 static inline float64 float64_ah_chs(float64 a)
284 {
285     return float64_is_any_nan(a) ? a : float64_chs(a);
286 }
287 
288 static inline float16 float16_maybe_ah_chs(float16 a, bool fpcr_ah)
289 {
290     return fpcr_ah && float16_is_any_nan(a) ? a : float16_chs(a);
291 }
292 
293 static inline float32 float32_maybe_ah_chs(float32 a, bool fpcr_ah)
294 {
295     return fpcr_ah && float32_is_any_nan(a) ? a : float32_chs(a);
296 }
297 
298 static inline float64 float64_maybe_ah_chs(float64 a, bool fpcr_ah)
299 {
300     return fpcr_ah && float64_is_any_nan(a) ? a : float64_chs(a);
301 }
302 
303 #endif /* TARGET_ARM_VEC_INTERNAL_H */
304