xref: /qemu/accel/tcg/tcg-runtime-gvec.c (revision db432672dc50ed86dda17ac821b7eb07411a90af)
1*db432672SRichard Henderson /*
2*db432672SRichard Henderson  * Generic vectorized operation runtime
3*db432672SRichard Henderson  *
4*db432672SRichard Henderson  * Copyright (c) 2018 Linaro
5*db432672SRichard Henderson  *
6*db432672SRichard Henderson  * This library is free software; you can redistribute it and/or
7*db432672SRichard Henderson  * modify it under the terms of the GNU Lesser General Public
8*db432672SRichard Henderson  * License as published by the Free Software Foundation; either
9*db432672SRichard Henderson  * version 2 of the License, or (at your option) any later version.
10*db432672SRichard Henderson  *
11*db432672SRichard Henderson  * This library is distributed in the hope that it will be useful,
12*db432672SRichard Henderson  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13*db432672SRichard Henderson  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14*db432672SRichard Henderson  * Lesser General Public License for more details.
15*db432672SRichard Henderson  *
16*db432672SRichard Henderson  * You should have received a copy of the GNU Lesser General Public
17*db432672SRichard Henderson  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18*db432672SRichard Henderson  */
19*db432672SRichard Henderson 
20*db432672SRichard Henderson #include "qemu/osdep.h"
21*db432672SRichard Henderson #include "qemu/host-utils.h"
22*db432672SRichard Henderson #include "cpu.h"
23*db432672SRichard Henderson #include "exec/helper-proto.h"
24*db432672SRichard Henderson #include "tcg-gvec-desc.h"
25*db432672SRichard Henderson 
26*db432672SRichard Henderson 
27*db432672SRichard Henderson /* Virtually all hosts support 16-byte vectors.  Those that don't can emulate
28*db432672SRichard Henderson  * them via GCC's generic vector extension.  This turns out to be simpler and
29*db432672SRichard Henderson  * more reliable than getting the compiler to autovectorize.
30*db432672SRichard Henderson  *
31*db432672SRichard Henderson  * In tcg-op-gvec.c, we asserted that both the size and alignment of the data
32*db432672SRichard Henderson  * are multiples of 16.
33*db432672SRichard Henderson  *
34*db432672SRichard Henderson  * When the compiler does not support all of the operations we require, the
35*db432672SRichard Henderson  * loops are written so that we can always fall back on the base types.
36*db432672SRichard Henderson  */
37*db432672SRichard Henderson #ifdef CONFIG_VECTOR16
38*db432672SRichard Henderson typedef uint8_t vec8 __attribute__((vector_size(16)));
39*db432672SRichard Henderson typedef uint16_t vec16 __attribute__((vector_size(16)));
40*db432672SRichard Henderson typedef uint32_t vec32 __attribute__((vector_size(16)));
41*db432672SRichard Henderson typedef uint64_t vec64 __attribute__((vector_size(16)));
42*db432672SRichard Henderson 
43*db432672SRichard Henderson typedef int8_t svec8 __attribute__((vector_size(16)));
44*db432672SRichard Henderson typedef int16_t svec16 __attribute__((vector_size(16)));
45*db432672SRichard Henderson typedef int32_t svec32 __attribute__((vector_size(16)));
46*db432672SRichard Henderson typedef int64_t svec64 __attribute__((vector_size(16)));
47*db432672SRichard Henderson 
48*db432672SRichard Henderson #define DUP16(X)  { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X }
49*db432672SRichard Henderson #define DUP8(X)   { X, X, X, X, X, X, X, X }
50*db432672SRichard Henderson #define DUP4(X)   { X, X, X, X }
51*db432672SRichard Henderson #define DUP2(X)   { X, X }
52*db432672SRichard Henderson #else
53*db432672SRichard Henderson typedef uint8_t vec8;
54*db432672SRichard Henderson typedef uint16_t vec16;
55*db432672SRichard Henderson typedef uint32_t vec32;
56*db432672SRichard Henderson typedef uint64_t vec64;
57*db432672SRichard Henderson 
58*db432672SRichard Henderson typedef int8_t svec8;
59*db432672SRichard Henderson typedef int16_t svec16;
60*db432672SRichard Henderson typedef int32_t svec32;
61*db432672SRichard Henderson typedef int64_t svec64;
62*db432672SRichard Henderson 
63*db432672SRichard Henderson #define DUP16(X)  X
64*db432672SRichard Henderson #define DUP8(X)   X
65*db432672SRichard Henderson #define DUP4(X)   X
66*db432672SRichard Henderson #define DUP2(X)   X
67*db432672SRichard Henderson #endif /* CONFIG_VECTOR16 */
68*db432672SRichard Henderson 
69*db432672SRichard Henderson static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc)
70*db432672SRichard Henderson {
71*db432672SRichard Henderson     intptr_t maxsz = simd_maxsz(desc);
72*db432672SRichard Henderson     intptr_t i;
73*db432672SRichard Henderson 
74*db432672SRichard Henderson     if (unlikely(maxsz > oprsz)) {
75*db432672SRichard Henderson         for (i = oprsz; i < maxsz; i += sizeof(uint64_t)) {
76*db432672SRichard Henderson             *(uint64_t *)(d + i) = 0;
77*db432672SRichard Henderson         }
78*db432672SRichard Henderson     }
79*db432672SRichard Henderson }
80*db432672SRichard Henderson 
81*db432672SRichard Henderson void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc)
82*db432672SRichard Henderson {
83*db432672SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);
84*db432672SRichard Henderson     intptr_t i;
85*db432672SRichard Henderson 
86*db432672SRichard Henderson     for (i = 0; i < oprsz; i += sizeof(vec8)) {
87*db432672SRichard Henderson         *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i);
88*db432672SRichard Henderson     }
89*db432672SRichard Henderson     clear_high(d, oprsz, desc);
90*db432672SRichard Henderson }
91*db432672SRichard Henderson 
92*db432672SRichard Henderson void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc)
93*db432672SRichard Henderson {
94*db432672SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);
95*db432672SRichard Henderson     intptr_t i;
96*db432672SRichard Henderson 
97*db432672SRichard Henderson     for (i = 0; i < oprsz; i += sizeof(vec16)) {
98*db432672SRichard Henderson         *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i);
99*db432672SRichard Henderson     }
100*db432672SRichard Henderson     clear_high(d, oprsz, desc);
101*db432672SRichard Henderson }
102*db432672SRichard Henderson 
103*db432672SRichard Henderson void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc)
104*db432672SRichard Henderson {
105*db432672SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);
106*db432672SRichard Henderson     intptr_t i;
107*db432672SRichard Henderson 
108*db432672SRichard Henderson     for (i = 0; i < oprsz; i += sizeof(vec32)) {
109*db432672SRichard Henderson         *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i);
110*db432672SRichard Henderson     }
111*db432672SRichard Henderson     clear_high(d, oprsz, desc);
112*db432672SRichard Henderson }
113*db432672SRichard Henderson 
114*db432672SRichard Henderson void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
115*db432672SRichard Henderson {
116*db432672SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);
117*db432672SRichard Henderson     intptr_t i;
118*db432672SRichard Henderson 
119*db432672SRichard Henderson     for (i = 0; i < oprsz; i += sizeof(vec64)) {
120*db432672SRichard Henderson         *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i);
121*db432672SRichard Henderson     }
122*db432672SRichard Henderson     clear_high(d, oprsz, desc);
123*db432672SRichard Henderson }
124*db432672SRichard Henderson 
125*db432672SRichard Henderson void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc)
126*db432672SRichard Henderson {
127*db432672SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);
128*db432672SRichard Henderson     intptr_t i;
129*db432672SRichard Henderson 
130*db432672SRichard Henderson     for (i = 0; i < oprsz; i += sizeof(vec8)) {
131*db432672SRichard Henderson         *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i);
132*db432672SRichard Henderson     }
133*db432672SRichard Henderson     clear_high(d, oprsz, desc);
134*db432672SRichard Henderson }
135*db432672SRichard Henderson 
136*db432672SRichard Henderson void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc)
137*db432672SRichard Henderson {
138*db432672SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);
139*db432672SRichard Henderson     intptr_t i;
140*db432672SRichard Henderson 
141*db432672SRichard Henderson     for (i = 0; i < oprsz; i += sizeof(vec16)) {
142*db432672SRichard Henderson         *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i);
143*db432672SRichard Henderson     }
144*db432672SRichard Henderson     clear_high(d, oprsz, desc);
145*db432672SRichard Henderson }
146*db432672SRichard Henderson 
147*db432672SRichard Henderson void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc)
148*db432672SRichard Henderson {
149*db432672SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);
150*db432672SRichard Henderson     intptr_t i;
151*db432672SRichard Henderson 
152*db432672SRichard Henderson     for (i = 0; i < oprsz; i += sizeof(vec32)) {
153*db432672SRichard Henderson         *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i);
154*db432672SRichard Henderson     }
155*db432672SRichard Henderson     clear_high(d, oprsz, desc);
156*db432672SRichard Henderson }
157*db432672SRichard Henderson 
158*db432672SRichard Henderson void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
159*db432672SRichard Henderson {
160*db432672SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);
161*db432672SRichard Henderson     intptr_t i;
162*db432672SRichard Henderson 
163*db432672SRichard Henderson     for (i = 0; i < oprsz; i += sizeof(vec64)) {
164*db432672SRichard Henderson         *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i);
165*db432672SRichard Henderson     }
166*db432672SRichard Henderson     clear_high(d, oprsz, desc);
167*db432672SRichard Henderson }
168*db432672SRichard Henderson 
169*db432672SRichard Henderson void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc)
170*db432672SRichard Henderson {
171*db432672SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);
172*db432672SRichard Henderson     intptr_t i;
173*db432672SRichard Henderson 
174*db432672SRichard Henderson     for (i = 0; i < oprsz; i += sizeof(vec8)) {
175*db432672SRichard Henderson         *(vec8 *)(d + i) = -*(vec8 *)(a + i);
176*db432672SRichard Henderson     }
177*db432672SRichard Henderson     clear_high(d, oprsz, desc);
178*db432672SRichard Henderson }
179*db432672SRichard Henderson 
180*db432672SRichard Henderson void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc)
181*db432672SRichard Henderson {
182*db432672SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);
183*db432672SRichard Henderson     intptr_t i;
184*db432672SRichard Henderson 
185*db432672SRichard Henderson     for (i = 0; i < oprsz; i += sizeof(vec16)) {
186*db432672SRichard Henderson         *(vec16 *)(d + i) = -*(vec16 *)(a + i);
187*db432672SRichard Henderson     }
188*db432672SRichard Henderson     clear_high(d, oprsz, desc);
189*db432672SRichard Henderson }
190*db432672SRichard Henderson 
191*db432672SRichard Henderson void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc)
192*db432672SRichard Henderson {
193*db432672SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);
194*db432672SRichard Henderson     intptr_t i;
195*db432672SRichard Henderson 
196*db432672SRichard Henderson     for (i = 0; i < oprsz; i += sizeof(vec32)) {
197*db432672SRichard Henderson         *(vec32 *)(d + i) = -*(vec32 *)(a + i);
198*db432672SRichard Henderson     }
199*db432672SRichard Henderson     clear_high(d, oprsz, desc);
200*db432672SRichard Henderson }
201*db432672SRichard Henderson 
202*db432672SRichard Henderson void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc)
203*db432672SRichard Henderson {
204*db432672SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);
205*db432672SRichard Henderson     intptr_t i;
206*db432672SRichard Henderson 
207*db432672SRichard Henderson     for (i = 0; i < oprsz; i += sizeof(vec64)) {
208*db432672SRichard Henderson         *(vec64 *)(d + i) = -*(vec64 *)(a + i);
209*db432672SRichard Henderson     }
210*db432672SRichard Henderson     clear_high(d, oprsz, desc);
211*db432672SRichard Henderson }
212*db432672SRichard Henderson 
213*db432672SRichard Henderson void HELPER(gvec_mov)(void *d, void *a, uint32_t desc)
214*db432672SRichard Henderson {
215*db432672SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);
216*db432672SRichard Henderson 
217*db432672SRichard Henderson     memcpy(d, a, oprsz);
218*db432672SRichard Henderson     clear_high(d, oprsz, desc);
219*db432672SRichard Henderson }
220*db432672SRichard Henderson 
221*db432672SRichard Henderson void HELPER(gvec_dup64)(void *d, uint32_t desc, uint64_t c)
222*db432672SRichard Henderson {
223*db432672SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);
224*db432672SRichard Henderson     intptr_t i;
225*db432672SRichard Henderson 
226*db432672SRichard Henderson     if (c == 0) {
227*db432672SRichard Henderson         oprsz = 0;
228*db432672SRichard Henderson     } else {
229*db432672SRichard Henderson         for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
230*db432672SRichard Henderson             *(uint64_t *)(d + i) = c;
231*db432672SRichard Henderson         }
232*db432672SRichard Henderson     }
233*db432672SRichard Henderson     clear_high(d, oprsz, desc);
234*db432672SRichard Henderson }
235*db432672SRichard Henderson 
236*db432672SRichard Henderson void HELPER(gvec_dup32)(void *d, uint32_t desc, uint32_t c)
237*db432672SRichard Henderson {
238*db432672SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);
239*db432672SRichard Henderson     intptr_t i;
240*db432672SRichard Henderson 
241*db432672SRichard Henderson     if (c == 0) {
242*db432672SRichard Henderson         oprsz = 0;
243*db432672SRichard Henderson     } else {
244*db432672SRichard Henderson         for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
245*db432672SRichard Henderson             *(uint32_t *)(d + i) = c;
246*db432672SRichard Henderson         }
247*db432672SRichard Henderson     }
248*db432672SRichard Henderson     clear_high(d, oprsz, desc);
249*db432672SRichard Henderson }
250*db432672SRichard Henderson 
251*db432672SRichard Henderson void HELPER(gvec_dup16)(void *d, uint32_t desc, uint32_t c)
252*db432672SRichard Henderson {
253*db432672SRichard Henderson     HELPER(gvec_dup32)(d, desc, 0x00010001 * (c & 0xffff));
254*db432672SRichard Henderson }
255*db432672SRichard Henderson 
256*db432672SRichard Henderson void HELPER(gvec_dup8)(void *d, uint32_t desc, uint32_t c)
257*db432672SRichard Henderson {
258*db432672SRichard Henderson     HELPER(gvec_dup32)(d, desc, 0x01010101 * (c & 0xff));
259*db432672SRichard Henderson }
260*db432672SRichard Henderson 
261*db432672SRichard Henderson void HELPER(gvec_not)(void *d, void *a, uint32_t desc)
262*db432672SRichard Henderson {
263*db432672SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);
264*db432672SRichard Henderson     intptr_t i;
265*db432672SRichard Henderson 
266*db432672SRichard Henderson     for (i = 0; i < oprsz; i += sizeof(vec64)) {
267*db432672SRichard Henderson         *(vec64 *)(d + i) = ~*(vec64 *)(a + i);
268*db432672SRichard Henderson     }
269*db432672SRichard Henderson     clear_high(d, oprsz, desc);
270*db432672SRichard Henderson }
271*db432672SRichard Henderson 
272*db432672SRichard Henderson void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc)
273*db432672SRichard Henderson {
274*db432672SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);
275*db432672SRichard Henderson     intptr_t i;
276*db432672SRichard Henderson 
277*db432672SRichard Henderson     for (i = 0; i < oprsz; i += sizeof(vec64)) {
278*db432672SRichard Henderson         *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i);
279*db432672SRichard Henderson     }
280*db432672SRichard Henderson     clear_high(d, oprsz, desc);
281*db432672SRichard Henderson }
282*db432672SRichard Henderson 
283*db432672SRichard Henderson void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc)
284*db432672SRichard Henderson {
285*db432672SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);
286*db432672SRichard Henderson     intptr_t i;
287*db432672SRichard Henderson 
288*db432672SRichard Henderson     for (i = 0; i < oprsz; i += sizeof(vec64)) {
289*db432672SRichard Henderson         *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i);
290*db432672SRichard Henderson     }
291*db432672SRichard Henderson     clear_high(d, oprsz, desc);
292*db432672SRichard Henderson }
293*db432672SRichard Henderson 
294*db432672SRichard Henderson void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc)
295*db432672SRichard Henderson {
296*db432672SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);
297*db432672SRichard Henderson     intptr_t i;
298*db432672SRichard Henderson 
299*db432672SRichard Henderson     for (i = 0; i < oprsz; i += sizeof(vec64)) {
300*db432672SRichard Henderson         *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i);
301*db432672SRichard Henderson     }
302*db432672SRichard Henderson     clear_high(d, oprsz, desc);
303*db432672SRichard Henderson }
304*db432672SRichard Henderson 
305*db432672SRichard Henderson void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc)
306*db432672SRichard Henderson {
307*db432672SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);
308*db432672SRichard Henderson     intptr_t i;
309*db432672SRichard Henderson 
310*db432672SRichard Henderson     for (i = 0; i < oprsz; i += sizeof(vec64)) {
311*db432672SRichard Henderson         *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i);
312*db432672SRichard Henderson     }
313*db432672SRichard Henderson     clear_high(d, oprsz, desc);
314*db432672SRichard Henderson }
315*db432672SRichard Henderson 
316*db432672SRichard Henderson void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc)
317*db432672SRichard Henderson {
318*db432672SRichard Henderson     intptr_t oprsz = simd_oprsz(desc);
319*db432672SRichard Henderson     intptr_t i;
320*db432672SRichard Henderson 
321*db432672SRichard Henderson     for (i = 0; i < oprsz; i += sizeof(vec64)) {
322*db432672SRichard Henderson         *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i);
323*db432672SRichard Henderson     }
324*db432672SRichard Henderson     clear_high(d, oprsz, desc);
325*db432672SRichard Henderson }
326