1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3 * QEMU LoongArch vector helper functions.
4 *
5 * Copyright (c) 2022-2023 Loongson Technology Corporation Limited
6 */
7
8 #include "qemu/osdep.h"
9 #include "cpu.h"
10 #include "exec/helper-proto.h"
11 #include "fpu/softfloat.h"
12 #include "internals.h"
13 #include "tcg/tcg.h"
14 #include "vec.h"
15 #include "tcg/tcg-gvec-desc.h"
16
17 #define DO_ODD_EVEN(NAME, BIT, E1, E2, DO_OP) \
18 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
19 { \
20 int i; \
21 VReg *Vd = (VReg *)vd; \
22 VReg *Vj = (VReg *)vj; \
23 VReg *Vk = (VReg *)vk; \
24 typedef __typeof(Vd->E1(0)) TD; \
25 int oprsz = simd_oprsz(desc); \
26 \
27 for (i = 0; i < oprsz / (BIT / 8); i++) { \
28 Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i + 1), (TD)Vk->E2(2 * i)); \
29 } \
30 }
31
32 DO_ODD_EVEN(vhaddw_h_b, 16, H, B, DO_ADD)
33 DO_ODD_EVEN(vhaddw_w_h, 32, W, H, DO_ADD)
34 DO_ODD_EVEN(vhaddw_d_w, 64, D, W, DO_ADD)
35
HELPER(vhaddw_q_d)36 void HELPER(vhaddw_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
37 {
38 int i;
39 VReg *Vd = (VReg *)vd;
40 VReg *Vj = (VReg *)vj;
41 VReg *Vk = (VReg *)vk;
42 int oprsz = simd_oprsz(desc);
43
44 for (i = 0; i < oprsz / 16 ; i++) {
45 Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i + 1)),
46 int128_makes64(Vk->D(2 * i)));
47 }
48 }
49
50 DO_ODD_EVEN(vhsubw_h_b, 16, H, B, DO_SUB)
51 DO_ODD_EVEN(vhsubw_w_h, 32, W, H, DO_SUB)
52 DO_ODD_EVEN(vhsubw_d_w, 64, D, W, DO_SUB)
53
HELPER(vhsubw_q_d)54 void HELPER(vhsubw_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
55 {
56 int i;
57 VReg *Vd = (VReg *)vd;
58 VReg *Vj = (VReg *)vj;
59 VReg *Vk = (VReg *)vk;
60 int oprsz = simd_oprsz(desc);
61
62 for (i = 0; i < oprsz / 16; i++) {
63 Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i + 1)),
64 int128_makes64(Vk->D(2 * i)));
65 }
66 }
67
68 DO_ODD_EVEN(vhaddw_hu_bu, 16, UH, UB, DO_ADD)
69 DO_ODD_EVEN(vhaddw_wu_hu, 32, UW, UH, DO_ADD)
70 DO_ODD_EVEN(vhaddw_du_wu, 64, UD, UW, DO_ADD)
71
HELPER(vhaddw_qu_du)72 void HELPER(vhaddw_qu_du)(void *vd, void *vj, void *vk, uint32_t desc)
73 {
74 int i;
75 VReg *Vd = (VReg *)vd;
76 VReg *Vj = (VReg *)vj;
77 VReg *Vk = (VReg *)vk;
78 int oprsz = simd_oprsz(desc);
79
80 for (i = 0; i < oprsz / 16; i ++) {
81 Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)),
82 int128_make64(Vk->UD(2 * i)));
83 }
84 }
85
86 DO_ODD_EVEN(vhsubw_hu_bu, 16, UH, UB, DO_SUB)
87 DO_ODD_EVEN(vhsubw_wu_hu, 32, UW, UH, DO_SUB)
88 DO_ODD_EVEN(vhsubw_du_wu, 64, UD, UW, DO_SUB)
89
HELPER(vhsubw_qu_du)90 void HELPER(vhsubw_qu_du)(void *vd, void *vj, void *vk, uint32_t desc)
91 {
92 int i;
93 VReg *Vd = (VReg *)vd;
94 VReg *Vj = (VReg *)vj;
95 VReg *Vk = (VReg *)vk;
96 int oprsz = simd_oprsz(desc);
97
98 for (i = 0; i < oprsz / 16; i++) {
99 Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i + 1)),
100 int128_make64(Vk->UD(2 * i)));
101 }
102 }
103
104 #define DO_EVEN(NAME, BIT, E1, E2, DO_OP) \
105 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
106 { \
107 int i; \
108 VReg *Vd = (VReg *)vd; \
109 VReg *Vj = (VReg *)vj; \
110 VReg *Vk = (VReg *)vk; \
111 typedef __typeof(Vd->E1(0)) TD; \
112 int oprsz = simd_oprsz(desc); \
113 \
114 for (i = 0; i < oprsz / (BIT / 8); i++) { \
115 Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i) ,(TD)Vk->E2(2 * i)); \
116 } \
117 }
118
119 #define DO_ODD(NAME, BIT, E1, E2, DO_OP) \
120 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
121 { \
122 int i; \
123 VReg *Vd = (VReg *)vd; \
124 VReg *Vj = (VReg *)vj; \
125 VReg *Vk = (VReg *)vk; \
126 typedef __typeof(Vd->E1(0)) TD; \
127 int oprsz = simd_oprsz(desc); \
128 \
129 for (i = 0; i < oprsz / (BIT / 8); i++) { \
130 Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i + 1), (TD)Vk->E2(2 * i + 1)); \
131 } \
132 }
133
HELPER(vaddwev_q_d)134 void HELPER(vaddwev_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
135 {
136 int i;
137 VReg *Vd = (VReg *)vd;
138 VReg *Vj = (VReg *)vj;
139 VReg *Vk = (VReg *)vk;
140 int oprsz = simd_oprsz(desc);
141
142 for (i = 0; i < oprsz / 16; i++) {
143 Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i)),
144 int128_makes64(Vk->D(2 * i)));
145 }
146 }
147
148 DO_EVEN(vaddwev_h_b, 16, H, B, DO_ADD)
149 DO_EVEN(vaddwev_w_h, 32, W, H, DO_ADD)
150 DO_EVEN(vaddwev_d_w, 64, D, W, DO_ADD)
151
HELPER(vaddwod_q_d)152 void HELPER(vaddwod_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
153 {
154 int i;
155 VReg *Vd = (VReg *)vd;
156 VReg *Vj = (VReg *)vj;
157 VReg *Vk = (VReg *)vk;
158 int oprsz = simd_oprsz(desc);
159
160 for (i = 0; i < oprsz / 16; i++) {
161 Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i +1)),
162 int128_makes64(Vk->D(2 * i +1)));
163 }
164 }
165
166 DO_ODD(vaddwod_h_b, 16, H, B, DO_ADD)
167 DO_ODD(vaddwod_w_h, 32, W, H, DO_ADD)
168 DO_ODD(vaddwod_d_w, 64, D, W, DO_ADD)
169
HELPER(vsubwev_q_d)170 void HELPER(vsubwev_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
171 {
172 int i;
173 VReg *Vd = (VReg *)vd;
174 VReg *Vj = (VReg *)vj;
175 VReg *Vk = (VReg *)vk;
176 int oprsz = simd_oprsz(desc);
177
178 for (i = 0; i < oprsz / 16; i++) {
179 Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i)),
180 int128_makes64(Vk->D(2 * i)));
181 }
182 }
183
184 DO_EVEN(vsubwev_h_b, 16, H, B, DO_SUB)
185 DO_EVEN(vsubwev_w_h, 32, W, H, DO_SUB)
186 DO_EVEN(vsubwev_d_w, 64, D, W, DO_SUB)
187
HELPER(vsubwod_q_d)188 void HELPER(vsubwod_q_d)(void *vd, void *vj, void *vk, uint32_t desc)
189 {
190 int i;
191 VReg *Vd = (VReg *)vd;
192 VReg *Vj = (VReg *)vj;
193 VReg *Vk = (VReg *)vk;
194 int oprsz = simd_oprsz(desc);
195
196 for (i = 0; i < oprsz / 16; i++) {
197 Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i + 1)),
198 int128_makes64(Vk->D(2 * i + 1)));
199 }
200 }
201
202 DO_ODD(vsubwod_h_b, 16, H, B, DO_SUB)
203 DO_ODD(vsubwod_w_h, 32, W, H, DO_SUB)
204 DO_ODD(vsubwod_d_w, 64, D, W, DO_SUB)
205
HELPER(vaddwev_q_du)206 void HELPER(vaddwev_q_du)(void *vd, void *vj, void *vk, uint32_t desc)
207 {
208 int i;
209 VReg *Vd = (VReg *)vd;
210 VReg *Vj = (VReg *)vj;
211 VReg *Vk = (VReg *)vk;
212 int oprsz = simd_oprsz(desc);
213
214 for (i = 0; i < oprsz / 16; i++) {
215 Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i)),
216 int128_make64(Vk->UD(2 * i)));
217 }
218 }
219
220 DO_EVEN(vaddwev_h_bu, 16, UH, UB, DO_ADD)
221 DO_EVEN(vaddwev_w_hu, 32, UW, UH, DO_ADD)
222 DO_EVEN(vaddwev_d_wu, 64, UD, UW, DO_ADD)
223
HELPER(vaddwod_q_du)224 void HELPER(vaddwod_q_du)(void *vd, void *vj, void *vk, uint32_t desc)
225 {
226 int i;
227 VReg *Vd = (VReg *)vd;
228 VReg *Vj = (VReg *)vj;
229 VReg *Vk = (VReg *)vk;
230 int oprsz = simd_oprsz(desc);
231
232 for (i = 0; i < oprsz / 16; i++) {
233 Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)),
234 int128_make64(Vk->UD(2 * i + 1)));
235 }
236 }
237
238 DO_ODD(vaddwod_h_bu, 16, UH, UB, DO_ADD)
239 DO_ODD(vaddwod_w_hu, 32, UW, UH, DO_ADD)
240 DO_ODD(vaddwod_d_wu, 64, UD, UW, DO_ADD)
241
HELPER(vsubwev_q_du)242 void HELPER(vsubwev_q_du)(void *vd, void *vj, void *vk, uint32_t desc)
243 {
244 int i;
245 VReg *Vd = (VReg *)vd;
246 VReg *Vj = (VReg *)vj;
247 VReg *Vk = (VReg *)vk;
248 int oprsz = simd_oprsz(desc);
249
250 for (i = 0; i < oprsz / 16; i++) {
251 Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i)),
252 int128_make64(Vk->UD(2 * i)));
253 }
254 }
255
256 DO_EVEN(vsubwev_h_bu, 16, UH, UB, DO_SUB)
257 DO_EVEN(vsubwev_w_hu, 32, UW, UH, DO_SUB)
258 DO_EVEN(vsubwev_d_wu, 64, UD, UW, DO_SUB)
259
HELPER(vsubwod_q_du)260 void HELPER(vsubwod_q_du)(void *vd, void *vj, void *vk, uint32_t desc)
261 {
262 int i;
263 VReg *Vd = (VReg *)vd;
264 VReg *Vj = (VReg *)vj;
265 VReg *Vk = (VReg *)vk;
266 int oprsz = simd_oprsz(desc);
267
268 for (i = 0; i < oprsz / 16; i++) {
269 Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i + 1)),
270 int128_make64(Vk->UD(2 * i + 1)));
271 }
272 }
273
274 DO_ODD(vsubwod_h_bu, 16, UH, UB, DO_SUB)
275 DO_ODD(vsubwod_w_hu, 32, UW, UH, DO_SUB)
276 DO_ODD(vsubwod_d_wu, 64, UD, UW, DO_SUB)
277
278 #define DO_EVEN_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \
279 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
280 { \
281 int i; \
282 VReg *Vd = (VReg *)vd; \
283 VReg *Vj = (VReg *)vj; \
284 VReg *Vk = (VReg *)vk; \
285 typedef __typeof(Vd->ES1(0)) TDS; \
286 typedef __typeof(Vd->EU1(0)) TDU; \
287 int oprsz = simd_oprsz(desc); \
288 \
289 for (i = 0; i < oprsz / (BIT / 8); i++) { \
290 Vd->ES1(i) = DO_OP((TDU)Vj->EU2(2 * i) ,(TDS)Vk->ES2(2 * i)); \
291 } \
292 }
293
294 #define DO_ODD_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \
295 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
296 { \
297 int i; \
298 VReg *Vd = (VReg *)vd; \
299 VReg *Vj = (VReg *)vj; \
300 VReg *Vk = (VReg *)vk; \
301 typedef __typeof(Vd->ES1(0)) TDS; \
302 typedef __typeof(Vd->EU1(0)) TDU; \
303 int oprsz = simd_oprsz(desc); \
304 \
305 for (i = 0; i < oprsz / (BIT / 8); i++) { \
306 Vd->ES1(i) = DO_OP((TDU)Vj->EU2(2 * i + 1), (TDS)Vk->ES2(2 * i + 1)); \
307 } \
308 }
309
HELPER(vaddwev_q_du_d)310 void HELPER(vaddwev_q_du_d)(void *vd, void *vj, void *vk, uint32_t desc)
311 {
312 int i;
313 VReg *Vd = (VReg *)vd;
314 VReg *Vj = (VReg *)vj;
315 VReg *Vk = (VReg *)vk;
316 int oprsz = simd_oprsz(desc);
317
318 for (i = 0; i < oprsz / 16; i++) {
319 Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i)),
320 int128_makes64(Vk->D(2 * i)));
321 }
322 }
323
324 DO_EVEN_U_S(vaddwev_h_bu_b, 16, H, UH, B, UB, DO_ADD)
325 DO_EVEN_U_S(vaddwev_w_hu_h, 32, W, UW, H, UH, DO_ADD)
326 DO_EVEN_U_S(vaddwev_d_wu_w, 64, D, UD, W, UW, DO_ADD)
327
HELPER(vaddwod_q_du_d)328 void HELPER(vaddwod_q_du_d)(void *vd, void *vj, void *vk, uint32_t desc)
329 {
330 int i;
331 VReg *Vd = (VReg *)vd;
332 VReg *Vj = (VReg *)vj;
333 VReg *Vk = (VReg *)vk;
334 int oprsz = simd_oprsz(desc);
335
336 for (i = 0; i < oprsz / 16; i++) {
337 Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)),
338 int128_makes64(Vk->D(2 * i + 1)));
339 }
340 }
341
342 DO_ODD_U_S(vaddwod_h_bu_b, 16, H, UH, B, UB, DO_ADD)
343 DO_ODD_U_S(vaddwod_w_hu_h, 32, W, UW, H, UH, DO_ADD)
344 DO_ODD_U_S(vaddwod_d_wu_w, 64, D, UD, W, UW, DO_ADD)
345
346 #define DO_3OP(NAME, BIT, E, DO_OP) \
347 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
348 { \
349 int i; \
350 VReg *Vd = (VReg *)vd; \
351 VReg *Vj = (VReg *)vj; \
352 VReg *Vk = (VReg *)vk; \
353 int oprsz = simd_oprsz(desc); \
354 \
355 for (i = 0; i < oprsz / (BIT / 8); i++) { \
356 Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)); \
357 } \
358 }
359
360 DO_3OP(vavg_b, 8, B, DO_VAVG)
361 DO_3OP(vavg_h, 16, H, DO_VAVG)
362 DO_3OP(vavg_w, 32, W, DO_VAVG)
363 DO_3OP(vavg_d, 64, D, DO_VAVG)
364 DO_3OP(vavgr_b, 8, B, DO_VAVGR)
365 DO_3OP(vavgr_h, 16, H, DO_VAVGR)
366 DO_3OP(vavgr_w, 32, W, DO_VAVGR)
367 DO_3OP(vavgr_d, 64, D, DO_VAVGR)
368 DO_3OP(vavg_bu, 8, UB, DO_VAVG)
369 DO_3OP(vavg_hu, 16, UH, DO_VAVG)
370 DO_3OP(vavg_wu, 32, UW, DO_VAVG)
371 DO_3OP(vavg_du, 64, UD, DO_VAVG)
372 DO_3OP(vavgr_bu, 8, UB, DO_VAVGR)
373 DO_3OP(vavgr_hu, 16, UH, DO_VAVGR)
374 DO_3OP(vavgr_wu, 32, UW, DO_VAVGR)
375 DO_3OP(vavgr_du, 64, UD, DO_VAVGR)
376
377 DO_3OP(vabsd_b, 8, B, DO_VABSD)
378 DO_3OP(vabsd_h, 16, H, DO_VABSD)
379 DO_3OP(vabsd_w, 32, W, DO_VABSD)
380 DO_3OP(vabsd_d, 64, D, DO_VABSD)
381 DO_3OP(vabsd_bu, 8, UB, DO_VABSD)
382 DO_3OP(vabsd_hu, 16, UH, DO_VABSD)
383 DO_3OP(vabsd_wu, 32, UW, DO_VABSD)
384 DO_3OP(vabsd_du, 64, UD, DO_VABSD)
385
386 #define DO_VADDA(NAME, BIT, E) \
387 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
388 { \
389 int i; \
390 VReg *Vd = (VReg *)vd; \
391 VReg *Vj = (VReg *)vj; \
392 VReg *Vk = (VReg *)vk; \
393 int oprsz = simd_oprsz(desc); \
394 \
395 for (i = 0; i < oprsz / (BIT / 8); i++) { \
396 Vd->E(i) = DO_VABS(Vj->E(i)) + DO_VABS(Vk->E(i)); \
397 } \
398 }
399
400 DO_VADDA(vadda_b, 8, B)
401 DO_VADDA(vadda_h, 16, H)
402 DO_VADDA(vadda_w, 32, W)
403 DO_VADDA(vadda_d, 64, D)
404
405 #define VMINMAXI(NAME, BIT, E, DO_OP) \
406 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
407 { \
408 int i; \
409 VReg *Vd = (VReg *)vd; \
410 VReg *Vj = (VReg *)vj; \
411 typedef __typeof(Vd->E(0)) TD; \
412 int oprsz = simd_oprsz(desc); \
413 \
414 for (i = 0; i < oprsz / (BIT / 8); i++) { \
415 Vd->E(i) = DO_OP(Vj->E(i), (TD)imm); \
416 } \
417 }
418
419 VMINMAXI(vmini_b, 8, B, DO_MIN)
420 VMINMAXI(vmini_h, 16, H, DO_MIN)
421 VMINMAXI(vmini_w, 32, W, DO_MIN)
422 VMINMAXI(vmini_d, 64, D, DO_MIN)
423 VMINMAXI(vmaxi_b, 8, B, DO_MAX)
424 VMINMAXI(vmaxi_h, 16, H, DO_MAX)
425 VMINMAXI(vmaxi_w, 32, W, DO_MAX)
426 VMINMAXI(vmaxi_d, 64, D, DO_MAX)
427 VMINMAXI(vmini_bu, 8, UB, DO_MIN)
428 VMINMAXI(vmini_hu, 16, UH, DO_MIN)
429 VMINMAXI(vmini_wu, 32, UW, DO_MIN)
430 VMINMAXI(vmini_du, 64, UD, DO_MIN)
431 VMINMAXI(vmaxi_bu, 8, UB, DO_MAX)
432 VMINMAXI(vmaxi_hu, 16, UH, DO_MAX)
433 VMINMAXI(vmaxi_wu, 32, UW, DO_MAX)
434 VMINMAXI(vmaxi_du, 64, UD, DO_MAX)
435
436 #define DO_VMUH(NAME, BIT, E1, E2, DO_OP) \
437 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
438 { \
439 int i; \
440 VReg *Vd = (VReg *)vd; \
441 VReg *Vj = (VReg *)vj; \
442 VReg *Vk = (VReg *)vk; \
443 typedef __typeof(Vd->E1(0)) T; \
444 int oprsz = simd_oprsz(desc); \
445 \
446 for (i = 0; i < oprsz / (BIT / 8); i++) { \
447 Vd->E2(i) = ((T)Vj->E2(i)) * ((T)Vk->E2(i)) >> BIT; \
448 } \
449 }
450
HELPER(vmuh_d)451 void HELPER(vmuh_d)(void *vd, void *vj, void *vk, uint32_t desc)
452 {
453 int i;
454 uint64_t l, h;
455 VReg *Vd = (VReg *)vd;
456 VReg *Vj = (VReg *)vj;
457 VReg *Vk = (VReg *)vk;
458 int oprsz = simd_oprsz(desc);
459
460 for (i = 0; i < oprsz / 8; i++) {
461 muls64(&l, &h, Vj->D(i), Vk->D(i));
462 Vd->D(i) = h;
463 }
464 }
465
466 DO_VMUH(vmuh_b, 8, H, B, DO_MUH)
467 DO_VMUH(vmuh_h, 16, W, H, DO_MUH)
468 DO_VMUH(vmuh_w, 32, D, W, DO_MUH)
469
HELPER(vmuh_du)470 void HELPER(vmuh_du)(void *vd, void *vj, void *vk, uint32_t desc)
471 {
472 int i;
473 uint64_t l, h;
474 VReg *Vd = (VReg *)vd;
475 VReg *Vj = (VReg *)vj;
476 VReg *Vk = (VReg *)vk;
477 int oprsz = simd_oprsz(desc);
478
479 for (i = 0; i < oprsz / 8; i++) {
480 mulu64(&l, &h, Vj->D(i), Vk->D(i));
481 Vd->D(i) = h;
482 }
483 }
484
485 DO_VMUH(vmuh_bu, 8, UH, UB, DO_MUH)
486 DO_VMUH(vmuh_hu, 16, UW, UH, DO_MUH)
487 DO_VMUH(vmuh_wu, 32, UD, UW, DO_MUH)
488
489 DO_EVEN(vmulwev_h_b, 16, H, B, DO_MUL)
490 DO_EVEN(vmulwev_w_h, 32, W, H, DO_MUL)
491 DO_EVEN(vmulwev_d_w, 64, D, W, DO_MUL)
492
493 DO_ODD(vmulwod_h_b, 16, H, B, DO_MUL)
494 DO_ODD(vmulwod_w_h, 32, W, H, DO_MUL)
495 DO_ODD(vmulwod_d_w, 64, D, W, DO_MUL)
496
497 DO_EVEN(vmulwev_h_bu, 16, UH, UB, DO_MUL)
498 DO_EVEN(vmulwev_w_hu, 32, UW, UH, DO_MUL)
499 DO_EVEN(vmulwev_d_wu, 64, UD, UW, DO_MUL)
500
501 DO_ODD(vmulwod_h_bu, 16, UH, UB, DO_MUL)
502 DO_ODD(vmulwod_w_hu, 32, UW, UH, DO_MUL)
503 DO_ODD(vmulwod_d_wu, 64, UD, UW, DO_MUL)
504
505 DO_EVEN_U_S(vmulwev_h_bu_b, 16, H, UH, B, UB, DO_MUL)
506 DO_EVEN_U_S(vmulwev_w_hu_h, 32, W, UW, H, UH, DO_MUL)
507 DO_EVEN_U_S(vmulwev_d_wu_w, 64, D, UD, W, UW, DO_MUL)
508
509 DO_ODD_U_S(vmulwod_h_bu_b, 16, H, UH, B, UB, DO_MUL)
510 DO_ODD_U_S(vmulwod_w_hu_h, 32, W, UW, H, UH, DO_MUL)
511 DO_ODD_U_S(vmulwod_d_wu_w, 64, D, UD, W, UW, DO_MUL)
512
513 #define VMADDSUB(NAME, BIT, E, DO_OP) \
514 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
515 { \
516 int i; \
517 VReg *Vd = (VReg *)vd; \
518 VReg *Vj = (VReg *)vj; \
519 VReg *Vk = (VReg *)vk; \
520 int oprsz = simd_oprsz(desc); \
521 \
522 for (i = 0; i < oprsz / (BIT / 8); i++) { \
523 Vd->E(i) = DO_OP(Vd->E(i), Vj->E(i) ,Vk->E(i)); \
524 } \
525 }
526
527 VMADDSUB(vmadd_b, 8, B, DO_MADD)
528 VMADDSUB(vmadd_h, 16, H, DO_MADD)
529 VMADDSUB(vmadd_w, 32, W, DO_MADD)
530 VMADDSUB(vmadd_d, 64, D, DO_MADD)
531 VMADDSUB(vmsub_b, 8, B, DO_MSUB)
532 VMADDSUB(vmsub_h, 16, H, DO_MSUB)
533 VMADDSUB(vmsub_w, 32, W, DO_MSUB)
534 VMADDSUB(vmsub_d, 64, D, DO_MSUB)
535
536 #define VMADDWEV(NAME, BIT, E1, E2, DO_OP) \
537 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
538 { \
539 int i; \
540 VReg *Vd = (VReg *)vd; \
541 VReg *Vj = (VReg *)vj; \
542 VReg *Vk = (VReg *)vk; \
543 typedef __typeof(Vd->E1(0)) TD; \
544 int oprsz = simd_oprsz(desc); \
545 \
546 for (i = 0; i < oprsz / (BIT / 8); i++) { \
547 Vd->E1(i) += DO_OP((TD)Vj->E2(2 * i), (TD)Vk->E2(2 * i)); \
548 } \
549 }
550
551 VMADDWEV(vmaddwev_h_b, 16, H, B, DO_MUL)
552 VMADDWEV(vmaddwev_w_h, 32, W, H, DO_MUL)
553 VMADDWEV(vmaddwev_d_w, 64, D, W, DO_MUL)
554 VMADDWEV(vmaddwev_h_bu, 16, UH, UB, DO_MUL)
555 VMADDWEV(vmaddwev_w_hu, 32, UW, UH, DO_MUL)
556 VMADDWEV(vmaddwev_d_wu, 64, UD, UW, DO_MUL)
557
558 #define VMADDWOD(NAME, BIT, E1, E2, DO_OP) \
559 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
560 { \
561 int i; \
562 VReg *Vd = (VReg *)vd; \
563 VReg *Vj = (VReg *)vj; \
564 VReg *Vk = (VReg *)vk; \
565 typedef __typeof(Vd->E1(0)) TD; \
566 int oprsz = simd_oprsz(desc); \
567 \
568 for (i = 0; i < oprsz / (BIT / 8); i++) { \
569 Vd->E1(i) += DO_OP((TD)Vj->E2(2 * i + 1), \
570 (TD)Vk->E2(2 * i + 1)); \
571 } \
572 }
573
574 VMADDWOD(vmaddwod_h_b, 16, H, B, DO_MUL)
575 VMADDWOD(vmaddwod_w_h, 32, W, H, DO_MUL)
576 VMADDWOD(vmaddwod_d_w, 64, D, W, DO_MUL)
577 VMADDWOD(vmaddwod_h_bu, 16, UH, UB, DO_MUL)
578 VMADDWOD(vmaddwod_w_hu, 32, UW, UH, DO_MUL)
579 VMADDWOD(vmaddwod_d_wu, 64, UD, UW, DO_MUL)
580
581 #define VMADDWEV_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \
582 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
583 { \
584 int i; \
585 VReg *Vd = (VReg *)vd; \
586 VReg *Vj = (VReg *)vj; \
587 VReg *Vk = (VReg *)vk; \
588 typedef __typeof(Vd->ES1(0)) TS1; \
589 typedef __typeof(Vd->EU1(0)) TU1; \
590 int oprsz = simd_oprsz(desc); \
591 \
592 for (i = 0; i < oprsz / (BIT / 8); i++) { \
593 Vd->ES1(i) += DO_OP((TU1)Vj->EU2(2 * i), \
594 (TS1)Vk->ES2(2 * i)); \
595 } \
596 }
597
598 VMADDWEV_U_S(vmaddwev_h_bu_b, 16, H, UH, B, UB, DO_MUL)
599 VMADDWEV_U_S(vmaddwev_w_hu_h, 32, W, UW, H, UH, DO_MUL)
600 VMADDWEV_U_S(vmaddwev_d_wu_w, 64, D, UD, W, UW, DO_MUL)
601
602 #define VMADDWOD_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \
603 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
604 { \
605 int i; \
606 VReg *Vd = (VReg *)vd; \
607 VReg *Vj = (VReg *)vj; \
608 VReg *Vk = (VReg *)vk; \
609 typedef __typeof(Vd->ES1(0)) TS1; \
610 typedef __typeof(Vd->EU1(0)) TU1; \
611 int oprsz = simd_oprsz(desc); \
612 \
613 for (i = 0; i < oprsz / (BIT / 8); i++) { \
614 Vd->ES1(i) += DO_OP((TU1)Vj->EU2(2 * i + 1), \
615 (TS1)Vk->ES2(2 * i + 1)); \
616 } \
617 }
618
619 VMADDWOD_U_S(vmaddwod_h_bu_b, 16, H, UH, B, UB, DO_MUL)
620 VMADDWOD_U_S(vmaddwod_w_hu_h, 32, W, UW, H, UH, DO_MUL)
621 VMADDWOD_U_S(vmaddwod_d_wu_w, 64, D, UD, W, UW, DO_MUL)
622
623 #define VDIV(NAME, BIT, E, DO_OP) \
624 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
625 { \
626 int i; \
627 VReg *Vd = (VReg *)vd; \
628 VReg *Vj = (VReg *)vj; \
629 VReg *Vk = (VReg *)vk; \
630 int oprsz = simd_oprsz(desc); \
631 \
632 for (i = 0; i < oprsz / (BIT / 8); i++) { \
633 Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)); \
634 } \
635 }
636
637 VDIV(vdiv_b, 8, B, DO_DIV)
638 VDIV(vdiv_h, 16, H, DO_DIV)
639 VDIV(vdiv_w, 32, W, DO_DIV)
640 VDIV(vdiv_d, 64, D, DO_DIV)
641 VDIV(vdiv_bu, 8, UB, DO_DIVU)
642 VDIV(vdiv_hu, 16, UH, DO_DIVU)
643 VDIV(vdiv_wu, 32, UW, DO_DIVU)
644 VDIV(vdiv_du, 64, UD, DO_DIVU)
645 VDIV(vmod_b, 8, B, DO_REM)
646 VDIV(vmod_h, 16, H, DO_REM)
647 VDIV(vmod_w, 32, W, DO_REM)
648 VDIV(vmod_d, 64, D, DO_REM)
649 VDIV(vmod_bu, 8, UB, DO_REMU)
650 VDIV(vmod_hu, 16, UH, DO_REMU)
651 VDIV(vmod_wu, 32, UW, DO_REMU)
652 VDIV(vmod_du, 64, UD, DO_REMU)
653
654 #define VSAT_S(NAME, BIT, E) \
655 void HELPER(NAME)(void *vd, void *vj, uint64_t max, uint32_t desc) \
656 { \
657 int i; \
658 VReg *Vd = (VReg *)vd; \
659 VReg *Vj = (VReg *)vj; \
660 typedef __typeof(Vd->E(0)) TD; \
661 int oprsz = simd_oprsz(desc); \
662 \
663 for (i = 0; i < oprsz / (BIT / 8); i++) { \
664 Vd->E(i) = Vj->E(i) > (TD)max ? (TD)max : \
665 Vj->E(i) < (TD)~max ? (TD)~max: Vj->E(i); \
666 } \
667 }
668
669 VSAT_S(vsat_b, 8, B)
670 VSAT_S(vsat_h, 16, H)
671 VSAT_S(vsat_w, 32, W)
672 VSAT_S(vsat_d, 64, D)
673
674 #define VSAT_U(NAME, BIT, E) \
675 void HELPER(NAME)(void *vd, void *vj, uint64_t max, uint32_t desc) \
676 { \
677 int i; \
678 VReg *Vd = (VReg *)vd; \
679 VReg *Vj = (VReg *)vj; \
680 typedef __typeof(Vd->E(0)) TD; \
681 int oprsz = simd_oprsz(desc); \
682 \
683 for (i = 0; i < oprsz / (BIT / 8); i++) { \
684 Vd->E(i) = Vj->E(i) > (TD)max ? (TD)max : Vj->E(i); \
685 } \
686 }
687
688 VSAT_U(vsat_bu, 8, UB)
689 VSAT_U(vsat_hu, 16, UH)
690 VSAT_U(vsat_wu, 32, UW)
691 VSAT_U(vsat_du, 64, UD)
692
693 #define VEXTH(NAME, BIT, E1, E2) \
694 void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \
695 { \
696 int i, j, ofs; \
697 VReg *Vd = (VReg *)vd; \
698 VReg *Vj = (VReg *)vj; \
699 int oprsz = simd_oprsz(desc); \
700 \
701 ofs = LSX_LEN / BIT; \
702 for (i = 0; i < oprsz / 16; i++) { \
703 for (j = 0; j < ofs; j++) { \
704 Vd->E1(j + i * ofs) = Vj->E2(j + ofs + ofs * 2 * i); \
705 } \
706 } \
707 }
708
HELPER(vexth_q_d)709 void HELPER(vexth_q_d)(void *vd, void *vj, uint32_t desc)
710 {
711 int i;
712 VReg *Vd = (VReg *)vd;
713 VReg *Vj = (VReg *)vj;
714 int oprsz = simd_oprsz(desc);
715
716 for (i = 0; i < oprsz / 16; i++) {
717 Vd->Q(i) = int128_makes64(Vj->D(2 * i + 1));
718 }
719 }
720
HELPER(vexth_qu_du)721 void HELPER(vexth_qu_du)(void *vd, void *vj, uint32_t desc)
722 {
723 int i;
724 VReg *Vd = (VReg *)vd;
725 VReg *Vj = (VReg *)vj;
726 int oprsz = simd_oprsz(desc);
727
728 for (i = 0; i < oprsz / 16; i++) {
729 Vd->Q(i) = int128_make64(Vj->UD(2 * i + 1));
730 }
731 }
732
733 VEXTH(vexth_h_b, 16, H, B)
734 VEXTH(vexth_w_h, 32, W, H)
735 VEXTH(vexth_d_w, 64, D, W)
736 VEXTH(vexth_hu_bu, 16, UH, UB)
737 VEXTH(vexth_wu_hu, 32, UW, UH)
738 VEXTH(vexth_du_wu, 64, UD, UW)
739
740 #define VEXT2XV(NAME, BIT, E1, E2) \
741 void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \
742 { \
743 int i; \
744 VReg temp = {}; \
745 VReg *Vd = (VReg *)vd; \
746 VReg *Vj = (VReg *)vj; \
747 int oprsz = simd_oprsz(desc); \
748 \
749 for (i = 0; i < oprsz / (BIT / 8); i++) { \
750 temp.E1(i) = Vj->E2(i); \
751 } \
752 *Vd = temp; \
753 }
754
755 VEXT2XV(vext2xv_h_b, 16, H, B)
756 VEXT2XV(vext2xv_w_b, 32, W, B)
757 VEXT2XV(vext2xv_d_b, 64, D, B)
758 VEXT2XV(vext2xv_w_h, 32, W, H)
759 VEXT2XV(vext2xv_d_h, 64, D, H)
760 VEXT2XV(vext2xv_d_w, 64, D, W)
761 VEXT2XV(vext2xv_hu_bu, 16, UH, UB)
762 VEXT2XV(vext2xv_wu_bu, 32, UW, UB)
763 VEXT2XV(vext2xv_du_bu, 64, UD, UB)
764 VEXT2XV(vext2xv_wu_hu, 32, UW, UH)
765 VEXT2XV(vext2xv_du_hu, 64, UD, UH)
766 VEXT2XV(vext2xv_du_wu, 64, UD, UW)
767
768 DO_3OP(vsigncov_b, 8, B, DO_SIGNCOV)
769 DO_3OP(vsigncov_h, 16, H, DO_SIGNCOV)
770 DO_3OP(vsigncov_w, 32, W, DO_SIGNCOV)
771 DO_3OP(vsigncov_d, 64, D, DO_SIGNCOV)
772
do_vmskltz_b(int64_t val)773 static uint64_t do_vmskltz_b(int64_t val)
774 {
775 uint64_t m = 0x8080808080808080ULL;
776 uint64_t c = val & m;
777 c |= c << 7;
778 c |= c << 14;
779 c |= c << 28;
780 return c >> 56;
781 }
782
HELPER(vmskltz_b)783 void HELPER(vmskltz_b)(void *vd, void *vj, uint32_t desc)
784 {
785 int i;
786 uint16_t temp = 0;
787 VReg *Vd = (VReg *)vd;
788 VReg *Vj = (VReg *)vj;
789 int oprsz = simd_oprsz(desc);
790
791 for (i = 0; i < oprsz / 16; i++) {
792 temp = 0;
793 temp = do_vmskltz_b(Vj->D(2 * i));
794 temp |= (do_vmskltz_b(Vj->D(2 * i + 1)) << 8);
795 Vd->D(2 * i) = temp;
796 Vd->D(2 * i + 1) = 0;
797 }
798 }
799
do_vmskltz_h(int64_t val)800 static uint64_t do_vmskltz_h(int64_t val)
801 {
802 uint64_t m = 0x8000800080008000ULL;
803 uint64_t c = val & m;
804 c |= c << 15;
805 c |= c << 30;
806 return c >> 60;
807 }
808
HELPER(vmskltz_h)809 void HELPER(vmskltz_h)(void *vd, void *vj, uint32_t desc)
810 {
811 int i;
812 uint16_t temp = 0;
813 VReg *Vd = (VReg *)vd;
814 VReg *Vj = (VReg *)vj;
815 int oprsz = simd_oprsz(desc);
816
817 for (i = 0; i < oprsz / 16; i++) {
818 temp = 0;
819 temp = do_vmskltz_h(Vj->D(2 * i));
820 temp |= (do_vmskltz_h(Vj->D(2 * i + 1)) << 4);
821 Vd->D(2 * i) = temp;
822 Vd->D(2 * i + 1) = 0;
823 }
824 }
825
do_vmskltz_w(int64_t val)826 static uint64_t do_vmskltz_w(int64_t val)
827 {
828 uint64_t m = 0x8000000080000000ULL;
829 uint64_t c = val & m;
830 c |= c << 31;
831 return c >> 62;
832 }
833
HELPER(vmskltz_w)834 void HELPER(vmskltz_w)(void *vd, void *vj, uint32_t desc)
835 {
836 int i;
837 uint16_t temp = 0;
838 VReg *Vd = (VReg *)vd;
839 VReg *Vj = (VReg *)vj;
840 int oprsz = simd_oprsz(desc);
841
842 for (i = 0; i < oprsz / 16; i++) {
843 temp = 0;
844 temp = do_vmskltz_w(Vj->D(2 * i));
845 temp |= (do_vmskltz_w(Vj->D(2 * i + 1)) << 2);
846 Vd->D(2 * i) = temp;
847 Vd->D(2 * i + 1) = 0;
848 }
849 }
850
do_vmskltz_d(int64_t val)851 static uint64_t do_vmskltz_d(int64_t val)
852 {
853 return (uint64_t)val >> 63;
854 }
HELPER(vmskltz_d)855 void HELPER(vmskltz_d)(void *vd, void *vj, uint32_t desc)
856 {
857 int i;
858 uint16_t temp = 0;
859 VReg *Vd = (VReg *)vd;
860 VReg *Vj = (VReg *)vj;
861 int oprsz = simd_oprsz(desc);
862
863 for (i = 0; i < oprsz / 16; i++) {
864 temp = 0;
865 temp = do_vmskltz_d(Vj->D(2 * i));
866 temp |= (do_vmskltz_d(Vj->D(2 * i + 1)) << 1);
867 Vd->D(2 * i) = temp;
868 Vd->D(2 * i + 1) = 0;
869 }
870 }
871
HELPER(vmskgez_b)872 void HELPER(vmskgez_b)(void *vd, void *vj, uint32_t desc)
873 {
874 int i;
875 uint16_t temp = 0;
876 VReg *Vd = (VReg *)vd;
877 VReg *Vj = (VReg *)vj;
878 int oprsz = simd_oprsz(desc);
879
880 for (i = 0; i < oprsz / 16; i++) {
881 temp = 0;
882 temp = do_vmskltz_b(Vj->D(2 * i));
883 temp |= (do_vmskltz_b(Vj->D(2 * i + 1)) << 8);
884 Vd->D(2 * i) = (uint16_t)(~temp);
885 Vd->D(2 * i + 1) = 0;
886 }
887 }
888
do_vmskez_b(uint64_t a)889 static uint64_t do_vmskez_b(uint64_t a)
890 {
891 uint64_t m = 0x7f7f7f7f7f7f7f7fULL;
892 uint64_t c = ~(((a & m) + m) | a | m);
893 c |= c << 7;
894 c |= c << 14;
895 c |= c << 28;
896 return c >> 56;
897 }
898
HELPER(vmsknz_b)899 void HELPER(vmsknz_b)(void *vd, void *vj, uint32_t desc)
900 {
901 int i;
902 uint16_t temp = 0;
903 VReg *Vd = (VReg *)vd;
904 VReg *Vj = (VReg *)vj;
905 int oprsz = simd_oprsz(desc);
906
907 for (i = 0; i < oprsz / 16; i++) {
908 temp = 0;
909 temp = do_vmskez_b(Vj->D(2 * i));
910 temp |= (do_vmskez_b(Vj->D(2 * i + 1)) << 8);
911 Vd->D(2 * i) = (uint16_t)(~temp);
912 Vd->D(2 * i + 1) = 0;
913 }
914 }
915
HELPER(vnori_b)916 void HELPER(vnori_b)(void *vd, void *vj, uint64_t imm, uint32_t desc)
917 {
918 int i;
919 VReg *Vd = (VReg *)vd;
920 VReg *Vj = (VReg *)vj;
921
922 for (i = 0; i < simd_oprsz(desc); i++) {
923 Vd->B(i) = ~(Vj->B(i) | (uint8_t)imm);
924 }
925 }
926
927 #define VSLLWIL(NAME, BIT, E1, E2) \
928 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
929 { \
930 int i, j, ofs; \
931 VReg temp = {}; \
932 VReg *Vd = (VReg *)vd; \
933 VReg *Vj = (VReg *)vj; \
934 int oprsz = simd_oprsz(desc); \
935 typedef __typeof(temp.E1(0)) TD; \
936 \
937 ofs = LSX_LEN / BIT; \
938 for (i = 0; i < oprsz / 16; i++) { \
939 for (j = 0; j < ofs; j++) { \
940 temp.E1(j + ofs * i) = (TD)Vj->E2(j + ofs * 2 * i) << (imm % BIT); \
941 } \
942 } \
943 *Vd = temp; \
944 }
945
946
HELPER(vextl_q_d)947 void HELPER(vextl_q_d)(void *vd, void *vj, uint32_t desc)
948 {
949 int i;
950 VReg *Vd = (VReg *)vd;
951 VReg *Vj = (VReg *)vj;
952 int oprsz = simd_oprsz(desc);
953
954 for (i = 0; i < oprsz / 16; i++) {
955 Vd->Q(i) = int128_makes64(Vj->D(2 * i));
956 }
957 }
958
HELPER(vextl_qu_du)959 void HELPER(vextl_qu_du)(void *vd, void *vj, uint32_t desc)
960 {
961 int i;
962 VReg *Vd = (VReg *)vd;
963 VReg *Vj = (VReg *)vj;
964 int oprsz = simd_oprsz(desc);
965
966 for (i = 0; i < oprsz / 16; i++) {
967 Vd->Q(i) = int128_make64(Vj->UD(2 * i));
968 }
969 }
970
971 VSLLWIL(vsllwil_h_b, 16, H, B)
972 VSLLWIL(vsllwil_w_h, 32, W, H)
973 VSLLWIL(vsllwil_d_w, 64, D, W)
974 VSLLWIL(vsllwil_hu_bu, 16, UH, UB)
975 VSLLWIL(vsllwil_wu_hu, 32, UW, UH)
976 VSLLWIL(vsllwil_du_wu, 64, UD, UW)
977
978 #define do_vsrlr(E, T) \
979 static T do_vsrlr_ ##E(T s1, int sh) \
980 { \
981 if (sh == 0) { \
982 return s1; \
983 } else { \
984 return (s1 >> sh) + ((s1 >> (sh - 1)) & 0x1); \
985 } \
986 }
987
do_vsrlr(B,uint8_t)988 do_vsrlr(B, uint8_t)
989 do_vsrlr(H, uint16_t)
990 do_vsrlr(W, uint32_t)
991 do_vsrlr(D, uint64_t)
992
993 #define VSRLR(NAME, BIT, T, E) \
994 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
995 { \
996 int i; \
997 VReg *Vd = (VReg *)vd; \
998 VReg *Vj = (VReg *)vj; \
999 VReg *Vk = (VReg *)vk; \
1000 int oprsz = simd_oprsz(desc); \
1001 \
1002 for (i = 0; i < oprsz / (BIT / 8); i++) { \
1003 Vd->E(i) = do_vsrlr_ ## E(Vj->E(i), ((T)Vk->E(i))%BIT); \
1004 } \
1005 }
1006
1007 VSRLR(vsrlr_b, 8, uint8_t, B)
1008 VSRLR(vsrlr_h, 16, uint16_t, H)
1009 VSRLR(vsrlr_w, 32, uint32_t, W)
1010 VSRLR(vsrlr_d, 64, uint64_t, D)
1011
1012 #define VSRLRI(NAME, BIT, E) \
1013 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1014 { \
1015 int i; \
1016 VReg *Vd = (VReg *)vd; \
1017 VReg *Vj = (VReg *)vj; \
1018 int oprsz = simd_oprsz(desc); \
1019 \
1020 for (i = 0; i < oprsz / (BIT / 8); i++) { \
1021 Vd->E(i) = do_vsrlr_ ## E(Vj->E(i), imm); \
1022 } \
1023 }
1024
1025 VSRLRI(vsrlri_b, 8, B)
1026 VSRLRI(vsrlri_h, 16, H)
1027 VSRLRI(vsrlri_w, 32, W)
1028 VSRLRI(vsrlri_d, 64, D)
1029
1030 #define do_vsrar(E, T) \
1031 static T do_vsrar_ ##E(T s1, int sh) \
1032 { \
1033 if (sh == 0) { \
1034 return s1; \
1035 } else { \
1036 return (s1 >> sh) + ((s1 >> (sh - 1)) & 0x1); \
1037 } \
1038 }
1039
1040 do_vsrar(B, int8_t)
1041 do_vsrar(H, int16_t)
1042 do_vsrar(W, int32_t)
1043 do_vsrar(D, int64_t)
1044
1045 #define VSRAR(NAME, BIT, T, E) \
1046 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1047 { \
1048 int i; \
1049 VReg *Vd = (VReg *)vd; \
1050 VReg *Vj = (VReg *)vj; \
1051 VReg *Vk = (VReg *)vk; \
1052 int oprsz = simd_oprsz(desc); \
1053 \
1054 for (i = 0; i < oprsz / (BIT / 8); i++) { \
1055 Vd->E(i) = do_vsrar_ ## E(Vj->E(i), ((T)Vk->E(i))%BIT); \
1056 } \
1057 }
1058
1059 VSRAR(vsrar_b, 8, uint8_t, B)
1060 VSRAR(vsrar_h, 16, uint16_t, H)
1061 VSRAR(vsrar_w, 32, uint32_t, W)
1062 VSRAR(vsrar_d, 64, uint64_t, D)
1063
1064 #define VSRARI(NAME, BIT, E) \
1065 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1066 { \
1067 int i; \
1068 VReg *Vd = (VReg *)vd; \
1069 VReg *Vj = (VReg *)vj; \
1070 int oprsz = simd_oprsz(desc); \
1071 \
1072 for (i = 0; i < oprsz / (BIT / 8); i++) { \
1073 Vd->E(i) = do_vsrar_ ## E(Vj->E(i), imm); \
1074 } \
1075 }
1076
1077 VSRARI(vsrari_b, 8, B)
1078 VSRARI(vsrari_h, 16, H)
1079 VSRARI(vsrari_w, 32, W)
1080 VSRARI(vsrari_d, 64, D)
1081
1082 #define VSRLN(NAME, BIT, E1, E2) \
1083 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1084 { \
1085 int i, j, ofs; \
1086 VReg *Vd = (VReg *)vd; \
1087 VReg *Vj = (VReg *)vj; \
1088 VReg *Vk = (VReg *)vk; \
1089 int oprsz = simd_oprsz(desc); \
1090 \
1091 ofs = LSX_LEN / BIT; \
1092 for (i = 0; i < oprsz / 16; i++) { \
1093 for (j = 0; j < ofs; j++) { \
1094 Vd->E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), \
1095 Vk->E2(j + ofs * i) % BIT); \
1096 } \
1097 Vd->D(2 * i + 1) = 0; \
1098 } \
1099 }
1100
1101 VSRLN(vsrln_b_h, 16, B, UH)
1102 VSRLN(vsrln_h_w, 32, H, UW)
1103 VSRLN(vsrln_w_d, 64, W, UD)
1104
1105 #define VSRAN(NAME, BIT, E1, E2, E3) \
1106 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1107 { \
1108 int i, j, ofs; \
1109 VReg *Vd = (VReg *)vd; \
1110 VReg *Vj = (VReg *)vj; \
1111 VReg *Vk = (VReg *)vk; \
1112 int oprsz = simd_oprsz(desc); \
1113 \
1114 ofs = LSX_LEN / BIT; \
1115 for (i = 0; i < oprsz / 16; i++) { \
1116 for (j = 0; j < ofs; j++) { \
1117 Vd->E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), \
1118 Vk->E3(j + ofs * i) % BIT); \
1119 } \
1120 Vd->D(2 * i + 1) = 0; \
1121 } \
1122 }
1123
1124 VSRAN(vsran_b_h, 16, B, H, UH)
1125 VSRAN(vsran_h_w, 32, H, W, UW)
1126 VSRAN(vsran_w_d, 64, W, D, UD)
1127
1128 #define VSRLNI(NAME, BIT, E1, E2) \
1129 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1130 { \
1131 int i, j, ofs; \
1132 VReg temp = {}; \
1133 VReg *Vd = (VReg *)vd; \
1134 VReg *Vj = (VReg *)vj; \
1135 int oprsz = simd_oprsz(desc); \
1136 \
1137 ofs = LSX_LEN / BIT; \
1138 for (i = 0; i < oprsz / 16; i++) { \
1139 for (j = 0; j < ofs; j++) { \
1140 temp.E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), imm); \
1141 temp.E1(j + ofs * (2 * i + 1)) = R_SHIFT(Vd->E2(j + ofs * i), \
1142 imm); \
1143 } \
1144 } \
1145 *Vd = temp; \
1146 }
1147
1148 void HELPER(vsrlni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1149 {
1150 int i;
1151 VReg temp = {};
1152 VReg *Vd = (VReg *)vd;
1153 VReg *Vj = (VReg *)vj;
1154
1155 for (i = 0; i < 2; i++) {
1156 temp.D(2 * i) = int128_getlo(int128_urshift(Vj->Q(i), imm % 128));
1157 temp.D(2 * i +1) = int128_getlo(int128_urshift(Vd->Q(i), imm % 128));
1158 }
1159 *Vd = temp;
1160 }
1161
1162 VSRLNI(vsrlni_b_h, 16, B, UH)
1163 VSRLNI(vsrlni_h_w, 32, H, UW)
1164 VSRLNI(vsrlni_w_d, 64, W, UD)
1165
1166 #define VSRANI(NAME, BIT, E1, E2) \
1167 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1168 { \
1169 int i, j, ofs; \
1170 VReg temp = {}; \
1171 VReg *Vd = (VReg *)vd; \
1172 VReg *Vj = (VReg *)vj; \
1173 int oprsz = simd_oprsz(desc); \
1174 \
1175 ofs = LSX_LEN / BIT; \
1176 for (i = 0; i < oprsz / 16; i++) { \
1177 for (j = 0; j < ofs; j++) { \
1178 temp.E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), imm); \
1179 temp.E1(j + ofs * (2 * i + 1)) = R_SHIFT(Vd->E2(j + ofs * i), \
1180 imm); \
1181 } \
1182 } \
1183 *Vd = temp; \
1184 }
1185
HELPER(vsrani_d_q)1186 void HELPER(vsrani_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1187 {
1188 int i;
1189 VReg temp = {};
1190 VReg *Vd = (VReg *)vd;
1191 VReg *Vj = (VReg *)vj;
1192
1193 for (i = 0; i < 2; i++) {
1194 temp.D(2 * i) = int128_getlo(int128_rshift(Vj->Q(i), imm % 128));
1195 temp.D(2 * i + 1) = int128_getlo(int128_rshift(Vd->Q(i), imm % 128));
1196 }
1197 *Vd = temp;
1198 }
1199
1200 VSRANI(vsrani_b_h, 16, B, H)
1201 VSRANI(vsrani_h_w, 32, H, W)
1202 VSRANI(vsrani_w_d, 64, W, D)
1203
1204 #define VSRLRN(NAME, BIT, E1, E2, E3) \
1205 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1206 { \
1207 int i, j, ofs; \
1208 VReg *Vd = (VReg *)vd; \
1209 VReg *Vj = (VReg *)vj; \
1210 VReg *Vk = (VReg *)vk; \
1211 int oprsz = simd_oprsz(desc); \
1212 \
1213 ofs = LSX_LEN / BIT; \
1214 for (i = 0; i < oprsz / 16; i++) { \
1215 for (j = 0; j < ofs; j++) { \
1216 Vd->E1(j + ofs * 2 * i) = do_vsrlr_ ##E2(Vj->E2(j + ofs * i), \
1217 Vk->E3(j + ofs * i) % BIT); \
1218 } \
1219 Vd->D(2 * i + 1) = 0; \
1220 } \
1221 }
1222
1223 VSRLRN(vsrlrn_b_h, 16, B, H, UH)
1224 VSRLRN(vsrlrn_h_w, 32, H, W, UW)
1225 VSRLRN(vsrlrn_w_d, 64, W, D, UD)
1226
1227 #define VSRARN(NAME, BIT, E1, E2, E3) \
1228 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1229 { \
1230 int i, j, ofs; \
1231 VReg *Vd = (VReg *)vd; \
1232 VReg *Vj = (VReg *)vj; \
1233 VReg *Vk = (VReg *)vk; \
1234 int oprsz = simd_oprsz(desc); \
1235 \
1236 ofs = LSX_LEN / BIT; \
1237 for (i = 0; i < oprsz / 16; i++) { \
1238 for (j = 0; j < ofs; j++) { \
1239 Vd->E1(j + ofs * 2 * i) = do_vsrar_ ## E2(Vj->E2(j + ofs * i), \
1240 Vk->E3(j + ofs * i) % BIT); \
1241 } \
1242 Vd->D(2 * i + 1) = 0; \
1243 } \
1244 }
1245
1246 VSRARN(vsrarn_b_h, 16, B, H, UH)
1247 VSRARN(vsrarn_h_w, 32, H, W, UW)
1248 VSRARN(vsrarn_w_d, 64, W, D, UD)
1249
1250 #define VSRLRNI(NAME, BIT, E1, E2) \
1251 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1252 { \
1253 int i, j, ofs; \
1254 VReg temp = {}; \
1255 VReg *Vd = (VReg *)vd; \
1256 VReg *Vj = (VReg *)vj; \
1257 int oprsz = simd_oprsz(desc); \
1258 \
1259 ofs = LSX_LEN / BIT; \
1260 for (i = 0; i < oprsz / 16; i++) { \
1261 for (j = 0; j < ofs; j++) { \
1262 temp.E1(j + ofs * 2 * i) = do_vsrlr_ ## E2(Vj->E2(j + ofs * i), imm); \
1263 temp.E1(j + ofs * (2 * i + 1)) = do_vsrlr_ ## E2(Vd->E2(j + ofs * i), \
1264 imm); \
1265 } \
1266 } \
1267 *Vd = temp; \
1268 }
1269
HELPER(vsrlrni_d_q)1270 void HELPER(vsrlrni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1271 {
1272 int i;
1273 VReg temp = {};
1274 VReg *Vd = (VReg *)vd;
1275 VReg *Vj = (VReg *)vj;
1276 Int128 r[4];
1277 int oprsz = simd_oprsz(desc);
1278
1279 for (i = 0; i < oprsz / 16; i++) {
1280 if (imm == 0) {
1281 temp.D(2 * i) = int128_getlo(Vj->Q(i));
1282 temp.D(2 * i + 1) = int128_getlo(Vd->Q(i));
1283 } else {
1284 r[2 * i] = int128_and(int128_urshift(Vj->Q(i), (imm - 1)),
1285 int128_one());
1286 r[2 * i + 1] = int128_and(int128_urshift(Vd->Q(i), (imm - 1)),
1287 int128_one());
1288 temp.D(2 * i) = int128_getlo(int128_add(int128_urshift(Vj->Q(i),
1289 imm), r[2 * i]));
1290 temp.D(2 * i + 1) = int128_getlo(int128_add(int128_urshift(Vd->Q(i),
1291 imm), r[ 2 * i + 1]));
1292 }
1293 }
1294 *Vd = temp;
1295 }
1296
1297 VSRLRNI(vsrlrni_b_h, 16, B, H)
1298 VSRLRNI(vsrlrni_h_w, 32, H, W)
1299 VSRLRNI(vsrlrni_w_d, 64, W, D)
1300
1301 #define VSRARNI(NAME, BIT, E1, E2) \
1302 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1303 { \
1304 int i, j, ofs; \
1305 VReg temp = {}; \
1306 VReg *Vd = (VReg *)vd; \
1307 VReg *Vj = (VReg *)vj; \
1308 int oprsz = simd_oprsz(desc); \
1309 \
1310 ofs = LSX_LEN / BIT; \
1311 for (i = 0; i < oprsz / 16; i++) { \
1312 for (j = 0; j < ofs; j++) { \
1313 temp.E1(j + ofs * 2 * i) = do_vsrar_ ## E2(Vj->E2(j + ofs * i), imm); \
1314 temp.E1(j + ofs * (2 * i + 1)) = do_vsrar_ ## E2(Vd->E2(j + ofs * i), \
1315 imm); \
1316 } \
1317 } \
1318 *Vd = temp; \
1319 }
1320
HELPER(vsrarni_d_q)1321 void HELPER(vsrarni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1322 {
1323 int i;
1324 VReg temp = {};
1325 VReg *Vd = (VReg *)vd;
1326 VReg *Vj = (VReg *)vj;
1327 Int128 r[4];
1328 int oprsz = simd_oprsz(desc);
1329
1330 for (i = 0; i < oprsz / 16; i++) {
1331 if (imm == 0) {
1332 temp.D(2 * i) = int128_getlo(Vj->Q(i));
1333 temp.D(2 * i + 1) = int128_getlo(Vd->Q(i));
1334 } else {
1335 r[2 * i] = int128_and(int128_rshift(Vj->Q(i), (imm - 1)),
1336 int128_one());
1337 r[2 * i + 1] = int128_and(int128_rshift(Vd->Q(i), (imm - 1)),
1338 int128_one());
1339 temp.D(2 * i) = int128_getlo(int128_add(int128_rshift(Vj->Q(i),
1340 imm), r[2 * i]));
1341 temp.D(2 * i + 1) = int128_getlo(int128_add(int128_rshift(Vd->Q(i),
1342 imm), r[2 * i + 1]));
1343 }
1344 }
1345 *Vd = temp;
1346 }
1347
1348 VSRARNI(vsrarni_b_h, 16, B, H)
1349 VSRARNI(vsrarni_h_w, 32, H, W)
1350 VSRARNI(vsrarni_w_d, 64, W, D)
1351
1352 #define SSRLNS(NAME, T1, T2, T3) \
1353 static T1 do_ssrlns_ ## NAME(T2 e2, int sa, int sh) \
1354 { \
1355 T1 shft_res; \
1356 if (sa == 0) { \
1357 shft_res = e2; \
1358 } else { \
1359 shft_res = (((T1)e2) >> sa); \
1360 } \
1361 T3 mask; \
1362 mask = (1ull << sh) -1; \
1363 if (shft_res > mask) { \
1364 return mask; \
1365 } else { \
1366 return shft_res; \
1367 } \
1368 }
1369
SSRLNS(B,uint16_t,int16_t,uint8_t)1370 SSRLNS(B, uint16_t, int16_t, uint8_t)
1371 SSRLNS(H, uint32_t, int32_t, uint16_t)
1372 SSRLNS(W, uint64_t, int64_t, uint32_t)
1373
1374 #define VSSRLN(NAME, BIT, E1, E2, E3) \
1375 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1376 { \
1377 int i, j, ofs; \
1378 VReg *Vd = (VReg *)vd; \
1379 VReg *Vj = (VReg *)vj; \
1380 VReg *Vk = (VReg *)vk; \
1381 int oprsz = simd_oprsz(desc); \
1382 \
1383 ofs = LSX_LEN / BIT; \
1384 for (i = 0; i < oprsz / 16; i++) { \
1385 for (j = 0; j < ofs; j++) { \
1386 Vd->E1(j + ofs * 2 * i) = do_ssrlns_ ## E1(Vj->E2(j + ofs * i), \
1387 Vk->E3(j + ofs * i) % BIT, \
1388 BIT / 2 - 1); \
1389 } \
1390 Vd->D(2 * i + 1) = 0; \
1391 } \
1392 }
1393
1394 VSSRLN(vssrln_b_h, 16, B, H, UH)
1395 VSSRLN(vssrln_h_w, 32, H, W, UW)
1396 VSSRLN(vssrln_w_d, 64, W, D, UD)
1397
1398 #define SSRANS(E, T1, T2) \
1399 static T1 do_ssrans_ ## E(T1 e2, int sa, int sh) \
1400 { \
1401 T1 shft_res; \
1402 if (sa == 0) { \
1403 shft_res = e2; \
1404 } else { \
1405 shft_res = e2 >> sa; \
1406 } \
1407 T2 mask; \
1408 mask = (1ll << sh) - 1; \
1409 if (shft_res > mask) { \
1410 return mask; \
1411 } else if (shft_res < -(mask + 1)) { \
1412 return ~mask; \
1413 } else { \
1414 return shft_res; \
1415 } \
1416 }
1417
1418 SSRANS(B, int16_t, int8_t)
1419 SSRANS(H, int32_t, int16_t)
1420 SSRANS(W, int64_t, int32_t)
1421
1422 #define VSSRAN(NAME, BIT, E1, E2, E3) \
1423 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1424 { \
1425 int i, j, ofs; \
1426 VReg *Vd = (VReg *)vd; \
1427 VReg *Vj = (VReg *)vj; \
1428 VReg *Vk = (VReg *)vk; \
1429 int oprsz = simd_oprsz(desc); \
1430 \
1431 ofs = LSX_LEN / BIT; \
1432 for (i = 0; i < oprsz / 16; i++) { \
1433 for (j = 0; j < ofs; j++) { \
1434 Vd->E1(j + ofs * 2 * i) = do_ssrans_ ## E1(Vj->E2(j + ofs * i), \
1435 Vk->E3(j + ofs * i) % BIT, \
1436 BIT / 2 - 1); \
1437 } \
1438 Vd->D(2 * i + 1) = 0; \
1439 } \
1440 }
1441
1442 VSSRAN(vssran_b_h, 16, B, H, UH)
1443 VSSRAN(vssran_h_w, 32, H, W, UW)
1444 VSSRAN(vssran_w_d, 64, W, D, UD)
1445
1446 #define SSRLNU(E, T1, T2, T3) \
1447 static T1 do_ssrlnu_ ## E(T3 e2, int sa, int sh) \
1448 { \
1449 T1 shft_res; \
1450 if (sa == 0) { \
1451 shft_res = e2; \
1452 } else { \
1453 shft_res = (((T1)e2) >> sa); \
1454 } \
1455 T2 mask; \
1456 mask = (1ull << sh) - 1; \
1457 if (shft_res > mask) { \
1458 return mask; \
1459 } else { \
1460 return shft_res; \
1461 } \
1462 }
1463
1464 SSRLNU(B, uint16_t, uint8_t, int16_t)
1465 SSRLNU(H, uint32_t, uint16_t, int32_t)
1466 SSRLNU(W, uint64_t, uint32_t, int64_t)
1467
1468 #define VSSRLNU(NAME, BIT, E1, E2, E3) \
1469 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1470 { \
1471 int i, j, ofs; \
1472 VReg *Vd = (VReg *)vd; \
1473 VReg *Vj = (VReg *)vj; \
1474 VReg *Vk = (VReg *)vk; \
1475 int oprsz = simd_oprsz(desc); \
1476 \
1477 ofs = LSX_LEN / BIT; \
1478 for (i = 0; i < oprsz / 16; i++) { \
1479 for (j = 0; j < ofs; j++) { \
1480 Vd->E1(j + ofs * 2 * i) = do_ssrlnu_ ## E1(Vj->E2(j + ofs * i), \
1481 Vk->E3(j + ofs * i) % BIT, \
1482 BIT / 2); \
1483 } \
1484 Vd->D(2 * i + 1) = 0; \
1485 } \
1486 }
1487
1488 VSSRLNU(vssrln_bu_h, 16, B, H, UH)
1489 VSSRLNU(vssrln_hu_w, 32, H, W, UW)
1490 VSSRLNU(vssrln_wu_d, 64, W, D, UD)
1491
1492 #define SSRANU(E, T1, T2, T3) \
1493 static T1 do_ssranu_ ## E(T3 e2, int sa, int sh) \
1494 { \
1495 T1 shft_res; \
1496 if (sa == 0) { \
1497 shft_res = e2; \
1498 } else { \
1499 shft_res = e2 >> sa; \
1500 } \
1501 if (e2 < 0) { \
1502 shft_res = 0; \
1503 } \
1504 T2 mask; \
1505 mask = (1ull << sh) - 1; \
1506 if (shft_res > mask) { \
1507 return mask; \
1508 } else { \
1509 return shft_res; \
1510 } \
1511 }
1512
1513 SSRANU(B, uint16_t, uint8_t, int16_t)
1514 SSRANU(H, uint32_t, uint16_t, int32_t)
1515 SSRANU(W, uint64_t, uint32_t, int64_t)
1516
1517 #define VSSRANU(NAME, BIT, E1, E2, E3) \
1518 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1519 { \
1520 int i, j, ofs; \
1521 VReg *Vd = (VReg *)vd; \
1522 VReg *Vj = (VReg *)vj; \
1523 VReg *Vk = (VReg *)vk; \
1524 int oprsz = simd_oprsz(desc); \
1525 \
1526 ofs = LSX_LEN / BIT; \
1527 for (i = 0; i < oprsz / 16; i++) { \
1528 for (j = 0; j < ofs; j++) { \
1529 Vd->E1(j + ofs * 2 * i) = do_ssranu_ ## E1(Vj->E2(j + ofs * i), \
1530 Vk->E3(j + ofs * i) % BIT, \
1531 BIT / 2); \
1532 } \
1533 Vd->D(2 * i + 1) = 0; \
1534 } \
1535 }
1536
1537 VSSRANU(vssran_bu_h, 16, B, H, UH)
1538 VSSRANU(vssran_hu_w, 32, H, W, UW)
1539 VSSRANU(vssran_wu_d, 64, W, D, UD)
1540
1541 #define VSSRLNI(NAME, BIT, E1, E2) \
1542 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1543 { \
1544 int i, j, ofs; \
1545 VReg temp = {}; \
1546 VReg *Vd = (VReg *)vd; \
1547 VReg *Vj = (VReg *)vj; \
1548 int oprsz = simd_oprsz(desc); \
1549 \
1550 ofs = LSX_LEN / BIT; \
1551 for (i = 0; i < oprsz / 16; i++) { \
1552 for (j = 0; j < ofs; j++) { \
1553 temp.E1(j + ofs * 2 * i) = do_ssrlns_ ## E1(Vj->E2(j + ofs * i), \
1554 imm, BIT / 2 - 1); \
1555 temp.E1(j + ofs * (2 * i + 1)) = do_ssrlns_ ## E1(Vd->E2(j + ofs * i), \
1556 imm, BIT / 2 - 1); \
1557 } \
1558 } \
1559 *Vd = temp; \
1560 }
1561
1562 static void do_vssrlni_q(VReg *Vd, VReg *Vj,
1563 uint64_t imm, int idx, Int128 mask)
1564 {
1565 Int128 shft_res1, shft_res2;
1566
1567 if (imm == 0) {
1568 shft_res1 = Vj->Q(idx);
1569 shft_res2 = Vd->Q(idx);
1570 } else {
1571 shft_res1 = int128_urshift(Vj->Q(idx), imm);
1572 shft_res2 = int128_urshift(Vd->Q(idx), imm);
1573 }
1574
1575 if (int128_ult(mask, shft_res1)) {
1576 Vd->D(idx * 2) = int128_getlo(mask);
1577 }else {
1578 Vd->D(idx * 2) = int128_getlo(shft_res1);
1579 }
1580
1581 if (int128_ult(mask, shft_res2)) {
1582 Vd->D(idx * 2 + 1) = int128_getlo(mask);
1583 }else {
1584 Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
1585 }
1586 }
1587
HELPER(vssrlni_d_q)1588 void HELPER(vssrlni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1589 {
1590 int i;
1591 Int128 mask;
1592 VReg *Vd = (VReg *)vd;
1593 VReg *Vj = (VReg *)vj;
1594 int oprsz = simd_oprsz(desc);
1595
1596 mask = int128_sub(int128_lshift(int128_one(), 63), int128_one());
1597
1598 for (i = 0; i < oprsz / 16; i++) {
1599 do_vssrlni_q(Vd, Vj, imm, i, mask);
1600 }
1601 }
1602
1603 VSSRLNI(vssrlni_b_h, 16, B, H)
1604 VSSRLNI(vssrlni_h_w, 32, H, W)
1605 VSSRLNI(vssrlni_w_d, 64, W, D)
1606
1607 #define VSSRANI(NAME, BIT, E1, E2) \
1608 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1609 { \
1610 int i, j, ofs; \
1611 VReg temp = {}; \
1612 VReg *Vd = (VReg *)vd; \
1613 VReg *Vj = (VReg *)vj; \
1614 int oprsz = simd_oprsz(desc); \
1615 \
1616 ofs = LSX_LEN / BIT; \
1617 for (i = 0; i < oprsz / 16; i++) { \
1618 for (j = 0; j < ofs; j++) { \
1619 temp.E1(j + ofs * 2 * i) = do_ssrans_ ## E1(Vj->E2(j + ofs * i), \
1620 imm, BIT / 2 - 1); \
1621 temp.E1(j + ofs * (2 * i + 1)) = do_ssrans_ ## E1(Vd->E2(j + ofs * i), \
1622 imm, BIT / 2 - 1); \
1623 } \
1624 } \
1625 *Vd = temp; \
1626 }
1627
do_vssrani_d_q(VReg * Vd,VReg * Vj,uint64_t imm,int idx,Int128 mask,Int128 min)1628 static void do_vssrani_d_q(VReg *Vd, VReg *Vj,
1629 uint64_t imm, int idx, Int128 mask, Int128 min)
1630 {
1631 Int128 shft_res1, shft_res2;
1632
1633 if (imm == 0) {
1634 shft_res1 = Vj->Q(idx);
1635 shft_res2 = Vd->Q(idx);
1636 } else {
1637 shft_res1 = int128_rshift(Vj->Q(idx), imm);
1638 shft_res2 = int128_rshift(Vd->Q(idx), imm);
1639 }
1640
1641 if (int128_gt(shft_res1, mask)) {
1642 Vd->D(idx * 2) = int128_getlo(mask);
1643 } else if (int128_lt(shft_res1, int128_neg(min))) {
1644 Vd->D(idx * 2) = int128_getlo(min);
1645 } else {
1646 Vd->D(idx * 2) = int128_getlo(shft_res1);
1647 }
1648
1649 if (int128_gt(shft_res2, mask)) {
1650 Vd->D(idx * 2 + 1) = int128_getlo(mask);
1651 } else if (int128_lt(shft_res2, int128_neg(min))) {
1652 Vd->D(idx * 2 + 1) = int128_getlo(min);
1653 } else {
1654 Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
1655 }
1656 }
1657
HELPER(vssrani_d_q)1658 void HELPER(vssrani_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1659 {
1660 int i;
1661 Int128 mask, min;
1662 VReg *Vd = (VReg *)vd;
1663 VReg *Vj = (VReg *)vj;
1664 int oprsz = simd_oprsz(desc);
1665
1666 mask = int128_sub(int128_lshift(int128_one(), 63), int128_one());
1667 min = int128_lshift(int128_one(), 63);
1668
1669 for (i = 0; i < oprsz / 16; i++) {
1670 do_vssrani_d_q(Vd, Vj, imm, i, mask, min);
1671 }
1672 }
1673
1674
1675 VSSRANI(vssrani_b_h, 16, B, H)
1676 VSSRANI(vssrani_h_w, 32, H, W)
1677 VSSRANI(vssrani_w_d, 64, W, D)
1678
1679 #define VSSRLNUI(NAME, BIT, E1, E2) \
1680 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1681 { \
1682 int i, j, ofs; \
1683 VReg temp = {}; \
1684 VReg *Vd = (VReg *)vd; \
1685 VReg *Vj = (VReg *)vj; \
1686 int oprsz = simd_oprsz(desc); \
1687 \
1688 ofs = LSX_LEN / BIT; \
1689 for (i = 0; i < oprsz / 16; i++) { \
1690 for (j = 0; j < ofs; j++) { \
1691 temp.E1(j + ofs * 2 * i) = do_ssrlnu_ ## E1(Vj->E2(j + ofs * i), \
1692 imm, BIT / 2); \
1693 temp.E1(j + ofs * (2 * i + 1)) = do_ssrlnu_ ## E1(Vd->E2(j + ofs * i), \
1694 imm, BIT / 2); \
1695 } \
1696 } \
1697 *Vd = temp; \
1698 }
1699
HELPER(vssrlni_du_q)1700 void HELPER(vssrlni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1701 {
1702 int i;
1703 Int128 mask;
1704 VReg *Vd = (VReg *)vd;
1705 VReg *Vj = (VReg *)vj;
1706 int oprsz = simd_oprsz(desc);
1707
1708 mask = int128_sub(int128_lshift(int128_one(), 64), int128_one());
1709
1710 for (i = 0; i < oprsz / 16; i++) {
1711 do_vssrlni_q(Vd, Vj, imm, i, mask);
1712 }
1713 }
1714
1715 VSSRLNUI(vssrlni_bu_h, 16, B, H)
1716 VSSRLNUI(vssrlni_hu_w, 32, H, W)
1717 VSSRLNUI(vssrlni_wu_d, 64, W, D)
1718
1719 #define VSSRANUI(NAME, BIT, E1, E2) \
1720 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1721 { \
1722 int i, j, ofs; \
1723 VReg temp = {}; \
1724 VReg *Vd = (VReg *)vd; \
1725 VReg *Vj = (VReg *)vj; \
1726 int oprsz = simd_oprsz(desc); \
1727 \
1728 ofs = LSX_LEN / BIT; \
1729 for (i = 0; i < oprsz / 16; i++) { \
1730 for (j = 0; j < ofs; j++) { \
1731 temp.E1(j + ofs * 2 * i) = do_ssranu_ ## E1(Vj->E2(j + ofs * i), \
1732 imm, BIT / 2); \
1733 temp.E1(j + ofs * (2 * i + 1)) = do_ssranu_ ## E1(Vd->E2(j + ofs * i), \
1734 imm, BIT / 2); \
1735 } \
1736 } \
1737 *Vd = temp; \
1738 }
1739
do_vssrani_du_q(VReg * Vd,VReg * Vj,uint64_t imm,int idx,Int128 mask)1740 static void do_vssrani_du_q(VReg *Vd, VReg *Vj,
1741 uint64_t imm, int idx, Int128 mask)
1742 {
1743 Int128 shft_res1, shft_res2;
1744
1745 if (imm == 0) {
1746 shft_res1 = Vj->Q(idx);
1747 shft_res2 = Vd->Q(idx);
1748 } else {
1749 shft_res1 = int128_rshift(Vj->Q(idx), imm);
1750 shft_res2 = int128_rshift(Vd->Q(idx), imm);
1751 }
1752
1753 if (int128_lt(Vj->Q(idx), int128_zero())) {
1754 shft_res1 = int128_zero();
1755 }
1756
1757 if (int128_lt(Vd->Q(idx), int128_zero())) {
1758 shft_res2 = int128_zero();
1759 }
1760 if (int128_ult(mask, shft_res1)) {
1761 Vd->D(idx * 2) = int128_getlo(mask);
1762 }else {
1763 Vd->D(idx * 2) = int128_getlo(shft_res1);
1764 }
1765
1766 if (int128_ult(mask, shft_res2)) {
1767 Vd->D(idx * 2 + 1) = int128_getlo(mask);
1768 }else {
1769 Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
1770 }
1771
1772 }
1773
HELPER(vssrani_du_q)1774 void HELPER(vssrani_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
1775 {
1776 int i;
1777 Int128 mask;
1778 VReg *Vd = (VReg *)vd;
1779 VReg *Vj = (VReg *)vj;
1780 int oprsz = simd_oprsz(desc);
1781
1782 mask = int128_sub(int128_lshift(int128_one(), 64), int128_one());
1783
1784 for (i = 0; i < oprsz / 16; i++) {
1785 do_vssrani_du_q(Vd, Vj, imm, i, mask);
1786 }
1787 }
1788
1789 VSSRANUI(vssrani_bu_h, 16, B, H)
1790 VSSRANUI(vssrani_hu_w, 32, H, W)
1791 VSSRANUI(vssrani_wu_d, 64, W, D)
1792
1793 #define SSRLRNS(E1, E2, T1, T2, T3) \
1794 static T1 do_ssrlrns_ ## E1(T2 e2, int sa, int sh) \
1795 { \
1796 T1 shft_res; \
1797 \
1798 shft_res = do_vsrlr_ ## E2(e2, sa); \
1799 T1 mask; \
1800 mask = (1ull << sh) - 1; \
1801 if (shft_res > mask) { \
1802 return mask; \
1803 } else { \
1804 return shft_res; \
1805 } \
1806 }
1807
SSRLRNS(B,H,uint16_t,int16_t,uint8_t)1808 SSRLRNS(B, H, uint16_t, int16_t, uint8_t)
1809 SSRLRNS(H, W, uint32_t, int32_t, uint16_t)
1810 SSRLRNS(W, D, uint64_t, int64_t, uint32_t)
1811
1812 #define VSSRLRN(NAME, BIT, E1, E2, E3) \
1813 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1814 { \
1815 int i, j, ofs; \
1816 VReg *Vd = (VReg *)vd; \
1817 VReg *Vj = (VReg *)vj; \
1818 VReg *Vk = (VReg *)vk; \
1819 int oprsz = simd_oprsz(desc); \
1820 \
1821 ofs = LSX_LEN / BIT; \
1822 for (i = 0; i < oprsz / 16; i++) { \
1823 for (j = 0; j < ofs; j++) { \
1824 Vd->E1(j + ofs * 2 * i) = do_ssrlrns_ ## E1(Vj->E2(j + ofs * i), \
1825 Vk->E3(j + ofs * i) % BIT, \
1826 BIT / 2 - 1); \
1827 } \
1828 Vd->D(2 * i + 1) = 0; \
1829 } \
1830 }
1831
1832 VSSRLRN(vssrlrn_b_h, 16, B, H, UH)
1833 VSSRLRN(vssrlrn_h_w, 32, H, W, UW)
1834 VSSRLRN(vssrlrn_w_d, 64, W, D, UD)
1835
1836 #define SSRARNS(E1, E2, T1, T2) \
1837 static T1 do_ssrarns_ ## E1(T1 e2, int sa, int sh) \
1838 { \
1839 T1 shft_res; \
1840 \
1841 shft_res = do_vsrar_ ## E2(e2, sa); \
1842 T2 mask; \
1843 mask = (1ll << sh) - 1; \
1844 if (shft_res > mask) { \
1845 return mask; \
1846 } else if (shft_res < -(mask +1)) { \
1847 return ~mask; \
1848 } else { \
1849 return shft_res; \
1850 } \
1851 }
1852
1853 SSRARNS(B, H, int16_t, int8_t)
1854 SSRARNS(H, W, int32_t, int16_t)
1855 SSRARNS(W, D, int64_t, int32_t)
1856
1857 #define VSSRARN(NAME, BIT, E1, E2, E3) \
1858 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1859 { \
1860 int i, j, ofs; \
1861 VReg *Vd = (VReg *)vd; \
1862 VReg *Vj = (VReg *)vj; \
1863 VReg *Vk = (VReg *)vk; \
1864 int oprsz = simd_oprsz(desc); \
1865 \
1866 ofs = LSX_LEN / BIT; \
1867 for (i = 0; i < oprsz / 16; i++) { \
1868 for (j = 0; j < ofs; j++) { \
1869 Vd->E1(j + ofs * 2 * i) = do_ssrarns_ ## E1(Vj->E2(j + ofs * i), \
1870 Vk->E3(j + ofs * i) % BIT, \
1871 BIT/ 2 - 1); \
1872 } \
1873 Vd->D(2 * i + 1) = 0; \
1874 } \
1875 }
1876
1877 VSSRARN(vssrarn_b_h, 16, B, H, UH)
1878 VSSRARN(vssrarn_h_w, 32, H, W, UW)
1879 VSSRARN(vssrarn_w_d, 64, W, D, UD)
1880
1881 #define SSRLRNU(E1, E2, T1, T2, T3) \
1882 static T1 do_ssrlrnu_ ## E1(T3 e2, int sa, int sh) \
1883 { \
1884 T1 shft_res; \
1885 \
1886 shft_res = do_vsrlr_ ## E2(e2, sa); \
1887 \
1888 T2 mask; \
1889 mask = (1ull << sh) - 1; \
1890 if (shft_res > mask) { \
1891 return mask; \
1892 } else { \
1893 return shft_res; \
1894 } \
1895 }
1896
1897 SSRLRNU(B, H, uint16_t, uint8_t, int16_t)
1898 SSRLRNU(H, W, uint32_t, uint16_t, int32_t)
1899 SSRLRNU(W, D, uint64_t, uint32_t, int64_t)
1900
1901 #define VSSRLRNU(NAME, BIT, E1, E2, E3) \
1902 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1903 { \
1904 int i, j, ofs; \
1905 VReg *Vd = (VReg *)vd; \
1906 VReg *Vj = (VReg *)vj; \
1907 VReg *Vk = (VReg *)vk; \
1908 int oprsz = simd_oprsz(desc); \
1909 \
1910 ofs = LSX_LEN / BIT; \
1911 for (i = 0; i < oprsz / 16; i++) { \
1912 for (j = 0; j < ofs; j++) { \
1913 Vd->E1(j + ofs * 2 * i) = do_ssrlrnu_ ## E1(Vj->E2(j + ofs * i), \
1914 Vk->E3(j + ofs * i) % BIT, \
1915 BIT / 2); \
1916 } \
1917 Vd->D(2 * i + 1) = 0; \
1918 } \
1919 }
1920
1921 VSSRLRNU(vssrlrn_bu_h, 16, B, H, UH)
1922 VSSRLRNU(vssrlrn_hu_w, 32, H, W, UW)
1923 VSSRLRNU(vssrlrn_wu_d, 64, W, D, UD)
1924
1925 #define SSRARNU(E1, E2, T1, T2, T3) \
1926 static T1 do_ssrarnu_ ## E1(T3 e2, int sa, int sh) \
1927 { \
1928 T1 shft_res; \
1929 \
1930 if (e2 < 0) { \
1931 shft_res = 0; \
1932 } else { \
1933 shft_res = do_vsrar_ ## E2(e2, sa); \
1934 } \
1935 T2 mask; \
1936 mask = (1ull << sh) - 1; \
1937 if (shft_res > mask) { \
1938 return mask; \
1939 } else { \
1940 return shft_res; \
1941 } \
1942 }
1943
1944 SSRARNU(B, H, uint16_t, uint8_t, int16_t)
1945 SSRARNU(H, W, uint32_t, uint16_t, int32_t)
1946 SSRARNU(W, D, uint64_t, uint32_t, int64_t)
1947
1948 #define VSSRARNU(NAME, BIT, E1, E2, E3) \
1949 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
1950 { \
1951 int i, j, ofs; \
1952 VReg *Vd = (VReg *)vd; \
1953 VReg *Vj = (VReg *)vj; \
1954 VReg *Vk = (VReg *)vk; \
1955 int oprsz = simd_oprsz(desc); \
1956 \
1957 ofs = LSX_LEN / BIT; \
1958 for (i = 0; i < oprsz / 16; i++) { \
1959 for (j = 0; j < ofs; j++) { \
1960 Vd->E1(j + ofs * 2 * i) = do_ssrarnu_ ## E1(Vj->E2(j + ofs * i), \
1961 Vk->E3(j + ofs * i) % BIT, \
1962 BIT / 2); \
1963 } \
1964 Vd->D(2 * i + 1) = 0; \
1965 } \
1966 }
1967
1968 VSSRARNU(vssrarn_bu_h, 16, B, H, UH)
1969 VSSRARNU(vssrarn_hu_w, 32, H, W, UW)
1970 VSSRARNU(vssrarn_wu_d, 64, W, D, UD)
1971
1972 #define VSSRLRNI(NAME, BIT, E1, E2) \
1973 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
1974 { \
1975 int i, j, ofs; \
1976 VReg temp = {}; \
1977 VReg *Vd = (VReg *)vd; \
1978 VReg *Vj = (VReg *)vj; \
1979 int oprsz = simd_oprsz(desc); \
1980 \
1981 ofs = LSX_LEN / BIT; \
1982 for (i = 0; i < oprsz / 16; i++) { \
1983 for (j = 0; j < ofs; j++) { \
1984 temp.E1(j + ofs * 2 * i) = do_ssrlrns_ ## E1(Vj->E2(j + ofs * i), \
1985 imm, BIT / 2 - 1); \
1986 temp.E1(j + ofs * (2 * i + 1)) = do_ssrlrns_ ## E1(Vd->E2(j + ofs * i), \
1987 imm, BIT / 2 - 1); \
1988 } \
1989 } \
1990 *Vd = temp; \
1991 }
1992
1993 static void do_vssrlrni_q(VReg *Vd, VReg * Vj,
1994 uint64_t imm, int idx, Int128 mask)
1995 {
1996 Int128 shft_res1, shft_res2, r1, r2;
1997 if (imm == 0) {
1998 shft_res1 = Vj->Q(idx);
1999 shft_res2 = Vd->Q(idx);
2000 } else {
2001 r1 = int128_and(int128_urshift(Vj->Q(idx), (imm - 1)), int128_one());
2002 r2 = int128_and(int128_urshift(Vd->Q(idx), (imm - 1)), int128_one());
2003 shft_res1 = (int128_add(int128_urshift(Vj->Q(idx), imm), r1));
2004 shft_res2 = (int128_add(int128_urshift(Vd->Q(idx), imm), r2));
2005 }
2006
2007 if (int128_ult(mask, shft_res1)) {
2008 Vd->D(idx * 2) = int128_getlo(mask);
2009 }else {
2010 Vd->D(idx * 2) = int128_getlo(shft_res1);
2011 }
2012
2013 if (int128_ult(mask, shft_res2)) {
2014 Vd->D(idx * 2 + 1) = int128_getlo(mask);
2015 }else {
2016 Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
2017 }
2018 }
2019
HELPER(vssrlrni_d_q)2020 void HELPER(vssrlrni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
2021 {
2022 int i;
2023 Int128 mask;
2024 VReg *Vd = (VReg *)vd;
2025 VReg *Vj = (VReg *)vj;
2026 int oprsz = simd_oprsz(desc);
2027
2028 mask = int128_sub(int128_lshift(int128_one(), 63), int128_one());
2029
2030 for (i = 0; i < oprsz / 16; i++) {
2031 do_vssrlrni_q(Vd, Vj, imm, i, mask);
2032 }
2033 }
2034
2035 VSSRLRNI(vssrlrni_b_h, 16, B, H)
2036 VSSRLRNI(vssrlrni_h_w, 32, H, W)
2037 VSSRLRNI(vssrlrni_w_d, 64, W, D)
2038
2039 #define VSSRARNI(NAME, BIT, E1, E2) \
2040 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
2041 { \
2042 int i, j, ofs; \
2043 VReg temp = {}; \
2044 VReg *Vd = (VReg *)vd; \
2045 VReg *Vj = (VReg *)vj; \
2046 int oprsz = simd_oprsz(desc); \
2047 \
2048 ofs = LSX_LEN / BIT; \
2049 for (i = 0; i < oprsz / 16; i++) { \
2050 for (j = 0; j < ofs; j++) { \
2051 temp.E1(j + ofs * 2 * i) = do_ssrarns_ ## E1(Vj->E2(j + ofs * i), \
2052 imm, BIT / 2 - 1); \
2053 temp.E1(j + ofs * (2 * i + 1)) = do_ssrarns_ ## E1(Vd->E2(j + ofs * i), \
2054 imm, BIT / 2 - 1); \
2055 } \
2056 } \
2057 *Vd = temp; \
2058 }
2059
do_vssrarni_d_q(VReg * Vd,VReg * Vj,uint64_t imm,int idx,Int128 mask1,Int128 mask2)2060 static void do_vssrarni_d_q(VReg *Vd, VReg *Vj,
2061 uint64_t imm, int idx, Int128 mask1, Int128 mask2)
2062 {
2063 Int128 shft_res1, shft_res2, r1, r2;
2064
2065 if (imm == 0) {
2066 shft_res1 = Vj->Q(idx);
2067 shft_res2 = Vd->Q(idx);
2068 } else {
2069 r1 = int128_and(int128_rshift(Vj->Q(idx), (imm - 1)), int128_one());
2070 r2 = int128_and(int128_rshift(Vd->Q(idx), (imm - 1)), int128_one());
2071 shft_res1 = int128_add(int128_rshift(Vj->Q(idx), imm), r1);
2072 shft_res2 = int128_add(int128_rshift(Vd->Q(idx), imm), r2);
2073 }
2074 if (int128_gt(shft_res1, mask1)) {
2075 Vd->D(idx * 2) = int128_getlo(mask1);
2076 } else if (int128_lt(shft_res1, int128_neg(mask2))) {
2077 Vd->D(idx * 2) = int128_getlo(mask2);
2078 } else {
2079 Vd->D(idx * 2) = int128_getlo(shft_res1);
2080 }
2081
2082 if (int128_gt(shft_res2, mask1)) {
2083 Vd->D(idx * 2 + 1) = int128_getlo(mask1);
2084 } else if (int128_lt(shft_res2, int128_neg(mask2))) {
2085 Vd->D(idx * 2 + 1) = int128_getlo(mask2);
2086 } else {
2087 Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
2088 }
2089 }
2090
HELPER(vssrarni_d_q)2091 void HELPER(vssrarni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
2092 {
2093 int i;
2094 Int128 mask1, mask2;
2095 VReg *Vd = (VReg *)vd;
2096 VReg *Vj = (VReg *)vj;
2097 int oprsz = simd_oprsz(desc);
2098
2099 mask1 = int128_sub(int128_lshift(int128_one(), 63), int128_one());
2100 mask2 = int128_lshift(int128_one(), 63);
2101
2102 for (i = 0; i < oprsz / 16; i++) {
2103 do_vssrarni_d_q(Vd, Vj, imm, i, mask1, mask2);
2104 }
2105 }
2106
2107 VSSRARNI(vssrarni_b_h, 16, B, H)
2108 VSSRARNI(vssrarni_h_w, 32, H, W)
2109 VSSRARNI(vssrarni_w_d, 64, W, D)
2110
2111 #define VSSRLRNUI(NAME, BIT, E1, E2) \
2112 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
2113 { \
2114 int i, j, ofs; \
2115 VReg temp = {}; \
2116 VReg *Vd = (VReg *)vd; \
2117 VReg *Vj = (VReg *)vj; \
2118 int oprsz = simd_oprsz(desc); \
2119 \
2120 ofs = LSX_LEN / BIT; \
2121 for (i = 0; i < oprsz / 16; i++) { \
2122 for (j = 0; j < ofs; j++) { \
2123 temp.E1(j + ofs * 2 * i) = do_ssrlrnu_ ## E1(Vj->E2(j + ofs * i), \
2124 imm, BIT / 2); \
2125 temp.E1(j + ofs * (2 * i + 1)) = do_ssrlrnu_ ## E1(Vd->E2(j + ofs * i), \
2126 imm, BIT / 2); \
2127 } \
2128 } \
2129 *Vd = temp; \
2130 }
2131
HELPER(vssrlrni_du_q)2132 void HELPER(vssrlrni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
2133 {
2134 int i;
2135 Int128 mask;
2136 VReg *Vd = (VReg *)vd;
2137 VReg *Vj = (VReg *)vj;
2138 int oprsz = simd_oprsz(desc);
2139
2140 mask = int128_sub(int128_lshift(int128_one(), 64), int128_one());
2141
2142 for (i = 0; i < oprsz / 16; i++) {
2143 do_vssrlrni_q(Vd, Vj, imm, i, mask);
2144 }
2145 }
2146
2147 VSSRLRNUI(vssrlrni_bu_h, 16, B, H)
2148 VSSRLRNUI(vssrlrni_hu_w, 32, H, W)
2149 VSSRLRNUI(vssrlrni_wu_d, 64, W, D)
2150
2151 #define VSSRARNUI(NAME, BIT, E1, E2) \
2152 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
2153 { \
2154 int i, j, ofs; \
2155 VReg temp = {}; \
2156 VReg *Vd = (VReg *)vd; \
2157 VReg *Vj = (VReg *)vj; \
2158 int oprsz = simd_oprsz(desc); \
2159 \
2160 ofs = LSX_LEN / BIT; \
2161 for (i = 0; i < oprsz / 16; i++) { \
2162 for (j = 0; j < ofs; j++) { \
2163 temp.E1(j + ofs * 2 * i) = do_ssrarnu_ ## E1(Vj->E2(j + ofs * i), \
2164 imm, BIT / 2); \
2165 temp.E1(j + ofs * (2 * i + 1)) = do_ssrarnu_ ## E1(Vd->E2(j + ofs * i), \
2166 imm, BIT / 2); \
2167 } \
2168 } \
2169 *Vd = temp; \
2170 }
2171
do_vssrarni_du_q(VReg * Vd,VReg * Vj,uint64_t imm,int idx,Int128 mask1,Int128 mask2)2172 static void do_vssrarni_du_q(VReg *Vd, VReg *Vj,
2173 uint64_t imm, int idx, Int128 mask1, Int128 mask2)
2174 {
2175 Int128 shft_res1, shft_res2, r1, r2;
2176
2177 if (imm == 0) {
2178 shft_res1 = Vj->Q(idx);
2179 shft_res2 = Vd->Q(idx);
2180 } else {
2181 r1 = int128_and(int128_rshift(Vj->Q(idx), (imm - 1)), int128_one());
2182 r2 = int128_and(int128_rshift(Vd->Q(idx), (imm - 1)), int128_one());
2183 shft_res1 = int128_add(int128_rshift(Vj->Q(idx), imm), r1);
2184 shft_res2 = int128_add(int128_rshift(Vd->Q(idx), imm), r2);
2185 }
2186
2187 if (int128_lt(Vj->Q(idx), int128_zero())) {
2188 shft_res1 = int128_zero();
2189 }
2190 if (int128_lt(Vd->Q(idx), int128_zero())) {
2191 shft_res2 = int128_zero();
2192 }
2193
2194 if (int128_gt(shft_res1, mask1)) {
2195 Vd->D(idx * 2) = int128_getlo(mask1);
2196 } else if (int128_lt(shft_res1, int128_neg(mask2))) {
2197 Vd->D(idx * 2) = int128_getlo(mask2);
2198 } else {
2199 Vd->D(idx * 2) = int128_getlo(shft_res1);
2200 }
2201
2202 if (int128_gt(shft_res2, mask1)) {
2203 Vd->D(idx * 2 + 1) = int128_getlo(mask1);
2204 } else if (int128_lt(shft_res2, int128_neg(mask2))) {
2205 Vd->D(idx * 2 + 1) = int128_getlo(mask2);
2206 } else {
2207 Vd->D(idx * 2 + 1) = int128_getlo(shft_res2);
2208 }
2209 }
2210
HELPER(vssrarni_du_q)2211 void HELPER(vssrarni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
2212 {
2213 int i;
2214 Int128 mask1, mask2;
2215 VReg *Vd = (VReg *)vd;
2216 VReg *Vj = (VReg *)vj;
2217 int oprsz = simd_oprsz(desc);
2218
2219 mask1 = int128_sub(int128_lshift(int128_one(), 64), int128_one());
2220 mask2 = int128_lshift(int128_one(), 64);
2221
2222 for (i = 0; i < oprsz / 16; i++) {
2223 do_vssrarni_du_q(Vd, Vj, imm, i, mask1, mask2);
2224 }
2225 }
2226
2227 VSSRARNUI(vssrarni_bu_h, 16, B, H)
2228 VSSRARNUI(vssrarni_hu_w, 32, H, W)
2229 VSSRARNUI(vssrarni_wu_d, 64, W, D)
2230
2231 #define DO_2OP(NAME, BIT, E, DO_OP) \
2232 void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \
2233 { \
2234 int i; \
2235 VReg *Vd = (VReg *)vd; \
2236 VReg *Vj = (VReg *)vj; \
2237 int oprsz = simd_oprsz(desc); \
2238 \
2239 for (i = 0; i < oprsz / (BIT / 8); i++) \
2240 { \
2241 Vd->E(i) = DO_OP(Vj->E(i)); \
2242 } \
2243 }
2244
2245 DO_2OP(vclo_b, 8, UB, DO_CLO_B)
2246 DO_2OP(vclo_h, 16, UH, DO_CLO_H)
2247 DO_2OP(vclo_w, 32, UW, DO_CLO_W)
2248 DO_2OP(vclo_d, 64, UD, DO_CLO_D)
2249 DO_2OP(vclz_b, 8, UB, DO_CLZ_B)
2250 DO_2OP(vclz_h, 16, UH, DO_CLZ_H)
2251 DO_2OP(vclz_w, 32, UW, DO_CLZ_W)
2252 DO_2OP(vclz_d, 64, UD, DO_CLZ_D)
2253
2254 #define VPCNT(NAME, BIT, E, FN) \
2255 void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \
2256 { \
2257 int i; \
2258 VReg *Vd = (VReg *)vd; \
2259 VReg *Vj = (VReg *)vj; \
2260 int oprsz = simd_oprsz(desc); \
2261 \
2262 for (i = 0; i < oprsz / (BIT / 8); i++) \
2263 { \
2264 Vd->E(i) = FN(Vj->E(i)); \
2265 } \
2266 }
2267
2268 VPCNT(vpcnt_b, 8, UB, ctpop8)
2269 VPCNT(vpcnt_h, 16, UH, ctpop16)
2270 VPCNT(vpcnt_w, 32, UW, ctpop32)
2271 VPCNT(vpcnt_d, 64, UD, ctpop64)
2272
2273 #define DO_BIT(NAME, BIT, E, DO_OP) \
2274 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
2275 { \
2276 int i; \
2277 VReg *Vd = (VReg *)vd; \
2278 VReg *Vj = (VReg *)vj; \
2279 VReg *Vk = (VReg *)vk; \
2280 int oprsz = simd_oprsz(desc); \
2281 \
2282 for (i = 0; i < oprsz / (BIT / 8); i++) { \
2283 Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)%BIT); \
2284 } \
2285 }
2286
2287 DO_BIT(vbitclr_b, 8, UB, DO_BITCLR)
2288 DO_BIT(vbitclr_h, 16, UH, DO_BITCLR)
2289 DO_BIT(vbitclr_w, 32, UW, DO_BITCLR)
2290 DO_BIT(vbitclr_d, 64, UD, DO_BITCLR)
2291 DO_BIT(vbitset_b, 8, UB, DO_BITSET)
2292 DO_BIT(vbitset_h, 16, UH, DO_BITSET)
2293 DO_BIT(vbitset_w, 32, UW, DO_BITSET)
2294 DO_BIT(vbitset_d, 64, UD, DO_BITSET)
2295 DO_BIT(vbitrev_b, 8, UB, DO_BITREV)
2296 DO_BIT(vbitrev_h, 16, UH, DO_BITREV)
2297 DO_BIT(vbitrev_w, 32, UW, DO_BITREV)
2298 DO_BIT(vbitrev_d, 64, UD, DO_BITREV)
2299
2300 #define DO_BITI(NAME, BIT, E, DO_OP) \
2301 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
2302 { \
2303 int i; \
2304 VReg *Vd = (VReg *)vd; \
2305 VReg *Vj = (VReg *)vj; \
2306 int oprsz = simd_oprsz(desc); \
2307 \
2308 for (i = 0; i < oprsz / (BIT / 8); i++) { \
2309 Vd->E(i) = DO_OP(Vj->E(i), imm); \
2310 } \
2311 }
2312
2313 DO_BITI(vbitclri_b, 8, UB, DO_BITCLR)
2314 DO_BITI(vbitclri_h, 16, UH, DO_BITCLR)
2315 DO_BITI(vbitclri_w, 32, UW, DO_BITCLR)
2316 DO_BITI(vbitclri_d, 64, UD, DO_BITCLR)
2317 DO_BITI(vbitseti_b, 8, UB, DO_BITSET)
2318 DO_BITI(vbitseti_h, 16, UH, DO_BITSET)
2319 DO_BITI(vbitseti_w, 32, UW, DO_BITSET)
2320 DO_BITI(vbitseti_d, 64, UD, DO_BITSET)
2321 DO_BITI(vbitrevi_b, 8, UB, DO_BITREV)
2322 DO_BITI(vbitrevi_h, 16, UH, DO_BITREV)
2323 DO_BITI(vbitrevi_w, 32, UW, DO_BITREV)
2324 DO_BITI(vbitrevi_d, 64, UD, DO_BITREV)
2325
2326 #define VFRSTP(NAME, BIT, MASK, E) \
2327 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
2328 { \
2329 int i, j, m, ofs; \
2330 VReg *Vd = (VReg *)vd; \
2331 VReg *Vj = (VReg *)vj; \
2332 VReg *Vk = (VReg *)vk; \
2333 int oprsz = simd_oprsz(desc); \
2334 \
2335 ofs = LSX_LEN / BIT; \
2336 for (i = 0; i < oprsz / 16; i++) { \
2337 m = Vk->E(i * ofs) & MASK; \
2338 for (j = 0; j < ofs; j++) { \
2339 if (Vj->E(j + ofs * i) < 0) { \
2340 break; \
2341 } \
2342 } \
2343 Vd->E(m + i * ofs) = j; \
2344 } \
2345 }
2346
2347 VFRSTP(vfrstp_b, 8, 0xf, B)
2348 VFRSTP(vfrstp_h, 16, 0x7, H)
2349
2350 #define VFRSTPI(NAME, BIT, E) \
2351 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
2352 { \
2353 int i, j, m, ofs; \
2354 VReg *Vd = (VReg *)vd; \
2355 VReg *Vj = (VReg *)vj; \
2356 int oprsz = simd_oprsz(desc); \
2357 \
2358 ofs = LSX_LEN / BIT; \
2359 m = imm % ofs; \
2360 for (i = 0; i < oprsz / 16; i++) { \
2361 for (j = 0; j < ofs; j++) { \
2362 if (Vj->E(j + ofs * i) < 0) { \
2363 break; \
2364 } \
2365 } \
2366 Vd->E(m + i * ofs) = j; \
2367 } \
2368 }
2369
2370 VFRSTPI(vfrstpi_b, 8, B)
2371 VFRSTPI(vfrstpi_h, 16, H)
2372
vec_update_fcsr0_mask(CPULoongArchState * env,uintptr_t pc,int mask)2373 static void vec_update_fcsr0_mask(CPULoongArchState *env,
2374 uintptr_t pc, int mask)
2375 {
2376 int flags = get_float_exception_flags(&env->fp_status);
2377
2378 set_float_exception_flags(0, &env->fp_status);
2379
2380 flags &= ~mask;
2381
2382 if (flags) {
2383 flags = ieee_ex_to_loongarch(flags);
2384 UPDATE_FP_CAUSE(env->fcsr0, flags);
2385 }
2386
2387 if (GET_FP_ENABLES(env->fcsr0) & flags) {
2388 do_raise_exception(env, EXCCODE_FPE, pc);
2389 } else {
2390 UPDATE_FP_FLAGS(env->fcsr0, flags);
2391 }
2392 }
2393
vec_update_fcsr0(CPULoongArchState * env,uintptr_t pc)2394 static void vec_update_fcsr0(CPULoongArchState *env, uintptr_t pc)
2395 {
2396 vec_update_fcsr0_mask(env, pc, 0);
2397 }
2398
vec_clear_cause(CPULoongArchState * env)2399 static inline void vec_clear_cause(CPULoongArchState *env)
2400 {
2401 SET_FP_CAUSE(env->fcsr0, 0);
2402 }
2403
2404 #define DO_3OP_F(NAME, BIT, E, FN) \
2405 void HELPER(NAME)(void *vd, void *vj, void *vk, \
2406 CPULoongArchState *env, uint32_t desc) \
2407 { \
2408 int i; \
2409 VReg *Vd = (VReg *)vd; \
2410 VReg *Vj = (VReg *)vj; \
2411 VReg *Vk = (VReg *)vk; \
2412 int oprsz = simd_oprsz(desc); \
2413 \
2414 vec_clear_cause(env); \
2415 for (i = 0; i < oprsz / (BIT / 8); i++) { \
2416 Vd->E(i) = FN(Vj->E(i), Vk->E(i), &env->fp_status); \
2417 vec_update_fcsr0(env, GETPC()); \
2418 } \
2419 }
2420
2421 DO_3OP_F(vfadd_s, 32, UW, float32_add)
2422 DO_3OP_F(vfadd_d, 64, UD, float64_add)
2423 DO_3OP_F(vfsub_s, 32, UW, float32_sub)
2424 DO_3OP_F(vfsub_d, 64, UD, float64_sub)
2425 DO_3OP_F(vfmul_s, 32, UW, float32_mul)
2426 DO_3OP_F(vfmul_d, 64, UD, float64_mul)
2427 DO_3OP_F(vfdiv_s, 32, UW, float32_div)
2428 DO_3OP_F(vfdiv_d, 64, UD, float64_div)
2429 DO_3OP_F(vfmax_s, 32, UW, float32_maxnum)
2430 DO_3OP_F(vfmax_d, 64, UD, float64_maxnum)
2431 DO_3OP_F(vfmin_s, 32, UW, float32_minnum)
2432 DO_3OP_F(vfmin_d, 64, UD, float64_minnum)
2433 DO_3OP_F(vfmaxa_s, 32, UW, float32_maxnummag)
2434 DO_3OP_F(vfmaxa_d, 64, UD, float64_maxnummag)
2435 DO_3OP_F(vfmina_s, 32, UW, float32_minnummag)
2436 DO_3OP_F(vfmina_d, 64, UD, float64_minnummag)
2437
2438 #define DO_4OP_F(NAME, BIT, E, FN, flags) \
2439 void HELPER(NAME)(void *vd, void *vj, void *vk, void *va, \
2440 CPULoongArchState *env, uint32_t desc) \
2441 { \
2442 int i; \
2443 VReg *Vd = (VReg *)vd; \
2444 VReg *Vj = (VReg *)vj; \
2445 VReg *Vk = (VReg *)vk; \
2446 VReg *Va = (VReg *)va; \
2447 int oprsz = simd_oprsz(desc); \
2448 \
2449 vec_clear_cause(env); \
2450 for (i = 0; i < oprsz / (BIT / 8); i++) { \
2451 Vd->E(i) = FN(Vj->E(i), Vk->E(i), Va->E(i), flags, &env->fp_status); \
2452 vec_update_fcsr0(env, GETPC()); \
2453 } \
2454 }
2455
2456 DO_4OP_F(vfmadd_s, 32, UW, float32_muladd, 0)
2457 DO_4OP_F(vfmadd_d, 64, UD, float64_muladd, 0)
2458 DO_4OP_F(vfmsub_s, 32, UW, float32_muladd, float_muladd_negate_c)
2459 DO_4OP_F(vfmsub_d, 64, UD, float64_muladd, float_muladd_negate_c)
2460 DO_4OP_F(vfnmadd_s, 32, UW, float32_muladd, float_muladd_negate_result)
2461 DO_4OP_F(vfnmadd_d, 64, UD, float64_muladd, float_muladd_negate_result)
2462 DO_4OP_F(vfnmsub_s, 32, UW, float32_muladd,
2463 float_muladd_negate_c | float_muladd_negate_result)
2464 DO_4OP_F(vfnmsub_d, 64, UD, float64_muladd,
2465 float_muladd_negate_c | float_muladd_negate_result)
2466
2467 #define DO_2OP_F(NAME, BIT, E, FN) \
2468 void HELPER(NAME)(void *vd, void *vj, \
2469 CPULoongArchState *env, uint32_t desc) \
2470 { \
2471 int i; \
2472 VReg *Vd = (VReg *)vd; \
2473 VReg *Vj = (VReg *)vj; \
2474 int oprsz = simd_oprsz(desc); \
2475 \
2476 vec_clear_cause(env); \
2477 for (i = 0; i < oprsz / (BIT / 8); i++) { \
2478 Vd->E(i) = FN(env, Vj->E(i)); \
2479 } \
2480 }
2481
2482 #define FLOGB(BIT, T) \
2483 static T do_flogb_## BIT(CPULoongArchState *env, T fj) \
2484 { \
2485 T fp, fd; \
2486 float_status *status = &env->fp_status; \
2487 FloatRoundMode old_mode = get_float_rounding_mode(status); \
2488 \
2489 set_float_rounding_mode(float_round_down, status); \
2490 fp = float ## BIT ##_log2(fj, status); \
2491 fd = float ## BIT ##_round_to_int(fp, status); \
2492 set_float_rounding_mode(old_mode, status); \
2493 vec_update_fcsr0_mask(env, GETPC(), float_flag_inexact); \
2494 return fd; \
2495 }
2496
2497 FLOGB(32, uint32_t)
2498 FLOGB(64, uint64_t)
2499
2500 #define FCLASS(NAME, BIT, E, FN) \
2501 void HELPER(NAME)(void *vd, void *vj, \
2502 CPULoongArchState *env, uint32_t desc) \
2503 { \
2504 int i; \
2505 VReg *Vd = (VReg *)vd; \
2506 VReg *Vj = (VReg *)vj; \
2507 int oprsz = simd_oprsz(desc); \
2508 \
2509 for (i = 0; i < oprsz / (BIT / 8); i++) { \
2510 Vd->E(i) = FN(env, Vj->E(i)); \
2511 } \
2512 }
2513
2514 FCLASS(vfclass_s, 32, UW, helper_fclass_s)
2515 FCLASS(vfclass_d, 64, UD, helper_fclass_d)
2516
2517 #define FSQRT(BIT, T) \
2518 static T do_fsqrt_## BIT(CPULoongArchState *env, T fj) \
2519 { \
2520 T fd; \
2521 fd = float ## BIT ##_sqrt(fj, &env->fp_status); \
2522 vec_update_fcsr0(env, GETPC()); \
2523 return fd; \
2524 }
2525
2526 FSQRT(32, uint32_t)
2527 FSQRT(64, uint64_t)
2528
2529 #define FRECIP(BIT, T) \
2530 static T do_frecip_## BIT(CPULoongArchState *env, T fj) \
2531 { \
2532 T fd; \
2533 fd = float ## BIT ##_div(float ## BIT ##_one, fj, &env->fp_status); \
2534 vec_update_fcsr0(env, GETPC()); \
2535 return fd; \
2536 }
2537
2538 FRECIP(32, uint32_t)
2539 FRECIP(64, uint64_t)
2540
2541 #define FRSQRT(BIT, T) \
2542 static T do_frsqrt_## BIT(CPULoongArchState *env, T fj) \
2543 { \
2544 T fd, fp; \
2545 fp = float ## BIT ##_sqrt(fj, &env->fp_status); \
2546 fd = float ## BIT ##_div(float ## BIT ##_one, fp, &env->fp_status); \
2547 vec_update_fcsr0(env, GETPC()); \
2548 return fd; \
2549 }
2550
2551 FRSQRT(32, uint32_t)
2552 FRSQRT(64, uint64_t)
2553
2554 DO_2OP_F(vflogb_s, 32, UW, do_flogb_32)
2555 DO_2OP_F(vflogb_d, 64, UD, do_flogb_64)
2556 DO_2OP_F(vfsqrt_s, 32, UW, do_fsqrt_32)
2557 DO_2OP_F(vfsqrt_d, 64, UD, do_fsqrt_64)
2558 DO_2OP_F(vfrecip_s, 32, UW, do_frecip_32)
2559 DO_2OP_F(vfrecip_d, 64, UD, do_frecip_64)
2560 DO_2OP_F(vfrsqrt_s, 32, UW, do_frsqrt_32)
2561 DO_2OP_F(vfrsqrt_d, 64, UD, do_frsqrt_64)
2562
float16_cvt_float32(uint16_t h,float_status * status)2563 static uint32_t float16_cvt_float32(uint16_t h, float_status *status)
2564 {
2565 return float16_to_float32(h, true, status);
2566 }
float32_cvt_float64(uint32_t s,float_status * status)2567 static uint64_t float32_cvt_float64(uint32_t s, float_status *status)
2568 {
2569 return float32_to_float64(s, status);
2570 }
2571
float32_cvt_float16(uint32_t s,float_status * status)2572 static uint16_t float32_cvt_float16(uint32_t s, float_status *status)
2573 {
2574 return float32_to_float16(s, true, status);
2575 }
float64_cvt_float32(uint64_t d,float_status * status)2576 static uint32_t float64_cvt_float32(uint64_t d, float_status *status)
2577 {
2578 return float64_to_float32(d, status);
2579 }
2580
HELPER(vfcvtl_s_h)2581 void HELPER(vfcvtl_s_h)(void *vd, void *vj,
2582 CPULoongArchState *env, uint32_t desc)
2583 {
2584 int i, j, ofs;
2585 VReg temp = {};
2586 VReg *Vd = (VReg *)vd;
2587 VReg *Vj = (VReg *)vj;
2588 int oprsz = simd_oprsz(desc);
2589
2590 ofs = LSX_LEN / 32;
2591 vec_clear_cause(env);
2592 for (i = 0; i < oprsz / 16; i++) {
2593 for (j = 0; j < ofs; j++) {
2594 temp.UW(j + ofs * i) =float16_cvt_float32(Vj->UH(j + ofs * 2 * i),
2595 &env->fp_status);
2596 }
2597 vec_update_fcsr0(env, GETPC());
2598 }
2599 *Vd = temp;
2600 }
2601
HELPER(vfcvtl_d_s)2602 void HELPER(vfcvtl_d_s)(void *vd, void *vj,
2603 CPULoongArchState *env, uint32_t desc)
2604 {
2605 int i, j, ofs;
2606 VReg temp = {};
2607 VReg *Vd = (VReg *)vd;
2608 VReg *Vj = (VReg *)vj;
2609 int oprsz = simd_oprsz(desc);
2610
2611 ofs = LSX_LEN / 64;
2612 vec_clear_cause(env);
2613 for (i = 0; i < oprsz / 16; i++) {
2614 for (j = 0; j < ofs; j++) {
2615 temp.UD(j + ofs * i) = float32_cvt_float64(Vj->UW(j + ofs * 2 * i),
2616 &env->fp_status);
2617 }
2618 vec_update_fcsr0(env, GETPC());
2619 }
2620 *Vd = temp;
2621 }
2622
HELPER(vfcvth_s_h)2623 void HELPER(vfcvth_s_h)(void *vd, void *vj,
2624 CPULoongArchState *env, uint32_t desc)
2625 {
2626 int i, j, ofs;
2627 VReg temp = {};
2628 VReg *Vd = (VReg *)vd;
2629 VReg *Vj = (VReg *)vj;
2630 int oprsz = simd_oprsz(desc);
2631
2632 ofs = LSX_LEN / 32;
2633 vec_clear_cause(env);
2634 for (i = 0; i < oprsz / 16; i++) {
2635 for (j = 0; j < ofs; j++) {
2636 temp.UW(j + ofs * i) = float16_cvt_float32(Vj->UH(j + ofs * (2 * i + 1)),
2637 &env->fp_status);
2638 }
2639 vec_update_fcsr0(env, GETPC());
2640 }
2641 *Vd = temp;
2642 }
2643
HELPER(vfcvth_d_s)2644 void HELPER(vfcvth_d_s)(void *vd, void *vj,
2645 CPULoongArchState *env, uint32_t desc)
2646 {
2647 int i, j, ofs;
2648 VReg temp = {};
2649 VReg *Vd = (VReg *)vd;
2650 VReg *Vj = (VReg *)vj;
2651 int oprsz = simd_oprsz(desc);
2652
2653 ofs = LSX_LEN / 64;
2654 vec_clear_cause(env);
2655 for (i = 0; i < oprsz / 16; i++) {
2656 for (j = 0; j < ofs; j++) {
2657 temp.UD(j + ofs * i) = float32_cvt_float64(Vj->UW(j + ofs * (2 * i + 1)),
2658 &env->fp_status);
2659 }
2660 vec_update_fcsr0(env, GETPC());
2661 }
2662 *Vd = temp;
2663 }
2664
HELPER(vfcvt_h_s)2665 void HELPER(vfcvt_h_s)(void *vd, void *vj, void *vk,
2666 CPULoongArchState *env, uint32_t desc)
2667 {
2668 int i, j, ofs;
2669 VReg temp = {};
2670 VReg *Vd = (VReg *)vd;
2671 VReg *Vj = (VReg *)vj;
2672 VReg *Vk = (VReg *)vk;
2673 int oprsz = simd_oprsz(desc);
2674
2675 ofs = LSX_LEN / 32;
2676 vec_clear_cause(env);
2677 for(i = 0; i < oprsz / 16; i++) {
2678 for (j = 0; j < ofs; j++) {
2679 temp.UH(j + ofs * (2 * i + 1)) = float32_cvt_float16(Vj->UW(j + ofs * i),
2680 &env->fp_status);
2681 temp.UH(j + ofs * 2 * i) = float32_cvt_float16(Vk->UW(j + ofs * i),
2682 &env->fp_status);
2683 }
2684 vec_update_fcsr0(env, GETPC());
2685 }
2686 *Vd = temp;
2687 }
2688
HELPER(vfcvt_s_d)2689 void HELPER(vfcvt_s_d)(void *vd, void *vj, void *vk,
2690 CPULoongArchState *env, uint32_t desc)
2691 {
2692 int i, j, ofs;
2693 VReg temp = {};
2694 VReg *Vd = (VReg *)vd;
2695 VReg *Vj = (VReg *)vj;
2696 VReg *Vk = (VReg *)vk;
2697 int oprsz = simd_oprsz(desc);
2698
2699 ofs = LSX_LEN / 64;
2700 vec_clear_cause(env);
2701 for(i = 0; i < oprsz / 16; i++) {
2702 for (j = 0; j < ofs; j++) {
2703 temp.UW(j + ofs * (2 * i + 1)) = float64_cvt_float32(Vj->UD(j + ofs * i),
2704 &env->fp_status);
2705 temp.UW(j + ofs * 2 * i) = float64_cvt_float32(Vk->UD(j + ofs * i),
2706 &env->fp_status);
2707 }
2708 vec_update_fcsr0(env, GETPC());
2709 }
2710 *Vd = temp;
2711 }
2712
HELPER(vfrint_s)2713 void HELPER(vfrint_s)(void *vd, void *vj,
2714 CPULoongArchState *env, uint32_t desc)
2715 {
2716 int i;
2717 VReg *Vd = (VReg *)vd;
2718 VReg *Vj = (VReg *)vj;
2719 int oprsz = simd_oprsz(desc);
2720
2721 vec_clear_cause(env);
2722 for (i = 0; i < oprsz / 4; i++) {
2723 Vd->W(i) = float32_round_to_int(Vj->UW(i), &env->fp_status);
2724 vec_update_fcsr0(env, GETPC());
2725 }
2726 }
2727
HELPER(vfrint_d)2728 void HELPER(vfrint_d)(void *vd, void *vj,
2729 CPULoongArchState *env, uint32_t desc)
2730 {
2731 int i;
2732 VReg *Vd = (VReg *)vd;
2733 VReg *Vj = (VReg *)vj;
2734 int oprsz = simd_oprsz(desc);
2735
2736 vec_clear_cause(env);
2737 for (i = 0; i < oprsz / 8; i++) {
2738 Vd->D(i) = float64_round_to_int(Vj->UD(i), &env->fp_status);
2739 vec_update_fcsr0(env, GETPC());
2740 }
2741 }
2742
2743 #define FCVT_2OP(NAME, BIT, E, MODE) \
2744 void HELPER(NAME)(void *vd, void *vj, \
2745 CPULoongArchState *env, uint32_t desc) \
2746 { \
2747 int i; \
2748 VReg *Vd = (VReg *)vd; \
2749 VReg *Vj = (VReg *)vj; \
2750 int oprsz = simd_oprsz(desc); \
2751 \
2752 vec_clear_cause(env); \
2753 for (i = 0; i < oprsz / (BIT / 8); i++) { \
2754 FloatRoundMode old_mode = get_float_rounding_mode(&env->fp_status); \
2755 set_float_rounding_mode(MODE, &env->fp_status); \
2756 Vd->E(i) = float## BIT ## _round_to_int(Vj->E(i), &env->fp_status); \
2757 set_float_rounding_mode(old_mode, &env->fp_status); \
2758 vec_update_fcsr0(env, GETPC()); \
2759 } \
2760 }
2761
2762 FCVT_2OP(vfrintrne_s, 32, UW, float_round_nearest_even)
2763 FCVT_2OP(vfrintrne_d, 64, UD, float_round_nearest_even)
2764 FCVT_2OP(vfrintrz_s, 32, UW, float_round_to_zero)
2765 FCVT_2OP(vfrintrz_d, 64, UD, float_round_to_zero)
2766 FCVT_2OP(vfrintrp_s, 32, UW, float_round_up)
2767 FCVT_2OP(vfrintrp_d, 64, UD, float_round_up)
2768 FCVT_2OP(vfrintrm_s, 32, UW, float_round_down)
2769 FCVT_2OP(vfrintrm_d, 64, UD, float_round_down)
2770
2771 #define FTINT(NAME, FMT1, FMT2, T1, T2, MODE) \
2772 static T2 do_ftint ## NAME(CPULoongArchState *env, T1 fj) \
2773 { \
2774 T2 fd; \
2775 FloatRoundMode old_mode = get_float_rounding_mode(&env->fp_status); \
2776 \
2777 set_float_rounding_mode(MODE, &env->fp_status); \
2778 fd = do_## FMT1 ##_to_## FMT2(env, fj); \
2779 set_float_rounding_mode(old_mode, &env->fp_status); \
2780 return fd; \
2781 }
2782
2783 #define DO_FTINT(FMT1, FMT2, T1, T2) \
2784 static T2 do_## FMT1 ##_to_## FMT2(CPULoongArchState *env, T1 fj) \
2785 { \
2786 T2 fd; \
2787 \
2788 fd = FMT1 ##_to_## FMT2(fj, &env->fp_status); \
2789 if (get_float_exception_flags(&env->fp_status) & (float_flag_invalid)) { \
2790 if (FMT1 ##_is_any_nan(fj)) { \
2791 fd = 0; \
2792 } \
2793 } \
2794 vec_update_fcsr0(env, GETPC()); \
2795 return fd; \
2796 }
2797
DO_FTINT(float32,int32,uint32_t,uint32_t)2798 DO_FTINT(float32, int32, uint32_t, uint32_t)
2799 DO_FTINT(float64, int64, uint64_t, uint64_t)
2800 DO_FTINT(float32, uint32, uint32_t, uint32_t)
2801 DO_FTINT(float64, uint64, uint64_t, uint64_t)
2802 DO_FTINT(float64, int32, uint64_t, uint32_t)
2803 DO_FTINT(float32, int64, uint32_t, uint64_t)
2804
2805 FTINT(rne_w_s, float32, int32, uint32_t, uint32_t, float_round_nearest_even)
2806 FTINT(rne_l_d, float64, int64, uint64_t, uint64_t, float_round_nearest_even)
2807 FTINT(rp_w_s, float32, int32, uint32_t, uint32_t, float_round_up)
2808 FTINT(rp_l_d, float64, int64, uint64_t, uint64_t, float_round_up)
2809 FTINT(rz_w_s, float32, int32, uint32_t, uint32_t, float_round_to_zero)
2810 FTINT(rz_l_d, float64, int64, uint64_t, uint64_t, float_round_to_zero)
2811 FTINT(rm_w_s, float32, int32, uint32_t, uint32_t, float_round_down)
2812 FTINT(rm_l_d, float64, int64, uint64_t, uint64_t, float_round_down)
2813
2814 DO_2OP_F(vftintrne_w_s, 32, UW, do_ftintrne_w_s)
2815 DO_2OP_F(vftintrne_l_d, 64, UD, do_ftintrne_l_d)
2816 DO_2OP_F(vftintrp_w_s, 32, UW, do_ftintrp_w_s)
2817 DO_2OP_F(vftintrp_l_d, 64, UD, do_ftintrp_l_d)
2818 DO_2OP_F(vftintrz_w_s, 32, UW, do_ftintrz_w_s)
2819 DO_2OP_F(vftintrz_l_d, 64, UD, do_ftintrz_l_d)
2820 DO_2OP_F(vftintrm_w_s, 32, UW, do_ftintrm_w_s)
2821 DO_2OP_F(vftintrm_l_d, 64, UD, do_ftintrm_l_d)
2822 DO_2OP_F(vftint_w_s, 32, UW, do_float32_to_int32)
2823 DO_2OP_F(vftint_l_d, 64, UD, do_float64_to_int64)
2824
2825 FTINT(rz_wu_s, float32, uint32, uint32_t, uint32_t, float_round_to_zero)
2826 FTINT(rz_lu_d, float64, uint64, uint64_t, uint64_t, float_round_to_zero)
2827
2828 DO_2OP_F(vftintrz_wu_s, 32, UW, do_ftintrz_wu_s)
2829 DO_2OP_F(vftintrz_lu_d, 64, UD, do_ftintrz_lu_d)
2830 DO_2OP_F(vftint_wu_s, 32, UW, do_float32_to_uint32)
2831 DO_2OP_F(vftint_lu_d, 64, UD, do_float64_to_uint64)
2832
2833 FTINT(rm_w_d, float64, int32, uint64_t, uint32_t, float_round_down)
2834 FTINT(rp_w_d, float64, int32, uint64_t, uint32_t, float_round_up)
2835 FTINT(rz_w_d, float64, int32, uint64_t, uint32_t, float_round_to_zero)
2836 FTINT(rne_w_d, float64, int32, uint64_t, uint32_t, float_round_nearest_even)
2837
2838 #define FTINT_W_D(NAME, FN) \
2839 void HELPER(NAME)(void *vd, void *vj, void *vk, \
2840 CPULoongArchState *env, uint32_t desc) \
2841 { \
2842 int i, j, ofs; \
2843 VReg temp = {}; \
2844 VReg *Vd = (VReg *)vd; \
2845 VReg *Vj = (VReg *)vj; \
2846 VReg *Vk = (VReg *)vk; \
2847 int oprsz = simd_oprsz(desc); \
2848 \
2849 ofs = LSX_LEN / 64; \
2850 vec_clear_cause(env); \
2851 for (i = 0; i < oprsz / 16; i++) { \
2852 for (j = 0; j < ofs; j++) { \
2853 temp.W(j + ofs * (2 * i + 1)) = FN(env, Vj->UD(j + ofs * i)); \
2854 temp.W(j + ofs * 2 * i) = FN(env, Vk->UD(j + ofs * i)); \
2855 } \
2856 } \
2857 *Vd = temp; \
2858 }
2859
2860 FTINT_W_D(vftint_w_d, do_float64_to_int32)
2861 FTINT_W_D(vftintrm_w_d, do_ftintrm_w_d)
2862 FTINT_W_D(vftintrp_w_d, do_ftintrp_w_d)
2863 FTINT_W_D(vftintrz_w_d, do_ftintrz_w_d)
2864 FTINT_W_D(vftintrne_w_d, do_ftintrne_w_d)
2865
2866 FTINT(rml_l_s, float32, int64, uint32_t, uint64_t, float_round_down)
2867 FTINT(rpl_l_s, float32, int64, uint32_t, uint64_t, float_round_up)
2868 FTINT(rzl_l_s, float32, int64, uint32_t, uint64_t, float_round_to_zero)
2869 FTINT(rnel_l_s, float32, int64, uint32_t, uint64_t, float_round_nearest_even)
2870 FTINT(rmh_l_s, float32, int64, uint32_t, uint64_t, float_round_down)
2871 FTINT(rph_l_s, float32, int64, uint32_t, uint64_t, float_round_up)
2872 FTINT(rzh_l_s, float32, int64, uint32_t, uint64_t, float_round_to_zero)
2873 FTINT(rneh_l_s, float32, int64, uint32_t, uint64_t, float_round_nearest_even)
2874
2875 #define FTINTL_L_S(NAME, FN) \
2876 void HELPER(NAME)(void *vd, void *vj, \
2877 CPULoongArchState *env, uint32_t desc) \
2878 { \
2879 int i, j, ofs; \
2880 VReg temp; \
2881 VReg *Vd = (VReg *)vd; \
2882 VReg *Vj = (VReg *)vj; \
2883 int oprsz = simd_oprsz(desc); \
2884 \
2885 ofs = LSX_LEN / 64; \
2886 vec_clear_cause(env); \
2887 for (i = 0; i < oprsz / 16; i++) { \
2888 for (j = 0; j < ofs; j++) { \
2889 temp.D(j + ofs * i) = FN(env, Vj->UW(j + ofs * 2 * i)); \
2890 } \
2891 } \
2892 *Vd = temp; \
2893 }
2894
2895 FTINTL_L_S(vftintl_l_s, do_float32_to_int64)
2896 FTINTL_L_S(vftintrml_l_s, do_ftintrml_l_s)
2897 FTINTL_L_S(vftintrpl_l_s, do_ftintrpl_l_s)
2898 FTINTL_L_S(vftintrzl_l_s, do_ftintrzl_l_s)
2899 FTINTL_L_S(vftintrnel_l_s, do_ftintrnel_l_s)
2900
2901 #define FTINTH_L_S(NAME, FN) \
2902 void HELPER(NAME)(void *vd, void *vj, \
2903 CPULoongArchState *env, uint32_t desc) \
2904 { \
2905 int i, j, ofs; \
2906 VReg temp = {}; \
2907 VReg *Vd = (VReg *)vd; \
2908 VReg *Vj = (VReg *)vj; \
2909 int oprsz = simd_oprsz(desc); \
2910 \
2911 ofs = LSX_LEN / 64; \
2912 vec_clear_cause(env); \
2913 for (i = 0; i < oprsz / 16; i++) { \
2914 for (j = 0; j < ofs; j++) { \
2915 temp.D(j + ofs * i) = FN(env, Vj->UW(j + ofs * (2 * i + 1))); \
2916 } \
2917 } \
2918 *Vd = temp; \
2919 }
2920
2921 FTINTH_L_S(vftinth_l_s, do_float32_to_int64)
2922 FTINTH_L_S(vftintrmh_l_s, do_ftintrmh_l_s)
2923 FTINTH_L_S(vftintrph_l_s, do_ftintrph_l_s)
2924 FTINTH_L_S(vftintrzh_l_s, do_ftintrzh_l_s)
2925 FTINTH_L_S(vftintrneh_l_s, do_ftintrneh_l_s)
2926
2927 #define FFINT(NAME, FMT1, FMT2, T1, T2) \
2928 static T2 do_ffint_ ## NAME(CPULoongArchState *env, T1 fj) \
2929 { \
2930 T2 fd; \
2931 \
2932 fd = FMT1 ##_to_## FMT2(fj, &env->fp_status); \
2933 vec_update_fcsr0(env, GETPC()); \
2934 return fd; \
2935 }
2936
2937 FFINT(s_w, int32, float32, int32_t, uint32_t)
2938 FFINT(d_l, int64, float64, int64_t, uint64_t)
2939 FFINT(s_wu, uint32, float32, uint32_t, uint32_t)
2940 FFINT(d_lu, uint64, float64, uint64_t, uint64_t)
2941
2942 DO_2OP_F(vffint_s_w, 32, W, do_ffint_s_w)
2943 DO_2OP_F(vffint_d_l, 64, D, do_ffint_d_l)
2944 DO_2OP_F(vffint_s_wu, 32, UW, do_ffint_s_wu)
2945 DO_2OP_F(vffint_d_lu, 64, UD, do_ffint_d_lu)
2946
2947 void HELPER(vffintl_d_w)(void *vd, void *vj,
2948 CPULoongArchState *env, uint32_t desc)
2949 {
2950 int i, j, ofs;
2951 VReg temp = {};
2952 VReg *Vd = (VReg *)vd;
2953 VReg *Vj = (VReg *)vj;
2954 int oprsz = simd_oprsz(desc);
2955
2956 ofs = LSX_LEN / 64;
2957 vec_clear_cause(env);
2958 for (i = 0; i < oprsz / 16; i++) {
2959 for (j = 0; j < ofs; j++) {
2960 temp.D(j + ofs * i) = int32_to_float64(Vj->W(j + ofs * 2 * i),
2961 &env->fp_status);
2962 }
2963 vec_update_fcsr0(env, GETPC());
2964 }
2965 *Vd = temp;
2966 }
2967
HELPER(vffinth_d_w)2968 void HELPER(vffinth_d_w)(void *vd, void *vj,
2969 CPULoongArchState *env, uint32_t desc)
2970 {
2971 int i, j, ofs;
2972 VReg temp = {};
2973 VReg *Vd = (VReg *)vd;
2974 VReg *Vj = (VReg *)vj;
2975 int oprsz = simd_oprsz(desc);
2976
2977 ofs = LSX_LEN / 64;
2978 vec_clear_cause(env);
2979 for (i = 0; i < oprsz /16; i++) {
2980 for (j = 0; j < ofs; j++) {
2981 temp.D(j + ofs * i) = int32_to_float64(Vj->W(j + ofs * (2 * i + 1)),
2982 &env->fp_status);
2983 }
2984 vec_update_fcsr0(env, GETPC());
2985 }
2986 *Vd = temp;
2987 }
2988
HELPER(vffint_s_l)2989 void HELPER(vffint_s_l)(void *vd, void *vj, void *vk,
2990 CPULoongArchState *env, uint32_t desc)
2991 {
2992 int i, j, ofs;
2993 VReg temp = {};
2994 VReg *Vd = (VReg *)vd;
2995 VReg *Vj = (VReg *)vj;
2996 VReg *Vk = (VReg *)vk;
2997 int oprsz = simd_oprsz(desc);
2998
2999 ofs = LSX_LEN / 64;
3000 vec_clear_cause(env);
3001 for (i = 0; i < oprsz / 16; i++) {
3002 for (j = 0; j < ofs; j++) {
3003 temp.W(j + ofs * (2 * i + 1)) = int64_to_float32(Vj->D(j + ofs * i),
3004 &env->fp_status);
3005 temp.W(j + ofs * 2 * i) = int64_to_float32(Vk->D(j + ofs * i),
3006 &env->fp_status);
3007 }
3008 vec_update_fcsr0(env, GETPC());
3009 }
3010 *Vd = temp;
3011 }
3012
3013 #define VCMPI(NAME, BIT, E, DO_OP) \
3014 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
3015 { \
3016 int i; \
3017 VReg *Vd = (VReg *)vd; \
3018 VReg *Vj = (VReg *)vj; \
3019 typedef __typeof(Vd->E(0)) TD; \
3020 int oprsz = simd_oprsz(desc); \
3021 \
3022 for (i = 0; i < oprsz / (BIT / 8); i++) { \
3023 Vd->E(i) = DO_OP(Vj->E(i), (TD)imm); \
3024 } \
3025 }
3026
3027 VCMPI(vseqi_b, 8, B, VSEQ)
3028 VCMPI(vseqi_h, 16, H, VSEQ)
3029 VCMPI(vseqi_w, 32, W, VSEQ)
3030 VCMPI(vseqi_d, 64, D, VSEQ)
3031 VCMPI(vslei_b, 8, B, VSLE)
3032 VCMPI(vslei_h, 16, H, VSLE)
3033 VCMPI(vslei_w, 32, W, VSLE)
3034 VCMPI(vslei_d, 64, D, VSLE)
3035 VCMPI(vslei_bu, 8, UB, VSLE)
3036 VCMPI(vslei_hu, 16, UH, VSLE)
3037 VCMPI(vslei_wu, 32, UW, VSLE)
3038 VCMPI(vslei_du, 64, UD, VSLE)
3039 VCMPI(vslti_b, 8, B, VSLT)
3040 VCMPI(vslti_h, 16, H, VSLT)
3041 VCMPI(vslti_w, 32, W, VSLT)
3042 VCMPI(vslti_d, 64, D, VSLT)
3043 VCMPI(vslti_bu, 8, UB, VSLT)
3044 VCMPI(vslti_hu, 16, UH, VSLT)
3045 VCMPI(vslti_wu, 32, UW, VSLT)
3046 VCMPI(vslti_du, 64, UD, VSLT)
3047
vfcmp_common(CPULoongArchState * env,FloatRelation cmp,uint32_t flags)3048 static uint64_t vfcmp_common(CPULoongArchState *env,
3049 FloatRelation cmp, uint32_t flags)
3050 {
3051 uint64_t ret = 0;
3052
3053 switch (cmp) {
3054 case float_relation_less:
3055 ret = (flags & FCMP_LT);
3056 break;
3057 case float_relation_equal:
3058 ret = (flags & FCMP_EQ);
3059 break;
3060 case float_relation_greater:
3061 ret = (flags & FCMP_GT);
3062 break;
3063 case float_relation_unordered:
3064 ret = (flags & FCMP_UN);
3065 break;
3066 default:
3067 g_assert_not_reached();
3068 }
3069
3070 if (ret) {
3071 ret = -1;
3072 }
3073
3074 return ret;
3075 }
3076
3077 #define VFCMP(NAME, BIT, E, FN) \
3078 void HELPER(NAME)(CPULoongArchState *env, uint32_t oprsz, \
3079 uint32_t vd, uint32_t vj, uint32_t vk, uint32_t flags) \
3080 { \
3081 int i; \
3082 VReg t; \
3083 VReg *Vd = &(env->fpr[vd].vreg); \
3084 VReg *Vj = &(env->fpr[vj].vreg); \
3085 VReg *Vk = &(env->fpr[vk].vreg); \
3086 \
3087 vec_clear_cause(env); \
3088 for (i = 0; i < oprsz / (BIT / 8); i++) { \
3089 FloatRelation cmp; \
3090 cmp = FN(Vj->E(i), Vk->E(i), &env->fp_status); \
3091 t.E(i) = vfcmp_common(env, cmp, flags); \
3092 vec_update_fcsr0(env, GETPC()); \
3093 } \
3094 *Vd = t; \
3095 }
3096
3097 VFCMP(vfcmp_c_s, 32, UW, float32_compare_quiet)
3098 VFCMP(vfcmp_s_s, 32, UW, float32_compare)
3099 VFCMP(vfcmp_c_d, 64, UD, float64_compare_quiet)
3100 VFCMP(vfcmp_s_d, 64, UD, float64_compare)
3101
HELPER(vbitseli_b)3102 void HELPER(vbitseli_b)(void *vd, void *vj, uint64_t imm, uint32_t desc)
3103 {
3104 int i;
3105 VReg *Vd = (VReg *)vd;
3106 VReg *Vj = (VReg *)vj;
3107
3108 for (i = 0; i < simd_oprsz(desc); i++) {
3109 Vd->B(i) = (~Vd->B(i) & Vj->B(i)) | (Vd->B(i) & imm);
3110 }
3111 }
3112
3113 /* Copy from target/arm/tcg/sve_helper.c */
do_match2(uint64_t n,uint64_t m0,uint64_t m1,int esz)3114 static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
3115 {
3116 int bits = 8 << esz;
3117 uint64_t ones = dup_const(esz, 1);
3118 uint64_t signs = ones << (bits - 1);
3119 uint64_t cmp0, cmp1;
3120
3121 cmp1 = dup_const(esz, n);
3122 cmp0 = cmp1 ^ m0;
3123 cmp1 = cmp1 ^ m1;
3124 cmp0 = (cmp0 - ones) & ~cmp0;
3125 cmp1 = (cmp1 - ones) & ~cmp1;
3126 return (cmp0 | cmp1) & signs;
3127 }
3128
3129 #define SETANYEQZ(NAME, MO) \
3130 void HELPER(NAME)(CPULoongArchState *env, \
3131 uint32_t oprsz, uint32_t cd, uint32_t vj) \
3132 { \
3133 VReg *Vj = &(env->fpr[vj].vreg); \
3134 \
3135 env->cf[cd & 0x7] = do_match2(0, Vj->D(0), Vj->D(1), MO); \
3136 if (oprsz == 32) { \
3137 env->cf[cd & 0x7] = env->cf[cd & 0x7] || \
3138 do_match2(0, Vj->D(2), Vj->D(3), MO); \
3139 } \
3140 }
3141
SETANYEQZ(vsetanyeqz_b,MO_8)3142 SETANYEQZ(vsetanyeqz_b, MO_8)
3143 SETANYEQZ(vsetanyeqz_h, MO_16)
3144 SETANYEQZ(vsetanyeqz_w, MO_32)
3145 SETANYEQZ(vsetanyeqz_d, MO_64)
3146
3147 #define SETALLNEZ(NAME, MO) \
3148 void HELPER(NAME)(CPULoongArchState *env, \
3149 uint32_t oprsz, uint32_t cd, uint32_t vj) \
3150 { \
3151 VReg *Vj = &(env->fpr[vj].vreg); \
3152 \
3153 env->cf[cd & 0x7]= !do_match2(0, Vj->D(0), Vj->D(1), MO); \
3154 if (oprsz == 32) { \
3155 env->cf[cd & 0x7] = env->cf[cd & 0x7] && \
3156 !do_match2(0, Vj->D(2), Vj->D(3), MO); \
3157 } \
3158 }
3159
3160 SETALLNEZ(vsetallnez_b, MO_8)
3161 SETALLNEZ(vsetallnez_h, MO_16)
3162 SETALLNEZ(vsetallnez_w, MO_32)
3163 SETALLNEZ(vsetallnez_d, MO_64)
3164
3165 #define XVINSVE0(NAME, E, MASK) \
3166 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
3167 { \
3168 VReg *Vd = (VReg *)vd; \
3169 VReg *Vj = (VReg *)vj; \
3170 Vd->E(imm & MASK) = Vj->E(0); \
3171 }
3172
3173 XVINSVE0(xvinsve0_w, W, 0x7)
3174 XVINSVE0(xvinsve0_d, D, 0x3)
3175
3176 #define XVPICKVE(NAME, E, BIT, MASK) \
3177 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
3178 { \
3179 int i; \
3180 VReg *Vd = (VReg *)vd; \
3181 VReg *Vj = (VReg *)vj; \
3182 int oprsz = simd_oprsz(desc); \
3183 \
3184 Vd->E(0) = Vj->E(imm & MASK); \
3185 for (i = 1; i < oprsz / (BIT / 8); i++) { \
3186 Vd->E(i) = 0; \
3187 } \
3188 }
3189
3190 XVPICKVE(xvpickve_w, W, 32, 0x7)
3191 XVPICKVE(xvpickve_d, D, 64, 0x3)
3192
3193 #define VPACKEV(NAME, BIT, E) \
3194 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
3195 { \
3196 int i; \
3197 VReg temp = {}; \
3198 VReg *Vd = (VReg *)vd; \
3199 VReg *Vj = (VReg *)vj; \
3200 VReg *Vk = (VReg *)vk; \
3201 int oprsz = simd_oprsz(desc); \
3202 \
3203 for (i = 0; i < oprsz / (BIT / 8); i++) { \
3204 temp.E(2 * i + 1) = Vj->E(2 * i); \
3205 temp.E(2 *i) = Vk->E(2 * i); \
3206 } \
3207 *Vd = temp; \
3208 }
3209
3210 VPACKEV(vpackev_b, 16, B)
3211 VPACKEV(vpackev_h, 32, H)
3212 VPACKEV(vpackev_w, 64, W)
3213 VPACKEV(vpackev_d, 128, D)
3214
3215 #define VPACKOD(NAME, BIT, E) \
3216 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
3217 { \
3218 int i; \
3219 VReg temp = {}; \
3220 VReg *Vd = (VReg *)vd; \
3221 VReg *Vj = (VReg *)vj; \
3222 VReg *Vk = (VReg *)vk; \
3223 int oprsz = simd_oprsz(desc); \
3224 \
3225 for (i = 0; i < oprsz / (BIT / 8); i++) { \
3226 temp.E(2 * i + 1) = Vj->E(2 * i + 1); \
3227 temp.E(2 * i) = Vk->E(2 * i + 1); \
3228 } \
3229 *Vd = temp; \
3230 }
3231
3232 VPACKOD(vpackod_b, 16, B)
3233 VPACKOD(vpackod_h, 32, H)
3234 VPACKOD(vpackod_w, 64, W)
3235 VPACKOD(vpackod_d, 128, D)
3236
3237 #define VPICKEV(NAME, BIT, E) \
3238 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
3239 { \
3240 int i, j, ofs; \
3241 VReg temp = {}; \
3242 VReg *Vd = (VReg *)vd; \
3243 VReg *Vj = (VReg *)vj; \
3244 VReg *Vk = (VReg *)vk; \
3245 int oprsz = simd_oprsz(desc); \
3246 \
3247 ofs = LSX_LEN / BIT; \
3248 for (i = 0; i < oprsz / 16; i++) { \
3249 for (j = 0; j < ofs; j++) { \
3250 temp.E(j + ofs * (2 * i + 1)) = Vj->E(2 * (j + ofs * i)); \
3251 temp.E(j + ofs * 2 * i) = Vk->E(2 * (j + ofs * i)); \
3252 } \
3253 } \
3254 *Vd = temp; \
3255 }
3256
3257 VPICKEV(vpickev_b, 16, B)
3258 VPICKEV(vpickev_h, 32, H)
3259 VPICKEV(vpickev_w, 64, W)
3260 VPICKEV(vpickev_d, 128, D)
3261
3262 #define VPICKOD(NAME, BIT, E) \
3263 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
3264 { \
3265 int i, j, ofs; \
3266 VReg temp = {}; \
3267 VReg *Vd = (VReg *)vd; \
3268 VReg *Vj = (VReg *)vj; \
3269 VReg *Vk = (VReg *)vk; \
3270 int oprsz = simd_oprsz(desc); \
3271 \
3272 ofs = LSX_LEN / BIT; \
3273 for (i = 0; i < oprsz / 16; i++) { \
3274 for (j = 0; j < ofs; j++) { \
3275 temp.E(j + ofs * (2 * i + 1)) = Vj->E(2 * (j + ofs * i) + 1); \
3276 temp.E(j + ofs * 2 * i) = Vk->E(2 * (j + ofs * i) + 1); \
3277 } \
3278 } \
3279 *Vd = temp; \
3280 }
3281
3282 VPICKOD(vpickod_b, 16, B)
3283 VPICKOD(vpickod_h, 32, H)
3284 VPICKOD(vpickod_w, 64, W)
3285 VPICKOD(vpickod_d, 128, D)
3286
3287 #define VILVL(NAME, BIT, E) \
3288 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
3289 { \
3290 int i, j, ofs; \
3291 VReg temp = {}; \
3292 VReg *Vd = (VReg *)vd; \
3293 VReg *Vj = (VReg *)vj; \
3294 VReg *Vk = (VReg *)vk; \
3295 int oprsz = simd_oprsz(desc); \
3296 \
3297 ofs = LSX_LEN / BIT; \
3298 for (i = 0; i < oprsz / 16; i++) { \
3299 for (j = 0; j < ofs; j++) { \
3300 temp.E(2 * (j + ofs * i) + 1) = Vj->E(j + ofs * 2 * i); \
3301 temp.E(2 * (j + ofs * i)) = Vk->E(j + ofs * 2 * i); \
3302 } \
3303 } \
3304 *Vd = temp; \
3305 }
3306
3307 VILVL(vilvl_b, 16, B)
3308 VILVL(vilvl_h, 32, H)
3309 VILVL(vilvl_w, 64, W)
3310 VILVL(vilvl_d, 128, D)
3311
3312 #define VILVH(NAME, BIT, E) \
3313 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
3314 { \
3315 int i, j, ofs; \
3316 VReg temp = {}; \
3317 VReg *Vd = (VReg *)vd; \
3318 VReg *Vj = (VReg *)vj; \
3319 VReg *Vk = (VReg *)vk; \
3320 int oprsz = simd_oprsz(desc); \
3321 \
3322 ofs = LSX_LEN / BIT; \
3323 for (i = 0; i < oprsz / 16; i++) { \
3324 for (j = 0; j < ofs; j++) { \
3325 temp.E(2 * (j + ofs * i) + 1) = Vj->E(j + ofs * (2 * i + 1)); \
3326 temp.E(2 * (j + ofs * i)) = Vk->E(j + ofs * (2 * i + 1)); \
3327 } \
3328 } \
3329 *Vd = temp; \
3330 }
3331
3332 VILVH(vilvh_b, 16, B)
3333 VILVH(vilvh_h, 32, H)
3334 VILVH(vilvh_w, 64, W)
3335 VILVH(vilvh_d, 128, D)
3336
3337 void HELPER(vshuf_b)(void *vd, void *vj, void *vk, void *va, uint32_t desc)
3338 {
3339 int i, j, m;
3340 VReg temp = {};
3341 VReg *Vd = (VReg *)vd;
3342 VReg *Vj = (VReg *)vj;
3343 VReg *Vk = (VReg *)vk;
3344 VReg *Va = (VReg *)va;
3345 int oprsz = simd_oprsz(desc);
3346
3347 m = LSX_LEN / 8;
3348 for (i = 0; i < (oprsz / 16) * m; i++) {
3349 j = i < m ? 0 : 1;
3350 uint64_t k = (uint8_t)Va->B(i) % (2 * m);
3351 temp.B(i) = k < m ? Vk->B(k + j * m): Vj->B(k + (j - 1) * m);
3352 }
3353 *Vd = temp;
3354 }
3355
3356 #define VSHUF(NAME, BIT, E) \
3357 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
3358 { \
3359 int i, j, m; \
3360 VReg temp = {}; \
3361 VReg *Vd = (VReg *)vd; \
3362 VReg *Vj = (VReg *)vj; \
3363 VReg *Vk = (VReg *)vk; \
3364 int oprsz = simd_oprsz(desc); \
3365 \
3366 m = LSX_LEN / BIT; \
3367 for (i = 0; i < (oprsz / 16) * m; i++) { \
3368 j = i < m ? 0 : 1; \
3369 uint64_t k = ((uint8_t)Vd->E(i)) % (2 * m); \
3370 temp.E(i) = k < m ? Vk->E(k + j * m) : Vj->E(k + (j - 1) * m); \
3371 } \
3372 *Vd = temp; \
3373 }
3374
3375 VSHUF(vshuf_h, 16, H)
3376 VSHUF(vshuf_w, 32, W)
3377 VSHUF(vshuf_d, 64, D)
3378
3379 #define VSHUF4I(NAME, BIT, E) \
3380 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
3381 { \
3382 int i, j, max; \
3383 VReg temp = {}; \
3384 VReg *Vd = (VReg *)vd; \
3385 VReg *Vj = (VReg *)vj; \
3386 int oprsz = simd_oprsz(desc); \
3387 \
3388 max = LSX_LEN / BIT; \
3389 for (i = 0; i < oprsz / (BIT / 8); i++) { \
3390 j = i < max ? 1 : 2; \
3391 temp.E(i) = Vj->E(SHF_POS(i - ((j -1)* max), imm) + (j - 1) * max); \
3392 } \
3393 *Vd = temp; \
3394 }
3395
3396 VSHUF4I(vshuf4i_b, 8, B)
3397 VSHUF4I(vshuf4i_h, 16, H)
3398 VSHUF4I(vshuf4i_w, 32, W)
3399
HELPER(vshuf4i_d)3400 void HELPER(vshuf4i_d)(void *vd, void *vj, uint64_t imm, uint32_t desc)
3401 {
3402 int i;
3403 VReg temp = {};
3404 VReg *Vd = (VReg *)vd;
3405 VReg *Vj = (VReg *)vj;
3406 int oprsz = simd_oprsz(desc);
3407
3408 for (i = 0; i < oprsz / 16; i++) {
3409 temp.D(2 * i) = (imm & 2 ? Vj : Vd)->D((imm & 1) + 2 * i);
3410 temp.D(2 * i + 1) = (imm & 8 ? Vj : Vd)->D(((imm >> 2) & 1) + 2 * i);
3411 }
3412 *Vd = temp;
3413 }
3414
HELPER(vperm_w)3415 void HELPER(vperm_w)(void *vd, void *vj, void *vk, uint32_t desc)
3416 {
3417 int i, m;
3418 VReg temp = {};
3419 VReg *Vd = (VReg *)vd;
3420 VReg *Vj = (VReg *)vj;
3421 VReg *Vk = (VReg *)vk;
3422
3423 m = LASX_LEN / 32;
3424 for (i = 0; i < m ; i++) {
3425 uint64_t k = (uint8_t)Vk->W(i) % 8;
3426 temp.W(i) = Vj->W(k);
3427 }
3428 *Vd = temp;
3429 }
3430
HELPER(vpermi_w)3431 void HELPER(vpermi_w)(void *vd, void *vj, uint64_t imm, uint32_t desc)
3432 {
3433 int i;
3434 VReg temp = {};
3435 VReg *Vd = (VReg *)vd;
3436 VReg *Vj = (VReg *)vj;
3437 int oprsz = simd_oprsz(desc);
3438
3439 for (i = 0; i < oprsz / 16; i++) {
3440 temp.W(4 * i) = Vj->W((imm & 0x3) + 4 * i);
3441 temp.W(4 * i + 1) = Vj->W(((imm >> 2) & 0x3) + 4 * i);
3442 temp.W(4 * i + 2) = Vd->W(((imm >> 4) & 0x3) + 4 * i);
3443 temp.W(4 * i + 3) = Vd->W(((imm >> 6) & 0x3) + 4 * i);
3444 }
3445 *Vd = temp;
3446 }
3447
HELPER(vpermi_d)3448 void HELPER(vpermi_d)(void *vd, void *vj, uint64_t imm, uint32_t desc)
3449 {
3450 VReg temp = {};
3451 VReg *Vd = (VReg *)vd;
3452 VReg *Vj = (VReg *)vj;
3453
3454 temp.D(0) = Vj->D(imm & 0x3);
3455 temp.D(1) = Vj->D((imm >> 2) & 0x3);
3456 temp.D(2) = Vj->D((imm >> 4) & 0x3);
3457 temp.D(3) = Vj->D((imm >> 6) & 0x3);
3458 *Vd = temp;
3459 }
3460
HELPER(vpermi_q)3461 void HELPER(vpermi_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
3462 {
3463 int i;
3464 VReg temp;
3465 VReg *Vd = (VReg *)vd;
3466 VReg *Vj = (VReg *)vj;
3467
3468 for (i = 0; i < 2; i++, imm >>= 4) {
3469 temp.Q(i) = (imm & 2 ? Vd: Vj)->Q(imm & 1);
3470 }
3471 *Vd = temp;
3472 }
3473
3474 #define VEXTRINS(NAME, BIT, E, MASK) \
3475 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
3476 { \
3477 int i, ins, extr, max; \
3478 VReg *Vd = (VReg *)vd; \
3479 VReg *Vj = (VReg *)vj; \
3480 int oprsz = simd_oprsz(desc); \
3481 \
3482 max = LSX_LEN / BIT; \
3483 ins = (imm >> 4) & MASK; \
3484 extr = imm & MASK; \
3485 for (i = 0; i < oprsz / 16; i++) { \
3486 Vd->E(ins + i * max) = Vj->E(extr + i * max); \
3487 } \
3488 }
3489
3490 VEXTRINS(vextrins_b, 8, B, 0xf)
3491 VEXTRINS(vextrins_h, 16, H, 0x7)
3492 VEXTRINS(vextrins_w, 32, W, 0x3)
3493 VEXTRINS(vextrins_d, 64, D, 0x1)
3494