1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "crypto/clmul.h" 27 #include "vec_internal.h" 28 29 /* 30 * Data for expanding active predicate bits to bytes, for byte elements. 31 * 32 * for (i = 0; i < 256; ++i) { 33 * unsigned long m = 0; 34 * for (j = 0; j < 8; j++) { 35 * if ((i >> j) & 1) { 36 * m |= 0xfful << (j << 3); 37 * } 38 * } 39 * printf("0x%016lx,\n", m); 40 * } 41 */ 42 const uint64_t expand_pred_b_data[256] = { 43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 128 0xffffffffffffffff, 129 }; 130 131 /* 132 * Similarly for half-word elements. 133 * for (i = 0; i < 256; ++i) { 134 * unsigned long m = 0; 135 * if (i & 0xaa) { 136 * continue; 137 * } 138 * for (j = 0; j < 8; j += 2) { 139 * if ((i >> j) & 1) { 140 * m |= 0xfffful << (j << 3); 141 * } 142 * } 143 * printf("[0x%x] = 0x%016lx,\n", i, m); 144 * } 145 */ 146 const uint64_t expand_pred_h_data[0x55 + 1] = { 147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 154 [0x55] = 0xffffffffffffffff, 155 }; 156 157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 159 bool neg, bool round) 160 { 161 /* 162 * Simplify: 163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 165 */ 166 int32_t ret = (int32_t)src1 * src2; 167 if (neg) { 168 ret = -ret; 169 } 170 ret += ((int32_t)src3 << 7) + (round << 6); 171 ret >>= 7; 172 173 if (ret != (int8_t)ret) { 174 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 175 } 176 return ret; 177 } 178 179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 180 void *va, uint32_t desc) 181 { 182 intptr_t i, opr_sz = simd_oprsz(desc); 183 int8_t *d = vd, *n = vn, *m = vm, *a = va; 184 185 for (i = 0; i < opr_sz; ++i) { 186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 187 } 188 } 189 190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 191 void *va, uint32_t desc) 192 { 193 intptr_t i, opr_sz = simd_oprsz(desc); 194 int8_t *d = vd, *n = vn, *m = vm, *a = va; 195 196 for (i = 0; i < opr_sz; ++i) { 197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 198 } 199 } 200 201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 202 { 203 intptr_t i, opr_sz = simd_oprsz(desc); 204 int8_t *d = vd, *n = vn, *m = vm; 205 206 for (i = 0; i < opr_sz; ++i) { 207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 208 } 209 } 210 211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 212 { 213 intptr_t i, opr_sz = simd_oprsz(desc); 214 int8_t *d = vd, *n = vn, *m = vm; 215 216 for (i = 0; i < opr_sz; ++i) { 217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 218 } 219 } 220 221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 223 bool neg, bool round, uint32_t *sat) 224 { 225 /* Simplify similarly to do_sqrdmlah_b above. */ 226 int32_t ret = (int32_t)src1 * src2; 227 if (neg) { 228 ret = -ret; 229 } 230 ret += ((int32_t)src3 << 15) + (round << 14); 231 ret >>= 15; 232 233 if (ret != (int16_t)ret) { 234 *sat = 1; 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 236 } 237 return ret; 238 } 239 240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 241 uint32_t src2, uint32_t src3) 242 { 243 uint32_t *sat = &env->vfp.qc[0]; 244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 246 false, true, sat); 247 return deposit32(e1, 16, 16, e2); 248 } 249 250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 251 void *vq, uint32_t desc) 252 { 253 uintptr_t opr_sz = simd_oprsz(desc); 254 int16_t *d = vd; 255 int16_t *n = vn; 256 int16_t *m = vm; 257 uintptr_t i; 258 259 for (i = 0; i < opr_sz / 2; ++i) { 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 261 } 262 clear_tail(d, opr_sz, simd_maxsz(desc)); 263 } 264 265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 266 uint32_t src2, uint32_t src3) 267 { 268 uint32_t *sat = &env->vfp.qc[0]; 269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 271 true, true, sat); 272 return deposit32(e1, 16, 16, e2); 273 } 274 275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277 { 278 uintptr_t opr_sz = simd_oprsz(desc); 279 int16_t *d = vd; 280 int16_t *n = vn; 281 int16_t *m = vm; 282 uintptr_t i; 283 284 for (i = 0; i < opr_sz / 2; ++i) { 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 286 } 287 clear_tail(d, opr_sz, simd_maxsz(desc)); 288 } 289 290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 291 void *vq, uint32_t desc) 292 { 293 intptr_t i, opr_sz = simd_oprsz(desc); 294 int16_t *d = vd, *n = vn, *m = vm; 295 296 for (i = 0; i < opr_sz / 2; ++i) { 297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 298 } 299 clear_tail(d, opr_sz, simd_maxsz(desc)); 300 } 301 302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 303 void *vq, uint32_t desc) 304 { 305 intptr_t i, opr_sz = simd_oprsz(desc); 306 int16_t *d = vd, *n = vn, *m = vm; 307 308 for (i = 0; i < opr_sz / 2; ++i) { 309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 310 } 311 clear_tail(d, opr_sz, simd_maxsz(desc)); 312 } 313 314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm, 315 void *vq, uint32_t desc) 316 { 317 intptr_t i, j, opr_sz = simd_oprsz(desc); 318 int idx = simd_data(desc); 319 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 320 intptr_t elements = opr_sz / 2; 321 intptr_t eltspersegment = MIN(16 / 2, elements); 322 323 for (i = 0; i < elements; i += 16 / 2) { 324 int16_t mm = m[i]; 325 for (j = 0; j < eltspersegment; ++j) { 326 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq); 327 } 328 } 329 clear_tail(d, opr_sz, simd_maxsz(desc)); 330 } 331 332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, 333 void *vq, uint32_t desc) 334 { 335 intptr_t i, j, opr_sz = simd_oprsz(desc); 336 int idx = simd_data(desc); 337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 338 intptr_t elements = opr_sz / 2; 339 intptr_t eltspersegment = MIN(16 / 2, elements); 340 341 for (i = 0; i < elements; i += 16 / 2) { 342 int16_t mm = m[i]; 343 for (j = 0; j < eltspersegment; ++j) { 344 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq); 345 } 346 } 347 clear_tail(d, opr_sz, simd_maxsz(desc)); 348 } 349 350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm, 351 void *vq, uint32_t desc) 352 { 353 intptr_t i, j, opr_sz = simd_oprsz(desc); 354 int idx = simd_data(desc); 355 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 356 intptr_t elements = opr_sz / 2; 357 intptr_t eltspersegment = MIN(16 / 2, elements); 358 359 for (i = 0; i < elements; i += 16 / 2) { 360 int16_t mm = m[i]; 361 for (j = 0; j < eltspersegment; ++j) { 362 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq); 363 } 364 } 365 clear_tail(d, opr_sz, simd_maxsz(desc)); 366 } 367 368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm, 369 void *vq, uint32_t desc) 370 { 371 intptr_t i, j, opr_sz = simd_oprsz(desc); 372 int idx = simd_data(desc); 373 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 374 intptr_t elements = opr_sz / 2; 375 intptr_t eltspersegment = MIN(16 / 2, elements); 376 377 for (i = 0; i < elements; i += 16 / 2) { 378 int16_t mm = m[i]; 379 for (j = 0; j < eltspersegment; ++j) { 380 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq); 381 } 382 } 383 clear_tail(d, opr_sz, simd_maxsz(desc)); 384 } 385 386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 387 void *va, uint32_t desc) 388 { 389 intptr_t i, opr_sz = simd_oprsz(desc); 390 int16_t *d = vd, *n = vn, *m = vm, *a = va; 391 uint32_t discard; 392 393 for (i = 0; i < opr_sz / 2; ++i) { 394 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 395 } 396 } 397 398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 399 void *va, uint32_t desc) 400 { 401 intptr_t i, opr_sz = simd_oprsz(desc); 402 int16_t *d = vd, *n = vn, *m = vm, *a = va; 403 uint32_t discard; 404 405 for (i = 0; i < opr_sz / 2; ++i) { 406 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 407 } 408 } 409 410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 411 { 412 intptr_t i, opr_sz = simd_oprsz(desc); 413 int16_t *d = vd, *n = vn, *m = vm; 414 uint32_t discard; 415 416 for (i = 0; i < opr_sz / 2; ++i) { 417 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 418 } 419 } 420 421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 422 { 423 intptr_t i, opr_sz = simd_oprsz(desc); 424 int16_t *d = vd, *n = vn, *m = vm; 425 uint32_t discard; 426 427 for (i = 0; i < opr_sz / 2; ++i) { 428 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 429 } 430 } 431 432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 433 { 434 intptr_t i, j, opr_sz = simd_oprsz(desc); 435 int idx = simd_data(desc); 436 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 437 uint32_t discard; 438 439 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 440 int16_t mm = m[i]; 441 for (j = 0; j < 16 / 2; ++j) { 442 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 443 } 444 } 445 } 446 447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 448 { 449 intptr_t i, j, opr_sz = simd_oprsz(desc); 450 int idx = simd_data(desc); 451 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 452 uint32_t discard; 453 454 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 455 int16_t mm = m[i]; 456 for (j = 0; j < 16 / 2; ++j) { 457 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 458 } 459 } 460 } 461 462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 464 bool neg, bool round, uint32_t *sat) 465 { 466 /* Simplify similarly to do_sqrdmlah_b above. */ 467 int64_t ret = (int64_t)src1 * src2; 468 if (neg) { 469 ret = -ret; 470 } 471 ret += ((int64_t)src3 << 31) + (round << 30); 472 ret >>= 31; 473 474 if (ret != (int32_t)ret) { 475 *sat = 1; 476 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 477 } 478 return ret; 479 } 480 481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 482 int32_t src2, int32_t src3) 483 { 484 uint32_t *sat = &env->vfp.qc[0]; 485 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 486 } 487 488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 489 void *vq, uint32_t desc) 490 { 491 uintptr_t opr_sz = simd_oprsz(desc); 492 int32_t *d = vd; 493 int32_t *n = vn; 494 int32_t *m = vm; 495 uintptr_t i; 496 497 for (i = 0; i < opr_sz / 4; ++i) { 498 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 499 } 500 clear_tail(d, opr_sz, simd_maxsz(desc)); 501 } 502 503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 504 int32_t src2, int32_t src3) 505 { 506 uint32_t *sat = &env->vfp.qc[0]; 507 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 508 } 509 510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 511 void *vq, uint32_t desc) 512 { 513 uintptr_t opr_sz = simd_oprsz(desc); 514 int32_t *d = vd; 515 int32_t *n = vn; 516 int32_t *m = vm; 517 uintptr_t i; 518 519 for (i = 0; i < opr_sz / 4; ++i) { 520 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 521 } 522 clear_tail(d, opr_sz, simd_maxsz(desc)); 523 } 524 525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 526 void *vq, uint32_t desc) 527 { 528 intptr_t i, opr_sz = simd_oprsz(desc); 529 int32_t *d = vd, *n = vn, *m = vm; 530 531 for (i = 0; i < opr_sz / 4; ++i) { 532 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 533 } 534 clear_tail(d, opr_sz, simd_maxsz(desc)); 535 } 536 537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 538 void *vq, uint32_t desc) 539 { 540 intptr_t i, opr_sz = simd_oprsz(desc); 541 int32_t *d = vd, *n = vn, *m = vm; 542 543 for (i = 0; i < opr_sz / 4; ++i) { 544 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 545 } 546 clear_tail(d, opr_sz, simd_maxsz(desc)); 547 } 548 549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm, 550 void *vq, uint32_t desc) 551 { 552 intptr_t i, j, opr_sz = simd_oprsz(desc); 553 int idx = simd_data(desc); 554 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 555 intptr_t elements = opr_sz / 4; 556 intptr_t eltspersegment = MIN(16 / 4, elements); 557 558 for (i = 0; i < elements; i += 16 / 4) { 559 int32_t mm = m[i]; 560 for (j = 0; j < eltspersegment; ++j) { 561 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq); 562 } 563 } 564 clear_tail(d, opr_sz, simd_maxsz(desc)); 565 } 566 567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, 568 void *vq, uint32_t desc) 569 { 570 intptr_t i, j, opr_sz = simd_oprsz(desc); 571 int idx = simd_data(desc); 572 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 573 intptr_t elements = opr_sz / 4; 574 intptr_t eltspersegment = MIN(16 / 4, elements); 575 576 for (i = 0; i < elements; i += 16 / 4) { 577 int32_t mm = m[i]; 578 for (j = 0; j < eltspersegment; ++j) { 579 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq); 580 } 581 } 582 clear_tail(d, opr_sz, simd_maxsz(desc)); 583 } 584 585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm, 586 void *vq, uint32_t desc) 587 { 588 intptr_t i, j, opr_sz = simd_oprsz(desc); 589 int idx = simd_data(desc); 590 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 591 intptr_t elements = opr_sz / 4; 592 intptr_t eltspersegment = MIN(16 / 4, elements); 593 594 for (i = 0; i < elements; i += 16 / 4) { 595 int32_t mm = m[i]; 596 for (j = 0; j < eltspersegment; ++j) { 597 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq); 598 } 599 } 600 clear_tail(d, opr_sz, simd_maxsz(desc)); 601 } 602 603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm, 604 void *vq, uint32_t desc) 605 { 606 intptr_t i, j, opr_sz = simd_oprsz(desc); 607 int idx = simd_data(desc); 608 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 609 intptr_t elements = opr_sz / 4; 610 intptr_t eltspersegment = MIN(16 / 4, elements); 611 612 for (i = 0; i < elements; i += 16 / 4) { 613 int32_t mm = m[i]; 614 for (j = 0; j < eltspersegment; ++j) { 615 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq); 616 } 617 } 618 clear_tail(d, opr_sz, simd_maxsz(desc)); 619 } 620 621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 622 void *va, uint32_t desc) 623 { 624 intptr_t i, opr_sz = simd_oprsz(desc); 625 int32_t *d = vd, *n = vn, *m = vm, *a = va; 626 uint32_t discard; 627 628 for (i = 0; i < opr_sz / 4; ++i) { 629 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 630 } 631 } 632 633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 634 void *va, uint32_t desc) 635 { 636 intptr_t i, opr_sz = simd_oprsz(desc); 637 int32_t *d = vd, *n = vn, *m = vm, *a = va; 638 uint32_t discard; 639 640 for (i = 0; i < opr_sz / 4; ++i) { 641 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 642 } 643 } 644 645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 646 { 647 intptr_t i, opr_sz = simd_oprsz(desc); 648 int32_t *d = vd, *n = vn, *m = vm; 649 uint32_t discard; 650 651 for (i = 0; i < opr_sz / 4; ++i) { 652 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 653 } 654 } 655 656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 657 { 658 intptr_t i, opr_sz = simd_oprsz(desc); 659 int32_t *d = vd, *n = vn, *m = vm; 660 uint32_t discard; 661 662 for (i = 0; i < opr_sz / 4; ++i) { 663 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 664 } 665 } 666 667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 668 { 669 intptr_t i, j, opr_sz = simd_oprsz(desc); 670 int idx = simd_data(desc); 671 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 672 uint32_t discard; 673 674 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 675 int32_t mm = m[i]; 676 for (j = 0; j < 16 / 4; ++j) { 677 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 678 } 679 } 680 } 681 682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 683 { 684 intptr_t i, j, opr_sz = simd_oprsz(desc); 685 int idx = simd_data(desc); 686 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 687 uint32_t discard; 688 689 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 690 int32_t mm = m[i]; 691 for (j = 0; j < 16 / 4; ++j) { 692 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 693 } 694 } 695 } 696 697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 698 static int64_t do_sat128_d(Int128 r) 699 { 700 int64_t ls = int128_getlo(r); 701 int64_t hs = int128_gethi(r); 702 703 if (unlikely(hs != (ls >> 63))) { 704 return hs < 0 ? INT64_MIN : INT64_MAX; 705 } 706 return ls; 707 } 708 709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 710 { 711 uint64_t l, h; 712 Int128 r, t; 713 714 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 715 muls64(&l, &h, m, n); 716 r = int128_make128(l, h); 717 if (neg) { 718 r = int128_neg(r); 719 } 720 if (a) { 721 t = int128_exts64(a); 722 t = int128_lshift(t, 63); 723 r = int128_add(r, t); 724 } 725 if (round) { 726 t = int128_exts64(1ll << 62); 727 r = int128_add(r, t); 728 } 729 r = int128_rshift(r, 63); 730 731 return do_sat128_d(r); 732 } 733 734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 735 void *va, uint32_t desc) 736 { 737 intptr_t i, opr_sz = simd_oprsz(desc); 738 int64_t *d = vd, *n = vn, *m = vm, *a = va; 739 740 for (i = 0; i < opr_sz / 8; ++i) { 741 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 742 } 743 } 744 745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 746 void *va, uint32_t desc) 747 { 748 intptr_t i, opr_sz = simd_oprsz(desc); 749 int64_t *d = vd, *n = vn, *m = vm, *a = va; 750 751 for (i = 0; i < opr_sz / 8; ++i) { 752 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 753 } 754 } 755 756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 757 { 758 intptr_t i, opr_sz = simd_oprsz(desc); 759 int64_t *d = vd, *n = vn, *m = vm; 760 761 for (i = 0; i < opr_sz / 8; ++i) { 762 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 763 } 764 } 765 766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 767 { 768 intptr_t i, opr_sz = simd_oprsz(desc); 769 int64_t *d = vd, *n = vn, *m = vm; 770 771 for (i = 0; i < opr_sz / 8; ++i) { 772 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 773 } 774 } 775 776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 777 { 778 intptr_t i, j, opr_sz = simd_oprsz(desc); 779 int idx = simd_data(desc); 780 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 781 782 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 783 int64_t mm = m[i]; 784 for (j = 0; j < 16 / 8; ++j) { 785 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 786 } 787 } 788 } 789 790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 791 { 792 intptr_t i, j, opr_sz = simd_oprsz(desc); 793 int idx = simd_data(desc); 794 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 795 796 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 797 int64_t mm = m[i]; 798 for (j = 0; j < 16 / 8; ++j) { 799 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 800 } 801 } 802 } 803 804 /* Integer 8 and 16-bit dot-product. 805 * 806 * Note that for the loops herein, host endianness does not matter 807 * with respect to the ordering of data within the quad-width lanes. 808 * All elements are treated equally, no matter where they are. 809 */ 810 811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 813 { \ 814 intptr_t i, opr_sz = simd_oprsz(desc); \ 815 TYPED *d = vd, *a = va; \ 816 TYPEN *n = vn; \ 817 TYPEM *m = vm; \ 818 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 819 d[i] = (a[i] + \ 820 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 821 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 822 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 823 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 824 } \ 825 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 826 } 827 828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 833 834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 836 { \ 837 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 838 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 839 /* \ 840 * Special case: opr_sz == 8 from AA64/AA32 advsimd means the \ 841 * first iteration might not be a full 16 byte segment. But \ 842 * for vector lengths beyond that this must be SVE and we know \ 843 * opr_sz is a multiple of 16, so we need not clamp segend \ 844 * to opr_sz_n when we advance it at the end of the loop. \ 845 */ \ 846 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 847 intptr_t index = simd_data(desc); \ 848 TYPED *d = vd, *a = va; \ 849 TYPEN *n = vn; \ 850 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 851 do { \ 852 TYPED m0 = m_indexed[i * 4 + 0]; \ 853 TYPED m1 = m_indexed[i * 4 + 1]; \ 854 TYPED m2 = m_indexed[i * 4 + 2]; \ 855 TYPED m3 = m_indexed[i * 4 + 3]; \ 856 do { \ 857 d[i] = (a[i] + \ 858 n[i * 4 + 0] * m0 + \ 859 n[i * 4 + 1] * m1 + \ 860 n[i * 4 + 2] * m2 + \ 861 n[i * 4 + 3] * m3); \ 862 } while (++i < segend); \ 863 segend = i + (16 / sizeof(TYPED)); \ 864 } while (i < opr_sz_n); \ 865 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 866 } 867 868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 874 875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 876 void *vfpst, uint32_t desc) 877 { 878 uintptr_t opr_sz = simd_oprsz(desc); 879 float16 *d = vd; 880 float16 *n = vn; 881 float16 *m = vm; 882 float_status *fpst = vfpst; 883 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 884 uint32_t neg_imag = neg_real ^ 1; 885 uintptr_t i; 886 887 /* Shift boolean to the sign bit so we can xor to negate. */ 888 neg_real <<= 15; 889 neg_imag <<= 15; 890 891 for (i = 0; i < opr_sz / 2; i += 2) { 892 float16 e0 = n[H2(i)]; 893 float16 e1 = m[H2(i + 1)] ^ neg_imag; 894 float16 e2 = n[H2(i + 1)]; 895 float16 e3 = m[H2(i)] ^ neg_real; 896 897 d[H2(i)] = float16_add(e0, e1, fpst); 898 d[H2(i + 1)] = float16_add(e2, e3, fpst); 899 } 900 clear_tail(d, opr_sz, simd_maxsz(desc)); 901 } 902 903 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 904 void *vfpst, uint32_t desc) 905 { 906 uintptr_t opr_sz = simd_oprsz(desc); 907 float32 *d = vd; 908 float32 *n = vn; 909 float32 *m = vm; 910 float_status *fpst = vfpst; 911 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 912 uint32_t neg_imag = neg_real ^ 1; 913 uintptr_t i; 914 915 /* Shift boolean to the sign bit so we can xor to negate. */ 916 neg_real <<= 31; 917 neg_imag <<= 31; 918 919 for (i = 0; i < opr_sz / 4; i += 2) { 920 float32 e0 = n[H4(i)]; 921 float32 e1 = m[H4(i + 1)] ^ neg_imag; 922 float32 e2 = n[H4(i + 1)]; 923 float32 e3 = m[H4(i)] ^ neg_real; 924 925 d[H4(i)] = float32_add(e0, e1, fpst); 926 d[H4(i + 1)] = float32_add(e2, e3, fpst); 927 } 928 clear_tail(d, opr_sz, simd_maxsz(desc)); 929 } 930 931 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 932 void *vfpst, uint32_t desc) 933 { 934 uintptr_t opr_sz = simd_oprsz(desc); 935 float64 *d = vd; 936 float64 *n = vn; 937 float64 *m = vm; 938 float_status *fpst = vfpst; 939 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 940 uint64_t neg_imag = neg_real ^ 1; 941 uintptr_t i; 942 943 /* Shift boolean to the sign bit so we can xor to negate. */ 944 neg_real <<= 63; 945 neg_imag <<= 63; 946 947 for (i = 0; i < opr_sz / 8; i += 2) { 948 float64 e0 = n[i]; 949 float64 e1 = m[i + 1] ^ neg_imag; 950 float64 e2 = n[i + 1]; 951 float64 e3 = m[i] ^ neg_real; 952 953 d[i] = float64_add(e0, e1, fpst); 954 d[i + 1] = float64_add(e2, e3, fpst); 955 } 956 clear_tail(d, opr_sz, simd_maxsz(desc)); 957 } 958 959 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 960 void *vfpst, uint32_t desc) 961 { 962 uintptr_t opr_sz = simd_oprsz(desc); 963 float16 *d = vd, *n = vn, *m = vm, *a = va; 964 float_status *fpst = vfpst; 965 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 966 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 967 uint32_t neg_real = flip ^ neg_imag; 968 uintptr_t i; 969 970 /* Shift boolean to the sign bit so we can xor to negate. */ 971 neg_real <<= 15; 972 neg_imag <<= 15; 973 974 for (i = 0; i < opr_sz / 2; i += 2) { 975 float16 e2 = n[H2(i + flip)]; 976 float16 e1 = m[H2(i + flip)] ^ neg_real; 977 float16 e4 = e2; 978 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 979 980 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst); 981 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst); 982 } 983 clear_tail(d, opr_sz, simd_maxsz(desc)); 984 } 985 986 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 987 void *vfpst, uint32_t desc) 988 { 989 uintptr_t opr_sz = simd_oprsz(desc); 990 float16 *d = vd, *n = vn, *m = vm, *a = va; 991 float_status *fpst = vfpst; 992 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 993 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 994 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 995 uint32_t neg_real = flip ^ neg_imag; 996 intptr_t elements = opr_sz / sizeof(float16); 997 intptr_t eltspersegment = MIN(16 / sizeof(float16), elements); 998 intptr_t i, j; 999 1000 /* Shift boolean to the sign bit so we can xor to negate. */ 1001 neg_real <<= 15; 1002 neg_imag <<= 15; 1003 1004 for (i = 0; i < elements; i += eltspersegment) { 1005 float16 mr = m[H2(i + 2 * index + 0)]; 1006 float16 mi = m[H2(i + 2 * index + 1)]; 1007 float16 e1 = neg_real ^ (flip ? mi : mr); 1008 float16 e3 = neg_imag ^ (flip ? mr : mi); 1009 1010 for (j = i; j < i + eltspersegment; j += 2) { 1011 float16 e2 = n[H2(j + flip)]; 1012 float16 e4 = e2; 1013 1014 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); 1015 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); 1016 } 1017 } 1018 clear_tail(d, opr_sz, simd_maxsz(desc)); 1019 } 1020 1021 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 1022 void *vfpst, uint32_t desc) 1023 { 1024 uintptr_t opr_sz = simd_oprsz(desc); 1025 float32 *d = vd, *n = vn, *m = vm, *a = va; 1026 float_status *fpst = vfpst; 1027 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1028 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1029 uint32_t neg_real = flip ^ neg_imag; 1030 uintptr_t i; 1031 1032 /* Shift boolean to the sign bit so we can xor to negate. */ 1033 neg_real <<= 31; 1034 neg_imag <<= 31; 1035 1036 for (i = 0; i < opr_sz / 4; i += 2) { 1037 float32 e2 = n[H4(i + flip)]; 1038 float32 e1 = m[H4(i + flip)] ^ neg_real; 1039 float32 e4 = e2; 1040 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 1041 1042 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst); 1043 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst); 1044 } 1045 clear_tail(d, opr_sz, simd_maxsz(desc)); 1046 } 1047 1048 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 1049 void *vfpst, uint32_t desc) 1050 { 1051 uintptr_t opr_sz = simd_oprsz(desc); 1052 float32 *d = vd, *n = vn, *m = vm, *a = va; 1053 float_status *fpst = vfpst; 1054 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1055 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1056 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1057 uint32_t neg_real = flip ^ neg_imag; 1058 intptr_t elements = opr_sz / sizeof(float32); 1059 intptr_t eltspersegment = MIN(16 / sizeof(float32), elements); 1060 intptr_t i, j; 1061 1062 /* Shift boolean to the sign bit so we can xor to negate. */ 1063 neg_real <<= 31; 1064 neg_imag <<= 31; 1065 1066 for (i = 0; i < elements; i += eltspersegment) { 1067 float32 mr = m[H4(i + 2 * index + 0)]; 1068 float32 mi = m[H4(i + 2 * index + 1)]; 1069 float32 e1 = neg_real ^ (flip ? mi : mr); 1070 float32 e3 = neg_imag ^ (flip ? mr : mi); 1071 1072 for (j = i; j < i + eltspersegment; j += 2) { 1073 float32 e2 = n[H4(j + flip)]; 1074 float32 e4 = e2; 1075 1076 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); 1077 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); 1078 } 1079 } 1080 clear_tail(d, opr_sz, simd_maxsz(desc)); 1081 } 1082 1083 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 1084 void *vfpst, uint32_t desc) 1085 { 1086 uintptr_t opr_sz = simd_oprsz(desc); 1087 float64 *d = vd, *n = vn, *m = vm, *a = va; 1088 float_status *fpst = vfpst; 1089 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1090 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1091 uint64_t neg_real = flip ^ neg_imag; 1092 uintptr_t i; 1093 1094 /* Shift boolean to the sign bit so we can xor to negate. */ 1095 neg_real <<= 63; 1096 neg_imag <<= 63; 1097 1098 for (i = 0; i < opr_sz / 8; i += 2) { 1099 float64 e2 = n[i + flip]; 1100 float64 e1 = m[i + flip] ^ neg_real; 1101 float64 e4 = e2; 1102 float64 e3 = m[i + 1 - flip] ^ neg_imag; 1103 1104 d[i] = float64_muladd(e2, e1, a[i], 0, fpst); 1105 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst); 1106 } 1107 clear_tail(d, opr_sz, simd_maxsz(desc)); 1108 } 1109 1110 /* 1111 * Floating point comparisons producing an integer result (all 1s or all 0s). 1112 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1113 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1114 */ 1115 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 1116 { 1117 return -float16_eq_quiet(op1, op2, stat); 1118 } 1119 1120 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 1121 { 1122 return -float32_eq_quiet(op1, op2, stat); 1123 } 1124 1125 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat) 1126 { 1127 return -float64_eq_quiet(op1, op2, stat); 1128 } 1129 1130 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 1131 { 1132 return -float16_le(op2, op1, stat); 1133 } 1134 1135 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 1136 { 1137 return -float32_le(op2, op1, stat); 1138 } 1139 1140 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat) 1141 { 1142 return -float64_le(op2, op1, stat); 1143 } 1144 1145 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 1146 { 1147 return -float16_lt(op2, op1, stat); 1148 } 1149 1150 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 1151 { 1152 return -float32_lt(op2, op1, stat); 1153 } 1154 1155 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat) 1156 { 1157 return -float64_lt(op2, op1, stat); 1158 } 1159 1160 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 1161 { 1162 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 1163 } 1164 1165 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 1166 { 1167 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1168 } 1169 1170 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat) 1171 { 1172 return -float64_le(float64_abs(op2), float64_abs(op1), stat); 1173 } 1174 1175 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1176 { 1177 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1178 } 1179 1180 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1181 { 1182 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1183 } 1184 1185 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat) 1186 { 1187 return -float64_lt(float64_abs(op2), float64_abs(op1), stat); 1188 } 1189 1190 static int16_t vfp_tosszh(float16 x, void *fpstp) 1191 { 1192 float_status *fpst = fpstp; 1193 if (float16_is_any_nan(x)) { 1194 float_raise(float_flag_invalid, fpst); 1195 return 0; 1196 } 1197 return float16_to_int16_round_to_zero(x, fpst); 1198 } 1199 1200 static uint16_t vfp_touszh(float16 x, void *fpstp) 1201 { 1202 float_status *fpst = fpstp; 1203 if (float16_is_any_nan(x)) { 1204 float_raise(float_flag_invalid, fpst); 1205 return 0; 1206 } 1207 return float16_to_uint16_round_to_zero(x, fpst); 1208 } 1209 1210 #define DO_2OP(NAME, FUNC, TYPE) \ 1211 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 1212 { \ 1213 intptr_t i, oprsz = simd_oprsz(desc); \ 1214 TYPE *d = vd, *n = vn; \ 1215 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1216 d[i] = FUNC(n[i], stat); \ 1217 } \ 1218 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1219 } 1220 1221 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1222 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1223 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1224 1225 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1226 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1227 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1228 1229 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1230 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1231 1232 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1233 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1234 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1235 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1236 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1237 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1238 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1239 DO_2OP(gvec_touszh, vfp_touszh, float16) 1240 1241 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1242 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1243 { \ 1244 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1245 } 1246 1247 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1248 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1249 { \ 1250 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1251 } 1252 1253 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1254 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1255 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1256 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1257 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) 1258 1259 DO_2OP_CMP0(cgt, cgt, FWD) 1260 DO_2OP_CMP0(cge, cge, FWD) 1261 DO_2OP_CMP0(ceq, ceq, FWD) 1262 DO_2OP_CMP0(clt, cgt, REV) 1263 DO_2OP_CMP0(cle, cge, REV) 1264 1265 #undef DO_2OP 1266 #undef DO_2OP_CMP0 1267 1268 /* Floating-point trigonometric starting value. 1269 * See the ARM ARM pseudocode function FPTrigSMul. 1270 */ 1271 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1272 { 1273 float16 result = float16_mul(op1, op1, stat); 1274 if (!float16_is_any_nan(result)) { 1275 result = float16_set_sign(result, op2 & 1); 1276 } 1277 return result; 1278 } 1279 1280 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1281 { 1282 float32 result = float32_mul(op1, op1, stat); 1283 if (!float32_is_any_nan(result)) { 1284 result = float32_set_sign(result, op2 & 1); 1285 } 1286 return result; 1287 } 1288 1289 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1290 { 1291 float64 result = float64_mul(op1, op1, stat); 1292 if (!float64_is_any_nan(result)) { 1293 result = float64_set_sign(result, op2 & 1); 1294 } 1295 return result; 1296 } 1297 1298 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1299 { 1300 return float16_abs(float16_sub(op1, op2, stat)); 1301 } 1302 1303 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1304 { 1305 return float32_abs(float32_sub(op1, op2, stat)); 1306 } 1307 1308 static float64 float64_abd(float64 op1, float64 op2, float_status *stat) 1309 { 1310 return float64_abs(float64_sub(op1, op2, stat)); 1311 } 1312 1313 /* 1314 * Reciprocal step. These are the AArch32 version which uses a 1315 * non-fused multiply-and-subtract. 1316 */ 1317 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1318 { 1319 op1 = float16_squash_input_denormal(op1, stat); 1320 op2 = float16_squash_input_denormal(op2, stat); 1321 1322 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1323 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1324 return float16_two; 1325 } 1326 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1327 } 1328 1329 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1330 { 1331 op1 = float32_squash_input_denormal(op1, stat); 1332 op2 = float32_squash_input_denormal(op2, stat); 1333 1334 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1335 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1336 return float32_two; 1337 } 1338 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1339 } 1340 1341 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1342 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1343 { 1344 op1 = float16_squash_input_denormal(op1, stat); 1345 op2 = float16_squash_input_denormal(op2, stat); 1346 1347 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1348 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1349 return float16_one_point_five; 1350 } 1351 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1352 return float16_div(op1, float16_two, stat); 1353 } 1354 1355 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1356 { 1357 op1 = float32_squash_input_denormal(op1, stat); 1358 op2 = float32_squash_input_denormal(op2, stat); 1359 1360 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1361 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1362 return float32_one_point_five; 1363 } 1364 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1365 return float32_div(op1, float32_two, stat); 1366 } 1367 1368 #define DO_3OP(NAME, FUNC, TYPE) \ 1369 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1370 { \ 1371 intptr_t i, oprsz = simd_oprsz(desc); \ 1372 TYPE *d = vd, *n = vn, *m = vm; \ 1373 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1374 d[i] = FUNC(n[i], m[i], stat); \ 1375 } \ 1376 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1377 } 1378 1379 DO_3OP(gvec_fadd_h, float16_add, float16) 1380 DO_3OP(gvec_fadd_s, float32_add, float32) 1381 DO_3OP(gvec_fadd_d, float64_add, float64) 1382 1383 DO_3OP(gvec_fsub_h, float16_sub, float16) 1384 DO_3OP(gvec_fsub_s, float32_sub, float32) 1385 DO_3OP(gvec_fsub_d, float64_sub, float64) 1386 1387 DO_3OP(gvec_fmul_h, float16_mul, float16) 1388 DO_3OP(gvec_fmul_s, float32_mul, float32) 1389 DO_3OP(gvec_fmul_d, float64_mul, float64) 1390 1391 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1392 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1393 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1394 1395 DO_3OP(gvec_fabd_h, float16_abd, float16) 1396 DO_3OP(gvec_fabd_s, float32_abd, float32) 1397 DO_3OP(gvec_fabd_d, float64_abd, float64) 1398 1399 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1400 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1401 DO_3OP(gvec_fceq_d, float64_ceq, float64) 1402 1403 DO_3OP(gvec_fcge_h, float16_cge, float16) 1404 DO_3OP(gvec_fcge_s, float32_cge, float32) 1405 DO_3OP(gvec_fcge_d, float64_cge, float64) 1406 1407 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1408 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1409 DO_3OP(gvec_fcgt_d, float64_cgt, float64) 1410 1411 DO_3OP(gvec_facge_h, float16_acge, float16) 1412 DO_3OP(gvec_facge_s, float32_acge, float32) 1413 DO_3OP(gvec_facge_d, float64_acge, float64) 1414 1415 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1416 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1417 DO_3OP(gvec_facgt_d, float64_acgt, float64) 1418 1419 DO_3OP(gvec_fmax_h, float16_max, float16) 1420 DO_3OP(gvec_fmax_s, float32_max, float32) 1421 DO_3OP(gvec_fmax_d, float64_max, float64) 1422 1423 DO_3OP(gvec_fmin_h, float16_min, float16) 1424 DO_3OP(gvec_fmin_s, float32_min, float32) 1425 DO_3OP(gvec_fmin_d, float64_min, float64) 1426 1427 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1428 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1429 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64) 1430 1431 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1432 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1433 DO_3OP(gvec_fminnum_d, float64_minnum, float64) 1434 1435 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1436 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1437 1438 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1439 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1440 1441 #ifdef TARGET_AARCH64 1442 DO_3OP(gvec_fdiv_h, float16_div, float16) 1443 DO_3OP(gvec_fdiv_s, float32_div, float32) 1444 DO_3OP(gvec_fdiv_d, float64_div, float64) 1445 1446 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16) 1447 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32) 1448 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64) 1449 1450 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1451 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1452 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1453 1454 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1455 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1456 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1457 1458 #endif 1459 #undef DO_3OP 1460 1461 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1462 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1463 float_status *stat) 1464 { 1465 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1466 } 1467 1468 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1469 float_status *stat) 1470 { 1471 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1472 } 1473 1474 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1475 float_status *stat) 1476 { 1477 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1478 } 1479 1480 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1481 float_status *stat) 1482 { 1483 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1484 } 1485 1486 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1487 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1488 float_status *stat) 1489 { 1490 return float16_muladd(op1, op2, dest, 0, stat); 1491 } 1492 1493 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1494 float_status *stat) 1495 { 1496 return float32_muladd(op1, op2, dest, 0, stat); 1497 } 1498 1499 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2, 1500 float_status *stat) 1501 { 1502 return float64_muladd(op1, op2, dest, 0, stat); 1503 } 1504 1505 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1506 float_status *stat) 1507 { 1508 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1509 } 1510 1511 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1512 float_status *stat) 1513 { 1514 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1515 } 1516 1517 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2, 1518 float_status *stat) 1519 { 1520 return float64_muladd(float64_chs(op1), op2, dest, 0, stat); 1521 } 1522 1523 #define DO_MULADD(NAME, FUNC, TYPE) \ 1524 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1525 { \ 1526 intptr_t i, oprsz = simd_oprsz(desc); \ 1527 TYPE *d = vd, *n = vn, *m = vm; \ 1528 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1529 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1530 } \ 1531 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1532 } 1533 1534 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1535 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1536 1537 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1538 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1539 1540 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1541 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1542 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64) 1543 1544 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1545 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1546 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64) 1547 1548 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1549 * For AdvSIMD, there is of course only one such vector segment. 1550 */ 1551 1552 #define DO_MUL_IDX(NAME, TYPE, H) \ 1553 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1554 { \ 1555 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1556 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1557 intptr_t idx = simd_data(desc); \ 1558 TYPE *d = vd, *n = vn, *m = vm; \ 1559 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1560 TYPE mm = m[H(i + idx)]; \ 1561 for (j = 0; j < segment; j++) { \ 1562 d[i + j] = n[i + j] * mm; \ 1563 } \ 1564 } \ 1565 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1566 } 1567 1568 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1569 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1570 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1571 1572 #undef DO_MUL_IDX 1573 1574 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1575 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1576 { \ 1577 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1578 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1579 intptr_t idx = simd_data(desc); \ 1580 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1581 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1582 TYPE mm = m[H(i + idx)]; \ 1583 for (j = 0; j < segment; j++) { \ 1584 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1585 } \ 1586 } \ 1587 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1588 } 1589 1590 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1591 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1592 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1593 1594 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1595 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1596 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1597 1598 #undef DO_MLA_IDX 1599 1600 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \ 1601 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1602 { \ 1603 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1604 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1605 intptr_t idx = simd_data(desc); \ 1606 TYPE *d = vd, *n = vn, *m = vm; \ 1607 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1608 TYPE mm = m[H(i + idx)]; \ 1609 for (j = 0; j < segment; j++) { \ 1610 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \ 1611 } \ 1612 } \ 1613 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1614 } 1615 1616 #define nop(N, M, S) (M) 1617 1618 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2) 1619 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4) 1620 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8) 1621 1622 #ifdef TARGET_AARCH64 1623 1624 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2) 1625 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4) 1626 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8) 1627 1628 #endif 1629 1630 #undef nop 1631 1632 /* 1633 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1634 * the fused ops below they assume accumulate both from and into Vd. 1635 */ 1636 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2) 1637 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4) 1638 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2) 1639 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4) 1640 1641 #undef DO_FMUL_IDX 1642 1643 #define DO_FMLA_IDX(NAME, TYPE, H) \ 1644 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1645 void *stat, uint32_t desc) \ 1646 { \ 1647 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1648 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1649 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 1650 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 1651 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1652 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 1653 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1654 TYPE mm = m[H(i + idx)]; \ 1655 for (j = 0; j < segment; j++) { \ 1656 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 1657 mm, a[i + j], 0, stat); \ 1658 } \ 1659 } \ 1660 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1661 } 1662 1663 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 1664 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 1665 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) 1666 1667 #undef DO_FMLA_IDX 1668 1669 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1670 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1671 { \ 1672 intptr_t i, oprsz = simd_oprsz(desc); \ 1673 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1674 bool q = false; \ 1675 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1676 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1677 if (dd < MIN) { \ 1678 dd = MIN; \ 1679 q = true; \ 1680 } else if (dd > MAX) { \ 1681 dd = MAX; \ 1682 q = true; \ 1683 } \ 1684 d[i] = dd; \ 1685 } \ 1686 if (q) { \ 1687 uint32_t *qc = vq; \ 1688 qc[0] = 1; \ 1689 } \ 1690 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1691 } 1692 1693 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1694 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1695 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1696 1697 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1698 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1699 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1700 1701 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1702 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1703 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1704 1705 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1706 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1707 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1708 1709 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX) 1710 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX) 1711 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX) 1712 1713 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX) 1714 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX) 1715 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX) 1716 1717 #undef DO_SAT 1718 1719 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1720 void *vm, uint32_t desc) 1721 { 1722 intptr_t i, oprsz = simd_oprsz(desc); 1723 uint64_t *d = vd, *n = vn, *m = vm; 1724 bool q = false; 1725 1726 for (i = 0; i < oprsz / 8; i++) { 1727 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1728 if (dd < nn) { 1729 dd = UINT64_MAX; 1730 q = true; 1731 } 1732 d[i] = dd; 1733 } 1734 if (q) { 1735 uint32_t *qc = vq; 1736 qc[0] = 1; 1737 } 1738 clear_tail(d, oprsz, simd_maxsz(desc)); 1739 } 1740 1741 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1742 void *vm, uint32_t desc) 1743 { 1744 intptr_t i, oprsz = simd_oprsz(desc); 1745 uint64_t *d = vd, *n = vn, *m = vm; 1746 bool q = false; 1747 1748 for (i = 0; i < oprsz / 8; i++) { 1749 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1750 if (nn < mm) { 1751 dd = 0; 1752 q = true; 1753 } 1754 d[i] = dd; 1755 } 1756 if (q) { 1757 uint32_t *qc = vq; 1758 qc[0] = 1; 1759 } 1760 clear_tail(d, oprsz, simd_maxsz(desc)); 1761 } 1762 1763 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1764 void *vm, uint32_t desc) 1765 { 1766 intptr_t i, oprsz = simd_oprsz(desc); 1767 int64_t *d = vd, *n = vn, *m = vm; 1768 bool q = false; 1769 1770 for (i = 0; i < oprsz / 8; i++) { 1771 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1772 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1773 dd = (nn >> 63) ^ ~INT64_MIN; 1774 q = true; 1775 } 1776 d[i] = dd; 1777 } 1778 if (q) { 1779 uint32_t *qc = vq; 1780 qc[0] = 1; 1781 } 1782 clear_tail(d, oprsz, simd_maxsz(desc)); 1783 } 1784 1785 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1786 void *vm, uint32_t desc) 1787 { 1788 intptr_t i, oprsz = simd_oprsz(desc); 1789 int64_t *d = vd, *n = vn, *m = vm; 1790 bool q = false; 1791 1792 for (i = 0; i < oprsz / 8; i++) { 1793 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1794 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1795 dd = (nn >> 63) ^ ~INT64_MIN; 1796 q = true; 1797 } 1798 d[i] = dd; 1799 } 1800 if (q) { 1801 uint32_t *qc = vq; 1802 qc[0] = 1; 1803 } 1804 clear_tail(d, oprsz, simd_maxsz(desc)); 1805 } 1806 1807 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn, 1808 void *vm, uint32_t desc) 1809 { 1810 intptr_t i, oprsz = simd_oprsz(desc); 1811 uint64_t *d = vd, *n = vn, *m = vm; 1812 bool q = false; 1813 1814 for (i = 0; i < oprsz / 8; i++) { 1815 uint64_t nn = n[i]; 1816 int64_t mm = m[i]; 1817 uint64_t dd = nn + mm; 1818 1819 if (mm < 0) { 1820 if (nn < (uint64_t)-mm) { 1821 dd = 0; 1822 q = true; 1823 } 1824 } else { 1825 if (dd < nn) { 1826 dd = UINT64_MAX; 1827 q = true; 1828 } 1829 } 1830 d[i] = dd; 1831 } 1832 if (q) { 1833 uint32_t *qc = vq; 1834 qc[0] = 1; 1835 } 1836 clear_tail(d, oprsz, simd_maxsz(desc)); 1837 } 1838 1839 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn, 1840 void *vm, uint32_t desc) 1841 { 1842 intptr_t i, oprsz = simd_oprsz(desc); 1843 uint64_t *d = vd, *n = vn, *m = vm; 1844 bool q = false; 1845 1846 for (i = 0; i < oprsz / 8; i++) { 1847 int64_t nn = n[i]; 1848 uint64_t mm = m[i]; 1849 int64_t dd = nn + mm; 1850 1851 if (mm > (uint64_t)(INT64_MAX - nn)) { 1852 dd = INT64_MAX; 1853 q = true; 1854 } 1855 d[i] = dd; 1856 } 1857 if (q) { 1858 uint32_t *qc = vq; 1859 qc[0] = 1; 1860 } 1861 clear_tail(d, oprsz, simd_maxsz(desc)); 1862 } 1863 1864 #define DO_SRA(NAME, TYPE) \ 1865 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1866 { \ 1867 intptr_t i, oprsz = simd_oprsz(desc); \ 1868 int shift = simd_data(desc); \ 1869 TYPE *d = vd, *n = vn; \ 1870 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1871 d[i] += n[i] >> shift; \ 1872 } \ 1873 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1874 } 1875 1876 DO_SRA(gvec_ssra_b, int8_t) 1877 DO_SRA(gvec_ssra_h, int16_t) 1878 DO_SRA(gvec_ssra_s, int32_t) 1879 DO_SRA(gvec_ssra_d, int64_t) 1880 1881 DO_SRA(gvec_usra_b, uint8_t) 1882 DO_SRA(gvec_usra_h, uint16_t) 1883 DO_SRA(gvec_usra_s, uint32_t) 1884 DO_SRA(gvec_usra_d, uint64_t) 1885 1886 #undef DO_SRA 1887 1888 #define DO_RSHR(NAME, TYPE) \ 1889 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1890 { \ 1891 intptr_t i, oprsz = simd_oprsz(desc); \ 1892 int shift = simd_data(desc); \ 1893 TYPE *d = vd, *n = vn; \ 1894 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1895 TYPE tmp = n[i] >> (shift - 1); \ 1896 d[i] = (tmp >> 1) + (tmp & 1); \ 1897 } \ 1898 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1899 } 1900 1901 DO_RSHR(gvec_srshr_b, int8_t) 1902 DO_RSHR(gvec_srshr_h, int16_t) 1903 DO_RSHR(gvec_srshr_s, int32_t) 1904 DO_RSHR(gvec_srshr_d, int64_t) 1905 1906 DO_RSHR(gvec_urshr_b, uint8_t) 1907 DO_RSHR(gvec_urshr_h, uint16_t) 1908 DO_RSHR(gvec_urshr_s, uint32_t) 1909 DO_RSHR(gvec_urshr_d, uint64_t) 1910 1911 #undef DO_RSHR 1912 1913 #define DO_RSRA(NAME, TYPE) \ 1914 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1915 { \ 1916 intptr_t i, oprsz = simd_oprsz(desc); \ 1917 int shift = simd_data(desc); \ 1918 TYPE *d = vd, *n = vn; \ 1919 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1920 TYPE tmp = n[i] >> (shift - 1); \ 1921 d[i] += (tmp >> 1) + (tmp & 1); \ 1922 } \ 1923 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1924 } 1925 1926 DO_RSRA(gvec_srsra_b, int8_t) 1927 DO_RSRA(gvec_srsra_h, int16_t) 1928 DO_RSRA(gvec_srsra_s, int32_t) 1929 DO_RSRA(gvec_srsra_d, int64_t) 1930 1931 DO_RSRA(gvec_ursra_b, uint8_t) 1932 DO_RSRA(gvec_ursra_h, uint16_t) 1933 DO_RSRA(gvec_ursra_s, uint32_t) 1934 DO_RSRA(gvec_ursra_d, uint64_t) 1935 1936 #undef DO_RSRA 1937 1938 #define DO_SRI(NAME, TYPE) \ 1939 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1940 { \ 1941 intptr_t i, oprsz = simd_oprsz(desc); \ 1942 int shift = simd_data(desc); \ 1943 TYPE *d = vd, *n = vn; \ 1944 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1945 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 1946 } \ 1947 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1948 } 1949 1950 DO_SRI(gvec_sri_b, uint8_t) 1951 DO_SRI(gvec_sri_h, uint16_t) 1952 DO_SRI(gvec_sri_s, uint32_t) 1953 DO_SRI(gvec_sri_d, uint64_t) 1954 1955 #undef DO_SRI 1956 1957 #define DO_SLI(NAME, TYPE) \ 1958 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1959 { \ 1960 intptr_t i, oprsz = simd_oprsz(desc); \ 1961 int shift = simd_data(desc); \ 1962 TYPE *d = vd, *n = vn; \ 1963 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1964 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 1965 } \ 1966 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1967 } 1968 1969 DO_SLI(gvec_sli_b, uint8_t) 1970 DO_SLI(gvec_sli_h, uint16_t) 1971 DO_SLI(gvec_sli_s, uint32_t) 1972 DO_SLI(gvec_sli_d, uint64_t) 1973 1974 #undef DO_SLI 1975 1976 /* 1977 * Convert float16 to float32, raising no exceptions and 1978 * preserving exceptional values, including SNaN. 1979 * This is effectively an unpack+repack operation. 1980 */ 1981 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 1982 { 1983 const int f16_bias = 15; 1984 const int f32_bias = 127; 1985 uint32_t sign = extract32(f16, 15, 1); 1986 uint32_t exp = extract32(f16, 10, 5); 1987 uint32_t frac = extract32(f16, 0, 10); 1988 1989 if (exp == 0x1f) { 1990 /* Inf or NaN */ 1991 exp = 0xff; 1992 } else if (exp == 0) { 1993 /* Zero or denormal. */ 1994 if (frac != 0) { 1995 if (fz16) { 1996 frac = 0; 1997 } else { 1998 /* 1999 * Denormal; these are all normal float32. 2000 * Shift the fraction so that the msb is at bit 11, 2001 * then remove bit 11 as the implicit bit of the 2002 * normalized float32. Note that we still go through 2003 * the shift for normal numbers below, to put the 2004 * float32 fraction at the right place. 2005 */ 2006 int shift = clz32(frac) - 21; 2007 frac = (frac << shift) & 0x3ff; 2008 exp = f32_bias - f16_bias - shift + 1; 2009 } 2010 } 2011 } else { 2012 /* Normal number; adjust the bias. */ 2013 exp += f32_bias - f16_bias; 2014 } 2015 sign <<= 31; 2016 exp <<= 23; 2017 frac <<= 23 - 10; 2018 2019 return sign | exp | frac; 2020 } 2021 2022 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 2023 { 2024 /* 2025 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 2026 * Load the 2nd qword iff is_q & is_2. 2027 * Shift to the 2nd dword iff !is_q & is_2. 2028 * For !is_q & !is_2, the upper bits of the result are garbage. 2029 */ 2030 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 2031 } 2032 2033 /* 2034 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 2035 * as there is not yet SVE versions that might use blocking. 2036 */ 2037 2038 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 2039 uint32_t desc, bool fz16) 2040 { 2041 intptr_t i, oprsz = simd_oprsz(desc); 2042 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2043 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2044 int is_q = oprsz == 16; 2045 uint64_t n_4, m_4; 2046 2047 /* Pre-load all of the f16 data, avoiding overlap issues. */ 2048 n_4 = load4_f16(vn, is_q, is_2); 2049 m_4 = load4_f16(vm, is_q, is_2); 2050 2051 /* Negate all inputs for FMLSL at once. */ 2052 if (is_s) { 2053 n_4 ^= 0x8000800080008000ull; 2054 } 2055 2056 for (i = 0; i < oprsz / 4; i++) { 2057 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2058 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 2059 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 2060 } 2061 clear_tail(d, oprsz, simd_maxsz(desc)); 2062 } 2063 2064 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 2065 void *venv, uint32_t desc) 2066 { 2067 CPUARMState *env = venv; 2068 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, 2069 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 2070 } 2071 2072 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 2073 void *venv, uint32_t desc) 2074 { 2075 CPUARMState *env = venv; 2076 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc, 2077 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 2078 } 2079 2080 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 2081 void *venv, uint32_t desc) 2082 { 2083 intptr_t i, oprsz = simd_oprsz(desc); 2084 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 2085 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2086 CPUARMState *env = venv; 2087 float_status *status = &env->vfp.fp_status; 2088 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 2089 2090 for (i = 0; i < oprsz; i += sizeof(float32)) { 2091 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; 2092 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 2093 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2094 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2095 float32 aa = *(float32 *)(va + H1_4(i)); 2096 2097 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); 2098 } 2099 } 2100 2101 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 2102 uint32_t desc, bool fz16) 2103 { 2104 intptr_t i, oprsz = simd_oprsz(desc); 2105 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2106 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2107 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 2108 int is_q = oprsz == 16; 2109 uint64_t n_4; 2110 float32 m_1; 2111 2112 /* Pre-load all of the f16 data, avoiding overlap issues. */ 2113 n_4 = load4_f16(vn, is_q, is_2); 2114 2115 /* Negate all inputs for FMLSL at once. */ 2116 if (is_s) { 2117 n_4 ^= 0x8000800080008000ull; 2118 } 2119 2120 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 2121 2122 for (i = 0; i < oprsz / 4; i++) { 2123 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2124 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 2125 } 2126 clear_tail(d, oprsz, simd_maxsz(desc)); 2127 } 2128 2129 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 2130 void *venv, uint32_t desc) 2131 { 2132 CPUARMState *env = venv; 2133 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, 2134 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 2135 } 2136 2137 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 2138 void *venv, uint32_t desc) 2139 { 2140 CPUARMState *env = venv; 2141 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc, 2142 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 2143 } 2144 2145 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 2146 void *venv, uint32_t desc) 2147 { 2148 intptr_t i, j, oprsz = simd_oprsz(desc); 2149 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 2150 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2151 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 2152 CPUARMState *env = venv; 2153 float_status *status = &env->vfp.fp_status; 2154 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 2155 2156 for (i = 0; i < oprsz; i += 16) { 2157 float16 mm_16 = *(float16 *)(vm + i + idx); 2158 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2159 2160 for (j = 0; j < 16; j += sizeof(float32)) { 2161 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; 2162 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2163 float32 aa = *(float32 *)(va + H1_4(i + j)); 2164 2165 *(float32 *)(vd + H1_4(i + j)) = 2166 float32_muladd(nn, mm, aa, 0, status); 2167 } 2168 } 2169 } 2170 2171 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2172 { 2173 intptr_t i, opr_sz = simd_oprsz(desc); 2174 int8_t *d = vd, *n = vn, *m = vm; 2175 2176 for (i = 0; i < opr_sz; ++i) { 2177 int8_t mm = m[i]; 2178 int8_t nn = n[i]; 2179 int8_t res = 0; 2180 if (mm >= 0) { 2181 if (mm < 8) { 2182 res = nn << mm; 2183 } 2184 } else { 2185 res = nn >> (mm > -8 ? -mm : 7); 2186 } 2187 d[i] = res; 2188 } 2189 clear_tail(d, opr_sz, simd_maxsz(desc)); 2190 } 2191 2192 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2193 { 2194 intptr_t i, opr_sz = simd_oprsz(desc); 2195 int16_t *d = vd, *n = vn, *m = vm; 2196 2197 for (i = 0; i < opr_sz / 2; ++i) { 2198 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2199 int16_t nn = n[i]; 2200 int16_t res = 0; 2201 if (mm >= 0) { 2202 if (mm < 16) { 2203 res = nn << mm; 2204 } 2205 } else { 2206 res = nn >> (mm > -16 ? -mm : 15); 2207 } 2208 d[i] = res; 2209 } 2210 clear_tail(d, opr_sz, simd_maxsz(desc)); 2211 } 2212 2213 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2214 { 2215 intptr_t i, opr_sz = simd_oprsz(desc); 2216 uint8_t *d = vd, *n = vn, *m = vm; 2217 2218 for (i = 0; i < opr_sz; ++i) { 2219 int8_t mm = m[i]; 2220 uint8_t nn = n[i]; 2221 uint8_t res = 0; 2222 if (mm >= 0) { 2223 if (mm < 8) { 2224 res = nn << mm; 2225 } 2226 } else { 2227 if (mm > -8) { 2228 res = nn >> -mm; 2229 } 2230 } 2231 d[i] = res; 2232 } 2233 clear_tail(d, opr_sz, simd_maxsz(desc)); 2234 } 2235 2236 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2237 { 2238 intptr_t i, opr_sz = simd_oprsz(desc); 2239 uint16_t *d = vd, *n = vn, *m = vm; 2240 2241 for (i = 0; i < opr_sz / 2; ++i) { 2242 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2243 uint16_t nn = n[i]; 2244 uint16_t res = 0; 2245 if (mm >= 0) { 2246 if (mm < 16) { 2247 res = nn << mm; 2248 } 2249 } else { 2250 if (mm > -16) { 2251 res = nn >> -mm; 2252 } 2253 } 2254 d[i] = res; 2255 } 2256 clear_tail(d, opr_sz, simd_maxsz(desc)); 2257 } 2258 2259 /* 2260 * 8x8->8 polynomial multiply. 2261 * 2262 * Polynomial multiplication is like integer multiplication except the 2263 * partial products are XORed, not added. 2264 * 2265 * TODO: expose this as a generic vector operation, as it is a common 2266 * crypto building block. 2267 */ 2268 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 2269 { 2270 intptr_t i, opr_sz = simd_oprsz(desc); 2271 uint64_t *d = vd, *n = vn, *m = vm; 2272 2273 for (i = 0; i < opr_sz / 8; ++i) { 2274 d[i] = clmul_8x8_low(n[i], m[i]); 2275 } 2276 clear_tail(d, opr_sz, simd_maxsz(desc)); 2277 } 2278 2279 /* 2280 * 64x64->128 polynomial multiply. 2281 * Because of the lanes are not accessed in strict columns, 2282 * this probably cannot be turned into a generic helper. 2283 */ 2284 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2285 { 2286 intptr_t i, opr_sz = simd_oprsz(desc); 2287 intptr_t hi = simd_data(desc); 2288 uint64_t *d = vd, *n = vn, *m = vm; 2289 2290 for (i = 0; i < opr_sz / 8; i += 2) { 2291 Int128 r = clmul_64(n[i + hi], m[i + hi]); 2292 d[i] = int128_getlo(r); 2293 d[i + 1] = int128_gethi(r); 2294 } 2295 clear_tail(d, opr_sz, simd_maxsz(desc)); 2296 } 2297 2298 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2299 { 2300 int hi = simd_data(desc); 2301 uint64_t *d = vd, *n = vn, *m = vm; 2302 uint64_t nn = n[hi], mm = m[hi]; 2303 2304 d[0] = clmul_8x4_packed(nn, mm); 2305 nn >>= 32; 2306 mm >>= 32; 2307 d[1] = clmul_8x4_packed(nn, mm); 2308 2309 clear_tail(d, 16, simd_maxsz(desc)); 2310 } 2311 2312 #ifdef TARGET_AARCH64 2313 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2314 { 2315 int shift = simd_data(desc) * 8; 2316 intptr_t i, opr_sz = simd_oprsz(desc); 2317 uint64_t *d = vd, *n = vn, *m = vm; 2318 2319 for (i = 0; i < opr_sz / 8; ++i) { 2320 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); 2321 } 2322 } 2323 2324 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2325 { 2326 intptr_t sel = H4(simd_data(desc)); 2327 intptr_t i, opr_sz = simd_oprsz(desc); 2328 uint32_t *n = vn, *m = vm; 2329 uint64_t *d = vd; 2330 2331 for (i = 0; i < opr_sz / 8; ++i) { 2332 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); 2333 } 2334 } 2335 #endif 2336 2337 #define DO_CMP0(NAME, TYPE, OP) \ 2338 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2339 { \ 2340 intptr_t i, opr_sz = simd_oprsz(desc); \ 2341 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2342 TYPE nn = *(TYPE *)(vn + i); \ 2343 *(TYPE *)(vd + i) = -(nn OP 0); \ 2344 } \ 2345 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2346 } 2347 2348 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2349 DO_CMP0(gvec_clt0_b, int8_t, <) 2350 DO_CMP0(gvec_cle0_b, int8_t, <=) 2351 DO_CMP0(gvec_cgt0_b, int8_t, >) 2352 DO_CMP0(gvec_cge0_b, int8_t, >=) 2353 2354 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2355 DO_CMP0(gvec_clt0_h, int16_t, <) 2356 DO_CMP0(gvec_cle0_h, int16_t, <=) 2357 DO_CMP0(gvec_cgt0_h, int16_t, >) 2358 DO_CMP0(gvec_cge0_h, int16_t, >=) 2359 2360 #undef DO_CMP0 2361 2362 #define DO_ABD(NAME, TYPE) \ 2363 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2364 { \ 2365 intptr_t i, opr_sz = simd_oprsz(desc); \ 2366 TYPE *d = vd, *n = vn, *m = vm; \ 2367 \ 2368 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2369 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2370 } \ 2371 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2372 } 2373 2374 DO_ABD(gvec_sabd_b, int8_t) 2375 DO_ABD(gvec_sabd_h, int16_t) 2376 DO_ABD(gvec_sabd_s, int32_t) 2377 DO_ABD(gvec_sabd_d, int64_t) 2378 2379 DO_ABD(gvec_uabd_b, uint8_t) 2380 DO_ABD(gvec_uabd_h, uint16_t) 2381 DO_ABD(gvec_uabd_s, uint32_t) 2382 DO_ABD(gvec_uabd_d, uint64_t) 2383 2384 #undef DO_ABD 2385 2386 #define DO_ABA(NAME, TYPE) \ 2387 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2388 { \ 2389 intptr_t i, opr_sz = simd_oprsz(desc); \ 2390 TYPE *d = vd, *n = vn, *m = vm; \ 2391 \ 2392 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2393 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2394 } \ 2395 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2396 } 2397 2398 DO_ABA(gvec_saba_b, int8_t) 2399 DO_ABA(gvec_saba_h, int16_t) 2400 DO_ABA(gvec_saba_s, int32_t) 2401 DO_ABA(gvec_saba_d, int64_t) 2402 2403 DO_ABA(gvec_uaba_b, uint8_t) 2404 DO_ABA(gvec_uaba_h, uint16_t) 2405 DO_ABA(gvec_uaba_s, uint32_t) 2406 DO_ABA(gvec_uaba_d, uint64_t) 2407 2408 #undef DO_ABA 2409 2410 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2411 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 2412 { \ 2413 ARMVectorReg scratch; \ 2414 intptr_t oprsz = simd_oprsz(desc); \ 2415 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2416 TYPE *d = vd, *n = vn, *m = vm; \ 2417 if (unlikely(d == m)) { \ 2418 m = memcpy(&scratch, m, oprsz); \ 2419 } \ 2420 for (intptr_t i = 0; i < half; ++i) { \ 2421 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat); \ 2422 } \ 2423 for (intptr_t i = 0; i < half; ++i) { \ 2424 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat); \ 2425 } \ 2426 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2427 } 2428 2429 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2) 2430 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4) 2431 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, ) 2432 2433 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2) 2434 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4) 2435 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, ) 2436 2437 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2) 2438 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4) 2439 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, ) 2440 2441 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2) 2442 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4) 2443 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, ) 2444 2445 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2) 2446 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4) 2447 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, ) 2448 2449 #undef DO_3OP_PAIR 2450 2451 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2452 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2453 { \ 2454 ARMVectorReg scratch; \ 2455 intptr_t oprsz = simd_oprsz(desc); \ 2456 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2457 TYPE *d = vd, *n = vn, *m = vm; \ 2458 if (unlikely(d == m)) { \ 2459 m = memcpy(&scratch, m, oprsz); \ 2460 } \ 2461 for (intptr_t i = 0; i < half; ++i) { \ 2462 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]); \ 2463 } \ 2464 for (intptr_t i = 0; i < half; ++i) { \ 2465 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]); \ 2466 } \ 2467 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2468 } 2469 2470 #define ADD(A, B) (A + B) 2471 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1) 2472 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2) 2473 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4) 2474 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, ) 2475 #undef ADD 2476 2477 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1) 2478 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2) 2479 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4) 2480 2481 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1) 2482 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2) 2483 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4) 2484 2485 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1) 2486 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2) 2487 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4) 2488 2489 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1) 2490 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2) 2491 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4) 2492 2493 #undef DO_3OP_PAIR 2494 2495 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2496 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2497 { \ 2498 intptr_t i, oprsz = simd_oprsz(desc); \ 2499 int shift = simd_data(desc); \ 2500 TYPE *d = vd, *n = vn; \ 2501 float_status *fpst = stat; \ 2502 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2503 d[i] = FUNC(n[i], shift, fpst); \ 2504 } \ 2505 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2506 } 2507 2508 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t) 2509 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t) 2510 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2511 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2512 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2513 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2514 2515 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t) 2516 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t) 2517 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2518 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t) 2519 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2520 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2521 2522 #undef DO_VCVT_FIXED 2523 2524 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2525 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2526 { \ 2527 float_status *fpst = stat; \ 2528 intptr_t i, oprsz = simd_oprsz(desc); \ 2529 uint32_t rmode = simd_data(desc); \ 2530 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2531 TYPE *d = vd, *n = vn; \ 2532 set_float_rounding_mode(rmode, fpst); \ 2533 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2534 d[i] = FUNC(n[i], 0, fpst); \ 2535 } \ 2536 set_float_rounding_mode(prev_rmode, fpst); \ 2537 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2538 } 2539 2540 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2541 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2542 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2543 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2544 2545 #undef DO_VCVT_RMODE 2546 2547 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2548 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2549 { \ 2550 float_status *fpst = stat; \ 2551 intptr_t i, oprsz = simd_oprsz(desc); \ 2552 uint32_t rmode = simd_data(desc); \ 2553 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2554 TYPE *d = vd, *n = vn; \ 2555 set_float_rounding_mode(rmode, fpst); \ 2556 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2557 d[i] = FUNC(n[i], fpst); \ 2558 } \ 2559 set_float_rounding_mode(prev_rmode, fpst); \ 2560 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2561 } 2562 2563 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2564 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2565 2566 #undef DO_VRINT_RMODE 2567 2568 #ifdef TARGET_AARCH64 2569 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc) 2570 { 2571 const uint8_t *indices = vm; 2572 CPUARMState *env = venv; 2573 size_t oprsz = simd_oprsz(desc); 2574 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2575 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2576 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2577 union { 2578 uint8_t b[16]; 2579 uint64_t d[2]; 2580 } result; 2581 2582 /* 2583 * We must construct the final result in a temp, lest the output 2584 * overlaps the input table. For TBL, begin with zero; for TBX, 2585 * begin with the original register contents. Note that we always 2586 * copy 16 bytes here to avoid an extra branch; clearing the high 2587 * bits of the register for oprsz == 8 is handled below. 2588 */ 2589 if (is_tbx) { 2590 memcpy(&result, vd, 16); 2591 } else { 2592 memset(&result, 0, 16); 2593 } 2594 2595 for (size_t i = 0; i < oprsz; ++i) { 2596 uint32_t index = indices[H1(i)]; 2597 2598 if (index < table_len) { 2599 /* 2600 * Convert index (a byte offset into the virtual table 2601 * which is a series of 128-bit vectors concatenated) 2602 * into the correct register element, bearing in mind 2603 * that the table can wrap around from V31 to V0. 2604 */ 2605 const uint8_t *table = (const uint8_t *) 2606 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2607 result.b[H1(i)] = table[H1(index % 16)]; 2608 } 2609 } 2610 2611 memcpy(vd, &result, 16); 2612 clear_tail(vd, oprsz, simd_maxsz(desc)); 2613 } 2614 #endif 2615 2616 /* 2617 * NxN -> N highpart multiply 2618 * 2619 * TODO: expose this as a generic vector operation. 2620 */ 2621 2622 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2623 { 2624 intptr_t i, opr_sz = simd_oprsz(desc); 2625 int8_t *d = vd, *n = vn, *m = vm; 2626 2627 for (i = 0; i < opr_sz; ++i) { 2628 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2629 } 2630 clear_tail(d, opr_sz, simd_maxsz(desc)); 2631 } 2632 2633 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2634 { 2635 intptr_t i, opr_sz = simd_oprsz(desc); 2636 int16_t *d = vd, *n = vn, *m = vm; 2637 2638 for (i = 0; i < opr_sz / 2; ++i) { 2639 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2640 } 2641 clear_tail(d, opr_sz, simd_maxsz(desc)); 2642 } 2643 2644 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2645 { 2646 intptr_t i, opr_sz = simd_oprsz(desc); 2647 int32_t *d = vd, *n = vn, *m = vm; 2648 2649 for (i = 0; i < opr_sz / 4; ++i) { 2650 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2651 } 2652 clear_tail(d, opr_sz, simd_maxsz(desc)); 2653 } 2654 2655 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2656 { 2657 intptr_t i, opr_sz = simd_oprsz(desc); 2658 uint64_t *d = vd, *n = vn, *m = vm; 2659 uint64_t discard; 2660 2661 for (i = 0; i < opr_sz / 8; ++i) { 2662 muls64(&discard, &d[i], n[i], m[i]); 2663 } 2664 clear_tail(d, opr_sz, simd_maxsz(desc)); 2665 } 2666 2667 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2668 { 2669 intptr_t i, opr_sz = simd_oprsz(desc); 2670 uint8_t *d = vd, *n = vn, *m = vm; 2671 2672 for (i = 0; i < opr_sz; ++i) { 2673 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2674 } 2675 clear_tail(d, opr_sz, simd_maxsz(desc)); 2676 } 2677 2678 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2679 { 2680 intptr_t i, opr_sz = simd_oprsz(desc); 2681 uint16_t *d = vd, *n = vn, *m = vm; 2682 2683 for (i = 0; i < opr_sz / 2; ++i) { 2684 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2685 } 2686 clear_tail(d, opr_sz, simd_maxsz(desc)); 2687 } 2688 2689 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2690 { 2691 intptr_t i, opr_sz = simd_oprsz(desc); 2692 uint32_t *d = vd, *n = vn, *m = vm; 2693 2694 for (i = 0; i < opr_sz / 4; ++i) { 2695 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2696 } 2697 clear_tail(d, opr_sz, simd_maxsz(desc)); 2698 } 2699 2700 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2701 { 2702 intptr_t i, opr_sz = simd_oprsz(desc); 2703 uint64_t *d = vd, *n = vn, *m = vm; 2704 uint64_t discard; 2705 2706 for (i = 0; i < opr_sz / 8; ++i) { 2707 mulu64(&discard, &d[i], n[i], m[i]); 2708 } 2709 clear_tail(d, opr_sz, simd_maxsz(desc)); 2710 } 2711 2712 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2713 { 2714 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2715 int shr = simd_data(desc); 2716 uint64_t *d = vd, *n = vn, *m = vm; 2717 2718 for (i = 0; i < opr_sz; ++i) { 2719 d[i] = ror64(n[i] ^ m[i], shr); 2720 } 2721 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2722 } 2723 2724 /* 2725 * Integer matrix-multiply accumulate 2726 */ 2727 2728 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2729 { 2730 int8_t *n = vn, *m = vm; 2731 2732 for (intptr_t k = 0; k < 8; ++k) { 2733 sum += n[H1(k)] * m[H1(k)]; 2734 } 2735 return sum; 2736 } 2737 2738 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2739 { 2740 uint8_t *n = vn, *m = vm; 2741 2742 for (intptr_t k = 0; k < 8; ++k) { 2743 sum += n[H1(k)] * m[H1(k)]; 2744 } 2745 return sum; 2746 } 2747 2748 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2749 { 2750 uint8_t *n = vn; 2751 int8_t *m = vm; 2752 2753 for (intptr_t k = 0; k < 8; ++k) { 2754 sum += n[H1(k)] * m[H1(k)]; 2755 } 2756 return sum; 2757 } 2758 2759 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2760 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2761 { 2762 intptr_t seg, opr_sz = simd_oprsz(desc); 2763 2764 for (seg = 0; seg < opr_sz; seg += 16) { 2765 uint32_t *d = vd + seg; 2766 uint32_t *a = va + seg; 2767 uint32_t sum0, sum1, sum2, sum3; 2768 2769 /* 2770 * Process the entire segment at once, writing back the 2771 * results only after we've consumed all of the inputs. 2772 * 2773 * Key to indices by column: 2774 * i j i j 2775 */ 2776 sum0 = a[H4(0 + 0)]; 2777 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2778 sum1 = a[H4(0 + 1)]; 2779 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2780 sum2 = a[H4(2 + 0)]; 2781 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2782 sum3 = a[H4(2 + 1)]; 2783 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2784 2785 d[H4(0)] = sum0; 2786 d[H4(1)] = sum1; 2787 d[H4(2)] = sum2; 2788 d[H4(3)] = sum3; 2789 } 2790 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2791 } 2792 2793 #define DO_MMLA_B(NAME, INNER) \ 2794 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2795 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2796 2797 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2798 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2799 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2800 2801 /* 2802 * BFloat16 Dot Product 2803 */ 2804 2805 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp) 2806 { 2807 /* 2808 * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF. 2809 * For EBF = 0, we ignore the FPCR bits which determine rounding 2810 * mode and denormal-flushing, and we do unfused multiplies and 2811 * additions with intermediate rounding of all products and sums. 2812 * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits, 2813 * and we perform a fused two-way sum-of-products without intermediate 2814 * rounding of the products. 2815 * In either case, we don't set fp exception flags. 2816 * 2817 * EBF is AArch64 only, so even if it's set in the FPCR it has 2818 * no effect on AArch32 instructions. 2819 */ 2820 bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF; 2821 2822 *statusp = env->vfp.fp_status; 2823 set_default_nan_mode(true, statusp); 2824 2825 if (ebf) { 2826 /* EBF=1 needs to do a step with round-to-odd semantics */ 2827 *oddstatusp = *statusp; 2828 set_float_rounding_mode(float_round_to_odd, oddstatusp); 2829 } else { 2830 set_flush_to_zero(true, statusp); 2831 set_flush_inputs_to_zero(true, statusp); 2832 set_float_rounding_mode(float_round_to_odd_inf, statusp); 2833 } 2834 return ebf; 2835 } 2836 2837 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst) 2838 { 2839 float32 t1, t2; 2840 2841 /* 2842 * Extract each BFloat16 from the element pair, and shift 2843 * them such that they become float32. 2844 */ 2845 t1 = float32_mul(e1 << 16, e2 << 16, fpst); 2846 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst); 2847 t1 = float32_add(t1, t2, fpst); 2848 t1 = float32_add(sum, t1, fpst); 2849 2850 return t1; 2851 } 2852 2853 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2, 2854 float_status *fpst, float_status *fpst_odd) 2855 { 2856 /* 2857 * Compare f16_dotadd() in sme_helper.c, but here we have 2858 * bfloat16 inputs. In particular that means that we do not 2859 * want the FPCR.FZ16 flush semantics, so we use the normal 2860 * float_status for the input handling here. 2861 */ 2862 float64 e1r = float32_to_float64(e1 << 16, fpst); 2863 float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst); 2864 float64 e2r = float32_to_float64(e2 << 16, fpst); 2865 float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst); 2866 float64 t64; 2867 float32 t32; 2868 2869 /* 2870 * The ARM pseudocode function FPDot performs both multiplies 2871 * and the add with a single rounding operation. Emulate this 2872 * by performing the first multiply in round-to-odd, then doing 2873 * the second multiply as fused multiply-add, and rounding to 2874 * float32 all in one step. 2875 */ 2876 t64 = float64_mul(e1r, e2r, fpst_odd); 2877 t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst); 2878 2879 /* This conversion is exact, because we've already rounded. */ 2880 t32 = float64_to_float32(t64, fpst); 2881 2882 /* The final accumulation step is not fused. */ 2883 return float32_add(sum, t32, fpst); 2884 } 2885 2886 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, 2887 CPUARMState *env, uint32_t desc) 2888 { 2889 intptr_t i, opr_sz = simd_oprsz(desc); 2890 float32 *d = vd, *a = va; 2891 uint32_t *n = vn, *m = vm; 2892 float_status fpst, fpst_odd; 2893 2894 if (is_ebf(env, &fpst, &fpst_odd)) { 2895 for (i = 0; i < opr_sz / 4; ++i) { 2896 d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd); 2897 } 2898 } else { 2899 for (i = 0; i < opr_sz / 4; ++i) { 2900 d[i] = bfdotadd(a[i], n[i], m[i], &fpst); 2901 } 2902 } 2903 clear_tail(d, opr_sz, simd_maxsz(desc)); 2904 } 2905 2906 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 2907 void *va, CPUARMState *env, uint32_t desc) 2908 { 2909 intptr_t i, j, opr_sz = simd_oprsz(desc); 2910 intptr_t index = simd_data(desc); 2911 intptr_t elements = opr_sz / 4; 2912 intptr_t eltspersegment = MIN(16 / 4, elements); 2913 float32 *d = vd, *a = va; 2914 uint32_t *n = vn, *m = vm; 2915 float_status fpst, fpst_odd; 2916 2917 if (is_ebf(env, &fpst, &fpst_odd)) { 2918 for (i = 0; i < elements; i += eltspersegment) { 2919 uint32_t m_idx = m[i + H4(index)]; 2920 2921 for (j = i; j < i + eltspersegment; j++) { 2922 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd); 2923 } 2924 } 2925 } else { 2926 for (i = 0; i < elements; i += eltspersegment) { 2927 uint32_t m_idx = m[i + H4(index)]; 2928 2929 for (j = i; j < i + eltspersegment; j++) { 2930 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst); 2931 } 2932 } 2933 } 2934 clear_tail(d, opr_sz, simd_maxsz(desc)); 2935 } 2936 2937 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, 2938 CPUARMState *env, uint32_t desc) 2939 { 2940 intptr_t s, opr_sz = simd_oprsz(desc); 2941 float32 *d = vd, *a = va; 2942 uint32_t *n = vn, *m = vm; 2943 float_status fpst, fpst_odd; 2944 2945 if (is_ebf(env, &fpst, &fpst_odd)) { 2946 for (s = 0; s < opr_sz / 4; s += 4) { 2947 float32 sum00, sum01, sum10, sum11; 2948 2949 /* 2950 * Process the entire segment at once, writing back the 2951 * results only after we've consumed all of the inputs. 2952 * 2953 * Key to indices by column: 2954 * i j i k j k 2955 */ 2956 sum00 = a[s + H4(0 + 0)]; 2957 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 2958 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 2959 2960 sum01 = a[s + H4(0 + 1)]; 2961 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 2962 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 2963 2964 sum10 = a[s + H4(2 + 0)]; 2965 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 2966 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 2967 2968 sum11 = a[s + H4(2 + 1)]; 2969 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 2970 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 2971 2972 d[s + H4(0 + 0)] = sum00; 2973 d[s + H4(0 + 1)] = sum01; 2974 d[s + H4(2 + 0)] = sum10; 2975 d[s + H4(2 + 1)] = sum11; 2976 } 2977 } else { 2978 for (s = 0; s < opr_sz / 4; s += 4) { 2979 float32 sum00, sum01, sum10, sum11; 2980 2981 /* 2982 * Process the entire segment at once, writing back the 2983 * results only after we've consumed all of the inputs. 2984 * 2985 * Key to indices by column: 2986 * i j i k j k 2987 */ 2988 sum00 = a[s + H4(0 + 0)]; 2989 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst); 2990 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst); 2991 2992 sum01 = a[s + H4(0 + 1)]; 2993 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst); 2994 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst); 2995 2996 sum10 = a[s + H4(2 + 0)]; 2997 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst); 2998 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst); 2999 3000 sum11 = a[s + H4(2 + 1)]; 3001 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst); 3002 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst); 3003 3004 d[s + H4(0 + 0)] = sum00; 3005 d[s + H4(0 + 1)] = sum01; 3006 d[s + H4(2 + 0)] = sum10; 3007 d[s + H4(2 + 1)] = sum11; 3008 } 3009 } 3010 clear_tail(d, opr_sz, simd_maxsz(desc)); 3011 } 3012 3013 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 3014 void *stat, uint32_t desc) 3015 { 3016 intptr_t i, opr_sz = simd_oprsz(desc); 3017 intptr_t sel = simd_data(desc); 3018 float32 *d = vd, *a = va; 3019 bfloat16 *n = vn, *m = vm; 3020 3021 for (i = 0; i < opr_sz / 4; ++i) { 3022 float32 nn = n[H2(i * 2 + sel)] << 16; 3023 float32 mm = m[H2(i * 2 + sel)] << 16; 3024 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 3025 } 3026 clear_tail(d, opr_sz, simd_maxsz(desc)); 3027 } 3028 3029 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 3030 void *va, void *stat, uint32_t desc) 3031 { 3032 intptr_t i, j, opr_sz = simd_oprsz(desc); 3033 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 3034 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 3035 intptr_t elements = opr_sz / 4; 3036 intptr_t eltspersegment = MIN(16 / 4, elements); 3037 float32 *d = vd, *a = va; 3038 bfloat16 *n = vn, *m = vm; 3039 3040 for (i = 0; i < elements; i += eltspersegment) { 3041 float32 m_idx = m[H2(2 * i + index)] << 16; 3042 3043 for (j = i; j < i + eltspersegment; j++) { 3044 float32 n_j = n[H2(2 * j + sel)] << 16; 3045 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 3046 } 3047 } 3048 clear_tail(d, opr_sz, simd_maxsz(desc)); 3049 } 3050 3051 #define DO_CLAMP(NAME, TYPE) \ 3052 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 3053 { \ 3054 intptr_t i, opr_sz = simd_oprsz(desc); \ 3055 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 3056 TYPE aa = *(TYPE *)(a + i); \ 3057 TYPE nn = *(TYPE *)(n + i); \ 3058 TYPE mm = *(TYPE *)(m + i); \ 3059 TYPE dd = MIN(MAX(aa, nn), mm); \ 3060 *(TYPE *)(d + i) = dd; \ 3061 } \ 3062 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 3063 } 3064 3065 DO_CLAMP(gvec_sclamp_b, int8_t) 3066 DO_CLAMP(gvec_sclamp_h, int16_t) 3067 DO_CLAMP(gvec_sclamp_s, int32_t) 3068 DO_CLAMP(gvec_sclamp_d, int64_t) 3069 3070 DO_CLAMP(gvec_uclamp_b, uint8_t) 3071 DO_CLAMP(gvec_uclamp_h, uint16_t) 3072 DO_CLAMP(gvec_uclamp_s, uint32_t) 3073 DO_CLAMP(gvec_uclamp_d, uint64_t) 3074 3075 /* Bit count in each 8-bit word. */ 3076 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc) 3077 { 3078 intptr_t i, opr_sz = simd_oprsz(desc); 3079 uint8_t *d = vd, *n = vn; 3080 3081 for (i = 0; i < opr_sz; ++i) { 3082 d[i] = ctpop8(n[i]); 3083 } 3084 clear_tail(d, opr_sz, simd_maxsz(desc)); 3085 } 3086 3087 /* Reverse bits in each 8 bit word */ 3088 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc) 3089 { 3090 intptr_t i, opr_sz = simd_oprsz(desc); 3091 uint64_t *d = vd, *n = vn; 3092 3093 for (i = 0; i < opr_sz / 8; ++i) { 3094 d[i] = revbit64(bswap64(n[i])); 3095 } 3096 clear_tail(d, opr_sz, simd_maxsz(desc)); 3097 } 3098