1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "crypto/clmul.h" 27 #include "vec_internal.h" 28 29 /* 30 * Data for expanding active predicate bits to bytes, for byte elements. 31 * 32 * for (i = 0; i < 256; ++i) { 33 * unsigned long m = 0; 34 * for (j = 0; j < 8; j++) { 35 * if ((i >> j) & 1) { 36 * m |= 0xfful << (j << 3); 37 * } 38 * } 39 * printf("0x%016lx,\n", m); 40 * } 41 */ 42 const uint64_t expand_pred_b_data[256] = { 43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 128 0xffffffffffffffff, 129 }; 130 131 /* 132 * Similarly for half-word elements. 133 * for (i = 0; i < 256; ++i) { 134 * unsigned long m = 0; 135 * if (i & 0xaa) { 136 * continue; 137 * } 138 * for (j = 0; j < 8; j += 2) { 139 * if ((i >> j) & 1) { 140 * m |= 0xfffful << (j << 3); 141 * } 142 * } 143 * printf("[0x%x] = 0x%016lx,\n", i, m); 144 * } 145 */ 146 const uint64_t expand_pred_h_data[0x55 + 1] = { 147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 154 [0x55] = 0xffffffffffffffff, 155 }; 156 157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 159 bool neg, bool round) 160 { 161 /* 162 * Simplify: 163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 165 */ 166 int32_t ret = (int32_t)src1 * src2; 167 if (neg) { 168 ret = -ret; 169 } 170 ret += ((int32_t)src3 << 7) + (round << 6); 171 ret >>= 7; 172 173 if (ret != (int8_t)ret) { 174 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 175 } 176 return ret; 177 } 178 179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 180 void *va, uint32_t desc) 181 { 182 intptr_t i, opr_sz = simd_oprsz(desc); 183 int8_t *d = vd, *n = vn, *m = vm, *a = va; 184 185 for (i = 0; i < opr_sz; ++i) { 186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 187 } 188 } 189 190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 191 void *va, uint32_t desc) 192 { 193 intptr_t i, opr_sz = simd_oprsz(desc); 194 int8_t *d = vd, *n = vn, *m = vm, *a = va; 195 196 for (i = 0; i < opr_sz; ++i) { 197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 198 } 199 } 200 201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 202 { 203 intptr_t i, opr_sz = simd_oprsz(desc); 204 int8_t *d = vd, *n = vn, *m = vm; 205 206 for (i = 0; i < opr_sz; ++i) { 207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 208 } 209 } 210 211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 212 { 213 intptr_t i, opr_sz = simd_oprsz(desc); 214 int8_t *d = vd, *n = vn, *m = vm; 215 216 for (i = 0; i < opr_sz; ++i) { 217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 218 } 219 } 220 221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 223 bool neg, bool round, uint32_t *sat) 224 { 225 /* Simplify similarly to do_sqrdmlah_b above. */ 226 int32_t ret = (int32_t)src1 * src2; 227 if (neg) { 228 ret = -ret; 229 } 230 ret += ((int32_t)src3 << 15) + (round << 14); 231 ret >>= 15; 232 233 if (ret != (int16_t)ret) { 234 *sat = 1; 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 236 } 237 return ret; 238 } 239 240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 241 uint32_t src2, uint32_t src3) 242 { 243 uint32_t *sat = &env->vfp.qc[0]; 244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 246 false, true, sat); 247 return deposit32(e1, 16, 16, e2); 248 } 249 250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 251 void *vq, uint32_t desc) 252 { 253 uintptr_t opr_sz = simd_oprsz(desc); 254 int16_t *d = vd; 255 int16_t *n = vn; 256 int16_t *m = vm; 257 uintptr_t i; 258 259 for (i = 0; i < opr_sz / 2; ++i) { 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 261 } 262 clear_tail(d, opr_sz, simd_maxsz(desc)); 263 } 264 265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 266 uint32_t src2, uint32_t src3) 267 { 268 uint32_t *sat = &env->vfp.qc[0]; 269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 271 true, true, sat); 272 return deposit32(e1, 16, 16, e2); 273 } 274 275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277 { 278 uintptr_t opr_sz = simd_oprsz(desc); 279 int16_t *d = vd; 280 int16_t *n = vn; 281 int16_t *m = vm; 282 uintptr_t i; 283 284 for (i = 0; i < opr_sz / 2; ++i) { 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 286 } 287 clear_tail(d, opr_sz, simd_maxsz(desc)); 288 } 289 290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 291 void *vq, uint32_t desc) 292 { 293 intptr_t i, opr_sz = simd_oprsz(desc); 294 int16_t *d = vd, *n = vn, *m = vm; 295 296 for (i = 0; i < opr_sz / 2; ++i) { 297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 298 } 299 clear_tail(d, opr_sz, simd_maxsz(desc)); 300 } 301 302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 303 void *vq, uint32_t desc) 304 { 305 intptr_t i, opr_sz = simd_oprsz(desc); 306 int16_t *d = vd, *n = vn, *m = vm; 307 308 for (i = 0; i < opr_sz / 2; ++i) { 309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 310 } 311 clear_tail(d, opr_sz, simd_maxsz(desc)); 312 } 313 314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm, 315 void *vq, uint32_t desc) 316 { 317 intptr_t i, j, opr_sz = simd_oprsz(desc); 318 int idx = simd_data(desc); 319 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 320 intptr_t elements = opr_sz / 2; 321 intptr_t eltspersegment = MIN(16 / 2, elements); 322 323 for (i = 0; i < elements; i += 16 / 2) { 324 int16_t mm = m[i]; 325 for (j = 0; j < eltspersegment; ++j) { 326 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq); 327 } 328 } 329 clear_tail(d, opr_sz, simd_maxsz(desc)); 330 } 331 332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, 333 void *vq, uint32_t desc) 334 { 335 intptr_t i, j, opr_sz = simd_oprsz(desc); 336 int idx = simd_data(desc); 337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 338 intptr_t elements = opr_sz / 2; 339 intptr_t eltspersegment = MIN(16 / 2, elements); 340 341 for (i = 0; i < elements; i += 16 / 2) { 342 int16_t mm = m[i]; 343 for (j = 0; j < eltspersegment; ++j) { 344 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq); 345 } 346 } 347 clear_tail(d, opr_sz, simd_maxsz(desc)); 348 } 349 350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm, 351 void *vq, uint32_t desc) 352 { 353 intptr_t i, j, opr_sz = simd_oprsz(desc); 354 int idx = simd_data(desc); 355 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 356 intptr_t elements = opr_sz / 2; 357 intptr_t eltspersegment = MIN(16 / 2, elements); 358 359 for (i = 0; i < elements; i += 16 / 2) { 360 int16_t mm = m[i]; 361 for (j = 0; j < eltspersegment; ++j) { 362 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq); 363 } 364 } 365 clear_tail(d, opr_sz, simd_maxsz(desc)); 366 } 367 368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm, 369 void *vq, uint32_t desc) 370 { 371 intptr_t i, j, opr_sz = simd_oprsz(desc); 372 int idx = simd_data(desc); 373 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 374 intptr_t elements = opr_sz / 2; 375 intptr_t eltspersegment = MIN(16 / 2, elements); 376 377 for (i = 0; i < elements; i += 16 / 2) { 378 int16_t mm = m[i]; 379 for (j = 0; j < eltspersegment; ++j) { 380 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq); 381 } 382 } 383 clear_tail(d, opr_sz, simd_maxsz(desc)); 384 } 385 386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 387 void *va, uint32_t desc) 388 { 389 intptr_t i, opr_sz = simd_oprsz(desc); 390 int16_t *d = vd, *n = vn, *m = vm, *a = va; 391 uint32_t discard; 392 393 for (i = 0; i < opr_sz / 2; ++i) { 394 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 395 } 396 } 397 398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 399 void *va, uint32_t desc) 400 { 401 intptr_t i, opr_sz = simd_oprsz(desc); 402 int16_t *d = vd, *n = vn, *m = vm, *a = va; 403 uint32_t discard; 404 405 for (i = 0; i < opr_sz / 2; ++i) { 406 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 407 } 408 } 409 410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 411 { 412 intptr_t i, opr_sz = simd_oprsz(desc); 413 int16_t *d = vd, *n = vn, *m = vm; 414 uint32_t discard; 415 416 for (i = 0; i < opr_sz / 2; ++i) { 417 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 418 } 419 } 420 421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 422 { 423 intptr_t i, opr_sz = simd_oprsz(desc); 424 int16_t *d = vd, *n = vn, *m = vm; 425 uint32_t discard; 426 427 for (i = 0; i < opr_sz / 2; ++i) { 428 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 429 } 430 } 431 432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 433 { 434 intptr_t i, j, opr_sz = simd_oprsz(desc); 435 int idx = simd_data(desc); 436 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 437 uint32_t discard; 438 439 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 440 int16_t mm = m[i]; 441 for (j = 0; j < 16 / 2; ++j) { 442 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 443 } 444 } 445 } 446 447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 448 { 449 intptr_t i, j, opr_sz = simd_oprsz(desc); 450 int idx = simd_data(desc); 451 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 452 uint32_t discard; 453 454 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 455 int16_t mm = m[i]; 456 for (j = 0; j < 16 / 2; ++j) { 457 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 458 } 459 } 460 } 461 462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 464 bool neg, bool round, uint32_t *sat) 465 { 466 /* Simplify similarly to do_sqrdmlah_b above. */ 467 int64_t ret = (int64_t)src1 * src2; 468 if (neg) { 469 ret = -ret; 470 } 471 ret += ((int64_t)src3 << 31) + (round << 30); 472 ret >>= 31; 473 474 if (ret != (int32_t)ret) { 475 *sat = 1; 476 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 477 } 478 return ret; 479 } 480 481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 482 int32_t src2, int32_t src3) 483 { 484 uint32_t *sat = &env->vfp.qc[0]; 485 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 486 } 487 488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 489 void *vq, uint32_t desc) 490 { 491 uintptr_t opr_sz = simd_oprsz(desc); 492 int32_t *d = vd; 493 int32_t *n = vn; 494 int32_t *m = vm; 495 uintptr_t i; 496 497 for (i = 0; i < opr_sz / 4; ++i) { 498 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 499 } 500 clear_tail(d, opr_sz, simd_maxsz(desc)); 501 } 502 503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 504 int32_t src2, int32_t src3) 505 { 506 uint32_t *sat = &env->vfp.qc[0]; 507 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 508 } 509 510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 511 void *vq, uint32_t desc) 512 { 513 uintptr_t opr_sz = simd_oprsz(desc); 514 int32_t *d = vd; 515 int32_t *n = vn; 516 int32_t *m = vm; 517 uintptr_t i; 518 519 for (i = 0; i < opr_sz / 4; ++i) { 520 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 521 } 522 clear_tail(d, opr_sz, simd_maxsz(desc)); 523 } 524 525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 526 void *vq, uint32_t desc) 527 { 528 intptr_t i, opr_sz = simd_oprsz(desc); 529 int32_t *d = vd, *n = vn, *m = vm; 530 531 for (i = 0; i < opr_sz / 4; ++i) { 532 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 533 } 534 clear_tail(d, opr_sz, simd_maxsz(desc)); 535 } 536 537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 538 void *vq, uint32_t desc) 539 { 540 intptr_t i, opr_sz = simd_oprsz(desc); 541 int32_t *d = vd, *n = vn, *m = vm; 542 543 for (i = 0; i < opr_sz / 4; ++i) { 544 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 545 } 546 clear_tail(d, opr_sz, simd_maxsz(desc)); 547 } 548 549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm, 550 void *vq, uint32_t desc) 551 { 552 intptr_t i, j, opr_sz = simd_oprsz(desc); 553 int idx = simd_data(desc); 554 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 555 intptr_t elements = opr_sz / 4; 556 intptr_t eltspersegment = MIN(16 / 4, elements); 557 558 for (i = 0; i < elements; i += 16 / 4) { 559 int32_t mm = m[i]; 560 for (j = 0; j < eltspersegment; ++j) { 561 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq); 562 } 563 } 564 clear_tail(d, opr_sz, simd_maxsz(desc)); 565 } 566 567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, 568 void *vq, uint32_t desc) 569 { 570 intptr_t i, j, opr_sz = simd_oprsz(desc); 571 int idx = simd_data(desc); 572 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 573 intptr_t elements = opr_sz / 4; 574 intptr_t eltspersegment = MIN(16 / 4, elements); 575 576 for (i = 0; i < elements; i += 16 / 4) { 577 int32_t mm = m[i]; 578 for (j = 0; j < eltspersegment; ++j) { 579 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq); 580 } 581 } 582 clear_tail(d, opr_sz, simd_maxsz(desc)); 583 } 584 585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm, 586 void *vq, uint32_t desc) 587 { 588 intptr_t i, j, opr_sz = simd_oprsz(desc); 589 int idx = simd_data(desc); 590 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 591 intptr_t elements = opr_sz / 4; 592 intptr_t eltspersegment = MIN(16 / 4, elements); 593 594 for (i = 0; i < elements; i += 16 / 4) { 595 int32_t mm = m[i]; 596 for (j = 0; j < eltspersegment; ++j) { 597 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq); 598 } 599 } 600 clear_tail(d, opr_sz, simd_maxsz(desc)); 601 } 602 603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm, 604 void *vq, uint32_t desc) 605 { 606 intptr_t i, j, opr_sz = simd_oprsz(desc); 607 int idx = simd_data(desc); 608 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 609 intptr_t elements = opr_sz / 4; 610 intptr_t eltspersegment = MIN(16 / 4, elements); 611 612 for (i = 0; i < elements; i += 16 / 4) { 613 int32_t mm = m[i]; 614 for (j = 0; j < eltspersegment; ++j) { 615 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq); 616 } 617 } 618 clear_tail(d, opr_sz, simd_maxsz(desc)); 619 } 620 621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 622 void *va, uint32_t desc) 623 { 624 intptr_t i, opr_sz = simd_oprsz(desc); 625 int32_t *d = vd, *n = vn, *m = vm, *a = va; 626 uint32_t discard; 627 628 for (i = 0; i < opr_sz / 4; ++i) { 629 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 630 } 631 } 632 633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 634 void *va, uint32_t desc) 635 { 636 intptr_t i, opr_sz = simd_oprsz(desc); 637 int32_t *d = vd, *n = vn, *m = vm, *a = va; 638 uint32_t discard; 639 640 for (i = 0; i < opr_sz / 4; ++i) { 641 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 642 } 643 } 644 645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 646 { 647 intptr_t i, opr_sz = simd_oprsz(desc); 648 int32_t *d = vd, *n = vn, *m = vm; 649 uint32_t discard; 650 651 for (i = 0; i < opr_sz / 4; ++i) { 652 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 653 } 654 } 655 656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 657 { 658 intptr_t i, opr_sz = simd_oprsz(desc); 659 int32_t *d = vd, *n = vn, *m = vm; 660 uint32_t discard; 661 662 for (i = 0; i < opr_sz / 4; ++i) { 663 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 664 } 665 } 666 667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 668 { 669 intptr_t i, j, opr_sz = simd_oprsz(desc); 670 int idx = simd_data(desc); 671 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 672 uint32_t discard; 673 674 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 675 int32_t mm = m[i]; 676 for (j = 0; j < 16 / 4; ++j) { 677 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 678 } 679 } 680 } 681 682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 683 { 684 intptr_t i, j, opr_sz = simd_oprsz(desc); 685 int idx = simd_data(desc); 686 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 687 uint32_t discard; 688 689 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 690 int32_t mm = m[i]; 691 for (j = 0; j < 16 / 4; ++j) { 692 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 693 } 694 } 695 } 696 697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 698 static int64_t do_sat128_d(Int128 r) 699 { 700 int64_t ls = int128_getlo(r); 701 int64_t hs = int128_gethi(r); 702 703 if (unlikely(hs != (ls >> 63))) { 704 return hs < 0 ? INT64_MIN : INT64_MAX; 705 } 706 return ls; 707 } 708 709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 710 { 711 uint64_t l, h; 712 Int128 r, t; 713 714 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 715 muls64(&l, &h, m, n); 716 r = int128_make128(l, h); 717 if (neg) { 718 r = int128_neg(r); 719 } 720 if (a) { 721 t = int128_exts64(a); 722 t = int128_lshift(t, 63); 723 r = int128_add(r, t); 724 } 725 if (round) { 726 t = int128_exts64(1ll << 62); 727 r = int128_add(r, t); 728 } 729 r = int128_rshift(r, 63); 730 731 return do_sat128_d(r); 732 } 733 734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 735 void *va, uint32_t desc) 736 { 737 intptr_t i, opr_sz = simd_oprsz(desc); 738 int64_t *d = vd, *n = vn, *m = vm, *a = va; 739 740 for (i = 0; i < opr_sz / 8; ++i) { 741 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 742 } 743 } 744 745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 746 void *va, uint32_t desc) 747 { 748 intptr_t i, opr_sz = simd_oprsz(desc); 749 int64_t *d = vd, *n = vn, *m = vm, *a = va; 750 751 for (i = 0; i < opr_sz / 8; ++i) { 752 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 753 } 754 } 755 756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 757 { 758 intptr_t i, opr_sz = simd_oprsz(desc); 759 int64_t *d = vd, *n = vn, *m = vm; 760 761 for (i = 0; i < opr_sz / 8; ++i) { 762 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 763 } 764 } 765 766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 767 { 768 intptr_t i, opr_sz = simd_oprsz(desc); 769 int64_t *d = vd, *n = vn, *m = vm; 770 771 for (i = 0; i < opr_sz / 8; ++i) { 772 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 773 } 774 } 775 776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 777 { 778 intptr_t i, j, opr_sz = simd_oprsz(desc); 779 int idx = simd_data(desc); 780 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 781 782 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 783 int64_t mm = m[i]; 784 for (j = 0; j < 16 / 8; ++j) { 785 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 786 } 787 } 788 } 789 790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 791 { 792 intptr_t i, j, opr_sz = simd_oprsz(desc); 793 int idx = simd_data(desc); 794 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 795 796 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 797 int64_t mm = m[i]; 798 for (j = 0; j < 16 / 8; ++j) { 799 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 800 } 801 } 802 } 803 804 /* Integer 8 and 16-bit dot-product. 805 * 806 * Note that for the loops herein, host endianness does not matter 807 * with respect to the ordering of data within the quad-width lanes. 808 * All elements are treated equally, no matter where they are. 809 */ 810 811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 813 { \ 814 intptr_t i, opr_sz = simd_oprsz(desc); \ 815 TYPED *d = vd, *a = va; \ 816 TYPEN *n = vn; \ 817 TYPEM *m = vm; \ 818 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 819 d[i] = (a[i] + \ 820 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 821 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 822 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 823 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 824 } \ 825 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 826 } 827 828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 833 834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 836 { \ 837 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 838 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 839 /* \ 840 * Special case: opr_sz == 8 from AA64/AA32 advsimd means the \ 841 * first iteration might not be a full 16 byte segment. But \ 842 * for vector lengths beyond that this must be SVE and we know \ 843 * opr_sz is a multiple of 16, so we need not clamp segend \ 844 * to opr_sz_n when we advance it at the end of the loop. \ 845 */ \ 846 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 847 intptr_t index = simd_data(desc); \ 848 TYPED *d = vd, *a = va; \ 849 TYPEN *n = vn; \ 850 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 851 do { \ 852 TYPED m0 = m_indexed[i * 4 + 0]; \ 853 TYPED m1 = m_indexed[i * 4 + 1]; \ 854 TYPED m2 = m_indexed[i * 4 + 2]; \ 855 TYPED m3 = m_indexed[i * 4 + 3]; \ 856 do { \ 857 d[i] = (a[i] + \ 858 n[i * 4 + 0] * m0 + \ 859 n[i * 4 + 1] * m1 + \ 860 n[i * 4 + 2] * m2 + \ 861 n[i * 4 + 3] * m3); \ 862 } while (++i < segend); \ 863 segend = i + (16 / sizeof(TYPED)); \ 864 } while (i < opr_sz_n); \ 865 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 866 } 867 868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 874 875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 876 void *vfpst, uint32_t desc) 877 { 878 uintptr_t opr_sz = simd_oprsz(desc); 879 float16 *d = vd; 880 float16 *n = vn; 881 float16 *m = vm; 882 float_status *fpst = vfpst; 883 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 884 uint32_t neg_imag = neg_real ^ 1; 885 uintptr_t i; 886 887 /* Shift boolean to the sign bit so we can xor to negate. */ 888 neg_real <<= 15; 889 neg_imag <<= 15; 890 891 for (i = 0; i < opr_sz / 2; i += 2) { 892 float16 e0 = n[H2(i)]; 893 float16 e1 = m[H2(i + 1)] ^ neg_imag; 894 float16 e2 = n[H2(i + 1)]; 895 float16 e3 = m[H2(i)] ^ neg_real; 896 897 d[H2(i)] = float16_add(e0, e1, fpst); 898 d[H2(i + 1)] = float16_add(e2, e3, fpst); 899 } 900 clear_tail(d, opr_sz, simd_maxsz(desc)); 901 } 902 903 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 904 void *vfpst, uint32_t desc) 905 { 906 uintptr_t opr_sz = simd_oprsz(desc); 907 float32 *d = vd; 908 float32 *n = vn; 909 float32 *m = vm; 910 float_status *fpst = vfpst; 911 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 912 uint32_t neg_imag = neg_real ^ 1; 913 uintptr_t i; 914 915 /* Shift boolean to the sign bit so we can xor to negate. */ 916 neg_real <<= 31; 917 neg_imag <<= 31; 918 919 for (i = 0; i < opr_sz / 4; i += 2) { 920 float32 e0 = n[H4(i)]; 921 float32 e1 = m[H4(i + 1)] ^ neg_imag; 922 float32 e2 = n[H4(i + 1)]; 923 float32 e3 = m[H4(i)] ^ neg_real; 924 925 d[H4(i)] = float32_add(e0, e1, fpst); 926 d[H4(i + 1)] = float32_add(e2, e3, fpst); 927 } 928 clear_tail(d, opr_sz, simd_maxsz(desc)); 929 } 930 931 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 932 void *vfpst, uint32_t desc) 933 { 934 uintptr_t opr_sz = simd_oprsz(desc); 935 float64 *d = vd; 936 float64 *n = vn; 937 float64 *m = vm; 938 float_status *fpst = vfpst; 939 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 940 uint64_t neg_imag = neg_real ^ 1; 941 uintptr_t i; 942 943 /* Shift boolean to the sign bit so we can xor to negate. */ 944 neg_real <<= 63; 945 neg_imag <<= 63; 946 947 for (i = 0; i < opr_sz / 8; i += 2) { 948 float64 e0 = n[i]; 949 float64 e1 = m[i + 1] ^ neg_imag; 950 float64 e2 = n[i + 1]; 951 float64 e3 = m[i] ^ neg_real; 952 953 d[i] = float64_add(e0, e1, fpst); 954 d[i + 1] = float64_add(e2, e3, fpst); 955 } 956 clear_tail(d, opr_sz, simd_maxsz(desc)); 957 } 958 959 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 960 void *vfpst, uint32_t desc) 961 { 962 uintptr_t opr_sz = simd_oprsz(desc); 963 float16 *d = vd, *n = vn, *m = vm, *a = va; 964 float_status *fpst = vfpst; 965 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 966 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 967 uint32_t neg_real = flip ^ neg_imag; 968 uintptr_t i; 969 970 /* Shift boolean to the sign bit so we can xor to negate. */ 971 neg_real <<= 15; 972 neg_imag <<= 15; 973 974 for (i = 0; i < opr_sz / 2; i += 2) { 975 float16 e2 = n[H2(i + flip)]; 976 float16 e1 = m[H2(i + flip)] ^ neg_real; 977 float16 e4 = e2; 978 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 979 980 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst); 981 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst); 982 } 983 clear_tail(d, opr_sz, simd_maxsz(desc)); 984 } 985 986 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 987 void *vfpst, uint32_t desc) 988 { 989 uintptr_t opr_sz = simd_oprsz(desc); 990 float16 *d = vd, *n = vn, *m = vm, *a = va; 991 float_status *fpst = vfpst; 992 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 993 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 994 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 995 uint32_t neg_real = flip ^ neg_imag; 996 intptr_t elements = opr_sz / sizeof(float16); 997 intptr_t eltspersegment = MIN(16 / sizeof(float16), elements); 998 intptr_t i, j; 999 1000 /* Shift boolean to the sign bit so we can xor to negate. */ 1001 neg_real <<= 15; 1002 neg_imag <<= 15; 1003 1004 for (i = 0; i < elements; i += eltspersegment) { 1005 float16 mr = m[H2(i + 2 * index + 0)]; 1006 float16 mi = m[H2(i + 2 * index + 1)]; 1007 float16 e1 = neg_real ^ (flip ? mi : mr); 1008 float16 e3 = neg_imag ^ (flip ? mr : mi); 1009 1010 for (j = i; j < i + eltspersegment; j += 2) { 1011 float16 e2 = n[H2(j + flip)]; 1012 float16 e4 = e2; 1013 1014 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); 1015 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); 1016 } 1017 } 1018 clear_tail(d, opr_sz, simd_maxsz(desc)); 1019 } 1020 1021 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 1022 void *vfpst, uint32_t desc) 1023 { 1024 uintptr_t opr_sz = simd_oprsz(desc); 1025 float32 *d = vd, *n = vn, *m = vm, *a = va; 1026 float_status *fpst = vfpst; 1027 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1028 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1029 uint32_t neg_real = flip ^ neg_imag; 1030 uintptr_t i; 1031 1032 /* Shift boolean to the sign bit so we can xor to negate. */ 1033 neg_real <<= 31; 1034 neg_imag <<= 31; 1035 1036 for (i = 0; i < opr_sz / 4; i += 2) { 1037 float32 e2 = n[H4(i + flip)]; 1038 float32 e1 = m[H4(i + flip)] ^ neg_real; 1039 float32 e4 = e2; 1040 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 1041 1042 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst); 1043 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst); 1044 } 1045 clear_tail(d, opr_sz, simd_maxsz(desc)); 1046 } 1047 1048 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 1049 void *vfpst, uint32_t desc) 1050 { 1051 uintptr_t opr_sz = simd_oprsz(desc); 1052 float32 *d = vd, *n = vn, *m = vm, *a = va; 1053 float_status *fpst = vfpst; 1054 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1055 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1056 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1057 uint32_t neg_real = flip ^ neg_imag; 1058 intptr_t elements = opr_sz / sizeof(float32); 1059 intptr_t eltspersegment = MIN(16 / sizeof(float32), elements); 1060 intptr_t i, j; 1061 1062 /* Shift boolean to the sign bit so we can xor to negate. */ 1063 neg_real <<= 31; 1064 neg_imag <<= 31; 1065 1066 for (i = 0; i < elements; i += eltspersegment) { 1067 float32 mr = m[H4(i + 2 * index + 0)]; 1068 float32 mi = m[H4(i + 2 * index + 1)]; 1069 float32 e1 = neg_real ^ (flip ? mi : mr); 1070 float32 e3 = neg_imag ^ (flip ? mr : mi); 1071 1072 for (j = i; j < i + eltspersegment; j += 2) { 1073 float32 e2 = n[H4(j + flip)]; 1074 float32 e4 = e2; 1075 1076 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); 1077 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); 1078 } 1079 } 1080 clear_tail(d, opr_sz, simd_maxsz(desc)); 1081 } 1082 1083 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 1084 void *vfpst, uint32_t desc) 1085 { 1086 uintptr_t opr_sz = simd_oprsz(desc); 1087 float64 *d = vd, *n = vn, *m = vm, *a = va; 1088 float_status *fpst = vfpst; 1089 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1090 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1091 uint64_t neg_real = flip ^ neg_imag; 1092 uintptr_t i; 1093 1094 /* Shift boolean to the sign bit so we can xor to negate. */ 1095 neg_real <<= 63; 1096 neg_imag <<= 63; 1097 1098 for (i = 0; i < opr_sz / 8; i += 2) { 1099 float64 e2 = n[i + flip]; 1100 float64 e1 = m[i + flip] ^ neg_real; 1101 float64 e4 = e2; 1102 float64 e3 = m[i + 1 - flip] ^ neg_imag; 1103 1104 d[i] = float64_muladd(e2, e1, a[i], 0, fpst); 1105 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst); 1106 } 1107 clear_tail(d, opr_sz, simd_maxsz(desc)); 1108 } 1109 1110 /* 1111 * Floating point comparisons producing an integer result (all 1s or all 0s). 1112 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1113 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1114 */ 1115 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 1116 { 1117 return -float16_eq_quiet(op1, op2, stat); 1118 } 1119 1120 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 1121 { 1122 return -float32_eq_quiet(op1, op2, stat); 1123 } 1124 1125 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat) 1126 { 1127 return -float64_eq_quiet(op1, op2, stat); 1128 } 1129 1130 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 1131 { 1132 return -float16_le(op2, op1, stat); 1133 } 1134 1135 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 1136 { 1137 return -float32_le(op2, op1, stat); 1138 } 1139 1140 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat) 1141 { 1142 return -float64_le(op2, op1, stat); 1143 } 1144 1145 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 1146 { 1147 return -float16_lt(op2, op1, stat); 1148 } 1149 1150 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 1151 { 1152 return -float32_lt(op2, op1, stat); 1153 } 1154 1155 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat) 1156 { 1157 return -float64_lt(op2, op1, stat); 1158 } 1159 1160 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 1161 { 1162 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 1163 } 1164 1165 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 1166 { 1167 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1168 } 1169 1170 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat) 1171 { 1172 return -float64_le(float64_abs(op2), float64_abs(op1), stat); 1173 } 1174 1175 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1176 { 1177 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1178 } 1179 1180 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1181 { 1182 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1183 } 1184 1185 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat) 1186 { 1187 return -float64_lt(float64_abs(op2), float64_abs(op1), stat); 1188 } 1189 1190 static int16_t vfp_tosszh(float16 x, void *fpstp) 1191 { 1192 float_status *fpst = fpstp; 1193 if (float16_is_any_nan(x)) { 1194 float_raise(float_flag_invalid, fpst); 1195 return 0; 1196 } 1197 return float16_to_int16_round_to_zero(x, fpst); 1198 } 1199 1200 static uint16_t vfp_touszh(float16 x, void *fpstp) 1201 { 1202 float_status *fpst = fpstp; 1203 if (float16_is_any_nan(x)) { 1204 float_raise(float_flag_invalid, fpst); 1205 return 0; 1206 } 1207 return float16_to_uint16_round_to_zero(x, fpst); 1208 } 1209 1210 #define DO_2OP(NAME, FUNC, TYPE) \ 1211 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 1212 { \ 1213 intptr_t i, oprsz = simd_oprsz(desc); \ 1214 TYPE *d = vd, *n = vn; \ 1215 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1216 d[i] = FUNC(n[i], stat); \ 1217 } \ 1218 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1219 } 1220 1221 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1222 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1223 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1224 1225 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1226 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1227 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1228 1229 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1230 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1231 1232 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1233 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1234 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1235 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1236 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1237 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1238 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1239 DO_2OP(gvec_touszh, vfp_touszh, float16) 1240 1241 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1242 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1243 { \ 1244 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1245 } 1246 1247 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1248 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1249 { \ 1250 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1251 } 1252 1253 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1254 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1255 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1256 WRAP_CMP0_##DIRN(FN, CMPOP, float64) \ 1257 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1258 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) \ 1259 DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64) 1260 1261 DO_2OP_CMP0(cgt, cgt, FWD) 1262 DO_2OP_CMP0(cge, cge, FWD) 1263 DO_2OP_CMP0(ceq, ceq, FWD) 1264 DO_2OP_CMP0(clt, cgt, REV) 1265 DO_2OP_CMP0(cle, cge, REV) 1266 1267 #undef DO_2OP 1268 #undef DO_2OP_CMP0 1269 1270 /* Floating-point trigonometric starting value. 1271 * See the ARM ARM pseudocode function FPTrigSMul. 1272 */ 1273 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1274 { 1275 float16 result = float16_mul(op1, op1, stat); 1276 if (!float16_is_any_nan(result)) { 1277 result = float16_set_sign(result, op2 & 1); 1278 } 1279 return result; 1280 } 1281 1282 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1283 { 1284 float32 result = float32_mul(op1, op1, stat); 1285 if (!float32_is_any_nan(result)) { 1286 result = float32_set_sign(result, op2 & 1); 1287 } 1288 return result; 1289 } 1290 1291 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1292 { 1293 float64 result = float64_mul(op1, op1, stat); 1294 if (!float64_is_any_nan(result)) { 1295 result = float64_set_sign(result, op2 & 1); 1296 } 1297 return result; 1298 } 1299 1300 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1301 { 1302 return float16_abs(float16_sub(op1, op2, stat)); 1303 } 1304 1305 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1306 { 1307 return float32_abs(float32_sub(op1, op2, stat)); 1308 } 1309 1310 static float64 float64_abd(float64 op1, float64 op2, float_status *stat) 1311 { 1312 return float64_abs(float64_sub(op1, op2, stat)); 1313 } 1314 1315 /* 1316 * Reciprocal step. These are the AArch32 version which uses a 1317 * non-fused multiply-and-subtract. 1318 */ 1319 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1320 { 1321 op1 = float16_squash_input_denormal(op1, stat); 1322 op2 = float16_squash_input_denormal(op2, stat); 1323 1324 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1325 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1326 return float16_two; 1327 } 1328 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1329 } 1330 1331 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1332 { 1333 op1 = float32_squash_input_denormal(op1, stat); 1334 op2 = float32_squash_input_denormal(op2, stat); 1335 1336 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1337 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1338 return float32_two; 1339 } 1340 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1341 } 1342 1343 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1344 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1345 { 1346 op1 = float16_squash_input_denormal(op1, stat); 1347 op2 = float16_squash_input_denormal(op2, stat); 1348 1349 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1350 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1351 return float16_one_point_five; 1352 } 1353 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1354 return float16_div(op1, float16_two, stat); 1355 } 1356 1357 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1358 { 1359 op1 = float32_squash_input_denormal(op1, stat); 1360 op2 = float32_squash_input_denormal(op2, stat); 1361 1362 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1363 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1364 return float32_one_point_five; 1365 } 1366 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1367 return float32_div(op1, float32_two, stat); 1368 } 1369 1370 #define DO_3OP(NAME, FUNC, TYPE) \ 1371 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1372 { \ 1373 intptr_t i, oprsz = simd_oprsz(desc); \ 1374 TYPE *d = vd, *n = vn, *m = vm; \ 1375 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1376 d[i] = FUNC(n[i], m[i], stat); \ 1377 } \ 1378 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1379 } 1380 1381 DO_3OP(gvec_fadd_h, float16_add, float16) 1382 DO_3OP(gvec_fadd_s, float32_add, float32) 1383 DO_3OP(gvec_fadd_d, float64_add, float64) 1384 1385 DO_3OP(gvec_fsub_h, float16_sub, float16) 1386 DO_3OP(gvec_fsub_s, float32_sub, float32) 1387 DO_3OP(gvec_fsub_d, float64_sub, float64) 1388 1389 DO_3OP(gvec_fmul_h, float16_mul, float16) 1390 DO_3OP(gvec_fmul_s, float32_mul, float32) 1391 DO_3OP(gvec_fmul_d, float64_mul, float64) 1392 1393 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1394 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1395 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1396 1397 DO_3OP(gvec_fabd_h, float16_abd, float16) 1398 DO_3OP(gvec_fabd_s, float32_abd, float32) 1399 DO_3OP(gvec_fabd_d, float64_abd, float64) 1400 1401 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1402 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1403 DO_3OP(gvec_fceq_d, float64_ceq, float64) 1404 1405 DO_3OP(gvec_fcge_h, float16_cge, float16) 1406 DO_3OP(gvec_fcge_s, float32_cge, float32) 1407 DO_3OP(gvec_fcge_d, float64_cge, float64) 1408 1409 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1410 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1411 DO_3OP(gvec_fcgt_d, float64_cgt, float64) 1412 1413 DO_3OP(gvec_facge_h, float16_acge, float16) 1414 DO_3OP(gvec_facge_s, float32_acge, float32) 1415 DO_3OP(gvec_facge_d, float64_acge, float64) 1416 1417 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1418 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1419 DO_3OP(gvec_facgt_d, float64_acgt, float64) 1420 1421 DO_3OP(gvec_fmax_h, float16_max, float16) 1422 DO_3OP(gvec_fmax_s, float32_max, float32) 1423 DO_3OP(gvec_fmax_d, float64_max, float64) 1424 1425 DO_3OP(gvec_fmin_h, float16_min, float16) 1426 DO_3OP(gvec_fmin_s, float32_min, float32) 1427 DO_3OP(gvec_fmin_d, float64_min, float64) 1428 1429 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1430 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1431 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64) 1432 1433 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1434 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1435 DO_3OP(gvec_fminnum_d, float64_minnum, float64) 1436 1437 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1438 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1439 1440 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1441 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1442 1443 #ifdef TARGET_AARCH64 1444 DO_3OP(gvec_fdiv_h, float16_div, float16) 1445 DO_3OP(gvec_fdiv_s, float32_div, float32) 1446 DO_3OP(gvec_fdiv_d, float64_div, float64) 1447 1448 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16) 1449 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32) 1450 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64) 1451 1452 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1453 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1454 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1455 1456 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1457 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1458 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1459 1460 #endif 1461 #undef DO_3OP 1462 1463 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1464 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1465 float_status *stat) 1466 { 1467 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1468 } 1469 1470 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1471 float_status *stat) 1472 { 1473 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1474 } 1475 1476 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1477 float_status *stat) 1478 { 1479 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1480 } 1481 1482 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1483 float_status *stat) 1484 { 1485 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1486 } 1487 1488 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1489 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1490 float_status *stat) 1491 { 1492 return float16_muladd(op1, op2, dest, 0, stat); 1493 } 1494 1495 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1496 float_status *stat) 1497 { 1498 return float32_muladd(op1, op2, dest, 0, stat); 1499 } 1500 1501 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2, 1502 float_status *stat) 1503 { 1504 return float64_muladd(op1, op2, dest, 0, stat); 1505 } 1506 1507 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1508 float_status *stat) 1509 { 1510 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1511 } 1512 1513 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1514 float_status *stat) 1515 { 1516 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1517 } 1518 1519 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2, 1520 float_status *stat) 1521 { 1522 return float64_muladd(float64_chs(op1), op2, dest, 0, stat); 1523 } 1524 1525 #define DO_MULADD(NAME, FUNC, TYPE) \ 1526 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1527 { \ 1528 intptr_t i, oprsz = simd_oprsz(desc); \ 1529 TYPE *d = vd, *n = vn, *m = vm; \ 1530 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1531 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1532 } \ 1533 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1534 } 1535 1536 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1537 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1538 1539 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1540 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1541 1542 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1543 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1544 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64) 1545 1546 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1547 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1548 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64) 1549 1550 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1551 * For AdvSIMD, there is of course only one such vector segment. 1552 */ 1553 1554 #define DO_MUL_IDX(NAME, TYPE, H) \ 1555 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1556 { \ 1557 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1558 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1559 intptr_t idx = simd_data(desc); \ 1560 TYPE *d = vd, *n = vn, *m = vm; \ 1561 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1562 TYPE mm = m[H(i + idx)]; \ 1563 for (j = 0; j < segment; j++) { \ 1564 d[i + j] = n[i + j] * mm; \ 1565 } \ 1566 } \ 1567 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1568 } 1569 1570 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1571 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1572 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1573 1574 #undef DO_MUL_IDX 1575 1576 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1577 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1578 { \ 1579 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1580 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1581 intptr_t idx = simd_data(desc); \ 1582 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1583 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1584 TYPE mm = m[H(i + idx)]; \ 1585 for (j = 0; j < segment; j++) { \ 1586 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1587 } \ 1588 } \ 1589 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1590 } 1591 1592 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1593 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1594 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1595 1596 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1597 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1598 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1599 1600 #undef DO_MLA_IDX 1601 1602 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \ 1603 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 1604 { \ 1605 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1606 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1607 intptr_t idx = simd_data(desc); \ 1608 TYPE *d = vd, *n = vn, *m = vm; \ 1609 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1610 TYPE mm = m[H(i + idx)]; \ 1611 for (j = 0; j < segment; j++) { \ 1612 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \ 1613 } \ 1614 } \ 1615 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1616 } 1617 1618 #define nop(N, M, S) (M) 1619 1620 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2) 1621 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4) 1622 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8) 1623 1624 #ifdef TARGET_AARCH64 1625 1626 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2) 1627 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4) 1628 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8) 1629 1630 #endif 1631 1632 #undef nop 1633 1634 /* 1635 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1636 * the fused ops below they assume accumulate both from and into Vd. 1637 */ 1638 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2) 1639 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4) 1640 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2) 1641 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4) 1642 1643 #undef DO_FMUL_IDX 1644 1645 #define DO_FMLA_IDX(NAME, TYPE, H) \ 1646 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1647 void *stat, uint32_t desc) \ 1648 { \ 1649 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1650 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1651 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 1652 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 1653 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1654 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 1655 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1656 TYPE mm = m[H(i + idx)]; \ 1657 for (j = 0; j < segment; j++) { \ 1658 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 1659 mm, a[i + j], 0, stat); \ 1660 } \ 1661 } \ 1662 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1663 } 1664 1665 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 1666 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 1667 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) 1668 1669 #undef DO_FMLA_IDX 1670 1671 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1672 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1673 { \ 1674 intptr_t i, oprsz = simd_oprsz(desc); \ 1675 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1676 bool q = false; \ 1677 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1678 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1679 if (dd < MIN) { \ 1680 dd = MIN; \ 1681 q = true; \ 1682 } else if (dd > MAX) { \ 1683 dd = MAX; \ 1684 q = true; \ 1685 } \ 1686 d[i] = dd; \ 1687 } \ 1688 if (q) { \ 1689 uint32_t *qc = vq; \ 1690 qc[0] = 1; \ 1691 } \ 1692 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1693 } 1694 1695 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1696 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1697 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1698 1699 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1700 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1701 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1702 1703 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1704 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1705 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1706 1707 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1708 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1709 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1710 1711 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX) 1712 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX) 1713 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX) 1714 1715 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX) 1716 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX) 1717 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX) 1718 1719 #undef DO_SAT 1720 1721 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1722 void *vm, uint32_t desc) 1723 { 1724 intptr_t i, oprsz = simd_oprsz(desc); 1725 uint64_t *d = vd, *n = vn, *m = vm; 1726 bool q = false; 1727 1728 for (i = 0; i < oprsz / 8; i++) { 1729 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1730 if (dd < nn) { 1731 dd = UINT64_MAX; 1732 q = true; 1733 } 1734 d[i] = dd; 1735 } 1736 if (q) { 1737 uint32_t *qc = vq; 1738 qc[0] = 1; 1739 } 1740 clear_tail(d, oprsz, simd_maxsz(desc)); 1741 } 1742 1743 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1744 void *vm, uint32_t desc) 1745 { 1746 intptr_t i, oprsz = simd_oprsz(desc); 1747 uint64_t *d = vd, *n = vn, *m = vm; 1748 bool q = false; 1749 1750 for (i = 0; i < oprsz / 8; i++) { 1751 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1752 if (nn < mm) { 1753 dd = 0; 1754 q = true; 1755 } 1756 d[i] = dd; 1757 } 1758 if (q) { 1759 uint32_t *qc = vq; 1760 qc[0] = 1; 1761 } 1762 clear_tail(d, oprsz, simd_maxsz(desc)); 1763 } 1764 1765 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1766 void *vm, uint32_t desc) 1767 { 1768 intptr_t i, oprsz = simd_oprsz(desc); 1769 int64_t *d = vd, *n = vn, *m = vm; 1770 bool q = false; 1771 1772 for (i = 0; i < oprsz / 8; i++) { 1773 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1774 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1775 dd = (nn >> 63) ^ ~INT64_MIN; 1776 q = true; 1777 } 1778 d[i] = dd; 1779 } 1780 if (q) { 1781 uint32_t *qc = vq; 1782 qc[0] = 1; 1783 } 1784 clear_tail(d, oprsz, simd_maxsz(desc)); 1785 } 1786 1787 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1788 void *vm, uint32_t desc) 1789 { 1790 intptr_t i, oprsz = simd_oprsz(desc); 1791 int64_t *d = vd, *n = vn, *m = vm; 1792 bool q = false; 1793 1794 for (i = 0; i < oprsz / 8; i++) { 1795 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1796 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1797 dd = (nn >> 63) ^ ~INT64_MIN; 1798 q = true; 1799 } 1800 d[i] = dd; 1801 } 1802 if (q) { 1803 uint32_t *qc = vq; 1804 qc[0] = 1; 1805 } 1806 clear_tail(d, oprsz, simd_maxsz(desc)); 1807 } 1808 1809 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn, 1810 void *vm, uint32_t desc) 1811 { 1812 intptr_t i, oprsz = simd_oprsz(desc); 1813 uint64_t *d = vd, *n = vn, *m = vm; 1814 bool q = false; 1815 1816 for (i = 0; i < oprsz / 8; i++) { 1817 uint64_t nn = n[i]; 1818 int64_t mm = m[i]; 1819 uint64_t dd = nn + mm; 1820 1821 if (mm < 0) { 1822 if (nn < (uint64_t)-mm) { 1823 dd = 0; 1824 q = true; 1825 } 1826 } else { 1827 if (dd < nn) { 1828 dd = UINT64_MAX; 1829 q = true; 1830 } 1831 } 1832 d[i] = dd; 1833 } 1834 if (q) { 1835 uint32_t *qc = vq; 1836 qc[0] = 1; 1837 } 1838 clear_tail(d, oprsz, simd_maxsz(desc)); 1839 } 1840 1841 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn, 1842 void *vm, uint32_t desc) 1843 { 1844 intptr_t i, oprsz = simd_oprsz(desc); 1845 uint64_t *d = vd, *n = vn, *m = vm; 1846 bool q = false; 1847 1848 for (i = 0; i < oprsz / 8; i++) { 1849 int64_t nn = n[i]; 1850 uint64_t mm = m[i]; 1851 int64_t dd = nn + mm; 1852 1853 if (mm > (uint64_t)(INT64_MAX - nn)) { 1854 dd = INT64_MAX; 1855 q = true; 1856 } 1857 d[i] = dd; 1858 } 1859 if (q) { 1860 uint32_t *qc = vq; 1861 qc[0] = 1; 1862 } 1863 clear_tail(d, oprsz, simd_maxsz(desc)); 1864 } 1865 1866 #define DO_SRA(NAME, TYPE) \ 1867 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1868 { \ 1869 intptr_t i, oprsz = simd_oprsz(desc); \ 1870 int shift = simd_data(desc); \ 1871 TYPE *d = vd, *n = vn; \ 1872 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1873 d[i] += n[i] >> shift; \ 1874 } \ 1875 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1876 } 1877 1878 DO_SRA(gvec_ssra_b, int8_t) 1879 DO_SRA(gvec_ssra_h, int16_t) 1880 DO_SRA(gvec_ssra_s, int32_t) 1881 DO_SRA(gvec_ssra_d, int64_t) 1882 1883 DO_SRA(gvec_usra_b, uint8_t) 1884 DO_SRA(gvec_usra_h, uint16_t) 1885 DO_SRA(gvec_usra_s, uint32_t) 1886 DO_SRA(gvec_usra_d, uint64_t) 1887 1888 #undef DO_SRA 1889 1890 #define DO_RSHR(NAME, TYPE) \ 1891 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1892 { \ 1893 intptr_t i, oprsz = simd_oprsz(desc); \ 1894 int shift = simd_data(desc); \ 1895 TYPE *d = vd, *n = vn; \ 1896 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1897 TYPE tmp = n[i] >> (shift - 1); \ 1898 d[i] = (tmp >> 1) + (tmp & 1); \ 1899 } \ 1900 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1901 } 1902 1903 DO_RSHR(gvec_srshr_b, int8_t) 1904 DO_RSHR(gvec_srshr_h, int16_t) 1905 DO_RSHR(gvec_srshr_s, int32_t) 1906 DO_RSHR(gvec_srshr_d, int64_t) 1907 1908 DO_RSHR(gvec_urshr_b, uint8_t) 1909 DO_RSHR(gvec_urshr_h, uint16_t) 1910 DO_RSHR(gvec_urshr_s, uint32_t) 1911 DO_RSHR(gvec_urshr_d, uint64_t) 1912 1913 #undef DO_RSHR 1914 1915 #define DO_RSRA(NAME, TYPE) \ 1916 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1917 { \ 1918 intptr_t i, oprsz = simd_oprsz(desc); \ 1919 int shift = simd_data(desc); \ 1920 TYPE *d = vd, *n = vn; \ 1921 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1922 TYPE tmp = n[i] >> (shift - 1); \ 1923 d[i] += (tmp >> 1) + (tmp & 1); \ 1924 } \ 1925 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1926 } 1927 1928 DO_RSRA(gvec_srsra_b, int8_t) 1929 DO_RSRA(gvec_srsra_h, int16_t) 1930 DO_RSRA(gvec_srsra_s, int32_t) 1931 DO_RSRA(gvec_srsra_d, int64_t) 1932 1933 DO_RSRA(gvec_ursra_b, uint8_t) 1934 DO_RSRA(gvec_ursra_h, uint16_t) 1935 DO_RSRA(gvec_ursra_s, uint32_t) 1936 DO_RSRA(gvec_ursra_d, uint64_t) 1937 1938 #undef DO_RSRA 1939 1940 #define DO_SRI(NAME, TYPE) \ 1941 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1942 { \ 1943 intptr_t i, oprsz = simd_oprsz(desc); \ 1944 int shift = simd_data(desc); \ 1945 TYPE *d = vd, *n = vn; \ 1946 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1947 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 1948 } \ 1949 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1950 } 1951 1952 DO_SRI(gvec_sri_b, uint8_t) 1953 DO_SRI(gvec_sri_h, uint16_t) 1954 DO_SRI(gvec_sri_s, uint32_t) 1955 DO_SRI(gvec_sri_d, uint64_t) 1956 1957 #undef DO_SRI 1958 1959 #define DO_SLI(NAME, TYPE) \ 1960 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1961 { \ 1962 intptr_t i, oprsz = simd_oprsz(desc); \ 1963 int shift = simd_data(desc); \ 1964 TYPE *d = vd, *n = vn; \ 1965 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1966 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 1967 } \ 1968 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1969 } 1970 1971 DO_SLI(gvec_sli_b, uint8_t) 1972 DO_SLI(gvec_sli_h, uint16_t) 1973 DO_SLI(gvec_sli_s, uint32_t) 1974 DO_SLI(gvec_sli_d, uint64_t) 1975 1976 #undef DO_SLI 1977 1978 /* 1979 * Convert float16 to float32, raising no exceptions and 1980 * preserving exceptional values, including SNaN. 1981 * This is effectively an unpack+repack operation. 1982 */ 1983 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 1984 { 1985 const int f16_bias = 15; 1986 const int f32_bias = 127; 1987 uint32_t sign = extract32(f16, 15, 1); 1988 uint32_t exp = extract32(f16, 10, 5); 1989 uint32_t frac = extract32(f16, 0, 10); 1990 1991 if (exp == 0x1f) { 1992 /* Inf or NaN */ 1993 exp = 0xff; 1994 } else if (exp == 0) { 1995 /* Zero or denormal. */ 1996 if (frac != 0) { 1997 if (fz16) { 1998 frac = 0; 1999 } else { 2000 /* 2001 * Denormal; these are all normal float32. 2002 * Shift the fraction so that the msb is at bit 11, 2003 * then remove bit 11 as the implicit bit of the 2004 * normalized float32. Note that we still go through 2005 * the shift for normal numbers below, to put the 2006 * float32 fraction at the right place. 2007 */ 2008 int shift = clz32(frac) - 21; 2009 frac = (frac << shift) & 0x3ff; 2010 exp = f32_bias - f16_bias - shift + 1; 2011 } 2012 } 2013 } else { 2014 /* Normal number; adjust the bias. */ 2015 exp += f32_bias - f16_bias; 2016 } 2017 sign <<= 31; 2018 exp <<= 23; 2019 frac <<= 23 - 10; 2020 2021 return sign | exp | frac; 2022 } 2023 2024 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 2025 { 2026 /* 2027 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 2028 * Load the 2nd qword iff is_q & is_2. 2029 * Shift to the 2nd dword iff !is_q & is_2. 2030 * For !is_q & !is_2, the upper bits of the result are garbage. 2031 */ 2032 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 2033 } 2034 2035 /* 2036 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 2037 * as there is not yet SVE versions that might use blocking. 2038 */ 2039 2040 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 2041 uint32_t desc, bool fz16) 2042 { 2043 intptr_t i, oprsz = simd_oprsz(desc); 2044 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2045 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2046 int is_q = oprsz == 16; 2047 uint64_t n_4, m_4; 2048 2049 /* Pre-load all of the f16 data, avoiding overlap issues. */ 2050 n_4 = load4_f16(vn, is_q, is_2); 2051 m_4 = load4_f16(vm, is_q, is_2); 2052 2053 /* Negate all inputs for FMLSL at once. */ 2054 if (is_s) { 2055 n_4 ^= 0x8000800080008000ull; 2056 } 2057 2058 for (i = 0; i < oprsz / 4; i++) { 2059 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2060 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 2061 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 2062 } 2063 clear_tail(d, oprsz, simd_maxsz(desc)); 2064 } 2065 2066 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 2067 void *venv, uint32_t desc) 2068 { 2069 CPUARMState *env = venv; 2070 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, 2071 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 2072 } 2073 2074 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 2075 void *venv, uint32_t desc) 2076 { 2077 CPUARMState *env = venv; 2078 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc, 2079 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 2080 } 2081 2082 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 2083 void *venv, uint32_t desc) 2084 { 2085 intptr_t i, oprsz = simd_oprsz(desc); 2086 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 2087 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2088 CPUARMState *env = venv; 2089 float_status *status = &env->vfp.fp_status; 2090 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 2091 2092 for (i = 0; i < oprsz; i += sizeof(float32)) { 2093 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; 2094 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 2095 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2096 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2097 float32 aa = *(float32 *)(va + H1_4(i)); 2098 2099 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); 2100 } 2101 } 2102 2103 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 2104 uint32_t desc, bool fz16) 2105 { 2106 intptr_t i, oprsz = simd_oprsz(desc); 2107 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2108 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2109 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 2110 int is_q = oprsz == 16; 2111 uint64_t n_4; 2112 float32 m_1; 2113 2114 /* Pre-load all of the f16 data, avoiding overlap issues. */ 2115 n_4 = load4_f16(vn, is_q, is_2); 2116 2117 /* Negate all inputs for FMLSL at once. */ 2118 if (is_s) { 2119 n_4 ^= 0x8000800080008000ull; 2120 } 2121 2122 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 2123 2124 for (i = 0; i < oprsz / 4; i++) { 2125 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2126 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 2127 } 2128 clear_tail(d, oprsz, simd_maxsz(desc)); 2129 } 2130 2131 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 2132 void *venv, uint32_t desc) 2133 { 2134 CPUARMState *env = venv; 2135 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, 2136 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 2137 } 2138 2139 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 2140 void *venv, uint32_t desc) 2141 { 2142 CPUARMState *env = venv; 2143 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc, 2144 get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); 2145 } 2146 2147 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 2148 void *venv, uint32_t desc) 2149 { 2150 intptr_t i, j, oprsz = simd_oprsz(desc); 2151 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 2152 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2153 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 2154 CPUARMState *env = venv; 2155 float_status *status = &env->vfp.fp_status; 2156 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); 2157 2158 for (i = 0; i < oprsz; i += 16) { 2159 float16 mm_16 = *(float16 *)(vm + i + idx); 2160 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2161 2162 for (j = 0; j < 16; j += sizeof(float32)) { 2163 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; 2164 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2165 float32 aa = *(float32 *)(va + H1_4(i + j)); 2166 2167 *(float32 *)(vd + H1_4(i + j)) = 2168 float32_muladd(nn, mm, aa, 0, status); 2169 } 2170 } 2171 } 2172 2173 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2174 { 2175 intptr_t i, opr_sz = simd_oprsz(desc); 2176 int8_t *d = vd, *n = vn, *m = vm; 2177 2178 for (i = 0; i < opr_sz; ++i) { 2179 int8_t mm = m[i]; 2180 int8_t nn = n[i]; 2181 int8_t res = 0; 2182 if (mm >= 0) { 2183 if (mm < 8) { 2184 res = nn << mm; 2185 } 2186 } else { 2187 res = nn >> (mm > -8 ? -mm : 7); 2188 } 2189 d[i] = res; 2190 } 2191 clear_tail(d, opr_sz, simd_maxsz(desc)); 2192 } 2193 2194 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2195 { 2196 intptr_t i, opr_sz = simd_oprsz(desc); 2197 int16_t *d = vd, *n = vn, *m = vm; 2198 2199 for (i = 0; i < opr_sz / 2; ++i) { 2200 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2201 int16_t nn = n[i]; 2202 int16_t res = 0; 2203 if (mm >= 0) { 2204 if (mm < 16) { 2205 res = nn << mm; 2206 } 2207 } else { 2208 res = nn >> (mm > -16 ? -mm : 15); 2209 } 2210 d[i] = res; 2211 } 2212 clear_tail(d, opr_sz, simd_maxsz(desc)); 2213 } 2214 2215 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2216 { 2217 intptr_t i, opr_sz = simd_oprsz(desc); 2218 uint8_t *d = vd, *n = vn, *m = vm; 2219 2220 for (i = 0; i < opr_sz; ++i) { 2221 int8_t mm = m[i]; 2222 uint8_t nn = n[i]; 2223 uint8_t res = 0; 2224 if (mm >= 0) { 2225 if (mm < 8) { 2226 res = nn << mm; 2227 } 2228 } else { 2229 if (mm > -8) { 2230 res = nn >> -mm; 2231 } 2232 } 2233 d[i] = res; 2234 } 2235 clear_tail(d, opr_sz, simd_maxsz(desc)); 2236 } 2237 2238 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2239 { 2240 intptr_t i, opr_sz = simd_oprsz(desc); 2241 uint16_t *d = vd, *n = vn, *m = vm; 2242 2243 for (i = 0; i < opr_sz / 2; ++i) { 2244 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2245 uint16_t nn = n[i]; 2246 uint16_t res = 0; 2247 if (mm >= 0) { 2248 if (mm < 16) { 2249 res = nn << mm; 2250 } 2251 } else { 2252 if (mm > -16) { 2253 res = nn >> -mm; 2254 } 2255 } 2256 d[i] = res; 2257 } 2258 clear_tail(d, opr_sz, simd_maxsz(desc)); 2259 } 2260 2261 /* 2262 * 8x8->8 polynomial multiply. 2263 * 2264 * Polynomial multiplication is like integer multiplication except the 2265 * partial products are XORed, not added. 2266 * 2267 * TODO: expose this as a generic vector operation, as it is a common 2268 * crypto building block. 2269 */ 2270 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 2271 { 2272 intptr_t i, opr_sz = simd_oprsz(desc); 2273 uint64_t *d = vd, *n = vn, *m = vm; 2274 2275 for (i = 0; i < opr_sz / 8; ++i) { 2276 d[i] = clmul_8x8_low(n[i], m[i]); 2277 } 2278 clear_tail(d, opr_sz, simd_maxsz(desc)); 2279 } 2280 2281 /* 2282 * 64x64->128 polynomial multiply. 2283 * Because of the lanes are not accessed in strict columns, 2284 * this probably cannot be turned into a generic helper. 2285 */ 2286 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2287 { 2288 intptr_t i, opr_sz = simd_oprsz(desc); 2289 intptr_t hi = simd_data(desc); 2290 uint64_t *d = vd, *n = vn, *m = vm; 2291 2292 for (i = 0; i < opr_sz / 8; i += 2) { 2293 Int128 r = clmul_64(n[i + hi], m[i + hi]); 2294 d[i] = int128_getlo(r); 2295 d[i + 1] = int128_gethi(r); 2296 } 2297 clear_tail(d, opr_sz, simd_maxsz(desc)); 2298 } 2299 2300 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2301 { 2302 int hi = simd_data(desc); 2303 uint64_t *d = vd, *n = vn, *m = vm; 2304 uint64_t nn = n[hi], mm = m[hi]; 2305 2306 d[0] = clmul_8x4_packed(nn, mm); 2307 nn >>= 32; 2308 mm >>= 32; 2309 d[1] = clmul_8x4_packed(nn, mm); 2310 2311 clear_tail(d, 16, simd_maxsz(desc)); 2312 } 2313 2314 #ifdef TARGET_AARCH64 2315 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2316 { 2317 int shift = simd_data(desc) * 8; 2318 intptr_t i, opr_sz = simd_oprsz(desc); 2319 uint64_t *d = vd, *n = vn, *m = vm; 2320 2321 for (i = 0; i < opr_sz / 8; ++i) { 2322 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); 2323 } 2324 } 2325 2326 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2327 { 2328 intptr_t sel = H4(simd_data(desc)); 2329 intptr_t i, opr_sz = simd_oprsz(desc); 2330 uint32_t *n = vn, *m = vm; 2331 uint64_t *d = vd; 2332 2333 for (i = 0; i < opr_sz / 8; ++i) { 2334 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); 2335 } 2336 } 2337 #endif 2338 2339 #define DO_CMP0(NAME, TYPE, OP) \ 2340 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2341 { \ 2342 intptr_t i, opr_sz = simd_oprsz(desc); \ 2343 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2344 TYPE nn = *(TYPE *)(vn + i); \ 2345 *(TYPE *)(vd + i) = -(nn OP 0); \ 2346 } \ 2347 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2348 } 2349 2350 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2351 DO_CMP0(gvec_clt0_b, int8_t, <) 2352 DO_CMP0(gvec_cle0_b, int8_t, <=) 2353 DO_CMP0(gvec_cgt0_b, int8_t, >) 2354 DO_CMP0(gvec_cge0_b, int8_t, >=) 2355 2356 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2357 DO_CMP0(gvec_clt0_h, int16_t, <) 2358 DO_CMP0(gvec_cle0_h, int16_t, <=) 2359 DO_CMP0(gvec_cgt0_h, int16_t, >) 2360 DO_CMP0(gvec_cge0_h, int16_t, >=) 2361 2362 #undef DO_CMP0 2363 2364 #define DO_ABD(NAME, TYPE) \ 2365 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2366 { \ 2367 intptr_t i, opr_sz = simd_oprsz(desc); \ 2368 TYPE *d = vd, *n = vn, *m = vm; \ 2369 \ 2370 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2371 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2372 } \ 2373 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2374 } 2375 2376 DO_ABD(gvec_sabd_b, int8_t) 2377 DO_ABD(gvec_sabd_h, int16_t) 2378 DO_ABD(gvec_sabd_s, int32_t) 2379 DO_ABD(gvec_sabd_d, int64_t) 2380 2381 DO_ABD(gvec_uabd_b, uint8_t) 2382 DO_ABD(gvec_uabd_h, uint16_t) 2383 DO_ABD(gvec_uabd_s, uint32_t) 2384 DO_ABD(gvec_uabd_d, uint64_t) 2385 2386 #undef DO_ABD 2387 2388 #define DO_ABA(NAME, TYPE) \ 2389 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2390 { \ 2391 intptr_t i, opr_sz = simd_oprsz(desc); \ 2392 TYPE *d = vd, *n = vn, *m = vm; \ 2393 \ 2394 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2395 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2396 } \ 2397 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2398 } 2399 2400 DO_ABA(gvec_saba_b, int8_t) 2401 DO_ABA(gvec_saba_h, int16_t) 2402 DO_ABA(gvec_saba_s, int32_t) 2403 DO_ABA(gvec_saba_d, int64_t) 2404 2405 DO_ABA(gvec_uaba_b, uint8_t) 2406 DO_ABA(gvec_uaba_h, uint16_t) 2407 DO_ABA(gvec_uaba_s, uint32_t) 2408 DO_ABA(gvec_uaba_d, uint64_t) 2409 2410 #undef DO_ABA 2411 2412 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2413 void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ 2414 { \ 2415 ARMVectorReg scratch; \ 2416 intptr_t oprsz = simd_oprsz(desc); \ 2417 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2418 TYPE *d = vd, *n = vn, *m = vm; \ 2419 if (unlikely(d == m)) { \ 2420 m = memcpy(&scratch, m, oprsz); \ 2421 } \ 2422 for (intptr_t i = 0; i < half; ++i) { \ 2423 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat); \ 2424 } \ 2425 for (intptr_t i = 0; i < half; ++i) { \ 2426 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat); \ 2427 } \ 2428 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2429 } 2430 2431 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2) 2432 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4) 2433 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, ) 2434 2435 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2) 2436 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4) 2437 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, ) 2438 2439 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2) 2440 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4) 2441 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, ) 2442 2443 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2) 2444 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4) 2445 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, ) 2446 2447 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2) 2448 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4) 2449 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, ) 2450 2451 #undef DO_3OP_PAIR 2452 2453 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2454 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2455 { \ 2456 ARMVectorReg scratch; \ 2457 intptr_t oprsz = simd_oprsz(desc); \ 2458 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2459 TYPE *d = vd, *n = vn, *m = vm; \ 2460 if (unlikely(d == m)) { \ 2461 m = memcpy(&scratch, m, oprsz); \ 2462 } \ 2463 for (intptr_t i = 0; i < half; ++i) { \ 2464 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]); \ 2465 } \ 2466 for (intptr_t i = 0; i < half; ++i) { \ 2467 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]); \ 2468 } \ 2469 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2470 } 2471 2472 #define ADD(A, B) (A + B) 2473 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1) 2474 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2) 2475 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4) 2476 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, ) 2477 #undef ADD 2478 2479 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1) 2480 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2) 2481 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4) 2482 2483 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1) 2484 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2) 2485 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4) 2486 2487 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1) 2488 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2) 2489 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4) 2490 2491 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1) 2492 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2) 2493 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4) 2494 2495 #undef DO_3OP_PAIR 2496 2497 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2498 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2499 { \ 2500 intptr_t i, oprsz = simd_oprsz(desc); \ 2501 int shift = simd_data(desc); \ 2502 TYPE *d = vd, *n = vn; \ 2503 float_status *fpst = stat; \ 2504 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2505 d[i] = FUNC(n[i], shift, fpst); \ 2506 } \ 2507 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2508 } 2509 2510 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t) 2511 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t) 2512 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2513 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2514 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2515 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2516 2517 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t) 2518 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t) 2519 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2520 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t) 2521 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2522 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2523 2524 #undef DO_VCVT_FIXED 2525 2526 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2527 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2528 { \ 2529 float_status *fpst = stat; \ 2530 intptr_t i, oprsz = simd_oprsz(desc); \ 2531 uint32_t rmode = simd_data(desc); \ 2532 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2533 TYPE *d = vd, *n = vn; \ 2534 set_float_rounding_mode(rmode, fpst); \ 2535 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2536 d[i] = FUNC(n[i], 0, fpst); \ 2537 } \ 2538 set_float_rounding_mode(prev_rmode, fpst); \ 2539 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2540 } 2541 2542 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t) 2543 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t) 2544 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2545 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2546 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2547 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2548 2549 #undef DO_VCVT_RMODE 2550 2551 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2552 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ 2553 { \ 2554 float_status *fpst = stat; \ 2555 intptr_t i, oprsz = simd_oprsz(desc); \ 2556 uint32_t rmode = simd_data(desc); \ 2557 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2558 TYPE *d = vd, *n = vn; \ 2559 set_float_rounding_mode(rmode, fpst); \ 2560 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2561 d[i] = FUNC(n[i], fpst); \ 2562 } \ 2563 set_float_rounding_mode(prev_rmode, fpst); \ 2564 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2565 } 2566 2567 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2568 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2569 2570 #undef DO_VRINT_RMODE 2571 2572 #ifdef TARGET_AARCH64 2573 void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc) 2574 { 2575 const uint8_t *indices = vm; 2576 CPUARMState *env = venv; 2577 size_t oprsz = simd_oprsz(desc); 2578 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2579 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2580 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2581 union { 2582 uint8_t b[16]; 2583 uint64_t d[2]; 2584 } result; 2585 2586 /* 2587 * We must construct the final result in a temp, lest the output 2588 * overlaps the input table. For TBL, begin with zero; for TBX, 2589 * begin with the original register contents. Note that we always 2590 * copy 16 bytes here to avoid an extra branch; clearing the high 2591 * bits of the register for oprsz == 8 is handled below. 2592 */ 2593 if (is_tbx) { 2594 memcpy(&result, vd, 16); 2595 } else { 2596 memset(&result, 0, 16); 2597 } 2598 2599 for (size_t i = 0; i < oprsz; ++i) { 2600 uint32_t index = indices[H1(i)]; 2601 2602 if (index < table_len) { 2603 /* 2604 * Convert index (a byte offset into the virtual table 2605 * which is a series of 128-bit vectors concatenated) 2606 * into the correct register element, bearing in mind 2607 * that the table can wrap around from V31 to V0. 2608 */ 2609 const uint8_t *table = (const uint8_t *) 2610 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2611 result.b[H1(i)] = table[H1(index % 16)]; 2612 } 2613 } 2614 2615 memcpy(vd, &result, 16); 2616 clear_tail(vd, oprsz, simd_maxsz(desc)); 2617 } 2618 #endif 2619 2620 /* 2621 * NxN -> N highpart multiply 2622 * 2623 * TODO: expose this as a generic vector operation. 2624 */ 2625 2626 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2627 { 2628 intptr_t i, opr_sz = simd_oprsz(desc); 2629 int8_t *d = vd, *n = vn, *m = vm; 2630 2631 for (i = 0; i < opr_sz; ++i) { 2632 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2633 } 2634 clear_tail(d, opr_sz, simd_maxsz(desc)); 2635 } 2636 2637 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2638 { 2639 intptr_t i, opr_sz = simd_oprsz(desc); 2640 int16_t *d = vd, *n = vn, *m = vm; 2641 2642 for (i = 0; i < opr_sz / 2; ++i) { 2643 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2644 } 2645 clear_tail(d, opr_sz, simd_maxsz(desc)); 2646 } 2647 2648 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2649 { 2650 intptr_t i, opr_sz = simd_oprsz(desc); 2651 int32_t *d = vd, *n = vn, *m = vm; 2652 2653 for (i = 0; i < opr_sz / 4; ++i) { 2654 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2655 } 2656 clear_tail(d, opr_sz, simd_maxsz(desc)); 2657 } 2658 2659 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2660 { 2661 intptr_t i, opr_sz = simd_oprsz(desc); 2662 uint64_t *d = vd, *n = vn, *m = vm; 2663 uint64_t discard; 2664 2665 for (i = 0; i < opr_sz / 8; ++i) { 2666 muls64(&discard, &d[i], n[i], m[i]); 2667 } 2668 clear_tail(d, opr_sz, simd_maxsz(desc)); 2669 } 2670 2671 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2672 { 2673 intptr_t i, opr_sz = simd_oprsz(desc); 2674 uint8_t *d = vd, *n = vn, *m = vm; 2675 2676 for (i = 0; i < opr_sz; ++i) { 2677 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2678 } 2679 clear_tail(d, opr_sz, simd_maxsz(desc)); 2680 } 2681 2682 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2683 { 2684 intptr_t i, opr_sz = simd_oprsz(desc); 2685 uint16_t *d = vd, *n = vn, *m = vm; 2686 2687 for (i = 0; i < opr_sz / 2; ++i) { 2688 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2689 } 2690 clear_tail(d, opr_sz, simd_maxsz(desc)); 2691 } 2692 2693 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2694 { 2695 intptr_t i, opr_sz = simd_oprsz(desc); 2696 uint32_t *d = vd, *n = vn, *m = vm; 2697 2698 for (i = 0; i < opr_sz / 4; ++i) { 2699 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2700 } 2701 clear_tail(d, opr_sz, simd_maxsz(desc)); 2702 } 2703 2704 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2705 { 2706 intptr_t i, opr_sz = simd_oprsz(desc); 2707 uint64_t *d = vd, *n = vn, *m = vm; 2708 uint64_t discard; 2709 2710 for (i = 0; i < opr_sz / 8; ++i) { 2711 mulu64(&discard, &d[i], n[i], m[i]); 2712 } 2713 clear_tail(d, opr_sz, simd_maxsz(desc)); 2714 } 2715 2716 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2717 { 2718 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2719 int shr = simd_data(desc); 2720 uint64_t *d = vd, *n = vn, *m = vm; 2721 2722 for (i = 0; i < opr_sz; ++i) { 2723 d[i] = ror64(n[i] ^ m[i], shr); 2724 } 2725 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2726 } 2727 2728 /* 2729 * Integer matrix-multiply accumulate 2730 */ 2731 2732 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2733 { 2734 int8_t *n = vn, *m = vm; 2735 2736 for (intptr_t k = 0; k < 8; ++k) { 2737 sum += n[H1(k)] * m[H1(k)]; 2738 } 2739 return sum; 2740 } 2741 2742 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2743 { 2744 uint8_t *n = vn, *m = vm; 2745 2746 for (intptr_t k = 0; k < 8; ++k) { 2747 sum += n[H1(k)] * m[H1(k)]; 2748 } 2749 return sum; 2750 } 2751 2752 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2753 { 2754 uint8_t *n = vn; 2755 int8_t *m = vm; 2756 2757 for (intptr_t k = 0; k < 8; ++k) { 2758 sum += n[H1(k)] * m[H1(k)]; 2759 } 2760 return sum; 2761 } 2762 2763 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2764 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2765 { 2766 intptr_t seg, opr_sz = simd_oprsz(desc); 2767 2768 for (seg = 0; seg < opr_sz; seg += 16) { 2769 uint32_t *d = vd + seg; 2770 uint32_t *a = va + seg; 2771 uint32_t sum0, sum1, sum2, sum3; 2772 2773 /* 2774 * Process the entire segment at once, writing back the 2775 * results only after we've consumed all of the inputs. 2776 * 2777 * Key to indices by column: 2778 * i j i j 2779 */ 2780 sum0 = a[H4(0 + 0)]; 2781 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2782 sum1 = a[H4(0 + 1)]; 2783 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2784 sum2 = a[H4(2 + 0)]; 2785 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2786 sum3 = a[H4(2 + 1)]; 2787 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2788 2789 d[H4(0)] = sum0; 2790 d[H4(1)] = sum1; 2791 d[H4(2)] = sum2; 2792 d[H4(3)] = sum3; 2793 } 2794 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2795 } 2796 2797 #define DO_MMLA_B(NAME, INNER) \ 2798 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2799 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2800 2801 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2802 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2803 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2804 2805 /* 2806 * BFloat16 Dot Product 2807 */ 2808 2809 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp) 2810 { 2811 /* 2812 * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF. 2813 * For EBF = 0, we ignore the FPCR bits which determine rounding 2814 * mode and denormal-flushing, and we do unfused multiplies and 2815 * additions with intermediate rounding of all products and sums. 2816 * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits, 2817 * and we perform a fused two-way sum-of-products without intermediate 2818 * rounding of the products. 2819 * In either case, we don't set fp exception flags. 2820 * 2821 * EBF is AArch64 only, so even if it's set in the FPCR it has 2822 * no effect on AArch32 instructions. 2823 */ 2824 bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF; 2825 2826 *statusp = env->vfp.fp_status; 2827 set_default_nan_mode(true, statusp); 2828 2829 if (ebf) { 2830 /* EBF=1 needs to do a step with round-to-odd semantics */ 2831 *oddstatusp = *statusp; 2832 set_float_rounding_mode(float_round_to_odd, oddstatusp); 2833 } else { 2834 set_flush_to_zero(true, statusp); 2835 set_flush_inputs_to_zero(true, statusp); 2836 set_float_rounding_mode(float_round_to_odd_inf, statusp); 2837 } 2838 return ebf; 2839 } 2840 2841 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst) 2842 { 2843 float32 t1, t2; 2844 2845 /* 2846 * Extract each BFloat16 from the element pair, and shift 2847 * them such that they become float32. 2848 */ 2849 t1 = float32_mul(e1 << 16, e2 << 16, fpst); 2850 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst); 2851 t1 = float32_add(t1, t2, fpst); 2852 t1 = float32_add(sum, t1, fpst); 2853 2854 return t1; 2855 } 2856 2857 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2, 2858 float_status *fpst, float_status *fpst_odd) 2859 { 2860 /* 2861 * Compare f16_dotadd() in sme_helper.c, but here we have 2862 * bfloat16 inputs. In particular that means that we do not 2863 * want the FPCR.FZ16 flush semantics, so we use the normal 2864 * float_status for the input handling here. 2865 */ 2866 float64 e1r = float32_to_float64(e1 << 16, fpst); 2867 float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst); 2868 float64 e2r = float32_to_float64(e2 << 16, fpst); 2869 float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst); 2870 float64 t64; 2871 float32 t32; 2872 2873 /* 2874 * The ARM pseudocode function FPDot performs both multiplies 2875 * and the add with a single rounding operation. Emulate this 2876 * by performing the first multiply in round-to-odd, then doing 2877 * the second multiply as fused multiply-add, and rounding to 2878 * float32 all in one step. 2879 */ 2880 t64 = float64_mul(e1r, e2r, fpst_odd); 2881 t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst); 2882 2883 /* This conversion is exact, because we've already rounded. */ 2884 t32 = float64_to_float32(t64, fpst); 2885 2886 /* The final accumulation step is not fused. */ 2887 return float32_add(sum, t32, fpst); 2888 } 2889 2890 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, 2891 CPUARMState *env, uint32_t desc) 2892 { 2893 intptr_t i, opr_sz = simd_oprsz(desc); 2894 float32 *d = vd, *a = va; 2895 uint32_t *n = vn, *m = vm; 2896 float_status fpst, fpst_odd; 2897 2898 if (is_ebf(env, &fpst, &fpst_odd)) { 2899 for (i = 0; i < opr_sz / 4; ++i) { 2900 d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd); 2901 } 2902 } else { 2903 for (i = 0; i < opr_sz / 4; ++i) { 2904 d[i] = bfdotadd(a[i], n[i], m[i], &fpst); 2905 } 2906 } 2907 clear_tail(d, opr_sz, simd_maxsz(desc)); 2908 } 2909 2910 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 2911 void *va, CPUARMState *env, uint32_t desc) 2912 { 2913 intptr_t i, j, opr_sz = simd_oprsz(desc); 2914 intptr_t index = simd_data(desc); 2915 intptr_t elements = opr_sz / 4; 2916 intptr_t eltspersegment = MIN(16 / 4, elements); 2917 float32 *d = vd, *a = va; 2918 uint32_t *n = vn, *m = vm; 2919 float_status fpst, fpst_odd; 2920 2921 if (is_ebf(env, &fpst, &fpst_odd)) { 2922 for (i = 0; i < elements; i += eltspersegment) { 2923 uint32_t m_idx = m[i + H4(index)]; 2924 2925 for (j = i; j < i + eltspersegment; j++) { 2926 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd); 2927 } 2928 } 2929 } else { 2930 for (i = 0; i < elements; i += eltspersegment) { 2931 uint32_t m_idx = m[i + H4(index)]; 2932 2933 for (j = i; j < i + eltspersegment; j++) { 2934 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst); 2935 } 2936 } 2937 } 2938 clear_tail(d, opr_sz, simd_maxsz(desc)); 2939 } 2940 2941 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, 2942 CPUARMState *env, uint32_t desc) 2943 { 2944 intptr_t s, opr_sz = simd_oprsz(desc); 2945 float32 *d = vd, *a = va; 2946 uint32_t *n = vn, *m = vm; 2947 float_status fpst, fpst_odd; 2948 2949 if (is_ebf(env, &fpst, &fpst_odd)) { 2950 for (s = 0; s < opr_sz / 4; s += 4) { 2951 float32 sum00, sum01, sum10, sum11; 2952 2953 /* 2954 * Process the entire segment at once, writing back the 2955 * results only after we've consumed all of the inputs. 2956 * 2957 * Key to indices by column: 2958 * i j i k j k 2959 */ 2960 sum00 = a[s + H4(0 + 0)]; 2961 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 2962 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 2963 2964 sum01 = a[s + H4(0 + 1)]; 2965 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 2966 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 2967 2968 sum10 = a[s + H4(2 + 0)]; 2969 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 2970 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 2971 2972 sum11 = a[s + H4(2 + 1)]; 2973 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 2974 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 2975 2976 d[s + H4(0 + 0)] = sum00; 2977 d[s + H4(0 + 1)] = sum01; 2978 d[s + H4(2 + 0)] = sum10; 2979 d[s + H4(2 + 1)] = sum11; 2980 } 2981 } else { 2982 for (s = 0; s < opr_sz / 4; s += 4) { 2983 float32 sum00, sum01, sum10, sum11; 2984 2985 /* 2986 * Process the entire segment at once, writing back the 2987 * results only after we've consumed all of the inputs. 2988 * 2989 * Key to indices by column: 2990 * i j i k j k 2991 */ 2992 sum00 = a[s + H4(0 + 0)]; 2993 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst); 2994 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst); 2995 2996 sum01 = a[s + H4(0 + 1)]; 2997 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst); 2998 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst); 2999 3000 sum10 = a[s + H4(2 + 0)]; 3001 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst); 3002 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst); 3003 3004 sum11 = a[s + H4(2 + 1)]; 3005 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst); 3006 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst); 3007 3008 d[s + H4(0 + 0)] = sum00; 3009 d[s + H4(0 + 1)] = sum01; 3010 d[s + H4(2 + 0)] = sum10; 3011 d[s + H4(2 + 1)] = sum11; 3012 } 3013 } 3014 clear_tail(d, opr_sz, simd_maxsz(desc)); 3015 } 3016 3017 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 3018 void *stat, uint32_t desc) 3019 { 3020 intptr_t i, opr_sz = simd_oprsz(desc); 3021 intptr_t sel = simd_data(desc); 3022 float32 *d = vd, *a = va; 3023 bfloat16 *n = vn, *m = vm; 3024 3025 for (i = 0; i < opr_sz / 4; ++i) { 3026 float32 nn = n[H2(i * 2 + sel)] << 16; 3027 float32 mm = m[H2(i * 2 + sel)] << 16; 3028 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 3029 } 3030 clear_tail(d, opr_sz, simd_maxsz(desc)); 3031 } 3032 3033 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 3034 void *va, void *stat, uint32_t desc) 3035 { 3036 intptr_t i, j, opr_sz = simd_oprsz(desc); 3037 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 3038 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 3039 intptr_t elements = opr_sz / 4; 3040 intptr_t eltspersegment = MIN(16 / 4, elements); 3041 float32 *d = vd, *a = va; 3042 bfloat16 *n = vn, *m = vm; 3043 3044 for (i = 0; i < elements; i += eltspersegment) { 3045 float32 m_idx = m[H2(2 * i + index)] << 16; 3046 3047 for (j = i; j < i + eltspersegment; j++) { 3048 float32 n_j = n[H2(2 * j + sel)] << 16; 3049 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 3050 } 3051 } 3052 clear_tail(d, opr_sz, simd_maxsz(desc)); 3053 } 3054 3055 #define DO_CLAMP(NAME, TYPE) \ 3056 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 3057 { \ 3058 intptr_t i, opr_sz = simd_oprsz(desc); \ 3059 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 3060 TYPE aa = *(TYPE *)(a + i); \ 3061 TYPE nn = *(TYPE *)(n + i); \ 3062 TYPE mm = *(TYPE *)(m + i); \ 3063 TYPE dd = MIN(MAX(aa, nn), mm); \ 3064 *(TYPE *)(d + i) = dd; \ 3065 } \ 3066 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 3067 } 3068 3069 DO_CLAMP(gvec_sclamp_b, int8_t) 3070 DO_CLAMP(gvec_sclamp_h, int16_t) 3071 DO_CLAMP(gvec_sclamp_s, int32_t) 3072 DO_CLAMP(gvec_sclamp_d, int64_t) 3073 3074 DO_CLAMP(gvec_uclamp_b, uint8_t) 3075 DO_CLAMP(gvec_uclamp_h, uint16_t) 3076 DO_CLAMP(gvec_uclamp_s, uint32_t) 3077 DO_CLAMP(gvec_uclamp_d, uint64_t) 3078 3079 /* Bit count in each 8-bit word. */ 3080 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc) 3081 { 3082 intptr_t i, opr_sz = simd_oprsz(desc); 3083 uint8_t *d = vd, *n = vn; 3084 3085 for (i = 0; i < opr_sz; ++i) { 3086 d[i] = ctpop8(n[i]); 3087 } 3088 clear_tail(d, opr_sz, simd_maxsz(desc)); 3089 } 3090 3091 /* Reverse bits in each 8 bit word */ 3092 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc) 3093 { 3094 intptr_t i, opr_sz = simd_oprsz(desc); 3095 uint64_t *d = vd, *n = vn; 3096 3097 for (i = 0; i < opr_sz / 8; ++i) { 3098 d[i] = revbit64(bswap64(n[i])); 3099 } 3100 clear_tail(d, opr_sz, simd_maxsz(desc)); 3101 } 3102 3103 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc) 3104 { 3105 intptr_t i, opr_sz = simd_oprsz(desc); 3106 uint32_t *d = vd, *n = vn; 3107 3108 for (i = 0; i < opr_sz / 4; ++i) { 3109 d[i] = helper_recpe_u32(n[i]); 3110 } 3111 clear_tail(d, opr_sz, simd_maxsz(desc)); 3112 } 3113 3114 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc) 3115 { 3116 intptr_t i, opr_sz = simd_oprsz(desc); 3117 uint32_t *d = vd, *n = vn; 3118 3119 for (i = 0; i < opr_sz / 4; ++i) { 3120 d[i] = helper_rsqrte_u32(n[i]); 3121 } 3122 clear_tail(d, opr_sz, simd_maxsz(desc)); 3123 } 3124