1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "crypto/clmul.h" 27 #include "vec_internal.h" 28 29 /* 30 * Data for expanding active predicate bits to bytes, for byte elements. 31 * 32 * for (i = 0; i < 256; ++i) { 33 * unsigned long m = 0; 34 * for (j = 0; j < 8; j++) { 35 * if ((i >> j) & 1) { 36 * m |= 0xfful << (j << 3); 37 * } 38 * } 39 * printf("0x%016lx,\n", m); 40 * } 41 */ 42 const uint64_t expand_pred_b_data[256] = { 43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 128 0xffffffffffffffff, 129 }; 130 131 /* 132 * Similarly for half-word elements. 133 * for (i = 0; i < 256; ++i) { 134 * unsigned long m = 0; 135 * if (i & 0xaa) { 136 * continue; 137 * } 138 * for (j = 0; j < 8; j += 2) { 139 * if ((i >> j) & 1) { 140 * m |= 0xfffful << (j << 3); 141 * } 142 * } 143 * printf("[0x%x] = 0x%016lx,\n", i, m); 144 * } 145 */ 146 const uint64_t expand_pred_h_data[0x55 + 1] = { 147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 154 [0x55] = 0xffffffffffffffff, 155 }; 156 157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 159 bool neg, bool round) 160 { 161 /* 162 * Simplify: 163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 165 */ 166 int32_t ret = (int32_t)src1 * src2; 167 if (neg) { 168 ret = -ret; 169 } 170 ret += ((int32_t)src3 << 7) + (round << 6); 171 ret >>= 7; 172 173 if (ret != (int8_t)ret) { 174 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 175 } 176 return ret; 177 } 178 179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 180 void *va, uint32_t desc) 181 { 182 intptr_t i, opr_sz = simd_oprsz(desc); 183 int8_t *d = vd, *n = vn, *m = vm, *a = va; 184 185 for (i = 0; i < opr_sz; ++i) { 186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 187 } 188 } 189 190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 191 void *va, uint32_t desc) 192 { 193 intptr_t i, opr_sz = simd_oprsz(desc); 194 int8_t *d = vd, *n = vn, *m = vm, *a = va; 195 196 for (i = 0; i < opr_sz; ++i) { 197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 198 } 199 } 200 201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 202 { 203 intptr_t i, opr_sz = simd_oprsz(desc); 204 int8_t *d = vd, *n = vn, *m = vm; 205 206 for (i = 0; i < opr_sz; ++i) { 207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 208 } 209 } 210 211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 212 { 213 intptr_t i, opr_sz = simd_oprsz(desc); 214 int8_t *d = vd, *n = vn, *m = vm; 215 216 for (i = 0; i < opr_sz; ++i) { 217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 218 } 219 } 220 221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 223 bool neg, bool round, uint32_t *sat) 224 { 225 /* Simplify similarly to do_sqrdmlah_b above. */ 226 int32_t ret = (int32_t)src1 * src2; 227 if (neg) { 228 ret = -ret; 229 } 230 ret += ((int32_t)src3 << 15) + (round << 14); 231 ret >>= 15; 232 233 if (ret != (int16_t)ret) { 234 *sat = 1; 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 236 } 237 return ret; 238 } 239 240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 241 uint32_t src2, uint32_t src3) 242 { 243 uint32_t *sat = &env->vfp.qc[0]; 244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 246 false, true, sat); 247 return deposit32(e1, 16, 16, e2); 248 } 249 250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 251 void *vq, uint32_t desc) 252 { 253 uintptr_t opr_sz = simd_oprsz(desc); 254 int16_t *d = vd; 255 int16_t *n = vn; 256 int16_t *m = vm; 257 uintptr_t i; 258 259 for (i = 0; i < opr_sz / 2; ++i) { 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 261 } 262 clear_tail(d, opr_sz, simd_maxsz(desc)); 263 } 264 265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 266 uint32_t src2, uint32_t src3) 267 { 268 uint32_t *sat = &env->vfp.qc[0]; 269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 271 true, true, sat); 272 return deposit32(e1, 16, 16, e2); 273 } 274 275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277 { 278 uintptr_t opr_sz = simd_oprsz(desc); 279 int16_t *d = vd; 280 int16_t *n = vn; 281 int16_t *m = vm; 282 uintptr_t i; 283 284 for (i = 0; i < opr_sz / 2; ++i) { 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 286 } 287 clear_tail(d, opr_sz, simd_maxsz(desc)); 288 } 289 290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 291 void *vq, uint32_t desc) 292 { 293 intptr_t i, opr_sz = simd_oprsz(desc); 294 int16_t *d = vd, *n = vn, *m = vm; 295 296 for (i = 0; i < opr_sz / 2; ++i) { 297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 298 } 299 clear_tail(d, opr_sz, simd_maxsz(desc)); 300 } 301 302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 303 void *vq, uint32_t desc) 304 { 305 intptr_t i, opr_sz = simd_oprsz(desc); 306 int16_t *d = vd, *n = vn, *m = vm; 307 308 for (i = 0; i < opr_sz / 2; ++i) { 309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 310 } 311 clear_tail(d, opr_sz, simd_maxsz(desc)); 312 } 313 314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm, 315 void *vq, uint32_t desc) 316 { 317 intptr_t i, j, opr_sz = simd_oprsz(desc); 318 int idx = simd_data(desc); 319 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 320 intptr_t elements = opr_sz / 2; 321 intptr_t eltspersegment = MIN(16 / 2, elements); 322 323 for (i = 0; i < elements; i += 16 / 2) { 324 int16_t mm = m[i]; 325 for (j = 0; j < eltspersegment; ++j) { 326 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq); 327 } 328 } 329 clear_tail(d, opr_sz, simd_maxsz(desc)); 330 } 331 332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, 333 void *vq, uint32_t desc) 334 { 335 intptr_t i, j, opr_sz = simd_oprsz(desc); 336 int idx = simd_data(desc); 337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 338 intptr_t elements = opr_sz / 2; 339 intptr_t eltspersegment = MIN(16 / 2, elements); 340 341 for (i = 0; i < elements; i += 16 / 2) { 342 int16_t mm = m[i]; 343 for (j = 0; j < eltspersegment; ++j) { 344 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq); 345 } 346 } 347 clear_tail(d, opr_sz, simd_maxsz(desc)); 348 } 349 350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm, 351 void *vq, uint32_t desc) 352 { 353 intptr_t i, j, opr_sz = simd_oprsz(desc); 354 int idx = simd_data(desc); 355 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 356 intptr_t elements = opr_sz / 2; 357 intptr_t eltspersegment = MIN(16 / 2, elements); 358 359 for (i = 0; i < elements; i += 16 / 2) { 360 int16_t mm = m[i]; 361 for (j = 0; j < eltspersegment; ++j) { 362 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq); 363 } 364 } 365 clear_tail(d, opr_sz, simd_maxsz(desc)); 366 } 367 368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm, 369 void *vq, uint32_t desc) 370 { 371 intptr_t i, j, opr_sz = simd_oprsz(desc); 372 int idx = simd_data(desc); 373 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 374 intptr_t elements = opr_sz / 2; 375 intptr_t eltspersegment = MIN(16 / 2, elements); 376 377 for (i = 0; i < elements; i += 16 / 2) { 378 int16_t mm = m[i]; 379 for (j = 0; j < eltspersegment; ++j) { 380 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq); 381 } 382 } 383 clear_tail(d, opr_sz, simd_maxsz(desc)); 384 } 385 386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 387 void *va, uint32_t desc) 388 { 389 intptr_t i, opr_sz = simd_oprsz(desc); 390 int16_t *d = vd, *n = vn, *m = vm, *a = va; 391 uint32_t discard; 392 393 for (i = 0; i < opr_sz / 2; ++i) { 394 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 395 } 396 } 397 398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 399 void *va, uint32_t desc) 400 { 401 intptr_t i, opr_sz = simd_oprsz(desc); 402 int16_t *d = vd, *n = vn, *m = vm, *a = va; 403 uint32_t discard; 404 405 for (i = 0; i < opr_sz / 2; ++i) { 406 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 407 } 408 } 409 410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 411 { 412 intptr_t i, opr_sz = simd_oprsz(desc); 413 int16_t *d = vd, *n = vn, *m = vm; 414 uint32_t discard; 415 416 for (i = 0; i < opr_sz / 2; ++i) { 417 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 418 } 419 } 420 421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 422 { 423 intptr_t i, opr_sz = simd_oprsz(desc); 424 int16_t *d = vd, *n = vn, *m = vm; 425 uint32_t discard; 426 427 for (i = 0; i < opr_sz / 2; ++i) { 428 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 429 } 430 } 431 432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 433 { 434 intptr_t i, j, opr_sz = simd_oprsz(desc); 435 int idx = simd_data(desc); 436 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 437 uint32_t discard; 438 439 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 440 int16_t mm = m[i]; 441 for (j = 0; j < 16 / 2; ++j) { 442 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 443 } 444 } 445 } 446 447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 448 { 449 intptr_t i, j, opr_sz = simd_oprsz(desc); 450 int idx = simd_data(desc); 451 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 452 uint32_t discard; 453 454 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 455 int16_t mm = m[i]; 456 for (j = 0; j < 16 / 2; ++j) { 457 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 458 } 459 } 460 } 461 462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 464 bool neg, bool round, uint32_t *sat) 465 { 466 /* Simplify similarly to do_sqrdmlah_b above. */ 467 int64_t ret = (int64_t)src1 * src2; 468 if (neg) { 469 ret = -ret; 470 } 471 ret += ((int64_t)src3 << 31) + (round << 30); 472 ret >>= 31; 473 474 if (ret != (int32_t)ret) { 475 *sat = 1; 476 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 477 } 478 return ret; 479 } 480 481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 482 int32_t src2, int32_t src3) 483 { 484 uint32_t *sat = &env->vfp.qc[0]; 485 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 486 } 487 488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 489 void *vq, uint32_t desc) 490 { 491 uintptr_t opr_sz = simd_oprsz(desc); 492 int32_t *d = vd; 493 int32_t *n = vn; 494 int32_t *m = vm; 495 uintptr_t i; 496 497 for (i = 0; i < opr_sz / 4; ++i) { 498 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 499 } 500 clear_tail(d, opr_sz, simd_maxsz(desc)); 501 } 502 503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 504 int32_t src2, int32_t src3) 505 { 506 uint32_t *sat = &env->vfp.qc[0]; 507 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 508 } 509 510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 511 void *vq, uint32_t desc) 512 { 513 uintptr_t opr_sz = simd_oprsz(desc); 514 int32_t *d = vd; 515 int32_t *n = vn; 516 int32_t *m = vm; 517 uintptr_t i; 518 519 for (i = 0; i < opr_sz / 4; ++i) { 520 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 521 } 522 clear_tail(d, opr_sz, simd_maxsz(desc)); 523 } 524 525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 526 void *vq, uint32_t desc) 527 { 528 intptr_t i, opr_sz = simd_oprsz(desc); 529 int32_t *d = vd, *n = vn, *m = vm; 530 531 for (i = 0; i < opr_sz / 4; ++i) { 532 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 533 } 534 clear_tail(d, opr_sz, simd_maxsz(desc)); 535 } 536 537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 538 void *vq, uint32_t desc) 539 { 540 intptr_t i, opr_sz = simd_oprsz(desc); 541 int32_t *d = vd, *n = vn, *m = vm; 542 543 for (i = 0; i < opr_sz / 4; ++i) { 544 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 545 } 546 clear_tail(d, opr_sz, simd_maxsz(desc)); 547 } 548 549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm, 550 void *vq, uint32_t desc) 551 { 552 intptr_t i, j, opr_sz = simd_oprsz(desc); 553 int idx = simd_data(desc); 554 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 555 intptr_t elements = opr_sz / 4; 556 intptr_t eltspersegment = MIN(16 / 4, elements); 557 558 for (i = 0; i < elements; i += 16 / 4) { 559 int32_t mm = m[i]; 560 for (j = 0; j < eltspersegment; ++j) { 561 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq); 562 } 563 } 564 clear_tail(d, opr_sz, simd_maxsz(desc)); 565 } 566 567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, 568 void *vq, uint32_t desc) 569 { 570 intptr_t i, j, opr_sz = simd_oprsz(desc); 571 int idx = simd_data(desc); 572 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 573 intptr_t elements = opr_sz / 4; 574 intptr_t eltspersegment = MIN(16 / 4, elements); 575 576 for (i = 0; i < elements; i += 16 / 4) { 577 int32_t mm = m[i]; 578 for (j = 0; j < eltspersegment; ++j) { 579 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq); 580 } 581 } 582 clear_tail(d, opr_sz, simd_maxsz(desc)); 583 } 584 585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm, 586 void *vq, uint32_t desc) 587 { 588 intptr_t i, j, opr_sz = simd_oprsz(desc); 589 int idx = simd_data(desc); 590 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 591 intptr_t elements = opr_sz / 4; 592 intptr_t eltspersegment = MIN(16 / 4, elements); 593 594 for (i = 0; i < elements; i += 16 / 4) { 595 int32_t mm = m[i]; 596 for (j = 0; j < eltspersegment; ++j) { 597 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq); 598 } 599 } 600 clear_tail(d, opr_sz, simd_maxsz(desc)); 601 } 602 603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm, 604 void *vq, uint32_t desc) 605 { 606 intptr_t i, j, opr_sz = simd_oprsz(desc); 607 int idx = simd_data(desc); 608 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 609 intptr_t elements = opr_sz / 4; 610 intptr_t eltspersegment = MIN(16 / 4, elements); 611 612 for (i = 0; i < elements; i += 16 / 4) { 613 int32_t mm = m[i]; 614 for (j = 0; j < eltspersegment; ++j) { 615 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq); 616 } 617 } 618 clear_tail(d, opr_sz, simd_maxsz(desc)); 619 } 620 621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 622 void *va, uint32_t desc) 623 { 624 intptr_t i, opr_sz = simd_oprsz(desc); 625 int32_t *d = vd, *n = vn, *m = vm, *a = va; 626 uint32_t discard; 627 628 for (i = 0; i < opr_sz / 4; ++i) { 629 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 630 } 631 } 632 633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 634 void *va, uint32_t desc) 635 { 636 intptr_t i, opr_sz = simd_oprsz(desc); 637 int32_t *d = vd, *n = vn, *m = vm, *a = va; 638 uint32_t discard; 639 640 for (i = 0; i < opr_sz / 4; ++i) { 641 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 642 } 643 } 644 645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 646 { 647 intptr_t i, opr_sz = simd_oprsz(desc); 648 int32_t *d = vd, *n = vn, *m = vm; 649 uint32_t discard; 650 651 for (i = 0; i < opr_sz / 4; ++i) { 652 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 653 } 654 } 655 656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 657 { 658 intptr_t i, opr_sz = simd_oprsz(desc); 659 int32_t *d = vd, *n = vn, *m = vm; 660 uint32_t discard; 661 662 for (i = 0; i < opr_sz / 4; ++i) { 663 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 664 } 665 } 666 667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 668 { 669 intptr_t i, j, opr_sz = simd_oprsz(desc); 670 int idx = simd_data(desc); 671 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 672 uint32_t discard; 673 674 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 675 int32_t mm = m[i]; 676 for (j = 0; j < 16 / 4; ++j) { 677 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 678 } 679 } 680 } 681 682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 683 { 684 intptr_t i, j, opr_sz = simd_oprsz(desc); 685 int idx = simd_data(desc); 686 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 687 uint32_t discard; 688 689 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 690 int32_t mm = m[i]; 691 for (j = 0; j < 16 / 4; ++j) { 692 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 693 } 694 } 695 } 696 697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 698 static int64_t do_sat128_d(Int128 r) 699 { 700 int64_t ls = int128_getlo(r); 701 int64_t hs = int128_gethi(r); 702 703 if (unlikely(hs != (ls >> 63))) { 704 return hs < 0 ? INT64_MIN : INT64_MAX; 705 } 706 return ls; 707 } 708 709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 710 { 711 uint64_t l, h; 712 Int128 r, t; 713 714 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 715 muls64(&l, &h, m, n); 716 r = int128_make128(l, h); 717 if (neg) { 718 r = int128_neg(r); 719 } 720 if (a) { 721 t = int128_exts64(a); 722 t = int128_lshift(t, 63); 723 r = int128_add(r, t); 724 } 725 if (round) { 726 t = int128_exts64(1ll << 62); 727 r = int128_add(r, t); 728 } 729 r = int128_rshift(r, 63); 730 731 return do_sat128_d(r); 732 } 733 734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 735 void *va, uint32_t desc) 736 { 737 intptr_t i, opr_sz = simd_oprsz(desc); 738 int64_t *d = vd, *n = vn, *m = vm, *a = va; 739 740 for (i = 0; i < opr_sz / 8; ++i) { 741 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 742 } 743 } 744 745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 746 void *va, uint32_t desc) 747 { 748 intptr_t i, opr_sz = simd_oprsz(desc); 749 int64_t *d = vd, *n = vn, *m = vm, *a = va; 750 751 for (i = 0; i < opr_sz / 8; ++i) { 752 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 753 } 754 } 755 756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 757 { 758 intptr_t i, opr_sz = simd_oprsz(desc); 759 int64_t *d = vd, *n = vn, *m = vm; 760 761 for (i = 0; i < opr_sz / 8; ++i) { 762 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 763 } 764 } 765 766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 767 { 768 intptr_t i, opr_sz = simd_oprsz(desc); 769 int64_t *d = vd, *n = vn, *m = vm; 770 771 for (i = 0; i < opr_sz / 8; ++i) { 772 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 773 } 774 } 775 776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 777 { 778 intptr_t i, j, opr_sz = simd_oprsz(desc); 779 int idx = simd_data(desc); 780 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 781 782 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 783 int64_t mm = m[i]; 784 for (j = 0; j < 16 / 8; ++j) { 785 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 786 } 787 } 788 } 789 790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 791 { 792 intptr_t i, j, opr_sz = simd_oprsz(desc); 793 int idx = simd_data(desc); 794 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 795 796 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 797 int64_t mm = m[i]; 798 for (j = 0; j < 16 / 8; ++j) { 799 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 800 } 801 } 802 } 803 804 /* Integer 8 and 16-bit dot-product. 805 * 806 * Note that for the loops herein, host endianness does not matter 807 * with respect to the ordering of data within the quad-width lanes. 808 * All elements are treated equally, no matter where they are. 809 */ 810 811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 813 { \ 814 intptr_t i, opr_sz = simd_oprsz(desc); \ 815 TYPED *d = vd, *a = va; \ 816 TYPEN *n = vn; \ 817 TYPEM *m = vm; \ 818 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 819 d[i] = (a[i] + \ 820 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 821 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 822 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 823 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 824 } \ 825 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 826 } 827 828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 833 834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 836 { \ 837 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 838 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 839 /* \ 840 * Special case: opr_sz == 8 from AA64/AA32 advsimd means the \ 841 * first iteration might not be a full 16 byte segment. But \ 842 * for vector lengths beyond that this must be SVE and we know \ 843 * opr_sz is a multiple of 16, so we need not clamp segend \ 844 * to opr_sz_n when we advance it at the end of the loop. \ 845 */ \ 846 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 847 intptr_t index = simd_data(desc); \ 848 TYPED *d = vd, *a = va; \ 849 TYPEN *n = vn; \ 850 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 851 do { \ 852 TYPED m0 = m_indexed[i * 4 + 0]; \ 853 TYPED m1 = m_indexed[i * 4 + 1]; \ 854 TYPED m2 = m_indexed[i * 4 + 2]; \ 855 TYPED m3 = m_indexed[i * 4 + 3]; \ 856 do { \ 857 d[i] = (a[i] + \ 858 n[i * 4 + 0] * m0 + \ 859 n[i * 4 + 1] * m1 + \ 860 n[i * 4 + 2] * m2 + \ 861 n[i * 4 + 3] * m3); \ 862 } while (++i < segend); \ 863 segend = i + (16 / sizeof(TYPED)); \ 864 } while (i < opr_sz_n); \ 865 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 866 } 867 868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 874 875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 876 float_status *fpst, uint32_t desc) 877 { 878 uintptr_t opr_sz = simd_oprsz(desc); 879 float16 *d = vd; 880 float16 *n = vn; 881 float16 *m = vm; 882 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 883 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 884 uintptr_t i; 885 886 for (i = 0; i < opr_sz / 2; i += 2) { 887 float16 e0 = n[H2(i)]; 888 float16 e1 = m[H2(i + 1)]; 889 float16 e2 = n[H2(i + 1)]; 890 float16 e3 = m[H2(i)]; 891 892 if (rot) { 893 e3 = float16_maybe_ah_chs(e3, fpcr_ah); 894 } else { 895 e1 = float16_maybe_ah_chs(e1, fpcr_ah); 896 } 897 898 d[H2(i)] = float16_add(e0, e1, fpst); 899 d[H2(i + 1)] = float16_add(e2, e3, fpst); 900 } 901 clear_tail(d, opr_sz, simd_maxsz(desc)); 902 } 903 904 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 905 float_status *fpst, uint32_t desc) 906 { 907 uintptr_t opr_sz = simd_oprsz(desc); 908 float32 *d = vd; 909 float32 *n = vn; 910 float32 *m = vm; 911 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 912 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 913 uintptr_t i; 914 915 for (i = 0; i < opr_sz / 4; i += 2) { 916 float32 e0 = n[H4(i)]; 917 float32 e1 = m[H4(i + 1)]; 918 float32 e2 = n[H4(i + 1)]; 919 float32 e3 = m[H4(i)]; 920 921 if (rot) { 922 e3 = float32_maybe_ah_chs(e3, fpcr_ah); 923 } else { 924 e1 = float32_maybe_ah_chs(e1, fpcr_ah); 925 } 926 927 d[H4(i)] = float32_add(e0, e1, fpst); 928 d[H4(i + 1)] = float32_add(e2, e3, fpst); 929 } 930 clear_tail(d, opr_sz, simd_maxsz(desc)); 931 } 932 933 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 934 float_status *fpst, uint32_t desc) 935 { 936 uintptr_t opr_sz = simd_oprsz(desc); 937 float64 *d = vd; 938 float64 *n = vn; 939 float64 *m = vm; 940 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 941 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 942 uintptr_t i; 943 944 for (i = 0; i < opr_sz / 8; i += 2) { 945 float64 e0 = n[i]; 946 float64 e1 = m[i + 1]; 947 float64 e2 = n[i + 1]; 948 float64 e3 = m[i]; 949 950 if (rot) { 951 e3 = float64_maybe_ah_chs(e3, fpcr_ah); 952 } else { 953 e1 = float64_maybe_ah_chs(e1, fpcr_ah); 954 } 955 956 d[i] = float64_add(e0, e1, fpst); 957 d[i + 1] = float64_add(e2, e3, fpst); 958 } 959 clear_tail(d, opr_sz, simd_maxsz(desc)); 960 } 961 962 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 963 float_status *fpst, uint32_t desc) 964 { 965 uintptr_t opr_sz = simd_oprsz(desc); 966 float16 *d = vd, *n = vn, *m = vm, *a = va; 967 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 968 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 969 uint32_t neg_real = flip ^ neg_imag; 970 uintptr_t i; 971 972 /* Shift boolean to the sign bit so we can xor to negate. */ 973 neg_real <<= 15; 974 neg_imag <<= 15; 975 976 for (i = 0; i < opr_sz / 2; i += 2) { 977 float16 e2 = n[H2(i + flip)]; 978 float16 e1 = m[H2(i + flip)] ^ neg_real; 979 float16 e4 = e2; 980 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 981 982 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst); 983 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst); 984 } 985 clear_tail(d, opr_sz, simd_maxsz(desc)); 986 } 987 988 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 989 float_status *fpst, uint32_t desc) 990 { 991 uintptr_t opr_sz = simd_oprsz(desc); 992 float16 *d = vd, *n = vn, *m = vm, *a = va; 993 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 994 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 995 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 996 uint32_t neg_real = flip ^ neg_imag; 997 intptr_t elements = opr_sz / sizeof(float16); 998 intptr_t eltspersegment = MIN(16 / sizeof(float16), elements); 999 intptr_t i, j; 1000 1001 /* Shift boolean to the sign bit so we can xor to negate. */ 1002 neg_real <<= 15; 1003 neg_imag <<= 15; 1004 1005 for (i = 0; i < elements; i += eltspersegment) { 1006 float16 mr = m[H2(i + 2 * index + 0)]; 1007 float16 mi = m[H2(i + 2 * index + 1)]; 1008 float16 e1 = neg_real ^ (flip ? mi : mr); 1009 float16 e3 = neg_imag ^ (flip ? mr : mi); 1010 1011 for (j = i; j < i + eltspersegment; j += 2) { 1012 float16 e2 = n[H2(j + flip)]; 1013 float16 e4 = e2; 1014 1015 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); 1016 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); 1017 } 1018 } 1019 clear_tail(d, opr_sz, simd_maxsz(desc)); 1020 } 1021 1022 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 1023 float_status *fpst, uint32_t desc) 1024 { 1025 uintptr_t opr_sz = simd_oprsz(desc); 1026 float32 *d = vd, *n = vn, *m = vm, *a = va; 1027 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1028 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1029 uint32_t neg_real = flip ^ neg_imag; 1030 uintptr_t i; 1031 1032 /* Shift boolean to the sign bit so we can xor to negate. */ 1033 neg_real <<= 31; 1034 neg_imag <<= 31; 1035 1036 for (i = 0; i < opr_sz / 4; i += 2) { 1037 float32 e2 = n[H4(i + flip)]; 1038 float32 e1 = m[H4(i + flip)] ^ neg_real; 1039 float32 e4 = e2; 1040 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 1041 1042 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst); 1043 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst); 1044 } 1045 clear_tail(d, opr_sz, simd_maxsz(desc)); 1046 } 1047 1048 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 1049 float_status *fpst, uint32_t desc) 1050 { 1051 uintptr_t opr_sz = simd_oprsz(desc); 1052 float32 *d = vd, *n = vn, *m = vm, *a = va; 1053 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1054 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1055 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1056 uint32_t neg_real = flip ^ neg_imag; 1057 intptr_t elements = opr_sz / sizeof(float32); 1058 intptr_t eltspersegment = MIN(16 / sizeof(float32), elements); 1059 intptr_t i, j; 1060 1061 /* Shift boolean to the sign bit so we can xor to negate. */ 1062 neg_real <<= 31; 1063 neg_imag <<= 31; 1064 1065 for (i = 0; i < elements; i += eltspersegment) { 1066 float32 mr = m[H4(i + 2 * index + 0)]; 1067 float32 mi = m[H4(i + 2 * index + 1)]; 1068 float32 e1 = neg_real ^ (flip ? mi : mr); 1069 float32 e3 = neg_imag ^ (flip ? mr : mi); 1070 1071 for (j = i; j < i + eltspersegment; j += 2) { 1072 float32 e2 = n[H4(j + flip)]; 1073 float32 e4 = e2; 1074 1075 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); 1076 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); 1077 } 1078 } 1079 clear_tail(d, opr_sz, simd_maxsz(desc)); 1080 } 1081 1082 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 1083 float_status *fpst, uint32_t desc) 1084 { 1085 uintptr_t opr_sz = simd_oprsz(desc); 1086 float64 *d = vd, *n = vn, *m = vm, *a = va; 1087 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1088 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1089 uint64_t neg_real = flip ^ neg_imag; 1090 uintptr_t i; 1091 1092 /* Shift boolean to the sign bit so we can xor to negate. */ 1093 neg_real <<= 63; 1094 neg_imag <<= 63; 1095 1096 for (i = 0; i < opr_sz / 8; i += 2) { 1097 float64 e2 = n[i + flip]; 1098 float64 e1 = m[i + flip] ^ neg_real; 1099 float64 e4 = e2; 1100 float64 e3 = m[i + 1 - flip] ^ neg_imag; 1101 1102 d[i] = float64_muladd(e2, e1, a[i], 0, fpst); 1103 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst); 1104 } 1105 clear_tail(d, opr_sz, simd_maxsz(desc)); 1106 } 1107 1108 /* 1109 * Floating point comparisons producing an integer result (all 1s or all 0s). 1110 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1111 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1112 */ 1113 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 1114 { 1115 return -float16_eq_quiet(op1, op2, stat); 1116 } 1117 1118 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 1119 { 1120 return -float32_eq_quiet(op1, op2, stat); 1121 } 1122 1123 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat) 1124 { 1125 return -float64_eq_quiet(op1, op2, stat); 1126 } 1127 1128 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 1129 { 1130 return -float16_le(op2, op1, stat); 1131 } 1132 1133 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 1134 { 1135 return -float32_le(op2, op1, stat); 1136 } 1137 1138 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat) 1139 { 1140 return -float64_le(op2, op1, stat); 1141 } 1142 1143 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 1144 { 1145 return -float16_lt(op2, op1, stat); 1146 } 1147 1148 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 1149 { 1150 return -float32_lt(op2, op1, stat); 1151 } 1152 1153 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat) 1154 { 1155 return -float64_lt(op2, op1, stat); 1156 } 1157 1158 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 1159 { 1160 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 1161 } 1162 1163 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 1164 { 1165 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1166 } 1167 1168 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat) 1169 { 1170 return -float64_le(float64_abs(op2), float64_abs(op1), stat); 1171 } 1172 1173 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1174 { 1175 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1176 } 1177 1178 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1179 { 1180 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1181 } 1182 1183 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat) 1184 { 1185 return -float64_lt(float64_abs(op2), float64_abs(op1), stat); 1186 } 1187 1188 static int16_t vfp_tosszh(float16 x, float_status *fpst) 1189 { 1190 if (float16_is_any_nan(x)) { 1191 float_raise(float_flag_invalid, fpst); 1192 return 0; 1193 } 1194 return float16_to_int16_round_to_zero(x, fpst); 1195 } 1196 1197 static uint16_t vfp_touszh(float16 x, float_status *fpst) 1198 { 1199 if (float16_is_any_nan(x)) { 1200 float_raise(float_flag_invalid, fpst); 1201 return 0; 1202 } 1203 return float16_to_uint16_round_to_zero(x, fpst); 1204 } 1205 1206 #define DO_2OP(NAME, FUNC, TYPE) \ 1207 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 1208 { \ 1209 intptr_t i, oprsz = simd_oprsz(desc); \ 1210 TYPE *d = vd, *n = vn; \ 1211 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1212 d[i] = FUNC(n[i], stat); \ 1213 } \ 1214 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1215 } 1216 1217 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1218 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1219 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1220 1221 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1222 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1223 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1224 1225 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1226 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1227 1228 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1229 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1230 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1231 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1232 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1233 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1234 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1235 DO_2OP(gvec_touszh, vfp_touszh, float16) 1236 1237 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1238 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1239 { \ 1240 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1241 } 1242 1243 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1244 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1245 { \ 1246 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1247 } 1248 1249 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1250 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1251 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1252 WRAP_CMP0_##DIRN(FN, CMPOP, float64) \ 1253 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1254 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) \ 1255 DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64) 1256 1257 DO_2OP_CMP0(cgt, cgt, FWD) 1258 DO_2OP_CMP0(cge, cge, FWD) 1259 DO_2OP_CMP0(ceq, ceq, FWD) 1260 DO_2OP_CMP0(clt, cgt, REV) 1261 DO_2OP_CMP0(cle, cge, REV) 1262 1263 #undef DO_2OP 1264 #undef DO_2OP_CMP0 1265 1266 /* Floating-point trigonometric starting value. 1267 * See the ARM ARM pseudocode function FPTrigSMul. 1268 */ 1269 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1270 { 1271 float16 result = float16_mul(op1, op1, stat); 1272 if (!float16_is_any_nan(result)) { 1273 result = float16_set_sign(result, op2 & 1); 1274 } 1275 return result; 1276 } 1277 1278 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1279 { 1280 float32 result = float32_mul(op1, op1, stat); 1281 if (!float32_is_any_nan(result)) { 1282 result = float32_set_sign(result, op2 & 1); 1283 } 1284 return result; 1285 } 1286 1287 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1288 { 1289 float64 result = float64_mul(op1, op1, stat); 1290 if (!float64_is_any_nan(result)) { 1291 result = float64_set_sign(result, op2 & 1); 1292 } 1293 return result; 1294 } 1295 1296 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1297 { 1298 return float16_abs(float16_sub(op1, op2, stat)); 1299 } 1300 1301 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1302 { 1303 return float32_abs(float32_sub(op1, op2, stat)); 1304 } 1305 1306 static float64 float64_abd(float64 op1, float64 op2, float_status *stat) 1307 { 1308 return float64_abs(float64_sub(op1, op2, stat)); 1309 } 1310 1311 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */ 1312 static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat) 1313 { 1314 float16 r = float16_sub(op1, op2, stat); 1315 return float16_is_any_nan(r) ? r : float16_abs(r); 1316 } 1317 1318 static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat) 1319 { 1320 float32 r = float32_sub(op1, op2, stat); 1321 return float32_is_any_nan(r) ? r : float32_abs(r); 1322 } 1323 1324 static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat) 1325 { 1326 float64 r = float64_sub(op1, op2, stat); 1327 return float64_is_any_nan(r) ? r : float64_abs(r); 1328 } 1329 1330 /* 1331 * Reciprocal step. These are the AArch32 version which uses a 1332 * non-fused multiply-and-subtract. 1333 */ 1334 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1335 { 1336 op1 = float16_squash_input_denormal(op1, stat); 1337 op2 = float16_squash_input_denormal(op2, stat); 1338 1339 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1340 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1341 return float16_two; 1342 } 1343 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1344 } 1345 1346 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1347 { 1348 op1 = float32_squash_input_denormal(op1, stat); 1349 op2 = float32_squash_input_denormal(op2, stat); 1350 1351 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1352 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1353 return float32_two; 1354 } 1355 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1356 } 1357 1358 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1359 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1360 { 1361 op1 = float16_squash_input_denormal(op1, stat); 1362 op2 = float16_squash_input_denormal(op2, stat); 1363 1364 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1365 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1366 return float16_one_point_five; 1367 } 1368 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1369 return float16_div(op1, float16_two, stat); 1370 } 1371 1372 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1373 { 1374 op1 = float32_squash_input_denormal(op1, stat); 1375 op2 = float32_squash_input_denormal(op2, stat); 1376 1377 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1378 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1379 return float32_one_point_five; 1380 } 1381 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1382 return float32_div(op1, float32_two, stat); 1383 } 1384 1385 #define DO_3OP(NAME, FUNC, TYPE) \ 1386 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1387 float_status *stat, uint32_t desc) \ 1388 { \ 1389 intptr_t i, oprsz = simd_oprsz(desc); \ 1390 TYPE *d = vd, *n = vn, *m = vm; \ 1391 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1392 d[i] = FUNC(n[i], m[i], stat); \ 1393 } \ 1394 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1395 } 1396 1397 DO_3OP(gvec_fadd_h, float16_add, float16) 1398 DO_3OP(gvec_fadd_s, float32_add, float32) 1399 DO_3OP(gvec_fadd_d, float64_add, float64) 1400 1401 DO_3OP(gvec_fsub_h, float16_sub, float16) 1402 DO_3OP(gvec_fsub_s, float32_sub, float32) 1403 DO_3OP(gvec_fsub_d, float64_sub, float64) 1404 1405 DO_3OP(gvec_fmul_h, float16_mul, float16) 1406 DO_3OP(gvec_fmul_s, float32_mul, float32) 1407 DO_3OP(gvec_fmul_d, float64_mul, float64) 1408 1409 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1410 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1411 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1412 1413 DO_3OP(gvec_fabd_h, float16_abd, float16) 1414 DO_3OP(gvec_fabd_s, float32_abd, float32) 1415 DO_3OP(gvec_fabd_d, float64_abd, float64) 1416 1417 DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16) 1418 DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32) 1419 DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64) 1420 1421 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1422 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1423 DO_3OP(gvec_fceq_d, float64_ceq, float64) 1424 1425 DO_3OP(gvec_fcge_h, float16_cge, float16) 1426 DO_3OP(gvec_fcge_s, float32_cge, float32) 1427 DO_3OP(gvec_fcge_d, float64_cge, float64) 1428 1429 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1430 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1431 DO_3OP(gvec_fcgt_d, float64_cgt, float64) 1432 1433 DO_3OP(gvec_facge_h, float16_acge, float16) 1434 DO_3OP(gvec_facge_s, float32_acge, float32) 1435 DO_3OP(gvec_facge_d, float64_acge, float64) 1436 1437 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1438 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1439 DO_3OP(gvec_facgt_d, float64_acgt, float64) 1440 1441 DO_3OP(gvec_fmax_h, float16_max, float16) 1442 DO_3OP(gvec_fmax_s, float32_max, float32) 1443 DO_3OP(gvec_fmax_d, float64_max, float64) 1444 1445 DO_3OP(gvec_fmin_h, float16_min, float16) 1446 DO_3OP(gvec_fmin_s, float32_min, float32) 1447 DO_3OP(gvec_fmin_d, float64_min, float64) 1448 1449 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1450 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1451 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64) 1452 1453 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1454 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1455 DO_3OP(gvec_fminnum_d, float64_minnum, float64) 1456 1457 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1458 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1459 1460 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1461 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1462 1463 #ifdef TARGET_AARCH64 1464 DO_3OP(gvec_fdiv_h, float16_div, float16) 1465 DO_3OP(gvec_fdiv_s, float32_div, float32) 1466 DO_3OP(gvec_fdiv_d, float64_div, float64) 1467 1468 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16) 1469 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32) 1470 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64) 1471 1472 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1473 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1474 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1475 1476 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1477 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1478 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1479 1480 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16) 1481 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32) 1482 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64) 1483 1484 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16) 1485 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32) 1486 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64) 1487 1488 #endif 1489 #undef DO_3OP 1490 1491 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1492 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1493 float_status *stat) 1494 { 1495 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1496 } 1497 1498 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1499 float_status *stat) 1500 { 1501 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1502 } 1503 1504 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1505 float_status *stat) 1506 { 1507 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1508 } 1509 1510 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1511 float_status *stat) 1512 { 1513 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1514 } 1515 1516 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1517 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1518 float_status *stat) 1519 { 1520 return float16_muladd(op1, op2, dest, 0, stat); 1521 } 1522 1523 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1524 float_status *stat) 1525 { 1526 return float32_muladd(op1, op2, dest, 0, stat); 1527 } 1528 1529 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2, 1530 float_status *stat) 1531 { 1532 return float64_muladd(op1, op2, dest, 0, stat); 1533 } 1534 1535 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1536 float_status *stat) 1537 { 1538 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1539 } 1540 1541 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1542 float_status *stat) 1543 { 1544 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1545 } 1546 1547 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2, 1548 float_status *stat) 1549 { 1550 return float64_muladd(float64_chs(op1), op2, dest, 0, stat); 1551 } 1552 1553 #define DO_MULADD(NAME, FUNC, TYPE) \ 1554 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1555 float_status *stat, uint32_t desc) \ 1556 { \ 1557 intptr_t i, oprsz = simd_oprsz(desc); \ 1558 TYPE *d = vd, *n = vn, *m = vm; \ 1559 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1560 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1561 } \ 1562 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1563 } 1564 1565 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1566 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1567 1568 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1569 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1570 1571 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1572 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1573 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64) 1574 1575 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1576 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1577 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64) 1578 1579 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1580 * For AdvSIMD, there is of course only one such vector segment. 1581 */ 1582 1583 #define DO_MUL_IDX(NAME, TYPE, H) \ 1584 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1585 { \ 1586 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1587 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1588 intptr_t idx = simd_data(desc); \ 1589 TYPE *d = vd, *n = vn, *m = vm; \ 1590 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1591 TYPE mm = m[H(i + idx)]; \ 1592 for (j = 0; j < segment; j++) { \ 1593 d[i + j] = n[i + j] * mm; \ 1594 } \ 1595 } \ 1596 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1597 } 1598 1599 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1600 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1601 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1602 1603 #undef DO_MUL_IDX 1604 1605 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1606 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1607 { \ 1608 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1609 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1610 intptr_t idx = simd_data(desc); \ 1611 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1612 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1613 TYPE mm = m[H(i + idx)]; \ 1614 for (j = 0; j < segment; j++) { \ 1615 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1616 } \ 1617 } \ 1618 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1619 } 1620 1621 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1622 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1623 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1624 1625 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1626 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1627 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1628 1629 #undef DO_MLA_IDX 1630 1631 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \ 1632 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1633 float_status *stat, uint32_t desc) \ 1634 { \ 1635 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1636 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1637 intptr_t idx = simd_data(desc); \ 1638 TYPE *d = vd, *n = vn, *m = vm; \ 1639 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1640 TYPE mm = m[H(i + idx)]; \ 1641 for (j = 0; j < segment; j++) { \ 1642 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \ 1643 } \ 1644 } \ 1645 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1646 } 1647 1648 #define nop(N, M, S) (M) 1649 1650 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2) 1651 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4) 1652 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8) 1653 1654 #ifdef TARGET_AARCH64 1655 1656 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2) 1657 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4) 1658 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8) 1659 1660 #endif 1661 1662 #undef nop 1663 1664 /* 1665 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1666 * the fused ops below they assume accumulate both from and into Vd. 1667 */ 1668 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2) 1669 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4) 1670 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2) 1671 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4) 1672 1673 #undef DO_FMUL_IDX 1674 1675 #define DO_FMLA_IDX(NAME, TYPE, H) \ 1676 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1677 float_status *stat, uint32_t desc) \ 1678 { \ 1679 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1680 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1681 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 1682 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 1683 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1684 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 1685 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1686 TYPE mm = m[H(i + idx)]; \ 1687 for (j = 0; j < segment; j++) { \ 1688 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 1689 mm, a[i + j], 0, stat); \ 1690 } \ 1691 } \ 1692 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1693 } 1694 1695 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 1696 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 1697 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) 1698 1699 #undef DO_FMLA_IDX 1700 1701 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1702 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1703 { \ 1704 intptr_t i, oprsz = simd_oprsz(desc); \ 1705 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1706 bool q = false; \ 1707 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1708 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1709 if (dd < MIN) { \ 1710 dd = MIN; \ 1711 q = true; \ 1712 } else if (dd > MAX) { \ 1713 dd = MAX; \ 1714 q = true; \ 1715 } \ 1716 d[i] = dd; \ 1717 } \ 1718 if (q) { \ 1719 uint32_t *qc = vq; \ 1720 qc[0] = 1; \ 1721 } \ 1722 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1723 } 1724 1725 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1726 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1727 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1728 1729 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1730 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1731 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1732 1733 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1734 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1735 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1736 1737 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1738 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1739 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1740 1741 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX) 1742 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX) 1743 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX) 1744 1745 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX) 1746 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX) 1747 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX) 1748 1749 #undef DO_SAT 1750 1751 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1752 void *vm, uint32_t desc) 1753 { 1754 intptr_t i, oprsz = simd_oprsz(desc); 1755 uint64_t *d = vd, *n = vn, *m = vm; 1756 bool q = false; 1757 1758 for (i = 0; i < oprsz / 8; i++) { 1759 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1760 if (dd < nn) { 1761 dd = UINT64_MAX; 1762 q = true; 1763 } 1764 d[i] = dd; 1765 } 1766 if (q) { 1767 uint32_t *qc = vq; 1768 qc[0] = 1; 1769 } 1770 clear_tail(d, oprsz, simd_maxsz(desc)); 1771 } 1772 1773 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1774 void *vm, uint32_t desc) 1775 { 1776 intptr_t i, oprsz = simd_oprsz(desc); 1777 uint64_t *d = vd, *n = vn, *m = vm; 1778 bool q = false; 1779 1780 for (i = 0; i < oprsz / 8; i++) { 1781 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1782 if (nn < mm) { 1783 dd = 0; 1784 q = true; 1785 } 1786 d[i] = dd; 1787 } 1788 if (q) { 1789 uint32_t *qc = vq; 1790 qc[0] = 1; 1791 } 1792 clear_tail(d, oprsz, simd_maxsz(desc)); 1793 } 1794 1795 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1796 void *vm, uint32_t desc) 1797 { 1798 intptr_t i, oprsz = simd_oprsz(desc); 1799 int64_t *d = vd, *n = vn, *m = vm; 1800 bool q = false; 1801 1802 for (i = 0; i < oprsz / 8; i++) { 1803 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1804 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1805 dd = (nn >> 63) ^ ~INT64_MIN; 1806 q = true; 1807 } 1808 d[i] = dd; 1809 } 1810 if (q) { 1811 uint32_t *qc = vq; 1812 qc[0] = 1; 1813 } 1814 clear_tail(d, oprsz, simd_maxsz(desc)); 1815 } 1816 1817 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1818 void *vm, uint32_t desc) 1819 { 1820 intptr_t i, oprsz = simd_oprsz(desc); 1821 int64_t *d = vd, *n = vn, *m = vm; 1822 bool q = false; 1823 1824 for (i = 0; i < oprsz / 8; i++) { 1825 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1826 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1827 dd = (nn >> 63) ^ ~INT64_MIN; 1828 q = true; 1829 } 1830 d[i] = dd; 1831 } 1832 if (q) { 1833 uint32_t *qc = vq; 1834 qc[0] = 1; 1835 } 1836 clear_tail(d, oprsz, simd_maxsz(desc)); 1837 } 1838 1839 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn, 1840 void *vm, uint32_t desc) 1841 { 1842 intptr_t i, oprsz = simd_oprsz(desc); 1843 uint64_t *d = vd, *n = vn, *m = vm; 1844 bool q = false; 1845 1846 for (i = 0; i < oprsz / 8; i++) { 1847 uint64_t nn = n[i]; 1848 int64_t mm = m[i]; 1849 uint64_t dd = nn + mm; 1850 1851 if (mm < 0) { 1852 if (nn < (uint64_t)-mm) { 1853 dd = 0; 1854 q = true; 1855 } 1856 } else { 1857 if (dd < nn) { 1858 dd = UINT64_MAX; 1859 q = true; 1860 } 1861 } 1862 d[i] = dd; 1863 } 1864 if (q) { 1865 uint32_t *qc = vq; 1866 qc[0] = 1; 1867 } 1868 clear_tail(d, oprsz, simd_maxsz(desc)); 1869 } 1870 1871 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn, 1872 void *vm, uint32_t desc) 1873 { 1874 intptr_t i, oprsz = simd_oprsz(desc); 1875 uint64_t *d = vd, *n = vn, *m = vm; 1876 bool q = false; 1877 1878 for (i = 0; i < oprsz / 8; i++) { 1879 int64_t nn = n[i]; 1880 uint64_t mm = m[i]; 1881 int64_t dd = nn + mm; 1882 1883 if (mm > (uint64_t)(INT64_MAX - nn)) { 1884 dd = INT64_MAX; 1885 q = true; 1886 } 1887 d[i] = dd; 1888 } 1889 if (q) { 1890 uint32_t *qc = vq; 1891 qc[0] = 1; 1892 } 1893 clear_tail(d, oprsz, simd_maxsz(desc)); 1894 } 1895 1896 #define DO_SRA(NAME, TYPE) \ 1897 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1898 { \ 1899 intptr_t i, oprsz = simd_oprsz(desc); \ 1900 int shift = simd_data(desc); \ 1901 TYPE *d = vd, *n = vn; \ 1902 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1903 d[i] += n[i] >> shift; \ 1904 } \ 1905 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1906 } 1907 1908 DO_SRA(gvec_ssra_b, int8_t) 1909 DO_SRA(gvec_ssra_h, int16_t) 1910 DO_SRA(gvec_ssra_s, int32_t) 1911 DO_SRA(gvec_ssra_d, int64_t) 1912 1913 DO_SRA(gvec_usra_b, uint8_t) 1914 DO_SRA(gvec_usra_h, uint16_t) 1915 DO_SRA(gvec_usra_s, uint32_t) 1916 DO_SRA(gvec_usra_d, uint64_t) 1917 1918 #undef DO_SRA 1919 1920 #define DO_RSHR(NAME, TYPE) \ 1921 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1922 { \ 1923 intptr_t i, oprsz = simd_oprsz(desc); \ 1924 int shift = simd_data(desc); \ 1925 TYPE *d = vd, *n = vn; \ 1926 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1927 TYPE tmp = n[i] >> (shift - 1); \ 1928 d[i] = (tmp >> 1) + (tmp & 1); \ 1929 } \ 1930 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1931 } 1932 1933 DO_RSHR(gvec_srshr_b, int8_t) 1934 DO_RSHR(gvec_srshr_h, int16_t) 1935 DO_RSHR(gvec_srshr_s, int32_t) 1936 DO_RSHR(gvec_srshr_d, int64_t) 1937 1938 DO_RSHR(gvec_urshr_b, uint8_t) 1939 DO_RSHR(gvec_urshr_h, uint16_t) 1940 DO_RSHR(gvec_urshr_s, uint32_t) 1941 DO_RSHR(gvec_urshr_d, uint64_t) 1942 1943 #undef DO_RSHR 1944 1945 #define DO_RSRA(NAME, TYPE) \ 1946 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1947 { \ 1948 intptr_t i, oprsz = simd_oprsz(desc); \ 1949 int shift = simd_data(desc); \ 1950 TYPE *d = vd, *n = vn; \ 1951 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1952 TYPE tmp = n[i] >> (shift - 1); \ 1953 d[i] += (tmp >> 1) + (tmp & 1); \ 1954 } \ 1955 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1956 } 1957 1958 DO_RSRA(gvec_srsra_b, int8_t) 1959 DO_RSRA(gvec_srsra_h, int16_t) 1960 DO_RSRA(gvec_srsra_s, int32_t) 1961 DO_RSRA(gvec_srsra_d, int64_t) 1962 1963 DO_RSRA(gvec_ursra_b, uint8_t) 1964 DO_RSRA(gvec_ursra_h, uint16_t) 1965 DO_RSRA(gvec_ursra_s, uint32_t) 1966 DO_RSRA(gvec_ursra_d, uint64_t) 1967 1968 #undef DO_RSRA 1969 1970 #define DO_SRI(NAME, TYPE) \ 1971 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1972 { \ 1973 intptr_t i, oprsz = simd_oprsz(desc); \ 1974 int shift = simd_data(desc); \ 1975 TYPE *d = vd, *n = vn; \ 1976 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1977 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 1978 } \ 1979 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1980 } 1981 1982 DO_SRI(gvec_sri_b, uint8_t) 1983 DO_SRI(gvec_sri_h, uint16_t) 1984 DO_SRI(gvec_sri_s, uint32_t) 1985 DO_SRI(gvec_sri_d, uint64_t) 1986 1987 #undef DO_SRI 1988 1989 #define DO_SLI(NAME, TYPE) \ 1990 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1991 { \ 1992 intptr_t i, oprsz = simd_oprsz(desc); \ 1993 int shift = simd_data(desc); \ 1994 TYPE *d = vd, *n = vn; \ 1995 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1996 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 1997 } \ 1998 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1999 } 2000 2001 DO_SLI(gvec_sli_b, uint8_t) 2002 DO_SLI(gvec_sli_h, uint16_t) 2003 DO_SLI(gvec_sli_s, uint32_t) 2004 DO_SLI(gvec_sli_d, uint64_t) 2005 2006 #undef DO_SLI 2007 2008 /* 2009 * Convert float16 to float32, raising no exceptions and 2010 * preserving exceptional values, including SNaN. 2011 * This is effectively an unpack+repack operation. 2012 */ 2013 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 2014 { 2015 const int f16_bias = 15; 2016 const int f32_bias = 127; 2017 uint32_t sign = extract32(f16, 15, 1); 2018 uint32_t exp = extract32(f16, 10, 5); 2019 uint32_t frac = extract32(f16, 0, 10); 2020 2021 if (exp == 0x1f) { 2022 /* Inf or NaN */ 2023 exp = 0xff; 2024 } else if (exp == 0) { 2025 /* Zero or denormal. */ 2026 if (frac != 0) { 2027 if (fz16) { 2028 frac = 0; 2029 } else { 2030 /* 2031 * Denormal; these are all normal float32. 2032 * Shift the fraction so that the msb is at bit 11, 2033 * then remove bit 11 as the implicit bit of the 2034 * normalized float32. Note that we still go through 2035 * the shift for normal numbers below, to put the 2036 * float32 fraction at the right place. 2037 */ 2038 int shift = clz32(frac) - 21; 2039 frac = (frac << shift) & 0x3ff; 2040 exp = f32_bias - f16_bias - shift + 1; 2041 } 2042 } 2043 } else { 2044 /* Normal number; adjust the bias. */ 2045 exp += f32_bias - f16_bias; 2046 } 2047 sign <<= 31; 2048 exp <<= 23; 2049 frac <<= 23 - 10; 2050 2051 return sign | exp | frac; 2052 } 2053 2054 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 2055 { 2056 /* 2057 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 2058 * Load the 2nd qword iff is_q & is_2. 2059 * Shift to the 2nd dword iff !is_q & is_2. 2060 * For !is_q & !is_2, the upper bits of the result are garbage. 2061 */ 2062 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 2063 } 2064 2065 /* 2066 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 2067 * as there is not yet SVE versions that might use blocking. 2068 */ 2069 2070 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 2071 uint32_t desc, bool fz16) 2072 { 2073 intptr_t i, oprsz = simd_oprsz(desc); 2074 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2075 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2076 int is_q = oprsz == 16; 2077 uint64_t n_4, m_4; 2078 2079 /* Pre-load all of the f16 data, avoiding overlap issues. */ 2080 n_4 = load4_f16(vn, is_q, is_2); 2081 m_4 = load4_f16(vm, is_q, is_2); 2082 2083 /* Negate all inputs for FMLSL at once. */ 2084 if (is_s) { 2085 n_4 ^= 0x8000800080008000ull; 2086 } 2087 2088 for (i = 0; i < oprsz / 4; i++) { 2089 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2090 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 2091 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 2092 } 2093 clear_tail(d, oprsz, simd_maxsz(desc)); 2094 } 2095 2096 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 2097 CPUARMState *env, uint32_t desc) 2098 { 2099 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, 2100 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32)); 2101 } 2102 2103 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 2104 CPUARMState *env, uint32_t desc) 2105 { 2106 do_fmlal(vd, vn, vm, &env->vfp.fp_status_a64, desc, 2107 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64)); 2108 } 2109 2110 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 2111 CPUARMState *env, uint32_t desc) 2112 { 2113 intptr_t i, oprsz = simd_oprsz(desc); 2114 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 2115 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2116 float_status *status = &env->vfp.fp_status_a64; 2117 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64); 2118 2119 for (i = 0; i < oprsz; i += sizeof(float32)) { 2120 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; 2121 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 2122 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2123 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2124 float32 aa = *(float32 *)(va + H1_4(i)); 2125 2126 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); 2127 } 2128 } 2129 2130 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 2131 uint32_t desc, bool fz16) 2132 { 2133 intptr_t i, oprsz = simd_oprsz(desc); 2134 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2135 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2136 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 2137 int is_q = oprsz == 16; 2138 uint64_t n_4; 2139 float32 m_1; 2140 2141 /* Pre-load all of the f16 data, avoiding overlap issues. */ 2142 n_4 = load4_f16(vn, is_q, is_2); 2143 2144 /* Negate all inputs for FMLSL at once. */ 2145 if (is_s) { 2146 n_4 ^= 0x8000800080008000ull; 2147 } 2148 2149 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 2150 2151 for (i = 0; i < oprsz / 4; i++) { 2152 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2153 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 2154 } 2155 clear_tail(d, oprsz, simd_maxsz(desc)); 2156 } 2157 2158 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 2159 CPUARMState *env, uint32_t desc) 2160 { 2161 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, 2162 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32)); 2163 } 2164 2165 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 2166 CPUARMState *env, uint32_t desc) 2167 { 2168 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status_a64, desc, 2169 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64)); 2170 } 2171 2172 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 2173 CPUARMState *env, uint32_t desc) 2174 { 2175 intptr_t i, j, oprsz = simd_oprsz(desc); 2176 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 2177 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2178 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 2179 float_status *status = &env->vfp.fp_status_a64; 2180 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64); 2181 2182 for (i = 0; i < oprsz; i += 16) { 2183 float16 mm_16 = *(float16 *)(vm + i + idx); 2184 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2185 2186 for (j = 0; j < 16; j += sizeof(float32)) { 2187 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; 2188 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2189 float32 aa = *(float32 *)(va + H1_4(i + j)); 2190 2191 *(float32 *)(vd + H1_4(i + j)) = 2192 float32_muladd(nn, mm, aa, 0, status); 2193 } 2194 } 2195 } 2196 2197 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2198 { 2199 intptr_t i, opr_sz = simd_oprsz(desc); 2200 int8_t *d = vd, *n = vn, *m = vm; 2201 2202 for (i = 0; i < opr_sz; ++i) { 2203 int8_t mm = m[i]; 2204 int8_t nn = n[i]; 2205 int8_t res = 0; 2206 if (mm >= 0) { 2207 if (mm < 8) { 2208 res = nn << mm; 2209 } 2210 } else { 2211 res = nn >> (mm > -8 ? -mm : 7); 2212 } 2213 d[i] = res; 2214 } 2215 clear_tail(d, opr_sz, simd_maxsz(desc)); 2216 } 2217 2218 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2219 { 2220 intptr_t i, opr_sz = simd_oprsz(desc); 2221 int16_t *d = vd, *n = vn, *m = vm; 2222 2223 for (i = 0; i < opr_sz / 2; ++i) { 2224 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2225 int16_t nn = n[i]; 2226 int16_t res = 0; 2227 if (mm >= 0) { 2228 if (mm < 16) { 2229 res = nn << mm; 2230 } 2231 } else { 2232 res = nn >> (mm > -16 ? -mm : 15); 2233 } 2234 d[i] = res; 2235 } 2236 clear_tail(d, opr_sz, simd_maxsz(desc)); 2237 } 2238 2239 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2240 { 2241 intptr_t i, opr_sz = simd_oprsz(desc); 2242 uint8_t *d = vd, *n = vn, *m = vm; 2243 2244 for (i = 0; i < opr_sz; ++i) { 2245 int8_t mm = m[i]; 2246 uint8_t nn = n[i]; 2247 uint8_t res = 0; 2248 if (mm >= 0) { 2249 if (mm < 8) { 2250 res = nn << mm; 2251 } 2252 } else { 2253 if (mm > -8) { 2254 res = nn >> -mm; 2255 } 2256 } 2257 d[i] = res; 2258 } 2259 clear_tail(d, opr_sz, simd_maxsz(desc)); 2260 } 2261 2262 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2263 { 2264 intptr_t i, opr_sz = simd_oprsz(desc); 2265 uint16_t *d = vd, *n = vn, *m = vm; 2266 2267 for (i = 0; i < opr_sz / 2; ++i) { 2268 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2269 uint16_t nn = n[i]; 2270 uint16_t res = 0; 2271 if (mm >= 0) { 2272 if (mm < 16) { 2273 res = nn << mm; 2274 } 2275 } else { 2276 if (mm > -16) { 2277 res = nn >> -mm; 2278 } 2279 } 2280 d[i] = res; 2281 } 2282 clear_tail(d, opr_sz, simd_maxsz(desc)); 2283 } 2284 2285 /* 2286 * 8x8->8 polynomial multiply. 2287 * 2288 * Polynomial multiplication is like integer multiplication except the 2289 * partial products are XORed, not added. 2290 * 2291 * TODO: expose this as a generic vector operation, as it is a common 2292 * crypto building block. 2293 */ 2294 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 2295 { 2296 intptr_t i, opr_sz = simd_oprsz(desc); 2297 uint64_t *d = vd, *n = vn, *m = vm; 2298 2299 for (i = 0; i < opr_sz / 8; ++i) { 2300 d[i] = clmul_8x8_low(n[i], m[i]); 2301 } 2302 clear_tail(d, opr_sz, simd_maxsz(desc)); 2303 } 2304 2305 /* 2306 * 64x64->128 polynomial multiply. 2307 * Because of the lanes are not accessed in strict columns, 2308 * this probably cannot be turned into a generic helper. 2309 */ 2310 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2311 { 2312 intptr_t i, opr_sz = simd_oprsz(desc); 2313 intptr_t hi = simd_data(desc); 2314 uint64_t *d = vd, *n = vn, *m = vm; 2315 2316 for (i = 0; i < opr_sz / 8; i += 2) { 2317 Int128 r = clmul_64(n[i + hi], m[i + hi]); 2318 d[i] = int128_getlo(r); 2319 d[i + 1] = int128_gethi(r); 2320 } 2321 clear_tail(d, opr_sz, simd_maxsz(desc)); 2322 } 2323 2324 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2325 { 2326 int hi = simd_data(desc); 2327 uint64_t *d = vd, *n = vn, *m = vm; 2328 uint64_t nn = n[hi], mm = m[hi]; 2329 2330 d[0] = clmul_8x4_packed(nn, mm); 2331 nn >>= 32; 2332 mm >>= 32; 2333 d[1] = clmul_8x4_packed(nn, mm); 2334 2335 clear_tail(d, 16, simd_maxsz(desc)); 2336 } 2337 2338 #ifdef TARGET_AARCH64 2339 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2340 { 2341 int shift = simd_data(desc) * 8; 2342 intptr_t i, opr_sz = simd_oprsz(desc); 2343 uint64_t *d = vd, *n = vn, *m = vm; 2344 2345 for (i = 0; i < opr_sz / 8; ++i) { 2346 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); 2347 } 2348 } 2349 2350 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2351 { 2352 intptr_t sel = H4(simd_data(desc)); 2353 intptr_t i, opr_sz = simd_oprsz(desc); 2354 uint32_t *n = vn, *m = vm; 2355 uint64_t *d = vd; 2356 2357 for (i = 0; i < opr_sz / 8; ++i) { 2358 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); 2359 } 2360 } 2361 #endif 2362 2363 #define DO_CMP0(NAME, TYPE, OP) \ 2364 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2365 { \ 2366 intptr_t i, opr_sz = simd_oprsz(desc); \ 2367 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2368 TYPE nn = *(TYPE *)(vn + i); \ 2369 *(TYPE *)(vd + i) = -(nn OP 0); \ 2370 } \ 2371 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2372 } 2373 2374 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2375 DO_CMP0(gvec_clt0_b, int8_t, <) 2376 DO_CMP0(gvec_cle0_b, int8_t, <=) 2377 DO_CMP0(gvec_cgt0_b, int8_t, >) 2378 DO_CMP0(gvec_cge0_b, int8_t, >=) 2379 2380 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2381 DO_CMP0(gvec_clt0_h, int16_t, <) 2382 DO_CMP0(gvec_cle0_h, int16_t, <=) 2383 DO_CMP0(gvec_cgt0_h, int16_t, >) 2384 DO_CMP0(gvec_cge0_h, int16_t, >=) 2385 2386 #undef DO_CMP0 2387 2388 #define DO_ABD(NAME, TYPE) \ 2389 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2390 { \ 2391 intptr_t i, opr_sz = simd_oprsz(desc); \ 2392 TYPE *d = vd, *n = vn, *m = vm; \ 2393 \ 2394 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2395 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2396 } \ 2397 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2398 } 2399 2400 DO_ABD(gvec_sabd_b, int8_t) 2401 DO_ABD(gvec_sabd_h, int16_t) 2402 DO_ABD(gvec_sabd_s, int32_t) 2403 DO_ABD(gvec_sabd_d, int64_t) 2404 2405 DO_ABD(gvec_uabd_b, uint8_t) 2406 DO_ABD(gvec_uabd_h, uint16_t) 2407 DO_ABD(gvec_uabd_s, uint32_t) 2408 DO_ABD(gvec_uabd_d, uint64_t) 2409 2410 #undef DO_ABD 2411 2412 #define DO_ABA(NAME, TYPE) \ 2413 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2414 { \ 2415 intptr_t i, opr_sz = simd_oprsz(desc); \ 2416 TYPE *d = vd, *n = vn, *m = vm; \ 2417 \ 2418 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2419 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2420 } \ 2421 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2422 } 2423 2424 DO_ABA(gvec_saba_b, int8_t) 2425 DO_ABA(gvec_saba_h, int16_t) 2426 DO_ABA(gvec_saba_s, int32_t) 2427 DO_ABA(gvec_saba_d, int64_t) 2428 2429 DO_ABA(gvec_uaba_b, uint8_t) 2430 DO_ABA(gvec_uaba_h, uint16_t) 2431 DO_ABA(gvec_uaba_s, uint32_t) 2432 DO_ABA(gvec_uaba_d, uint64_t) 2433 2434 #undef DO_ABA 2435 2436 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2437 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 2438 float_status *stat, uint32_t desc) \ 2439 { \ 2440 ARMVectorReg scratch; \ 2441 intptr_t oprsz = simd_oprsz(desc); \ 2442 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2443 TYPE *d = vd, *n = vn, *m = vm; \ 2444 if (unlikely(d == m)) { \ 2445 m = memcpy(&scratch, m, oprsz); \ 2446 } \ 2447 for (intptr_t i = 0; i < half; ++i) { \ 2448 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat); \ 2449 } \ 2450 for (intptr_t i = 0; i < half; ++i) { \ 2451 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat); \ 2452 } \ 2453 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2454 } 2455 2456 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2) 2457 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4) 2458 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, ) 2459 2460 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2) 2461 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4) 2462 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, ) 2463 2464 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2) 2465 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4) 2466 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, ) 2467 2468 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2) 2469 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4) 2470 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, ) 2471 2472 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2) 2473 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4) 2474 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, ) 2475 2476 #ifdef TARGET_AARCH64 2477 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2) 2478 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4) 2479 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, ) 2480 2481 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2) 2482 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4) 2483 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, ) 2484 #endif 2485 2486 #undef DO_3OP_PAIR 2487 2488 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2489 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2490 { \ 2491 ARMVectorReg scratch; \ 2492 intptr_t oprsz = simd_oprsz(desc); \ 2493 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2494 TYPE *d = vd, *n = vn, *m = vm; \ 2495 if (unlikely(d == m)) { \ 2496 m = memcpy(&scratch, m, oprsz); \ 2497 } \ 2498 for (intptr_t i = 0; i < half; ++i) { \ 2499 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]); \ 2500 } \ 2501 for (intptr_t i = 0; i < half; ++i) { \ 2502 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]); \ 2503 } \ 2504 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2505 } 2506 2507 #define ADD(A, B) (A + B) 2508 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1) 2509 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2) 2510 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4) 2511 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, ) 2512 #undef ADD 2513 2514 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1) 2515 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2) 2516 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4) 2517 2518 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1) 2519 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2) 2520 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4) 2521 2522 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1) 2523 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2) 2524 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4) 2525 2526 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1) 2527 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2) 2528 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4) 2529 2530 #undef DO_3OP_PAIR 2531 2532 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2533 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 2534 { \ 2535 intptr_t i, oprsz = simd_oprsz(desc); \ 2536 int shift = simd_data(desc); \ 2537 TYPE *d = vd, *n = vn; \ 2538 float_status *fpst = stat; \ 2539 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2540 d[i] = FUNC(n[i], shift, fpst); \ 2541 } \ 2542 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2543 } 2544 2545 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t) 2546 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t) 2547 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2548 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2549 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2550 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2551 2552 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t) 2553 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t) 2554 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2555 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t) 2556 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2557 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2558 2559 #undef DO_VCVT_FIXED 2560 2561 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2562 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2563 { \ 2564 intptr_t i, oprsz = simd_oprsz(desc); \ 2565 uint32_t rmode = simd_data(desc); \ 2566 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2567 TYPE *d = vd, *n = vn; \ 2568 set_float_rounding_mode(rmode, fpst); \ 2569 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2570 d[i] = FUNC(n[i], 0, fpst); \ 2571 } \ 2572 set_float_rounding_mode(prev_rmode, fpst); \ 2573 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2574 } 2575 2576 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t) 2577 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t) 2578 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2579 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2580 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2581 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2582 2583 #undef DO_VCVT_RMODE 2584 2585 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2586 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2587 { \ 2588 intptr_t i, oprsz = simd_oprsz(desc); \ 2589 uint32_t rmode = simd_data(desc); \ 2590 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2591 TYPE *d = vd, *n = vn; \ 2592 set_float_rounding_mode(rmode, fpst); \ 2593 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2594 d[i] = FUNC(n[i], fpst); \ 2595 } \ 2596 set_float_rounding_mode(prev_rmode, fpst); \ 2597 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2598 } 2599 2600 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2601 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2602 2603 #undef DO_VRINT_RMODE 2604 2605 #ifdef TARGET_AARCH64 2606 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc) 2607 { 2608 const uint8_t *indices = vm; 2609 size_t oprsz = simd_oprsz(desc); 2610 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2611 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2612 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2613 union { 2614 uint8_t b[16]; 2615 uint64_t d[2]; 2616 } result; 2617 2618 /* 2619 * We must construct the final result in a temp, lest the output 2620 * overlaps the input table. For TBL, begin with zero; for TBX, 2621 * begin with the original register contents. Note that we always 2622 * copy 16 bytes here to avoid an extra branch; clearing the high 2623 * bits of the register for oprsz == 8 is handled below. 2624 */ 2625 if (is_tbx) { 2626 memcpy(&result, vd, 16); 2627 } else { 2628 memset(&result, 0, 16); 2629 } 2630 2631 for (size_t i = 0; i < oprsz; ++i) { 2632 uint32_t index = indices[H1(i)]; 2633 2634 if (index < table_len) { 2635 /* 2636 * Convert index (a byte offset into the virtual table 2637 * which is a series of 128-bit vectors concatenated) 2638 * into the correct register element, bearing in mind 2639 * that the table can wrap around from V31 to V0. 2640 */ 2641 const uint8_t *table = (const uint8_t *) 2642 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2643 result.b[H1(i)] = table[H1(index % 16)]; 2644 } 2645 } 2646 2647 memcpy(vd, &result, 16); 2648 clear_tail(vd, oprsz, simd_maxsz(desc)); 2649 } 2650 #endif 2651 2652 /* 2653 * NxN -> N highpart multiply 2654 * 2655 * TODO: expose this as a generic vector operation. 2656 */ 2657 2658 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2659 { 2660 intptr_t i, opr_sz = simd_oprsz(desc); 2661 int8_t *d = vd, *n = vn, *m = vm; 2662 2663 for (i = 0; i < opr_sz; ++i) { 2664 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2665 } 2666 clear_tail(d, opr_sz, simd_maxsz(desc)); 2667 } 2668 2669 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2670 { 2671 intptr_t i, opr_sz = simd_oprsz(desc); 2672 int16_t *d = vd, *n = vn, *m = vm; 2673 2674 for (i = 0; i < opr_sz / 2; ++i) { 2675 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2676 } 2677 clear_tail(d, opr_sz, simd_maxsz(desc)); 2678 } 2679 2680 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2681 { 2682 intptr_t i, opr_sz = simd_oprsz(desc); 2683 int32_t *d = vd, *n = vn, *m = vm; 2684 2685 for (i = 0; i < opr_sz / 4; ++i) { 2686 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2687 } 2688 clear_tail(d, opr_sz, simd_maxsz(desc)); 2689 } 2690 2691 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2692 { 2693 intptr_t i, opr_sz = simd_oprsz(desc); 2694 uint64_t *d = vd, *n = vn, *m = vm; 2695 uint64_t discard; 2696 2697 for (i = 0; i < opr_sz / 8; ++i) { 2698 muls64(&discard, &d[i], n[i], m[i]); 2699 } 2700 clear_tail(d, opr_sz, simd_maxsz(desc)); 2701 } 2702 2703 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2704 { 2705 intptr_t i, opr_sz = simd_oprsz(desc); 2706 uint8_t *d = vd, *n = vn, *m = vm; 2707 2708 for (i = 0; i < opr_sz; ++i) { 2709 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2710 } 2711 clear_tail(d, opr_sz, simd_maxsz(desc)); 2712 } 2713 2714 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2715 { 2716 intptr_t i, opr_sz = simd_oprsz(desc); 2717 uint16_t *d = vd, *n = vn, *m = vm; 2718 2719 for (i = 0; i < opr_sz / 2; ++i) { 2720 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2721 } 2722 clear_tail(d, opr_sz, simd_maxsz(desc)); 2723 } 2724 2725 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2726 { 2727 intptr_t i, opr_sz = simd_oprsz(desc); 2728 uint32_t *d = vd, *n = vn, *m = vm; 2729 2730 for (i = 0; i < opr_sz / 4; ++i) { 2731 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2732 } 2733 clear_tail(d, opr_sz, simd_maxsz(desc)); 2734 } 2735 2736 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2737 { 2738 intptr_t i, opr_sz = simd_oprsz(desc); 2739 uint64_t *d = vd, *n = vn, *m = vm; 2740 uint64_t discard; 2741 2742 for (i = 0; i < opr_sz / 8; ++i) { 2743 mulu64(&discard, &d[i], n[i], m[i]); 2744 } 2745 clear_tail(d, opr_sz, simd_maxsz(desc)); 2746 } 2747 2748 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2749 { 2750 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2751 int shr = simd_data(desc); 2752 uint64_t *d = vd, *n = vn, *m = vm; 2753 2754 for (i = 0; i < opr_sz; ++i) { 2755 d[i] = ror64(n[i] ^ m[i], shr); 2756 } 2757 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2758 } 2759 2760 /* 2761 * Integer matrix-multiply accumulate 2762 */ 2763 2764 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2765 { 2766 int8_t *n = vn, *m = vm; 2767 2768 for (intptr_t k = 0; k < 8; ++k) { 2769 sum += n[H1(k)] * m[H1(k)]; 2770 } 2771 return sum; 2772 } 2773 2774 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2775 { 2776 uint8_t *n = vn, *m = vm; 2777 2778 for (intptr_t k = 0; k < 8; ++k) { 2779 sum += n[H1(k)] * m[H1(k)]; 2780 } 2781 return sum; 2782 } 2783 2784 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2785 { 2786 uint8_t *n = vn; 2787 int8_t *m = vm; 2788 2789 for (intptr_t k = 0; k < 8; ++k) { 2790 sum += n[H1(k)] * m[H1(k)]; 2791 } 2792 return sum; 2793 } 2794 2795 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2796 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2797 { 2798 intptr_t seg, opr_sz = simd_oprsz(desc); 2799 2800 for (seg = 0; seg < opr_sz; seg += 16) { 2801 uint32_t *d = vd + seg; 2802 uint32_t *a = va + seg; 2803 uint32_t sum0, sum1, sum2, sum3; 2804 2805 /* 2806 * Process the entire segment at once, writing back the 2807 * results only after we've consumed all of the inputs. 2808 * 2809 * Key to indices by column: 2810 * i j i j 2811 */ 2812 sum0 = a[H4(0 + 0)]; 2813 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2814 sum1 = a[H4(0 + 1)]; 2815 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2816 sum2 = a[H4(2 + 0)]; 2817 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2818 sum3 = a[H4(2 + 1)]; 2819 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2820 2821 d[H4(0)] = sum0; 2822 d[H4(1)] = sum1; 2823 d[H4(2)] = sum2; 2824 d[H4(3)] = sum3; 2825 } 2826 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2827 } 2828 2829 #define DO_MMLA_B(NAME, INNER) \ 2830 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2831 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2832 2833 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2834 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2835 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2836 2837 /* 2838 * BFloat16 Dot Product 2839 */ 2840 2841 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp) 2842 { 2843 /* 2844 * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF. 2845 * For EBF = 0, we ignore the FPCR bits which determine rounding 2846 * mode and denormal-flushing, and we do unfused multiplies and 2847 * additions with intermediate rounding of all products and sums. 2848 * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits, 2849 * and we perform a fused two-way sum-of-products without intermediate 2850 * rounding of the products. 2851 * In either case, we don't set fp exception flags. 2852 * 2853 * EBF is AArch64 only, so even if it's set in the FPCR it has 2854 * no effect on AArch32 instructions. 2855 */ 2856 bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF; 2857 2858 *statusp = is_a64(env) ? env->vfp.fp_status_a64 : env->vfp.fp_status_a32; 2859 set_default_nan_mode(true, statusp); 2860 2861 if (ebf) { 2862 /* EBF=1 needs to do a step with round-to-odd semantics */ 2863 *oddstatusp = *statusp; 2864 set_float_rounding_mode(float_round_to_odd, oddstatusp); 2865 } else { 2866 set_flush_to_zero(true, statusp); 2867 set_flush_inputs_to_zero(true, statusp); 2868 set_float_rounding_mode(float_round_to_odd_inf, statusp); 2869 } 2870 return ebf; 2871 } 2872 2873 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst) 2874 { 2875 float32 t1, t2; 2876 2877 /* 2878 * Extract each BFloat16 from the element pair, and shift 2879 * them such that they become float32. 2880 */ 2881 t1 = float32_mul(e1 << 16, e2 << 16, fpst); 2882 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst); 2883 t1 = float32_add(t1, t2, fpst); 2884 t1 = float32_add(sum, t1, fpst); 2885 2886 return t1; 2887 } 2888 2889 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2, 2890 float_status *fpst, float_status *fpst_odd) 2891 { 2892 /* 2893 * Compare f16_dotadd() in sme_helper.c, but here we have 2894 * bfloat16 inputs. In particular that means that we do not 2895 * want the FPCR.FZ16 flush semantics, so we use the normal 2896 * float_status for the input handling here. 2897 */ 2898 float64 e1r = float32_to_float64(e1 << 16, fpst); 2899 float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst); 2900 float64 e2r = float32_to_float64(e2 << 16, fpst); 2901 float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst); 2902 float64 t64; 2903 float32 t32; 2904 2905 /* 2906 * The ARM pseudocode function FPDot performs both multiplies 2907 * and the add with a single rounding operation. Emulate this 2908 * by performing the first multiply in round-to-odd, then doing 2909 * the second multiply as fused multiply-add, and rounding to 2910 * float32 all in one step. 2911 */ 2912 t64 = float64_mul(e1r, e2r, fpst_odd); 2913 t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst); 2914 2915 /* This conversion is exact, because we've already rounded. */ 2916 t32 = float64_to_float32(t64, fpst); 2917 2918 /* The final accumulation step is not fused. */ 2919 return float32_add(sum, t32, fpst); 2920 } 2921 2922 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, 2923 CPUARMState *env, uint32_t desc) 2924 { 2925 intptr_t i, opr_sz = simd_oprsz(desc); 2926 float32 *d = vd, *a = va; 2927 uint32_t *n = vn, *m = vm; 2928 float_status fpst, fpst_odd; 2929 2930 if (is_ebf(env, &fpst, &fpst_odd)) { 2931 for (i = 0; i < opr_sz / 4; ++i) { 2932 d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd); 2933 } 2934 } else { 2935 for (i = 0; i < opr_sz / 4; ++i) { 2936 d[i] = bfdotadd(a[i], n[i], m[i], &fpst); 2937 } 2938 } 2939 clear_tail(d, opr_sz, simd_maxsz(desc)); 2940 } 2941 2942 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 2943 void *va, CPUARMState *env, uint32_t desc) 2944 { 2945 intptr_t i, j, opr_sz = simd_oprsz(desc); 2946 intptr_t index = simd_data(desc); 2947 intptr_t elements = opr_sz / 4; 2948 intptr_t eltspersegment = MIN(16 / 4, elements); 2949 float32 *d = vd, *a = va; 2950 uint32_t *n = vn, *m = vm; 2951 float_status fpst, fpst_odd; 2952 2953 if (is_ebf(env, &fpst, &fpst_odd)) { 2954 for (i = 0; i < elements; i += eltspersegment) { 2955 uint32_t m_idx = m[i + H4(index)]; 2956 2957 for (j = i; j < i + eltspersegment; j++) { 2958 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd); 2959 } 2960 } 2961 } else { 2962 for (i = 0; i < elements; i += eltspersegment) { 2963 uint32_t m_idx = m[i + H4(index)]; 2964 2965 for (j = i; j < i + eltspersegment; j++) { 2966 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst); 2967 } 2968 } 2969 } 2970 clear_tail(d, opr_sz, simd_maxsz(desc)); 2971 } 2972 2973 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, 2974 CPUARMState *env, uint32_t desc) 2975 { 2976 intptr_t s, opr_sz = simd_oprsz(desc); 2977 float32 *d = vd, *a = va; 2978 uint32_t *n = vn, *m = vm; 2979 float_status fpst, fpst_odd; 2980 2981 if (is_ebf(env, &fpst, &fpst_odd)) { 2982 for (s = 0; s < opr_sz / 4; s += 4) { 2983 float32 sum00, sum01, sum10, sum11; 2984 2985 /* 2986 * Process the entire segment at once, writing back the 2987 * results only after we've consumed all of the inputs. 2988 * 2989 * Key to indices by column: 2990 * i j i k j k 2991 */ 2992 sum00 = a[s + H4(0 + 0)]; 2993 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 2994 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 2995 2996 sum01 = a[s + H4(0 + 1)]; 2997 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 2998 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 2999 3000 sum10 = a[s + H4(2 + 0)]; 3001 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 3002 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 3003 3004 sum11 = a[s + H4(2 + 1)]; 3005 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 3006 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 3007 3008 d[s + H4(0 + 0)] = sum00; 3009 d[s + H4(0 + 1)] = sum01; 3010 d[s + H4(2 + 0)] = sum10; 3011 d[s + H4(2 + 1)] = sum11; 3012 } 3013 } else { 3014 for (s = 0; s < opr_sz / 4; s += 4) { 3015 float32 sum00, sum01, sum10, sum11; 3016 3017 /* 3018 * Process the entire segment at once, writing back the 3019 * results only after we've consumed all of the inputs. 3020 * 3021 * Key to indices by column: 3022 * i j i k j k 3023 */ 3024 sum00 = a[s + H4(0 + 0)]; 3025 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst); 3026 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst); 3027 3028 sum01 = a[s + H4(0 + 1)]; 3029 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst); 3030 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst); 3031 3032 sum10 = a[s + H4(2 + 0)]; 3033 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst); 3034 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst); 3035 3036 sum11 = a[s + H4(2 + 1)]; 3037 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst); 3038 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst); 3039 3040 d[s + H4(0 + 0)] = sum00; 3041 d[s + H4(0 + 1)] = sum01; 3042 d[s + H4(2 + 0)] = sum10; 3043 d[s + H4(2 + 1)] = sum11; 3044 } 3045 } 3046 clear_tail(d, opr_sz, simd_maxsz(desc)); 3047 } 3048 3049 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 3050 float_status *stat, uint32_t desc) 3051 { 3052 intptr_t i, opr_sz = simd_oprsz(desc); 3053 intptr_t sel = simd_data(desc); 3054 float32 *d = vd, *a = va; 3055 bfloat16 *n = vn, *m = vm; 3056 3057 for (i = 0; i < opr_sz / 4; ++i) { 3058 float32 nn = n[H2(i * 2 + sel)] << 16; 3059 float32 mm = m[H2(i * 2 + sel)] << 16; 3060 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 3061 } 3062 clear_tail(d, opr_sz, simd_maxsz(desc)); 3063 } 3064 3065 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 3066 void *va, float_status *stat, uint32_t desc) 3067 { 3068 intptr_t i, j, opr_sz = simd_oprsz(desc); 3069 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 3070 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 3071 intptr_t elements = opr_sz / 4; 3072 intptr_t eltspersegment = MIN(16 / 4, elements); 3073 float32 *d = vd, *a = va; 3074 bfloat16 *n = vn, *m = vm; 3075 3076 for (i = 0; i < elements; i += eltspersegment) { 3077 float32 m_idx = m[H2(2 * i + index)] << 16; 3078 3079 for (j = i; j < i + eltspersegment; j++) { 3080 float32 n_j = n[H2(2 * j + sel)] << 16; 3081 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 3082 } 3083 } 3084 clear_tail(d, opr_sz, simd_maxsz(desc)); 3085 } 3086 3087 #define DO_CLAMP(NAME, TYPE) \ 3088 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 3089 { \ 3090 intptr_t i, opr_sz = simd_oprsz(desc); \ 3091 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 3092 TYPE aa = *(TYPE *)(a + i); \ 3093 TYPE nn = *(TYPE *)(n + i); \ 3094 TYPE mm = *(TYPE *)(m + i); \ 3095 TYPE dd = MIN(MAX(aa, nn), mm); \ 3096 *(TYPE *)(d + i) = dd; \ 3097 } \ 3098 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 3099 } 3100 3101 DO_CLAMP(gvec_sclamp_b, int8_t) 3102 DO_CLAMP(gvec_sclamp_h, int16_t) 3103 DO_CLAMP(gvec_sclamp_s, int32_t) 3104 DO_CLAMP(gvec_sclamp_d, int64_t) 3105 3106 DO_CLAMP(gvec_uclamp_b, uint8_t) 3107 DO_CLAMP(gvec_uclamp_h, uint16_t) 3108 DO_CLAMP(gvec_uclamp_s, uint32_t) 3109 DO_CLAMP(gvec_uclamp_d, uint64_t) 3110 3111 /* Bit count in each 8-bit word. */ 3112 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc) 3113 { 3114 intptr_t i, opr_sz = simd_oprsz(desc); 3115 uint8_t *d = vd, *n = vn; 3116 3117 for (i = 0; i < opr_sz; ++i) { 3118 d[i] = ctpop8(n[i]); 3119 } 3120 clear_tail(d, opr_sz, simd_maxsz(desc)); 3121 } 3122 3123 /* Reverse bits in each 8 bit word */ 3124 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc) 3125 { 3126 intptr_t i, opr_sz = simd_oprsz(desc); 3127 uint64_t *d = vd, *n = vn; 3128 3129 for (i = 0; i < opr_sz / 8; ++i) { 3130 d[i] = revbit64(bswap64(n[i])); 3131 } 3132 clear_tail(d, opr_sz, simd_maxsz(desc)); 3133 } 3134 3135 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc) 3136 { 3137 intptr_t i, opr_sz = simd_oprsz(desc); 3138 uint32_t *d = vd, *n = vn; 3139 3140 for (i = 0; i < opr_sz / 4; ++i) { 3141 d[i] = helper_recpe_u32(n[i]); 3142 } 3143 clear_tail(d, opr_sz, simd_maxsz(desc)); 3144 } 3145 3146 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc) 3147 { 3148 intptr_t i, opr_sz = simd_oprsz(desc); 3149 uint32_t *d = vd, *n = vn; 3150 3151 for (i = 0; i < opr_sz / 4; ++i) { 3152 d[i] = helper_rsqrte_u32(n[i]); 3153 } 3154 clear_tail(d, opr_sz, simd_maxsz(desc)); 3155 } 3156