1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "crypto/clmul.h" 27 #include "vec_internal.h" 28 29 /* 30 * Data for expanding active predicate bits to bytes, for byte elements. 31 * 32 * for (i = 0; i < 256; ++i) { 33 * unsigned long m = 0; 34 * for (j = 0; j < 8; j++) { 35 * if ((i >> j) & 1) { 36 * m |= 0xfful << (j << 3); 37 * } 38 * } 39 * printf("0x%016lx,\n", m); 40 * } 41 */ 42 const uint64_t expand_pred_b_data[256] = { 43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 128 0xffffffffffffffff, 129 }; 130 131 /* 132 * Similarly for half-word elements. 133 * for (i = 0; i < 256; ++i) { 134 * unsigned long m = 0; 135 * if (i & 0xaa) { 136 * continue; 137 * } 138 * for (j = 0; j < 8; j += 2) { 139 * if ((i >> j) & 1) { 140 * m |= 0xfffful << (j << 3); 141 * } 142 * } 143 * printf("[0x%x] = 0x%016lx,\n", i, m); 144 * } 145 */ 146 const uint64_t expand_pred_h_data[0x55 + 1] = { 147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 154 [0x55] = 0xffffffffffffffff, 155 }; 156 157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 159 bool neg, bool round) 160 { 161 /* 162 * Simplify: 163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 165 */ 166 int32_t ret = (int32_t)src1 * src2; 167 if (neg) { 168 ret = -ret; 169 } 170 ret += ((int32_t)src3 << 7) + (round << 6); 171 ret >>= 7; 172 173 if (ret != (int8_t)ret) { 174 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 175 } 176 return ret; 177 } 178 179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 180 void *va, uint32_t desc) 181 { 182 intptr_t i, opr_sz = simd_oprsz(desc); 183 int8_t *d = vd, *n = vn, *m = vm, *a = va; 184 185 for (i = 0; i < opr_sz; ++i) { 186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 187 } 188 } 189 190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 191 void *va, uint32_t desc) 192 { 193 intptr_t i, opr_sz = simd_oprsz(desc); 194 int8_t *d = vd, *n = vn, *m = vm, *a = va; 195 196 for (i = 0; i < opr_sz; ++i) { 197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 198 } 199 } 200 201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 202 { 203 intptr_t i, opr_sz = simd_oprsz(desc); 204 int8_t *d = vd, *n = vn, *m = vm; 205 206 for (i = 0; i < opr_sz; ++i) { 207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 208 } 209 } 210 211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 212 { 213 intptr_t i, opr_sz = simd_oprsz(desc); 214 int8_t *d = vd, *n = vn, *m = vm; 215 216 for (i = 0; i < opr_sz; ++i) { 217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 218 } 219 } 220 221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 223 bool neg, bool round, uint32_t *sat) 224 { 225 /* Simplify similarly to do_sqrdmlah_b above. */ 226 int32_t ret = (int32_t)src1 * src2; 227 if (neg) { 228 ret = -ret; 229 } 230 ret += ((int32_t)src3 << 15) + (round << 14); 231 ret >>= 15; 232 233 if (ret != (int16_t)ret) { 234 *sat = 1; 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 236 } 237 return ret; 238 } 239 240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 241 uint32_t src2, uint32_t src3) 242 { 243 uint32_t *sat = &env->vfp.qc[0]; 244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 246 false, true, sat); 247 return deposit32(e1, 16, 16, e2); 248 } 249 250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 251 void *vq, uint32_t desc) 252 { 253 uintptr_t opr_sz = simd_oprsz(desc); 254 int16_t *d = vd; 255 int16_t *n = vn; 256 int16_t *m = vm; 257 uintptr_t i; 258 259 for (i = 0; i < opr_sz / 2; ++i) { 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 261 } 262 clear_tail(d, opr_sz, simd_maxsz(desc)); 263 } 264 265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 266 uint32_t src2, uint32_t src3) 267 { 268 uint32_t *sat = &env->vfp.qc[0]; 269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 271 true, true, sat); 272 return deposit32(e1, 16, 16, e2); 273 } 274 275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277 { 278 uintptr_t opr_sz = simd_oprsz(desc); 279 int16_t *d = vd; 280 int16_t *n = vn; 281 int16_t *m = vm; 282 uintptr_t i; 283 284 for (i = 0; i < opr_sz / 2; ++i) { 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 286 } 287 clear_tail(d, opr_sz, simd_maxsz(desc)); 288 } 289 290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 291 void *vq, uint32_t desc) 292 { 293 intptr_t i, opr_sz = simd_oprsz(desc); 294 int16_t *d = vd, *n = vn, *m = vm; 295 296 for (i = 0; i < opr_sz / 2; ++i) { 297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 298 } 299 clear_tail(d, opr_sz, simd_maxsz(desc)); 300 } 301 302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 303 void *vq, uint32_t desc) 304 { 305 intptr_t i, opr_sz = simd_oprsz(desc); 306 int16_t *d = vd, *n = vn, *m = vm; 307 308 for (i = 0; i < opr_sz / 2; ++i) { 309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 310 } 311 clear_tail(d, opr_sz, simd_maxsz(desc)); 312 } 313 314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm, 315 void *vq, uint32_t desc) 316 { 317 intptr_t i, j, opr_sz = simd_oprsz(desc); 318 int idx = simd_data(desc); 319 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 320 intptr_t elements = opr_sz / 2; 321 intptr_t eltspersegment = MIN(16 / 2, elements); 322 323 for (i = 0; i < elements; i += 16 / 2) { 324 int16_t mm = m[i]; 325 for (j = 0; j < eltspersegment; ++j) { 326 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq); 327 } 328 } 329 clear_tail(d, opr_sz, simd_maxsz(desc)); 330 } 331 332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, 333 void *vq, uint32_t desc) 334 { 335 intptr_t i, j, opr_sz = simd_oprsz(desc); 336 int idx = simd_data(desc); 337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 338 intptr_t elements = opr_sz / 2; 339 intptr_t eltspersegment = MIN(16 / 2, elements); 340 341 for (i = 0; i < elements; i += 16 / 2) { 342 int16_t mm = m[i]; 343 for (j = 0; j < eltspersegment; ++j) { 344 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq); 345 } 346 } 347 clear_tail(d, opr_sz, simd_maxsz(desc)); 348 } 349 350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm, 351 void *vq, uint32_t desc) 352 { 353 intptr_t i, j, opr_sz = simd_oprsz(desc); 354 int idx = simd_data(desc); 355 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 356 intptr_t elements = opr_sz / 2; 357 intptr_t eltspersegment = MIN(16 / 2, elements); 358 359 for (i = 0; i < elements; i += 16 / 2) { 360 int16_t mm = m[i]; 361 for (j = 0; j < eltspersegment; ++j) { 362 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq); 363 } 364 } 365 clear_tail(d, opr_sz, simd_maxsz(desc)); 366 } 367 368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm, 369 void *vq, uint32_t desc) 370 { 371 intptr_t i, j, opr_sz = simd_oprsz(desc); 372 int idx = simd_data(desc); 373 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 374 intptr_t elements = opr_sz / 2; 375 intptr_t eltspersegment = MIN(16 / 2, elements); 376 377 for (i = 0; i < elements; i += 16 / 2) { 378 int16_t mm = m[i]; 379 for (j = 0; j < eltspersegment; ++j) { 380 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq); 381 } 382 } 383 clear_tail(d, opr_sz, simd_maxsz(desc)); 384 } 385 386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 387 void *va, uint32_t desc) 388 { 389 intptr_t i, opr_sz = simd_oprsz(desc); 390 int16_t *d = vd, *n = vn, *m = vm, *a = va; 391 uint32_t discard; 392 393 for (i = 0; i < opr_sz / 2; ++i) { 394 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 395 } 396 } 397 398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 399 void *va, uint32_t desc) 400 { 401 intptr_t i, opr_sz = simd_oprsz(desc); 402 int16_t *d = vd, *n = vn, *m = vm, *a = va; 403 uint32_t discard; 404 405 for (i = 0; i < opr_sz / 2; ++i) { 406 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 407 } 408 } 409 410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 411 { 412 intptr_t i, opr_sz = simd_oprsz(desc); 413 int16_t *d = vd, *n = vn, *m = vm; 414 uint32_t discard; 415 416 for (i = 0; i < opr_sz / 2; ++i) { 417 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 418 } 419 } 420 421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 422 { 423 intptr_t i, opr_sz = simd_oprsz(desc); 424 int16_t *d = vd, *n = vn, *m = vm; 425 uint32_t discard; 426 427 for (i = 0; i < opr_sz / 2; ++i) { 428 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 429 } 430 } 431 432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 433 { 434 intptr_t i, j, opr_sz = simd_oprsz(desc); 435 int idx = simd_data(desc); 436 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 437 uint32_t discard; 438 439 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 440 int16_t mm = m[i]; 441 for (j = 0; j < 16 / 2; ++j) { 442 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 443 } 444 } 445 } 446 447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 448 { 449 intptr_t i, j, opr_sz = simd_oprsz(desc); 450 int idx = simd_data(desc); 451 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 452 uint32_t discard; 453 454 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 455 int16_t mm = m[i]; 456 for (j = 0; j < 16 / 2; ++j) { 457 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 458 } 459 } 460 } 461 462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 464 bool neg, bool round, uint32_t *sat) 465 { 466 /* Simplify similarly to do_sqrdmlah_b above. */ 467 int64_t ret = (int64_t)src1 * src2; 468 if (neg) { 469 ret = -ret; 470 } 471 ret += ((int64_t)src3 << 31) + (round << 30); 472 ret >>= 31; 473 474 if (ret != (int32_t)ret) { 475 *sat = 1; 476 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 477 } 478 return ret; 479 } 480 481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 482 int32_t src2, int32_t src3) 483 { 484 uint32_t *sat = &env->vfp.qc[0]; 485 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 486 } 487 488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 489 void *vq, uint32_t desc) 490 { 491 uintptr_t opr_sz = simd_oprsz(desc); 492 int32_t *d = vd; 493 int32_t *n = vn; 494 int32_t *m = vm; 495 uintptr_t i; 496 497 for (i = 0; i < opr_sz / 4; ++i) { 498 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 499 } 500 clear_tail(d, opr_sz, simd_maxsz(desc)); 501 } 502 503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 504 int32_t src2, int32_t src3) 505 { 506 uint32_t *sat = &env->vfp.qc[0]; 507 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 508 } 509 510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 511 void *vq, uint32_t desc) 512 { 513 uintptr_t opr_sz = simd_oprsz(desc); 514 int32_t *d = vd; 515 int32_t *n = vn; 516 int32_t *m = vm; 517 uintptr_t i; 518 519 for (i = 0; i < opr_sz / 4; ++i) { 520 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 521 } 522 clear_tail(d, opr_sz, simd_maxsz(desc)); 523 } 524 525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 526 void *vq, uint32_t desc) 527 { 528 intptr_t i, opr_sz = simd_oprsz(desc); 529 int32_t *d = vd, *n = vn, *m = vm; 530 531 for (i = 0; i < opr_sz / 4; ++i) { 532 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 533 } 534 clear_tail(d, opr_sz, simd_maxsz(desc)); 535 } 536 537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 538 void *vq, uint32_t desc) 539 { 540 intptr_t i, opr_sz = simd_oprsz(desc); 541 int32_t *d = vd, *n = vn, *m = vm; 542 543 for (i = 0; i < opr_sz / 4; ++i) { 544 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 545 } 546 clear_tail(d, opr_sz, simd_maxsz(desc)); 547 } 548 549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm, 550 void *vq, uint32_t desc) 551 { 552 intptr_t i, j, opr_sz = simd_oprsz(desc); 553 int idx = simd_data(desc); 554 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 555 intptr_t elements = opr_sz / 4; 556 intptr_t eltspersegment = MIN(16 / 4, elements); 557 558 for (i = 0; i < elements; i += 16 / 4) { 559 int32_t mm = m[i]; 560 for (j = 0; j < eltspersegment; ++j) { 561 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq); 562 } 563 } 564 clear_tail(d, opr_sz, simd_maxsz(desc)); 565 } 566 567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, 568 void *vq, uint32_t desc) 569 { 570 intptr_t i, j, opr_sz = simd_oprsz(desc); 571 int idx = simd_data(desc); 572 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 573 intptr_t elements = opr_sz / 4; 574 intptr_t eltspersegment = MIN(16 / 4, elements); 575 576 for (i = 0; i < elements; i += 16 / 4) { 577 int32_t mm = m[i]; 578 for (j = 0; j < eltspersegment; ++j) { 579 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq); 580 } 581 } 582 clear_tail(d, opr_sz, simd_maxsz(desc)); 583 } 584 585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm, 586 void *vq, uint32_t desc) 587 { 588 intptr_t i, j, opr_sz = simd_oprsz(desc); 589 int idx = simd_data(desc); 590 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 591 intptr_t elements = opr_sz / 4; 592 intptr_t eltspersegment = MIN(16 / 4, elements); 593 594 for (i = 0; i < elements; i += 16 / 4) { 595 int32_t mm = m[i]; 596 for (j = 0; j < eltspersegment; ++j) { 597 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq); 598 } 599 } 600 clear_tail(d, opr_sz, simd_maxsz(desc)); 601 } 602 603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm, 604 void *vq, uint32_t desc) 605 { 606 intptr_t i, j, opr_sz = simd_oprsz(desc); 607 int idx = simd_data(desc); 608 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 609 intptr_t elements = opr_sz / 4; 610 intptr_t eltspersegment = MIN(16 / 4, elements); 611 612 for (i = 0; i < elements; i += 16 / 4) { 613 int32_t mm = m[i]; 614 for (j = 0; j < eltspersegment; ++j) { 615 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq); 616 } 617 } 618 clear_tail(d, opr_sz, simd_maxsz(desc)); 619 } 620 621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 622 void *va, uint32_t desc) 623 { 624 intptr_t i, opr_sz = simd_oprsz(desc); 625 int32_t *d = vd, *n = vn, *m = vm, *a = va; 626 uint32_t discard; 627 628 for (i = 0; i < opr_sz / 4; ++i) { 629 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 630 } 631 } 632 633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 634 void *va, uint32_t desc) 635 { 636 intptr_t i, opr_sz = simd_oprsz(desc); 637 int32_t *d = vd, *n = vn, *m = vm, *a = va; 638 uint32_t discard; 639 640 for (i = 0; i < opr_sz / 4; ++i) { 641 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 642 } 643 } 644 645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 646 { 647 intptr_t i, opr_sz = simd_oprsz(desc); 648 int32_t *d = vd, *n = vn, *m = vm; 649 uint32_t discard; 650 651 for (i = 0; i < opr_sz / 4; ++i) { 652 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 653 } 654 } 655 656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 657 { 658 intptr_t i, opr_sz = simd_oprsz(desc); 659 int32_t *d = vd, *n = vn, *m = vm; 660 uint32_t discard; 661 662 for (i = 0; i < opr_sz / 4; ++i) { 663 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 664 } 665 } 666 667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 668 { 669 intptr_t i, j, opr_sz = simd_oprsz(desc); 670 int idx = simd_data(desc); 671 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 672 uint32_t discard; 673 674 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 675 int32_t mm = m[i]; 676 for (j = 0; j < 16 / 4; ++j) { 677 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 678 } 679 } 680 } 681 682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 683 { 684 intptr_t i, j, opr_sz = simd_oprsz(desc); 685 int idx = simd_data(desc); 686 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 687 uint32_t discard; 688 689 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 690 int32_t mm = m[i]; 691 for (j = 0; j < 16 / 4; ++j) { 692 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 693 } 694 } 695 } 696 697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 698 static int64_t do_sat128_d(Int128 r) 699 { 700 int64_t ls = int128_getlo(r); 701 int64_t hs = int128_gethi(r); 702 703 if (unlikely(hs != (ls >> 63))) { 704 return hs < 0 ? INT64_MIN : INT64_MAX; 705 } 706 return ls; 707 } 708 709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 710 { 711 uint64_t l, h; 712 Int128 r, t; 713 714 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 715 muls64(&l, &h, m, n); 716 r = int128_make128(l, h); 717 if (neg) { 718 r = int128_neg(r); 719 } 720 if (a) { 721 t = int128_exts64(a); 722 t = int128_lshift(t, 63); 723 r = int128_add(r, t); 724 } 725 if (round) { 726 t = int128_exts64(1ll << 62); 727 r = int128_add(r, t); 728 } 729 r = int128_rshift(r, 63); 730 731 return do_sat128_d(r); 732 } 733 734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 735 void *va, uint32_t desc) 736 { 737 intptr_t i, opr_sz = simd_oprsz(desc); 738 int64_t *d = vd, *n = vn, *m = vm, *a = va; 739 740 for (i = 0; i < opr_sz / 8; ++i) { 741 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 742 } 743 } 744 745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 746 void *va, uint32_t desc) 747 { 748 intptr_t i, opr_sz = simd_oprsz(desc); 749 int64_t *d = vd, *n = vn, *m = vm, *a = va; 750 751 for (i = 0; i < opr_sz / 8; ++i) { 752 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 753 } 754 } 755 756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 757 { 758 intptr_t i, opr_sz = simd_oprsz(desc); 759 int64_t *d = vd, *n = vn, *m = vm; 760 761 for (i = 0; i < opr_sz / 8; ++i) { 762 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 763 } 764 } 765 766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 767 { 768 intptr_t i, opr_sz = simd_oprsz(desc); 769 int64_t *d = vd, *n = vn, *m = vm; 770 771 for (i = 0; i < opr_sz / 8; ++i) { 772 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 773 } 774 } 775 776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 777 { 778 intptr_t i, j, opr_sz = simd_oprsz(desc); 779 int idx = simd_data(desc); 780 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 781 782 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 783 int64_t mm = m[i]; 784 for (j = 0; j < 16 / 8; ++j) { 785 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 786 } 787 } 788 } 789 790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 791 { 792 intptr_t i, j, opr_sz = simd_oprsz(desc); 793 int idx = simd_data(desc); 794 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 795 796 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 797 int64_t mm = m[i]; 798 for (j = 0; j < 16 / 8; ++j) { 799 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 800 } 801 } 802 } 803 804 /* Integer 8 and 16-bit dot-product. 805 * 806 * Note that for the loops herein, host endianness does not matter 807 * with respect to the ordering of data within the quad-width lanes. 808 * All elements are treated equally, no matter where they are. 809 */ 810 811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 813 { \ 814 intptr_t i, opr_sz = simd_oprsz(desc); \ 815 TYPED *d = vd, *a = va; \ 816 TYPEN *n = vn; \ 817 TYPEM *m = vm; \ 818 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 819 d[i] = (a[i] + \ 820 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 821 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 822 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 823 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 824 } \ 825 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 826 } 827 828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 833 834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 836 { \ 837 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 838 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 839 /* \ 840 * Special case: opr_sz == 8 from AA64/AA32 advsimd means the \ 841 * first iteration might not be a full 16 byte segment. But \ 842 * for vector lengths beyond that this must be SVE and we know \ 843 * opr_sz is a multiple of 16, so we need not clamp segend \ 844 * to opr_sz_n when we advance it at the end of the loop. \ 845 */ \ 846 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 847 intptr_t index = simd_data(desc); \ 848 TYPED *d = vd, *a = va; \ 849 TYPEN *n = vn; \ 850 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 851 do { \ 852 TYPED m0 = m_indexed[i * 4 + 0]; \ 853 TYPED m1 = m_indexed[i * 4 + 1]; \ 854 TYPED m2 = m_indexed[i * 4 + 2]; \ 855 TYPED m3 = m_indexed[i * 4 + 3]; \ 856 do { \ 857 d[i] = (a[i] + \ 858 n[i * 4 + 0] * m0 + \ 859 n[i * 4 + 1] * m1 + \ 860 n[i * 4 + 2] * m2 + \ 861 n[i * 4 + 3] * m3); \ 862 } while (++i < segend); \ 863 segend = i + (16 / sizeof(TYPED)); \ 864 } while (i < opr_sz_n); \ 865 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 866 } 867 868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 874 875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 876 float_status *fpst, uint32_t desc) 877 { 878 uintptr_t opr_sz = simd_oprsz(desc); 879 float16 *d = vd; 880 float16 *n = vn; 881 float16 *m = vm; 882 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 883 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 884 uintptr_t i; 885 886 for (i = 0; i < opr_sz / 2; i += 2) { 887 float16 e0 = n[H2(i)]; 888 float16 e1 = m[H2(i + 1)]; 889 float16 e2 = n[H2(i + 1)]; 890 float16 e3 = m[H2(i)]; 891 892 if (rot) { 893 e3 = float16_maybe_ah_chs(e3, fpcr_ah); 894 } else { 895 e1 = float16_maybe_ah_chs(e1, fpcr_ah); 896 } 897 898 d[H2(i)] = float16_add(e0, e1, fpst); 899 d[H2(i + 1)] = float16_add(e2, e3, fpst); 900 } 901 clear_tail(d, opr_sz, simd_maxsz(desc)); 902 } 903 904 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 905 float_status *fpst, uint32_t desc) 906 { 907 uintptr_t opr_sz = simd_oprsz(desc); 908 float32 *d = vd; 909 float32 *n = vn; 910 float32 *m = vm; 911 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 912 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 913 uintptr_t i; 914 915 for (i = 0; i < opr_sz / 4; i += 2) { 916 float32 e0 = n[H4(i)]; 917 float32 e1 = m[H4(i + 1)]; 918 float32 e2 = n[H4(i + 1)]; 919 float32 e3 = m[H4(i)]; 920 921 if (rot) { 922 e3 = float32_maybe_ah_chs(e3, fpcr_ah); 923 } else { 924 e1 = float32_maybe_ah_chs(e1, fpcr_ah); 925 } 926 927 d[H4(i)] = float32_add(e0, e1, fpst); 928 d[H4(i + 1)] = float32_add(e2, e3, fpst); 929 } 930 clear_tail(d, opr_sz, simd_maxsz(desc)); 931 } 932 933 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 934 float_status *fpst, uint32_t desc) 935 { 936 uintptr_t opr_sz = simd_oprsz(desc); 937 float64 *d = vd; 938 float64 *n = vn; 939 float64 *m = vm; 940 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 941 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 942 uintptr_t i; 943 944 for (i = 0; i < opr_sz / 8; i += 2) { 945 float64 e0 = n[i]; 946 float64 e1 = m[i + 1]; 947 float64 e2 = n[i + 1]; 948 float64 e3 = m[i]; 949 950 if (rot) { 951 e3 = float64_maybe_ah_chs(e3, fpcr_ah); 952 } else { 953 e1 = float64_maybe_ah_chs(e1, fpcr_ah); 954 } 955 956 d[i] = float64_add(e0, e1, fpst); 957 d[i + 1] = float64_add(e2, e3, fpst); 958 } 959 clear_tail(d, opr_sz, simd_maxsz(desc)); 960 } 961 962 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 963 float_status *fpst, uint32_t desc) 964 { 965 uintptr_t opr_sz = simd_oprsz(desc); 966 float16 *d = vd, *n = vn, *m = vm, *a = va; 967 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 968 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 969 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 970 uint32_t negf_real = flip ^ negf_imag; 971 float16 negx_imag, negx_real; 972 uintptr_t i; 973 974 /* With AH=0, use negx; with AH=1 use negf. */ 975 negx_real = (negf_real & ~fpcr_ah) << 15; 976 negx_imag = (negf_imag & ~fpcr_ah) << 15; 977 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 978 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 979 980 for (i = 0; i < opr_sz / 2; i += 2) { 981 float16 e2 = n[H2(i + flip)]; 982 float16 e1 = m[H2(i + flip)] ^ negx_real; 983 float16 e4 = e2; 984 float16 e3 = m[H2(i + 1 - flip)] ^ negx_imag; 985 986 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], negf_real, fpst); 987 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], negf_imag, fpst); 988 } 989 clear_tail(d, opr_sz, simd_maxsz(desc)); 990 } 991 992 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 993 float_status *fpst, uint32_t desc) 994 { 995 uintptr_t opr_sz = simd_oprsz(desc); 996 float16 *d = vd, *n = vn, *m = vm, *a = va; 997 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 998 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 999 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1000 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1); 1001 uint32_t negf_real = flip ^ negf_imag; 1002 intptr_t elements = opr_sz / sizeof(float16); 1003 intptr_t eltspersegment = MIN(16 / sizeof(float16), elements); 1004 float16 negx_imag, negx_real; 1005 intptr_t i, j; 1006 1007 /* With AH=0, use negx; with AH=1 use negf. */ 1008 negx_real = (negf_real & ~fpcr_ah) << 15; 1009 negx_imag = (negf_imag & ~fpcr_ah) << 15; 1010 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1011 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1012 1013 for (i = 0; i < elements; i += eltspersegment) { 1014 float16 mr = m[H2(i + 2 * index + 0)]; 1015 float16 mi = m[H2(i + 2 * index + 1)]; 1016 float16 e1 = negx_real ^ (flip ? mi : mr); 1017 float16 e3 = negx_imag ^ (flip ? mr : mi); 1018 1019 for (j = i; j < i + eltspersegment; j += 2) { 1020 float16 e2 = n[H2(j + flip)]; 1021 float16 e4 = e2; 1022 1023 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], negf_real, fpst); 1024 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], negf_imag, fpst); 1025 } 1026 } 1027 clear_tail(d, opr_sz, simd_maxsz(desc)); 1028 } 1029 1030 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 1031 float_status *fpst, uint32_t desc) 1032 { 1033 uintptr_t opr_sz = simd_oprsz(desc); 1034 float32 *d = vd, *n = vn, *m = vm, *a = va; 1035 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1036 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 1037 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1038 uint32_t negf_real = flip ^ negf_imag; 1039 float32 negx_imag, negx_real; 1040 uintptr_t i; 1041 1042 /* With AH=0, use negx; with AH=1 use negf. */ 1043 negx_real = (negf_real & ~fpcr_ah) << 31; 1044 negx_imag = (negf_imag & ~fpcr_ah) << 31; 1045 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1046 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1047 1048 for (i = 0; i < opr_sz / 4; i += 2) { 1049 float32 e2 = n[H4(i + flip)]; 1050 float32 e1 = m[H4(i + flip)] ^ negx_real; 1051 float32 e4 = e2; 1052 float32 e3 = m[H4(i + 1 - flip)] ^ negx_imag; 1053 1054 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], negf_real, fpst); 1055 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], negf_imag, fpst); 1056 } 1057 clear_tail(d, opr_sz, simd_maxsz(desc)); 1058 } 1059 1060 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 1061 float_status *fpst, uint32_t desc) 1062 { 1063 uintptr_t opr_sz = simd_oprsz(desc); 1064 float32 *d = vd, *n = vn, *m = vm, *a = va; 1065 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1066 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1067 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1068 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1); 1069 uint32_t negf_real = flip ^ negf_imag; 1070 intptr_t elements = opr_sz / sizeof(float32); 1071 intptr_t eltspersegment = MIN(16 / sizeof(float32), elements); 1072 float32 negx_imag, negx_real; 1073 intptr_t i, j; 1074 1075 /* With AH=0, use negx; with AH=1 use negf. */ 1076 negx_real = (negf_real & ~fpcr_ah) << 31; 1077 negx_imag = (negf_imag & ~fpcr_ah) << 31; 1078 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1079 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1080 1081 for (i = 0; i < elements; i += eltspersegment) { 1082 float32 mr = m[H4(i + 2 * index + 0)]; 1083 float32 mi = m[H4(i + 2 * index + 1)]; 1084 float32 e1 = negx_real ^ (flip ? mi : mr); 1085 float32 e3 = negx_imag ^ (flip ? mr : mi); 1086 1087 for (j = i; j < i + eltspersegment; j += 2) { 1088 float32 e2 = n[H4(j + flip)]; 1089 float32 e4 = e2; 1090 1091 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], negf_real, fpst); 1092 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], negf_imag, fpst); 1093 } 1094 } 1095 clear_tail(d, opr_sz, simd_maxsz(desc)); 1096 } 1097 1098 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 1099 float_status *fpst, uint32_t desc) 1100 { 1101 uintptr_t opr_sz = simd_oprsz(desc); 1102 float64 *d = vd, *n = vn, *m = vm, *a = va; 1103 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1104 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 1105 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1106 uint32_t negf_real = flip ^ negf_imag; 1107 float64 negx_real, negx_imag; 1108 uintptr_t i; 1109 1110 /* With AH=0, use negx; with AH=1 use negf. */ 1111 negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63; 1112 negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63; 1113 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1114 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1115 1116 for (i = 0; i < opr_sz / 8; i += 2) { 1117 float64 e2 = n[i + flip]; 1118 float64 e1 = m[i + flip] ^ negx_real; 1119 float64 e4 = e2; 1120 float64 e3 = m[i + 1 - flip] ^ negx_imag; 1121 1122 d[i] = float64_muladd(e2, e1, a[i], negf_real, fpst); 1123 d[i + 1] = float64_muladd(e4, e3, a[i + 1], negf_imag, fpst); 1124 } 1125 clear_tail(d, opr_sz, simd_maxsz(desc)); 1126 } 1127 1128 /* 1129 * Floating point comparisons producing an integer result (all 1s or all 0s). 1130 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1131 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1132 */ 1133 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 1134 { 1135 return -float16_eq_quiet(op1, op2, stat); 1136 } 1137 1138 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 1139 { 1140 return -float32_eq_quiet(op1, op2, stat); 1141 } 1142 1143 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat) 1144 { 1145 return -float64_eq_quiet(op1, op2, stat); 1146 } 1147 1148 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 1149 { 1150 return -float16_le(op2, op1, stat); 1151 } 1152 1153 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 1154 { 1155 return -float32_le(op2, op1, stat); 1156 } 1157 1158 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat) 1159 { 1160 return -float64_le(op2, op1, stat); 1161 } 1162 1163 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 1164 { 1165 return -float16_lt(op2, op1, stat); 1166 } 1167 1168 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 1169 { 1170 return -float32_lt(op2, op1, stat); 1171 } 1172 1173 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat) 1174 { 1175 return -float64_lt(op2, op1, stat); 1176 } 1177 1178 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 1179 { 1180 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 1181 } 1182 1183 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 1184 { 1185 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1186 } 1187 1188 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat) 1189 { 1190 return -float64_le(float64_abs(op2), float64_abs(op1), stat); 1191 } 1192 1193 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1194 { 1195 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1196 } 1197 1198 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1199 { 1200 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1201 } 1202 1203 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat) 1204 { 1205 return -float64_lt(float64_abs(op2), float64_abs(op1), stat); 1206 } 1207 1208 static int16_t vfp_tosszh(float16 x, float_status *fpst) 1209 { 1210 if (float16_is_any_nan(x)) { 1211 float_raise(float_flag_invalid, fpst); 1212 return 0; 1213 } 1214 return float16_to_int16_round_to_zero(x, fpst); 1215 } 1216 1217 static uint16_t vfp_touszh(float16 x, float_status *fpst) 1218 { 1219 if (float16_is_any_nan(x)) { 1220 float_raise(float_flag_invalid, fpst); 1221 return 0; 1222 } 1223 return float16_to_uint16_round_to_zero(x, fpst); 1224 } 1225 1226 #define DO_2OP(NAME, FUNC, TYPE) \ 1227 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 1228 { \ 1229 intptr_t i, oprsz = simd_oprsz(desc); \ 1230 TYPE *d = vd, *n = vn; \ 1231 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1232 d[i] = FUNC(n[i], stat); \ 1233 } \ 1234 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1235 } 1236 1237 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1238 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1239 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1240 1241 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1242 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1243 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1244 1245 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1246 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1247 1248 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1249 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1250 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1251 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1252 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1253 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1254 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1255 DO_2OP(gvec_touszh, vfp_touszh, float16) 1256 1257 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1258 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1259 { \ 1260 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1261 } 1262 1263 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1264 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1265 { \ 1266 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1267 } 1268 1269 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1270 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1271 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1272 WRAP_CMP0_##DIRN(FN, CMPOP, float64) \ 1273 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1274 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) \ 1275 DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64) 1276 1277 DO_2OP_CMP0(cgt, cgt, FWD) 1278 DO_2OP_CMP0(cge, cge, FWD) 1279 DO_2OP_CMP0(ceq, ceq, FWD) 1280 DO_2OP_CMP0(clt, cgt, REV) 1281 DO_2OP_CMP0(cle, cge, REV) 1282 1283 #undef DO_2OP 1284 #undef DO_2OP_CMP0 1285 1286 /* Floating-point trigonometric starting value. 1287 * See the ARM ARM pseudocode function FPTrigSMul. 1288 */ 1289 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1290 { 1291 float16 result = float16_mul(op1, op1, stat); 1292 if (!float16_is_any_nan(result)) { 1293 result = float16_set_sign(result, op2 & 1); 1294 } 1295 return result; 1296 } 1297 1298 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1299 { 1300 float32 result = float32_mul(op1, op1, stat); 1301 if (!float32_is_any_nan(result)) { 1302 result = float32_set_sign(result, op2 & 1); 1303 } 1304 return result; 1305 } 1306 1307 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1308 { 1309 float64 result = float64_mul(op1, op1, stat); 1310 if (!float64_is_any_nan(result)) { 1311 result = float64_set_sign(result, op2 & 1); 1312 } 1313 return result; 1314 } 1315 1316 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1317 { 1318 return float16_abs(float16_sub(op1, op2, stat)); 1319 } 1320 1321 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1322 { 1323 return float32_abs(float32_sub(op1, op2, stat)); 1324 } 1325 1326 static float64 float64_abd(float64 op1, float64 op2, float_status *stat) 1327 { 1328 return float64_abs(float64_sub(op1, op2, stat)); 1329 } 1330 1331 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */ 1332 static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat) 1333 { 1334 float16 r = float16_sub(op1, op2, stat); 1335 return float16_is_any_nan(r) ? r : float16_abs(r); 1336 } 1337 1338 static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat) 1339 { 1340 float32 r = float32_sub(op1, op2, stat); 1341 return float32_is_any_nan(r) ? r : float32_abs(r); 1342 } 1343 1344 static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat) 1345 { 1346 float64 r = float64_sub(op1, op2, stat); 1347 return float64_is_any_nan(r) ? r : float64_abs(r); 1348 } 1349 1350 /* 1351 * Reciprocal step. These are the AArch32 version which uses a 1352 * non-fused multiply-and-subtract. 1353 */ 1354 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1355 { 1356 op1 = float16_squash_input_denormal(op1, stat); 1357 op2 = float16_squash_input_denormal(op2, stat); 1358 1359 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1360 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1361 return float16_two; 1362 } 1363 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1364 } 1365 1366 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1367 { 1368 op1 = float32_squash_input_denormal(op1, stat); 1369 op2 = float32_squash_input_denormal(op2, stat); 1370 1371 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1372 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1373 return float32_two; 1374 } 1375 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1376 } 1377 1378 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1379 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1380 { 1381 op1 = float16_squash_input_denormal(op1, stat); 1382 op2 = float16_squash_input_denormal(op2, stat); 1383 1384 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1385 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1386 return float16_one_point_five; 1387 } 1388 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1389 return float16_div(op1, float16_two, stat); 1390 } 1391 1392 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1393 { 1394 op1 = float32_squash_input_denormal(op1, stat); 1395 op2 = float32_squash_input_denormal(op2, stat); 1396 1397 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1398 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1399 return float32_one_point_five; 1400 } 1401 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1402 return float32_div(op1, float32_two, stat); 1403 } 1404 1405 #define DO_3OP(NAME, FUNC, TYPE) \ 1406 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1407 float_status *stat, uint32_t desc) \ 1408 { \ 1409 intptr_t i, oprsz = simd_oprsz(desc); \ 1410 TYPE *d = vd, *n = vn, *m = vm; \ 1411 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1412 d[i] = FUNC(n[i], m[i], stat); \ 1413 } \ 1414 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1415 } 1416 1417 DO_3OP(gvec_fadd_h, float16_add, float16) 1418 DO_3OP(gvec_fadd_s, float32_add, float32) 1419 DO_3OP(gvec_fadd_d, float64_add, float64) 1420 1421 DO_3OP(gvec_fsub_h, float16_sub, float16) 1422 DO_3OP(gvec_fsub_s, float32_sub, float32) 1423 DO_3OP(gvec_fsub_d, float64_sub, float64) 1424 1425 DO_3OP(gvec_fmul_h, float16_mul, float16) 1426 DO_3OP(gvec_fmul_s, float32_mul, float32) 1427 DO_3OP(gvec_fmul_d, float64_mul, float64) 1428 1429 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1430 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1431 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1432 1433 DO_3OP(gvec_fabd_h, float16_abd, float16) 1434 DO_3OP(gvec_fabd_s, float32_abd, float32) 1435 DO_3OP(gvec_fabd_d, float64_abd, float64) 1436 1437 DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16) 1438 DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32) 1439 DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64) 1440 1441 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1442 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1443 DO_3OP(gvec_fceq_d, float64_ceq, float64) 1444 1445 DO_3OP(gvec_fcge_h, float16_cge, float16) 1446 DO_3OP(gvec_fcge_s, float32_cge, float32) 1447 DO_3OP(gvec_fcge_d, float64_cge, float64) 1448 1449 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1450 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1451 DO_3OP(gvec_fcgt_d, float64_cgt, float64) 1452 1453 DO_3OP(gvec_facge_h, float16_acge, float16) 1454 DO_3OP(gvec_facge_s, float32_acge, float32) 1455 DO_3OP(gvec_facge_d, float64_acge, float64) 1456 1457 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1458 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1459 DO_3OP(gvec_facgt_d, float64_acgt, float64) 1460 1461 DO_3OP(gvec_fmax_h, float16_max, float16) 1462 DO_3OP(gvec_fmax_s, float32_max, float32) 1463 DO_3OP(gvec_fmax_d, float64_max, float64) 1464 1465 DO_3OP(gvec_fmin_h, float16_min, float16) 1466 DO_3OP(gvec_fmin_s, float32_min, float32) 1467 DO_3OP(gvec_fmin_d, float64_min, float64) 1468 1469 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1470 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1471 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64) 1472 1473 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1474 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1475 DO_3OP(gvec_fminnum_d, float64_minnum, float64) 1476 1477 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1478 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1479 1480 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1481 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1482 1483 #ifdef TARGET_AARCH64 1484 DO_3OP(gvec_fdiv_h, float16_div, float16) 1485 DO_3OP(gvec_fdiv_s, float32_div, float32) 1486 DO_3OP(gvec_fdiv_d, float64_div, float64) 1487 1488 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16) 1489 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32) 1490 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64) 1491 1492 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1493 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1494 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1495 1496 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1497 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1498 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1499 1500 DO_3OP(gvec_ah_recps_h, helper_recpsf_ah_f16, float16) 1501 DO_3OP(gvec_ah_recps_s, helper_recpsf_ah_f32, float32) 1502 DO_3OP(gvec_ah_recps_d, helper_recpsf_ah_f64, float64) 1503 1504 DO_3OP(gvec_ah_rsqrts_h, helper_rsqrtsf_ah_f16, float16) 1505 DO_3OP(gvec_ah_rsqrts_s, helper_rsqrtsf_ah_f32, float32) 1506 DO_3OP(gvec_ah_rsqrts_d, helper_rsqrtsf_ah_f64, float64) 1507 1508 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16) 1509 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32) 1510 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64) 1511 1512 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16) 1513 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32) 1514 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64) 1515 1516 #endif 1517 #undef DO_3OP 1518 1519 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1520 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1521 float_status *stat) 1522 { 1523 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1524 } 1525 1526 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1527 float_status *stat) 1528 { 1529 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1530 } 1531 1532 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1533 float_status *stat) 1534 { 1535 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1536 } 1537 1538 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1539 float_status *stat) 1540 { 1541 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1542 } 1543 1544 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1545 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1546 float_status *stat) 1547 { 1548 return float16_muladd(op1, op2, dest, 0, stat); 1549 } 1550 1551 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1552 float_status *stat) 1553 { 1554 return float32_muladd(op1, op2, dest, 0, stat); 1555 } 1556 1557 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2, 1558 float_status *stat) 1559 { 1560 return float64_muladd(op1, op2, dest, 0, stat); 1561 } 1562 1563 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1564 float_status *stat) 1565 { 1566 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1567 } 1568 1569 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1570 float_status *stat) 1571 { 1572 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1573 } 1574 1575 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2, 1576 float_status *stat) 1577 { 1578 return float64_muladd(float64_chs(op1), op2, dest, 0, stat); 1579 } 1580 1581 static float16 float16_ah_mulsub_f(float16 dest, float16 op1, float16 op2, 1582 float_status *stat) 1583 { 1584 return float16_muladd(op1, op2, dest, float_muladd_negate_product, stat); 1585 } 1586 1587 static float32 float32_ah_mulsub_f(float32 dest, float32 op1, float32 op2, 1588 float_status *stat) 1589 { 1590 return float32_muladd(op1, op2, dest, float_muladd_negate_product, stat); 1591 } 1592 1593 static float64 float64_ah_mulsub_f(float64 dest, float64 op1, float64 op2, 1594 float_status *stat) 1595 { 1596 return float64_muladd(op1, op2, dest, float_muladd_negate_product, stat); 1597 } 1598 1599 #define DO_MULADD(NAME, FUNC, TYPE) \ 1600 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1601 float_status *stat, uint32_t desc) \ 1602 { \ 1603 intptr_t i, oprsz = simd_oprsz(desc); \ 1604 TYPE *d = vd, *n = vn, *m = vm; \ 1605 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1606 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1607 } \ 1608 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1609 } 1610 1611 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1612 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1613 1614 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1615 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1616 1617 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1618 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1619 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64) 1620 1621 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1622 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1623 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64) 1624 1625 DO_MULADD(gvec_ah_vfms_h, float16_ah_mulsub_f, float16) 1626 DO_MULADD(gvec_ah_vfms_s, float32_ah_mulsub_f, float32) 1627 DO_MULADD(gvec_ah_vfms_d, float64_ah_mulsub_f, float64) 1628 1629 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1630 * For AdvSIMD, there is of course only one such vector segment. 1631 */ 1632 1633 #define DO_MUL_IDX(NAME, TYPE, H) \ 1634 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1635 { \ 1636 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1637 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1638 intptr_t idx = simd_data(desc); \ 1639 TYPE *d = vd, *n = vn, *m = vm; \ 1640 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1641 TYPE mm = m[H(i + idx)]; \ 1642 for (j = 0; j < segment; j++) { \ 1643 d[i + j] = n[i + j] * mm; \ 1644 } \ 1645 } \ 1646 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1647 } 1648 1649 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1650 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1651 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1652 1653 #undef DO_MUL_IDX 1654 1655 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1656 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1657 { \ 1658 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1659 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1660 intptr_t idx = simd_data(desc); \ 1661 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1662 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1663 TYPE mm = m[H(i + idx)]; \ 1664 for (j = 0; j < segment; j++) { \ 1665 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1666 } \ 1667 } \ 1668 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1669 } 1670 1671 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1672 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1673 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1674 1675 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1676 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1677 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1678 1679 #undef DO_MLA_IDX 1680 1681 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \ 1682 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1683 float_status *stat, uint32_t desc) \ 1684 { \ 1685 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1686 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1687 intptr_t idx = simd_data(desc); \ 1688 TYPE *d = vd, *n = vn, *m = vm; \ 1689 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1690 TYPE mm = m[H(i + idx)]; \ 1691 for (j = 0; j < segment; j++) { \ 1692 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \ 1693 } \ 1694 } \ 1695 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1696 } 1697 1698 #define nop(N, M, S) (M) 1699 1700 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2) 1701 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4) 1702 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8) 1703 1704 #ifdef TARGET_AARCH64 1705 1706 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2) 1707 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4) 1708 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8) 1709 1710 #endif 1711 1712 #undef nop 1713 1714 /* 1715 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1716 * the fused ops below they assume accumulate both from and into Vd. 1717 */ 1718 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2) 1719 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4) 1720 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2) 1721 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4) 1722 1723 #undef DO_FMUL_IDX 1724 1725 #define DO_FMLA_IDX(NAME, TYPE, H, NEGX, NEGF) \ 1726 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1727 float_status *stat, uint32_t desc) \ 1728 { \ 1729 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1730 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1731 intptr_t idx = simd_data(desc); \ 1732 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1733 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1734 TYPE mm = m[H(i + idx)]; \ 1735 for (j = 0; j < segment; j++) { \ 1736 d[i + j] = TYPE##_muladd(n[i + j] ^ NEGX, mm, \ 1737 a[i + j], NEGF, stat); \ 1738 } \ 1739 } \ 1740 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1741 } 1742 1743 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2, 0, 0) 1744 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4, 0, 0) 1745 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8, 0, 0) 1746 1747 DO_FMLA_IDX(gvec_fmls_idx_h, float16, H2, INT16_MIN, 0) 1748 DO_FMLA_IDX(gvec_fmls_idx_s, float32, H4, INT32_MIN, 0) 1749 DO_FMLA_IDX(gvec_fmls_idx_d, float64, H8, INT64_MIN, 0) 1750 1751 DO_FMLA_IDX(gvec_ah_fmls_idx_h, float16, H2, 0, float_muladd_negate_product) 1752 DO_FMLA_IDX(gvec_ah_fmls_idx_s, float32, H4, 0, float_muladd_negate_product) 1753 DO_FMLA_IDX(gvec_ah_fmls_idx_d, float64, H8, 0, float_muladd_negate_product) 1754 1755 #undef DO_FMLA_IDX 1756 1757 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1758 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1759 { \ 1760 intptr_t i, oprsz = simd_oprsz(desc); \ 1761 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1762 bool q = false; \ 1763 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1764 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1765 if (dd < MIN) { \ 1766 dd = MIN; \ 1767 q = true; \ 1768 } else if (dd > MAX) { \ 1769 dd = MAX; \ 1770 q = true; \ 1771 } \ 1772 d[i] = dd; \ 1773 } \ 1774 if (q) { \ 1775 uint32_t *qc = vq; \ 1776 qc[0] = 1; \ 1777 } \ 1778 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1779 } 1780 1781 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1782 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1783 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1784 1785 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1786 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1787 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1788 1789 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1790 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1791 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1792 1793 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1794 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1795 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1796 1797 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX) 1798 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX) 1799 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX) 1800 1801 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX) 1802 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX) 1803 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX) 1804 1805 #undef DO_SAT 1806 1807 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1808 void *vm, uint32_t desc) 1809 { 1810 intptr_t i, oprsz = simd_oprsz(desc); 1811 uint64_t *d = vd, *n = vn, *m = vm; 1812 bool q = false; 1813 1814 for (i = 0; i < oprsz / 8; i++) { 1815 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1816 if (dd < nn) { 1817 dd = UINT64_MAX; 1818 q = true; 1819 } 1820 d[i] = dd; 1821 } 1822 if (q) { 1823 uint32_t *qc = vq; 1824 qc[0] = 1; 1825 } 1826 clear_tail(d, oprsz, simd_maxsz(desc)); 1827 } 1828 1829 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1830 void *vm, uint32_t desc) 1831 { 1832 intptr_t i, oprsz = simd_oprsz(desc); 1833 uint64_t *d = vd, *n = vn, *m = vm; 1834 bool q = false; 1835 1836 for (i = 0; i < oprsz / 8; i++) { 1837 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1838 if (nn < mm) { 1839 dd = 0; 1840 q = true; 1841 } 1842 d[i] = dd; 1843 } 1844 if (q) { 1845 uint32_t *qc = vq; 1846 qc[0] = 1; 1847 } 1848 clear_tail(d, oprsz, simd_maxsz(desc)); 1849 } 1850 1851 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1852 void *vm, uint32_t desc) 1853 { 1854 intptr_t i, oprsz = simd_oprsz(desc); 1855 int64_t *d = vd, *n = vn, *m = vm; 1856 bool q = false; 1857 1858 for (i = 0; i < oprsz / 8; i++) { 1859 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1860 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1861 dd = (nn >> 63) ^ ~INT64_MIN; 1862 q = true; 1863 } 1864 d[i] = dd; 1865 } 1866 if (q) { 1867 uint32_t *qc = vq; 1868 qc[0] = 1; 1869 } 1870 clear_tail(d, oprsz, simd_maxsz(desc)); 1871 } 1872 1873 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1874 void *vm, uint32_t desc) 1875 { 1876 intptr_t i, oprsz = simd_oprsz(desc); 1877 int64_t *d = vd, *n = vn, *m = vm; 1878 bool q = false; 1879 1880 for (i = 0; i < oprsz / 8; i++) { 1881 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1882 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1883 dd = (nn >> 63) ^ ~INT64_MIN; 1884 q = true; 1885 } 1886 d[i] = dd; 1887 } 1888 if (q) { 1889 uint32_t *qc = vq; 1890 qc[0] = 1; 1891 } 1892 clear_tail(d, oprsz, simd_maxsz(desc)); 1893 } 1894 1895 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn, 1896 void *vm, uint32_t desc) 1897 { 1898 intptr_t i, oprsz = simd_oprsz(desc); 1899 uint64_t *d = vd, *n = vn, *m = vm; 1900 bool q = false; 1901 1902 for (i = 0; i < oprsz / 8; i++) { 1903 uint64_t nn = n[i]; 1904 int64_t mm = m[i]; 1905 uint64_t dd = nn + mm; 1906 1907 if (mm < 0) { 1908 if (nn < (uint64_t)-mm) { 1909 dd = 0; 1910 q = true; 1911 } 1912 } else { 1913 if (dd < nn) { 1914 dd = UINT64_MAX; 1915 q = true; 1916 } 1917 } 1918 d[i] = dd; 1919 } 1920 if (q) { 1921 uint32_t *qc = vq; 1922 qc[0] = 1; 1923 } 1924 clear_tail(d, oprsz, simd_maxsz(desc)); 1925 } 1926 1927 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn, 1928 void *vm, uint32_t desc) 1929 { 1930 intptr_t i, oprsz = simd_oprsz(desc); 1931 uint64_t *d = vd, *n = vn, *m = vm; 1932 bool q = false; 1933 1934 for (i = 0; i < oprsz / 8; i++) { 1935 int64_t nn = n[i]; 1936 uint64_t mm = m[i]; 1937 int64_t dd = nn + mm; 1938 1939 if (mm > (uint64_t)(INT64_MAX - nn)) { 1940 dd = INT64_MAX; 1941 q = true; 1942 } 1943 d[i] = dd; 1944 } 1945 if (q) { 1946 uint32_t *qc = vq; 1947 qc[0] = 1; 1948 } 1949 clear_tail(d, oprsz, simd_maxsz(desc)); 1950 } 1951 1952 #define DO_SRA(NAME, TYPE) \ 1953 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1954 { \ 1955 intptr_t i, oprsz = simd_oprsz(desc); \ 1956 int shift = simd_data(desc); \ 1957 TYPE *d = vd, *n = vn; \ 1958 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1959 d[i] += n[i] >> shift; \ 1960 } \ 1961 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1962 } 1963 1964 DO_SRA(gvec_ssra_b, int8_t) 1965 DO_SRA(gvec_ssra_h, int16_t) 1966 DO_SRA(gvec_ssra_s, int32_t) 1967 DO_SRA(gvec_ssra_d, int64_t) 1968 1969 DO_SRA(gvec_usra_b, uint8_t) 1970 DO_SRA(gvec_usra_h, uint16_t) 1971 DO_SRA(gvec_usra_s, uint32_t) 1972 DO_SRA(gvec_usra_d, uint64_t) 1973 1974 #undef DO_SRA 1975 1976 #define DO_RSHR(NAME, TYPE) \ 1977 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1978 { \ 1979 intptr_t i, oprsz = simd_oprsz(desc); \ 1980 int shift = simd_data(desc); \ 1981 TYPE *d = vd, *n = vn; \ 1982 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1983 TYPE tmp = n[i] >> (shift - 1); \ 1984 d[i] = (tmp >> 1) + (tmp & 1); \ 1985 } \ 1986 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1987 } 1988 1989 DO_RSHR(gvec_srshr_b, int8_t) 1990 DO_RSHR(gvec_srshr_h, int16_t) 1991 DO_RSHR(gvec_srshr_s, int32_t) 1992 DO_RSHR(gvec_srshr_d, int64_t) 1993 1994 DO_RSHR(gvec_urshr_b, uint8_t) 1995 DO_RSHR(gvec_urshr_h, uint16_t) 1996 DO_RSHR(gvec_urshr_s, uint32_t) 1997 DO_RSHR(gvec_urshr_d, uint64_t) 1998 1999 #undef DO_RSHR 2000 2001 #define DO_RSRA(NAME, TYPE) \ 2002 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2003 { \ 2004 intptr_t i, oprsz = simd_oprsz(desc); \ 2005 int shift = simd_data(desc); \ 2006 TYPE *d = vd, *n = vn; \ 2007 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2008 TYPE tmp = n[i] >> (shift - 1); \ 2009 d[i] += (tmp >> 1) + (tmp & 1); \ 2010 } \ 2011 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2012 } 2013 2014 DO_RSRA(gvec_srsra_b, int8_t) 2015 DO_RSRA(gvec_srsra_h, int16_t) 2016 DO_RSRA(gvec_srsra_s, int32_t) 2017 DO_RSRA(gvec_srsra_d, int64_t) 2018 2019 DO_RSRA(gvec_ursra_b, uint8_t) 2020 DO_RSRA(gvec_ursra_h, uint16_t) 2021 DO_RSRA(gvec_ursra_s, uint32_t) 2022 DO_RSRA(gvec_ursra_d, uint64_t) 2023 2024 #undef DO_RSRA 2025 2026 #define DO_SRI(NAME, TYPE) \ 2027 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2028 { \ 2029 intptr_t i, oprsz = simd_oprsz(desc); \ 2030 int shift = simd_data(desc); \ 2031 TYPE *d = vd, *n = vn; \ 2032 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2033 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 2034 } \ 2035 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2036 } 2037 2038 DO_SRI(gvec_sri_b, uint8_t) 2039 DO_SRI(gvec_sri_h, uint16_t) 2040 DO_SRI(gvec_sri_s, uint32_t) 2041 DO_SRI(gvec_sri_d, uint64_t) 2042 2043 #undef DO_SRI 2044 2045 #define DO_SLI(NAME, TYPE) \ 2046 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2047 { \ 2048 intptr_t i, oprsz = simd_oprsz(desc); \ 2049 int shift = simd_data(desc); \ 2050 TYPE *d = vd, *n = vn; \ 2051 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2052 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 2053 } \ 2054 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2055 } 2056 2057 DO_SLI(gvec_sli_b, uint8_t) 2058 DO_SLI(gvec_sli_h, uint16_t) 2059 DO_SLI(gvec_sli_s, uint32_t) 2060 DO_SLI(gvec_sli_d, uint64_t) 2061 2062 #undef DO_SLI 2063 2064 /* 2065 * Convert float16 to float32, raising no exceptions and 2066 * preserving exceptional values, including SNaN. 2067 * This is effectively an unpack+repack operation. 2068 */ 2069 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 2070 { 2071 const int f16_bias = 15; 2072 const int f32_bias = 127; 2073 uint32_t sign = extract32(f16, 15, 1); 2074 uint32_t exp = extract32(f16, 10, 5); 2075 uint32_t frac = extract32(f16, 0, 10); 2076 2077 if (exp == 0x1f) { 2078 /* Inf or NaN */ 2079 exp = 0xff; 2080 } else if (exp == 0) { 2081 /* Zero or denormal. */ 2082 if (frac != 0) { 2083 if (fz16) { 2084 frac = 0; 2085 } else { 2086 /* 2087 * Denormal; these are all normal float32. 2088 * Shift the fraction so that the msb is at bit 11, 2089 * then remove bit 11 as the implicit bit of the 2090 * normalized float32. Note that we still go through 2091 * the shift for normal numbers below, to put the 2092 * float32 fraction at the right place. 2093 */ 2094 int shift = clz32(frac) - 21; 2095 frac = (frac << shift) & 0x3ff; 2096 exp = f32_bias - f16_bias - shift + 1; 2097 } 2098 } 2099 } else { 2100 /* Normal number; adjust the bias. */ 2101 exp += f32_bias - f16_bias; 2102 } 2103 sign <<= 31; 2104 exp <<= 23; 2105 frac <<= 23 - 10; 2106 2107 return sign | exp | frac; 2108 } 2109 2110 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 2111 { 2112 /* 2113 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 2114 * Load the 2nd qword iff is_q & is_2. 2115 * Shift to the 2nd dword iff !is_q & is_2. 2116 * For !is_q & !is_2, the upper bits of the result are garbage. 2117 */ 2118 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 2119 } 2120 2121 /* 2122 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 2123 * as there is not yet SVE versions that might use blocking. 2124 */ 2125 2126 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 2127 uint64_t negx, int negf, uint32_t desc, bool fz16) 2128 { 2129 intptr_t i, oprsz = simd_oprsz(desc); 2130 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2131 int is_q = oprsz == 16; 2132 uint64_t n_4, m_4; 2133 2134 /* 2135 * Pre-load all of the f16 data, avoiding overlap issues. 2136 * Negate all inputs for AH=0 FMLSL at once. 2137 */ 2138 n_4 = load4_f16(vn, is_q, is_2) ^ negx; 2139 m_4 = load4_f16(vm, is_q, is_2); 2140 2141 for (i = 0; i < oprsz / 4; i++) { 2142 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2143 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 2144 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst); 2145 } 2146 clear_tail(d, oprsz, simd_maxsz(desc)); 2147 } 2148 2149 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 2150 CPUARMState *env, uint32_t desc) 2151 { 2152 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2153 uint64_t negx = is_s ? 0x8000800080008000ull : 0; 2154 2155 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, negx, 0, desc, 2156 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32)); 2157 } 2158 2159 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 2160 CPUARMState *env, uint32_t desc) 2161 { 2162 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2163 uint64_t negx = 0; 2164 int negf = 0; 2165 2166 if (is_s) { 2167 if (env->vfp.fpcr & FPCR_AH) { 2168 negf = float_muladd_negate_product; 2169 } else { 2170 negx = 0x8000800080008000ull; 2171 } 2172 } 2173 do_fmlal(vd, vn, vm, &env->vfp.fp_status_a64, negx, negf, desc, 2174 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64)); 2175 } 2176 2177 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 2178 CPUARMState *env, uint32_t desc) 2179 { 2180 intptr_t i, oprsz = simd_oprsz(desc); 2181 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2182 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2183 float_status *status = &env->vfp.fp_status_a64; 2184 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64); 2185 int negx = 0, negf = 0; 2186 2187 if (is_s) { 2188 if (env->vfp.fpcr & FPCR_AH) { 2189 negf = float_muladd_negate_product; 2190 } else { 2191 negx = 0x8000; 2192 } 2193 } 2194 2195 for (i = 0; i < oprsz; i += sizeof(float32)) { 2196 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negx; 2197 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 2198 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2199 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2200 float32 aa = *(float32 *)(va + H1_4(i)); 2201 2202 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, negf, status); 2203 } 2204 } 2205 2206 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 2207 uint64_t negx, int negf, uint32_t desc, bool fz16) 2208 { 2209 intptr_t i, oprsz = simd_oprsz(desc); 2210 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2211 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 2212 int is_q = oprsz == 16; 2213 uint64_t n_4; 2214 float32 m_1; 2215 2216 /* 2217 * Pre-load all of the f16 data, avoiding overlap issues. 2218 * Negate all inputs for AH=0 FMLSL at once. 2219 */ 2220 n_4 = load4_f16(vn, is_q, is_2) ^ negx; 2221 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 2222 2223 for (i = 0; i < oprsz / 4; i++) { 2224 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2225 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst); 2226 } 2227 clear_tail(d, oprsz, simd_maxsz(desc)); 2228 } 2229 2230 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 2231 CPUARMState *env, uint32_t desc) 2232 { 2233 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2234 uint64_t negx = is_s ? 0x8000800080008000ull : 0; 2235 2236 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, negx, 0, desc, 2237 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32)); 2238 } 2239 2240 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 2241 CPUARMState *env, uint32_t desc) 2242 { 2243 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2244 uint64_t negx = 0; 2245 int negf = 0; 2246 2247 if (is_s) { 2248 if (env->vfp.fpcr & FPCR_AH) { 2249 negf = float_muladd_negate_product; 2250 } else { 2251 negx = 0x8000800080008000ull; 2252 } 2253 } 2254 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status_a64, negx, negf, desc, 2255 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64)); 2256 } 2257 2258 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 2259 CPUARMState *env, uint32_t desc) 2260 { 2261 intptr_t i, j, oprsz = simd_oprsz(desc); 2262 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2263 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2264 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 2265 float_status *status = &env->vfp.fp_status_a64; 2266 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64); 2267 int negx = 0, negf = 0; 2268 2269 if (is_s) { 2270 if (env->vfp.fpcr & FPCR_AH) { 2271 negf = float_muladd_negate_product; 2272 } else { 2273 negx = 0x8000; 2274 } 2275 } 2276 2277 for (i = 0; i < oprsz; i += 16) { 2278 float16 mm_16 = *(float16 *)(vm + i + idx); 2279 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2280 2281 for (j = 0; j < 16; j += sizeof(float32)) { 2282 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negx; 2283 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2284 float32 aa = *(float32 *)(va + H1_4(i + j)); 2285 2286 *(float32 *)(vd + H1_4(i + j)) = 2287 float32_muladd(nn, mm, aa, negf, status); 2288 } 2289 } 2290 } 2291 2292 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2293 { 2294 intptr_t i, opr_sz = simd_oprsz(desc); 2295 int8_t *d = vd, *n = vn, *m = vm; 2296 2297 for (i = 0; i < opr_sz; ++i) { 2298 int8_t mm = m[i]; 2299 int8_t nn = n[i]; 2300 int8_t res = 0; 2301 if (mm >= 0) { 2302 if (mm < 8) { 2303 res = nn << mm; 2304 } 2305 } else { 2306 res = nn >> (mm > -8 ? -mm : 7); 2307 } 2308 d[i] = res; 2309 } 2310 clear_tail(d, opr_sz, simd_maxsz(desc)); 2311 } 2312 2313 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2314 { 2315 intptr_t i, opr_sz = simd_oprsz(desc); 2316 int16_t *d = vd, *n = vn, *m = vm; 2317 2318 for (i = 0; i < opr_sz / 2; ++i) { 2319 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2320 int16_t nn = n[i]; 2321 int16_t res = 0; 2322 if (mm >= 0) { 2323 if (mm < 16) { 2324 res = nn << mm; 2325 } 2326 } else { 2327 res = nn >> (mm > -16 ? -mm : 15); 2328 } 2329 d[i] = res; 2330 } 2331 clear_tail(d, opr_sz, simd_maxsz(desc)); 2332 } 2333 2334 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2335 { 2336 intptr_t i, opr_sz = simd_oprsz(desc); 2337 uint8_t *d = vd, *n = vn, *m = vm; 2338 2339 for (i = 0; i < opr_sz; ++i) { 2340 int8_t mm = m[i]; 2341 uint8_t nn = n[i]; 2342 uint8_t res = 0; 2343 if (mm >= 0) { 2344 if (mm < 8) { 2345 res = nn << mm; 2346 } 2347 } else { 2348 if (mm > -8) { 2349 res = nn >> -mm; 2350 } 2351 } 2352 d[i] = res; 2353 } 2354 clear_tail(d, opr_sz, simd_maxsz(desc)); 2355 } 2356 2357 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2358 { 2359 intptr_t i, opr_sz = simd_oprsz(desc); 2360 uint16_t *d = vd, *n = vn, *m = vm; 2361 2362 for (i = 0; i < opr_sz / 2; ++i) { 2363 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2364 uint16_t nn = n[i]; 2365 uint16_t res = 0; 2366 if (mm >= 0) { 2367 if (mm < 16) { 2368 res = nn << mm; 2369 } 2370 } else { 2371 if (mm > -16) { 2372 res = nn >> -mm; 2373 } 2374 } 2375 d[i] = res; 2376 } 2377 clear_tail(d, opr_sz, simd_maxsz(desc)); 2378 } 2379 2380 /* 2381 * 8x8->8 polynomial multiply. 2382 * 2383 * Polynomial multiplication is like integer multiplication except the 2384 * partial products are XORed, not added. 2385 * 2386 * TODO: expose this as a generic vector operation, as it is a common 2387 * crypto building block. 2388 */ 2389 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 2390 { 2391 intptr_t i, opr_sz = simd_oprsz(desc); 2392 uint64_t *d = vd, *n = vn, *m = vm; 2393 2394 for (i = 0; i < opr_sz / 8; ++i) { 2395 d[i] = clmul_8x8_low(n[i], m[i]); 2396 } 2397 clear_tail(d, opr_sz, simd_maxsz(desc)); 2398 } 2399 2400 /* 2401 * 64x64->128 polynomial multiply. 2402 * Because of the lanes are not accessed in strict columns, 2403 * this probably cannot be turned into a generic helper. 2404 */ 2405 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2406 { 2407 intptr_t i, opr_sz = simd_oprsz(desc); 2408 intptr_t hi = simd_data(desc); 2409 uint64_t *d = vd, *n = vn, *m = vm; 2410 2411 for (i = 0; i < opr_sz / 8; i += 2) { 2412 Int128 r = clmul_64(n[i + hi], m[i + hi]); 2413 d[i] = int128_getlo(r); 2414 d[i + 1] = int128_gethi(r); 2415 } 2416 clear_tail(d, opr_sz, simd_maxsz(desc)); 2417 } 2418 2419 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2420 { 2421 int hi = simd_data(desc); 2422 uint64_t *d = vd, *n = vn, *m = vm; 2423 uint64_t nn = n[hi], mm = m[hi]; 2424 2425 d[0] = clmul_8x4_packed(nn, mm); 2426 nn >>= 32; 2427 mm >>= 32; 2428 d[1] = clmul_8x4_packed(nn, mm); 2429 2430 clear_tail(d, 16, simd_maxsz(desc)); 2431 } 2432 2433 #ifdef TARGET_AARCH64 2434 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2435 { 2436 int shift = simd_data(desc) * 8; 2437 intptr_t i, opr_sz = simd_oprsz(desc); 2438 uint64_t *d = vd, *n = vn, *m = vm; 2439 2440 for (i = 0; i < opr_sz / 8; ++i) { 2441 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); 2442 } 2443 } 2444 2445 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2446 { 2447 intptr_t sel = H4(simd_data(desc)); 2448 intptr_t i, opr_sz = simd_oprsz(desc); 2449 uint32_t *n = vn, *m = vm; 2450 uint64_t *d = vd; 2451 2452 for (i = 0; i < opr_sz / 8; ++i) { 2453 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); 2454 } 2455 } 2456 #endif 2457 2458 #define DO_CMP0(NAME, TYPE, OP) \ 2459 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2460 { \ 2461 intptr_t i, opr_sz = simd_oprsz(desc); \ 2462 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2463 TYPE nn = *(TYPE *)(vn + i); \ 2464 *(TYPE *)(vd + i) = -(nn OP 0); \ 2465 } \ 2466 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2467 } 2468 2469 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2470 DO_CMP0(gvec_clt0_b, int8_t, <) 2471 DO_CMP0(gvec_cle0_b, int8_t, <=) 2472 DO_CMP0(gvec_cgt0_b, int8_t, >) 2473 DO_CMP0(gvec_cge0_b, int8_t, >=) 2474 2475 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2476 DO_CMP0(gvec_clt0_h, int16_t, <) 2477 DO_CMP0(gvec_cle0_h, int16_t, <=) 2478 DO_CMP0(gvec_cgt0_h, int16_t, >) 2479 DO_CMP0(gvec_cge0_h, int16_t, >=) 2480 2481 #undef DO_CMP0 2482 2483 #define DO_ABD(NAME, TYPE) \ 2484 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2485 { \ 2486 intptr_t i, opr_sz = simd_oprsz(desc); \ 2487 TYPE *d = vd, *n = vn, *m = vm; \ 2488 \ 2489 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2490 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2491 } \ 2492 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2493 } 2494 2495 DO_ABD(gvec_sabd_b, int8_t) 2496 DO_ABD(gvec_sabd_h, int16_t) 2497 DO_ABD(gvec_sabd_s, int32_t) 2498 DO_ABD(gvec_sabd_d, int64_t) 2499 2500 DO_ABD(gvec_uabd_b, uint8_t) 2501 DO_ABD(gvec_uabd_h, uint16_t) 2502 DO_ABD(gvec_uabd_s, uint32_t) 2503 DO_ABD(gvec_uabd_d, uint64_t) 2504 2505 #undef DO_ABD 2506 2507 #define DO_ABA(NAME, TYPE) \ 2508 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2509 { \ 2510 intptr_t i, opr_sz = simd_oprsz(desc); \ 2511 TYPE *d = vd, *n = vn, *m = vm; \ 2512 \ 2513 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2514 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2515 } \ 2516 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2517 } 2518 2519 DO_ABA(gvec_saba_b, int8_t) 2520 DO_ABA(gvec_saba_h, int16_t) 2521 DO_ABA(gvec_saba_s, int32_t) 2522 DO_ABA(gvec_saba_d, int64_t) 2523 2524 DO_ABA(gvec_uaba_b, uint8_t) 2525 DO_ABA(gvec_uaba_h, uint16_t) 2526 DO_ABA(gvec_uaba_s, uint32_t) 2527 DO_ABA(gvec_uaba_d, uint64_t) 2528 2529 #undef DO_ABA 2530 2531 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2532 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 2533 float_status *stat, uint32_t desc) \ 2534 { \ 2535 ARMVectorReg scratch; \ 2536 intptr_t oprsz = simd_oprsz(desc); \ 2537 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2538 TYPE *d = vd, *n = vn, *m = vm; \ 2539 if (unlikely(d == m)) { \ 2540 m = memcpy(&scratch, m, oprsz); \ 2541 } \ 2542 for (intptr_t i = 0; i < half; ++i) { \ 2543 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat); \ 2544 } \ 2545 for (intptr_t i = 0; i < half; ++i) { \ 2546 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat); \ 2547 } \ 2548 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2549 } 2550 2551 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2) 2552 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4) 2553 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, ) 2554 2555 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2) 2556 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4) 2557 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, ) 2558 2559 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2) 2560 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4) 2561 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, ) 2562 2563 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2) 2564 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4) 2565 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, ) 2566 2567 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2) 2568 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4) 2569 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, ) 2570 2571 #ifdef TARGET_AARCH64 2572 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2) 2573 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4) 2574 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, ) 2575 2576 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2) 2577 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4) 2578 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, ) 2579 #endif 2580 2581 #undef DO_3OP_PAIR 2582 2583 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2584 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2585 { \ 2586 ARMVectorReg scratch; \ 2587 intptr_t oprsz = simd_oprsz(desc); \ 2588 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2589 TYPE *d = vd, *n = vn, *m = vm; \ 2590 if (unlikely(d == m)) { \ 2591 m = memcpy(&scratch, m, oprsz); \ 2592 } \ 2593 for (intptr_t i = 0; i < half; ++i) { \ 2594 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]); \ 2595 } \ 2596 for (intptr_t i = 0; i < half; ++i) { \ 2597 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]); \ 2598 } \ 2599 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2600 } 2601 2602 #define ADD(A, B) (A + B) 2603 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1) 2604 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2) 2605 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4) 2606 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, ) 2607 #undef ADD 2608 2609 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1) 2610 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2) 2611 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4) 2612 2613 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1) 2614 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2) 2615 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4) 2616 2617 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1) 2618 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2) 2619 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4) 2620 2621 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1) 2622 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2) 2623 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4) 2624 2625 #undef DO_3OP_PAIR 2626 2627 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2628 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 2629 { \ 2630 intptr_t i, oprsz = simd_oprsz(desc); \ 2631 int shift = simd_data(desc); \ 2632 TYPE *d = vd, *n = vn; \ 2633 float_status *fpst = stat; \ 2634 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2635 d[i] = FUNC(n[i], shift, fpst); \ 2636 } \ 2637 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2638 } 2639 2640 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t) 2641 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t) 2642 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2643 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2644 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2645 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2646 2647 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t) 2648 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t) 2649 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2650 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t) 2651 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2652 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2653 2654 #undef DO_VCVT_FIXED 2655 2656 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2657 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2658 { \ 2659 intptr_t i, oprsz = simd_oprsz(desc); \ 2660 uint32_t rmode = simd_data(desc); \ 2661 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2662 TYPE *d = vd, *n = vn; \ 2663 set_float_rounding_mode(rmode, fpst); \ 2664 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2665 d[i] = FUNC(n[i], 0, fpst); \ 2666 } \ 2667 set_float_rounding_mode(prev_rmode, fpst); \ 2668 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2669 } 2670 2671 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t) 2672 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t) 2673 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2674 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2675 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2676 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2677 2678 #undef DO_VCVT_RMODE 2679 2680 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2681 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2682 { \ 2683 intptr_t i, oprsz = simd_oprsz(desc); \ 2684 uint32_t rmode = simd_data(desc); \ 2685 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2686 TYPE *d = vd, *n = vn; \ 2687 set_float_rounding_mode(rmode, fpst); \ 2688 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2689 d[i] = FUNC(n[i], fpst); \ 2690 } \ 2691 set_float_rounding_mode(prev_rmode, fpst); \ 2692 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2693 } 2694 2695 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2696 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2697 2698 #undef DO_VRINT_RMODE 2699 2700 #ifdef TARGET_AARCH64 2701 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc) 2702 { 2703 const uint8_t *indices = vm; 2704 size_t oprsz = simd_oprsz(desc); 2705 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2706 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2707 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2708 union { 2709 uint8_t b[16]; 2710 uint64_t d[2]; 2711 } result; 2712 2713 /* 2714 * We must construct the final result in a temp, lest the output 2715 * overlaps the input table. For TBL, begin with zero; for TBX, 2716 * begin with the original register contents. Note that we always 2717 * copy 16 bytes here to avoid an extra branch; clearing the high 2718 * bits of the register for oprsz == 8 is handled below. 2719 */ 2720 if (is_tbx) { 2721 memcpy(&result, vd, 16); 2722 } else { 2723 memset(&result, 0, 16); 2724 } 2725 2726 for (size_t i = 0; i < oprsz; ++i) { 2727 uint32_t index = indices[H1(i)]; 2728 2729 if (index < table_len) { 2730 /* 2731 * Convert index (a byte offset into the virtual table 2732 * which is a series of 128-bit vectors concatenated) 2733 * into the correct register element, bearing in mind 2734 * that the table can wrap around from V31 to V0. 2735 */ 2736 const uint8_t *table = (const uint8_t *) 2737 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2738 result.b[H1(i)] = table[H1(index % 16)]; 2739 } 2740 } 2741 2742 memcpy(vd, &result, 16); 2743 clear_tail(vd, oprsz, simd_maxsz(desc)); 2744 } 2745 #endif 2746 2747 /* 2748 * NxN -> N highpart multiply 2749 * 2750 * TODO: expose this as a generic vector operation. 2751 */ 2752 2753 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2754 { 2755 intptr_t i, opr_sz = simd_oprsz(desc); 2756 int8_t *d = vd, *n = vn, *m = vm; 2757 2758 for (i = 0; i < opr_sz; ++i) { 2759 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2760 } 2761 clear_tail(d, opr_sz, simd_maxsz(desc)); 2762 } 2763 2764 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2765 { 2766 intptr_t i, opr_sz = simd_oprsz(desc); 2767 int16_t *d = vd, *n = vn, *m = vm; 2768 2769 for (i = 0; i < opr_sz / 2; ++i) { 2770 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2771 } 2772 clear_tail(d, opr_sz, simd_maxsz(desc)); 2773 } 2774 2775 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2776 { 2777 intptr_t i, opr_sz = simd_oprsz(desc); 2778 int32_t *d = vd, *n = vn, *m = vm; 2779 2780 for (i = 0; i < opr_sz / 4; ++i) { 2781 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2782 } 2783 clear_tail(d, opr_sz, simd_maxsz(desc)); 2784 } 2785 2786 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2787 { 2788 intptr_t i, opr_sz = simd_oprsz(desc); 2789 uint64_t *d = vd, *n = vn, *m = vm; 2790 uint64_t discard; 2791 2792 for (i = 0; i < opr_sz / 8; ++i) { 2793 muls64(&discard, &d[i], n[i], m[i]); 2794 } 2795 clear_tail(d, opr_sz, simd_maxsz(desc)); 2796 } 2797 2798 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2799 { 2800 intptr_t i, opr_sz = simd_oprsz(desc); 2801 uint8_t *d = vd, *n = vn, *m = vm; 2802 2803 for (i = 0; i < opr_sz; ++i) { 2804 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2805 } 2806 clear_tail(d, opr_sz, simd_maxsz(desc)); 2807 } 2808 2809 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2810 { 2811 intptr_t i, opr_sz = simd_oprsz(desc); 2812 uint16_t *d = vd, *n = vn, *m = vm; 2813 2814 for (i = 0; i < opr_sz / 2; ++i) { 2815 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2816 } 2817 clear_tail(d, opr_sz, simd_maxsz(desc)); 2818 } 2819 2820 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2821 { 2822 intptr_t i, opr_sz = simd_oprsz(desc); 2823 uint32_t *d = vd, *n = vn, *m = vm; 2824 2825 for (i = 0; i < opr_sz / 4; ++i) { 2826 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2827 } 2828 clear_tail(d, opr_sz, simd_maxsz(desc)); 2829 } 2830 2831 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2832 { 2833 intptr_t i, opr_sz = simd_oprsz(desc); 2834 uint64_t *d = vd, *n = vn, *m = vm; 2835 uint64_t discard; 2836 2837 for (i = 0; i < opr_sz / 8; ++i) { 2838 mulu64(&discard, &d[i], n[i], m[i]); 2839 } 2840 clear_tail(d, opr_sz, simd_maxsz(desc)); 2841 } 2842 2843 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2844 { 2845 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2846 int shr = simd_data(desc); 2847 uint64_t *d = vd, *n = vn, *m = vm; 2848 2849 for (i = 0; i < opr_sz; ++i) { 2850 d[i] = ror64(n[i] ^ m[i], shr); 2851 } 2852 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2853 } 2854 2855 /* 2856 * Integer matrix-multiply accumulate 2857 */ 2858 2859 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2860 { 2861 int8_t *n = vn, *m = vm; 2862 2863 for (intptr_t k = 0; k < 8; ++k) { 2864 sum += n[H1(k)] * m[H1(k)]; 2865 } 2866 return sum; 2867 } 2868 2869 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2870 { 2871 uint8_t *n = vn, *m = vm; 2872 2873 for (intptr_t k = 0; k < 8; ++k) { 2874 sum += n[H1(k)] * m[H1(k)]; 2875 } 2876 return sum; 2877 } 2878 2879 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2880 { 2881 uint8_t *n = vn; 2882 int8_t *m = vm; 2883 2884 for (intptr_t k = 0; k < 8; ++k) { 2885 sum += n[H1(k)] * m[H1(k)]; 2886 } 2887 return sum; 2888 } 2889 2890 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2891 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2892 { 2893 intptr_t seg, opr_sz = simd_oprsz(desc); 2894 2895 for (seg = 0; seg < opr_sz; seg += 16) { 2896 uint32_t *d = vd + seg; 2897 uint32_t *a = va + seg; 2898 uint32_t sum0, sum1, sum2, sum3; 2899 2900 /* 2901 * Process the entire segment at once, writing back the 2902 * results only after we've consumed all of the inputs. 2903 * 2904 * Key to indices by column: 2905 * i j i j 2906 */ 2907 sum0 = a[H4(0 + 0)]; 2908 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2909 sum1 = a[H4(0 + 1)]; 2910 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2911 sum2 = a[H4(2 + 0)]; 2912 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2913 sum3 = a[H4(2 + 1)]; 2914 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2915 2916 d[H4(0)] = sum0; 2917 d[H4(1)] = sum1; 2918 d[H4(2)] = sum2; 2919 d[H4(3)] = sum3; 2920 } 2921 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2922 } 2923 2924 #define DO_MMLA_B(NAME, INNER) \ 2925 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2926 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2927 2928 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2929 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2930 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2931 2932 /* 2933 * BFloat16 Dot Product 2934 */ 2935 2936 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp) 2937 { 2938 /* 2939 * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF. 2940 * For EBF = 0, we ignore the FPCR bits which determine rounding 2941 * mode and denormal-flushing, and we do unfused multiplies and 2942 * additions with intermediate rounding of all products and sums. 2943 * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits, 2944 * and we perform a fused two-way sum-of-products without intermediate 2945 * rounding of the products. 2946 * In either case, we don't set fp exception flags. 2947 * 2948 * EBF is AArch64 only, so even if it's set in the FPCR it has 2949 * no effect on AArch32 instructions. 2950 */ 2951 bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF; 2952 2953 *statusp = is_a64(env) ? env->vfp.fp_status_a64 : env->vfp.fp_status_a32; 2954 set_default_nan_mode(true, statusp); 2955 2956 if (ebf) { 2957 /* EBF=1 needs to do a step with round-to-odd semantics */ 2958 *oddstatusp = *statusp; 2959 set_float_rounding_mode(float_round_to_odd, oddstatusp); 2960 } else { 2961 set_flush_to_zero(true, statusp); 2962 set_flush_inputs_to_zero(true, statusp); 2963 set_float_rounding_mode(float_round_to_odd_inf, statusp); 2964 } 2965 return ebf; 2966 } 2967 2968 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst) 2969 { 2970 float32 t1, t2; 2971 2972 /* 2973 * Extract each BFloat16 from the element pair, and shift 2974 * them such that they become float32. 2975 */ 2976 t1 = float32_mul(e1 << 16, e2 << 16, fpst); 2977 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst); 2978 t1 = float32_add(t1, t2, fpst); 2979 t1 = float32_add(sum, t1, fpst); 2980 2981 return t1; 2982 } 2983 2984 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2, 2985 float_status *fpst, float_status *fpst_odd) 2986 { 2987 /* 2988 * Compare f16_dotadd() in sme_helper.c, but here we have 2989 * bfloat16 inputs. In particular that means that we do not 2990 * want the FPCR.FZ16 flush semantics, so we use the normal 2991 * float_status for the input handling here. 2992 */ 2993 float64 e1r = float32_to_float64(e1 << 16, fpst); 2994 float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst); 2995 float64 e2r = float32_to_float64(e2 << 16, fpst); 2996 float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst); 2997 float64 t64; 2998 float32 t32; 2999 3000 /* 3001 * The ARM pseudocode function FPDot performs both multiplies 3002 * and the add with a single rounding operation. Emulate this 3003 * by performing the first multiply in round-to-odd, then doing 3004 * the second multiply as fused multiply-add, and rounding to 3005 * float32 all in one step. 3006 */ 3007 t64 = float64_mul(e1r, e2r, fpst_odd); 3008 t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst); 3009 3010 /* This conversion is exact, because we've already rounded. */ 3011 t32 = float64_to_float32(t64, fpst); 3012 3013 /* The final accumulation step is not fused. */ 3014 return float32_add(sum, t32, fpst); 3015 } 3016 3017 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, 3018 CPUARMState *env, uint32_t desc) 3019 { 3020 intptr_t i, opr_sz = simd_oprsz(desc); 3021 float32 *d = vd, *a = va; 3022 uint32_t *n = vn, *m = vm; 3023 float_status fpst, fpst_odd; 3024 3025 if (is_ebf(env, &fpst, &fpst_odd)) { 3026 for (i = 0; i < opr_sz / 4; ++i) { 3027 d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd); 3028 } 3029 } else { 3030 for (i = 0; i < opr_sz / 4; ++i) { 3031 d[i] = bfdotadd(a[i], n[i], m[i], &fpst); 3032 } 3033 } 3034 clear_tail(d, opr_sz, simd_maxsz(desc)); 3035 } 3036 3037 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 3038 void *va, CPUARMState *env, uint32_t desc) 3039 { 3040 intptr_t i, j, opr_sz = simd_oprsz(desc); 3041 intptr_t index = simd_data(desc); 3042 intptr_t elements = opr_sz / 4; 3043 intptr_t eltspersegment = MIN(16 / 4, elements); 3044 float32 *d = vd, *a = va; 3045 uint32_t *n = vn, *m = vm; 3046 float_status fpst, fpst_odd; 3047 3048 if (is_ebf(env, &fpst, &fpst_odd)) { 3049 for (i = 0; i < elements; i += eltspersegment) { 3050 uint32_t m_idx = m[i + H4(index)]; 3051 3052 for (j = i; j < i + eltspersegment; j++) { 3053 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd); 3054 } 3055 } 3056 } else { 3057 for (i = 0; i < elements; i += eltspersegment) { 3058 uint32_t m_idx = m[i + H4(index)]; 3059 3060 for (j = i; j < i + eltspersegment; j++) { 3061 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst); 3062 } 3063 } 3064 } 3065 clear_tail(d, opr_sz, simd_maxsz(desc)); 3066 } 3067 3068 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, 3069 CPUARMState *env, uint32_t desc) 3070 { 3071 intptr_t s, opr_sz = simd_oprsz(desc); 3072 float32 *d = vd, *a = va; 3073 uint32_t *n = vn, *m = vm; 3074 float_status fpst, fpst_odd; 3075 3076 if (is_ebf(env, &fpst, &fpst_odd)) { 3077 for (s = 0; s < opr_sz / 4; s += 4) { 3078 float32 sum00, sum01, sum10, sum11; 3079 3080 /* 3081 * Process the entire segment at once, writing back the 3082 * results only after we've consumed all of the inputs. 3083 * 3084 * Key to indices by column: 3085 * i j i k j k 3086 */ 3087 sum00 = a[s + H4(0 + 0)]; 3088 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 3089 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 3090 3091 sum01 = a[s + H4(0 + 1)]; 3092 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 3093 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 3094 3095 sum10 = a[s + H4(2 + 0)]; 3096 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 3097 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 3098 3099 sum11 = a[s + H4(2 + 1)]; 3100 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 3101 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 3102 3103 d[s + H4(0 + 0)] = sum00; 3104 d[s + H4(0 + 1)] = sum01; 3105 d[s + H4(2 + 0)] = sum10; 3106 d[s + H4(2 + 1)] = sum11; 3107 } 3108 } else { 3109 for (s = 0; s < opr_sz / 4; s += 4) { 3110 float32 sum00, sum01, sum10, sum11; 3111 3112 /* 3113 * Process the entire segment at once, writing back the 3114 * results only after we've consumed all of the inputs. 3115 * 3116 * Key to indices by column: 3117 * i j i k j k 3118 */ 3119 sum00 = a[s + H4(0 + 0)]; 3120 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst); 3121 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst); 3122 3123 sum01 = a[s + H4(0 + 1)]; 3124 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst); 3125 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst); 3126 3127 sum10 = a[s + H4(2 + 0)]; 3128 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst); 3129 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst); 3130 3131 sum11 = a[s + H4(2 + 1)]; 3132 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst); 3133 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst); 3134 3135 d[s + H4(0 + 0)] = sum00; 3136 d[s + H4(0 + 1)] = sum01; 3137 d[s + H4(2 + 0)] = sum10; 3138 d[s + H4(2 + 1)] = sum11; 3139 } 3140 } 3141 clear_tail(d, opr_sz, simd_maxsz(desc)); 3142 } 3143 3144 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 3145 float_status *stat, uint32_t desc) 3146 { 3147 intptr_t i, opr_sz = simd_oprsz(desc); 3148 intptr_t sel = simd_data(desc); 3149 float32 *d = vd, *a = va; 3150 bfloat16 *n = vn, *m = vm; 3151 3152 for (i = 0; i < opr_sz / 4; ++i) { 3153 float32 nn = n[H2(i * 2 + sel)] << 16; 3154 float32 mm = m[H2(i * 2 + sel)] << 16; 3155 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 3156 } 3157 clear_tail(d, opr_sz, simd_maxsz(desc)); 3158 } 3159 3160 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 3161 void *va, float_status *stat, uint32_t desc) 3162 { 3163 intptr_t i, j, opr_sz = simd_oprsz(desc); 3164 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 3165 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 3166 intptr_t elements = opr_sz / 4; 3167 intptr_t eltspersegment = MIN(16 / 4, elements); 3168 float32 *d = vd, *a = va; 3169 bfloat16 *n = vn, *m = vm; 3170 3171 for (i = 0; i < elements; i += eltspersegment) { 3172 float32 m_idx = m[H2(2 * i + index)] << 16; 3173 3174 for (j = i; j < i + eltspersegment; j++) { 3175 float32 n_j = n[H2(2 * j + sel)] << 16; 3176 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 3177 } 3178 } 3179 clear_tail(d, opr_sz, simd_maxsz(desc)); 3180 } 3181 3182 #define DO_CLAMP(NAME, TYPE) \ 3183 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 3184 { \ 3185 intptr_t i, opr_sz = simd_oprsz(desc); \ 3186 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 3187 TYPE aa = *(TYPE *)(a + i); \ 3188 TYPE nn = *(TYPE *)(n + i); \ 3189 TYPE mm = *(TYPE *)(m + i); \ 3190 TYPE dd = MIN(MAX(aa, nn), mm); \ 3191 *(TYPE *)(d + i) = dd; \ 3192 } \ 3193 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 3194 } 3195 3196 DO_CLAMP(gvec_sclamp_b, int8_t) 3197 DO_CLAMP(gvec_sclamp_h, int16_t) 3198 DO_CLAMP(gvec_sclamp_s, int32_t) 3199 DO_CLAMP(gvec_sclamp_d, int64_t) 3200 3201 DO_CLAMP(gvec_uclamp_b, uint8_t) 3202 DO_CLAMP(gvec_uclamp_h, uint16_t) 3203 DO_CLAMP(gvec_uclamp_s, uint32_t) 3204 DO_CLAMP(gvec_uclamp_d, uint64_t) 3205 3206 /* Bit count in each 8-bit word. */ 3207 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc) 3208 { 3209 intptr_t i, opr_sz = simd_oprsz(desc); 3210 uint8_t *d = vd, *n = vn; 3211 3212 for (i = 0; i < opr_sz; ++i) { 3213 d[i] = ctpop8(n[i]); 3214 } 3215 clear_tail(d, opr_sz, simd_maxsz(desc)); 3216 } 3217 3218 /* Reverse bits in each 8 bit word */ 3219 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc) 3220 { 3221 intptr_t i, opr_sz = simd_oprsz(desc); 3222 uint64_t *d = vd, *n = vn; 3223 3224 for (i = 0; i < opr_sz / 8; ++i) { 3225 d[i] = revbit64(bswap64(n[i])); 3226 } 3227 clear_tail(d, opr_sz, simd_maxsz(desc)); 3228 } 3229 3230 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc) 3231 { 3232 intptr_t i, opr_sz = simd_oprsz(desc); 3233 uint32_t *d = vd, *n = vn; 3234 3235 for (i = 0; i < opr_sz / 4; ++i) { 3236 d[i] = helper_recpe_u32(n[i]); 3237 } 3238 clear_tail(d, opr_sz, simd_maxsz(desc)); 3239 } 3240 3241 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc) 3242 { 3243 intptr_t i, opr_sz = simd_oprsz(desc); 3244 uint32_t *d = vd, *n = vn; 3245 3246 for (i = 0; i < opr_sz / 4; ++i) { 3247 d[i] = helper_rsqrte_u32(n[i]); 3248 } 3249 clear_tail(d, opr_sz, simd_maxsz(desc)); 3250 } 3251