1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "crypto/clmul.h" 27 #include "vec_internal.h" 28 29 /* 30 * Data for expanding active predicate bits to bytes, for byte elements. 31 * 32 * for (i = 0; i < 256; ++i) { 33 * unsigned long m = 0; 34 * for (j = 0; j < 8; j++) { 35 * if ((i >> j) & 1) { 36 * m |= 0xfful << (j << 3); 37 * } 38 * } 39 * printf("0x%016lx,\n", m); 40 * } 41 */ 42 const uint64_t expand_pred_b_data[256] = { 43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 128 0xffffffffffffffff, 129 }; 130 131 /* 132 * Similarly for half-word elements. 133 * for (i = 0; i < 256; ++i) { 134 * unsigned long m = 0; 135 * if (i & 0xaa) { 136 * continue; 137 * } 138 * for (j = 0; j < 8; j += 2) { 139 * if ((i >> j) & 1) { 140 * m |= 0xfffful << (j << 3); 141 * } 142 * } 143 * printf("[0x%x] = 0x%016lx,\n", i, m); 144 * } 145 */ 146 const uint64_t expand_pred_h_data[0x55 + 1] = { 147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 154 [0x55] = 0xffffffffffffffff, 155 }; 156 157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 159 bool neg, bool round) 160 { 161 /* 162 * Simplify: 163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 165 */ 166 int32_t ret = (int32_t)src1 * src2; 167 if (neg) { 168 ret = -ret; 169 } 170 ret += ((int32_t)src3 << 7) + (round << 6); 171 ret >>= 7; 172 173 if (ret != (int8_t)ret) { 174 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 175 } 176 return ret; 177 } 178 179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 180 void *va, uint32_t desc) 181 { 182 intptr_t i, opr_sz = simd_oprsz(desc); 183 int8_t *d = vd, *n = vn, *m = vm, *a = va; 184 185 for (i = 0; i < opr_sz; ++i) { 186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 187 } 188 } 189 190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 191 void *va, uint32_t desc) 192 { 193 intptr_t i, opr_sz = simd_oprsz(desc); 194 int8_t *d = vd, *n = vn, *m = vm, *a = va; 195 196 for (i = 0; i < opr_sz; ++i) { 197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 198 } 199 } 200 201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 202 { 203 intptr_t i, opr_sz = simd_oprsz(desc); 204 int8_t *d = vd, *n = vn, *m = vm; 205 206 for (i = 0; i < opr_sz; ++i) { 207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 208 } 209 } 210 211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 212 { 213 intptr_t i, opr_sz = simd_oprsz(desc); 214 int8_t *d = vd, *n = vn, *m = vm; 215 216 for (i = 0; i < opr_sz; ++i) { 217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 218 } 219 } 220 221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 223 bool neg, bool round, uint32_t *sat) 224 { 225 /* Simplify similarly to do_sqrdmlah_b above. */ 226 int32_t ret = (int32_t)src1 * src2; 227 if (neg) { 228 ret = -ret; 229 } 230 ret += ((int32_t)src3 << 15) + (round << 14); 231 ret >>= 15; 232 233 if (ret != (int16_t)ret) { 234 *sat = 1; 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 236 } 237 return ret; 238 } 239 240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 241 uint32_t src2, uint32_t src3) 242 { 243 uint32_t *sat = &env->vfp.qc[0]; 244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 246 false, true, sat); 247 return deposit32(e1, 16, 16, e2); 248 } 249 250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 251 void *vq, uint32_t desc) 252 { 253 uintptr_t opr_sz = simd_oprsz(desc); 254 int16_t *d = vd; 255 int16_t *n = vn; 256 int16_t *m = vm; 257 uintptr_t i; 258 259 for (i = 0; i < opr_sz / 2; ++i) { 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 261 } 262 clear_tail(d, opr_sz, simd_maxsz(desc)); 263 } 264 265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 266 uint32_t src2, uint32_t src3) 267 { 268 uint32_t *sat = &env->vfp.qc[0]; 269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 271 true, true, sat); 272 return deposit32(e1, 16, 16, e2); 273 } 274 275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277 { 278 uintptr_t opr_sz = simd_oprsz(desc); 279 int16_t *d = vd; 280 int16_t *n = vn; 281 int16_t *m = vm; 282 uintptr_t i; 283 284 for (i = 0; i < opr_sz / 2; ++i) { 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 286 } 287 clear_tail(d, opr_sz, simd_maxsz(desc)); 288 } 289 290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 291 void *vq, uint32_t desc) 292 { 293 intptr_t i, opr_sz = simd_oprsz(desc); 294 int16_t *d = vd, *n = vn, *m = vm; 295 296 for (i = 0; i < opr_sz / 2; ++i) { 297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 298 } 299 clear_tail(d, opr_sz, simd_maxsz(desc)); 300 } 301 302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 303 void *vq, uint32_t desc) 304 { 305 intptr_t i, opr_sz = simd_oprsz(desc); 306 int16_t *d = vd, *n = vn, *m = vm; 307 308 for (i = 0; i < opr_sz / 2; ++i) { 309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 310 } 311 clear_tail(d, opr_sz, simd_maxsz(desc)); 312 } 313 314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm, 315 void *vq, uint32_t desc) 316 { 317 intptr_t i, j, opr_sz = simd_oprsz(desc); 318 int idx = simd_data(desc); 319 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 320 intptr_t elements = opr_sz / 2; 321 intptr_t eltspersegment = MIN(16 / 2, elements); 322 323 for (i = 0; i < elements; i += 16 / 2) { 324 int16_t mm = m[i]; 325 for (j = 0; j < eltspersegment; ++j) { 326 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq); 327 } 328 } 329 clear_tail(d, opr_sz, simd_maxsz(desc)); 330 } 331 332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, 333 void *vq, uint32_t desc) 334 { 335 intptr_t i, j, opr_sz = simd_oprsz(desc); 336 int idx = simd_data(desc); 337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 338 intptr_t elements = opr_sz / 2; 339 intptr_t eltspersegment = MIN(16 / 2, elements); 340 341 for (i = 0; i < elements; i += 16 / 2) { 342 int16_t mm = m[i]; 343 for (j = 0; j < eltspersegment; ++j) { 344 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq); 345 } 346 } 347 clear_tail(d, opr_sz, simd_maxsz(desc)); 348 } 349 350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm, 351 void *vq, uint32_t desc) 352 { 353 intptr_t i, j, opr_sz = simd_oprsz(desc); 354 int idx = simd_data(desc); 355 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 356 intptr_t elements = opr_sz / 2; 357 intptr_t eltspersegment = MIN(16 / 2, elements); 358 359 for (i = 0; i < elements; i += 16 / 2) { 360 int16_t mm = m[i]; 361 for (j = 0; j < eltspersegment; ++j) { 362 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq); 363 } 364 } 365 clear_tail(d, opr_sz, simd_maxsz(desc)); 366 } 367 368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm, 369 void *vq, uint32_t desc) 370 { 371 intptr_t i, j, opr_sz = simd_oprsz(desc); 372 int idx = simd_data(desc); 373 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 374 intptr_t elements = opr_sz / 2; 375 intptr_t eltspersegment = MIN(16 / 2, elements); 376 377 for (i = 0; i < elements; i += 16 / 2) { 378 int16_t mm = m[i]; 379 for (j = 0; j < eltspersegment; ++j) { 380 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq); 381 } 382 } 383 clear_tail(d, opr_sz, simd_maxsz(desc)); 384 } 385 386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 387 void *va, uint32_t desc) 388 { 389 intptr_t i, opr_sz = simd_oprsz(desc); 390 int16_t *d = vd, *n = vn, *m = vm, *a = va; 391 uint32_t discard; 392 393 for (i = 0; i < opr_sz / 2; ++i) { 394 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 395 } 396 } 397 398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 399 void *va, uint32_t desc) 400 { 401 intptr_t i, opr_sz = simd_oprsz(desc); 402 int16_t *d = vd, *n = vn, *m = vm, *a = va; 403 uint32_t discard; 404 405 for (i = 0; i < opr_sz / 2; ++i) { 406 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 407 } 408 } 409 410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 411 { 412 intptr_t i, opr_sz = simd_oprsz(desc); 413 int16_t *d = vd, *n = vn, *m = vm; 414 uint32_t discard; 415 416 for (i = 0; i < opr_sz / 2; ++i) { 417 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 418 } 419 } 420 421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 422 { 423 intptr_t i, opr_sz = simd_oprsz(desc); 424 int16_t *d = vd, *n = vn, *m = vm; 425 uint32_t discard; 426 427 for (i = 0; i < opr_sz / 2; ++i) { 428 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 429 } 430 } 431 432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 433 { 434 intptr_t i, j, opr_sz = simd_oprsz(desc); 435 int idx = simd_data(desc); 436 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 437 uint32_t discard; 438 439 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 440 int16_t mm = m[i]; 441 for (j = 0; j < 16 / 2; ++j) { 442 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 443 } 444 } 445 } 446 447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 448 { 449 intptr_t i, j, opr_sz = simd_oprsz(desc); 450 int idx = simd_data(desc); 451 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 452 uint32_t discard; 453 454 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 455 int16_t mm = m[i]; 456 for (j = 0; j < 16 / 2; ++j) { 457 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 458 } 459 } 460 } 461 462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 464 bool neg, bool round, uint32_t *sat) 465 { 466 /* Simplify similarly to do_sqrdmlah_b above. */ 467 int64_t ret = (int64_t)src1 * src2; 468 if (neg) { 469 ret = -ret; 470 } 471 ret += ((int64_t)src3 << 31) + (round << 30); 472 ret >>= 31; 473 474 if (ret != (int32_t)ret) { 475 *sat = 1; 476 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 477 } 478 return ret; 479 } 480 481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 482 int32_t src2, int32_t src3) 483 { 484 uint32_t *sat = &env->vfp.qc[0]; 485 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 486 } 487 488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 489 void *vq, uint32_t desc) 490 { 491 uintptr_t opr_sz = simd_oprsz(desc); 492 int32_t *d = vd; 493 int32_t *n = vn; 494 int32_t *m = vm; 495 uintptr_t i; 496 497 for (i = 0; i < opr_sz / 4; ++i) { 498 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 499 } 500 clear_tail(d, opr_sz, simd_maxsz(desc)); 501 } 502 503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 504 int32_t src2, int32_t src3) 505 { 506 uint32_t *sat = &env->vfp.qc[0]; 507 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 508 } 509 510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 511 void *vq, uint32_t desc) 512 { 513 uintptr_t opr_sz = simd_oprsz(desc); 514 int32_t *d = vd; 515 int32_t *n = vn; 516 int32_t *m = vm; 517 uintptr_t i; 518 519 for (i = 0; i < opr_sz / 4; ++i) { 520 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 521 } 522 clear_tail(d, opr_sz, simd_maxsz(desc)); 523 } 524 525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 526 void *vq, uint32_t desc) 527 { 528 intptr_t i, opr_sz = simd_oprsz(desc); 529 int32_t *d = vd, *n = vn, *m = vm; 530 531 for (i = 0; i < opr_sz / 4; ++i) { 532 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 533 } 534 clear_tail(d, opr_sz, simd_maxsz(desc)); 535 } 536 537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 538 void *vq, uint32_t desc) 539 { 540 intptr_t i, opr_sz = simd_oprsz(desc); 541 int32_t *d = vd, *n = vn, *m = vm; 542 543 for (i = 0; i < opr_sz / 4; ++i) { 544 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 545 } 546 clear_tail(d, opr_sz, simd_maxsz(desc)); 547 } 548 549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm, 550 void *vq, uint32_t desc) 551 { 552 intptr_t i, j, opr_sz = simd_oprsz(desc); 553 int idx = simd_data(desc); 554 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 555 intptr_t elements = opr_sz / 4; 556 intptr_t eltspersegment = MIN(16 / 4, elements); 557 558 for (i = 0; i < elements; i += 16 / 4) { 559 int32_t mm = m[i]; 560 for (j = 0; j < eltspersegment; ++j) { 561 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq); 562 } 563 } 564 clear_tail(d, opr_sz, simd_maxsz(desc)); 565 } 566 567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, 568 void *vq, uint32_t desc) 569 { 570 intptr_t i, j, opr_sz = simd_oprsz(desc); 571 int idx = simd_data(desc); 572 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 573 intptr_t elements = opr_sz / 4; 574 intptr_t eltspersegment = MIN(16 / 4, elements); 575 576 for (i = 0; i < elements; i += 16 / 4) { 577 int32_t mm = m[i]; 578 for (j = 0; j < eltspersegment; ++j) { 579 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq); 580 } 581 } 582 clear_tail(d, opr_sz, simd_maxsz(desc)); 583 } 584 585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm, 586 void *vq, uint32_t desc) 587 { 588 intptr_t i, j, opr_sz = simd_oprsz(desc); 589 int idx = simd_data(desc); 590 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 591 intptr_t elements = opr_sz / 4; 592 intptr_t eltspersegment = MIN(16 / 4, elements); 593 594 for (i = 0; i < elements; i += 16 / 4) { 595 int32_t mm = m[i]; 596 for (j = 0; j < eltspersegment; ++j) { 597 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq); 598 } 599 } 600 clear_tail(d, opr_sz, simd_maxsz(desc)); 601 } 602 603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm, 604 void *vq, uint32_t desc) 605 { 606 intptr_t i, j, opr_sz = simd_oprsz(desc); 607 int idx = simd_data(desc); 608 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 609 intptr_t elements = opr_sz / 4; 610 intptr_t eltspersegment = MIN(16 / 4, elements); 611 612 for (i = 0; i < elements; i += 16 / 4) { 613 int32_t mm = m[i]; 614 for (j = 0; j < eltspersegment; ++j) { 615 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq); 616 } 617 } 618 clear_tail(d, opr_sz, simd_maxsz(desc)); 619 } 620 621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 622 void *va, uint32_t desc) 623 { 624 intptr_t i, opr_sz = simd_oprsz(desc); 625 int32_t *d = vd, *n = vn, *m = vm, *a = va; 626 uint32_t discard; 627 628 for (i = 0; i < opr_sz / 4; ++i) { 629 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 630 } 631 } 632 633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 634 void *va, uint32_t desc) 635 { 636 intptr_t i, opr_sz = simd_oprsz(desc); 637 int32_t *d = vd, *n = vn, *m = vm, *a = va; 638 uint32_t discard; 639 640 for (i = 0; i < opr_sz / 4; ++i) { 641 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 642 } 643 } 644 645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 646 { 647 intptr_t i, opr_sz = simd_oprsz(desc); 648 int32_t *d = vd, *n = vn, *m = vm; 649 uint32_t discard; 650 651 for (i = 0; i < opr_sz / 4; ++i) { 652 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 653 } 654 } 655 656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 657 { 658 intptr_t i, opr_sz = simd_oprsz(desc); 659 int32_t *d = vd, *n = vn, *m = vm; 660 uint32_t discard; 661 662 for (i = 0; i < opr_sz / 4; ++i) { 663 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 664 } 665 } 666 667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 668 { 669 intptr_t i, j, opr_sz = simd_oprsz(desc); 670 int idx = simd_data(desc); 671 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 672 uint32_t discard; 673 674 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 675 int32_t mm = m[i]; 676 for (j = 0; j < 16 / 4; ++j) { 677 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 678 } 679 } 680 } 681 682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 683 { 684 intptr_t i, j, opr_sz = simd_oprsz(desc); 685 int idx = simd_data(desc); 686 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 687 uint32_t discard; 688 689 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 690 int32_t mm = m[i]; 691 for (j = 0; j < 16 / 4; ++j) { 692 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 693 } 694 } 695 } 696 697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 698 static int64_t do_sat128_d(Int128 r) 699 { 700 int64_t ls = int128_getlo(r); 701 int64_t hs = int128_gethi(r); 702 703 if (unlikely(hs != (ls >> 63))) { 704 return hs < 0 ? INT64_MIN : INT64_MAX; 705 } 706 return ls; 707 } 708 709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 710 { 711 uint64_t l, h; 712 Int128 r, t; 713 714 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 715 muls64(&l, &h, m, n); 716 r = int128_make128(l, h); 717 if (neg) { 718 r = int128_neg(r); 719 } 720 if (a) { 721 t = int128_exts64(a); 722 t = int128_lshift(t, 63); 723 r = int128_add(r, t); 724 } 725 if (round) { 726 t = int128_exts64(1ll << 62); 727 r = int128_add(r, t); 728 } 729 r = int128_rshift(r, 63); 730 731 return do_sat128_d(r); 732 } 733 734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 735 void *va, uint32_t desc) 736 { 737 intptr_t i, opr_sz = simd_oprsz(desc); 738 int64_t *d = vd, *n = vn, *m = vm, *a = va; 739 740 for (i = 0; i < opr_sz / 8; ++i) { 741 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 742 } 743 } 744 745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 746 void *va, uint32_t desc) 747 { 748 intptr_t i, opr_sz = simd_oprsz(desc); 749 int64_t *d = vd, *n = vn, *m = vm, *a = va; 750 751 for (i = 0; i < opr_sz / 8; ++i) { 752 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 753 } 754 } 755 756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 757 { 758 intptr_t i, opr_sz = simd_oprsz(desc); 759 int64_t *d = vd, *n = vn, *m = vm; 760 761 for (i = 0; i < opr_sz / 8; ++i) { 762 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 763 } 764 } 765 766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 767 { 768 intptr_t i, opr_sz = simd_oprsz(desc); 769 int64_t *d = vd, *n = vn, *m = vm; 770 771 for (i = 0; i < opr_sz / 8; ++i) { 772 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 773 } 774 } 775 776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 777 { 778 intptr_t i, j, opr_sz = simd_oprsz(desc); 779 int idx = simd_data(desc); 780 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 781 782 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 783 int64_t mm = m[i]; 784 for (j = 0; j < 16 / 8; ++j) { 785 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 786 } 787 } 788 } 789 790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 791 { 792 intptr_t i, j, opr_sz = simd_oprsz(desc); 793 int idx = simd_data(desc); 794 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 795 796 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 797 int64_t mm = m[i]; 798 for (j = 0; j < 16 / 8; ++j) { 799 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 800 } 801 } 802 } 803 804 /* Integer 8 and 16-bit dot-product. 805 * 806 * Note that for the loops herein, host endianness does not matter 807 * with respect to the ordering of data within the quad-width lanes. 808 * All elements are treated equally, no matter where they are. 809 */ 810 811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 813 { \ 814 intptr_t i, opr_sz = simd_oprsz(desc); \ 815 TYPED *d = vd, *a = va; \ 816 TYPEN *n = vn; \ 817 TYPEM *m = vm; \ 818 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 819 d[i] = (a[i] + \ 820 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 821 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 822 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 823 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 824 } \ 825 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 826 } 827 828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 833 834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 836 { \ 837 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 838 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 839 /* \ 840 * Special case: opr_sz == 8 from AA64/AA32 advsimd means the \ 841 * first iteration might not be a full 16 byte segment. But \ 842 * for vector lengths beyond that this must be SVE and we know \ 843 * opr_sz is a multiple of 16, so we need not clamp segend \ 844 * to opr_sz_n when we advance it at the end of the loop. \ 845 */ \ 846 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 847 intptr_t index = simd_data(desc); \ 848 TYPED *d = vd, *a = va; \ 849 TYPEN *n = vn; \ 850 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 851 do { \ 852 TYPED m0 = m_indexed[i * 4 + 0]; \ 853 TYPED m1 = m_indexed[i * 4 + 1]; \ 854 TYPED m2 = m_indexed[i * 4 + 2]; \ 855 TYPED m3 = m_indexed[i * 4 + 3]; \ 856 do { \ 857 d[i] = (a[i] + \ 858 n[i * 4 + 0] * m0 + \ 859 n[i * 4 + 1] * m1 + \ 860 n[i * 4 + 2] * m2 + \ 861 n[i * 4 + 3] * m3); \ 862 } while (++i < segend); \ 863 segend = i + (16 / sizeof(TYPED)); \ 864 } while (i < opr_sz_n); \ 865 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 866 } 867 868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 874 875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 876 float_status *fpst, uint32_t desc) 877 { 878 uintptr_t opr_sz = simd_oprsz(desc); 879 float16 *d = vd; 880 float16 *n = vn; 881 float16 *m = vm; 882 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 883 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 884 uintptr_t i; 885 886 for (i = 0; i < opr_sz / 2; i += 2) { 887 float16 e0 = n[H2(i)]; 888 float16 e1 = m[H2(i + 1)]; 889 float16 e2 = n[H2(i + 1)]; 890 float16 e3 = m[H2(i)]; 891 892 if (rot) { 893 e3 = float16_maybe_ah_chs(e3, fpcr_ah); 894 } else { 895 e1 = float16_maybe_ah_chs(e1, fpcr_ah); 896 } 897 898 d[H2(i)] = float16_add(e0, e1, fpst); 899 d[H2(i + 1)] = float16_add(e2, e3, fpst); 900 } 901 clear_tail(d, opr_sz, simd_maxsz(desc)); 902 } 903 904 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 905 float_status *fpst, uint32_t desc) 906 { 907 uintptr_t opr_sz = simd_oprsz(desc); 908 float32 *d = vd; 909 float32 *n = vn; 910 float32 *m = vm; 911 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 912 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 913 uintptr_t i; 914 915 for (i = 0; i < opr_sz / 4; i += 2) { 916 float32 e0 = n[H4(i)]; 917 float32 e1 = m[H4(i + 1)]; 918 float32 e2 = n[H4(i + 1)]; 919 float32 e3 = m[H4(i)]; 920 921 if (rot) { 922 e3 = float32_maybe_ah_chs(e3, fpcr_ah); 923 } else { 924 e1 = float32_maybe_ah_chs(e1, fpcr_ah); 925 } 926 927 d[H4(i)] = float32_add(e0, e1, fpst); 928 d[H4(i + 1)] = float32_add(e2, e3, fpst); 929 } 930 clear_tail(d, opr_sz, simd_maxsz(desc)); 931 } 932 933 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 934 float_status *fpst, uint32_t desc) 935 { 936 uintptr_t opr_sz = simd_oprsz(desc); 937 float64 *d = vd; 938 float64 *n = vn; 939 float64 *m = vm; 940 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 941 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 942 uintptr_t i; 943 944 for (i = 0; i < opr_sz / 8; i += 2) { 945 float64 e0 = n[i]; 946 float64 e1 = m[i + 1]; 947 float64 e2 = n[i + 1]; 948 float64 e3 = m[i]; 949 950 if (rot) { 951 e3 = float64_maybe_ah_chs(e3, fpcr_ah); 952 } else { 953 e1 = float64_maybe_ah_chs(e1, fpcr_ah); 954 } 955 956 d[i] = float64_add(e0, e1, fpst); 957 d[i + 1] = float64_add(e2, e3, fpst); 958 } 959 clear_tail(d, opr_sz, simd_maxsz(desc)); 960 } 961 962 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 963 float_status *fpst, uint32_t desc) 964 { 965 uintptr_t opr_sz = simd_oprsz(desc); 966 float16 *d = vd, *n = vn, *m = vm, *a = va; 967 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 968 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 969 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 970 uint32_t negf_real = flip ^ negf_imag; 971 float16 negx_imag, negx_real; 972 uintptr_t i; 973 974 /* With AH=0, use negx; with AH=1 use negf. */ 975 negx_real = (negf_real & ~fpcr_ah) << 15; 976 negx_imag = (negf_imag & ~fpcr_ah) << 15; 977 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 978 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 979 980 for (i = 0; i < opr_sz / 2; i += 2) { 981 float16 e2 = n[H2(i + flip)]; 982 float16 e1 = m[H2(i + flip)] ^ negx_real; 983 float16 e4 = e2; 984 float16 e3 = m[H2(i + 1 - flip)] ^ negx_imag; 985 986 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], negf_real, fpst); 987 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], negf_imag, fpst); 988 } 989 clear_tail(d, opr_sz, simd_maxsz(desc)); 990 } 991 992 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 993 float_status *fpst, uint32_t desc) 994 { 995 uintptr_t opr_sz = simd_oprsz(desc); 996 float16 *d = vd, *n = vn, *m = vm, *a = va; 997 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 998 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 999 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1000 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1); 1001 uint32_t negf_real = flip ^ negf_imag; 1002 intptr_t elements = opr_sz / sizeof(float16); 1003 intptr_t eltspersegment = MIN(16 / sizeof(float16), elements); 1004 float16 negx_imag, negx_real; 1005 intptr_t i, j; 1006 1007 /* With AH=0, use negx; with AH=1 use negf. */ 1008 negx_real = (negf_real & ~fpcr_ah) << 15; 1009 negx_imag = (negf_imag & ~fpcr_ah) << 15; 1010 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1011 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1012 1013 for (i = 0; i < elements; i += eltspersegment) { 1014 float16 mr = m[H2(i + 2 * index + 0)]; 1015 float16 mi = m[H2(i + 2 * index + 1)]; 1016 float16 e1 = negx_real ^ (flip ? mi : mr); 1017 float16 e3 = negx_imag ^ (flip ? mr : mi); 1018 1019 for (j = i; j < i + eltspersegment; j += 2) { 1020 float16 e2 = n[H2(j + flip)]; 1021 float16 e4 = e2; 1022 1023 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], negf_real, fpst); 1024 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], negf_imag, fpst); 1025 } 1026 } 1027 clear_tail(d, opr_sz, simd_maxsz(desc)); 1028 } 1029 1030 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 1031 float_status *fpst, uint32_t desc) 1032 { 1033 uintptr_t opr_sz = simd_oprsz(desc); 1034 float32 *d = vd, *n = vn, *m = vm, *a = va; 1035 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1036 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 1037 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1038 uint32_t negf_real = flip ^ negf_imag; 1039 float32 negx_imag, negx_real; 1040 uintptr_t i; 1041 1042 /* With AH=0, use negx; with AH=1 use negf. */ 1043 negx_real = (negf_real & ~fpcr_ah) << 31; 1044 negx_imag = (negf_imag & ~fpcr_ah) << 31; 1045 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1046 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1047 1048 for (i = 0; i < opr_sz / 4; i += 2) { 1049 float32 e2 = n[H4(i + flip)]; 1050 float32 e1 = m[H4(i + flip)] ^ negx_real; 1051 float32 e4 = e2; 1052 float32 e3 = m[H4(i + 1 - flip)] ^ negx_imag; 1053 1054 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], negf_real, fpst); 1055 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], negf_imag, fpst); 1056 } 1057 clear_tail(d, opr_sz, simd_maxsz(desc)); 1058 } 1059 1060 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 1061 float_status *fpst, uint32_t desc) 1062 { 1063 uintptr_t opr_sz = simd_oprsz(desc); 1064 float32 *d = vd, *n = vn, *m = vm, *a = va; 1065 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1066 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1067 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1068 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1); 1069 uint32_t negf_real = flip ^ negf_imag; 1070 intptr_t elements = opr_sz / sizeof(float32); 1071 intptr_t eltspersegment = MIN(16 / sizeof(float32), elements); 1072 float32 negx_imag, negx_real; 1073 intptr_t i, j; 1074 1075 /* With AH=0, use negx; with AH=1 use negf. */ 1076 negx_real = (negf_real & ~fpcr_ah) << 31; 1077 negx_imag = (negf_imag & ~fpcr_ah) << 31; 1078 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1079 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1080 1081 for (i = 0; i < elements; i += eltspersegment) { 1082 float32 mr = m[H4(i + 2 * index + 0)]; 1083 float32 mi = m[H4(i + 2 * index + 1)]; 1084 float32 e1 = negx_real ^ (flip ? mi : mr); 1085 float32 e3 = negx_imag ^ (flip ? mr : mi); 1086 1087 for (j = i; j < i + eltspersegment; j += 2) { 1088 float32 e2 = n[H4(j + flip)]; 1089 float32 e4 = e2; 1090 1091 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], negf_real, fpst); 1092 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], negf_imag, fpst); 1093 } 1094 } 1095 clear_tail(d, opr_sz, simd_maxsz(desc)); 1096 } 1097 1098 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 1099 float_status *fpst, uint32_t desc) 1100 { 1101 uintptr_t opr_sz = simd_oprsz(desc); 1102 float64 *d = vd, *n = vn, *m = vm, *a = va; 1103 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1104 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 1105 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1106 uint32_t negf_real = flip ^ negf_imag; 1107 float64 negx_real, negx_imag; 1108 uintptr_t i; 1109 1110 /* With AH=0, use negx; with AH=1 use negf. */ 1111 negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63; 1112 negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63; 1113 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1114 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1115 1116 for (i = 0; i < opr_sz / 8; i += 2) { 1117 float64 e2 = n[i + flip]; 1118 float64 e1 = m[i + flip] ^ negx_real; 1119 float64 e4 = e2; 1120 float64 e3 = m[i + 1 - flip] ^ negx_imag; 1121 1122 d[i] = float64_muladd(e2, e1, a[i], negf_real, fpst); 1123 d[i + 1] = float64_muladd(e4, e3, a[i + 1], negf_imag, fpst); 1124 } 1125 clear_tail(d, opr_sz, simd_maxsz(desc)); 1126 } 1127 1128 /* 1129 * Floating point comparisons producing an integer result (all 1s or all 0s). 1130 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1131 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1132 */ 1133 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 1134 { 1135 return -float16_eq_quiet(op1, op2, stat); 1136 } 1137 1138 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 1139 { 1140 return -float32_eq_quiet(op1, op2, stat); 1141 } 1142 1143 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat) 1144 { 1145 return -float64_eq_quiet(op1, op2, stat); 1146 } 1147 1148 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 1149 { 1150 return -float16_le(op2, op1, stat); 1151 } 1152 1153 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 1154 { 1155 return -float32_le(op2, op1, stat); 1156 } 1157 1158 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat) 1159 { 1160 return -float64_le(op2, op1, stat); 1161 } 1162 1163 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 1164 { 1165 return -float16_lt(op2, op1, stat); 1166 } 1167 1168 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 1169 { 1170 return -float32_lt(op2, op1, stat); 1171 } 1172 1173 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat) 1174 { 1175 return -float64_lt(op2, op1, stat); 1176 } 1177 1178 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 1179 { 1180 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 1181 } 1182 1183 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 1184 { 1185 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1186 } 1187 1188 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat) 1189 { 1190 return -float64_le(float64_abs(op2), float64_abs(op1), stat); 1191 } 1192 1193 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1194 { 1195 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1196 } 1197 1198 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1199 { 1200 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1201 } 1202 1203 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat) 1204 { 1205 return -float64_lt(float64_abs(op2), float64_abs(op1), stat); 1206 } 1207 1208 static int16_t vfp_tosszh(float16 x, float_status *fpst) 1209 { 1210 if (float16_is_any_nan(x)) { 1211 float_raise(float_flag_invalid, fpst); 1212 return 0; 1213 } 1214 return float16_to_int16_round_to_zero(x, fpst); 1215 } 1216 1217 static uint16_t vfp_touszh(float16 x, float_status *fpst) 1218 { 1219 if (float16_is_any_nan(x)) { 1220 float_raise(float_flag_invalid, fpst); 1221 return 0; 1222 } 1223 return float16_to_uint16_round_to_zero(x, fpst); 1224 } 1225 1226 #define DO_2OP(NAME, FUNC, TYPE) \ 1227 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 1228 { \ 1229 intptr_t i, oprsz = simd_oprsz(desc); \ 1230 TYPE *d = vd, *n = vn; \ 1231 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1232 d[i] = FUNC(n[i], stat); \ 1233 } \ 1234 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1235 } 1236 1237 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1238 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1239 DO_2OP(gvec_frecpe_rpres_s, helper_recpe_rpres_f32, float32) 1240 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1241 1242 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1243 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1244 DO_2OP(gvec_frsqrte_rpres_s, helper_rsqrte_rpres_f32, float32) 1245 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1246 1247 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1248 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1249 1250 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1251 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1252 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1253 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1254 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1255 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1256 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1257 DO_2OP(gvec_touszh, vfp_touszh, float16) 1258 1259 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1260 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1261 { \ 1262 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1263 } 1264 1265 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1266 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1267 { \ 1268 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1269 } 1270 1271 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1272 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1273 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1274 WRAP_CMP0_##DIRN(FN, CMPOP, float64) \ 1275 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1276 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) \ 1277 DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64) 1278 1279 DO_2OP_CMP0(cgt, cgt, FWD) 1280 DO_2OP_CMP0(cge, cge, FWD) 1281 DO_2OP_CMP0(ceq, ceq, FWD) 1282 DO_2OP_CMP0(clt, cgt, REV) 1283 DO_2OP_CMP0(cle, cge, REV) 1284 1285 #undef DO_2OP 1286 #undef DO_2OP_CMP0 1287 1288 /* Floating-point trigonometric starting value. 1289 * See the ARM ARM pseudocode function FPTrigSMul. 1290 */ 1291 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1292 { 1293 float16 result = float16_mul(op1, op1, stat); 1294 if (!float16_is_any_nan(result)) { 1295 result = float16_set_sign(result, op2 & 1); 1296 } 1297 return result; 1298 } 1299 1300 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1301 { 1302 float32 result = float32_mul(op1, op1, stat); 1303 if (!float32_is_any_nan(result)) { 1304 result = float32_set_sign(result, op2 & 1); 1305 } 1306 return result; 1307 } 1308 1309 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1310 { 1311 float64 result = float64_mul(op1, op1, stat); 1312 if (!float64_is_any_nan(result)) { 1313 result = float64_set_sign(result, op2 & 1); 1314 } 1315 return result; 1316 } 1317 1318 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1319 { 1320 return float16_abs(float16_sub(op1, op2, stat)); 1321 } 1322 1323 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1324 { 1325 return float32_abs(float32_sub(op1, op2, stat)); 1326 } 1327 1328 static float64 float64_abd(float64 op1, float64 op2, float_status *stat) 1329 { 1330 return float64_abs(float64_sub(op1, op2, stat)); 1331 } 1332 1333 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */ 1334 static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat) 1335 { 1336 float16 r = float16_sub(op1, op2, stat); 1337 return float16_is_any_nan(r) ? r : float16_abs(r); 1338 } 1339 1340 static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat) 1341 { 1342 float32 r = float32_sub(op1, op2, stat); 1343 return float32_is_any_nan(r) ? r : float32_abs(r); 1344 } 1345 1346 static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat) 1347 { 1348 float64 r = float64_sub(op1, op2, stat); 1349 return float64_is_any_nan(r) ? r : float64_abs(r); 1350 } 1351 1352 /* 1353 * Reciprocal step. These are the AArch32 version which uses a 1354 * non-fused multiply-and-subtract. 1355 */ 1356 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1357 { 1358 op1 = float16_squash_input_denormal(op1, stat); 1359 op2 = float16_squash_input_denormal(op2, stat); 1360 1361 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1362 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1363 return float16_two; 1364 } 1365 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1366 } 1367 1368 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1369 { 1370 op1 = float32_squash_input_denormal(op1, stat); 1371 op2 = float32_squash_input_denormal(op2, stat); 1372 1373 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1374 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1375 return float32_two; 1376 } 1377 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1378 } 1379 1380 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1381 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1382 { 1383 op1 = float16_squash_input_denormal(op1, stat); 1384 op2 = float16_squash_input_denormal(op2, stat); 1385 1386 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1387 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1388 return float16_one_point_five; 1389 } 1390 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1391 return float16_div(op1, float16_two, stat); 1392 } 1393 1394 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1395 { 1396 op1 = float32_squash_input_denormal(op1, stat); 1397 op2 = float32_squash_input_denormal(op2, stat); 1398 1399 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1400 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1401 return float32_one_point_five; 1402 } 1403 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1404 return float32_div(op1, float32_two, stat); 1405 } 1406 1407 #define DO_3OP(NAME, FUNC, TYPE) \ 1408 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1409 float_status *stat, uint32_t desc) \ 1410 { \ 1411 intptr_t i, oprsz = simd_oprsz(desc); \ 1412 TYPE *d = vd, *n = vn, *m = vm; \ 1413 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1414 d[i] = FUNC(n[i], m[i], stat); \ 1415 } \ 1416 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1417 } 1418 1419 DO_3OP(gvec_fadd_h, float16_add, float16) 1420 DO_3OP(gvec_fadd_s, float32_add, float32) 1421 DO_3OP(gvec_fadd_d, float64_add, float64) 1422 1423 DO_3OP(gvec_fsub_h, float16_sub, float16) 1424 DO_3OP(gvec_fsub_s, float32_sub, float32) 1425 DO_3OP(gvec_fsub_d, float64_sub, float64) 1426 1427 DO_3OP(gvec_fmul_h, float16_mul, float16) 1428 DO_3OP(gvec_fmul_s, float32_mul, float32) 1429 DO_3OP(gvec_fmul_d, float64_mul, float64) 1430 1431 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1432 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1433 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1434 1435 DO_3OP(gvec_fabd_h, float16_abd, float16) 1436 DO_3OP(gvec_fabd_s, float32_abd, float32) 1437 DO_3OP(gvec_fabd_d, float64_abd, float64) 1438 1439 DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16) 1440 DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32) 1441 DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64) 1442 1443 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1444 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1445 DO_3OP(gvec_fceq_d, float64_ceq, float64) 1446 1447 DO_3OP(gvec_fcge_h, float16_cge, float16) 1448 DO_3OP(gvec_fcge_s, float32_cge, float32) 1449 DO_3OP(gvec_fcge_d, float64_cge, float64) 1450 1451 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1452 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1453 DO_3OP(gvec_fcgt_d, float64_cgt, float64) 1454 1455 DO_3OP(gvec_facge_h, float16_acge, float16) 1456 DO_3OP(gvec_facge_s, float32_acge, float32) 1457 DO_3OP(gvec_facge_d, float64_acge, float64) 1458 1459 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1460 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1461 DO_3OP(gvec_facgt_d, float64_acgt, float64) 1462 1463 DO_3OP(gvec_fmax_h, float16_max, float16) 1464 DO_3OP(gvec_fmax_s, float32_max, float32) 1465 DO_3OP(gvec_fmax_d, float64_max, float64) 1466 1467 DO_3OP(gvec_fmin_h, float16_min, float16) 1468 DO_3OP(gvec_fmin_s, float32_min, float32) 1469 DO_3OP(gvec_fmin_d, float64_min, float64) 1470 1471 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1472 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1473 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64) 1474 1475 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1476 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1477 DO_3OP(gvec_fminnum_d, float64_minnum, float64) 1478 1479 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1480 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1481 1482 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1483 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1484 1485 #ifdef TARGET_AARCH64 1486 DO_3OP(gvec_fdiv_h, float16_div, float16) 1487 DO_3OP(gvec_fdiv_s, float32_div, float32) 1488 DO_3OP(gvec_fdiv_d, float64_div, float64) 1489 1490 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16) 1491 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32) 1492 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64) 1493 1494 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1495 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1496 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1497 1498 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1499 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1500 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1501 1502 DO_3OP(gvec_ah_recps_h, helper_recpsf_ah_f16, float16) 1503 DO_3OP(gvec_ah_recps_s, helper_recpsf_ah_f32, float32) 1504 DO_3OP(gvec_ah_recps_d, helper_recpsf_ah_f64, float64) 1505 1506 DO_3OP(gvec_ah_rsqrts_h, helper_rsqrtsf_ah_f16, float16) 1507 DO_3OP(gvec_ah_rsqrts_s, helper_rsqrtsf_ah_f32, float32) 1508 DO_3OP(gvec_ah_rsqrts_d, helper_rsqrtsf_ah_f64, float64) 1509 1510 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16) 1511 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32) 1512 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64) 1513 1514 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16) 1515 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32) 1516 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64) 1517 1518 #endif 1519 #undef DO_3OP 1520 1521 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1522 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1523 float_status *stat) 1524 { 1525 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1526 } 1527 1528 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1529 float_status *stat) 1530 { 1531 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1532 } 1533 1534 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1535 float_status *stat) 1536 { 1537 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1538 } 1539 1540 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1541 float_status *stat) 1542 { 1543 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1544 } 1545 1546 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1547 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1548 float_status *stat) 1549 { 1550 return float16_muladd(op1, op2, dest, 0, stat); 1551 } 1552 1553 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1554 float_status *stat) 1555 { 1556 return float32_muladd(op1, op2, dest, 0, stat); 1557 } 1558 1559 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2, 1560 float_status *stat) 1561 { 1562 return float64_muladd(op1, op2, dest, 0, stat); 1563 } 1564 1565 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1566 float_status *stat) 1567 { 1568 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1569 } 1570 1571 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1572 float_status *stat) 1573 { 1574 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1575 } 1576 1577 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2, 1578 float_status *stat) 1579 { 1580 return float64_muladd(float64_chs(op1), op2, dest, 0, stat); 1581 } 1582 1583 static float16 float16_ah_mulsub_f(float16 dest, float16 op1, float16 op2, 1584 float_status *stat) 1585 { 1586 return float16_muladd(op1, op2, dest, float_muladd_negate_product, stat); 1587 } 1588 1589 static float32 float32_ah_mulsub_f(float32 dest, float32 op1, float32 op2, 1590 float_status *stat) 1591 { 1592 return float32_muladd(op1, op2, dest, float_muladd_negate_product, stat); 1593 } 1594 1595 static float64 float64_ah_mulsub_f(float64 dest, float64 op1, float64 op2, 1596 float_status *stat) 1597 { 1598 return float64_muladd(op1, op2, dest, float_muladd_negate_product, stat); 1599 } 1600 1601 #define DO_MULADD(NAME, FUNC, TYPE) \ 1602 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1603 float_status *stat, uint32_t desc) \ 1604 { \ 1605 intptr_t i, oprsz = simd_oprsz(desc); \ 1606 TYPE *d = vd, *n = vn, *m = vm; \ 1607 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1608 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1609 } \ 1610 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1611 } 1612 1613 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1614 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1615 1616 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1617 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1618 1619 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1620 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1621 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64) 1622 1623 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1624 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1625 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64) 1626 1627 DO_MULADD(gvec_ah_vfms_h, float16_ah_mulsub_f, float16) 1628 DO_MULADD(gvec_ah_vfms_s, float32_ah_mulsub_f, float32) 1629 DO_MULADD(gvec_ah_vfms_d, float64_ah_mulsub_f, float64) 1630 1631 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1632 * For AdvSIMD, there is of course only one such vector segment. 1633 */ 1634 1635 #define DO_MUL_IDX(NAME, TYPE, H) \ 1636 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1637 { \ 1638 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1639 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1640 intptr_t idx = simd_data(desc); \ 1641 TYPE *d = vd, *n = vn, *m = vm; \ 1642 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1643 TYPE mm = m[H(i + idx)]; \ 1644 for (j = 0; j < segment; j++) { \ 1645 d[i + j] = n[i + j] * mm; \ 1646 } \ 1647 } \ 1648 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1649 } 1650 1651 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1652 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1653 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1654 1655 #undef DO_MUL_IDX 1656 1657 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1658 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1659 { \ 1660 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1661 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1662 intptr_t idx = simd_data(desc); \ 1663 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1664 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1665 TYPE mm = m[H(i + idx)]; \ 1666 for (j = 0; j < segment; j++) { \ 1667 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1668 } \ 1669 } \ 1670 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1671 } 1672 1673 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1674 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1675 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1676 1677 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1678 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1679 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1680 1681 #undef DO_MLA_IDX 1682 1683 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \ 1684 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1685 float_status *stat, uint32_t desc) \ 1686 { \ 1687 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1688 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1689 intptr_t idx = simd_data(desc); \ 1690 TYPE *d = vd, *n = vn, *m = vm; \ 1691 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1692 TYPE mm = m[H(i + idx)]; \ 1693 for (j = 0; j < segment; j++) { \ 1694 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \ 1695 } \ 1696 } \ 1697 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1698 } 1699 1700 #define nop(N, M, S) (M) 1701 1702 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2) 1703 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4) 1704 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8) 1705 1706 #ifdef TARGET_AARCH64 1707 1708 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2) 1709 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4) 1710 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8) 1711 1712 #endif 1713 1714 #undef nop 1715 1716 /* 1717 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1718 * the fused ops below they assume accumulate both from and into Vd. 1719 */ 1720 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2) 1721 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4) 1722 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2) 1723 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4) 1724 1725 #undef DO_FMUL_IDX 1726 1727 #define DO_FMLA_IDX(NAME, TYPE, H, NEGX, NEGF) \ 1728 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1729 float_status *stat, uint32_t desc) \ 1730 { \ 1731 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1732 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1733 intptr_t idx = simd_data(desc); \ 1734 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1735 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1736 TYPE mm = m[H(i + idx)]; \ 1737 for (j = 0; j < segment; j++) { \ 1738 d[i + j] = TYPE##_muladd(n[i + j] ^ NEGX, mm, \ 1739 a[i + j], NEGF, stat); \ 1740 } \ 1741 } \ 1742 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1743 } 1744 1745 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2, 0, 0) 1746 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4, 0, 0) 1747 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8, 0, 0) 1748 1749 DO_FMLA_IDX(gvec_fmls_idx_h, float16, H2, INT16_MIN, 0) 1750 DO_FMLA_IDX(gvec_fmls_idx_s, float32, H4, INT32_MIN, 0) 1751 DO_FMLA_IDX(gvec_fmls_idx_d, float64, H8, INT64_MIN, 0) 1752 1753 DO_FMLA_IDX(gvec_ah_fmls_idx_h, float16, H2, 0, float_muladd_negate_product) 1754 DO_FMLA_IDX(gvec_ah_fmls_idx_s, float32, H4, 0, float_muladd_negate_product) 1755 DO_FMLA_IDX(gvec_ah_fmls_idx_d, float64, H8, 0, float_muladd_negate_product) 1756 1757 #undef DO_FMLA_IDX 1758 1759 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1760 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1761 { \ 1762 intptr_t i, oprsz = simd_oprsz(desc); \ 1763 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1764 bool q = false; \ 1765 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1766 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1767 if (dd < MIN) { \ 1768 dd = MIN; \ 1769 q = true; \ 1770 } else if (dd > MAX) { \ 1771 dd = MAX; \ 1772 q = true; \ 1773 } \ 1774 d[i] = dd; \ 1775 } \ 1776 if (q) { \ 1777 uint32_t *qc = vq; \ 1778 qc[0] = 1; \ 1779 } \ 1780 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1781 } 1782 1783 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1784 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1785 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1786 1787 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1788 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1789 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1790 1791 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1792 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1793 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1794 1795 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1796 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1797 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1798 1799 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX) 1800 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX) 1801 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX) 1802 1803 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX) 1804 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX) 1805 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX) 1806 1807 #undef DO_SAT 1808 1809 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1810 void *vm, uint32_t desc) 1811 { 1812 intptr_t i, oprsz = simd_oprsz(desc); 1813 uint64_t *d = vd, *n = vn, *m = vm; 1814 bool q = false; 1815 1816 for (i = 0; i < oprsz / 8; i++) { 1817 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1818 if (dd < nn) { 1819 dd = UINT64_MAX; 1820 q = true; 1821 } 1822 d[i] = dd; 1823 } 1824 if (q) { 1825 uint32_t *qc = vq; 1826 qc[0] = 1; 1827 } 1828 clear_tail(d, oprsz, simd_maxsz(desc)); 1829 } 1830 1831 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1832 void *vm, uint32_t desc) 1833 { 1834 intptr_t i, oprsz = simd_oprsz(desc); 1835 uint64_t *d = vd, *n = vn, *m = vm; 1836 bool q = false; 1837 1838 for (i = 0; i < oprsz / 8; i++) { 1839 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1840 if (nn < mm) { 1841 dd = 0; 1842 q = true; 1843 } 1844 d[i] = dd; 1845 } 1846 if (q) { 1847 uint32_t *qc = vq; 1848 qc[0] = 1; 1849 } 1850 clear_tail(d, oprsz, simd_maxsz(desc)); 1851 } 1852 1853 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1854 void *vm, uint32_t desc) 1855 { 1856 intptr_t i, oprsz = simd_oprsz(desc); 1857 int64_t *d = vd, *n = vn, *m = vm; 1858 bool q = false; 1859 1860 for (i = 0; i < oprsz / 8; i++) { 1861 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1862 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1863 dd = (nn >> 63) ^ ~INT64_MIN; 1864 q = true; 1865 } 1866 d[i] = dd; 1867 } 1868 if (q) { 1869 uint32_t *qc = vq; 1870 qc[0] = 1; 1871 } 1872 clear_tail(d, oprsz, simd_maxsz(desc)); 1873 } 1874 1875 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1876 void *vm, uint32_t desc) 1877 { 1878 intptr_t i, oprsz = simd_oprsz(desc); 1879 int64_t *d = vd, *n = vn, *m = vm; 1880 bool q = false; 1881 1882 for (i = 0; i < oprsz / 8; i++) { 1883 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1884 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1885 dd = (nn >> 63) ^ ~INT64_MIN; 1886 q = true; 1887 } 1888 d[i] = dd; 1889 } 1890 if (q) { 1891 uint32_t *qc = vq; 1892 qc[0] = 1; 1893 } 1894 clear_tail(d, oprsz, simd_maxsz(desc)); 1895 } 1896 1897 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn, 1898 void *vm, uint32_t desc) 1899 { 1900 intptr_t i, oprsz = simd_oprsz(desc); 1901 uint64_t *d = vd, *n = vn, *m = vm; 1902 bool q = false; 1903 1904 for (i = 0; i < oprsz / 8; i++) { 1905 uint64_t nn = n[i]; 1906 int64_t mm = m[i]; 1907 uint64_t dd = nn + mm; 1908 1909 if (mm < 0) { 1910 if (nn < (uint64_t)-mm) { 1911 dd = 0; 1912 q = true; 1913 } 1914 } else { 1915 if (dd < nn) { 1916 dd = UINT64_MAX; 1917 q = true; 1918 } 1919 } 1920 d[i] = dd; 1921 } 1922 if (q) { 1923 uint32_t *qc = vq; 1924 qc[0] = 1; 1925 } 1926 clear_tail(d, oprsz, simd_maxsz(desc)); 1927 } 1928 1929 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn, 1930 void *vm, uint32_t desc) 1931 { 1932 intptr_t i, oprsz = simd_oprsz(desc); 1933 uint64_t *d = vd, *n = vn, *m = vm; 1934 bool q = false; 1935 1936 for (i = 0; i < oprsz / 8; i++) { 1937 int64_t nn = n[i]; 1938 uint64_t mm = m[i]; 1939 int64_t dd = nn + mm; 1940 1941 if (mm > (uint64_t)(INT64_MAX - nn)) { 1942 dd = INT64_MAX; 1943 q = true; 1944 } 1945 d[i] = dd; 1946 } 1947 if (q) { 1948 uint32_t *qc = vq; 1949 qc[0] = 1; 1950 } 1951 clear_tail(d, oprsz, simd_maxsz(desc)); 1952 } 1953 1954 #define DO_SRA(NAME, TYPE) \ 1955 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1956 { \ 1957 intptr_t i, oprsz = simd_oprsz(desc); \ 1958 int shift = simd_data(desc); \ 1959 TYPE *d = vd, *n = vn; \ 1960 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1961 d[i] += n[i] >> shift; \ 1962 } \ 1963 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1964 } 1965 1966 DO_SRA(gvec_ssra_b, int8_t) 1967 DO_SRA(gvec_ssra_h, int16_t) 1968 DO_SRA(gvec_ssra_s, int32_t) 1969 DO_SRA(gvec_ssra_d, int64_t) 1970 1971 DO_SRA(gvec_usra_b, uint8_t) 1972 DO_SRA(gvec_usra_h, uint16_t) 1973 DO_SRA(gvec_usra_s, uint32_t) 1974 DO_SRA(gvec_usra_d, uint64_t) 1975 1976 #undef DO_SRA 1977 1978 #define DO_RSHR(NAME, TYPE) \ 1979 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1980 { \ 1981 intptr_t i, oprsz = simd_oprsz(desc); \ 1982 int shift = simd_data(desc); \ 1983 TYPE *d = vd, *n = vn; \ 1984 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1985 TYPE tmp = n[i] >> (shift - 1); \ 1986 d[i] = (tmp >> 1) + (tmp & 1); \ 1987 } \ 1988 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1989 } 1990 1991 DO_RSHR(gvec_srshr_b, int8_t) 1992 DO_RSHR(gvec_srshr_h, int16_t) 1993 DO_RSHR(gvec_srshr_s, int32_t) 1994 DO_RSHR(gvec_srshr_d, int64_t) 1995 1996 DO_RSHR(gvec_urshr_b, uint8_t) 1997 DO_RSHR(gvec_urshr_h, uint16_t) 1998 DO_RSHR(gvec_urshr_s, uint32_t) 1999 DO_RSHR(gvec_urshr_d, uint64_t) 2000 2001 #undef DO_RSHR 2002 2003 #define DO_RSRA(NAME, TYPE) \ 2004 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2005 { \ 2006 intptr_t i, oprsz = simd_oprsz(desc); \ 2007 int shift = simd_data(desc); \ 2008 TYPE *d = vd, *n = vn; \ 2009 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2010 TYPE tmp = n[i] >> (shift - 1); \ 2011 d[i] += (tmp >> 1) + (tmp & 1); \ 2012 } \ 2013 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2014 } 2015 2016 DO_RSRA(gvec_srsra_b, int8_t) 2017 DO_RSRA(gvec_srsra_h, int16_t) 2018 DO_RSRA(gvec_srsra_s, int32_t) 2019 DO_RSRA(gvec_srsra_d, int64_t) 2020 2021 DO_RSRA(gvec_ursra_b, uint8_t) 2022 DO_RSRA(gvec_ursra_h, uint16_t) 2023 DO_RSRA(gvec_ursra_s, uint32_t) 2024 DO_RSRA(gvec_ursra_d, uint64_t) 2025 2026 #undef DO_RSRA 2027 2028 #define DO_SRI(NAME, TYPE) \ 2029 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2030 { \ 2031 intptr_t i, oprsz = simd_oprsz(desc); \ 2032 int shift = simd_data(desc); \ 2033 TYPE *d = vd, *n = vn; \ 2034 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2035 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 2036 } \ 2037 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2038 } 2039 2040 DO_SRI(gvec_sri_b, uint8_t) 2041 DO_SRI(gvec_sri_h, uint16_t) 2042 DO_SRI(gvec_sri_s, uint32_t) 2043 DO_SRI(gvec_sri_d, uint64_t) 2044 2045 #undef DO_SRI 2046 2047 #define DO_SLI(NAME, TYPE) \ 2048 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2049 { \ 2050 intptr_t i, oprsz = simd_oprsz(desc); \ 2051 int shift = simd_data(desc); \ 2052 TYPE *d = vd, *n = vn; \ 2053 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2054 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 2055 } \ 2056 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2057 } 2058 2059 DO_SLI(gvec_sli_b, uint8_t) 2060 DO_SLI(gvec_sli_h, uint16_t) 2061 DO_SLI(gvec_sli_s, uint32_t) 2062 DO_SLI(gvec_sli_d, uint64_t) 2063 2064 #undef DO_SLI 2065 2066 /* 2067 * Convert float16 to float32, raising no exceptions and 2068 * preserving exceptional values, including SNaN. 2069 * This is effectively an unpack+repack operation. 2070 */ 2071 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 2072 { 2073 const int f16_bias = 15; 2074 const int f32_bias = 127; 2075 uint32_t sign = extract32(f16, 15, 1); 2076 uint32_t exp = extract32(f16, 10, 5); 2077 uint32_t frac = extract32(f16, 0, 10); 2078 2079 if (exp == 0x1f) { 2080 /* Inf or NaN */ 2081 exp = 0xff; 2082 } else if (exp == 0) { 2083 /* Zero or denormal. */ 2084 if (frac != 0) { 2085 if (fz16) { 2086 frac = 0; 2087 } else { 2088 /* 2089 * Denormal; these are all normal float32. 2090 * Shift the fraction so that the msb is at bit 11, 2091 * then remove bit 11 as the implicit bit of the 2092 * normalized float32. Note that we still go through 2093 * the shift for normal numbers below, to put the 2094 * float32 fraction at the right place. 2095 */ 2096 int shift = clz32(frac) - 21; 2097 frac = (frac << shift) & 0x3ff; 2098 exp = f32_bias - f16_bias - shift + 1; 2099 } 2100 } 2101 } else { 2102 /* Normal number; adjust the bias. */ 2103 exp += f32_bias - f16_bias; 2104 } 2105 sign <<= 31; 2106 exp <<= 23; 2107 frac <<= 23 - 10; 2108 2109 return sign | exp | frac; 2110 } 2111 2112 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 2113 { 2114 /* 2115 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 2116 * Load the 2nd qword iff is_q & is_2. 2117 * Shift to the 2nd dword iff !is_q & is_2. 2118 * For !is_q & !is_2, the upper bits of the result are garbage. 2119 */ 2120 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 2121 } 2122 2123 /* 2124 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 2125 * as there is not yet SVE versions that might use blocking. 2126 */ 2127 2128 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 2129 uint64_t negx, int negf, uint32_t desc, bool fz16) 2130 { 2131 intptr_t i, oprsz = simd_oprsz(desc); 2132 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2133 int is_q = oprsz == 16; 2134 uint64_t n_4, m_4; 2135 2136 /* 2137 * Pre-load all of the f16 data, avoiding overlap issues. 2138 * Negate all inputs for AH=0 FMLSL at once. 2139 */ 2140 n_4 = load4_f16(vn, is_q, is_2) ^ negx; 2141 m_4 = load4_f16(vm, is_q, is_2); 2142 2143 for (i = 0; i < oprsz / 4; i++) { 2144 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2145 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 2146 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst); 2147 } 2148 clear_tail(d, oprsz, simd_maxsz(desc)); 2149 } 2150 2151 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 2152 CPUARMState *env, uint32_t desc) 2153 { 2154 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2155 uint64_t negx = is_s ? 0x8000800080008000ull : 0; 2156 2157 do_fmlal(vd, vn, vm, &env->vfp.fp_status[FPST_STD], negx, 0, desc, 2158 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32)); 2159 } 2160 2161 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 2162 CPUARMState *env, uint32_t desc) 2163 { 2164 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2165 uint64_t negx = 0; 2166 int negf = 0; 2167 2168 if (is_s) { 2169 if (env->vfp.fpcr & FPCR_AH) { 2170 negf = float_muladd_negate_product; 2171 } else { 2172 negx = 0x8000800080008000ull; 2173 } 2174 } 2175 do_fmlal(vd, vn, vm, &env->vfp.fp_status_a64, negx, negf, desc, 2176 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64)); 2177 } 2178 2179 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 2180 CPUARMState *env, uint32_t desc) 2181 { 2182 intptr_t i, oprsz = simd_oprsz(desc); 2183 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2184 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2185 float_status *status = &env->vfp.fp_status_a64; 2186 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64); 2187 int negx = 0, negf = 0; 2188 2189 if (is_s) { 2190 if (env->vfp.fpcr & FPCR_AH) { 2191 negf = float_muladd_negate_product; 2192 } else { 2193 negx = 0x8000; 2194 } 2195 } 2196 2197 for (i = 0; i < oprsz; i += sizeof(float32)) { 2198 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negx; 2199 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 2200 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2201 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2202 float32 aa = *(float32 *)(va + H1_4(i)); 2203 2204 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, negf, status); 2205 } 2206 } 2207 2208 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 2209 uint64_t negx, int negf, uint32_t desc, bool fz16) 2210 { 2211 intptr_t i, oprsz = simd_oprsz(desc); 2212 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2213 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 2214 int is_q = oprsz == 16; 2215 uint64_t n_4; 2216 float32 m_1; 2217 2218 /* 2219 * Pre-load all of the f16 data, avoiding overlap issues. 2220 * Negate all inputs for AH=0 FMLSL at once. 2221 */ 2222 n_4 = load4_f16(vn, is_q, is_2) ^ negx; 2223 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 2224 2225 for (i = 0; i < oprsz / 4; i++) { 2226 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2227 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst); 2228 } 2229 clear_tail(d, oprsz, simd_maxsz(desc)); 2230 } 2231 2232 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 2233 CPUARMState *env, uint32_t desc) 2234 { 2235 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2236 uint64_t negx = is_s ? 0x8000800080008000ull : 0; 2237 2238 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status[FPST_STD], negx, 0, desc, 2239 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32)); 2240 } 2241 2242 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 2243 CPUARMState *env, uint32_t desc) 2244 { 2245 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2246 uint64_t negx = 0; 2247 int negf = 0; 2248 2249 if (is_s) { 2250 if (env->vfp.fpcr & FPCR_AH) { 2251 negf = float_muladd_negate_product; 2252 } else { 2253 negx = 0x8000800080008000ull; 2254 } 2255 } 2256 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status_a64, negx, negf, desc, 2257 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64)); 2258 } 2259 2260 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 2261 CPUARMState *env, uint32_t desc) 2262 { 2263 intptr_t i, j, oprsz = simd_oprsz(desc); 2264 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2265 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2266 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 2267 float_status *status = &env->vfp.fp_status_a64; 2268 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64); 2269 int negx = 0, negf = 0; 2270 2271 if (is_s) { 2272 if (env->vfp.fpcr & FPCR_AH) { 2273 negf = float_muladd_negate_product; 2274 } else { 2275 negx = 0x8000; 2276 } 2277 } 2278 2279 for (i = 0; i < oprsz; i += 16) { 2280 float16 mm_16 = *(float16 *)(vm + i + idx); 2281 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2282 2283 for (j = 0; j < 16; j += sizeof(float32)) { 2284 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negx; 2285 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2286 float32 aa = *(float32 *)(va + H1_4(i + j)); 2287 2288 *(float32 *)(vd + H1_4(i + j)) = 2289 float32_muladd(nn, mm, aa, negf, status); 2290 } 2291 } 2292 } 2293 2294 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2295 { 2296 intptr_t i, opr_sz = simd_oprsz(desc); 2297 int8_t *d = vd, *n = vn, *m = vm; 2298 2299 for (i = 0; i < opr_sz; ++i) { 2300 int8_t mm = m[i]; 2301 int8_t nn = n[i]; 2302 int8_t res = 0; 2303 if (mm >= 0) { 2304 if (mm < 8) { 2305 res = nn << mm; 2306 } 2307 } else { 2308 res = nn >> (mm > -8 ? -mm : 7); 2309 } 2310 d[i] = res; 2311 } 2312 clear_tail(d, opr_sz, simd_maxsz(desc)); 2313 } 2314 2315 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2316 { 2317 intptr_t i, opr_sz = simd_oprsz(desc); 2318 int16_t *d = vd, *n = vn, *m = vm; 2319 2320 for (i = 0; i < opr_sz / 2; ++i) { 2321 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2322 int16_t nn = n[i]; 2323 int16_t res = 0; 2324 if (mm >= 0) { 2325 if (mm < 16) { 2326 res = nn << mm; 2327 } 2328 } else { 2329 res = nn >> (mm > -16 ? -mm : 15); 2330 } 2331 d[i] = res; 2332 } 2333 clear_tail(d, opr_sz, simd_maxsz(desc)); 2334 } 2335 2336 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2337 { 2338 intptr_t i, opr_sz = simd_oprsz(desc); 2339 uint8_t *d = vd, *n = vn, *m = vm; 2340 2341 for (i = 0; i < opr_sz; ++i) { 2342 int8_t mm = m[i]; 2343 uint8_t nn = n[i]; 2344 uint8_t res = 0; 2345 if (mm >= 0) { 2346 if (mm < 8) { 2347 res = nn << mm; 2348 } 2349 } else { 2350 if (mm > -8) { 2351 res = nn >> -mm; 2352 } 2353 } 2354 d[i] = res; 2355 } 2356 clear_tail(d, opr_sz, simd_maxsz(desc)); 2357 } 2358 2359 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2360 { 2361 intptr_t i, opr_sz = simd_oprsz(desc); 2362 uint16_t *d = vd, *n = vn, *m = vm; 2363 2364 for (i = 0; i < opr_sz / 2; ++i) { 2365 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2366 uint16_t nn = n[i]; 2367 uint16_t res = 0; 2368 if (mm >= 0) { 2369 if (mm < 16) { 2370 res = nn << mm; 2371 } 2372 } else { 2373 if (mm > -16) { 2374 res = nn >> -mm; 2375 } 2376 } 2377 d[i] = res; 2378 } 2379 clear_tail(d, opr_sz, simd_maxsz(desc)); 2380 } 2381 2382 /* 2383 * 8x8->8 polynomial multiply. 2384 * 2385 * Polynomial multiplication is like integer multiplication except the 2386 * partial products are XORed, not added. 2387 * 2388 * TODO: expose this as a generic vector operation, as it is a common 2389 * crypto building block. 2390 */ 2391 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 2392 { 2393 intptr_t i, opr_sz = simd_oprsz(desc); 2394 uint64_t *d = vd, *n = vn, *m = vm; 2395 2396 for (i = 0; i < opr_sz / 8; ++i) { 2397 d[i] = clmul_8x8_low(n[i], m[i]); 2398 } 2399 clear_tail(d, opr_sz, simd_maxsz(desc)); 2400 } 2401 2402 /* 2403 * 64x64->128 polynomial multiply. 2404 * Because of the lanes are not accessed in strict columns, 2405 * this probably cannot be turned into a generic helper. 2406 */ 2407 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2408 { 2409 intptr_t i, opr_sz = simd_oprsz(desc); 2410 intptr_t hi = simd_data(desc); 2411 uint64_t *d = vd, *n = vn, *m = vm; 2412 2413 for (i = 0; i < opr_sz / 8; i += 2) { 2414 Int128 r = clmul_64(n[i + hi], m[i + hi]); 2415 d[i] = int128_getlo(r); 2416 d[i + 1] = int128_gethi(r); 2417 } 2418 clear_tail(d, opr_sz, simd_maxsz(desc)); 2419 } 2420 2421 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2422 { 2423 int hi = simd_data(desc); 2424 uint64_t *d = vd, *n = vn, *m = vm; 2425 uint64_t nn = n[hi], mm = m[hi]; 2426 2427 d[0] = clmul_8x4_packed(nn, mm); 2428 nn >>= 32; 2429 mm >>= 32; 2430 d[1] = clmul_8x4_packed(nn, mm); 2431 2432 clear_tail(d, 16, simd_maxsz(desc)); 2433 } 2434 2435 #ifdef TARGET_AARCH64 2436 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2437 { 2438 int shift = simd_data(desc) * 8; 2439 intptr_t i, opr_sz = simd_oprsz(desc); 2440 uint64_t *d = vd, *n = vn, *m = vm; 2441 2442 for (i = 0; i < opr_sz / 8; ++i) { 2443 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); 2444 } 2445 } 2446 2447 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2448 { 2449 intptr_t sel = H4(simd_data(desc)); 2450 intptr_t i, opr_sz = simd_oprsz(desc); 2451 uint32_t *n = vn, *m = vm; 2452 uint64_t *d = vd; 2453 2454 for (i = 0; i < opr_sz / 8; ++i) { 2455 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); 2456 } 2457 } 2458 #endif 2459 2460 #define DO_CMP0(NAME, TYPE, OP) \ 2461 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2462 { \ 2463 intptr_t i, opr_sz = simd_oprsz(desc); \ 2464 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2465 TYPE nn = *(TYPE *)(vn + i); \ 2466 *(TYPE *)(vd + i) = -(nn OP 0); \ 2467 } \ 2468 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2469 } 2470 2471 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2472 DO_CMP0(gvec_clt0_b, int8_t, <) 2473 DO_CMP0(gvec_cle0_b, int8_t, <=) 2474 DO_CMP0(gvec_cgt0_b, int8_t, >) 2475 DO_CMP0(gvec_cge0_b, int8_t, >=) 2476 2477 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2478 DO_CMP0(gvec_clt0_h, int16_t, <) 2479 DO_CMP0(gvec_cle0_h, int16_t, <=) 2480 DO_CMP0(gvec_cgt0_h, int16_t, >) 2481 DO_CMP0(gvec_cge0_h, int16_t, >=) 2482 2483 #undef DO_CMP0 2484 2485 #define DO_ABD(NAME, TYPE) \ 2486 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2487 { \ 2488 intptr_t i, opr_sz = simd_oprsz(desc); \ 2489 TYPE *d = vd, *n = vn, *m = vm; \ 2490 \ 2491 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2492 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2493 } \ 2494 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2495 } 2496 2497 DO_ABD(gvec_sabd_b, int8_t) 2498 DO_ABD(gvec_sabd_h, int16_t) 2499 DO_ABD(gvec_sabd_s, int32_t) 2500 DO_ABD(gvec_sabd_d, int64_t) 2501 2502 DO_ABD(gvec_uabd_b, uint8_t) 2503 DO_ABD(gvec_uabd_h, uint16_t) 2504 DO_ABD(gvec_uabd_s, uint32_t) 2505 DO_ABD(gvec_uabd_d, uint64_t) 2506 2507 #undef DO_ABD 2508 2509 #define DO_ABA(NAME, TYPE) \ 2510 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2511 { \ 2512 intptr_t i, opr_sz = simd_oprsz(desc); \ 2513 TYPE *d = vd, *n = vn, *m = vm; \ 2514 \ 2515 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2516 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2517 } \ 2518 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2519 } 2520 2521 DO_ABA(gvec_saba_b, int8_t) 2522 DO_ABA(gvec_saba_h, int16_t) 2523 DO_ABA(gvec_saba_s, int32_t) 2524 DO_ABA(gvec_saba_d, int64_t) 2525 2526 DO_ABA(gvec_uaba_b, uint8_t) 2527 DO_ABA(gvec_uaba_h, uint16_t) 2528 DO_ABA(gvec_uaba_s, uint32_t) 2529 DO_ABA(gvec_uaba_d, uint64_t) 2530 2531 #undef DO_ABA 2532 2533 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2534 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 2535 float_status *stat, uint32_t desc) \ 2536 { \ 2537 ARMVectorReg scratch; \ 2538 intptr_t oprsz = simd_oprsz(desc); \ 2539 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2540 TYPE *d = vd, *n = vn, *m = vm; \ 2541 if (unlikely(d == m)) { \ 2542 m = memcpy(&scratch, m, oprsz); \ 2543 } \ 2544 for (intptr_t i = 0; i < half; ++i) { \ 2545 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat); \ 2546 } \ 2547 for (intptr_t i = 0; i < half; ++i) { \ 2548 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat); \ 2549 } \ 2550 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2551 } 2552 2553 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2) 2554 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4) 2555 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, ) 2556 2557 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2) 2558 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4) 2559 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, ) 2560 2561 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2) 2562 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4) 2563 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, ) 2564 2565 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2) 2566 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4) 2567 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, ) 2568 2569 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2) 2570 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4) 2571 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, ) 2572 2573 #ifdef TARGET_AARCH64 2574 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2) 2575 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4) 2576 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, ) 2577 2578 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2) 2579 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4) 2580 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, ) 2581 #endif 2582 2583 #undef DO_3OP_PAIR 2584 2585 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2586 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2587 { \ 2588 ARMVectorReg scratch; \ 2589 intptr_t oprsz = simd_oprsz(desc); \ 2590 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2591 TYPE *d = vd, *n = vn, *m = vm; \ 2592 if (unlikely(d == m)) { \ 2593 m = memcpy(&scratch, m, oprsz); \ 2594 } \ 2595 for (intptr_t i = 0; i < half; ++i) { \ 2596 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]); \ 2597 } \ 2598 for (intptr_t i = 0; i < half; ++i) { \ 2599 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]); \ 2600 } \ 2601 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2602 } 2603 2604 #define ADD(A, B) (A + B) 2605 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1) 2606 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2) 2607 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4) 2608 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, ) 2609 #undef ADD 2610 2611 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1) 2612 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2) 2613 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4) 2614 2615 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1) 2616 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2) 2617 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4) 2618 2619 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1) 2620 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2) 2621 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4) 2622 2623 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1) 2624 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2) 2625 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4) 2626 2627 #undef DO_3OP_PAIR 2628 2629 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2630 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 2631 { \ 2632 intptr_t i, oprsz = simd_oprsz(desc); \ 2633 int shift = simd_data(desc); \ 2634 TYPE *d = vd, *n = vn; \ 2635 float_status *fpst = stat; \ 2636 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2637 d[i] = FUNC(n[i], shift, fpst); \ 2638 } \ 2639 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2640 } 2641 2642 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t) 2643 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t) 2644 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2645 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2646 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2647 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2648 2649 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t) 2650 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t) 2651 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2652 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t) 2653 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2654 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2655 2656 #undef DO_VCVT_FIXED 2657 2658 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2659 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2660 { \ 2661 intptr_t i, oprsz = simd_oprsz(desc); \ 2662 uint32_t rmode = simd_data(desc); \ 2663 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2664 TYPE *d = vd, *n = vn; \ 2665 set_float_rounding_mode(rmode, fpst); \ 2666 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2667 d[i] = FUNC(n[i], 0, fpst); \ 2668 } \ 2669 set_float_rounding_mode(prev_rmode, fpst); \ 2670 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2671 } 2672 2673 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t) 2674 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t) 2675 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2676 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2677 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2678 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2679 2680 #undef DO_VCVT_RMODE 2681 2682 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2683 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2684 { \ 2685 intptr_t i, oprsz = simd_oprsz(desc); \ 2686 uint32_t rmode = simd_data(desc); \ 2687 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2688 TYPE *d = vd, *n = vn; \ 2689 set_float_rounding_mode(rmode, fpst); \ 2690 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2691 d[i] = FUNC(n[i], fpst); \ 2692 } \ 2693 set_float_rounding_mode(prev_rmode, fpst); \ 2694 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2695 } 2696 2697 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2698 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2699 2700 #undef DO_VRINT_RMODE 2701 2702 #ifdef TARGET_AARCH64 2703 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc) 2704 { 2705 const uint8_t *indices = vm; 2706 size_t oprsz = simd_oprsz(desc); 2707 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2708 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2709 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2710 union { 2711 uint8_t b[16]; 2712 uint64_t d[2]; 2713 } result; 2714 2715 /* 2716 * We must construct the final result in a temp, lest the output 2717 * overlaps the input table. For TBL, begin with zero; for TBX, 2718 * begin with the original register contents. Note that we always 2719 * copy 16 bytes here to avoid an extra branch; clearing the high 2720 * bits of the register for oprsz == 8 is handled below. 2721 */ 2722 if (is_tbx) { 2723 memcpy(&result, vd, 16); 2724 } else { 2725 memset(&result, 0, 16); 2726 } 2727 2728 for (size_t i = 0; i < oprsz; ++i) { 2729 uint32_t index = indices[H1(i)]; 2730 2731 if (index < table_len) { 2732 /* 2733 * Convert index (a byte offset into the virtual table 2734 * which is a series of 128-bit vectors concatenated) 2735 * into the correct register element, bearing in mind 2736 * that the table can wrap around from V31 to V0. 2737 */ 2738 const uint8_t *table = (const uint8_t *) 2739 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2740 result.b[H1(i)] = table[H1(index % 16)]; 2741 } 2742 } 2743 2744 memcpy(vd, &result, 16); 2745 clear_tail(vd, oprsz, simd_maxsz(desc)); 2746 } 2747 #endif 2748 2749 /* 2750 * NxN -> N highpart multiply 2751 * 2752 * TODO: expose this as a generic vector operation. 2753 */ 2754 2755 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2756 { 2757 intptr_t i, opr_sz = simd_oprsz(desc); 2758 int8_t *d = vd, *n = vn, *m = vm; 2759 2760 for (i = 0; i < opr_sz; ++i) { 2761 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2762 } 2763 clear_tail(d, opr_sz, simd_maxsz(desc)); 2764 } 2765 2766 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2767 { 2768 intptr_t i, opr_sz = simd_oprsz(desc); 2769 int16_t *d = vd, *n = vn, *m = vm; 2770 2771 for (i = 0; i < opr_sz / 2; ++i) { 2772 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2773 } 2774 clear_tail(d, opr_sz, simd_maxsz(desc)); 2775 } 2776 2777 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2778 { 2779 intptr_t i, opr_sz = simd_oprsz(desc); 2780 int32_t *d = vd, *n = vn, *m = vm; 2781 2782 for (i = 0; i < opr_sz / 4; ++i) { 2783 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2784 } 2785 clear_tail(d, opr_sz, simd_maxsz(desc)); 2786 } 2787 2788 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2789 { 2790 intptr_t i, opr_sz = simd_oprsz(desc); 2791 uint64_t *d = vd, *n = vn, *m = vm; 2792 uint64_t discard; 2793 2794 for (i = 0; i < opr_sz / 8; ++i) { 2795 muls64(&discard, &d[i], n[i], m[i]); 2796 } 2797 clear_tail(d, opr_sz, simd_maxsz(desc)); 2798 } 2799 2800 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2801 { 2802 intptr_t i, opr_sz = simd_oprsz(desc); 2803 uint8_t *d = vd, *n = vn, *m = vm; 2804 2805 for (i = 0; i < opr_sz; ++i) { 2806 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2807 } 2808 clear_tail(d, opr_sz, simd_maxsz(desc)); 2809 } 2810 2811 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2812 { 2813 intptr_t i, opr_sz = simd_oprsz(desc); 2814 uint16_t *d = vd, *n = vn, *m = vm; 2815 2816 for (i = 0; i < opr_sz / 2; ++i) { 2817 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2818 } 2819 clear_tail(d, opr_sz, simd_maxsz(desc)); 2820 } 2821 2822 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2823 { 2824 intptr_t i, opr_sz = simd_oprsz(desc); 2825 uint32_t *d = vd, *n = vn, *m = vm; 2826 2827 for (i = 0; i < opr_sz / 4; ++i) { 2828 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2829 } 2830 clear_tail(d, opr_sz, simd_maxsz(desc)); 2831 } 2832 2833 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2834 { 2835 intptr_t i, opr_sz = simd_oprsz(desc); 2836 uint64_t *d = vd, *n = vn, *m = vm; 2837 uint64_t discard; 2838 2839 for (i = 0; i < opr_sz / 8; ++i) { 2840 mulu64(&discard, &d[i], n[i], m[i]); 2841 } 2842 clear_tail(d, opr_sz, simd_maxsz(desc)); 2843 } 2844 2845 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2846 { 2847 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2848 int shr = simd_data(desc); 2849 uint64_t *d = vd, *n = vn, *m = vm; 2850 2851 for (i = 0; i < opr_sz; ++i) { 2852 d[i] = ror64(n[i] ^ m[i], shr); 2853 } 2854 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2855 } 2856 2857 /* 2858 * Integer matrix-multiply accumulate 2859 */ 2860 2861 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2862 { 2863 int8_t *n = vn, *m = vm; 2864 2865 for (intptr_t k = 0; k < 8; ++k) { 2866 sum += n[H1(k)] * m[H1(k)]; 2867 } 2868 return sum; 2869 } 2870 2871 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2872 { 2873 uint8_t *n = vn, *m = vm; 2874 2875 for (intptr_t k = 0; k < 8; ++k) { 2876 sum += n[H1(k)] * m[H1(k)]; 2877 } 2878 return sum; 2879 } 2880 2881 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2882 { 2883 uint8_t *n = vn; 2884 int8_t *m = vm; 2885 2886 for (intptr_t k = 0; k < 8; ++k) { 2887 sum += n[H1(k)] * m[H1(k)]; 2888 } 2889 return sum; 2890 } 2891 2892 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2893 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2894 { 2895 intptr_t seg, opr_sz = simd_oprsz(desc); 2896 2897 for (seg = 0; seg < opr_sz; seg += 16) { 2898 uint32_t *d = vd + seg; 2899 uint32_t *a = va + seg; 2900 uint32_t sum0, sum1, sum2, sum3; 2901 2902 /* 2903 * Process the entire segment at once, writing back the 2904 * results only after we've consumed all of the inputs. 2905 * 2906 * Key to indices by column: 2907 * i j i j 2908 */ 2909 sum0 = a[H4(0 + 0)]; 2910 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2911 sum1 = a[H4(0 + 1)]; 2912 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2913 sum2 = a[H4(2 + 0)]; 2914 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2915 sum3 = a[H4(2 + 1)]; 2916 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2917 2918 d[H4(0)] = sum0; 2919 d[H4(1)] = sum1; 2920 d[H4(2)] = sum2; 2921 d[H4(3)] = sum3; 2922 } 2923 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2924 } 2925 2926 #define DO_MMLA_B(NAME, INNER) \ 2927 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2928 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2929 2930 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2931 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2932 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2933 2934 /* 2935 * BFloat16 Dot Product 2936 */ 2937 2938 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp) 2939 { 2940 /* 2941 * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF. 2942 * For EBF = 0, we ignore the FPCR bits which determine rounding 2943 * mode and denormal-flushing, and we do unfused multiplies and 2944 * additions with intermediate rounding of all products and sums. 2945 * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits, 2946 * and we perform a fused two-way sum-of-products without intermediate 2947 * rounding of the products. 2948 * In either case, we don't set fp exception flags. 2949 * 2950 * EBF is AArch64 only, so even if it's set in the FPCR it has 2951 * no effect on AArch32 instructions. 2952 */ 2953 bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF; 2954 2955 *statusp = is_a64(env) ? env->vfp.fp_status_a64 : env->vfp.fp_status_a32; 2956 set_default_nan_mode(true, statusp); 2957 2958 if (ebf) { 2959 /* EBF=1 needs to do a step with round-to-odd semantics */ 2960 *oddstatusp = *statusp; 2961 set_float_rounding_mode(float_round_to_odd, oddstatusp); 2962 } else { 2963 set_flush_to_zero(true, statusp); 2964 set_flush_inputs_to_zero(true, statusp); 2965 set_float_rounding_mode(float_round_to_odd_inf, statusp); 2966 } 2967 return ebf; 2968 } 2969 2970 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst) 2971 { 2972 float32 t1, t2; 2973 2974 /* 2975 * Extract each BFloat16 from the element pair, and shift 2976 * them such that they become float32. 2977 */ 2978 t1 = float32_mul(e1 << 16, e2 << 16, fpst); 2979 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst); 2980 t1 = float32_add(t1, t2, fpst); 2981 t1 = float32_add(sum, t1, fpst); 2982 2983 return t1; 2984 } 2985 2986 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2, 2987 float_status *fpst, float_status *fpst_odd) 2988 { 2989 /* 2990 * Compare f16_dotadd() in sme_helper.c, but here we have 2991 * bfloat16 inputs. In particular that means that we do not 2992 * want the FPCR.FZ16 flush semantics, so we use the normal 2993 * float_status for the input handling here. 2994 */ 2995 float64 e1r = float32_to_float64(e1 << 16, fpst); 2996 float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst); 2997 float64 e2r = float32_to_float64(e2 << 16, fpst); 2998 float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst); 2999 float64 t64; 3000 float32 t32; 3001 3002 /* 3003 * The ARM pseudocode function FPDot performs both multiplies 3004 * and the add with a single rounding operation. Emulate this 3005 * by performing the first multiply in round-to-odd, then doing 3006 * the second multiply as fused multiply-add, and rounding to 3007 * float32 all in one step. 3008 */ 3009 t64 = float64_mul(e1r, e2r, fpst_odd); 3010 t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst); 3011 3012 /* This conversion is exact, because we've already rounded. */ 3013 t32 = float64_to_float32(t64, fpst); 3014 3015 /* The final accumulation step is not fused. */ 3016 return float32_add(sum, t32, fpst); 3017 } 3018 3019 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, 3020 CPUARMState *env, uint32_t desc) 3021 { 3022 intptr_t i, opr_sz = simd_oprsz(desc); 3023 float32 *d = vd, *a = va; 3024 uint32_t *n = vn, *m = vm; 3025 float_status fpst, fpst_odd; 3026 3027 if (is_ebf(env, &fpst, &fpst_odd)) { 3028 for (i = 0; i < opr_sz / 4; ++i) { 3029 d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd); 3030 } 3031 } else { 3032 for (i = 0; i < opr_sz / 4; ++i) { 3033 d[i] = bfdotadd(a[i], n[i], m[i], &fpst); 3034 } 3035 } 3036 clear_tail(d, opr_sz, simd_maxsz(desc)); 3037 } 3038 3039 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 3040 void *va, CPUARMState *env, uint32_t desc) 3041 { 3042 intptr_t i, j, opr_sz = simd_oprsz(desc); 3043 intptr_t index = simd_data(desc); 3044 intptr_t elements = opr_sz / 4; 3045 intptr_t eltspersegment = MIN(16 / 4, elements); 3046 float32 *d = vd, *a = va; 3047 uint32_t *n = vn, *m = vm; 3048 float_status fpst, fpst_odd; 3049 3050 if (is_ebf(env, &fpst, &fpst_odd)) { 3051 for (i = 0; i < elements; i += eltspersegment) { 3052 uint32_t m_idx = m[i + H4(index)]; 3053 3054 for (j = i; j < i + eltspersegment; j++) { 3055 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd); 3056 } 3057 } 3058 } else { 3059 for (i = 0; i < elements; i += eltspersegment) { 3060 uint32_t m_idx = m[i + H4(index)]; 3061 3062 for (j = i; j < i + eltspersegment; j++) { 3063 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst); 3064 } 3065 } 3066 } 3067 clear_tail(d, opr_sz, simd_maxsz(desc)); 3068 } 3069 3070 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, 3071 CPUARMState *env, uint32_t desc) 3072 { 3073 intptr_t s, opr_sz = simd_oprsz(desc); 3074 float32 *d = vd, *a = va; 3075 uint32_t *n = vn, *m = vm; 3076 float_status fpst, fpst_odd; 3077 3078 if (is_ebf(env, &fpst, &fpst_odd)) { 3079 for (s = 0; s < opr_sz / 4; s += 4) { 3080 float32 sum00, sum01, sum10, sum11; 3081 3082 /* 3083 * Process the entire segment at once, writing back the 3084 * results only after we've consumed all of the inputs. 3085 * 3086 * Key to indices by column: 3087 * i j i k j k 3088 */ 3089 sum00 = a[s + H4(0 + 0)]; 3090 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 3091 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 3092 3093 sum01 = a[s + H4(0 + 1)]; 3094 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 3095 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 3096 3097 sum10 = a[s + H4(2 + 0)]; 3098 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 3099 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 3100 3101 sum11 = a[s + H4(2 + 1)]; 3102 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 3103 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 3104 3105 d[s + H4(0 + 0)] = sum00; 3106 d[s + H4(0 + 1)] = sum01; 3107 d[s + H4(2 + 0)] = sum10; 3108 d[s + H4(2 + 1)] = sum11; 3109 } 3110 } else { 3111 for (s = 0; s < opr_sz / 4; s += 4) { 3112 float32 sum00, sum01, sum10, sum11; 3113 3114 /* 3115 * Process the entire segment at once, writing back the 3116 * results only after we've consumed all of the inputs. 3117 * 3118 * Key to indices by column: 3119 * i j i k j k 3120 */ 3121 sum00 = a[s + H4(0 + 0)]; 3122 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst); 3123 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst); 3124 3125 sum01 = a[s + H4(0 + 1)]; 3126 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst); 3127 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst); 3128 3129 sum10 = a[s + H4(2 + 0)]; 3130 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst); 3131 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst); 3132 3133 sum11 = a[s + H4(2 + 1)]; 3134 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst); 3135 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst); 3136 3137 d[s + H4(0 + 0)] = sum00; 3138 d[s + H4(0 + 1)] = sum01; 3139 d[s + H4(2 + 0)] = sum10; 3140 d[s + H4(2 + 1)] = sum11; 3141 } 3142 } 3143 clear_tail(d, opr_sz, simd_maxsz(desc)); 3144 } 3145 3146 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 3147 float_status *stat, uint32_t desc) 3148 { 3149 intptr_t i, opr_sz = simd_oprsz(desc); 3150 intptr_t sel = simd_data(desc); 3151 float32 *d = vd, *a = va; 3152 bfloat16 *n = vn, *m = vm; 3153 3154 for (i = 0; i < opr_sz / 4; ++i) { 3155 float32 nn = n[H2(i * 2 + sel)] << 16; 3156 float32 mm = m[H2(i * 2 + sel)] << 16; 3157 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 3158 } 3159 clear_tail(d, opr_sz, simd_maxsz(desc)); 3160 } 3161 3162 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 3163 void *va, float_status *stat, uint32_t desc) 3164 { 3165 intptr_t i, j, opr_sz = simd_oprsz(desc); 3166 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 3167 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 3168 intptr_t elements = opr_sz / 4; 3169 intptr_t eltspersegment = MIN(16 / 4, elements); 3170 float32 *d = vd, *a = va; 3171 bfloat16 *n = vn, *m = vm; 3172 3173 for (i = 0; i < elements; i += eltspersegment) { 3174 float32 m_idx = m[H2(2 * i + index)] << 16; 3175 3176 for (j = i; j < i + eltspersegment; j++) { 3177 float32 n_j = n[H2(2 * j + sel)] << 16; 3178 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 3179 } 3180 } 3181 clear_tail(d, opr_sz, simd_maxsz(desc)); 3182 } 3183 3184 #define DO_CLAMP(NAME, TYPE) \ 3185 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 3186 { \ 3187 intptr_t i, opr_sz = simd_oprsz(desc); \ 3188 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 3189 TYPE aa = *(TYPE *)(a + i); \ 3190 TYPE nn = *(TYPE *)(n + i); \ 3191 TYPE mm = *(TYPE *)(m + i); \ 3192 TYPE dd = MIN(MAX(aa, nn), mm); \ 3193 *(TYPE *)(d + i) = dd; \ 3194 } \ 3195 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 3196 } 3197 3198 DO_CLAMP(gvec_sclamp_b, int8_t) 3199 DO_CLAMP(gvec_sclamp_h, int16_t) 3200 DO_CLAMP(gvec_sclamp_s, int32_t) 3201 DO_CLAMP(gvec_sclamp_d, int64_t) 3202 3203 DO_CLAMP(gvec_uclamp_b, uint8_t) 3204 DO_CLAMP(gvec_uclamp_h, uint16_t) 3205 DO_CLAMP(gvec_uclamp_s, uint32_t) 3206 DO_CLAMP(gvec_uclamp_d, uint64_t) 3207 3208 /* Bit count in each 8-bit word. */ 3209 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc) 3210 { 3211 intptr_t i, opr_sz = simd_oprsz(desc); 3212 uint8_t *d = vd, *n = vn; 3213 3214 for (i = 0; i < opr_sz; ++i) { 3215 d[i] = ctpop8(n[i]); 3216 } 3217 clear_tail(d, opr_sz, simd_maxsz(desc)); 3218 } 3219 3220 /* Reverse bits in each 8 bit word */ 3221 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc) 3222 { 3223 intptr_t i, opr_sz = simd_oprsz(desc); 3224 uint64_t *d = vd, *n = vn; 3225 3226 for (i = 0; i < opr_sz / 8; ++i) { 3227 d[i] = revbit64(bswap64(n[i])); 3228 } 3229 clear_tail(d, opr_sz, simd_maxsz(desc)); 3230 } 3231 3232 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc) 3233 { 3234 intptr_t i, opr_sz = simd_oprsz(desc); 3235 uint32_t *d = vd, *n = vn; 3236 3237 for (i = 0; i < opr_sz / 4; ++i) { 3238 d[i] = helper_recpe_u32(n[i]); 3239 } 3240 clear_tail(d, opr_sz, simd_maxsz(desc)); 3241 } 3242 3243 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc) 3244 { 3245 intptr_t i, opr_sz = simd_oprsz(desc); 3246 uint32_t *d = vd, *n = vn; 3247 3248 for (i = 0; i < opr_sz / 4; ++i) { 3249 d[i] = helper_rsqrte_u32(n[i]); 3250 } 3251 clear_tail(d, opr_sz, simd_maxsz(desc)); 3252 } 3253