1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "crypto/clmul.h" 27 #include "vec_internal.h" 28 29 /* 30 * Data for expanding active predicate bits to bytes, for byte elements. 31 * 32 * for (i = 0; i < 256; ++i) { 33 * unsigned long m = 0; 34 * for (j = 0; j < 8; j++) { 35 * if ((i >> j) & 1) { 36 * m |= 0xfful << (j << 3); 37 * } 38 * } 39 * printf("0x%016lx,\n", m); 40 * } 41 */ 42 const uint64_t expand_pred_b_data[256] = { 43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 128 0xffffffffffffffff, 129 }; 130 131 /* 132 * Similarly for half-word elements. 133 * for (i = 0; i < 256; ++i) { 134 * unsigned long m = 0; 135 * if (i & 0xaa) { 136 * continue; 137 * } 138 * for (j = 0; j < 8; j += 2) { 139 * if ((i >> j) & 1) { 140 * m |= 0xfffful << (j << 3); 141 * } 142 * } 143 * printf("[0x%x] = 0x%016lx,\n", i, m); 144 * } 145 */ 146 const uint64_t expand_pred_h_data[0x55 + 1] = { 147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 154 [0x55] = 0xffffffffffffffff, 155 }; 156 157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 159 bool neg, bool round) 160 { 161 /* 162 * Simplify: 163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 165 */ 166 int32_t ret = (int32_t)src1 * src2; 167 if (neg) { 168 ret = -ret; 169 } 170 ret += ((int32_t)src3 << 7) + (round << 6); 171 ret >>= 7; 172 173 if (ret != (int8_t)ret) { 174 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 175 } 176 return ret; 177 } 178 179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 180 void *va, uint32_t desc) 181 { 182 intptr_t i, opr_sz = simd_oprsz(desc); 183 int8_t *d = vd, *n = vn, *m = vm, *a = va; 184 185 for (i = 0; i < opr_sz; ++i) { 186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 187 } 188 } 189 190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 191 void *va, uint32_t desc) 192 { 193 intptr_t i, opr_sz = simd_oprsz(desc); 194 int8_t *d = vd, *n = vn, *m = vm, *a = va; 195 196 for (i = 0; i < opr_sz; ++i) { 197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 198 } 199 } 200 201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 202 { 203 intptr_t i, opr_sz = simd_oprsz(desc); 204 int8_t *d = vd, *n = vn, *m = vm; 205 206 for (i = 0; i < opr_sz; ++i) { 207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 208 } 209 } 210 211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 212 { 213 intptr_t i, opr_sz = simd_oprsz(desc); 214 int8_t *d = vd, *n = vn, *m = vm; 215 216 for (i = 0; i < opr_sz; ++i) { 217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 218 } 219 } 220 221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 223 bool neg, bool round, uint32_t *sat) 224 { 225 /* Simplify similarly to do_sqrdmlah_b above. */ 226 int32_t ret = (int32_t)src1 * src2; 227 if (neg) { 228 ret = -ret; 229 } 230 ret += ((int32_t)src3 << 15) + (round << 14); 231 ret >>= 15; 232 233 if (ret != (int16_t)ret) { 234 *sat = 1; 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 236 } 237 return ret; 238 } 239 240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 241 uint32_t src2, uint32_t src3) 242 { 243 uint32_t *sat = &env->vfp.qc[0]; 244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 246 false, true, sat); 247 return deposit32(e1, 16, 16, e2); 248 } 249 250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 251 void *vq, uint32_t desc) 252 { 253 uintptr_t opr_sz = simd_oprsz(desc); 254 int16_t *d = vd; 255 int16_t *n = vn; 256 int16_t *m = vm; 257 uintptr_t i; 258 259 for (i = 0; i < opr_sz / 2; ++i) { 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 261 } 262 clear_tail(d, opr_sz, simd_maxsz(desc)); 263 } 264 265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 266 uint32_t src2, uint32_t src3) 267 { 268 uint32_t *sat = &env->vfp.qc[0]; 269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 271 true, true, sat); 272 return deposit32(e1, 16, 16, e2); 273 } 274 275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277 { 278 uintptr_t opr_sz = simd_oprsz(desc); 279 int16_t *d = vd; 280 int16_t *n = vn; 281 int16_t *m = vm; 282 uintptr_t i; 283 284 for (i = 0; i < opr_sz / 2; ++i) { 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 286 } 287 clear_tail(d, opr_sz, simd_maxsz(desc)); 288 } 289 290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 291 void *vq, uint32_t desc) 292 { 293 intptr_t i, opr_sz = simd_oprsz(desc); 294 int16_t *d = vd, *n = vn, *m = vm; 295 296 for (i = 0; i < opr_sz / 2; ++i) { 297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 298 } 299 clear_tail(d, opr_sz, simd_maxsz(desc)); 300 } 301 302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 303 void *vq, uint32_t desc) 304 { 305 intptr_t i, opr_sz = simd_oprsz(desc); 306 int16_t *d = vd, *n = vn, *m = vm; 307 308 for (i = 0; i < opr_sz / 2; ++i) { 309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 310 } 311 clear_tail(d, opr_sz, simd_maxsz(desc)); 312 } 313 314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm, 315 void *vq, uint32_t desc) 316 { 317 intptr_t i, j, opr_sz = simd_oprsz(desc); 318 int idx = simd_data(desc); 319 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 320 intptr_t elements = opr_sz / 2; 321 intptr_t eltspersegment = MIN(16 / 2, elements); 322 323 for (i = 0; i < elements; i += 16 / 2) { 324 int16_t mm = m[i]; 325 for (j = 0; j < eltspersegment; ++j) { 326 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq); 327 } 328 } 329 clear_tail(d, opr_sz, simd_maxsz(desc)); 330 } 331 332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, 333 void *vq, uint32_t desc) 334 { 335 intptr_t i, j, opr_sz = simd_oprsz(desc); 336 int idx = simd_data(desc); 337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 338 intptr_t elements = opr_sz / 2; 339 intptr_t eltspersegment = MIN(16 / 2, elements); 340 341 for (i = 0; i < elements; i += 16 / 2) { 342 int16_t mm = m[i]; 343 for (j = 0; j < eltspersegment; ++j) { 344 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq); 345 } 346 } 347 clear_tail(d, opr_sz, simd_maxsz(desc)); 348 } 349 350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm, 351 void *vq, uint32_t desc) 352 { 353 intptr_t i, j, opr_sz = simd_oprsz(desc); 354 int idx = simd_data(desc); 355 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 356 intptr_t elements = opr_sz / 2; 357 intptr_t eltspersegment = MIN(16 / 2, elements); 358 359 for (i = 0; i < elements; i += 16 / 2) { 360 int16_t mm = m[i]; 361 for (j = 0; j < eltspersegment; ++j) { 362 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq); 363 } 364 } 365 clear_tail(d, opr_sz, simd_maxsz(desc)); 366 } 367 368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm, 369 void *vq, uint32_t desc) 370 { 371 intptr_t i, j, opr_sz = simd_oprsz(desc); 372 int idx = simd_data(desc); 373 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 374 intptr_t elements = opr_sz / 2; 375 intptr_t eltspersegment = MIN(16 / 2, elements); 376 377 for (i = 0; i < elements; i += 16 / 2) { 378 int16_t mm = m[i]; 379 for (j = 0; j < eltspersegment; ++j) { 380 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq); 381 } 382 } 383 clear_tail(d, opr_sz, simd_maxsz(desc)); 384 } 385 386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 387 void *va, uint32_t desc) 388 { 389 intptr_t i, opr_sz = simd_oprsz(desc); 390 int16_t *d = vd, *n = vn, *m = vm, *a = va; 391 uint32_t discard; 392 393 for (i = 0; i < opr_sz / 2; ++i) { 394 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 395 } 396 } 397 398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 399 void *va, uint32_t desc) 400 { 401 intptr_t i, opr_sz = simd_oprsz(desc); 402 int16_t *d = vd, *n = vn, *m = vm, *a = va; 403 uint32_t discard; 404 405 for (i = 0; i < opr_sz / 2; ++i) { 406 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 407 } 408 } 409 410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 411 { 412 intptr_t i, opr_sz = simd_oprsz(desc); 413 int16_t *d = vd, *n = vn, *m = vm; 414 uint32_t discard; 415 416 for (i = 0; i < opr_sz / 2; ++i) { 417 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 418 } 419 } 420 421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 422 { 423 intptr_t i, opr_sz = simd_oprsz(desc); 424 int16_t *d = vd, *n = vn, *m = vm; 425 uint32_t discard; 426 427 for (i = 0; i < opr_sz / 2; ++i) { 428 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 429 } 430 } 431 432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 433 { 434 intptr_t i, j, opr_sz = simd_oprsz(desc); 435 int idx = simd_data(desc); 436 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 437 uint32_t discard; 438 439 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 440 int16_t mm = m[i]; 441 for (j = 0; j < 16 / 2; ++j) { 442 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 443 } 444 } 445 } 446 447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 448 { 449 intptr_t i, j, opr_sz = simd_oprsz(desc); 450 int idx = simd_data(desc); 451 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 452 uint32_t discard; 453 454 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 455 int16_t mm = m[i]; 456 for (j = 0; j < 16 / 2; ++j) { 457 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 458 } 459 } 460 } 461 462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 464 bool neg, bool round, uint32_t *sat) 465 { 466 /* Simplify similarly to do_sqrdmlah_b above. */ 467 int64_t ret = (int64_t)src1 * src2; 468 if (neg) { 469 ret = -ret; 470 } 471 ret += ((int64_t)src3 << 31) + (round << 30); 472 ret >>= 31; 473 474 if (ret != (int32_t)ret) { 475 *sat = 1; 476 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 477 } 478 return ret; 479 } 480 481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 482 int32_t src2, int32_t src3) 483 { 484 uint32_t *sat = &env->vfp.qc[0]; 485 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 486 } 487 488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 489 void *vq, uint32_t desc) 490 { 491 uintptr_t opr_sz = simd_oprsz(desc); 492 int32_t *d = vd; 493 int32_t *n = vn; 494 int32_t *m = vm; 495 uintptr_t i; 496 497 for (i = 0; i < opr_sz / 4; ++i) { 498 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 499 } 500 clear_tail(d, opr_sz, simd_maxsz(desc)); 501 } 502 503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 504 int32_t src2, int32_t src3) 505 { 506 uint32_t *sat = &env->vfp.qc[0]; 507 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 508 } 509 510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 511 void *vq, uint32_t desc) 512 { 513 uintptr_t opr_sz = simd_oprsz(desc); 514 int32_t *d = vd; 515 int32_t *n = vn; 516 int32_t *m = vm; 517 uintptr_t i; 518 519 for (i = 0; i < opr_sz / 4; ++i) { 520 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 521 } 522 clear_tail(d, opr_sz, simd_maxsz(desc)); 523 } 524 525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 526 void *vq, uint32_t desc) 527 { 528 intptr_t i, opr_sz = simd_oprsz(desc); 529 int32_t *d = vd, *n = vn, *m = vm; 530 531 for (i = 0; i < opr_sz / 4; ++i) { 532 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 533 } 534 clear_tail(d, opr_sz, simd_maxsz(desc)); 535 } 536 537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 538 void *vq, uint32_t desc) 539 { 540 intptr_t i, opr_sz = simd_oprsz(desc); 541 int32_t *d = vd, *n = vn, *m = vm; 542 543 for (i = 0; i < opr_sz / 4; ++i) { 544 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 545 } 546 clear_tail(d, opr_sz, simd_maxsz(desc)); 547 } 548 549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm, 550 void *vq, uint32_t desc) 551 { 552 intptr_t i, j, opr_sz = simd_oprsz(desc); 553 int idx = simd_data(desc); 554 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 555 intptr_t elements = opr_sz / 4; 556 intptr_t eltspersegment = MIN(16 / 4, elements); 557 558 for (i = 0; i < elements; i += 16 / 4) { 559 int32_t mm = m[i]; 560 for (j = 0; j < eltspersegment; ++j) { 561 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq); 562 } 563 } 564 clear_tail(d, opr_sz, simd_maxsz(desc)); 565 } 566 567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, 568 void *vq, uint32_t desc) 569 { 570 intptr_t i, j, opr_sz = simd_oprsz(desc); 571 int idx = simd_data(desc); 572 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 573 intptr_t elements = opr_sz / 4; 574 intptr_t eltspersegment = MIN(16 / 4, elements); 575 576 for (i = 0; i < elements; i += 16 / 4) { 577 int32_t mm = m[i]; 578 for (j = 0; j < eltspersegment; ++j) { 579 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq); 580 } 581 } 582 clear_tail(d, opr_sz, simd_maxsz(desc)); 583 } 584 585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm, 586 void *vq, uint32_t desc) 587 { 588 intptr_t i, j, opr_sz = simd_oprsz(desc); 589 int idx = simd_data(desc); 590 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 591 intptr_t elements = opr_sz / 4; 592 intptr_t eltspersegment = MIN(16 / 4, elements); 593 594 for (i = 0; i < elements; i += 16 / 4) { 595 int32_t mm = m[i]; 596 for (j = 0; j < eltspersegment; ++j) { 597 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq); 598 } 599 } 600 clear_tail(d, opr_sz, simd_maxsz(desc)); 601 } 602 603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm, 604 void *vq, uint32_t desc) 605 { 606 intptr_t i, j, opr_sz = simd_oprsz(desc); 607 int idx = simd_data(desc); 608 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 609 intptr_t elements = opr_sz / 4; 610 intptr_t eltspersegment = MIN(16 / 4, elements); 611 612 for (i = 0; i < elements; i += 16 / 4) { 613 int32_t mm = m[i]; 614 for (j = 0; j < eltspersegment; ++j) { 615 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq); 616 } 617 } 618 clear_tail(d, opr_sz, simd_maxsz(desc)); 619 } 620 621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 622 void *va, uint32_t desc) 623 { 624 intptr_t i, opr_sz = simd_oprsz(desc); 625 int32_t *d = vd, *n = vn, *m = vm, *a = va; 626 uint32_t discard; 627 628 for (i = 0; i < opr_sz / 4; ++i) { 629 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 630 } 631 } 632 633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 634 void *va, uint32_t desc) 635 { 636 intptr_t i, opr_sz = simd_oprsz(desc); 637 int32_t *d = vd, *n = vn, *m = vm, *a = va; 638 uint32_t discard; 639 640 for (i = 0; i < opr_sz / 4; ++i) { 641 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 642 } 643 } 644 645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 646 { 647 intptr_t i, opr_sz = simd_oprsz(desc); 648 int32_t *d = vd, *n = vn, *m = vm; 649 uint32_t discard; 650 651 for (i = 0; i < opr_sz / 4; ++i) { 652 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 653 } 654 } 655 656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 657 { 658 intptr_t i, opr_sz = simd_oprsz(desc); 659 int32_t *d = vd, *n = vn, *m = vm; 660 uint32_t discard; 661 662 for (i = 0; i < opr_sz / 4; ++i) { 663 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 664 } 665 } 666 667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 668 { 669 intptr_t i, j, opr_sz = simd_oprsz(desc); 670 int idx = simd_data(desc); 671 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 672 uint32_t discard; 673 674 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 675 int32_t mm = m[i]; 676 for (j = 0; j < 16 / 4; ++j) { 677 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 678 } 679 } 680 } 681 682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 683 { 684 intptr_t i, j, opr_sz = simd_oprsz(desc); 685 int idx = simd_data(desc); 686 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 687 uint32_t discard; 688 689 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 690 int32_t mm = m[i]; 691 for (j = 0; j < 16 / 4; ++j) { 692 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 693 } 694 } 695 } 696 697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 698 static int64_t do_sat128_d(Int128 r) 699 { 700 int64_t ls = int128_getlo(r); 701 int64_t hs = int128_gethi(r); 702 703 if (unlikely(hs != (ls >> 63))) { 704 return hs < 0 ? INT64_MIN : INT64_MAX; 705 } 706 return ls; 707 } 708 709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 710 { 711 uint64_t l, h; 712 Int128 r, t; 713 714 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 715 muls64(&l, &h, m, n); 716 r = int128_make128(l, h); 717 if (neg) { 718 r = int128_neg(r); 719 } 720 if (a) { 721 t = int128_exts64(a); 722 t = int128_lshift(t, 63); 723 r = int128_add(r, t); 724 } 725 if (round) { 726 t = int128_exts64(1ll << 62); 727 r = int128_add(r, t); 728 } 729 r = int128_rshift(r, 63); 730 731 return do_sat128_d(r); 732 } 733 734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 735 void *va, uint32_t desc) 736 { 737 intptr_t i, opr_sz = simd_oprsz(desc); 738 int64_t *d = vd, *n = vn, *m = vm, *a = va; 739 740 for (i = 0; i < opr_sz / 8; ++i) { 741 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 742 } 743 } 744 745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 746 void *va, uint32_t desc) 747 { 748 intptr_t i, opr_sz = simd_oprsz(desc); 749 int64_t *d = vd, *n = vn, *m = vm, *a = va; 750 751 for (i = 0; i < opr_sz / 8; ++i) { 752 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 753 } 754 } 755 756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 757 { 758 intptr_t i, opr_sz = simd_oprsz(desc); 759 int64_t *d = vd, *n = vn, *m = vm; 760 761 for (i = 0; i < opr_sz / 8; ++i) { 762 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 763 } 764 } 765 766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 767 { 768 intptr_t i, opr_sz = simd_oprsz(desc); 769 int64_t *d = vd, *n = vn, *m = vm; 770 771 for (i = 0; i < opr_sz / 8; ++i) { 772 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 773 } 774 } 775 776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 777 { 778 intptr_t i, j, opr_sz = simd_oprsz(desc); 779 int idx = simd_data(desc); 780 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 781 782 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 783 int64_t mm = m[i]; 784 for (j = 0; j < 16 / 8; ++j) { 785 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 786 } 787 } 788 } 789 790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 791 { 792 intptr_t i, j, opr_sz = simd_oprsz(desc); 793 int idx = simd_data(desc); 794 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 795 796 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 797 int64_t mm = m[i]; 798 for (j = 0; j < 16 / 8; ++j) { 799 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 800 } 801 } 802 } 803 804 /* Integer 8 and 16-bit dot-product. 805 * 806 * Note that for the loops herein, host endianness does not matter 807 * with respect to the ordering of data within the quad-width lanes. 808 * All elements are treated equally, no matter where they are. 809 */ 810 811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 813 { \ 814 intptr_t i, opr_sz = simd_oprsz(desc); \ 815 TYPED *d = vd, *a = va; \ 816 TYPEN *n = vn; \ 817 TYPEM *m = vm; \ 818 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 819 d[i] = (a[i] + \ 820 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 821 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 822 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 823 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 824 } \ 825 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 826 } 827 828 DO_DOT(gvec_sdot_4b, int32_t, int8_t, int8_t) 829 DO_DOT(gvec_udot_4b, uint32_t, uint8_t, uint8_t) 830 DO_DOT(gvec_usdot_4b, uint32_t, uint8_t, int8_t) 831 DO_DOT(gvec_sdot_4h, int64_t, int16_t, int16_t) 832 DO_DOT(gvec_udot_4h, uint64_t, uint16_t, uint16_t) 833 834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 836 { \ 837 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 838 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 839 /* \ 840 * Special case: opr_sz == 8 from AA64/AA32 advsimd means the \ 841 * first iteration might not be a full 16 byte segment. But \ 842 * for vector lengths beyond that this must be SVE and we know \ 843 * opr_sz is a multiple of 16, so we need not clamp segend \ 844 * to opr_sz_n when we advance it at the end of the loop. \ 845 */ \ 846 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 847 intptr_t index = simd_data(desc); \ 848 TYPED *d = vd, *a = va; \ 849 TYPEN *n = vn; \ 850 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 851 do { \ 852 TYPED m0 = m_indexed[i * 4 + 0]; \ 853 TYPED m1 = m_indexed[i * 4 + 1]; \ 854 TYPED m2 = m_indexed[i * 4 + 2]; \ 855 TYPED m3 = m_indexed[i * 4 + 3]; \ 856 do { \ 857 d[i] = (a[i] + \ 858 n[i * 4 + 0] * m0 + \ 859 n[i * 4 + 1] * m1 + \ 860 n[i * 4 + 2] * m2 + \ 861 n[i * 4 + 3] * m3); \ 862 } while (++i < segend); \ 863 segend = i + (16 / sizeof(TYPED)); \ 864 } while (i < opr_sz_n); \ 865 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 866 } 867 868 DO_DOT_IDX(gvec_sdot_idx_4b, int32_t, int8_t, int8_t, H4) 869 DO_DOT_IDX(gvec_udot_idx_4b, uint32_t, uint8_t, uint8_t, H4) 870 DO_DOT_IDX(gvec_sudot_idx_4b, int32_t, int8_t, uint8_t, H4) 871 DO_DOT_IDX(gvec_usdot_idx_4b, int32_t, uint8_t, int8_t, H4) 872 DO_DOT_IDX(gvec_sdot_idx_4h, int64_t, int16_t, int16_t, H8) 873 DO_DOT_IDX(gvec_udot_idx_4h, uint64_t, uint16_t, uint16_t, H8) 874 875 #undef DO_DOT 876 #undef DO_DOT_IDX 877 878 /* Similar for 2-way dot product */ 879 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 880 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 881 { \ 882 intptr_t i, opr_sz = simd_oprsz(desc); \ 883 TYPED *d = vd, *a = va; \ 884 TYPEN *n = vn; \ 885 TYPEM *m = vm; \ 886 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 887 d[i] = (a[i] + \ 888 (TYPED)n[i * 2 + 0] * m[i * 2 + 0] + \ 889 (TYPED)n[i * 2 + 1] * m[i * 2 + 1]); \ 890 } \ 891 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 892 } 893 894 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 895 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 896 { \ 897 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 898 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 899 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 900 intptr_t index = simd_data(desc); \ 901 TYPED *d = vd, *a = va; \ 902 TYPEN *n = vn; \ 903 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 2; \ 904 do { \ 905 TYPED m0 = m_indexed[i * 2 + 0]; \ 906 TYPED m1 = m_indexed[i * 2 + 1]; \ 907 do { \ 908 d[i] = (a[i] + \ 909 n[i * 2 + 0] * m0 + \ 910 n[i * 2 + 1] * m1); \ 911 } while (++i < segend); \ 912 segend = i + (16 / sizeof(TYPED)); \ 913 } while (i < opr_sz_n); \ 914 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 915 } 916 917 DO_DOT(gvec_sdot_2h, int32_t, int16_t, int16_t) 918 DO_DOT(gvec_udot_2h, uint32_t, uint16_t, uint16_t) 919 920 DO_DOT_IDX(gvec_sdot_idx_2h, int32_t, int16_t, int16_t, H4) 921 DO_DOT_IDX(gvec_udot_idx_2h, uint32_t, uint16_t, uint16_t, H4) 922 923 #undef DO_DOT 924 #undef DO_DOT_IDX 925 926 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 927 float_status *fpst, uint32_t desc) 928 { 929 uintptr_t opr_sz = simd_oprsz(desc); 930 float16 *d = vd; 931 float16 *n = vn; 932 float16 *m = vm; 933 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 934 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 935 uintptr_t i; 936 937 for (i = 0; i < opr_sz / 2; i += 2) { 938 float16 e0 = n[H2(i)]; 939 float16 e1 = m[H2(i + 1)]; 940 float16 e2 = n[H2(i + 1)]; 941 float16 e3 = m[H2(i)]; 942 943 if (rot) { 944 e3 = float16_maybe_ah_chs(e3, fpcr_ah); 945 } else { 946 e1 = float16_maybe_ah_chs(e1, fpcr_ah); 947 } 948 949 d[H2(i)] = float16_add(e0, e1, fpst); 950 d[H2(i + 1)] = float16_add(e2, e3, fpst); 951 } 952 clear_tail(d, opr_sz, simd_maxsz(desc)); 953 } 954 955 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 956 float_status *fpst, uint32_t desc) 957 { 958 uintptr_t opr_sz = simd_oprsz(desc); 959 float32 *d = vd; 960 float32 *n = vn; 961 float32 *m = vm; 962 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 963 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 964 uintptr_t i; 965 966 for (i = 0; i < opr_sz / 4; i += 2) { 967 float32 e0 = n[H4(i)]; 968 float32 e1 = m[H4(i + 1)]; 969 float32 e2 = n[H4(i + 1)]; 970 float32 e3 = m[H4(i)]; 971 972 if (rot) { 973 e3 = float32_maybe_ah_chs(e3, fpcr_ah); 974 } else { 975 e1 = float32_maybe_ah_chs(e1, fpcr_ah); 976 } 977 978 d[H4(i)] = float32_add(e0, e1, fpst); 979 d[H4(i + 1)] = float32_add(e2, e3, fpst); 980 } 981 clear_tail(d, opr_sz, simd_maxsz(desc)); 982 } 983 984 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 985 float_status *fpst, uint32_t desc) 986 { 987 uintptr_t opr_sz = simd_oprsz(desc); 988 float64 *d = vd; 989 float64 *n = vn; 990 float64 *m = vm; 991 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 992 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 993 uintptr_t i; 994 995 for (i = 0; i < opr_sz / 8; i += 2) { 996 float64 e0 = n[i]; 997 float64 e1 = m[i + 1]; 998 float64 e2 = n[i + 1]; 999 float64 e3 = m[i]; 1000 1001 if (rot) { 1002 e3 = float64_maybe_ah_chs(e3, fpcr_ah); 1003 } else { 1004 e1 = float64_maybe_ah_chs(e1, fpcr_ah); 1005 } 1006 1007 d[i] = float64_add(e0, e1, fpst); 1008 d[i + 1] = float64_add(e2, e3, fpst); 1009 } 1010 clear_tail(d, opr_sz, simd_maxsz(desc)); 1011 } 1012 1013 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 1014 float_status *fpst, uint32_t desc) 1015 { 1016 uintptr_t opr_sz = simd_oprsz(desc); 1017 float16 *d = vd, *n = vn, *m = vm, *a = va; 1018 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1019 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 1020 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1021 uint32_t negf_real = flip ^ negf_imag; 1022 float16 negx_imag, negx_real; 1023 uintptr_t i; 1024 1025 /* With AH=0, use negx; with AH=1 use negf. */ 1026 negx_real = (negf_real & ~fpcr_ah) << 15; 1027 negx_imag = (negf_imag & ~fpcr_ah) << 15; 1028 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1029 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1030 1031 for (i = 0; i < opr_sz / 2; i += 2) { 1032 float16 e2 = n[H2(i + flip)]; 1033 float16 e1 = m[H2(i + flip)] ^ negx_real; 1034 float16 e4 = e2; 1035 float16 e3 = m[H2(i + 1 - flip)] ^ negx_imag; 1036 1037 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], negf_real, fpst); 1038 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], negf_imag, fpst); 1039 } 1040 clear_tail(d, opr_sz, simd_maxsz(desc)); 1041 } 1042 1043 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 1044 float_status *fpst, uint32_t desc) 1045 { 1046 uintptr_t opr_sz = simd_oprsz(desc); 1047 float16 *d = vd, *n = vn, *m = vm, *a = va; 1048 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1049 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1050 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1051 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1); 1052 uint32_t negf_real = flip ^ negf_imag; 1053 intptr_t elements = opr_sz / sizeof(float16); 1054 intptr_t eltspersegment = MIN(16 / sizeof(float16), elements); 1055 float16 negx_imag, negx_real; 1056 intptr_t i, j; 1057 1058 /* With AH=0, use negx; with AH=1 use negf. */ 1059 negx_real = (negf_real & ~fpcr_ah) << 15; 1060 negx_imag = (negf_imag & ~fpcr_ah) << 15; 1061 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1062 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1063 1064 for (i = 0; i < elements; i += eltspersegment) { 1065 float16 mr = m[H2(i + 2 * index + 0)]; 1066 float16 mi = m[H2(i + 2 * index + 1)]; 1067 float16 e1 = negx_real ^ (flip ? mi : mr); 1068 float16 e3 = negx_imag ^ (flip ? mr : mi); 1069 1070 for (j = i; j < i + eltspersegment; j += 2) { 1071 float16 e2 = n[H2(j + flip)]; 1072 float16 e4 = e2; 1073 1074 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], negf_real, fpst); 1075 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], negf_imag, fpst); 1076 } 1077 } 1078 clear_tail(d, opr_sz, simd_maxsz(desc)); 1079 } 1080 1081 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 1082 float_status *fpst, uint32_t desc) 1083 { 1084 uintptr_t opr_sz = simd_oprsz(desc); 1085 float32 *d = vd, *n = vn, *m = vm, *a = va; 1086 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1087 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 1088 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1089 uint32_t negf_real = flip ^ negf_imag; 1090 float32 negx_imag, negx_real; 1091 uintptr_t i; 1092 1093 /* With AH=0, use negx; with AH=1 use negf. */ 1094 negx_real = (negf_real & ~fpcr_ah) << 31; 1095 negx_imag = (negf_imag & ~fpcr_ah) << 31; 1096 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1097 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1098 1099 for (i = 0; i < opr_sz / 4; i += 2) { 1100 float32 e2 = n[H4(i + flip)]; 1101 float32 e1 = m[H4(i + flip)] ^ negx_real; 1102 float32 e4 = e2; 1103 float32 e3 = m[H4(i + 1 - flip)] ^ negx_imag; 1104 1105 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], negf_real, fpst); 1106 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], negf_imag, fpst); 1107 } 1108 clear_tail(d, opr_sz, simd_maxsz(desc)); 1109 } 1110 1111 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 1112 float_status *fpst, uint32_t desc) 1113 { 1114 uintptr_t opr_sz = simd_oprsz(desc); 1115 float32 *d = vd, *n = vn, *m = vm, *a = va; 1116 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1117 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1118 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1119 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1); 1120 uint32_t negf_real = flip ^ negf_imag; 1121 intptr_t elements = opr_sz / sizeof(float32); 1122 intptr_t eltspersegment = MIN(16 / sizeof(float32), elements); 1123 float32 negx_imag, negx_real; 1124 intptr_t i, j; 1125 1126 /* With AH=0, use negx; with AH=1 use negf. */ 1127 negx_real = (negf_real & ~fpcr_ah) << 31; 1128 negx_imag = (negf_imag & ~fpcr_ah) << 31; 1129 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1130 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1131 1132 for (i = 0; i < elements; i += eltspersegment) { 1133 float32 mr = m[H4(i + 2 * index + 0)]; 1134 float32 mi = m[H4(i + 2 * index + 1)]; 1135 float32 e1 = negx_real ^ (flip ? mi : mr); 1136 float32 e3 = negx_imag ^ (flip ? mr : mi); 1137 1138 for (j = i; j < i + eltspersegment; j += 2) { 1139 float32 e2 = n[H4(j + flip)]; 1140 float32 e4 = e2; 1141 1142 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], negf_real, fpst); 1143 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], negf_imag, fpst); 1144 } 1145 } 1146 clear_tail(d, opr_sz, simd_maxsz(desc)); 1147 } 1148 1149 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 1150 float_status *fpst, uint32_t desc) 1151 { 1152 uintptr_t opr_sz = simd_oprsz(desc); 1153 float64 *d = vd, *n = vn, *m = vm, *a = va; 1154 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1155 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 1156 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1157 uint32_t negf_real = flip ^ negf_imag; 1158 float64 negx_real, negx_imag; 1159 uintptr_t i; 1160 1161 /* With AH=0, use negx; with AH=1 use negf. */ 1162 negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63; 1163 negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63; 1164 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1165 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1166 1167 for (i = 0; i < opr_sz / 8; i += 2) { 1168 float64 e2 = n[i + flip]; 1169 float64 e1 = m[i + flip] ^ negx_real; 1170 float64 e4 = e2; 1171 float64 e3 = m[i + 1 - flip] ^ negx_imag; 1172 1173 d[i] = float64_muladd(e2, e1, a[i], negf_real, fpst); 1174 d[i + 1] = float64_muladd(e4, e3, a[i + 1], negf_imag, fpst); 1175 } 1176 clear_tail(d, opr_sz, simd_maxsz(desc)); 1177 } 1178 1179 /* 1180 * Floating point comparisons producing an integer result (all 1s or all 0s). 1181 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1182 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1183 */ 1184 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 1185 { 1186 return -float16_eq_quiet(op1, op2, stat); 1187 } 1188 1189 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 1190 { 1191 return -float32_eq_quiet(op1, op2, stat); 1192 } 1193 1194 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat) 1195 { 1196 return -float64_eq_quiet(op1, op2, stat); 1197 } 1198 1199 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 1200 { 1201 return -float16_le(op2, op1, stat); 1202 } 1203 1204 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 1205 { 1206 return -float32_le(op2, op1, stat); 1207 } 1208 1209 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat) 1210 { 1211 return -float64_le(op2, op1, stat); 1212 } 1213 1214 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 1215 { 1216 return -float16_lt(op2, op1, stat); 1217 } 1218 1219 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 1220 { 1221 return -float32_lt(op2, op1, stat); 1222 } 1223 1224 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat) 1225 { 1226 return -float64_lt(op2, op1, stat); 1227 } 1228 1229 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 1230 { 1231 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 1232 } 1233 1234 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 1235 { 1236 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1237 } 1238 1239 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat) 1240 { 1241 return -float64_le(float64_abs(op2), float64_abs(op1), stat); 1242 } 1243 1244 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1245 { 1246 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1247 } 1248 1249 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1250 { 1251 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1252 } 1253 1254 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat) 1255 { 1256 return -float64_lt(float64_abs(op2), float64_abs(op1), stat); 1257 } 1258 1259 static int16_t vfp_tosszh(float16 x, float_status *fpst) 1260 { 1261 if (float16_is_any_nan(x)) { 1262 float_raise(float_flag_invalid, fpst); 1263 return 0; 1264 } 1265 return float16_to_int16_round_to_zero(x, fpst); 1266 } 1267 1268 static uint16_t vfp_touszh(float16 x, float_status *fpst) 1269 { 1270 if (float16_is_any_nan(x)) { 1271 float_raise(float_flag_invalid, fpst); 1272 return 0; 1273 } 1274 return float16_to_uint16_round_to_zero(x, fpst); 1275 } 1276 1277 #define DO_2OP(NAME, FUNC, TYPE) \ 1278 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 1279 { \ 1280 intptr_t i, oprsz = simd_oprsz(desc); \ 1281 TYPE *d = vd, *n = vn; \ 1282 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1283 d[i] = FUNC(n[i], stat); \ 1284 } \ 1285 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1286 } 1287 1288 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1289 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1290 DO_2OP(gvec_frecpe_rpres_s, helper_recpe_rpres_f32, float32) 1291 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1292 1293 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1294 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1295 DO_2OP(gvec_frsqrte_rpres_s, helper_rsqrte_rpres_f32, float32) 1296 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1297 1298 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1299 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1300 1301 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1302 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1303 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1304 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1305 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1306 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1307 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1308 DO_2OP(gvec_touszh, vfp_touszh, float16) 1309 1310 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1311 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1312 { \ 1313 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1314 } 1315 1316 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1317 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1318 { \ 1319 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1320 } 1321 1322 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1323 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1324 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1325 WRAP_CMP0_##DIRN(FN, CMPOP, float64) \ 1326 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1327 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) \ 1328 DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64) 1329 1330 DO_2OP_CMP0(cgt, cgt, FWD) 1331 DO_2OP_CMP0(cge, cge, FWD) 1332 DO_2OP_CMP0(ceq, ceq, FWD) 1333 DO_2OP_CMP0(clt, cgt, REV) 1334 DO_2OP_CMP0(cle, cge, REV) 1335 1336 #undef DO_2OP 1337 #undef DO_2OP_CMP0 1338 1339 /* Floating-point trigonometric starting value. 1340 * See the ARM ARM pseudocode function FPTrigSMul. 1341 */ 1342 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1343 { 1344 float16 result = float16_mul(op1, op1, stat); 1345 if (!float16_is_any_nan(result)) { 1346 result = float16_set_sign(result, op2 & 1); 1347 } 1348 return result; 1349 } 1350 1351 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1352 { 1353 float32 result = float32_mul(op1, op1, stat); 1354 if (!float32_is_any_nan(result)) { 1355 result = float32_set_sign(result, op2 & 1); 1356 } 1357 return result; 1358 } 1359 1360 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1361 { 1362 float64 result = float64_mul(op1, op1, stat); 1363 if (!float64_is_any_nan(result)) { 1364 result = float64_set_sign(result, op2 & 1); 1365 } 1366 return result; 1367 } 1368 1369 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1370 { 1371 return float16_abs(float16_sub(op1, op2, stat)); 1372 } 1373 1374 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1375 { 1376 return float32_abs(float32_sub(op1, op2, stat)); 1377 } 1378 1379 static float64 float64_abd(float64 op1, float64 op2, float_status *stat) 1380 { 1381 return float64_abs(float64_sub(op1, op2, stat)); 1382 } 1383 1384 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */ 1385 static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat) 1386 { 1387 float16 r = float16_sub(op1, op2, stat); 1388 return float16_is_any_nan(r) ? r : float16_abs(r); 1389 } 1390 1391 static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat) 1392 { 1393 float32 r = float32_sub(op1, op2, stat); 1394 return float32_is_any_nan(r) ? r : float32_abs(r); 1395 } 1396 1397 static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat) 1398 { 1399 float64 r = float64_sub(op1, op2, stat); 1400 return float64_is_any_nan(r) ? r : float64_abs(r); 1401 } 1402 1403 /* 1404 * Reciprocal step. These are the AArch32 version which uses a 1405 * non-fused multiply-and-subtract. 1406 */ 1407 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1408 { 1409 op1 = float16_squash_input_denormal(op1, stat); 1410 op2 = float16_squash_input_denormal(op2, stat); 1411 1412 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1413 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1414 return float16_two; 1415 } 1416 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1417 } 1418 1419 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1420 { 1421 op1 = float32_squash_input_denormal(op1, stat); 1422 op2 = float32_squash_input_denormal(op2, stat); 1423 1424 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1425 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1426 return float32_two; 1427 } 1428 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1429 } 1430 1431 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1432 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1433 { 1434 op1 = float16_squash_input_denormal(op1, stat); 1435 op2 = float16_squash_input_denormal(op2, stat); 1436 1437 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1438 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1439 return float16_one_point_five; 1440 } 1441 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1442 return float16_div(op1, float16_two, stat); 1443 } 1444 1445 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1446 { 1447 op1 = float32_squash_input_denormal(op1, stat); 1448 op2 = float32_squash_input_denormal(op2, stat); 1449 1450 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1451 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1452 return float32_one_point_five; 1453 } 1454 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1455 return float32_div(op1, float32_two, stat); 1456 } 1457 1458 #define DO_3OP(NAME, FUNC, TYPE) \ 1459 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1460 float_status *stat, uint32_t desc) \ 1461 { \ 1462 intptr_t i, oprsz = simd_oprsz(desc); \ 1463 TYPE *d = vd, *n = vn, *m = vm; \ 1464 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1465 d[i] = FUNC(n[i], m[i], stat); \ 1466 } \ 1467 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1468 } 1469 1470 DO_3OP(gvec_fadd_h, float16_add, float16) 1471 DO_3OP(gvec_fadd_s, float32_add, float32) 1472 DO_3OP(gvec_fadd_d, float64_add, float64) 1473 DO_3OP(gvec_bfadd, bfloat16_add, bfloat16) 1474 1475 DO_3OP(gvec_fsub_h, float16_sub, float16) 1476 DO_3OP(gvec_fsub_s, float32_sub, float32) 1477 DO_3OP(gvec_fsub_d, float64_sub, float64) 1478 DO_3OP(gvec_bfsub, bfloat16_sub, bfloat16) 1479 1480 DO_3OP(gvec_fmul_h, float16_mul, float16) 1481 DO_3OP(gvec_fmul_s, float32_mul, float32) 1482 DO_3OP(gvec_fmul_d, float64_mul, float64) 1483 1484 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1485 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1486 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1487 1488 DO_3OP(gvec_fabd_h, float16_abd, float16) 1489 DO_3OP(gvec_fabd_s, float32_abd, float32) 1490 DO_3OP(gvec_fabd_d, float64_abd, float64) 1491 1492 DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16) 1493 DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32) 1494 DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64) 1495 1496 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1497 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1498 DO_3OP(gvec_fceq_d, float64_ceq, float64) 1499 1500 DO_3OP(gvec_fcge_h, float16_cge, float16) 1501 DO_3OP(gvec_fcge_s, float32_cge, float32) 1502 DO_3OP(gvec_fcge_d, float64_cge, float64) 1503 1504 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1505 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1506 DO_3OP(gvec_fcgt_d, float64_cgt, float64) 1507 1508 DO_3OP(gvec_facge_h, float16_acge, float16) 1509 DO_3OP(gvec_facge_s, float32_acge, float32) 1510 DO_3OP(gvec_facge_d, float64_acge, float64) 1511 1512 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1513 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1514 DO_3OP(gvec_facgt_d, float64_acgt, float64) 1515 1516 DO_3OP(gvec_fmax_h, float16_max, float16) 1517 DO_3OP(gvec_fmax_s, float32_max, float32) 1518 DO_3OP(gvec_fmax_d, float64_max, float64) 1519 1520 DO_3OP(gvec_fmin_h, float16_min, float16) 1521 DO_3OP(gvec_fmin_s, float32_min, float32) 1522 DO_3OP(gvec_fmin_d, float64_min, float64) 1523 1524 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1525 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1526 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64) 1527 1528 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1529 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1530 DO_3OP(gvec_fminnum_d, float64_minnum, float64) 1531 1532 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1533 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1534 1535 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1536 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1537 1538 #ifdef TARGET_AARCH64 1539 DO_3OP(gvec_fdiv_h, float16_div, float16) 1540 DO_3OP(gvec_fdiv_s, float32_div, float32) 1541 DO_3OP(gvec_fdiv_d, float64_div, float64) 1542 1543 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16) 1544 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32) 1545 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64) 1546 1547 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1548 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1549 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1550 1551 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1552 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1553 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1554 1555 DO_3OP(gvec_ah_recps_h, helper_recpsf_ah_f16, float16) 1556 DO_3OP(gvec_ah_recps_s, helper_recpsf_ah_f32, float32) 1557 DO_3OP(gvec_ah_recps_d, helper_recpsf_ah_f64, float64) 1558 1559 DO_3OP(gvec_ah_rsqrts_h, helper_rsqrtsf_ah_f16, float16) 1560 DO_3OP(gvec_ah_rsqrts_s, helper_rsqrtsf_ah_f32, float32) 1561 DO_3OP(gvec_ah_rsqrts_d, helper_rsqrtsf_ah_f64, float64) 1562 1563 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16) 1564 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32) 1565 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64) 1566 1567 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16) 1568 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32) 1569 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64) 1570 1571 DO_3OP(gvec_fmax_b16, bfloat16_max, bfloat16) 1572 DO_3OP(gvec_fmin_b16, bfloat16_min, bfloat16) 1573 DO_3OP(gvec_fmaxnum_b16, bfloat16_maxnum, bfloat16) 1574 DO_3OP(gvec_fminnum_b16, bfloat16_minnum, bfloat16) 1575 DO_3OP(gvec_ah_fmax_b16, helper_sme2_ah_fmax_b16, bfloat16) 1576 DO_3OP(gvec_ah_fmin_b16, helper_sme2_ah_fmin_b16, bfloat16) 1577 1578 #endif 1579 #undef DO_3OP 1580 1581 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1582 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1583 float_status *stat) 1584 { 1585 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1586 } 1587 1588 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1589 float_status *stat) 1590 { 1591 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1592 } 1593 1594 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1595 float_status *stat) 1596 { 1597 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1598 } 1599 1600 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1601 float_status *stat) 1602 { 1603 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1604 } 1605 1606 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1607 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1608 float_status *stat) 1609 { 1610 return float16_muladd(op1, op2, dest, 0, stat); 1611 } 1612 1613 static bfloat16 bfloat16_muladd_f(bfloat16 dest, bfloat16 op1, bfloat16 op2, 1614 float_status *stat) 1615 { 1616 return bfloat16_muladd(op1, op2, dest, 0, stat); 1617 } 1618 1619 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1620 float_status *stat) 1621 { 1622 return float32_muladd(op1, op2, dest, 0, stat); 1623 } 1624 1625 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2, 1626 float_status *stat) 1627 { 1628 return float64_muladd(op1, op2, dest, 0, stat); 1629 } 1630 1631 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1632 float_status *stat) 1633 { 1634 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1635 } 1636 1637 static bfloat16 bfloat16_mulsub_f(bfloat16 dest, bfloat16 op1, bfloat16 op2, 1638 float_status *stat) 1639 { 1640 return bfloat16_muladd(bfloat16_chs(op1), op2, dest, 0, stat); 1641 } 1642 1643 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1644 float_status *stat) 1645 { 1646 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1647 } 1648 1649 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2, 1650 float_status *stat) 1651 { 1652 return float64_muladd(float64_chs(op1), op2, dest, 0, stat); 1653 } 1654 1655 static float16 float16_ah_mulsub_f(float16 dest, float16 op1, float16 op2, 1656 float_status *stat) 1657 { 1658 return float16_muladd(op1, op2, dest, float_muladd_negate_product, stat); 1659 } 1660 1661 static bfloat16 bfloat16_ah_mulsub_f(bfloat16 dest, bfloat16 op1, bfloat16 op2, 1662 float_status *stat) 1663 { 1664 return bfloat16_muladd(op1, op2, dest, float_muladd_negate_product, stat); 1665 } 1666 1667 static float32 float32_ah_mulsub_f(float32 dest, float32 op1, float32 op2, 1668 float_status *stat) 1669 { 1670 return float32_muladd(op1, op2, dest, float_muladd_negate_product, stat); 1671 } 1672 1673 static float64 float64_ah_mulsub_f(float64 dest, float64 op1, float64 op2, 1674 float_status *stat) 1675 { 1676 return float64_muladd(op1, op2, dest, float_muladd_negate_product, stat); 1677 } 1678 1679 #define DO_MULADD(NAME, FUNC, TYPE) \ 1680 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1681 float_status *stat, uint32_t desc) \ 1682 { \ 1683 intptr_t i, oprsz = simd_oprsz(desc); \ 1684 TYPE *d = vd, *n = vn, *m = vm; \ 1685 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1686 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1687 } \ 1688 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1689 } 1690 1691 DO_MULADD(gvec_fmla_nf_h, float16_muladd_nf, float16) 1692 DO_MULADD(gvec_fmla_nf_s, float32_muladd_nf, float32) 1693 1694 DO_MULADD(gvec_fmls_nf_h, float16_mulsub_nf, float16) 1695 DO_MULADD(gvec_fmls_nf_s, float32_mulsub_nf, float32) 1696 1697 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1698 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1699 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64) 1700 DO_MULADD(gvec_bfmla, bfloat16_muladd_f, bfloat16) 1701 1702 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1703 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1704 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64) 1705 DO_MULADD(gvec_bfmls, bfloat16_mulsub_f, bfloat16) 1706 1707 DO_MULADD(gvec_ah_vfms_h, float16_ah_mulsub_f, float16) 1708 DO_MULADD(gvec_ah_vfms_s, float32_ah_mulsub_f, float32) 1709 DO_MULADD(gvec_ah_vfms_d, float64_ah_mulsub_f, float64) 1710 DO_MULADD(gvec_ah_bfmls, bfloat16_ah_mulsub_f, bfloat16) 1711 1712 #undef DO_MULADD 1713 1714 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1715 * For AdvSIMD, there is of course only one such vector segment. 1716 */ 1717 1718 #define DO_MUL_IDX(NAME, TYPE, H) \ 1719 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1720 { \ 1721 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1722 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1723 intptr_t idx = simd_data(desc); \ 1724 TYPE *d = vd, *n = vn, *m = vm; \ 1725 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1726 TYPE mm = m[H(i + idx)]; \ 1727 for (j = 0; j < segment; j++) { \ 1728 d[i + j] = n[i + j] * mm; \ 1729 } \ 1730 } \ 1731 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1732 } 1733 1734 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1735 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1736 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1737 1738 #undef DO_MUL_IDX 1739 1740 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1741 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1742 { \ 1743 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1744 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1745 intptr_t idx = simd_data(desc); \ 1746 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1747 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1748 TYPE mm = m[H(i + idx)]; \ 1749 for (j = 0; j < segment; j++) { \ 1750 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1751 } \ 1752 } \ 1753 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1754 } 1755 1756 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1757 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1758 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1759 1760 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1761 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1762 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1763 1764 #undef DO_MLA_IDX 1765 1766 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \ 1767 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1768 float_status *stat, uint32_t desc) \ 1769 { \ 1770 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1771 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1772 intptr_t idx = simd_data(desc); \ 1773 TYPE *d = vd, *n = vn, *m = vm; \ 1774 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1775 TYPE mm = m[H(i + idx)]; \ 1776 for (j = 0; j < segment; j++) { \ 1777 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \ 1778 } \ 1779 } \ 1780 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1781 } 1782 1783 #define nop(N, M, S) (M) 1784 1785 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2) 1786 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4) 1787 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8) 1788 1789 #ifdef TARGET_AARCH64 1790 1791 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2) 1792 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4) 1793 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8) 1794 1795 #endif 1796 1797 #undef nop 1798 1799 /* 1800 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1801 * the fused ops below they assume accumulate both from and into Vd. 1802 */ 1803 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2) 1804 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4) 1805 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2) 1806 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4) 1807 1808 #undef DO_FMUL_IDX 1809 1810 #define DO_FMLA_IDX(NAME, TYPE, H, NEGX, NEGF) \ 1811 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1812 float_status *stat, uint32_t desc) \ 1813 { \ 1814 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1815 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1816 intptr_t idx = simd_data(desc); \ 1817 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1818 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1819 TYPE mm = m[H(i + idx)]; \ 1820 for (j = 0; j < segment; j++) { \ 1821 d[i + j] = TYPE##_muladd(n[i + j] ^ NEGX, mm, \ 1822 a[i + j], NEGF, stat); \ 1823 } \ 1824 } \ 1825 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1826 } 1827 1828 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2, 0, 0) 1829 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4, 0, 0) 1830 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8, 0, 0) 1831 DO_FMLA_IDX(gvec_bfmla_idx, bfloat16, H2, 0, 0) 1832 1833 DO_FMLA_IDX(gvec_fmls_idx_h, float16, H2, INT16_MIN, 0) 1834 DO_FMLA_IDX(gvec_fmls_idx_s, float32, H4, INT32_MIN, 0) 1835 DO_FMLA_IDX(gvec_fmls_idx_d, float64, H8, INT64_MIN, 0) 1836 DO_FMLA_IDX(gvec_bfmls_idx, bfloat16, H2, INT16_MIN, 0) 1837 1838 DO_FMLA_IDX(gvec_ah_fmls_idx_h, float16, H2, 0, float_muladd_negate_product) 1839 DO_FMLA_IDX(gvec_ah_fmls_idx_s, float32, H4, 0, float_muladd_negate_product) 1840 DO_FMLA_IDX(gvec_ah_fmls_idx_d, float64, H8, 0, float_muladd_negate_product) 1841 DO_FMLA_IDX(gvec_ah_bfmls_idx, bfloat16, H2, 0, float_muladd_negate_product) 1842 1843 #undef DO_FMLA_IDX 1844 1845 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1846 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1847 { \ 1848 intptr_t i, oprsz = simd_oprsz(desc); \ 1849 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1850 bool q = false; \ 1851 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1852 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1853 if (dd < MIN) { \ 1854 dd = MIN; \ 1855 q = true; \ 1856 } else if (dd > MAX) { \ 1857 dd = MAX; \ 1858 q = true; \ 1859 } \ 1860 d[i] = dd; \ 1861 } \ 1862 if (q) { \ 1863 uint32_t *qc = vq; \ 1864 qc[0] = 1; \ 1865 } \ 1866 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1867 } 1868 1869 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1870 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1871 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1872 1873 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1874 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1875 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1876 1877 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1878 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1879 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1880 1881 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1882 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1883 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1884 1885 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX) 1886 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX) 1887 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX) 1888 1889 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX) 1890 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX) 1891 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX) 1892 1893 #undef DO_SAT 1894 1895 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1896 void *vm, uint32_t desc) 1897 { 1898 intptr_t i, oprsz = simd_oprsz(desc); 1899 uint64_t *d = vd, *n = vn, *m = vm; 1900 bool q = false; 1901 1902 for (i = 0; i < oprsz / 8; i++) { 1903 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1904 if (dd < nn) { 1905 dd = UINT64_MAX; 1906 q = true; 1907 } 1908 d[i] = dd; 1909 } 1910 if (q) { 1911 uint32_t *qc = vq; 1912 qc[0] = 1; 1913 } 1914 clear_tail(d, oprsz, simd_maxsz(desc)); 1915 } 1916 1917 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1918 void *vm, uint32_t desc) 1919 { 1920 intptr_t i, oprsz = simd_oprsz(desc); 1921 uint64_t *d = vd, *n = vn, *m = vm; 1922 bool q = false; 1923 1924 for (i = 0; i < oprsz / 8; i++) { 1925 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1926 if (nn < mm) { 1927 dd = 0; 1928 q = true; 1929 } 1930 d[i] = dd; 1931 } 1932 if (q) { 1933 uint32_t *qc = vq; 1934 qc[0] = 1; 1935 } 1936 clear_tail(d, oprsz, simd_maxsz(desc)); 1937 } 1938 1939 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1940 void *vm, uint32_t desc) 1941 { 1942 intptr_t i, oprsz = simd_oprsz(desc); 1943 int64_t *d = vd, *n = vn, *m = vm; 1944 bool q = false; 1945 1946 for (i = 0; i < oprsz / 8; i++) { 1947 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1948 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1949 dd = (nn >> 63) ^ ~INT64_MIN; 1950 q = true; 1951 } 1952 d[i] = dd; 1953 } 1954 if (q) { 1955 uint32_t *qc = vq; 1956 qc[0] = 1; 1957 } 1958 clear_tail(d, oprsz, simd_maxsz(desc)); 1959 } 1960 1961 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1962 void *vm, uint32_t desc) 1963 { 1964 intptr_t i, oprsz = simd_oprsz(desc); 1965 int64_t *d = vd, *n = vn, *m = vm; 1966 bool q = false; 1967 1968 for (i = 0; i < oprsz / 8; i++) { 1969 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1970 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1971 dd = (nn >> 63) ^ ~INT64_MIN; 1972 q = true; 1973 } 1974 d[i] = dd; 1975 } 1976 if (q) { 1977 uint32_t *qc = vq; 1978 qc[0] = 1; 1979 } 1980 clear_tail(d, oprsz, simd_maxsz(desc)); 1981 } 1982 1983 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn, 1984 void *vm, uint32_t desc) 1985 { 1986 intptr_t i, oprsz = simd_oprsz(desc); 1987 uint64_t *d = vd, *n = vn, *m = vm; 1988 bool q = false; 1989 1990 for (i = 0; i < oprsz / 8; i++) { 1991 uint64_t nn = n[i]; 1992 int64_t mm = m[i]; 1993 uint64_t dd = nn + mm; 1994 1995 if (mm < 0) { 1996 if (nn < (uint64_t)-mm) { 1997 dd = 0; 1998 q = true; 1999 } 2000 } else { 2001 if (dd < nn) { 2002 dd = UINT64_MAX; 2003 q = true; 2004 } 2005 } 2006 d[i] = dd; 2007 } 2008 if (q) { 2009 uint32_t *qc = vq; 2010 qc[0] = 1; 2011 } 2012 clear_tail(d, oprsz, simd_maxsz(desc)); 2013 } 2014 2015 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn, 2016 void *vm, uint32_t desc) 2017 { 2018 intptr_t i, oprsz = simd_oprsz(desc); 2019 uint64_t *d = vd, *n = vn, *m = vm; 2020 bool q = false; 2021 2022 for (i = 0; i < oprsz / 8; i++) { 2023 int64_t nn = n[i]; 2024 uint64_t mm = m[i]; 2025 int64_t dd = nn + mm; 2026 2027 if (mm > (uint64_t)(INT64_MAX - nn)) { 2028 dd = INT64_MAX; 2029 q = true; 2030 } 2031 d[i] = dd; 2032 } 2033 if (q) { 2034 uint32_t *qc = vq; 2035 qc[0] = 1; 2036 } 2037 clear_tail(d, oprsz, simd_maxsz(desc)); 2038 } 2039 2040 #define DO_SRA(NAME, TYPE) \ 2041 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2042 { \ 2043 intptr_t i, oprsz = simd_oprsz(desc); \ 2044 int shift = simd_data(desc); \ 2045 TYPE *d = vd, *n = vn; \ 2046 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2047 d[i] += n[i] >> shift; \ 2048 } \ 2049 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2050 } 2051 2052 DO_SRA(gvec_ssra_b, int8_t) 2053 DO_SRA(gvec_ssra_h, int16_t) 2054 DO_SRA(gvec_ssra_s, int32_t) 2055 DO_SRA(gvec_ssra_d, int64_t) 2056 2057 DO_SRA(gvec_usra_b, uint8_t) 2058 DO_SRA(gvec_usra_h, uint16_t) 2059 DO_SRA(gvec_usra_s, uint32_t) 2060 DO_SRA(gvec_usra_d, uint64_t) 2061 2062 #undef DO_SRA 2063 2064 #define DO_RSHR(NAME, TYPE) \ 2065 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2066 { \ 2067 intptr_t i, oprsz = simd_oprsz(desc); \ 2068 int shift = simd_data(desc); \ 2069 TYPE *d = vd, *n = vn; \ 2070 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2071 TYPE tmp = n[i] >> (shift - 1); \ 2072 d[i] = (tmp >> 1) + (tmp & 1); \ 2073 } \ 2074 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2075 } 2076 2077 DO_RSHR(gvec_srshr_b, int8_t) 2078 DO_RSHR(gvec_srshr_h, int16_t) 2079 DO_RSHR(gvec_srshr_s, int32_t) 2080 DO_RSHR(gvec_srshr_d, int64_t) 2081 2082 DO_RSHR(gvec_urshr_b, uint8_t) 2083 DO_RSHR(gvec_urshr_h, uint16_t) 2084 DO_RSHR(gvec_urshr_s, uint32_t) 2085 DO_RSHR(gvec_urshr_d, uint64_t) 2086 2087 #undef DO_RSHR 2088 2089 #define DO_RSRA(NAME, TYPE) \ 2090 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2091 { \ 2092 intptr_t i, oprsz = simd_oprsz(desc); \ 2093 int shift = simd_data(desc); \ 2094 TYPE *d = vd, *n = vn; \ 2095 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2096 TYPE tmp = n[i] >> (shift - 1); \ 2097 d[i] += (tmp >> 1) + (tmp & 1); \ 2098 } \ 2099 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2100 } 2101 2102 DO_RSRA(gvec_srsra_b, int8_t) 2103 DO_RSRA(gvec_srsra_h, int16_t) 2104 DO_RSRA(gvec_srsra_s, int32_t) 2105 DO_RSRA(gvec_srsra_d, int64_t) 2106 2107 DO_RSRA(gvec_ursra_b, uint8_t) 2108 DO_RSRA(gvec_ursra_h, uint16_t) 2109 DO_RSRA(gvec_ursra_s, uint32_t) 2110 DO_RSRA(gvec_ursra_d, uint64_t) 2111 2112 #undef DO_RSRA 2113 2114 #define DO_SRI(NAME, TYPE) \ 2115 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2116 { \ 2117 intptr_t i, oprsz = simd_oprsz(desc); \ 2118 int shift = simd_data(desc); \ 2119 TYPE *d = vd, *n = vn; \ 2120 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2121 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 2122 } \ 2123 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2124 } 2125 2126 DO_SRI(gvec_sri_b, uint8_t) 2127 DO_SRI(gvec_sri_h, uint16_t) 2128 DO_SRI(gvec_sri_s, uint32_t) 2129 DO_SRI(gvec_sri_d, uint64_t) 2130 2131 #undef DO_SRI 2132 2133 #define DO_SLI(NAME, TYPE) \ 2134 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2135 { \ 2136 intptr_t i, oprsz = simd_oprsz(desc); \ 2137 int shift = simd_data(desc); \ 2138 TYPE *d = vd, *n = vn; \ 2139 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2140 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 2141 } \ 2142 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2143 } 2144 2145 DO_SLI(gvec_sli_b, uint8_t) 2146 DO_SLI(gvec_sli_h, uint16_t) 2147 DO_SLI(gvec_sli_s, uint32_t) 2148 DO_SLI(gvec_sli_d, uint64_t) 2149 2150 #undef DO_SLI 2151 2152 /* 2153 * Convert float16 to float32, raising no exceptions and 2154 * preserving exceptional values, including SNaN. 2155 * This is effectively an unpack+repack operation. 2156 */ 2157 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 2158 { 2159 const int f16_bias = 15; 2160 const int f32_bias = 127; 2161 uint32_t sign = extract32(f16, 15, 1); 2162 uint32_t exp = extract32(f16, 10, 5); 2163 uint32_t frac = extract32(f16, 0, 10); 2164 2165 if (exp == 0x1f) { 2166 /* Inf or NaN */ 2167 exp = 0xff; 2168 } else if (exp == 0) { 2169 /* Zero or denormal. */ 2170 if (frac != 0) { 2171 if (fz16) { 2172 frac = 0; 2173 } else { 2174 /* 2175 * Denormal; these are all normal float32. 2176 * Shift the fraction so that the msb is at bit 11, 2177 * then remove bit 11 as the implicit bit of the 2178 * normalized float32. Note that we still go through 2179 * the shift for normal numbers below, to put the 2180 * float32 fraction at the right place. 2181 */ 2182 int shift = clz32(frac) - 21; 2183 frac = (frac << shift) & 0x3ff; 2184 exp = f32_bias - f16_bias - shift + 1; 2185 } 2186 } 2187 } else { 2188 /* Normal number; adjust the bias. */ 2189 exp += f32_bias - f16_bias; 2190 } 2191 sign <<= 31; 2192 exp <<= 23; 2193 frac <<= 23 - 10; 2194 2195 return sign | exp | frac; 2196 } 2197 2198 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 2199 { 2200 /* 2201 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 2202 * Load the 2nd qword iff is_q & is_2. 2203 * Shift to the 2nd dword iff !is_q & is_2. 2204 * For !is_q & !is_2, the upper bits of the result are garbage. 2205 */ 2206 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 2207 } 2208 2209 /* 2210 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 2211 * as there is not yet SVE versions that might use blocking. 2212 */ 2213 2214 static void do_fmlal(float32 *d, void *vn, void *vm, 2215 CPUARMState *env, uint32_t desc, 2216 ARMFPStatusFlavour fpst_idx, 2217 uint64_t negx, int negf) 2218 { 2219 float_status *fpst = &env->vfp.fp_status[fpst_idx]; 2220 bool fz16 = env->vfp.fpcr & FPCR_FZ16; 2221 intptr_t i, oprsz = simd_oprsz(desc); 2222 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2223 int is_q = oprsz == 16; 2224 uint64_t n_4, m_4; 2225 2226 /* 2227 * Pre-load all of the f16 data, avoiding overlap issues. 2228 * Negate all inputs for AH=0 FMLSL at once. 2229 */ 2230 n_4 = load4_f16(vn, is_q, is_2) ^ negx; 2231 m_4 = load4_f16(vm, is_q, is_2); 2232 2233 for (i = 0; i < oprsz / 4; i++) { 2234 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2235 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 2236 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst); 2237 } 2238 clear_tail(d, oprsz, simd_maxsz(desc)); 2239 } 2240 2241 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 2242 CPUARMState *env, uint32_t desc) 2243 { 2244 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2245 uint64_t negx = is_s ? 0x8000800080008000ull : 0; 2246 2247 do_fmlal(vd, vn, vm, env, desc, FPST_STD, negx, 0); 2248 } 2249 2250 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 2251 CPUARMState *env, uint32_t desc) 2252 { 2253 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2254 uint64_t negx = 0; 2255 int negf = 0; 2256 2257 if (is_s) { 2258 if (env->vfp.fpcr & FPCR_AH) { 2259 negf = float_muladd_negate_product; 2260 } else { 2261 negx = 0x8000800080008000ull; 2262 } 2263 } 2264 do_fmlal(vd, vn, vm, env, desc, FPST_A64, negx, negf); 2265 } 2266 2267 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 2268 CPUARMState *env, uint32_t desc) 2269 { 2270 intptr_t i, oprsz = simd_oprsz(desc); 2271 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2272 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2273 bool za = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 2274 float_status *status = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64]; 2275 bool fz16 = env->vfp.fpcr & FPCR_FZ16; 2276 int negx = 0, negf = 0; 2277 2278 if (is_s) { 2279 if (env->vfp.fpcr & FPCR_AH) { 2280 negf = float_muladd_negate_product; 2281 } else { 2282 negx = 0x8000; 2283 } 2284 } 2285 2286 for (i = 0; i < oprsz; i += sizeof(float32)) { 2287 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negx; 2288 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 2289 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2290 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2291 float32 aa = *(float32 *)(va + H1_4(i)); 2292 2293 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, negf, status); 2294 } 2295 } 2296 2297 static void do_fmlal_idx(float32 *d, void *vn, void *vm, 2298 CPUARMState *env, uint32_t desc, 2299 ARMFPStatusFlavour fpst_idx, 2300 uint64_t negx, int negf) 2301 { 2302 float_status *fpst = &env->vfp.fp_status[fpst_idx]; 2303 bool fz16 = env->vfp.fpcr & FPCR_FZ16; 2304 intptr_t i, oprsz = simd_oprsz(desc); 2305 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2306 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 2307 int is_q = oprsz == 16; 2308 uint64_t n_4; 2309 float32 m_1; 2310 2311 /* 2312 * Pre-load all of the f16 data, avoiding overlap issues. 2313 * Negate all inputs for AH=0 FMLSL at once. 2314 */ 2315 n_4 = load4_f16(vn, is_q, is_2) ^ negx; 2316 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 2317 2318 for (i = 0; i < oprsz / 4; i++) { 2319 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2320 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst); 2321 } 2322 clear_tail(d, oprsz, simd_maxsz(desc)); 2323 } 2324 2325 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 2326 CPUARMState *env, uint32_t desc) 2327 { 2328 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2329 uint64_t negx = is_s ? 0x8000800080008000ull : 0; 2330 2331 do_fmlal_idx(vd, vn, vm, env, desc, FPST_STD, negx, 0); 2332 } 2333 2334 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 2335 CPUARMState *env, uint32_t desc) 2336 { 2337 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2338 uint64_t negx = 0; 2339 int negf = 0; 2340 2341 if (is_s) { 2342 if (env->vfp.fpcr & FPCR_AH) { 2343 negf = float_muladd_negate_product; 2344 } else { 2345 negx = 0x8000800080008000ull; 2346 } 2347 } 2348 do_fmlal_idx(vd, vn, vm, env, desc, FPST_A64, negx, negf); 2349 } 2350 2351 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 2352 CPUARMState *env, uint32_t desc) 2353 { 2354 intptr_t i, j, oprsz = simd_oprsz(desc); 2355 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2356 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2357 bool za = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 2358 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 3, 3) * sizeof(float16); 2359 float_status *status = &env->vfp.fp_status[za ? FPST_ZA : FPST_A64]; 2360 bool fz16 = env->vfp.fpcr & FPCR_FZ16; 2361 int negx = 0, negf = 0; 2362 2363 if (is_s) { 2364 if (env->vfp.fpcr & FPCR_AH) { 2365 negf = float_muladd_negate_product; 2366 } else { 2367 negx = 0x8000; 2368 } 2369 } 2370 for (i = 0; i < oprsz; i += 16) { 2371 float16 mm_16 = *(float16 *)(vm + i + idx); 2372 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2373 2374 for (j = 0; j < 16; j += sizeof(float32)) { 2375 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negx; 2376 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2377 float32 aa = *(float32 *)(va + H1_4(i + j)); 2378 2379 *(float32 *)(vd + H1_4(i + j)) = 2380 float32_muladd(nn, mm, aa, negf, status); 2381 } 2382 } 2383 } 2384 2385 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2386 { 2387 intptr_t i, opr_sz = simd_oprsz(desc); 2388 int8_t *d = vd, *n = vn, *m = vm; 2389 2390 for (i = 0; i < opr_sz; ++i) { 2391 int8_t mm = m[i]; 2392 int8_t nn = n[i]; 2393 int8_t res = 0; 2394 if (mm >= 0) { 2395 if (mm < 8) { 2396 res = nn << mm; 2397 } 2398 } else { 2399 res = nn >> (mm > -8 ? -mm : 7); 2400 } 2401 d[i] = res; 2402 } 2403 clear_tail(d, opr_sz, simd_maxsz(desc)); 2404 } 2405 2406 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2407 { 2408 intptr_t i, opr_sz = simd_oprsz(desc); 2409 int16_t *d = vd, *n = vn, *m = vm; 2410 2411 for (i = 0; i < opr_sz / 2; ++i) { 2412 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2413 int16_t nn = n[i]; 2414 int16_t res = 0; 2415 if (mm >= 0) { 2416 if (mm < 16) { 2417 res = nn << mm; 2418 } 2419 } else { 2420 res = nn >> (mm > -16 ? -mm : 15); 2421 } 2422 d[i] = res; 2423 } 2424 clear_tail(d, opr_sz, simd_maxsz(desc)); 2425 } 2426 2427 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2428 { 2429 intptr_t i, opr_sz = simd_oprsz(desc); 2430 uint8_t *d = vd, *n = vn, *m = vm; 2431 2432 for (i = 0; i < opr_sz; ++i) { 2433 int8_t mm = m[i]; 2434 uint8_t nn = n[i]; 2435 uint8_t res = 0; 2436 if (mm >= 0) { 2437 if (mm < 8) { 2438 res = nn << mm; 2439 } 2440 } else { 2441 if (mm > -8) { 2442 res = nn >> -mm; 2443 } 2444 } 2445 d[i] = res; 2446 } 2447 clear_tail(d, opr_sz, simd_maxsz(desc)); 2448 } 2449 2450 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2451 { 2452 intptr_t i, opr_sz = simd_oprsz(desc); 2453 uint16_t *d = vd, *n = vn, *m = vm; 2454 2455 for (i = 0; i < opr_sz / 2; ++i) { 2456 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2457 uint16_t nn = n[i]; 2458 uint16_t res = 0; 2459 if (mm >= 0) { 2460 if (mm < 16) { 2461 res = nn << mm; 2462 } 2463 } else { 2464 if (mm > -16) { 2465 res = nn >> -mm; 2466 } 2467 } 2468 d[i] = res; 2469 } 2470 clear_tail(d, opr_sz, simd_maxsz(desc)); 2471 } 2472 2473 /* 2474 * 8x8->8 polynomial multiply. 2475 * 2476 * Polynomial multiplication is like integer multiplication except the 2477 * partial products are XORed, not added. 2478 * 2479 * TODO: expose this as a generic vector operation, as it is a common 2480 * crypto building block. 2481 */ 2482 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 2483 { 2484 intptr_t i, opr_sz = simd_oprsz(desc); 2485 uint64_t *d = vd, *n = vn, *m = vm; 2486 2487 for (i = 0; i < opr_sz / 8; ++i) { 2488 d[i] = clmul_8x8_low(n[i], m[i]); 2489 } 2490 clear_tail(d, opr_sz, simd_maxsz(desc)); 2491 } 2492 2493 /* 2494 * 64x64->128 polynomial multiply. 2495 * Because of the lanes are not accessed in strict columns, 2496 * this probably cannot be turned into a generic helper. 2497 */ 2498 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2499 { 2500 intptr_t i, opr_sz = simd_oprsz(desc); 2501 intptr_t hi = simd_data(desc); 2502 uint64_t *d = vd, *n = vn, *m = vm; 2503 2504 for (i = 0; i < opr_sz / 8; i += 2) { 2505 Int128 r = clmul_64(n[i + hi], m[i + hi]); 2506 d[i] = int128_getlo(r); 2507 d[i + 1] = int128_gethi(r); 2508 } 2509 clear_tail(d, opr_sz, simd_maxsz(desc)); 2510 } 2511 2512 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2513 { 2514 int hi = simd_data(desc); 2515 uint64_t *d = vd, *n = vn, *m = vm; 2516 uint64_t nn = n[hi], mm = m[hi]; 2517 2518 d[0] = clmul_8x4_packed(nn, mm); 2519 nn >>= 32; 2520 mm >>= 32; 2521 d[1] = clmul_8x4_packed(nn, mm); 2522 2523 clear_tail(d, 16, simd_maxsz(desc)); 2524 } 2525 2526 #ifdef TARGET_AARCH64 2527 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2528 { 2529 int shift = simd_data(desc) * 8; 2530 intptr_t i, opr_sz = simd_oprsz(desc); 2531 uint64_t *d = vd, *n = vn, *m = vm; 2532 2533 for (i = 0; i < opr_sz / 8; ++i) { 2534 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); 2535 } 2536 } 2537 2538 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2539 { 2540 intptr_t sel = H4(simd_data(desc)); 2541 intptr_t i, opr_sz = simd_oprsz(desc); 2542 uint32_t *n = vn, *m = vm; 2543 uint64_t *d = vd; 2544 2545 for (i = 0; i < opr_sz / 8; ++i) { 2546 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); 2547 } 2548 } 2549 #endif 2550 2551 #define DO_CMP0(NAME, TYPE, OP) \ 2552 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2553 { \ 2554 intptr_t i, opr_sz = simd_oprsz(desc); \ 2555 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2556 TYPE nn = *(TYPE *)(vn + i); \ 2557 *(TYPE *)(vd + i) = -(nn OP 0); \ 2558 } \ 2559 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2560 } 2561 2562 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2563 DO_CMP0(gvec_clt0_b, int8_t, <) 2564 DO_CMP0(gvec_cle0_b, int8_t, <=) 2565 DO_CMP0(gvec_cgt0_b, int8_t, >) 2566 DO_CMP0(gvec_cge0_b, int8_t, >=) 2567 2568 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2569 DO_CMP0(gvec_clt0_h, int16_t, <) 2570 DO_CMP0(gvec_cle0_h, int16_t, <=) 2571 DO_CMP0(gvec_cgt0_h, int16_t, >) 2572 DO_CMP0(gvec_cge0_h, int16_t, >=) 2573 2574 #undef DO_CMP0 2575 2576 #define DO_ABD(NAME, TYPE) \ 2577 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2578 { \ 2579 intptr_t i, opr_sz = simd_oprsz(desc); \ 2580 TYPE *d = vd, *n = vn, *m = vm; \ 2581 \ 2582 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2583 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2584 } \ 2585 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2586 } 2587 2588 DO_ABD(gvec_sabd_b, int8_t) 2589 DO_ABD(gvec_sabd_h, int16_t) 2590 DO_ABD(gvec_sabd_s, int32_t) 2591 DO_ABD(gvec_sabd_d, int64_t) 2592 2593 DO_ABD(gvec_uabd_b, uint8_t) 2594 DO_ABD(gvec_uabd_h, uint16_t) 2595 DO_ABD(gvec_uabd_s, uint32_t) 2596 DO_ABD(gvec_uabd_d, uint64_t) 2597 2598 #undef DO_ABD 2599 2600 #define DO_ABA(NAME, TYPE) \ 2601 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2602 { \ 2603 intptr_t i, opr_sz = simd_oprsz(desc); \ 2604 TYPE *d = vd, *n = vn, *m = vm; \ 2605 \ 2606 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2607 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2608 } \ 2609 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2610 } 2611 2612 DO_ABA(gvec_saba_b, int8_t) 2613 DO_ABA(gvec_saba_h, int16_t) 2614 DO_ABA(gvec_saba_s, int32_t) 2615 DO_ABA(gvec_saba_d, int64_t) 2616 2617 DO_ABA(gvec_uaba_b, uint8_t) 2618 DO_ABA(gvec_uaba_h, uint16_t) 2619 DO_ABA(gvec_uaba_s, uint32_t) 2620 DO_ABA(gvec_uaba_d, uint64_t) 2621 2622 #undef DO_ABA 2623 2624 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2625 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 2626 float_status *stat, uint32_t desc) \ 2627 { \ 2628 ARMVectorReg scratch; \ 2629 intptr_t oprsz = simd_oprsz(desc); \ 2630 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2631 TYPE *d = vd, *n = vn, *m = vm; \ 2632 if (unlikely(d == m)) { \ 2633 m = memcpy(&scratch, m, oprsz); \ 2634 } \ 2635 for (intptr_t i = 0; i < half; ++i) { \ 2636 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat); \ 2637 } \ 2638 for (intptr_t i = 0; i < half; ++i) { \ 2639 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat); \ 2640 } \ 2641 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2642 } 2643 2644 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2) 2645 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4) 2646 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, ) 2647 2648 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2) 2649 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4) 2650 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, ) 2651 2652 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2) 2653 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4) 2654 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, ) 2655 2656 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2) 2657 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4) 2658 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, ) 2659 2660 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2) 2661 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4) 2662 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, ) 2663 2664 #ifdef TARGET_AARCH64 2665 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2) 2666 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4) 2667 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, ) 2668 2669 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2) 2670 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4) 2671 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, ) 2672 #endif 2673 2674 #undef DO_3OP_PAIR 2675 2676 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2677 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2678 { \ 2679 ARMVectorReg scratch; \ 2680 intptr_t oprsz = simd_oprsz(desc); \ 2681 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2682 TYPE *d = vd, *n = vn, *m = vm; \ 2683 if (unlikely(d == m)) { \ 2684 m = memcpy(&scratch, m, oprsz); \ 2685 } \ 2686 for (intptr_t i = 0; i < half; ++i) { \ 2687 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]); \ 2688 } \ 2689 for (intptr_t i = 0; i < half; ++i) { \ 2690 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]); \ 2691 } \ 2692 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2693 } 2694 2695 #define ADD(A, B) (A + B) 2696 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1) 2697 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2) 2698 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4) 2699 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, ) 2700 #undef ADD 2701 2702 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1) 2703 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2) 2704 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4) 2705 2706 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1) 2707 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2) 2708 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4) 2709 2710 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1) 2711 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2) 2712 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4) 2713 2714 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1) 2715 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2) 2716 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4) 2717 2718 #undef DO_3OP_PAIR 2719 2720 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2721 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 2722 { \ 2723 intptr_t i, oprsz = simd_oprsz(desc); \ 2724 int shift = simd_data(desc); \ 2725 TYPE *d = vd, *n = vn; \ 2726 float_status *fpst = stat; \ 2727 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2728 d[i] = FUNC(n[i], shift, fpst); \ 2729 } \ 2730 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2731 } 2732 2733 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t) 2734 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t) 2735 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2736 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2737 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2738 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2739 2740 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t) 2741 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t) 2742 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2743 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t) 2744 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2745 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2746 2747 #undef DO_VCVT_FIXED 2748 2749 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2750 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2751 { \ 2752 intptr_t i, oprsz = simd_oprsz(desc); \ 2753 uint32_t rmode = simd_data(desc); \ 2754 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2755 TYPE *d = vd, *n = vn; \ 2756 set_float_rounding_mode(rmode, fpst); \ 2757 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2758 d[i] = FUNC(n[i], 0, fpst); \ 2759 } \ 2760 set_float_rounding_mode(prev_rmode, fpst); \ 2761 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2762 } 2763 2764 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t) 2765 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t) 2766 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2767 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2768 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2769 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2770 2771 #undef DO_VCVT_RMODE 2772 2773 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2774 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2775 { \ 2776 intptr_t i, oprsz = simd_oprsz(desc); \ 2777 uint32_t rmode = simd_data(desc); \ 2778 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2779 TYPE *d = vd, *n = vn; \ 2780 set_float_rounding_mode(rmode, fpst); \ 2781 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2782 d[i] = FUNC(n[i], fpst); \ 2783 } \ 2784 set_float_rounding_mode(prev_rmode, fpst); \ 2785 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2786 } 2787 2788 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2789 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2790 2791 #undef DO_VRINT_RMODE 2792 2793 #ifdef TARGET_AARCH64 2794 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc) 2795 { 2796 const uint8_t *indices = vm; 2797 size_t oprsz = simd_oprsz(desc); 2798 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2799 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2800 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2801 union { 2802 uint8_t b[16]; 2803 uint64_t d[2]; 2804 } result; 2805 2806 /* 2807 * We must construct the final result in a temp, lest the output 2808 * overlaps the input table. For TBL, begin with zero; for TBX, 2809 * begin with the original register contents. Note that we always 2810 * copy 16 bytes here to avoid an extra branch; clearing the high 2811 * bits of the register for oprsz == 8 is handled below. 2812 */ 2813 if (is_tbx) { 2814 memcpy(&result, vd, 16); 2815 } else { 2816 memset(&result, 0, 16); 2817 } 2818 2819 for (size_t i = 0; i < oprsz; ++i) { 2820 uint32_t index = indices[H1(i)]; 2821 2822 if (index < table_len) { 2823 /* 2824 * Convert index (a byte offset into the virtual table 2825 * which is a series of 128-bit vectors concatenated) 2826 * into the correct register element, bearing in mind 2827 * that the table can wrap around from V31 to V0. 2828 */ 2829 const uint8_t *table = (const uint8_t *) 2830 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2831 result.b[H1(i)] = table[H1(index % 16)]; 2832 } 2833 } 2834 2835 memcpy(vd, &result, 16); 2836 clear_tail(vd, oprsz, simd_maxsz(desc)); 2837 } 2838 #endif 2839 2840 /* 2841 * NxN -> N highpart multiply 2842 * 2843 * TODO: expose this as a generic vector operation. 2844 */ 2845 2846 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2847 { 2848 intptr_t i, opr_sz = simd_oprsz(desc); 2849 int8_t *d = vd, *n = vn, *m = vm; 2850 2851 for (i = 0; i < opr_sz; ++i) { 2852 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2853 } 2854 clear_tail(d, opr_sz, simd_maxsz(desc)); 2855 } 2856 2857 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2858 { 2859 intptr_t i, opr_sz = simd_oprsz(desc); 2860 int16_t *d = vd, *n = vn, *m = vm; 2861 2862 for (i = 0; i < opr_sz / 2; ++i) { 2863 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2864 } 2865 clear_tail(d, opr_sz, simd_maxsz(desc)); 2866 } 2867 2868 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2869 { 2870 intptr_t i, opr_sz = simd_oprsz(desc); 2871 int32_t *d = vd, *n = vn, *m = vm; 2872 2873 for (i = 0; i < opr_sz / 4; ++i) { 2874 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2875 } 2876 clear_tail(d, opr_sz, simd_maxsz(desc)); 2877 } 2878 2879 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2880 { 2881 intptr_t i, opr_sz = simd_oprsz(desc); 2882 uint64_t *d = vd, *n = vn, *m = vm; 2883 uint64_t discard; 2884 2885 for (i = 0; i < opr_sz / 8; ++i) { 2886 muls64(&discard, &d[i], n[i], m[i]); 2887 } 2888 clear_tail(d, opr_sz, simd_maxsz(desc)); 2889 } 2890 2891 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2892 { 2893 intptr_t i, opr_sz = simd_oprsz(desc); 2894 uint8_t *d = vd, *n = vn, *m = vm; 2895 2896 for (i = 0; i < opr_sz; ++i) { 2897 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2898 } 2899 clear_tail(d, opr_sz, simd_maxsz(desc)); 2900 } 2901 2902 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2903 { 2904 intptr_t i, opr_sz = simd_oprsz(desc); 2905 uint16_t *d = vd, *n = vn, *m = vm; 2906 2907 for (i = 0; i < opr_sz / 2; ++i) { 2908 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2909 } 2910 clear_tail(d, opr_sz, simd_maxsz(desc)); 2911 } 2912 2913 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2914 { 2915 intptr_t i, opr_sz = simd_oprsz(desc); 2916 uint32_t *d = vd, *n = vn, *m = vm; 2917 2918 for (i = 0; i < opr_sz / 4; ++i) { 2919 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2920 } 2921 clear_tail(d, opr_sz, simd_maxsz(desc)); 2922 } 2923 2924 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2925 { 2926 intptr_t i, opr_sz = simd_oprsz(desc); 2927 uint64_t *d = vd, *n = vn, *m = vm; 2928 uint64_t discard; 2929 2930 for (i = 0; i < opr_sz / 8; ++i) { 2931 mulu64(&discard, &d[i], n[i], m[i]); 2932 } 2933 clear_tail(d, opr_sz, simd_maxsz(desc)); 2934 } 2935 2936 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2937 { 2938 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2939 int shr = simd_data(desc); 2940 uint64_t *d = vd, *n = vn, *m = vm; 2941 2942 for (i = 0; i < opr_sz; ++i) { 2943 d[i] = ror64(n[i] ^ m[i], shr); 2944 } 2945 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2946 } 2947 2948 /* 2949 * Integer matrix-multiply accumulate 2950 */ 2951 2952 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2953 { 2954 int8_t *n = vn, *m = vm; 2955 2956 for (intptr_t k = 0; k < 8; ++k) { 2957 sum += n[H1(k)] * m[H1(k)]; 2958 } 2959 return sum; 2960 } 2961 2962 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2963 { 2964 uint8_t *n = vn, *m = vm; 2965 2966 for (intptr_t k = 0; k < 8; ++k) { 2967 sum += n[H1(k)] * m[H1(k)]; 2968 } 2969 return sum; 2970 } 2971 2972 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2973 { 2974 uint8_t *n = vn; 2975 int8_t *m = vm; 2976 2977 for (intptr_t k = 0; k < 8; ++k) { 2978 sum += n[H1(k)] * m[H1(k)]; 2979 } 2980 return sum; 2981 } 2982 2983 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2984 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2985 { 2986 intptr_t seg, opr_sz = simd_oprsz(desc); 2987 2988 for (seg = 0; seg < opr_sz; seg += 16) { 2989 uint32_t *d = vd + seg; 2990 uint32_t *a = va + seg; 2991 uint32_t sum0, sum1, sum2, sum3; 2992 2993 /* 2994 * Process the entire segment at once, writing back the 2995 * results only after we've consumed all of the inputs. 2996 * 2997 * Key to indices by column: 2998 * i j i j 2999 */ 3000 sum0 = a[H4(0 + 0)]; 3001 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 3002 sum1 = a[H4(0 + 1)]; 3003 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 3004 sum2 = a[H4(2 + 0)]; 3005 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 3006 sum3 = a[H4(2 + 1)]; 3007 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 3008 3009 d[H4(0)] = sum0; 3010 d[H4(1)] = sum1; 3011 d[H4(2)] = sum2; 3012 d[H4(3)] = sum3; 3013 } 3014 clear_tail(vd, opr_sz, simd_maxsz(desc)); 3015 } 3016 3017 #define DO_MMLA_B(NAME, INNER) \ 3018 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 3019 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 3020 3021 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 3022 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 3023 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 3024 3025 /* 3026 * BFloat16 Dot Product 3027 */ 3028 3029 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp) 3030 { 3031 /* 3032 * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF. 3033 * For EBF = 0, we ignore the FPCR bits which determine rounding 3034 * mode and denormal-flushing, and we do unfused multiplies and 3035 * additions with intermediate rounding of all products and sums. 3036 * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits, 3037 * and we perform a fused two-way sum-of-products without intermediate 3038 * rounding of the products. 3039 * In either case, we don't set fp exception flags. 3040 * 3041 * EBF is AArch64 only, so even if it's set in the FPCR it has 3042 * no effect on AArch32 instructions. 3043 */ 3044 bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF; 3045 3046 *statusp = env->vfp.fp_status[is_a64(env) ? FPST_A64 : FPST_A32]; 3047 set_default_nan_mode(true, statusp); 3048 3049 if (ebf) { 3050 /* EBF=1 needs to do a step with round-to-odd semantics */ 3051 *oddstatusp = *statusp; 3052 set_float_rounding_mode(float_round_to_odd, oddstatusp); 3053 } else { 3054 set_flush_to_zero(true, statusp); 3055 set_flush_inputs_to_zero(true, statusp); 3056 set_float_rounding_mode(float_round_to_odd_inf, statusp); 3057 } 3058 return ebf; 3059 } 3060 3061 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst) 3062 { 3063 float32 t1, t2; 3064 3065 /* 3066 * Extract each BFloat16 from the element pair, and shift 3067 * them such that they become float32. 3068 */ 3069 t1 = float32_mul(e1 << 16, e2 << 16, fpst); 3070 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst); 3071 t1 = float32_add(t1, t2, fpst); 3072 t1 = float32_add(sum, t1, fpst); 3073 3074 return t1; 3075 } 3076 3077 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2, 3078 float_status *fpst, float_status *fpst_odd) 3079 { 3080 float32 s1r = e1 << 16; 3081 float32 s1c = e1 & 0xffff0000u; 3082 float32 s2r = e2 << 16; 3083 float32 s2c = e2 & 0xffff0000u; 3084 float32 t32; 3085 3086 /* C.f. FPProcessNaNs4 */ 3087 if (float32_is_any_nan(s1r) || float32_is_any_nan(s1c) || 3088 float32_is_any_nan(s2r) || float32_is_any_nan(s2c)) { 3089 if (float32_is_signaling_nan(s1r, fpst)) { 3090 t32 = s1r; 3091 } else if (float32_is_signaling_nan(s1c, fpst)) { 3092 t32 = s1c; 3093 } else if (float32_is_signaling_nan(s2r, fpst)) { 3094 t32 = s2r; 3095 } else if (float32_is_signaling_nan(s2c, fpst)) { 3096 t32 = s2c; 3097 } else if (float32_is_any_nan(s1r)) { 3098 t32 = s1r; 3099 } else if (float32_is_any_nan(s1c)) { 3100 t32 = s1c; 3101 } else if (float32_is_any_nan(s2r)) { 3102 t32 = s2r; 3103 } else { 3104 t32 = s2c; 3105 } 3106 /* 3107 * FPConvertNaN(FPProcessNaN(t32)) will be done as part 3108 * of the final addition below. 3109 */ 3110 } else { 3111 /* 3112 * Compare f16_dotadd() in sme_helper.c, but here we have 3113 * bfloat16 inputs. In particular that means that we do not 3114 * want the FPCR.FZ16 flush semantics, so we use the normal 3115 * float_status for the input handling here. 3116 */ 3117 float64 e1r = float32_to_float64(s1r, fpst); 3118 float64 e1c = float32_to_float64(s1c, fpst); 3119 float64 e2r = float32_to_float64(s2r, fpst); 3120 float64 e2c = float32_to_float64(s2c, fpst); 3121 float64 t64; 3122 3123 /* 3124 * The ARM pseudocode function FPDot performs both multiplies 3125 * and the add with a single rounding operation. Emulate this 3126 * by performing the first multiply in round-to-odd, then doing 3127 * the second multiply as fused multiply-add, and rounding to 3128 * float32 all in one step. 3129 */ 3130 t64 = float64_mul(e1r, e2r, fpst_odd); 3131 t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst); 3132 3133 /* This conversion is exact, because we've already rounded. */ 3134 t32 = float64_to_float32(t64, fpst); 3135 } 3136 3137 /* The final accumulation step is not fused. */ 3138 return float32_add(sum, t32, fpst); 3139 } 3140 3141 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, 3142 CPUARMState *env, uint32_t desc) 3143 { 3144 intptr_t i, opr_sz = simd_oprsz(desc); 3145 float32 *d = vd, *a = va; 3146 uint32_t *n = vn, *m = vm; 3147 float_status fpst, fpst_odd; 3148 3149 if (is_ebf(env, &fpst, &fpst_odd)) { 3150 for (i = 0; i < opr_sz / 4; ++i) { 3151 d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd); 3152 } 3153 } else { 3154 for (i = 0; i < opr_sz / 4; ++i) { 3155 d[i] = bfdotadd(a[i], n[i], m[i], &fpst); 3156 } 3157 } 3158 clear_tail(d, opr_sz, simd_maxsz(desc)); 3159 } 3160 3161 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 3162 void *va, CPUARMState *env, uint32_t desc) 3163 { 3164 intptr_t i, j, opr_sz = simd_oprsz(desc); 3165 intptr_t index = simd_data(desc); 3166 intptr_t elements = opr_sz / 4; 3167 intptr_t eltspersegment = MIN(16 / 4, elements); 3168 float32 *d = vd, *a = va; 3169 uint32_t *n = vn, *m = vm; 3170 float_status fpst, fpst_odd; 3171 3172 if (is_ebf(env, &fpst, &fpst_odd)) { 3173 for (i = 0; i < elements; i += eltspersegment) { 3174 uint32_t m_idx = m[i + H4(index)]; 3175 3176 for (j = i; j < i + eltspersegment; j++) { 3177 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd); 3178 } 3179 } 3180 } else { 3181 for (i = 0; i < elements; i += eltspersegment) { 3182 uint32_t m_idx = m[i + H4(index)]; 3183 3184 for (j = i; j < i + eltspersegment; j++) { 3185 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst); 3186 } 3187 } 3188 } 3189 clear_tail(d, opr_sz, simd_maxsz(desc)); 3190 } 3191 3192 void HELPER(sme2_bfvdot_idx)(void *vd, void *vn, void *vm, 3193 void *va, CPUARMState *env, uint32_t desc) 3194 { 3195 intptr_t i, j, opr_sz = simd_oprsz(desc); 3196 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT, 2); 3197 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 3198 intptr_t elements = opr_sz / 4; 3199 intptr_t eltspersegment = MIN(16 / 4, elements); 3200 float32 *d = vd, *a = va; 3201 uint16_t *n0 = vn; 3202 uint16_t *n1 = vn + sizeof(ARMVectorReg); 3203 uint32_t *m = vm; 3204 float_status fpst, fpst_odd; 3205 3206 if (is_ebf(env, &fpst, &fpst_odd)) { 3207 for (i = 0; i < elements; i += eltspersegment) { 3208 uint32_t m_idx = m[i + H4(idx)]; 3209 3210 for (j = 0; j < eltspersegment; j++) { 3211 uint32_t nn = (n0[H2(2 * (i + j) + sel)]) 3212 | (n1[H2(2 * (i + j) + sel)] << 16); 3213 d[i + H4(j)] = bfdotadd_ebf(a[i + H4(j)], nn, m_idx, 3214 &fpst, &fpst_odd); 3215 } 3216 } 3217 } else { 3218 for (i = 0; i < elements; i += eltspersegment) { 3219 uint32_t m_idx = m[i + H4(idx)]; 3220 3221 for (j = 0; j < eltspersegment; j++) { 3222 uint32_t nn = (n0[H2(2 * (i + j) + sel)]) 3223 | (n1[H2(2 * (i + j) + sel)] << 16); 3224 d[i + H4(j)] = bfdotadd(a[i + H4(j)], nn, m_idx, &fpst); 3225 } 3226 } 3227 } 3228 clear_tail(d, opr_sz, simd_maxsz(desc)); 3229 } 3230 3231 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, 3232 CPUARMState *env, uint32_t desc) 3233 { 3234 intptr_t s, opr_sz = simd_oprsz(desc); 3235 float32 *d = vd, *a = va; 3236 uint32_t *n = vn, *m = vm; 3237 float_status fpst, fpst_odd; 3238 3239 if (is_ebf(env, &fpst, &fpst_odd)) { 3240 for (s = 0; s < opr_sz / 4; s += 4) { 3241 float32 sum00, sum01, sum10, sum11; 3242 3243 /* 3244 * Process the entire segment at once, writing back the 3245 * results only after we've consumed all of the inputs. 3246 * 3247 * Key to indices by column: 3248 * i j i k j k 3249 */ 3250 sum00 = a[s + H4(0 + 0)]; 3251 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 3252 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 3253 3254 sum01 = a[s + H4(0 + 1)]; 3255 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 3256 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 3257 3258 sum10 = a[s + H4(2 + 0)]; 3259 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 3260 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 3261 3262 sum11 = a[s + H4(2 + 1)]; 3263 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 3264 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 3265 3266 d[s + H4(0 + 0)] = sum00; 3267 d[s + H4(0 + 1)] = sum01; 3268 d[s + H4(2 + 0)] = sum10; 3269 d[s + H4(2 + 1)] = sum11; 3270 } 3271 } else { 3272 for (s = 0; s < opr_sz / 4; s += 4) { 3273 float32 sum00, sum01, sum10, sum11; 3274 3275 /* 3276 * Process the entire segment at once, writing back the 3277 * results only after we've consumed all of the inputs. 3278 * 3279 * Key to indices by column: 3280 * i j i k j k 3281 */ 3282 sum00 = a[s + H4(0 + 0)]; 3283 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst); 3284 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst); 3285 3286 sum01 = a[s + H4(0 + 1)]; 3287 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst); 3288 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst); 3289 3290 sum10 = a[s + H4(2 + 0)]; 3291 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst); 3292 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst); 3293 3294 sum11 = a[s + H4(2 + 1)]; 3295 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst); 3296 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst); 3297 3298 d[s + H4(0 + 0)] = sum00; 3299 d[s + H4(0 + 1)] = sum01; 3300 d[s + H4(2 + 0)] = sum10; 3301 d[s + H4(2 + 1)] = sum11; 3302 } 3303 } 3304 clear_tail(d, opr_sz, simd_maxsz(desc)); 3305 } 3306 3307 static void do_bfmlal(float32 *d, bfloat16 *n, bfloat16 *m, float32 *a, 3308 float_status *stat, uint32_t desc, int negx, int negf) 3309 { 3310 intptr_t i, opr_sz = simd_oprsz(desc); 3311 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 3312 3313 for (i = 0; i < opr_sz / 4; ++i) { 3314 float32 nn = (negx ^ n[H2(i * 2 + sel)]) << 16; 3315 float32 mm = m[H2(i * 2 + sel)] << 16; 3316 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], negf, stat); 3317 } 3318 clear_tail(d, opr_sz, simd_maxsz(desc)); 3319 } 3320 3321 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 3322 float_status *stat, uint32_t desc) 3323 { 3324 do_bfmlal(vd, vn, vm, va, stat, desc, 0, 0); 3325 } 3326 3327 void HELPER(gvec_bfmlsl)(void *vd, void *vn, void *vm, void *va, 3328 float_status *stat, uint32_t desc) 3329 { 3330 do_bfmlal(vd, vn, vm, va, stat, desc, 0x8000, 0); 3331 } 3332 3333 void HELPER(gvec_ah_bfmlsl)(void *vd, void *vn, void *vm, void *va, 3334 float_status *stat, uint32_t desc) 3335 { 3336 do_bfmlal(vd, vn, vm, va, stat, desc, 0, float_muladd_negate_product); 3337 } 3338 3339 static void do_bfmlal_idx(float32 *d, bfloat16 *n, bfloat16 *m, float32 *a, 3340 float_status *stat, uint32_t desc, int negx, int negf) 3341 { 3342 intptr_t i, j, opr_sz = simd_oprsz(desc); 3343 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 3344 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 3345 intptr_t elements = opr_sz / 4; 3346 intptr_t eltspersegment = MIN(16 / 4, elements); 3347 3348 for (i = 0; i < elements; i += eltspersegment) { 3349 float32 m_idx = m[H2(2 * i + index)] << 16; 3350 3351 for (j = i; j < i + eltspersegment; j++) { 3352 float32 n_j = (negx ^ n[H2(2 * j + sel)]) << 16; 3353 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], negf, stat); 3354 } 3355 } 3356 clear_tail(d, opr_sz, simd_maxsz(desc)); 3357 } 3358 3359 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, void *va, 3360 float_status *stat, uint32_t desc) 3361 { 3362 do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0, 0); 3363 } 3364 3365 void HELPER(gvec_bfmlsl_idx)(void *vd, void *vn, void *vm, void *va, 3366 float_status *stat, uint32_t desc) 3367 { 3368 do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0x8000, 0); 3369 } 3370 3371 void HELPER(gvec_ah_bfmlsl_idx)(void *vd, void *vn, void *vm, void *va, 3372 float_status *stat, uint32_t desc) 3373 { 3374 do_bfmlal_idx(vd, vn, vm, va, stat, desc, 0, float_muladd_negate_product); 3375 } 3376 3377 #define DO_CLAMP(NAME, TYPE) \ 3378 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 3379 { \ 3380 intptr_t i, opr_sz = simd_oprsz(desc); \ 3381 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 3382 TYPE aa = *(TYPE *)(a + i); \ 3383 TYPE nn = *(TYPE *)(n + i); \ 3384 TYPE mm = *(TYPE *)(m + i); \ 3385 TYPE dd = MIN(MAX(aa, nn), mm); \ 3386 *(TYPE *)(d + i) = dd; \ 3387 } \ 3388 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 3389 } 3390 3391 DO_CLAMP(gvec_sclamp_b, int8_t) 3392 DO_CLAMP(gvec_sclamp_h, int16_t) 3393 DO_CLAMP(gvec_sclamp_s, int32_t) 3394 DO_CLAMP(gvec_sclamp_d, int64_t) 3395 3396 DO_CLAMP(gvec_uclamp_b, uint8_t) 3397 DO_CLAMP(gvec_uclamp_h, uint16_t) 3398 DO_CLAMP(gvec_uclamp_s, uint32_t) 3399 DO_CLAMP(gvec_uclamp_d, uint64_t) 3400 3401 /* Bit count in each 8-bit word. */ 3402 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc) 3403 { 3404 intptr_t i, opr_sz = simd_oprsz(desc); 3405 uint8_t *d = vd, *n = vn; 3406 3407 for (i = 0; i < opr_sz; ++i) { 3408 d[i] = ctpop8(n[i]); 3409 } 3410 clear_tail(d, opr_sz, simd_maxsz(desc)); 3411 } 3412 3413 /* Reverse bits in each 8 bit word */ 3414 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc) 3415 { 3416 intptr_t i, opr_sz = simd_oprsz(desc); 3417 uint64_t *d = vd, *n = vn; 3418 3419 for (i = 0; i < opr_sz / 8; ++i) { 3420 d[i] = revbit64(bswap64(n[i])); 3421 } 3422 clear_tail(d, opr_sz, simd_maxsz(desc)); 3423 } 3424 3425 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc) 3426 { 3427 intptr_t i, opr_sz = simd_oprsz(desc); 3428 uint32_t *d = vd, *n = vn; 3429 3430 for (i = 0; i < opr_sz / 4; ++i) { 3431 d[i] = helper_recpe_u32(n[i]); 3432 } 3433 clear_tail(d, opr_sz, simd_maxsz(desc)); 3434 } 3435 3436 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc) 3437 { 3438 intptr_t i, opr_sz = simd_oprsz(desc); 3439 uint32_t *d = vd, *n = vn; 3440 3441 for (i = 0; i < opr_sz / 4; ++i) { 3442 d[i] = helper_rsqrte_u32(n[i]); 3443 } 3444 clear_tail(d, opr_sz, simd_maxsz(desc)); 3445 } 3446 3447 static inline void do_lut_b(void *zd, uint64_t *indexes, uint64_t *table, 3448 unsigned elements, unsigned segbase, 3449 unsigned dstride, unsigned isize, 3450 unsigned tsize, unsigned nreg) 3451 { 3452 for (unsigned r = 0; r < nreg; ++r) { 3453 uint8_t *dst = zd + dstride * r; 3454 unsigned base = segbase + r * elements; 3455 3456 for (unsigned e = 0; e < elements; ++e) { 3457 unsigned index = extractn(indexes, (base + e) * isize, isize); 3458 dst[H1(e)] = extractn(table, index * tsize, 8); 3459 } 3460 } 3461 } 3462 3463 static inline void do_lut_h(void *zd, uint64_t *indexes, uint64_t *table, 3464 unsigned elements, unsigned segbase, 3465 unsigned dstride, unsigned isize, 3466 unsigned tsize, unsigned nreg) 3467 { 3468 for (unsigned r = 0; r < nreg; ++r) { 3469 uint16_t *dst = zd + dstride * r; 3470 unsigned base = segbase + r * elements; 3471 3472 for (unsigned e = 0; e < elements; ++e) { 3473 unsigned index = extractn(indexes, (base + e) * isize, isize); 3474 dst[H2(e)] = extractn(table, index * tsize, 16); 3475 } 3476 } 3477 } 3478 3479 static inline void do_lut_s(void *zd, uint64_t *indexes, uint32_t *table, 3480 unsigned elements, unsigned segbase, 3481 unsigned dstride, unsigned isize, 3482 unsigned tsize, unsigned nreg) 3483 { 3484 for (unsigned r = 0; r < nreg; ++r) { 3485 uint32_t *dst = zd + dstride * r; 3486 unsigned base = segbase + r * elements; 3487 3488 for (unsigned e = 0; e < elements; ++e) { 3489 unsigned index = extractn(indexes, (base + e) * isize, isize); 3490 dst[H4(e)] = table[H4(index)]; 3491 } 3492 } 3493 } 3494 3495 #define DO_SME2_LUT(ISIZE, NREG, SUFF, ESIZE) \ 3496 void helper_sme2_luti##ISIZE##_##NREG##SUFF \ 3497 (void *zd, void *zn, CPUARMState *env, uint32_t desc) \ 3498 { \ 3499 unsigned vl = simd_oprsz(desc); \ 3500 unsigned strided = extract32(desc, SIMD_DATA_SHIFT, 1); \ 3501 unsigned idx = extract32(desc, SIMD_DATA_SHIFT + 1, 4); \ 3502 unsigned elements = vl / ESIZE; \ 3503 unsigned dstride = (!strided ? 1 : NREG == 4 ? 4 : 8); \ 3504 unsigned segments = (ESIZE * 8) / (ISIZE * NREG); \ 3505 unsigned segment = idx & (segments - 1); \ 3506 ARMVectorReg indexes; \ 3507 memcpy(&indexes, zn, vl); \ 3508 do_lut_##SUFF(zd, indexes.d, (void *)env->za_state.zt0, elements, \ 3509 segment * NREG * elements, \ 3510 dstride * sizeof(ARMVectorReg), ISIZE, 32, NREG); \ 3511 } 3512 3513 DO_SME2_LUT(2,1,b, 1) 3514 DO_SME2_LUT(2,1,h, 2) 3515 DO_SME2_LUT(2,1,s, 4) 3516 DO_SME2_LUT(2,2,b, 1) 3517 DO_SME2_LUT(2,2,h, 2) 3518 DO_SME2_LUT(2,2,s, 4) 3519 DO_SME2_LUT(2,4,b, 1) 3520 DO_SME2_LUT(2,4,h, 2) 3521 DO_SME2_LUT(2,4,s, 4) 3522 3523 DO_SME2_LUT(4,1,b, 1) 3524 DO_SME2_LUT(4,1,h, 2) 3525 DO_SME2_LUT(4,1,s, 4) 3526 DO_SME2_LUT(4,2,b, 1) 3527 DO_SME2_LUT(4,2,h, 2) 3528 DO_SME2_LUT(4,2,s, 4) 3529 DO_SME2_LUT(4,4,b, 1) 3530 DO_SME2_LUT(4,4,h, 2) 3531 DO_SME2_LUT(4,4,s, 4) 3532 3533 #undef DO_SME2_LUT 3534