1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "crypto/clmul.h" 27 #include "vec_internal.h" 28 29 /* 30 * Data for expanding active predicate bits to bytes, for byte elements. 31 * 32 * for (i = 0; i < 256; ++i) { 33 * unsigned long m = 0; 34 * for (j = 0; j < 8; j++) { 35 * if ((i >> j) & 1) { 36 * m |= 0xfful << (j << 3); 37 * } 38 * } 39 * printf("0x%016lx,\n", m); 40 * } 41 */ 42 const uint64_t expand_pred_b_data[256] = { 43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 128 0xffffffffffffffff, 129 }; 130 131 /* 132 * Similarly for half-word elements. 133 * for (i = 0; i < 256; ++i) { 134 * unsigned long m = 0; 135 * if (i & 0xaa) { 136 * continue; 137 * } 138 * for (j = 0; j < 8; j += 2) { 139 * if ((i >> j) & 1) { 140 * m |= 0xfffful << (j << 3); 141 * } 142 * } 143 * printf("[0x%x] = 0x%016lx,\n", i, m); 144 * } 145 */ 146 const uint64_t expand_pred_h_data[0x55 + 1] = { 147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 154 [0x55] = 0xffffffffffffffff, 155 }; 156 157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 159 bool neg, bool round) 160 { 161 /* 162 * Simplify: 163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 165 */ 166 int32_t ret = (int32_t)src1 * src2; 167 if (neg) { 168 ret = -ret; 169 } 170 ret += ((int32_t)src3 << 7) + (round << 6); 171 ret >>= 7; 172 173 if (ret != (int8_t)ret) { 174 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 175 } 176 return ret; 177 } 178 179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 180 void *va, uint32_t desc) 181 { 182 intptr_t i, opr_sz = simd_oprsz(desc); 183 int8_t *d = vd, *n = vn, *m = vm, *a = va; 184 185 for (i = 0; i < opr_sz; ++i) { 186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 187 } 188 } 189 190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 191 void *va, uint32_t desc) 192 { 193 intptr_t i, opr_sz = simd_oprsz(desc); 194 int8_t *d = vd, *n = vn, *m = vm, *a = va; 195 196 for (i = 0; i < opr_sz; ++i) { 197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 198 } 199 } 200 201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 202 { 203 intptr_t i, opr_sz = simd_oprsz(desc); 204 int8_t *d = vd, *n = vn, *m = vm; 205 206 for (i = 0; i < opr_sz; ++i) { 207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 208 } 209 } 210 211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 212 { 213 intptr_t i, opr_sz = simd_oprsz(desc); 214 int8_t *d = vd, *n = vn, *m = vm; 215 216 for (i = 0; i < opr_sz; ++i) { 217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 218 } 219 } 220 221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 223 bool neg, bool round, uint32_t *sat) 224 { 225 /* Simplify similarly to do_sqrdmlah_b above. */ 226 int32_t ret = (int32_t)src1 * src2; 227 if (neg) { 228 ret = -ret; 229 } 230 ret += ((int32_t)src3 << 15) + (round << 14); 231 ret >>= 15; 232 233 if (ret != (int16_t)ret) { 234 *sat = 1; 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 236 } 237 return ret; 238 } 239 240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 241 uint32_t src2, uint32_t src3) 242 { 243 uint32_t *sat = &env->vfp.qc[0]; 244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 246 false, true, sat); 247 return deposit32(e1, 16, 16, e2); 248 } 249 250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 251 void *vq, uint32_t desc) 252 { 253 uintptr_t opr_sz = simd_oprsz(desc); 254 int16_t *d = vd; 255 int16_t *n = vn; 256 int16_t *m = vm; 257 uintptr_t i; 258 259 for (i = 0; i < opr_sz / 2; ++i) { 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 261 } 262 clear_tail(d, opr_sz, simd_maxsz(desc)); 263 } 264 265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 266 uint32_t src2, uint32_t src3) 267 { 268 uint32_t *sat = &env->vfp.qc[0]; 269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 271 true, true, sat); 272 return deposit32(e1, 16, 16, e2); 273 } 274 275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277 { 278 uintptr_t opr_sz = simd_oprsz(desc); 279 int16_t *d = vd; 280 int16_t *n = vn; 281 int16_t *m = vm; 282 uintptr_t i; 283 284 for (i = 0; i < opr_sz / 2; ++i) { 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 286 } 287 clear_tail(d, opr_sz, simd_maxsz(desc)); 288 } 289 290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 291 void *vq, uint32_t desc) 292 { 293 intptr_t i, opr_sz = simd_oprsz(desc); 294 int16_t *d = vd, *n = vn, *m = vm; 295 296 for (i = 0; i < opr_sz / 2; ++i) { 297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 298 } 299 clear_tail(d, opr_sz, simd_maxsz(desc)); 300 } 301 302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 303 void *vq, uint32_t desc) 304 { 305 intptr_t i, opr_sz = simd_oprsz(desc); 306 int16_t *d = vd, *n = vn, *m = vm; 307 308 for (i = 0; i < opr_sz / 2; ++i) { 309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 310 } 311 clear_tail(d, opr_sz, simd_maxsz(desc)); 312 } 313 314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm, 315 void *vq, uint32_t desc) 316 { 317 intptr_t i, j, opr_sz = simd_oprsz(desc); 318 int idx = simd_data(desc); 319 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 320 intptr_t elements = opr_sz / 2; 321 intptr_t eltspersegment = MIN(16 / 2, elements); 322 323 for (i = 0; i < elements; i += 16 / 2) { 324 int16_t mm = m[i]; 325 for (j = 0; j < eltspersegment; ++j) { 326 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq); 327 } 328 } 329 clear_tail(d, opr_sz, simd_maxsz(desc)); 330 } 331 332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, 333 void *vq, uint32_t desc) 334 { 335 intptr_t i, j, opr_sz = simd_oprsz(desc); 336 int idx = simd_data(desc); 337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 338 intptr_t elements = opr_sz / 2; 339 intptr_t eltspersegment = MIN(16 / 2, elements); 340 341 for (i = 0; i < elements; i += 16 / 2) { 342 int16_t mm = m[i]; 343 for (j = 0; j < eltspersegment; ++j) { 344 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq); 345 } 346 } 347 clear_tail(d, opr_sz, simd_maxsz(desc)); 348 } 349 350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm, 351 void *vq, uint32_t desc) 352 { 353 intptr_t i, j, opr_sz = simd_oprsz(desc); 354 int idx = simd_data(desc); 355 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 356 intptr_t elements = opr_sz / 2; 357 intptr_t eltspersegment = MIN(16 / 2, elements); 358 359 for (i = 0; i < elements; i += 16 / 2) { 360 int16_t mm = m[i]; 361 for (j = 0; j < eltspersegment; ++j) { 362 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq); 363 } 364 } 365 clear_tail(d, opr_sz, simd_maxsz(desc)); 366 } 367 368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm, 369 void *vq, uint32_t desc) 370 { 371 intptr_t i, j, opr_sz = simd_oprsz(desc); 372 int idx = simd_data(desc); 373 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 374 intptr_t elements = opr_sz / 2; 375 intptr_t eltspersegment = MIN(16 / 2, elements); 376 377 for (i = 0; i < elements; i += 16 / 2) { 378 int16_t mm = m[i]; 379 for (j = 0; j < eltspersegment; ++j) { 380 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq); 381 } 382 } 383 clear_tail(d, opr_sz, simd_maxsz(desc)); 384 } 385 386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 387 void *va, uint32_t desc) 388 { 389 intptr_t i, opr_sz = simd_oprsz(desc); 390 int16_t *d = vd, *n = vn, *m = vm, *a = va; 391 uint32_t discard; 392 393 for (i = 0; i < opr_sz / 2; ++i) { 394 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 395 } 396 } 397 398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 399 void *va, uint32_t desc) 400 { 401 intptr_t i, opr_sz = simd_oprsz(desc); 402 int16_t *d = vd, *n = vn, *m = vm, *a = va; 403 uint32_t discard; 404 405 for (i = 0; i < opr_sz / 2; ++i) { 406 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 407 } 408 } 409 410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 411 { 412 intptr_t i, opr_sz = simd_oprsz(desc); 413 int16_t *d = vd, *n = vn, *m = vm; 414 uint32_t discard; 415 416 for (i = 0; i < opr_sz / 2; ++i) { 417 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 418 } 419 } 420 421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 422 { 423 intptr_t i, opr_sz = simd_oprsz(desc); 424 int16_t *d = vd, *n = vn, *m = vm; 425 uint32_t discard; 426 427 for (i = 0; i < opr_sz / 2; ++i) { 428 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 429 } 430 } 431 432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 433 { 434 intptr_t i, j, opr_sz = simd_oprsz(desc); 435 int idx = simd_data(desc); 436 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 437 uint32_t discard; 438 439 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 440 int16_t mm = m[i]; 441 for (j = 0; j < 16 / 2; ++j) { 442 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 443 } 444 } 445 } 446 447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 448 { 449 intptr_t i, j, opr_sz = simd_oprsz(desc); 450 int idx = simd_data(desc); 451 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 452 uint32_t discard; 453 454 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 455 int16_t mm = m[i]; 456 for (j = 0; j < 16 / 2; ++j) { 457 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 458 } 459 } 460 } 461 462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 464 bool neg, bool round, uint32_t *sat) 465 { 466 /* Simplify similarly to do_sqrdmlah_b above. */ 467 int64_t ret = (int64_t)src1 * src2; 468 if (neg) { 469 ret = -ret; 470 } 471 ret += ((int64_t)src3 << 31) + (round << 30); 472 ret >>= 31; 473 474 if (ret != (int32_t)ret) { 475 *sat = 1; 476 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 477 } 478 return ret; 479 } 480 481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 482 int32_t src2, int32_t src3) 483 { 484 uint32_t *sat = &env->vfp.qc[0]; 485 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 486 } 487 488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 489 void *vq, uint32_t desc) 490 { 491 uintptr_t opr_sz = simd_oprsz(desc); 492 int32_t *d = vd; 493 int32_t *n = vn; 494 int32_t *m = vm; 495 uintptr_t i; 496 497 for (i = 0; i < opr_sz / 4; ++i) { 498 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 499 } 500 clear_tail(d, opr_sz, simd_maxsz(desc)); 501 } 502 503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 504 int32_t src2, int32_t src3) 505 { 506 uint32_t *sat = &env->vfp.qc[0]; 507 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 508 } 509 510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 511 void *vq, uint32_t desc) 512 { 513 uintptr_t opr_sz = simd_oprsz(desc); 514 int32_t *d = vd; 515 int32_t *n = vn; 516 int32_t *m = vm; 517 uintptr_t i; 518 519 for (i = 0; i < opr_sz / 4; ++i) { 520 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 521 } 522 clear_tail(d, opr_sz, simd_maxsz(desc)); 523 } 524 525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 526 void *vq, uint32_t desc) 527 { 528 intptr_t i, opr_sz = simd_oprsz(desc); 529 int32_t *d = vd, *n = vn, *m = vm; 530 531 for (i = 0; i < opr_sz / 4; ++i) { 532 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 533 } 534 clear_tail(d, opr_sz, simd_maxsz(desc)); 535 } 536 537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 538 void *vq, uint32_t desc) 539 { 540 intptr_t i, opr_sz = simd_oprsz(desc); 541 int32_t *d = vd, *n = vn, *m = vm; 542 543 for (i = 0; i < opr_sz / 4; ++i) { 544 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 545 } 546 clear_tail(d, opr_sz, simd_maxsz(desc)); 547 } 548 549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm, 550 void *vq, uint32_t desc) 551 { 552 intptr_t i, j, opr_sz = simd_oprsz(desc); 553 int idx = simd_data(desc); 554 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 555 intptr_t elements = opr_sz / 4; 556 intptr_t eltspersegment = MIN(16 / 4, elements); 557 558 for (i = 0; i < elements; i += 16 / 4) { 559 int32_t mm = m[i]; 560 for (j = 0; j < eltspersegment; ++j) { 561 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq); 562 } 563 } 564 clear_tail(d, opr_sz, simd_maxsz(desc)); 565 } 566 567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, 568 void *vq, uint32_t desc) 569 { 570 intptr_t i, j, opr_sz = simd_oprsz(desc); 571 int idx = simd_data(desc); 572 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 573 intptr_t elements = opr_sz / 4; 574 intptr_t eltspersegment = MIN(16 / 4, elements); 575 576 for (i = 0; i < elements; i += 16 / 4) { 577 int32_t mm = m[i]; 578 for (j = 0; j < eltspersegment; ++j) { 579 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq); 580 } 581 } 582 clear_tail(d, opr_sz, simd_maxsz(desc)); 583 } 584 585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm, 586 void *vq, uint32_t desc) 587 { 588 intptr_t i, j, opr_sz = simd_oprsz(desc); 589 int idx = simd_data(desc); 590 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 591 intptr_t elements = opr_sz / 4; 592 intptr_t eltspersegment = MIN(16 / 4, elements); 593 594 for (i = 0; i < elements; i += 16 / 4) { 595 int32_t mm = m[i]; 596 for (j = 0; j < eltspersegment; ++j) { 597 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq); 598 } 599 } 600 clear_tail(d, opr_sz, simd_maxsz(desc)); 601 } 602 603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm, 604 void *vq, uint32_t desc) 605 { 606 intptr_t i, j, opr_sz = simd_oprsz(desc); 607 int idx = simd_data(desc); 608 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 609 intptr_t elements = opr_sz / 4; 610 intptr_t eltspersegment = MIN(16 / 4, elements); 611 612 for (i = 0; i < elements; i += 16 / 4) { 613 int32_t mm = m[i]; 614 for (j = 0; j < eltspersegment; ++j) { 615 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq); 616 } 617 } 618 clear_tail(d, opr_sz, simd_maxsz(desc)); 619 } 620 621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 622 void *va, uint32_t desc) 623 { 624 intptr_t i, opr_sz = simd_oprsz(desc); 625 int32_t *d = vd, *n = vn, *m = vm, *a = va; 626 uint32_t discard; 627 628 for (i = 0; i < opr_sz / 4; ++i) { 629 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 630 } 631 } 632 633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 634 void *va, uint32_t desc) 635 { 636 intptr_t i, opr_sz = simd_oprsz(desc); 637 int32_t *d = vd, *n = vn, *m = vm, *a = va; 638 uint32_t discard; 639 640 for (i = 0; i < opr_sz / 4; ++i) { 641 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 642 } 643 } 644 645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 646 { 647 intptr_t i, opr_sz = simd_oprsz(desc); 648 int32_t *d = vd, *n = vn, *m = vm; 649 uint32_t discard; 650 651 for (i = 0; i < opr_sz / 4; ++i) { 652 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 653 } 654 } 655 656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 657 { 658 intptr_t i, opr_sz = simd_oprsz(desc); 659 int32_t *d = vd, *n = vn, *m = vm; 660 uint32_t discard; 661 662 for (i = 0; i < opr_sz / 4; ++i) { 663 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 664 } 665 } 666 667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 668 { 669 intptr_t i, j, opr_sz = simd_oprsz(desc); 670 int idx = simd_data(desc); 671 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 672 uint32_t discard; 673 674 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 675 int32_t mm = m[i]; 676 for (j = 0; j < 16 / 4; ++j) { 677 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 678 } 679 } 680 } 681 682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 683 { 684 intptr_t i, j, opr_sz = simd_oprsz(desc); 685 int idx = simd_data(desc); 686 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 687 uint32_t discard; 688 689 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 690 int32_t mm = m[i]; 691 for (j = 0; j < 16 / 4; ++j) { 692 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 693 } 694 } 695 } 696 697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 698 static int64_t do_sat128_d(Int128 r) 699 { 700 int64_t ls = int128_getlo(r); 701 int64_t hs = int128_gethi(r); 702 703 if (unlikely(hs != (ls >> 63))) { 704 return hs < 0 ? INT64_MIN : INT64_MAX; 705 } 706 return ls; 707 } 708 709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 710 { 711 uint64_t l, h; 712 Int128 r, t; 713 714 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 715 muls64(&l, &h, m, n); 716 r = int128_make128(l, h); 717 if (neg) { 718 r = int128_neg(r); 719 } 720 if (a) { 721 t = int128_exts64(a); 722 t = int128_lshift(t, 63); 723 r = int128_add(r, t); 724 } 725 if (round) { 726 t = int128_exts64(1ll << 62); 727 r = int128_add(r, t); 728 } 729 r = int128_rshift(r, 63); 730 731 return do_sat128_d(r); 732 } 733 734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 735 void *va, uint32_t desc) 736 { 737 intptr_t i, opr_sz = simd_oprsz(desc); 738 int64_t *d = vd, *n = vn, *m = vm, *a = va; 739 740 for (i = 0; i < opr_sz / 8; ++i) { 741 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 742 } 743 } 744 745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 746 void *va, uint32_t desc) 747 { 748 intptr_t i, opr_sz = simd_oprsz(desc); 749 int64_t *d = vd, *n = vn, *m = vm, *a = va; 750 751 for (i = 0; i < opr_sz / 8; ++i) { 752 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 753 } 754 } 755 756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 757 { 758 intptr_t i, opr_sz = simd_oprsz(desc); 759 int64_t *d = vd, *n = vn, *m = vm; 760 761 for (i = 0; i < opr_sz / 8; ++i) { 762 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 763 } 764 } 765 766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 767 { 768 intptr_t i, opr_sz = simd_oprsz(desc); 769 int64_t *d = vd, *n = vn, *m = vm; 770 771 for (i = 0; i < opr_sz / 8; ++i) { 772 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 773 } 774 } 775 776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 777 { 778 intptr_t i, j, opr_sz = simd_oprsz(desc); 779 int idx = simd_data(desc); 780 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 781 782 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 783 int64_t mm = m[i]; 784 for (j = 0; j < 16 / 8; ++j) { 785 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 786 } 787 } 788 } 789 790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 791 { 792 intptr_t i, j, opr_sz = simd_oprsz(desc); 793 int idx = simd_data(desc); 794 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 795 796 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 797 int64_t mm = m[i]; 798 for (j = 0; j < 16 / 8; ++j) { 799 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 800 } 801 } 802 } 803 804 /* Integer 8 and 16-bit dot-product. 805 * 806 * Note that for the loops herein, host endianness does not matter 807 * with respect to the ordering of data within the quad-width lanes. 808 * All elements are treated equally, no matter where they are. 809 */ 810 811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 813 { \ 814 intptr_t i, opr_sz = simd_oprsz(desc); \ 815 TYPED *d = vd, *a = va; \ 816 TYPEN *n = vn; \ 817 TYPEM *m = vm; \ 818 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 819 d[i] = (a[i] + \ 820 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 821 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 822 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 823 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 824 } \ 825 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 826 } 827 828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 833 834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 836 { \ 837 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 838 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 839 /* \ 840 * Special case: opr_sz == 8 from AA64/AA32 advsimd means the \ 841 * first iteration might not be a full 16 byte segment. But \ 842 * for vector lengths beyond that this must be SVE and we know \ 843 * opr_sz is a multiple of 16, so we need not clamp segend \ 844 * to opr_sz_n when we advance it at the end of the loop. \ 845 */ \ 846 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 847 intptr_t index = simd_data(desc); \ 848 TYPED *d = vd, *a = va; \ 849 TYPEN *n = vn; \ 850 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 851 do { \ 852 TYPED m0 = m_indexed[i * 4 + 0]; \ 853 TYPED m1 = m_indexed[i * 4 + 1]; \ 854 TYPED m2 = m_indexed[i * 4 + 2]; \ 855 TYPED m3 = m_indexed[i * 4 + 3]; \ 856 do { \ 857 d[i] = (a[i] + \ 858 n[i * 4 + 0] * m0 + \ 859 n[i * 4 + 1] * m1 + \ 860 n[i * 4 + 2] * m2 + \ 861 n[i * 4 + 3] * m3); \ 862 } while (++i < segend); \ 863 segend = i + (16 / sizeof(TYPED)); \ 864 } while (i < opr_sz_n); \ 865 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 866 } 867 868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 874 875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 876 float_status *fpst, uint32_t desc) 877 { 878 uintptr_t opr_sz = simd_oprsz(desc); 879 float16 *d = vd; 880 float16 *n = vn; 881 float16 *m = vm; 882 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 883 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 884 uintptr_t i; 885 886 for (i = 0; i < opr_sz / 2; i += 2) { 887 float16 e0 = n[H2(i)]; 888 float16 e1 = m[H2(i + 1)]; 889 float16 e2 = n[H2(i + 1)]; 890 float16 e3 = m[H2(i)]; 891 892 if (rot) { 893 e3 = float16_maybe_ah_chs(e3, fpcr_ah); 894 } else { 895 e1 = float16_maybe_ah_chs(e1, fpcr_ah); 896 } 897 898 d[H2(i)] = float16_add(e0, e1, fpst); 899 d[H2(i + 1)] = float16_add(e2, e3, fpst); 900 } 901 clear_tail(d, opr_sz, simd_maxsz(desc)); 902 } 903 904 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 905 float_status *fpst, uint32_t desc) 906 { 907 uintptr_t opr_sz = simd_oprsz(desc); 908 float32 *d = vd; 909 float32 *n = vn; 910 float32 *m = vm; 911 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 912 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 913 uintptr_t i; 914 915 for (i = 0; i < opr_sz / 4; i += 2) { 916 float32 e0 = n[H4(i)]; 917 float32 e1 = m[H4(i + 1)]; 918 float32 e2 = n[H4(i + 1)]; 919 float32 e3 = m[H4(i)]; 920 921 if (rot) { 922 e3 = float32_maybe_ah_chs(e3, fpcr_ah); 923 } else { 924 e1 = float32_maybe_ah_chs(e1, fpcr_ah); 925 } 926 927 d[H4(i)] = float32_add(e0, e1, fpst); 928 d[H4(i + 1)] = float32_add(e2, e3, fpst); 929 } 930 clear_tail(d, opr_sz, simd_maxsz(desc)); 931 } 932 933 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 934 float_status *fpst, uint32_t desc) 935 { 936 uintptr_t opr_sz = simd_oprsz(desc); 937 float64 *d = vd; 938 float64 *n = vn; 939 float64 *m = vm; 940 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 941 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 942 uintptr_t i; 943 944 for (i = 0; i < opr_sz / 8; i += 2) { 945 float64 e0 = n[i]; 946 float64 e1 = m[i + 1]; 947 float64 e2 = n[i + 1]; 948 float64 e3 = m[i]; 949 950 if (rot) { 951 e3 = float64_maybe_ah_chs(e3, fpcr_ah); 952 } else { 953 e1 = float64_maybe_ah_chs(e1, fpcr_ah); 954 } 955 956 d[i] = float64_add(e0, e1, fpst); 957 d[i + 1] = float64_add(e2, e3, fpst); 958 } 959 clear_tail(d, opr_sz, simd_maxsz(desc)); 960 } 961 962 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 963 float_status *fpst, uint32_t desc) 964 { 965 uintptr_t opr_sz = simd_oprsz(desc); 966 float16 *d = vd, *n = vn, *m = vm, *a = va; 967 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 968 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 969 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 970 uint32_t negf_real = flip ^ negf_imag; 971 float16 negx_imag, negx_real; 972 uintptr_t i; 973 974 /* With AH=0, use negx; with AH=1 use negf. */ 975 negx_real = (negf_real & ~fpcr_ah) << 15; 976 negx_imag = (negf_imag & ~fpcr_ah) << 15; 977 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 978 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 979 980 for (i = 0; i < opr_sz / 2; i += 2) { 981 float16 e2 = n[H2(i + flip)]; 982 float16 e1 = m[H2(i + flip)] ^ negx_real; 983 float16 e4 = e2; 984 float16 e3 = m[H2(i + 1 - flip)] ^ negx_imag; 985 986 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], negf_real, fpst); 987 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], negf_imag, fpst); 988 } 989 clear_tail(d, opr_sz, simd_maxsz(desc)); 990 } 991 992 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 993 float_status *fpst, uint32_t desc) 994 { 995 uintptr_t opr_sz = simd_oprsz(desc); 996 float16 *d = vd, *n = vn, *m = vm, *a = va; 997 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 998 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 999 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1000 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1); 1001 uint32_t negf_real = flip ^ negf_imag; 1002 intptr_t elements = opr_sz / sizeof(float16); 1003 intptr_t eltspersegment = MIN(16 / sizeof(float16), elements); 1004 float16 negx_imag, negx_real; 1005 intptr_t i, j; 1006 1007 /* With AH=0, use negx; with AH=1 use negf. */ 1008 negx_real = (negf_real & ~fpcr_ah) << 15; 1009 negx_imag = (negf_imag & ~fpcr_ah) << 15; 1010 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1011 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1012 1013 for (i = 0; i < elements; i += eltspersegment) { 1014 float16 mr = m[H2(i + 2 * index + 0)]; 1015 float16 mi = m[H2(i + 2 * index + 1)]; 1016 float16 e1 = negx_real ^ (flip ? mi : mr); 1017 float16 e3 = negx_imag ^ (flip ? mr : mi); 1018 1019 for (j = i; j < i + eltspersegment; j += 2) { 1020 float16 e2 = n[H2(j + flip)]; 1021 float16 e4 = e2; 1022 1023 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], negf_real, fpst); 1024 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], negf_imag, fpst); 1025 } 1026 } 1027 clear_tail(d, opr_sz, simd_maxsz(desc)); 1028 } 1029 1030 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 1031 float_status *fpst, uint32_t desc) 1032 { 1033 uintptr_t opr_sz = simd_oprsz(desc); 1034 float32 *d = vd, *n = vn, *m = vm, *a = va; 1035 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1036 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 1037 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1038 uint32_t negf_real = flip ^ negf_imag; 1039 float32 negx_imag, negx_real; 1040 uintptr_t i; 1041 1042 /* With AH=0, use negx; with AH=1 use negf. */ 1043 negx_real = (negf_real & ~fpcr_ah) << 31; 1044 negx_imag = (negf_imag & ~fpcr_ah) << 31; 1045 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1046 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1047 1048 for (i = 0; i < opr_sz / 4; i += 2) { 1049 float32 e2 = n[H4(i + flip)]; 1050 float32 e1 = m[H4(i + flip)] ^ negx_real; 1051 float32 e4 = e2; 1052 float32 e3 = m[H4(i + 1 - flip)] ^ negx_imag; 1053 1054 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], negf_real, fpst); 1055 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], negf_imag, fpst); 1056 } 1057 clear_tail(d, opr_sz, simd_maxsz(desc)); 1058 } 1059 1060 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 1061 float_status *fpst, uint32_t desc) 1062 { 1063 uintptr_t opr_sz = simd_oprsz(desc); 1064 float32 *d = vd, *n = vn, *m = vm, *a = va; 1065 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1066 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1067 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1068 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1); 1069 uint32_t negf_real = flip ^ negf_imag; 1070 intptr_t elements = opr_sz / sizeof(float32); 1071 intptr_t eltspersegment = MIN(16 / sizeof(float32), elements); 1072 float32 negx_imag, negx_real; 1073 intptr_t i, j; 1074 1075 /* With AH=0, use negx; with AH=1 use negf. */ 1076 negx_real = (negf_real & ~fpcr_ah) << 31; 1077 negx_imag = (negf_imag & ~fpcr_ah) << 31; 1078 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1079 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1080 1081 for (i = 0; i < elements; i += eltspersegment) { 1082 float32 mr = m[H4(i + 2 * index + 0)]; 1083 float32 mi = m[H4(i + 2 * index + 1)]; 1084 float32 e1 = negx_real ^ (flip ? mi : mr); 1085 float32 e3 = negx_imag ^ (flip ? mr : mi); 1086 1087 for (j = i; j < i + eltspersegment; j += 2) { 1088 float32 e2 = n[H4(j + flip)]; 1089 float32 e4 = e2; 1090 1091 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], negf_real, fpst); 1092 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], negf_imag, fpst); 1093 } 1094 } 1095 clear_tail(d, opr_sz, simd_maxsz(desc)); 1096 } 1097 1098 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 1099 float_status *fpst, uint32_t desc) 1100 { 1101 uintptr_t opr_sz = simd_oprsz(desc); 1102 float64 *d = vd, *n = vn, *m = vm, *a = va; 1103 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1104 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 1105 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1106 uint32_t negf_real = flip ^ negf_imag; 1107 float64 negx_real, negx_imag; 1108 uintptr_t i; 1109 1110 /* With AH=0, use negx; with AH=1 use negf. */ 1111 negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63; 1112 negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63; 1113 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1114 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1115 1116 for (i = 0; i < opr_sz / 8; i += 2) { 1117 float64 e2 = n[i + flip]; 1118 float64 e1 = m[i + flip] ^ negx_real; 1119 float64 e4 = e2; 1120 float64 e3 = m[i + 1 - flip] ^ negx_imag; 1121 1122 d[i] = float64_muladd(e2, e1, a[i], negf_real, fpst); 1123 d[i + 1] = float64_muladd(e4, e3, a[i + 1], negf_imag, fpst); 1124 } 1125 clear_tail(d, opr_sz, simd_maxsz(desc)); 1126 } 1127 1128 /* 1129 * Floating point comparisons producing an integer result (all 1s or all 0s). 1130 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1131 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1132 */ 1133 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 1134 { 1135 return -float16_eq_quiet(op1, op2, stat); 1136 } 1137 1138 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 1139 { 1140 return -float32_eq_quiet(op1, op2, stat); 1141 } 1142 1143 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat) 1144 { 1145 return -float64_eq_quiet(op1, op2, stat); 1146 } 1147 1148 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 1149 { 1150 return -float16_le(op2, op1, stat); 1151 } 1152 1153 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 1154 { 1155 return -float32_le(op2, op1, stat); 1156 } 1157 1158 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat) 1159 { 1160 return -float64_le(op2, op1, stat); 1161 } 1162 1163 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 1164 { 1165 return -float16_lt(op2, op1, stat); 1166 } 1167 1168 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 1169 { 1170 return -float32_lt(op2, op1, stat); 1171 } 1172 1173 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat) 1174 { 1175 return -float64_lt(op2, op1, stat); 1176 } 1177 1178 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 1179 { 1180 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 1181 } 1182 1183 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 1184 { 1185 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1186 } 1187 1188 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat) 1189 { 1190 return -float64_le(float64_abs(op2), float64_abs(op1), stat); 1191 } 1192 1193 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1194 { 1195 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1196 } 1197 1198 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1199 { 1200 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1201 } 1202 1203 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat) 1204 { 1205 return -float64_lt(float64_abs(op2), float64_abs(op1), stat); 1206 } 1207 1208 static int16_t vfp_tosszh(float16 x, float_status *fpst) 1209 { 1210 if (float16_is_any_nan(x)) { 1211 float_raise(float_flag_invalid, fpst); 1212 return 0; 1213 } 1214 return float16_to_int16_round_to_zero(x, fpst); 1215 } 1216 1217 static uint16_t vfp_touszh(float16 x, float_status *fpst) 1218 { 1219 if (float16_is_any_nan(x)) { 1220 float_raise(float_flag_invalid, fpst); 1221 return 0; 1222 } 1223 return float16_to_uint16_round_to_zero(x, fpst); 1224 } 1225 1226 #define DO_2OP(NAME, FUNC, TYPE) \ 1227 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 1228 { \ 1229 intptr_t i, oprsz = simd_oprsz(desc); \ 1230 TYPE *d = vd, *n = vn; \ 1231 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1232 d[i] = FUNC(n[i], stat); \ 1233 } \ 1234 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1235 } 1236 1237 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1238 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1239 DO_2OP(gvec_frecpe_rpres_s, helper_recpe_rpres_f32, float32) 1240 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1241 1242 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1243 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1244 DO_2OP(gvec_frsqrte_rpres_s, helper_rsqrte_rpres_f32, float32) 1245 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1246 1247 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1248 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1249 1250 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1251 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1252 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1253 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1254 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1255 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1256 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1257 DO_2OP(gvec_touszh, vfp_touszh, float16) 1258 1259 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1260 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1261 { \ 1262 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1263 } 1264 1265 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1266 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1267 { \ 1268 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1269 } 1270 1271 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1272 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1273 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1274 WRAP_CMP0_##DIRN(FN, CMPOP, float64) \ 1275 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1276 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) \ 1277 DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64) 1278 1279 DO_2OP_CMP0(cgt, cgt, FWD) 1280 DO_2OP_CMP0(cge, cge, FWD) 1281 DO_2OP_CMP0(ceq, ceq, FWD) 1282 DO_2OP_CMP0(clt, cgt, REV) 1283 DO_2OP_CMP0(cle, cge, REV) 1284 1285 #undef DO_2OP 1286 #undef DO_2OP_CMP0 1287 1288 /* Floating-point trigonometric starting value. 1289 * See the ARM ARM pseudocode function FPTrigSMul. 1290 */ 1291 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1292 { 1293 float16 result = float16_mul(op1, op1, stat); 1294 if (!float16_is_any_nan(result)) { 1295 result = float16_set_sign(result, op2 & 1); 1296 } 1297 return result; 1298 } 1299 1300 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1301 { 1302 float32 result = float32_mul(op1, op1, stat); 1303 if (!float32_is_any_nan(result)) { 1304 result = float32_set_sign(result, op2 & 1); 1305 } 1306 return result; 1307 } 1308 1309 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1310 { 1311 float64 result = float64_mul(op1, op1, stat); 1312 if (!float64_is_any_nan(result)) { 1313 result = float64_set_sign(result, op2 & 1); 1314 } 1315 return result; 1316 } 1317 1318 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1319 { 1320 return float16_abs(float16_sub(op1, op2, stat)); 1321 } 1322 1323 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1324 { 1325 return float32_abs(float32_sub(op1, op2, stat)); 1326 } 1327 1328 static float64 float64_abd(float64 op1, float64 op2, float_status *stat) 1329 { 1330 return float64_abs(float64_sub(op1, op2, stat)); 1331 } 1332 1333 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */ 1334 static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat) 1335 { 1336 float16 r = float16_sub(op1, op2, stat); 1337 return float16_is_any_nan(r) ? r : float16_abs(r); 1338 } 1339 1340 static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat) 1341 { 1342 float32 r = float32_sub(op1, op2, stat); 1343 return float32_is_any_nan(r) ? r : float32_abs(r); 1344 } 1345 1346 static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat) 1347 { 1348 float64 r = float64_sub(op1, op2, stat); 1349 return float64_is_any_nan(r) ? r : float64_abs(r); 1350 } 1351 1352 /* 1353 * Reciprocal step. These are the AArch32 version which uses a 1354 * non-fused multiply-and-subtract. 1355 */ 1356 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1357 { 1358 op1 = float16_squash_input_denormal(op1, stat); 1359 op2 = float16_squash_input_denormal(op2, stat); 1360 1361 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1362 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1363 return float16_two; 1364 } 1365 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1366 } 1367 1368 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1369 { 1370 op1 = float32_squash_input_denormal(op1, stat); 1371 op2 = float32_squash_input_denormal(op2, stat); 1372 1373 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1374 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1375 return float32_two; 1376 } 1377 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1378 } 1379 1380 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1381 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1382 { 1383 op1 = float16_squash_input_denormal(op1, stat); 1384 op2 = float16_squash_input_denormal(op2, stat); 1385 1386 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1387 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1388 return float16_one_point_five; 1389 } 1390 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1391 return float16_div(op1, float16_two, stat); 1392 } 1393 1394 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1395 { 1396 op1 = float32_squash_input_denormal(op1, stat); 1397 op2 = float32_squash_input_denormal(op2, stat); 1398 1399 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1400 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1401 return float32_one_point_five; 1402 } 1403 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1404 return float32_div(op1, float32_two, stat); 1405 } 1406 1407 #define DO_3OP(NAME, FUNC, TYPE) \ 1408 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1409 float_status *stat, uint32_t desc) \ 1410 { \ 1411 intptr_t i, oprsz = simd_oprsz(desc); \ 1412 TYPE *d = vd, *n = vn, *m = vm; \ 1413 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1414 d[i] = FUNC(n[i], m[i], stat); \ 1415 } \ 1416 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1417 } 1418 1419 DO_3OP(gvec_fadd_h, float16_add, float16) 1420 DO_3OP(gvec_fadd_s, float32_add, float32) 1421 DO_3OP(gvec_fadd_d, float64_add, float64) 1422 1423 DO_3OP(gvec_fsub_h, float16_sub, float16) 1424 DO_3OP(gvec_fsub_s, float32_sub, float32) 1425 DO_3OP(gvec_fsub_d, float64_sub, float64) 1426 1427 DO_3OP(gvec_fmul_h, float16_mul, float16) 1428 DO_3OP(gvec_fmul_s, float32_mul, float32) 1429 DO_3OP(gvec_fmul_d, float64_mul, float64) 1430 1431 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1432 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1433 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1434 1435 DO_3OP(gvec_fabd_h, float16_abd, float16) 1436 DO_3OP(gvec_fabd_s, float32_abd, float32) 1437 DO_3OP(gvec_fabd_d, float64_abd, float64) 1438 1439 DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16) 1440 DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32) 1441 DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64) 1442 1443 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1444 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1445 DO_3OP(gvec_fceq_d, float64_ceq, float64) 1446 1447 DO_3OP(gvec_fcge_h, float16_cge, float16) 1448 DO_3OP(gvec_fcge_s, float32_cge, float32) 1449 DO_3OP(gvec_fcge_d, float64_cge, float64) 1450 1451 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1452 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1453 DO_3OP(gvec_fcgt_d, float64_cgt, float64) 1454 1455 DO_3OP(gvec_facge_h, float16_acge, float16) 1456 DO_3OP(gvec_facge_s, float32_acge, float32) 1457 DO_3OP(gvec_facge_d, float64_acge, float64) 1458 1459 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1460 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1461 DO_3OP(gvec_facgt_d, float64_acgt, float64) 1462 1463 DO_3OP(gvec_fmax_h, float16_max, float16) 1464 DO_3OP(gvec_fmax_s, float32_max, float32) 1465 DO_3OP(gvec_fmax_d, float64_max, float64) 1466 1467 DO_3OP(gvec_fmin_h, float16_min, float16) 1468 DO_3OP(gvec_fmin_s, float32_min, float32) 1469 DO_3OP(gvec_fmin_d, float64_min, float64) 1470 1471 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1472 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1473 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64) 1474 1475 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1476 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1477 DO_3OP(gvec_fminnum_d, float64_minnum, float64) 1478 1479 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1480 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1481 1482 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1483 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1484 1485 #ifdef TARGET_AARCH64 1486 DO_3OP(gvec_fdiv_h, float16_div, float16) 1487 DO_3OP(gvec_fdiv_s, float32_div, float32) 1488 DO_3OP(gvec_fdiv_d, float64_div, float64) 1489 1490 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16) 1491 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32) 1492 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64) 1493 1494 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1495 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1496 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1497 1498 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1499 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1500 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1501 1502 DO_3OP(gvec_ah_recps_h, helper_recpsf_ah_f16, float16) 1503 DO_3OP(gvec_ah_recps_s, helper_recpsf_ah_f32, float32) 1504 DO_3OP(gvec_ah_recps_d, helper_recpsf_ah_f64, float64) 1505 1506 DO_3OP(gvec_ah_rsqrts_h, helper_rsqrtsf_ah_f16, float16) 1507 DO_3OP(gvec_ah_rsqrts_s, helper_rsqrtsf_ah_f32, float32) 1508 DO_3OP(gvec_ah_rsqrts_d, helper_rsqrtsf_ah_f64, float64) 1509 1510 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16) 1511 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32) 1512 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64) 1513 1514 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16) 1515 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32) 1516 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64) 1517 1518 #endif 1519 #undef DO_3OP 1520 1521 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1522 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1523 float_status *stat) 1524 { 1525 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1526 } 1527 1528 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1529 float_status *stat) 1530 { 1531 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1532 } 1533 1534 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1535 float_status *stat) 1536 { 1537 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1538 } 1539 1540 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1541 float_status *stat) 1542 { 1543 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1544 } 1545 1546 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1547 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1548 float_status *stat) 1549 { 1550 return float16_muladd(op1, op2, dest, 0, stat); 1551 } 1552 1553 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1554 float_status *stat) 1555 { 1556 return float32_muladd(op1, op2, dest, 0, stat); 1557 } 1558 1559 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2, 1560 float_status *stat) 1561 { 1562 return float64_muladd(op1, op2, dest, 0, stat); 1563 } 1564 1565 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1566 float_status *stat) 1567 { 1568 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1569 } 1570 1571 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1572 float_status *stat) 1573 { 1574 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1575 } 1576 1577 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2, 1578 float_status *stat) 1579 { 1580 return float64_muladd(float64_chs(op1), op2, dest, 0, stat); 1581 } 1582 1583 static float16 float16_ah_mulsub_f(float16 dest, float16 op1, float16 op2, 1584 float_status *stat) 1585 { 1586 return float16_muladd(op1, op2, dest, float_muladd_negate_product, stat); 1587 } 1588 1589 static float32 float32_ah_mulsub_f(float32 dest, float32 op1, float32 op2, 1590 float_status *stat) 1591 { 1592 return float32_muladd(op1, op2, dest, float_muladd_negate_product, stat); 1593 } 1594 1595 static float64 float64_ah_mulsub_f(float64 dest, float64 op1, float64 op2, 1596 float_status *stat) 1597 { 1598 return float64_muladd(op1, op2, dest, float_muladd_negate_product, stat); 1599 } 1600 1601 #define DO_MULADD(NAME, FUNC, TYPE) \ 1602 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1603 float_status *stat, uint32_t desc) \ 1604 { \ 1605 intptr_t i, oprsz = simd_oprsz(desc); \ 1606 TYPE *d = vd, *n = vn, *m = vm; \ 1607 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1608 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1609 } \ 1610 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1611 } 1612 1613 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1614 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1615 1616 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1617 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1618 1619 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1620 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1621 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64) 1622 1623 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1624 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1625 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64) 1626 1627 DO_MULADD(gvec_ah_vfms_h, float16_ah_mulsub_f, float16) 1628 DO_MULADD(gvec_ah_vfms_s, float32_ah_mulsub_f, float32) 1629 DO_MULADD(gvec_ah_vfms_d, float64_ah_mulsub_f, float64) 1630 1631 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1632 * For AdvSIMD, there is of course only one such vector segment. 1633 */ 1634 1635 #define DO_MUL_IDX(NAME, TYPE, H) \ 1636 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1637 { \ 1638 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1639 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1640 intptr_t idx = simd_data(desc); \ 1641 TYPE *d = vd, *n = vn, *m = vm; \ 1642 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1643 TYPE mm = m[H(i + idx)]; \ 1644 for (j = 0; j < segment; j++) { \ 1645 d[i + j] = n[i + j] * mm; \ 1646 } \ 1647 } \ 1648 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1649 } 1650 1651 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1652 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1653 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1654 1655 #undef DO_MUL_IDX 1656 1657 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1658 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1659 { \ 1660 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1661 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1662 intptr_t idx = simd_data(desc); \ 1663 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1664 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1665 TYPE mm = m[H(i + idx)]; \ 1666 for (j = 0; j < segment; j++) { \ 1667 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1668 } \ 1669 } \ 1670 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1671 } 1672 1673 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1674 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1675 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1676 1677 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1678 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1679 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1680 1681 #undef DO_MLA_IDX 1682 1683 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \ 1684 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1685 float_status *stat, uint32_t desc) \ 1686 { \ 1687 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1688 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1689 intptr_t idx = simd_data(desc); \ 1690 TYPE *d = vd, *n = vn, *m = vm; \ 1691 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1692 TYPE mm = m[H(i + idx)]; \ 1693 for (j = 0; j < segment; j++) { \ 1694 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \ 1695 } \ 1696 } \ 1697 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1698 } 1699 1700 #define nop(N, M, S) (M) 1701 1702 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2) 1703 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4) 1704 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8) 1705 1706 #ifdef TARGET_AARCH64 1707 1708 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2) 1709 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4) 1710 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8) 1711 1712 #endif 1713 1714 #undef nop 1715 1716 /* 1717 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1718 * the fused ops below they assume accumulate both from and into Vd. 1719 */ 1720 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2) 1721 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4) 1722 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2) 1723 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4) 1724 1725 #undef DO_FMUL_IDX 1726 1727 #define DO_FMLA_IDX(NAME, TYPE, H, NEGX, NEGF) \ 1728 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1729 float_status *stat, uint32_t desc) \ 1730 { \ 1731 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1732 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1733 intptr_t idx = simd_data(desc); \ 1734 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1735 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1736 TYPE mm = m[H(i + idx)]; \ 1737 for (j = 0; j < segment; j++) { \ 1738 d[i + j] = TYPE##_muladd(n[i + j] ^ NEGX, mm, \ 1739 a[i + j], NEGF, stat); \ 1740 } \ 1741 } \ 1742 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1743 } 1744 1745 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2, 0, 0) 1746 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4, 0, 0) 1747 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8, 0, 0) 1748 1749 DO_FMLA_IDX(gvec_fmls_idx_h, float16, H2, INT16_MIN, 0) 1750 DO_FMLA_IDX(gvec_fmls_idx_s, float32, H4, INT32_MIN, 0) 1751 DO_FMLA_IDX(gvec_fmls_idx_d, float64, H8, INT64_MIN, 0) 1752 1753 DO_FMLA_IDX(gvec_ah_fmls_idx_h, float16, H2, 0, float_muladd_negate_product) 1754 DO_FMLA_IDX(gvec_ah_fmls_idx_s, float32, H4, 0, float_muladd_negate_product) 1755 DO_FMLA_IDX(gvec_ah_fmls_idx_d, float64, H8, 0, float_muladd_negate_product) 1756 1757 #undef DO_FMLA_IDX 1758 1759 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1760 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1761 { \ 1762 intptr_t i, oprsz = simd_oprsz(desc); \ 1763 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1764 bool q = false; \ 1765 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1766 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1767 if (dd < MIN) { \ 1768 dd = MIN; \ 1769 q = true; \ 1770 } else if (dd > MAX) { \ 1771 dd = MAX; \ 1772 q = true; \ 1773 } \ 1774 d[i] = dd; \ 1775 } \ 1776 if (q) { \ 1777 uint32_t *qc = vq; \ 1778 qc[0] = 1; \ 1779 } \ 1780 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1781 } 1782 1783 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1784 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1785 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1786 1787 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1788 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1789 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1790 1791 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1792 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1793 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1794 1795 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1796 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1797 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1798 1799 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX) 1800 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX) 1801 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX) 1802 1803 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX) 1804 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX) 1805 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX) 1806 1807 #undef DO_SAT 1808 1809 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1810 void *vm, uint32_t desc) 1811 { 1812 intptr_t i, oprsz = simd_oprsz(desc); 1813 uint64_t *d = vd, *n = vn, *m = vm; 1814 bool q = false; 1815 1816 for (i = 0; i < oprsz / 8; i++) { 1817 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1818 if (dd < nn) { 1819 dd = UINT64_MAX; 1820 q = true; 1821 } 1822 d[i] = dd; 1823 } 1824 if (q) { 1825 uint32_t *qc = vq; 1826 qc[0] = 1; 1827 } 1828 clear_tail(d, oprsz, simd_maxsz(desc)); 1829 } 1830 1831 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1832 void *vm, uint32_t desc) 1833 { 1834 intptr_t i, oprsz = simd_oprsz(desc); 1835 uint64_t *d = vd, *n = vn, *m = vm; 1836 bool q = false; 1837 1838 for (i = 0; i < oprsz / 8; i++) { 1839 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1840 if (nn < mm) { 1841 dd = 0; 1842 q = true; 1843 } 1844 d[i] = dd; 1845 } 1846 if (q) { 1847 uint32_t *qc = vq; 1848 qc[0] = 1; 1849 } 1850 clear_tail(d, oprsz, simd_maxsz(desc)); 1851 } 1852 1853 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1854 void *vm, uint32_t desc) 1855 { 1856 intptr_t i, oprsz = simd_oprsz(desc); 1857 int64_t *d = vd, *n = vn, *m = vm; 1858 bool q = false; 1859 1860 for (i = 0; i < oprsz / 8; i++) { 1861 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1862 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1863 dd = (nn >> 63) ^ ~INT64_MIN; 1864 q = true; 1865 } 1866 d[i] = dd; 1867 } 1868 if (q) { 1869 uint32_t *qc = vq; 1870 qc[0] = 1; 1871 } 1872 clear_tail(d, oprsz, simd_maxsz(desc)); 1873 } 1874 1875 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1876 void *vm, uint32_t desc) 1877 { 1878 intptr_t i, oprsz = simd_oprsz(desc); 1879 int64_t *d = vd, *n = vn, *m = vm; 1880 bool q = false; 1881 1882 for (i = 0; i < oprsz / 8; i++) { 1883 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1884 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1885 dd = (nn >> 63) ^ ~INT64_MIN; 1886 q = true; 1887 } 1888 d[i] = dd; 1889 } 1890 if (q) { 1891 uint32_t *qc = vq; 1892 qc[0] = 1; 1893 } 1894 clear_tail(d, oprsz, simd_maxsz(desc)); 1895 } 1896 1897 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn, 1898 void *vm, uint32_t desc) 1899 { 1900 intptr_t i, oprsz = simd_oprsz(desc); 1901 uint64_t *d = vd, *n = vn, *m = vm; 1902 bool q = false; 1903 1904 for (i = 0; i < oprsz / 8; i++) { 1905 uint64_t nn = n[i]; 1906 int64_t mm = m[i]; 1907 uint64_t dd = nn + mm; 1908 1909 if (mm < 0) { 1910 if (nn < (uint64_t)-mm) { 1911 dd = 0; 1912 q = true; 1913 } 1914 } else { 1915 if (dd < nn) { 1916 dd = UINT64_MAX; 1917 q = true; 1918 } 1919 } 1920 d[i] = dd; 1921 } 1922 if (q) { 1923 uint32_t *qc = vq; 1924 qc[0] = 1; 1925 } 1926 clear_tail(d, oprsz, simd_maxsz(desc)); 1927 } 1928 1929 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn, 1930 void *vm, uint32_t desc) 1931 { 1932 intptr_t i, oprsz = simd_oprsz(desc); 1933 uint64_t *d = vd, *n = vn, *m = vm; 1934 bool q = false; 1935 1936 for (i = 0; i < oprsz / 8; i++) { 1937 int64_t nn = n[i]; 1938 uint64_t mm = m[i]; 1939 int64_t dd = nn + mm; 1940 1941 if (mm > (uint64_t)(INT64_MAX - nn)) { 1942 dd = INT64_MAX; 1943 q = true; 1944 } 1945 d[i] = dd; 1946 } 1947 if (q) { 1948 uint32_t *qc = vq; 1949 qc[0] = 1; 1950 } 1951 clear_tail(d, oprsz, simd_maxsz(desc)); 1952 } 1953 1954 #define DO_SRA(NAME, TYPE) \ 1955 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1956 { \ 1957 intptr_t i, oprsz = simd_oprsz(desc); \ 1958 int shift = simd_data(desc); \ 1959 TYPE *d = vd, *n = vn; \ 1960 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1961 d[i] += n[i] >> shift; \ 1962 } \ 1963 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1964 } 1965 1966 DO_SRA(gvec_ssra_b, int8_t) 1967 DO_SRA(gvec_ssra_h, int16_t) 1968 DO_SRA(gvec_ssra_s, int32_t) 1969 DO_SRA(gvec_ssra_d, int64_t) 1970 1971 DO_SRA(gvec_usra_b, uint8_t) 1972 DO_SRA(gvec_usra_h, uint16_t) 1973 DO_SRA(gvec_usra_s, uint32_t) 1974 DO_SRA(gvec_usra_d, uint64_t) 1975 1976 #undef DO_SRA 1977 1978 #define DO_RSHR(NAME, TYPE) \ 1979 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1980 { \ 1981 intptr_t i, oprsz = simd_oprsz(desc); \ 1982 int shift = simd_data(desc); \ 1983 TYPE *d = vd, *n = vn; \ 1984 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1985 TYPE tmp = n[i] >> (shift - 1); \ 1986 d[i] = (tmp >> 1) + (tmp & 1); \ 1987 } \ 1988 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1989 } 1990 1991 DO_RSHR(gvec_srshr_b, int8_t) 1992 DO_RSHR(gvec_srshr_h, int16_t) 1993 DO_RSHR(gvec_srshr_s, int32_t) 1994 DO_RSHR(gvec_srshr_d, int64_t) 1995 1996 DO_RSHR(gvec_urshr_b, uint8_t) 1997 DO_RSHR(gvec_urshr_h, uint16_t) 1998 DO_RSHR(gvec_urshr_s, uint32_t) 1999 DO_RSHR(gvec_urshr_d, uint64_t) 2000 2001 #undef DO_RSHR 2002 2003 #define DO_RSRA(NAME, TYPE) \ 2004 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2005 { \ 2006 intptr_t i, oprsz = simd_oprsz(desc); \ 2007 int shift = simd_data(desc); \ 2008 TYPE *d = vd, *n = vn; \ 2009 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2010 TYPE tmp = n[i] >> (shift - 1); \ 2011 d[i] += (tmp >> 1) + (tmp & 1); \ 2012 } \ 2013 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2014 } 2015 2016 DO_RSRA(gvec_srsra_b, int8_t) 2017 DO_RSRA(gvec_srsra_h, int16_t) 2018 DO_RSRA(gvec_srsra_s, int32_t) 2019 DO_RSRA(gvec_srsra_d, int64_t) 2020 2021 DO_RSRA(gvec_ursra_b, uint8_t) 2022 DO_RSRA(gvec_ursra_h, uint16_t) 2023 DO_RSRA(gvec_ursra_s, uint32_t) 2024 DO_RSRA(gvec_ursra_d, uint64_t) 2025 2026 #undef DO_RSRA 2027 2028 #define DO_SRI(NAME, TYPE) \ 2029 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2030 { \ 2031 intptr_t i, oprsz = simd_oprsz(desc); \ 2032 int shift = simd_data(desc); \ 2033 TYPE *d = vd, *n = vn; \ 2034 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2035 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 2036 } \ 2037 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2038 } 2039 2040 DO_SRI(gvec_sri_b, uint8_t) 2041 DO_SRI(gvec_sri_h, uint16_t) 2042 DO_SRI(gvec_sri_s, uint32_t) 2043 DO_SRI(gvec_sri_d, uint64_t) 2044 2045 #undef DO_SRI 2046 2047 #define DO_SLI(NAME, TYPE) \ 2048 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2049 { \ 2050 intptr_t i, oprsz = simd_oprsz(desc); \ 2051 int shift = simd_data(desc); \ 2052 TYPE *d = vd, *n = vn; \ 2053 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2054 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 2055 } \ 2056 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2057 } 2058 2059 DO_SLI(gvec_sli_b, uint8_t) 2060 DO_SLI(gvec_sli_h, uint16_t) 2061 DO_SLI(gvec_sli_s, uint32_t) 2062 DO_SLI(gvec_sli_d, uint64_t) 2063 2064 #undef DO_SLI 2065 2066 /* 2067 * Convert float16 to float32, raising no exceptions and 2068 * preserving exceptional values, including SNaN. 2069 * This is effectively an unpack+repack operation. 2070 */ 2071 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 2072 { 2073 const int f16_bias = 15; 2074 const int f32_bias = 127; 2075 uint32_t sign = extract32(f16, 15, 1); 2076 uint32_t exp = extract32(f16, 10, 5); 2077 uint32_t frac = extract32(f16, 0, 10); 2078 2079 if (exp == 0x1f) { 2080 /* Inf or NaN */ 2081 exp = 0xff; 2082 } else if (exp == 0) { 2083 /* Zero or denormal. */ 2084 if (frac != 0) { 2085 if (fz16) { 2086 frac = 0; 2087 } else { 2088 /* 2089 * Denormal; these are all normal float32. 2090 * Shift the fraction so that the msb is at bit 11, 2091 * then remove bit 11 as the implicit bit of the 2092 * normalized float32. Note that we still go through 2093 * the shift for normal numbers below, to put the 2094 * float32 fraction at the right place. 2095 */ 2096 int shift = clz32(frac) - 21; 2097 frac = (frac << shift) & 0x3ff; 2098 exp = f32_bias - f16_bias - shift + 1; 2099 } 2100 } 2101 } else { 2102 /* Normal number; adjust the bias. */ 2103 exp += f32_bias - f16_bias; 2104 } 2105 sign <<= 31; 2106 exp <<= 23; 2107 frac <<= 23 - 10; 2108 2109 return sign | exp | frac; 2110 } 2111 2112 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 2113 { 2114 /* 2115 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 2116 * Load the 2nd qword iff is_q & is_2. 2117 * Shift to the 2nd dword iff !is_q & is_2. 2118 * For !is_q & !is_2, the upper bits of the result are garbage. 2119 */ 2120 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 2121 } 2122 2123 /* 2124 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 2125 * as there is not yet SVE versions that might use blocking. 2126 */ 2127 2128 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 2129 uint64_t negx, int negf, uint32_t desc, bool fz16) 2130 { 2131 intptr_t i, oprsz = simd_oprsz(desc); 2132 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2133 int is_q = oprsz == 16; 2134 uint64_t n_4, m_4; 2135 2136 /* 2137 * Pre-load all of the f16 data, avoiding overlap issues. 2138 * Negate all inputs for AH=0 FMLSL at once. 2139 */ 2140 n_4 = load4_f16(vn, is_q, is_2) ^ negx; 2141 m_4 = load4_f16(vm, is_q, is_2); 2142 2143 for (i = 0; i < oprsz / 4; i++) { 2144 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2145 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 2146 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst); 2147 } 2148 clear_tail(d, oprsz, simd_maxsz(desc)); 2149 } 2150 2151 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 2152 CPUARMState *env, uint32_t desc) 2153 { 2154 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2155 uint64_t negx = is_s ? 0x8000800080008000ull : 0; 2156 2157 do_fmlal(vd, vn, vm, &env->vfp.fp_status[FPST_STD], negx, 0, desc, 2158 get_flush_inputs_to_zero(&env->vfp.fp_status[FPST_A32_F16])); 2159 } 2160 2161 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 2162 CPUARMState *env, uint32_t desc) 2163 { 2164 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2165 uint64_t negx = 0; 2166 int negf = 0; 2167 2168 if (is_s) { 2169 if (env->vfp.fpcr & FPCR_AH) { 2170 negf = float_muladd_negate_product; 2171 } else { 2172 negx = 0x8000800080008000ull; 2173 } 2174 } 2175 do_fmlal(vd, vn, vm, &env->vfp.fp_status[FPST_A64], negx, negf, desc, 2176 get_flush_inputs_to_zero(&env->vfp.fp_status[FPST_A64_F16])); 2177 } 2178 2179 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 2180 CPUARMState *env, uint32_t desc) 2181 { 2182 intptr_t i, oprsz = simd_oprsz(desc); 2183 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2184 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2185 float_status *status = &env->vfp.fp_status[FPST_A64]; 2186 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status[FPST_A64_F16]); 2187 int negx = 0, negf = 0; 2188 2189 if (is_s) { 2190 if (env->vfp.fpcr & FPCR_AH) { 2191 negf = float_muladd_negate_product; 2192 } else { 2193 negx = 0x8000; 2194 } 2195 } 2196 2197 for (i = 0; i < oprsz; i += sizeof(float32)) { 2198 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negx; 2199 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 2200 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2201 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2202 float32 aa = *(float32 *)(va + H1_4(i)); 2203 2204 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, negf, status); 2205 } 2206 } 2207 2208 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 2209 uint64_t negx, int negf, uint32_t desc, bool fz16) 2210 { 2211 intptr_t i, oprsz = simd_oprsz(desc); 2212 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2213 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 2214 int is_q = oprsz == 16; 2215 uint64_t n_4; 2216 float32 m_1; 2217 2218 /* 2219 * Pre-load all of the f16 data, avoiding overlap issues. 2220 * Negate all inputs for AH=0 FMLSL at once. 2221 */ 2222 n_4 = load4_f16(vn, is_q, is_2) ^ negx; 2223 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 2224 2225 for (i = 0; i < oprsz / 4; i++) { 2226 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2227 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst); 2228 } 2229 clear_tail(d, oprsz, simd_maxsz(desc)); 2230 } 2231 2232 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 2233 CPUARMState *env, uint32_t desc) 2234 { 2235 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2236 uint64_t negx = is_s ? 0x8000800080008000ull : 0; 2237 2238 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status[FPST_STD], negx, 0, desc, 2239 get_flush_inputs_to_zero(&env->vfp.fp_status[FPST_A32_F16])); 2240 } 2241 2242 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 2243 CPUARMState *env, uint32_t desc) 2244 { 2245 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2246 uint64_t negx = 0; 2247 int negf = 0; 2248 2249 if (is_s) { 2250 if (env->vfp.fpcr & FPCR_AH) { 2251 negf = float_muladd_negate_product; 2252 } else { 2253 negx = 0x8000800080008000ull; 2254 } 2255 } 2256 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status[FPST_A64], negx, negf, desc, 2257 get_flush_inputs_to_zero(&env->vfp.fp_status[FPST_A64_F16])); 2258 } 2259 2260 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 2261 CPUARMState *env, uint32_t desc) 2262 { 2263 intptr_t i, j, oprsz = simd_oprsz(desc); 2264 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2265 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2266 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 2267 float_status *status = &env->vfp.fp_status[FPST_A64]; 2268 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status[FPST_A64_F16]); 2269 int negx = 0, negf = 0; 2270 2271 if (is_s) { 2272 if (env->vfp.fpcr & FPCR_AH) { 2273 negf = float_muladd_negate_product; 2274 } else { 2275 negx = 0x8000; 2276 } 2277 } 2278 for (i = 0; i < oprsz; i += 16) { 2279 float16 mm_16 = *(float16 *)(vm + i + idx); 2280 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2281 2282 for (j = 0; j < 16; j += sizeof(float32)) { 2283 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negx; 2284 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2285 float32 aa = *(float32 *)(va + H1_4(i + j)); 2286 2287 *(float32 *)(vd + H1_4(i + j)) = 2288 float32_muladd(nn, mm, aa, negf, status); 2289 } 2290 } 2291 } 2292 2293 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2294 { 2295 intptr_t i, opr_sz = simd_oprsz(desc); 2296 int8_t *d = vd, *n = vn, *m = vm; 2297 2298 for (i = 0; i < opr_sz; ++i) { 2299 int8_t mm = m[i]; 2300 int8_t nn = n[i]; 2301 int8_t res = 0; 2302 if (mm >= 0) { 2303 if (mm < 8) { 2304 res = nn << mm; 2305 } 2306 } else { 2307 res = nn >> (mm > -8 ? -mm : 7); 2308 } 2309 d[i] = res; 2310 } 2311 clear_tail(d, opr_sz, simd_maxsz(desc)); 2312 } 2313 2314 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2315 { 2316 intptr_t i, opr_sz = simd_oprsz(desc); 2317 int16_t *d = vd, *n = vn, *m = vm; 2318 2319 for (i = 0; i < opr_sz / 2; ++i) { 2320 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2321 int16_t nn = n[i]; 2322 int16_t res = 0; 2323 if (mm >= 0) { 2324 if (mm < 16) { 2325 res = nn << mm; 2326 } 2327 } else { 2328 res = nn >> (mm > -16 ? -mm : 15); 2329 } 2330 d[i] = res; 2331 } 2332 clear_tail(d, opr_sz, simd_maxsz(desc)); 2333 } 2334 2335 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2336 { 2337 intptr_t i, opr_sz = simd_oprsz(desc); 2338 uint8_t *d = vd, *n = vn, *m = vm; 2339 2340 for (i = 0; i < opr_sz; ++i) { 2341 int8_t mm = m[i]; 2342 uint8_t nn = n[i]; 2343 uint8_t res = 0; 2344 if (mm >= 0) { 2345 if (mm < 8) { 2346 res = nn << mm; 2347 } 2348 } else { 2349 if (mm > -8) { 2350 res = nn >> -mm; 2351 } 2352 } 2353 d[i] = res; 2354 } 2355 clear_tail(d, opr_sz, simd_maxsz(desc)); 2356 } 2357 2358 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2359 { 2360 intptr_t i, opr_sz = simd_oprsz(desc); 2361 uint16_t *d = vd, *n = vn, *m = vm; 2362 2363 for (i = 0; i < opr_sz / 2; ++i) { 2364 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2365 uint16_t nn = n[i]; 2366 uint16_t res = 0; 2367 if (mm >= 0) { 2368 if (mm < 16) { 2369 res = nn << mm; 2370 } 2371 } else { 2372 if (mm > -16) { 2373 res = nn >> -mm; 2374 } 2375 } 2376 d[i] = res; 2377 } 2378 clear_tail(d, opr_sz, simd_maxsz(desc)); 2379 } 2380 2381 /* 2382 * 8x8->8 polynomial multiply. 2383 * 2384 * Polynomial multiplication is like integer multiplication except the 2385 * partial products are XORed, not added. 2386 * 2387 * TODO: expose this as a generic vector operation, as it is a common 2388 * crypto building block. 2389 */ 2390 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 2391 { 2392 intptr_t i, opr_sz = simd_oprsz(desc); 2393 uint64_t *d = vd, *n = vn, *m = vm; 2394 2395 for (i = 0; i < opr_sz / 8; ++i) { 2396 d[i] = clmul_8x8_low(n[i], m[i]); 2397 } 2398 clear_tail(d, opr_sz, simd_maxsz(desc)); 2399 } 2400 2401 /* 2402 * 64x64->128 polynomial multiply. 2403 * Because of the lanes are not accessed in strict columns, 2404 * this probably cannot be turned into a generic helper. 2405 */ 2406 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2407 { 2408 intptr_t i, opr_sz = simd_oprsz(desc); 2409 intptr_t hi = simd_data(desc); 2410 uint64_t *d = vd, *n = vn, *m = vm; 2411 2412 for (i = 0; i < opr_sz / 8; i += 2) { 2413 Int128 r = clmul_64(n[i + hi], m[i + hi]); 2414 d[i] = int128_getlo(r); 2415 d[i + 1] = int128_gethi(r); 2416 } 2417 clear_tail(d, opr_sz, simd_maxsz(desc)); 2418 } 2419 2420 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2421 { 2422 int hi = simd_data(desc); 2423 uint64_t *d = vd, *n = vn, *m = vm; 2424 uint64_t nn = n[hi], mm = m[hi]; 2425 2426 d[0] = clmul_8x4_packed(nn, mm); 2427 nn >>= 32; 2428 mm >>= 32; 2429 d[1] = clmul_8x4_packed(nn, mm); 2430 2431 clear_tail(d, 16, simd_maxsz(desc)); 2432 } 2433 2434 #ifdef TARGET_AARCH64 2435 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2436 { 2437 int shift = simd_data(desc) * 8; 2438 intptr_t i, opr_sz = simd_oprsz(desc); 2439 uint64_t *d = vd, *n = vn, *m = vm; 2440 2441 for (i = 0; i < opr_sz / 8; ++i) { 2442 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); 2443 } 2444 } 2445 2446 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2447 { 2448 intptr_t sel = H4(simd_data(desc)); 2449 intptr_t i, opr_sz = simd_oprsz(desc); 2450 uint32_t *n = vn, *m = vm; 2451 uint64_t *d = vd; 2452 2453 for (i = 0; i < opr_sz / 8; ++i) { 2454 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); 2455 } 2456 } 2457 #endif 2458 2459 #define DO_CMP0(NAME, TYPE, OP) \ 2460 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2461 { \ 2462 intptr_t i, opr_sz = simd_oprsz(desc); \ 2463 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2464 TYPE nn = *(TYPE *)(vn + i); \ 2465 *(TYPE *)(vd + i) = -(nn OP 0); \ 2466 } \ 2467 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2468 } 2469 2470 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2471 DO_CMP0(gvec_clt0_b, int8_t, <) 2472 DO_CMP0(gvec_cle0_b, int8_t, <=) 2473 DO_CMP0(gvec_cgt0_b, int8_t, >) 2474 DO_CMP0(gvec_cge0_b, int8_t, >=) 2475 2476 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2477 DO_CMP0(gvec_clt0_h, int16_t, <) 2478 DO_CMP0(gvec_cle0_h, int16_t, <=) 2479 DO_CMP0(gvec_cgt0_h, int16_t, >) 2480 DO_CMP0(gvec_cge0_h, int16_t, >=) 2481 2482 #undef DO_CMP0 2483 2484 #define DO_ABD(NAME, TYPE) \ 2485 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2486 { \ 2487 intptr_t i, opr_sz = simd_oprsz(desc); \ 2488 TYPE *d = vd, *n = vn, *m = vm; \ 2489 \ 2490 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2491 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2492 } \ 2493 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2494 } 2495 2496 DO_ABD(gvec_sabd_b, int8_t) 2497 DO_ABD(gvec_sabd_h, int16_t) 2498 DO_ABD(gvec_sabd_s, int32_t) 2499 DO_ABD(gvec_sabd_d, int64_t) 2500 2501 DO_ABD(gvec_uabd_b, uint8_t) 2502 DO_ABD(gvec_uabd_h, uint16_t) 2503 DO_ABD(gvec_uabd_s, uint32_t) 2504 DO_ABD(gvec_uabd_d, uint64_t) 2505 2506 #undef DO_ABD 2507 2508 #define DO_ABA(NAME, TYPE) \ 2509 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2510 { \ 2511 intptr_t i, opr_sz = simd_oprsz(desc); \ 2512 TYPE *d = vd, *n = vn, *m = vm; \ 2513 \ 2514 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2515 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2516 } \ 2517 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2518 } 2519 2520 DO_ABA(gvec_saba_b, int8_t) 2521 DO_ABA(gvec_saba_h, int16_t) 2522 DO_ABA(gvec_saba_s, int32_t) 2523 DO_ABA(gvec_saba_d, int64_t) 2524 2525 DO_ABA(gvec_uaba_b, uint8_t) 2526 DO_ABA(gvec_uaba_h, uint16_t) 2527 DO_ABA(gvec_uaba_s, uint32_t) 2528 DO_ABA(gvec_uaba_d, uint64_t) 2529 2530 #undef DO_ABA 2531 2532 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2533 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 2534 float_status *stat, uint32_t desc) \ 2535 { \ 2536 ARMVectorReg scratch; \ 2537 intptr_t oprsz = simd_oprsz(desc); \ 2538 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2539 TYPE *d = vd, *n = vn, *m = vm; \ 2540 if (unlikely(d == m)) { \ 2541 m = memcpy(&scratch, m, oprsz); \ 2542 } \ 2543 for (intptr_t i = 0; i < half; ++i) { \ 2544 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat); \ 2545 } \ 2546 for (intptr_t i = 0; i < half; ++i) { \ 2547 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat); \ 2548 } \ 2549 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2550 } 2551 2552 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2) 2553 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4) 2554 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, ) 2555 2556 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2) 2557 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4) 2558 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, ) 2559 2560 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2) 2561 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4) 2562 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, ) 2563 2564 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2) 2565 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4) 2566 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, ) 2567 2568 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2) 2569 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4) 2570 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, ) 2571 2572 #ifdef TARGET_AARCH64 2573 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2) 2574 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4) 2575 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, ) 2576 2577 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2) 2578 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4) 2579 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, ) 2580 #endif 2581 2582 #undef DO_3OP_PAIR 2583 2584 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2585 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2586 { \ 2587 ARMVectorReg scratch; \ 2588 intptr_t oprsz = simd_oprsz(desc); \ 2589 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2590 TYPE *d = vd, *n = vn, *m = vm; \ 2591 if (unlikely(d == m)) { \ 2592 m = memcpy(&scratch, m, oprsz); \ 2593 } \ 2594 for (intptr_t i = 0; i < half; ++i) { \ 2595 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]); \ 2596 } \ 2597 for (intptr_t i = 0; i < half; ++i) { \ 2598 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]); \ 2599 } \ 2600 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2601 } 2602 2603 #define ADD(A, B) (A + B) 2604 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1) 2605 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2) 2606 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4) 2607 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, ) 2608 #undef ADD 2609 2610 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1) 2611 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2) 2612 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4) 2613 2614 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1) 2615 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2) 2616 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4) 2617 2618 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1) 2619 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2) 2620 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4) 2621 2622 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1) 2623 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2) 2624 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4) 2625 2626 #undef DO_3OP_PAIR 2627 2628 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2629 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 2630 { \ 2631 intptr_t i, oprsz = simd_oprsz(desc); \ 2632 int shift = simd_data(desc); \ 2633 TYPE *d = vd, *n = vn; \ 2634 float_status *fpst = stat; \ 2635 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2636 d[i] = FUNC(n[i], shift, fpst); \ 2637 } \ 2638 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2639 } 2640 2641 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t) 2642 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t) 2643 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2644 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2645 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2646 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2647 2648 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t) 2649 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t) 2650 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2651 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t) 2652 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2653 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2654 2655 #undef DO_VCVT_FIXED 2656 2657 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2658 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2659 { \ 2660 intptr_t i, oprsz = simd_oprsz(desc); \ 2661 uint32_t rmode = simd_data(desc); \ 2662 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2663 TYPE *d = vd, *n = vn; \ 2664 set_float_rounding_mode(rmode, fpst); \ 2665 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2666 d[i] = FUNC(n[i], 0, fpst); \ 2667 } \ 2668 set_float_rounding_mode(prev_rmode, fpst); \ 2669 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2670 } 2671 2672 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t) 2673 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t) 2674 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2675 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2676 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2677 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2678 2679 #undef DO_VCVT_RMODE 2680 2681 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2682 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2683 { \ 2684 intptr_t i, oprsz = simd_oprsz(desc); \ 2685 uint32_t rmode = simd_data(desc); \ 2686 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2687 TYPE *d = vd, *n = vn; \ 2688 set_float_rounding_mode(rmode, fpst); \ 2689 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2690 d[i] = FUNC(n[i], fpst); \ 2691 } \ 2692 set_float_rounding_mode(prev_rmode, fpst); \ 2693 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2694 } 2695 2696 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2697 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2698 2699 #undef DO_VRINT_RMODE 2700 2701 #ifdef TARGET_AARCH64 2702 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc) 2703 { 2704 const uint8_t *indices = vm; 2705 size_t oprsz = simd_oprsz(desc); 2706 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2707 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2708 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2709 union { 2710 uint8_t b[16]; 2711 uint64_t d[2]; 2712 } result; 2713 2714 /* 2715 * We must construct the final result in a temp, lest the output 2716 * overlaps the input table. For TBL, begin with zero; for TBX, 2717 * begin with the original register contents. Note that we always 2718 * copy 16 bytes here to avoid an extra branch; clearing the high 2719 * bits of the register for oprsz == 8 is handled below. 2720 */ 2721 if (is_tbx) { 2722 memcpy(&result, vd, 16); 2723 } else { 2724 memset(&result, 0, 16); 2725 } 2726 2727 for (size_t i = 0; i < oprsz; ++i) { 2728 uint32_t index = indices[H1(i)]; 2729 2730 if (index < table_len) { 2731 /* 2732 * Convert index (a byte offset into the virtual table 2733 * which is a series of 128-bit vectors concatenated) 2734 * into the correct register element, bearing in mind 2735 * that the table can wrap around from V31 to V0. 2736 */ 2737 const uint8_t *table = (const uint8_t *) 2738 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2739 result.b[H1(i)] = table[H1(index % 16)]; 2740 } 2741 } 2742 2743 memcpy(vd, &result, 16); 2744 clear_tail(vd, oprsz, simd_maxsz(desc)); 2745 } 2746 #endif 2747 2748 /* 2749 * NxN -> N highpart multiply 2750 * 2751 * TODO: expose this as a generic vector operation. 2752 */ 2753 2754 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2755 { 2756 intptr_t i, opr_sz = simd_oprsz(desc); 2757 int8_t *d = vd, *n = vn, *m = vm; 2758 2759 for (i = 0; i < opr_sz; ++i) { 2760 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2761 } 2762 clear_tail(d, opr_sz, simd_maxsz(desc)); 2763 } 2764 2765 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2766 { 2767 intptr_t i, opr_sz = simd_oprsz(desc); 2768 int16_t *d = vd, *n = vn, *m = vm; 2769 2770 for (i = 0; i < opr_sz / 2; ++i) { 2771 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2772 } 2773 clear_tail(d, opr_sz, simd_maxsz(desc)); 2774 } 2775 2776 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2777 { 2778 intptr_t i, opr_sz = simd_oprsz(desc); 2779 int32_t *d = vd, *n = vn, *m = vm; 2780 2781 for (i = 0; i < opr_sz / 4; ++i) { 2782 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2783 } 2784 clear_tail(d, opr_sz, simd_maxsz(desc)); 2785 } 2786 2787 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2788 { 2789 intptr_t i, opr_sz = simd_oprsz(desc); 2790 uint64_t *d = vd, *n = vn, *m = vm; 2791 uint64_t discard; 2792 2793 for (i = 0; i < opr_sz / 8; ++i) { 2794 muls64(&discard, &d[i], n[i], m[i]); 2795 } 2796 clear_tail(d, opr_sz, simd_maxsz(desc)); 2797 } 2798 2799 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2800 { 2801 intptr_t i, opr_sz = simd_oprsz(desc); 2802 uint8_t *d = vd, *n = vn, *m = vm; 2803 2804 for (i = 0; i < opr_sz; ++i) { 2805 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2806 } 2807 clear_tail(d, opr_sz, simd_maxsz(desc)); 2808 } 2809 2810 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2811 { 2812 intptr_t i, opr_sz = simd_oprsz(desc); 2813 uint16_t *d = vd, *n = vn, *m = vm; 2814 2815 for (i = 0; i < opr_sz / 2; ++i) { 2816 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2817 } 2818 clear_tail(d, opr_sz, simd_maxsz(desc)); 2819 } 2820 2821 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2822 { 2823 intptr_t i, opr_sz = simd_oprsz(desc); 2824 uint32_t *d = vd, *n = vn, *m = vm; 2825 2826 for (i = 0; i < opr_sz / 4; ++i) { 2827 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2828 } 2829 clear_tail(d, opr_sz, simd_maxsz(desc)); 2830 } 2831 2832 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2833 { 2834 intptr_t i, opr_sz = simd_oprsz(desc); 2835 uint64_t *d = vd, *n = vn, *m = vm; 2836 uint64_t discard; 2837 2838 for (i = 0; i < opr_sz / 8; ++i) { 2839 mulu64(&discard, &d[i], n[i], m[i]); 2840 } 2841 clear_tail(d, opr_sz, simd_maxsz(desc)); 2842 } 2843 2844 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2845 { 2846 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2847 int shr = simd_data(desc); 2848 uint64_t *d = vd, *n = vn, *m = vm; 2849 2850 for (i = 0; i < opr_sz; ++i) { 2851 d[i] = ror64(n[i] ^ m[i], shr); 2852 } 2853 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2854 } 2855 2856 /* 2857 * Integer matrix-multiply accumulate 2858 */ 2859 2860 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2861 { 2862 int8_t *n = vn, *m = vm; 2863 2864 for (intptr_t k = 0; k < 8; ++k) { 2865 sum += n[H1(k)] * m[H1(k)]; 2866 } 2867 return sum; 2868 } 2869 2870 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2871 { 2872 uint8_t *n = vn, *m = vm; 2873 2874 for (intptr_t k = 0; k < 8; ++k) { 2875 sum += n[H1(k)] * m[H1(k)]; 2876 } 2877 return sum; 2878 } 2879 2880 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2881 { 2882 uint8_t *n = vn; 2883 int8_t *m = vm; 2884 2885 for (intptr_t k = 0; k < 8; ++k) { 2886 sum += n[H1(k)] * m[H1(k)]; 2887 } 2888 return sum; 2889 } 2890 2891 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2892 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2893 { 2894 intptr_t seg, opr_sz = simd_oprsz(desc); 2895 2896 for (seg = 0; seg < opr_sz; seg += 16) { 2897 uint32_t *d = vd + seg; 2898 uint32_t *a = va + seg; 2899 uint32_t sum0, sum1, sum2, sum3; 2900 2901 /* 2902 * Process the entire segment at once, writing back the 2903 * results only after we've consumed all of the inputs. 2904 * 2905 * Key to indices by column: 2906 * i j i j 2907 */ 2908 sum0 = a[H4(0 + 0)]; 2909 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2910 sum1 = a[H4(0 + 1)]; 2911 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2912 sum2 = a[H4(2 + 0)]; 2913 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2914 sum3 = a[H4(2 + 1)]; 2915 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2916 2917 d[H4(0)] = sum0; 2918 d[H4(1)] = sum1; 2919 d[H4(2)] = sum2; 2920 d[H4(3)] = sum3; 2921 } 2922 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2923 } 2924 2925 #define DO_MMLA_B(NAME, INNER) \ 2926 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2927 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2928 2929 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2930 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2931 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2932 2933 /* 2934 * BFloat16 Dot Product 2935 */ 2936 2937 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp) 2938 { 2939 /* 2940 * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF. 2941 * For EBF = 0, we ignore the FPCR bits which determine rounding 2942 * mode and denormal-flushing, and we do unfused multiplies and 2943 * additions with intermediate rounding of all products and sums. 2944 * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits, 2945 * and we perform a fused two-way sum-of-products without intermediate 2946 * rounding of the products. 2947 * In either case, we don't set fp exception flags. 2948 * 2949 * EBF is AArch64 only, so even if it's set in the FPCR it has 2950 * no effect on AArch32 instructions. 2951 */ 2952 bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF; 2953 2954 *statusp = is_a64(env) ? env->vfp.fp_status[FPST_A64] : env->vfp.fp_status_a32; 2955 set_default_nan_mode(true, statusp); 2956 2957 if (ebf) { 2958 /* EBF=1 needs to do a step with round-to-odd semantics */ 2959 *oddstatusp = *statusp; 2960 set_float_rounding_mode(float_round_to_odd, oddstatusp); 2961 } else { 2962 set_flush_to_zero(true, statusp); 2963 set_flush_inputs_to_zero(true, statusp); 2964 set_float_rounding_mode(float_round_to_odd_inf, statusp); 2965 } 2966 return ebf; 2967 } 2968 2969 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst) 2970 { 2971 float32 t1, t2; 2972 2973 /* 2974 * Extract each BFloat16 from the element pair, and shift 2975 * them such that they become float32. 2976 */ 2977 t1 = float32_mul(e1 << 16, e2 << 16, fpst); 2978 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst); 2979 t1 = float32_add(t1, t2, fpst); 2980 t1 = float32_add(sum, t1, fpst); 2981 2982 return t1; 2983 } 2984 2985 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2, 2986 float_status *fpst, float_status *fpst_odd) 2987 { 2988 /* 2989 * Compare f16_dotadd() in sme_helper.c, but here we have 2990 * bfloat16 inputs. In particular that means that we do not 2991 * want the FPCR.FZ16 flush semantics, so we use the normal 2992 * float_status for the input handling here. 2993 */ 2994 float64 e1r = float32_to_float64(e1 << 16, fpst); 2995 float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst); 2996 float64 e2r = float32_to_float64(e2 << 16, fpst); 2997 float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst); 2998 float64 t64; 2999 float32 t32; 3000 3001 /* 3002 * The ARM pseudocode function FPDot performs both multiplies 3003 * and the add with a single rounding operation. Emulate this 3004 * by performing the first multiply in round-to-odd, then doing 3005 * the second multiply as fused multiply-add, and rounding to 3006 * float32 all in one step. 3007 */ 3008 t64 = float64_mul(e1r, e2r, fpst_odd); 3009 t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst); 3010 3011 /* This conversion is exact, because we've already rounded. */ 3012 t32 = float64_to_float32(t64, fpst); 3013 3014 /* The final accumulation step is not fused. */ 3015 return float32_add(sum, t32, fpst); 3016 } 3017 3018 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, 3019 CPUARMState *env, uint32_t desc) 3020 { 3021 intptr_t i, opr_sz = simd_oprsz(desc); 3022 float32 *d = vd, *a = va; 3023 uint32_t *n = vn, *m = vm; 3024 float_status fpst, fpst_odd; 3025 3026 if (is_ebf(env, &fpst, &fpst_odd)) { 3027 for (i = 0; i < opr_sz / 4; ++i) { 3028 d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd); 3029 } 3030 } else { 3031 for (i = 0; i < opr_sz / 4; ++i) { 3032 d[i] = bfdotadd(a[i], n[i], m[i], &fpst); 3033 } 3034 } 3035 clear_tail(d, opr_sz, simd_maxsz(desc)); 3036 } 3037 3038 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 3039 void *va, CPUARMState *env, uint32_t desc) 3040 { 3041 intptr_t i, j, opr_sz = simd_oprsz(desc); 3042 intptr_t index = simd_data(desc); 3043 intptr_t elements = opr_sz / 4; 3044 intptr_t eltspersegment = MIN(16 / 4, elements); 3045 float32 *d = vd, *a = va; 3046 uint32_t *n = vn, *m = vm; 3047 float_status fpst, fpst_odd; 3048 3049 if (is_ebf(env, &fpst, &fpst_odd)) { 3050 for (i = 0; i < elements; i += eltspersegment) { 3051 uint32_t m_idx = m[i + H4(index)]; 3052 3053 for (j = i; j < i + eltspersegment; j++) { 3054 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd); 3055 } 3056 } 3057 } else { 3058 for (i = 0; i < elements; i += eltspersegment) { 3059 uint32_t m_idx = m[i + H4(index)]; 3060 3061 for (j = i; j < i + eltspersegment; j++) { 3062 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst); 3063 } 3064 } 3065 } 3066 clear_tail(d, opr_sz, simd_maxsz(desc)); 3067 } 3068 3069 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, 3070 CPUARMState *env, uint32_t desc) 3071 { 3072 intptr_t s, opr_sz = simd_oprsz(desc); 3073 float32 *d = vd, *a = va; 3074 uint32_t *n = vn, *m = vm; 3075 float_status fpst, fpst_odd; 3076 3077 if (is_ebf(env, &fpst, &fpst_odd)) { 3078 for (s = 0; s < opr_sz / 4; s += 4) { 3079 float32 sum00, sum01, sum10, sum11; 3080 3081 /* 3082 * Process the entire segment at once, writing back the 3083 * results only after we've consumed all of the inputs. 3084 * 3085 * Key to indices by column: 3086 * i j i k j k 3087 */ 3088 sum00 = a[s + H4(0 + 0)]; 3089 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 3090 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 3091 3092 sum01 = a[s + H4(0 + 1)]; 3093 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 3094 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 3095 3096 sum10 = a[s + H4(2 + 0)]; 3097 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 3098 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 3099 3100 sum11 = a[s + H4(2 + 1)]; 3101 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 3102 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 3103 3104 d[s + H4(0 + 0)] = sum00; 3105 d[s + H4(0 + 1)] = sum01; 3106 d[s + H4(2 + 0)] = sum10; 3107 d[s + H4(2 + 1)] = sum11; 3108 } 3109 } else { 3110 for (s = 0; s < opr_sz / 4; s += 4) { 3111 float32 sum00, sum01, sum10, sum11; 3112 3113 /* 3114 * Process the entire segment at once, writing back the 3115 * results only after we've consumed all of the inputs. 3116 * 3117 * Key to indices by column: 3118 * i j i k j k 3119 */ 3120 sum00 = a[s + H4(0 + 0)]; 3121 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst); 3122 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst); 3123 3124 sum01 = a[s + H4(0 + 1)]; 3125 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst); 3126 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst); 3127 3128 sum10 = a[s + H4(2 + 0)]; 3129 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst); 3130 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst); 3131 3132 sum11 = a[s + H4(2 + 1)]; 3133 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst); 3134 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst); 3135 3136 d[s + H4(0 + 0)] = sum00; 3137 d[s + H4(0 + 1)] = sum01; 3138 d[s + H4(2 + 0)] = sum10; 3139 d[s + H4(2 + 1)] = sum11; 3140 } 3141 } 3142 clear_tail(d, opr_sz, simd_maxsz(desc)); 3143 } 3144 3145 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 3146 float_status *stat, uint32_t desc) 3147 { 3148 intptr_t i, opr_sz = simd_oprsz(desc); 3149 intptr_t sel = simd_data(desc); 3150 float32 *d = vd, *a = va; 3151 bfloat16 *n = vn, *m = vm; 3152 3153 for (i = 0; i < opr_sz / 4; ++i) { 3154 float32 nn = n[H2(i * 2 + sel)] << 16; 3155 float32 mm = m[H2(i * 2 + sel)] << 16; 3156 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 3157 } 3158 clear_tail(d, opr_sz, simd_maxsz(desc)); 3159 } 3160 3161 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 3162 void *va, float_status *stat, uint32_t desc) 3163 { 3164 intptr_t i, j, opr_sz = simd_oprsz(desc); 3165 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 3166 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 3167 intptr_t elements = opr_sz / 4; 3168 intptr_t eltspersegment = MIN(16 / 4, elements); 3169 float32 *d = vd, *a = va; 3170 bfloat16 *n = vn, *m = vm; 3171 3172 for (i = 0; i < elements; i += eltspersegment) { 3173 float32 m_idx = m[H2(2 * i + index)] << 16; 3174 3175 for (j = i; j < i + eltspersegment; j++) { 3176 float32 n_j = n[H2(2 * j + sel)] << 16; 3177 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 3178 } 3179 } 3180 clear_tail(d, opr_sz, simd_maxsz(desc)); 3181 } 3182 3183 #define DO_CLAMP(NAME, TYPE) \ 3184 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 3185 { \ 3186 intptr_t i, opr_sz = simd_oprsz(desc); \ 3187 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 3188 TYPE aa = *(TYPE *)(a + i); \ 3189 TYPE nn = *(TYPE *)(n + i); \ 3190 TYPE mm = *(TYPE *)(m + i); \ 3191 TYPE dd = MIN(MAX(aa, nn), mm); \ 3192 *(TYPE *)(d + i) = dd; \ 3193 } \ 3194 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 3195 } 3196 3197 DO_CLAMP(gvec_sclamp_b, int8_t) 3198 DO_CLAMP(gvec_sclamp_h, int16_t) 3199 DO_CLAMP(gvec_sclamp_s, int32_t) 3200 DO_CLAMP(gvec_sclamp_d, int64_t) 3201 3202 DO_CLAMP(gvec_uclamp_b, uint8_t) 3203 DO_CLAMP(gvec_uclamp_h, uint16_t) 3204 DO_CLAMP(gvec_uclamp_s, uint32_t) 3205 DO_CLAMP(gvec_uclamp_d, uint64_t) 3206 3207 /* Bit count in each 8-bit word. */ 3208 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc) 3209 { 3210 intptr_t i, opr_sz = simd_oprsz(desc); 3211 uint8_t *d = vd, *n = vn; 3212 3213 for (i = 0; i < opr_sz; ++i) { 3214 d[i] = ctpop8(n[i]); 3215 } 3216 clear_tail(d, opr_sz, simd_maxsz(desc)); 3217 } 3218 3219 /* Reverse bits in each 8 bit word */ 3220 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc) 3221 { 3222 intptr_t i, opr_sz = simd_oprsz(desc); 3223 uint64_t *d = vd, *n = vn; 3224 3225 for (i = 0; i < opr_sz / 8; ++i) { 3226 d[i] = revbit64(bswap64(n[i])); 3227 } 3228 clear_tail(d, opr_sz, simd_maxsz(desc)); 3229 } 3230 3231 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc) 3232 { 3233 intptr_t i, opr_sz = simd_oprsz(desc); 3234 uint32_t *d = vd, *n = vn; 3235 3236 for (i = 0; i < opr_sz / 4; ++i) { 3237 d[i] = helper_recpe_u32(n[i]); 3238 } 3239 clear_tail(d, opr_sz, simd_maxsz(desc)); 3240 } 3241 3242 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc) 3243 { 3244 intptr_t i, opr_sz = simd_oprsz(desc); 3245 uint32_t *d = vd, *n = vn; 3246 3247 for (i = 0; i < opr_sz / 4; ++i) { 3248 d[i] = helper_rsqrte_u32(n[i]); 3249 } 3250 clear_tail(d, opr_sz, simd_maxsz(desc)); 3251 } 3252