1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "crypto/clmul.h" 27 #include "vec_internal.h" 28 29 /* 30 * Data for expanding active predicate bits to bytes, for byte elements. 31 * 32 * for (i = 0; i < 256; ++i) { 33 * unsigned long m = 0; 34 * for (j = 0; j < 8; j++) { 35 * if ((i >> j) & 1) { 36 * m |= 0xfful << (j << 3); 37 * } 38 * } 39 * printf("0x%016lx,\n", m); 40 * } 41 */ 42 const uint64_t expand_pred_b_data[256] = { 43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 128 0xffffffffffffffff, 129 }; 130 131 /* 132 * Similarly for half-word elements. 133 * for (i = 0; i < 256; ++i) { 134 * unsigned long m = 0; 135 * if (i & 0xaa) { 136 * continue; 137 * } 138 * for (j = 0; j < 8; j += 2) { 139 * if ((i >> j) & 1) { 140 * m |= 0xfffful << (j << 3); 141 * } 142 * } 143 * printf("[0x%x] = 0x%016lx,\n", i, m); 144 * } 145 */ 146 const uint64_t expand_pred_h_data[0x55 + 1] = { 147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 154 [0x55] = 0xffffffffffffffff, 155 }; 156 157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 159 bool neg, bool round) 160 { 161 /* 162 * Simplify: 163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 165 */ 166 int32_t ret = (int32_t)src1 * src2; 167 if (neg) { 168 ret = -ret; 169 } 170 ret += ((int32_t)src3 << 7) + (round << 6); 171 ret >>= 7; 172 173 if (ret != (int8_t)ret) { 174 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 175 } 176 return ret; 177 } 178 179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 180 void *va, uint32_t desc) 181 { 182 intptr_t i, opr_sz = simd_oprsz(desc); 183 int8_t *d = vd, *n = vn, *m = vm, *a = va; 184 185 for (i = 0; i < opr_sz; ++i) { 186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 187 } 188 } 189 190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 191 void *va, uint32_t desc) 192 { 193 intptr_t i, opr_sz = simd_oprsz(desc); 194 int8_t *d = vd, *n = vn, *m = vm, *a = va; 195 196 for (i = 0; i < opr_sz; ++i) { 197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 198 } 199 } 200 201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 202 { 203 intptr_t i, opr_sz = simd_oprsz(desc); 204 int8_t *d = vd, *n = vn, *m = vm; 205 206 for (i = 0; i < opr_sz; ++i) { 207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 208 } 209 } 210 211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 212 { 213 intptr_t i, opr_sz = simd_oprsz(desc); 214 int8_t *d = vd, *n = vn, *m = vm; 215 216 for (i = 0; i < opr_sz; ++i) { 217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 218 } 219 } 220 221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 223 bool neg, bool round, uint32_t *sat) 224 { 225 /* Simplify similarly to do_sqrdmlah_b above. */ 226 int32_t ret = (int32_t)src1 * src2; 227 if (neg) { 228 ret = -ret; 229 } 230 ret += ((int32_t)src3 << 15) + (round << 14); 231 ret >>= 15; 232 233 if (ret != (int16_t)ret) { 234 *sat = 1; 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 236 } 237 return ret; 238 } 239 240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 241 uint32_t src2, uint32_t src3) 242 { 243 uint32_t *sat = &env->vfp.qc[0]; 244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 246 false, true, sat); 247 return deposit32(e1, 16, 16, e2); 248 } 249 250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 251 void *vq, uint32_t desc) 252 { 253 uintptr_t opr_sz = simd_oprsz(desc); 254 int16_t *d = vd; 255 int16_t *n = vn; 256 int16_t *m = vm; 257 uintptr_t i; 258 259 for (i = 0; i < opr_sz / 2; ++i) { 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 261 } 262 clear_tail(d, opr_sz, simd_maxsz(desc)); 263 } 264 265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 266 uint32_t src2, uint32_t src3) 267 { 268 uint32_t *sat = &env->vfp.qc[0]; 269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 271 true, true, sat); 272 return deposit32(e1, 16, 16, e2); 273 } 274 275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277 { 278 uintptr_t opr_sz = simd_oprsz(desc); 279 int16_t *d = vd; 280 int16_t *n = vn; 281 int16_t *m = vm; 282 uintptr_t i; 283 284 for (i = 0; i < opr_sz / 2; ++i) { 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 286 } 287 clear_tail(d, opr_sz, simd_maxsz(desc)); 288 } 289 290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 291 void *vq, uint32_t desc) 292 { 293 intptr_t i, opr_sz = simd_oprsz(desc); 294 int16_t *d = vd, *n = vn, *m = vm; 295 296 for (i = 0; i < opr_sz / 2; ++i) { 297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 298 } 299 clear_tail(d, opr_sz, simd_maxsz(desc)); 300 } 301 302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 303 void *vq, uint32_t desc) 304 { 305 intptr_t i, opr_sz = simd_oprsz(desc); 306 int16_t *d = vd, *n = vn, *m = vm; 307 308 for (i = 0; i < opr_sz / 2; ++i) { 309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 310 } 311 clear_tail(d, opr_sz, simd_maxsz(desc)); 312 } 313 314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm, 315 void *vq, uint32_t desc) 316 { 317 intptr_t i, j, opr_sz = simd_oprsz(desc); 318 int idx = simd_data(desc); 319 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 320 intptr_t elements = opr_sz / 2; 321 intptr_t eltspersegment = MIN(16 / 2, elements); 322 323 for (i = 0; i < elements; i += 16 / 2) { 324 int16_t mm = m[i]; 325 for (j = 0; j < eltspersegment; ++j) { 326 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq); 327 } 328 } 329 clear_tail(d, opr_sz, simd_maxsz(desc)); 330 } 331 332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, 333 void *vq, uint32_t desc) 334 { 335 intptr_t i, j, opr_sz = simd_oprsz(desc); 336 int idx = simd_data(desc); 337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 338 intptr_t elements = opr_sz / 2; 339 intptr_t eltspersegment = MIN(16 / 2, elements); 340 341 for (i = 0; i < elements; i += 16 / 2) { 342 int16_t mm = m[i]; 343 for (j = 0; j < eltspersegment; ++j) { 344 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq); 345 } 346 } 347 clear_tail(d, opr_sz, simd_maxsz(desc)); 348 } 349 350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm, 351 void *vq, uint32_t desc) 352 { 353 intptr_t i, j, opr_sz = simd_oprsz(desc); 354 int idx = simd_data(desc); 355 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 356 intptr_t elements = opr_sz / 2; 357 intptr_t eltspersegment = MIN(16 / 2, elements); 358 359 for (i = 0; i < elements; i += 16 / 2) { 360 int16_t mm = m[i]; 361 for (j = 0; j < eltspersegment; ++j) { 362 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq); 363 } 364 } 365 clear_tail(d, opr_sz, simd_maxsz(desc)); 366 } 367 368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm, 369 void *vq, uint32_t desc) 370 { 371 intptr_t i, j, opr_sz = simd_oprsz(desc); 372 int idx = simd_data(desc); 373 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 374 intptr_t elements = opr_sz / 2; 375 intptr_t eltspersegment = MIN(16 / 2, elements); 376 377 for (i = 0; i < elements; i += 16 / 2) { 378 int16_t mm = m[i]; 379 for (j = 0; j < eltspersegment; ++j) { 380 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq); 381 } 382 } 383 clear_tail(d, opr_sz, simd_maxsz(desc)); 384 } 385 386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 387 void *va, uint32_t desc) 388 { 389 intptr_t i, opr_sz = simd_oprsz(desc); 390 int16_t *d = vd, *n = vn, *m = vm, *a = va; 391 uint32_t discard; 392 393 for (i = 0; i < opr_sz / 2; ++i) { 394 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 395 } 396 } 397 398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 399 void *va, uint32_t desc) 400 { 401 intptr_t i, opr_sz = simd_oprsz(desc); 402 int16_t *d = vd, *n = vn, *m = vm, *a = va; 403 uint32_t discard; 404 405 for (i = 0; i < opr_sz / 2; ++i) { 406 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 407 } 408 } 409 410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 411 { 412 intptr_t i, opr_sz = simd_oprsz(desc); 413 int16_t *d = vd, *n = vn, *m = vm; 414 uint32_t discard; 415 416 for (i = 0; i < opr_sz / 2; ++i) { 417 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 418 } 419 } 420 421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 422 { 423 intptr_t i, opr_sz = simd_oprsz(desc); 424 int16_t *d = vd, *n = vn, *m = vm; 425 uint32_t discard; 426 427 for (i = 0; i < opr_sz / 2; ++i) { 428 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 429 } 430 } 431 432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 433 { 434 intptr_t i, j, opr_sz = simd_oprsz(desc); 435 int idx = simd_data(desc); 436 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 437 uint32_t discard; 438 439 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 440 int16_t mm = m[i]; 441 for (j = 0; j < 16 / 2; ++j) { 442 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 443 } 444 } 445 } 446 447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 448 { 449 intptr_t i, j, opr_sz = simd_oprsz(desc); 450 int idx = simd_data(desc); 451 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 452 uint32_t discard; 453 454 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 455 int16_t mm = m[i]; 456 for (j = 0; j < 16 / 2; ++j) { 457 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 458 } 459 } 460 } 461 462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 464 bool neg, bool round, uint32_t *sat) 465 { 466 /* Simplify similarly to do_sqrdmlah_b above. */ 467 int64_t ret = (int64_t)src1 * src2; 468 if (neg) { 469 ret = -ret; 470 } 471 ret += ((int64_t)src3 << 31) + (round << 30); 472 ret >>= 31; 473 474 if (ret != (int32_t)ret) { 475 *sat = 1; 476 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 477 } 478 return ret; 479 } 480 481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 482 int32_t src2, int32_t src3) 483 { 484 uint32_t *sat = &env->vfp.qc[0]; 485 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 486 } 487 488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 489 void *vq, uint32_t desc) 490 { 491 uintptr_t opr_sz = simd_oprsz(desc); 492 int32_t *d = vd; 493 int32_t *n = vn; 494 int32_t *m = vm; 495 uintptr_t i; 496 497 for (i = 0; i < opr_sz / 4; ++i) { 498 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 499 } 500 clear_tail(d, opr_sz, simd_maxsz(desc)); 501 } 502 503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 504 int32_t src2, int32_t src3) 505 { 506 uint32_t *sat = &env->vfp.qc[0]; 507 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 508 } 509 510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 511 void *vq, uint32_t desc) 512 { 513 uintptr_t opr_sz = simd_oprsz(desc); 514 int32_t *d = vd; 515 int32_t *n = vn; 516 int32_t *m = vm; 517 uintptr_t i; 518 519 for (i = 0; i < opr_sz / 4; ++i) { 520 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 521 } 522 clear_tail(d, opr_sz, simd_maxsz(desc)); 523 } 524 525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 526 void *vq, uint32_t desc) 527 { 528 intptr_t i, opr_sz = simd_oprsz(desc); 529 int32_t *d = vd, *n = vn, *m = vm; 530 531 for (i = 0; i < opr_sz / 4; ++i) { 532 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 533 } 534 clear_tail(d, opr_sz, simd_maxsz(desc)); 535 } 536 537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 538 void *vq, uint32_t desc) 539 { 540 intptr_t i, opr_sz = simd_oprsz(desc); 541 int32_t *d = vd, *n = vn, *m = vm; 542 543 for (i = 0; i < opr_sz / 4; ++i) { 544 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 545 } 546 clear_tail(d, opr_sz, simd_maxsz(desc)); 547 } 548 549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm, 550 void *vq, uint32_t desc) 551 { 552 intptr_t i, j, opr_sz = simd_oprsz(desc); 553 int idx = simd_data(desc); 554 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 555 intptr_t elements = opr_sz / 4; 556 intptr_t eltspersegment = MIN(16 / 4, elements); 557 558 for (i = 0; i < elements; i += 16 / 4) { 559 int32_t mm = m[i]; 560 for (j = 0; j < eltspersegment; ++j) { 561 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq); 562 } 563 } 564 clear_tail(d, opr_sz, simd_maxsz(desc)); 565 } 566 567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, 568 void *vq, uint32_t desc) 569 { 570 intptr_t i, j, opr_sz = simd_oprsz(desc); 571 int idx = simd_data(desc); 572 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 573 intptr_t elements = opr_sz / 4; 574 intptr_t eltspersegment = MIN(16 / 4, elements); 575 576 for (i = 0; i < elements; i += 16 / 4) { 577 int32_t mm = m[i]; 578 for (j = 0; j < eltspersegment; ++j) { 579 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq); 580 } 581 } 582 clear_tail(d, opr_sz, simd_maxsz(desc)); 583 } 584 585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm, 586 void *vq, uint32_t desc) 587 { 588 intptr_t i, j, opr_sz = simd_oprsz(desc); 589 int idx = simd_data(desc); 590 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 591 intptr_t elements = opr_sz / 4; 592 intptr_t eltspersegment = MIN(16 / 4, elements); 593 594 for (i = 0; i < elements; i += 16 / 4) { 595 int32_t mm = m[i]; 596 for (j = 0; j < eltspersegment; ++j) { 597 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq); 598 } 599 } 600 clear_tail(d, opr_sz, simd_maxsz(desc)); 601 } 602 603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm, 604 void *vq, uint32_t desc) 605 { 606 intptr_t i, j, opr_sz = simd_oprsz(desc); 607 int idx = simd_data(desc); 608 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 609 intptr_t elements = opr_sz / 4; 610 intptr_t eltspersegment = MIN(16 / 4, elements); 611 612 for (i = 0; i < elements; i += 16 / 4) { 613 int32_t mm = m[i]; 614 for (j = 0; j < eltspersegment; ++j) { 615 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq); 616 } 617 } 618 clear_tail(d, opr_sz, simd_maxsz(desc)); 619 } 620 621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 622 void *va, uint32_t desc) 623 { 624 intptr_t i, opr_sz = simd_oprsz(desc); 625 int32_t *d = vd, *n = vn, *m = vm, *a = va; 626 uint32_t discard; 627 628 for (i = 0; i < opr_sz / 4; ++i) { 629 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 630 } 631 } 632 633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 634 void *va, uint32_t desc) 635 { 636 intptr_t i, opr_sz = simd_oprsz(desc); 637 int32_t *d = vd, *n = vn, *m = vm, *a = va; 638 uint32_t discard; 639 640 for (i = 0; i < opr_sz / 4; ++i) { 641 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 642 } 643 } 644 645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 646 { 647 intptr_t i, opr_sz = simd_oprsz(desc); 648 int32_t *d = vd, *n = vn, *m = vm; 649 uint32_t discard; 650 651 for (i = 0; i < opr_sz / 4; ++i) { 652 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 653 } 654 } 655 656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 657 { 658 intptr_t i, opr_sz = simd_oprsz(desc); 659 int32_t *d = vd, *n = vn, *m = vm; 660 uint32_t discard; 661 662 for (i = 0; i < opr_sz / 4; ++i) { 663 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 664 } 665 } 666 667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 668 { 669 intptr_t i, j, opr_sz = simd_oprsz(desc); 670 int idx = simd_data(desc); 671 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 672 uint32_t discard; 673 674 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 675 int32_t mm = m[i]; 676 for (j = 0; j < 16 / 4; ++j) { 677 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 678 } 679 } 680 } 681 682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 683 { 684 intptr_t i, j, opr_sz = simd_oprsz(desc); 685 int idx = simd_data(desc); 686 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 687 uint32_t discard; 688 689 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 690 int32_t mm = m[i]; 691 for (j = 0; j < 16 / 4; ++j) { 692 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 693 } 694 } 695 } 696 697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 698 static int64_t do_sat128_d(Int128 r) 699 { 700 int64_t ls = int128_getlo(r); 701 int64_t hs = int128_gethi(r); 702 703 if (unlikely(hs != (ls >> 63))) { 704 return hs < 0 ? INT64_MIN : INT64_MAX; 705 } 706 return ls; 707 } 708 709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 710 { 711 uint64_t l, h; 712 Int128 r, t; 713 714 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 715 muls64(&l, &h, m, n); 716 r = int128_make128(l, h); 717 if (neg) { 718 r = int128_neg(r); 719 } 720 if (a) { 721 t = int128_exts64(a); 722 t = int128_lshift(t, 63); 723 r = int128_add(r, t); 724 } 725 if (round) { 726 t = int128_exts64(1ll << 62); 727 r = int128_add(r, t); 728 } 729 r = int128_rshift(r, 63); 730 731 return do_sat128_d(r); 732 } 733 734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 735 void *va, uint32_t desc) 736 { 737 intptr_t i, opr_sz = simd_oprsz(desc); 738 int64_t *d = vd, *n = vn, *m = vm, *a = va; 739 740 for (i = 0; i < opr_sz / 8; ++i) { 741 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 742 } 743 } 744 745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 746 void *va, uint32_t desc) 747 { 748 intptr_t i, opr_sz = simd_oprsz(desc); 749 int64_t *d = vd, *n = vn, *m = vm, *a = va; 750 751 for (i = 0; i < opr_sz / 8; ++i) { 752 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 753 } 754 } 755 756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 757 { 758 intptr_t i, opr_sz = simd_oprsz(desc); 759 int64_t *d = vd, *n = vn, *m = vm; 760 761 for (i = 0; i < opr_sz / 8; ++i) { 762 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 763 } 764 } 765 766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 767 { 768 intptr_t i, opr_sz = simd_oprsz(desc); 769 int64_t *d = vd, *n = vn, *m = vm; 770 771 for (i = 0; i < opr_sz / 8; ++i) { 772 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 773 } 774 } 775 776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 777 { 778 intptr_t i, j, opr_sz = simd_oprsz(desc); 779 int idx = simd_data(desc); 780 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 781 782 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 783 int64_t mm = m[i]; 784 for (j = 0; j < 16 / 8; ++j) { 785 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 786 } 787 } 788 } 789 790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 791 { 792 intptr_t i, j, opr_sz = simd_oprsz(desc); 793 int idx = simd_data(desc); 794 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 795 796 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 797 int64_t mm = m[i]; 798 for (j = 0; j < 16 / 8; ++j) { 799 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 800 } 801 } 802 } 803 804 /* Integer 8 and 16-bit dot-product. 805 * 806 * Note that for the loops herein, host endianness does not matter 807 * with respect to the ordering of data within the quad-width lanes. 808 * All elements are treated equally, no matter where they are. 809 */ 810 811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 813 { \ 814 intptr_t i, opr_sz = simd_oprsz(desc); \ 815 TYPED *d = vd, *a = va; \ 816 TYPEN *n = vn; \ 817 TYPEM *m = vm; \ 818 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 819 d[i] = (a[i] + \ 820 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 821 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 822 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 823 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 824 } \ 825 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 826 } 827 828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 833 834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 836 { \ 837 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 838 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 839 /* \ 840 * Special case: opr_sz == 8 from AA64/AA32 advsimd means the \ 841 * first iteration might not be a full 16 byte segment. But \ 842 * for vector lengths beyond that this must be SVE and we know \ 843 * opr_sz is a multiple of 16, so we need not clamp segend \ 844 * to opr_sz_n when we advance it at the end of the loop. \ 845 */ \ 846 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 847 intptr_t index = simd_data(desc); \ 848 TYPED *d = vd, *a = va; \ 849 TYPEN *n = vn; \ 850 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 851 do { \ 852 TYPED m0 = m_indexed[i * 4 + 0]; \ 853 TYPED m1 = m_indexed[i * 4 + 1]; \ 854 TYPED m2 = m_indexed[i * 4 + 2]; \ 855 TYPED m3 = m_indexed[i * 4 + 3]; \ 856 do { \ 857 d[i] = (a[i] + \ 858 n[i * 4 + 0] * m0 + \ 859 n[i * 4 + 1] * m1 + \ 860 n[i * 4 + 2] * m2 + \ 861 n[i * 4 + 3] * m3); \ 862 } while (++i < segend); \ 863 segend = i + (16 / sizeof(TYPED)); \ 864 } while (i < opr_sz_n); \ 865 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 866 } 867 868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 874 875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 876 float_status *fpst, uint32_t desc) 877 { 878 uintptr_t opr_sz = simd_oprsz(desc); 879 float16 *d = vd; 880 float16 *n = vn; 881 float16 *m = vm; 882 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 883 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 884 uintptr_t i; 885 886 for (i = 0; i < opr_sz / 2; i += 2) { 887 float16 e0 = n[H2(i)]; 888 float16 e1 = m[H2(i + 1)]; 889 float16 e2 = n[H2(i + 1)]; 890 float16 e3 = m[H2(i)]; 891 892 if (rot) { 893 e3 = float16_maybe_ah_chs(e3, fpcr_ah); 894 } else { 895 e1 = float16_maybe_ah_chs(e1, fpcr_ah); 896 } 897 898 d[H2(i)] = float16_add(e0, e1, fpst); 899 d[H2(i + 1)] = float16_add(e2, e3, fpst); 900 } 901 clear_tail(d, opr_sz, simd_maxsz(desc)); 902 } 903 904 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 905 float_status *fpst, uint32_t desc) 906 { 907 uintptr_t opr_sz = simd_oprsz(desc); 908 float32 *d = vd; 909 float32 *n = vn; 910 float32 *m = vm; 911 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 912 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 913 uintptr_t i; 914 915 for (i = 0; i < opr_sz / 4; i += 2) { 916 float32 e0 = n[H4(i)]; 917 float32 e1 = m[H4(i + 1)]; 918 float32 e2 = n[H4(i + 1)]; 919 float32 e3 = m[H4(i)]; 920 921 if (rot) { 922 e3 = float32_maybe_ah_chs(e3, fpcr_ah); 923 } else { 924 e1 = float32_maybe_ah_chs(e1, fpcr_ah); 925 } 926 927 d[H4(i)] = float32_add(e0, e1, fpst); 928 d[H4(i + 1)] = float32_add(e2, e3, fpst); 929 } 930 clear_tail(d, opr_sz, simd_maxsz(desc)); 931 } 932 933 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 934 float_status *fpst, uint32_t desc) 935 { 936 uintptr_t opr_sz = simd_oprsz(desc); 937 float64 *d = vd; 938 float64 *n = vn; 939 float64 *m = vm; 940 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 941 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 942 uintptr_t i; 943 944 for (i = 0; i < opr_sz / 8; i += 2) { 945 float64 e0 = n[i]; 946 float64 e1 = m[i + 1]; 947 float64 e2 = n[i + 1]; 948 float64 e3 = m[i]; 949 950 if (rot) { 951 e3 = float64_maybe_ah_chs(e3, fpcr_ah); 952 } else { 953 e1 = float64_maybe_ah_chs(e1, fpcr_ah); 954 } 955 956 d[i] = float64_add(e0, e1, fpst); 957 d[i + 1] = float64_add(e2, e3, fpst); 958 } 959 clear_tail(d, opr_sz, simd_maxsz(desc)); 960 } 961 962 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 963 float_status *fpst, uint32_t desc) 964 { 965 uintptr_t opr_sz = simd_oprsz(desc); 966 float16 *d = vd, *n = vn, *m = vm, *a = va; 967 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 968 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 969 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 970 uint32_t negf_real = flip ^ negf_imag; 971 float16 negx_imag, negx_real; 972 uintptr_t i; 973 974 /* With AH=0, use negx; with AH=1 use negf. */ 975 negx_real = (negf_real & ~fpcr_ah) << 15; 976 negx_imag = (negf_imag & ~fpcr_ah) << 15; 977 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 978 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 979 980 for (i = 0; i < opr_sz / 2; i += 2) { 981 float16 e2 = n[H2(i + flip)]; 982 float16 e1 = m[H2(i + flip)] ^ negx_real; 983 float16 e4 = e2; 984 float16 e3 = m[H2(i + 1 - flip)] ^ negx_imag; 985 986 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], negf_real, fpst); 987 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], negf_imag, fpst); 988 } 989 clear_tail(d, opr_sz, simd_maxsz(desc)); 990 } 991 992 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 993 float_status *fpst, uint32_t desc) 994 { 995 uintptr_t opr_sz = simd_oprsz(desc); 996 float16 *d = vd, *n = vn, *m = vm, *a = va; 997 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 998 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 999 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1000 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1); 1001 uint32_t negf_real = flip ^ negf_imag; 1002 intptr_t elements = opr_sz / sizeof(float16); 1003 intptr_t eltspersegment = MIN(16 / sizeof(float16), elements); 1004 float16 negx_imag, negx_real; 1005 intptr_t i, j; 1006 1007 /* With AH=0, use negx; with AH=1 use negf. */ 1008 negx_real = (negf_real & ~fpcr_ah) << 15; 1009 negx_imag = (negf_imag & ~fpcr_ah) << 15; 1010 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1011 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1012 1013 for (i = 0; i < elements; i += eltspersegment) { 1014 float16 mr = m[H2(i + 2 * index + 0)]; 1015 float16 mi = m[H2(i + 2 * index + 1)]; 1016 float16 e1 = negx_real ^ (flip ? mi : mr); 1017 float16 e3 = negx_imag ^ (flip ? mr : mi); 1018 1019 for (j = i; j < i + eltspersegment; j += 2) { 1020 float16 e2 = n[H2(j + flip)]; 1021 float16 e4 = e2; 1022 1023 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], negf_real, fpst); 1024 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], negf_imag, fpst); 1025 } 1026 } 1027 clear_tail(d, opr_sz, simd_maxsz(desc)); 1028 } 1029 1030 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 1031 float_status *fpst, uint32_t desc) 1032 { 1033 uintptr_t opr_sz = simd_oprsz(desc); 1034 float32 *d = vd, *n = vn, *m = vm, *a = va; 1035 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1036 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 1037 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1038 uint32_t negf_real = flip ^ negf_imag; 1039 float32 negx_imag, negx_real; 1040 uintptr_t i; 1041 1042 /* With AH=0, use negx; with AH=1 use negf. */ 1043 negx_real = (negf_real & ~fpcr_ah) << 31; 1044 negx_imag = (negf_imag & ~fpcr_ah) << 31; 1045 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1046 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1047 1048 for (i = 0; i < opr_sz / 4; i += 2) { 1049 float32 e2 = n[H4(i + flip)]; 1050 float32 e1 = m[H4(i + flip)] ^ negx_real; 1051 float32 e4 = e2; 1052 float32 e3 = m[H4(i + 1 - flip)] ^ negx_imag; 1053 1054 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], negf_real, fpst); 1055 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], negf_imag, fpst); 1056 } 1057 clear_tail(d, opr_sz, simd_maxsz(desc)); 1058 } 1059 1060 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 1061 float_status *fpst, uint32_t desc) 1062 { 1063 uintptr_t opr_sz = simd_oprsz(desc); 1064 float32 *d = vd, *n = vn, *m = vm, *a = va; 1065 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1066 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1067 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1068 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 4, 1); 1069 uint32_t negf_real = flip ^ negf_imag; 1070 intptr_t elements = opr_sz / sizeof(float32); 1071 intptr_t eltspersegment = MIN(16 / sizeof(float32), elements); 1072 float32 negx_imag, negx_real; 1073 intptr_t i, j; 1074 1075 /* With AH=0, use negx; with AH=1 use negf. */ 1076 negx_real = (negf_real & ~fpcr_ah) << 31; 1077 negx_imag = (negf_imag & ~fpcr_ah) << 31; 1078 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1079 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1080 1081 for (i = 0; i < elements; i += eltspersegment) { 1082 float32 mr = m[H4(i + 2 * index + 0)]; 1083 float32 mi = m[H4(i + 2 * index + 1)]; 1084 float32 e1 = negx_real ^ (flip ? mi : mr); 1085 float32 e3 = negx_imag ^ (flip ? mr : mi); 1086 1087 for (j = i; j < i + eltspersegment; j += 2) { 1088 float32 e2 = n[H4(j + flip)]; 1089 float32 e4 = e2; 1090 1091 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], negf_real, fpst); 1092 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], negf_imag, fpst); 1093 } 1094 } 1095 clear_tail(d, opr_sz, simd_maxsz(desc)); 1096 } 1097 1098 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 1099 float_status *fpst, uint32_t desc) 1100 { 1101 uintptr_t opr_sz = simd_oprsz(desc); 1102 float64 *d = vd, *n = vn, *m = vm, *a = va; 1103 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1104 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 1105 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1106 uint32_t negf_real = flip ^ negf_imag; 1107 float64 negx_real, negx_imag; 1108 uintptr_t i; 1109 1110 /* With AH=0, use negx; with AH=1 use negf. */ 1111 negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63; 1112 negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63; 1113 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1114 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1115 1116 for (i = 0; i < opr_sz / 8; i += 2) { 1117 float64 e2 = n[i + flip]; 1118 float64 e1 = m[i + flip] ^ negx_real; 1119 float64 e4 = e2; 1120 float64 e3 = m[i + 1 - flip] ^ negx_imag; 1121 1122 d[i] = float64_muladd(e2, e1, a[i], negf_real, fpst); 1123 d[i + 1] = float64_muladd(e4, e3, a[i + 1], negf_imag, fpst); 1124 } 1125 clear_tail(d, opr_sz, simd_maxsz(desc)); 1126 } 1127 1128 /* 1129 * Floating point comparisons producing an integer result (all 1s or all 0s). 1130 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1131 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1132 */ 1133 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 1134 { 1135 return -float16_eq_quiet(op1, op2, stat); 1136 } 1137 1138 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 1139 { 1140 return -float32_eq_quiet(op1, op2, stat); 1141 } 1142 1143 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat) 1144 { 1145 return -float64_eq_quiet(op1, op2, stat); 1146 } 1147 1148 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 1149 { 1150 return -float16_le(op2, op1, stat); 1151 } 1152 1153 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 1154 { 1155 return -float32_le(op2, op1, stat); 1156 } 1157 1158 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat) 1159 { 1160 return -float64_le(op2, op1, stat); 1161 } 1162 1163 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 1164 { 1165 return -float16_lt(op2, op1, stat); 1166 } 1167 1168 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 1169 { 1170 return -float32_lt(op2, op1, stat); 1171 } 1172 1173 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat) 1174 { 1175 return -float64_lt(op2, op1, stat); 1176 } 1177 1178 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 1179 { 1180 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 1181 } 1182 1183 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 1184 { 1185 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1186 } 1187 1188 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat) 1189 { 1190 return -float64_le(float64_abs(op2), float64_abs(op1), stat); 1191 } 1192 1193 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1194 { 1195 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1196 } 1197 1198 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1199 { 1200 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1201 } 1202 1203 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat) 1204 { 1205 return -float64_lt(float64_abs(op2), float64_abs(op1), stat); 1206 } 1207 1208 static int16_t vfp_tosszh(float16 x, float_status *fpst) 1209 { 1210 if (float16_is_any_nan(x)) { 1211 float_raise(float_flag_invalid, fpst); 1212 return 0; 1213 } 1214 return float16_to_int16_round_to_zero(x, fpst); 1215 } 1216 1217 static uint16_t vfp_touszh(float16 x, float_status *fpst) 1218 { 1219 if (float16_is_any_nan(x)) { 1220 float_raise(float_flag_invalid, fpst); 1221 return 0; 1222 } 1223 return float16_to_uint16_round_to_zero(x, fpst); 1224 } 1225 1226 #define DO_2OP(NAME, FUNC, TYPE) \ 1227 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 1228 { \ 1229 intptr_t i, oprsz = simd_oprsz(desc); \ 1230 TYPE *d = vd, *n = vn; \ 1231 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1232 d[i] = FUNC(n[i], stat); \ 1233 } \ 1234 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1235 } 1236 1237 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1238 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1239 DO_2OP(gvec_frecpe_rpres_s, helper_recpe_rpres_f32, float32) 1240 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1241 1242 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1243 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1244 DO_2OP(gvec_frsqrte_rpres_s, helper_rsqrte_rpres_f32, float32) 1245 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1246 1247 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1248 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1249 1250 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1251 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1252 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1253 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1254 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1255 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1256 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1257 DO_2OP(gvec_touszh, vfp_touszh, float16) 1258 1259 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1260 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1261 { \ 1262 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1263 } 1264 1265 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1266 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1267 { \ 1268 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1269 } 1270 1271 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1272 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1273 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1274 WRAP_CMP0_##DIRN(FN, CMPOP, float64) \ 1275 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1276 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) \ 1277 DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64) 1278 1279 DO_2OP_CMP0(cgt, cgt, FWD) 1280 DO_2OP_CMP0(cge, cge, FWD) 1281 DO_2OP_CMP0(ceq, ceq, FWD) 1282 DO_2OP_CMP0(clt, cgt, REV) 1283 DO_2OP_CMP0(cle, cge, REV) 1284 1285 #undef DO_2OP 1286 #undef DO_2OP_CMP0 1287 1288 /* Floating-point trigonometric starting value. 1289 * See the ARM ARM pseudocode function FPTrigSMul. 1290 */ 1291 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1292 { 1293 float16 result = float16_mul(op1, op1, stat); 1294 if (!float16_is_any_nan(result)) { 1295 result = float16_set_sign(result, op2 & 1); 1296 } 1297 return result; 1298 } 1299 1300 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1301 { 1302 float32 result = float32_mul(op1, op1, stat); 1303 if (!float32_is_any_nan(result)) { 1304 result = float32_set_sign(result, op2 & 1); 1305 } 1306 return result; 1307 } 1308 1309 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1310 { 1311 float64 result = float64_mul(op1, op1, stat); 1312 if (!float64_is_any_nan(result)) { 1313 result = float64_set_sign(result, op2 & 1); 1314 } 1315 return result; 1316 } 1317 1318 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1319 { 1320 return float16_abs(float16_sub(op1, op2, stat)); 1321 } 1322 1323 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1324 { 1325 return float32_abs(float32_sub(op1, op2, stat)); 1326 } 1327 1328 static float64 float64_abd(float64 op1, float64 op2, float_status *stat) 1329 { 1330 return float64_abs(float64_sub(op1, op2, stat)); 1331 } 1332 1333 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */ 1334 static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat) 1335 { 1336 float16 r = float16_sub(op1, op2, stat); 1337 return float16_is_any_nan(r) ? r : float16_abs(r); 1338 } 1339 1340 static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat) 1341 { 1342 float32 r = float32_sub(op1, op2, stat); 1343 return float32_is_any_nan(r) ? r : float32_abs(r); 1344 } 1345 1346 static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat) 1347 { 1348 float64 r = float64_sub(op1, op2, stat); 1349 return float64_is_any_nan(r) ? r : float64_abs(r); 1350 } 1351 1352 /* 1353 * Reciprocal step. These are the AArch32 version which uses a 1354 * non-fused multiply-and-subtract. 1355 */ 1356 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1357 { 1358 op1 = float16_squash_input_denormal(op1, stat); 1359 op2 = float16_squash_input_denormal(op2, stat); 1360 1361 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1362 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1363 return float16_two; 1364 } 1365 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1366 } 1367 1368 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1369 { 1370 op1 = float32_squash_input_denormal(op1, stat); 1371 op2 = float32_squash_input_denormal(op2, stat); 1372 1373 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1374 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1375 return float32_two; 1376 } 1377 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1378 } 1379 1380 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1381 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1382 { 1383 op1 = float16_squash_input_denormal(op1, stat); 1384 op2 = float16_squash_input_denormal(op2, stat); 1385 1386 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1387 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1388 return float16_one_point_five; 1389 } 1390 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1391 return float16_div(op1, float16_two, stat); 1392 } 1393 1394 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1395 { 1396 op1 = float32_squash_input_denormal(op1, stat); 1397 op2 = float32_squash_input_denormal(op2, stat); 1398 1399 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1400 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1401 return float32_one_point_five; 1402 } 1403 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1404 return float32_div(op1, float32_two, stat); 1405 } 1406 1407 #define DO_3OP(NAME, FUNC, TYPE) \ 1408 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1409 float_status *stat, uint32_t desc) \ 1410 { \ 1411 intptr_t i, oprsz = simd_oprsz(desc); \ 1412 TYPE *d = vd, *n = vn, *m = vm; \ 1413 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1414 d[i] = FUNC(n[i], m[i], stat); \ 1415 } \ 1416 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1417 } 1418 1419 DO_3OP(gvec_fadd_h, float16_add, float16) 1420 DO_3OP(gvec_fadd_s, float32_add, float32) 1421 DO_3OP(gvec_fadd_d, float64_add, float64) 1422 1423 DO_3OP(gvec_fsub_h, float16_sub, float16) 1424 DO_3OP(gvec_fsub_s, float32_sub, float32) 1425 DO_3OP(gvec_fsub_d, float64_sub, float64) 1426 1427 DO_3OP(gvec_fmul_h, float16_mul, float16) 1428 DO_3OP(gvec_fmul_s, float32_mul, float32) 1429 DO_3OP(gvec_fmul_d, float64_mul, float64) 1430 1431 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1432 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1433 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1434 1435 DO_3OP(gvec_fabd_h, float16_abd, float16) 1436 DO_3OP(gvec_fabd_s, float32_abd, float32) 1437 DO_3OP(gvec_fabd_d, float64_abd, float64) 1438 1439 DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16) 1440 DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32) 1441 DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64) 1442 1443 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1444 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1445 DO_3OP(gvec_fceq_d, float64_ceq, float64) 1446 1447 DO_3OP(gvec_fcge_h, float16_cge, float16) 1448 DO_3OP(gvec_fcge_s, float32_cge, float32) 1449 DO_3OP(gvec_fcge_d, float64_cge, float64) 1450 1451 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1452 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1453 DO_3OP(gvec_fcgt_d, float64_cgt, float64) 1454 1455 DO_3OP(gvec_facge_h, float16_acge, float16) 1456 DO_3OP(gvec_facge_s, float32_acge, float32) 1457 DO_3OP(gvec_facge_d, float64_acge, float64) 1458 1459 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1460 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1461 DO_3OP(gvec_facgt_d, float64_acgt, float64) 1462 1463 DO_3OP(gvec_fmax_h, float16_max, float16) 1464 DO_3OP(gvec_fmax_s, float32_max, float32) 1465 DO_3OP(gvec_fmax_d, float64_max, float64) 1466 1467 DO_3OP(gvec_fmin_h, float16_min, float16) 1468 DO_3OP(gvec_fmin_s, float32_min, float32) 1469 DO_3OP(gvec_fmin_d, float64_min, float64) 1470 1471 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1472 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1473 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64) 1474 1475 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1476 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1477 DO_3OP(gvec_fminnum_d, float64_minnum, float64) 1478 1479 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1480 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1481 1482 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1483 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1484 1485 #ifdef TARGET_AARCH64 1486 DO_3OP(gvec_fdiv_h, float16_div, float16) 1487 DO_3OP(gvec_fdiv_s, float32_div, float32) 1488 DO_3OP(gvec_fdiv_d, float64_div, float64) 1489 1490 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16) 1491 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32) 1492 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64) 1493 1494 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1495 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1496 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1497 1498 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1499 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1500 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1501 1502 DO_3OP(gvec_ah_recps_h, helper_recpsf_ah_f16, float16) 1503 DO_3OP(gvec_ah_recps_s, helper_recpsf_ah_f32, float32) 1504 DO_3OP(gvec_ah_recps_d, helper_recpsf_ah_f64, float64) 1505 1506 DO_3OP(gvec_ah_rsqrts_h, helper_rsqrtsf_ah_f16, float16) 1507 DO_3OP(gvec_ah_rsqrts_s, helper_rsqrtsf_ah_f32, float32) 1508 DO_3OP(gvec_ah_rsqrts_d, helper_rsqrtsf_ah_f64, float64) 1509 1510 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16) 1511 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32) 1512 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64) 1513 1514 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16) 1515 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32) 1516 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64) 1517 1518 #endif 1519 #undef DO_3OP 1520 1521 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1522 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1523 float_status *stat) 1524 { 1525 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1526 } 1527 1528 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1529 float_status *stat) 1530 { 1531 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1532 } 1533 1534 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1535 float_status *stat) 1536 { 1537 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1538 } 1539 1540 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1541 float_status *stat) 1542 { 1543 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1544 } 1545 1546 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1547 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1548 float_status *stat) 1549 { 1550 return float16_muladd(op1, op2, dest, 0, stat); 1551 } 1552 1553 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1554 float_status *stat) 1555 { 1556 return float32_muladd(op1, op2, dest, 0, stat); 1557 } 1558 1559 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2, 1560 float_status *stat) 1561 { 1562 return float64_muladd(op1, op2, dest, 0, stat); 1563 } 1564 1565 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1566 float_status *stat) 1567 { 1568 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1569 } 1570 1571 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1572 float_status *stat) 1573 { 1574 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1575 } 1576 1577 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2, 1578 float_status *stat) 1579 { 1580 return float64_muladd(float64_chs(op1), op2, dest, 0, stat); 1581 } 1582 1583 static float16 float16_ah_mulsub_f(float16 dest, float16 op1, float16 op2, 1584 float_status *stat) 1585 { 1586 return float16_muladd(op1, op2, dest, float_muladd_negate_product, stat); 1587 } 1588 1589 static float32 float32_ah_mulsub_f(float32 dest, float32 op1, float32 op2, 1590 float_status *stat) 1591 { 1592 return float32_muladd(op1, op2, dest, float_muladd_negate_product, stat); 1593 } 1594 1595 static float64 float64_ah_mulsub_f(float64 dest, float64 op1, float64 op2, 1596 float_status *stat) 1597 { 1598 return float64_muladd(op1, op2, dest, float_muladd_negate_product, stat); 1599 } 1600 1601 #define DO_MULADD(NAME, FUNC, TYPE) \ 1602 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1603 float_status *stat, uint32_t desc) \ 1604 { \ 1605 intptr_t i, oprsz = simd_oprsz(desc); \ 1606 TYPE *d = vd, *n = vn, *m = vm; \ 1607 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1608 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1609 } \ 1610 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1611 } 1612 1613 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1614 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1615 1616 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1617 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1618 1619 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1620 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1621 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64) 1622 1623 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1624 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1625 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64) 1626 1627 DO_MULADD(gvec_ah_vfms_h, float16_ah_mulsub_f, float16) 1628 DO_MULADD(gvec_ah_vfms_s, float32_ah_mulsub_f, float32) 1629 DO_MULADD(gvec_ah_vfms_d, float64_ah_mulsub_f, float64) 1630 1631 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1632 * For AdvSIMD, there is of course only one such vector segment. 1633 */ 1634 1635 #define DO_MUL_IDX(NAME, TYPE, H) \ 1636 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1637 { \ 1638 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1639 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1640 intptr_t idx = simd_data(desc); \ 1641 TYPE *d = vd, *n = vn, *m = vm; \ 1642 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1643 TYPE mm = m[H(i + idx)]; \ 1644 for (j = 0; j < segment; j++) { \ 1645 d[i + j] = n[i + j] * mm; \ 1646 } \ 1647 } \ 1648 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1649 } 1650 1651 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1652 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1653 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1654 1655 #undef DO_MUL_IDX 1656 1657 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1658 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1659 { \ 1660 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1661 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1662 intptr_t idx = simd_data(desc); \ 1663 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1664 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1665 TYPE mm = m[H(i + idx)]; \ 1666 for (j = 0; j < segment; j++) { \ 1667 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1668 } \ 1669 } \ 1670 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1671 } 1672 1673 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1674 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1675 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1676 1677 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1678 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1679 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1680 1681 #undef DO_MLA_IDX 1682 1683 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \ 1684 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1685 float_status *stat, uint32_t desc) \ 1686 { \ 1687 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1688 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1689 intptr_t idx = simd_data(desc); \ 1690 TYPE *d = vd, *n = vn, *m = vm; \ 1691 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1692 TYPE mm = m[H(i + idx)]; \ 1693 for (j = 0; j < segment; j++) { \ 1694 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \ 1695 } \ 1696 } \ 1697 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1698 } 1699 1700 #define nop(N, M, S) (M) 1701 1702 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2) 1703 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4) 1704 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8) 1705 1706 #ifdef TARGET_AARCH64 1707 1708 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2) 1709 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4) 1710 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8) 1711 1712 #endif 1713 1714 #undef nop 1715 1716 /* 1717 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1718 * the fused ops below they assume accumulate both from and into Vd. 1719 */ 1720 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2) 1721 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4) 1722 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2) 1723 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4) 1724 1725 #undef DO_FMUL_IDX 1726 1727 #define DO_FMLA_IDX(NAME, TYPE, H, NEGX, NEGF) \ 1728 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1729 float_status *stat, uint32_t desc) \ 1730 { \ 1731 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1732 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1733 intptr_t idx = simd_data(desc); \ 1734 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1735 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1736 TYPE mm = m[H(i + idx)]; \ 1737 for (j = 0; j < segment; j++) { \ 1738 d[i + j] = TYPE##_muladd(n[i + j] ^ NEGX, mm, \ 1739 a[i + j], NEGF, stat); \ 1740 } \ 1741 } \ 1742 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1743 } 1744 1745 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2, 0, 0) 1746 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4, 0, 0) 1747 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8, 0, 0) 1748 1749 DO_FMLA_IDX(gvec_fmls_idx_h, float16, H2, INT16_MIN, 0) 1750 DO_FMLA_IDX(gvec_fmls_idx_s, float32, H4, INT32_MIN, 0) 1751 DO_FMLA_IDX(gvec_fmls_idx_d, float64, H8, INT64_MIN, 0) 1752 1753 DO_FMLA_IDX(gvec_ah_fmls_idx_h, float16, H2, 0, float_muladd_negate_product) 1754 DO_FMLA_IDX(gvec_ah_fmls_idx_s, float32, H4, 0, float_muladd_negate_product) 1755 DO_FMLA_IDX(gvec_ah_fmls_idx_d, float64, H8, 0, float_muladd_negate_product) 1756 1757 #undef DO_FMLA_IDX 1758 1759 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1760 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1761 { \ 1762 intptr_t i, oprsz = simd_oprsz(desc); \ 1763 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1764 bool q = false; \ 1765 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1766 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1767 if (dd < MIN) { \ 1768 dd = MIN; \ 1769 q = true; \ 1770 } else if (dd > MAX) { \ 1771 dd = MAX; \ 1772 q = true; \ 1773 } \ 1774 d[i] = dd; \ 1775 } \ 1776 if (q) { \ 1777 uint32_t *qc = vq; \ 1778 qc[0] = 1; \ 1779 } \ 1780 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1781 } 1782 1783 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1784 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1785 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1786 1787 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1788 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1789 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1790 1791 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1792 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1793 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1794 1795 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1796 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1797 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1798 1799 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX) 1800 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX) 1801 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX) 1802 1803 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX) 1804 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX) 1805 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX) 1806 1807 #undef DO_SAT 1808 1809 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1810 void *vm, uint32_t desc) 1811 { 1812 intptr_t i, oprsz = simd_oprsz(desc); 1813 uint64_t *d = vd, *n = vn, *m = vm; 1814 bool q = false; 1815 1816 for (i = 0; i < oprsz / 8; i++) { 1817 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1818 if (dd < nn) { 1819 dd = UINT64_MAX; 1820 q = true; 1821 } 1822 d[i] = dd; 1823 } 1824 if (q) { 1825 uint32_t *qc = vq; 1826 qc[0] = 1; 1827 } 1828 clear_tail(d, oprsz, simd_maxsz(desc)); 1829 } 1830 1831 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1832 void *vm, uint32_t desc) 1833 { 1834 intptr_t i, oprsz = simd_oprsz(desc); 1835 uint64_t *d = vd, *n = vn, *m = vm; 1836 bool q = false; 1837 1838 for (i = 0; i < oprsz / 8; i++) { 1839 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1840 if (nn < mm) { 1841 dd = 0; 1842 q = true; 1843 } 1844 d[i] = dd; 1845 } 1846 if (q) { 1847 uint32_t *qc = vq; 1848 qc[0] = 1; 1849 } 1850 clear_tail(d, oprsz, simd_maxsz(desc)); 1851 } 1852 1853 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1854 void *vm, uint32_t desc) 1855 { 1856 intptr_t i, oprsz = simd_oprsz(desc); 1857 int64_t *d = vd, *n = vn, *m = vm; 1858 bool q = false; 1859 1860 for (i = 0; i < oprsz / 8; i++) { 1861 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1862 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1863 dd = (nn >> 63) ^ ~INT64_MIN; 1864 q = true; 1865 } 1866 d[i] = dd; 1867 } 1868 if (q) { 1869 uint32_t *qc = vq; 1870 qc[0] = 1; 1871 } 1872 clear_tail(d, oprsz, simd_maxsz(desc)); 1873 } 1874 1875 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1876 void *vm, uint32_t desc) 1877 { 1878 intptr_t i, oprsz = simd_oprsz(desc); 1879 int64_t *d = vd, *n = vn, *m = vm; 1880 bool q = false; 1881 1882 for (i = 0; i < oprsz / 8; i++) { 1883 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1884 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1885 dd = (nn >> 63) ^ ~INT64_MIN; 1886 q = true; 1887 } 1888 d[i] = dd; 1889 } 1890 if (q) { 1891 uint32_t *qc = vq; 1892 qc[0] = 1; 1893 } 1894 clear_tail(d, oprsz, simd_maxsz(desc)); 1895 } 1896 1897 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn, 1898 void *vm, uint32_t desc) 1899 { 1900 intptr_t i, oprsz = simd_oprsz(desc); 1901 uint64_t *d = vd, *n = vn, *m = vm; 1902 bool q = false; 1903 1904 for (i = 0; i < oprsz / 8; i++) { 1905 uint64_t nn = n[i]; 1906 int64_t mm = m[i]; 1907 uint64_t dd = nn + mm; 1908 1909 if (mm < 0) { 1910 if (nn < (uint64_t)-mm) { 1911 dd = 0; 1912 q = true; 1913 } 1914 } else { 1915 if (dd < nn) { 1916 dd = UINT64_MAX; 1917 q = true; 1918 } 1919 } 1920 d[i] = dd; 1921 } 1922 if (q) { 1923 uint32_t *qc = vq; 1924 qc[0] = 1; 1925 } 1926 clear_tail(d, oprsz, simd_maxsz(desc)); 1927 } 1928 1929 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn, 1930 void *vm, uint32_t desc) 1931 { 1932 intptr_t i, oprsz = simd_oprsz(desc); 1933 uint64_t *d = vd, *n = vn, *m = vm; 1934 bool q = false; 1935 1936 for (i = 0; i < oprsz / 8; i++) { 1937 int64_t nn = n[i]; 1938 uint64_t mm = m[i]; 1939 int64_t dd = nn + mm; 1940 1941 if (mm > (uint64_t)(INT64_MAX - nn)) { 1942 dd = INT64_MAX; 1943 q = true; 1944 } 1945 d[i] = dd; 1946 } 1947 if (q) { 1948 uint32_t *qc = vq; 1949 qc[0] = 1; 1950 } 1951 clear_tail(d, oprsz, simd_maxsz(desc)); 1952 } 1953 1954 #define DO_SRA(NAME, TYPE) \ 1955 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1956 { \ 1957 intptr_t i, oprsz = simd_oprsz(desc); \ 1958 int shift = simd_data(desc); \ 1959 TYPE *d = vd, *n = vn; \ 1960 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1961 d[i] += n[i] >> shift; \ 1962 } \ 1963 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1964 } 1965 1966 DO_SRA(gvec_ssra_b, int8_t) 1967 DO_SRA(gvec_ssra_h, int16_t) 1968 DO_SRA(gvec_ssra_s, int32_t) 1969 DO_SRA(gvec_ssra_d, int64_t) 1970 1971 DO_SRA(gvec_usra_b, uint8_t) 1972 DO_SRA(gvec_usra_h, uint16_t) 1973 DO_SRA(gvec_usra_s, uint32_t) 1974 DO_SRA(gvec_usra_d, uint64_t) 1975 1976 #undef DO_SRA 1977 1978 #define DO_RSHR(NAME, TYPE) \ 1979 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1980 { \ 1981 intptr_t i, oprsz = simd_oprsz(desc); \ 1982 int shift = simd_data(desc); \ 1983 TYPE *d = vd, *n = vn; \ 1984 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1985 TYPE tmp = n[i] >> (shift - 1); \ 1986 d[i] = (tmp >> 1) + (tmp & 1); \ 1987 } \ 1988 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1989 } 1990 1991 DO_RSHR(gvec_srshr_b, int8_t) 1992 DO_RSHR(gvec_srshr_h, int16_t) 1993 DO_RSHR(gvec_srshr_s, int32_t) 1994 DO_RSHR(gvec_srshr_d, int64_t) 1995 1996 DO_RSHR(gvec_urshr_b, uint8_t) 1997 DO_RSHR(gvec_urshr_h, uint16_t) 1998 DO_RSHR(gvec_urshr_s, uint32_t) 1999 DO_RSHR(gvec_urshr_d, uint64_t) 2000 2001 #undef DO_RSHR 2002 2003 #define DO_RSRA(NAME, TYPE) \ 2004 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2005 { \ 2006 intptr_t i, oprsz = simd_oprsz(desc); \ 2007 int shift = simd_data(desc); \ 2008 TYPE *d = vd, *n = vn; \ 2009 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2010 TYPE tmp = n[i] >> (shift - 1); \ 2011 d[i] += (tmp >> 1) + (tmp & 1); \ 2012 } \ 2013 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2014 } 2015 2016 DO_RSRA(gvec_srsra_b, int8_t) 2017 DO_RSRA(gvec_srsra_h, int16_t) 2018 DO_RSRA(gvec_srsra_s, int32_t) 2019 DO_RSRA(gvec_srsra_d, int64_t) 2020 2021 DO_RSRA(gvec_ursra_b, uint8_t) 2022 DO_RSRA(gvec_ursra_h, uint16_t) 2023 DO_RSRA(gvec_ursra_s, uint32_t) 2024 DO_RSRA(gvec_ursra_d, uint64_t) 2025 2026 #undef DO_RSRA 2027 2028 #define DO_SRI(NAME, TYPE) \ 2029 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2030 { \ 2031 intptr_t i, oprsz = simd_oprsz(desc); \ 2032 int shift = simd_data(desc); \ 2033 TYPE *d = vd, *n = vn; \ 2034 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2035 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 2036 } \ 2037 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2038 } 2039 2040 DO_SRI(gvec_sri_b, uint8_t) 2041 DO_SRI(gvec_sri_h, uint16_t) 2042 DO_SRI(gvec_sri_s, uint32_t) 2043 DO_SRI(gvec_sri_d, uint64_t) 2044 2045 #undef DO_SRI 2046 2047 #define DO_SLI(NAME, TYPE) \ 2048 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2049 { \ 2050 intptr_t i, oprsz = simd_oprsz(desc); \ 2051 int shift = simd_data(desc); \ 2052 TYPE *d = vd, *n = vn; \ 2053 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2054 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 2055 } \ 2056 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2057 } 2058 2059 DO_SLI(gvec_sli_b, uint8_t) 2060 DO_SLI(gvec_sli_h, uint16_t) 2061 DO_SLI(gvec_sli_s, uint32_t) 2062 DO_SLI(gvec_sli_d, uint64_t) 2063 2064 #undef DO_SLI 2065 2066 /* 2067 * Convert float16 to float32, raising no exceptions and 2068 * preserving exceptional values, including SNaN. 2069 * This is effectively an unpack+repack operation. 2070 */ 2071 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 2072 { 2073 const int f16_bias = 15; 2074 const int f32_bias = 127; 2075 uint32_t sign = extract32(f16, 15, 1); 2076 uint32_t exp = extract32(f16, 10, 5); 2077 uint32_t frac = extract32(f16, 0, 10); 2078 2079 if (exp == 0x1f) { 2080 /* Inf or NaN */ 2081 exp = 0xff; 2082 } else if (exp == 0) { 2083 /* Zero or denormal. */ 2084 if (frac != 0) { 2085 if (fz16) { 2086 frac = 0; 2087 } else { 2088 /* 2089 * Denormal; these are all normal float32. 2090 * Shift the fraction so that the msb is at bit 11, 2091 * then remove bit 11 as the implicit bit of the 2092 * normalized float32. Note that we still go through 2093 * the shift for normal numbers below, to put the 2094 * float32 fraction at the right place. 2095 */ 2096 int shift = clz32(frac) - 21; 2097 frac = (frac << shift) & 0x3ff; 2098 exp = f32_bias - f16_bias - shift + 1; 2099 } 2100 } 2101 } else { 2102 /* Normal number; adjust the bias. */ 2103 exp += f32_bias - f16_bias; 2104 } 2105 sign <<= 31; 2106 exp <<= 23; 2107 frac <<= 23 - 10; 2108 2109 return sign | exp | frac; 2110 } 2111 2112 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 2113 { 2114 /* 2115 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 2116 * Load the 2nd qword iff is_q & is_2. 2117 * Shift to the 2nd dword iff !is_q & is_2. 2118 * For !is_q & !is_2, the upper bits of the result are garbage. 2119 */ 2120 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 2121 } 2122 2123 /* 2124 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 2125 * as there is not yet SVE versions that might use blocking. 2126 */ 2127 2128 static void do_fmlal(float32 *d, void *vn, void *vm, 2129 CPUARMState *env, uint32_t desc, 2130 ARMFPStatusFlavour fpst_idx, 2131 uint64_t negx, int negf) 2132 { 2133 float_status *fpst = &env->vfp.fp_status[fpst_idx]; 2134 bool fz16 = env->vfp.fpcr & FPCR_FZ16; 2135 intptr_t i, oprsz = simd_oprsz(desc); 2136 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2137 int is_q = oprsz == 16; 2138 uint64_t n_4, m_4; 2139 2140 /* 2141 * Pre-load all of the f16 data, avoiding overlap issues. 2142 * Negate all inputs for AH=0 FMLSL at once. 2143 */ 2144 n_4 = load4_f16(vn, is_q, is_2) ^ negx; 2145 m_4 = load4_f16(vm, is_q, is_2); 2146 2147 for (i = 0; i < oprsz / 4; i++) { 2148 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2149 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 2150 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst); 2151 } 2152 clear_tail(d, oprsz, simd_maxsz(desc)); 2153 } 2154 2155 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 2156 CPUARMState *env, uint32_t desc) 2157 { 2158 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2159 uint64_t negx = is_s ? 0x8000800080008000ull : 0; 2160 2161 do_fmlal(vd, vn, vm, env, desc, FPST_STD, negx, 0); 2162 } 2163 2164 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 2165 CPUARMState *env, uint32_t desc) 2166 { 2167 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2168 uint64_t negx = 0; 2169 int negf = 0; 2170 2171 if (is_s) { 2172 if (env->vfp.fpcr & FPCR_AH) { 2173 negf = float_muladd_negate_product; 2174 } else { 2175 negx = 0x8000800080008000ull; 2176 } 2177 } 2178 do_fmlal(vd, vn, vm, env, desc, FPST_A64, negx, negf); 2179 } 2180 2181 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 2182 CPUARMState *env, uint32_t desc) 2183 { 2184 intptr_t i, oprsz = simd_oprsz(desc); 2185 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2186 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2187 float_status *status = &env->vfp.fp_status[FPST_A64]; 2188 bool fz16 = env->vfp.fpcr & FPCR_FZ16; 2189 int negx = 0, negf = 0; 2190 2191 if (is_s) { 2192 if (env->vfp.fpcr & FPCR_AH) { 2193 negf = float_muladd_negate_product; 2194 } else { 2195 negx = 0x8000; 2196 } 2197 } 2198 2199 for (i = 0; i < oprsz; i += sizeof(float32)) { 2200 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negx; 2201 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 2202 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2203 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2204 float32 aa = *(float32 *)(va + H1_4(i)); 2205 2206 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, negf, status); 2207 } 2208 } 2209 2210 static void do_fmlal_idx(float32 *d, void *vn, void *vm, 2211 CPUARMState *env, uint32_t desc, 2212 ARMFPStatusFlavour fpst_idx, 2213 uint64_t negx, int negf) 2214 { 2215 float_status *fpst = &env->vfp.fp_status[fpst_idx]; 2216 bool fz16 = env->vfp.fpcr & FPCR_FZ16; 2217 intptr_t i, oprsz = simd_oprsz(desc); 2218 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2219 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 2220 int is_q = oprsz == 16; 2221 uint64_t n_4; 2222 float32 m_1; 2223 2224 /* 2225 * Pre-load all of the f16 data, avoiding overlap issues. 2226 * Negate all inputs for AH=0 FMLSL at once. 2227 */ 2228 n_4 = load4_f16(vn, is_q, is_2) ^ negx; 2229 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 2230 2231 for (i = 0; i < oprsz / 4; i++) { 2232 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2233 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], negf, fpst); 2234 } 2235 clear_tail(d, oprsz, simd_maxsz(desc)); 2236 } 2237 2238 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 2239 CPUARMState *env, uint32_t desc) 2240 { 2241 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2242 uint64_t negx = is_s ? 0x8000800080008000ull : 0; 2243 2244 do_fmlal_idx(vd, vn, vm, env, desc, FPST_STD, negx, 0); 2245 } 2246 2247 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 2248 CPUARMState *env, uint32_t desc) 2249 { 2250 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2251 uint64_t negx = 0; 2252 int negf = 0; 2253 2254 if (is_s) { 2255 if (env->vfp.fpcr & FPCR_AH) { 2256 negf = float_muladd_negate_product; 2257 } else { 2258 negx = 0x8000800080008000ull; 2259 } 2260 } 2261 do_fmlal_idx(vd, vn, vm, env, desc, FPST_A64, negx, negf); 2262 } 2263 2264 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 2265 CPUARMState *env, uint32_t desc) 2266 { 2267 intptr_t i, j, oprsz = simd_oprsz(desc); 2268 bool is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2269 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2270 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 2271 float_status *status = &env->vfp.fp_status[FPST_A64]; 2272 bool fz16 = env->vfp.fpcr & FPCR_FZ16; 2273 int negx = 0, negf = 0; 2274 2275 if (is_s) { 2276 if (env->vfp.fpcr & FPCR_AH) { 2277 negf = float_muladd_negate_product; 2278 } else { 2279 negx = 0x8000; 2280 } 2281 } 2282 for (i = 0; i < oprsz; i += 16) { 2283 float16 mm_16 = *(float16 *)(vm + i + idx); 2284 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2285 2286 for (j = 0; j < 16; j += sizeof(float32)) { 2287 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negx; 2288 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2289 float32 aa = *(float32 *)(va + H1_4(i + j)); 2290 2291 *(float32 *)(vd + H1_4(i + j)) = 2292 float32_muladd(nn, mm, aa, negf, status); 2293 } 2294 } 2295 } 2296 2297 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2298 { 2299 intptr_t i, opr_sz = simd_oprsz(desc); 2300 int8_t *d = vd, *n = vn, *m = vm; 2301 2302 for (i = 0; i < opr_sz; ++i) { 2303 int8_t mm = m[i]; 2304 int8_t nn = n[i]; 2305 int8_t res = 0; 2306 if (mm >= 0) { 2307 if (mm < 8) { 2308 res = nn << mm; 2309 } 2310 } else { 2311 res = nn >> (mm > -8 ? -mm : 7); 2312 } 2313 d[i] = res; 2314 } 2315 clear_tail(d, opr_sz, simd_maxsz(desc)); 2316 } 2317 2318 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2319 { 2320 intptr_t i, opr_sz = simd_oprsz(desc); 2321 int16_t *d = vd, *n = vn, *m = vm; 2322 2323 for (i = 0; i < opr_sz / 2; ++i) { 2324 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2325 int16_t nn = n[i]; 2326 int16_t res = 0; 2327 if (mm >= 0) { 2328 if (mm < 16) { 2329 res = nn << mm; 2330 } 2331 } else { 2332 res = nn >> (mm > -16 ? -mm : 15); 2333 } 2334 d[i] = res; 2335 } 2336 clear_tail(d, opr_sz, simd_maxsz(desc)); 2337 } 2338 2339 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2340 { 2341 intptr_t i, opr_sz = simd_oprsz(desc); 2342 uint8_t *d = vd, *n = vn, *m = vm; 2343 2344 for (i = 0; i < opr_sz; ++i) { 2345 int8_t mm = m[i]; 2346 uint8_t nn = n[i]; 2347 uint8_t res = 0; 2348 if (mm >= 0) { 2349 if (mm < 8) { 2350 res = nn << mm; 2351 } 2352 } else { 2353 if (mm > -8) { 2354 res = nn >> -mm; 2355 } 2356 } 2357 d[i] = res; 2358 } 2359 clear_tail(d, opr_sz, simd_maxsz(desc)); 2360 } 2361 2362 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2363 { 2364 intptr_t i, opr_sz = simd_oprsz(desc); 2365 uint16_t *d = vd, *n = vn, *m = vm; 2366 2367 for (i = 0; i < opr_sz / 2; ++i) { 2368 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2369 uint16_t nn = n[i]; 2370 uint16_t res = 0; 2371 if (mm >= 0) { 2372 if (mm < 16) { 2373 res = nn << mm; 2374 } 2375 } else { 2376 if (mm > -16) { 2377 res = nn >> -mm; 2378 } 2379 } 2380 d[i] = res; 2381 } 2382 clear_tail(d, opr_sz, simd_maxsz(desc)); 2383 } 2384 2385 /* 2386 * 8x8->8 polynomial multiply. 2387 * 2388 * Polynomial multiplication is like integer multiplication except the 2389 * partial products are XORed, not added. 2390 * 2391 * TODO: expose this as a generic vector operation, as it is a common 2392 * crypto building block. 2393 */ 2394 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 2395 { 2396 intptr_t i, opr_sz = simd_oprsz(desc); 2397 uint64_t *d = vd, *n = vn, *m = vm; 2398 2399 for (i = 0; i < opr_sz / 8; ++i) { 2400 d[i] = clmul_8x8_low(n[i], m[i]); 2401 } 2402 clear_tail(d, opr_sz, simd_maxsz(desc)); 2403 } 2404 2405 /* 2406 * 64x64->128 polynomial multiply. 2407 * Because of the lanes are not accessed in strict columns, 2408 * this probably cannot be turned into a generic helper. 2409 */ 2410 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2411 { 2412 intptr_t i, opr_sz = simd_oprsz(desc); 2413 intptr_t hi = simd_data(desc); 2414 uint64_t *d = vd, *n = vn, *m = vm; 2415 2416 for (i = 0; i < opr_sz / 8; i += 2) { 2417 Int128 r = clmul_64(n[i + hi], m[i + hi]); 2418 d[i] = int128_getlo(r); 2419 d[i + 1] = int128_gethi(r); 2420 } 2421 clear_tail(d, opr_sz, simd_maxsz(desc)); 2422 } 2423 2424 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2425 { 2426 int hi = simd_data(desc); 2427 uint64_t *d = vd, *n = vn, *m = vm; 2428 uint64_t nn = n[hi], mm = m[hi]; 2429 2430 d[0] = clmul_8x4_packed(nn, mm); 2431 nn >>= 32; 2432 mm >>= 32; 2433 d[1] = clmul_8x4_packed(nn, mm); 2434 2435 clear_tail(d, 16, simd_maxsz(desc)); 2436 } 2437 2438 #ifdef TARGET_AARCH64 2439 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2440 { 2441 int shift = simd_data(desc) * 8; 2442 intptr_t i, opr_sz = simd_oprsz(desc); 2443 uint64_t *d = vd, *n = vn, *m = vm; 2444 2445 for (i = 0; i < opr_sz / 8; ++i) { 2446 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); 2447 } 2448 } 2449 2450 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2451 { 2452 intptr_t sel = H4(simd_data(desc)); 2453 intptr_t i, opr_sz = simd_oprsz(desc); 2454 uint32_t *n = vn, *m = vm; 2455 uint64_t *d = vd; 2456 2457 for (i = 0; i < opr_sz / 8; ++i) { 2458 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); 2459 } 2460 } 2461 #endif 2462 2463 #define DO_CMP0(NAME, TYPE, OP) \ 2464 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2465 { \ 2466 intptr_t i, opr_sz = simd_oprsz(desc); \ 2467 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2468 TYPE nn = *(TYPE *)(vn + i); \ 2469 *(TYPE *)(vd + i) = -(nn OP 0); \ 2470 } \ 2471 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2472 } 2473 2474 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2475 DO_CMP0(gvec_clt0_b, int8_t, <) 2476 DO_CMP0(gvec_cle0_b, int8_t, <=) 2477 DO_CMP0(gvec_cgt0_b, int8_t, >) 2478 DO_CMP0(gvec_cge0_b, int8_t, >=) 2479 2480 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2481 DO_CMP0(gvec_clt0_h, int16_t, <) 2482 DO_CMP0(gvec_cle0_h, int16_t, <=) 2483 DO_CMP0(gvec_cgt0_h, int16_t, >) 2484 DO_CMP0(gvec_cge0_h, int16_t, >=) 2485 2486 #undef DO_CMP0 2487 2488 #define DO_ABD(NAME, TYPE) \ 2489 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2490 { \ 2491 intptr_t i, opr_sz = simd_oprsz(desc); \ 2492 TYPE *d = vd, *n = vn, *m = vm; \ 2493 \ 2494 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2495 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2496 } \ 2497 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2498 } 2499 2500 DO_ABD(gvec_sabd_b, int8_t) 2501 DO_ABD(gvec_sabd_h, int16_t) 2502 DO_ABD(gvec_sabd_s, int32_t) 2503 DO_ABD(gvec_sabd_d, int64_t) 2504 2505 DO_ABD(gvec_uabd_b, uint8_t) 2506 DO_ABD(gvec_uabd_h, uint16_t) 2507 DO_ABD(gvec_uabd_s, uint32_t) 2508 DO_ABD(gvec_uabd_d, uint64_t) 2509 2510 #undef DO_ABD 2511 2512 #define DO_ABA(NAME, TYPE) \ 2513 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2514 { \ 2515 intptr_t i, opr_sz = simd_oprsz(desc); \ 2516 TYPE *d = vd, *n = vn, *m = vm; \ 2517 \ 2518 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2519 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2520 } \ 2521 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2522 } 2523 2524 DO_ABA(gvec_saba_b, int8_t) 2525 DO_ABA(gvec_saba_h, int16_t) 2526 DO_ABA(gvec_saba_s, int32_t) 2527 DO_ABA(gvec_saba_d, int64_t) 2528 2529 DO_ABA(gvec_uaba_b, uint8_t) 2530 DO_ABA(gvec_uaba_h, uint16_t) 2531 DO_ABA(gvec_uaba_s, uint32_t) 2532 DO_ABA(gvec_uaba_d, uint64_t) 2533 2534 #undef DO_ABA 2535 2536 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2537 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 2538 float_status *stat, uint32_t desc) \ 2539 { \ 2540 ARMVectorReg scratch; \ 2541 intptr_t oprsz = simd_oprsz(desc); \ 2542 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2543 TYPE *d = vd, *n = vn, *m = vm; \ 2544 if (unlikely(d == m)) { \ 2545 m = memcpy(&scratch, m, oprsz); \ 2546 } \ 2547 for (intptr_t i = 0; i < half; ++i) { \ 2548 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat); \ 2549 } \ 2550 for (intptr_t i = 0; i < half; ++i) { \ 2551 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat); \ 2552 } \ 2553 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2554 } 2555 2556 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2) 2557 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4) 2558 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, ) 2559 2560 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2) 2561 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4) 2562 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, ) 2563 2564 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2) 2565 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4) 2566 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, ) 2567 2568 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2) 2569 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4) 2570 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, ) 2571 2572 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2) 2573 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4) 2574 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, ) 2575 2576 #ifdef TARGET_AARCH64 2577 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2) 2578 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4) 2579 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, ) 2580 2581 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2) 2582 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4) 2583 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, ) 2584 #endif 2585 2586 #undef DO_3OP_PAIR 2587 2588 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2589 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2590 { \ 2591 ARMVectorReg scratch; \ 2592 intptr_t oprsz = simd_oprsz(desc); \ 2593 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2594 TYPE *d = vd, *n = vn, *m = vm; \ 2595 if (unlikely(d == m)) { \ 2596 m = memcpy(&scratch, m, oprsz); \ 2597 } \ 2598 for (intptr_t i = 0; i < half; ++i) { \ 2599 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]); \ 2600 } \ 2601 for (intptr_t i = 0; i < half; ++i) { \ 2602 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]); \ 2603 } \ 2604 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2605 } 2606 2607 #define ADD(A, B) (A + B) 2608 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1) 2609 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2) 2610 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4) 2611 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, ) 2612 #undef ADD 2613 2614 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1) 2615 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2) 2616 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4) 2617 2618 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1) 2619 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2) 2620 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4) 2621 2622 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1) 2623 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2) 2624 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4) 2625 2626 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1) 2627 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2) 2628 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4) 2629 2630 #undef DO_3OP_PAIR 2631 2632 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2633 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 2634 { \ 2635 intptr_t i, oprsz = simd_oprsz(desc); \ 2636 int shift = simd_data(desc); \ 2637 TYPE *d = vd, *n = vn; \ 2638 float_status *fpst = stat; \ 2639 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2640 d[i] = FUNC(n[i], shift, fpst); \ 2641 } \ 2642 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2643 } 2644 2645 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t) 2646 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t) 2647 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2648 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2649 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2650 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2651 2652 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t) 2653 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t) 2654 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2655 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t) 2656 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2657 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2658 2659 #undef DO_VCVT_FIXED 2660 2661 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2662 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2663 { \ 2664 intptr_t i, oprsz = simd_oprsz(desc); \ 2665 uint32_t rmode = simd_data(desc); \ 2666 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2667 TYPE *d = vd, *n = vn; \ 2668 set_float_rounding_mode(rmode, fpst); \ 2669 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2670 d[i] = FUNC(n[i], 0, fpst); \ 2671 } \ 2672 set_float_rounding_mode(prev_rmode, fpst); \ 2673 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2674 } 2675 2676 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t) 2677 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t) 2678 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2679 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2680 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2681 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2682 2683 #undef DO_VCVT_RMODE 2684 2685 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2686 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2687 { \ 2688 intptr_t i, oprsz = simd_oprsz(desc); \ 2689 uint32_t rmode = simd_data(desc); \ 2690 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2691 TYPE *d = vd, *n = vn; \ 2692 set_float_rounding_mode(rmode, fpst); \ 2693 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2694 d[i] = FUNC(n[i], fpst); \ 2695 } \ 2696 set_float_rounding_mode(prev_rmode, fpst); \ 2697 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2698 } 2699 2700 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2701 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2702 2703 #undef DO_VRINT_RMODE 2704 2705 #ifdef TARGET_AARCH64 2706 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc) 2707 { 2708 const uint8_t *indices = vm; 2709 size_t oprsz = simd_oprsz(desc); 2710 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2711 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2712 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2713 union { 2714 uint8_t b[16]; 2715 uint64_t d[2]; 2716 } result; 2717 2718 /* 2719 * We must construct the final result in a temp, lest the output 2720 * overlaps the input table. For TBL, begin with zero; for TBX, 2721 * begin with the original register contents. Note that we always 2722 * copy 16 bytes here to avoid an extra branch; clearing the high 2723 * bits of the register for oprsz == 8 is handled below. 2724 */ 2725 if (is_tbx) { 2726 memcpy(&result, vd, 16); 2727 } else { 2728 memset(&result, 0, 16); 2729 } 2730 2731 for (size_t i = 0; i < oprsz; ++i) { 2732 uint32_t index = indices[H1(i)]; 2733 2734 if (index < table_len) { 2735 /* 2736 * Convert index (a byte offset into the virtual table 2737 * which is a series of 128-bit vectors concatenated) 2738 * into the correct register element, bearing in mind 2739 * that the table can wrap around from V31 to V0. 2740 */ 2741 const uint8_t *table = (const uint8_t *) 2742 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2743 result.b[H1(i)] = table[H1(index % 16)]; 2744 } 2745 } 2746 2747 memcpy(vd, &result, 16); 2748 clear_tail(vd, oprsz, simd_maxsz(desc)); 2749 } 2750 #endif 2751 2752 /* 2753 * NxN -> N highpart multiply 2754 * 2755 * TODO: expose this as a generic vector operation. 2756 */ 2757 2758 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2759 { 2760 intptr_t i, opr_sz = simd_oprsz(desc); 2761 int8_t *d = vd, *n = vn, *m = vm; 2762 2763 for (i = 0; i < opr_sz; ++i) { 2764 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2765 } 2766 clear_tail(d, opr_sz, simd_maxsz(desc)); 2767 } 2768 2769 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2770 { 2771 intptr_t i, opr_sz = simd_oprsz(desc); 2772 int16_t *d = vd, *n = vn, *m = vm; 2773 2774 for (i = 0; i < opr_sz / 2; ++i) { 2775 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2776 } 2777 clear_tail(d, opr_sz, simd_maxsz(desc)); 2778 } 2779 2780 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2781 { 2782 intptr_t i, opr_sz = simd_oprsz(desc); 2783 int32_t *d = vd, *n = vn, *m = vm; 2784 2785 for (i = 0; i < opr_sz / 4; ++i) { 2786 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2787 } 2788 clear_tail(d, opr_sz, simd_maxsz(desc)); 2789 } 2790 2791 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2792 { 2793 intptr_t i, opr_sz = simd_oprsz(desc); 2794 uint64_t *d = vd, *n = vn, *m = vm; 2795 uint64_t discard; 2796 2797 for (i = 0; i < opr_sz / 8; ++i) { 2798 muls64(&discard, &d[i], n[i], m[i]); 2799 } 2800 clear_tail(d, opr_sz, simd_maxsz(desc)); 2801 } 2802 2803 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2804 { 2805 intptr_t i, opr_sz = simd_oprsz(desc); 2806 uint8_t *d = vd, *n = vn, *m = vm; 2807 2808 for (i = 0; i < opr_sz; ++i) { 2809 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2810 } 2811 clear_tail(d, opr_sz, simd_maxsz(desc)); 2812 } 2813 2814 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2815 { 2816 intptr_t i, opr_sz = simd_oprsz(desc); 2817 uint16_t *d = vd, *n = vn, *m = vm; 2818 2819 for (i = 0; i < opr_sz / 2; ++i) { 2820 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2821 } 2822 clear_tail(d, opr_sz, simd_maxsz(desc)); 2823 } 2824 2825 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2826 { 2827 intptr_t i, opr_sz = simd_oprsz(desc); 2828 uint32_t *d = vd, *n = vn, *m = vm; 2829 2830 for (i = 0; i < opr_sz / 4; ++i) { 2831 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2832 } 2833 clear_tail(d, opr_sz, simd_maxsz(desc)); 2834 } 2835 2836 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2837 { 2838 intptr_t i, opr_sz = simd_oprsz(desc); 2839 uint64_t *d = vd, *n = vn, *m = vm; 2840 uint64_t discard; 2841 2842 for (i = 0; i < opr_sz / 8; ++i) { 2843 mulu64(&discard, &d[i], n[i], m[i]); 2844 } 2845 clear_tail(d, opr_sz, simd_maxsz(desc)); 2846 } 2847 2848 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2849 { 2850 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2851 int shr = simd_data(desc); 2852 uint64_t *d = vd, *n = vn, *m = vm; 2853 2854 for (i = 0; i < opr_sz; ++i) { 2855 d[i] = ror64(n[i] ^ m[i], shr); 2856 } 2857 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2858 } 2859 2860 /* 2861 * Integer matrix-multiply accumulate 2862 */ 2863 2864 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2865 { 2866 int8_t *n = vn, *m = vm; 2867 2868 for (intptr_t k = 0; k < 8; ++k) { 2869 sum += n[H1(k)] * m[H1(k)]; 2870 } 2871 return sum; 2872 } 2873 2874 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2875 { 2876 uint8_t *n = vn, *m = vm; 2877 2878 for (intptr_t k = 0; k < 8; ++k) { 2879 sum += n[H1(k)] * m[H1(k)]; 2880 } 2881 return sum; 2882 } 2883 2884 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2885 { 2886 uint8_t *n = vn; 2887 int8_t *m = vm; 2888 2889 for (intptr_t k = 0; k < 8; ++k) { 2890 sum += n[H1(k)] * m[H1(k)]; 2891 } 2892 return sum; 2893 } 2894 2895 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2896 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2897 { 2898 intptr_t seg, opr_sz = simd_oprsz(desc); 2899 2900 for (seg = 0; seg < opr_sz; seg += 16) { 2901 uint32_t *d = vd + seg; 2902 uint32_t *a = va + seg; 2903 uint32_t sum0, sum1, sum2, sum3; 2904 2905 /* 2906 * Process the entire segment at once, writing back the 2907 * results only after we've consumed all of the inputs. 2908 * 2909 * Key to indices by column: 2910 * i j i j 2911 */ 2912 sum0 = a[H4(0 + 0)]; 2913 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2914 sum1 = a[H4(0 + 1)]; 2915 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2916 sum2 = a[H4(2 + 0)]; 2917 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2918 sum3 = a[H4(2 + 1)]; 2919 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2920 2921 d[H4(0)] = sum0; 2922 d[H4(1)] = sum1; 2923 d[H4(2)] = sum2; 2924 d[H4(3)] = sum3; 2925 } 2926 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2927 } 2928 2929 #define DO_MMLA_B(NAME, INNER) \ 2930 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2931 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2932 2933 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2934 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2935 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2936 2937 /* 2938 * BFloat16 Dot Product 2939 */ 2940 2941 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp) 2942 { 2943 /* 2944 * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF. 2945 * For EBF = 0, we ignore the FPCR bits which determine rounding 2946 * mode and denormal-flushing, and we do unfused multiplies and 2947 * additions with intermediate rounding of all products and sums. 2948 * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits, 2949 * and we perform a fused two-way sum-of-products without intermediate 2950 * rounding of the products. 2951 * In either case, we don't set fp exception flags. 2952 * 2953 * EBF is AArch64 only, so even if it's set in the FPCR it has 2954 * no effect on AArch32 instructions. 2955 */ 2956 bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF; 2957 2958 *statusp = env->vfp.fp_status[is_a64(env) ? FPST_A64 : FPST_A32]; 2959 set_default_nan_mode(true, statusp); 2960 2961 if (ebf) { 2962 /* EBF=1 needs to do a step with round-to-odd semantics */ 2963 *oddstatusp = *statusp; 2964 set_float_rounding_mode(float_round_to_odd, oddstatusp); 2965 } else { 2966 set_flush_to_zero(true, statusp); 2967 set_flush_inputs_to_zero(true, statusp); 2968 set_float_rounding_mode(float_round_to_odd_inf, statusp); 2969 } 2970 return ebf; 2971 } 2972 2973 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst) 2974 { 2975 float32 t1, t2; 2976 2977 /* 2978 * Extract each BFloat16 from the element pair, and shift 2979 * them such that they become float32. 2980 */ 2981 t1 = float32_mul(e1 << 16, e2 << 16, fpst); 2982 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst); 2983 t1 = float32_add(t1, t2, fpst); 2984 t1 = float32_add(sum, t1, fpst); 2985 2986 return t1; 2987 } 2988 2989 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2, 2990 float_status *fpst, float_status *fpst_odd) 2991 { 2992 /* 2993 * Compare f16_dotadd() in sme_helper.c, but here we have 2994 * bfloat16 inputs. In particular that means that we do not 2995 * want the FPCR.FZ16 flush semantics, so we use the normal 2996 * float_status for the input handling here. 2997 */ 2998 float64 e1r = float32_to_float64(e1 << 16, fpst); 2999 float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst); 3000 float64 e2r = float32_to_float64(e2 << 16, fpst); 3001 float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst); 3002 float64 t64; 3003 float32 t32; 3004 3005 /* 3006 * The ARM pseudocode function FPDot performs both multiplies 3007 * and the add with a single rounding operation. Emulate this 3008 * by performing the first multiply in round-to-odd, then doing 3009 * the second multiply as fused multiply-add, and rounding to 3010 * float32 all in one step. 3011 */ 3012 t64 = float64_mul(e1r, e2r, fpst_odd); 3013 t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst); 3014 3015 /* This conversion is exact, because we've already rounded. */ 3016 t32 = float64_to_float32(t64, fpst); 3017 3018 /* The final accumulation step is not fused. */ 3019 return float32_add(sum, t32, fpst); 3020 } 3021 3022 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, 3023 CPUARMState *env, uint32_t desc) 3024 { 3025 intptr_t i, opr_sz = simd_oprsz(desc); 3026 float32 *d = vd, *a = va; 3027 uint32_t *n = vn, *m = vm; 3028 float_status fpst, fpst_odd; 3029 3030 if (is_ebf(env, &fpst, &fpst_odd)) { 3031 for (i = 0; i < opr_sz / 4; ++i) { 3032 d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd); 3033 } 3034 } else { 3035 for (i = 0; i < opr_sz / 4; ++i) { 3036 d[i] = bfdotadd(a[i], n[i], m[i], &fpst); 3037 } 3038 } 3039 clear_tail(d, opr_sz, simd_maxsz(desc)); 3040 } 3041 3042 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 3043 void *va, CPUARMState *env, uint32_t desc) 3044 { 3045 intptr_t i, j, opr_sz = simd_oprsz(desc); 3046 intptr_t index = simd_data(desc); 3047 intptr_t elements = opr_sz / 4; 3048 intptr_t eltspersegment = MIN(16 / 4, elements); 3049 float32 *d = vd, *a = va; 3050 uint32_t *n = vn, *m = vm; 3051 float_status fpst, fpst_odd; 3052 3053 if (is_ebf(env, &fpst, &fpst_odd)) { 3054 for (i = 0; i < elements; i += eltspersegment) { 3055 uint32_t m_idx = m[i + H4(index)]; 3056 3057 for (j = i; j < i + eltspersegment; j++) { 3058 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd); 3059 } 3060 } 3061 } else { 3062 for (i = 0; i < elements; i += eltspersegment) { 3063 uint32_t m_idx = m[i + H4(index)]; 3064 3065 for (j = i; j < i + eltspersegment; j++) { 3066 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst); 3067 } 3068 } 3069 } 3070 clear_tail(d, opr_sz, simd_maxsz(desc)); 3071 } 3072 3073 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, 3074 CPUARMState *env, uint32_t desc) 3075 { 3076 intptr_t s, opr_sz = simd_oprsz(desc); 3077 float32 *d = vd, *a = va; 3078 uint32_t *n = vn, *m = vm; 3079 float_status fpst, fpst_odd; 3080 3081 if (is_ebf(env, &fpst, &fpst_odd)) { 3082 for (s = 0; s < opr_sz / 4; s += 4) { 3083 float32 sum00, sum01, sum10, sum11; 3084 3085 /* 3086 * Process the entire segment at once, writing back the 3087 * results only after we've consumed all of the inputs. 3088 * 3089 * Key to indices by column: 3090 * i j i k j k 3091 */ 3092 sum00 = a[s + H4(0 + 0)]; 3093 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 3094 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 3095 3096 sum01 = a[s + H4(0 + 1)]; 3097 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 3098 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 3099 3100 sum10 = a[s + H4(2 + 0)]; 3101 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 3102 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 3103 3104 sum11 = a[s + H4(2 + 1)]; 3105 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 3106 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 3107 3108 d[s + H4(0 + 0)] = sum00; 3109 d[s + H4(0 + 1)] = sum01; 3110 d[s + H4(2 + 0)] = sum10; 3111 d[s + H4(2 + 1)] = sum11; 3112 } 3113 } else { 3114 for (s = 0; s < opr_sz / 4; s += 4) { 3115 float32 sum00, sum01, sum10, sum11; 3116 3117 /* 3118 * Process the entire segment at once, writing back the 3119 * results only after we've consumed all of the inputs. 3120 * 3121 * Key to indices by column: 3122 * i j i k j k 3123 */ 3124 sum00 = a[s + H4(0 + 0)]; 3125 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst); 3126 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst); 3127 3128 sum01 = a[s + H4(0 + 1)]; 3129 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst); 3130 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst); 3131 3132 sum10 = a[s + H4(2 + 0)]; 3133 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst); 3134 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst); 3135 3136 sum11 = a[s + H4(2 + 1)]; 3137 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst); 3138 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst); 3139 3140 d[s + H4(0 + 0)] = sum00; 3141 d[s + H4(0 + 1)] = sum01; 3142 d[s + H4(2 + 0)] = sum10; 3143 d[s + H4(2 + 1)] = sum11; 3144 } 3145 } 3146 clear_tail(d, opr_sz, simd_maxsz(desc)); 3147 } 3148 3149 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 3150 float_status *stat, uint32_t desc) 3151 { 3152 intptr_t i, opr_sz = simd_oprsz(desc); 3153 intptr_t sel = simd_data(desc); 3154 float32 *d = vd, *a = va; 3155 bfloat16 *n = vn, *m = vm; 3156 3157 for (i = 0; i < opr_sz / 4; ++i) { 3158 float32 nn = n[H2(i * 2 + sel)] << 16; 3159 float32 mm = m[H2(i * 2 + sel)] << 16; 3160 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 3161 } 3162 clear_tail(d, opr_sz, simd_maxsz(desc)); 3163 } 3164 3165 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 3166 void *va, float_status *stat, uint32_t desc) 3167 { 3168 intptr_t i, j, opr_sz = simd_oprsz(desc); 3169 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 3170 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 3171 intptr_t elements = opr_sz / 4; 3172 intptr_t eltspersegment = MIN(16 / 4, elements); 3173 float32 *d = vd, *a = va; 3174 bfloat16 *n = vn, *m = vm; 3175 3176 for (i = 0; i < elements; i += eltspersegment) { 3177 float32 m_idx = m[H2(2 * i + index)] << 16; 3178 3179 for (j = i; j < i + eltspersegment; j++) { 3180 float32 n_j = n[H2(2 * j + sel)] << 16; 3181 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 3182 } 3183 } 3184 clear_tail(d, opr_sz, simd_maxsz(desc)); 3185 } 3186 3187 #define DO_CLAMP(NAME, TYPE) \ 3188 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 3189 { \ 3190 intptr_t i, opr_sz = simd_oprsz(desc); \ 3191 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 3192 TYPE aa = *(TYPE *)(a + i); \ 3193 TYPE nn = *(TYPE *)(n + i); \ 3194 TYPE mm = *(TYPE *)(m + i); \ 3195 TYPE dd = MIN(MAX(aa, nn), mm); \ 3196 *(TYPE *)(d + i) = dd; \ 3197 } \ 3198 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 3199 } 3200 3201 DO_CLAMP(gvec_sclamp_b, int8_t) 3202 DO_CLAMP(gvec_sclamp_h, int16_t) 3203 DO_CLAMP(gvec_sclamp_s, int32_t) 3204 DO_CLAMP(gvec_sclamp_d, int64_t) 3205 3206 DO_CLAMP(gvec_uclamp_b, uint8_t) 3207 DO_CLAMP(gvec_uclamp_h, uint16_t) 3208 DO_CLAMP(gvec_uclamp_s, uint32_t) 3209 DO_CLAMP(gvec_uclamp_d, uint64_t) 3210 3211 /* Bit count in each 8-bit word. */ 3212 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc) 3213 { 3214 intptr_t i, opr_sz = simd_oprsz(desc); 3215 uint8_t *d = vd, *n = vn; 3216 3217 for (i = 0; i < opr_sz; ++i) { 3218 d[i] = ctpop8(n[i]); 3219 } 3220 clear_tail(d, opr_sz, simd_maxsz(desc)); 3221 } 3222 3223 /* Reverse bits in each 8 bit word */ 3224 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc) 3225 { 3226 intptr_t i, opr_sz = simd_oprsz(desc); 3227 uint64_t *d = vd, *n = vn; 3228 3229 for (i = 0; i < opr_sz / 8; ++i) { 3230 d[i] = revbit64(bswap64(n[i])); 3231 } 3232 clear_tail(d, opr_sz, simd_maxsz(desc)); 3233 } 3234 3235 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc) 3236 { 3237 intptr_t i, opr_sz = simd_oprsz(desc); 3238 uint32_t *d = vd, *n = vn; 3239 3240 for (i = 0; i < opr_sz / 4; ++i) { 3241 d[i] = helper_recpe_u32(n[i]); 3242 } 3243 clear_tail(d, opr_sz, simd_maxsz(desc)); 3244 } 3245 3246 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc) 3247 { 3248 intptr_t i, opr_sz = simd_oprsz(desc); 3249 uint32_t *d = vd, *n = vn; 3250 3251 for (i = 0; i < opr_sz / 4; ++i) { 3252 d[i] = helper_rsqrte_u32(n[i]); 3253 } 3254 clear_tail(d, opr_sz, simd_maxsz(desc)); 3255 } 3256