1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "crypto/clmul.h" 27 #include "vec_internal.h" 28 29 /* 30 * Data for expanding active predicate bits to bytes, for byte elements. 31 * 32 * for (i = 0; i < 256; ++i) { 33 * unsigned long m = 0; 34 * for (j = 0; j < 8; j++) { 35 * if ((i >> j) & 1) { 36 * m |= 0xfful << (j << 3); 37 * } 38 * } 39 * printf("0x%016lx,\n", m); 40 * } 41 */ 42 const uint64_t expand_pred_b_data[256] = { 43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 128 0xffffffffffffffff, 129 }; 130 131 /* 132 * Similarly for half-word elements. 133 * for (i = 0; i < 256; ++i) { 134 * unsigned long m = 0; 135 * if (i & 0xaa) { 136 * continue; 137 * } 138 * for (j = 0; j < 8; j += 2) { 139 * if ((i >> j) & 1) { 140 * m |= 0xfffful << (j << 3); 141 * } 142 * } 143 * printf("[0x%x] = 0x%016lx,\n", i, m); 144 * } 145 */ 146 const uint64_t expand_pred_h_data[0x55 + 1] = { 147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 154 [0x55] = 0xffffffffffffffff, 155 }; 156 157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 159 bool neg, bool round) 160 { 161 /* 162 * Simplify: 163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 165 */ 166 int32_t ret = (int32_t)src1 * src2; 167 if (neg) { 168 ret = -ret; 169 } 170 ret += ((int32_t)src3 << 7) + (round << 6); 171 ret >>= 7; 172 173 if (ret != (int8_t)ret) { 174 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 175 } 176 return ret; 177 } 178 179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 180 void *va, uint32_t desc) 181 { 182 intptr_t i, opr_sz = simd_oprsz(desc); 183 int8_t *d = vd, *n = vn, *m = vm, *a = va; 184 185 for (i = 0; i < opr_sz; ++i) { 186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 187 } 188 } 189 190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 191 void *va, uint32_t desc) 192 { 193 intptr_t i, opr_sz = simd_oprsz(desc); 194 int8_t *d = vd, *n = vn, *m = vm, *a = va; 195 196 for (i = 0; i < opr_sz; ++i) { 197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 198 } 199 } 200 201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 202 { 203 intptr_t i, opr_sz = simd_oprsz(desc); 204 int8_t *d = vd, *n = vn, *m = vm; 205 206 for (i = 0; i < opr_sz; ++i) { 207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 208 } 209 } 210 211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 212 { 213 intptr_t i, opr_sz = simd_oprsz(desc); 214 int8_t *d = vd, *n = vn, *m = vm; 215 216 for (i = 0; i < opr_sz; ++i) { 217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 218 } 219 } 220 221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 223 bool neg, bool round, uint32_t *sat) 224 { 225 /* Simplify similarly to do_sqrdmlah_b above. */ 226 int32_t ret = (int32_t)src1 * src2; 227 if (neg) { 228 ret = -ret; 229 } 230 ret += ((int32_t)src3 << 15) + (round << 14); 231 ret >>= 15; 232 233 if (ret != (int16_t)ret) { 234 *sat = 1; 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 236 } 237 return ret; 238 } 239 240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 241 uint32_t src2, uint32_t src3) 242 { 243 uint32_t *sat = &env->vfp.qc[0]; 244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 246 false, true, sat); 247 return deposit32(e1, 16, 16, e2); 248 } 249 250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 251 void *vq, uint32_t desc) 252 { 253 uintptr_t opr_sz = simd_oprsz(desc); 254 int16_t *d = vd; 255 int16_t *n = vn; 256 int16_t *m = vm; 257 uintptr_t i; 258 259 for (i = 0; i < opr_sz / 2; ++i) { 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 261 } 262 clear_tail(d, opr_sz, simd_maxsz(desc)); 263 } 264 265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 266 uint32_t src2, uint32_t src3) 267 { 268 uint32_t *sat = &env->vfp.qc[0]; 269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 271 true, true, sat); 272 return deposit32(e1, 16, 16, e2); 273 } 274 275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277 { 278 uintptr_t opr_sz = simd_oprsz(desc); 279 int16_t *d = vd; 280 int16_t *n = vn; 281 int16_t *m = vm; 282 uintptr_t i; 283 284 for (i = 0; i < opr_sz / 2; ++i) { 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 286 } 287 clear_tail(d, opr_sz, simd_maxsz(desc)); 288 } 289 290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 291 void *vq, uint32_t desc) 292 { 293 intptr_t i, opr_sz = simd_oprsz(desc); 294 int16_t *d = vd, *n = vn, *m = vm; 295 296 for (i = 0; i < opr_sz / 2; ++i) { 297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 298 } 299 clear_tail(d, opr_sz, simd_maxsz(desc)); 300 } 301 302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 303 void *vq, uint32_t desc) 304 { 305 intptr_t i, opr_sz = simd_oprsz(desc); 306 int16_t *d = vd, *n = vn, *m = vm; 307 308 for (i = 0; i < opr_sz / 2; ++i) { 309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 310 } 311 clear_tail(d, opr_sz, simd_maxsz(desc)); 312 } 313 314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm, 315 void *vq, uint32_t desc) 316 { 317 intptr_t i, j, opr_sz = simd_oprsz(desc); 318 int idx = simd_data(desc); 319 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 320 intptr_t elements = opr_sz / 2; 321 intptr_t eltspersegment = MIN(16 / 2, elements); 322 323 for (i = 0; i < elements; i += 16 / 2) { 324 int16_t mm = m[i]; 325 for (j = 0; j < eltspersegment; ++j) { 326 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq); 327 } 328 } 329 clear_tail(d, opr_sz, simd_maxsz(desc)); 330 } 331 332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, 333 void *vq, uint32_t desc) 334 { 335 intptr_t i, j, opr_sz = simd_oprsz(desc); 336 int idx = simd_data(desc); 337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 338 intptr_t elements = opr_sz / 2; 339 intptr_t eltspersegment = MIN(16 / 2, elements); 340 341 for (i = 0; i < elements; i += 16 / 2) { 342 int16_t mm = m[i]; 343 for (j = 0; j < eltspersegment; ++j) { 344 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq); 345 } 346 } 347 clear_tail(d, opr_sz, simd_maxsz(desc)); 348 } 349 350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm, 351 void *vq, uint32_t desc) 352 { 353 intptr_t i, j, opr_sz = simd_oprsz(desc); 354 int idx = simd_data(desc); 355 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 356 intptr_t elements = opr_sz / 2; 357 intptr_t eltspersegment = MIN(16 / 2, elements); 358 359 for (i = 0; i < elements; i += 16 / 2) { 360 int16_t mm = m[i]; 361 for (j = 0; j < eltspersegment; ++j) { 362 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq); 363 } 364 } 365 clear_tail(d, opr_sz, simd_maxsz(desc)); 366 } 367 368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm, 369 void *vq, uint32_t desc) 370 { 371 intptr_t i, j, opr_sz = simd_oprsz(desc); 372 int idx = simd_data(desc); 373 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 374 intptr_t elements = opr_sz / 2; 375 intptr_t eltspersegment = MIN(16 / 2, elements); 376 377 for (i = 0; i < elements; i += 16 / 2) { 378 int16_t mm = m[i]; 379 for (j = 0; j < eltspersegment; ++j) { 380 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq); 381 } 382 } 383 clear_tail(d, opr_sz, simd_maxsz(desc)); 384 } 385 386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 387 void *va, uint32_t desc) 388 { 389 intptr_t i, opr_sz = simd_oprsz(desc); 390 int16_t *d = vd, *n = vn, *m = vm, *a = va; 391 uint32_t discard; 392 393 for (i = 0; i < opr_sz / 2; ++i) { 394 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 395 } 396 } 397 398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 399 void *va, uint32_t desc) 400 { 401 intptr_t i, opr_sz = simd_oprsz(desc); 402 int16_t *d = vd, *n = vn, *m = vm, *a = va; 403 uint32_t discard; 404 405 for (i = 0; i < opr_sz / 2; ++i) { 406 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 407 } 408 } 409 410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 411 { 412 intptr_t i, opr_sz = simd_oprsz(desc); 413 int16_t *d = vd, *n = vn, *m = vm; 414 uint32_t discard; 415 416 for (i = 0; i < opr_sz / 2; ++i) { 417 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 418 } 419 } 420 421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 422 { 423 intptr_t i, opr_sz = simd_oprsz(desc); 424 int16_t *d = vd, *n = vn, *m = vm; 425 uint32_t discard; 426 427 for (i = 0; i < opr_sz / 2; ++i) { 428 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 429 } 430 } 431 432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 433 { 434 intptr_t i, j, opr_sz = simd_oprsz(desc); 435 int idx = simd_data(desc); 436 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 437 uint32_t discard; 438 439 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 440 int16_t mm = m[i]; 441 for (j = 0; j < 16 / 2; ++j) { 442 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 443 } 444 } 445 } 446 447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 448 { 449 intptr_t i, j, opr_sz = simd_oprsz(desc); 450 int idx = simd_data(desc); 451 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 452 uint32_t discard; 453 454 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 455 int16_t mm = m[i]; 456 for (j = 0; j < 16 / 2; ++j) { 457 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 458 } 459 } 460 } 461 462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 464 bool neg, bool round, uint32_t *sat) 465 { 466 /* Simplify similarly to do_sqrdmlah_b above. */ 467 int64_t ret = (int64_t)src1 * src2; 468 if (neg) { 469 ret = -ret; 470 } 471 ret += ((int64_t)src3 << 31) + (round << 30); 472 ret >>= 31; 473 474 if (ret != (int32_t)ret) { 475 *sat = 1; 476 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 477 } 478 return ret; 479 } 480 481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 482 int32_t src2, int32_t src3) 483 { 484 uint32_t *sat = &env->vfp.qc[0]; 485 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 486 } 487 488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 489 void *vq, uint32_t desc) 490 { 491 uintptr_t opr_sz = simd_oprsz(desc); 492 int32_t *d = vd; 493 int32_t *n = vn; 494 int32_t *m = vm; 495 uintptr_t i; 496 497 for (i = 0; i < opr_sz / 4; ++i) { 498 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 499 } 500 clear_tail(d, opr_sz, simd_maxsz(desc)); 501 } 502 503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 504 int32_t src2, int32_t src3) 505 { 506 uint32_t *sat = &env->vfp.qc[0]; 507 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 508 } 509 510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 511 void *vq, uint32_t desc) 512 { 513 uintptr_t opr_sz = simd_oprsz(desc); 514 int32_t *d = vd; 515 int32_t *n = vn; 516 int32_t *m = vm; 517 uintptr_t i; 518 519 for (i = 0; i < opr_sz / 4; ++i) { 520 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 521 } 522 clear_tail(d, opr_sz, simd_maxsz(desc)); 523 } 524 525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 526 void *vq, uint32_t desc) 527 { 528 intptr_t i, opr_sz = simd_oprsz(desc); 529 int32_t *d = vd, *n = vn, *m = vm; 530 531 for (i = 0; i < opr_sz / 4; ++i) { 532 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 533 } 534 clear_tail(d, opr_sz, simd_maxsz(desc)); 535 } 536 537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 538 void *vq, uint32_t desc) 539 { 540 intptr_t i, opr_sz = simd_oprsz(desc); 541 int32_t *d = vd, *n = vn, *m = vm; 542 543 for (i = 0; i < opr_sz / 4; ++i) { 544 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 545 } 546 clear_tail(d, opr_sz, simd_maxsz(desc)); 547 } 548 549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm, 550 void *vq, uint32_t desc) 551 { 552 intptr_t i, j, opr_sz = simd_oprsz(desc); 553 int idx = simd_data(desc); 554 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 555 intptr_t elements = opr_sz / 4; 556 intptr_t eltspersegment = MIN(16 / 4, elements); 557 558 for (i = 0; i < elements; i += 16 / 4) { 559 int32_t mm = m[i]; 560 for (j = 0; j < eltspersegment; ++j) { 561 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq); 562 } 563 } 564 clear_tail(d, opr_sz, simd_maxsz(desc)); 565 } 566 567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, 568 void *vq, uint32_t desc) 569 { 570 intptr_t i, j, opr_sz = simd_oprsz(desc); 571 int idx = simd_data(desc); 572 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 573 intptr_t elements = opr_sz / 4; 574 intptr_t eltspersegment = MIN(16 / 4, elements); 575 576 for (i = 0; i < elements; i += 16 / 4) { 577 int32_t mm = m[i]; 578 for (j = 0; j < eltspersegment; ++j) { 579 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq); 580 } 581 } 582 clear_tail(d, opr_sz, simd_maxsz(desc)); 583 } 584 585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm, 586 void *vq, uint32_t desc) 587 { 588 intptr_t i, j, opr_sz = simd_oprsz(desc); 589 int idx = simd_data(desc); 590 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 591 intptr_t elements = opr_sz / 4; 592 intptr_t eltspersegment = MIN(16 / 4, elements); 593 594 for (i = 0; i < elements; i += 16 / 4) { 595 int32_t mm = m[i]; 596 for (j = 0; j < eltspersegment; ++j) { 597 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq); 598 } 599 } 600 clear_tail(d, opr_sz, simd_maxsz(desc)); 601 } 602 603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm, 604 void *vq, uint32_t desc) 605 { 606 intptr_t i, j, opr_sz = simd_oprsz(desc); 607 int idx = simd_data(desc); 608 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 609 intptr_t elements = opr_sz / 4; 610 intptr_t eltspersegment = MIN(16 / 4, elements); 611 612 for (i = 0; i < elements; i += 16 / 4) { 613 int32_t mm = m[i]; 614 for (j = 0; j < eltspersegment; ++j) { 615 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq); 616 } 617 } 618 clear_tail(d, opr_sz, simd_maxsz(desc)); 619 } 620 621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 622 void *va, uint32_t desc) 623 { 624 intptr_t i, opr_sz = simd_oprsz(desc); 625 int32_t *d = vd, *n = vn, *m = vm, *a = va; 626 uint32_t discard; 627 628 for (i = 0; i < opr_sz / 4; ++i) { 629 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 630 } 631 } 632 633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 634 void *va, uint32_t desc) 635 { 636 intptr_t i, opr_sz = simd_oprsz(desc); 637 int32_t *d = vd, *n = vn, *m = vm, *a = va; 638 uint32_t discard; 639 640 for (i = 0; i < opr_sz / 4; ++i) { 641 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 642 } 643 } 644 645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 646 { 647 intptr_t i, opr_sz = simd_oprsz(desc); 648 int32_t *d = vd, *n = vn, *m = vm; 649 uint32_t discard; 650 651 for (i = 0; i < opr_sz / 4; ++i) { 652 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 653 } 654 } 655 656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 657 { 658 intptr_t i, opr_sz = simd_oprsz(desc); 659 int32_t *d = vd, *n = vn, *m = vm; 660 uint32_t discard; 661 662 for (i = 0; i < opr_sz / 4; ++i) { 663 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 664 } 665 } 666 667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 668 { 669 intptr_t i, j, opr_sz = simd_oprsz(desc); 670 int idx = simd_data(desc); 671 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 672 uint32_t discard; 673 674 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 675 int32_t mm = m[i]; 676 for (j = 0; j < 16 / 4; ++j) { 677 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 678 } 679 } 680 } 681 682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 683 { 684 intptr_t i, j, opr_sz = simd_oprsz(desc); 685 int idx = simd_data(desc); 686 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 687 uint32_t discard; 688 689 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 690 int32_t mm = m[i]; 691 for (j = 0; j < 16 / 4; ++j) { 692 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 693 } 694 } 695 } 696 697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 698 static int64_t do_sat128_d(Int128 r) 699 { 700 int64_t ls = int128_getlo(r); 701 int64_t hs = int128_gethi(r); 702 703 if (unlikely(hs != (ls >> 63))) { 704 return hs < 0 ? INT64_MIN : INT64_MAX; 705 } 706 return ls; 707 } 708 709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 710 { 711 uint64_t l, h; 712 Int128 r, t; 713 714 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 715 muls64(&l, &h, m, n); 716 r = int128_make128(l, h); 717 if (neg) { 718 r = int128_neg(r); 719 } 720 if (a) { 721 t = int128_exts64(a); 722 t = int128_lshift(t, 63); 723 r = int128_add(r, t); 724 } 725 if (round) { 726 t = int128_exts64(1ll << 62); 727 r = int128_add(r, t); 728 } 729 r = int128_rshift(r, 63); 730 731 return do_sat128_d(r); 732 } 733 734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 735 void *va, uint32_t desc) 736 { 737 intptr_t i, opr_sz = simd_oprsz(desc); 738 int64_t *d = vd, *n = vn, *m = vm, *a = va; 739 740 for (i = 0; i < opr_sz / 8; ++i) { 741 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 742 } 743 } 744 745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 746 void *va, uint32_t desc) 747 { 748 intptr_t i, opr_sz = simd_oprsz(desc); 749 int64_t *d = vd, *n = vn, *m = vm, *a = va; 750 751 for (i = 0; i < opr_sz / 8; ++i) { 752 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 753 } 754 } 755 756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 757 { 758 intptr_t i, opr_sz = simd_oprsz(desc); 759 int64_t *d = vd, *n = vn, *m = vm; 760 761 for (i = 0; i < opr_sz / 8; ++i) { 762 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 763 } 764 } 765 766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 767 { 768 intptr_t i, opr_sz = simd_oprsz(desc); 769 int64_t *d = vd, *n = vn, *m = vm; 770 771 for (i = 0; i < opr_sz / 8; ++i) { 772 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 773 } 774 } 775 776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 777 { 778 intptr_t i, j, opr_sz = simd_oprsz(desc); 779 int idx = simd_data(desc); 780 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 781 782 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 783 int64_t mm = m[i]; 784 for (j = 0; j < 16 / 8; ++j) { 785 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 786 } 787 } 788 } 789 790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 791 { 792 intptr_t i, j, opr_sz = simd_oprsz(desc); 793 int idx = simd_data(desc); 794 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 795 796 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 797 int64_t mm = m[i]; 798 for (j = 0; j < 16 / 8; ++j) { 799 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 800 } 801 } 802 } 803 804 /* Integer 8 and 16-bit dot-product. 805 * 806 * Note that for the loops herein, host endianness does not matter 807 * with respect to the ordering of data within the quad-width lanes. 808 * All elements are treated equally, no matter where they are. 809 */ 810 811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 813 { \ 814 intptr_t i, opr_sz = simd_oprsz(desc); \ 815 TYPED *d = vd, *a = va; \ 816 TYPEN *n = vn; \ 817 TYPEM *m = vm; \ 818 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 819 d[i] = (a[i] + \ 820 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 821 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 822 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 823 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 824 } \ 825 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 826 } 827 828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 833 834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 836 { \ 837 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 838 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 839 /* \ 840 * Special case: opr_sz == 8 from AA64/AA32 advsimd means the \ 841 * first iteration might not be a full 16 byte segment. But \ 842 * for vector lengths beyond that this must be SVE and we know \ 843 * opr_sz is a multiple of 16, so we need not clamp segend \ 844 * to opr_sz_n when we advance it at the end of the loop. \ 845 */ \ 846 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 847 intptr_t index = simd_data(desc); \ 848 TYPED *d = vd, *a = va; \ 849 TYPEN *n = vn; \ 850 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 851 do { \ 852 TYPED m0 = m_indexed[i * 4 + 0]; \ 853 TYPED m1 = m_indexed[i * 4 + 1]; \ 854 TYPED m2 = m_indexed[i * 4 + 2]; \ 855 TYPED m3 = m_indexed[i * 4 + 3]; \ 856 do { \ 857 d[i] = (a[i] + \ 858 n[i * 4 + 0] * m0 + \ 859 n[i * 4 + 1] * m1 + \ 860 n[i * 4 + 2] * m2 + \ 861 n[i * 4 + 3] * m3); \ 862 } while (++i < segend); \ 863 segend = i + (16 / sizeof(TYPED)); \ 864 } while (i < opr_sz_n); \ 865 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 866 } 867 868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 874 875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 876 float_status *fpst, uint32_t desc) 877 { 878 uintptr_t opr_sz = simd_oprsz(desc); 879 float16 *d = vd; 880 float16 *n = vn; 881 float16 *m = vm; 882 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 883 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 884 uintptr_t i; 885 886 for (i = 0; i < opr_sz / 2; i += 2) { 887 float16 e0 = n[H2(i)]; 888 float16 e1 = m[H2(i + 1)]; 889 float16 e2 = n[H2(i + 1)]; 890 float16 e3 = m[H2(i)]; 891 892 if (rot) { 893 e3 = float16_maybe_ah_chs(e3, fpcr_ah); 894 } else { 895 e1 = float16_maybe_ah_chs(e1, fpcr_ah); 896 } 897 898 d[H2(i)] = float16_add(e0, e1, fpst); 899 d[H2(i + 1)] = float16_add(e2, e3, fpst); 900 } 901 clear_tail(d, opr_sz, simd_maxsz(desc)); 902 } 903 904 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 905 float_status *fpst, uint32_t desc) 906 { 907 uintptr_t opr_sz = simd_oprsz(desc); 908 float32 *d = vd; 909 float32 *n = vn; 910 float32 *m = vm; 911 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 912 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 913 uintptr_t i; 914 915 for (i = 0; i < opr_sz / 4; i += 2) { 916 float32 e0 = n[H4(i)]; 917 float32 e1 = m[H4(i + 1)]; 918 float32 e2 = n[H4(i + 1)]; 919 float32 e3 = m[H4(i)]; 920 921 if (rot) { 922 e3 = float32_maybe_ah_chs(e3, fpcr_ah); 923 } else { 924 e1 = float32_maybe_ah_chs(e1, fpcr_ah); 925 } 926 927 d[H4(i)] = float32_add(e0, e1, fpst); 928 d[H4(i + 1)] = float32_add(e2, e3, fpst); 929 } 930 clear_tail(d, opr_sz, simd_maxsz(desc)); 931 } 932 933 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 934 float_status *fpst, uint32_t desc) 935 { 936 uintptr_t opr_sz = simd_oprsz(desc); 937 float64 *d = vd; 938 float64 *n = vn; 939 float64 *m = vm; 940 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 941 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 942 uintptr_t i; 943 944 for (i = 0; i < opr_sz / 8; i += 2) { 945 float64 e0 = n[i]; 946 float64 e1 = m[i + 1]; 947 float64 e2 = n[i + 1]; 948 float64 e3 = m[i]; 949 950 if (rot) { 951 e3 = float64_maybe_ah_chs(e3, fpcr_ah); 952 } else { 953 e1 = float64_maybe_ah_chs(e1, fpcr_ah); 954 } 955 956 d[i] = float64_add(e0, e1, fpst); 957 d[i + 1] = float64_add(e2, e3, fpst); 958 } 959 clear_tail(d, opr_sz, simd_maxsz(desc)); 960 } 961 962 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 963 float_status *fpst, uint32_t desc) 964 { 965 uintptr_t opr_sz = simd_oprsz(desc); 966 float16 *d = vd, *n = vn, *m = vm, *a = va; 967 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 968 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 969 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 970 uint32_t negf_real = flip ^ negf_imag; 971 float16 negx_imag, negx_real; 972 uintptr_t i; 973 974 /* With AH=0, use negx; with AH=1 use negf. */ 975 negx_real = (negf_real & ~fpcr_ah) << 15; 976 negx_imag = (negf_imag & ~fpcr_ah) << 15; 977 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 978 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 979 980 for (i = 0; i < opr_sz / 2; i += 2) { 981 float16 e2 = n[H2(i + flip)]; 982 float16 e1 = m[H2(i + flip)] ^ negx_real; 983 float16 e4 = e2; 984 float16 e3 = m[H2(i + 1 - flip)] ^ negx_imag; 985 986 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], negf_real, fpst); 987 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], negf_imag, fpst); 988 } 989 clear_tail(d, opr_sz, simd_maxsz(desc)); 990 } 991 992 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 993 float_status *fpst, uint32_t desc) 994 { 995 uintptr_t opr_sz = simd_oprsz(desc); 996 float16 *d = vd, *n = vn, *m = vm, *a = va; 997 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 998 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 999 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1000 uint32_t neg_real = flip ^ neg_imag; 1001 intptr_t elements = opr_sz / sizeof(float16); 1002 intptr_t eltspersegment = MIN(16 / sizeof(float16), elements); 1003 intptr_t i, j; 1004 1005 /* Shift boolean to the sign bit so we can xor to negate. */ 1006 neg_real <<= 15; 1007 neg_imag <<= 15; 1008 1009 for (i = 0; i < elements; i += eltspersegment) { 1010 float16 mr = m[H2(i + 2 * index + 0)]; 1011 float16 mi = m[H2(i + 2 * index + 1)]; 1012 float16 e1 = neg_real ^ (flip ? mi : mr); 1013 float16 e3 = neg_imag ^ (flip ? mr : mi); 1014 1015 for (j = i; j < i + eltspersegment; j += 2) { 1016 float16 e2 = n[H2(j + flip)]; 1017 float16 e4 = e2; 1018 1019 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); 1020 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); 1021 } 1022 } 1023 clear_tail(d, opr_sz, simd_maxsz(desc)); 1024 } 1025 1026 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 1027 float_status *fpst, uint32_t desc) 1028 { 1029 uintptr_t opr_sz = simd_oprsz(desc); 1030 float32 *d = vd, *n = vn, *m = vm, *a = va; 1031 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1032 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 1033 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1034 uint32_t negf_real = flip ^ negf_imag; 1035 float32 negx_imag, negx_real; 1036 uintptr_t i; 1037 1038 /* With AH=0, use negx; with AH=1 use negf. */ 1039 negx_real = (negf_real & ~fpcr_ah) << 31; 1040 negx_imag = (negf_imag & ~fpcr_ah) << 31; 1041 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1042 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1043 1044 for (i = 0; i < opr_sz / 4; i += 2) { 1045 float32 e2 = n[H4(i + flip)]; 1046 float32 e1 = m[H4(i + flip)] ^ negx_real; 1047 float32 e4 = e2; 1048 float32 e3 = m[H4(i + 1 - flip)] ^ negx_imag; 1049 1050 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], negf_real, fpst); 1051 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], negf_imag, fpst); 1052 } 1053 clear_tail(d, opr_sz, simd_maxsz(desc)); 1054 } 1055 1056 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 1057 float_status *fpst, uint32_t desc) 1058 { 1059 uintptr_t opr_sz = simd_oprsz(desc); 1060 float32 *d = vd, *n = vn, *m = vm, *a = va; 1061 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1062 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1063 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1064 uint32_t neg_real = flip ^ neg_imag; 1065 intptr_t elements = opr_sz / sizeof(float32); 1066 intptr_t eltspersegment = MIN(16 / sizeof(float32), elements); 1067 intptr_t i, j; 1068 1069 /* Shift boolean to the sign bit so we can xor to negate. */ 1070 neg_real <<= 31; 1071 neg_imag <<= 31; 1072 1073 for (i = 0; i < elements; i += eltspersegment) { 1074 float32 mr = m[H4(i + 2 * index + 0)]; 1075 float32 mi = m[H4(i + 2 * index + 1)]; 1076 float32 e1 = neg_real ^ (flip ? mi : mr); 1077 float32 e3 = neg_imag ^ (flip ? mr : mi); 1078 1079 for (j = i; j < i + eltspersegment; j += 2) { 1080 float32 e2 = n[H4(j + flip)]; 1081 float32 e4 = e2; 1082 1083 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); 1084 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); 1085 } 1086 } 1087 clear_tail(d, opr_sz, simd_maxsz(desc)); 1088 } 1089 1090 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 1091 float_status *fpst, uint32_t desc) 1092 { 1093 uintptr_t opr_sz = simd_oprsz(desc); 1094 float64 *d = vd, *n = vn, *m = vm, *a = va; 1095 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1096 uint32_t fpcr_ah = extract32(desc, SIMD_DATA_SHIFT + 2, 1); 1097 uint32_t negf_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1098 uint32_t negf_real = flip ^ negf_imag; 1099 float64 negx_real, negx_imag; 1100 uintptr_t i; 1101 1102 /* With AH=0, use negx; with AH=1 use negf. */ 1103 negx_real = (uint64_t)(negf_real & ~fpcr_ah) << 63; 1104 negx_imag = (uint64_t)(negf_imag & ~fpcr_ah) << 63; 1105 negf_real = (negf_real & fpcr_ah ? float_muladd_negate_product : 0); 1106 negf_imag = (negf_imag & fpcr_ah ? float_muladd_negate_product : 0); 1107 1108 for (i = 0; i < opr_sz / 8; i += 2) { 1109 float64 e2 = n[i + flip]; 1110 float64 e1 = m[i + flip] ^ negx_real; 1111 float64 e4 = e2; 1112 float64 e3 = m[i + 1 - flip] ^ negx_imag; 1113 1114 d[i] = float64_muladd(e2, e1, a[i], negf_real, fpst); 1115 d[i + 1] = float64_muladd(e4, e3, a[i + 1], negf_imag, fpst); 1116 } 1117 clear_tail(d, opr_sz, simd_maxsz(desc)); 1118 } 1119 1120 /* 1121 * Floating point comparisons producing an integer result (all 1s or all 0s). 1122 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1123 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1124 */ 1125 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 1126 { 1127 return -float16_eq_quiet(op1, op2, stat); 1128 } 1129 1130 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 1131 { 1132 return -float32_eq_quiet(op1, op2, stat); 1133 } 1134 1135 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat) 1136 { 1137 return -float64_eq_quiet(op1, op2, stat); 1138 } 1139 1140 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 1141 { 1142 return -float16_le(op2, op1, stat); 1143 } 1144 1145 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 1146 { 1147 return -float32_le(op2, op1, stat); 1148 } 1149 1150 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat) 1151 { 1152 return -float64_le(op2, op1, stat); 1153 } 1154 1155 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 1156 { 1157 return -float16_lt(op2, op1, stat); 1158 } 1159 1160 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 1161 { 1162 return -float32_lt(op2, op1, stat); 1163 } 1164 1165 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat) 1166 { 1167 return -float64_lt(op2, op1, stat); 1168 } 1169 1170 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 1171 { 1172 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 1173 } 1174 1175 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 1176 { 1177 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1178 } 1179 1180 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat) 1181 { 1182 return -float64_le(float64_abs(op2), float64_abs(op1), stat); 1183 } 1184 1185 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1186 { 1187 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1188 } 1189 1190 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1191 { 1192 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1193 } 1194 1195 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat) 1196 { 1197 return -float64_lt(float64_abs(op2), float64_abs(op1), stat); 1198 } 1199 1200 static int16_t vfp_tosszh(float16 x, float_status *fpst) 1201 { 1202 if (float16_is_any_nan(x)) { 1203 float_raise(float_flag_invalid, fpst); 1204 return 0; 1205 } 1206 return float16_to_int16_round_to_zero(x, fpst); 1207 } 1208 1209 static uint16_t vfp_touszh(float16 x, float_status *fpst) 1210 { 1211 if (float16_is_any_nan(x)) { 1212 float_raise(float_flag_invalid, fpst); 1213 return 0; 1214 } 1215 return float16_to_uint16_round_to_zero(x, fpst); 1216 } 1217 1218 #define DO_2OP(NAME, FUNC, TYPE) \ 1219 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 1220 { \ 1221 intptr_t i, oprsz = simd_oprsz(desc); \ 1222 TYPE *d = vd, *n = vn; \ 1223 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1224 d[i] = FUNC(n[i], stat); \ 1225 } \ 1226 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1227 } 1228 1229 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1230 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1231 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1232 1233 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1234 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1235 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1236 1237 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1238 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1239 1240 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1241 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1242 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1243 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1244 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1245 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1246 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1247 DO_2OP(gvec_touszh, vfp_touszh, float16) 1248 1249 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1250 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1251 { \ 1252 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1253 } 1254 1255 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1256 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1257 { \ 1258 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1259 } 1260 1261 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1262 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1263 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1264 WRAP_CMP0_##DIRN(FN, CMPOP, float64) \ 1265 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1266 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) \ 1267 DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64) 1268 1269 DO_2OP_CMP0(cgt, cgt, FWD) 1270 DO_2OP_CMP0(cge, cge, FWD) 1271 DO_2OP_CMP0(ceq, ceq, FWD) 1272 DO_2OP_CMP0(clt, cgt, REV) 1273 DO_2OP_CMP0(cle, cge, REV) 1274 1275 #undef DO_2OP 1276 #undef DO_2OP_CMP0 1277 1278 /* Floating-point trigonometric starting value. 1279 * See the ARM ARM pseudocode function FPTrigSMul. 1280 */ 1281 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1282 { 1283 float16 result = float16_mul(op1, op1, stat); 1284 if (!float16_is_any_nan(result)) { 1285 result = float16_set_sign(result, op2 & 1); 1286 } 1287 return result; 1288 } 1289 1290 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1291 { 1292 float32 result = float32_mul(op1, op1, stat); 1293 if (!float32_is_any_nan(result)) { 1294 result = float32_set_sign(result, op2 & 1); 1295 } 1296 return result; 1297 } 1298 1299 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1300 { 1301 float64 result = float64_mul(op1, op1, stat); 1302 if (!float64_is_any_nan(result)) { 1303 result = float64_set_sign(result, op2 & 1); 1304 } 1305 return result; 1306 } 1307 1308 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1309 { 1310 return float16_abs(float16_sub(op1, op2, stat)); 1311 } 1312 1313 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1314 { 1315 return float32_abs(float32_sub(op1, op2, stat)); 1316 } 1317 1318 static float64 float64_abd(float64 op1, float64 op2, float_status *stat) 1319 { 1320 return float64_abs(float64_sub(op1, op2, stat)); 1321 } 1322 1323 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */ 1324 static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat) 1325 { 1326 float16 r = float16_sub(op1, op2, stat); 1327 return float16_is_any_nan(r) ? r : float16_abs(r); 1328 } 1329 1330 static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat) 1331 { 1332 float32 r = float32_sub(op1, op2, stat); 1333 return float32_is_any_nan(r) ? r : float32_abs(r); 1334 } 1335 1336 static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat) 1337 { 1338 float64 r = float64_sub(op1, op2, stat); 1339 return float64_is_any_nan(r) ? r : float64_abs(r); 1340 } 1341 1342 /* 1343 * Reciprocal step. These are the AArch32 version which uses a 1344 * non-fused multiply-and-subtract. 1345 */ 1346 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1347 { 1348 op1 = float16_squash_input_denormal(op1, stat); 1349 op2 = float16_squash_input_denormal(op2, stat); 1350 1351 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1352 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1353 return float16_two; 1354 } 1355 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1356 } 1357 1358 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1359 { 1360 op1 = float32_squash_input_denormal(op1, stat); 1361 op2 = float32_squash_input_denormal(op2, stat); 1362 1363 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1364 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1365 return float32_two; 1366 } 1367 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1368 } 1369 1370 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1371 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1372 { 1373 op1 = float16_squash_input_denormal(op1, stat); 1374 op2 = float16_squash_input_denormal(op2, stat); 1375 1376 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1377 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1378 return float16_one_point_five; 1379 } 1380 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1381 return float16_div(op1, float16_two, stat); 1382 } 1383 1384 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1385 { 1386 op1 = float32_squash_input_denormal(op1, stat); 1387 op2 = float32_squash_input_denormal(op2, stat); 1388 1389 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1390 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1391 return float32_one_point_five; 1392 } 1393 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1394 return float32_div(op1, float32_two, stat); 1395 } 1396 1397 #define DO_3OP(NAME, FUNC, TYPE) \ 1398 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1399 float_status *stat, uint32_t desc) \ 1400 { \ 1401 intptr_t i, oprsz = simd_oprsz(desc); \ 1402 TYPE *d = vd, *n = vn, *m = vm; \ 1403 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1404 d[i] = FUNC(n[i], m[i], stat); \ 1405 } \ 1406 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1407 } 1408 1409 DO_3OP(gvec_fadd_h, float16_add, float16) 1410 DO_3OP(gvec_fadd_s, float32_add, float32) 1411 DO_3OP(gvec_fadd_d, float64_add, float64) 1412 1413 DO_3OP(gvec_fsub_h, float16_sub, float16) 1414 DO_3OP(gvec_fsub_s, float32_sub, float32) 1415 DO_3OP(gvec_fsub_d, float64_sub, float64) 1416 1417 DO_3OP(gvec_fmul_h, float16_mul, float16) 1418 DO_3OP(gvec_fmul_s, float32_mul, float32) 1419 DO_3OP(gvec_fmul_d, float64_mul, float64) 1420 1421 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1422 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1423 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1424 1425 DO_3OP(gvec_fabd_h, float16_abd, float16) 1426 DO_3OP(gvec_fabd_s, float32_abd, float32) 1427 DO_3OP(gvec_fabd_d, float64_abd, float64) 1428 1429 DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16) 1430 DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32) 1431 DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64) 1432 1433 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1434 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1435 DO_3OP(gvec_fceq_d, float64_ceq, float64) 1436 1437 DO_3OP(gvec_fcge_h, float16_cge, float16) 1438 DO_3OP(gvec_fcge_s, float32_cge, float32) 1439 DO_3OP(gvec_fcge_d, float64_cge, float64) 1440 1441 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1442 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1443 DO_3OP(gvec_fcgt_d, float64_cgt, float64) 1444 1445 DO_3OP(gvec_facge_h, float16_acge, float16) 1446 DO_3OP(gvec_facge_s, float32_acge, float32) 1447 DO_3OP(gvec_facge_d, float64_acge, float64) 1448 1449 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1450 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1451 DO_3OP(gvec_facgt_d, float64_acgt, float64) 1452 1453 DO_3OP(gvec_fmax_h, float16_max, float16) 1454 DO_3OP(gvec_fmax_s, float32_max, float32) 1455 DO_3OP(gvec_fmax_d, float64_max, float64) 1456 1457 DO_3OP(gvec_fmin_h, float16_min, float16) 1458 DO_3OP(gvec_fmin_s, float32_min, float32) 1459 DO_3OP(gvec_fmin_d, float64_min, float64) 1460 1461 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1462 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1463 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64) 1464 1465 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1466 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1467 DO_3OP(gvec_fminnum_d, float64_minnum, float64) 1468 1469 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1470 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1471 1472 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1473 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1474 1475 #ifdef TARGET_AARCH64 1476 DO_3OP(gvec_fdiv_h, float16_div, float16) 1477 DO_3OP(gvec_fdiv_s, float32_div, float32) 1478 DO_3OP(gvec_fdiv_d, float64_div, float64) 1479 1480 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16) 1481 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32) 1482 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64) 1483 1484 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1485 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1486 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1487 1488 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1489 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1490 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1491 1492 DO_3OP(gvec_ah_recps_h, helper_recpsf_ah_f16, float16) 1493 DO_3OP(gvec_ah_recps_s, helper_recpsf_ah_f32, float32) 1494 DO_3OP(gvec_ah_recps_d, helper_recpsf_ah_f64, float64) 1495 1496 DO_3OP(gvec_ah_rsqrts_h, helper_rsqrtsf_ah_f16, float16) 1497 DO_3OP(gvec_ah_rsqrts_s, helper_rsqrtsf_ah_f32, float32) 1498 DO_3OP(gvec_ah_rsqrts_d, helper_rsqrtsf_ah_f64, float64) 1499 1500 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16) 1501 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32) 1502 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64) 1503 1504 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16) 1505 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32) 1506 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64) 1507 1508 #endif 1509 #undef DO_3OP 1510 1511 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1512 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1513 float_status *stat) 1514 { 1515 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1516 } 1517 1518 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1519 float_status *stat) 1520 { 1521 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1522 } 1523 1524 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1525 float_status *stat) 1526 { 1527 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1528 } 1529 1530 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1531 float_status *stat) 1532 { 1533 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1534 } 1535 1536 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1537 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1538 float_status *stat) 1539 { 1540 return float16_muladd(op1, op2, dest, 0, stat); 1541 } 1542 1543 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1544 float_status *stat) 1545 { 1546 return float32_muladd(op1, op2, dest, 0, stat); 1547 } 1548 1549 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2, 1550 float_status *stat) 1551 { 1552 return float64_muladd(op1, op2, dest, 0, stat); 1553 } 1554 1555 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1556 float_status *stat) 1557 { 1558 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1559 } 1560 1561 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1562 float_status *stat) 1563 { 1564 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1565 } 1566 1567 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2, 1568 float_status *stat) 1569 { 1570 return float64_muladd(float64_chs(op1), op2, dest, 0, stat); 1571 } 1572 1573 static float16 float16_ah_mulsub_f(float16 dest, float16 op1, float16 op2, 1574 float_status *stat) 1575 { 1576 return float16_muladd(op1, op2, dest, float_muladd_negate_product, stat); 1577 } 1578 1579 static float32 float32_ah_mulsub_f(float32 dest, float32 op1, float32 op2, 1580 float_status *stat) 1581 { 1582 return float32_muladd(op1, op2, dest, float_muladd_negate_product, stat); 1583 } 1584 1585 static float64 float64_ah_mulsub_f(float64 dest, float64 op1, float64 op2, 1586 float_status *stat) 1587 { 1588 return float64_muladd(op1, op2, dest, float_muladd_negate_product, stat); 1589 } 1590 1591 #define DO_MULADD(NAME, FUNC, TYPE) \ 1592 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1593 float_status *stat, uint32_t desc) \ 1594 { \ 1595 intptr_t i, oprsz = simd_oprsz(desc); \ 1596 TYPE *d = vd, *n = vn, *m = vm; \ 1597 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1598 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1599 } \ 1600 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1601 } 1602 1603 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1604 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1605 1606 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1607 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1608 1609 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1610 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1611 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64) 1612 1613 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1614 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1615 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64) 1616 1617 DO_MULADD(gvec_ah_vfms_h, float16_ah_mulsub_f, float16) 1618 DO_MULADD(gvec_ah_vfms_s, float32_ah_mulsub_f, float32) 1619 DO_MULADD(gvec_ah_vfms_d, float64_ah_mulsub_f, float64) 1620 1621 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1622 * For AdvSIMD, there is of course only one such vector segment. 1623 */ 1624 1625 #define DO_MUL_IDX(NAME, TYPE, H) \ 1626 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1627 { \ 1628 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1629 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1630 intptr_t idx = simd_data(desc); \ 1631 TYPE *d = vd, *n = vn, *m = vm; \ 1632 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1633 TYPE mm = m[H(i + idx)]; \ 1634 for (j = 0; j < segment; j++) { \ 1635 d[i + j] = n[i + j] * mm; \ 1636 } \ 1637 } \ 1638 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1639 } 1640 1641 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1642 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1643 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1644 1645 #undef DO_MUL_IDX 1646 1647 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1648 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1649 { \ 1650 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1651 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1652 intptr_t idx = simd_data(desc); \ 1653 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1654 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1655 TYPE mm = m[H(i + idx)]; \ 1656 for (j = 0; j < segment; j++) { \ 1657 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1658 } \ 1659 } \ 1660 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1661 } 1662 1663 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1664 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1665 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1666 1667 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1668 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1669 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1670 1671 #undef DO_MLA_IDX 1672 1673 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \ 1674 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1675 float_status *stat, uint32_t desc) \ 1676 { \ 1677 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1678 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1679 intptr_t idx = simd_data(desc); \ 1680 TYPE *d = vd, *n = vn, *m = vm; \ 1681 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1682 TYPE mm = m[H(i + idx)]; \ 1683 for (j = 0; j < segment; j++) { \ 1684 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \ 1685 } \ 1686 } \ 1687 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1688 } 1689 1690 #define nop(N, M, S) (M) 1691 1692 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2) 1693 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4) 1694 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8) 1695 1696 #ifdef TARGET_AARCH64 1697 1698 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2) 1699 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4) 1700 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8) 1701 1702 #endif 1703 1704 #undef nop 1705 1706 /* 1707 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1708 * the fused ops below they assume accumulate both from and into Vd. 1709 */ 1710 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2) 1711 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4) 1712 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2) 1713 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4) 1714 1715 #undef DO_FMUL_IDX 1716 1717 #define DO_FMLA_IDX(NAME, TYPE, H, NEGX, NEGF) \ 1718 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1719 float_status *stat, uint32_t desc) \ 1720 { \ 1721 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1722 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1723 intptr_t idx = simd_data(desc); \ 1724 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1725 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1726 TYPE mm = m[H(i + idx)]; \ 1727 for (j = 0; j < segment; j++) { \ 1728 d[i + j] = TYPE##_muladd(n[i + j] ^ NEGX, mm, \ 1729 a[i + j], NEGF, stat); \ 1730 } \ 1731 } \ 1732 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1733 } 1734 1735 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2, 0, 0) 1736 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4, 0, 0) 1737 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8, 0, 0) 1738 1739 DO_FMLA_IDX(gvec_fmls_idx_h, float16, H2, INT16_MIN, 0) 1740 DO_FMLA_IDX(gvec_fmls_idx_s, float32, H4, INT32_MIN, 0) 1741 DO_FMLA_IDX(gvec_fmls_idx_d, float64, H8, INT64_MIN, 0) 1742 1743 DO_FMLA_IDX(gvec_ah_fmls_idx_h, float16, H2, 0, float_muladd_negate_product) 1744 DO_FMLA_IDX(gvec_ah_fmls_idx_s, float32, H4, 0, float_muladd_negate_product) 1745 DO_FMLA_IDX(gvec_ah_fmls_idx_d, float64, H8, 0, float_muladd_negate_product) 1746 1747 #undef DO_FMLA_IDX 1748 1749 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1750 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1751 { \ 1752 intptr_t i, oprsz = simd_oprsz(desc); \ 1753 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1754 bool q = false; \ 1755 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1756 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1757 if (dd < MIN) { \ 1758 dd = MIN; \ 1759 q = true; \ 1760 } else if (dd > MAX) { \ 1761 dd = MAX; \ 1762 q = true; \ 1763 } \ 1764 d[i] = dd; \ 1765 } \ 1766 if (q) { \ 1767 uint32_t *qc = vq; \ 1768 qc[0] = 1; \ 1769 } \ 1770 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1771 } 1772 1773 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1774 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1775 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1776 1777 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1778 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1779 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1780 1781 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1782 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1783 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1784 1785 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1786 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1787 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1788 1789 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX) 1790 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX) 1791 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX) 1792 1793 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX) 1794 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX) 1795 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX) 1796 1797 #undef DO_SAT 1798 1799 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1800 void *vm, uint32_t desc) 1801 { 1802 intptr_t i, oprsz = simd_oprsz(desc); 1803 uint64_t *d = vd, *n = vn, *m = vm; 1804 bool q = false; 1805 1806 for (i = 0; i < oprsz / 8; i++) { 1807 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1808 if (dd < nn) { 1809 dd = UINT64_MAX; 1810 q = true; 1811 } 1812 d[i] = dd; 1813 } 1814 if (q) { 1815 uint32_t *qc = vq; 1816 qc[0] = 1; 1817 } 1818 clear_tail(d, oprsz, simd_maxsz(desc)); 1819 } 1820 1821 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1822 void *vm, uint32_t desc) 1823 { 1824 intptr_t i, oprsz = simd_oprsz(desc); 1825 uint64_t *d = vd, *n = vn, *m = vm; 1826 bool q = false; 1827 1828 for (i = 0; i < oprsz / 8; i++) { 1829 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1830 if (nn < mm) { 1831 dd = 0; 1832 q = true; 1833 } 1834 d[i] = dd; 1835 } 1836 if (q) { 1837 uint32_t *qc = vq; 1838 qc[0] = 1; 1839 } 1840 clear_tail(d, oprsz, simd_maxsz(desc)); 1841 } 1842 1843 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1844 void *vm, uint32_t desc) 1845 { 1846 intptr_t i, oprsz = simd_oprsz(desc); 1847 int64_t *d = vd, *n = vn, *m = vm; 1848 bool q = false; 1849 1850 for (i = 0; i < oprsz / 8; i++) { 1851 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1852 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1853 dd = (nn >> 63) ^ ~INT64_MIN; 1854 q = true; 1855 } 1856 d[i] = dd; 1857 } 1858 if (q) { 1859 uint32_t *qc = vq; 1860 qc[0] = 1; 1861 } 1862 clear_tail(d, oprsz, simd_maxsz(desc)); 1863 } 1864 1865 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1866 void *vm, uint32_t desc) 1867 { 1868 intptr_t i, oprsz = simd_oprsz(desc); 1869 int64_t *d = vd, *n = vn, *m = vm; 1870 bool q = false; 1871 1872 for (i = 0; i < oprsz / 8; i++) { 1873 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1874 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1875 dd = (nn >> 63) ^ ~INT64_MIN; 1876 q = true; 1877 } 1878 d[i] = dd; 1879 } 1880 if (q) { 1881 uint32_t *qc = vq; 1882 qc[0] = 1; 1883 } 1884 clear_tail(d, oprsz, simd_maxsz(desc)); 1885 } 1886 1887 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn, 1888 void *vm, uint32_t desc) 1889 { 1890 intptr_t i, oprsz = simd_oprsz(desc); 1891 uint64_t *d = vd, *n = vn, *m = vm; 1892 bool q = false; 1893 1894 for (i = 0; i < oprsz / 8; i++) { 1895 uint64_t nn = n[i]; 1896 int64_t mm = m[i]; 1897 uint64_t dd = nn + mm; 1898 1899 if (mm < 0) { 1900 if (nn < (uint64_t)-mm) { 1901 dd = 0; 1902 q = true; 1903 } 1904 } else { 1905 if (dd < nn) { 1906 dd = UINT64_MAX; 1907 q = true; 1908 } 1909 } 1910 d[i] = dd; 1911 } 1912 if (q) { 1913 uint32_t *qc = vq; 1914 qc[0] = 1; 1915 } 1916 clear_tail(d, oprsz, simd_maxsz(desc)); 1917 } 1918 1919 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn, 1920 void *vm, uint32_t desc) 1921 { 1922 intptr_t i, oprsz = simd_oprsz(desc); 1923 uint64_t *d = vd, *n = vn, *m = vm; 1924 bool q = false; 1925 1926 for (i = 0; i < oprsz / 8; i++) { 1927 int64_t nn = n[i]; 1928 uint64_t mm = m[i]; 1929 int64_t dd = nn + mm; 1930 1931 if (mm > (uint64_t)(INT64_MAX - nn)) { 1932 dd = INT64_MAX; 1933 q = true; 1934 } 1935 d[i] = dd; 1936 } 1937 if (q) { 1938 uint32_t *qc = vq; 1939 qc[0] = 1; 1940 } 1941 clear_tail(d, oprsz, simd_maxsz(desc)); 1942 } 1943 1944 #define DO_SRA(NAME, TYPE) \ 1945 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1946 { \ 1947 intptr_t i, oprsz = simd_oprsz(desc); \ 1948 int shift = simd_data(desc); \ 1949 TYPE *d = vd, *n = vn; \ 1950 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1951 d[i] += n[i] >> shift; \ 1952 } \ 1953 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1954 } 1955 1956 DO_SRA(gvec_ssra_b, int8_t) 1957 DO_SRA(gvec_ssra_h, int16_t) 1958 DO_SRA(gvec_ssra_s, int32_t) 1959 DO_SRA(gvec_ssra_d, int64_t) 1960 1961 DO_SRA(gvec_usra_b, uint8_t) 1962 DO_SRA(gvec_usra_h, uint16_t) 1963 DO_SRA(gvec_usra_s, uint32_t) 1964 DO_SRA(gvec_usra_d, uint64_t) 1965 1966 #undef DO_SRA 1967 1968 #define DO_RSHR(NAME, TYPE) \ 1969 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1970 { \ 1971 intptr_t i, oprsz = simd_oprsz(desc); \ 1972 int shift = simd_data(desc); \ 1973 TYPE *d = vd, *n = vn; \ 1974 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1975 TYPE tmp = n[i] >> (shift - 1); \ 1976 d[i] = (tmp >> 1) + (tmp & 1); \ 1977 } \ 1978 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1979 } 1980 1981 DO_RSHR(gvec_srshr_b, int8_t) 1982 DO_RSHR(gvec_srshr_h, int16_t) 1983 DO_RSHR(gvec_srshr_s, int32_t) 1984 DO_RSHR(gvec_srshr_d, int64_t) 1985 1986 DO_RSHR(gvec_urshr_b, uint8_t) 1987 DO_RSHR(gvec_urshr_h, uint16_t) 1988 DO_RSHR(gvec_urshr_s, uint32_t) 1989 DO_RSHR(gvec_urshr_d, uint64_t) 1990 1991 #undef DO_RSHR 1992 1993 #define DO_RSRA(NAME, TYPE) \ 1994 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1995 { \ 1996 intptr_t i, oprsz = simd_oprsz(desc); \ 1997 int shift = simd_data(desc); \ 1998 TYPE *d = vd, *n = vn; \ 1999 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2000 TYPE tmp = n[i] >> (shift - 1); \ 2001 d[i] += (tmp >> 1) + (tmp & 1); \ 2002 } \ 2003 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2004 } 2005 2006 DO_RSRA(gvec_srsra_b, int8_t) 2007 DO_RSRA(gvec_srsra_h, int16_t) 2008 DO_RSRA(gvec_srsra_s, int32_t) 2009 DO_RSRA(gvec_srsra_d, int64_t) 2010 2011 DO_RSRA(gvec_ursra_b, uint8_t) 2012 DO_RSRA(gvec_ursra_h, uint16_t) 2013 DO_RSRA(gvec_ursra_s, uint32_t) 2014 DO_RSRA(gvec_ursra_d, uint64_t) 2015 2016 #undef DO_RSRA 2017 2018 #define DO_SRI(NAME, TYPE) \ 2019 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2020 { \ 2021 intptr_t i, oprsz = simd_oprsz(desc); \ 2022 int shift = simd_data(desc); \ 2023 TYPE *d = vd, *n = vn; \ 2024 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2025 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 2026 } \ 2027 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2028 } 2029 2030 DO_SRI(gvec_sri_b, uint8_t) 2031 DO_SRI(gvec_sri_h, uint16_t) 2032 DO_SRI(gvec_sri_s, uint32_t) 2033 DO_SRI(gvec_sri_d, uint64_t) 2034 2035 #undef DO_SRI 2036 2037 #define DO_SLI(NAME, TYPE) \ 2038 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2039 { \ 2040 intptr_t i, oprsz = simd_oprsz(desc); \ 2041 int shift = simd_data(desc); \ 2042 TYPE *d = vd, *n = vn; \ 2043 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2044 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 2045 } \ 2046 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2047 } 2048 2049 DO_SLI(gvec_sli_b, uint8_t) 2050 DO_SLI(gvec_sli_h, uint16_t) 2051 DO_SLI(gvec_sli_s, uint32_t) 2052 DO_SLI(gvec_sli_d, uint64_t) 2053 2054 #undef DO_SLI 2055 2056 /* 2057 * Convert float16 to float32, raising no exceptions and 2058 * preserving exceptional values, including SNaN. 2059 * This is effectively an unpack+repack operation. 2060 */ 2061 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 2062 { 2063 const int f16_bias = 15; 2064 const int f32_bias = 127; 2065 uint32_t sign = extract32(f16, 15, 1); 2066 uint32_t exp = extract32(f16, 10, 5); 2067 uint32_t frac = extract32(f16, 0, 10); 2068 2069 if (exp == 0x1f) { 2070 /* Inf or NaN */ 2071 exp = 0xff; 2072 } else if (exp == 0) { 2073 /* Zero or denormal. */ 2074 if (frac != 0) { 2075 if (fz16) { 2076 frac = 0; 2077 } else { 2078 /* 2079 * Denormal; these are all normal float32. 2080 * Shift the fraction so that the msb is at bit 11, 2081 * then remove bit 11 as the implicit bit of the 2082 * normalized float32. Note that we still go through 2083 * the shift for normal numbers below, to put the 2084 * float32 fraction at the right place. 2085 */ 2086 int shift = clz32(frac) - 21; 2087 frac = (frac << shift) & 0x3ff; 2088 exp = f32_bias - f16_bias - shift + 1; 2089 } 2090 } 2091 } else { 2092 /* Normal number; adjust the bias. */ 2093 exp += f32_bias - f16_bias; 2094 } 2095 sign <<= 31; 2096 exp <<= 23; 2097 frac <<= 23 - 10; 2098 2099 return sign | exp | frac; 2100 } 2101 2102 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 2103 { 2104 /* 2105 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 2106 * Load the 2nd qword iff is_q & is_2. 2107 * Shift to the 2nd dword iff !is_q & is_2. 2108 * For !is_q & !is_2, the upper bits of the result are garbage. 2109 */ 2110 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 2111 } 2112 2113 /* 2114 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 2115 * as there is not yet SVE versions that might use blocking. 2116 */ 2117 2118 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 2119 uint32_t desc, bool fz16) 2120 { 2121 intptr_t i, oprsz = simd_oprsz(desc); 2122 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2123 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2124 int is_q = oprsz == 16; 2125 uint64_t n_4, m_4; 2126 2127 /* Pre-load all of the f16 data, avoiding overlap issues. */ 2128 n_4 = load4_f16(vn, is_q, is_2); 2129 m_4 = load4_f16(vm, is_q, is_2); 2130 2131 /* Negate all inputs for FMLSL at once. */ 2132 if (is_s) { 2133 n_4 ^= 0x8000800080008000ull; 2134 } 2135 2136 for (i = 0; i < oprsz / 4; i++) { 2137 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2138 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 2139 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 2140 } 2141 clear_tail(d, oprsz, simd_maxsz(desc)); 2142 } 2143 2144 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 2145 CPUARMState *env, uint32_t desc) 2146 { 2147 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, 2148 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32)); 2149 } 2150 2151 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 2152 CPUARMState *env, uint32_t desc) 2153 { 2154 do_fmlal(vd, vn, vm, &env->vfp.fp_status_a64, desc, 2155 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64)); 2156 } 2157 2158 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 2159 CPUARMState *env, uint32_t desc) 2160 { 2161 intptr_t i, oprsz = simd_oprsz(desc); 2162 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 2163 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2164 float_status *status = &env->vfp.fp_status_a64; 2165 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64); 2166 2167 for (i = 0; i < oprsz; i += sizeof(float32)) { 2168 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; 2169 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 2170 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2171 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2172 float32 aa = *(float32 *)(va + H1_4(i)); 2173 2174 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); 2175 } 2176 } 2177 2178 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 2179 uint32_t desc, bool fz16) 2180 { 2181 intptr_t i, oprsz = simd_oprsz(desc); 2182 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2183 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2184 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 2185 int is_q = oprsz == 16; 2186 uint64_t n_4; 2187 float32 m_1; 2188 2189 /* Pre-load all of the f16 data, avoiding overlap issues. */ 2190 n_4 = load4_f16(vn, is_q, is_2); 2191 2192 /* Negate all inputs for FMLSL at once. */ 2193 if (is_s) { 2194 n_4 ^= 0x8000800080008000ull; 2195 } 2196 2197 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 2198 2199 for (i = 0; i < oprsz / 4; i++) { 2200 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2201 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 2202 } 2203 clear_tail(d, oprsz, simd_maxsz(desc)); 2204 } 2205 2206 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 2207 CPUARMState *env, uint32_t desc) 2208 { 2209 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, 2210 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32)); 2211 } 2212 2213 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 2214 CPUARMState *env, uint32_t desc) 2215 { 2216 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status_a64, desc, 2217 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64)); 2218 } 2219 2220 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 2221 CPUARMState *env, uint32_t desc) 2222 { 2223 intptr_t i, j, oprsz = simd_oprsz(desc); 2224 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 2225 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2226 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 2227 float_status *status = &env->vfp.fp_status_a64; 2228 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64); 2229 2230 for (i = 0; i < oprsz; i += 16) { 2231 float16 mm_16 = *(float16 *)(vm + i + idx); 2232 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2233 2234 for (j = 0; j < 16; j += sizeof(float32)) { 2235 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; 2236 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2237 float32 aa = *(float32 *)(va + H1_4(i + j)); 2238 2239 *(float32 *)(vd + H1_4(i + j)) = 2240 float32_muladd(nn, mm, aa, 0, status); 2241 } 2242 } 2243 } 2244 2245 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2246 { 2247 intptr_t i, opr_sz = simd_oprsz(desc); 2248 int8_t *d = vd, *n = vn, *m = vm; 2249 2250 for (i = 0; i < opr_sz; ++i) { 2251 int8_t mm = m[i]; 2252 int8_t nn = n[i]; 2253 int8_t res = 0; 2254 if (mm >= 0) { 2255 if (mm < 8) { 2256 res = nn << mm; 2257 } 2258 } else { 2259 res = nn >> (mm > -8 ? -mm : 7); 2260 } 2261 d[i] = res; 2262 } 2263 clear_tail(d, opr_sz, simd_maxsz(desc)); 2264 } 2265 2266 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2267 { 2268 intptr_t i, opr_sz = simd_oprsz(desc); 2269 int16_t *d = vd, *n = vn, *m = vm; 2270 2271 for (i = 0; i < opr_sz / 2; ++i) { 2272 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2273 int16_t nn = n[i]; 2274 int16_t res = 0; 2275 if (mm >= 0) { 2276 if (mm < 16) { 2277 res = nn << mm; 2278 } 2279 } else { 2280 res = nn >> (mm > -16 ? -mm : 15); 2281 } 2282 d[i] = res; 2283 } 2284 clear_tail(d, opr_sz, simd_maxsz(desc)); 2285 } 2286 2287 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2288 { 2289 intptr_t i, opr_sz = simd_oprsz(desc); 2290 uint8_t *d = vd, *n = vn, *m = vm; 2291 2292 for (i = 0; i < opr_sz; ++i) { 2293 int8_t mm = m[i]; 2294 uint8_t nn = n[i]; 2295 uint8_t res = 0; 2296 if (mm >= 0) { 2297 if (mm < 8) { 2298 res = nn << mm; 2299 } 2300 } else { 2301 if (mm > -8) { 2302 res = nn >> -mm; 2303 } 2304 } 2305 d[i] = res; 2306 } 2307 clear_tail(d, opr_sz, simd_maxsz(desc)); 2308 } 2309 2310 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2311 { 2312 intptr_t i, opr_sz = simd_oprsz(desc); 2313 uint16_t *d = vd, *n = vn, *m = vm; 2314 2315 for (i = 0; i < opr_sz / 2; ++i) { 2316 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2317 uint16_t nn = n[i]; 2318 uint16_t res = 0; 2319 if (mm >= 0) { 2320 if (mm < 16) { 2321 res = nn << mm; 2322 } 2323 } else { 2324 if (mm > -16) { 2325 res = nn >> -mm; 2326 } 2327 } 2328 d[i] = res; 2329 } 2330 clear_tail(d, opr_sz, simd_maxsz(desc)); 2331 } 2332 2333 /* 2334 * 8x8->8 polynomial multiply. 2335 * 2336 * Polynomial multiplication is like integer multiplication except the 2337 * partial products are XORed, not added. 2338 * 2339 * TODO: expose this as a generic vector operation, as it is a common 2340 * crypto building block. 2341 */ 2342 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 2343 { 2344 intptr_t i, opr_sz = simd_oprsz(desc); 2345 uint64_t *d = vd, *n = vn, *m = vm; 2346 2347 for (i = 0; i < opr_sz / 8; ++i) { 2348 d[i] = clmul_8x8_low(n[i], m[i]); 2349 } 2350 clear_tail(d, opr_sz, simd_maxsz(desc)); 2351 } 2352 2353 /* 2354 * 64x64->128 polynomial multiply. 2355 * Because of the lanes are not accessed in strict columns, 2356 * this probably cannot be turned into a generic helper. 2357 */ 2358 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2359 { 2360 intptr_t i, opr_sz = simd_oprsz(desc); 2361 intptr_t hi = simd_data(desc); 2362 uint64_t *d = vd, *n = vn, *m = vm; 2363 2364 for (i = 0; i < opr_sz / 8; i += 2) { 2365 Int128 r = clmul_64(n[i + hi], m[i + hi]); 2366 d[i] = int128_getlo(r); 2367 d[i + 1] = int128_gethi(r); 2368 } 2369 clear_tail(d, opr_sz, simd_maxsz(desc)); 2370 } 2371 2372 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2373 { 2374 int hi = simd_data(desc); 2375 uint64_t *d = vd, *n = vn, *m = vm; 2376 uint64_t nn = n[hi], mm = m[hi]; 2377 2378 d[0] = clmul_8x4_packed(nn, mm); 2379 nn >>= 32; 2380 mm >>= 32; 2381 d[1] = clmul_8x4_packed(nn, mm); 2382 2383 clear_tail(d, 16, simd_maxsz(desc)); 2384 } 2385 2386 #ifdef TARGET_AARCH64 2387 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2388 { 2389 int shift = simd_data(desc) * 8; 2390 intptr_t i, opr_sz = simd_oprsz(desc); 2391 uint64_t *d = vd, *n = vn, *m = vm; 2392 2393 for (i = 0; i < opr_sz / 8; ++i) { 2394 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); 2395 } 2396 } 2397 2398 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2399 { 2400 intptr_t sel = H4(simd_data(desc)); 2401 intptr_t i, opr_sz = simd_oprsz(desc); 2402 uint32_t *n = vn, *m = vm; 2403 uint64_t *d = vd; 2404 2405 for (i = 0; i < opr_sz / 8; ++i) { 2406 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); 2407 } 2408 } 2409 #endif 2410 2411 #define DO_CMP0(NAME, TYPE, OP) \ 2412 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2413 { \ 2414 intptr_t i, opr_sz = simd_oprsz(desc); \ 2415 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2416 TYPE nn = *(TYPE *)(vn + i); \ 2417 *(TYPE *)(vd + i) = -(nn OP 0); \ 2418 } \ 2419 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2420 } 2421 2422 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2423 DO_CMP0(gvec_clt0_b, int8_t, <) 2424 DO_CMP0(gvec_cle0_b, int8_t, <=) 2425 DO_CMP0(gvec_cgt0_b, int8_t, >) 2426 DO_CMP0(gvec_cge0_b, int8_t, >=) 2427 2428 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2429 DO_CMP0(gvec_clt0_h, int16_t, <) 2430 DO_CMP0(gvec_cle0_h, int16_t, <=) 2431 DO_CMP0(gvec_cgt0_h, int16_t, >) 2432 DO_CMP0(gvec_cge0_h, int16_t, >=) 2433 2434 #undef DO_CMP0 2435 2436 #define DO_ABD(NAME, TYPE) \ 2437 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2438 { \ 2439 intptr_t i, opr_sz = simd_oprsz(desc); \ 2440 TYPE *d = vd, *n = vn, *m = vm; \ 2441 \ 2442 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2443 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2444 } \ 2445 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2446 } 2447 2448 DO_ABD(gvec_sabd_b, int8_t) 2449 DO_ABD(gvec_sabd_h, int16_t) 2450 DO_ABD(gvec_sabd_s, int32_t) 2451 DO_ABD(gvec_sabd_d, int64_t) 2452 2453 DO_ABD(gvec_uabd_b, uint8_t) 2454 DO_ABD(gvec_uabd_h, uint16_t) 2455 DO_ABD(gvec_uabd_s, uint32_t) 2456 DO_ABD(gvec_uabd_d, uint64_t) 2457 2458 #undef DO_ABD 2459 2460 #define DO_ABA(NAME, TYPE) \ 2461 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2462 { \ 2463 intptr_t i, opr_sz = simd_oprsz(desc); \ 2464 TYPE *d = vd, *n = vn, *m = vm; \ 2465 \ 2466 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2467 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2468 } \ 2469 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2470 } 2471 2472 DO_ABA(gvec_saba_b, int8_t) 2473 DO_ABA(gvec_saba_h, int16_t) 2474 DO_ABA(gvec_saba_s, int32_t) 2475 DO_ABA(gvec_saba_d, int64_t) 2476 2477 DO_ABA(gvec_uaba_b, uint8_t) 2478 DO_ABA(gvec_uaba_h, uint16_t) 2479 DO_ABA(gvec_uaba_s, uint32_t) 2480 DO_ABA(gvec_uaba_d, uint64_t) 2481 2482 #undef DO_ABA 2483 2484 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2485 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 2486 float_status *stat, uint32_t desc) \ 2487 { \ 2488 ARMVectorReg scratch; \ 2489 intptr_t oprsz = simd_oprsz(desc); \ 2490 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2491 TYPE *d = vd, *n = vn, *m = vm; \ 2492 if (unlikely(d == m)) { \ 2493 m = memcpy(&scratch, m, oprsz); \ 2494 } \ 2495 for (intptr_t i = 0; i < half; ++i) { \ 2496 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat); \ 2497 } \ 2498 for (intptr_t i = 0; i < half; ++i) { \ 2499 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat); \ 2500 } \ 2501 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2502 } 2503 2504 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2) 2505 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4) 2506 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, ) 2507 2508 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2) 2509 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4) 2510 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, ) 2511 2512 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2) 2513 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4) 2514 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, ) 2515 2516 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2) 2517 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4) 2518 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, ) 2519 2520 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2) 2521 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4) 2522 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, ) 2523 2524 #ifdef TARGET_AARCH64 2525 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2) 2526 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4) 2527 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, ) 2528 2529 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2) 2530 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4) 2531 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, ) 2532 #endif 2533 2534 #undef DO_3OP_PAIR 2535 2536 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2537 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2538 { \ 2539 ARMVectorReg scratch; \ 2540 intptr_t oprsz = simd_oprsz(desc); \ 2541 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2542 TYPE *d = vd, *n = vn, *m = vm; \ 2543 if (unlikely(d == m)) { \ 2544 m = memcpy(&scratch, m, oprsz); \ 2545 } \ 2546 for (intptr_t i = 0; i < half; ++i) { \ 2547 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]); \ 2548 } \ 2549 for (intptr_t i = 0; i < half; ++i) { \ 2550 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]); \ 2551 } \ 2552 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2553 } 2554 2555 #define ADD(A, B) (A + B) 2556 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1) 2557 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2) 2558 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4) 2559 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, ) 2560 #undef ADD 2561 2562 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1) 2563 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2) 2564 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4) 2565 2566 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1) 2567 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2) 2568 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4) 2569 2570 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1) 2571 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2) 2572 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4) 2573 2574 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1) 2575 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2) 2576 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4) 2577 2578 #undef DO_3OP_PAIR 2579 2580 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2581 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 2582 { \ 2583 intptr_t i, oprsz = simd_oprsz(desc); \ 2584 int shift = simd_data(desc); \ 2585 TYPE *d = vd, *n = vn; \ 2586 float_status *fpst = stat; \ 2587 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2588 d[i] = FUNC(n[i], shift, fpst); \ 2589 } \ 2590 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2591 } 2592 2593 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t) 2594 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t) 2595 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2596 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2597 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2598 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2599 2600 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t) 2601 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t) 2602 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2603 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t) 2604 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2605 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2606 2607 #undef DO_VCVT_FIXED 2608 2609 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2610 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2611 { \ 2612 intptr_t i, oprsz = simd_oprsz(desc); \ 2613 uint32_t rmode = simd_data(desc); \ 2614 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2615 TYPE *d = vd, *n = vn; \ 2616 set_float_rounding_mode(rmode, fpst); \ 2617 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2618 d[i] = FUNC(n[i], 0, fpst); \ 2619 } \ 2620 set_float_rounding_mode(prev_rmode, fpst); \ 2621 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2622 } 2623 2624 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t) 2625 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t) 2626 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2627 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2628 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2629 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2630 2631 #undef DO_VCVT_RMODE 2632 2633 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2634 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2635 { \ 2636 intptr_t i, oprsz = simd_oprsz(desc); \ 2637 uint32_t rmode = simd_data(desc); \ 2638 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2639 TYPE *d = vd, *n = vn; \ 2640 set_float_rounding_mode(rmode, fpst); \ 2641 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2642 d[i] = FUNC(n[i], fpst); \ 2643 } \ 2644 set_float_rounding_mode(prev_rmode, fpst); \ 2645 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2646 } 2647 2648 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2649 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2650 2651 #undef DO_VRINT_RMODE 2652 2653 #ifdef TARGET_AARCH64 2654 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc) 2655 { 2656 const uint8_t *indices = vm; 2657 size_t oprsz = simd_oprsz(desc); 2658 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2659 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2660 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2661 union { 2662 uint8_t b[16]; 2663 uint64_t d[2]; 2664 } result; 2665 2666 /* 2667 * We must construct the final result in a temp, lest the output 2668 * overlaps the input table. For TBL, begin with zero; for TBX, 2669 * begin with the original register contents. Note that we always 2670 * copy 16 bytes here to avoid an extra branch; clearing the high 2671 * bits of the register for oprsz == 8 is handled below. 2672 */ 2673 if (is_tbx) { 2674 memcpy(&result, vd, 16); 2675 } else { 2676 memset(&result, 0, 16); 2677 } 2678 2679 for (size_t i = 0; i < oprsz; ++i) { 2680 uint32_t index = indices[H1(i)]; 2681 2682 if (index < table_len) { 2683 /* 2684 * Convert index (a byte offset into the virtual table 2685 * which is a series of 128-bit vectors concatenated) 2686 * into the correct register element, bearing in mind 2687 * that the table can wrap around from V31 to V0. 2688 */ 2689 const uint8_t *table = (const uint8_t *) 2690 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2691 result.b[H1(i)] = table[H1(index % 16)]; 2692 } 2693 } 2694 2695 memcpy(vd, &result, 16); 2696 clear_tail(vd, oprsz, simd_maxsz(desc)); 2697 } 2698 #endif 2699 2700 /* 2701 * NxN -> N highpart multiply 2702 * 2703 * TODO: expose this as a generic vector operation. 2704 */ 2705 2706 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2707 { 2708 intptr_t i, opr_sz = simd_oprsz(desc); 2709 int8_t *d = vd, *n = vn, *m = vm; 2710 2711 for (i = 0; i < opr_sz; ++i) { 2712 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2713 } 2714 clear_tail(d, opr_sz, simd_maxsz(desc)); 2715 } 2716 2717 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2718 { 2719 intptr_t i, opr_sz = simd_oprsz(desc); 2720 int16_t *d = vd, *n = vn, *m = vm; 2721 2722 for (i = 0; i < opr_sz / 2; ++i) { 2723 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2724 } 2725 clear_tail(d, opr_sz, simd_maxsz(desc)); 2726 } 2727 2728 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2729 { 2730 intptr_t i, opr_sz = simd_oprsz(desc); 2731 int32_t *d = vd, *n = vn, *m = vm; 2732 2733 for (i = 0; i < opr_sz / 4; ++i) { 2734 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2735 } 2736 clear_tail(d, opr_sz, simd_maxsz(desc)); 2737 } 2738 2739 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2740 { 2741 intptr_t i, opr_sz = simd_oprsz(desc); 2742 uint64_t *d = vd, *n = vn, *m = vm; 2743 uint64_t discard; 2744 2745 for (i = 0; i < opr_sz / 8; ++i) { 2746 muls64(&discard, &d[i], n[i], m[i]); 2747 } 2748 clear_tail(d, opr_sz, simd_maxsz(desc)); 2749 } 2750 2751 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2752 { 2753 intptr_t i, opr_sz = simd_oprsz(desc); 2754 uint8_t *d = vd, *n = vn, *m = vm; 2755 2756 for (i = 0; i < opr_sz; ++i) { 2757 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2758 } 2759 clear_tail(d, opr_sz, simd_maxsz(desc)); 2760 } 2761 2762 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2763 { 2764 intptr_t i, opr_sz = simd_oprsz(desc); 2765 uint16_t *d = vd, *n = vn, *m = vm; 2766 2767 for (i = 0; i < opr_sz / 2; ++i) { 2768 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2769 } 2770 clear_tail(d, opr_sz, simd_maxsz(desc)); 2771 } 2772 2773 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2774 { 2775 intptr_t i, opr_sz = simd_oprsz(desc); 2776 uint32_t *d = vd, *n = vn, *m = vm; 2777 2778 for (i = 0; i < opr_sz / 4; ++i) { 2779 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2780 } 2781 clear_tail(d, opr_sz, simd_maxsz(desc)); 2782 } 2783 2784 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2785 { 2786 intptr_t i, opr_sz = simd_oprsz(desc); 2787 uint64_t *d = vd, *n = vn, *m = vm; 2788 uint64_t discard; 2789 2790 for (i = 0; i < opr_sz / 8; ++i) { 2791 mulu64(&discard, &d[i], n[i], m[i]); 2792 } 2793 clear_tail(d, opr_sz, simd_maxsz(desc)); 2794 } 2795 2796 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2797 { 2798 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2799 int shr = simd_data(desc); 2800 uint64_t *d = vd, *n = vn, *m = vm; 2801 2802 for (i = 0; i < opr_sz; ++i) { 2803 d[i] = ror64(n[i] ^ m[i], shr); 2804 } 2805 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2806 } 2807 2808 /* 2809 * Integer matrix-multiply accumulate 2810 */ 2811 2812 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2813 { 2814 int8_t *n = vn, *m = vm; 2815 2816 for (intptr_t k = 0; k < 8; ++k) { 2817 sum += n[H1(k)] * m[H1(k)]; 2818 } 2819 return sum; 2820 } 2821 2822 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2823 { 2824 uint8_t *n = vn, *m = vm; 2825 2826 for (intptr_t k = 0; k < 8; ++k) { 2827 sum += n[H1(k)] * m[H1(k)]; 2828 } 2829 return sum; 2830 } 2831 2832 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2833 { 2834 uint8_t *n = vn; 2835 int8_t *m = vm; 2836 2837 for (intptr_t k = 0; k < 8; ++k) { 2838 sum += n[H1(k)] * m[H1(k)]; 2839 } 2840 return sum; 2841 } 2842 2843 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2844 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2845 { 2846 intptr_t seg, opr_sz = simd_oprsz(desc); 2847 2848 for (seg = 0; seg < opr_sz; seg += 16) { 2849 uint32_t *d = vd + seg; 2850 uint32_t *a = va + seg; 2851 uint32_t sum0, sum1, sum2, sum3; 2852 2853 /* 2854 * Process the entire segment at once, writing back the 2855 * results only after we've consumed all of the inputs. 2856 * 2857 * Key to indices by column: 2858 * i j i j 2859 */ 2860 sum0 = a[H4(0 + 0)]; 2861 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2862 sum1 = a[H4(0 + 1)]; 2863 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2864 sum2 = a[H4(2 + 0)]; 2865 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2866 sum3 = a[H4(2 + 1)]; 2867 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2868 2869 d[H4(0)] = sum0; 2870 d[H4(1)] = sum1; 2871 d[H4(2)] = sum2; 2872 d[H4(3)] = sum3; 2873 } 2874 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2875 } 2876 2877 #define DO_MMLA_B(NAME, INNER) \ 2878 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2879 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2880 2881 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2882 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2883 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2884 2885 /* 2886 * BFloat16 Dot Product 2887 */ 2888 2889 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp) 2890 { 2891 /* 2892 * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF. 2893 * For EBF = 0, we ignore the FPCR bits which determine rounding 2894 * mode and denormal-flushing, and we do unfused multiplies and 2895 * additions with intermediate rounding of all products and sums. 2896 * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits, 2897 * and we perform a fused two-way sum-of-products without intermediate 2898 * rounding of the products. 2899 * In either case, we don't set fp exception flags. 2900 * 2901 * EBF is AArch64 only, so even if it's set in the FPCR it has 2902 * no effect on AArch32 instructions. 2903 */ 2904 bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF; 2905 2906 *statusp = is_a64(env) ? env->vfp.fp_status_a64 : env->vfp.fp_status_a32; 2907 set_default_nan_mode(true, statusp); 2908 2909 if (ebf) { 2910 /* EBF=1 needs to do a step with round-to-odd semantics */ 2911 *oddstatusp = *statusp; 2912 set_float_rounding_mode(float_round_to_odd, oddstatusp); 2913 } else { 2914 set_flush_to_zero(true, statusp); 2915 set_flush_inputs_to_zero(true, statusp); 2916 set_float_rounding_mode(float_round_to_odd_inf, statusp); 2917 } 2918 return ebf; 2919 } 2920 2921 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst) 2922 { 2923 float32 t1, t2; 2924 2925 /* 2926 * Extract each BFloat16 from the element pair, and shift 2927 * them such that they become float32. 2928 */ 2929 t1 = float32_mul(e1 << 16, e2 << 16, fpst); 2930 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst); 2931 t1 = float32_add(t1, t2, fpst); 2932 t1 = float32_add(sum, t1, fpst); 2933 2934 return t1; 2935 } 2936 2937 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2, 2938 float_status *fpst, float_status *fpst_odd) 2939 { 2940 /* 2941 * Compare f16_dotadd() in sme_helper.c, but here we have 2942 * bfloat16 inputs. In particular that means that we do not 2943 * want the FPCR.FZ16 flush semantics, so we use the normal 2944 * float_status for the input handling here. 2945 */ 2946 float64 e1r = float32_to_float64(e1 << 16, fpst); 2947 float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst); 2948 float64 e2r = float32_to_float64(e2 << 16, fpst); 2949 float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst); 2950 float64 t64; 2951 float32 t32; 2952 2953 /* 2954 * The ARM pseudocode function FPDot performs both multiplies 2955 * and the add with a single rounding operation. Emulate this 2956 * by performing the first multiply in round-to-odd, then doing 2957 * the second multiply as fused multiply-add, and rounding to 2958 * float32 all in one step. 2959 */ 2960 t64 = float64_mul(e1r, e2r, fpst_odd); 2961 t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst); 2962 2963 /* This conversion is exact, because we've already rounded. */ 2964 t32 = float64_to_float32(t64, fpst); 2965 2966 /* The final accumulation step is not fused. */ 2967 return float32_add(sum, t32, fpst); 2968 } 2969 2970 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, 2971 CPUARMState *env, uint32_t desc) 2972 { 2973 intptr_t i, opr_sz = simd_oprsz(desc); 2974 float32 *d = vd, *a = va; 2975 uint32_t *n = vn, *m = vm; 2976 float_status fpst, fpst_odd; 2977 2978 if (is_ebf(env, &fpst, &fpst_odd)) { 2979 for (i = 0; i < opr_sz / 4; ++i) { 2980 d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd); 2981 } 2982 } else { 2983 for (i = 0; i < opr_sz / 4; ++i) { 2984 d[i] = bfdotadd(a[i], n[i], m[i], &fpst); 2985 } 2986 } 2987 clear_tail(d, opr_sz, simd_maxsz(desc)); 2988 } 2989 2990 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 2991 void *va, CPUARMState *env, uint32_t desc) 2992 { 2993 intptr_t i, j, opr_sz = simd_oprsz(desc); 2994 intptr_t index = simd_data(desc); 2995 intptr_t elements = opr_sz / 4; 2996 intptr_t eltspersegment = MIN(16 / 4, elements); 2997 float32 *d = vd, *a = va; 2998 uint32_t *n = vn, *m = vm; 2999 float_status fpst, fpst_odd; 3000 3001 if (is_ebf(env, &fpst, &fpst_odd)) { 3002 for (i = 0; i < elements; i += eltspersegment) { 3003 uint32_t m_idx = m[i + H4(index)]; 3004 3005 for (j = i; j < i + eltspersegment; j++) { 3006 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd); 3007 } 3008 } 3009 } else { 3010 for (i = 0; i < elements; i += eltspersegment) { 3011 uint32_t m_idx = m[i + H4(index)]; 3012 3013 for (j = i; j < i + eltspersegment; j++) { 3014 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst); 3015 } 3016 } 3017 } 3018 clear_tail(d, opr_sz, simd_maxsz(desc)); 3019 } 3020 3021 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, 3022 CPUARMState *env, uint32_t desc) 3023 { 3024 intptr_t s, opr_sz = simd_oprsz(desc); 3025 float32 *d = vd, *a = va; 3026 uint32_t *n = vn, *m = vm; 3027 float_status fpst, fpst_odd; 3028 3029 if (is_ebf(env, &fpst, &fpst_odd)) { 3030 for (s = 0; s < opr_sz / 4; s += 4) { 3031 float32 sum00, sum01, sum10, sum11; 3032 3033 /* 3034 * Process the entire segment at once, writing back the 3035 * results only after we've consumed all of the inputs. 3036 * 3037 * Key to indices by column: 3038 * i j i k j k 3039 */ 3040 sum00 = a[s + H4(0 + 0)]; 3041 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 3042 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 3043 3044 sum01 = a[s + H4(0 + 1)]; 3045 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 3046 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 3047 3048 sum10 = a[s + H4(2 + 0)]; 3049 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 3050 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 3051 3052 sum11 = a[s + H4(2 + 1)]; 3053 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 3054 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 3055 3056 d[s + H4(0 + 0)] = sum00; 3057 d[s + H4(0 + 1)] = sum01; 3058 d[s + H4(2 + 0)] = sum10; 3059 d[s + H4(2 + 1)] = sum11; 3060 } 3061 } else { 3062 for (s = 0; s < opr_sz / 4; s += 4) { 3063 float32 sum00, sum01, sum10, sum11; 3064 3065 /* 3066 * Process the entire segment at once, writing back the 3067 * results only after we've consumed all of the inputs. 3068 * 3069 * Key to indices by column: 3070 * i j i k j k 3071 */ 3072 sum00 = a[s + H4(0 + 0)]; 3073 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst); 3074 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst); 3075 3076 sum01 = a[s + H4(0 + 1)]; 3077 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst); 3078 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst); 3079 3080 sum10 = a[s + H4(2 + 0)]; 3081 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst); 3082 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst); 3083 3084 sum11 = a[s + H4(2 + 1)]; 3085 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst); 3086 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst); 3087 3088 d[s + H4(0 + 0)] = sum00; 3089 d[s + H4(0 + 1)] = sum01; 3090 d[s + H4(2 + 0)] = sum10; 3091 d[s + H4(2 + 1)] = sum11; 3092 } 3093 } 3094 clear_tail(d, opr_sz, simd_maxsz(desc)); 3095 } 3096 3097 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 3098 float_status *stat, uint32_t desc) 3099 { 3100 intptr_t i, opr_sz = simd_oprsz(desc); 3101 intptr_t sel = simd_data(desc); 3102 float32 *d = vd, *a = va; 3103 bfloat16 *n = vn, *m = vm; 3104 3105 for (i = 0; i < opr_sz / 4; ++i) { 3106 float32 nn = n[H2(i * 2 + sel)] << 16; 3107 float32 mm = m[H2(i * 2 + sel)] << 16; 3108 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 3109 } 3110 clear_tail(d, opr_sz, simd_maxsz(desc)); 3111 } 3112 3113 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 3114 void *va, float_status *stat, uint32_t desc) 3115 { 3116 intptr_t i, j, opr_sz = simd_oprsz(desc); 3117 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 3118 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 3119 intptr_t elements = opr_sz / 4; 3120 intptr_t eltspersegment = MIN(16 / 4, elements); 3121 float32 *d = vd, *a = va; 3122 bfloat16 *n = vn, *m = vm; 3123 3124 for (i = 0; i < elements; i += eltspersegment) { 3125 float32 m_idx = m[H2(2 * i + index)] << 16; 3126 3127 for (j = i; j < i + eltspersegment; j++) { 3128 float32 n_j = n[H2(2 * j + sel)] << 16; 3129 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 3130 } 3131 } 3132 clear_tail(d, opr_sz, simd_maxsz(desc)); 3133 } 3134 3135 #define DO_CLAMP(NAME, TYPE) \ 3136 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 3137 { \ 3138 intptr_t i, opr_sz = simd_oprsz(desc); \ 3139 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 3140 TYPE aa = *(TYPE *)(a + i); \ 3141 TYPE nn = *(TYPE *)(n + i); \ 3142 TYPE mm = *(TYPE *)(m + i); \ 3143 TYPE dd = MIN(MAX(aa, nn), mm); \ 3144 *(TYPE *)(d + i) = dd; \ 3145 } \ 3146 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 3147 } 3148 3149 DO_CLAMP(gvec_sclamp_b, int8_t) 3150 DO_CLAMP(gvec_sclamp_h, int16_t) 3151 DO_CLAMP(gvec_sclamp_s, int32_t) 3152 DO_CLAMP(gvec_sclamp_d, int64_t) 3153 3154 DO_CLAMP(gvec_uclamp_b, uint8_t) 3155 DO_CLAMP(gvec_uclamp_h, uint16_t) 3156 DO_CLAMP(gvec_uclamp_s, uint32_t) 3157 DO_CLAMP(gvec_uclamp_d, uint64_t) 3158 3159 /* Bit count in each 8-bit word. */ 3160 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc) 3161 { 3162 intptr_t i, opr_sz = simd_oprsz(desc); 3163 uint8_t *d = vd, *n = vn; 3164 3165 for (i = 0; i < opr_sz; ++i) { 3166 d[i] = ctpop8(n[i]); 3167 } 3168 clear_tail(d, opr_sz, simd_maxsz(desc)); 3169 } 3170 3171 /* Reverse bits in each 8 bit word */ 3172 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc) 3173 { 3174 intptr_t i, opr_sz = simd_oprsz(desc); 3175 uint64_t *d = vd, *n = vn; 3176 3177 for (i = 0; i < opr_sz / 8; ++i) { 3178 d[i] = revbit64(bswap64(n[i])); 3179 } 3180 clear_tail(d, opr_sz, simd_maxsz(desc)); 3181 } 3182 3183 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc) 3184 { 3185 intptr_t i, opr_sz = simd_oprsz(desc); 3186 uint32_t *d = vd, *n = vn; 3187 3188 for (i = 0; i < opr_sz / 4; ++i) { 3189 d[i] = helper_recpe_u32(n[i]); 3190 } 3191 clear_tail(d, opr_sz, simd_maxsz(desc)); 3192 } 3193 3194 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc) 3195 { 3196 intptr_t i, opr_sz = simd_oprsz(desc); 3197 uint32_t *d = vd, *n = vn; 3198 3199 for (i = 0; i < opr_sz / 4; ++i) { 3200 d[i] = helper_rsqrte_u32(n[i]); 3201 } 3202 clear_tail(d, opr_sz, simd_maxsz(desc)); 3203 } 3204