1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "crypto/clmul.h" 27 #include "vec_internal.h" 28 29 /* 30 * Data for expanding active predicate bits to bytes, for byte elements. 31 * 32 * for (i = 0; i < 256; ++i) { 33 * unsigned long m = 0; 34 * for (j = 0; j < 8; j++) { 35 * if ((i >> j) & 1) { 36 * m |= 0xfful << (j << 3); 37 * } 38 * } 39 * printf("0x%016lx,\n", m); 40 * } 41 */ 42 const uint64_t expand_pred_b_data[256] = { 43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 128 0xffffffffffffffff, 129 }; 130 131 /* 132 * Similarly for half-word elements. 133 * for (i = 0; i < 256; ++i) { 134 * unsigned long m = 0; 135 * if (i & 0xaa) { 136 * continue; 137 * } 138 * for (j = 0; j < 8; j += 2) { 139 * if ((i >> j) & 1) { 140 * m |= 0xfffful << (j << 3); 141 * } 142 * } 143 * printf("[0x%x] = 0x%016lx,\n", i, m); 144 * } 145 */ 146 const uint64_t expand_pred_h_data[0x55 + 1] = { 147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 154 [0x55] = 0xffffffffffffffff, 155 }; 156 157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 159 bool neg, bool round) 160 { 161 /* 162 * Simplify: 163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 165 */ 166 int32_t ret = (int32_t)src1 * src2; 167 if (neg) { 168 ret = -ret; 169 } 170 ret += ((int32_t)src3 << 7) + (round << 6); 171 ret >>= 7; 172 173 if (ret != (int8_t)ret) { 174 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 175 } 176 return ret; 177 } 178 179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 180 void *va, uint32_t desc) 181 { 182 intptr_t i, opr_sz = simd_oprsz(desc); 183 int8_t *d = vd, *n = vn, *m = vm, *a = va; 184 185 for (i = 0; i < opr_sz; ++i) { 186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 187 } 188 } 189 190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 191 void *va, uint32_t desc) 192 { 193 intptr_t i, opr_sz = simd_oprsz(desc); 194 int8_t *d = vd, *n = vn, *m = vm, *a = va; 195 196 for (i = 0; i < opr_sz; ++i) { 197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 198 } 199 } 200 201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 202 { 203 intptr_t i, opr_sz = simd_oprsz(desc); 204 int8_t *d = vd, *n = vn, *m = vm; 205 206 for (i = 0; i < opr_sz; ++i) { 207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 208 } 209 } 210 211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 212 { 213 intptr_t i, opr_sz = simd_oprsz(desc); 214 int8_t *d = vd, *n = vn, *m = vm; 215 216 for (i = 0; i < opr_sz; ++i) { 217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 218 } 219 } 220 221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 223 bool neg, bool round, uint32_t *sat) 224 { 225 /* Simplify similarly to do_sqrdmlah_b above. */ 226 int32_t ret = (int32_t)src1 * src2; 227 if (neg) { 228 ret = -ret; 229 } 230 ret += ((int32_t)src3 << 15) + (round << 14); 231 ret >>= 15; 232 233 if (ret != (int16_t)ret) { 234 *sat = 1; 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 236 } 237 return ret; 238 } 239 240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 241 uint32_t src2, uint32_t src3) 242 { 243 uint32_t *sat = &env->vfp.qc[0]; 244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 246 false, true, sat); 247 return deposit32(e1, 16, 16, e2); 248 } 249 250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 251 void *vq, uint32_t desc) 252 { 253 uintptr_t opr_sz = simd_oprsz(desc); 254 int16_t *d = vd; 255 int16_t *n = vn; 256 int16_t *m = vm; 257 uintptr_t i; 258 259 for (i = 0; i < opr_sz / 2; ++i) { 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 261 } 262 clear_tail(d, opr_sz, simd_maxsz(desc)); 263 } 264 265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 266 uint32_t src2, uint32_t src3) 267 { 268 uint32_t *sat = &env->vfp.qc[0]; 269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 271 true, true, sat); 272 return deposit32(e1, 16, 16, e2); 273 } 274 275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277 { 278 uintptr_t opr_sz = simd_oprsz(desc); 279 int16_t *d = vd; 280 int16_t *n = vn; 281 int16_t *m = vm; 282 uintptr_t i; 283 284 for (i = 0; i < opr_sz / 2; ++i) { 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 286 } 287 clear_tail(d, opr_sz, simd_maxsz(desc)); 288 } 289 290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 291 void *vq, uint32_t desc) 292 { 293 intptr_t i, opr_sz = simd_oprsz(desc); 294 int16_t *d = vd, *n = vn, *m = vm; 295 296 for (i = 0; i < opr_sz / 2; ++i) { 297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 298 } 299 clear_tail(d, opr_sz, simd_maxsz(desc)); 300 } 301 302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 303 void *vq, uint32_t desc) 304 { 305 intptr_t i, opr_sz = simd_oprsz(desc); 306 int16_t *d = vd, *n = vn, *m = vm; 307 308 for (i = 0; i < opr_sz / 2; ++i) { 309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 310 } 311 clear_tail(d, opr_sz, simd_maxsz(desc)); 312 } 313 314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm, 315 void *vq, uint32_t desc) 316 { 317 intptr_t i, j, opr_sz = simd_oprsz(desc); 318 int idx = simd_data(desc); 319 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 320 intptr_t elements = opr_sz / 2; 321 intptr_t eltspersegment = MIN(16 / 2, elements); 322 323 for (i = 0; i < elements; i += 16 / 2) { 324 int16_t mm = m[i]; 325 for (j = 0; j < eltspersegment; ++j) { 326 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq); 327 } 328 } 329 clear_tail(d, opr_sz, simd_maxsz(desc)); 330 } 331 332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, 333 void *vq, uint32_t desc) 334 { 335 intptr_t i, j, opr_sz = simd_oprsz(desc); 336 int idx = simd_data(desc); 337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 338 intptr_t elements = opr_sz / 2; 339 intptr_t eltspersegment = MIN(16 / 2, elements); 340 341 for (i = 0; i < elements; i += 16 / 2) { 342 int16_t mm = m[i]; 343 for (j = 0; j < eltspersegment; ++j) { 344 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq); 345 } 346 } 347 clear_tail(d, opr_sz, simd_maxsz(desc)); 348 } 349 350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm, 351 void *vq, uint32_t desc) 352 { 353 intptr_t i, j, opr_sz = simd_oprsz(desc); 354 int idx = simd_data(desc); 355 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 356 intptr_t elements = opr_sz / 2; 357 intptr_t eltspersegment = MIN(16 / 2, elements); 358 359 for (i = 0; i < elements; i += 16 / 2) { 360 int16_t mm = m[i]; 361 for (j = 0; j < eltspersegment; ++j) { 362 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq); 363 } 364 } 365 clear_tail(d, opr_sz, simd_maxsz(desc)); 366 } 367 368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm, 369 void *vq, uint32_t desc) 370 { 371 intptr_t i, j, opr_sz = simd_oprsz(desc); 372 int idx = simd_data(desc); 373 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 374 intptr_t elements = opr_sz / 2; 375 intptr_t eltspersegment = MIN(16 / 2, elements); 376 377 for (i = 0; i < elements; i += 16 / 2) { 378 int16_t mm = m[i]; 379 for (j = 0; j < eltspersegment; ++j) { 380 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq); 381 } 382 } 383 clear_tail(d, opr_sz, simd_maxsz(desc)); 384 } 385 386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 387 void *va, uint32_t desc) 388 { 389 intptr_t i, opr_sz = simd_oprsz(desc); 390 int16_t *d = vd, *n = vn, *m = vm, *a = va; 391 uint32_t discard; 392 393 for (i = 0; i < opr_sz / 2; ++i) { 394 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 395 } 396 } 397 398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 399 void *va, uint32_t desc) 400 { 401 intptr_t i, opr_sz = simd_oprsz(desc); 402 int16_t *d = vd, *n = vn, *m = vm, *a = va; 403 uint32_t discard; 404 405 for (i = 0; i < opr_sz / 2; ++i) { 406 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 407 } 408 } 409 410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 411 { 412 intptr_t i, opr_sz = simd_oprsz(desc); 413 int16_t *d = vd, *n = vn, *m = vm; 414 uint32_t discard; 415 416 for (i = 0; i < opr_sz / 2; ++i) { 417 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 418 } 419 } 420 421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 422 { 423 intptr_t i, opr_sz = simd_oprsz(desc); 424 int16_t *d = vd, *n = vn, *m = vm; 425 uint32_t discard; 426 427 for (i = 0; i < opr_sz / 2; ++i) { 428 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 429 } 430 } 431 432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 433 { 434 intptr_t i, j, opr_sz = simd_oprsz(desc); 435 int idx = simd_data(desc); 436 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 437 uint32_t discard; 438 439 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 440 int16_t mm = m[i]; 441 for (j = 0; j < 16 / 2; ++j) { 442 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 443 } 444 } 445 } 446 447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 448 { 449 intptr_t i, j, opr_sz = simd_oprsz(desc); 450 int idx = simd_data(desc); 451 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 452 uint32_t discard; 453 454 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 455 int16_t mm = m[i]; 456 for (j = 0; j < 16 / 2; ++j) { 457 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 458 } 459 } 460 } 461 462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 464 bool neg, bool round, uint32_t *sat) 465 { 466 /* Simplify similarly to do_sqrdmlah_b above. */ 467 int64_t ret = (int64_t)src1 * src2; 468 if (neg) { 469 ret = -ret; 470 } 471 ret += ((int64_t)src3 << 31) + (round << 30); 472 ret >>= 31; 473 474 if (ret != (int32_t)ret) { 475 *sat = 1; 476 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 477 } 478 return ret; 479 } 480 481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 482 int32_t src2, int32_t src3) 483 { 484 uint32_t *sat = &env->vfp.qc[0]; 485 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 486 } 487 488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 489 void *vq, uint32_t desc) 490 { 491 uintptr_t opr_sz = simd_oprsz(desc); 492 int32_t *d = vd; 493 int32_t *n = vn; 494 int32_t *m = vm; 495 uintptr_t i; 496 497 for (i = 0; i < opr_sz / 4; ++i) { 498 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 499 } 500 clear_tail(d, opr_sz, simd_maxsz(desc)); 501 } 502 503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 504 int32_t src2, int32_t src3) 505 { 506 uint32_t *sat = &env->vfp.qc[0]; 507 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 508 } 509 510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 511 void *vq, uint32_t desc) 512 { 513 uintptr_t opr_sz = simd_oprsz(desc); 514 int32_t *d = vd; 515 int32_t *n = vn; 516 int32_t *m = vm; 517 uintptr_t i; 518 519 for (i = 0; i < opr_sz / 4; ++i) { 520 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 521 } 522 clear_tail(d, opr_sz, simd_maxsz(desc)); 523 } 524 525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 526 void *vq, uint32_t desc) 527 { 528 intptr_t i, opr_sz = simd_oprsz(desc); 529 int32_t *d = vd, *n = vn, *m = vm; 530 531 for (i = 0; i < opr_sz / 4; ++i) { 532 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 533 } 534 clear_tail(d, opr_sz, simd_maxsz(desc)); 535 } 536 537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 538 void *vq, uint32_t desc) 539 { 540 intptr_t i, opr_sz = simd_oprsz(desc); 541 int32_t *d = vd, *n = vn, *m = vm; 542 543 for (i = 0; i < opr_sz / 4; ++i) { 544 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 545 } 546 clear_tail(d, opr_sz, simd_maxsz(desc)); 547 } 548 549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm, 550 void *vq, uint32_t desc) 551 { 552 intptr_t i, j, opr_sz = simd_oprsz(desc); 553 int idx = simd_data(desc); 554 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 555 intptr_t elements = opr_sz / 4; 556 intptr_t eltspersegment = MIN(16 / 4, elements); 557 558 for (i = 0; i < elements; i += 16 / 4) { 559 int32_t mm = m[i]; 560 for (j = 0; j < eltspersegment; ++j) { 561 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq); 562 } 563 } 564 clear_tail(d, opr_sz, simd_maxsz(desc)); 565 } 566 567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, 568 void *vq, uint32_t desc) 569 { 570 intptr_t i, j, opr_sz = simd_oprsz(desc); 571 int idx = simd_data(desc); 572 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 573 intptr_t elements = opr_sz / 4; 574 intptr_t eltspersegment = MIN(16 / 4, elements); 575 576 for (i = 0; i < elements; i += 16 / 4) { 577 int32_t mm = m[i]; 578 for (j = 0; j < eltspersegment; ++j) { 579 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq); 580 } 581 } 582 clear_tail(d, opr_sz, simd_maxsz(desc)); 583 } 584 585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm, 586 void *vq, uint32_t desc) 587 { 588 intptr_t i, j, opr_sz = simd_oprsz(desc); 589 int idx = simd_data(desc); 590 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 591 intptr_t elements = opr_sz / 4; 592 intptr_t eltspersegment = MIN(16 / 4, elements); 593 594 for (i = 0; i < elements; i += 16 / 4) { 595 int32_t mm = m[i]; 596 for (j = 0; j < eltspersegment; ++j) { 597 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq); 598 } 599 } 600 clear_tail(d, opr_sz, simd_maxsz(desc)); 601 } 602 603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm, 604 void *vq, uint32_t desc) 605 { 606 intptr_t i, j, opr_sz = simd_oprsz(desc); 607 int idx = simd_data(desc); 608 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 609 intptr_t elements = opr_sz / 4; 610 intptr_t eltspersegment = MIN(16 / 4, elements); 611 612 for (i = 0; i < elements; i += 16 / 4) { 613 int32_t mm = m[i]; 614 for (j = 0; j < eltspersegment; ++j) { 615 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq); 616 } 617 } 618 clear_tail(d, opr_sz, simd_maxsz(desc)); 619 } 620 621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 622 void *va, uint32_t desc) 623 { 624 intptr_t i, opr_sz = simd_oprsz(desc); 625 int32_t *d = vd, *n = vn, *m = vm, *a = va; 626 uint32_t discard; 627 628 for (i = 0; i < opr_sz / 4; ++i) { 629 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 630 } 631 } 632 633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 634 void *va, uint32_t desc) 635 { 636 intptr_t i, opr_sz = simd_oprsz(desc); 637 int32_t *d = vd, *n = vn, *m = vm, *a = va; 638 uint32_t discard; 639 640 for (i = 0; i < opr_sz / 4; ++i) { 641 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 642 } 643 } 644 645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 646 { 647 intptr_t i, opr_sz = simd_oprsz(desc); 648 int32_t *d = vd, *n = vn, *m = vm; 649 uint32_t discard; 650 651 for (i = 0; i < opr_sz / 4; ++i) { 652 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 653 } 654 } 655 656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 657 { 658 intptr_t i, opr_sz = simd_oprsz(desc); 659 int32_t *d = vd, *n = vn, *m = vm; 660 uint32_t discard; 661 662 for (i = 0; i < opr_sz / 4; ++i) { 663 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 664 } 665 } 666 667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 668 { 669 intptr_t i, j, opr_sz = simd_oprsz(desc); 670 int idx = simd_data(desc); 671 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 672 uint32_t discard; 673 674 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 675 int32_t mm = m[i]; 676 for (j = 0; j < 16 / 4; ++j) { 677 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 678 } 679 } 680 } 681 682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 683 { 684 intptr_t i, j, opr_sz = simd_oprsz(desc); 685 int idx = simd_data(desc); 686 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 687 uint32_t discard; 688 689 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 690 int32_t mm = m[i]; 691 for (j = 0; j < 16 / 4; ++j) { 692 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 693 } 694 } 695 } 696 697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 698 static int64_t do_sat128_d(Int128 r) 699 { 700 int64_t ls = int128_getlo(r); 701 int64_t hs = int128_gethi(r); 702 703 if (unlikely(hs != (ls >> 63))) { 704 return hs < 0 ? INT64_MIN : INT64_MAX; 705 } 706 return ls; 707 } 708 709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 710 { 711 uint64_t l, h; 712 Int128 r, t; 713 714 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 715 muls64(&l, &h, m, n); 716 r = int128_make128(l, h); 717 if (neg) { 718 r = int128_neg(r); 719 } 720 if (a) { 721 t = int128_exts64(a); 722 t = int128_lshift(t, 63); 723 r = int128_add(r, t); 724 } 725 if (round) { 726 t = int128_exts64(1ll << 62); 727 r = int128_add(r, t); 728 } 729 r = int128_rshift(r, 63); 730 731 return do_sat128_d(r); 732 } 733 734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 735 void *va, uint32_t desc) 736 { 737 intptr_t i, opr_sz = simd_oprsz(desc); 738 int64_t *d = vd, *n = vn, *m = vm, *a = va; 739 740 for (i = 0; i < opr_sz / 8; ++i) { 741 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 742 } 743 } 744 745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 746 void *va, uint32_t desc) 747 { 748 intptr_t i, opr_sz = simd_oprsz(desc); 749 int64_t *d = vd, *n = vn, *m = vm, *a = va; 750 751 for (i = 0; i < opr_sz / 8; ++i) { 752 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 753 } 754 } 755 756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 757 { 758 intptr_t i, opr_sz = simd_oprsz(desc); 759 int64_t *d = vd, *n = vn, *m = vm; 760 761 for (i = 0; i < opr_sz / 8; ++i) { 762 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 763 } 764 } 765 766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 767 { 768 intptr_t i, opr_sz = simd_oprsz(desc); 769 int64_t *d = vd, *n = vn, *m = vm; 770 771 for (i = 0; i < opr_sz / 8; ++i) { 772 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 773 } 774 } 775 776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 777 { 778 intptr_t i, j, opr_sz = simd_oprsz(desc); 779 int idx = simd_data(desc); 780 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 781 782 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 783 int64_t mm = m[i]; 784 for (j = 0; j < 16 / 8; ++j) { 785 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 786 } 787 } 788 } 789 790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 791 { 792 intptr_t i, j, opr_sz = simd_oprsz(desc); 793 int idx = simd_data(desc); 794 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 795 796 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 797 int64_t mm = m[i]; 798 for (j = 0; j < 16 / 8; ++j) { 799 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 800 } 801 } 802 } 803 804 /* Integer 8 and 16-bit dot-product. 805 * 806 * Note that for the loops herein, host endianness does not matter 807 * with respect to the ordering of data within the quad-width lanes. 808 * All elements are treated equally, no matter where they are. 809 */ 810 811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 813 { \ 814 intptr_t i, opr_sz = simd_oprsz(desc); \ 815 TYPED *d = vd, *a = va; \ 816 TYPEN *n = vn; \ 817 TYPEM *m = vm; \ 818 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 819 d[i] = (a[i] + \ 820 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 821 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 822 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 823 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 824 } \ 825 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 826 } 827 828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 833 834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 836 { \ 837 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 838 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 839 /* \ 840 * Special case: opr_sz == 8 from AA64/AA32 advsimd means the \ 841 * first iteration might not be a full 16 byte segment. But \ 842 * for vector lengths beyond that this must be SVE and we know \ 843 * opr_sz is a multiple of 16, so we need not clamp segend \ 844 * to opr_sz_n when we advance it at the end of the loop. \ 845 */ \ 846 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 847 intptr_t index = simd_data(desc); \ 848 TYPED *d = vd, *a = va; \ 849 TYPEN *n = vn; \ 850 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 851 do { \ 852 TYPED m0 = m_indexed[i * 4 + 0]; \ 853 TYPED m1 = m_indexed[i * 4 + 1]; \ 854 TYPED m2 = m_indexed[i * 4 + 2]; \ 855 TYPED m3 = m_indexed[i * 4 + 3]; \ 856 do { \ 857 d[i] = (a[i] + \ 858 n[i * 4 + 0] * m0 + \ 859 n[i * 4 + 1] * m1 + \ 860 n[i * 4 + 2] * m2 + \ 861 n[i * 4 + 3] * m3); \ 862 } while (++i < segend); \ 863 segend = i + (16 / sizeof(TYPED)); \ 864 } while (i < opr_sz_n); \ 865 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 866 } 867 868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 874 875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 876 float_status *fpst, uint32_t desc) 877 { 878 uintptr_t opr_sz = simd_oprsz(desc); 879 float16 *d = vd; 880 float16 *n = vn; 881 float16 *m = vm; 882 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 883 uint32_t neg_imag = neg_real ^ 1; 884 uintptr_t i; 885 886 /* Shift boolean to the sign bit so we can xor to negate. */ 887 neg_real <<= 15; 888 neg_imag <<= 15; 889 890 for (i = 0; i < opr_sz / 2; i += 2) { 891 float16 e0 = n[H2(i)]; 892 float16 e1 = m[H2(i + 1)] ^ neg_imag; 893 float16 e2 = n[H2(i + 1)]; 894 float16 e3 = m[H2(i)] ^ neg_real; 895 896 d[H2(i)] = float16_add(e0, e1, fpst); 897 d[H2(i + 1)] = float16_add(e2, e3, fpst); 898 } 899 clear_tail(d, opr_sz, simd_maxsz(desc)); 900 } 901 902 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 903 float_status *fpst, uint32_t desc) 904 { 905 uintptr_t opr_sz = simd_oprsz(desc); 906 float32 *d = vd; 907 float32 *n = vn; 908 float32 *m = vm; 909 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 910 uint32_t neg_imag = neg_real ^ 1; 911 uintptr_t i; 912 913 /* Shift boolean to the sign bit so we can xor to negate. */ 914 neg_real <<= 31; 915 neg_imag <<= 31; 916 917 for (i = 0; i < opr_sz / 4; i += 2) { 918 float32 e0 = n[H4(i)]; 919 float32 e1 = m[H4(i + 1)] ^ neg_imag; 920 float32 e2 = n[H4(i + 1)]; 921 float32 e3 = m[H4(i)] ^ neg_real; 922 923 d[H4(i)] = float32_add(e0, e1, fpst); 924 d[H4(i + 1)] = float32_add(e2, e3, fpst); 925 } 926 clear_tail(d, opr_sz, simd_maxsz(desc)); 927 } 928 929 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 930 float_status *fpst, uint32_t desc) 931 { 932 uintptr_t opr_sz = simd_oprsz(desc); 933 float64 *d = vd; 934 float64 *n = vn; 935 float64 *m = vm; 936 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 937 uint64_t neg_imag = neg_real ^ 1; 938 uintptr_t i; 939 940 /* Shift boolean to the sign bit so we can xor to negate. */ 941 neg_real <<= 63; 942 neg_imag <<= 63; 943 944 for (i = 0; i < opr_sz / 8; i += 2) { 945 float64 e0 = n[i]; 946 float64 e1 = m[i + 1] ^ neg_imag; 947 float64 e2 = n[i + 1]; 948 float64 e3 = m[i] ^ neg_real; 949 950 d[i] = float64_add(e0, e1, fpst); 951 d[i + 1] = float64_add(e2, e3, fpst); 952 } 953 clear_tail(d, opr_sz, simd_maxsz(desc)); 954 } 955 956 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 957 float_status *fpst, uint32_t desc) 958 { 959 uintptr_t opr_sz = simd_oprsz(desc); 960 float16 *d = vd, *n = vn, *m = vm, *a = va; 961 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 962 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 963 uint32_t neg_real = flip ^ neg_imag; 964 uintptr_t i; 965 966 /* Shift boolean to the sign bit so we can xor to negate. */ 967 neg_real <<= 15; 968 neg_imag <<= 15; 969 970 for (i = 0; i < opr_sz / 2; i += 2) { 971 float16 e2 = n[H2(i + flip)]; 972 float16 e1 = m[H2(i + flip)] ^ neg_real; 973 float16 e4 = e2; 974 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 975 976 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst); 977 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst); 978 } 979 clear_tail(d, opr_sz, simd_maxsz(desc)); 980 } 981 982 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 983 float_status *fpst, uint32_t desc) 984 { 985 uintptr_t opr_sz = simd_oprsz(desc); 986 float16 *d = vd, *n = vn, *m = vm, *a = va; 987 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 988 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 989 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 990 uint32_t neg_real = flip ^ neg_imag; 991 intptr_t elements = opr_sz / sizeof(float16); 992 intptr_t eltspersegment = MIN(16 / sizeof(float16), elements); 993 intptr_t i, j; 994 995 /* Shift boolean to the sign bit so we can xor to negate. */ 996 neg_real <<= 15; 997 neg_imag <<= 15; 998 999 for (i = 0; i < elements; i += eltspersegment) { 1000 float16 mr = m[H2(i + 2 * index + 0)]; 1001 float16 mi = m[H2(i + 2 * index + 1)]; 1002 float16 e1 = neg_real ^ (flip ? mi : mr); 1003 float16 e3 = neg_imag ^ (flip ? mr : mi); 1004 1005 for (j = i; j < i + eltspersegment; j += 2) { 1006 float16 e2 = n[H2(j + flip)]; 1007 float16 e4 = e2; 1008 1009 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); 1010 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); 1011 } 1012 } 1013 clear_tail(d, opr_sz, simd_maxsz(desc)); 1014 } 1015 1016 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 1017 float_status *fpst, uint32_t desc) 1018 { 1019 uintptr_t opr_sz = simd_oprsz(desc); 1020 float32 *d = vd, *n = vn, *m = vm, *a = va; 1021 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1022 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1023 uint32_t neg_real = flip ^ neg_imag; 1024 uintptr_t i; 1025 1026 /* Shift boolean to the sign bit so we can xor to negate. */ 1027 neg_real <<= 31; 1028 neg_imag <<= 31; 1029 1030 for (i = 0; i < opr_sz / 4; i += 2) { 1031 float32 e2 = n[H4(i + flip)]; 1032 float32 e1 = m[H4(i + flip)] ^ neg_real; 1033 float32 e4 = e2; 1034 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 1035 1036 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst); 1037 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst); 1038 } 1039 clear_tail(d, opr_sz, simd_maxsz(desc)); 1040 } 1041 1042 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 1043 float_status *fpst, uint32_t desc) 1044 { 1045 uintptr_t opr_sz = simd_oprsz(desc); 1046 float32 *d = vd, *n = vn, *m = vm, *a = va; 1047 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1048 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1049 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1050 uint32_t neg_real = flip ^ neg_imag; 1051 intptr_t elements = opr_sz / sizeof(float32); 1052 intptr_t eltspersegment = MIN(16 / sizeof(float32), elements); 1053 intptr_t i, j; 1054 1055 /* Shift boolean to the sign bit so we can xor to negate. */ 1056 neg_real <<= 31; 1057 neg_imag <<= 31; 1058 1059 for (i = 0; i < elements; i += eltspersegment) { 1060 float32 mr = m[H4(i + 2 * index + 0)]; 1061 float32 mi = m[H4(i + 2 * index + 1)]; 1062 float32 e1 = neg_real ^ (flip ? mi : mr); 1063 float32 e3 = neg_imag ^ (flip ? mr : mi); 1064 1065 for (j = i; j < i + eltspersegment; j += 2) { 1066 float32 e2 = n[H4(j + flip)]; 1067 float32 e4 = e2; 1068 1069 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); 1070 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); 1071 } 1072 } 1073 clear_tail(d, opr_sz, simd_maxsz(desc)); 1074 } 1075 1076 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 1077 float_status *fpst, uint32_t desc) 1078 { 1079 uintptr_t opr_sz = simd_oprsz(desc); 1080 float64 *d = vd, *n = vn, *m = vm, *a = va; 1081 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1082 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1083 uint64_t neg_real = flip ^ neg_imag; 1084 uintptr_t i; 1085 1086 /* Shift boolean to the sign bit so we can xor to negate. */ 1087 neg_real <<= 63; 1088 neg_imag <<= 63; 1089 1090 for (i = 0; i < opr_sz / 8; i += 2) { 1091 float64 e2 = n[i + flip]; 1092 float64 e1 = m[i + flip] ^ neg_real; 1093 float64 e4 = e2; 1094 float64 e3 = m[i + 1 - flip] ^ neg_imag; 1095 1096 d[i] = float64_muladd(e2, e1, a[i], 0, fpst); 1097 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst); 1098 } 1099 clear_tail(d, opr_sz, simd_maxsz(desc)); 1100 } 1101 1102 /* 1103 * Floating point comparisons producing an integer result (all 1s or all 0s). 1104 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1105 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1106 */ 1107 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 1108 { 1109 return -float16_eq_quiet(op1, op2, stat); 1110 } 1111 1112 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 1113 { 1114 return -float32_eq_quiet(op1, op2, stat); 1115 } 1116 1117 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat) 1118 { 1119 return -float64_eq_quiet(op1, op2, stat); 1120 } 1121 1122 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 1123 { 1124 return -float16_le(op2, op1, stat); 1125 } 1126 1127 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 1128 { 1129 return -float32_le(op2, op1, stat); 1130 } 1131 1132 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat) 1133 { 1134 return -float64_le(op2, op1, stat); 1135 } 1136 1137 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 1138 { 1139 return -float16_lt(op2, op1, stat); 1140 } 1141 1142 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 1143 { 1144 return -float32_lt(op2, op1, stat); 1145 } 1146 1147 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat) 1148 { 1149 return -float64_lt(op2, op1, stat); 1150 } 1151 1152 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 1153 { 1154 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 1155 } 1156 1157 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 1158 { 1159 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1160 } 1161 1162 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat) 1163 { 1164 return -float64_le(float64_abs(op2), float64_abs(op1), stat); 1165 } 1166 1167 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1168 { 1169 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1170 } 1171 1172 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1173 { 1174 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1175 } 1176 1177 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat) 1178 { 1179 return -float64_lt(float64_abs(op2), float64_abs(op1), stat); 1180 } 1181 1182 static int16_t vfp_tosszh(float16 x, float_status *fpst) 1183 { 1184 if (float16_is_any_nan(x)) { 1185 float_raise(float_flag_invalid, fpst); 1186 return 0; 1187 } 1188 return float16_to_int16_round_to_zero(x, fpst); 1189 } 1190 1191 static uint16_t vfp_touszh(float16 x, float_status *fpst) 1192 { 1193 if (float16_is_any_nan(x)) { 1194 float_raise(float_flag_invalid, fpst); 1195 return 0; 1196 } 1197 return float16_to_uint16_round_to_zero(x, fpst); 1198 } 1199 1200 #define DO_2OP(NAME, FUNC, TYPE) \ 1201 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 1202 { \ 1203 intptr_t i, oprsz = simd_oprsz(desc); \ 1204 TYPE *d = vd, *n = vn; \ 1205 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1206 d[i] = FUNC(n[i], stat); \ 1207 } \ 1208 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1209 } 1210 1211 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1212 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1213 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1214 1215 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1216 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1217 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1218 1219 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1220 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1221 1222 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1223 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1224 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1225 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1226 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1227 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1228 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1229 DO_2OP(gvec_touszh, vfp_touszh, float16) 1230 1231 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1232 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1233 { \ 1234 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1235 } 1236 1237 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1238 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1239 { \ 1240 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1241 } 1242 1243 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1244 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1245 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1246 WRAP_CMP0_##DIRN(FN, CMPOP, float64) \ 1247 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1248 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) \ 1249 DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64) 1250 1251 DO_2OP_CMP0(cgt, cgt, FWD) 1252 DO_2OP_CMP0(cge, cge, FWD) 1253 DO_2OP_CMP0(ceq, ceq, FWD) 1254 DO_2OP_CMP0(clt, cgt, REV) 1255 DO_2OP_CMP0(cle, cge, REV) 1256 1257 #undef DO_2OP 1258 #undef DO_2OP_CMP0 1259 1260 /* Floating-point trigonometric starting value. 1261 * See the ARM ARM pseudocode function FPTrigSMul. 1262 */ 1263 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1264 { 1265 float16 result = float16_mul(op1, op1, stat); 1266 if (!float16_is_any_nan(result)) { 1267 result = float16_set_sign(result, op2 & 1); 1268 } 1269 return result; 1270 } 1271 1272 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1273 { 1274 float32 result = float32_mul(op1, op1, stat); 1275 if (!float32_is_any_nan(result)) { 1276 result = float32_set_sign(result, op2 & 1); 1277 } 1278 return result; 1279 } 1280 1281 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1282 { 1283 float64 result = float64_mul(op1, op1, stat); 1284 if (!float64_is_any_nan(result)) { 1285 result = float64_set_sign(result, op2 & 1); 1286 } 1287 return result; 1288 } 1289 1290 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1291 { 1292 return float16_abs(float16_sub(op1, op2, stat)); 1293 } 1294 1295 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1296 { 1297 return float32_abs(float32_sub(op1, op2, stat)); 1298 } 1299 1300 static float64 float64_abd(float64 op1, float64 op2, float_status *stat) 1301 { 1302 return float64_abs(float64_sub(op1, op2, stat)); 1303 } 1304 1305 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */ 1306 static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat) 1307 { 1308 float16 r = float16_sub(op1, op2, stat); 1309 return float16_is_any_nan(r) ? r : float16_abs(r); 1310 } 1311 1312 static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat) 1313 { 1314 float32 r = float32_sub(op1, op2, stat); 1315 return float32_is_any_nan(r) ? r : float32_abs(r); 1316 } 1317 1318 static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat) 1319 { 1320 float64 r = float64_sub(op1, op2, stat); 1321 return float64_is_any_nan(r) ? r : float64_abs(r); 1322 } 1323 1324 /* 1325 * Reciprocal step. These are the AArch32 version which uses a 1326 * non-fused multiply-and-subtract. 1327 */ 1328 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1329 { 1330 op1 = float16_squash_input_denormal(op1, stat); 1331 op2 = float16_squash_input_denormal(op2, stat); 1332 1333 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1334 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1335 return float16_two; 1336 } 1337 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1338 } 1339 1340 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1341 { 1342 op1 = float32_squash_input_denormal(op1, stat); 1343 op2 = float32_squash_input_denormal(op2, stat); 1344 1345 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1346 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1347 return float32_two; 1348 } 1349 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1350 } 1351 1352 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1353 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1354 { 1355 op1 = float16_squash_input_denormal(op1, stat); 1356 op2 = float16_squash_input_denormal(op2, stat); 1357 1358 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1359 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1360 return float16_one_point_five; 1361 } 1362 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1363 return float16_div(op1, float16_two, stat); 1364 } 1365 1366 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1367 { 1368 op1 = float32_squash_input_denormal(op1, stat); 1369 op2 = float32_squash_input_denormal(op2, stat); 1370 1371 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1372 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1373 return float32_one_point_five; 1374 } 1375 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1376 return float32_div(op1, float32_two, stat); 1377 } 1378 1379 #define DO_3OP(NAME, FUNC, TYPE) \ 1380 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1381 float_status *stat, uint32_t desc) \ 1382 { \ 1383 intptr_t i, oprsz = simd_oprsz(desc); \ 1384 TYPE *d = vd, *n = vn, *m = vm; \ 1385 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1386 d[i] = FUNC(n[i], m[i], stat); \ 1387 } \ 1388 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1389 } 1390 1391 DO_3OP(gvec_fadd_h, float16_add, float16) 1392 DO_3OP(gvec_fadd_s, float32_add, float32) 1393 DO_3OP(gvec_fadd_d, float64_add, float64) 1394 1395 DO_3OP(gvec_fsub_h, float16_sub, float16) 1396 DO_3OP(gvec_fsub_s, float32_sub, float32) 1397 DO_3OP(gvec_fsub_d, float64_sub, float64) 1398 1399 DO_3OP(gvec_fmul_h, float16_mul, float16) 1400 DO_3OP(gvec_fmul_s, float32_mul, float32) 1401 DO_3OP(gvec_fmul_d, float64_mul, float64) 1402 1403 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1404 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1405 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1406 1407 DO_3OP(gvec_fabd_h, float16_abd, float16) 1408 DO_3OP(gvec_fabd_s, float32_abd, float32) 1409 DO_3OP(gvec_fabd_d, float64_abd, float64) 1410 1411 DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16) 1412 DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32) 1413 DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64) 1414 1415 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1416 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1417 DO_3OP(gvec_fceq_d, float64_ceq, float64) 1418 1419 DO_3OP(gvec_fcge_h, float16_cge, float16) 1420 DO_3OP(gvec_fcge_s, float32_cge, float32) 1421 DO_3OP(gvec_fcge_d, float64_cge, float64) 1422 1423 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1424 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1425 DO_3OP(gvec_fcgt_d, float64_cgt, float64) 1426 1427 DO_3OP(gvec_facge_h, float16_acge, float16) 1428 DO_3OP(gvec_facge_s, float32_acge, float32) 1429 DO_3OP(gvec_facge_d, float64_acge, float64) 1430 1431 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1432 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1433 DO_3OP(gvec_facgt_d, float64_acgt, float64) 1434 1435 DO_3OP(gvec_fmax_h, float16_max, float16) 1436 DO_3OP(gvec_fmax_s, float32_max, float32) 1437 DO_3OP(gvec_fmax_d, float64_max, float64) 1438 1439 DO_3OP(gvec_fmin_h, float16_min, float16) 1440 DO_3OP(gvec_fmin_s, float32_min, float32) 1441 DO_3OP(gvec_fmin_d, float64_min, float64) 1442 1443 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1444 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1445 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64) 1446 1447 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1448 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1449 DO_3OP(gvec_fminnum_d, float64_minnum, float64) 1450 1451 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1452 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1453 1454 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1455 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1456 1457 #ifdef TARGET_AARCH64 1458 DO_3OP(gvec_fdiv_h, float16_div, float16) 1459 DO_3OP(gvec_fdiv_s, float32_div, float32) 1460 DO_3OP(gvec_fdiv_d, float64_div, float64) 1461 1462 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16) 1463 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32) 1464 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64) 1465 1466 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1467 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1468 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1469 1470 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1471 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1472 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1473 1474 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16) 1475 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32) 1476 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64) 1477 1478 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16) 1479 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32) 1480 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64) 1481 1482 #endif 1483 #undef DO_3OP 1484 1485 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1486 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1487 float_status *stat) 1488 { 1489 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1490 } 1491 1492 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1493 float_status *stat) 1494 { 1495 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1496 } 1497 1498 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1499 float_status *stat) 1500 { 1501 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1502 } 1503 1504 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1505 float_status *stat) 1506 { 1507 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1508 } 1509 1510 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1511 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1512 float_status *stat) 1513 { 1514 return float16_muladd(op1, op2, dest, 0, stat); 1515 } 1516 1517 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1518 float_status *stat) 1519 { 1520 return float32_muladd(op1, op2, dest, 0, stat); 1521 } 1522 1523 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2, 1524 float_status *stat) 1525 { 1526 return float64_muladd(op1, op2, dest, 0, stat); 1527 } 1528 1529 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1530 float_status *stat) 1531 { 1532 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1533 } 1534 1535 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1536 float_status *stat) 1537 { 1538 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1539 } 1540 1541 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2, 1542 float_status *stat) 1543 { 1544 return float64_muladd(float64_chs(op1), op2, dest, 0, stat); 1545 } 1546 1547 #define DO_MULADD(NAME, FUNC, TYPE) \ 1548 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1549 float_status *stat, uint32_t desc) \ 1550 { \ 1551 intptr_t i, oprsz = simd_oprsz(desc); \ 1552 TYPE *d = vd, *n = vn, *m = vm; \ 1553 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1554 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1555 } \ 1556 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1557 } 1558 1559 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1560 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1561 1562 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1563 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1564 1565 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1566 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1567 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64) 1568 1569 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1570 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1571 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64) 1572 1573 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1574 * For AdvSIMD, there is of course only one such vector segment. 1575 */ 1576 1577 #define DO_MUL_IDX(NAME, TYPE, H) \ 1578 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1579 { \ 1580 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1581 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1582 intptr_t idx = simd_data(desc); \ 1583 TYPE *d = vd, *n = vn, *m = vm; \ 1584 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1585 TYPE mm = m[H(i + idx)]; \ 1586 for (j = 0; j < segment; j++) { \ 1587 d[i + j] = n[i + j] * mm; \ 1588 } \ 1589 } \ 1590 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1591 } 1592 1593 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1594 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1595 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1596 1597 #undef DO_MUL_IDX 1598 1599 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1600 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1601 { \ 1602 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1603 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1604 intptr_t idx = simd_data(desc); \ 1605 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1606 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1607 TYPE mm = m[H(i + idx)]; \ 1608 for (j = 0; j < segment; j++) { \ 1609 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1610 } \ 1611 } \ 1612 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1613 } 1614 1615 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1616 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1617 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1618 1619 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1620 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1621 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1622 1623 #undef DO_MLA_IDX 1624 1625 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \ 1626 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1627 float_status *stat, uint32_t desc) \ 1628 { \ 1629 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1630 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1631 intptr_t idx = simd_data(desc); \ 1632 TYPE *d = vd, *n = vn, *m = vm; \ 1633 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1634 TYPE mm = m[H(i + idx)]; \ 1635 for (j = 0; j < segment; j++) { \ 1636 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \ 1637 } \ 1638 } \ 1639 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1640 } 1641 1642 #define nop(N, M, S) (M) 1643 1644 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2) 1645 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4) 1646 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8) 1647 1648 #ifdef TARGET_AARCH64 1649 1650 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2) 1651 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4) 1652 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8) 1653 1654 #endif 1655 1656 #undef nop 1657 1658 /* 1659 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1660 * the fused ops below they assume accumulate both from and into Vd. 1661 */ 1662 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2) 1663 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4) 1664 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2) 1665 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4) 1666 1667 #undef DO_FMUL_IDX 1668 1669 #define DO_FMLA_IDX(NAME, TYPE, H) \ 1670 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1671 float_status *stat, uint32_t desc) \ 1672 { \ 1673 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1674 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1675 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 1676 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 1677 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1678 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 1679 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1680 TYPE mm = m[H(i + idx)]; \ 1681 for (j = 0; j < segment; j++) { \ 1682 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 1683 mm, a[i + j], 0, stat); \ 1684 } \ 1685 } \ 1686 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1687 } 1688 1689 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 1690 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 1691 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) 1692 1693 #undef DO_FMLA_IDX 1694 1695 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1696 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1697 { \ 1698 intptr_t i, oprsz = simd_oprsz(desc); \ 1699 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1700 bool q = false; \ 1701 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1702 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1703 if (dd < MIN) { \ 1704 dd = MIN; \ 1705 q = true; \ 1706 } else if (dd > MAX) { \ 1707 dd = MAX; \ 1708 q = true; \ 1709 } \ 1710 d[i] = dd; \ 1711 } \ 1712 if (q) { \ 1713 uint32_t *qc = vq; \ 1714 qc[0] = 1; \ 1715 } \ 1716 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1717 } 1718 1719 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1720 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1721 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1722 1723 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1724 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1725 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1726 1727 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1728 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1729 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1730 1731 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1732 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1733 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1734 1735 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX) 1736 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX) 1737 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX) 1738 1739 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX) 1740 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX) 1741 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX) 1742 1743 #undef DO_SAT 1744 1745 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1746 void *vm, uint32_t desc) 1747 { 1748 intptr_t i, oprsz = simd_oprsz(desc); 1749 uint64_t *d = vd, *n = vn, *m = vm; 1750 bool q = false; 1751 1752 for (i = 0; i < oprsz / 8; i++) { 1753 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1754 if (dd < nn) { 1755 dd = UINT64_MAX; 1756 q = true; 1757 } 1758 d[i] = dd; 1759 } 1760 if (q) { 1761 uint32_t *qc = vq; 1762 qc[0] = 1; 1763 } 1764 clear_tail(d, oprsz, simd_maxsz(desc)); 1765 } 1766 1767 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1768 void *vm, uint32_t desc) 1769 { 1770 intptr_t i, oprsz = simd_oprsz(desc); 1771 uint64_t *d = vd, *n = vn, *m = vm; 1772 bool q = false; 1773 1774 for (i = 0; i < oprsz / 8; i++) { 1775 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1776 if (nn < mm) { 1777 dd = 0; 1778 q = true; 1779 } 1780 d[i] = dd; 1781 } 1782 if (q) { 1783 uint32_t *qc = vq; 1784 qc[0] = 1; 1785 } 1786 clear_tail(d, oprsz, simd_maxsz(desc)); 1787 } 1788 1789 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1790 void *vm, uint32_t desc) 1791 { 1792 intptr_t i, oprsz = simd_oprsz(desc); 1793 int64_t *d = vd, *n = vn, *m = vm; 1794 bool q = false; 1795 1796 for (i = 0; i < oprsz / 8; i++) { 1797 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1798 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1799 dd = (nn >> 63) ^ ~INT64_MIN; 1800 q = true; 1801 } 1802 d[i] = dd; 1803 } 1804 if (q) { 1805 uint32_t *qc = vq; 1806 qc[0] = 1; 1807 } 1808 clear_tail(d, oprsz, simd_maxsz(desc)); 1809 } 1810 1811 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1812 void *vm, uint32_t desc) 1813 { 1814 intptr_t i, oprsz = simd_oprsz(desc); 1815 int64_t *d = vd, *n = vn, *m = vm; 1816 bool q = false; 1817 1818 for (i = 0; i < oprsz / 8; i++) { 1819 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1820 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1821 dd = (nn >> 63) ^ ~INT64_MIN; 1822 q = true; 1823 } 1824 d[i] = dd; 1825 } 1826 if (q) { 1827 uint32_t *qc = vq; 1828 qc[0] = 1; 1829 } 1830 clear_tail(d, oprsz, simd_maxsz(desc)); 1831 } 1832 1833 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn, 1834 void *vm, uint32_t desc) 1835 { 1836 intptr_t i, oprsz = simd_oprsz(desc); 1837 uint64_t *d = vd, *n = vn, *m = vm; 1838 bool q = false; 1839 1840 for (i = 0; i < oprsz / 8; i++) { 1841 uint64_t nn = n[i]; 1842 int64_t mm = m[i]; 1843 uint64_t dd = nn + mm; 1844 1845 if (mm < 0) { 1846 if (nn < (uint64_t)-mm) { 1847 dd = 0; 1848 q = true; 1849 } 1850 } else { 1851 if (dd < nn) { 1852 dd = UINT64_MAX; 1853 q = true; 1854 } 1855 } 1856 d[i] = dd; 1857 } 1858 if (q) { 1859 uint32_t *qc = vq; 1860 qc[0] = 1; 1861 } 1862 clear_tail(d, oprsz, simd_maxsz(desc)); 1863 } 1864 1865 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn, 1866 void *vm, uint32_t desc) 1867 { 1868 intptr_t i, oprsz = simd_oprsz(desc); 1869 uint64_t *d = vd, *n = vn, *m = vm; 1870 bool q = false; 1871 1872 for (i = 0; i < oprsz / 8; i++) { 1873 int64_t nn = n[i]; 1874 uint64_t mm = m[i]; 1875 int64_t dd = nn + mm; 1876 1877 if (mm > (uint64_t)(INT64_MAX - nn)) { 1878 dd = INT64_MAX; 1879 q = true; 1880 } 1881 d[i] = dd; 1882 } 1883 if (q) { 1884 uint32_t *qc = vq; 1885 qc[0] = 1; 1886 } 1887 clear_tail(d, oprsz, simd_maxsz(desc)); 1888 } 1889 1890 #define DO_SRA(NAME, TYPE) \ 1891 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1892 { \ 1893 intptr_t i, oprsz = simd_oprsz(desc); \ 1894 int shift = simd_data(desc); \ 1895 TYPE *d = vd, *n = vn; \ 1896 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1897 d[i] += n[i] >> shift; \ 1898 } \ 1899 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1900 } 1901 1902 DO_SRA(gvec_ssra_b, int8_t) 1903 DO_SRA(gvec_ssra_h, int16_t) 1904 DO_SRA(gvec_ssra_s, int32_t) 1905 DO_SRA(gvec_ssra_d, int64_t) 1906 1907 DO_SRA(gvec_usra_b, uint8_t) 1908 DO_SRA(gvec_usra_h, uint16_t) 1909 DO_SRA(gvec_usra_s, uint32_t) 1910 DO_SRA(gvec_usra_d, uint64_t) 1911 1912 #undef DO_SRA 1913 1914 #define DO_RSHR(NAME, TYPE) \ 1915 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1916 { \ 1917 intptr_t i, oprsz = simd_oprsz(desc); \ 1918 int shift = simd_data(desc); \ 1919 TYPE *d = vd, *n = vn; \ 1920 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1921 TYPE tmp = n[i] >> (shift - 1); \ 1922 d[i] = (tmp >> 1) + (tmp & 1); \ 1923 } \ 1924 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1925 } 1926 1927 DO_RSHR(gvec_srshr_b, int8_t) 1928 DO_RSHR(gvec_srshr_h, int16_t) 1929 DO_RSHR(gvec_srshr_s, int32_t) 1930 DO_RSHR(gvec_srshr_d, int64_t) 1931 1932 DO_RSHR(gvec_urshr_b, uint8_t) 1933 DO_RSHR(gvec_urshr_h, uint16_t) 1934 DO_RSHR(gvec_urshr_s, uint32_t) 1935 DO_RSHR(gvec_urshr_d, uint64_t) 1936 1937 #undef DO_RSHR 1938 1939 #define DO_RSRA(NAME, TYPE) \ 1940 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1941 { \ 1942 intptr_t i, oprsz = simd_oprsz(desc); \ 1943 int shift = simd_data(desc); \ 1944 TYPE *d = vd, *n = vn; \ 1945 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1946 TYPE tmp = n[i] >> (shift - 1); \ 1947 d[i] += (tmp >> 1) + (tmp & 1); \ 1948 } \ 1949 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1950 } 1951 1952 DO_RSRA(gvec_srsra_b, int8_t) 1953 DO_RSRA(gvec_srsra_h, int16_t) 1954 DO_RSRA(gvec_srsra_s, int32_t) 1955 DO_RSRA(gvec_srsra_d, int64_t) 1956 1957 DO_RSRA(gvec_ursra_b, uint8_t) 1958 DO_RSRA(gvec_ursra_h, uint16_t) 1959 DO_RSRA(gvec_ursra_s, uint32_t) 1960 DO_RSRA(gvec_ursra_d, uint64_t) 1961 1962 #undef DO_RSRA 1963 1964 #define DO_SRI(NAME, TYPE) \ 1965 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1966 { \ 1967 intptr_t i, oprsz = simd_oprsz(desc); \ 1968 int shift = simd_data(desc); \ 1969 TYPE *d = vd, *n = vn; \ 1970 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1971 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 1972 } \ 1973 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1974 } 1975 1976 DO_SRI(gvec_sri_b, uint8_t) 1977 DO_SRI(gvec_sri_h, uint16_t) 1978 DO_SRI(gvec_sri_s, uint32_t) 1979 DO_SRI(gvec_sri_d, uint64_t) 1980 1981 #undef DO_SRI 1982 1983 #define DO_SLI(NAME, TYPE) \ 1984 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1985 { \ 1986 intptr_t i, oprsz = simd_oprsz(desc); \ 1987 int shift = simd_data(desc); \ 1988 TYPE *d = vd, *n = vn; \ 1989 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1990 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 1991 } \ 1992 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1993 } 1994 1995 DO_SLI(gvec_sli_b, uint8_t) 1996 DO_SLI(gvec_sli_h, uint16_t) 1997 DO_SLI(gvec_sli_s, uint32_t) 1998 DO_SLI(gvec_sli_d, uint64_t) 1999 2000 #undef DO_SLI 2001 2002 /* 2003 * Convert float16 to float32, raising no exceptions and 2004 * preserving exceptional values, including SNaN. 2005 * This is effectively an unpack+repack operation. 2006 */ 2007 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 2008 { 2009 const int f16_bias = 15; 2010 const int f32_bias = 127; 2011 uint32_t sign = extract32(f16, 15, 1); 2012 uint32_t exp = extract32(f16, 10, 5); 2013 uint32_t frac = extract32(f16, 0, 10); 2014 2015 if (exp == 0x1f) { 2016 /* Inf or NaN */ 2017 exp = 0xff; 2018 } else if (exp == 0) { 2019 /* Zero or denormal. */ 2020 if (frac != 0) { 2021 if (fz16) { 2022 frac = 0; 2023 } else { 2024 /* 2025 * Denormal; these are all normal float32. 2026 * Shift the fraction so that the msb is at bit 11, 2027 * then remove bit 11 as the implicit bit of the 2028 * normalized float32. Note that we still go through 2029 * the shift for normal numbers below, to put the 2030 * float32 fraction at the right place. 2031 */ 2032 int shift = clz32(frac) - 21; 2033 frac = (frac << shift) & 0x3ff; 2034 exp = f32_bias - f16_bias - shift + 1; 2035 } 2036 } 2037 } else { 2038 /* Normal number; adjust the bias. */ 2039 exp += f32_bias - f16_bias; 2040 } 2041 sign <<= 31; 2042 exp <<= 23; 2043 frac <<= 23 - 10; 2044 2045 return sign | exp | frac; 2046 } 2047 2048 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 2049 { 2050 /* 2051 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 2052 * Load the 2nd qword iff is_q & is_2. 2053 * Shift to the 2nd dword iff !is_q & is_2. 2054 * For !is_q & !is_2, the upper bits of the result are garbage. 2055 */ 2056 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 2057 } 2058 2059 /* 2060 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 2061 * as there is not yet SVE versions that might use blocking. 2062 */ 2063 2064 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 2065 uint32_t desc, bool fz16) 2066 { 2067 intptr_t i, oprsz = simd_oprsz(desc); 2068 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2069 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2070 int is_q = oprsz == 16; 2071 uint64_t n_4, m_4; 2072 2073 /* Pre-load all of the f16 data, avoiding overlap issues. */ 2074 n_4 = load4_f16(vn, is_q, is_2); 2075 m_4 = load4_f16(vm, is_q, is_2); 2076 2077 /* Negate all inputs for FMLSL at once. */ 2078 if (is_s) { 2079 n_4 ^= 0x8000800080008000ull; 2080 } 2081 2082 for (i = 0; i < oprsz / 4; i++) { 2083 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2084 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 2085 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 2086 } 2087 clear_tail(d, oprsz, simd_maxsz(desc)); 2088 } 2089 2090 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 2091 CPUARMState *env, uint32_t desc) 2092 { 2093 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, 2094 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32)); 2095 } 2096 2097 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 2098 CPUARMState *env, uint32_t desc) 2099 { 2100 do_fmlal(vd, vn, vm, &env->vfp.fp_status_a64, desc, 2101 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64)); 2102 } 2103 2104 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 2105 CPUARMState *env, uint32_t desc) 2106 { 2107 intptr_t i, oprsz = simd_oprsz(desc); 2108 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 2109 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2110 float_status *status = &env->vfp.fp_status_a64; 2111 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64); 2112 2113 for (i = 0; i < oprsz; i += sizeof(float32)) { 2114 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; 2115 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 2116 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2117 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2118 float32 aa = *(float32 *)(va + H1_4(i)); 2119 2120 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); 2121 } 2122 } 2123 2124 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 2125 uint32_t desc, bool fz16) 2126 { 2127 intptr_t i, oprsz = simd_oprsz(desc); 2128 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2129 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2130 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 2131 int is_q = oprsz == 16; 2132 uint64_t n_4; 2133 float32 m_1; 2134 2135 /* Pre-load all of the f16 data, avoiding overlap issues. */ 2136 n_4 = load4_f16(vn, is_q, is_2); 2137 2138 /* Negate all inputs for FMLSL at once. */ 2139 if (is_s) { 2140 n_4 ^= 0x8000800080008000ull; 2141 } 2142 2143 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 2144 2145 for (i = 0; i < oprsz / 4; i++) { 2146 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2147 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 2148 } 2149 clear_tail(d, oprsz, simd_maxsz(desc)); 2150 } 2151 2152 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 2153 CPUARMState *env, uint32_t desc) 2154 { 2155 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, 2156 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32)); 2157 } 2158 2159 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 2160 CPUARMState *env, uint32_t desc) 2161 { 2162 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status_a64, desc, 2163 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64)); 2164 } 2165 2166 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 2167 CPUARMState *env, uint32_t desc) 2168 { 2169 intptr_t i, j, oprsz = simd_oprsz(desc); 2170 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 2171 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2172 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 2173 float_status *status = &env->vfp.fp_status_a64; 2174 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64); 2175 2176 for (i = 0; i < oprsz; i += 16) { 2177 float16 mm_16 = *(float16 *)(vm + i + idx); 2178 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2179 2180 for (j = 0; j < 16; j += sizeof(float32)) { 2181 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; 2182 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2183 float32 aa = *(float32 *)(va + H1_4(i + j)); 2184 2185 *(float32 *)(vd + H1_4(i + j)) = 2186 float32_muladd(nn, mm, aa, 0, status); 2187 } 2188 } 2189 } 2190 2191 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2192 { 2193 intptr_t i, opr_sz = simd_oprsz(desc); 2194 int8_t *d = vd, *n = vn, *m = vm; 2195 2196 for (i = 0; i < opr_sz; ++i) { 2197 int8_t mm = m[i]; 2198 int8_t nn = n[i]; 2199 int8_t res = 0; 2200 if (mm >= 0) { 2201 if (mm < 8) { 2202 res = nn << mm; 2203 } 2204 } else { 2205 res = nn >> (mm > -8 ? -mm : 7); 2206 } 2207 d[i] = res; 2208 } 2209 clear_tail(d, opr_sz, simd_maxsz(desc)); 2210 } 2211 2212 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2213 { 2214 intptr_t i, opr_sz = simd_oprsz(desc); 2215 int16_t *d = vd, *n = vn, *m = vm; 2216 2217 for (i = 0; i < opr_sz / 2; ++i) { 2218 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2219 int16_t nn = n[i]; 2220 int16_t res = 0; 2221 if (mm >= 0) { 2222 if (mm < 16) { 2223 res = nn << mm; 2224 } 2225 } else { 2226 res = nn >> (mm > -16 ? -mm : 15); 2227 } 2228 d[i] = res; 2229 } 2230 clear_tail(d, opr_sz, simd_maxsz(desc)); 2231 } 2232 2233 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2234 { 2235 intptr_t i, opr_sz = simd_oprsz(desc); 2236 uint8_t *d = vd, *n = vn, *m = vm; 2237 2238 for (i = 0; i < opr_sz; ++i) { 2239 int8_t mm = m[i]; 2240 uint8_t nn = n[i]; 2241 uint8_t res = 0; 2242 if (mm >= 0) { 2243 if (mm < 8) { 2244 res = nn << mm; 2245 } 2246 } else { 2247 if (mm > -8) { 2248 res = nn >> -mm; 2249 } 2250 } 2251 d[i] = res; 2252 } 2253 clear_tail(d, opr_sz, simd_maxsz(desc)); 2254 } 2255 2256 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2257 { 2258 intptr_t i, opr_sz = simd_oprsz(desc); 2259 uint16_t *d = vd, *n = vn, *m = vm; 2260 2261 for (i = 0; i < opr_sz / 2; ++i) { 2262 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2263 uint16_t nn = n[i]; 2264 uint16_t res = 0; 2265 if (mm >= 0) { 2266 if (mm < 16) { 2267 res = nn << mm; 2268 } 2269 } else { 2270 if (mm > -16) { 2271 res = nn >> -mm; 2272 } 2273 } 2274 d[i] = res; 2275 } 2276 clear_tail(d, opr_sz, simd_maxsz(desc)); 2277 } 2278 2279 /* 2280 * 8x8->8 polynomial multiply. 2281 * 2282 * Polynomial multiplication is like integer multiplication except the 2283 * partial products are XORed, not added. 2284 * 2285 * TODO: expose this as a generic vector operation, as it is a common 2286 * crypto building block. 2287 */ 2288 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 2289 { 2290 intptr_t i, opr_sz = simd_oprsz(desc); 2291 uint64_t *d = vd, *n = vn, *m = vm; 2292 2293 for (i = 0; i < opr_sz / 8; ++i) { 2294 d[i] = clmul_8x8_low(n[i], m[i]); 2295 } 2296 clear_tail(d, opr_sz, simd_maxsz(desc)); 2297 } 2298 2299 /* 2300 * 64x64->128 polynomial multiply. 2301 * Because of the lanes are not accessed in strict columns, 2302 * this probably cannot be turned into a generic helper. 2303 */ 2304 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2305 { 2306 intptr_t i, opr_sz = simd_oprsz(desc); 2307 intptr_t hi = simd_data(desc); 2308 uint64_t *d = vd, *n = vn, *m = vm; 2309 2310 for (i = 0; i < opr_sz / 8; i += 2) { 2311 Int128 r = clmul_64(n[i + hi], m[i + hi]); 2312 d[i] = int128_getlo(r); 2313 d[i + 1] = int128_gethi(r); 2314 } 2315 clear_tail(d, opr_sz, simd_maxsz(desc)); 2316 } 2317 2318 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2319 { 2320 int hi = simd_data(desc); 2321 uint64_t *d = vd, *n = vn, *m = vm; 2322 uint64_t nn = n[hi], mm = m[hi]; 2323 2324 d[0] = clmul_8x4_packed(nn, mm); 2325 nn >>= 32; 2326 mm >>= 32; 2327 d[1] = clmul_8x4_packed(nn, mm); 2328 2329 clear_tail(d, 16, simd_maxsz(desc)); 2330 } 2331 2332 #ifdef TARGET_AARCH64 2333 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2334 { 2335 int shift = simd_data(desc) * 8; 2336 intptr_t i, opr_sz = simd_oprsz(desc); 2337 uint64_t *d = vd, *n = vn, *m = vm; 2338 2339 for (i = 0; i < opr_sz / 8; ++i) { 2340 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); 2341 } 2342 } 2343 2344 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2345 { 2346 intptr_t sel = H4(simd_data(desc)); 2347 intptr_t i, opr_sz = simd_oprsz(desc); 2348 uint32_t *n = vn, *m = vm; 2349 uint64_t *d = vd; 2350 2351 for (i = 0; i < opr_sz / 8; ++i) { 2352 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); 2353 } 2354 } 2355 #endif 2356 2357 #define DO_CMP0(NAME, TYPE, OP) \ 2358 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2359 { \ 2360 intptr_t i, opr_sz = simd_oprsz(desc); \ 2361 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2362 TYPE nn = *(TYPE *)(vn + i); \ 2363 *(TYPE *)(vd + i) = -(nn OP 0); \ 2364 } \ 2365 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2366 } 2367 2368 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2369 DO_CMP0(gvec_clt0_b, int8_t, <) 2370 DO_CMP0(gvec_cle0_b, int8_t, <=) 2371 DO_CMP0(gvec_cgt0_b, int8_t, >) 2372 DO_CMP0(gvec_cge0_b, int8_t, >=) 2373 2374 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2375 DO_CMP0(gvec_clt0_h, int16_t, <) 2376 DO_CMP0(gvec_cle0_h, int16_t, <=) 2377 DO_CMP0(gvec_cgt0_h, int16_t, >) 2378 DO_CMP0(gvec_cge0_h, int16_t, >=) 2379 2380 #undef DO_CMP0 2381 2382 #define DO_ABD(NAME, TYPE) \ 2383 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2384 { \ 2385 intptr_t i, opr_sz = simd_oprsz(desc); \ 2386 TYPE *d = vd, *n = vn, *m = vm; \ 2387 \ 2388 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2389 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2390 } \ 2391 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2392 } 2393 2394 DO_ABD(gvec_sabd_b, int8_t) 2395 DO_ABD(gvec_sabd_h, int16_t) 2396 DO_ABD(gvec_sabd_s, int32_t) 2397 DO_ABD(gvec_sabd_d, int64_t) 2398 2399 DO_ABD(gvec_uabd_b, uint8_t) 2400 DO_ABD(gvec_uabd_h, uint16_t) 2401 DO_ABD(gvec_uabd_s, uint32_t) 2402 DO_ABD(gvec_uabd_d, uint64_t) 2403 2404 #undef DO_ABD 2405 2406 #define DO_ABA(NAME, TYPE) \ 2407 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2408 { \ 2409 intptr_t i, opr_sz = simd_oprsz(desc); \ 2410 TYPE *d = vd, *n = vn, *m = vm; \ 2411 \ 2412 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2413 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2414 } \ 2415 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2416 } 2417 2418 DO_ABA(gvec_saba_b, int8_t) 2419 DO_ABA(gvec_saba_h, int16_t) 2420 DO_ABA(gvec_saba_s, int32_t) 2421 DO_ABA(gvec_saba_d, int64_t) 2422 2423 DO_ABA(gvec_uaba_b, uint8_t) 2424 DO_ABA(gvec_uaba_h, uint16_t) 2425 DO_ABA(gvec_uaba_s, uint32_t) 2426 DO_ABA(gvec_uaba_d, uint64_t) 2427 2428 #undef DO_ABA 2429 2430 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2431 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 2432 float_status *stat, uint32_t desc) \ 2433 { \ 2434 ARMVectorReg scratch; \ 2435 intptr_t oprsz = simd_oprsz(desc); \ 2436 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2437 TYPE *d = vd, *n = vn, *m = vm; \ 2438 if (unlikely(d == m)) { \ 2439 m = memcpy(&scratch, m, oprsz); \ 2440 } \ 2441 for (intptr_t i = 0; i < half; ++i) { \ 2442 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat); \ 2443 } \ 2444 for (intptr_t i = 0; i < half; ++i) { \ 2445 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat); \ 2446 } \ 2447 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2448 } 2449 2450 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2) 2451 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4) 2452 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, ) 2453 2454 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2) 2455 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4) 2456 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, ) 2457 2458 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2) 2459 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4) 2460 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, ) 2461 2462 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2) 2463 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4) 2464 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, ) 2465 2466 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2) 2467 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4) 2468 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, ) 2469 2470 #ifdef TARGET_AARCH64 2471 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2) 2472 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4) 2473 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, ) 2474 2475 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2) 2476 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4) 2477 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, ) 2478 #endif 2479 2480 #undef DO_3OP_PAIR 2481 2482 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2483 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2484 { \ 2485 ARMVectorReg scratch; \ 2486 intptr_t oprsz = simd_oprsz(desc); \ 2487 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2488 TYPE *d = vd, *n = vn, *m = vm; \ 2489 if (unlikely(d == m)) { \ 2490 m = memcpy(&scratch, m, oprsz); \ 2491 } \ 2492 for (intptr_t i = 0; i < half; ++i) { \ 2493 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]); \ 2494 } \ 2495 for (intptr_t i = 0; i < half; ++i) { \ 2496 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]); \ 2497 } \ 2498 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2499 } 2500 2501 #define ADD(A, B) (A + B) 2502 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1) 2503 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2) 2504 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4) 2505 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, ) 2506 #undef ADD 2507 2508 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1) 2509 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2) 2510 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4) 2511 2512 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1) 2513 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2) 2514 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4) 2515 2516 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1) 2517 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2) 2518 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4) 2519 2520 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1) 2521 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2) 2522 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4) 2523 2524 #undef DO_3OP_PAIR 2525 2526 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2527 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 2528 { \ 2529 intptr_t i, oprsz = simd_oprsz(desc); \ 2530 int shift = simd_data(desc); \ 2531 TYPE *d = vd, *n = vn; \ 2532 float_status *fpst = stat; \ 2533 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2534 d[i] = FUNC(n[i], shift, fpst); \ 2535 } \ 2536 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2537 } 2538 2539 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t) 2540 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t) 2541 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2542 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2543 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2544 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2545 2546 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t) 2547 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t) 2548 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2549 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t) 2550 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2551 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2552 2553 #undef DO_VCVT_FIXED 2554 2555 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2556 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2557 { \ 2558 intptr_t i, oprsz = simd_oprsz(desc); \ 2559 uint32_t rmode = simd_data(desc); \ 2560 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2561 TYPE *d = vd, *n = vn; \ 2562 set_float_rounding_mode(rmode, fpst); \ 2563 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2564 d[i] = FUNC(n[i], 0, fpst); \ 2565 } \ 2566 set_float_rounding_mode(prev_rmode, fpst); \ 2567 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2568 } 2569 2570 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t) 2571 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t) 2572 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2573 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2574 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2575 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2576 2577 #undef DO_VCVT_RMODE 2578 2579 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2580 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2581 { \ 2582 intptr_t i, oprsz = simd_oprsz(desc); \ 2583 uint32_t rmode = simd_data(desc); \ 2584 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2585 TYPE *d = vd, *n = vn; \ 2586 set_float_rounding_mode(rmode, fpst); \ 2587 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2588 d[i] = FUNC(n[i], fpst); \ 2589 } \ 2590 set_float_rounding_mode(prev_rmode, fpst); \ 2591 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2592 } 2593 2594 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2595 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2596 2597 #undef DO_VRINT_RMODE 2598 2599 #ifdef TARGET_AARCH64 2600 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc) 2601 { 2602 const uint8_t *indices = vm; 2603 size_t oprsz = simd_oprsz(desc); 2604 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2605 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2606 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2607 union { 2608 uint8_t b[16]; 2609 uint64_t d[2]; 2610 } result; 2611 2612 /* 2613 * We must construct the final result in a temp, lest the output 2614 * overlaps the input table. For TBL, begin with zero; for TBX, 2615 * begin with the original register contents. Note that we always 2616 * copy 16 bytes here to avoid an extra branch; clearing the high 2617 * bits of the register for oprsz == 8 is handled below. 2618 */ 2619 if (is_tbx) { 2620 memcpy(&result, vd, 16); 2621 } else { 2622 memset(&result, 0, 16); 2623 } 2624 2625 for (size_t i = 0; i < oprsz; ++i) { 2626 uint32_t index = indices[H1(i)]; 2627 2628 if (index < table_len) { 2629 /* 2630 * Convert index (a byte offset into the virtual table 2631 * which is a series of 128-bit vectors concatenated) 2632 * into the correct register element, bearing in mind 2633 * that the table can wrap around from V31 to V0. 2634 */ 2635 const uint8_t *table = (const uint8_t *) 2636 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2637 result.b[H1(i)] = table[H1(index % 16)]; 2638 } 2639 } 2640 2641 memcpy(vd, &result, 16); 2642 clear_tail(vd, oprsz, simd_maxsz(desc)); 2643 } 2644 #endif 2645 2646 /* 2647 * NxN -> N highpart multiply 2648 * 2649 * TODO: expose this as a generic vector operation. 2650 */ 2651 2652 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2653 { 2654 intptr_t i, opr_sz = simd_oprsz(desc); 2655 int8_t *d = vd, *n = vn, *m = vm; 2656 2657 for (i = 0; i < opr_sz; ++i) { 2658 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2659 } 2660 clear_tail(d, opr_sz, simd_maxsz(desc)); 2661 } 2662 2663 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2664 { 2665 intptr_t i, opr_sz = simd_oprsz(desc); 2666 int16_t *d = vd, *n = vn, *m = vm; 2667 2668 for (i = 0; i < opr_sz / 2; ++i) { 2669 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2670 } 2671 clear_tail(d, opr_sz, simd_maxsz(desc)); 2672 } 2673 2674 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2675 { 2676 intptr_t i, opr_sz = simd_oprsz(desc); 2677 int32_t *d = vd, *n = vn, *m = vm; 2678 2679 for (i = 0; i < opr_sz / 4; ++i) { 2680 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2681 } 2682 clear_tail(d, opr_sz, simd_maxsz(desc)); 2683 } 2684 2685 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2686 { 2687 intptr_t i, opr_sz = simd_oprsz(desc); 2688 uint64_t *d = vd, *n = vn, *m = vm; 2689 uint64_t discard; 2690 2691 for (i = 0; i < opr_sz / 8; ++i) { 2692 muls64(&discard, &d[i], n[i], m[i]); 2693 } 2694 clear_tail(d, opr_sz, simd_maxsz(desc)); 2695 } 2696 2697 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2698 { 2699 intptr_t i, opr_sz = simd_oprsz(desc); 2700 uint8_t *d = vd, *n = vn, *m = vm; 2701 2702 for (i = 0; i < opr_sz; ++i) { 2703 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2704 } 2705 clear_tail(d, opr_sz, simd_maxsz(desc)); 2706 } 2707 2708 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2709 { 2710 intptr_t i, opr_sz = simd_oprsz(desc); 2711 uint16_t *d = vd, *n = vn, *m = vm; 2712 2713 for (i = 0; i < opr_sz / 2; ++i) { 2714 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2715 } 2716 clear_tail(d, opr_sz, simd_maxsz(desc)); 2717 } 2718 2719 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2720 { 2721 intptr_t i, opr_sz = simd_oprsz(desc); 2722 uint32_t *d = vd, *n = vn, *m = vm; 2723 2724 for (i = 0; i < opr_sz / 4; ++i) { 2725 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2726 } 2727 clear_tail(d, opr_sz, simd_maxsz(desc)); 2728 } 2729 2730 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2731 { 2732 intptr_t i, opr_sz = simd_oprsz(desc); 2733 uint64_t *d = vd, *n = vn, *m = vm; 2734 uint64_t discard; 2735 2736 for (i = 0; i < opr_sz / 8; ++i) { 2737 mulu64(&discard, &d[i], n[i], m[i]); 2738 } 2739 clear_tail(d, opr_sz, simd_maxsz(desc)); 2740 } 2741 2742 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2743 { 2744 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2745 int shr = simd_data(desc); 2746 uint64_t *d = vd, *n = vn, *m = vm; 2747 2748 for (i = 0; i < opr_sz; ++i) { 2749 d[i] = ror64(n[i] ^ m[i], shr); 2750 } 2751 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2752 } 2753 2754 /* 2755 * Integer matrix-multiply accumulate 2756 */ 2757 2758 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2759 { 2760 int8_t *n = vn, *m = vm; 2761 2762 for (intptr_t k = 0; k < 8; ++k) { 2763 sum += n[H1(k)] * m[H1(k)]; 2764 } 2765 return sum; 2766 } 2767 2768 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2769 { 2770 uint8_t *n = vn, *m = vm; 2771 2772 for (intptr_t k = 0; k < 8; ++k) { 2773 sum += n[H1(k)] * m[H1(k)]; 2774 } 2775 return sum; 2776 } 2777 2778 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2779 { 2780 uint8_t *n = vn; 2781 int8_t *m = vm; 2782 2783 for (intptr_t k = 0; k < 8; ++k) { 2784 sum += n[H1(k)] * m[H1(k)]; 2785 } 2786 return sum; 2787 } 2788 2789 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2790 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2791 { 2792 intptr_t seg, opr_sz = simd_oprsz(desc); 2793 2794 for (seg = 0; seg < opr_sz; seg += 16) { 2795 uint32_t *d = vd + seg; 2796 uint32_t *a = va + seg; 2797 uint32_t sum0, sum1, sum2, sum3; 2798 2799 /* 2800 * Process the entire segment at once, writing back the 2801 * results only after we've consumed all of the inputs. 2802 * 2803 * Key to indices by column: 2804 * i j i j 2805 */ 2806 sum0 = a[H4(0 + 0)]; 2807 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2808 sum1 = a[H4(0 + 1)]; 2809 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2810 sum2 = a[H4(2 + 0)]; 2811 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2812 sum3 = a[H4(2 + 1)]; 2813 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2814 2815 d[H4(0)] = sum0; 2816 d[H4(1)] = sum1; 2817 d[H4(2)] = sum2; 2818 d[H4(3)] = sum3; 2819 } 2820 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2821 } 2822 2823 #define DO_MMLA_B(NAME, INNER) \ 2824 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2825 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2826 2827 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2828 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2829 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2830 2831 /* 2832 * BFloat16 Dot Product 2833 */ 2834 2835 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp) 2836 { 2837 /* 2838 * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF. 2839 * For EBF = 0, we ignore the FPCR bits which determine rounding 2840 * mode and denormal-flushing, and we do unfused multiplies and 2841 * additions with intermediate rounding of all products and sums. 2842 * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits, 2843 * and we perform a fused two-way sum-of-products without intermediate 2844 * rounding of the products. 2845 * In either case, we don't set fp exception flags. 2846 * 2847 * EBF is AArch64 only, so even if it's set in the FPCR it has 2848 * no effect on AArch32 instructions. 2849 */ 2850 bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF; 2851 2852 *statusp = is_a64(env) ? env->vfp.fp_status_a64 : env->vfp.fp_status_a32; 2853 set_default_nan_mode(true, statusp); 2854 2855 if (ebf) { 2856 /* EBF=1 needs to do a step with round-to-odd semantics */ 2857 *oddstatusp = *statusp; 2858 set_float_rounding_mode(float_round_to_odd, oddstatusp); 2859 } else { 2860 set_flush_to_zero(true, statusp); 2861 set_flush_inputs_to_zero(true, statusp); 2862 set_float_rounding_mode(float_round_to_odd_inf, statusp); 2863 } 2864 return ebf; 2865 } 2866 2867 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst) 2868 { 2869 float32 t1, t2; 2870 2871 /* 2872 * Extract each BFloat16 from the element pair, and shift 2873 * them such that they become float32. 2874 */ 2875 t1 = float32_mul(e1 << 16, e2 << 16, fpst); 2876 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst); 2877 t1 = float32_add(t1, t2, fpst); 2878 t1 = float32_add(sum, t1, fpst); 2879 2880 return t1; 2881 } 2882 2883 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2, 2884 float_status *fpst, float_status *fpst_odd) 2885 { 2886 /* 2887 * Compare f16_dotadd() in sme_helper.c, but here we have 2888 * bfloat16 inputs. In particular that means that we do not 2889 * want the FPCR.FZ16 flush semantics, so we use the normal 2890 * float_status for the input handling here. 2891 */ 2892 float64 e1r = float32_to_float64(e1 << 16, fpst); 2893 float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst); 2894 float64 e2r = float32_to_float64(e2 << 16, fpst); 2895 float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst); 2896 float64 t64; 2897 float32 t32; 2898 2899 /* 2900 * The ARM pseudocode function FPDot performs both multiplies 2901 * and the add with a single rounding operation. Emulate this 2902 * by performing the first multiply in round-to-odd, then doing 2903 * the second multiply as fused multiply-add, and rounding to 2904 * float32 all in one step. 2905 */ 2906 t64 = float64_mul(e1r, e2r, fpst_odd); 2907 t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst); 2908 2909 /* This conversion is exact, because we've already rounded. */ 2910 t32 = float64_to_float32(t64, fpst); 2911 2912 /* The final accumulation step is not fused. */ 2913 return float32_add(sum, t32, fpst); 2914 } 2915 2916 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, 2917 CPUARMState *env, uint32_t desc) 2918 { 2919 intptr_t i, opr_sz = simd_oprsz(desc); 2920 float32 *d = vd, *a = va; 2921 uint32_t *n = vn, *m = vm; 2922 float_status fpst, fpst_odd; 2923 2924 if (is_ebf(env, &fpst, &fpst_odd)) { 2925 for (i = 0; i < opr_sz / 4; ++i) { 2926 d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd); 2927 } 2928 } else { 2929 for (i = 0; i < opr_sz / 4; ++i) { 2930 d[i] = bfdotadd(a[i], n[i], m[i], &fpst); 2931 } 2932 } 2933 clear_tail(d, opr_sz, simd_maxsz(desc)); 2934 } 2935 2936 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 2937 void *va, CPUARMState *env, uint32_t desc) 2938 { 2939 intptr_t i, j, opr_sz = simd_oprsz(desc); 2940 intptr_t index = simd_data(desc); 2941 intptr_t elements = opr_sz / 4; 2942 intptr_t eltspersegment = MIN(16 / 4, elements); 2943 float32 *d = vd, *a = va; 2944 uint32_t *n = vn, *m = vm; 2945 float_status fpst, fpst_odd; 2946 2947 if (is_ebf(env, &fpst, &fpst_odd)) { 2948 for (i = 0; i < elements; i += eltspersegment) { 2949 uint32_t m_idx = m[i + H4(index)]; 2950 2951 for (j = i; j < i + eltspersegment; j++) { 2952 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd); 2953 } 2954 } 2955 } else { 2956 for (i = 0; i < elements; i += eltspersegment) { 2957 uint32_t m_idx = m[i + H4(index)]; 2958 2959 for (j = i; j < i + eltspersegment; j++) { 2960 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst); 2961 } 2962 } 2963 } 2964 clear_tail(d, opr_sz, simd_maxsz(desc)); 2965 } 2966 2967 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, 2968 CPUARMState *env, uint32_t desc) 2969 { 2970 intptr_t s, opr_sz = simd_oprsz(desc); 2971 float32 *d = vd, *a = va; 2972 uint32_t *n = vn, *m = vm; 2973 float_status fpst, fpst_odd; 2974 2975 if (is_ebf(env, &fpst, &fpst_odd)) { 2976 for (s = 0; s < opr_sz / 4; s += 4) { 2977 float32 sum00, sum01, sum10, sum11; 2978 2979 /* 2980 * Process the entire segment at once, writing back the 2981 * results only after we've consumed all of the inputs. 2982 * 2983 * Key to indices by column: 2984 * i j i k j k 2985 */ 2986 sum00 = a[s + H4(0 + 0)]; 2987 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 2988 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 2989 2990 sum01 = a[s + H4(0 + 1)]; 2991 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 2992 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 2993 2994 sum10 = a[s + H4(2 + 0)]; 2995 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 2996 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 2997 2998 sum11 = a[s + H4(2 + 1)]; 2999 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 3000 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 3001 3002 d[s + H4(0 + 0)] = sum00; 3003 d[s + H4(0 + 1)] = sum01; 3004 d[s + H4(2 + 0)] = sum10; 3005 d[s + H4(2 + 1)] = sum11; 3006 } 3007 } else { 3008 for (s = 0; s < opr_sz / 4; s += 4) { 3009 float32 sum00, sum01, sum10, sum11; 3010 3011 /* 3012 * Process the entire segment at once, writing back the 3013 * results only after we've consumed all of the inputs. 3014 * 3015 * Key to indices by column: 3016 * i j i k j k 3017 */ 3018 sum00 = a[s + H4(0 + 0)]; 3019 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst); 3020 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst); 3021 3022 sum01 = a[s + H4(0 + 1)]; 3023 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst); 3024 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst); 3025 3026 sum10 = a[s + H4(2 + 0)]; 3027 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst); 3028 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst); 3029 3030 sum11 = a[s + H4(2 + 1)]; 3031 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst); 3032 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst); 3033 3034 d[s + H4(0 + 0)] = sum00; 3035 d[s + H4(0 + 1)] = sum01; 3036 d[s + H4(2 + 0)] = sum10; 3037 d[s + H4(2 + 1)] = sum11; 3038 } 3039 } 3040 clear_tail(d, opr_sz, simd_maxsz(desc)); 3041 } 3042 3043 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 3044 float_status *stat, uint32_t desc) 3045 { 3046 intptr_t i, opr_sz = simd_oprsz(desc); 3047 intptr_t sel = simd_data(desc); 3048 float32 *d = vd, *a = va; 3049 bfloat16 *n = vn, *m = vm; 3050 3051 for (i = 0; i < opr_sz / 4; ++i) { 3052 float32 nn = n[H2(i * 2 + sel)] << 16; 3053 float32 mm = m[H2(i * 2 + sel)] << 16; 3054 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 3055 } 3056 clear_tail(d, opr_sz, simd_maxsz(desc)); 3057 } 3058 3059 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 3060 void *va, float_status *stat, uint32_t desc) 3061 { 3062 intptr_t i, j, opr_sz = simd_oprsz(desc); 3063 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 3064 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 3065 intptr_t elements = opr_sz / 4; 3066 intptr_t eltspersegment = MIN(16 / 4, elements); 3067 float32 *d = vd, *a = va; 3068 bfloat16 *n = vn, *m = vm; 3069 3070 for (i = 0; i < elements; i += eltspersegment) { 3071 float32 m_idx = m[H2(2 * i + index)] << 16; 3072 3073 for (j = i; j < i + eltspersegment; j++) { 3074 float32 n_j = n[H2(2 * j + sel)] << 16; 3075 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 3076 } 3077 } 3078 clear_tail(d, opr_sz, simd_maxsz(desc)); 3079 } 3080 3081 #define DO_CLAMP(NAME, TYPE) \ 3082 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 3083 { \ 3084 intptr_t i, opr_sz = simd_oprsz(desc); \ 3085 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 3086 TYPE aa = *(TYPE *)(a + i); \ 3087 TYPE nn = *(TYPE *)(n + i); \ 3088 TYPE mm = *(TYPE *)(m + i); \ 3089 TYPE dd = MIN(MAX(aa, nn), mm); \ 3090 *(TYPE *)(d + i) = dd; \ 3091 } \ 3092 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 3093 } 3094 3095 DO_CLAMP(gvec_sclamp_b, int8_t) 3096 DO_CLAMP(gvec_sclamp_h, int16_t) 3097 DO_CLAMP(gvec_sclamp_s, int32_t) 3098 DO_CLAMP(gvec_sclamp_d, int64_t) 3099 3100 DO_CLAMP(gvec_uclamp_b, uint8_t) 3101 DO_CLAMP(gvec_uclamp_h, uint16_t) 3102 DO_CLAMP(gvec_uclamp_s, uint32_t) 3103 DO_CLAMP(gvec_uclamp_d, uint64_t) 3104 3105 /* Bit count in each 8-bit word. */ 3106 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc) 3107 { 3108 intptr_t i, opr_sz = simd_oprsz(desc); 3109 uint8_t *d = vd, *n = vn; 3110 3111 for (i = 0; i < opr_sz; ++i) { 3112 d[i] = ctpop8(n[i]); 3113 } 3114 clear_tail(d, opr_sz, simd_maxsz(desc)); 3115 } 3116 3117 /* Reverse bits in each 8 bit word */ 3118 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc) 3119 { 3120 intptr_t i, opr_sz = simd_oprsz(desc); 3121 uint64_t *d = vd, *n = vn; 3122 3123 for (i = 0; i < opr_sz / 8; ++i) { 3124 d[i] = revbit64(bswap64(n[i])); 3125 } 3126 clear_tail(d, opr_sz, simd_maxsz(desc)); 3127 } 3128 3129 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc) 3130 { 3131 intptr_t i, opr_sz = simd_oprsz(desc); 3132 uint32_t *d = vd, *n = vn; 3133 3134 for (i = 0; i < opr_sz / 4; ++i) { 3135 d[i] = helper_recpe_u32(n[i]); 3136 } 3137 clear_tail(d, opr_sz, simd_maxsz(desc)); 3138 } 3139 3140 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc) 3141 { 3142 intptr_t i, opr_sz = simd_oprsz(desc); 3143 uint32_t *d = vd, *n = vn; 3144 3145 for (i = 0; i < opr_sz / 4; ++i) { 3146 d[i] = helper_rsqrte_u32(n[i]); 3147 } 3148 clear_tail(d, opr_sz, simd_maxsz(desc)); 3149 } 3150