1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "crypto/clmul.h" 27 #include "vec_internal.h" 28 29 /* 30 * Data for expanding active predicate bits to bytes, for byte elements. 31 * 32 * for (i = 0; i < 256; ++i) { 33 * unsigned long m = 0; 34 * for (j = 0; j < 8; j++) { 35 * if ((i >> j) & 1) { 36 * m |= 0xfful << (j << 3); 37 * } 38 * } 39 * printf("0x%016lx,\n", m); 40 * } 41 */ 42 const uint64_t expand_pred_b_data[256] = { 43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 128 0xffffffffffffffff, 129 }; 130 131 /* 132 * Similarly for half-word elements. 133 * for (i = 0; i < 256; ++i) { 134 * unsigned long m = 0; 135 * if (i & 0xaa) { 136 * continue; 137 * } 138 * for (j = 0; j < 8; j += 2) { 139 * if ((i >> j) & 1) { 140 * m |= 0xfffful << (j << 3); 141 * } 142 * } 143 * printf("[0x%x] = 0x%016lx,\n", i, m); 144 * } 145 */ 146 const uint64_t expand_pred_h_data[0x55 + 1] = { 147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 154 [0x55] = 0xffffffffffffffff, 155 }; 156 157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 159 bool neg, bool round) 160 { 161 /* 162 * Simplify: 163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 165 */ 166 int32_t ret = (int32_t)src1 * src2; 167 if (neg) { 168 ret = -ret; 169 } 170 ret += ((int32_t)src3 << 7) + (round << 6); 171 ret >>= 7; 172 173 if (ret != (int8_t)ret) { 174 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 175 } 176 return ret; 177 } 178 179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 180 void *va, uint32_t desc) 181 { 182 intptr_t i, opr_sz = simd_oprsz(desc); 183 int8_t *d = vd, *n = vn, *m = vm, *a = va; 184 185 for (i = 0; i < opr_sz; ++i) { 186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 187 } 188 } 189 190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 191 void *va, uint32_t desc) 192 { 193 intptr_t i, opr_sz = simd_oprsz(desc); 194 int8_t *d = vd, *n = vn, *m = vm, *a = va; 195 196 for (i = 0; i < opr_sz; ++i) { 197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 198 } 199 } 200 201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 202 { 203 intptr_t i, opr_sz = simd_oprsz(desc); 204 int8_t *d = vd, *n = vn, *m = vm; 205 206 for (i = 0; i < opr_sz; ++i) { 207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 208 } 209 } 210 211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 212 { 213 intptr_t i, opr_sz = simd_oprsz(desc); 214 int8_t *d = vd, *n = vn, *m = vm; 215 216 for (i = 0; i < opr_sz; ++i) { 217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 218 } 219 } 220 221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 223 bool neg, bool round, uint32_t *sat) 224 { 225 /* Simplify similarly to do_sqrdmlah_b above. */ 226 int32_t ret = (int32_t)src1 * src2; 227 if (neg) { 228 ret = -ret; 229 } 230 ret += ((int32_t)src3 << 15) + (round << 14); 231 ret >>= 15; 232 233 if (ret != (int16_t)ret) { 234 *sat = 1; 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 236 } 237 return ret; 238 } 239 240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 241 uint32_t src2, uint32_t src3) 242 { 243 uint32_t *sat = &env->vfp.qc[0]; 244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 246 false, true, sat); 247 return deposit32(e1, 16, 16, e2); 248 } 249 250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 251 void *vq, uint32_t desc) 252 { 253 uintptr_t opr_sz = simd_oprsz(desc); 254 int16_t *d = vd; 255 int16_t *n = vn; 256 int16_t *m = vm; 257 uintptr_t i; 258 259 for (i = 0; i < opr_sz / 2; ++i) { 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 261 } 262 clear_tail(d, opr_sz, simd_maxsz(desc)); 263 } 264 265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 266 uint32_t src2, uint32_t src3) 267 { 268 uint32_t *sat = &env->vfp.qc[0]; 269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 271 true, true, sat); 272 return deposit32(e1, 16, 16, e2); 273 } 274 275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277 { 278 uintptr_t opr_sz = simd_oprsz(desc); 279 int16_t *d = vd; 280 int16_t *n = vn; 281 int16_t *m = vm; 282 uintptr_t i; 283 284 for (i = 0; i < opr_sz / 2; ++i) { 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 286 } 287 clear_tail(d, opr_sz, simd_maxsz(desc)); 288 } 289 290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 291 void *vq, uint32_t desc) 292 { 293 intptr_t i, opr_sz = simd_oprsz(desc); 294 int16_t *d = vd, *n = vn, *m = vm; 295 296 for (i = 0; i < opr_sz / 2; ++i) { 297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 298 } 299 clear_tail(d, opr_sz, simd_maxsz(desc)); 300 } 301 302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 303 void *vq, uint32_t desc) 304 { 305 intptr_t i, opr_sz = simd_oprsz(desc); 306 int16_t *d = vd, *n = vn, *m = vm; 307 308 for (i = 0; i < opr_sz / 2; ++i) { 309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 310 } 311 clear_tail(d, opr_sz, simd_maxsz(desc)); 312 } 313 314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm, 315 void *vq, uint32_t desc) 316 { 317 intptr_t i, j, opr_sz = simd_oprsz(desc); 318 int idx = simd_data(desc); 319 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 320 intptr_t elements = opr_sz / 2; 321 intptr_t eltspersegment = MIN(16 / 2, elements); 322 323 for (i = 0; i < elements; i += 16 / 2) { 324 int16_t mm = m[i]; 325 for (j = 0; j < eltspersegment; ++j) { 326 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq); 327 } 328 } 329 clear_tail(d, opr_sz, simd_maxsz(desc)); 330 } 331 332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, 333 void *vq, uint32_t desc) 334 { 335 intptr_t i, j, opr_sz = simd_oprsz(desc); 336 int idx = simd_data(desc); 337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 338 intptr_t elements = opr_sz / 2; 339 intptr_t eltspersegment = MIN(16 / 2, elements); 340 341 for (i = 0; i < elements; i += 16 / 2) { 342 int16_t mm = m[i]; 343 for (j = 0; j < eltspersegment; ++j) { 344 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq); 345 } 346 } 347 clear_tail(d, opr_sz, simd_maxsz(desc)); 348 } 349 350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm, 351 void *vq, uint32_t desc) 352 { 353 intptr_t i, j, opr_sz = simd_oprsz(desc); 354 int idx = simd_data(desc); 355 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 356 intptr_t elements = opr_sz / 2; 357 intptr_t eltspersegment = MIN(16 / 2, elements); 358 359 for (i = 0; i < elements; i += 16 / 2) { 360 int16_t mm = m[i]; 361 for (j = 0; j < eltspersegment; ++j) { 362 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq); 363 } 364 } 365 clear_tail(d, opr_sz, simd_maxsz(desc)); 366 } 367 368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm, 369 void *vq, uint32_t desc) 370 { 371 intptr_t i, j, opr_sz = simd_oprsz(desc); 372 int idx = simd_data(desc); 373 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 374 intptr_t elements = opr_sz / 2; 375 intptr_t eltspersegment = MIN(16 / 2, elements); 376 377 for (i = 0; i < elements; i += 16 / 2) { 378 int16_t mm = m[i]; 379 for (j = 0; j < eltspersegment; ++j) { 380 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq); 381 } 382 } 383 clear_tail(d, opr_sz, simd_maxsz(desc)); 384 } 385 386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 387 void *va, uint32_t desc) 388 { 389 intptr_t i, opr_sz = simd_oprsz(desc); 390 int16_t *d = vd, *n = vn, *m = vm, *a = va; 391 uint32_t discard; 392 393 for (i = 0; i < opr_sz / 2; ++i) { 394 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 395 } 396 } 397 398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 399 void *va, uint32_t desc) 400 { 401 intptr_t i, opr_sz = simd_oprsz(desc); 402 int16_t *d = vd, *n = vn, *m = vm, *a = va; 403 uint32_t discard; 404 405 for (i = 0; i < opr_sz / 2; ++i) { 406 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 407 } 408 } 409 410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 411 { 412 intptr_t i, opr_sz = simd_oprsz(desc); 413 int16_t *d = vd, *n = vn, *m = vm; 414 uint32_t discard; 415 416 for (i = 0; i < opr_sz / 2; ++i) { 417 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 418 } 419 } 420 421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 422 { 423 intptr_t i, opr_sz = simd_oprsz(desc); 424 int16_t *d = vd, *n = vn, *m = vm; 425 uint32_t discard; 426 427 for (i = 0; i < opr_sz / 2; ++i) { 428 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 429 } 430 } 431 432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 433 { 434 intptr_t i, j, opr_sz = simd_oprsz(desc); 435 int idx = simd_data(desc); 436 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 437 uint32_t discard; 438 439 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 440 int16_t mm = m[i]; 441 for (j = 0; j < 16 / 2; ++j) { 442 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 443 } 444 } 445 } 446 447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 448 { 449 intptr_t i, j, opr_sz = simd_oprsz(desc); 450 int idx = simd_data(desc); 451 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 452 uint32_t discard; 453 454 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 455 int16_t mm = m[i]; 456 for (j = 0; j < 16 / 2; ++j) { 457 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 458 } 459 } 460 } 461 462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 464 bool neg, bool round, uint32_t *sat) 465 { 466 /* Simplify similarly to do_sqrdmlah_b above. */ 467 int64_t ret = (int64_t)src1 * src2; 468 if (neg) { 469 ret = -ret; 470 } 471 ret += ((int64_t)src3 << 31) + (round << 30); 472 ret >>= 31; 473 474 if (ret != (int32_t)ret) { 475 *sat = 1; 476 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 477 } 478 return ret; 479 } 480 481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 482 int32_t src2, int32_t src3) 483 { 484 uint32_t *sat = &env->vfp.qc[0]; 485 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 486 } 487 488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 489 void *vq, uint32_t desc) 490 { 491 uintptr_t opr_sz = simd_oprsz(desc); 492 int32_t *d = vd; 493 int32_t *n = vn; 494 int32_t *m = vm; 495 uintptr_t i; 496 497 for (i = 0; i < opr_sz / 4; ++i) { 498 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 499 } 500 clear_tail(d, opr_sz, simd_maxsz(desc)); 501 } 502 503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 504 int32_t src2, int32_t src3) 505 { 506 uint32_t *sat = &env->vfp.qc[0]; 507 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 508 } 509 510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 511 void *vq, uint32_t desc) 512 { 513 uintptr_t opr_sz = simd_oprsz(desc); 514 int32_t *d = vd; 515 int32_t *n = vn; 516 int32_t *m = vm; 517 uintptr_t i; 518 519 for (i = 0; i < opr_sz / 4; ++i) { 520 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 521 } 522 clear_tail(d, opr_sz, simd_maxsz(desc)); 523 } 524 525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 526 void *vq, uint32_t desc) 527 { 528 intptr_t i, opr_sz = simd_oprsz(desc); 529 int32_t *d = vd, *n = vn, *m = vm; 530 531 for (i = 0; i < opr_sz / 4; ++i) { 532 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 533 } 534 clear_tail(d, opr_sz, simd_maxsz(desc)); 535 } 536 537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 538 void *vq, uint32_t desc) 539 { 540 intptr_t i, opr_sz = simd_oprsz(desc); 541 int32_t *d = vd, *n = vn, *m = vm; 542 543 for (i = 0; i < opr_sz / 4; ++i) { 544 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 545 } 546 clear_tail(d, opr_sz, simd_maxsz(desc)); 547 } 548 549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm, 550 void *vq, uint32_t desc) 551 { 552 intptr_t i, j, opr_sz = simd_oprsz(desc); 553 int idx = simd_data(desc); 554 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 555 intptr_t elements = opr_sz / 4; 556 intptr_t eltspersegment = MIN(16 / 4, elements); 557 558 for (i = 0; i < elements; i += 16 / 4) { 559 int32_t mm = m[i]; 560 for (j = 0; j < eltspersegment; ++j) { 561 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq); 562 } 563 } 564 clear_tail(d, opr_sz, simd_maxsz(desc)); 565 } 566 567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, 568 void *vq, uint32_t desc) 569 { 570 intptr_t i, j, opr_sz = simd_oprsz(desc); 571 int idx = simd_data(desc); 572 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 573 intptr_t elements = opr_sz / 4; 574 intptr_t eltspersegment = MIN(16 / 4, elements); 575 576 for (i = 0; i < elements; i += 16 / 4) { 577 int32_t mm = m[i]; 578 for (j = 0; j < eltspersegment; ++j) { 579 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq); 580 } 581 } 582 clear_tail(d, opr_sz, simd_maxsz(desc)); 583 } 584 585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm, 586 void *vq, uint32_t desc) 587 { 588 intptr_t i, j, opr_sz = simd_oprsz(desc); 589 int idx = simd_data(desc); 590 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 591 intptr_t elements = opr_sz / 4; 592 intptr_t eltspersegment = MIN(16 / 4, elements); 593 594 for (i = 0; i < elements; i += 16 / 4) { 595 int32_t mm = m[i]; 596 for (j = 0; j < eltspersegment; ++j) { 597 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq); 598 } 599 } 600 clear_tail(d, opr_sz, simd_maxsz(desc)); 601 } 602 603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm, 604 void *vq, uint32_t desc) 605 { 606 intptr_t i, j, opr_sz = simd_oprsz(desc); 607 int idx = simd_data(desc); 608 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 609 intptr_t elements = opr_sz / 4; 610 intptr_t eltspersegment = MIN(16 / 4, elements); 611 612 for (i = 0; i < elements; i += 16 / 4) { 613 int32_t mm = m[i]; 614 for (j = 0; j < eltspersegment; ++j) { 615 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq); 616 } 617 } 618 clear_tail(d, opr_sz, simd_maxsz(desc)); 619 } 620 621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 622 void *va, uint32_t desc) 623 { 624 intptr_t i, opr_sz = simd_oprsz(desc); 625 int32_t *d = vd, *n = vn, *m = vm, *a = va; 626 uint32_t discard; 627 628 for (i = 0; i < opr_sz / 4; ++i) { 629 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 630 } 631 } 632 633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 634 void *va, uint32_t desc) 635 { 636 intptr_t i, opr_sz = simd_oprsz(desc); 637 int32_t *d = vd, *n = vn, *m = vm, *a = va; 638 uint32_t discard; 639 640 for (i = 0; i < opr_sz / 4; ++i) { 641 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 642 } 643 } 644 645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 646 { 647 intptr_t i, opr_sz = simd_oprsz(desc); 648 int32_t *d = vd, *n = vn, *m = vm; 649 uint32_t discard; 650 651 for (i = 0; i < opr_sz / 4; ++i) { 652 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 653 } 654 } 655 656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 657 { 658 intptr_t i, opr_sz = simd_oprsz(desc); 659 int32_t *d = vd, *n = vn, *m = vm; 660 uint32_t discard; 661 662 for (i = 0; i < opr_sz / 4; ++i) { 663 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 664 } 665 } 666 667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 668 { 669 intptr_t i, j, opr_sz = simd_oprsz(desc); 670 int idx = simd_data(desc); 671 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 672 uint32_t discard; 673 674 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 675 int32_t mm = m[i]; 676 for (j = 0; j < 16 / 4; ++j) { 677 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 678 } 679 } 680 } 681 682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 683 { 684 intptr_t i, j, opr_sz = simd_oprsz(desc); 685 int idx = simd_data(desc); 686 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 687 uint32_t discard; 688 689 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 690 int32_t mm = m[i]; 691 for (j = 0; j < 16 / 4; ++j) { 692 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 693 } 694 } 695 } 696 697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 698 static int64_t do_sat128_d(Int128 r) 699 { 700 int64_t ls = int128_getlo(r); 701 int64_t hs = int128_gethi(r); 702 703 if (unlikely(hs != (ls >> 63))) { 704 return hs < 0 ? INT64_MIN : INT64_MAX; 705 } 706 return ls; 707 } 708 709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 710 { 711 uint64_t l, h; 712 Int128 r, t; 713 714 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 715 muls64(&l, &h, m, n); 716 r = int128_make128(l, h); 717 if (neg) { 718 r = int128_neg(r); 719 } 720 if (a) { 721 t = int128_exts64(a); 722 t = int128_lshift(t, 63); 723 r = int128_add(r, t); 724 } 725 if (round) { 726 t = int128_exts64(1ll << 62); 727 r = int128_add(r, t); 728 } 729 r = int128_rshift(r, 63); 730 731 return do_sat128_d(r); 732 } 733 734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 735 void *va, uint32_t desc) 736 { 737 intptr_t i, opr_sz = simd_oprsz(desc); 738 int64_t *d = vd, *n = vn, *m = vm, *a = va; 739 740 for (i = 0; i < opr_sz / 8; ++i) { 741 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 742 } 743 } 744 745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 746 void *va, uint32_t desc) 747 { 748 intptr_t i, opr_sz = simd_oprsz(desc); 749 int64_t *d = vd, *n = vn, *m = vm, *a = va; 750 751 for (i = 0; i < opr_sz / 8; ++i) { 752 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 753 } 754 } 755 756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 757 { 758 intptr_t i, opr_sz = simd_oprsz(desc); 759 int64_t *d = vd, *n = vn, *m = vm; 760 761 for (i = 0; i < opr_sz / 8; ++i) { 762 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 763 } 764 } 765 766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 767 { 768 intptr_t i, opr_sz = simd_oprsz(desc); 769 int64_t *d = vd, *n = vn, *m = vm; 770 771 for (i = 0; i < opr_sz / 8; ++i) { 772 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 773 } 774 } 775 776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 777 { 778 intptr_t i, j, opr_sz = simd_oprsz(desc); 779 int idx = simd_data(desc); 780 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 781 782 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 783 int64_t mm = m[i]; 784 for (j = 0; j < 16 / 8; ++j) { 785 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 786 } 787 } 788 } 789 790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 791 { 792 intptr_t i, j, opr_sz = simd_oprsz(desc); 793 int idx = simd_data(desc); 794 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 795 796 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 797 int64_t mm = m[i]; 798 for (j = 0; j < 16 / 8; ++j) { 799 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 800 } 801 } 802 } 803 804 /* Integer 8 and 16-bit dot-product. 805 * 806 * Note that for the loops herein, host endianness does not matter 807 * with respect to the ordering of data within the quad-width lanes. 808 * All elements are treated equally, no matter where they are. 809 */ 810 811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 813 { \ 814 intptr_t i, opr_sz = simd_oprsz(desc); \ 815 TYPED *d = vd, *a = va; \ 816 TYPEN *n = vn; \ 817 TYPEM *m = vm; \ 818 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 819 d[i] = (a[i] + \ 820 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 821 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 822 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 823 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 824 } \ 825 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 826 } 827 828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 833 834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 836 { \ 837 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 838 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 839 /* \ 840 * Special case: opr_sz == 8 from AA64/AA32 advsimd means the \ 841 * first iteration might not be a full 16 byte segment. But \ 842 * for vector lengths beyond that this must be SVE and we know \ 843 * opr_sz is a multiple of 16, so we need not clamp segend \ 844 * to opr_sz_n when we advance it at the end of the loop. \ 845 */ \ 846 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 847 intptr_t index = simd_data(desc); \ 848 TYPED *d = vd, *a = va; \ 849 TYPEN *n = vn; \ 850 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 851 do { \ 852 TYPED m0 = m_indexed[i * 4 + 0]; \ 853 TYPED m1 = m_indexed[i * 4 + 1]; \ 854 TYPED m2 = m_indexed[i * 4 + 2]; \ 855 TYPED m3 = m_indexed[i * 4 + 3]; \ 856 do { \ 857 d[i] = (a[i] + \ 858 n[i * 4 + 0] * m0 + \ 859 n[i * 4 + 1] * m1 + \ 860 n[i * 4 + 2] * m2 + \ 861 n[i * 4 + 3] * m3); \ 862 } while (++i < segend); \ 863 segend = i + (16 / sizeof(TYPED)); \ 864 } while (i < opr_sz_n); \ 865 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 866 } 867 868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 874 875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 876 float_status *fpst, uint32_t desc) 877 { 878 uintptr_t opr_sz = simd_oprsz(desc); 879 float16 *d = vd; 880 float16 *n = vn; 881 float16 *m = vm; 882 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 883 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 884 uintptr_t i; 885 886 for (i = 0; i < opr_sz / 2; i += 2) { 887 float16 e0 = n[H2(i)]; 888 float16 e1 = m[H2(i + 1)]; 889 float16 e2 = n[H2(i + 1)]; 890 float16 e3 = m[H2(i)]; 891 892 if (rot) { 893 e3 = float16_maybe_ah_chs(e3, fpcr_ah); 894 } else { 895 e1 = float16_maybe_ah_chs(e1, fpcr_ah); 896 } 897 898 d[H2(i)] = float16_add(e0, e1, fpst); 899 d[H2(i + 1)] = float16_add(e2, e3, fpst); 900 } 901 clear_tail(d, opr_sz, simd_maxsz(desc)); 902 } 903 904 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 905 float_status *fpst, uint32_t desc) 906 { 907 uintptr_t opr_sz = simd_oprsz(desc); 908 float32 *d = vd; 909 float32 *n = vn; 910 float32 *m = vm; 911 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 912 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 913 uintptr_t i; 914 915 for (i = 0; i < opr_sz / 4; i += 2) { 916 float32 e0 = n[H4(i)]; 917 float32 e1 = m[H4(i + 1)]; 918 float32 e2 = n[H4(i + 1)]; 919 float32 e3 = m[H4(i)]; 920 921 if (rot) { 922 e3 = float32_maybe_ah_chs(e3, fpcr_ah); 923 } else { 924 e1 = float32_maybe_ah_chs(e1, fpcr_ah); 925 } 926 927 d[H4(i)] = float32_add(e0, e1, fpst); 928 d[H4(i + 1)] = float32_add(e2, e3, fpst); 929 } 930 clear_tail(d, opr_sz, simd_maxsz(desc)); 931 } 932 933 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 934 float_status *fpst, uint32_t desc) 935 { 936 uintptr_t opr_sz = simd_oprsz(desc); 937 float64 *d = vd; 938 float64 *n = vn; 939 float64 *m = vm; 940 bool rot = extract32(desc, SIMD_DATA_SHIFT, 1); 941 bool fpcr_ah = extract64(desc, SIMD_DATA_SHIFT + 1, 1); 942 uintptr_t i; 943 944 for (i = 0; i < opr_sz / 8; i += 2) { 945 float64 e0 = n[i]; 946 float64 e1 = m[i + 1]; 947 float64 e2 = n[i + 1]; 948 float64 e3 = m[i]; 949 950 if (rot) { 951 e3 = float64_maybe_ah_chs(e3, fpcr_ah); 952 } else { 953 e1 = float64_maybe_ah_chs(e1, fpcr_ah); 954 } 955 956 d[i] = float64_add(e0, e1, fpst); 957 d[i + 1] = float64_add(e2, e3, fpst); 958 } 959 clear_tail(d, opr_sz, simd_maxsz(desc)); 960 } 961 962 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 963 float_status *fpst, uint32_t desc) 964 { 965 uintptr_t opr_sz = simd_oprsz(desc); 966 float16 *d = vd, *n = vn, *m = vm, *a = va; 967 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 968 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 969 uint32_t neg_real = flip ^ neg_imag; 970 uintptr_t i; 971 972 /* Shift boolean to the sign bit so we can xor to negate. */ 973 neg_real <<= 15; 974 neg_imag <<= 15; 975 976 for (i = 0; i < opr_sz / 2; i += 2) { 977 float16 e2 = n[H2(i + flip)]; 978 float16 e1 = m[H2(i + flip)] ^ neg_real; 979 float16 e4 = e2; 980 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 981 982 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst); 983 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst); 984 } 985 clear_tail(d, opr_sz, simd_maxsz(desc)); 986 } 987 988 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 989 float_status *fpst, uint32_t desc) 990 { 991 uintptr_t opr_sz = simd_oprsz(desc); 992 float16 *d = vd, *n = vn, *m = vm, *a = va; 993 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 994 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 995 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 996 uint32_t neg_real = flip ^ neg_imag; 997 intptr_t elements = opr_sz / sizeof(float16); 998 intptr_t eltspersegment = MIN(16 / sizeof(float16), elements); 999 intptr_t i, j; 1000 1001 /* Shift boolean to the sign bit so we can xor to negate. */ 1002 neg_real <<= 15; 1003 neg_imag <<= 15; 1004 1005 for (i = 0; i < elements; i += eltspersegment) { 1006 float16 mr = m[H2(i + 2 * index + 0)]; 1007 float16 mi = m[H2(i + 2 * index + 1)]; 1008 float16 e1 = neg_real ^ (flip ? mi : mr); 1009 float16 e3 = neg_imag ^ (flip ? mr : mi); 1010 1011 for (j = i; j < i + eltspersegment; j += 2) { 1012 float16 e2 = n[H2(j + flip)]; 1013 float16 e4 = e2; 1014 1015 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); 1016 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); 1017 } 1018 } 1019 clear_tail(d, opr_sz, simd_maxsz(desc)); 1020 } 1021 1022 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 1023 float_status *fpst, uint32_t desc) 1024 { 1025 uintptr_t opr_sz = simd_oprsz(desc); 1026 float32 *d = vd, *n = vn, *m = vm, *a = va; 1027 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1028 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1029 uint32_t neg_real = flip ^ neg_imag; 1030 uintptr_t i; 1031 1032 /* Shift boolean to the sign bit so we can xor to negate. */ 1033 neg_real <<= 31; 1034 neg_imag <<= 31; 1035 1036 for (i = 0; i < opr_sz / 4; i += 2) { 1037 float32 e2 = n[H4(i + flip)]; 1038 float32 e1 = m[H4(i + flip)] ^ neg_real; 1039 float32 e4 = e2; 1040 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 1041 1042 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst); 1043 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst); 1044 } 1045 clear_tail(d, opr_sz, simd_maxsz(desc)); 1046 } 1047 1048 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 1049 float_status *fpst, uint32_t desc) 1050 { 1051 uintptr_t opr_sz = simd_oprsz(desc); 1052 float32 *d = vd, *n = vn, *m = vm, *a = va; 1053 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1054 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1055 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1056 uint32_t neg_real = flip ^ neg_imag; 1057 intptr_t elements = opr_sz / sizeof(float32); 1058 intptr_t eltspersegment = MIN(16 / sizeof(float32), elements); 1059 intptr_t i, j; 1060 1061 /* Shift boolean to the sign bit so we can xor to negate. */ 1062 neg_real <<= 31; 1063 neg_imag <<= 31; 1064 1065 for (i = 0; i < elements; i += eltspersegment) { 1066 float32 mr = m[H4(i + 2 * index + 0)]; 1067 float32 mi = m[H4(i + 2 * index + 1)]; 1068 float32 e1 = neg_real ^ (flip ? mi : mr); 1069 float32 e3 = neg_imag ^ (flip ? mr : mi); 1070 1071 for (j = i; j < i + eltspersegment; j += 2) { 1072 float32 e2 = n[H4(j + flip)]; 1073 float32 e4 = e2; 1074 1075 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); 1076 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); 1077 } 1078 } 1079 clear_tail(d, opr_sz, simd_maxsz(desc)); 1080 } 1081 1082 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 1083 float_status *fpst, uint32_t desc) 1084 { 1085 uintptr_t opr_sz = simd_oprsz(desc); 1086 float64 *d = vd, *n = vn, *m = vm, *a = va; 1087 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1088 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1089 uint64_t neg_real = flip ^ neg_imag; 1090 uintptr_t i; 1091 1092 /* Shift boolean to the sign bit so we can xor to negate. */ 1093 neg_real <<= 63; 1094 neg_imag <<= 63; 1095 1096 for (i = 0; i < opr_sz / 8; i += 2) { 1097 float64 e2 = n[i + flip]; 1098 float64 e1 = m[i + flip] ^ neg_real; 1099 float64 e4 = e2; 1100 float64 e3 = m[i + 1 - flip] ^ neg_imag; 1101 1102 d[i] = float64_muladd(e2, e1, a[i], 0, fpst); 1103 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst); 1104 } 1105 clear_tail(d, opr_sz, simd_maxsz(desc)); 1106 } 1107 1108 /* 1109 * Floating point comparisons producing an integer result (all 1s or all 0s). 1110 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1111 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1112 */ 1113 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 1114 { 1115 return -float16_eq_quiet(op1, op2, stat); 1116 } 1117 1118 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 1119 { 1120 return -float32_eq_quiet(op1, op2, stat); 1121 } 1122 1123 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat) 1124 { 1125 return -float64_eq_quiet(op1, op2, stat); 1126 } 1127 1128 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 1129 { 1130 return -float16_le(op2, op1, stat); 1131 } 1132 1133 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 1134 { 1135 return -float32_le(op2, op1, stat); 1136 } 1137 1138 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat) 1139 { 1140 return -float64_le(op2, op1, stat); 1141 } 1142 1143 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 1144 { 1145 return -float16_lt(op2, op1, stat); 1146 } 1147 1148 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 1149 { 1150 return -float32_lt(op2, op1, stat); 1151 } 1152 1153 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat) 1154 { 1155 return -float64_lt(op2, op1, stat); 1156 } 1157 1158 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 1159 { 1160 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 1161 } 1162 1163 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 1164 { 1165 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1166 } 1167 1168 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat) 1169 { 1170 return -float64_le(float64_abs(op2), float64_abs(op1), stat); 1171 } 1172 1173 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1174 { 1175 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1176 } 1177 1178 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1179 { 1180 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1181 } 1182 1183 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat) 1184 { 1185 return -float64_lt(float64_abs(op2), float64_abs(op1), stat); 1186 } 1187 1188 static int16_t vfp_tosszh(float16 x, float_status *fpst) 1189 { 1190 if (float16_is_any_nan(x)) { 1191 float_raise(float_flag_invalid, fpst); 1192 return 0; 1193 } 1194 return float16_to_int16_round_to_zero(x, fpst); 1195 } 1196 1197 static uint16_t vfp_touszh(float16 x, float_status *fpst) 1198 { 1199 if (float16_is_any_nan(x)) { 1200 float_raise(float_flag_invalid, fpst); 1201 return 0; 1202 } 1203 return float16_to_uint16_round_to_zero(x, fpst); 1204 } 1205 1206 #define DO_2OP(NAME, FUNC, TYPE) \ 1207 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 1208 { \ 1209 intptr_t i, oprsz = simd_oprsz(desc); \ 1210 TYPE *d = vd, *n = vn; \ 1211 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1212 d[i] = FUNC(n[i], stat); \ 1213 } \ 1214 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1215 } 1216 1217 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1218 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1219 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1220 1221 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1222 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1223 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1224 1225 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1226 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1227 1228 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1229 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1230 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1231 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1232 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1233 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1234 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1235 DO_2OP(gvec_touszh, vfp_touszh, float16) 1236 1237 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1238 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1239 { \ 1240 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1241 } 1242 1243 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1244 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1245 { \ 1246 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1247 } 1248 1249 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1250 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1251 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1252 WRAP_CMP0_##DIRN(FN, CMPOP, float64) \ 1253 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1254 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) \ 1255 DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64) 1256 1257 DO_2OP_CMP0(cgt, cgt, FWD) 1258 DO_2OP_CMP0(cge, cge, FWD) 1259 DO_2OP_CMP0(ceq, ceq, FWD) 1260 DO_2OP_CMP0(clt, cgt, REV) 1261 DO_2OP_CMP0(cle, cge, REV) 1262 1263 #undef DO_2OP 1264 #undef DO_2OP_CMP0 1265 1266 /* Floating-point trigonometric starting value. 1267 * See the ARM ARM pseudocode function FPTrigSMul. 1268 */ 1269 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1270 { 1271 float16 result = float16_mul(op1, op1, stat); 1272 if (!float16_is_any_nan(result)) { 1273 result = float16_set_sign(result, op2 & 1); 1274 } 1275 return result; 1276 } 1277 1278 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1279 { 1280 float32 result = float32_mul(op1, op1, stat); 1281 if (!float32_is_any_nan(result)) { 1282 result = float32_set_sign(result, op2 & 1); 1283 } 1284 return result; 1285 } 1286 1287 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1288 { 1289 float64 result = float64_mul(op1, op1, stat); 1290 if (!float64_is_any_nan(result)) { 1291 result = float64_set_sign(result, op2 & 1); 1292 } 1293 return result; 1294 } 1295 1296 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1297 { 1298 return float16_abs(float16_sub(op1, op2, stat)); 1299 } 1300 1301 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1302 { 1303 return float32_abs(float32_sub(op1, op2, stat)); 1304 } 1305 1306 static float64 float64_abd(float64 op1, float64 op2, float_status *stat) 1307 { 1308 return float64_abs(float64_sub(op1, op2, stat)); 1309 } 1310 1311 /* ABD when FPCR.AH = 1: avoid flipping sign bit of a NaN result */ 1312 static float16 float16_ah_abd(float16 op1, float16 op2, float_status *stat) 1313 { 1314 float16 r = float16_sub(op1, op2, stat); 1315 return float16_is_any_nan(r) ? r : float16_abs(r); 1316 } 1317 1318 static float32 float32_ah_abd(float32 op1, float32 op2, float_status *stat) 1319 { 1320 float32 r = float32_sub(op1, op2, stat); 1321 return float32_is_any_nan(r) ? r : float32_abs(r); 1322 } 1323 1324 static float64 float64_ah_abd(float64 op1, float64 op2, float_status *stat) 1325 { 1326 float64 r = float64_sub(op1, op2, stat); 1327 return float64_is_any_nan(r) ? r : float64_abs(r); 1328 } 1329 1330 /* 1331 * Reciprocal step. These are the AArch32 version which uses a 1332 * non-fused multiply-and-subtract. 1333 */ 1334 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1335 { 1336 op1 = float16_squash_input_denormal(op1, stat); 1337 op2 = float16_squash_input_denormal(op2, stat); 1338 1339 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1340 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1341 return float16_two; 1342 } 1343 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1344 } 1345 1346 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1347 { 1348 op1 = float32_squash_input_denormal(op1, stat); 1349 op2 = float32_squash_input_denormal(op2, stat); 1350 1351 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1352 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1353 return float32_two; 1354 } 1355 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1356 } 1357 1358 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1359 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1360 { 1361 op1 = float16_squash_input_denormal(op1, stat); 1362 op2 = float16_squash_input_denormal(op2, stat); 1363 1364 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1365 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1366 return float16_one_point_five; 1367 } 1368 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1369 return float16_div(op1, float16_two, stat); 1370 } 1371 1372 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1373 { 1374 op1 = float32_squash_input_denormal(op1, stat); 1375 op2 = float32_squash_input_denormal(op2, stat); 1376 1377 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1378 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1379 return float32_one_point_five; 1380 } 1381 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1382 return float32_div(op1, float32_two, stat); 1383 } 1384 1385 #define DO_3OP(NAME, FUNC, TYPE) \ 1386 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1387 float_status *stat, uint32_t desc) \ 1388 { \ 1389 intptr_t i, oprsz = simd_oprsz(desc); \ 1390 TYPE *d = vd, *n = vn, *m = vm; \ 1391 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1392 d[i] = FUNC(n[i], m[i], stat); \ 1393 } \ 1394 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1395 } 1396 1397 DO_3OP(gvec_fadd_h, float16_add, float16) 1398 DO_3OP(gvec_fadd_s, float32_add, float32) 1399 DO_3OP(gvec_fadd_d, float64_add, float64) 1400 1401 DO_3OP(gvec_fsub_h, float16_sub, float16) 1402 DO_3OP(gvec_fsub_s, float32_sub, float32) 1403 DO_3OP(gvec_fsub_d, float64_sub, float64) 1404 1405 DO_3OP(gvec_fmul_h, float16_mul, float16) 1406 DO_3OP(gvec_fmul_s, float32_mul, float32) 1407 DO_3OP(gvec_fmul_d, float64_mul, float64) 1408 1409 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1410 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1411 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1412 1413 DO_3OP(gvec_fabd_h, float16_abd, float16) 1414 DO_3OP(gvec_fabd_s, float32_abd, float32) 1415 DO_3OP(gvec_fabd_d, float64_abd, float64) 1416 1417 DO_3OP(gvec_ah_fabd_h, float16_ah_abd, float16) 1418 DO_3OP(gvec_ah_fabd_s, float32_ah_abd, float32) 1419 DO_3OP(gvec_ah_fabd_d, float64_ah_abd, float64) 1420 1421 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1422 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1423 DO_3OP(gvec_fceq_d, float64_ceq, float64) 1424 1425 DO_3OP(gvec_fcge_h, float16_cge, float16) 1426 DO_3OP(gvec_fcge_s, float32_cge, float32) 1427 DO_3OP(gvec_fcge_d, float64_cge, float64) 1428 1429 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1430 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1431 DO_3OP(gvec_fcgt_d, float64_cgt, float64) 1432 1433 DO_3OP(gvec_facge_h, float16_acge, float16) 1434 DO_3OP(gvec_facge_s, float32_acge, float32) 1435 DO_3OP(gvec_facge_d, float64_acge, float64) 1436 1437 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1438 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1439 DO_3OP(gvec_facgt_d, float64_acgt, float64) 1440 1441 DO_3OP(gvec_fmax_h, float16_max, float16) 1442 DO_3OP(gvec_fmax_s, float32_max, float32) 1443 DO_3OP(gvec_fmax_d, float64_max, float64) 1444 1445 DO_3OP(gvec_fmin_h, float16_min, float16) 1446 DO_3OP(gvec_fmin_s, float32_min, float32) 1447 DO_3OP(gvec_fmin_d, float64_min, float64) 1448 1449 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1450 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1451 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64) 1452 1453 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1454 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1455 DO_3OP(gvec_fminnum_d, float64_minnum, float64) 1456 1457 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1458 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1459 1460 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1461 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1462 1463 #ifdef TARGET_AARCH64 1464 DO_3OP(gvec_fdiv_h, float16_div, float16) 1465 DO_3OP(gvec_fdiv_s, float32_div, float32) 1466 DO_3OP(gvec_fdiv_d, float64_div, float64) 1467 1468 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16) 1469 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32) 1470 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64) 1471 1472 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1473 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1474 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1475 1476 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1477 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1478 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1479 1480 DO_3OP(gvec_ah_recps_h, helper_recpsf_ah_f16, float16) 1481 DO_3OP(gvec_ah_recps_s, helper_recpsf_ah_f32, float32) 1482 DO_3OP(gvec_ah_recps_d, helper_recpsf_ah_f64, float64) 1483 1484 DO_3OP(gvec_ah_rsqrts_h, helper_rsqrtsf_ah_f16, float16) 1485 DO_3OP(gvec_ah_rsqrts_s, helper_rsqrtsf_ah_f32, float32) 1486 DO_3OP(gvec_ah_rsqrts_d, helper_rsqrtsf_ah_f64, float64) 1487 1488 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16) 1489 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32) 1490 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64) 1491 1492 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16) 1493 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32) 1494 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64) 1495 1496 #endif 1497 #undef DO_3OP 1498 1499 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1500 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1501 float_status *stat) 1502 { 1503 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1504 } 1505 1506 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1507 float_status *stat) 1508 { 1509 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1510 } 1511 1512 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1513 float_status *stat) 1514 { 1515 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1516 } 1517 1518 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1519 float_status *stat) 1520 { 1521 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1522 } 1523 1524 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1525 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1526 float_status *stat) 1527 { 1528 return float16_muladd(op1, op2, dest, 0, stat); 1529 } 1530 1531 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1532 float_status *stat) 1533 { 1534 return float32_muladd(op1, op2, dest, 0, stat); 1535 } 1536 1537 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2, 1538 float_status *stat) 1539 { 1540 return float64_muladd(op1, op2, dest, 0, stat); 1541 } 1542 1543 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1544 float_status *stat) 1545 { 1546 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1547 } 1548 1549 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1550 float_status *stat) 1551 { 1552 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1553 } 1554 1555 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2, 1556 float_status *stat) 1557 { 1558 return float64_muladd(float64_chs(op1), op2, dest, 0, stat); 1559 } 1560 1561 #define DO_MULADD(NAME, FUNC, TYPE) \ 1562 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1563 float_status *stat, uint32_t desc) \ 1564 { \ 1565 intptr_t i, oprsz = simd_oprsz(desc); \ 1566 TYPE *d = vd, *n = vn, *m = vm; \ 1567 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1568 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1569 } \ 1570 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1571 } 1572 1573 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1574 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1575 1576 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1577 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1578 1579 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1580 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1581 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64) 1582 1583 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1584 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1585 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64) 1586 1587 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1588 * For AdvSIMD, there is of course only one such vector segment. 1589 */ 1590 1591 #define DO_MUL_IDX(NAME, TYPE, H) \ 1592 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1593 { \ 1594 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1595 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1596 intptr_t idx = simd_data(desc); \ 1597 TYPE *d = vd, *n = vn, *m = vm; \ 1598 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1599 TYPE mm = m[H(i + idx)]; \ 1600 for (j = 0; j < segment; j++) { \ 1601 d[i + j] = n[i + j] * mm; \ 1602 } \ 1603 } \ 1604 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1605 } 1606 1607 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1608 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1609 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1610 1611 #undef DO_MUL_IDX 1612 1613 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1614 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1615 { \ 1616 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1617 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1618 intptr_t idx = simd_data(desc); \ 1619 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1620 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1621 TYPE mm = m[H(i + idx)]; \ 1622 for (j = 0; j < segment; j++) { \ 1623 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1624 } \ 1625 } \ 1626 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1627 } 1628 1629 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1630 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1631 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1632 1633 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1634 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1635 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1636 1637 #undef DO_MLA_IDX 1638 1639 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \ 1640 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1641 float_status *stat, uint32_t desc) \ 1642 { \ 1643 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1644 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1645 intptr_t idx = simd_data(desc); \ 1646 TYPE *d = vd, *n = vn, *m = vm; \ 1647 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1648 TYPE mm = m[H(i + idx)]; \ 1649 for (j = 0; j < segment; j++) { \ 1650 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \ 1651 } \ 1652 } \ 1653 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1654 } 1655 1656 #define nop(N, M, S) (M) 1657 1658 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2) 1659 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4) 1660 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8) 1661 1662 #ifdef TARGET_AARCH64 1663 1664 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2) 1665 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4) 1666 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8) 1667 1668 #endif 1669 1670 #undef nop 1671 1672 /* 1673 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1674 * the fused ops below they assume accumulate both from and into Vd. 1675 */ 1676 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2) 1677 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4) 1678 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2) 1679 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4) 1680 1681 #undef DO_FMUL_IDX 1682 1683 #define DO_FMLA_IDX(NAME, TYPE, H) \ 1684 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1685 float_status *stat, uint32_t desc) \ 1686 { \ 1687 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1688 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1689 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 1690 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 1691 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1692 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 1693 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1694 TYPE mm = m[H(i + idx)]; \ 1695 for (j = 0; j < segment; j++) { \ 1696 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 1697 mm, a[i + j], 0, stat); \ 1698 } \ 1699 } \ 1700 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1701 } 1702 1703 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 1704 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 1705 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) 1706 1707 #undef DO_FMLA_IDX 1708 1709 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1710 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1711 { \ 1712 intptr_t i, oprsz = simd_oprsz(desc); \ 1713 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1714 bool q = false; \ 1715 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1716 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1717 if (dd < MIN) { \ 1718 dd = MIN; \ 1719 q = true; \ 1720 } else if (dd > MAX) { \ 1721 dd = MAX; \ 1722 q = true; \ 1723 } \ 1724 d[i] = dd; \ 1725 } \ 1726 if (q) { \ 1727 uint32_t *qc = vq; \ 1728 qc[0] = 1; \ 1729 } \ 1730 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1731 } 1732 1733 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1734 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1735 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1736 1737 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1738 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1739 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1740 1741 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1742 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1743 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1744 1745 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1746 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1747 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1748 1749 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX) 1750 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX) 1751 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX) 1752 1753 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX) 1754 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX) 1755 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX) 1756 1757 #undef DO_SAT 1758 1759 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1760 void *vm, uint32_t desc) 1761 { 1762 intptr_t i, oprsz = simd_oprsz(desc); 1763 uint64_t *d = vd, *n = vn, *m = vm; 1764 bool q = false; 1765 1766 for (i = 0; i < oprsz / 8; i++) { 1767 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1768 if (dd < nn) { 1769 dd = UINT64_MAX; 1770 q = true; 1771 } 1772 d[i] = dd; 1773 } 1774 if (q) { 1775 uint32_t *qc = vq; 1776 qc[0] = 1; 1777 } 1778 clear_tail(d, oprsz, simd_maxsz(desc)); 1779 } 1780 1781 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1782 void *vm, uint32_t desc) 1783 { 1784 intptr_t i, oprsz = simd_oprsz(desc); 1785 uint64_t *d = vd, *n = vn, *m = vm; 1786 bool q = false; 1787 1788 for (i = 0; i < oprsz / 8; i++) { 1789 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1790 if (nn < mm) { 1791 dd = 0; 1792 q = true; 1793 } 1794 d[i] = dd; 1795 } 1796 if (q) { 1797 uint32_t *qc = vq; 1798 qc[0] = 1; 1799 } 1800 clear_tail(d, oprsz, simd_maxsz(desc)); 1801 } 1802 1803 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1804 void *vm, uint32_t desc) 1805 { 1806 intptr_t i, oprsz = simd_oprsz(desc); 1807 int64_t *d = vd, *n = vn, *m = vm; 1808 bool q = false; 1809 1810 for (i = 0; i < oprsz / 8; i++) { 1811 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1812 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1813 dd = (nn >> 63) ^ ~INT64_MIN; 1814 q = true; 1815 } 1816 d[i] = dd; 1817 } 1818 if (q) { 1819 uint32_t *qc = vq; 1820 qc[0] = 1; 1821 } 1822 clear_tail(d, oprsz, simd_maxsz(desc)); 1823 } 1824 1825 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1826 void *vm, uint32_t desc) 1827 { 1828 intptr_t i, oprsz = simd_oprsz(desc); 1829 int64_t *d = vd, *n = vn, *m = vm; 1830 bool q = false; 1831 1832 for (i = 0; i < oprsz / 8; i++) { 1833 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1834 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1835 dd = (nn >> 63) ^ ~INT64_MIN; 1836 q = true; 1837 } 1838 d[i] = dd; 1839 } 1840 if (q) { 1841 uint32_t *qc = vq; 1842 qc[0] = 1; 1843 } 1844 clear_tail(d, oprsz, simd_maxsz(desc)); 1845 } 1846 1847 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn, 1848 void *vm, uint32_t desc) 1849 { 1850 intptr_t i, oprsz = simd_oprsz(desc); 1851 uint64_t *d = vd, *n = vn, *m = vm; 1852 bool q = false; 1853 1854 for (i = 0; i < oprsz / 8; i++) { 1855 uint64_t nn = n[i]; 1856 int64_t mm = m[i]; 1857 uint64_t dd = nn + mm; 1858 1859 if (mm < 0) { 1860 if (nn < (uint64_t)-mm) { 1861 dd = 0; 1862 q = true; 1863 } 1864 } else { 1865 if (dd < nn) { 1866 dd = UINT64_MAX; 1867 q = true; 1868 } 1869 } 1870 d[i] = dd; 1871 } 1872 if (q) { 1873 uint32_t *qc = vq; 1874 qc[0] = 1; 1875 } 1876 clear_tail(d, oprsz, simd_maxsz(desc)); 1877 } 1878 1879 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn, 1880 void *vm, uint32_t desc) 1881 { 1882 intptr_t i, oprsz = simd_oprsz(desc); 1883 uint64_t *d = vd, *n = vn, *m = vm; 1884 bool q = false; 1885 1886 for (i = 0; i < oprsz / 8; i++) { 1887 int64_t nn = n[i]; 1888 uint64_t mm = m[i]; 1889 int64_t dd = nn + mm; 1890 1891 if (mm > (uint64_t)(INT64_MAX - nn)) { 1892 dd = INT64_MAX; 1893 q = true; 1894 } 1895 d[i] = dd; 1896 } 1897 if (q) { 1898 uint32_t *qc = vq; 1899 qc[0] = 1; 1900 } 1901 clear_tail(d, oprsz, simd_maxsz(desc)); 1902 } 1903 1904 #define DO_SRA(NAME, TYPE) \ 1905 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1906 { \ 1907 intptr_t i, oprsz = simd_oprsz(desc); \ 1908 int shift = simd_data(desc); \ 1909 TYPE *d = vd, *n = vn; \ 1910 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1911 d[i] += n[i] >> shift; \ 1912 } \ 1913 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1914 } 1915 1916 DO_SRA(gvec_ssra_b, int8_t) 1917 DO_SRA(gvec_ssra_h, int16_t) 1918 DO_SRA(gvec_ssra_s, int32_t) 1919 DO_SRA(gvec_ssra_d, int64_t) 1920 1921 DO_SRA(gvec_usra_b, uint8_t) 1922 DO_SRA(gvec_usra_h, uint16_t) 1923 DO_SRA(gvec_usra_s, uint32_t) 1924 DO_SRA(gvec_usra_d, uint64_t) 1925 1926 #undef DO_SRA 1927 1928 #define DO_RSHR(NAME, TYPE) \ 1929 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1930 { \ 1931 intptr_t i, oprsz = simd_oprsz(desc); \ 1932 int shift = simd_data(desc); \ 1933 TYPE *d = vd, *n = vn; \ 1934 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1935 TYPE tmp = n[i] >> (shift - 1); \ 1936 d[i] = (tmp >> 1) + (tmp & 1); \ 1937 } \ 1938 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1939 } 1940 1941 DO_RSHR(gvec_srshr_b, int8_t) 1942 DO_RSHR(gvec_srshr_h, int16_t) 1943 DO_RSHR(gvec_srshr_s, int32_t) 1944 DO_RSHR(gvec_srshr_d, int64_t) 1945 1946 DO_RSHR(gvec_urshr_b, uint8_t) 1947 DO_RSHR(gvec_urshr_h, uint16_t) 1948 DO_RSHR(gvec_urshr_s, uint32_t) 1949 DO_RSHR(gvec_urshr_d, uint64_t) 1950 1951 #undef DO_RSHR 1952 1953 #define DO_RSRA(NAME, TYPE) \ 1954 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1955 { \ 1956 intptr_t i, oprsz = simd_oprsz(desc); \ 1957 int shift = simd_data(desc); \ 1958 TYPE *d = vd, *n = vn; \ 1959 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1960 TYPE tmp = n[i] >> (shift - 1); \ 1961 d[i] += (tmp >> 1) + (tmp & 1); \ 1962 } \ 1963 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1964 } 1965 1966 DO_RSRA(gvec_srsra_b, int8_t) 1967 DO_RSRA(gvec_srsra_h, int16_t) 1968 DO_RSRA(gvec_srsra_s, int32_t) 1969 DO_RSRA(gvec_srsra_d, int64_t) 1970 1971 DO_RSRA(gvec_ursra_b, uint8_t) 1972 DO_RSRA(gvec_ursra_h, uint16_t) 1973 DO_RSRA(gvec_ursra_s, uint32_t) 1974 DO_RSRA(gvec_ursra_d, uint64_t) 1975 1976 #undef DO_RSRA 1977 1978 #define DO_SRI(NAME, TYPE) \ 1979 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1980 { \ 1981 intptr_t i, oprsz = simd_oprsz(desc); \ 1982 int shift = simd_data(desc); \ 1983 TYPE *d = vd, *n = vn; \ 1984 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1985 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 1986 } \ 1987 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1988 } 1989 1990 DO_SRI(gvec_sri_b, uint8_t) 1991 DO_SRI(gvec_sri_h, uint16_t) 1992 DO_SRI(gvec_sri_s, uint32_t) 1993 DO_SRI(gvec_sri_d, uint64_t) 1994 1995 #undef DO_SRI 1996 1997 #define DO_SLI(NAME, TYPE) \ 1998 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1999 { \ 2000 intptr_t i, oprsz = simd_oprsz(desc); \ 2001 int shift = simd_data(desc); \ 2002 TYPE *d = vd, *n = vn; \ 2003 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2004 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 2005 } \ 2006 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2007 } 2008 2009 DO_SLI(gvec_sli_b, uint8_t) 2010 DO_SLI(gvec_sli_h, uint16_t) 2011 DO_SLI(gvec_sli_s, uint32_t) 2012 DO_SLI(gvec_sli_d, uint64_t) 2013 2014 #undef DO_SLI 2015 2016 /* 2017 * Convert float16 to float32, raising no exceptions and 2018 * preserving exceptional values, including SNaN. 2019 * This is effectively an unpack+repack operation. 2020 */ 2021 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 2022 { 2023 const int f16_bias = 15; 2024 const int f32_bias = 127; 2025 uint32_t sign = extract32(f16, 15, 1); 2026 uint32_t exp = extract32(f16, 10, 5); 2027 uint32_t frac = extract32(f16, 0, 10); 2028 2029 if (exp == 0x1f) { 2030 /* Inf or NaN */ 2031 exp = 0xff; 2032 } else if (exp == 0) { 2033 /* Zero or denormal. */ 2034 if (frac != 0) { 2035 if (fz16) { 2036 frac = 0; 2037 } else { 2038 /* 2039 * Denormal; these are all normal float32. 2040 * Shift the fraction so that the msb is at bit 11, 2041 * then remove bit 11 as the implicit bit of the 2042 * normalized float32. Note that we still go through 2043 * the shift for normal numbers below, to put the 2044 * float32 fraction at the right place. 2045 */ 2046 int shift = clz32(frac) - 21; 2047 frac = (frac << shift) & 0x3ff; 2048 exp = f32_bias - f16_bias - shift + 1; 2049 } 2050 } 2051 } else { 2052 /* Normal number; adjust the bias. */ 2053 exp += f32_bias - f16_bias; 2054 } 2055 sign <<= 31; 2056 exp <<= 23; 2057 frac <<= 23 - 10; 2058 2059 return sign | exp | frac; 2060 } 2061 2062 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 2063 { 2064 /* 2065 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 2066 * Load the 2nd qword iff is_q & is_2. 2067 * Shift to the 2nd dword iff !is_q & is_2. 2068 * For !is_q & !is_2, the upper bits of the result are garbage. 2069 */ 2070 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 2071 } 2072 2073 /* 2074 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 2075 * as there is not yet SVE versions that might use blocking. 2076 */ 2077 2078 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 2079 uint32_t desc, bool fz16) 2080 { 2081 intptr_t i, oprsz = simd_oprsz(desc); 2082 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2083 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2084 int is_q = oprsz == 16; 2085 uint64_t n_4, m_4; 2086 2087 /* Pre-load all of the f16 data, avoiding overlap issues. */ 2088 n_4 = load4_f16(vn, is_q, is_2); 2089 m_4 = load4_f16(vm, is_q, is_2); 2090 2091 /* Negate all inputs for FMLSL at once. */ 2092 if (is_s) { 2093 n_4 ^= 0x8000800080008000ull; 2094 } 2095 2096 for (i = 0; i < oprsz / 4; i++) { 2097 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2098 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 2099 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 2100 } 2101 clear_tail(d, oprsz, simd_maxsz(desc)); 2102 } 2103 2104 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 2105 CPUARMState *env, uint32_t desc) 2106 { 2107 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, 2108 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32)); 2109 } 2110 2111 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 2112 CPUARMState *env, uint32_t desc) 2113 { 2114 do_fmlal(vd, vn, vm, &env->vfp.fp_status_a64, desc, 2115 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64)); 2116 } 2117 2118 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 2119 CPUARMState *env, uint32_t desc) 2120 { 2121 intptr_t i, oprsz = simd_oprsz(desc); 2122 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 2123 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2124 float_status *status = &env->vfp.fp_status_a64; 2125 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64); 2126 2127 for (i = 0; i < oprsz; i += sizeof(float32)) { 2128 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; 2129 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 2130 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2131 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2132 float32 aa = *(float32 *)(va + H1_4(i)); 2133 2134 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); 2135 } 2136 } 2137 2138 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 2139 uint32_t desc, bool fz16) 2140 { 2141 intptr_t i, oprsz = simd_oprsz(desc); 2142 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2143 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2144 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 2145 int is_q = oprsz == 16; 2146 uint64_t n_4; 2147 float32 m_1; 2148 2149 /* Pre-load all of the f16 data, avoiding overlap issues. */ 2150 n_4 = load4_f16(vn, is_q, is_2); 2151 2152 /* Negate all inputs for FMLSL at once. */ 2153 if (is_s) { 2154 n_4 ^= 0x8000800080008000ull; 2155 } 2156 2157 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 2158 2159 for (i = 0; i < oprsz / 4; i++) { 2160 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2161 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 2162 } 2163 clear_tail(d, oprsz, simd_maxsz(desc)); 2164 } 2165 2166 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 2167 CPUARMState *env, uint32_t desc) 2168 { 2169 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, 2170 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32)); 2171 } 2172 2173 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 2174 CPUARMState *env, uint32_t desc) 2175 { 2176 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status_a64, desc, 2177 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64)); 2178 } 2179 2180 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 2181 CPUARMState *env, uint32_t desc) 2182 { 2183 intptr_t i, j, oprsz = simd_oprsz(desc); 2184 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 2185 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2186 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 2187 float_status *status = &env->vfp.fp_status_a64; 2188 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64); 2189 2190 for (i = 0; i < oprsz; i += 16) { 2191 float16 mm_16 = *(float16 *)(vm + i + idx); 2192 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2193 2194 for (j = 0; j < 16; j += sizeof(float32)) { 2195 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; 2196 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2197 float32 aa = *(float32 *)(va + H1_4(i + j)); 2198 2199 *(float32 *)(vd + H1_4(i + j)) = 2200 float32_muladd(nn, mm, aa, 0, status); 2201 } 2202 } 2203 } 2204 2205 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2206 { 2207 intptr_t i, opr_sz = simd_oprsz(desc); 2208 int8_t *d = vd, *n = vn, *m = vm; 2209 2210 for (i = 0; i < opr_sz; ++i) { 2211 int8_t mm = m[i]; 2212 int8_t nn = n[i]; 2213 int8_t res = 0; 2214 if (mm >= 0) { 2215 if (mm < 8) { 2216 res = nn << mm; 2217 } 2218 } else { 2219 res = nn >> (mm > -8 ? -mm : 7); 2220 } 2221 d[i] = res; 2222 } 2223 clear_tail(d, opr_sz, simd_maxsz(desc)); 2224 } 2225 2226 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2227 { 2228 intptr_t i, opr_sz = simd_oprsz(desc); 2229 int16_t *d = vd, *n = vn, *m = vm; 2230 2231 for (i = 0; i < opr_sz / 2; ++i) { 2232 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2233 int16_t nn = n[i]; 2234 int16_t res = 0; 2235 if (mm >= 0) { 2236 if (mm < 16) { 2237 res = nn << mm; 2238 } 2239 } else { 2240 res = nn >> (mm > -16 ? -mm : 15); 2241 } 2242 d[i] = res; 2243 } 2244 clear_tail(d, opr_sz, simd_maxsz(desc)); 2245 } 2246 2247 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2248 { 2249 intptr_t i, opr_sz = simd_oprsz(desc); 2250 uint8_t *d = vd, *n = vn, *m = vm; 2251 2252 for (i = 0; i < opr_sz; ++i) { 2253 int8_t mm = m[i]; 2254 uint8_t nn = n[i]; 2255 uint8_t res = 0; 2256 if (mm >= 0) { 2257 if (mm < 8) { 2258 res = nn << mm; 2259 } 2260 } else { 2261 if (mm > -8) { 2262 res = nn >> -mm; 2263 } 2264 } 2265 d[i] = res; 2266 } 2267 clear_tail(d, opr_sz, simd_maxsz(desc)); 2268 } 2269 2270 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2271 { 2272 intptr_t i, opr_sz = simd_oprsz(desc); 2273 uint16_t *d = vd, *n = vn, *m = vm; 2274 2275 for (i = 0; i < opr_sz / 2; ++i) { 2276 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2277 uint16_t nn = n[i]; 2278 uint16_t res = 0; 2279 if (mm >= 0) { 2280 if (mm < 16) { 2281 res = nn << mm; 2282 } 2283 } else { 2284 if (mm > -16) { 2285 res = nn >> -mm; 2286 } 2287 } 2288 d[i] = res; 2289 } 2290 clear_tail(d, opr_sz, simd_maxsz(desc)); 2291 } 2292 2293 /* 2294 * 8x8->8 polynomial multiply. 2295 * 2296 * Polynomial multiplication is like integer multiplication except the 2297 * partial products are XORed, not added. 2298 * 2299 * TODO: expose this as a generic vector operation, as it is a common 2300 * crypto building block. 2301 */ 2302 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 2303 { 2304 intptr_t i, opr_sz = simd_oprsz(desc); 2305 uint64_t *d = vd, *n = vn, *m = vm; 2306 2307 for (i = 0; i < opr_sz / 8; ++i) { 2308 d[i] = clmul_8x8_low(n[i], m[i]); 2309 } 2310 clear_tail(d, opr_sz, simd_maxsz(desc)); 2311 } 2312 2313 /* 2314 * 64x64->128 polynomial multiply. 2315 * Because of the lanes are not accessed in strict columns, 2316 * this probably cannot be turned into a generic helper. 2317 */ 2318 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2319 { 2320 intptr_t i, opr_sz = simd_oprsz(desc); 2321 intptr_t hi = simd_data(desc); 2322 uint64_t *d = vd, *n = vn, *m = vm; 2323 2324 for (i = 0; i < opr_sz / 8; i += 2) { 2325 Int128 r = clmul_64(n[i + hi], m[i + hi]); 2326 d[i] = int128_getlo(r); 2327 d[i + 1] = int128_gethi(r); 2328 } 2329 clear_tail(d, opr_sz, simd_maxsz(desc)); 2330 } 2331 2332 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2333 { 2334 int hi = simd_data(desc); 2335 uint64_t *d = vd, *n = vn, *m = vm; 2336 uint64_t nn = n[hi], mm = m[hi]; 2337 2338 d[0] = clmul_8x4_packed(nn, mm); 2339 nn >>= 32; 2340 mm >>= 32; 2341 d[1] = clmul_8x4_packed(nn, mm); 2342 2343 clear_tail(d, 16, simd_maxsz(desc)); 2344 } 2345 2346 #ifdef TARGET_AARCH64 2347 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2348 { 2349 int shift = simd_data(desc) * 8; 2350 intptr_t i, opr_sz = simd_oprsz(desc); 2351 uint64_t *d = vd, *n = vn, *m = vm; 2352 2353 for (i = 0; i < opr_sz / 8; ++i) { 2354 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); 2355 } 2356 } 2357 2358 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2359 { 2360 intptr_t sel = H4(simd_data(desc)); 2361 intptr_t i, opr_sz = simd_oprsz(desc); 2362 uint32_t *n = vn, *m = vm; 2363 uint64_t *d = vd; 2364 2365 for (i = 0; i < opr_sz / 8; ++i) { 2366 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); 2367 } 2368 } 2369 #endif 2370 2371 #define DO_CMP0(NAME, TYPE, OP) \ 2372 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2373 { \ 2374 intptr_t i, opr_sz = simd_oprsz(desc); \ 2375 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2376 TYPE nn = *(TYPE *)(vn + i); \ 2377 *(TYPE *)(vd + i) = -(nn OP 0); \ 2378 } \ 2379 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2380 } 2381 2382 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2383 DO_CMP0(gvec_clt0_b, int8_t, <) 2384 DO_CMP0(gvec_cle0_b, int8_t, <=) 2385 DO_CMP0(gvec_cgt0_b, int8_t, >) 2386 DO_CMP0(gvec_cge0_b, int8_t, >=) 2387 2388 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2389 DO_CMP0(gvec_clt0_h, int16_t, <) 2390 DO_CMP0(gvec_cle0_h, int16_t, <=) 2391 DO_CMP0(gvec_cgt0_h, int16_t, >) 2392 DO_CMP0(gvec_cge0_h, int16_t, >=) 2393 2394 #undef DO_CMP0 2395 2396 #define DO_ABD(NAME, TYPE) \ 2397 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2398 { \ 2399 intptr_t i, opr_sz = simd_oprsz(desc); \ 2400 TYPE *d = vd, *n = vn, *m = vm; \ 2401 \ 2402 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2403 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2404 } \ 2405 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2406 } 2407 2408 DO_ABD(gvec_sabd_b, int8_t) 2409 DO_ABD(gvec_sabd_h, int16_t) 2410 DO_ABD(gvec_sabd_s, int32_t) 2411 DO_ABD(gvec_sabd_d, int64_t) 2412 2413 DO_ABD(gvec_uabd_b, uint8_t) 2414 DO_ABD(gvec_uabd_h, uint16_t) 2415 DO_ABD(gvec_uabd_s, uint32_t) 2416 DO_ABD(gvec_uabd_d, uint64_t) 2417 2418 #undef DO_ABD 2419 2420 #define DO_ABA(NAME, TYPE) \ 2421 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2422 { \ 2423 intptr_t i, opr_sz = simd_oprsz(desc); \ 2424 TYPE *d = vd, *n = vn, *m = vm; \ 2425 \ 2426 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2427 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2428 } \ 2429 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2430 } 2431 2432 DO_ABA(gvec_saba_b, int8_t) 2433 DO_ABA(gvec_saba_h, int16_t) 2434 DO_ABA(gvec_saba_s, int32_t) 2435 DO_ABA(gvec_saba_d, int64_t) 2436 2437 DO_ABA(gvec_uaba_b, uint8_t) 2438 DO_ABA(gvec_uaba_h, uint16_t) 2439 DO_ABA(gvec_uaba_s, uint32_t) 2440 DO_ABA(gvec_uaba_d, uint64_t) 2441 2442 #undef DO_ABA 2443 2444 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2445 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 2446 float_status *stat, uint32_t desc) \ 2447 { \ 2448 ARMVectorReg scratch; \ 2449 intptr_t oprsz = simd_oprsz(desc); \ 2450 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2451 TYPE *d = vd, *n = vn, *m = vm; \ 2452 if (unlikely(d == m)) { \ 2453 m = memcpy(&scratch, m, oprsz); \ 2454 } \ 2455 for (intptr_t i = 0; i < half; ++i) { \ 2456 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat); \ 2457 } \ 2458 for (intptr_t i = 0; i < half; ++i) { \ 2459 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat); \ 2460 } \ 2461 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2462 } 2463 2464 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2) 2465 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4) 2466 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, ) 2467 2468 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2) 2469 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4) 2470 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, ) 2471 2472 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2) 2473 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4) 2474 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, ) 2475 2476 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2) 2477 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4) 2478 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, ) 2479 2480 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2) 2481 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4) 2482 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, ) 2483 2484 #ifdef TARGET_AARCH64 2485 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2) 2486 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4) 2487 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, ) 2488 2489 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2) 2490 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4) 2491 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, ) 2492 #endif 2493 2494 #undef DO_3OP_PAIR 2495 2496 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2497 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2498 { \ 2499 ARMVectorReg scratch; \ 2500 intptr_t oprsz = simd_oprsz(desc); \ 2501 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2502 TYPE *d = vd, *n = vn, *m = vm; \ 2503 if (unlikely(d == m)) { \ 2504 m = memcpy(&scratch, m, oprsz); \ 2505 } \ 2506 for (intptr_t i = 0; i < half; ++i) { \ 2507 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]); \ 2508 } \ 2509 for (intptr_t i = 0; i < half; ++i) { \ 2510 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]); \ 2511 } \ 2512 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2513 } 2514 2515 #define ADD(A, B) (A + B) 2516 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1) 2517 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2) 2518 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4) 2519 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, ) 2520 #undef ADD 2521 2522 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1) 2523 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2) 2524 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4) 2525 2526 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1) 2527 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2) 2528 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4) 2529 2530 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1) 2531 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2) 2532 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4) 2533 2534 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1) 2535 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2) 2536 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4) 2537 2538 #undef DO_3OP_PAIR 2539 2540 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2541 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 2542 { \ 2543 intptr_t i, oprsz = simd_oprsz(desc); \ 2544 int shift = simd_data(desc); \ 2545 TYPE *d = vd, *n = vn; \ 2546 float_status *fpst = stat; \ 2547 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2548 d[i] = FUNC(n[i], shift, fpst); \ 2549 } \ 2550 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2551 } 2552 2553 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t) 2554 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t) 2555 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2556 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2557 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2558 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2559 2560 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t) 2561 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t) 2562 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2563 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t) 2564 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2565 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2566 2567 #undef DO_VCVT_FIXED 2568 2569 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2570 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2571 { \ 2572 intptr_t i, oprsz = simd_oprsz(desc); \ 2573 uint32_t rmode = simd_data(desc); \ 2574 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2575 TYPE *d = vd, *n = vn; \ 2576 set_float_rounding_mode(rmode, fpst); \ 2577 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2578 d[i] = FUNC(n[i], 0, fpst); \ 2579 } \ 2580 set_float_rounding_mode(prev_rmode, fpst); \ 2581 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2582 } 2583 2584 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t) 2585 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t) 2586 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2587 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2588 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2589 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2590 2591 #undef DO_VCVT_RMODE 2592 2593 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2594 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2595 { \ 2596 intptr_t i, oprsz = simd_oprsz(desc); \ 2597 uint32_t rmode = simd_data(desc); \ 2598 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2599 TYPE *d = vd, *n = vn; \ 2600 set_float_rounding_mode(rmode, fpst); \ 2601 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2602 d[i] = FUNC(n[i], fpst); \ 2603 } \ 2604 set_float_rounding_mode(prev_rmode, fpst); \ 2605 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2606 } 2607 2608 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2609 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2610 2611 #undef DO_VRINT_RMODE 2612 2613 #ifdef TARGET_AARCH64 2614 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc) 2615 { 2616 const uint8_t *indices = vm; 2617 size_t oprsz = simd_oprsz(desc); 2618 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2619 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2620 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2621 union { 2622 uint8_t b[16]; 2623 uint64_t d[2]; 2624 } result; 2625 2626 /* 2627 * We must construct the final result in a temp, lest the output 2628 * overlaps the input table. For TBL, begin with zero; for TBX, 2629 * begin with the original register contents. Note that we always 2630 * copy 16 bytes here to avoid an extra branch; clearing the high 2631 * bits of the register for oprsz == 8 is handled below. 2632 */ 2633 if (is_tbx) { 2634 memcpy(&result, vd, 16); 2635 } else { 2636 memset(&result, 0, 16); 2637 } 2638 2639 for (size_t i = 0; i < oprsz; ++i) { 2640 uint32_t index = indices[H1(i)]; 2641 2642 if (index < table_len) { 2643 /* 2644 * Convert index (a byte offset into the virtual table 2645 * which is a series of 128-bit vectors concatenated) 2646 * into the correct register element, bearing in mind 2647 * that the table can wrap around from V31 to V0. 2648 */ 2649 const uint8_t *table = (const uint8_t *) 2650 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2651 result.b[H1(i)] = table[H1(index % 16)]; 2652 } 2653 } 2654 2655 memcpy(vd, &result, 16); 2656 clear_tail(vd, oprsz, simd_maxsz(desc)); 2657 } 2658 #endif 2659 2660 /* 2661 * NxN -> N highpart multiply 2662 * 2663 * TODO: expose this as a generic vector operation. 2664 */ 2665 2666 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2667 { 2668 intptr_t i, opr_sz = simd_oprsz(desc); 2669 int8_t *d = vd, *n = vn, *m = vm; 2670 2671 for (i = 0; i < opr_sz; ++i) { 2672 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2673 } 2674 clear_tail(d, opr_sz, simd_maxsz(desc)); 2675 } 2676 2677 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2678 { 2679 intptr_t i, opr_sz = simd_oprsz(desc); 2680 int16_t *d = vd, *n = vn, *m = vm; 2681 2682 for (i = 0; i < opr_sz / 2; ++i) { 2683 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2684 } 2685 clear_tail(d, opr_sz, simd_maxsz(desc)); 2686 } 2687 2688 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2689 { 2690 intptr_t i, opr_sz = simd_oprsz(desc); 2691 int32_t *d = vd, *n = vn, *m = vm; 2692 2693 for (i = 0; i < opr_sz / 4; ++i) { 2694 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2695 } 2696 clear_tail(d, opr_sz, simd_maxsz(desc)); 2697 } 2698 2699 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2700 { 2701 intptr_t i, opr_sz = simd_oprsz(desc); 2702 uint64_t *d = vd, *n = vn, *m = vm; 2703 uint64_t discard; 2704 2705 for (i = 0; i < opr_sz / 8; ++i) { 2706 muls64(&discard, &d[i], n[i], m[i]); 2707 } 2708 clear_tail(d, opr_sz, simd_maxsz(desc)); 2709 } 2710 2711 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2712 { 2713 intptr_t i, opr_sz = simd_oprsz(desc); 2714 uint8_t *d = vd, *n = vn, *m = vm; 2715 2716 for (i = 0; i < opr_sz; ++i) { 2717 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2718 } 2719 clear_tail(d, opr_sz, simd_maxsz(desc)); 2720 } 2721 2722 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2723 { 2724 intptr_t i, opr_sz = simd_oprsz(desc); 2725 uint16_t *d = vd, *n = vn, *m = vm; 2726 2727 for (i = 0; i < opr_sz / 2; ++i) { 2728 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2729 } 2730 clear_tail(d, opr_sz, simd_maxsz(desc)); 2731 } 2732 2733 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2734 { 2735 intptr_t i, opr_sz = simd_oprsz(desc); 2736 uint32_t *d = vd, *n = vn, *m = vm; 2737 2738 for (i = 0; i < opr_sz / 4; ++i) { 2739 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2740 } 2741 clear_tail(d, opr_sz, simd_maxsz(desc)); 2742 } 2743 2744 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2745 { 2746 intptr_t i, opr_sz = simd_oprsz(desc); 2747 uint64_t *d = vd, *n = vn, *m = vm; 2748 uint64_t discard; 2749 2750 for (i = 0; i < opr_sz / 8; ++i) { 2751 mulu64(&discard, &d[i], n[i], m[i]); 2752 } 2753 clear_tail(d, opr_sz, simd_maxsz(desc)); 2754 } 2755 2756 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2757 { 2758 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2759 int shr = simd_data(desc); 2760 uint64_t *d = vd, *n = vn, *m = vm; 2761 2762 for (i = 0; i < opr_sz; ++i) { 2763 d[i] = ror64(n[i] ^ m[i], shr); 2764 } 2765 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2766 } 2767 2768 /* 2769 * Integer matrix-multiply accumulate 2770 */ 2771 2772 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2773 { 2774 int8_t *n = vn, *m = vm; 2775 2776 for (intptr_t k = 0; k < 8; ++k) { 2777 sum += n[H1(k)] * m[H1(k)]; 2778 } 2779 return sum; 2780 } 2781 2782 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2783 { 2784 uint8_t *n = vn, *m = vm; 2785 2786 for (intptr_t k = 0; k < 8; ++k) { 2787 sum += n[H1(k)] * m[H1(k)]; 2788 } 2789 return sum; 2790 } 2791 2792 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2793 { 2794 uint8_t *n = vn; 2795 int8_t *m = vm; 2796 2797 for (intptr_t k = 0; k < 8; ++k) { 2798 sum += n[H1(k)] * m[H1(k)]; 2799 } 2800 return sum; 2801 } 2802 2803 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2804 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2805 { 2806 intptr_t seg, opr_sz = simd_oprsz(desc); 2807 2808 for (seg = 0; seg < opr_sz; seg += 16) { 2809 uint32_t *d = vd + seg; 2810 uint32_t *a = va + seg; 2811 uint32_t sum0, sum1, sum2, sum3; 2812 2813 /* 2814 * Process the entire segment at once, writing back the 2815 * results only after we've consumed all of the inputs. 2816 * 2817 * Key to indices by column: 2818 * i j i j 2819 */ 2820 sum0 = a[H4(0 + 0)]; 2821 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2822 sum1 = a[H4(0 + 1)]; 2823 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2824 sum2 = a[H4(2 + 0)]; 2825 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2826 sum3 = a[H4(2 + 1)]; 2827 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2828 2829 d[H4(0)] = sum0; 2830 d[H4(1)] = sum1; 2831 d[H4(2)] = sum2; 2832 d[H4(3)] = sum3; 2833 } 2834 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2835 } 2836 2837 #define DO_MMLA_B(NAME, INNER) \ 2838 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2839 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2840 2841 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2842 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2843 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2844 2845 /* 2846 * BFloat16 Dot Product 2847 */ 2848 2849 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp) 2850 { 2851 /* 2852 * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF. 2853 * For EBF = 0, we ignore the FPCR bits which determine rounding 2854 * mode and denormal-flushing, and we do unfused multiplies and 2855 * additions with intermediate rounding of all products and sums. 2856 * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits, 2857 * and we perform a fused two-way sum-of-products without intermediate 2858 * rounding of the products. 2859 * In either case, we don't set fp exception flags. 2860 * 2861 * EBF is AArch64 only, so even if it's set in the FPCR it has 2862 * no effect on AArch32 instructions. 2863 */ 2864 bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF; 2865 2866 *statusp = is_a64(env) ? env->vfp.fp_status_a64 : env->vfp.fp_status_a32; 2867 set_default_nan_mode(true, statusp); 2868 2869 if (ebf) { 2870 /* EBF=1 needs to do a step with round-to-odd semantics */ 2871 *oddstatusp = *statusp; 2872 set_float_rounding_mode(float_round_to_odd, oddstatusp); 2873 } else { 2874 set_flush_to_zero(true, statusp); 2875 set_flush_inputs_to_zero(true, statusp); 2876 set_float_rounding_mode(float_round_to_odd_inf, statusp); 2877 } 2878 return ebf; 2879 } 2880 2881 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst) 2882 { 2883 float32 t1, t2; 2884 2885 /* 2886 * Extract each BFloat16 from the element pair, and shift 2887 * them such that they become float32. 2888 */ 2889 t1 = float32_mul(e1 << 16, e2 << 16, fpst); 2890 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst); 2891 t1 = float32_add(t1, t2, fpst); 2892 t1 = float32_add(sum, t1, fpst); 2893 2894 return t1; 2895 } 2896 2897 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2, 2898 float_status *fpst, float_status *fpst_odd) 2899 { 2900 /* 2901 * Compare f16_dotadd() in sme_helper.c, but here we have 2902 * bfloat16 inputs. In particular that means that we do not 2903 * want the FPCR.FZ16 flush semantics, so we use the normal 2904 * float_status for the input handling here. 2905 */ 2906 float64 e1r = float32_to_float64(e1 << 16, fpst); 2907 float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst); 2908 float64 e2r = float32_to_float64(e2 << 16, fpst); 2909 float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst); 2910 float64 t64; 2911 float32 t32; 2912 2913 /* 2914 * The ARM pseudocode function FPDot performs both multiplies 2915 * and the add with a single rounding operation. Emulate this 2916 * by performing the first multiply in round-to-odd, then doing 2917 * the second multiply as fused multiply-add, and rounding to 2918 * float32 all in one step. 2919 */ 2920 t64 = float64_mul(e1r, e2r, fpst_odd); 2921 t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst); 2922 2923 /* This conversion is exact, because we've already rounded. */ 2924 t32 = float64_to_float32(t64, fpst); 2925 2926 /* The final accumulation step is not fused. */ 2927 return float32_add(sum, t32, fpst); 2928 } 2929 2930 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, 2931 CPUARMState *env, uint32_t desc) 2932 { 2933 intptr_t i, opr_sz = simd_oprsz(desc); 2934 float32 *d = vd, *a = va; 2935 uint32_t *n = vn, *m = vm; 2936 float_status fpst, fpst_odd; 2937 2938 if (is_ebf(env, &fpst, &fpst_odd)) { 2939 for (i = 0; i < opr_sz / 4; ++i) { 2940 d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd); 2941 } 2942 } else { 2943 for (i = 0; i < opr_sz / 4; ++i) { 2944 d[i] = bfdotadd(a[i], n[i], m[i], &fpst); 2945 } 2946 } 2947 clear_tail(d, opr_sz, simd_maxsz(desc)); 2948 } 2949 2950 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 2951 void *va, CPUARMState *env, uint32_t desc) 2952 { 2953 intptr_t i, j, opr_sz = simd_oprsz(desc); 2954 intptr_t index = simd_data(desc); 2955 intptr_t elements = opr_sz / 4; 2956 intptr_t eltspersegment = MIN(16 / 4, elements); 2957 float32 *d = vd, *a = va; 2958 uint32_t *n = vn, *m = vm; 2959 float_status fpst, fpst_odd; 2960 2961 if (is_ebf(env, &fpst, &fpst_odd)) { 2962 for (i = 0; i < elements; i += eltspersegment) { 2963 uint32_t m_idx = m[i + H4(index)]; 2964 2965 for (j = i; j < i + eltspersegment; j++) { 2966 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd); 2967 } 2968 } 2969 } else { 2970 for (i = 0; i < elements; i += eltspersegment) { 2971 uint32_t m_idx = m[i + H4(index)]; 2972 2973 for (j = i; j < i + eltspersegment; j++) { 2974 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst); 2975 } 2976 } 2977 } 2978 clear_tail(d, opr_sz, simd_maxsz(desc)); 2979 } 2980 2981 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, 2982 CPUARMState *env, uint32_t desc) 2983 { 2984 intptr_t s, opr_sz = simd_oprsz(desc); 2985 float32 *d = vd, *a = va; 2986 uint32_t *n = vn, *m = vm; 2987 float_status fpst, fpst_odd; 2988 2989 if (is_ebf(env, &fpst, &fpst_odd)) { 2990 for (s = 0; s < opr_sz / 4; s += 4) { 2991 float32 sum00, sum01, sum10, sum11; 2992 2993 /* 2994 * Process the entire segment at once, writing back the 2995 * results only after we've consumed all of the inputs. 2996 * 2997 * Key to indices by column: 2998 * i j i k j k 2999 */ 3000 sum00 = a[s + H4(0 + 0)]; 3001 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 3002 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 3003 3004 sum01 = a[s + H4(0 + 1)]; 3005 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 3006 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 3007 3008 sum10 = a[s + H4(2 + 0)]; 3009 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 3010 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 3011 3012 sum11 = a[s + H4(2 + 1)]; 3013 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 3014 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 3015 3016 d[s + H4(0 + 0)] = sum00; 3017 d[s + H4(0 + 1)] = sum01; 3018 d[s + H4(2 + 0)] = sum10; 3019 d[s + H4(2 + 1)] = sum11; 3020 } 3021 } else { 3022 for (s = 0; s < opr_sz / 4; s += 4) { 3023 float32 sum00, sum01, sum10, sum11; 3024 3025 /* 3026 * Process the entire segment at once, writing back the 3027 * results only after we've consumed all of the inputs. 3028 * 3029 * Key to indices by column: 3030 * i j i k j k 3031 */ 3032 sum00 = a[s + H4(0 + 0)]; 3033 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst); 3034 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst); 3035 3036 sum01 = a[s + H4(0 + 1)]; 3037 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst); 3038 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst); 3039 3040 sum10 = a[s + H4(2 + 0)]; 3041 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst); 3042 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst); 3043 3044 sum11 = a[s + H4(2 + 1)]; 3045 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst); 3046 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst); 3047 3048 d[s + H4(0 + 0)] = sum00; 3049 d[s + H4(0 + 1)] = sum01; 3050 d[s + H4(2 + 0)] = sum10; 3051 d[s + H4(2 + 1)] = sum11; 3052 } 3053 } 3054 clear_tail(d, opr_sz, simd_maxsz(desc)); 3055 } 3056 3057 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 3058 float_status *stat, uint32_t desc) 3059 { 3060 intptr_t i, opr_sz = simd_oprsz(desc); 3061 intptr_t sel = simd_data(desc); 3062 float32 *d = vd, *a = va; 3063 bfloat16 *n = vn, *m = vm; 3064 3065 for (i = 0; i < opr_sz / 4; ++i) { 3066 float32 nn = n[H2(i * 2 + sel)] << 16; 3067 float32 mm = m[H2(i * 2 + sel)] << 16; 3068 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 3069 } 3070 clear_tail(d, opr_sz, simd_maxsz(desc)); 3071 } 3072 3073 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 3074 void *va, float_status *stat, uint32_t desc) 3075 { 3076 intptr_t i, j, opr_sz = simd_oprsz(desc); 3077 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 3078 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 3079 intptr_t elements = opr_sz / 4; 3080 intptr_t eltspersegment = MIN(16 / 4, elements); 3081 float32 *d = vd, *a = va; 3082 bfloat16 *n = vn, *m = vm; 3083 3084 for (i = 0; i < elements; i += eltspersegment) { 3085 float32 m_idx = m[H2(2 * i + index)] << 16; 3086 3087 for (j = i; j < i + eltspersegment; j++) { 3088 float32 n_j = n[H2(2 * j + sel)] << 16; 3089 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 3090 } 3091 } 3092 clear_tail(d, opr_sz, simd_maxsz(desc)); 3093 } 3094 3095 #define DO_CLAMP(NAME, TYPE) \ 3096 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 3097 { \ 3098 intptr_t i, opr_sz = simd_oprsz(desc); \ 3099 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 3100 TYPE aa = *(TYPE *)(a + i); \ 3101 TYPE nn = *(TYPE *)(n + i); \ 3102 TYPE mm = *(TYPE *)(m + i); \ 3103 TYPE dd = MIN(MAX(aa, nn), mm); \ 3104 *(TYPE *)(d + i) = dd; \ 3105 } \ 3106 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 3107 } 3108 3109 DO_CLAMP(gvec_sclamp_b, int8_t) 3110 DO_CLAMP(gvec_sclamp_h, int16_t) 3111 DO_CLAMP(gvec_sclamp_s, int32_t) 3112 DO_CLAMP(gvec_sclamp_d, int64_t) 3113 3114 DO_CLAMP(gvec_uclamp_b, uint8_t) 3115 DO_CLAMP(gvec_uclamp_h, uint16_t) 3116 DO_CLAMP(gvec_uclamp_s, uint32_t) 3117 DO_CLAMP(gvec_uclamp_d, uint64_t) 3118 3119 /* Bit count in each 8-bit word. */ 3120 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc) 3121 { 3122 intptr_t i, opr_sz = simd_oprsz(desc); 3123 uint8_t *d = vd, *n = vn; 3124 3125 for (i = 0; i < opr_sz; ++i) { 3126 d[i] = ctpop8(n[i]); 3127 } 3128 clear_tail(d, opr_sz, simd_maxsz(desc)); 3129 } 3130 3131 /* Reverse bits in each 8 bit word */ 3132 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc) 3133 { 3134 intptr_t i, opr_sz = simd_oprsz(desc); 3135 uint64_t *d = vd, *n = vn; 3136 3137 for (i = 0; i < opr_sz / 8; ++i) { 3138 d[i] = revbit64(bswap64(n[i])); 3139 } 3140 clear_tail(d, opr_sz, simd_maxsz(desc)); 3141 } 3142 3143 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc) 3144 { 3145 intptr_t i, opr_sz = simd_oprsz(desc); 3146 uint32_t *d = vd, *n = vn; 3147 3148 for (i = 0; i < opr_sz / 4; ++i) { 3149 d[i] = helper_recpe_u32(n[i]); 3150 } 3151 clear_tail(d, opr_sz, simd_maxsz(desc)); 3152 } 3153 3154 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc) 3155 { 3156 intptr_t i, opr_sz = simd_oprsz(desc); 3157 uint32_t *d = vd, *n = vn; 3158 3159 for (i = 0; i < opr_sz / 4; ++i) { 3160 d[i] = helper_rsqrte_u32(n[i]); 3161 } 3162 clear_tail(d, opr_sz, simd_maxsz(desc)); 3163 } 3164