1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "crypto/clmul.h" 27 #include "vec_internal.h" 28 29 /* 30 * Data for expanding active predicate bits to bytes, for byte elements. 31 * 32 * for (i = 0; i < 256; ++i) { 33 * unsigned long m = 0; 34 * for (j = 0; j < 8; j++) { 35 * if ((i >> j) & 1) { 36 * m |= 0xfful << (j << 3); 37 * } 38 * } 39 * printf("0x%016lx,\n", m); 40 * } 41 */ 42 const uint64_t expand_pred_b_data[256] = { 43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 128 0xffffffffffffffff, 129 }; 130 131 /* 132 * Similarly for half-word elements. 133 * for (i = 0; i < 256; ++i) { 134 * unsigned long m = 0; 135 * if (i & 0xaa) { 136 * continue; 137 * } 138 * for (j = 0; j < 8; j += 2) { 139 * if ((i >> j) & 1) { 140 * m |= 0xfffful << (j << 3); 141 * } 142 * } 143 * printf("[0x%x] = 0x%016lx,\n", i, m); 144 * } 145 */ 146 const uint64_t expand_pred_h_data[0x55 + 1] = { 147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 154 [0x55] = 0xffffffffffffffff, 155 }; 156 157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 159 bool neg, bool round) 160 { 161 /* 162 * Simplify: 163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 165 */ 166 int32_t ret = (int32_t)src1 * src2; 167 if (neg) { 168 ret = -ret; 169 } 170 ret += ((int32_t)src3 << 7) + (round << 6); 171 ret >>= 7; 172 173 if (ret != (int8_t)ret) { 174 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 175 } 176 return ret; 177 } 178 179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 180 void *va, uint32_t desc) 181 { 182 intptr_t i, opr_sz = simd_oprsz(desc); 183 int8_t *d = vd, *n = vn, *m = vm, *a = va; 184 185 for (i = 0; i < opr_sz; ++i) { 186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 187 } 188 } 189 190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 191 void *va, uint32_t desc) 192 { 193 intptr_t i, opr_sz = simd_oprsz(desc); 194 int8_t *d = vd, *n = vn, *m = vm, *a = va; 195 196 for (i = 0; i < opr_sz; ++i) { 197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 198 } 199 } 200 201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 202 { 203 intptr_t i, opr_sz = simd_oprsz(desc); 204 int8_t *d = vd, *n = vn, *m = vm; 205 206 for (i = 0; i < opr_sz; ++i) { 207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 208 } 209 } 210 211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 212 { 213 intptr_t i, opr_sz = simd_oprsz(desc); 214 int8_t *d = vd, *n = vn, *m = vm; 215 216 for (i = 0; i < opr_sz; ++i) { 217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 218 } 219 } 220 221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 223 bool neg, bool round, uint32_t *sat) 224 { 225 /* Simplify similarly to do_sqrdmlah_b above. */ 226 int32_t ret = (int32_t)src1 * src2; 227 if (neg) { 228 ret = -ret; 229 } 230 ret += ((int32_t)src3 << 15) + (round << 14); 231 ret >>= 15; 232 233 if (ret != (int16_t)ret) { 234 *sat = 1; 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 236 } 237 return ret; 238 } 239 240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 241 uint32_t src2, uint32_t src3) 242 { 243 uint32_t *sat = &env->vfp.qc[0]; 244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 246 false, true, sat); 247 return deposit32(e1, 16, 16, e2); 248 } 249 250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 251 void *vq, uint32_t desc) 252 { 253 uintptr_t opr_sz = simd_oprsz(desc); 254 int16_t *d = vd; 255 int16_t *n = vn; 256 int16_t *m = vm; 257 uintptr_t i; 258 259 for (i = 0; i < opr_sz / 2; ++i) { 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 261 } 262 clear_tail(d, opr_sz, simd_maxsz(desc)); 263 } 264 265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 266 uint32_t src2, uint32_t src3) 267 { 268 uint32_t *sat = &env->vfp.qc[0]; 269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 271 true, true, sat); 272 return deposit32(e1, 16, 16, e2); 273 } 274 275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277 { 278 uintptr_t opr_sz = simd_oprsz(desc); 279 int16_t *d = vd; 280 int16_t *n = vn; 281 int16_t *m = vm; 282 uintptr_t i; 283 284 for (i = 0; i < opr_sz / 2; ++i) { 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 286 } 287 clear_tail(d, opr_sz, simd_maxsz(desc)); 288 } 289 290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 291 void *vq, uint32_t desc) 292 { 293 intptr_t i, opr_sz = simd_oprsz(desc); 294 int16_t *d = vd, *n = vn, *m = vm; 295 296 for (i = 0; i < opr_sz / 2; ++i) { 297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 298 } 299 clear_tail(d, opr_sz, simd_maxsz(desc)); 300 } 301 302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 303 void *vq, uint32_t desc) 304 { 305 intptr_t i, opr_sz = simd_oprsz(desc); 306 int16_t *d = vd, *n = vn, *m = vm; 307 308 for (i = 0; i < opr_sz / 2; ++i) { 309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 310 } 311 clear_tail(d, opr_sz, simd_maxsz(desc)); 312 } 313 314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm, 315 void *vq, uint32_t desc) 316 { 317 intptr_t i, j, opr_sz = simd_oprsz(desc); 318 int idx = simd_data(desc); 319 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 320 intptr_t elements = opr_sz / 2; 321 intptr_t eltspersegment = MIN(16 / 2, elements); 322 323 for (i = 0; i < elements; i += 16 / 2) { 324 int16_t mm = m[i]; 325 for (j = 0; j < eltspersegment; ++j) { 326 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq); 327 } 328 } 329 clear_tail(d, opr_sz, simd_maxsz(desc)); 330 } 331 332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, 333 void *vq, uint32_t desc) 334 { 335 intptr_t i, j, opr_sz = simd_oprsz(desc); 336 int idx = simd_data(desc); 337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 338 intptr_t elements = opr_sz / 2; 339 intptr_t eltspersegment = MIN(16 / 2, elements); 340 341 for (i = 0; i < elements; i += 16 / 2) { 342 int16_t mm = m[i]; 343 for (j = 0; j < eltspersegment; ++j) { 344 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq); 345 } 346 } 347 clear_tail(d, opr_sz, simd_maxsz(desc)); 348 } 349 350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm, 351 void *vq, uint32_t desc) 352 { 353 intptr_t i, j, opr_sz = simd_oprsz(desc); 354 int idx = simd_data(desc); 355 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 356 intptr_t elements = opr_sz / 2; 357 intptr_t eltspersegment = MIN(16 / 2, elements); 358 359 for (i = 0; i < elements; i += 16 / 2) { 360 int16_t mm = m[i]; 361 for (j = 0; j < eltspersegment; ++j) { 362 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq); 363 } 364 } 365 clear_tail(d, opr_sz, simd_maxsz(desc)); 366 } 367 368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm, 369 void *vq, uint32_t desc) 370 { 371 intptr_t i, j, opr_sz = simd_oprsz(desc); 372 int idx = simd_data(desc); 373 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 374 intptr_t elements = opr_sz / 2; 375 intptr_t eltspersegment = MIN(16 / 2, elements); 376 377 for (i = 0; i < elements; i += 16 / 2) { 378 int16_t mm = m[i]; 379 for (j = 0; j < eltspersegment; ++j) { 380 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq); 381 } 382 } 383 clear_tail(d, opr_sz, simd_maxsz(desc)); 384 } 385 386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 387 void *va, uint32_t desc) 388 { 389 intptr_t i, opr_sz = simd_oprsz(desc); 390 int16_t *d = vd, *n = vn, *m = vm, *a = va; 391 uint32_t discard; 392 393 for (i = 0; i < opr_sz / 2; ++i) { 394 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 395 } 396 } 397 398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 399 void *va, uint32_t desc) 400 { 401 intptr_t i, opr_sz = simd_oprsz(desc); 402 int16_t *d = vd, *n = vn, *m = vm, *a = va; 403 uint32_t discard; 404 405 for (i = 0; i < opr_sz / 2; ++i) { 406 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 407 } 408 } 409 410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 411 { 412 intptr_t i, opr_sz = simd_oprsz(desc); 413 int16_t *d = vd, *n = vn, *m = vm; 414 uint32_t discard; 415 416 for (i = 0; i < opr_sz / 2; ++i) { 417 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 418 } 419 } 420 421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 422 { 423 intptr_t i, opr_sz = simd_oprsz(desc); 424 int16_t *d = vd, *n = vn, *m = vm; 425 uint32_t discard; 426 427 for (i = 0; i < opr_sz / 2; ++i) { 428 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 429 } 430 } 431 432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 433 { 434 intptr_t i, j, opr_sz = simd_oprsz(desc); 435 int idx = simd_data(desc); 436 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 437 uint32_t discard; 438 439 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 440 int16_t mm = m[i]; 441 for (j = 0; j < 16 / 2; ++j) { 442 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 443 } 444 } 445 } 446 447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 448 { 449 intptr_t i, j, opr_sz = simd_oprsz(desc); 450 int idx = simd_data(desc); 451 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 452 uint32_t discard; 453 454 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 455 int16_t mm = m[i]; 456 for (j = 0; j < 16 / 2; ++j) { 457 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 458 } 459 } 460 } 461 462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 464 bool neg, bool round, uint32_t *sat) 465 { 466 /* Simplify similarly to do_sqrdmlah_b above. */ 467 int64_t ret = (int64_t)src1 * src2; 468 if (neg) { 469 ret = -ret; 470 } 471 ret += ((int64_t)src3 << 31) + (round << 30); 472 ret >>= 31; 473 474 if (ret != (int32_t)ret) { 475 *sat = 1; 476 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 477 } 478 return ret; 479 } 480 481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 482 int32_t src2, int32_t src3) 483 { 484 uint32_t *sat = &env->vfp.qc[0]; 485 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 486 } 487 488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 489 void *vq, uint32_t desc) 490 { 491 uintptr_t opr_sz = simd_oprsz(desc); 492 int32_t *d = vd; 493 int32_t *n = vn; 494 int32_t *m = vm; 495 uintptr_t i; 496 497 for (i = 0; i < opr_sz / 4; ++i) { 498 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 499 } 500 clear_tail(d, opr_sz, simd_maxsz(desc)); 501 } 502 503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 504 int32_t src2, int32_t src3) 505 { 506 uint32_t *sat = &env->vfp.qc[0]; 507 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 508 } 509 510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 511 void *vq, uint32_t desc) 512 { 513 uintptr_t opr_sz = simd_oprsz(desc); 514 int32_t *d = vd; 515 int32_t *n = vn; 516 int32_t *m = vm; 517 uintptr_t i; 518 519 for (i = 0; i < opr_sz / 4; ++i) { 520 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 521 } 522 clear_tail(d, opr_sz, simd_maxsz(desc)); 523 } 524 525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 526 void *vq, uint32_t desc) 527 { 528 intptr_t i, opr_sz = simd_oprsz(desc); 529 int32_t *d = vd, *n = vn, *m = vm; 530 531 for (i = 0; i < opr_sz / 4; ++i) { 532 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 533 } 534 clear_tail(d, opr_sz, simd_maxsz(desc)); 535 } 536 537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 538 void *vq, uint32_t desc) 539 { 540 intptr_t i, opr_sz = simd_oprsz(desc); 541 int32_t *d = vd, *n = vn, *m = vm; 542 543 for (i = 0; i < opr_sz / 4; ++i) { 544 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 545 } 546 clear_tail(d, opr_sz, simd_maxsz(desc)); 547 } 548 549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm, 550 void *vq, uint32_t desc) 551 { 552 intptr_t i, j, opr_sz = simd_oprsz(desc); 553 int idx = simd_data(desc); 554 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 555 intptr_t elements = opr_sz / 4; 556 intptr_t eltspersegment = MIN(16 / 4, elements); 557 558 for (i = 0; i < elements; i += 16 / 4) { 559 int32_t mm = m[i]; 560 for (j = 0; j < eltspersegment; ++j) { 561 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq); 562 } 563 } 564 clear_tail(d, opr_sz, simd_maxsz(desc)); 565 } 566 567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, 568 void *vq, uint32_t desc) 569 { 570 intptr_t i, j, opr_sz = simd_oprsz(desc); 571 int idx = simd_data(desc); 572 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 573 intptr_t elements = opr_sz / 4; 574 intptr_t eltspersegment = MIN(16 / 4, elements); 575 576 for (i = 0; i < elements; i += 16 / 4) { 577 int32_t mm = m[i]; 578 for (j = 0; j < eltspersegment; ++j) { 579 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq); 580 } 581 } 582 clear_tail(d, opr_sz, simd_maxsz(desc)); 583 } 584 585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm, 586 void *vq, uint32_t desc) 587 { 588 intptr_t i, j, opr_sz = simd_oprsz(desc); 589 int idx = simd_data(desc); 590 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 591 intptr_t elements = opr_sz / 4; 592 intptr_t eltspersegment = MIN(16 / 4, elements); 593 594 for (i = 0; i < elements; i += 16 / 4) { 595 int32_t mm = m[i]; 596 for (j = 0; j < eltspersegment; ++j) { 597 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq); 598 } 599 } 600 clear_tail(d, opr_sz, simd_maxsz(desc)); 601 } 602 603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm, 604 void *vq, uint32_t desc) 605 { 606 intptr_t i, j, opr_sz = simd_oprsz(desc); 607 int idx = simd_data(desc); 608 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 609 intptr_t elements = opr_sz / 4; 610 intptr_t eltspersegment = MIN(16 / 4, elements); 611 612 for (i = 0; i < elements; i += 16 / 4) { 613 int32_t mm = m[i]; 614 for (j = 0; j < eltspersegment; ++j) { 615 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq); 616 } 617 } 618 clear_tail(d, opr_sz, simd_maxsz(desc)); 619 } 620 621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 622 void *va, uint32_t desc) 623 { 624 intptr_t i, opr_sz = simd_oprsz(desc); 625 int32_t *d = vd, *n = vn, *m = vm, *a = va; 626 uint32_t discard; 627 628 for (i = 0; i < opr_sz / 4; ++i) { 629 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 630 } 631 } 632 633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 634 void *va, uint32_t desc) 635 { 636 intptr_t i, opr_sz = simd_oprsz(desc); 637 int32_t *d = vd, *n = vn, *m = vm, *a = va; 638 uint32_t discard; 639 640 for (i = 0; i < opr_sz / 4; ++i) { 641 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 642 } 643 } 644 645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 646 { 647 intptr_t i, opr_sz = simd_oprsz(desc); 648 int32_t *d = vd, *n = vn, *m = vm; 649 uint32_t discard; 650 651 for (i = 0; i < opr_sz / 4; ++i) { 652 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 653 } 654 } 655 656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 657 { 658 intptr_t i, opr_sz = simd_oprsz(desc); 659 int32_t *d = vd, *n = vn, *m = vm; 660 uint32_t discard; 661 662 for (i = 0; i < opr_sz / 4; ++i) { 663 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 664 } 665 } 666 667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 668 { 669 intptr_t i, j, opr_sz = simd_oprsz(desc); 670 int idx = simd_data(desc); 671 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 672 uint32_t discard; 673 674 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 675 int32_t mm = m[i]; 676 for (j = 0; j < 16 / 4; ++j) { 677 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 678 } 679 } 680 } 681 682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 683 { 684 intptr_t i, j, opr_sz = simd_oprsz(desc); 685 int idx = simd_data(desc); 686 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 687 uint32_t discard; 688 689 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 690 int32_t mm = m[i]; 691 for (j = 0; j < 16 / 4; ++j) { 692 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 693 } 694 } 695 } 696 697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 698 static int64_t do_sat128_d(Int128 r) 699 { 700 int64_t ls = int128_getlo(r); 701 int64_t hs = int128_gethi(r); 702 703 if (unlikely(hs != (ls >> 63))) { 704 return hs < 0 ? INT64_MIN : INT64_MAX; 705 } 706 return ls; 707 } 708 709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 710 { 711 uint64_t l, h; 712 Int128 r, t; 713 714 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 715 muls64(&l, &h, m, n); 716 r = int128_make128(l, h); 717 if (neg) { 718 r = int128_neg(r); 719 } 720 if (a) { 721 t = int128_exts64(a); 722 t = int128_lshift(t, 63); 723 r = int128_add(r, t); 724 } 725 if (round) { 726 t = int128_exts64(1ll << 62); 727 r = int128_add(r, t); 728 } 729 r = int128_rshift(r, 63); 730 731 return do_sat128_d(r); 732 } 733 734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 735 void *va, uint32_t desc) 736 { 737 intptr_t i, opr_sz = simd_oprsz(desc); 738 int64_t *d = vd, *n = vn, *m = vm, *a = va; 739 740 for (i = 0; i < opr_sz / 8; ++i) { 741 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 742 } 743 } 744 745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 746 void *va, uint32_t desc) 747 { 748 intptr_t i, opr_sz = simd_oprsz(desc); 749 int64_t *d = vd, *n = vn, *m = vm, *a = va; 750 751 for (i = 0; i < opr_sz / 8; ++i) { 752 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 753 } 754 } 755 756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 757 { 758 intptr_t i, opr_sz = simd_oprsz(desc); 759 int64_t *d = vd, *n = vn, *m = vm; 760 761 for (i = 0; i < opr_sz / 8; ++i) { 762 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 763 } 764 } 765 766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 767 { 768 intptr_t i, opr_sz = simd_oprsz(desc); 769 int64_t *d = vd, *n = vn, *m = vm; 770 771 for (i = 0; i < opr_sz / 8; ++i) { 772 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 773 } 774 } 775 776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 777 { 778 intptr_t i, j, opr_sz = simd_oprsz(desc); 779 int idx = simd_data(desc); 780 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 781 782 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 783 int64_t mm = m[i]; 784 for (j = 0; j < 16 / 8; ++j) { 785 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 786 } 787 } 788 } 789 790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 791 { 792 intptr_t i, j, opr_sz = simd_oprsz(desc); 793 int idx = simd_data(desc); 794 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 795 796 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 797 int64_t mm = m[i]; 798 for (j = 0; j < 16 / 8; ++j) { 799 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 800 } 801 } 802 } 803 804 /* Integer 8 and 16-bit dot-product. 805 * 806 * Note that for the loops herein, host endianness does not matter 807 * with respect to the ordering of data within the quad-width lanes. 808 * All elements are treated equally, no matter where they are. 809 */ 810 811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 813 { \ 814 intptr_t i, opr_sz = simd_oprsz(desc); \ 815 TYPED *d = vd, *a = va; \ 816 TYPEN *n = vn; \ 817 TYPEM *m = vm; \ 818 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 819 d[i] = (a[i] + \ 820 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 821 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 822 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 823 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 824 } \ 825 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 826 } 827 828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 833 834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 836 { \ 837 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 838 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 839 /* \ 840 * Special case: opr_sz == 8 from AA64/AA32 advsimd means the \ 841 * first iteration might not be a full 16 byte segment. But \ 842 * for vector lengths beyond that this must be SVE and we know \ 843 * opr_sz is a multiple of 16, so we need not clamp segend \ 844 * to opr_sz_n when we advance it at the end of the loop. \ 845 */ \ 846 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 847 intptr_t index = simd_data(desc); \ 848 TYPED *d = vd, *a = va; \ 849 TYPEN *n = vn; \ 850 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 851 do { \ 852 TYPED m0 = m_indexed[i * 4 + 0]; \ 853 TYPED m1 = m_indexed[i * 4 + 1]; \ 854 TYPED m2 = m_indexed[i * 4 + 2]; \ 855 TYPED m3 = m_indexed[i * 4 + 3]; \ 856 do { \ 857 d[i] = (a[i] + \ 858 n[i * 4 + 0] * m0 + \ 859 n[i * 4 + 1] * m1 + \ 860 n[i * 4 + 2] * m2 + \ 861 n[i * 4 + 3] * m3); \ 862 } while (++i < segend); \ 863 segend = i + (16 / sizeof(TYPED)); \ 864 } while (i < opr_sz_n); \ 865 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 866 } 867 868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 874 875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 876 float_status *fpst, uint32_t desc) 877 { 878 uintptr_t opr_sz = simd_oprsz(desc); 879 float16 *d = vd; 880 float16 *n = vn; 881 float16 *m = vm; 882 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 883 uint32_t neg_imag = neg_real ^ 1; 884 uintptr_t i; 885 886 /* Shift boolean to the sign bit so we can xor to negate. */ 887 neg_real <<= 15; 888 neg_imag <<= 15; 889 890 for (i = 0; i < opr_sz / 2; i += 2) { 891 float16 e0 = n[H2(i)]; 892 float16 e1 = m[H2(i + 1)] ^ neg_imag; 893 float16 e2 = n[H2(i + 1)]; 894 float16 e3 = m[H2(i)] ^ neg_real; 895 896 d[H2(i)] = float16_add(e0, e1, fpst); 897 d[H2(i + 1)] = float16_add(e2, e3, fpst); 898 } 899 clear_tail(d, opr_sz, simd_maxsz(desc)); 900 } 901 902 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 903 float_status *fpst, uint32_t desc) 904 { 905 uintptr_t opr_sz = simd_oprsz(desc); 906 float32 *d = vd; 907 float32 *n = vn; 908 float32 *m = vm; 909 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 910 uint32_t neg_imag = neg_real ^ 1; 911 uintptr_t i; 912 913 /* Shift boolean to the sign bit so we can xor to negate. */ 914 neg_real <<= 31; 915 neg_imag <<= 31; 916 917 for (i = 0; i < opr_sz / 4; i += 2) { 918 float32 e0 = n[H4(i)]; 919 float32 e1 = m[H4(i + 1)] ^ neg_imag; 920 float32 e2 = n[H4(i + 1)]; 921 float32 e3 = m[H4(i)] ^ neg_real; 922 923 d[H4(i)] = float32_add(e0, e1, fpst); 924 d[H4(i + 1)] = float32_add(e2, e3, fpst); 925 } 926 clear_tail(d, opr_sz, simd_maxsz(desc)); 927 } 928 929 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 930 float_status *fpst, uint32_t desc) 931 { 932 uintptr_t opr_sz = simd_oprsz(desc); 933 float64 *d = vd; 934 float64 *n = vn; 935 float64 *m = vm; 936 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 937 uint64_t neg_imag = neg_real ^ 1; 938 uintptr_t i; 939 940 /* Shift boolean to the sign bit so we can xor to negate. */ 941 neg_real <<= 63; 942 neg_imag <<= 63; 943 944 for (i = 0; i < opr_sz / 8; i += 2) { 945 float64 e0 = n[i]; 946 float64 e1 = m[i + 1] ^ neg_imag; 947 float64 e2 = n[i + 1]; 948 float64 e3 = m[i] ^ neg_real; 949 950 d[i] = float64_add(e0, e1, fpst); 951 d[i + 1] = float64_add(e2, e3, fpst); 952 } 953 clear_tail(d, opr_sz, simd_maxsz(desc)); 954 } 955 956 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 957 float_status *fpst, uint32_t desc) 958 { 959 uintptr_t opr_sz = simd_oprsz(desc); 960 float16 *d = vd, *n = vn, *m = vm, *a = va; 961 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 962 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 963 uint32_t neg_real = flip ^ neg_imag; 964 uintptr_t i; 965 966 /* Shift boolean to the sign bit so we can xor to negate. */ 967 neg_real <<= 15; 968 neg_imag <<= 15; 969 970 for (i = 0; i < opr_sz / 2; i += 2) { 971 float16 e2 = n[H2(i + flip)]; 972 float16 e1 = m[H2(i + flip)] ^ neg_real; 973 float16 e4 = e2; 974 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 975 976 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst); 977 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst); 978 } 979 clear_tail(d, opr_sz, simd_maxsz(desc)); 980 } 981 982 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 983 float_status *fpst, uint32_t desc) 984 { 985 uintptr_t opr_sz = simd_oprsz(desc); 986 float16 *d = vd, *n = vn, *m = vm, *a = va; 987 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 988 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 989 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 990 uint32_t neg_real = flip ^ neg_imag; 991 intptr_t elements = opr_sz / sizeof(float16); 992 intptr_t eltspersegment = MIN(16 / sizeof(float16), elements); 993 intptr_t i, j; 994 995 /* Shift boolean to the sign bit so we can xor to negate. */ 996 neg_real <<= 15; 997 neg_imag <<= 15; 998 999 for (i = 0; i < elements; i += eltspersegment) { 1000 float16 mr = m[H2(i + 2 * index + 0)]; 1001 float16 mi = m[H2(i + 2 * index + 1)]; 1002 float16 e1 = neg_real ^ (flip ? mi : mr); 1003 float16 e3 = neg_imag ^ (flip ? mr : mi); 1004 1005 for (j = i; j < i + eltspersegment; j += 2) { 1006 float16 e2 = n[H2(j + flip)]; 1007 float16 e4 = e2; 1008 1009 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); 1010 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); 1011 } 1012 } 1013 clear_tail(d, opr_sz, simd_maxsz(desc)); 1014 } 1015 1016 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 1017 float_status *fpst, uint32_t desc) 1018 { 1019 uintptr_t opr_sz = simd_oprsz(desc); 1020 float32 *d = vd, *n = vn, *m = vm, *a = va; 1021 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1022 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1023 uint32_t neg_real = flip ^ neg_imag; 1024 uintptr_t i; 1025 1026 /* Shift boolean to the sign bit so we can xor to negate. */ 1027 neg_real <<= 31; 1028 neg_imag <<= 31; 1029 1030 for (i = 0; i < opr_sz / 4; i += 2) { 1031 float32 e2 = n[H4(i + flip)]; 1032 float32 e1 = m[H4(i + flip)] ^ neg_real; 1033 float32 e4 = e2; 1034 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 1035 1036 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst); 1037 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst); 1038 } 1039 clear_tail(d, opr_sz, simd_maxsz(desc)); 1040 } 1041 1042 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 1043 float_status *fpst, uint32_t desc) 1044 { 1045 uintptr_t opr_sz = simd_oprsz(desc); 1046 float32 *d = vd, *n = vn, *m = vm, *a = va; 1047 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1048 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1049 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1050 uint32_t neg_real = flip ^ neg_imag; 1051 intptr_t elements = opr_sz / sizeof(float32); 1052 intptr_t eltspersegment = MIN(16 / sizeof(float32), elements); 1053 intptr_t i, j; 1054 1055 /* Shift boolean to the sign bit so we can xor to negate. */ 1056 neg_real <<= 31; 1057 neg_imag <<= 31; 1058 1059 for (i = 0; i < elements; i += eltspersegment) { 1060 float32 mr = m[H4(i + 2 * index + 0)]; 1061 float32 mi = m[H4(i + 2 * index + 1)]; 1062 float32 e1 = neg_real ^ (flip ? mi : mr); 1063 float32 e3 = neg_imag ^ (flip ? mr : mi); 1064 1065 for (j = i; j < i + eltspersegment; j += 2) { 1066 float32 e2 = n[H4(j + flip)]; 1067 float32 e4 = e2; 1068 1069 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); 1070 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); 1071 } 1072 } 1073 clear_tail(d, opr_sz, simd_maxsz(desc)); 1074 } 1075 1076 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 1077 float_status *fpst, uint32_t desc) 1078 { 1079 uintptr_t opr_sz = simd_oprsz(desc); 1080 float64 *d = vd, *n = vn, *m = vm, *a = va; 1081 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1082 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1083 uint64_t neg_real = flip ^ neg_imag; 1084 uintptr_t i; 1085 1086 /* Shift boolean to the sign bit so we can xor to negate. */ 1087 neg_real <<= 63; 1088 neg_imag <<= 63; 1089 1090 for (i = 0; i < opr_sz / 8; i += 2) { 1091 float64 e2 = n[i + flip]; 1092 float64 e1 = m[i + flip] ^ neg_real; 1093 float64 e4 = e2; 1094 float64 e3 = m[i + 1 - flip] ^ neg_imag; 1095 1096 d[i] = float64_muladd(e2, e1, a[i], 0, fpst); 1097 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst); 1098 } 1099 clear_tail(d, opr_sz, simd_maxsz(desc)); 1100 } 1101 1102 /* 1103 * Floating point comparisons producing an integer result (all 1s or all 0s). 1104 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1105 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1106 */ 1107 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 1108 { 1109 return -float16_eq_quiet(op1, op2, stat); 1110 } 1111 1112 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 1113 { 1114 return -float32_eq_quiet(op1, op2, stat); 1115 } 1116 1117 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat) 1118 { 1119 return -float64_eq_quiet(op1, op2, stat); 1120 } 1121 1122 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 1123 { 1124 return -float16_le(op2, op1, stat); 1125 } 1126 1127 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 1128 { 1129 return -float32_le(op2, op1, stat); 1130 } 1131 1132 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat) 1133 { 1134 return -float64_le(op2, op1, stat); 1135 } 1136 1137 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 1138 { 1139 return -float16_lt(op2, op1, stat); 1140 } 1141 1142 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 1143 { 1144 return -float32_lt(op2, op1, stat); 1145 } 1146 1147 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat) 1148 { 1149 return -float64_lt(op2, op1, stat); 1150 } 1151 1152 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 1153 { 1154 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 1155 } 1156 1157 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 1158 { 1159 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1160 } 1161 1162 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat) 1163 { 1164 return -float64_le(float64_abs(op2), float64_abs(op1), stat); 1165 } 1166 1167 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1168 { 1169 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1170 } 1171 1172 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1173 { 1174 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1175 } 1176 1177 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat) 1178 { 1179 return -float64_lt(float64_abs(op2), float64_abs(op1), stat); 1180 } 1181 1182 static int16_t vfp_tosszh(float16 x, float_status *fpst) 1183 { 1184 if (float16_is_any_nan(x)) { 1185 float_raise(float_flag_invalid, fpst); 1186 return 0; 1187 } 1188 return float16_to_int16_round_to_zero(x, fpst); 1189 } 1190 1191 static uint16_t vfp_touszh(float16 x, float_status *fpst) 1192 { 1193 if (float16_is_any_nan(x)) { 1194 float_raise(float_flag_invalid, fpst); 1195 return 0; 1196 } 1197 return float16_to_uint16_round_to_zero(x, fpst); 1198 } 1199 1200 #define DO_2OP(NAME, FUNC, TYPE) \ 1201 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 1202 { \ 1203 intptr_t i, oprsz = simd_oprsz(desc); \ 1204 TYPE *d = vd, *n = vn; \ 1205 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1206 d[i] = FUNC(n[i], stat); \ 1207 } \ 1208 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1209 } 1210 1211 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1212 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1213 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1214 1215 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1216 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1217 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1218 1219 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1220 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1221 1222 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1223 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1224 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1225 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1226 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1227 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1228 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1229 DO_2OP(gvec_touszh, vfp_touszh, float16) 1230 1231 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1232 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1233 { \ 1234 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1235 } 1236 1237 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1238 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1239 { \ 1240 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1241 } 1242 1243 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1244 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1245 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1246 WRAP_CMP0_##DIRN(FN, CMPOP, float64) \ 1247 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1248 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) \ 1249 DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64) 1250 1251 DO_2OP_CMP0(cgt, cgt, FWD) 1252 DO_2OP_CMP0(cge, cge, FWD) 1253 DO_2OP_CMP0(ceq, ceq, FWD) 1254 DO_2OP_CMP0(clt, cgt, REV) 1255 DO_2OP_CMP0(cle, cge, REV) 1256 1257 #undef DO_2OP 1258 #undef DO_2OP_CMP0 1259 1260 /* Floating-point trigonometric starting value. 1261 * See the ARM ARM pseudocode function FPTrigSMul. 1262 */ 1263 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1264 { 1265 float16 result = float16_mul(op1, op1, stat); 1266 if (!float16_is_any_nan(result)) { 1267 result = float16_set_sign(result, op2 & 1); 1268 } 1269 return result; 1270 } 1271 1272 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1273 { 1274 float32 result = float32_mul(op1, op1, stat); 1275 if (!float32_is_any_nan(result)) { 1276 result = float32_set_sign(result, op2 & 1); 1277 } 1278 return result; 1279 } 1280 1281 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1282 { 1283 float64 result = float64_mul(op1, op1, stat); 1284 if (!float64_is_any_nan(result)) { 1285 result = float64_set_sign(result, op2 & 1); 1286 } 1287 return result; 1288 } 1289 1290 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1291 { 1292 return float16_abs(float16_sub(op1, op2, stat)); 1293 } 1294 1295 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1296 { 1297 return float32_abs(float32_sub(op1, op2, stat)); 1298 } 1299 1300 static float64 float64_abd(float64 op1, float64 op2, float_status *stat) 1301 { 1302 return float64_abs(float64_sub(op1, op2, stat)); 1303 } 1304 1305 /* 1306 * Reciprocal step. These are the AArch32 version which uses a 1307 * non-fused multiply-and-subtract. 1308 */ 1309 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1310 { 1311 op1 = float16_squash_input_denormal(op1, stat); 1312 op2 = float16_squash_input_denormal(op2, stat); 1313 1314 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1315 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1316 return float16_two; 1317 } 1318 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1319 } 1320 1321 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1322 { 1323 op1 = float32_squash_input_denormal(op1, stat); 1324 op2 = float32_squash_input_denormal(op2, stat); 1325 1326 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1327 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1328 return float32_two; 1329 } 1330 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1331 } 1332 1333 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1334 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1335 { 1336 op1 = float16_squash_input_denormal(op1, stat); 1337 op2 = float16_squash_input_denormal(op2, stat); 1338 1339 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1340 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1341 return float16_one_point_five; 1342 } 1343 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1344 return float16_div(op1, float16_two, stat); 1345 } 1346 1347 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1348 { 1349 op1 = float32_squash_input_denormal(op1, stat); 1350 op2 = float32_squash_input_denormal(op2, stat); 1351 1352 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1353 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1354 return float32_one_point_five; 1355 } 1356 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1357 return float32_div(op1, float32_two, stat); 1358 } 1359 1360 #define DO_3OP(NAME, FUNC, TYPE) \ 1361 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1362 float_status *stat, uint32_t desc) \ 1363 { \ 1364 intptr_t i, oprsz = simd_oprsz(desc); \ 1365 TYPE *d = vd, *n = vn, *m = vm; \ 1366 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1367 d[i] = FUNC(n[i], m[i], stat); \ 1368 } \ 1369 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1370 } 1371 1372 DO_3OP(gvec_fadd_h, float16_add, float16) 1373 DO_3OP(gvec_fadd_s, float32_add, float32) 1374 DO_3OP(gvec_fadd_d, float64_add, float64) 1375 1376 DO_3OP(gvec_fsub_h, float16_sub, float16) 1377 DO_3OP(gvec_fsub_s, float32_sub, float32) 1378 DO_3OP(gvec_fsub_d, float64_sub, float64) 1379 1380 DO_3OP(gvec_fmul_h, float16_mul, float16) 1381 DO_3OP(gvec_fmul_s, float32_mul, float32) 1382 DO_3OP(gvec_fmul_d, float64_mul, float64) 1383 1384 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1385 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1386 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1387 1388 DO_3OP(gvec_fabd_h, float16_abd, float16) 1389 DO_3OP(gvec_fabd_s, float32_abd, float32) 1390 DO_3OP(gvec_fabd_d, float64_abd, float64) 1391 1392 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1393 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1394 DO_3OP(gvec_fceq_d, float64_ceq, float64) 1395 1396 DO_3OP(gvec_fcge_h, float16_cge, float16) 1397 DO_3OP(gvec_fcge_s, float32_cge, float32) 1398 DO_3OP(gvec_fcge_d, float64_cge, float64) 1399 1400 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1401 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1402 DO_3OP(gvec_fcgt_d, float64_cgt, float64) 1403 1404 DO_3OP(gvec_facge_h, float16_acge, float16) 1405 DO_3OP(gvec_facge_s, float32_acge, float32) 1406 DO_3OP(gvec_facge_d, float64_acge, float64) 1407 1408 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1409 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1410 DO_3OP(gvec_facgt_d, float64_acgt, float64) 1411 1412 DO_3OP(gvec_fmax_h, float16_max, float16) 1413 DO_3OP(gvec_fmax_s, float32_max, float32) 1414 DO_3OP(gvec_fmax_d, float64_max, float64) 1415 1416 DO_3OP(gvec_fmin_h, float16_min, float16) 1417 DO_3OP(gvec_fmin_s, float32_min, float32) 1418 DO_3OP(gvec_fmin_d, float64_min, float64) 1419 1420 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1421 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1422 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64) 1423 1424 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1425 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1426 DO_3OP(gvec_fminnum_d, float64_minnum, float64) 1427 1428 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1429 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1430 1431 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1432 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1433 1434 #ifdef TARGET_AARCH64 1435 DO_3OP(gvec_fdiv_h, float16_div, float16) 1436 DO_3OP(gvec_fdiv_s, float32_div, float32) 1437 DO_3OP(gvec_fdiv_d, float64_div, float64) 1438 1439 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16) 1440 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32) 1441 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64) 1442 1443 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1444 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1445 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1446 1447 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1448 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1449 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1450 1451 #endif 1452 #undef DO_3OP 1453 1454 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1455 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1456 float_status *stat) 1457 { 1458 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1459 } 1460 1461 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1462 float_status *stat) 1463 { 1464 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1465 } 1466 1467 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1468 float_status *stat) 1469 { 1470 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1471 } 1472 1473 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1474 float_status *stat) 1475 { 1476 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1477 } 1478 1479 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1480 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1481 float_status *stat) 1482 { 1483 return float16_muladd(op1, op2, dest, 0, stat); 1484 } 1485 1486 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1487 float_status *stat) 1488 { 1489 return float32_muladd(op1, op2, dest, 0, stat); 1490 } 1491 1492 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2, 1493 float_status *stat) 1494 { 1495 return float64_muladd(op1, op2, dest, 0, stat); 1496 } 1497 1498 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1499 float_status *stat) 1500 { 1501 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1502 } 1503 1504 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1505 float_status *stat) 1506 { 1507 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1508 } 1509 1510 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2, 1511 float_status *stat) 1512 { 1513 return float64_muladd(float64_chs(op1), op2, dest, 0, stat); 1514 } 1515 1516 #define DO_MULADD(NAME, FUNC, TYPE) \ 1517 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1518 float_status *stat, uint32_t desc) \ 1519 { \ 1520 intptr_t i, oprsz = simd_oprsz(desc); \ 1521 TYPE *d = vd, *n = vn, *m = vm; \ 1522 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1523 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1524 } \ 1525 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1526 } 1527 1528 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1529 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1530 1531 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1532 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1533 1534 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1535 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1536 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64) 1537 1538 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1539 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1540 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64) 1541 1542 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1543 * For AdvSIMD, there is of course only one such vector segment. 1544 */ 1545 1546 #define DO_MUL_IDX(NAME, TYPE, H) \ 1547 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1548 { \ 1549 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1550 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1551 intptr_t idx = simd_data(desc); \ 1552 TYPE *d = vd, *n = vn, *m = vm; \ 1553 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1554 TYPE mm = m[H(i + idx)]; \ 1555 for (j = 0; j < segment; j++) { \ 1556 d[i + j] = n[i + j] * mm; \ 1557 } \ 1558 } \ 1559 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1560 } 1561 1562 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1563 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1564 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1565 1566 #undef DO_MUL_IDX 1567 1568 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1569 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1570 { \ 1571 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1572 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1573 intptr_t idx = simd_data(desc); \ 1574 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1575 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1576 TYPE mm = m[H(i + idx)]; \ 1577 for (j = 0; j < segment; j++) { \ 1578 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1579 } \ 1580 } \ 1581 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1582 } 1583 1584 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1585 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1586 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1587 1588 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1589 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1590 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1591 1592 #undef DO_MLA_IDX 1593 1594 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \ 1595 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1596 float_status *stat, uint32_t desc) \ 1597 { \ 1598 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1599 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1600 intptr_t idx = simd_data(desc); \ 1601 TYPE *d = vd, *n = vn, *m = vm; \ 1602 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1603 TYPE mm = m[H(i + idx)]; \ 1604 for (j = 0; j < segment; j++) { \ 1605 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \ 1606 } \ 1607 } \ 1608 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1609 } 1610 1611 #define nop(N, M, S) (M) 1612 1613 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2) 1614 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4) 1615 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8) 1616 1617 #ifdef TARGET_AARCH64 1618 1619 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2) 1620 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4) 1621 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8) 1622 1623 #endif 1624 1625 #undef nop 1626 1627 /* 1628 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1629 * the fused ops below they assume accumulate both from and into Vd. 1630 */ 1631 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2) 1632 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4) 1633 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2) 1634 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4) 1635 1636 #undef DO_FMUL_IDX 1637 1638 #define DO_FMLA_IDX(NAME, TYPE, H) \ 1639 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1640 float_status *stat, uint32_t desc) \ 1641 { \ 1642 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1643 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1644 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 1645 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 1646 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1647 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 1648 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1649 TYPE mm = m[H(i + idx)]; \ 1650 for (j = 0; j < segment; j++) { \ 1651 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 1652 mm, a[i + j], 0, stat); \ 1653 } \ 1654 } \ 1655 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1656 } 1657 1658 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 1659 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 1660 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) 1661 1662 #undef DO_FMLA_IDX 1663 1664 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1665 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1666 { \ 1667 intptr_t i, oprsz = simd_oprsz(desc); \ 1668 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1669 bool q = false; \ 1670 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1671 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1672 if (dd < MIN) { \ 1673 dd = MIN; \ 1674 q = true; \ 1675 } else if (dd > MAX) { \ 1676 dd = MAX; \ 1677 q = true; \ 1678 } \ 1679 d[i] = dd; \ 1680 } \ 1681 if (q) { \ 1682 uint32_t *qc = vq; \ 1683 qc[0] = 1; \ 1684 } \ 1685 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1686 } 1687 1688 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1689 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1690 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1691 1692 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1693 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1694 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1695 1696 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1697 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1698 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1699 1700 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1701 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1702 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1703 1704 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX) 1705 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX) 1706 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX) 1707 1708 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX) 1709 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX) 1710 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX) 1711 1712 #undef DO_SAT 1713 1714 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1715 void *vm, uint32_t desc) 1716 { 1717 intptr_t i, oprsz = simd_oprsz(desc); 1718 uint64_t *d = vd, *n = vn, *m = vm; 1719 bool q = false; 1720 1721 for (i = 0; i < oprsz / 8; i++) { 1722 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1723 if (dd < nn) { 1724 dd = UINT64_MAX; 1725 q = true; 1726 } 1727 d[i] = dd; 1728 } 1729 if (q) { 1730 uint32_t *qc = vq; 1731 qc[0] = 1; 1732 } 1733 clear_tail(d, oprsz, simd_maxsz(desc)); 1734 } 1735 1736 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1737 void *vm, uint32_t desc) 1738 { 1739 intptr_t i, oprsz = simd_oprsz(desc); 1740 uint64_t *d = vd, *n = vn, *m = vm; 1741 bool q = false; 1742 1743 for (i = 0; i < oprsz / 8; i++) { 1744 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1745 if (nn < mm) { 1746 dd = 0; 1747 q = true; 1748 } 1749 d[i] = dd; 1750 } 1751 if (q) { 1752 uint32_t *qc = vq; 1753 qc[0] = 1; 1754 } 1755 clear_tail(d, oprsz, simd_maxsz(desc)); 1756 } 1757 1758 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1759 void *vm, uint32_t desc) 1760 { 1761 intptr_t i, oprsz = simd_oprsz(desc); 1762 int64_t *d = vd, *n = vn, *m = vm; 1763 bool q = false; 1764 1765 for (i = 0; i < oprsz / 8; i++) { 1766 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1767 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1768 dd = (nn >> 63) ^ ~INT64_MIN; 1769 q = true; 1770 } 1771 d[i] = dd; 1772 } 1773 if (q) { 1774 uint32_t *qc = vq; 1775 qc[0] = 1; 1776 } 1777 clear_tail(d, oprsz, simd_maxsz(desc)); 1778 } 1779 1780 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1781 void *vm, uint32_t desc) 1782 { 1783 intptr_t i, oprsz = simd_oprsz(desc); 1784 int64_t *d = vd, *n = vn, *m = vm; 1785 bool q = false; 1786 1787 for (i = 0; i < oprsz / 8; i++) { 1788 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1789 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1790 dd = (nn >> 63) ^ ~INT64_MIN; 1791 q = true; 1792 } 1793 d[i] = dd; 1794 } 1795 if (q) { 1796 uint32_t *qc = vq; 1797 qc[0] = 1; 1798 } 1799 clear_tail(d, oprsz, simd_maxsz(desc)); 1800 } 1801 1802 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn, 1803 void *vm, uint32_t desc) 1804 { 1805 intptr_t i, oprsz = simd_oprsz(desc); 1806 uint64_t *d = vd, *n = vn, *m = vm; 1807 bool q = false; 1808 1809 for (i = 0; i < oprsz / 8; i++) { 1810 uint64_t nn = n[i]; 1811 int64_t mm = m[i]; 1812 uint64_t dd = nn + mm; 1813 1814 if (mm < 0) { 1815 if (nn < (uint64_t)-mm) { 1816 dd = 0; 1817 q = true; 1818 } 1819 } else { 1820 if (dd < nn) { 1821 dd = UINT64_MAX; 1822 q = true; 1823 } 1824 } 1825 d[i] = dd; 1826 } 1827 if (q) { 1828 uint32_t *qc = vq; 1829 qc[0] = 1; 1830 } 1831 clear_tail(d, oprsz, simd_maxsz(desc)); 1832 } 1833 1834 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn, 1835 void *vm, uint32_t desc) 1836 { 1837 intptr_t i, oprsz = simd_oprsz(desc); 1838 uint64_t *d = vd, *n = vn, *m = vm; 1839 bool q = false; 1840 1841 for (i = 0; i < oprsz / 8; i++) { 1842 int64_t nn = n[i]; 1843 uint64_t mm = m[i]; 1844 int64_t dd = nn + mm; 1845 1846 if (mm > (uint64_t)(INT64_MAX - nn)) { 1847 dd = INT64_MAX; 1848 q = true; 1849 } 1850 d[i] = dd; 1851 } 1852 if (q) { 1853 uint32_t *qc = vq; 1854 qc[0] = 1; 1855 } 1856 clear_tail(d, oprsz, simd_maxsz(desc)); 1857 } 1858 1859 #define DO_SRA(NAME, TYPE) \ 1860 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1861 { \ 1862 intptr_t i, oprsz = simd_oprsz(desc); \ 1863 int shift = simd_data(desc); \ 1864 TYPE *d = vd, *n = vn; \ 1865 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1866 d[i] += n[i] >> shift; \ 1867 } \ 1868 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1869 } 1870 1871 DO_SRA(gvec_ssra_b, int8_t) 1872 DO_SRA(gvec_ssra_h, int16_t) 1873 DO_SRA(gvec_ssra_s, int32_t) 1874 DO_SRA(gvec_ssra_d, int64_t) 1875 1876 DO_SRA(gvec_usra_b, uint8_t) 1877 DO_SRA(gvec_usra_h, uint16_t) 1878 DO_SRA(gvec_usra_s, uint32_t) 1879 DO_SRA(gvec_usra_d, uint64_t) 1880 1881 #undef DO_SRA 1882 1883 #define DO_RSHR(NAME, TYPE) \ 1884 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1885 { \ 1886 intptr_t i, oprsz = simd_oprsz(desc); \ 1887 int shift = simd_data(desc); \ 1888 TYPE *d = vd, *n = vn; \ 1889 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1890 TYPE tmp = n[i] >> (shift - 1); \ 1891 d[i] = (tmp >> 1) + (tmp & 1); \ 1892 } \ 1893 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1894 } 1895 1896 DO_RSHR(gvec_srshr_b, int8_t) 1897 DO_RSHR(gvec_srshr_h, int16_t) 1898 DO_RSHR(gvec_srshr_s, int32_t) 1899 DO_RSHR(gvec_srshr_d, int64_t) 1900 1901 DO_RSHR(gvec_urshr_b, uint8_t) 1902 DO_RSHR(gvec_urshr_h, uint16_t) 1903 DO_RSHR(gvec_urshr_s, uint32_t) 1904 DO_RSHR(gvec_urshr_d, uint64_t) 1905 1906 #undef DO_RSHR 1907 1908 #define DO_RSRA(NAME, TYPE) \ 1909 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1910 { \ 1911 intptr_t i, oprsz = simd_oprsz(desc); \ 1912 int shift = simd_data(desc); \ 1913 TYPE *d = vd, *n = vn; \ 1914 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1915 TYPE tmp = n[i] >> (shift - 1); \ 1916 d[i] += (tmp >> 1) + (tmp & 1); \ 1917 } \ 1918 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1919 } 1920 1921 DO_RSRA(gvec_srsra_b, int8_t) 1922 DO_RSRA(gvec_srsra_h, int16_t) 1923 DO_RSRA(gvec_srsra_s, int32_t) 1924 DO_RSRA(gvec_srsra_d, int64_t) 1925 1926 DO_RSRA(gvec_ursra_b, uint8_t) 1927 DO_RSRA(gvec_ursra_h, uint16_t) 1928 DO_RSRA(gvec_ursra_s, uint32_t) 1929 DO_RSRA(gvec_ursra_d, uint64_t) 1930 1931 #undef DO_RSRA 1932 1933 #define DO_SRI(NAME, TYPE) \ 1934 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1935 { \ 1936 intptr_t i, oprsz = simd_oprsz(desc); \ 1937 int shift = simd_data(desc); \ 1938 TYPE *d = vd, *n = vn; \ 1939 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1940 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 1941 } \ 1942 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1943 } 1944 1945 DO_SRI(gvec_sri_b, uint8_t) 1946 DO_SRI(gvec_sri_h, uint16_t) 1947 DO_SRI(gvec_sri_s, uint32_t) 1948 DO_SRI(gvec_sri_d, uint64_t) 1949 1950 #undef DO_SRI 1951 1952 #define DO_SLI(NAME, TYPE) \ 1953 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1954 { \ 1955 intptr_t i, oprsz = simd_oprsz(desc); \ 1956 int shift = simd_data(desc); \ 1957 TYPE *d = vd, *n = vn; \ 1958 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1959 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 1960 } \ 1961 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1962 } 1963 1964 DO_SLI(gvec_sli_b, uint8_t) 1965 DO_SLI(gvec_sli_h, uint16_t) 1966 DO_SLI(gvec_sli_s, uint32_t) 1967 DO_SLI(gvec_sli_d, uint64_t) 1968 1969 #undef DO_SLI 1970 1971 /* 1972 * Convert float16 to float32, raising no exceptions and 1973 * preserving exceptional values, including SNaN. 1974 * This is effectively an unpack+repack operation. 1975 */ 1976 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 1977 { 1978 const int f16_bias = 15; 1979 const int f32_bias = 127; 1980 uint32_t sign = extract32(f16, 15, 1); 1981 uint32_t exp = extract32(f16, 10, 5); 1982 uint32_t frac = extract32(f16, 0, 10); 1983 1984 if (exp == 0x1f) { 1985 /* Inf or NaN */ 1986 exp = 0xff; 1987 } else if (exp == 0) { 1988 /* Zero or denormal. */ 1989 if (frac != 0) { 1990 if (fz16) { 1991 frac = 0; 1992 } else { 1993 /* 1994 * Denormal; these are all normal float32. 1995 * Shift the fraction so that the msb is at bit 11, 1996 * then remove bit 11 as the implicit bit of the 1997 * normalized float32. Note that we still go through 1998 * the shift for normal numbers below, to put the 1999 * float32 fraction at the right place. 2000 */ 2001 int shift = clz32(frac) - 21; 2002 frac = (frac << shift) & 0x3ff; 2003 exp = f32_bias - f16_bias - shift + 1; 2004 } 2005 } 2006 } else { 2007 /* Normal number; adjust the bias. */ 2008 exp += f32_bias - f16_bias; 2009 } 2010 sign <<= 31; 2011 exp <<= 23; 2012 frac <<= 23 - 10; 2013 2014 return sign | exp | frac; 2015 } 2016 2017 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 2018 { 2019 /* 2020 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 2021 * Load the 2nd qword iff is_q & is_2. 2022 * Shift to the 2nd dword iff !is_q & is_2. 2023 * For !is_q & !is_2, the upper bits of the result are garbage. 2024 */ 2025 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 2026 } 2027 2028 /* 2029 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 2030 * as there is not yet SVE versions that might use blocking. 2031 */ 2032 2033 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 2034 uint32_t desc, bool fz16) 2035 { 2036 intptr_t i, oprsz = simd_oprsz(desc); 2037 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2038 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2039 int is_q = oprsz == 16; 2040 uint64_t n_4, m_4; 2041 2042 /* Pre-load all of the f16 data, avoiding overlap issues. */ 2043 n_4 = load4_f16(vn, is_q, is_2); 2044 m_4 = load4_f16(vm, is_q, is_2); 2045 2046 /* Negate all inputs for FMLSL at once. */ 2047 if (is_s) { 2048 n_4 ^= 0x8000800080008000ull; 2049 } 2050 2051 for (i = 0; i < oprsz / 4; i++) { 2052 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2053 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 2054 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 2055 } 2056 clear_tail(d, oprsz, simd_maxsz(desc)); 2057 } 2058 2059 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 2060 CPUARMState *env, uint32_t desc) 2061 { 2062 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, 2063 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32)); 2064 } 2065 2066 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 2067 CPUARMState *env, uint32_t desc) 2068 { 2069 do_fmlal(vd, vn, vm, &env->vfp.fp_status_a64, desc, 2070 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64)); 2071 } 2072 2073 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 2074 CPUARMState *env, uint32_t desc) 2075 { 2076 intptr_t i, oprsz = simd_oprsz(desc); 2077 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 2078 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2079 float_status *status = &env->vfp.fp_status_a64; 2080 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64); 2081 2082 for (i = 0; i < oprsz; i += sizeof(float32)) { 2083 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; 2084 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 2085 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2086 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2087 float32 aa = *(float32 *)(va + H1_4(i)); 2088 2089 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); 2090 } 2091 } 2092 2093 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 2094 uint32_t desc, bool fz16) 2095 { 2096 intptr_t i, oprsz = simd_oprsz(desc); 2097 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2098 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2099 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 2100 int is_q = oprsz == 16; 2101 uint64_t n_4; 2102 float32 m_1; 2103 2104 /* Pre-load all of the f16 data, avoiding overlap issues. */ 2105 n_4 = load4_f16(vn, is_q, is_2); 2106 2107 /* Negate all inputs for FMLSL at once. */ 2108 if (is_s) { 2109 n_4 ^= 0x8000800080008000ull; 2110 } 2111 2112 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 2113 2114 for (i = 0; i < oprsz / 4; i++) { 2115 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2116 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 2117 } 2118 clear_tail(d, oprsz, simd_maxsz(desc)); 2119 } 2120 2121 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 2122 CPUARMState *env, uint32_t desc) 2123 { 2124 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, 2125 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32)); 2126 } 2127 2128 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 2129 CPUARMState *env, uint32_t desc) 2130 { 2131 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status_a64, desc, 2132 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64)); 2133 } 2134 2135 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 2136 CPUARMState *env, uint32_t desc) 2137 { 2138 intptr_t i, j, oprsz = simd_oprsz(desc); 2139 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 2140 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2141 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 2142 float_status *status = &env->vfp.fp_status_a64; 2143 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64); 2144 2145 for (i = 0; i < oprsz; i += 16) { 2146 float16 mm_16 = *(float16 *)(vm + i + idx); 2147 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2148 2149 for (j = 0; j < 16; j += sizeof(float32)) { 2150 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; 2151 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2152 float32 aa = *(float32 *)(va + H1_4(i + j)); 2153 2154 *(float32 *)(vd + H1_4(i + j)) = 2155 float32_muladd(nn, mm, aa, 0, status); 2156 } 2157 } 2158 } 2159 2160 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2161 { 2162 intptr_t i, opr_sz = simd_oprsz(desc); 2163 int8_t *d = vd, *n = vn, *m = vm; 2164 2165 for (i = 0; i < opr_sz; ++i) { 2166 int8_t mm = m[i]; 2167 int8_t nn = n[i]; 2168 int8_t res = 0; 2169 if (mm >= 0) { 2170 if (mm < 8) { 2171 res = nn << mm; 2172 } 2173 } else { 2174 res = nn >> (mm > -8 ? -mm : 7); 2175 } 2176 d[i] = res; 2177 } 2178 clear_tail(d, opr_sz, simd_maxsz(desc)); 2179 } 2180 2181 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2182 { 2183 intptr_t i, opr_sz = simd_oprsz(desc); 2184 int16_t *d = vd, *n = vn, *m = vm; 2185 2186 for (i = 0; i < opr_sz / 2; ++i) { 2187 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2188 int16_t nn = n[i]; 2189 int16_t res = 0; 2190 if (mm >= 0) { 2191 if (mm < 16) { 2192 res = nn << mm; 2193 } 2194 } else { 2195 res = nn >> (mm > -16 ? -mm : 15); 2196 } 2197 d[i] = res; 2198 } 2199 clear_tail(d, opr_sz, simd_maxsz(desc)); 2200 } 2201 2202 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2203 { 2204 intptr_t i, opr_sz = simd_oprsz(desc); 2205 uint8_t *d = vd, *n = vn, *m = vm; 2206 2207 for (i = 0; i < opr_sz; ++i) { 2208 int8_t mm = m[i]; 2209 uint8_t nn = n[i]; 2210 uint8_t res = 0; 2211 if (mm >= 0) { 2212 if (mm < 8) { 2213 res = nn << mm; 2214 } 2215 } else { 2216 if (mm > -8) { 2217 res = nn >> -mm; 2218 } 2219 } 2220 d[i] = res; 2221 } 2222 clear_tail(d, opr_sz, simd_maxsz(desc)); 2223 } 2224 2225 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2226 { 2227 intptr_t i, opr_sz = simd_oprsz(desc); 2228 uint16_t *d = vd, *n = vn, *m = vm; 2229 2230 for (i = 0; i < opr_sz / 2; ++i) { 2231 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2232 uint16_t nn = n[i]; 2233 uint16_t res = 0; 2234 if (mm >= 0) { 2235 if (mm < 16) { 2236 res = nn << mm; 2237 } 2238 } else { 2239 if (mm > -16) { 2240 res = nn >> -mm; 2241 } 2242 } 2243 d[i] = res; 2244 } 2245 clear_tail(d, opr_sz, simd_maxsz(desc)); 2246 } 2247 2248 /* 2249 * 8x8->8 polynomial multiply. 2250 * 2251 * Polynomial multiplication is like integer multiplication except the 2252 * partial products are XORed, not added. 2253 * 2254 * TODO: expose this as a generic vector operation, as it is a common 2255 * crypto building block. 2256 */ 2257 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 2258 { 2259 intptr_t i, opr_sz = simd_oprsz(desc); 2260 uint64_t *d = vd, *n = vn, *m = vm; 2261 2262 for (i = 0; i < opr_sz / 8; ++i) { 2263 d[i] = clmul_8x8_low(n[i], m[i]); 2264 } 2265 clear_tail(d, opr_sz, simd_maxsz(desc)); 2266 } 2267 2268 /* 2269 * 64x64->128 polynomial multiply. 2270 * Because of the lanes are not accessed in strict columns, 2271 * this probably cannot be turned into a generic helper. 2272 */ 2273 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2274 { 2275 intptr_t i, opr_sz = simd_oprsz(desc); 2276 intptr_t hi = simd_data(desc); 2277 uint64_t *d = vd, *n = vn, *m = vm; 2278 2279 for (i = 0; i < opr_sz / 8; i += 2) { 2280 Int128 r = clmul_64(n[i + hi], m[i + hi]); 2281 d[i] = int128_getlo(r); 2282 d[i + 1] = int128_gethi(r); 2283 } 2284 clear_tail(d, opr_sz, simd_maxsz(desc)); 2285 } 2286 2287 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2288 { 2289 int hi = simd_data(desc); 2290 uint64_t *d = vd, *n = vn, *m = vm; 2291 uint64_t nn = n[hi], mm = m[hi]; 2292 2293 d[0] = clmul_8x4_packed(nn, mm); 2294 nn >>= 32; 2295 mm >>= 32; 2296 d[1] = clmul_8x4_packed(nn, mm); 2297 2298 clear_tail(d, 16, simd_maxsz(desc)); 2299 } 2300 2301 #ifdef TARGET_AARCH64 2302 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2303 { 2304 int shift = simd_data(desc) * 8; 2305 intptr_t i, opr_sz = simd_oprsz(desc); 2306 uint64_t *d = vd, *n = vn, *m = vm; 2307 2308 for (i = 0; i < opr_sz / 8; ++i) { 2309 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); 2310 } 2311 } 2312 2313 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2314 { 2315 intptr_t sel = H4(simd_data(desc)); 2316 intptr_t i, opr_sz = simd_oprsz(desc); 2317 uint32_t *n = vn, *m = vm; 2318 uint64_t *d = vd; 2319 2320 for (i = 0; i < opr_sz / 8; ++i) { 2321 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); 2322 } 2323 } 2324 #endif 2325 2326 #define DO_CMP0(NAME, TYPE, OP) \ 2327 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2328 { \ 2329 intptr_t i, opr_sz = simd_oprsz(desc); \ 2330 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2331 TYPE nn = *(TYPE *)(vn + i); \ 2332 *(TYPE *)(vd + i) = -(nn OP 0); \ 2333 } \ 2334 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2335 } 2336 2337 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2338 DO_CMP0(gvec_clt0_b, int8_t, <) 2339 DO_CMP0(gvec_cle0_b, int8_t, <=) 2340 DO_CMP0(gvec_cgt0_b, int8_t, >) 2341 DO_CMP0(gvec_cge0_b, int8_t, >=) 2342 2343 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2344 DO_CMP0(gvec_clt0_h, int16_t, <) 2345 DO_CMP0(gvec_cle0_h, int16_t, <=) 2346 DO_CMP0(gvec_cgt0_h, int16_t, >) 2347 DO_CMP0(gvec_cge0_h, int16_t, >=) 2348 2349 #undef DO_CMP0 2350 2351 #define DO_ABD(NAME, TYPE) \ 2352 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2353 { \ 2354 intptr_t i, opr_sz = simd_oprsz(desc); \ 2355 TYPE *d = vd, *n = vn, *m = vm; \ 2356 \ 2357 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2358 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2359 } \ 2360 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2361 } 2362 2363 DO_ABD(gvec_sabd_b, int8_t) 2364 DO_ABD(gvec_sabd_h, int16_t) 2365 DO_ABD(gvec_sabd_s, int32_t) 2366 DO_ABD(gvec_sabd_d, int64_t) 2367 2368 DO_ABD(gvec_uabd_b, uint8_t) 2369 DO_ABD(gvec_uabd_h, uint16_t) 2370 DO_ABD(gvec_uabd_s, uint32_t) 2371 DO_ABD(gvec_uabd_d, uint64_t) 2372 2373 #undef DO_ABD 2374 2375 #define DO_ABA(NAME, TYPE) \ 2376 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2377 { \ 2378 intptr_t i, opr_sz = simd_oprsz(desc); \ 2379 TYPE *d = vd, *n = vn, *m = vm; \ 2380 \ 2381 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2382 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2383 } \ 2384 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2385 } 2386 2387 DO_ABA(gvec_saba_b, int8_t) 2388 DO_ABA(gvec_saba_h, int16_t) 2389 DO_ABA(gvec_saba_s, int32_t) 2390 DO_ABA(gvec_saba_d, int64_t) 2391 2392 DO_ABA(gvec_uaba_b, uint8_t) 2393 DO_ABA(gvec_uaba_h, uint16_t) 2394 DO_ABA(gvec_uaba_s, uint32_t) 2395 DO_ABA(gvec_uaba_d, uint64_t) 2396 2397 #undef DO_ABA 2398 2399 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2400 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 2401 float_status *stat, uint32_t desc) \ 2402 { \ 2403 ARMVectorReg scratch; \ 2404 intptr_t oprsz = simd_oprsz(desc); \ 2405 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2406 TYPE *d = vd, *n = vn, *m = vm; \ 2407 if (unlikely(d == m)) { \ 2408 m = memcpy(&scratch, m, oprsz); \ 2409 } \ 2410 for (intptr_t i = 0; i < half; ++i) { \ 2411 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat); \ 2412 } \ 2413 for (intptr_t i = 0; i < half; ++i) { \ 2414 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat); \ 2415 } \ 2416 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2417 } 2418 2419 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2) 2420 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4) 2421 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, ) 2422 2423 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2) 2424 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4) 2425 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, ) 2426 2427 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2) 2428 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4) 2429 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, ) 2430 2431 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2) 2432 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4) 2433 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, ) 2434 2435 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2) 2436 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4) 2437 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, ) 2438 2439 #undef DO_3OP_PAIR 2440 2441 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2442 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2443 { \ 2444 ARMVectorReg scratch; \ 2445 intptr_t oprsz = simd_oprsz(desc); \ 2446 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2447 TYPE *d = vd, *n = vn, *m = vm; \ 2448 if (unlikely(d == m)) { \ 2449 m = memcpy(&scratch, m, oprsz); \ 2450 } \ 2451 for (intptr_t i = 0; i < half; ++i) { \ 2452 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]); \ 2453 } \ 2454 for (intptr_t i = 0; i < half; ++i) { \ 2455 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]); \ 2456 } \ 2457 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2458 } 2459 2460 #define ADD(A, B) (A + B) 2461 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1) 2462 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2) 2463 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4) 2464 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, ) 2465 #undef ADD 2466 2467 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1) 2468 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2) 2469 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4) 2470 2471 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1) 2472 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2) 2473 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4) 2474 2475 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1) 2476 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2) 2477 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4) 2478 2479 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1) 2480 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2) 2481 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4) 2482 2483 #undef DO_3OP_PAIR 2484 2485 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2486 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 2487 { \ 2488 intptr_t i, oprsz = simd_oprsz(desc); \ 2489 int shift = simd_data(desc); \ 2490 TYPE *d = vd, *n = vn; \ 2491 float_status *fpst = stat; \ 2492 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2493 d[i] = FUNC(n[i], shift, fpst); \ 2494 } \ 2495 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2496 } 2497 2498 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t) 2499 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t) 2500 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2501 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2502 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2503 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2504 2505 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t) 2506 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t) 2507 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2508 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t) 2509 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2510 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2511 2512 #undef DO_VCVT_FIXED 2513 2514 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2515 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2516 { \ 2517 intptr_t i, oprsz = simd_oprsz(desc); \ 2518 uint32_t rmode = simd_data(desc); \ 2519 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2520 TYPE *d = vd, *n = vn; \ 2521 set_float_rounding_mode(rmode, fpst); \ 2522 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2523 d[i] = FUNC(n[i], 0, fpst); \ 2524 } \ 2525 set_float_rounding_mode(prev_rmode, fpst); \ 2526 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2527 } 2528 2529 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t) 2530 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t) 2531 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2532 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2533 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2534 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2535 2536 #undef DO_VCVT_RMODE 2537 2538 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2539 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2540 { \ 2541 intptr_t i, oprsz = simd_oprsz(desc); \ 2542 uint32_t rmode = simd_data(desc); \ 2543 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2544 TYPE *d = vd, *n = vn; \ 2545 set_float_rounding_mode(rmode, fpst); \ 2546 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2547 d[i] = FUNC(n[i], fpst); \ 2548 } \ 2549 set_float_rounding_mode(prev_rmode, fpst); \ 2550 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2551 } 2552 2553 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2554 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2555 2556 #undef DO_VRINT_RMODE 2557 2558 #ifdef TARGET_AARCH64 2559 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc) 2560 { 2561 const uint8_t *indices = vm; 2562 size_t oprsz = simd_oprsz(desc); 2563 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2564 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2565 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2566 union { 2567 uint8_t b[16]; 2568 uint64_t d[2]; 2569 } result; 2570 2571 /* 2572 * We must construct the final result in a temp, lest the output 2573 * overlaps the input table. For TBL, begin with zero; for TBX, 2574 * begin with the original register contents. Note that we always 2575 * copy 16 bytes here to avoid an extra branch; clearing the high 2576 * bits of the register for oprsz == 8 is handled below. 2577 */ 2578 if (is_tbx) { 2579 memcpy(&result, vd, 16); 2580 } else { 2581 memset(&result, 0, 16); 2582 } 2583 2584 for (size_t i = 0; i < oprsz; ++i) { 2585 uint32_t index = indices[H1(i)]; 2586 2587 if (index < table_len) { 2588 /* 2589 * Convert index (a byte offset into the virtual table 2590 * which is a series of 128-bit vectors concatenated) 2591 * into the correct register element, bearing in mind 2592 * that the table can wrap around from V31 to V0. 2593 */ 2594 const uint8_t *table = (const uint8_t *) 2595 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2596 result.b[H1(i)] = table[H1(index % 16)]; 2597 } 2598 } 2599 2600 memcpy(vd, &result, 16); 2601 clear_tail(vd, oprsz, simd_maxsz(desc)); 2602 } 2603 #endif 2604 2605 /* 2606 * NxN -> N highpart multiply 2607 * 2608 * TODO: expose this as a generic vector operation. 2609 */ 2610 2611 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2612 { 2613 intptr_t i, opr_sz = simd_oprsz(desc); 2614 int8_t *d = vd, *n = vn, *m = vm; 2615 2616 for (i = 0; i < opr_sz; ++i) { 2617 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2618 } 2619 clear_tail(d, opr_sz, simd_maxsz(desc)); 2620 } 2621 2622 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2623 { 2624 intptr_t i, opr_sz = simd_oprsz(desc); 2625 int16_t *d = vd, *n = vn, *m = vm; 2626 2627 for (i = 0; i < opr_sz / 2; ++i) { 2628 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2629 } 2630 clear_tail(d, opr_sz, simd_maxsz(desc)); 2631 } 2632 2633 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2634 { 2635 intptr_t i, opr_sz = simd_oprsz(desc); 2636 int32_t *d = vd, *n = vn, *m = vm; 2637 2638 for (i = 0; i < opr_sz / 4; ++i) { 2639 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2640 } 2641 clear_tail(d, opr_sz, simd_maxsz(desc)); 2642 } 2643 2644 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2645 { 2646 intptr_t i, opr_sz = simd_oprsz(desc); 2647 uint64_t *d = vd, *n = vn, *m = vm; 2648 uint64_t discard; 2649 2650 for (i = 0; i < opr_sz / 8; ++i) { 2651 muls64(&discard, &d[i], n[i], m[i]); 2652 } 2653 clear_tail(d, opr_sz, simd_maxsz(desc)); 2654 } 2655 2656 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2657 { 2658 intptr_t i, opr_sz = simd_oprsz(desc); 2659 uint8_t *d = vd, *n = vn, *m = vm; 2660 2661 for (i = 0; i < opr_sz; ++i) { 2662 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2663 } 2664 clear_tail(d, opr_sz, simd_maxsz(desc)); 2665 } 2666 2667 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2668 { 2669 intptr_t i, opr_sz = simd_oprsz(desc); 2670 uint16_t *d = vd, *n = vn, *m = vm; 2671 2672 for (i = 0; i < opr_sz / 2; ++i) { 2673 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2674 } 2675 clear_tail(d, opr_sz, simd_maxsz(desc)); 2676 } 2677 2678 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2679 { 2680 intptr_t i, opr_sz = simd_oprsz(desc); 2681 uint32_t *d = vd, *n = vn, *m = vm; 2682 2683 for (i = 0; i < opr_sz / 4; ++i) { 2684 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2685 } 2686 clear_tail(d, opr_sz, simd_maxsz(desc)); 2687 } 2688 2689 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2690 { 2691 intptr_t i, opr_sz = simd_oprsz(desc); 2692 uint64_t *d = vd, *n = vn, *m = vm; 2693 uint64_t discard; 2694 2695 for (i = 0; i < opr_sz / 8; ++i) { 2696 mulu64(&discard, &d[i], n[i], m[i]); 2697 } 2698 clear_tail(d, opr_sz, simd_maxsz(desc)); 2699 } 2700 2701 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2702 { 2703 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2704 int shr = simd_data(desc); 2705 uint64_t *d = vd, *n = vn, *m = vm; 2706 2707 for (i = 0; i < opr_sz; ++i) { 2708 d[i] = ror64(n[i] ^ m[i], shr); 2709 } 2710 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2711 } 2712 2713 /* 2714 * Integer matrix-multiply accumulate 2715 */ 2716 2717 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2718 { 2719 int8_t *n = vn, *m = vm; 2720 2721 for (intptr_t k = 0; k < 8; ++k) { 2722 sum += n[H1(k)] * m[H1(k)]; 2723 } 2724 return sum; 2725 } 2726 2727 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2728 { 2729 uint8_t *n = vn, *m = vm; 2730 2731 for (intptr_t k = 0; k < 8; ++k) { 2732 sum += n[H1(k)] * m[H1(k)]; 2733 } 2734 return sum; 2735 } 2736 2737 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2738 { 2739 uint8_t *n = vn; 2740 int8_t *m = vm; 2741 2742 for (intptr_t k = 0; k < 8; ++k) { 2743 sum += n[H1(k)] * m[H1(k)]; 2744 } 2745 return sum; 2746 } 2747 2748 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2749 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2750 { 2751 intptr_t seg, opr_sz = simd_oprsz(desc); 2752 2753 for (seg = 0; seg < opr_sz; seg += 16) { 2754 uint32_t *d = vd + seg; 2755 uint32_t *a = va + seg; 2756 uint32_t sum0, sum1, sum2, sum3; 2757 2758 /* 2759 * Process the entire segment at once, writing back the 2760 * results only after we've consumed all of the inputs. 2761 * 2762 * Key to indices by column: 2763 * i j i j 2764 */ 2765 sum0 = a[H4(0 + 0)]; 2766 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2767 sum1 = a[H4(0 + 1)]; 2768 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2769 sum2 = a[H4(2 + 0)]; 2770 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2771 sum3 = a[H4(2 + 1)]; 2772 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2773 2774 d[H4(0)] = sum0; 2775 d[H4(1)] = sum1; 2776 d[H4(2)] = sum2; 2777 d[H4(3)] = sum3; 2778 } 2779 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2780 } 2781 2782 #define DO_MMLA_B(NAME, INNER) \ 2783 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2784 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2785 2786 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2787 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2788 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2789 2790 /* 2791 * BFloat16 Dot Product 2792 */ 2793 2794 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp) 2795 { 2796 /* 2797 * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF. 2798 * For EBF = 0, we ignore the FPCR bits which determine rounding 2799 * mode and denormal-flushing, and we do unfused multiplies and 2800 * additions with intermediate rounding of all products and sums. 2801 * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits, 2802 * and we perform a fused two-way sum-of-products without intermediate 2803 * rounding of the products. 2804 * In either case, we don't set fp exception flags. 2805 * 2806 * EBF is AArch64 only, so even if it's set in the FPCR it has 2807 * no effect on AArch32 instructions. 2808 */ 2809 bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF; 2810 2811 *statusp = is_a64(env) ? env->vfp.fp_status_a64 : env->vfp.fp_status_a32; 2812 set_default_nan_mode(true, statusp); 2813 2814 if (ebf) { 2815 /* EBF=1 needs to do a step with round-to-odd semantics */ 2816 *oddstatusp = *statusp; 2817 set_float_rounding_mode(float_round_to_odd, oddstatusp); 2818 } else { 2819 set_flush_to_zero(true, statusp); 2820 set_flush_inputs_to_zero(true, statusp); 2821 set_float_rounding_mode(float_round_to_odd_inf, statusp); 2822 } 2823 return ebf; 2824 } 2825 2826 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst) 2827 { 2828 float32 t1, t2; 2829 2830 /* 2831 * Extract each BFloat16 from the element pair, and shift 2832 * them such that they become float32. 2833 */ 2834 t1 = float32_mul(e1 << 16, e2 << 16, fpst); 2835 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst); 2836 t1 = float32_add(t1, t2, fpst); 2837 t1 = float32_add(sum, t1, fpst); 2838 2839 return t1; 2840 } 2841 2842 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2, 2843 float_status *fpst, float_status *fpst_odd) 2844 { 2845 /* 2846 * Compare f16_dotadd() in sme_helper.c, but here we have 2847 * bfloat16 inputs. In particular that means that we do not 2848 * want the FPCR.FZ16 flush semantics, so we use the normal 2849 * float_status for the input handling here. 2850 */ 2851 float64 e1r = float32_to_float64(e1 << 16, fpst); 2852 float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst); 2853 float64 e2r = float32_to_float64(e2 << 16, fpst); 2854 float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst); 2855 float64 t64; 2856 float32 t32; 2857 2858 /* 2859 * The ARM pseudocode function FPDot performs both multiplies 2860 * and the add with a single rounding operation. Emulate this 2861 * by performing the first multiply in round-to-odd, then doing 2862 * the second multiply as fused multiply-add, and rounding to 2863 * float32 all in one step. 2864 */ 2865 t64 = float64_mul(e1r, e2r, fpst_odd); 2866 t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst); 2867 2868 /* This conversion is exact, because we've already rounded. */ 2869 t32 = float64_to_float32(t64, fpst); 2870 2871 /* The final accumulation step is not fused. */ 2872 return float32_add(sum, t32, fpst); 2873 } 2874 2875 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, 2876 CPUARMState *env, uint32_t desc) 2877 { 2878 intptr_t i, opr_sz = simd_oprsz(desc); 2879 float32 *d = vd, *a = va; 2880 uint32_t *n = vn, *m = vm; 2881 float_status fpst, fpst_odd; 2882 2883 if (is_ebf(env, &fpst, &fpst_odd)) { 2884 for (i = 0; i < opr_sz / 4; ++i) { 2885 d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd); 2886 } 2887 } else { 2888 for (i = 0; i < opr_sz / 4; ++i) { 2889 d[i] = bfdotadd(a[i], n[i], m[i], &fpst); 2890 } 2891 } 2892 clear_tail(d, opr_sz, simd_maxsz(desc)); 2893 } 2894 2895 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 2896 void *va, CPUARMState *env, uint32_t desc) 2897 { 2898 intptr_t i, j, opr_sz = simd_oprsz(desc); 2899 intptr_t index = simd_data(desc); 2900 intptr_t elements = opr_sz / 4; 2901 intptr_t eltspersegment = MIN(16 / 4, elements); 2902 float32 *d = vd, *a = va; 2903 uint32_t *n = vn, *m = vm; 2904 float_status fpst, fpst_odd; 2905 2906 if (is_ebf(env, &fpst, &fpst_odd)) { 2907 for (i = 0; i < elements; i += eltspersegment) { 2908 uint32_t m_idx = m[i + H4(index)]; 2909 2910 for (j = i; j < i + eltspersegment; j++) { 2911 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd); 2912 } 2913 } 2914 } else { 2915 for (i = 0; i < elements; i += eltspersegment) { 2916 uint32_t m_idx = m[i + H4(index)]; 2917 2918 for (j = i; j < i + eltspersegment; j++) { 2919 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst); 2920 } 2921 } 2922 } 2923 clear_tail(d, opr_sz, simd_maxsz(desc)); 2924 } 2925 2926 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, 2927 CPUARMState *env, uint32_t desc) 2928 { 2929 intptr_t s, opr_sz = simd_oprsz(desc); 2930 float32 *d = vd, *a = va; 2931 uint32_t *n = vn, *m = vm; 2932 float_status fpst, fpst_odd; 2933 2934 if (is_ebf(env, &fpst, &fpst_odd)) { 2935 for (s = 0; s < opr_sz / 4; s += 4) { 2936 float32 sum00, sum01, sum10, sum11; 2937 2938 /* 2939 * Process the entire segment at once, writing back the 2940 * results only after we've consumed all of the inputs. 2941 * 2942 * Key to indices by column: 2943 * i j i k j k 2944 */ 2945 sum00 = a[s + H4(0 + 0)]; 2946 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 2947 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 2948 2949 sum01 = a[s + H4(0 + 1)]; 2950 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 2951 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 2952 2953 sum10 = a[s + H4(2 + 0)]; 2954 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 2955 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 2956 2957 sum11 = a[s + H4(2 + 1)]; 2958 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 2959 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 2960 2961 d[s + H4(0 + 0)] = sum00; 2962 d[s + H4(0 + 1)] = sum01; 2963 d[s + H4(2 + 0)] = sum10; 2964 d[s + H4(2 + 1)] = sum11; 2965 } 2966 } else { 2967 for (s = 0; s < opr_sz / 4; s += 4) { 2968 float32 sum00, sum01, sum10, sum11; 2969 2970 /* 2971 * Process the entire segment at once, writing back the 2972 * results only after we've consumed all of the inputs. 2973 * 2974 * Key to indices by column: 2975 * i j i k j k 2976 */ 2977 sum00 = a[s + H4(0 + 0)]; 2978 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst); 2979 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst); 2980 2981 sum01 = a[s + H4(0 + 1)]; 2982 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst); 2983 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst); 2984 2985 sum10 = a[s + H4(2 + 0)]; 2986 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst); 2987 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst); 2988 2989 sum11 = a[s + H4(2 + 1)]; 2990 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst); 2991 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst); 2992 2993 d[s + H4(0 + 0)] = sum00; 2994 d[s + H4(0 + 1)] = sum01; 2995 d[s + H4(2 + 0)] = sum10; 2996 d[s + H4(2 + 1)] = sum11; 2997 } 2998 } 2999 clear_tail(d, opr_sz, simd_maxsz(desc)); 3000 } 3001 3002 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 3003 float_status *stat, uint32_t desc) 3004 { 3005 intptr_t i, opr_sz = simd_oprsz(desc); 3006 intptr_t sel = simd_data(desc); 3007 float32 *d = vd, *a = va; 3008 bfloat16 *n = vn, *m = vm; 3009 3010 for (i = 0; i < opr_sz / 4; ++i) { 3011 float32 nn = n[H2(i * 2 + sel)] << 16; 3012 float32 mm = m[H2(i * 2 + sel)] << 16; 3013 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 3014 } 3015 clear_tail(d, opr_sz, simd_maxsz(desc)); 3016 } 3017 3018 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 3019 void *va, float_status *stat, uint32_t desc) 3020 { 3021 intptr_t i, j, opr_sz = simd_oprsz(desc); 3022 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 3023 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 3024 intptr_t elements = opr_sz / 4; 3025 intptr_t eltspersegment = MIN(16 / 4, elements); 3026 float32 *d = vd, *a = va; 3027 bfloat16 *n = vn, *m = vm; 3028 3029 for (i = 0; i < elements; i += eltspersegment) { 3030 float32 m_idx = m[H2(2 * i + index)] << 16; 3031 3032 for (j = i; j < i + eltspersegment; j++) { 3033 float32 n_j = n[H2(2 * j + sel)] << 16; 3034 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 3035 } 3036 } 3037 clear_tail(d, opr_sz, simd_maxsz(desc)); 3038 } 3039 3040 #define DO_CLAMP(NAME, TYPE) \ 3041 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 3042 { \ 3043 intptr_t i, opr_sz = simd_oprsz(desc); \ 3044 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 3045 TYPE aa = *(TYPE *)(a + i); \ 3046 TYPE nn = *(TYPE *)(n + i); \ 3047 TYPE mm = *(TYPE *)(m + i); \ 3048 TYPE dd = MIN(MAX(aa, nn), mm); \ 3049 *(TYPE *)(d + i) = dd; \ 3050 } \ 3051 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 3052 } 3053 3054 DO_CLAMP(gvec_sclamp_b, int8_t) 3055 DO_CLAMP(gvec_sclamp_h, int16_t) 3056 DO_CLAMP(gvec_sclamp_s, int32_t) 3057 DO_CLAMP(gvec_sclamp_d, int64_t) 3058 3059 DO_CLAMP(gvec_uclamp_b, uint8_t) 3060 DO_CLAMP(gvec_uclamp_h, uint16_t) 3061 DO_CLAMP(gvec_uclamp_s, uint32_t) 3062 DO_CLAMP(gvec_uclamp_d, uint64_t) 3063 3064 /* Bit count in each 8-bit word. */ 3065 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc) 3066 { 3067 intptr_t i, opr_sz = simd_oprsz(desc); 3068 uint8_t *d = vd, *n = vn; 3069 3070 for (i = 0; i < opr_sz; ++i) { 3071 d[i] = ctpop8(n[i]); 3072 } 3073 clear_tail(d, opr_sz, simd_maxsz(desc)); 3074 } 3075 3076 /* Reverse bits in each 8 bit word */ 3077 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc) 3078 { 3079 intptr_t i, opr_sz = simd_oprsz(desc); 3080 uint64_t *d = vd, *n = vn; 3081 3082 for (i = 0; i < opr_sz / 8; ++i) { 3083 d[i] = revbit64(bswap64(n[i])); 3084 } 3085 clear_tail(d, opr_sz, simd_maxsz(desc)); 3086 } 3087 3088 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc) 3089 { 3090 intptr_t i, opr_sz = simd_oprsz(desc); 3091 uint32_t *d = vd, *n = vn; 3092 3093 for (i = 0; i < opr_sz / 4; ++i) { 3094 d[i] = helper_recpe_u32(n[i]); 3095 } 3096 clear_tail(d, opr_sz, simd_maxsz(desc)); 3097 } 3098 3099 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc) 3100 { 3101 intptr_t i, opr_sz = simd_oprsz(desc); 3102 uint32_t *d = vd, *n = vn; 3103 3104 for (i = 0; i < opr_sz / 4; ++i) { 3105 d[i] = helper_rsqrte_u32(n[i]); 3106 } 3107 clear_tail(d, opr_sz, simd_maxsz(desc)); 3108 } 3109