1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/helper-proto.h" 23 #include "tcg/tcg-gvec-desc.h" 24 #include "fpu/softfloat.h" 25 #include "qemu/int128.h" 26 #include "crypto/clmul.h" 27 #include "vec_internal.h" 28 29 /* 30 * Data for expanding active predicate bits to bytes, for byte elements. 31 * 32 * for (i = 0; i < 256; ++i) { 33 * unsigned long m = 0; 34 * for (j = 0; j < 8; j++) { 35 * if ((i >> j) & 1) { 36 * m |= 0xfful << (j << 3); 37 * } 38 * } 39 * printf("0x%016lx,\n", m); 40 * } 41 */ 42 const uint64_t expand_pred_b_data[256] = { 43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, 44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, 45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, 46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, 47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, 48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, 49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, 50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, 51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, 52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, 53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, 54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, 55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, 56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, 57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, 58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, 59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, 60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, 61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, 62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, 63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, 64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, 65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, 66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, 67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, 68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, 69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, 70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, 71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, 73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, 74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, 75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, 76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, 77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, 78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, 79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, 80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, 81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, 82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, 83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, 84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, 85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, 86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, 87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, 88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, 89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, 90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, 91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, 92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, 93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, 94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, 95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, 96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, 97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, 98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, 99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, 101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, 102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, 103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, 104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, 105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, 106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, 107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, 108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, 109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, 110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, 111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, 112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, 113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, 114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, 115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, 116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, 117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, 118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, 119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, 120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, 121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, 122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, 123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, 124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, 125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, 126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, 127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, 128 0xffffffffffffffff, 129 }; 130 131 /* 132 * Similarly for half-word elements. 133 * for (i = 0; i < 256; ++i) { 134 * unsigned long m = 0; 135 * if (i & 0xaa) { 136 * continue; 137 * } 138 * for (j = 0; j < 8; j += 2) { 139 * if ((i >> j) & 1) { 140 * m |= 0xfffful << (j << 3); 141 * } 142 * } 143 * printf("[0x%x] = 0x%016lx,\n", i, m); 144 * } 145 */ 146 const uint64_t expand_pred_h_data[0x55 + 1] = { 147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, 148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, 149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, 150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, 151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, 152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, 153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, 154 [0x55] = 0xffffffffffffffff, 155 }; 156 157 /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ 158 int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, 159 bool neg, bool round) 160 { 161 /* 162 * Simplify: 163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 165 */ 166 int32_t ret = (int32_t)src1 * src2; 167 if (neg) { 168 ret = -ret; 169 } 170 ret += ((int32_t)src3 << 7) + (round << 6); 171 ret >>= 7; 172 173 if (ret != (int8_t)ret) { 174 ret = (ret < 0 ? INT8_MIN : INT8_MAX); 175 } 176 return ret; 177 } 178 179 void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, 180 void *va, uint32_t desc) 181 { 182 intptr_t i, opr_sz = simd_oprsz(desc); 183 int8_t *d = vd, *n = vn, *m = vm, *a = va; 184 185 for (i = 0; i < opr_sz; ++i) { 186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); 187 } 188 } 189 190 void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, 191 void *va, uint32_t desc) 192 { 193 intptr_t i, opr_sz = simd_oprsz(desc); 194 int8_t *d = vd, *n = vn, *m = vm, *a = va; 195 196 for (i = 0; i < opr_sz; ++i) { 197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); 198 } 199 } 200 201 void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 202 { 203 intptr_t i, opr_sz = simd_oprsz(desc); 204 int8_t *d = vd, *n = vn, *m = vm; 205 206 for (i = 0; i < opr_sz; ++i) { 207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); 208 } 209 } 210 211 void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 212 { 213 intptr_t i, opr_sz = simd_oprsz(desc); 214 int8_t *d = vd, *n = vn, *m = vm; 215 216 for (i = 0; i < opr_sz; ++i) { 217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); 218 } 219 } 220 221 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 222 int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, 223 bool neg, bool round, uint32_t *sat) 224 { 225 /* Simplify similarly to do_sqrdmlah_b above. */ 226 int32_t ret = (int32_t)src1 * src2; 227 if (neg) { 228 ret = -ret; 229 } 230 ret += ((int32_t)src3 << 15) + (round << 14); 231 ret >>= 15; 232 233 if (ret != (int16_t)ret) { 234 *sat = 1; 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX); 236 } 237 return ret; 238 } 239 240 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 241 uint32_t src2, uint32_t src3) 242 { 243 uint32_t *sat = &env->vfp.qc[0]; 244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); 245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 246 false, true, sat); 247 return deposit32(e1, 16, 16, e2); 248 } 249 250 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 251 void *vq, uint32_t desc) 252 { 253 uintptr_t opr_sz = simd_oprsz(desc); 254 int16_t *d = vd; 255 int16_t *n = vn; 256 int16_t *m = vm; 257 uintptr_t i; 258 259 for (i = 0; i < opr_sz / 2; ++i) { 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); 261 } 262 clear_tail(d, opr_sz, simd_maxsz(desc)); 263 } 264 265 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 266 uint32_t src2, uint32_t src3) 267 { 268 uint32_t *sat = &env->vfp.qc[0]; 269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); 270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, 271 true, true, sat); 272 return deposit32(e1, 16, 16, e2); 273 } 274 275 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 276 void *vq, uint32_t desc) 277 { 278 uintptr_t opr_sz = simd_oprsz(desc); 279 int16_t *d = vd; 280 int16_t *n = vn; 281 int16_t *m = vm; 282 uintptr_t i; 283 284 for (i = 0; i < opr_sz / 2; ++i) { 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); 286 } 287 clear_tail(d, opr_sz, simd_maxsz(desc)); 288 } 289 290 void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, 291 void *vq, uint32_t desc) 292 { 293 intptr_t i, opr_sz = simd_oprsz(desc); 294 int16_t *d = vd, *n = vn, *m = vm; 295 296 for (i = 0; i < opr_sz / 2; ++i) { 297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); 298 } 299 clear_tail(d, opr_sz, simd_maxsz(desc)); 300 } 301 302 void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, 303 void *vq, uint32_t desc) 304 { 305 intptr_t i, opr_sz = simd_oprsz(desc); 306 int16_t *d = vd, *n = vn, *m = vm; 307 308 for (i = 0; i < opr_sz / 2; ++i) { 309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); 310 } 311 clear_tail(d, opr_sz, simd_maxsz(desc)); 312 } 313 314 void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm, 315 void *vq, uint32_t desc) 316 { 317 intptr_t i, j, opr_sz = simd_oprsz(desc); 318 int idx = simd_data(desc); 319 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 320 intptr_t elements = opr_sz / 2; 321 intptr_t eltspersegment = MIN(16 / 2, elements); 322 323 for (i = 0; i < elements; i += 16 / 2) { 324 int16_t mm = m[i]; 325 for (j = 0; j < eltspersegment; ++j) { 326 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq); 327 } 328 } 329 clear_tail(d, opr_sz, simd_maxsz(desc)); 330 } 331 332 void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, 333 void *vq, uint32_t desc) 334 { 335 intptr_t i, j, opr_sz = simd_oprsz(desc); 336 int idx = simd_data(desc); 337 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 338 intptr_t elements = opr_sz / 2; 339 intptr_t eltspersegment = MIN(16 / 2, elements); 340 341 for (i = 0; i < elements; i += 16 / 2) { 342 int16_t mm = m[i]; 343 for (j = 0; j < eltspersegment; ++j) { 344 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq); 345 } 346 } 347 clear_tail(d, opr_sz, simd_maxsz(desc)); 348 } 349 350 void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm, 351 void *vq, uint32_t desc) 352 { 353 intptr_t i, j, opr_sz = simd_oprsz(desc); 354 int idx = simd_data(desc); 355 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 356 intptr_t elements = opr_sz / 2; 357 intptr_t eltspersegment = MIN(16 / 2, elements); 358 359 for (i = 0; i < elements; i += 16 / 2) { 360 int16_t mm = m[i]; 361 for (j = 0; j < eltspersegment; ++j) { 362 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq); 363 } 364 } 365 clear_tail(d, opr_sz, simd_maxsz(desc)); 366 } 367 368 void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm, 369 void *vq, uint32_t desc) 370 { 371 intptr_t i, j, opr_sz = simd_oprsz(desc); 372 int idx = simd_data(desc); 373 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 374 intptr_t elements = opr_sz / 2; 375 intptr_t eltspersegment = MIN(16 / 2, elements); 376 377 for (i = 0; i < elements; i += 16 / 2) { 378 int16_t mm = m[i]; 379 for (j = 0; j < eltspersegment; ++j) { 380 d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq); 381 } 382 } 383 clear_tail(d, opr_sz, simd_maxsz(desc)); 384 } 385 386 void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, 387 void *va, uint32_t desc) 388 { 389 intptr_t i, opr_sz = simd_oprsz(desc); 390 int16_t *d = vd, *n = vn, *m = vm, *a = va; 391 uint32_t discard; 392 393 for (i = 0; i < opr_sz / 2; ++i) { 394 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); 395 } 396 } 397 398 void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, 399 void *va, uint32_t desc) 400 { 401 intptr_t i, opr_sz = simd_oprsz(desc); 402 int16_t *d = vd, *n = vn, *m = vm, *a = va; 403 uint32_t discard; 404 405 for (i = 0; i < opr_sz / 2; ++i) { 406 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); 407 } 408 } 409 410 void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 411 { 412 intptr_t i, opr_sz = simd_oprsz(desc); 413 int16_t *d = vd, *n = vn, *m = vm; 414 uint32_t discard; 415 416 for (i = 0; i < opr_sz / 2; ++i) { 417 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); 418 } 419 } 420 421 void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 422 { 423 intptr_t i, opr_sz = simd_oprsz(desc); 424 int16_t *d = vd, *n = vn, *m = vm; 425 uint32_t discard; 426 427 for (i = 0; i < opr_sz / 2; ++i) { 428 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); 429 } 430 } 431 432 void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 433 { 434 intptr_t i, j, opr_sz = simd_oprsz(desc); 435 int idx = simd_data(desc); 436 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 437 uint32_t discard; 438 439 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 440 int16_t mm = m[i]; 441 for (j = 0; j < 16 / 2; ++j) { 442 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); 443 } 444 } 445 } 446 447 void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) 448 { 449 intptr_t i, j, opr_sz = simd_oprsz(desc); 450 int idx = simd_data(desc); 451 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); 452 uint32_t discard; 453 454 for (i = 0; i < opr_sz / 2; i += 16 / 2) { 455 int16_t mm = m[i]; 456 for (j = 0; j < 16 / 2; ++j) { 457 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); 458 } 459 } 460 } 461 462 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 463 int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, 464 bool neg, bool round, uint32_t *sat) 465 { 466 /* Simplify similarly to do_sqrdmlah_b above. */ 467 int64_t ret = (int64_t)src1 * src2; 468 if (neg) { 469 ret = -ret; 470 } 471 ret += ((int64_t)src3 << 31) + (round << 30); 472 ret >>= 31; 473 474 if (ret != (int32_t)ret) { 475 *sat = 1; 476 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 477 } 478 return ret; 479 } 480 481 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 482 int32_t src2, int32_t src3) 483 { 484 uint32_t *sat = &env->vfp.qc[0]; 485 return do_sqrdmlah_s(src1, src2, src3, false, true, sat); 486 } 487 488 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 489 void *vq, uint32_t desc) 490 { 491 uintptr_t opr_sz = simd_oprsz(desc); 492 int32_t *d = vd; 493 int32_t *n = vn; 494 int32_t *m = vm; 495 uintptr_t i; 496 497 for (i = 0; i < opr_sz / 4; ++i) { 498 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); 499 } 500 clear_tail(d, opr_sz, simd_maxsz(desc)); 501 } 502 503 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 504 int32_t src2, int32_t src3) 505 { 506 uint32_t *sat = &env->vfp.qc[0]; 507 return do_sqrdmlah_s(src1, src2, src3, true, true, sat); 508 } 509 510 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 511 void *vq, uint32_t desc) 512 { 513 uintptr_t opr_sz = simd_oprsz(desc); 514 int32_t *d = vd; 515 int32_t *n = vn; 516 int32_t *m = vm; 517 uintptr_t i; 518 519 for (i = 0; i < opr_sz / 4; ++i) { 520 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); 521 } 522 clear_tail(d, opr_sz, simd_maxsz(desc)); 523 } 524 525 void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, 526 void *vq, uint32_t desc) 527 { 528 intptr_t i, opr_sz = simd_oprsz(desc); 529 int32_t *d = vd, *n = vn, *m = vm; 530 531 for (i = 0; i < opr_sz / 4; ++i) { 532 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); 533 } 534 clear_tail(d, opr_sz, simd_maxsz(desc)); 535 } 536 537 void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, 538 void *vq, uint32_t desc) 539 { 540 intptr_t i, opr_sz = simd_oprsz(desc); 541 int32_t *d = vd, *n = vn, *m = vm; 542 543 for (i = 0; i < opr_sz / 4; ++i) { 544 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); 545 } 546 clear_tail(d, opr_sz, simd_maxsz(desc)); 547 } 548 549 void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm, 550 void *vq, uint32_t desc) 551 { 552 intptr_t i, j, opr_sz = simd_oprsz(desc); 553 int idx = simd_data(desc); 554 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 555 intptr_t elements = opr_sz / 4; 556 intptr_t eltspersegment = MIN(16 / 4, elements); 557 558 for (i = 0; i < elements; i += 16 / 4) { 559 int32_t mm = m[i]; 560 for (j = 0; j < eltspersegment; ++j) { 561 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq); 562 } 563 } 564 clear_tail(d, opr_sz, simd_maxsz(desc)); 565 } 566 567 void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, 568 void *vq, uint32_t desc) 569 { 570 intptr_t i, j, opr_sz = simd_oprsz(desc); 571 int idx = simd_data(desc); 572 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 573 intptr_t elements = opr_sz / 4; 574 intptr_t eltspersegment = MIN(16 / 4, elements); 575 576 for (i = 0; i < elements; i += 16 / 4) { 577 int32_t mm = m[i]; 578 for (j = 0; j < eltspersegment; ++j) { 579 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq); 580 } 581 } 582 clear_tail(d, opr_sz, simd_maxsz(desc)); 583 } 584 585 void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm, 586 void *vq, uint32_t desc) 587 { 588 intptr_t i, j, opr_sz = simd_oprsz(desc); 589 int idx = simd_data(desc); 590 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 591 intptr_t elements = opr_sz / 4; 592 intptr_t eltspersegment = MIN(16 / 4, elements); 593 594 for (i = 0; i < elements; i += 16 / 4) { 595 int32_t mm = m[i]; 596 for (j = 0; j < eltspersegment; ++j) { 597 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq); 598 } 599 } 600 clear_tail(d, opr_sz, simd_maxsz(desc)); 601 } 602 603 void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm, 604 void *vq, uint32_t desc) 605 { 606 intptr_t i, j, opr_sz = simd_oprsz(desc); 607 int idx = simd_data(desc); 608 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 609 intptr_t elements = opr_sz / 4; 610 intptr_t eltspersegment = MIN(16 / 4, elements); 611 612 for (i = 0; i < elements; i += 16 / 4) { 613 int32_t mm = m[i]; 614 for (j = 0; j < eltspersegment; ++j) { 615 d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq); 616 } 617 } 618 clear_tail(d, opr_sz, simd_maxsz(desc)); 619 } 620 621 void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, 622 void *va, uint32_t desc) 623 { 624 intptr_t i, opr_sz = simd_oprsz(desc); 625 int32_t *d = vd, *n = vn, *m = vm, *a = va; 626 uint32_t discard; 627 628 for (i = 0; i < opr_sz / 4; ++i) { 629 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); 630 } 631 } 632 633 void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, 634 void *va, uint32_t desc) 635 { 636 intptr_t i, opr_sz = simd_oprsz(desc); 637 int32_t *d = vd, *n = vn, *m = vm, *a = va; 638 uint32_t discard; 639 640 for (i = 0; i < opr_sz / 4; ++i) { 641 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); 642 } 643 } 644 645 void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 646 { 647 intptr_t i, opr_sz = simd_oprsz(desc); 648 int32_t *d = vd, *n = vn, *m = vm; 649 uint32_t discard; 650 651 for (i = 0; i < opr_sz / 4; ++i) { 652 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); 653 } 654 } 655 656 void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 657 { 658 intptr_t i, opr_sz = simd_oprsz(desc); 659 int32_t *d = vd, *n = vn, *m = vm; 660 uint32_t discard; 661 662 for (i = 0; i < opr_sz / 4; ++i) { 663 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); 664 } 665 } 666 667 void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 668 { 669 intptr_t i, j, opr_sz = simd_oprsz(desc); 670 int idx = simd_data(desc); 671 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 672 uint32_t discard; 673 674 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 675 int32_t mm = m[i]; 676 for (j = 0; j < 16 / 4; ++j) { 677 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); 678 } 679 } 680 } 681 682 void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) 683 { 684 intptr_t i, j, opr_sz = simd_oprsz(desc); 685 int idx = simd_data(desc); 686 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); 687 uint32_t discard; 688 689 for (i = 0; i < opr_sz / 4; i += 16 / 4) { 690 int32_t mm = m[i]; 691 for (j = 0; j < 16 / 4; ++j) { 692 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); 693 } 694 } 695 } 696 697 /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ 698 static int64_t do_sat128_d(Int128 r) 699 { 700 int64_t ls = int128_getlo(r); 701 int64_t hs = int128_gethi(r); 702 703 if (unlikely(hs != (ls >> 63))) { 704 return hs < 0 ? INT64_MIN : INT64_MAX; 705 } 706 return ls; 707 } 708 709 int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) 710 { 711 uint64_t l, h; 712 Int128 r, t; 713 714 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ 715 muls64(&l, &h, m, n); 716 r = int128_make128(l, h); 717 if (neg) { 718 r = int128_neg(r); 719 } 720 if (a) { 721 t = int128_exts64(a); 722 t = int128_lshift(t, 63); 723 r = int128_add(r, t); 724 } 725 if (round) { 726 t = int128_exts64(1ll << 62); 727 r = int128_add(r, t); 728 } 729 r = int128_rshift(r, 63); 730 731 return do_sat128_d(r); 732 } 733 734 void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, 735 void *va, uint32_t desc) 736 { 737 intptr_t i, opr_sz = simd_oprsz(desc); 738 int64_t *d = vd, *n = vn, *m = vm, *a = va; 739 740 for (i = 0; i < opr_sz / 8; ++i) { 741 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); 742 } 743 } 744 745 void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, 746 void *va, uint32_t desc) 747 { 748 intptr_t i, opr_sz = simd_oprsz(desc); 749 int64_t *d = vd, *n = vn, *m = vm, *a = va; 750 751 for (i = 0; i < opr_sz / 8; ++i) { 752 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); 753 } 754 } 755 756 void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 757 { 758 intptr_t i, opr_sz = simd_oprsz(desc); 759 int64_t *d = vd, *n = vn, *m = vm; 760 761 for (i = 0; i < opr_sz / 8; ++i) { 762 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); 763 } 764 } 765 766 void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 767 { 768 intptr_t i, opr_sz = simd_oprsz(desc); 769 int64_t *d = vd, *n = vn, *m = vm; 770 771 for (i = 0; i < opr_sz / 8; ++i) { 772 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); 773 } 774 } 775 776 void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 777 { 778 intptr_t i, j, opr_sz = simd_oprsz(desc); 779 int idx = simd_data(desc); 780 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 781 782 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 783 int64_t mm = m[i]; 784 for (j = 0; j < 16 / 8; ++j) { 785 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); 786 } 787 } 788 } 789 790 void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) 791 { 792 intptr_t i, j, opr_sz = simd_oprsz(desc); 793 int idx = simd_data(desc); 794 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; 795 796 for (i = 0; i < opr_sz / 8; i += 16 / 8) { 797 int64_t mm = m[i]; 798 for (j = 0; j < 16 / 8; ++j) { 799 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); 800 } 801 } 802 } 803 804 /* Integer 8 and 16-bit dot-product. 805 * 806 * Note that for the loops herein, host endianness does not matter 807 * with respect to the ordering of data within the quad-width lanes. 808 * All elements are treated equally, no matter where they are. 809 */ 810 811 #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ 812 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 813 { \ 814 intptr_t i, opr_sz = simd_oprsz(desc); \ 815 TYPED *d = vd, *a = va; \ 816 TYPEN *n = vn; \ 817 TYPEM *m = vm; \ 818 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ 819 d[i] = (a[i] + \ 820 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ 821 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ 822 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ 823 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ 824 } \ 825 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 826 } 827 828 DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) 829 DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) 830 DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) 831 DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) 832 DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) 833 834 #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ 835 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 836 { \ 837 intptr_t i = 0, opr_sz = simd_oprsz(desc); \ 838 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ 839 /* \ 840 * Special case: opr_sz == 8 from AA64/AA32 advsimd means the \ 841 * first iteration might not be a full 16 byte segment. But \ 842 * for vector lengths beyond that this must be SVE and we know \ 843 * opr_sz is a multiple of 16, so we need not clamp segend \ 844 * to opr_sz_n when we advance it at the end of the loop. \ 845 */ \ 846 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ 847 intptr_t index = simd_data(desc); \ 848 TYPED *d = vd, *a = va; \ 849 TYPEN *n = vn; \ 850 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ 851 do { \ 852 TYPED m0 = m_indexed[i * 4 + 0]; \ 853 TYPED m1 = m_indexed[i * 4 + 1]; \ 854 TYPED m2 = m_indexed[i * 4 + 2]; \ 855 TYPED m3 = m_indexed[i * 4 + 3]; \ 856 do { \ 857 d[i] = (a[i] + \ 858 n[i * 4 + 0] * m0 + \ 859 n[i * 4 + 1] * m1 + \ 860 n[i * 4 + 2] * m2 + \ 861 n[i * 4 + 3] * m3); \ 862 } while (++i < segend); \ 863 segend = i + (16 / sizeof(TYPED)); \ 864 } while (i < opr_sz_n); \ 865 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 866 } 867 868 DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) 869 DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) 870 DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) 871 DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) 872 DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) 873 DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) 874 875 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 876 float_status *fpst, uint32_t desc) 877 { 878 uintptr_t opr_sz = simd_oprsz(desc); 879 float16 *d = vd; 880 float16 *n = vn; 881 float16 *m = vm; 882 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 883 uint32_t neg_imag = neg_real ^ 1; 884 uintptr_t i; 885 886 /* Shift boolean to the sign bit so we can xor to negate. */ 887 neg_real <<= 15; 888 neg_imag <<= 15; 889 890 for (i = 0; i < opr_sz / 2; i += 2) { 891 float16 e0 = n[H2(i)]; 892 float16 e1 = m[H2(i + 1)] ^ neg_imag; 893 float16 e2 = n[H2(i + 1)]; 894 float16 e3 = m[H2(i)] ^ neg_real; 895 896 d[H2(i)] = float16_add(e0, e1, fpst); 897 d[H2(i + 1)] = float16_add(e2, e3, fpst); 898 } 899 clear_tail(d, opr_sz, simd_maxsz(desc)); 900 } 901 902 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 903 float_status *fpst, uint32_t desc) 904 { 905 uintptr_t opr_sz = simd_oprsz(desc); 906 float32 *d = vd; 907 float32 *n = vn; 908 float32 *m = vm; 909 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 910 uint32_t neg_imag = neg_real ^ 1; 911 uintptr_t i; 912 913 /* Shift boolean to the sign bit so we can xor to negate. */ 914 neg_real <<= 31; 915 neg_imag <<= 31; 916 917 for (i = 0; i < opr_sz / 4; i += 2) { 918 float32 e0 = n[H4(i)]; 919 float32 e1 = m[H4(i + 1)] ^ neg_imag; 920 float32 e2 = n[H4(i + 1)]; 921 float32 e3 = m[H4(i)] ^ neg_real; 922 923 d[H4(i)] = float32_add(e0, e1, fpst); 924 d[H4(i + 1)] = float32_add(e2, e3, fpst); 925 } 926 clear_tail(d, opr_sz, simd_maxsz(desc)); 927 } 928 929 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 930 float_status *fpst, uint32_t desc) 931 { 932 uintptr_t opr_sz = simd_oprsz(desc); 933 float64 *d = vd; 934 float64 *n = vn; 935 float64 *m = vm; 936 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 937 uint64_t neg_imag = neg_real ^ 1; 938 uintptr_t i; 939 940 /* Shift boolean to the sign bit so we can xor to negate. */ 941 neg_real <<= 63; 942 neg_imag <<= 63; 943 944 for (i = 0; i < opr_sz / 8; i += 2) { 945 float64 e0 = n[i]; 946 float64 e1 = m[i + 1] ^ neg_imag; 947 float64 e2 = n[i + 1]; 948 float64 e3 = m[i] ^ neg_real; 949 950 d[i] = float64_add(e0, e1, fpst); 951 d[i + 1] = float64_add(e2, e3, fpst); 952 } 953 clear_tail(d, opr_sz, simd_maxsz(desc)); 954 } 955 956 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, 957 float_status *fpst, uint32_t desc) 958 { 959 uintptr_t opr_sz = simd_oprsz(desc); 960 float16 *d = vd, *n = vn, *m = vm, *a = va; 961 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 962 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 963 uint32_t neg_real = flip ^ neg_imag; 964 uintptr_t i; 965 966 /* Shift boolean to the sign bit so we can xor to negate. */ 967 neg_real <<= 15; 968 neg_imag <<= 15; 969 970 for (i = 0; i < opr_sz / 2; i += 2) { 971 float16 e2 = n[H2(i + flip)]; 972 float16 e1 = m[H2(i + flip)] ^ neg_real; 973 float16 e4 = e2; 974 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 975 976 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst); 977 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst); 978 } 979 clear_tail(d, opr_sz, simd_maxsz(desc)); 980 } 981 982 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, 983 float_status *fpst, uint32_t desc) 984 { 985 uintptr_t opr_sz = simd_oprsz(desc); 986 float16 *d = vd, *n = vn, *m = vm, *a = va; 987 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 988 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 989 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 990 uint32_t neg_real = flip ^ neg_imag; 991 intptr_t elements = opr_sz / sizeof(float16); 992 intptr_t eltspersegment = MIN(16 / sizeof(float16), elements); 993 intptr_t i, j; 994 995 /* Shift boolean to the sign bit so we can xor to negate. */ 996 neg_real <<= 15; 997 neg_imag <<= 15; 998 999 for (i = 0; i < elements; i += eltspersegment) { 1000 float16 mr = m[H2(i + 2 * index + 0)]; 1001 float16 mi = m[H2(i + 2 * index + 1)]; 1002 float16 e1 = neg_real ^ (flip ? mi : mr); 1003 float16 e3 = neg_imag ^ (flip ? mr : mi); 1004 1005 for (j = i; j < i + eltspersegment; j += 2) { 1006 float16 e2 = n[H2(j + flip)]; 1007 float16 e4 = e2; 1008 1009 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); 1010 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); 1011 } 1012 } 1013 clear_tail(d, opr_sz, simd_maxsz(desc)); 1014 } 1015 1016 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, 1017 float_status *fpst, uint32_t desc) 1018 { 1019 uintptr_t opr_sz = simd_oprsz(desc); 1020 float32 *d = vd, *n = vn, *m = vm, *a = va; 1021 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1022 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1023 uint32_t neg_real = flip ^ neg_imag; 1024 uintptr_t i; 1025 1026 /* Shift boolean to the sign bit so we can xor to negate. */ 1027 neg_real <<= 31; 1028 neg_imag <<= 31; 1029 1030 for (i = 0; i < opr_sz / 4; i += 2) { 1031 float32 e2 = n[H4(i + flip)]; 1032 float32 e1 = m[H4(i + flip)] ^ neg_real; 1033 float32 e4 = e2; 1034 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 1035 1036 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst); 1037 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst); 1038 } 1039 clear_tail(d, opr_sz, simd_maxsz(desc)); 1040 } 1041 1042 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, 1043 float_status *fpst, uint32_t desc) 1044 { 1045 uintptr_t opr_sz = simd_oprsz(desc); 1046 float32 *d = vd, *n = vn, *m = vm, *a = va; 1047 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1048 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1049 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); 1050 uint32_t neg_real = flip ^ neg_imag; 1051 intptr_t elements = opr_sz / sizeof(float32); 1052 intptr_t eltspersegment = MIN(16 / sizeof(float32), elements); 1053 intptr_t i, j; 1054 1055 /* Shift boolean to the sign bit so we can xor to negate. */ 1056 neg_real <<= 31; 1057 neg_imag <<= 31; 1058 1059 for (i = 0; i < elements; i += eltspersegment) { 1060 float32 mr = m[H4(i + 2 * index + 0)]; 1061 float32 mi = m[H4(i + 2 * index + 1)]; 1062 float32 e1 = neg_real ^ (flip ? mi : mr); 1063 float32 e3 = neg_imag ^ (flip ? mr : mi); 1064 1065 for (j = i; j < i + eltspersegment; j += 2) { 1066 float32 e2 = n[H4(j + flip)]; 1067 float32 e4 = e2; 1068 1069 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); 1070 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); 1071 } 1072 } 1073 clear_tail(d, opr_sz, simd_maxsz(desc)); 1074 } 1075 1076 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, 1077 float_status *fpst, uint32_t desc) 1078 { 1079 uintptr_t opr_sz = simd_oprsz(desc); 1080 float64 *d = vd, *n = vn, *m = vm, *a = va; 1081 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 1082 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 1083 uint64_t neg_real = flip ^ neg_imag; 1084 uintptr_t i; 1085 1086 /* Shift boolean to the sign bit so we can xor to negate. */ 1087 neg_real <<= 63; 1088 neg_imag <<= 63; 1089 1090 for (i = 0; i < opr_sz / 8; i += 2) { 1091 float64 e2 = n[i + flip]; 1092 float64 e1 = m[i + flip] ^ neg_real; 1093 float64 e4 = e2; 1094 float64 e3 = m[i + 1 - flip] ^ neg_imag; 1095 1096 d[i] = float64_muladd(e2, e1, a[i], 0, fpst); 1097 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst); 1098 } 1099 clear_tail(d, opr_sz, simd_maxsz(desc)); 1100 } 1101 1102 /* 1103 * Floating point comparisons producing an integer result (all 1s or all 0s). 1104 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1105 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1106 */ 1107 static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) 1108 { 1109 return -float16_eq_quiet(op1, op2, stat); 1110 } 1111 1112 static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) 1113 { 1114 return -float32_eq_quiet(op1, op2, stat); 1115 } 1116 1117 static uint64_t float64_ceq(float64 op1, float64 op2, float_status *stat) 1118 { 1119 return -float64_eq_quiet(op1, op2, stat); 1120 } 1121 1122 static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) 1123 { 1124 return -float16_le(op2, op1, stat); 1125 } 1126 1127 static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) 1128 { 1129 return -float32_le(op2, op1, stat); 1130 } 1131 1132 static uint64_t float64_cge(float64 op1, float64 op2, float_status *stat) 1133 { 1134 return -float64_le(op2, op1, stat); 1135 } 1136 1137 static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) 1138 { 1139 return -float16_lt(op2, op1, stat); 1140 } 1141 1142 static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) 1143 { 1144 return -float32_lt(op2, op1, stat); 1145 } 1146 1147 static uint64_t float64_cgt(float64 op1, float64 op2, float_status *stat) 1148 { 1149 return -float64_lt(op2, op1, stat); 1150 } 1151 1152 static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) 1153 { 1154 return -float16_le(float16_abs(op2), float16_abs(op1), stat); 1155 } 1156 1157 static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) 1158 { 1159 return -float32_le(float32_abs(op2), float32_abs(op1), stat); 1160 } 1161 1162 static uint64_t float64_acge(float64 op1, float64 op2, float_status *stat) 1163 { 1164 return -float64_le(float64_abs(op2), float64_abs(op1), stat); 1165 } 1166 1167 static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) 1168 { 1169 return -float16_lt(float16_abs(op2), float16_abs(op1), stat); 1170 } 1171 1172 static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) 1173 { 1174 return -float32_lt(float32_abs(op2), float32_abs(op1), stat); 1175 } 1176 1177 static uint64_t float64_acgt(float64 op1, float64 op2, float_status *stat) 1178 { 1179 return -float64_lt(float64_abs(op2), float64_abs(op1), stat); 1180 } 1181 1182 static int16_t vfp_tosszh(float16 x, float_status *fpst) 1183 { 1184 if (float16_is_any_nan(x)) { 1185 float_raise(float_flag_invalid, fpst); 1186 return 0; 1187 } 1188 return float16_to_int16_round_to_zero(x, fpst); 1189 } 1190 1191 static uint16_t vfp_touszh(float16 x, float_status *fpst) 1192 { 1193 if (float16_is_any_nan(x)) { 1194 float_raise(float_flag_invalid, fpst); 1195 return 0; 1196 } 1197 return float16_to_uint16_round_to_zero(x, fpst); 1198 } 1199 1200 #define DO_2OP(NAME, FUNC, TYPE) \ 1201 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 1202 { \ 1203 intptr_t i, oprsz = simd_oprsz(desc); \ 1204 TYPE *d = vd, *n = vn; \ 1205 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1206 d[i] = FUNC(n[i], stat); \ 1207 } \ 1208 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1209 } 1210 1211 DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) 1212 DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) 1213 DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) 1214 1215 DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) 1216 DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) 1217 DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) 1218 1219 DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) 1220 DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) 1221 1222 DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) 1223 DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) 1224 DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) 1225 DO_2OP(gvec_touizs, helper_vfp_touizs, float32) 1226 DO_2OP(gvec_sstoh, int16_to_float16, int16_t) 1227 DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) 1228 DO_2OP(gvec_tosszh, vfp_tosszh, float16) 1229 DO_2OP(gvec_touszh, vfp_touszh, float16) 1230 1231 #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ 1232 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1233 { \ 1234 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ 1235 } 1236 1237 #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ 1238 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ 1239 { \ 1240 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ 1241 } 1242 1243 #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ 1244 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ 1245 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ 1246 WRAP_CMP0_##DIRN(FN, CMPOP, float64) \ 1247 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ 1248 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) \ 1249 DO_2OP(gvec_f##FN##0_d, float64_##FN##0, float64) 1250 1251 DO_2OP_CMP0(cgt, cgt, FWD) 1252 DO_2OP_CMP0(cge, cge, FWD) 1253 DO_2OP_CMP0(ceq, ceq, FWD) 1254 DO_2OP_CMP0(clt, cgt, REV) 1255 DO_2OP_CMP0(cle, cge, REV) 1256 1257 #undef DO_2OP 1258 #undef DO_2OP_CMP0 1259 1260 /* Floating-point trigonometric starting value. 1261 * See the ARM ARM pseudocode function FPTrigSMul. 1262 */ 1263 static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) 1264 { 1265 float16 result = float16_mul(op1, op1, stat); 1266 if (!float16_is_any_nan(result)) { 1267 result = float16_set_sign(result, op2 & 1); 1268 } 1269 return result; 1270 } 1271 1272 static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) 1273 { 1274 float32 result = float32_mul(op1, op1, stat); 1275 if (!float32_is_any_nan(result)) { 1276 result = float32_set_sign(result, op2 & 1); 1277 } 1278 return result; 1279 } 1280 1281 static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) 1282 { 1283 float64 result = float64_mul(op1, op1, stat); 1284 if (!float64_is_any_nan(result)) { 1285 result = float64_set_sign(result, op2 & 1); 1286 } 1287 return result; 1288 } 1289 1290 static float16 float16_abd(float16 op1, float16 op2, float_status *stat) 1291 { 1292 return float16_abs(float16_sub(op1, op2, stat)); 1293 } 1294 1295 static float32 float32_abd(float32 op1, float32 op2, float_status *stat) 1296 { 1297 return float32_abs(float32_sub(op1, op2, stat)); 1298 } 1299 1300 static float64 float64_abd(float64 op1, float64 op2, float_status *stat) 1301 { 1302 return float64_abs(float64_sub(op1, op2, stat)); 1303 } 1304 1305 /* 1306 * Reciprocal step. These are the AArch32 version which uses a 1307 * non-fused multiply-and-subtract. 1308 */ 1309 static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) 1310 { 1311 op1 = float16_squash_input_denormal(op1, stat); 1312 op2 = float16_squash_input_denormal(op2, stat); 1313 1314 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1315 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1316 return float16_two; 1317 } 1318 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); 1319 } 1320 1321 static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) 1322 { 1323 op1 = float32_squash_input_denormal(op1, stat); 1324 op2 = float32_squash_input_denormal(op2, stat); 1325 1326 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1327 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1328 return float32_two; 1329 } 1330 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); 1331 } 1332 1333 /* Reciprocal square-root step. AArch32 non-fused semantics. */ 1334 static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) 1335 { 1336 op1 = float16_squash_input_denormal(op1, stat); 1337 op2 = float16_squash_input_denormal(op2, stat); 1338 1339 if ((float16_is_infinity(op1) && float16_is_zero(op2)) || 1340 (float16_is_infinity(op2) && float16_is_zero(op1))) { 1341 return float16_one_point_five; 1342 } 1343 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); 1344 return float16_div(op1, float16_two, stat); 1345 } 1346 1347 static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) 1348 { 1349 op1 = float32_squash_input_denormal(op1, stat); 1350 op2 = float32_squash_input_denormal(op2, stat); 1351 1352 if ((float32_is_infinity(op1) && float32_is_zero(op2)) || 1353 (float32_is_infinity(op2) && float32_is_zero(op1))) { 1354 return float32_one_point_five; 1355 } 1356 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); 1357 return float32_div(op1, float32_two, stat); 1358 } 1359 1360 #define DO_3OP(NAME, FUNC, TYPE) \ 1361 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1362 float_status *stat, uint32_t desc) \ 1363 { \ 1364 intptr_t i, oprsz = simd_oprsz(desc); \ 1365 TYPE *d = vd, *n = vn, *m = vm; \ 1366 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1367 d[i] = FUNC(n[i], m[i], stat); \ 1368 } \ 1369 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1370 } 1371 1372 DO_3OP(gvec_fadd_h, float16_add, float16) 1373 DO_3OP(gvec_fadd_s, float32_add, float32) 1374 DO_3OP(gvec_fadd_d, float64_add, float64) 1375 1376 DO_3OP(gvec_fsub_h, float16_sub, float16) 1377 DO_3OP(gvec_fsub_s, float32_sub, float32) 1378 DO_3OP(gvec_fsub_d, float64_sub, float64) 1379 1380 DO_3OP(gvec_fmul_h, float16_mul, float16) 1381 DO_3OP(gvec_fmul_s, float32_mul, float32) 1382 DO_3OP(gvec_fmul_d, float64_mul, float64) 1383 1384 DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) 1385 DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) 1386 DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) 1387 1388 DO_3OP(gvec_fabd_h, float16_abd, float16) 1389 DO_3OP(gvec_fabd_s, float32_abd, float32) 1390 DO_3OP(gvec_fabd_d, float64_abd, float64) 1391 1392 DO_3OP(gvec_fceq_h, float16_ceq, float16) 1393 DO_3OP(gvec_fceq_s, float32_ceq, float32) 1394 DO_3OP(gvec_fceq_d, float64_ceq, float64) 1395 1396 DO_3OP(gvec_fcge_h, float16_cge, float16) 1397 DO_3OP(gvec_fcge_s, float32_cge, float32) 1398 DO_3OP(gvec_fcge_d, float64_cge, float64) 1399 1400 DO_3OP(gvec_fcgt_h, float16_cgt, float16) 1401 DO_3OP(gvec_fcgt_s, float32_cgt, float32) 1402 DO_3OP(gvec_fcgt_d, float64_cgt, float64) 1403 1404 DO_3OP(gvec_facge_h, float16_acge, float16) 1405 DO_3OP(gvec_facge_s, float32_acge, float32) 1406 DO_3OP(gvec_facge_d, float64_acge, float64) 1407 1408 DO_3OP(gvec_facgt_h, float16_acgt, float16) 1409 DO_3OP(gvec_facgt_s, float32_acgt, float32) 1410 DO_3OP(gvec_facgt_d, float64_acgt, float64) 1411 1412 DO_3OP(gvec_fmax_h, float16_max, float16) 1413 DO_3OP(gvec_fmax_s, float32_max, float32) 1414 DO_3OP(gvec_fmax_d, float64_max, float64) 1415 1416 DO_3OP(gvec_fmin_h, float16_min, float16) 1417 DO_3OP(gvec_fmin_s, float32_min, float32) 1418 DO_3OP(gvec_fmin_d, float64_min, float64) 1419 1420 DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) 1421 DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) 1422 DO_3OP(gvec_fmaxnum_d, float64_maxnum, float64) 1423 1424 DO_3OP(gvec_fminnum_h, float16_minnum, float16) 1425 DO_3OP(gvec_fminnum_s, float32_minnum, float32) 1426 DO_3OP(gvec_fminnum_d, float64_minnum, float64) 1427 1428 DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) 1429 DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) 1430 1431 DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) 1432 DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) 1433 1434 #ifdef TARGET_AARCH64 1435 DO_3OP(gvec_fdiv_h, float16_div, float16) 1436 DO_3OP(gvec_fdiv_s, float32_div, float32) 1437 DO_3OP(gvec_fdiv_d, float64_div, float64) 1438 1439 DO_3OP(gvec_fmulx_h, helper_advsimd_mulxh, float16) 1440 DO_3OP(gvec_fmulx_s, helper_vfp_mulxs, float32) 1441 DO_3OP(gvec_fmulx_d, helper_vfp_mulxd, float64) 1442 1443 DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) 1444 DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) 1445 DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) 1446 1447 DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) 1448 DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) 1449 DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) 1450 1451 DO_3OP(gvec_ah_fmax_h, helper_vfp_ah_maxh, float16) 1452 DO_3OP(gvec_ah_fmax_s, helper_vfp_ah_maxs, float32) 1453 DO_3OP(gvec_ah_fmax_d, helper_vfp_ah_maxd, float64) 1454 1455 DO_3OP(gvec_ah_fmin_h, helper_vfp_ah_minh, float16) 1456 DO_3OP(gvec_ah_fmin_s, helper_vfp_ah_mins, float32) 1457 DO_3OP(gvec_ah_fmin_d, helper_vfp_ah_mind, float64) 1458 1459 #endif 1460 #undef DO_3OP 1461 1462 /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ 1463 static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, 1464 float_status *stat) 1465 { 1466 return float16_add(dest, float16_mul(op1, op2, stat), stat); 1467 } 1468 1469 static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, 1470 float_status *stat) 1471 { 1472 return float32_add(dest, float32_mul(op1, op2, stat), stat); 1473 } 1474 1475 static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, 1476 float_status *stat) 1477 { 1478 return float16_sub(dest, float16_mul(op1, op2, stat), stat); 1479 } 1480 1481 static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, 1482 float_status *stat) 1483 { 1484 return float32_sub(dest, float32_mul(op1, op2, stat), stat); 1485 } 1486 1487 /* Fused versions; these have the semantics Neon VFMA/VFMS want */ 1488 static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, 1489 float_status *stat) 1490 { 1491 return float16_muladd(op1, op2, dest, 0, stat); 1492 } 1493 1494 static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, 1495 float_status *stat) 1496 { 1497 return float32_muladd(op1, op2, dest, 0, stat); 1498 } 1499 1500 static float64 float64_muladd_f(float64 dest, float64 op1, float64 op2, 1501 float_status *stat) 1502 { 1503 return float64_muladd(op1, op2, dest, 0, stat); 1504 } 1505 1506 static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, 1507 float_status *stat) 1508 { 1509 return float16_muladd(float16_chs(op1), op2, dest, 0, stat); 1510 } 1511 1512 static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, 1513 float_status *stat) 1514 { 1515 return float32_muladd(float32_chs(op1), op2, dest, 0, stat); 1516 } 1517 1518 static float64 float64_mulsub_f(float64 dest, float64 op1, float64 op2, 1519 float_status *stat) 1520 { 1521 return float64_muladd(float64_chs(op1), op2, dest, 0, stat); 1522 } 1523 1524 #define DO_MULADD(NAME, FUNC, TYPE) \ 1525 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1526 float_status *stat, uint32_t desc) \ 1527 { \ 1528 intptr_t i, oprsz = simd_oprsz(desc); \ 1529 TYPE *d = vd, *n = vn, *m = vm; \ 1530 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1531 d[i] = FUNC(d[i], n[i], m[i], stat); \ 1532 } \ 1533 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1534 } 1535 1536 DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) 1537 DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) 1538 1539 DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) 1540 DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) 1541 1542 DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) 1543 DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) 1544 DO_MULADD(gvec_vfma_d, float64_muladd_f, float64) 1545 1546 DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) 1547 DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) 1548 DO_MULADD(gvec_vfms_d, float64_mulsub_f, float64) 1549 1550 /* For the indexed ops, SVE applies the index per 128-bit vector segment. 1551 * For AdvSIMD, there is of course only one such vector segment. 1552 */ 1553 1554 #define DO_MUL_IDX(NAME, TYPE, H) \ 1555 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 1556 { \ 1557 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1558 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1559 intptr_t idx = simd_data(desc); \ 1560 TYPE *d = vd, *n = vn, *m = vm; \ 1561 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1562 TYPE mm = m[H(i + idx)]; \ 1563 for (j = 0; j < segment; j++) { \ 1564 d[i + j] = n[i + j] * mm; \ 1565 } \ 1566 } \ 1567 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1568 } 1569 1570 DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) 1571 DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) 1572 DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) 1573 1574 #undef DO_MUL_IDX 1575 1576 #define DO_MLA_IDX(NAME, TYPE, OP, H) \ 1577 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 1578 { \ 1579 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1580 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1581 intptr_t idx = simd_data(desc); \ 1582 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1583 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1584 TYPE mm = m[H(i + idx)]; \ 1585 for (j = 0; j < segment; j++) { \ 1586 d[i + j] = a[i + j] OP n[i + j] * mm; \ 1587 } \ 1588 } \ 1589 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1590 } 1591 1592 DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) 1593 DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) 1594 DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) 1595 1596 DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) 1597 DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) 1598 DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) 1599 1600 #undef DO_MLA_IDX 1601 1602 #define DO_FMUL_IDX(NAME, ADD, MUL, TYPE, H) \ 1603 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 1604 float_status *stat, uint32_t desc) \ 1605 { \ 1606 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1607 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1608 intptr_t idx = simd_data(desc); \ 1609 TYPE *d = vd, *n = vn, *m = vm; \ 1610 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1611 TYPE mm = m[H(i + idx)]; \ 1612 for (j = 0; j < segment; j++) { \ 1613 d[i + j] = ADD(d[i + j], MUL(n[i + j], mm, stat), stat); \ 1614 } \ 1615 } \ 1616 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1617 } 1618 1619 #define nop(N, M, S) (M) 1620 1621 DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16_mul, float16, H2) 1622 DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32_mul, float32, H4) 1623 DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64_mul, float64, H8) 1624 1625 #ifdef TARGET_AARCH64 1626 1627 DO_FMUL_IDX(gvec_fmulx_idx_h, nop, helper_advsimd_mulxh, float16, H2) 1628 DO_FMUL_IDX(gvec_fmulx_idx_s, nop, helper_vfp_mulxs, float32, H4) 1629 DO_FMUL_IDX(gvec_fmulx_idx_d, nop, helper_vfp_mulxd, float64, H8) 1630 1631 #endif 1632 1633 #undef nop 1634 1635 /* 1636 * Non-fused multiply-accumulate operations, for Neon. NB that unlike 1637 * the fused ops below they assume accumulate both from and into Vd. 1638 */ 1639 DO_FMUL_IDX(gvec_fmla_nf_idx_h, float16_add, float16_mul, float16, H2) 1640 DO_FMUL_IDX(gvec_fmla_nf_idx_s, float32_add, float32_mul, float32, H4) 1641 DO_FMUL_IDX(gvec_fmls_nf_idx_h, float16_sub, float16_mul, float16, H2) 1642 DO_FMUL_IDX(gvec_fmls_nf_idx_s, float32_sub, float32_mul, float32, H4) 1643 1644 #undef DO_FMUL_IDX 1645 1646 #define DO_FMLA_IDX(NAME, TYPE, H) \ 1647 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ 1648 float_status *stat, uint32_t desc) \ 1649 { \ 1650 intptr_t i, j, oprsz = simd_oprsz(desc); \ 1651 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ 1652 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ 1653 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ 1654 TYPE *d = vd, *n = vn, *m = vm, *a = va; \ 1655 op1_neg <<= (8 * sizeof(TYPE) - 1); \ 1656 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ 1657 TYPE mm = m[H(i + idx)]; \ 1658 for (j = 0; j < segment; j++) { \ 1659 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ 1660 mm, a[i + j], 0, stat); \ 1661 } \ 1662 } \ 1663 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1664 } 1665 1666 DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) 1667 DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) 1668 DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) 1669 1670 #undef DO_FMLA_IDX 1671 1672 #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ 1673 void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ 1674 { \ 1675 intptr_t i, oprsz = simd_oprsz(desc); \ 1676 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ 1677 bool q = false; \ 1678 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ 1679 WTYPE dd = (WTYPE)n[i] OP m[i]; \ 1680 if (dd < MIN) { \ 1681 dd = MIN; \ 1682 q = true; \ 1683 } else if (dd > MAX) { \ 1684 dd = MAX; \ 1685 q = true; \ 1686 } \ 1687 d[i] = dd; \ 1688 } \ 1689 if (q) { \ 1690 uint32_t *qc = vq; \ 1691 qc[0] = 1; \ 1692 } \ 1693 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1694 } 1695 1696 DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) 1697 DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) 1698 DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) 1699 1700 DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) 1701 DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) 1702 DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) 1703 1704 DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) 1705 DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) 1706 DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) 1707 1708 DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) 1709 DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) 1710 DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) 1711 1712 DO_SAT(gvec_usqadd_b, int, uint8_t, int8_t, +, 0, UINT8_MAX) 1713 DO_SAT(gvec_usqadd_h, int, uint16_t, int16_t, +, 0, UINT16_MAX) 1714 DO_SAT(gvec_usqadd_s, int64_t, uint32_t, int32_t, +, 0, UINT32_MAX) 1715 1716 DO_SAT(gvec_suqadd_b, int, int8_t, uint8_t, +, INT8_MIN, INT8_MAX) 1717 DO_SAT(gvec_suqadd_h, int, int16_t, uint16_t, +, INT16_MIN, INT16_MAX) 1718 DO_SAT(gvec_suqadd_s, int64_t, int32_t, uint32_t, +, INT32_MIN, INT32_MAX) 1719 1720 #undef DO_SAT 1721 1722 void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, 1723 void *vm, uint32_t desc) 1724 { 1725 intptr_t i, oprsz = simd_oprsz(desc); 1726 uint64_t *d = vd, *n = vn, *m = vm; 1727 bool q = false; 1728 1729 for (i = 0; i < oprsz / 8; i++) { 1730 uint64_t nn = n[i], mm = m[i], dd = nn + mm; 1731 if (dd < nn) { 1732 dd = UINT64_MAX; 1733 q = true; 1734 } 1735 d[i] = dd; 1736 } 1737 if (q) { 1738 uint32_t *qc = vq; 1739 qc[0] = 1; 1740 } 1741 clear_tail(d, oprsz, simd_maxsz(desc)); 1742 } 1743 1744 void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, 1745 void *vm, uint32_t desc) 1746 { 1747 intptr_t i, oprsz = simd_oprsz(desc); 1748 uint64_t *d = vd, *n = vn, *m = vm; 1749 bool q = false; 1750 1751 for (i = 0; i < oprsz / 8; i++) { 1752 uint64_t nn = n[i], mm = m[i], dd = nn - mm; 1753 if (nn < mm) { 1754 dd = 0; 1755 q = true; 1756 } 1757 d[i] = dd; 1758 } 1759 if (q) { 1760 uint32_t *qc = vq; 1761 qc[0] = 1; 1762 } 1763 clear_tail(d, oprsz, simd_maxsz(desc)); 1764 } 1765 1766 void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, 1767 void *vm, uint32_t desc) 1768 { 1769 intptr_t i, oprsz = simd_oprsz(desc); 1770 int64_t *d = vd, *n = vn, *m = vm; 1771 bool q = false; 1772 1773 for (i = 0; i < oprsz / 8; i++) { 1774 int64_t nn = n[i], mm = m[i], dd = nn + mm; 1775 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { 1776 dd = (nn >> 63) ^ ~INT64_MIN; 1777 q = true; 1778 } 1779 d[i] = dd; 1780 } 1781 if (q) { 1782 uint32_t *qc = vq; 1783 qc[0] = 1; 1784 } 1785 clear_tail(d, oprsz, simd_maxsz(desc)); 1786 } 1787 1788 void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, 1789 void *vm, uint32_t desc) 1790 { 1791 intptr_t i, oprsz = simd_oprsz(desc); 1792 int64_t *d = vd, *n = vn, *m = vm; 1793 bool q = false; 1794 1795 for (i = 0; i < oprsz / 8; i++) { 1796 int64_t nn = n[i], mm = m[i], dd = nn - mm; 1797 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { 1798 dd = (nn >> 63) ^ ~INT64_MIN; 1799 q = true; 1800 } 1801 d[i] = dd; 1802 } 1803 if (q) { 1804 uint32_t *qc = vq; 1805 qc[0] = 1; 1806 } 1807 clear_tail(d, oprsz, simd_maxsz(desc)); 1808 } 1809 1810 void HELPER(gvec_usqadd_d)(void *vd, void *vq, void *vn, 1811 void *vm, uint32_t desc) 1812 { 1813 intptr_t i, oprsz = simd_oprsz(desc); 1814 uint64_t *d = vd, *n = vn, *m = vm; 1815 bool q = false; 1816 1817 for (i = 0; i < oprsz / 8; i++) { 1818 uint64_t nn = n[i]; 1819 int64_t mm = m[i]; 1820 uint64_t dd = nn + mm; 1821 1822 if (mm < 0) { 1823 if (nn < (uint64_t)-mm) { 1824 dd = 0; 1825 q = true; 1826 } 1827 } else { 1828 if (dd < nn) { 1829 dd = UINT64_MAX; 1830 q = true; 1831 } 1832 } 1833 d[i] = dd; 1834 } 1835 if (q) { 1836 uint32_t *qc = vq; 1837 qc[0] = 1; 1838 } 1839 clear_tail(d, oprsz, simd_maxsz(desc)); 1840 } 1841 1842 void HELPER(gvec_suqadd_d)(void *vd, void *vq, void *vn, 1843 void *vm, uint32_t desc) 1844 { 1845 intptr_t i, oprsz = simd_oprsz(desc); 1846 uint64_t *d = vd, *n = vn, *m = vm; 1847 bool q = false; 1848 1849 for (i = 0; i < oprsz / 8; i++) { 1850 int64_t nn = n[i]; 1851 uint64_t mm = m[i]; 1852 int64_t dd = nn + mm; 1853 1854 if (mm > (uint64_t)(INT64_MAX - nn)) { 1855 dd = INT64_MAX; 1856 q = true; 1857 } 1858 d[i] = dd; 1859 } 1860 if (q) { 1861 uint32_t *qc = vq; 1862 qc[0] = 1; 1863 } 1864 clear_tail(d, oprsz, simd_maxsz(desc)); 1865 } 1866 1867 #define DO_SRA(NAME, TYPE) \ 1868 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1869 { \ 1870 intptr_t i, oprsz = simd_oprsz(desc); \ 1871 int shift = simd_data(desc); \ 1872 TYPE *d = vd, *n = vn; \ 1873 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1874 d[i] += n[i] >> shift; \ 1875 } \ 1876 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1877 } 1878 1879 DO_SRA(gvec_ssra_b, int8_t) 1880 DO_SRA(gvec_ssra_h, int16_t) 1881 DO_SRA(gvec_ssra_s, int32_t) 1882 DO_SRA(gvec_ssra_d, int64_t) 1883 1884 DO_SRA(gvec_usra_b, uint8_t) 1885 DO_SRA(gvec_usra_h, uint16_t) 1886 DO_SRA(gvec_usra_s, uint32_t) 1887 DO_SRA(gvec_usra_d, uint64_t) 1888 1889 #undef DO_SRA 1890 1891 #define DO_RSHR(NAME, TYPE) \ 1892 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1893 { \ 1894 intptr_t i, oprsz = simd_oprsz(desc); \ 1895 int shift = simd_data(desc); \ 1896 TYPE *d = vd, *n = vn; \ 1897 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1898 TYPE tmp = n[i] >> (shift - 1); \ 1899 d[i] = (tmp >> 1) + (tmp & 1); \ 1900 } \ 1901 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1902 } 1903 1904 DO_RSHR(gvec_srshr_b, int8_t) 1905 DO_RSHR(gvec_srshr_h, int16_t) 1906 DO_RSHR(gvec_srshr_s, int32_t) 1907 DO_RSHR(gvec_srshr_d, int64_t) 1908 1909 DO_RSHR(gvec_urshr_b, uint8_t) 1910 DO_RSHR(gvec_urshr_h, uint16_t) 1911 DO_RSHR(gvec_urshr_s, uint32_t) 1912 DO_RSHR(gvec_urshr_d, uint64_t) 1913 1914 #undef DO_RSHR 1915 1916 #define DO_RSRA(NAME, TYPE) \ 1917 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1918 { \ 1919 intptr_t i, oprsz = simd_oprsz(desc); \ 1920 int shift = simd_data(desc); \ 1921 TYPE *d = vd, *n = vn; \ 1922 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1923 TYPE tmp = n[i] >> (shift - 1); \ 1924 d[i] += (tmp >> 1) + (tmp & 1); \ 1925 } \ 1926 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1927 } 1928 1929 DO_RSRA(gvec_srsra_b, int8_t) 1930 DO_RSRA(gvec_srsra_h, int16_t) 1931 DO_RSRA(gvec_srsra_s, int32_t) 1932 DO_RSRA(gvec_srsra_d, int64_t) 1933 1934 DO_RSRA(gvec_ursra_b, uint8_t) 1935 DO_RSRA(gvec_ursra_h, uint16_t) 1936 DO_RSRA(gvec_ursra_s, uint32_t) 1937 DO_RSRA(gvec_ursra_d, uint64_t) 1938 1939 #undef DO_RSRA 1940 1941 #define DO_SRI(NAME, TYPE) \ 1942 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1943 { \ 1944 intptr_t i, oprsz = simd_oprsz(desc); \ 1945 int shift = simd_data(desc); \ 1946 TYPE *d = vd, *n = vn; \ 1947 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1948 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ 1949 } \ 1950 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1951 } 1952 1953 DO_SRI(gvec_sri_b, uint8_t) 1954 DO_SRI(gvec_sri_h, uint16_t) 1955 DO_SRI(gvec_sri_s, uint32_t) 1956 DO_SRI(gvec_sri_d, uint64_t) 1957 1958 #undef DO_SRI 1959 1960 #define DO_SLI(NAME, TYPE) \ 1961 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 1962 { \ 1963 intptr_t i, oprsz = simd_oprsz(desc); \ 1964 int shift = simd_data(desc); \ 1965 TYPE *d = vd, *n = vn; \ 1966 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 1967 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ 1968 } \ 1969 clear_tail(d, oprsz, simd_maxsz(desc)); \ 1970 } 1971 1972 DO_SLI(gvec_sli_b, uint8_t) 1973 DO_SLI(gvec_sli_h, uint16_t) 1974 DO_SLI(gvec_sli_s, uint32_t) 1975 DO_SLI(gvec_sli_d, uint64_t) 1976 1977 #undef DO_SLI 1978 1979 /* 1980 * Convert float16 to float32, raising no exceptions and 1981 * preserving exceptional values, including SNaN. 1982 * This is effectively an unpack+repack operation. 1983 */ 1984 static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) 1985 { 1986 const int f16_bias = 15; 1987 const int f32_bias = 127; 1988 uint32_t sign = extract32(f16, 15, 1); 1989 uint32_t exp = extract32(f16, 10, 5); 1990 uint32_t frac = extract32(f16, 0, 10); 1991 1992 if (exp == 0x1f) { 1993 /* Inf or NaN */ 1994 exp = 0xff; 1995 } else if (exp == 0) { 1996 /* Zero or denormal. */ 1997 if (frac != 0) { 1998 if (fz16) { 1999 frac = 0; 2000 } else { 2001 /* 2002 * Denormal; these are all normal float32. 2003 * Shift the fraction so that the msb is at bit 11, 2004 * then remove bit 11 as the implicit bit of the 2005 * normalized float32. Note that we still go through 2006 * the shift for normal numbers below, to put the 2007 * float32 fraction at the right place. 2008 */ 2009 int shift = clz32(frac) - 21; 2010 frac = (frac << shift) & 0x3ff; 2011 exp = f32_bias - f16_bias - shift + 1; 2012 } 2013 } 2014 } else { 2015 /* Normal number; adjust the bias. */ 2016 exp += f32_bias - f16_bias; 2017 } 2018 sign <<= 31; 2019 exp <<= 23; 2020 frac <<= 23 - 10; 2021 2022 return sign | exp | frac; 2023 } 2024 2025 static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) 2026 { 2027 /* 2028 * Branchless load of u32[0], u64[0], u32[1], or u64[1]. 2029 * Load the 2nd qword iff is_q & is_2. 2030 * Shift to the 2nd dword iff !is_q & is_2. 2031 * For !is_q & !is_2, the upper bits of the result are garbage. 2032 */ 2033 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); 2034 } 2035 2036 /* 2037 * Note that FMLAL requires oprsz == 8 or oprsz == 16, 2038 * as there is not yet SVE versions that might use blocking. 2039 */ 2040 2041 static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, 2042 uint32_t desc, bool fz16) 2043 { 2044 intptr_t i, oprsz = simd_oprsz(desc); 2045 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2046 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2047 int is_q = oprsz == 16; 2048 uint64_t n_4, m_4; 2049 2050 /* Pre-load all of the f16 data, avoiding overlap issues. */ 2051 n_4 = load4_f16(vn, is_q, is_2); 2052 m_4 = load4_f16(vm, is_q, is_2); 2053 2054 /* Negate all inputs for FMLSL at once. */ 2055 if (is_s) { 2056 n_4 ^= 0x8000800080008000ull; 2057 } 2058 2059 for (i = 0; i < oprsz / 4; i++) { 2060 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2061 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); 2062 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 2063 } 2064 clear_tail(d, oprsz, simd_maxsz(desc)); 2065 } 2066 2067 void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, 2068 CPUARMState *env, uint32_t desc) 2069 { 2070 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, 2071 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32)); 2072 } 2073 2074 void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, 2075 CPUARMState *env, uint32_t desc) 2076 { 2077 do_fmlal(vd, vn, vm, &env->vfp.fp_status_a64, desc, 2078 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64)); 2079 } 2080 2081 void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, 2082 CPUARMState *env, uint32_t desc) 2083 { 2084 intptr_t i, oprsz = simd_oprsz(desc); 2085 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 2086 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2087 float_status *status = &env->vfp.fp_status_a64; 2088 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64); 2089 2090 for (i = 0; i < oprsz; i += sizeof(float32)) { 2091 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; 2092 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); 2093 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2094 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2095 float32 aa = *(float32 *)(va + H1_4(i)); 2096 2097 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); 2098 } 2099 } 2100 2101 static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, 2102 uint32_t desc, bool fz16) 2103 { 2104 intptr_t i, oprsz = simd_oprsz(desc); 2105 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); 2106 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 2107 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); 2108 int is_q = oprsz == 16; 2109 uint64_t n_4; 2110 float32 m_1; 2111 2112 /* Pre-load all of the f16 data, avoiding overlap issues. */ 2113 n_4 = load4_f16(vn, is_q, is_2); 2114 2115 /* Negate all inputs for FMLSL at once. */ 2116 if (is_s) { 2117 n_4 ^= 0x8000800080008000ull; 2118 } 2119 2120 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); 2121 2122 for (i = 0; i < oprsz / 4; i++) { 2123 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); 2124 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); 2125 } 2126 clear_tail(d, oprsz, simd_maxsz(desc)); 2127 } 2128 2129 void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, 2130 CPUARMState *env, uint32_t desc) 2131 { 2132 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, 2133 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a32)); 2134 } 2135 2136 void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, 2137 CPUARMState *env, uint32_t desc) 2138 { 2139 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status_a64, desc, 2140 get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64)); 2141 } 2142 2143 void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, 2144 CPUARMState *env, uint32_t desc) 2145 { 2146 intptr_t i, j, oprsz = simd_oprsz(desc); 2147 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; 2148 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); 2149 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); 2150 float_status *status = &env->vfp.fp_status_a64; 2151 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16_a64); 2152 2153 for (i = 0; i < oprsz; i += 16) { 2154 float16 mm_16 = *(float16 *)(vm + i + idx); 2155 float32 mm = float16_to_float32_by_bits(mm_16, fz16); 2156 2157 for (j = 0; j < 16; j += sizeof(float32)) { 2158 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; 2159 float32 nn = float16_to_float32_by_bits(nn_16, fz16); 2160 float32 aa = *(float32 *)(va + H1_4(i + j)); 2161 2162 *(float32 *)(vd + H1_4(i + j)) = 2163 float32_muladd(nn, mm, aa, 0, status); 2164 } 2165 } 2166 } 2167 2168 void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2169 { 2170 intptr_t i, opr_sz = simd_oprsz(desc); 2171 int8_t *d = vd, *n = vn, *m = vm; 2172 2173 for (i = 0; i < opr_sz; ++i) { 2174 int8_t mm = m[i]; 2175 int8_t nn = n[i]; 2176 int8_t res = 0; 2177 if (mm >= 0) { 2178 if (mm < 8) { 2179 res = nn << mm; 2180 } 2181 } else { 2182 res = nn >> (mm > -8 ? -mm : 7); 2183 } 2184 d[i] = res; 2185 } 2186 clear_tail(d, opr_sz, simd_maxsz(desc)); 2187 } 2188 2189 void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2190 { 2191 intptr_t i, opr_sz = simd_oprsz(desc); 2192 int16_t *d = vd, *n = vn, *m = vm; 2193 2194 for (i = 0; i < opr_sz / 2; ++i) { 2195 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2196 int16_t nn = n[i]; 2197 int16_t res = 0; 2198 if (mm >= 0) { 2199 if (mm < 16) { 2200 res = nn << mm; 2201 } 2202 } else { 2203 res = nn >> (mm > -16 ? -mm : 15); 2204 } 2205 d[i] = res; 2206 } 2207 clear_tail(d, opr_sz, simd_maxsz(desc)); 2208 } 2209 2210 void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) 2211 { 2212 intptr_t i, opr_sz = simd_oprsz(desc); 2213 uint8_t *d = vd, *n = vn, *m = vm; 2214 2215 for (i = 0; i < opr_sz; ++i) { 2216 int8_t mm = m[i]; 2217 uint8_t nn = n[i]; 2218 uint8_t res = 0; 2219 if (mm >= 0) { 2220 if (mm < 8) { 2221 res = nn << mm; 2222 } 2223 } else { 2224 if (mm > -8) { 2225 res = nn >> -mm; 2226 } 2227 } 2228 d[i] = res; 2229 } 2230 clear_tail(d, opr_sz, simd_maxsz(desc)); 2231 } 2232 2233 void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) 2234 { 2235 intptr_t i, opr_sz = simd_oprsz(desc); 2236 uint16_t *d = vd, *n = vn, *m = vm; 2237 2238 for (i = 0; i < opr_sz / 2; ++i) { 2239 int8_t mm = m[i]; /* only 8 bits of shift are significant */ 2240 uint16_t nn = n[i]; 2241 uint16_t res = 0; 2242 if (mm >= 0) { 2243 if (mm < 16) { 2244 res = nn << mm; 2245 } 2246 } else { 2247 if (mm > -16) { 2248 res = nn >> -mm; 2249 } 2250 } 2251 d[i] = res; 2252 } 2253 clear_tail(d, opr_sz, simd_maxsz(desc)); 2254 } 2255 2256 /* 2257 * 8x8->8 polynomial multiply. 2258 * 2259 * Polynomial multiplication is like integer multiplication except the 2260 * partial products are XORed, not added. 2261 * 2262 * TODO: expose this as a generic vector operation, as it is a common 2263 * crypto building block. 2264 */ 2265 void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) 2266 { 2267 intptr_t i, opr_sz = simd_oprsz(desc); 2268 uint64_t *d = vd, *n = vn, *m = vm; 2269 2270 for (i = 0; i < opr_sz / 8; ++i) { 2271 d[i] = clmul_8x8_low(n[i], m[i]); 2272 } 2273 clear_tail(d, opr_sz, simd_maxsz(desc)); 2274 } 2275 2276 /* 2277 * 64x64->128 polynomial multiply. 2278 * Because of the lanes are not accessed in strict columns, 2279 * this probably cannot be turned into a generic helper. 2280 */ 2281 void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) 2282 { 2283 intptr_t i, opr_sz = simd_oprsz(desc); 2284 intptr_t hi = simd_data(desc); 2285 uint64_t *d = vd, *n = vn, *m = vm; 2286 2287 for (i = 0; i < opr_sz / 8; i += 2) { 2288 Int128 r = clmul_64(n[i + hi], m[i + hi]); 2289 d[i] = int128_getlo(r); 2290 d[i + 1] = int128_gethi(r); 2291 } 2292 clear_tail(d, opr_sz, simd_maxsz(desc)); 2293 } 2294 2295 void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2296 { 2297 int hi = simd_data(desc); 2298 uint64_t *d = vd, *n = vn, *m = vm; 2299 uint64_t nn = n[hi], mm = m[hi]; 2300 2301 d[0] = clmul_8x4_packed(nn, mm); 2302 nn >>= 32; 2303 mm >>= 32; 2304 d[1] = clmul_8x4_packed(nn, mm); 2305 2306 clear_tail(d, 16, simd_maxsz(desc)); 2307 } 2308 2309 #ifdef TARGET_AARCH64 2310 void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) 2311 { 2312 int shift = simd_data(desc) * 8; 2313 intptr_t i, opr_sz = simd_oprsz(desc); 2314 uint64_t *d = vd, *n = vn, *m = vm; 2315 2316 for (i = 0; i < opr_sz / 8; ++i) { 2317 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); 2318 } 2319 } 2320 2321 void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) 2322 { 2323 intptr_t sel = H4(simd_data(desc)); 2324 intptr_t i, opr_sz = simd_oprsz(desc); 2325 uint32_t *n = vn, *m = vm; 2326 uint64_t *d = vd; 2327 2328 for (i = 0; i < opr_sz / 8; ++i) { 2329 d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); 2330 } 2331 } 2332 #endif 2333 2334 #define DO_CMP0(NAME, TYPE, OP) \ 2335 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ 2336 { \ 2337 intptr_t i, opr_sz = simd_oprsz(desc); \ 2338 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 2339 TYPE nn = *(TYPE *)(vn + i); \ 2340 *(TYPE *)(vd + i) = -(nn OP 0); \ 2341 } \ 2342 clear_tail(vd, opr_sz, simd_maxsz(desc)); \ 2343 } 2344 2345 DO_CMP0(gvec_ceq0_b, int8_t, ==) 2346 DO_CMP0(gvec_clt0_b, int8_t, <) 2347 DO_CMP0(gvec_cle0_b, int8_t, <=) 2348 DO_CMP0(gvec_cgt0_b, int8_t, >) 2349 DO_CMP0(gvec_cge0_b, int8_t, >=) 2350 2351 DO_CMP0(gvec_ceq0_h, int16_t, ==) 2352 DO_CMP0(gvec_clt0_h, int16_t, <) 2353 DO_CMP0(gvec_cle0_h, int16_t, <=) 2354 DO_CMP0(gvec_cgt0_h, int16_t, >) 2355 DO_CMP0(gvec_cge0_h, int16_t, >=) 2356 2357 #undef DO_CMP0 2358 2359 #define DO_ABD(NAME, TYPE) \ 2360 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2361 { \ 2362 intptr_t i, opr_sz = simd_oprsz(desc); \ 2363 TYPE *d = vd, *n = vn, *m = vm; \ 2364 \ 2365 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2366 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2367 } \ 2368 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2369 } 2370 2371 DO_ABD(gvec_sabd_b, int8_t) 2372 DO_ABD(gvec_sabd_h, int16_t) 2373 DO_ABD(gvec_sabd_s, int32_t) 2374 DO_ABD(gvec_sabd_d, int64_t) 2375 2376 DO_ABD(gvec_uabd_b, uint8_t) 2377 DO_ABD(gvec_uabd_h, uint16_t) 2378 DO_ABD(gvec_uabd_s, uint32_t) 2379 DO_ABD(gvec_uabd_d, uint64_t) 2380 2381 #undef DO_ABD 2382 2383 #define DO_ABA(NAME, TYPE) \ 2384 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2385 { \ 2386 intptr_t i, opr_sz = simd_oprsz(desc); \ 2387 TYPE *d = vd, *n = vn, *m = vm; \ 2388 \ 2389 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ 2390 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ 2391 } \ 2392 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 2393 } 2394 2395 DO_ABA(gvec_saba_b, int8_t) 2396 DO_ABA(gvec_saba_h, int16_t) 2397 DO_ABA(gvec_saba_s, int32_t) 2398 DO_ABA(gvec_saba_d, int64_t) 2399 2400 DO_ABA(gvec_uaba_b, uint8_t) 2401 DO_ABA(gvec_uaba_h, uint16_t) 2402 DO_ABA(gvec_uaba_s, uint32_t) 2403 DO_ABA(gvec_uaba_d, uint64_t) 2404 2405 #undef DO_ABA 2406 2407 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2408 void HELPER(NAME)(void *vd, void *vn, void *vm, \ 2409 float_status *stat, uint32_t desc) \ 2410 { \ 2411 ARMVectorReg scratch; \ 2412 intptr_t oprsz = simd_oprsz(desc); \ 2413 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2414 TYPE *d = vd, *n = vn, *m = vm; \ 2415 if (unlikely(d == m)) { \ 2416 m = memcpy(&scratch, m, oprsz); \ 2417 } \ 2418 for (intptr_t i = 0; i < half; ++i) { \ 2419 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)], stat); \ 2420 } \ 2421 for (intptr_t i = 0; i < half; ++i) { \ 2422 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)], stat); \ 2423 } \ 2424 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2425 } 2426 2427 DO_3OP_PAIR(gvec_faddp_h, float16_add, float16, H2) 2428 DO_3OP_PAIR(gvec_faddp_s, float32_add, float32, H4) 2429 DO_3OP_PAIR(gvec_faddp_d, float64_add, float64, ) 2430 2431 DO_3OP_PAIR(gvec_fmaxp_h, float16_max, float16, H2) 2432 DO_3OP_PAIR(gvec_fmaxp_s, float32_max, float32, H4) 2433 DO_3OP_PAIR(gvec_fmaxp_d, float64_max, float64, ) 2434 2435 DO_3OP_PAIR(gvec_fminp_h, float16_min, float16, H2) 2436 DO_3OP_PAIR(gvec_fminp_s, float32_min, float32, H4) 2437 DO_3OP_PAIR(gvec_fminp_d, float64_min, float64, ) 2438 2439 DO_3OP_PAIR(gvec_fmaxnump_h, float16_maxnum, float16, H2) 2440 DO_3OP_PAIR(gvec_fmaxnump_s, float32_maxnum, float32, H4) 2441 DO_3OP_PAIR(gvec_fmaxnump_d, float64_maxnum, float64, ) 2442 2443 DO_3OP_PAIR(gvec_fminnump_h, float16_minnum, float16, H2) 2444 DO_3OP_PAIR(gvec_fminnump_s, float32_minnum, float32, H4) 2445 DO_3OP_PAIR(gvec_fminnump_d, float64_minnum, float64, ) 2446 2447 #ifdef TARGET_AARCH64 2448 DO_3OP_PAIR(gvec_ah_fmaxp_h, helper_vfp_ah_maxh, float16, H2) 2449 DO_3OP_PAIR(gvec_ah_fmaxp_s, helper_vfp_ah_maxs, float32, H4) 2450 DO_3OP_PAIR(gvec_ah_fmaxp_d, helper_vfp_ah_maxd, float64, ) 2451 2452 DO_3OP_PAIR(gvec_ah_fminp_h, helper_vfp_ah_minh, float16, H2) 2453 DO_3OP_PAIR(gvec_ah_fminp_s, helper_vfp_ah_mins, float32, H4) 2454 DO_3OP_PAIR(gvec_ah_fminp_d, helper_vfp_ah_mind, float64, ) 2455 #endif 2456 2457 #undef DO_3OP_PAIR 2458 2459 #define DO_3OP_PAIR(NAME, FUNC, TYPE, H) \ 2460 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ 2461 { \ 2462 ARMVectorReg scratch; \ 2463 intptr_t oprsz = simd_oprsz(desc); \ 2464 intptr_t half = oprsz / sizeof(TYPE) / 2; \ 2465 TYPE *d = vd, *n = vn, *m = vm; \ 2466 if (unlikely(d == m)) { \ 2467 m = memcpy(&scratch, m, oprsz); \ 2468 } \ 2469 for (intptr_t i = 0; i < half; ++i) { \ 2470 d[H(i)] = FUNC(n[H(i * 2)], n[H(i * 2 + 1)]); \ 2471 } \ 2472 for (intptr_t i = 0; i < half; ++i) { \ 2473 d[H(i + half)] = FUNC(m[H(i * 2)], m[H(i * 2 + 1)]); \ 2474 } \ 2475 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2476 } 2477 2478 #define ADD(A, B) (A + B) 2479 DO_3OP_PAIR(gvec_addp_b, ADD, uint8_t, H1) 2480 DO_3OP_PAIR(gvec_addp_h, ADD, uint16_t, H2) 2481 DO_3OP_PAIR(gvec_addp_s, ADD, uint32_t, H4) 2482 DO_3OP_PAIR(gvec_addp_d, ADD, uint64_t, ) 2483 #undef ADD 2484 2485 DO_3OP_PAIR(gvec_smaxp_b, MAX, int8_t, H1) 2486 DO_3OP_PAIR(gvec_smaxp_h, MAX, int16_t, H2) 2487 DO_3OP_PAIR(gvec_smaxp_s, MAX, int32_t, H4) 2488 2489 DO_3OP_PAIR(gvec_umaxp_b, MAX, uint8_t, H1) 2490 DO_3OP_PAIR(gvec_umaxp_h, MAX, uint16_t, H2) 2491 DO_3OP_PAIR(gvec_umaxp_s, MAX, uint32_t, H4) 2492 2493 DO_3OP_PAIR(gvec_sminp_b, MIN, int8_t, H1) 2494 DO_3OP_PAIR(gvec_sminp_h, MIN, int16_t, H2) 2495 DO_3OP_PAIR(gvec_sminp_s, MIN, int32_t, H4) 2496 2497 DO_3OP_PAIR(gvec_uminp_b, MIN, uint8_t, H1) 2498 DO_3OP_PAIR(gvec_uminp_h, MIN, uint16_t, H2) 2499 DO_3OP_PAIR(gvec_uminp_s, MIN, uint32_t, H4) 2500 2501 #undef DO_3OP_PAIR 2502 2503 #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ 2504 void HELPER(NAME)(void *vd, void *vn, float_status *stat, uint32_t desc) \ 2505 { \ 2506 intptr_t i, oprsz = simd_oprsz(desc); \ 2507 int shift = simd_data(desc); \ 2508 TYPE *d = vd, *n = vn; \ 2509 float_status *fpst = stat; \ 2510 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2511 d[i] = FUNC(n[i], shift, fpst); \ 2512 } \ 2513 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2514 } 2515 2516 DO_VCVT_FIXED(gvec_vcvt_sd, helper_vfp_sqtod, uint64_t) 2517 DO_VCVT_FIXED(gvec_vcvt_ud, helper_vfp_uqtod, uint64_t) 2518 DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) 2519 DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) 2520 DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) 2521 DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) 2522 2523 DO_VCVT_FIXED(gvec_vcvt_rz_ds, helper_vfp_tosqd_round_to_zero, uint64_t) 2524 DO_VCVT_FIXED(gvec_vcvt_rz_du, helper_vfp_touqd_round_to_zero, uint64_t) 2525 DO_VCVT_FIXED(gvec_vcvt_rz_fs, helper_vfp_tosls_round_to_zero, uint32_t) 2526 DO_VCVT_FIXED(gvec_vcvt_rz_fu, helper_vfp_touls_round_to_zero, uint32_t) 2527 DO_VCVT_FIXED(gvec_vcvt_rz_hs, helper_vfp_toshh_round_to_zero, uint16_t) 2528 DO_VCVT_FIXED(gvec_vcvt_rz_hu, helper_vfp_touhh_round_to_zero, uint16_t) 2529 2530 #undef DO_VCVT_FIXED 2531 2532 #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ 2533 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2534 { \ 2535 intptr_t i, oprsz = simd_oprsz(desc); \ 2536 uint32_t rmode = simd_data(desc); \ 2537 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2538 TYPE *d = vd, *n = vn; \ 2539 set_float_rounding_mode(rmode, fpst); \ 2540 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2541 d[i] = FUNC(n[i], 0, fpst); \ 2542 } \ 2543 set_float_rounding_mode(prev_rmode, fpst); \ 2544 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2545 } 2546 2547 DO_VCVT_RMODE(gvec_vcvt_rm_sd, helper_vfp_tosqd, uint64_t) 2548 DO_VCVT_RMODE(gvec_vcvt_rm_ud, helper_vfp_touqd, uint64_t) 2549 DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) 2550 DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) 2551 DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) 2552 DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) 2553 2554 #undef DO_VCVT_RMODE 2555 2556 #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ 2557 void HELPER(NAME)(void *vd, void *vn, float_status *fpst, uint32_t desc) \ 2558 { \ 2559 intptr_t i, oprsz = simd_oprsz(desc); \ 2560 uint32_t rmode = simd_data(desc); \ 2561 uint32_t prev_rmode = get_float_rounding_mode(fpst); \ 2562 TYPE *d = vd, *n = vn; \ 2563 set_float_rounding_mode(rmode, fpst); \ 2564 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ 2565 d[i] = FUNC(n[i], fpst); \ 2566 } \ 2567 set_float_rounding_mode(prev_rmode, fpst); \ 2568 clear_tail(d, oprsz, simd_maxsz(desc)); \ 2569 } 2570 2571 DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) 2572 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) 2573 2574 #undef DO_VRINT_RMODE 2575 2576 #ifdef TARGET_AARCH64 2577 void HELPER(simd_tblx)(void *vd, void *vm, CPUARMState *env, uint32_t desc) 2578 { 2579 const uint8_t *indices = vm; 2580 size_t oprsz = simd_oprsz(desc); 2581 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); 2582 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); 2583 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); 2584 union { 2585 uint8_t b[16]; 2586 uint64_t d[2]; 2587 } result; 2588 2589 /* 2590 * We must construct the final result in a temp, lest the output 2591 * overlaps the input table. For TBL, begin with zero; for TBX, 2592 * begin with the original register contents. Note that we always 2593 * copy 16 bytes here to avoid an extra branch; clearing the high 2594 * bits of the register for oprsz == 8 is handled below. 2595 */ 2596 if (is_tbx) { 2597 memcpy(&result, vd, 16); 2598 } else { 2599 memset(&result, 0, 16); 2600 } 2601 2602 for (size_t i = 0; i < oprsz; ++i) { 2603 uint32_t index = indices[H1(i)]; 2604 2605 if (index < table_len) { 2606 /* 2607 * Convert index (a byte offset into the virtual table 2608 * which is a series of 128-bit vectors concatenated) 2609 * into the correct register element, bearing in mind 2610 * that the table can wrap around from V31 to V0. 2611 */ 2612 const uint8_t *table = (const uint8_t *) 2613 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); 2614 result.b[H1(i)] = table[H1(index % 16)]; 2615 } 2616 } 2617 2618 memcpy(vd, &result, 16); 2619 clear_tail(vd, oprsz, simd_maxsz(desc)); 2620 } 2621 #endif 2622 2623 /* 2624 * NxN -> N highpart multiply 2625 * 2626 * TODO: expose this as a generic vector operation. 2627 */ 2628 2629 void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2630 { 2631 intptr_t i, opr_sz = simd_oprsz(desc); 2632 int8_t *d = vd, *n = vn, *m = vm; 2633 2634 for (i = 0; i < opr_sz; ++i) { 2635 d[i] = ((int32_t)n[i] * m[i]) >> 8; 2636 } 2637 clear_tail(d, opr_sz, simd_maxsz(desc)); 2638 } 2639 2640 void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2641 { 2642 intptr_t i, opr_sz = simd_oprsz(desc); 2643 int16_t *d = vd, *n = vn, *m = vm; 2644 2645 for (i = 0; i < opr_sz / 2; ++i) { 2646 d[i] = ((int32_t)n[i] * m[i]) >> 16; 2647 } 2648 clear_tail(d, opr_sz, simd_maxsz(desc)); 2649 } 2650 2651 void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2652 { 2653 intptr_t i, opr_sz = simd_oprsz(desc); 2654 int32_t *d = vd, *n = vn, *m = vm; 2655 2656 for (i = 0; i < opr_sz / 4; ++i) { 2657 d[i] = ((int64_t)n[i] * m[i]) >> 32; 2658 } 2659 clear_tail(d, opr_sz, simd_maxsz(desc)); 2660 } 2661 2662 void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2663 { 2664 intptr_t i, opr_sz = simd_oprsz(desc); 2665 uint64_t *d = vd, *n = vn, *m = vm; 2666 uint64_t discard; 2667 2668 for (i = 0; i < opr_sz / 8; ++i) { 2669 muls64(&discard, &d[i], n[i], m[i]); 2670 } 2671 clear_tail(d, opr_sz, simd_maxsz(desc)); 2672 } 2673 2674 void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) 2675 { 2676 intptr_t i, opr_sz = simd_oprsz(desc); 2677 uint8_t *d = vd, *n = vn, *m = vm; 2678 2679 for (i = 0; i < opr_sz; ++i) { 2680 d[i] = ((uint32_t)n[i] * m[i]) >> 8; 2681 } 2682 clear_tail(d, opr_sz, simd_maxsz(desc)); 2683 } 2684 2685 void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) 2686 { 2687 intptr_t i, opr_sz = simd_oprsz(desc); 2688 uint16_t *d = vd, *n = vn, *m = vm; 2689 2690 for (i = 0; i < opr_sz / 2; ++i) { 2691 d[i] = ((uint32_t)n[i] * m[i]) >> 16; 2692 } 2693 clear_tail(d, opr_sz, simd_maxsz(desc)); 2694 } 2695 2696 void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) 2697 { 2698 intptr_t i, opr_sz = simd_oprsz(desc); 2699 uint32_t *d = vd, *n = vn, *m = vm; 2700 2701 for (i = 0; i < opr_sz / 4; ++i) { 2702 d[i] = ((uint64_t)n[i] * m[i]) >> 32; 2703 } 2704 clear_tail(d, opr_sz, simd_maxsz(desc)); 2705 } 2706 2707 void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) 2708 { 2709 intptr_t i, opr_sz = simd_oprsz(desc); 2710 uint64_t *d = vd, *n = vn, *m = vm; 2711 uint64_t discard; 2712 2713 for (i = 0; i < opr_sz / 8; ++i) { 2714 mulu64(&discard, &d[i], n[i], m[i]); 2715 } 2716 clear_tail(d, opr_sz, simd_maxsz(desc)); 2717 } 2718 2719 void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) 2720 { 2721 intptr_t i, opr_sz = simd_oprsz(desc) / 8; 2722 int shr = simd_data(desc); 2723 uint64_t *d = vd, *n = vn, *m = vm; 2724 2725 for (i = 0; i < opr_sz; ++i) { 2726 d[i] = ror64(n[i] ^ m[i], shr); 2727 } 2728 clear_tail(d, opr_sz * 8, simd_maxsz(desc)); 2729 } 2730 2731 /* 2732 * Integer matrix-multiply accumulate 2733 */ 2734 2735 static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) 2736 { 2737 int8_t *n = vn, *m = vm; 2738 2739 for (intptr_t k = 0; k < 8; ++k) { 2740 sum += n[H1(k)] * m[H1(k)]; 2741 } 2742 return sum; 2743 } 2744 2745 static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) 2746 { 2747 uint8_t *n = vn, *m = vm; 2748 2749 for (intptr_t k = 0; k < 8; ++k) { 2750 sum += n[H1(k)] * m[H1(k)]; 2751 } 2752 return sum; 2753 } 2754 2755 static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) 2756 { 2757 uint8_t *n = vn; 2758 int8_t *m = vm; 2759 2760 for (intptr_t k = 0; k < 8; ++k) { 2761 sum += n[H1(k)] * m[H1(k)]; 2762 } 2763 return sum; 2764 } 2765 2766 static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, 2767 uint32_t (*inner_loop)(uint32_t, void *, void *)) 2768 { 2769 intptr_t seg, opr_sz = simd_oprsz(desc); 2770 2771 for (seg = 0; seg < opr_sz; seg += 16) { 2772 uint32_t *d = vd + seg; 2773 uint32_t *a = va + seg; 2774 uint32_t sum0, sum1, sum2, sum3; 2775 2776 /* 2777 * Process the entire segment at once, writing back the 2778 * results only after we've consumed all of the inputs. 2779 * 2780 * Key to indices by column: 2781 * i j i j 2782 */ 2783 sum0 = a[H4(0 + 0)]; 2784 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); 2785 sum1 = a[H4(0 + 1)]; 2786 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); 2787 sum2 = a[H4(2 + 0)]; 2788 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); 2789 sum3 = a[H4(2 + 1)]; 2790 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); 2791 2792 d[H4(0)] = sum0; 2793 d[H4(1)] = sum1; 2794 d[H4(2)] = sum2; 2795 d[H4(3)] = sum3; 2796 } 2797 clear_tail(vd, opr_sz, simd_maxsz(desc)); 2798 } 2799 2800 #define DO_MMLA_B(NAME, INNER) \ 2801 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ 2802 { do_mmla_b(vd, vn, vm, va, desc, INNER); } 2803 2804 DO_MMLA_B(gvec_smmla_b, do_smmla_b) 2805 DO_MMLA_B(gvec_ummla_b, do_ummla_b) 2806 DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) 2807 2808 /* 2809 * BFloat16 Dot Product 2810 */ 2811 2812 bool is_ebf(CPUARMState *env, float_status *statusp, float_status *oddstatusp) 2813 { 2814 /* 2815 * For BFDOT, BFMMLA, etc, the behaviour depends on FPCR.EBF. 2816 * For EBF = 0, we ignore the FPCR bits which determine rounding 2817 * mode and denormal-flushing, and we do unfused multiplies and 2818 * additions with intermediate rounding of all products and sums. 2819 * For EBF = 1, we honour FPCR rounding mode and denormal-flushing bits, 2820 * and we perform a fused two-way sum-of-products without intermediate 2821 * rounding of the products. 2822 * In either case, we don't set fp exception flags. 2823 * 2824 * EBF is AArch64 only, so even if it's set in the FPCR it has 2825 * no effect on AArch32 instructions. 2826 */ 2827 bool ebf = is_a64(env) && env->vfp.fpcr & FPCR_EBF; 2828 2829 *statusp = is_a64(env) ? env->vfp.fp_status_a64 : env->vfp.fp_status_a32; 2830 set_default_nan_mode(true, statusp); 2831 2832 if (ebf) { 2833 /* EBF=1 needs to do a step with round-to-odd semantics */ 2834 *oddstatusp = *statusp; 2835 set_float_rounding_mode(float_round_to_odd, oddstatusp); 2836 } else { 2837 set_flush_to_zero(true, statusp); 2838 set_flush_inputs_to_zero(true, statusp); 2839 set_float_rounding_mode(float_round_to_odd_inf, statusp); 2840 } 2841 return ebf; 2842 } 2843 2844 float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2, float_status *fpst) 2845 { 2846 float32 t1, t2; 2847 2848 /* 2849 * Extract each BFloat16 from the element pair, and shift 2850 * them such that they become float32. 2851 */ 2852 t1 = float32_mul(e1 << 16, e2 << 16, fpst); 2853 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, fpst); 2854 t1 = float32_add(t1, t2, fpst); 2855 t1 = float32_add(sum, t1, fpst); 2856 2857 return t1; 2858 } 2859 2860 float32 bfdotadd_ebf(float32 sum, uint32_t e1, uint32_t e2, 2861 float_status *fpst, float_status *fpst_odd) 2862 { 2863 /* 2864 * Compare f16_dotadd() in sme_helper.c, but here we have 2865 * bfloat16 inputs. In particular that means that we do not 2866 * want the FPCR.FZ16 flush semantics, so we use the normal 2867 * float_status for the input handling here. 2868 */ 2869 float64 e1r = float32_to_float64(e1 << 16, fpst); 2870 float64 e1c = float32_to_float64(e1 & 0xffff0000u, fpst); 2871 float64 e2r = float32_to_float64(e2 << 16, fpst); 2872 float64 e2c = float32_to_float64(e2 & 0xffff0000u, fpst); 2873 float64 t64; 2874 float32 t32; 2875 2876 /* 2877 * The ARM pseudocode function FPDot performs both multiplies 2878 * and the add with a single rounding operation. Emulate this 2879 * by performing the first multiply in round-to-odd, then doing 2880 * the second multiply as fused multiply-add, and rounding to 2881 * float32 all in one step. 2882 */ 2883 t64 = float64_mul(e1r, e2r, fpst_odd); 2884 t64 = float64r32_muladd(e1c, e2c, t64, 0, fpst); 2885 2886 /* This conversion is exact, because we've already rounded. */ 2887 t32 = float64_to_float32(t64, fpst); 2888 2889 /* The final accumulation step is not fused. */ 2890 return float32_add(sum, t32, fpst); 2891 } 2892 2893 void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, 2894 CPUARMState *env, uint32_t desc) 2895 { 2896 intptr_t i, opr_sz = simd_oprsz(desc); 2897 float32 *d = vd, *a = va; 2898 uint32_t *n = vn, *m = vm; 2899 float_status fpst, fpst_odd; 2900 2901 if (is_ebf(env, &fpst, &fpst_odd)) { 2902 for (i = 0; i < opr_sz / 4; ++i) { 2903 d[i] = bfdotadd_ebf(a[i], n[i], m[i], &fpst, &fpst_odd); 2904 } 2905 } else { 2906 for (i = 0; i < opr_sz / 4; ++i) { 2907 d[i] = bfdotadd(a[i], n[i], m[i], &fpst); 2908 } 2909 } 2910 clear_tail(d, opr_sz, simd_maxsz(desc)); 2911 } 2912 2913 void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, 2914 void *va, CPUARMState *env, uint32_t desc) 2915 { 2916 intptr_t i, j, opr_sz = simd_oprsz(desc); 2917 intptr_t index = simd_data(desc); 2918 intptr_t elements = opr_sz / 4; 2919 intptr_t eltspersegment = MIN(16 / 4, elements); 2920 float32 *d = vd, *a = va; 2921 uint32_t *n = vn, *m = vm; 2922 float_status fpst, fpst_odd; 2923 2924 if (is_ebf(env, &fpst, &fpst_odd)) { 2925 for (i = 0; i < elements; i += eltspersegment) { 2926 uint32_t m_idx = m[i + H4(index)]; 2927 2928 for (j = i; j < i + eltspersegment; j++) { 2929 d[j] = bfdotadd_ebf(a[j], n[j], m_idx, &fpst, &fpst_odd); 2930 } 2931 } 2932 } else { 2933 for (i = 0; i < elements; i += eltspersegment) { 2934 uint32_t m_idx = m[i + H4(index)]; 2935 2936 for (j = i; j < i + eltspersegment; j++) { 2937 d[j] = bfdotadd(a[j], n[j], m_idx, &fpst); 2938 } 2939 } 2940 } 2941 clear_tail(d, opr_sz, simd_maxsz(desc)); 2942 } 2943 2944 void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, 2945 CPUARMState *env, uint32_t desc) 2946 { 2947 intptr_t s, opr_sz = simd_oprsz(desc); 2948 float32 *d = vd, *a = va; 2949 uint32_t *n = vn, *m = vm; 2950 float_status fpst, fpst_odd; 2951 2952 if (is_ebf(env, &fpst, &fpst_odd)) { 2953 for (s = 0; s < opr_sz / 4; s += 4) { 2954 float32 sum00, sum01, sum10, sum11; 2955 2956 /* 2957 * Process the entire segment at once, writing back the 2958 * results only after we've consumed all of the inputs. 2959 * 2960 * Key to indices by column: 2961 * i j i k j k 2962 */ 2963 sum00 = a[s + H4(0 + 0)]; 2964 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 2965 sum00 = bfdotadd_ebf(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 2966 2967 sum01 = a[s + H4(0 + 1)]; 2968 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 2969 sum01 = bfdotadd_ebf(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 2970 2971 sum10 = a[s + H4(2 + 0)]; 2972 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst, &fpst_odd); 2973 sum10 = bfdotadd_ebf(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst, &fpst_odd); 2974 2975 sum11 = a[s + H4(2 + 1)]; 2976 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst, &fpst_odd); 2977 sum11 = bfdotadd_ebf(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst, &fpst_odd); 2978 2979 d[s + H4(0 + 0)] = sum00; 2980 d[s + H4(0 + 1)] = sum01; 2981 d[s + H4(2 + 0)] = sum10; 2982 d[s + H4(2 + 1)] = sum11; 2983 } 2984 } else { 2985 for (s = 0; s < opr_sz / 4; s += 4) { 2986 float32 sum00, sum01, sum10, sum11; 2987 2988 /* 2989 * Process the entire segment at once, writing back the 2990 * results only after we've consumed all of the inputs. 2991 * 2992 * Key to indices by column: 2993 * i j i k j k 2994 */ 2995 sum00 = a[s + H4(0 + 0)]; 2996 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)], &fpst); 2997 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)], &fpst); 2998 2999 sum01 = a[s + H4(0 + 1)]; 3000 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)], &fpst); 3001 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)], &fpst); 3002 3003 sum10 = a[s + H4(2 + 0)]; 3004 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)], &fpst); 3005 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)], &fpst); 3006 3007 sum11 = a[s + H4(2 + 1)]; 3008 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)], &fpst); 3009 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)], &fpst); 3010 3011 d[s + H4(0 + 0)] = sum00; 3012 d[s + H4(0 + 1)] = sum01; 3013 d[s + H4(2 + 0)] = sum10; 3014 d[s + H4(2 + 1)] = sum11; 3015 } 3016 } 3017 clear_tail(d, opr_sz, simd_maxsz(desc)); 3018 } 3019 3020 void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, 3021 float_status *stat, uint32_t desc) 3022 { 3023 intptr_t i, opr_sz = simd_oprsz(desc); 3024 intptr_t sel = simd_data(desc); 3025 float32 *d = vd, *a = va; 3026 bfloat16 *n = vn, *m = vm; 3027 3028 for (i = 0; i < opr_sz / 4; ++i) { 3029 float32 nn = n[H2(i * 2 + sel)] << 16; 3030 float32 mm = m[H2(i * 2 + sel)] << 16; 3031 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); 3032 } 3033 clear_tail(d, opr_sz, simd_maxsz(desc)); 3034 } 3035 3036 void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, 3037 void *va, float_status *stat, uint32_t desc) 3038 { 3039 intptr_t i, j, opr_sz = simd_oprsz(desc); 3040 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); 3041 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); 3042 intptr_t elements = opr_sz / 4; 3043 intptr_t eltspersegment = MIN(16 / 4, elements); 3044 float32 *d = vd, *a = va; 3045 bfloat16 *n = vn, *m = vm; 3046 3047 for (i = 0; i < elements; i += eltspersegment) { 3048 float32 m_idx = m[H2(2 * i + index)] << 16; 3049 3050 for (j = i; j < i + eltspersegment; j++) { 3051 float32 n_j = n[H2(2 * j + sel)] << 16; 3052 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); 3053 } 3054 } 3055 clear_tail(d, opr_sz, simd_maxsz(desc)); 3056 } 3057 3058 #define DO_CLAMP(NAME, TYPE) \ 3059 void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ 3060 { \ 3061 intptr_t i, opr_sz = simd_oprsz(desc); \ 3062 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ 3063 TYPE aa = *(TYPE *)(a + i); \ 3064 TYPE nn = *(TYPE *)(n + i); \ 3065 TYPE mm = *(TYPE *)(m + i); \ 3066 TYPE dd = MIN(MAX(aa, nn), mm); \ 3067 *(TYPE *)(d + i) = dd; \ 3068 } \ 3069 clear_tail(d, opr_sz, simd_maxsz(desc)); \ 3070 } 3071 3072 DO_CLAMP(gvec_sclamp_b, int8_t) 3073 DO_CLAMP(gvec_sclamp_h, int16_t) 3074 DO_CLAMP(gvec_sclamp_s, int32_t) 3075 DO_CLAMP(gvec_sclamp_d, int64_t) 3076 3077 DO_CLAMP(gvec_uclamp_b, uint8_t) 3078 DO_CLAMP(gvec_uclamp_h, uint16_t) 3079 DO_CLAMP(gvec_uclamp_s, uint32_t) 3080 DO_CLAMP(gvec_uclamp_d, uint64_t) 3081 3082 /* Bit count in each 8-bit word. */ 3083 void HELPER(gvec_cnt_b)(void *vd, void *vn, uint32_t desc) 3084 { 3085 intptr_t i, opr_sz = simd_oprsz(desc); 3086 uint8_t *d = vd, *n = vn; 3087 3088 for (i = 0; i < opr_sz; ++i) { 3089 d[i] = ctpop8(n[i]); 3090 } 3091 clear_tail(d, opr_sz, simd_maxsz(desc)); 3092 } 3093 3094 /* Reverse bits in each 8 bit word */ 3095 void HELPER(gvec_rbit_b)(void *vd, void *vn, uint32_t desc) 3096 { 3097 intptr_t i, opr_sz = simd_oprsz(desc); 3098 uint64_t *d = vd, *n = vn; 3099 3100 for (i = 0; i < opr_sz / 8; ++i) { 3101 d[i] = revbit64(bswap64(n[i])); 3102 } 3103 clear_tail(d, opr_sz, simd_maxsz(desc)); 3104 } 3105 3106 void HELPER(gvec_urecpe_s)(void *vd, void *vn, uint32_t desc) 3107 { 3108 intptr_t i, opr_sz = simd_oprsz(desc); 3109 uint32_t *d = vd, *n = vn; 3110 3111 for (i = 0; i < opr_sz / 4; ++i) { 3112 d[i] = helper_recpe_u32(n[i]); 3113 } 3114 clear_tail(d, opr_sz, simd_maxsz(desc)); 3115 } 3116 3117 void HELPER(gvec_ursqrte_s)(void *vd, void *vn, uint32_t desc) 3118 { 3119 intptr_t i, opr_sz = simd_oprsz(desc); 3120 uint32_t *d = vd, *n = vn; 3121 3122 for (i = 0; i < opr_sz / 4; ++i) { 3123 d[i] = helper_rsqrte_u32(n[i]); 3124 } 3125 clear_tail(d, opr_sz, simd_maxsz(desc)); 3126 } 3127