1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/exec-all.h" 23 #include "exec/helper-proto.h" 24 #include "tcg/tcg-gvec-desc.h" 25 #include "fpu/softfloat.h" 26 27 28 /* Note that vector data is stored in host-endian 64-bit chunks, 29 so addressing units smaller than that needs a host-endian fixup. */ 30 #ifdef HOST_WORDS_BIGENDIAN 31 #define H1(x) ((x) ^ 7) 32 #define H2(x) ((x) ^ 3) 33 #define H4(x) ((x) ^ 1) 34 #else 35 #define H1(x) (x) 36 #define H2(x) (x) 37 #define H4(x) (x) 38 #endif 39 40 #define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] |= CPSR_Q 41 42 static void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz) 43 { 44 uint64_t *d = vd + opr_sz; 45 uintptr_t i; 46 47 for (i = opr_sz; i < max_sz; i += 8) { 48 *d++ = 0; 49 } 50 } 51 52 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 53 static uint16_t inl_qrdmlah_s16(CPUARMState *env, int16_t src1, 54 int16_t src2, int16_t src3) 55 { 56 /* Simplify: 57 * = ((a3 << 16) + ((e1 * e2) << 1) + (1 << 15)) >> 16 58 * = ((a3 << 15) + (e1 * e2) + (1 << 14)) >> 15 59 */ 60 int32_t ret = (int32_t)src1 * src2; 61 ret = ((int32_t)src3 << 15) + ret + (1 << 14); 62 ret >>= 15; 63 if (ret != (int16_t)ret) { 64 SET_QC(); 65 ret = (ret < 0 ? -0x8000 : 0x7fff); 66 } 67 return ret; 68 } 69 70 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 71 uint32_t src2, uint32_t src3) 72 { 73 uint16_t e1 = inl_qrdmlah_s16(env, src1, src2, src3); 74 uint16_t e2 = inl_qrdmlah_s16(env, src1 >> 16, src2 >> 16, src3 >> 16); 75 return deposit32(e1, 16, 16, e2); 76 } 77 78 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 79 void *ve, uint32_t desc) 80 { 81 uintptr_t opr_sz = simd_oprsz(desc); 82 int16_t *d = vd; 83 int16_t *n = vn; 84 int16_t *m = vm; 85 CPUARMState *env = ve; 86 uintptr_t i; 87 88 for (i = 0; i < opr_sz / 2; ++i) { 89 d[i] = inl_qrdmlah_s16(env, n[i], m[i], d[i]); 90 } 91 clear_tail(d, opr_sz, simd_maxsz(desc)); 92 } 93 94 /* Signed saturating rounding doubling multiply-subtract high half, 16-bit */ 95 static uint16_t inl_qrdmlsh_s16(CPUARMState *env, int16_t src1, 96 int16_t src2, int16_t src3) 97 { 98 /* Similarly, using subtraction: 99 * = ((a3 << 16) - ((e1 * e2) << 1) + (1 << 15)) >> 16 100 * = ((a3 << 15) - (e1 * e2) + (1 << 14)) >> 15 101 */ 102 int32_t ret = (int32_t)src1 * src2; 103 ret = ((int32_t)src3 << 15) - ret + (1 << 14); 104 ret >>= 15; 105 if (ret != (int16_t)ret) { 106 SET_QC(); 107 ret = (ret < 0 ? -0x8000 : 0x7fff); 108 } 109 return ret; 110 } 111 112 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 113 uint32_t src2, uint32_t src3) 114 { 115 uint16_t e1 = inl_qrdmlsh_s16(env, src1, src2, src3); 116 uint16_t e2 = inl_qrdmlsh_s16(env, src1 >> 16, src2 >> 16, src3 >> 16); 117 return deposit32(e1, 16, 16, e2); 118 } 119 120 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 121 void *ve, uint32_t desc) 122 { 123 uintptr_t opr_sz = simd_oprsz(desc); 124 int16_t *d = vd; 125 int16_t *n = vn; 126 int16_t *m = vm; 127 CPUARMState *env = ve; 128 uintptr_t i; 129 130 for (i = 0; i < opr_sz / 2; ++i) { 131 d[i] = inl_qrdmlsh_s16(env, n[i], m[i], d[i]); 132 } 133 clear_tail(d, opr_sz, simd_maxsz(desc)); 134 } 135 136 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 137 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 138 int32_t src2, int32_t src3) 139 { 140 /* Simplify similarly to int_qrdmlah_s16 above. */ 141 int64_t ret = (int64_t)src1 * src2; 142 ret = ((int64_t)src3 << 31) + ret + (1 << 30); 143 ret >>= 31; 144 if (ret != (int32_t)ret) { 145 SET_QC(); 146 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 147 } 148 return ret; 149 } 150 151 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 152 void *ve, uint32_t desc) 153 { 154 uintptr_t opr_sz = simd_oprsz(desc); 155 int32_t *d = vd; 156 int32_t *n = vn; 157 int32_t *m = vm; 158 CPUARMState *env = ve; 159 uintptr_t i; 160 161 for (i = 0; i < opr_sz / 4; ++i) { 162 d[i] = helper_neon_qrdmlah_s32(env, n[i], m[i], d[i]); 163 } 164 clear_tail(d, opr_sz, simd_maxsz(desc)); 165 } 166 167 /* Signed saturating rounding doubling multiply-subtract high half, 32-bit */ 168 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 169 int32_t src2, int32_t src3) 170 { 171 /* Simplify similarly to int_qrdmlsh_s16 above. */ 172 int64_t ret = (int64_t)src1 * src2; 173 ret = ((int64_t)src3 << 31) - ret + (1 << 30); 174 ret >>= 31; 175 if (ret != (int32_t)ret) { 176 SET_QC(); 177 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 178 } 179 return ret; 180 } 181 182 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 183 void *ve, uint32_t desc) 184 { 185 uintptr_t opr_sz = simd_oprsz(desc); 186 int32_t *d = vd; 187 int32_t *n = vn; 188 int32_t *m = vm; 189 CPUARMState *env = ve; 190 uintptr_t i; 191 192 for (i = 0; i < opr_sz / 4; ++i) { 193 d[i] = helper_neon_qrdmlsh_s32(env, n[i], m[i], d[i]); 194 } 195 clear_tail(d, opr_sz, simd_maxsz(desc)); 196 } 197 198 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 199 void *vfpst, uint32_t desc) 200 { 201 uintptr_t opr_sz = simd_oprsz(desc); 202 float16 *d = vd; 203 float16 *n = vn; 204 float16 *m = vm; 205 float_status *fpst = vfpst; 206 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 207 uint32_t neg_imag = neg_real ^ 1; 208 uintptr_t i; 209 210 /* Shift boolean to the sign bit so we can xor to negate. */ 211 neg_real <<= 15; 212 neg_imag <<= 15; 213 214 for (i = 0; i < opr_sz / 2; i += 2) { 215 float16 e0 = n[H2(i)]; 216 float16 e1 = m[H2(i + 1)] ^ neg_imag; 217 float16 e2 = n[H2(i + 1)]; 218 float16 e3 = m[H2(i)] ^ neg_real; 219 220 d[H2(i)] = float16_add(e0, e1, fpst); 221 d[H2(i + 1)] = float16_add(e2, e3, fpst); 222 } 223 clear_tail(d, opr_sz, simd_maxsz(desc)); 224 } 225 226 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 227 void *vfpst, uint32_t desc) 228 { 229 uintptr_t opr_sz = simd_oprsz(desc); 230 float32 *d = vd; 231 float32 *n = vn; 232 float32 *m = vm; 233 float_status *fpst = vfpst; 234 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 235 uint32_t neg_imag = neg_real ^ 1; 236 uintptr_t i; 237 238 /* Shift boolean to the sign bit so we can xor to negate. */ 239 neg_real <<= 31; 240 neg_imag <<= 31; 241 242 for (i = 0; i < opr_sz / 4; i += 2) { 243 float32 e0 = n[H4(i)]; 244 float32 e1 = m[H4(i + 1)] ^ neg_imag; 245 float32 e2 = n[H4(i + 1)]; 246 float32 e3 = m[H4(i)] ^ neg_real; 247 248 d[H4(i)] = float32_add(e0, e1, fpst); 249 d[H4(i + 1)] = float32_add(e2, e3, fpst); 250 } 251 clear_tail(d, opr_sz, simd_maxsz(desc)); 252 } 253 254 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 255 void *vfpst, uint32_t desc) 256 { 257 uintptr_t opr_sz = simd_oprsz(desc); 258 float64 *d = vd; 259 float64 *n = vn; 260 float64 *m = vm; 261 float_status *fpst = vfpst; 262 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 263 uint64_t neg_imag = neg_real ^ 1; 264 uintptr_t i; 265 266 /* Shift boolean to the sign bit so we can xor to negate. */ 267 neg_real <<= 63; 268 neg_imag <<= 63; 269 270 for (i = 0; i < opr_sz / 8; i += 2) { 271 float64 e0 = n[i]; 272 float64 e1 = m[i + 1] ^ neg_imag; 273 float64 e2 = n[i + 1]; 274 float64 e3 = m[i] ^ neg_real; 275 276 d[i] = float64_add(e0, e1, fpst); 277 d[i + 1] = float64_add(e2, e3, fpst); 278 } 279 clear_tail(d, opr_sz, simd_maxsz(desc)); 280 } 281 282 void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, 283 void *vfpst, uint32_t desc) 284 { 285 uintptr_t opr_sz = simd_oprsz(desc); 286 float16 *d = vd; 287 float16 *n = vn; 288 float16 *m = vm; 289 float_status *fpst = vfpst; 290 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 291 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 292 uint32_t neg_real = flip ^ neg_imag; 293 uintptr_t i; 294 295 /* Shift boolean to the sign bit so we can xor to negate. */ 296 neg_real <<= 15; 297 neg_imag <<= 15; 298 299 for (i = 0; i < opr_sz / 2; i += 2) { 300 float16 e2 = n[H2(i + flip)]; 301 float16 e1 = m[H2(i + flip)] ^ neg_real; 302 float16 e4 = e2; 303 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; 304 305 d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst); 306 d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst); 307 } 308 clear_tail(d, opr_sz, simd_maxsz(desc)); 309 } 310 311 void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, 312 void *vfpst, uint32_t desc) 313 { 314 uintptr_t opr_sz = simd_oprsz(desc); 315 float16 *d = vd; 316 float16 *n = vn; 317 float16 *m = vm; 318 float_status *fpst = vfpst; 319 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 320 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 321 uint32_t neg_real = flip ^ neg_imag; 322 uintptr_t i; 323 float16 e1 = m[H2(flip)]; 324 float16 e3 = m[H2(1 - flip)]; 325 326 /* Shift boolean to the sign bit so we can xor to negate. */ 327 neg_real <<= 15; 328 neg_imag <<= 15; 329 e1 ^= neg_real; 330 e3 ^= neg_imag; 331 332 for (i = 0; i < opr_sz / 2; i += 2) { 333 float16 e2 = n[H2(i + flip)]; 334 float16 e4 = e2; 335 336 d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst); 337 d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst); 338 } 339 clear_tail(d, opr_sz, simd_maxsz(desc)); 340 } 341 342 void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, 343 void *vfpst, uint32_t desc) 344 { 345 uintptr_t opr_sz = simd_oprsz(desc); 346 float32 *d = vd; 347 float32 *n = vn; 348 float32 *m = vm; 349 float_status *fpst = vfpst; 350 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 351 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 352 uint32_t neg_real = flip ^ neg_imag; 353 uintptr_t i; 354 355 /* Shift boolean to the sign bit so we can xor to negate. */ 356 neg_real <<= 31; 357 neg_imag <<= 31; 358 359 for (i = 0; i < opr_sz / 4; i += 2) { 360 float32 e2 = n[H4(i + flip)]; 361 float32 e1 = m[H4(i + flip)] ^ neg_real; 362 float32 e4 = e2; 363 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; 364 365 d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst); 366 d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst); 367 } 368 clear_tail(d, opr_sz, simd_maxsz(desc)); 369 } 370 371 void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, 372 void *vfpst, uint32_t desc) 373 { 374 uintptr_t opr_sz = simd_oprsz(desc); 375 float32 *d = vd; 376 float32 *n = vn; 377 float32 *m = vm; 378 float_status *fpst = vfpst; 379 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 380 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 381 uint32_t neg_real = flip ^ neg_imag; 382 uintptr_t i; 383 float32 e1 = m[H4(flip)]; 384 float32 e3 = m[H4(1 - flip)]; 385 386 /* Shift boolean to the sign bit so we can xor to negate. */ 387 neg_real <<= 31; 388 neg_imag <<= 31; 389 e1 ^= neg_real; 390 e3 ^= neg_imag; 391 392 for (i = 0; i < opr_sz / 4; i += 2) { 393 float32 e2 = n[H4(i + flip)]; 394 float32 e4 = e2; 395 396 d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst); 397 d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst); 398 } 399 clear_tail(d, opr_sz, simd_maxsz(desc)); 400 } 401 402 void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, 403 void *vfpst, uint32_t desc) 404 { 405 uintptr_t opr_sz = simd_oprsz(desc); 406 float64 *d = vd; 407 float64 *n = vn; 408 float64 *m = vm; 409 float_status *fpst = vfpst; 410 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); 411 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); 412 uint64_t neg_real = flip ^ neg_imag; 413 uintptr_t i; 414 415 /* Shift boolean to the sign bit so we can xor to negate. */ 416 neg_real <<= 63; 417 neg_imag <<= 63; 418 419 for (i = 0; i < opr_sz / 8; i += 2) { 420 float64 e2 = n[i + flip]; 421 float64 e1 = m[i + flip] ^ neg_real; 422 float64 e4 = e2; 423 float64 e3 = m[i + 1 - flip] ^ neg_imag; 424 425 d[i] = float64_muladd(e2, e1, d[i], 0, fpst); 426 d[i + 1] = float64_muladd(e4, e3, d[i + 1], 0, fpst); 427 } 428 clear_tail(d, opr_sz, simd_maxsz(desc)); 429 } 430