1 /* 2 * Generic vectorized operation runtime 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "qemu/host-utils.h" 22 #include "cpu.h" 23 #include "exec/helper-proto.h" 24 #include "tcg-gvec-desc.h" 25 26 27 /* Virtually all hosts support 16-byte vectors. Those that don't can emulate 28 * them via GCC's generic vector extension. This turns out to be simpler and 29 * more reliable than getting the compiler to autovectorize. 30 * 31 * In tcg-op-gvec.c, we asserted that both the size and alignment of the data 32 * are multiples of 16. 33 * 34 * When the compiler does not support all of the operations we require, the 35 * loops are written so that we can always fall back on the base types. 36 */ 37 #ifdef CONFIG_VECTOR16 38 typedef uint8_t vec8 __attribute__((vector_size(16))); 39 typedef uint16_t vec16 __attribute__((vector_size(16))); 40 typedef uint32_t vec32 __attribute__((vector_size(16))); 41 typedef uint64_t vec64 __attribute__((vector_size(16))); 42 43 typedef int8_t svec8 __attribute__((vector_size(16))); 44 typedef int16_t svec16 __attribute__((vector_size(16))); 45 typedef int32_t svec32 __attribute__((vector_size(16))); 46 typedef int64_t svec64 __attribute__((vector_size(16))); 47 48 #define DUP16(X) { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X } 49 #define DUP8(X) { X, X, X, X, X, X, X, X } 50 #define DUP4(X) { X, X, X, X } 51 #define DUP2(X) { X, X } 52 #else 53 typedef uint8_t vec8; 54 typedef uint16_t vec16; 55 typedef uint32_t vec32; 56 typedef uint64_t vec64; 57 58 typedef int8_t svec8; 59 typedef int16_t svec16; 60 typedef int32_t svec32; 61 typedef int64_t svec64; 62 63 #define DUP16(X) X 64 #define DUP8(X) X 65 #define DUP4(X) X 66 #define DUP2(X) X 67 #endif /* CONFIG_VECTOR16 */ 68 69 static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc) 70 { 71 intptr_t maxsz = simd_maxsz(desc); 72 intptr_t i; 73 74 if (unlikely(maxsz > oprsz)) { 75 for (i = oprsz; i < maxsz; i += sizeof(uint64_t)) { 76 *(uint64_t *)(d + i) = 0; 77 } 78 } 79 } 80 81 void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc) 82 { 83 intptr_t oprsz = simd_oprsz(desc); 84 intptr_t i; 85 86 for (i = 0; i < oprsz; i += sizeof(vec8)) { 87 *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i); 88 } 89 clear_high(d, oprsz, desc); 90 } 91 92 void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc) 93 { 94 intptr_t oprsz = simd_oprsz(desc); 95 intptr_t i; 96 97 for (i = 0; i < oprsz; i += sizeof(vec16)) { 98 *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i); 99 } 100 clear_high(d, oprsz, desc); 101 } 102 103 void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc) 104 { 105 intptr_t oprsz = simd_oprsz(desc); 106 intptr_t i; 107 108 for (i = 0; i < oprsz; i += sizeof(vec32)) { 109 *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i); 110 } 111 clear_high(d, oprsz, desc); 112 } 113 114 void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc) 115 { 116 intptr_t oprsz = simd_oprsz(desc); 117 intptr_t i; 118 119 for (i = 0; i < oprsz; i += sizeof(vec64)) { 120 *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i); 121 } 122 clear_high(d, oprsz, desc); 123 } 124 125 void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc) 126 { 127 intptr_t oprsz = simd_oprsz(desc); 128 intptr_t i; 129 130 for (i = 0; i < oprsz; i += sizeof(vec8)) { 131 *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i); 132 } 133 clear_high(d, oprsz, desc); 134 } 135 136 void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc) 137 { 138 intptr_t oprsz = simd_oprsz(desc); 139 intptr_t i; 140 141 for (i = 0; i < oprsz; i += sizeof(vec16)) { 142 *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i); 143 } 144 clear_high(d, oprsz, desc); 145 } 146 147 void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc) 148 { 149 intptr_t oprsz = simd_oprsz(desc); 150 intptr_t i; 151 152 for (i = 0; i < oprsz; i += sizeof(vec32)) { 153 *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i); 154 } 155 clear_high(d, oprsz, desc); 156 } 157 158 void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc) 159 { 160 intptr_t oprsz = simd_oprsz(desc); 161 intptr_t i; 162 163 for (i = 0; i < oprsz; i += sizeof(vec64)) { 164 *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i); 165 } 166 clear_high(d, oprsz, desc); 167 } 168 169 void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc) 170 { 171 intptr_t oprsz = simd_oprsz(desc); 172 intptr_t i; 173 174 for (i = 0; i < oprsz; i += sizeof(vec8)) { 175 *(vec8 *)(d + i) = -*(vec8 *)(a + i); 176 } 177 clear_high(d, oprsz, desc); 178 } 179 180 void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc) 181 { 182 intptr_t oprsz = simd_oprsz(desc); 183 intptr_t i; 184 185 for (i = 0; i < oprsz; i += sizeof(vec16)) { 186 *(vec16 *)(d + i) = -*(vec16 *)(a + i); 187 } 188 clear_high(d, oprsz, desc); 189 } 190 191 void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc) 192 { 193 intptr_t oprsz = simd_oprsz(desc); 194 intptr_t i; 195 196 for (i = 0; i < oprsz; i += sizeof(vec32)) { 197 *(vec32 *)(d + i) = -*(vec32 *)(a + i); 198 } 199 clear_high(d, oprsz, desc); 200 } 201 202 void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc) 203 { 204 intptr_t oprsz = simd_oprsz(desc); 205 intptr_t i; 206 207 for (i = 0; i < oprsz; i += sizeof(vec64)) { 208 *(vec64 *)(d + i) = -*(vec64 *)(a + i); 209 } 210 clear_high(d, oprsz, desc); 211 } 212 213 void HELPER(gvec_mov)(void *d, void *a, uint32_t desc) 214 { 215 intptr_t oprsz = simd_oprsz(desc); 216 217 memcpy(d, a, oprsz); 218 clear_high(d, oprsz, desc); 219 } 220 221 void HELPER(gvec_dup64)(void *d, uint32_t desc, uint64_t c) 222 { 223 intptr_t oprsz = simd_oprsz(desc); 224 intptr_t i; 225 226 if (c == 0) { 227 oprsz = 0; 228 } else { 229 for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 230 *(uint64_t *)(d + i) = c; 231 } 232 } 233 clear_high(d, oprsz, desc); 234 } 235 236 void HELPER(gvec_dup32)(void *d, uint32_t desc, uint32_t c) 237 { 238 intptr_t oprsz = simd_oprsz(desc); 239 intptr_t i; 240 241 if (c == 0) { 242 oprsz = 0; 243 } else { 244 for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 245 *(uint32_t *)(d + i) = c; 246 } 247 } 248 clear_high(d, oprsz, desc); 249 } 250 251 void HELPER(gvec_dup16)(void *d, uint32_t desc, uint32_t c) 252 { 253 HELPER(gvec_dup32)(d, desc, 0x00010001 * (c & 0xffff)); 254 } 255 256 void HELPER(gvec_dup8)(void *d, uint32_t desc, uint32_t c) 257 { 258 HELPER(gvec_dup32)(d, desc, 0x01010101 * (c & 0xff)); 259 } 260 261 void HELPER(gvec_not)(void *d, void *a, uint32_t desc) 262 { 263 intptr_t oprsz = simd_oprsz(desc); 264 intptr_t i; 265 266 for (i = 0; i < oprsz; i += sizeof(vec64)) { 267 *(vec64 *)(d + i) = ~*(vec64 *)(a + i); 268 } 269 clear_high(d, oprsz, desc); 270 } 271 272 void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc) 273 { 274 intptr_t oprsz = simd_oprsz(desc); 275 intptr_t i; 276 277 for (i = 0; i < oprsz; i += sizeof(vec64)) { 278 *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i); 279 } 280 clear_high(d, oprsz, desc); 281 } 282 283 void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc) 284 { 285 intptr_t oprsz = simd_oprsz(desc); 286 intptr_t i; 287 288 for (i = 0; i < oprsz; i += sizeof(vec64)) { 289 *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i); 290 } 291 clear_high(d, oprsz, desc); 292 } 293 294 void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc) 295 { 296 intptr_t oprsz = simd_oprsz(desc); 297 intptr_t i; 298 299 for (i = 0; i < oprsz; i += sizeof(vec64)) { 300 *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i); 301 } 302 clear_high(d, oprsz, desc); 303 } 304 305 void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc) 306 { 307 intptr_t oprsz = simd_oprsz(desc); 308 intptr_t i; 309 310 for (i = 0; i < oprsz; i += sizeof(vec64)) { 311 *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i); 312 } 313 clear_high(d, oprsz, desc); 314 } 315 316 void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc) 317 { 318 intptr_t oprsz = simd_oprsz(desc); 319 intptr_t i; 320 321 for (i = 0; i < oprsz; i += sizeof(vec64)) { 322 *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i); 323 } 324 clear_high(d, oprsz, desc); 325 } 326 327 void HELPER(gvec_shl8i)(void *d, void *a, uint32_t desc) 328 { 329 intptr_t oprsz = simd_oprsz(desc); 330 int shift = simd_data(desc); 331 intptr_t i; 332 333 for (i = 0; i < oprsz; i += sizeof(vec8)) { 334 *(vec8 *)(d + i) = *(vec8 *)(a + i) << shift; 335 } 336 clear_high(d, oprsz, desc); 337 } 338 339 void HELPER(gvec_shl16i)(void *d, void *a, uint32_t desc) 340 { 341 intptr_t oprsz = simd_oprsz(desc); 342 int shift = simd_data(desc); 343 intptr_t i; 344 345 for (i = 0; i < oprsz; i += sizeof(vec16)) { 346 *(vec16 *)(d + i) = *(vec16 *)(a + i) << shift; 347 } 348 clear_high(d, oprsz, desc); 349 } 350 351 void HELPER(gvec_shl32i)(void *d, void *a, uint32_t desc) 352 { 353 intptr_t oprsz = simd_oprsz(desc); 354 int shift = simd_data(desc); 355 intptr_t i; 356 357 for (i = 0; i < oprsz; i += sizeof(vec32)) { 358 *(vec32 *)(d + i) = *(vec32 *)(a + i) << shift; 359 } 360 clear_high(d, oprsz, desc); 361 } 362 363 void HELPER(gvec_shl64i)(void *d, void *a, uint32_t desc) 364 { 365 intptr_t oprsz = simd_oprsz(desc); 366 int shift = simd_data(desc); 367 intptr_t i; 368 369 for (i = 0; i < oprsz; i += sizeof(vec64)) { 370 *(vec64 *)(d + i) = *(vec64 *)(a + i) << shift; 371 } 372 clear_high(d, oprsz, desc); 373 } 374 375 void HELPER(gvec_shr8i)(void *d, void *a, uint32_t desc) 376 { 377 intptr_t oprsz = simd_oprsz(desc); 378 int shift = simd_data(desc); 379 intptr_t i; 380 381 for (i = 0; i < oprsz; i += sizeof(vec8)) { 382 *(vec8 *)(d + i) = *(vec8 *)(a + i) >> shift; 383 } 384 clear_high(d, oprsz, desc); 385 } 386 387 void HELPER(gvec_shr16i)(void *d, void *a, uint32_t desc) 388 { 389 intptr_t oprsz = simd_oprsz(desc); 390 int shift = simd_data(desc); 391 intptr_t i; 392 393 for (i = 0; i < oprsz; i += sizeof(vec16)) { 394 *(vec16 *)(d + i) = *(vec16 *)(a + i) >> shift; 395 } 396 clear_high(d, oprsz, desc); 397 } 398 399 void HELPER(gvec_shr32i)(void *d, void *a, uint32_t desc) 400 { 401 intptr_t oprsz = simd_oprsz(desc); 402 int shift = simd_data(desc); 403 intptr_t i; 404 405 for (i = 0; i < oprsz; i += sizeof(vec32)) { 406 *(vec32 *)(d + i) = *(vec32 *)(a + i) >> shift; 407 } 408 clear_high(d, oprsz, desc); 409 } 410 411 void HELPER(gvec_shr64i)(void *d, void *a, uint32_t desc) 412 { 413 intptr_t oprsz = simd_oprsz(desc); 414 int shift = simd_data(desc); 415 intptr_t i; 416 417 for (i = 0; i < oprsz; i += sizeof(vec64)) { 418 *(vec64 *)(d + i) = *(vec64 *)(a + i) >> shift; 419 } 420 clear_high(d, oprsz, desc); 421 } 422 423 void HELPER(gvec_sar8i)(void *d, void *a, uint32_t desc) 424 { 425 intptr_t oprsz = simd_oprsz(desc); 426 int shift = simd_data(desc); 427 intptr_t i; 428 429 for (i = 0; i < oprsz; i += sizeof(vec8)) { 430 *(svec8 *)(d + i) = *(svec8 *)(a + i) >> shift; 431 } 432 clear_high(d, oprsz, desc); 433 } 434 435 void HELPER(gvec_sar16i)(void *d, void *a, uint32_t desc) 436 { 437 intptr_t oprsz = simd_oprsz(desc); 438 int shift = simd_data(desc); 439 intptr_t i; 440 441 for (i = 0; i < oprsz; i += sizeof(vec16)) { 442 *(svec16 *)(d + i) = *(svec16 *)(a + i) >> shift; 443 } 444 clear_high(d, oprsz, desc); 445 } 446 447 void HELPER(gvec_sar32i)(void *d, void *a, uint32_t desc) 448 { 449 intptr_t oprsz = simd_oprsz(desc); 450 int shift = simd_data(desc); 451 intptr_t i; 452 453 for (i = 0; i < oprsz; i += sizeof(vec32)) { 454 *(svec32 *)(d + i) = *(svec32 *)(a + i) >> shift; 455 } 456 clear_high(d, oprsz, desc); 457 } 458 459 void HELPER(gvec_sar64i)(void *d, void *a, uint32_t desc) 460 { 461 intptr_t oprsz = simd_oprsz(desc); 462 int shift = simd_data(desc); 463 intptr_t i; 464 465 for (i = 0; i < oprsz; i += sizeof(vec64)) { 466 *(svec64 *)(d + i) = *(svec64 *)(a + i) >> shift; 467 } 468 clear_high(d, oprsz, desc); 469 } 470 471 /* If vectors are enabled, the compiler fills in -1 for true. 472 Otherwise, we must take care of this by hand. */ 473 #ifdef CONFIG_VECTOR16 474 # define DO_CMP0(X) X 475 #else 476 # define DO_CMP0(X) -(X) 477 #endif 478 479 #define DO_CMP1(NAME, TYPE, OP) \ 480 void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc) \ 481 { \ 482 intptr_t oprsz = simd_oprsz(desc); \ 483 intptr_t i; \ 484 for (i = 0; i < oprsz; i += sizeof(vec64)) { \ 485 *(TYPE *)(d + i) = DO_CMP0(*(TYPE *)(a + i) OP *(TYPE *)(b + i)); \ 486 } \ 487 clear_high(d, oprsz, desc); \ 488 } 489 490 #define DO_CMP2(SZ) \ 491 DO_CMP1(gvec_eq##SZ, vec##SZ, ==) \ 492 DO_CMP1(gvec_ne##SZ, vec##SZ, !=) \ 493 DO_CMP1(gvec_lt##SZ, svec##SZ, <) \ 494 DO_CMP1(gvec_le##SZ, svec##SZ, <=) \ 495 DO_CMP1(gvec_ltu##SZ, vec##SZ, <) \ 496 DO_CMP1(gvec_leu##SZ, vec##SZ, <=) 497 498 DO_CMP2(8) 499 DO_CMP2(16) 500 DO_CMP2(32) 501 DO_CMP2(64) 502 503 #undef DO_CMP0 504 #undef DO_CMP1 505 #undef DO_CMP2 506