1*db432672SRichard Henderson /* 2*db432672SRichard Henderson * Generic vectorized operation runtime 3*db432672SRichard Henderson * 4*db432672SRichard Henderson * Copyright (c) 2018 Linaro 5*db432672SRichard Henderson * 6*db432672SRichard Henderson * This library is free software; you can redistribute it and/or 7*db432672SRichard Henderson * modify it under the terms of the GNU Lesser General Public 8*db432672SRichard Henderson * License as published by the Free Software Foundation; either 9*db432672SRichard Henderson * version 2 of the License, or (at your option) any later version. 10*db432672SRichard Henderson * 11*db432672SRichard Henderson * This library is distributed in the hope that it will be useful, 12*db432672SRichard Henderson * but WITHOUT ANY WARRANTY; without even the implied warranty of 13*db432672SRichard Henderson * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14*db432672SRichard Henderson * Lesser General Public License for more details. 15*db432672SRichard Henderson * 16*db432672SRichard Henderson * You should have received a copy of the GNU Lesser General Public 17*db432672SRichard Henderson * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18*db432672SRichard Henderson */ 19*db432672SRichard Henderson 20*db432672SRichard Henderson #include "qemu/osdep.h" 21*db432672SRichard Henderson #include "qemu/host-utils.h" 22*db432672SRichard Henderson #include "cpu.h" 23*db432672SRichard Henderson #include "exec/helper-proto.h" 24*db432672SRichard Henderson #include "tcg-gvec-desc.h" 25*db432672SRichard Henderson 26*db432672SRichard Henderson 27*db432672SRichard Henderson /* Virtually all hosts support 16-byte vectors. Those that don't can emulate 28*db432672SRichard Henderson * them via GCC's generic vector extension. This turns out to be simpler and 29*db432672SRichard Henderson * more reliable than getting the compiler to autovectorize. 30*db432672SRichard Henderson * 31*db432672SRichard Henderson * In tcg-op-gvec.c, we asserted that both the size and alignment of the data 32*db432672SRichard Henderson * are multiples of 16. 33*db432672SRichard Henderson * 34*db432672SRichard Henderson * When the compiler does not support all of the operations we require, the 35*db432672SRichard Henderson * loops are written so that we can always fall back on the base types. 36*db432672SRichard Henderson */ 37*db432672SRichard Henderson #ifdef CONFIG_VECTOR16 38*db432672SRichard Henderson typedef uint8_t vec8 __attribute__((vector_size(16))); 39*db432672SRichard Henderson typedef uint16_t vec16 __attribute__((vector_size(16))); 40*db432672SRichard Henderson typedef uint32_t vec32 __attribute__((vector_size(16))); 41*db432672SRichard Henderson typedef uint64_t vec64 __attribute__((vector_size(16))); 42*db432672SRichard Henderson 43*db432672SRichard Henderson typedef int8_t svec8 __attribute__((vector_size(16))); 44*db432672SRichard Henderson typedef int16_t svec16 __attribute__((vector_size(16))); 45*db432672SRichard Henderson typedef int32_t svec32 __attribute__((vector_size(16))); 46*db432672SRichard Henderson typedef int64_t svec64 __attribute__((vector_size(16))); 47*db432672SRichard Henderson 48*db432672SRichard Henderson #define DUP16(X) { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X } 49*db432672SRichard Henderson #define DUP8(X) { X, X, X, X, X, X, X, X } 50*db432672SRichard Henderson #define DUP4(X) { X, X, X, X } 51*db432672SRichard Henderson #define DUP2(X) { X, X } 52*db432672SRichard Henderson #else 53*db432672SRichard Henderson typedef uint8_t vec8; 54*db432672SRichard Henderson typedef uint16_t vec16; 55*db432672SRichard Henderson typedef uint32_t vec32; 56*db432672SRichard Henderson typedef uint64_t vec64; 57*db432672SRichard Henderson 58*db432672SRichard Henderson typedef int8_t svec8; 59*db432672SRichard Henderson typedef int16_t svec16; 60*db432672SRichard Henderson typedef int32_t svec32; 61*db432672SRichard Henderson typedef int64_t svec64; 62*db432672SRichard Henderson 63*db432672SRichard Henderson #define DUP16(X) X 64*db432672SRichard Henderson #define DUP8(X) X 65*db432672SRichard Henderson #define DUP4(X) X 66*db432672SRichard Henderson #define DUP2(X) X 67*db432672SRichard Henderson #endif /* CONFIG_VECTOR16 */ 68*db432672SRichard Henderson 69*db432672SRichard Henderson static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc) 70*db432672SRichard Henderson { 71*db432672SRichard Henderson intptr_t maxsz = simd_maxsz(desc); 72*db432672SRichard Henderson intptr_t i; 73*db432672SRichard Henderson 74*db432672SRichard Henderson if (unlikely(maxsz > oprsz)) { 75*db432672SRichard Henderson for (i = oprsz; i < maxsz; i += sizeof(uint64_t)) { 76*db432672SRichard Henderson *(uint64_t *)(d + i) = 0; 77*db432672SRichard Henderson } 78*db432672SRichard Henderson } 79*db432672SRichard Henderson } 80*db432672SRichard Henderson 81*db432672SRichard Henderson void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc) 82*db432672SRichard Henderson { 83*db432672SRichard Henderson intptr_t oprsz = simd_oprsz(desc); 84*db432672SRichard Henderson intptr_t i; 85*db432672SRichard Henderson 86*db432672SRichard Henderson for (i = 0; i < oprsz; i += sizeof(vec8)) { 87*db432672SRichard Henderson *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i); 88*db432672SRichard Henderson } 89*db432672SRichard Henderson clear_high(d, oprsz, desc); 90*db432672SRichard Henderson } 91*db432672SRichard Henderson 92*db432672SRichard Henderson void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc) 93*db432672SRichard Henderson { 94*db432672SRichard Henderson intptr_t oprsz = simd_oprsz(desc); 95*db432672SRichard Henderson intptr_t i; 96*db432672SRichard Henderson 97*db432672SRichard Henderson for (i = 0; i < oprsz; i += sizeof(vec16)) { 98*db432672SRichard Henderson *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i); 99*db432672SRichard Henderson } 100*db432672SRichard Henderson clear_high(d, oprsz, desc); 101*db432672SRichard Henderson } 102*db432672SRichard Henderson 103*db432672SRichard Henderson void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc) 104*db432672SRichard Henderson { 105*db432672SRichard Henderson intptr_t oprsz = simd_oprsz(desc); 106*db432672SRichard Henderson intptr_t i; 107*db432672SRichard Henderson 108*db432672SRichard Henderson for (i = 0; i < oprsz; i += sizeof(vec32)) { 109*db432672SRichard Henderson *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i); 110*db432672SRichard Henderson } 111*db432672SRichard Henderson clear_high(d, oprsz, desc); 112*db432672SRichard Henderson } 113*db432672SRichard Henderson 114*db432672SRichard Henderson void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc) 115*db432672SRichard Henderson { 116*db432672SRichard Henderson intptr_t oprsz = simd_oprsz(desc); 117*db432672SRichard Henderson intptr_t i; 118*db432672SRichard Henderson 119*db432672SRichard Henderson for (i = 0; i < oprsz; i += sizeof(vec64)) { 120*db432672SRichard Henderson *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i); 121*db432672SRichard Henderson } 122*db432672SRichard Henderson clear_high(d, oprsz, desc); 123*db432672SRichard Henderson } 124*db432672SRichard Henderson 125*db432672SRichard Henderson void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc) 126*db432672SRichard Henderson { 127*db432672SRichard Henderson intptr_t oprsz = simd_oprsz(desc); 128*db432672SRichard Henderson intptr_t i; 129*db432672SRichard Henderson 130*db432672SRichard Henderson for (i = 0; i < oprsz; i += sizeof(vec8)) { 131*db432672SRichard Henderson *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i); 132*db432672SRichard Henderson } 133*db432672SRichard Henderson clear_high(d, oprsz, desc); 134*db432672SRichard Henderson } 135*db432672SRichard Henderson 136*db432672SRichard Henderson void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc) 137*db432672SRichard Henderson { 138*db432672SRichard Henderson intptr_t oprsz = simd_oprsz(desc); 139*db432672SRichard Henderson intptr_t i; 140*db432672SRichard Henderson 141*db432672SRichard Henderson for (i = 0; i < oprsz; i += sizeof(vec16)) { 142*db432672SRichard Henderson *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i); 143*db432672SRichard Henderson } 144*db432672SRichard Henderson clear_high(d, oprsz, desc); 145*db432672SRichard Henderson } 146*db432672SRichard Henderson 147*db432672SRichard Henderson void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc) 148*db432672SRichard Henderson { 149*db432672SRichard Henderson intptr_t oprsz = simd_oprsz(desc); 150*db432672SRichard Henderson intptr_t i; 151*db432672SRichard Henderson 152*db432672SRichard Henderson for (i = 0; i < oprsz; i += sizeof(vec32)) { 153*db432672SRichard Henderson *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i); 154*db432672SRichard Henderson } 155*db432672SRichard Henderson clear_high(d, oprsz, desc); 156*db432672SRichard Henderson } 157*db432672SRichard Henderson 158*db432672SRichard Henderson void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc) 159*db432672SRichard Henderson { 160*db432672SRichard Henderson intptr_t oprsz = simd_oprsz(desc); 161*db432672SRichard Henderson intptr_t i; 162*db432672SRichard Henderson 163*db432672SRichard Henderson for (i = 0; i < oprsz; i += sizeof(vec64)) { 164*db432672SRichard Henderson *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i); 165*db432672SRichard Henderson } 166*db432672SRichard Henderson clear_high(d, oprsz, desc); 167*db432672SRichard Henderson } 168*db432672SRichard Henderson 169*db432672SRichard Henderson void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc) 170*db432672SRichard Henderson { 171*db432672SRichard Henderson intptr_t oprsz = simd_oprsz(desc); 172*db432672SRichard Henderson intptr_t i; 173*db432672SRichard Henderson 174*db432672SRichard Henderson for (i = 0; i < oprsz; i += sizeof(vec8)) { 175*db432672SRichard Henderson *(vec8 *)(d + i) = -*(vec8 *)(a + i); 176*db432672SRichard Henderson } 177*db432672SRichard Henderson clear_high(d, oprsz, desc); 178*db432672SRichard Henderson } 179*db432672SRichard Henderson 180*db432672SRichard Henderson void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc) 181*db432672SRichard Henderson { 182*db432672SRichard Henderson intptr_t oprsz = simd_oprsz(desc); 183*db432672SRichard Henderson intptr_t i; 184*db432672SRichard Henderson 185*db432672SRichard Henderson for (i = 0; i < oprsz; i += sizeof(vec16)) { 186*db432672SRichard Henderson *(vec16 *)(d + i) = -*(vec16 *)(a + i); 187*db432672SRichard Henderson } 188*db432672SRichard Henderson clear_high(d, oprsz, desc); 189*db432672SRichard Henderson } 190*db432672SRichard Henderson 191*db432672SRichard Henderson void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc) 192*db432672SRichard Henderson { 193*db432672SRichard Henderson intptr_t oprsz = simd_oprsz(desc); 194*db432672SRichard Henderson intptr_t i; 195*db432672SRichard Henderson 196*db432672SRichard Henderson for (i = 0; i < oprsz; i += sizeof(vec32)) { 197*db432672SRichard Henderson *(vec32 *)(d + i) = -*(vec32 *)(a + i); 198*db432672SRichard Henderson } 199*db432672SRichard Henderson clear_high(d, oprsz, desc); 200*db432672SRichard Henderson } 201*db432672SRichard Henderson 202*db432672SRichard Henderson void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc) 203*db432672SRichard Henderson { 204*db432672SRichard Henderson intptr_t oprsz = simd_oprsz(desc); 205*db432672SRichard Henderson intptr_t i; 206*db432672SRichard Henderson 207*db432672SRichard Henderson for (i = 0; i < oprsz; i += sizeof(vec64)) { 208*db432672SRichard Henderson *(vec64 *)(d + i) = -*(vec64 *)(a + i); 209*db432672SRichard Henderson } 210*db432672SRichard Henderson clear_high(d, oprsz, desc); 211*db432672SRichard Henderson } 212*db432672SRichard Henderson 213*db432672SRichard Henderson void HELPER(gvec_mov)(void *d, void *a, uint32_t desc) 214*db432672SRichard Henderson { 215*db432672SRichard Henderson intptr_t oprsz = simd_oprsz(desc); 216*db432672SRichard Henderson 217*db432672SRichard Henderson memcpy(d, a, oprsz); 218*db432672SRichard Henderson clear_high(d, oprsz, desc); 219*db432672SRichard Henderson } 220*db432672SRichard Henderson 221*db432672SRichard Henderson void HELPER(gvec_dup64)(void *d, uint32_t desc, uint64_t c) 222*db432672SRichard Henderson { 223*db432672SRichard Henderson intptr_t oprsz = simd_oprsz(desc); 224*db432672SRichard Henderson intptr_t i; 225*db432672SRichard Henderson 226*db432672SRichard Henderson if (c == 0) { 227*db432672SRichard Henderson oprsz = 0; 228*db432672SRichard Henderson } else { 229*db432672SRichard Henderson for (i = 0; i < oprsz; i += sizeof(uint64_t)) { 230*db432672SRichard Henderson *(uint64_t *)(d + i) = c; 231*db432672SRichard Henderson } 232*db432672SRichard Henderson } 233*db432672SRichard Henderson clear_high(d, oprsz, desc); 234*db432672SRichard Henderson } 235*db432672SRichard Henderson 236*db432672SRichard Henderson void HELPER(gvec_dup32)(void *d, uint32_t desc, uint32_t c) 237*db432672SRichard Henderson { 238*db432672SRichard Henderson intptr_t oprsz = simd_oprsz(desc); 239*db432672SRichard Henderson intptr_t i; 240*db432672SRichard Henderson 241*db432672SRichard Henderson if (c == 0) { 242*db432672SRichard Henderson oprsz = 0; 243*db432672SRichard Henderson } else { 244*db432672SRichard Henderson for (i = 0; i < oprsz; i += sizeof(uint32_t)) { 245*db432672SRichard Henderson *(uint32_t *)(d + i) = c; 246*db432672SRichard Henderson } 247*db432672SRichard Henderson } 248*db432672SRichard Henderson clear_high(d, oprsz, desc); 249*db432672SRichard Henderson } 250*db432672SRichard Henderson 251*db432672SRichard Henderson void HELPER(gvec_dup16)(void *d, uint32_t desc, uint32_t c) 252*db432672SRichard Henderson { 253*db432672SRichard Henderson HELPER(gvec_dup32)(d, desc, 0x00010001 * (c & 0xffff)); 254*db432672SRichard Henderson } 255*db432672SRichard Henderson 256*db432672SRichard Henderson void HELPER(gvec_dup8)(void *d, uint32_t desc, uint32_t c) 257*db432672SRichard Henderson { 258*db432672SRichard Henderson HELPER(gvec_dup32)(d, desc, 0x01010101 * (c & 0xff)); 259*db432672SRichard Henderson } 260*db432672SRichard Henderson 261*db432672SRichard Henderson void HELPER(gvec_not)(void *d, void *a, uint32_t desc) 262*db432672SRichard Henderson { 263*db432672SRichard Henderson intptr_t oprsz = simd_oprsz(desc); 264*db432672SRichard Henderson intptr_t i; 265*db432672SRichard Henderson 266*db432672SRichard Henderson for (i = 0; i < oprsz; i += sizeof(vec64)) { 267*db432672SRichard Henderson *(vec64 *)(d + i) = ~*(vec64 *)(a + i); 268*db432672SRichard Henderson } 269*db432672SRichard Henderson clear_high(d, oprsz, desc); 270*db432672SRichard Henderson } 271*db432672SRichard Henderson 272*db432672SRichard Henderson void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc) 273*db432672SRichard Henderson { 274*db432672SRichard Henderson intptr_t oprsz = simd_oprsz(desc); 275*db432672SRichard Henderson intptr_t i; 276*db432672SRichard Henderson 277*db432672SRichard Henderson for (i = 0; i < oprsz; i += sizeof(vec64)) { 278*db432672SRichard Henderson *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i); 279*db432672SRichard Henderson } 280*db432672SRichard Henderson clear_high(d, oprsz, desc); 281*db432672SRichard Henderson } 282*db432672SRichard Henderson 283*db432672SRichard Henderson void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc) 284*db432672SRichard Henderson { 285*db432672SRichard Henderson intptr_t oprsz = simd_oprsz(desc); 286*db432672SRichard Henderson intptr_t i; 287*db432672SRichard Henderson 288*db432672SRichard Henderson for (i = 0; i < oprsz; i += sizeof(vec64)) { 289*db432672SRichard Henderson *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i); 290*db432672SRichard Henderson } 291*db432672SRichard Henderson clear_high(d, oprsz, desc); 292*db432672SRichard Henderson } 293*db432672SRichard Henderson 294*db432672SRichard Henderson void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc) 295*db432672SRichard Henderson { 296*db432672SRichard Henderson intptr_t oprsz = simd_oprsz(desc); 297*db432672SRichard Henderson intptr_t i; 298*db432672SRichard Henderson 299*db432672SRichard Henderson for (i = 0; i < oprsz; i += sizeof(vec64)) { 300*db432672SRichard Henderson *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i); 301*db432672SRichard Henderson } 302*db432672SRichard Henderson clear_high(d, oprsz, desc); 303*db432672SRichard Henderson } 304*db432672SRichard Henderson 305*db432672SRichard Henderson void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc) 306*db432672SRichard Henderson { 307*db432672SRichard Henderson intptr_t oprsz = simd_oprsz(desc); 308*db432672SRichard Henderson intptr_t i; 309*db432672SRichard Henderson 310*db432672SRichard Henderson for (i = 0; i < oprsz; i += sizeof(vec64)) { 311*db432672SRichard Henderson *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i); 312*db432672SRichard Henderson } 313*db432672SRichard Henderson clear_high(d, oprsz, desc); 314*db432672SRichard Henderson } 315*db432672SRichard Henderson 316*db432672SRichard Henderson void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc) 317*db432672SRichard Henderson { 318*db432672SRichard Henderson intptr_t oprsz = simd_oprsz(desc); 319*db432672SRichard Henderson intptr_t i; 320*db432672SRichard Henderson 321*db432672SRichard Henderson for (i = 0; i < oprsz; i += sizeof(vec64)) { 322*db432672SRichard Henderson *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i); 323*db432672SRichard Henderson } 324*db432672SRichard Henderson clear_high(d, oprsz, desc); 325*db432672SRichard Henderson } 326