1 /* 2 * ARM AdvSIMD / SVE Vector Operations 3 * 4 * Copyright (c) 2018 Linaro 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "cpu.h" 22 #include "exec/exec-all.h" 23 #include "exec/helper-proto.h" 24 #include "tcg/tcg-gvec-desc.h" 25 #include "fpu/softfloat.h" 26 27 28 /* Note that vector data is stored in host-endian 64-bit chunks, 29 so addressing units smaller than that needs a host-endian fixup. */ 30 #ifdef HOST_WORDS_BIGENDIAN 31 #define H1(x) ((x) ^ 7) 32 #define H2(x) ((x) ^ 3) 33 #define H4(x) ((x) ^ 1) 34 #else 35 #define H1(x) (x) 36 #define H2(x) (x) 37 #define H4(x) (x) 38 #endif 39 40 #define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] |= CPSR_Q 41 42 static void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz) 43 { 44 uint64_t *d = vd + opr_sz; 45 uintptr_t i; 46 47 for (i = opr_sz; i < max_sz; i += 8) { 48 *d++ = 0; 49 } 50 } 51 52 /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ 53 static uint16_t inl_qrdmlah_s16(CPUARMState *env, int16_t src1, 54 int16_t src2, int16_t src3) 55 { 56 /* Simplify: 57 * = ((a3 << 16) + ((e1 * e2) << 1) + (1 << 15)) >> 16 58 * = ((a3 << 15) + (e1 * e2) + (1 << 14)) >> 15 59 */ 60 int32_t ret = (int32_t)src1 * src2; 61 ret = ((int32_t)src3 << 15) + ret + (1 << 14); 62 ret >>= 15; 63 if (ret != (int16_t)ret) { 64 SET_QC(); 65 ret = (ret < 0 ? -0x8000 : 0x7fff); 66 } 67 return ret; 68 } 69 70 uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, 71 uint32_t src2, uint32_t src3) 72 { 73 uint16_t e1 = inl_qrdmlah_s16(env, src1, src2, src3); 74 uint16_t e2 = inl_qrdmlah_s16(env, src1 >> 16, src2 >> 16, src3 >> 16); 75 return deposit32(e1, 16, 16, e2); 76 } 77 78 void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, 79 void *ve, uint32_t desc) 80 { 81 uintptr_t opr_sz = simd_oprsz(desc); 82 int16_t *d = vd; 83 int16_t *n = vn; 84 int16_t *m = vm; 85 CPUARMState *env = ve; 86 uintptr_t i; 87 88 for (i = 0; i < opr_sz / 2; ++i) { 89 d[i] = inl_qrdmlah_s16(env, n[i], m[i], d[i]); 90 } 91 clear_tail(d, opr_sz, simd_maxsz(desc)); 92 } 93 94 /* Signed saturating rounding doubling multiply-subtract high half, 16-bit */ 95 static uint16_t inl_qrdmlsh_s16(CPUARMState *env, int16_t src1, 96 int16_t src2, int16_t src3) 97 { 98 /* Similarly, using subtraction: 99 * = ((a3 << 16) - ((e1 * e2) << 1) + (1 << 15)) >> 16 100 * = ((a3 << 15) - (e1 * e2) + (1 << 14)) >> 15 101 */ 102 int32_t ret = (int32_t)src1 * src2; 103 ret = ((int32_t)src3 << 15) - ret + (1 << 14); 104 ret >>= 15; 105 if (ret != (int16_t)ret) { 106 SET_QC(); 107 ret = (ret < 0 ? -0x8000 : 0x7fff); 108 } 109 return ret; 110 } 111 112 uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, 113 uint32_t src2, uint32_t src3) 114 { 115 uint16_t e1 = inl_qrdmlsh_s16(env, src1, src2, src3); 116 uint16_t e2 = inl_qrdmlsh_s16(env, src1 >> 16, src2 >> 16, src3 >> 16); 117 return deposit32(e1, 16, 16, e2); 118 } 119 120 void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, 121 void *ve, uint32_t desc) 122 { 123 uintptr_t opr_sz = simd_oprsz(desc); 124 int16_t *d = vd; 125 int16_t *n = vn; 126 int16_t *m = vm; 127 CPUARMState *env = ve; 128 uintptr_t i; 129 130 for (i = 0; i < opr_sz / 2; ++i) { 131 d[i] = inl_qrdmlsh_s16(env, n[i], m[i], d[i]); 132 } 133 clear_tail(d, opr_sz, simd_maxsz(desc)); 134 } 135 136 /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ 137 uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, 138 int32_t src2, int32_t src3) 139 { 140 /* Simplify similarly to int_qrdmlah_s16 above. */ 141 int64_t ret = (int64_t)src1 * src2; 142 ret = ((int64_t)src3 << 31) + ret + (1 << 30); 143 ret >>= 31; 144 if (ret != (int32_t)ret) { 145 SET_QC(); 146 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 147 } 148 return ret; 149 } 150 151 void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, 152 void *ve, uint32_t desc) 153 { 154 uintptr_t opr_sz = simd_oprsz(desc); 155 int32_t *d = vd; 156 int32_t *n = vn; 157 int32_t *m = vm; 158 CPUARMState *env = ve; 159 uintptr_t i; 160 161 for (i = 0; i < opr_sz / 4; ++i) { 162 d[i] = helper_neon_qrdmlah_s32(env, n[i], m[i], d[i]); 163 } 164 clear_tail(d, opr_sz, simd_maxsz(desc)); 165 } 166 167 /* Signed saturating rounding doubling multiply-subtract high half, 32-bit */ 168 uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, 169 int32_t src2, int32_t src3) 170 { 171 /* Simplify similarly to int_qrdmlsh_s16 above. */ 172 int64_t ret = (int64_t)src1 * src2; 173 ret = ((int64_t)src3 << 31) - ret + (1 << 30); 174 ret >>= 31; 175 if (ret != (int32_t)ret) { 176 SET_QC(); 177 ret = (ret < 0 ? INT32_MIN : INT32_MAX); 178 } 179 return ret; 180 } 181 182 void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, 183 void *ve, uint32_t desc) 184 { 185 uintptr_t opr_sz = simd_oprsz(desc); 186 int32_t *d = vd; 187 int32_t *n = vn; 188 int32_t *m = vm; 189 CPUARMState *env = ve; 190 uintptr_t i; 191 192 for (i = 0; i < opr_sz / 4; ++i) { 193 d[i] = helper_neon_qrdmlsh_s32(env, n[i], m[i], d[i]); 194 } 195 clear_tail(d, opr_sz, simd_maxsz(desc)); 196 } 197 198 void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, 199 void *vfpst, uint32_t desc) 200 { 201 uintptr_t opr_sz = simd_oprsz(desc); 202 float16 *d = vd; 203 float16 *n = vn; 204 float16 *m = vm; 205 float_status *fpst = vfpst; 206 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 207 uint32_t neg_imag = neg_real ^ 1; 208 uintptr_t i; 209 210 /* Shift boolean to the sign bit so we can xor to negate. */ 211 neg_real <<= 15; 212 neg_imag <<= 15; 213 214 for (i = 0; i < opr_sz / 2; i += 2) { 215 float16 e0 = n[H2(i)]; 216 float16 e1 = m[H2(i + 1)] ^ neg_imag; 217 float16 e2 = n[H2(i + 1)]; 218 float16 e3 = m[H2(i)] ^ neg_real; 219 220 d[H2(i)] = float16_add(e0, e1, fpst); 221 d[H2(i + 1)] = float16_add(e2, e3, fpst); 222 } 223 clear_tail(d, opr_sz, simd_maxsz(desc)); 224 } 225 226 void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, 227 void *vfpst, uint32_t desc) 228 { 229 uintptr_t opr_sz = simd_oprsz(desc); 230 float32 *d = vd; 231 float32 *n = vn; 232 float32 *m = vm; 233 float_status *fpst = vfpst; 234 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); 235 uint32_t neg_imag = neg_real ^ 1; 236 uintptr_t i; 237 238 /* Shift boolean to the sign bit so we can xor to negate. */ 239 neg_real <<= 31; 240 neg_imag <<= 31; 241 242 for (i = 0; i < opr_sz / 4; i += 2) { 243 float32 e0 = n[H4(i)]; 244 float32 e1 = m[H4(i + 1)] ^ neg_imag; 245 float32 e2 = n[H4(i + 1)]; 246 float32 e3 = m[H4(i)] ^ neg_real; 247 248 d[H4(i)] = float32_add(e0, e1, fpst); 249 d[H4(i + 1)] = float32_add(e2, e3, fpst); 250 } 251 clear_tail(d, opr_sz, simd_maxsz(desc)); 252 } 253 254 void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, 255 void *vfpst, uint32_t desc) 256 { 257 uintptr_t opr_sz = simd_oprsz(desc); 258 float64 *d = vd; 259 float64 *n = vn; 260 float64 *m = vm; 261 float_status *fpst = vfpst; 262 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); 263 uint64_t neg_imag = neg_real ^ 1; 264 uintptr_t i; 265 266 /* Shift boolean to the sign bit so we can xor to negate. */ 267 neg_real <<= 63; 268 neg_imag <<= 63; 269 270 for (i = 0; i < opr_sz / 8; i += 2) { 271 float64 e0 = n[i]; 272 float64 e1 = m[i + 1] ^ neg_imag; 273 float64 e2 = n[i + 1]; 274 float64 e3 = m[i] ^ neg_real; 275 276 d[i] = float64_add(e0, e1, fpst); 277 d[i + 1] = float64_add(e2, e3, fpst); 278 } 279 clear_tail(d, opr_sz, simd_maxsz(desc)); 280 } 281