1 /* 2 * RISC-V Vector Crypto Extension Helpers for QEMU. 3 * 4 * Copyright (C) 2023 SiFive, Inc. 5 * Written by Codethink Ltd and SiFive. 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms and conditions of the GNU General Public License, 9 * version 2 or later, as published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 * more details. 15 * 16 * You should have received a copy of the GNU General Public License along with 17 * this program. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "qemu/osdep.h" 21 #include "qemu/host-utils.h" 22 #include "qemu/bitops.h" 23 #include "qemu/bswap.h" 24 #include "cpu.h" 25 #include "crypto/aes.h" 26 #include "crypto/aes-round.h" 27 #include "crypto/sm4.h" 28 #include "exec/memop.h" 29 #include "exec/helper-proto.h" 30 #include "internals.h" 31 #include "vector_internals.h" 32 33 static uint64_t clmul64(uint64_t y, uint64_t x) 34 { 35 uint64_t result = 0; 36 for (int j = 63; j >= 0; j--) { 37 if ((y >> j) & 1) { 38 result ^= (x << j); 39 } 40 } 41 return result; 42 } 43 44 static uint64_t clmulh64(uint64_t y, uint64_t x) 45 { 46 uint64_t result = 0; 47 for (int j = 63; j >= 1; j--) { 48 if ((y >> j) & 1) { 49 result ^= (x >> (64 - j)); 50 } 51 } 52 return result; 53 } 54 55 RVVCALL(OPIVV2, vclmul_vv, OP_UUU_D, H8, H8, H8, clmul64) 56 GEN_VEXT_VV(vclmul_vv, 8) 57 RVVCALL(OPIVX2, vclmul_vx, OP_UUU_D, H8, H8, clmul64) 58 GEN_VEXT_VX(vclmul_vx, 8) 59 RVVCALL(OPIVV2, vclmulh_vv, OP_UUU_D, H8, H8, H8, clmulh64) 60 GEN_VEXT_VV(vclmulh_vv, 8) 61 RVVCALL(OPIVX2, vclmulh_vx, OP_UUU_D, H8, H8, clmulh64) 62 GEN_VEXT_VX(vclmulh_vx, 8) 63 64 RVVCALL(OPIVV2, vror_vv_b, OP_UUU_B, H1, H1, H1, ror8) 65 RVVCALL(OPIVV2, vror_vv_h, OP_UUU_H, H2, H2, H2, ror16) 66 RVVCALL(OPIVV2, vror_vv_w, OP_UUU_W, H4, H4, H4, ror32) 67 RVVCALL(OPIVV2, vror_vv_d, OP_UUU_D, H8, H8, H8, ror64) 68 GEN_VEXT_VV(vror_vv_b, 1) 69 GEN_VEXT_VV(vror_vv_h, 2) 70 GEN_VEXT_VV(vror_vv_w, 4) 71 GEN_VEXT_VV(vror_vv_d, 8) 72 73 RVVCALL(OPIVX2, vror_vx_b, OP_UUU_B, H1, H1, ror8) 74 RVVCALL(OPIVX2, vror_vx_h, OP_UUU_H, H2, H2, ror16) 75 RVVCALL(OPIVX2, vror_vx_w, OP_UUU_W, H4, H4, ror32) 76 RVVCALL(OPIVX2, vror_vx_d, OP_UUU_D, H8, H8, ror64) 77 GEN_VEXT_VX(vror_vx_b, 1) 78 GEN_VEXT_VX(vror_vx_h, 2) 79 GEN_VEXT_VX(vror_vx_w, 4) 80 GEN_VEXT_VX(vror_vx_d, 8) 81 82 RVVCALL(OPIVV2, vrol_vv_b, OP_UUU_B, H1, H1, H1, rol8) 83 RVVCALL(OPIVV2, vrol_vv_h, OP_UUU_H, H2, H2, H2, rol16) 84 RVVCALL(OPIVV2, vrol_vv_w, OP_UUU_W, H4, H4, H4, rol32) 85 RVVCALL(OPIVV2, vrol_vv_d, OP_UUU_D, H8, H8, H8, rol64) 86 GEN_VEXT_VV(vrol_vv_b, 1) 87 GEN_VEXT_VV(vrol_vv_h, 2) 88 GEN_VEXT_VV(vrol_vv_w, 4) 89 GEN_VEXT_VV(vrol_vv_d, 8) 90 91 RVVCALL(OPIVX2, vrol_vx_b, OP_UUU_B, H1, H1, rol8) 92 RVVCALL(OPIVX2, vrol_vx_h, OP_UUU_H, H2, H2, rol16) 93 RVVCALL(OPIVX2, vrol_vx_w, OP_UUU_W, H4, H4, rol32) 94 RVVCALL(OPIVX2, vrol_vx_d, OP_UUU_D, H8, H8, rol64) 95 GEN_VEXT_VX(vrol_vx_b, 1) 96 GEN_VEXT_VX(vrol_vx_h, 2) 97 GEN_VEXT_VX(vrol_vx_w, 4) 98 GEN_VEXT_VX(vrol_vx_d, 8) 99 100 static uint64_t brev8(uint64_t val) 101 { 102 val = ((val & 0x5555555555555555ull) << 1) | 103 ((val & 0xAAAAAAAAAAAAAAAAull) >> 1); 104 val = ((val & 0x3333333333333333ull) << 2) | 105 ((val & 0xCCCCCCCCCCCCCCCCull) >> 2); 106 val = ((val & 0x0F0F0F0F0F0F0F0Full) << 4) | 107 ((val & 0xF0F0F0F0F0F0F0F0ull) >> 4); 108 109 return val; 110 } 111 112 RVVCALL(OPIVV1, vbrev8_v_b, OP_UU_B, H1, H1, brev8) 113 RVVCALL(OPIVV1, vbrev8_v_h, OP_UU_H, H2, H2, brev8) 114 RVVCALL(OPIVV1, vbrev8_v_w, OP_UU_W, H4, H4, brev8) 115 RVVCALL(OPIVV1, vbrev8_v_d, OP_UU_D, H8, H8, brev8) 116 GEN_VEXT_V(vbrev8_v_b, 1) 117 GEN_VEXT_V(vbrev8_v_h, 2) 118 GEN_VEXT_V(vbrev8_v_w, 4) 119 GEN_VEXT_V(vbrev8_v_d, 8) 120 121 #define DO_IDENTITY(a) (a) 122 RVVCALL(OPIVV1, vrev8_v_b, OP_UU_B, H1, H1, DO_IDENTITY) 123 RVVCALL(OPIVV1, vrev8_v_h, OP_UU_H, H2, H2, bswap16) 124 RVVCALL(OPIVV1, vrev8_v_w, OP_UU_W, H4, H4, bswap32) 125 RVVCALL(OPIVV1, vrev8_v_d, OP_UU_D, H8, H8, bswap64) 126 GEN_VEXT_V(vrev8_v_b, 1) 127 GEN_VEXT_V(vrev8_v_h, 2) 128 GEN_VEXT_V(vrev8_v_w, 4) 129 GEN_VEXT_V(vrev8_v_d, 8) 130 131 #define DO_ANDN(a, b) ((a) & ~(b)) 132 RVVCALL(OPIVV2, vandn_vv_b, OP_UUU_B, H1, H1, H1, DO_ANDN) 133 RVVCALL(OPIVV2, vandn_vv_h, OP_UUU_H, H2, H2, H2, DO_ANDN) 134 RVVCALL(OPIVV2, vandn_vv_w, OP_UUU_W, H4, H4, H4, DO_ANDN) 135 RVVCALL(OPIVV2, vandn_vv_d, OP_UUU_D, H8, H8, H8, DO_ANDN) 136 GEN_VEXT_VV(vandn_vv_b, 1) 137 GEN_VEXT_VV(vandn_vv_h, 2) 138 GEN_VEXT_VV(vandn_vv_w, 4) 139 GEN_VEXT_VV(vandn_vv_d, 8) 140 141 RVVCALL(OPIVX2, vandn_vx_b, OP_UUU_B, H1, H1, DO_ANDN) 142 RVVCALL(OPIVX2, vandn_vx_h, OP_UUU_H, H2, H2, DO_ANDN) 143 RVVCALL(OPIVX2, vandn_vx_w, OP_UUU_W, H4, H4, DO_ANDN) 144 RVVCALL(OPIVX2, vandn_vx_d, OP_UUU_D, H8, H8, DO_ANDN) 145 GEN_VEXT_VX(vandn_vx_b, 1) 146 GEN_VEXT_VX(vandn_vx_h, 2) 147 GEN_VEXT_VX(vandn_vx_w, 4) 148 GEN_VEXT_VX(vandn_vx_d, 8) 149 150 RVVCALL(OPIVV1, vbrev_v_b, OP_UU_B, H1, H1, revbit8) 151 RVVCALL(OPIVV1, vbrev_v_h, OP_UU_H, H2, H2, revbit16) 152 RVVCALL(OPIVV1, vbrev_v_w, OP_UU_W, H4, H4, revbit32) 153 RVVCALL(OPIVV1, vbrev_v_d, OP_UU_D, H8, H8, revbit64) 154 GEN_VEXT_V(vbrev_v_b, 1) 155 GEN_VEXT_V(vbrev_v_h, 2) 156 GEN_VEXT_V(vbrev_v_w, 4) 157 GEN_VEXT_V(vbrev_v_d, 8) 158 159 RVVCALL(OPIVV1, vclz_v_b, OP_UU_B, H1, H1, clz8) 160 RVVCALL(OPIVV1, vclz_v_h, OP_UU_H, H2, H2, clz16) 161 RVVCALL(OPIVV1, vclz_v_w, OP_UU_W, H4, H4, clz32) 162 RVVCALL(OPIVV1, vclz_v_d, OP_UU_D, H8, H8, clz64) 163 GEN_VEXT_V(vclz_v_b, 1) 164 GEN_VEXT_V(vclz_v_h, 2) 165 GEN_VEXT_V(vclz_v_w, 4) 166 GEN_VEXT_V(vclz_v_d, 8) 167 168 RVVCALL(OPIVV1, vctz_v_b, OP_UU_B, H1, H1, ctz8) 169 RVVCALL(OPIVV1, vctz_v_h, OP_UU_H, H2, H2, ctz16) 170 RVVCALL(OPIVV1, vctz_v_w, OP_UU_W, H4, H4, ctz32) 171 RVVCALL(OPIVV1, vctz_v_d, OP_UU_D, H8, H8, ctz64) 172 GEN_VEXT_V(vctz_v_b, 1) 173 GEN_VEXT_V(vctz_v_h, 2) 174 GEN_VEXT_V(vctz_v_w, 4) 175 GEN_VEXT_V(vctz_v_d, 8) 176 177 RVVCALL(OPIVV1, vcpop_v_b, OP_UU_B, H1, H1, ctpop8) 178 RVVCALL(OPIVV1, vcpop_v_h, OP_UU_H, H2, H2, ctpop16) 179 RVVCALL(OPIVV1, vcpop_v_w, OP_UU_W, H4, H4, ctpop32) 180 RVVCALL(OPIVV1, vcpop_v_d, OP_UU_D, H8, H8, ctpop64) 181 GEN_VEXT_V(vcpop_v_b, 1) 182 GEN_VEXT_V(vcpop_v_h, 2) 183 GEN_VEXT_V(vcpop_v_w, 4) 184 GEN_VEXT_V(vcpop_v_d, 8) 185 186 #define DO_SLL(N, M) (N << (M & (sizeof(N) * 8 - 1))) 187 RVVCALL(OPIVV2, vwsll_vv_b, WOP_UUU_B, H2, H1, H1, DO_SLL) 188 RVVCALL(OPIVV2, vwsll_vv_h, WOP_UUU_H, H4, H2, H2, DO_SLL) 189 RVVCALL(OPIVV2, vwsll_vv_w, WOP_UUU_W, H8, H4, H4, DO_SLL) 190 GEN_VEXT_VV(vwsll_vv_b, 2) 191 GEN_VEXT_VV(vwsll_vv_h, 4) 192 GEN_VEXT_VV(vwsll_vv_w, 8) 193 194 RVVCALL(OPIVX2, vwsll_vx_b, WOP_UUU_B, H2, H1, DO_SLL) 195 RVVCALL(OPIVX2, vwsll_vx_h, WOP_UUU_H, H4, H2, DO_SLL) 196 RVVCALL(OPIVX2, vwsll_vx_w, WOP_UUU_W, H8, H4, DO_SLL) 197 GEN_VEXT_VX(vwsll_vx_b, 2) 198 GEN_VEXT_VX(vwsll_vx_h, 4) 199 GEN_VEXT_VX(vwsll_vx_w, 8) 200 201 void HELPER(egs_check)(uint32_t egs, CPURISCVState *env) 202 { 203 uint32_t vl = env->vl; 204 uint32_t vstart = env->vstart; 205 206 if (vl % egs != 0 || vstart % egs != 0) { 207 riscv_raise_exception(env, RISCV_EXCP_ILLEGAL_INST, GETPC()); 208 } 209 } 210 211 static inline void xor_round_key(AESState *round_state, AESState *round_key) 212 { 213 round_state->v = round_state->v ^ round_key->v; 214 } 215 216 #define GEN_ZVKNED_HELPER_VV(NAME, ...) \ 217 void HELPER(NAME)(void *vd, void *vs2, CPURISCVState *env, \ 218 uint32_t desc) \ 219 { \ 220 uint32_t vl = env->vl; \ 221 uint32_t total_elems = vext_get_total_elems(env, desc, 4); \ 222 uint32_t vta = vext_vta(desc); \ 223 \ 224 VSTART_CHECK_EARLY_EXIT(env, vl); \ 225 \ 226 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { \ 227 AESState round_key; \ 228 round_key.d[0] = *((uint64_t *)vs2 + H8(i * 2 + 0)); \ 229 round_key.d[1] = *((uint64_t *)vs2 + H8(i * 2 + 1)); \ 230 AESState round_state; \ 231 round_state.d[0] = *((uint64_t *)vd + H8(i * 2 + 0)); \ 232 round_state.d[1] = *((uint64_t *)vd + H8(i * 2 + 1)); \ 233 __VA_ARGS__; \ 234 *((uint64_t *)vd + H8(i * 2 + 0)) = round_state.d[0]; \ 235 *((uint64_t *)vd + H8(i * 2 + 1)) = round_state.d[1]; \ 236 } \ 237 env->vstart = 0; \ 238 /* set tail elements to 1s */ \ 239 vext_set_elems_1s(vd, vta, vl * 4, total_elems * 4); \ 240 } 241 242 #define GEN_ZVKNED_HELPER_VS(NAME, ...) \ 243 void HELPER(NAME)(void *vd, void *vs2, CPURISCVState *env, \ 244 uint32_t desc) \ 245 { \ 246 uint32_t vl = env->vl; \ 247 uint32_t total_elems = vext_get_total_elems(env, desc, 4); \ 248 uint32_t vta = vext_vta(desc); \ 249 \ 250 VSTART_CHECK_EARLY_EXIT(env, vl); \ 251 \ 252 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { \ 253 AESState round_key; \ 254 round_key.d[0] = *((uint64_t *)vs2 + H8(0)); \ 255 round_key.d[1] = *((uint64_t *)vs2 + H8(1)); \ 256 AESState round_state; \ 257 round_state.d[0] = *((uint64_t *)vd + H8(i * 2 + 0)); \ 258 round_state.d[1] = *((uint64_t *)vd + H8(i * 2 + 1)); \ 259 __VA_ARGS__; \ 260 *((uint64_t *)vd + H8(i * 2 + 0)) = round_state.d[0]; \ 261 *((uint64_t *)vd + H8(i * 2 + 1)) = round_state.d[1]; \ 262 } \ 263 env->vstart = 0; \ 264 /* set tail elements to 1s */ \ 265 vext_set_elems_1s(vd, vta, vl * 4, total_elems * 4); \ 266 } 267 268 GEN_ZVKNED_HELPER_VV(vaesef_vv, aesenc_SB_SR_AK(&round_state, 269 &round_state, 270 &round_key, 271 false);) 272 GEN_ZVKNED_HELPER_VS(vaesef_vs, aesenc_SB_SR_AK(&round_state, 273 &round_state, 274 &round_key, 275 false);) 276 GEN_ZVKNED_HELPER_VV(vaesdf_vv, aesdec_ISB_ISR_AK(&round_state, 277 &round_state, 278 &round_key, 279 false);) 280 GEN_ZVKNED_HELPER_VS(vaesdf_vs, aesdec_ISB_ISR_AK(&round_state, 281 &round_state, 282 &round_key, 283 false);) 284 GEN_ZVKNED_HELPER_VV(vaesem_vv, aesenc_SB_SR_MC_AK(&round_state, 285 &round_state, 286 &round_key, 287 false);) 288 GEN_ZVKNED_HELPER_VS(vaesem_vs, aesenc_SB_SR_MC_AK(&round_state, 289 &round_state, 290 &round_key, 291 false);) 292 GEN_ZVKNED_HELPER_VV(vaesdm_vv, aesdec_ISB_ISR_AK_IMC(&round_state, 293 &round_state, 294 &round_key, 295 false);) 296 GEN_ZVKNED_HELPER_VS(vaesdm_vs, aesdec_ISB_ISR_AK_IMC(&round_state, 297 &round_state, 298 &round_key, 299 false);) 300 GEN_ZVKNED_HELPER_VS(vaesz_vs, xor_round_key(&round_state, &round_key);) 301 302 void HELPER(vaeskf1_vi)(void *vd_vptr, void *vs2_vptr, uint32_t uimm, 303 CPURISCVState *env, uint32_t desc) 304 { 305 uint32_t *vd = vd_vptr; 306 uint32_t *vs2 = vs2_vptr; 307 uint32_t vl = env->vl; 308 uint32_t total_elems = vext_get_total_elems(env, desc, 4); 309 uint32_t vta = vext_vta(desc); 310 311 VSTART_CHECK_EARLY_EXIT(env, vl); 312 313 uimm &= 0b1111; 314 if (uimm > 10 || uimm == 0) { 315 uimm ^= 0b1000; 316 } 317 318 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 319 uint32_t rk[8], tmp; 320 static const uint32_t rcon[] = { 321 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 322 0x00000020, 0x00000040, 0x00000080, 0x0000001B, 0x00000036, 323 }; 324 325 rk[0] = vs2[i * 4 + H4(0)]; 326 rk[1] = vs2[i * 4 + H4(1)]; 327 rk[2] = vs2[i * 4 + H4(2)]; 328 rk[3] = vs2[i * 4 + H4(3)]; 329 tmp = ror32(rk[3], 8); 330 331 rk[4] = rk[0] ^ (((uint32_t)AES_sbox[(tmp >> 24) & 0xff] << 24) | 332 ((uint32_t)AES_sbox[(tmp >> 16) & 0xff] << 16) | 333 ((uint32_t)AES_sbox[(tmp >> 8) & 0xff] << 8) | 334 ((uint32_t)AES_sbox[(tmp >> 0) & 0xff] << 0)) 335 ^ rcon[uimm - 1]; 336 rk[5] = rk[1] ^ rk[4]; 337 rk[6] = rk[2] ^ rk[5]; 338 rk[7] = rk[3] ^ rk[6]; 339 340 vd[i * 4 + H4(0)] = rk[4]; 341 vd[i * 4 + H4(1)] = rk[5]; 342 vd[i * 4 + H4(2)] = rk[6]; 343 vd[i * 4 + H4(3)] = rk[7]; 344 } 345 env->vstart = 0; 346 /* set tail elements to 1s */ 347 vext_set_elems_1s(vd, vta, vl * 4, total_elems * 4); 348 } 349 350 void HELPER(vaeskf2_vi)(void *vd_vptr, void *vs2_vptr, uint32_t uimm, 351 CPURISCVState *env, uint32_t desc) 352 { 353 uint32_t *vd = vd_vptr; 354 uint32_t *vs2 = vs2_vptr; 355 uint32_t vl = env->vl; 356 uint32_t total_elems = vext_get_total_elems(env, desc, 4); 357 uint32_t vta = vext_vta(desc); 358 359 VSTART_CHECK_EARLY_EXIT(env, vl); 360 361 uimm &= 0b1111; 362 if (uimm > 14 || uimm < 2) { 363 uimm ^= 0b1000; 364 } 365 366 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 367 uint32_t rk[12], tmp; 368 static const uint32_t rcon[] = { 369 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 370 0x00000020, 0x00000040, 0x00000080, 0x0000001B, 0x00000036, 371 }; 372 373 rk[0] = vd[i * 4 + H4(0)]; 374 rk[1] = vd[i * 4 + H4(1)]; 375 rk[2] = vd[i * 4 + H4(2)]; 376 rk[3] = vd[i * 4 + H4(3)]; 377 rk[4] = vs2[i * 4 + H4(0)]; 378 rk[5] = vs2[i * 4 + H4(1)]; 379 rk[6] = vs2[i * 4 + H4(2)]; 380 rk[7] = vs2[i * 4 + H4(3)]; 381 382 if (uimm % 2 == 0) { 383 tmp = ror32(rk[7], 8); 384 rk[8] = rk[0] ^ (((uint32_t)AES_sbox[(tmp >> 24) & 0xff] << 24) | 385 ((uint32_t)AES_sbox[(tmp >> 16) & 0xff] << 16) | 386 ((uint32_t)AES_sbox[(tmp >> 8) & 0xff] << 8) | 387 ((uint32_t)AES_sbox[(tmp >> 0) & 0xff] << 0)) 388 ^ rcon[(uimm - 1) / 2]; 389 } else { 390 rk[8] = rk[0] ^ (((uint32_t)AES_sbox[(rk[7] >> 24) & 0xff] << 24) | 391 ((uint32_t)AES_sbox[(rk[7] >> 16) & 0xff] << 16) | 392 ((uint32_t)AES_sbox[(rk[7] >> 8) & 0xff] << 8) | 393 ((uint32_t)AES_sbox[(rk[7] >> 0) & 0xff] << 0)); 394 } 395 rk[9] = rk[1] ^ rk[8]; 396 rk[10] = rk[2] ^ rk[9]; 397 rk[11] = rk[3] ^ rk[10]; 398 399 vd[i * 4 + H4(0)] = rk[8]; 400 vd[i * 4 + H4(1)] = rk[9]; 401 vd[i * 4 + H4(2)] = rk[10]; 402 vd[i * 4 + H4(3)] = rk[11]; 403 } 404 env->vstart = 0; 405 /* set tail elements to 1s */ 406 vext_set_elems_1s(vd, vta, vl * 4, total_elems * 4); 407 } 408 409 static inline uint32_t sig0_sha256(uint32_t x) 410 { 411 return ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3); 412 } 413 414 static inline uint32_t sig1_sha256(uint32_t x) 415 { 416 return ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10); 417 } 418 419 static inline uint64_t sig0_sha512(uint64_t x) 420 { 421 return ror64(x, 1) ^ ror64(x, 8) ^ (x >> 7); 422 } 423 424 static inline uint64_t sig1_sha512(uint64_t x) 425 { 426 return ror64(x, 19) ^ ror64(x, 61) ^ (x >> 6); 427 } 428 429 static inline void vsha2ms_e32(uint32_t *vd, uint32_t *vs1, uint32_t *vs2) 430 { 431 uint32_t res[4]; 432 res[0] = sig1_sha256(vs1[H4(2)]) + vs2[H4(1)] + sig0_sha256(vd[H4(1)]) + 433 vd[H4(0)]; 434 res[1] = sig1_sha256(vs1[H4(3)]) + vs2[H4(2)] + sig0_sha256(vd[H4(2)]) + 435 vd[H4(1)]; 436 res[2] = 437 sig1_sha256(res[0]) + vs2[H4(3)] + sig0_sha256(vd[H4(3)]) + vd[H4(2)]; 438 res[3] = 439 sig1_sha256(res[1]) + vs1[H4(0)] + sig0_sha256(vs2[H4(0)]) + vd[H4(3)]; 440 vd[H4(3)] = res[3]; 441 vd[H4(2)] = res[2]; 442 vd[H4(1)] = res[1]; 443 vd[H4(0)] = res[0]; 444 } 445 446 static inline void vsha2ms_e64(uint64_t *vd, uint64_t *vs1, uint64_t *vs2) 447 { 448 uint64_t res[4]; 449 res[0] = sig1_sha512(vs1[2]) + vs2[1] + sig0_sha512(vd[1]) + vd[0]; 450 res[1] = sig1_sha512(vs1[3]) + vs2[2] + sig0_sha512(vd[2]) + vd[1]; 451 res[2] = sig1_sha512(res[0]) + vs2[3] + sig0_sha512(vd[3]) + vd[2]; 452 res[3] = sig1_sha512(res[1]) + vs1[0] + sig0_sha512(vs2[0]) + vd[3]; 453 vd[3] = res[3]; 454 vd[2] = res[2]; 455 vd[1] = res[1]; 456 vd[0] = res[0]; 457 } 458 459 void HELPER(vsha2ms_vv)(void *vd, void *vs1, void *vs2, CPURISCVState *env, 460 uint32_t desc) 461 { 462 uint32_t sew = FIELD_EX64(env->vtype, VTYPE, VSEW); 463 uint32_t esz = sew == MO_32 ? 4 : 8; 464 uint32_t total_elems; 465 uint32_t vta = vext_vta(desc); 466 467 VSTART_CHECK_EARLY_EXIT(env, env->vl); 468 469 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 470 if (sew == MO_32) { 471 vsha2ms_e32(((uint32_t *)vd) + i * 4, ((uint32_t *)vs1) + i * 4, 472 ((uint32_t *)vs2) + i * 4); 473 } else { 474 /* If not 32 then SEW should be 64 */ 475 vsha2ms_e64(((uint64_t *)vd) + i * 4, ((uint64_t *)vs1) + i * 4, 476 ((uint64_t *)vs2) + i * 4); 477 } 478 } 479 /* set tail elements to 1s */ 480 total_elems = vext_get_total_elems(env, desc, esz); 481 vext_set_elems_1s(vd, vta, env->vl * esz, total_elems * esz); 482 env->vstart = 0; 483 } 484 485 static inline uint64_t sum0_64(uint64_t x) 486 { 487 return ror64(x, 28) ^ ror64(x, 34) ^ ror64(x, 39); 488 } 489 490 static inline uint32_t sum0_32(uint32_t x) 491 { 492 return ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22); 493 } 494 495 static inline uint64_t sum1_64(uint64_t x) 496 { 497 return ror64(x, 14) ^ ror64(x, 18) ^ ror64(x, 41); 498 } 499 500 static inline uint32_t sum1_32(uint32_t x) 501 { 502 return ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25); 503 } 504 505 #define ch(x, y, z) ((x & y) ^ ((~x) & z)) 506 507 #define maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z)) 508 509 static void vsha2c_64(uint64_t *vs2, uint64_t *vd, uint64_t *vs1) 510 { 511 uint64_t a = vs2[3], b = vs2[2], e = vs2[1], f = vs2[0]; 512 uint64_t c = vd[3], d = vd[2], g = vd[1], h = vd[0]; 513 uint64_t W0 = vs1[0], W1 = vs1[1]; 514 uint64_t T1 = h + sum1_64(e) + ch(e, f, g) + W0; 515 uint64_t T2 = sum0_64(a) + maj(a, b, c); 516 517 h = g; 518 g = f; 519 f = e; 520 e = d + T1; 521 d = c; 522 c = b; 523 b = a; 524 a = T1 + T2; 525 526 T1 = h + sum1_64(e) + ch(e, f, g) + W1; 527 T2 = sum0_64(a) + maj(a, b, c); 528 h = g; 529 g = f; 530 f = e; 531 e = d + T1; 532 d = c; 533 c = b; 534 b = a; 535 a = T1 + T2; 536 537 vd[0] = f; 538 vd[1] = e; 539 vd[2] = b; 540 vd[3] = a; 541 } 542 543 static void vsha2c_32(uint32_t *vs2, uint32_t *vd, uint32_t *vs1) 544 { 545 uint32_t a = vs2[H4(3)], b = vs2[H4(2)], e = vs2[H4(1)], f = vs2[H4(0)]; 546 uint32_t c = vd[H4(3)], d = vd[H4(2)], g = vd[H4(1)], h = vd[H4(0)]; 547 uint32_t W0 = vs1[H4(0)], W1 = vs1[H4(1)]; 548 uint32_t T1 = h + sum1_32(e) + ch(e, f, g) + W0; 549 uint32_t T2 = sum0_32(a) + maj(a, b, c); 550 551 h = g; 552 g = f; 553 f = e; 554 e = d + T1; 555 d = c; 556 c = b; 557 b = a; 558 a = T1 + T2; 559 560 T1 = h + sum1_32(e) + ch(e, f, g) + W1; 561 T2 = sum0_32(a) + maj(a, b, c); 562 h = g; 563 g = f; 564 f = e; 565 e = d + T1; 566 d = c; 567 c = b; 568 b = a; 569 a = T1 + T2; 570 571 vd[H4(0)] = f; 572 vd[H4(1)] = e; 573 vd[H4(2)] = b; 574 vd[H4(3)] = a; 575 } 576 577 void HELPER(vsha2ch32_vv)(void *vd, void *vs1, void *vs2, CPURISCVState *env, 578 uint32_t desc) 579 { 580 const uint32_t esz = 4; 581 uint32_t total_elems; 582 uint32_t vta = vext_vta(desc); 583 584 VSTART_CHECK_EARLY_EXIT(env, env->vl); 585 586 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 587 vsha2c_32(((uint32_t *)vs2) + 4 * i, ((uint32_t *)vd) + 4 * i, 588 ((uint32_t *)vs1) + 4 * i + 2); 589 } 590 591 /* set tail elements to 1s */ 592 total_elems = vext_get_total_elems(env, desc, esz); 593 vext_set_elems_1s(vd, vta, env->vl * esz, total_elems * esz); 594 env->vstart = 0; 595 } 596 597 void HELPER(vsha2ch64_vv)(void *vd, void *vs1, void *vs2, CPURISCVState *env, 598 uint32_t desc) 599 { 600 const uint32_t esz = 8; 601 uint32_t total_elems; 602 uint32_t vta = vext_vta(desc); 603 604 VSTART_CHECK_EARLY_EXIT(env, env->vl); 605 606 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 607 vsha2c_64(((uint64_t *)vs2) + 4 * i, ((uint64_t *)vd) + 4 * i, 608 ((uint64_t *)vs1) + 4 * i + 2); 609 } 610 611 /* set tail elements to 1s */ 612 total_elems = vext_get_total_elems(env, desc, esz); 613 vext_set_elems_1s(vd, vta, env->vl * esz, total_elems * esz); 614 env->vstart = 0; 615 } 616 617 void HELPER(vsha2cl32_vv)(void *vd, void *vs1, void *vs2, CPURISCVState *env, 618 uint32_t desc) 619 { 620 const uint32_t esz = 4; 621 uint32_t total_elems; 622 uint32_t vta = vext_vta(desc); 623 624 VSTART_CHECK_EARLY_EXIT(env, env->vl); 625 626 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 627 vsha2c_32(((uint32_t *)vs2) + 4 * i, ((uint32_t *)vd) + 4 * i, 628 (((uint32_t *)vs1) + 4 * i)); 629 } 630 631 /* set tail elements to 1s */ 632 total_elems = vext_get_total_elems(env, desc, esz); 633 vext_set_elems_1s(vd, vta, env->vl * esz, total_elems * esz); 634 env->vstart = 0; 635 } 636 637 void HELPER(vsha2cl64_vv)(void *vd, void *vs1, void *vs2, CPURISCVState *env, 638 uint32_t desc) 639 { 640 uint32_t esz = 8; 641 uint32_t total_elems; 642 uint32_t vta = vext_vta(desc); 643 644 VSTART_CHECK_EARLY_EXIT(env, env->vl); 645 646 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 647 vsha2c_64(((uint64_t *)vs2) + 4 * i, ((uint64_t *)vd) + 4 * i, 648 (((uint64_t *)vs1) + 4 * i)); 649 } 650 651 /* set tail elements to 1s */ 652 total_elems = vext_get_total_elems(env, desc, esz); 653 vext_set_elems_1s(vd, vta, env->vl * esz, total_elems * esz); 654 env->vstart = 0; 655 } 656 657 static inline uint32_t p1(uint32_t x) 658 { 659 return x ^ rol32(x, 15) ^ rol32(x, 23); 660 } 661 662 static inline uint32_t zvksh_w(uint32_t m16, uint32_t m9, uint32_t m3, 663 uint32_t m13, uint32_t m6) 664 { 665 return p1(m16 ^ m9 ^ rol32(m3, 15)) ^ rol32(m13, 7) ^ m6; 666 } 667 668 void HELPER(vsm3me_vv)(void *vd_vptr, void *vs1_vptr, void *vs2_vptr, 669 CPURISCVState *env, uint32_t desc) 670 { 671 uint32_t esz = memop_size(FIELD_EX64(env->vtype, VTYPE, VSEW)); 672 uint32_t total_elems = vext_get_total_elems(env, desc, esz); 673 uint32_t vta = vext_vta(desc); 674 uint32_t *vd = vd_vptr; 675 uint32_t *vs1 = vs1_vptr; 676 uint32_t *vs2 = vs2_vptr; 677 678 VSTART_CHECK_EARLY_EXIT(env, env->vl); 679 680 for (int i = env->vstart / 8; i < env->vl / 8; i++) { 681 uint32_t w[24]; 682 for (int j = 0; j < 8; j++) { 683 w[j] = bswap32(vs1[H4((i * 8) + j)]); 684 w[j + 8] = bswap32(vs2[H4((i * 8) + j)]); 685 } 686 for (int j = 0; j < 8; j++) { 687 w[j + 16] = 688 zvksh_w(w[j], w[j + 7], w[j + 13], w[j + 3], w[j + 10]); 689 } 690 for (int j = 0; j < 8; j++) { 691 vd[(i * 8) + j] = bswap32(w[H4(j + 16)]); 692 } 693 } 694 vext_set_elems_1s(vd_vptr, vta, env->vl * esz, total_elems * esz); 695 env->vstart = 0; 696 } 697 698 static inline uint32_t ff1(uint32_t x, uint32_t y, uint32_t z) 699 { 700 return x ^ y ^ z; 701 } 702 703 static inline uint32_t ff2(uint32_t x, uint32_t y, uint32_t z) 704 { 705 return (x & y) | (x & z) | (y & z); 706 } 707 708 static inline uint32_t ff_j(uint32_t x, uint32_t y, uint32_t z, uint32_t j) 709 { 710 return (j <= 15) ? ff1(x, y, z) : ff2(x, y, z); 711 } 712 713 static inline uint32_t gg1(uint32_t x, uint32_t y, uint32_t z) 714 { 715 return x ^ y ^ z; 716 } 717 718 static inline uint32_t gg2(uint32_t x, uint32_t y, uint32_t z) 719 { 720 return (x & y) | (~x & z); 721 } 722 723 static inline uint32_t gg_j(uint32_t x, uint32_t y, uint32_t z, uint32_t j) 724 { 725 return (j <= 15) ? gg1(x, y, z) : gg2(x, y, z); 726 } 727 728 static inline uint32_t t_j(uint32_t j) 729 { 730 return (j <= 15) ? 0x79cc4519 : 0x7a879d8a; 731 } 732 733 static inline uint32_t p_0(uint32_t x) 734 { 735 return x ^ rol32(x, 9) ^ rol32(x, 17); 736 } 737 738 static void sm3c(uint32_t *vd, uint32_t *vs1, uint32_t *vs2, uint32_t uimm) 739 { 740 uint32_t x0, x1; 741 uint32_t j; 742 uint32_t ss1, ss2, tt1, tt2; 743 x0 = vs2[0] ^ vs2[4]; 744 x1 = vs2[1] ^ vs2[5]; 745 j = 2 * uimm; 746 ss1 = rol32(rol32(vs1[0], 12) + vs1[4] + rol32(t_j(j), j % 32), 7); 747 ss2 = ss1 ^ rol32(vs1[0], 12); 748 tt1 = ff_j(vs1[0], vs1[1], vs1[2], j) + vs1[3] + ss2 + x0; 749 tt2 = gg_j(vs1[4], vs1[5], vs1[6], j) + vs1[7] + ss1 + vs2[0]; 750 vs1[3] = vs1[2]; 751 vd[3] = rol32(vs1[1], 9); 752 vs1[1] = vs1[0]; 753 vd[1] = tt1; 754 vs1[7] = vs1[6]; 755 vd[7] = rol32(vs1[5], 19); 756 vs1[5] = vs1[4]; 757 vd[5] = p_0(tt2); 758 j = 2 * uimm + 1; 759 ss1 = rol32(rol32(vd[1], 12) + vd[5] + rol32(t_j(j), j % 32), 7); 760 ss2 = ss1 ^ rol32(vd[1], 12); 761 tt1 = ff_j(vd[1], vs1[1], vd[3], j) + vs1[3] + ss2 + x1; 762 tt2 = gg_j(vd[5], vs1[5], vd[7], j) + vs1[7] + ss1 + vs2[1]; 763 vd[2] = rol32(vs1[1], 9); 764 vd[0] = tt1; 765 vd[6] = rol32(vs1[5], 19); 766 vd[4] = p_0(tt2); 767 } 768 769 void HELPER(vsm3c_vi)(void *vd_vptr, void *vs2_vptr, uint32_t uimm, 770 CPURISCVState *env, uint32_t desc) 771 { 772 uint32_t esz = memop_size(FIELD_EX64(env->vtype, VTYPE, VSEW)); 773 uint32_t total_elems = vext_get_total_elems(env, desc, esz); 774 uint32_t vta = vext_vta(desc); 775 uint32_t *vd = vd_vptr; 776 uint32_t *vs2 = vs2_vptr; 777 uint32_t v1[8], v2[8], v3[8]; 778 779 VSTART_CHECK_EARLY_EXIT(env, env->vl); 780 781 for (int i = env->vstart / 8; i < env->vl / 8; i++) { 782 for (int k = 0; k < 8; k++) { 783 v2[k] = bswap32(vd[H4(i * 8 + k)]); 784 v3[k] = bswap32(vs2[H4(i * 8 + k)]); 785 } 786 sm3c(v1, v2, v3, uimm); 787 for (int k = 0; k < 8; k++) { 788 vd[i * 8 + k] = bswap32(v1[H4(k)]); 789 } 790 } 791 vext_set_elems_1s(vd_vptr, vta, env->vl * esz, total_elems * esz); 792 env->vstart = 0; 793 } 794 795 void HELPER(vghsh_vv)(void *vd_vptr, void *vs1_vptr, void *vs2_vptr, 796 CPURISCVState *env, uint32_t desc) 797 { 798 uint64_t *vd = vd_vptr; 799 uint64_t *vs1 = vs1_vptr; 800 uint64_t *vs2 = vs2_vptr; 801 uint32_t vta = vext_vta(desc); 802 uint32_t total_elems = vext_get_total_elems(env, desc, 4); 803 804 VSTART_CHECK_EARLY_EXIT(env, env->vl); 805 806 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 807 uint64_t Y[2] = {vd[i * 2 + 0], vd[i * 2 + 1]}; 808 uint64_t H[2] = {brev8(vs2[i * 2 + 0]), brev8(vs2[i * 2 + 1])}; 809 uint64_t X[2] = {vs1[i * 2 + 0], vs1[i * 2 + 1]}; 810 uint64_t Z[2] = {0, 0}; 811 812 uint64_t S[2] = {brev8(Y[0] ^ X[0]), brev8(Y[1] ^ X[1])}; 813 814 for (int j = 0; j < 128; j++) { 815 if ((S[j / 64] >> (j % 64)) & 1) { 816 Z[0] ^= H[0]; 817 Z[1] ^= H[1]; 818 } 819 bool reduce = ((H[1] >> 63) & 1); 820 H[1] = H[1] << 1 | H[0] >> 63; 821 H[0] = H[0] << 1; 822 if (reduce) { 823 H[0] ^= 0x87; 824 } 825 } 826 827 vd[i * 2 + 0] = brev8(Z[0]); 828 vd[i * 2 + 1] = brev8(Z[1]); 829 } 830 /* set tail elements to 1s */ 831 vext_set_elems_1s(vd, vta, env->vl * 4, total_elems * 4); 832 env->vstart = 0; 833 } 834 835 void HELPER(vgmul_vv)(void *vd_vptr, void *vs2_vptr, CPURISCVState *env, 836 uint32_t desc) 837 { 838 uint64_t *vd = vd_vptr; 839 uint64_t *vs2 = vs2_vptr; 840 uint32_t vta = vext_vta(desc); 841 uint32_t total_elems = vext_get_total_elems(env, desc, 4); 842 843 VSTART_CHECK_EARLY_EXIT(env, env->vl); 844 845 for (uint32_t i = env->vstart / 4; i < env->vl / 4; i++) { 846 uint64_t Y[2] = {brev8(vd[i * 2 + 0]), brev8(vd[i * 2 + 1])}; 847 uint64_t H[2] = {brev8(vs2[i * 2 + 0]), brev8(vs2[i * 2 + 1])}; 848 uint64_t Z[2] = {0, 0}; 849 850 for (int j = 0; j < 128; j++) { 851 if ((Y[j / 64] >> (j % 64)) & 1) { 852 Z[0] ^= H[0]; 853 Z[1] ^= H[1]; 854 } 855 bool reduce = ((H[1] >> 63) & 1); 856 H[1] = H[1] << 1 | H[0] >> 63; 857 H[0] = H[0] << 1; 858 if (reduce) { 859 H[0] ^= 0x87; 860 } 861 } 862 863 vd[i * 2 + 0] = brev8(Z[0]); 864 vd[i * 2 + 1] = brev8(Z[1]); 865 } 866 /* set tail elements to 1s */ 867 vext_set_elems_1s(vd, vta, env->vl * 4, total_elems * 4); 868 env->vstart = 0; 869 } 870 871 void HELPER(vsm4k_vi)(void *vd, void *vs2, uint32_t uimm5, CPURISCVState *env, 872 uint32_t desc) 873 { 874 const uint32_t egs = 4; 875 uint32_t rnd = uimm5 & 0x7; 876 uint32_t group_start = env->vstart / egs; 877 uint32_t group_end = env->vl / egs; 878 uint32_t esz = sizeof(uint32_t); 879 uint32_t total_elems = vext_get_total_elems(env, desc, esz); 880 881 VSTART_CHECK_EARLY_EXIT(env, env->vl); 882 883 for (uint32_t i = group_start; i < group_end; ++i) { 884 uint32_t vstart = i * egs; 885 uint32_t vend = (i + 1) * egs; 886 uint32_t rk[4] = {0}; 887 uint32_t tmp[8] = {0}; 888 889 for (uint32_t j = vstart; j < vend; ++j) { 890 rk[j - vstart] = *((uint32_t *)vs2 + H4(j)); 891 } 892 893 for (uint32_t j = 0; j < egs; ++j) { 894 tmp[j] = rk[j]; 895 } 896 897 for (uint32_t j = 0; j < egs; ++j) { 898 uint32_t b, s; 899 b = tmp[j + 1] ^ tmp[j + 2] ^ tmp[j + 3] ^ sm4_ck[rnd * 4 + j]; 900 901 s = sm4_subword(b); 902 903 tmp[j + 4] = tmp[j] ^ (s ^ rol32(s, 13) ^ rol32(s, 23)); 904 } 905 906 for (uint32_t j = vstart; j < vend; ++j) { 907 *((uint32_t *)vd + H4(j)) = tmp[egs + (j - vstart)]; 908 } 909 } 910 911 env->vstart = 0; 912 /* set tail elements to 1s */ 913 vext_set_elems_1s(vd, vext_vta(desc), env->vl * esz, total_elems * esz); 914 } 915 916 static void do_sm4_round(uint32_t *rk, uint32_t *buf) 917 { 918 const uint32_t egs = 4; 919 uint32_t s, b; 920 921 for (uint32_t j = egs; j < egs * 2; ++j) { 922 b = buf[j - 3] ^ buf[j - 2] ^ buf[j - 1] ^ rk[j - 4]; 923 924 s = sm4_subword(b); 925 926 buf[j] = buf[j - 4] ^ (s ^ rol32(s, 2) ^ rol32(s, 10) ^ rol32(s, 18) ^ 927 rol32(s, 24)); 928 } 929 } 930 931 void HELPER(vsm4r_vv)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc) 932 { 933 const uint32_t egs = 4; 934 uint32_t group_start = env->vstart / egs; 935 uint32_t group_end = env->vl / egs; 936 uint32_t esz = sizeof(uint32_t); 937 uint32_t total_elems = vext_get_total_elems(env, desc, esz); 938 939 VSTART_CHECK_EARLY_EXIT(env, env->vl); 940 941 for (uint32_t i = group_start; i < group_end; ++i) { 942 uint32_t vstart = i * egs; 943 uint32_t vend = (i + 1) * egs; 944 uint32_t rk[4] = {0}; 945 uint32_t tmp[8] = {0}; 946 947 for (uint32_t j = vstart; j < vend; ++j) { 948 rk[j - vstart] = *((uint32_t *)vs2 + H4(j)); 949 } 950 951 for (uint32_t j = vstart; j < vend; ++j) { 952 tmp[j - vstart] = *((uint32_t *)vd + H4(j)); 953 } 954 955 do_sm4_round(rk, tmp); 956 957 for (uint32_t j = vstart; j < vend; ++j) { 958 *((uint32_t *)vd + H4(j)) = tmp[egs + (j - vstart)]; 959 } 960 } 961 962 env->vstart = 0; 963 /* set tail elements to 1s */ 964 vext_set_elems_1s(vd, vext_vta(desc), env->vl * esz, total_elems * esz); 965 } 966 967 void HELPER(vsm4r_vs)(void *vd, void *vs2, CPURISCVState *env, uint32_t desc) 968 { 969 const uint32_t egs = 4; 970 uint32_t group_start = env->vstart / egs; 971 uint32_t group_end = env->vl / egs; 972 uint32_t esz = sizeof(uint32_t); 973 uint32_t total_elems = vext_get_total_elems(env, desc, esz); 974 975 VSTART_CHECK_EARLY_EXIT(env, env->vl); 976 977 for (uint32_t i = group_start; i < group_end; ++i) { 978 uint32_t vstart = i * egs; 979 uint32_t vend = (i + 1) * egs; 980 uint32_t rk[4] = {0}; 981 uint32_t tmp[8] = {0}; 982 983 for (uint32_t j = 0; j < egs; ++j) { 984 rk[j] = *((uint32_t *)vs2 + H4(j)); 985 } 986 987 for (uint32_t j = vstart; j < vend; ++j) { 988 tmp[j - vstart] = *((uint32_t *)vd + H4(j)); 989 } 990 991 do_sm4_round(rk, tmp); 992 993 for (uint32_t j = vstart; j < vend; ++j) { 994 *((uint32_t *)vd + H4(j)) = tmp[egs + (j - vstart)]; 995 } 996 } 997 998 env->vstart = 0; 999 /* set tail elements to 1s */ 1000 vext_set_elems_1s(vd, vext_vta(desc), env->vl * esz, total_elems * esz); 1001 } 1002