xref: /qemu/target/arm/tcg/crypto_helper.c (revision 2a8b545ffdb08fb2d5f37cbcc84d19629c419b3b)
1 /*
2  * crypto_helper.c - emulate v8 Crypto Extensions instructions
3  *
4  * Copyright (C) 2013 - 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  */
11 
12 #include "qemu/osdep.h"
13 
14 #include "cpu.h"
15 #include "exec/helper-proto.h"
16 #include "tcg/tcg-gvec-desc.h"
17 #include "crypto/aes.h"
18 #include "crypto/aes-round.h"
19 #include "crypto/sm4.h"
20 #include "vec_internal.h"
21 
22 union CRYPTO_STATE {
23     uint8_t    bytes[16];
24     uint32_t   words[4];
25     uint64_t   l[2];
26 };
27 
28 #if HOST_BIG_ENDIAN
29 #define CR_ST_BYTE(state, i)   ((state).bytes[(15 - (i)) ^ 8])
30 #define CR_ST_WORD(state, i)   ((state).words[(3 - (i)) ^ 2])
31 #else
32 #define CR_ST_BYTE(state, i)   ((state).bytes[i])
33 #define CR_ST_WORD(state, i)   ((state).words[i])
34 #endif
35 
36 /*
37  * The caller has not been converted to full gvec, and so only
38  * modifies the low 16 bytes of the vector register.
39  */
40 static void clear_tail_16(void *vd, uint32_t desc)
41 {
42     int opr_sz = simd_oprsz(desc);
43     int max_sz = simd_maxsz(desc);
44 
45     assert(opr_sz == 16);
46     clear_tail(vd, opr_sz, max_sz);
47 }
48 
49 static const AESState aes_zero = { };
50 
51 void HELPER(crypto_aese)(void *vd, void *vn, void *vm, uint32_t desc)
52 {
53     intptr_t i, opr_sz = simd_oprsz(desc);
54 
55     for (i = 0; i < opr_sz; i += 16) {
56         AESState *ad = (AESState *)(vd + i);
57         AESState *st = (AESState *)(vn + i);
58         AESState *rk = (AESState *)(vm + i);
59         AESState t;
60 
61         /*
62          * Our uint64_t are in the wrong order for big-endian.
63          * The Arm AddRoundKey comes first, while the API AddRoundKey
64          * comes last: perform the xor here, and provide zero to API.
65          */
66         if (HOST_BIG_ENDIAN) {
67             t.d[0] = st->d[1] ^ rk->d[1];
68             t.d[1] = st->d[0] ^ rk->d[0];
69             aesenc_SB_SR_AK(&t, &t, &aes_zero, false);
70             ad->d[0] = t.d[1];
71             ad->d[1] = t.d[0];
72         } else {
73             t.v = st->v ^ rk->v;
74             aesenc_SB_SR_AK(ad, &t, &aes_zero, false);
75         }
76     }
77     clear_tail(vd, opr_sz, simd_maxsz(desc));
78 }
79 
80 void HELPER(crypto_aesd)(void *vd, void *vn, void *vm, uint32_t desc)
81 {
82     intptr_t i, opr_sz = simd_oprsz(desc);
83 
84     for (i = 0; i < opr_sz; i += 16) {
85         AESState *ad = (AESState *)(vd + i);
86         AESState *st = (AESState *)(vn + i);
87         AESState *rk = (AESState *)(vm + i);
88         AESState t;
89 
90         /* Our uint64_t are in the wrong order for big-endian. */
91         if (HOST_BIG_ENDIAN) {
92             t.d[0] = st->d[1] ^ rk->d[1];
93             t.d[1] = st->d[0] ^ rk->d[0];
94             aesdec_ISB_ISR_AK(&t, &t, &aes_zero, false);
95             ad->d[0] = t.d[1];
96             ad->d[1] = t.d[0];
97         } else {
98             t.v = st->v ^ rk->v;
99             aesdec_ISB_ISR_AK(ad, &t, &aes_zero, false);
100         }
101     }
102     clear_tail(vd, opr_sz, simd_maxsz(desc));
103 }
104 
105 static void do_crypto_aesmc(uint64_t *rd, uint64_t *rm, const uint32_t *mc)
106 {
107     union CRYPTO_STATE st = { .l = { rm[0], rm[1] } };
108     int i;
109 
110     for (i = 0; i < 16; i += 4) {
111         CR_ST_WORD(st, i >> 2) =
112             mc[CR_ST_BYTE(st, i)] ^
113             rol32(mc[CR_ST_BYTE(st, i + 1)], 8) ^
114             rol32(mc[CR_ST_BYTE(st, i + 2)], 16) ^
115             rol32(mc[CR_ST_BYTE(st, i + 3)], 24);
116     }
117 
118     rd[0] = st.l[0];
119     rd[1] = st.l[1];
120 }
121 
122 void HELPER(crypto_aesmc)(void *vd, void *vm, uint32_t desc)
123 {
124     intptr_t i, opr_sz = simd_oprsz(desc);
125 
126     for (i = 0; i < opr_sz; i += 16) {
127         do_crypto_aesmc(vd + i, vm + i, AES_mc_rot);
128     }
129     clear_tail(vd, opr_sz, simd_maxsz(desc));
130 }
131 
132 void HELPER(crypto_aesimc)(void *vd, void *vm, uint32_t desc)
133 {
134     intptr_t i, opr_sz = simd_oprsz(desc);
135 
136     for (i = 0; i < opr_sz; i += 16) {
137         do_crypto_aesmc(vd + i, vm + i, AES_imc_rot);
138     }
139     clear_tail(vd, opr_sz, simd_maxsz(desc));
140 }
141 
142 /*
143  * SHA-1 logical functions
144  */
145 
146 static uint32_t cho(uint32_t x, uint32_t y, uint32_t z)
147 {
148     return (x & (y ^ z)) ^ z;
149 }
150 
151 static uint32_t par(uint32_t x, uint32_t y, uint32_t z)
152 {
153     return x ^ y ^ z;
154 }
155 
156 static uint32_t maj(uint32_t x, uint32_t y, uint32_t z)
157 {
158     return (x & y) | ((x | y) & z);
159 }
160 
161 void HELPER(crypto_sha1su0)(void *vd, void *vn, void *vm, uint32_t desc)
162 {
163     uint64_t *d = vd, *n = vn, *m = vm;
164     uint64_t d0, d1;
165 
166     d0 = d[1] ^ d[0] ^ m[0];
167     d1 = n[0] ^ d[1] ^ m[1];
168     d[0] = d0;
169     d[1] = d1;
170 
171     clear_tail_16(vd, desc);
172 }
173 
174 static inline void crypto_sha1_3reg(uint64_t *rd, uint64_t *rn,
175                                     uint64_t *rm, uint32_t desc,
176                                     uint32_t (*fn)(union CRYPTO_STATE *d))
177 {
178     union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
179     union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
180     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
181     int i;
182 
183     for (i = 0; i < 4; i++) {
184         uint32_t t = fn(&d);
185 
186         t += rol32(CR_ST_WORD(d, 0), 5) + CR_ST_WORD(n, 0)
187              + CR_ST_WORD(m, i);
188 
189         CR_ST_WORD(n, 0) = CR_ST_WORD(d, 3);
190         CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2);
191         CR_ST_WORD(d, 2) = ror32(CR_ST_WORD(d, 1), 2);
192         CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0);
193         CR_ST_WORD(d, 0) = t;
194     }
195     rd[0] = d.l[0];
196     rd[1] = d.l[1];
197 
198     clear_tail_16(rd, desc);
199 }
200 
201 static uint32_t do_sha1c(union CRYPTO_STATE *d)
202 {
203     return cho(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3));
204 }
205 
206 void HELPER(crypto_sha1c)(void *vd, void *vn, void *vm, uint32_t desc)
207 {
208     crypto_sha1_3reg(vd, vn, vm, desc, do_sha1c);
209 }
210 
211 static uint32_t do_sha1p(union CRYPTO_STATE *d)
212 {
213     return par(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3));
214 }
215 
216 void HELPER(crypto_sha1p)(void *vd, void *vn, void *vm, uint32_t desc)
217 {
218     crypto_sha1_3reg(vd, vn, vm, desc, do_sha1p);
219 }
220 
221 static uint32_t do_sha1m(union CRYPTO_STATE *d)
222 {
223     return maj(CR_ST_WORD(*d, 1), CR_ST_WORD(*d, 2), CR_ST_WORD(*d, 3));
224 }
225 
226 void HELPER(crypto_sha1m)(void *vd, void *vn, void *vm, uint32_t desc)
227 {
228     crypto_sha1_3reg(vd, vn, vm, desc, do_sha1m);
229 }
230 
231 void HELPER(crypto_sha1h)(void *vd, void *vm, uint32_t desc)
232 {
233     uint64_t *rd = vd;
234     uint64_t *rm = vm;
235     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
236 
237     CR_ST_WORD(m, 0) = ror32(CR_ST_WORD(m, 0), 2);
238     CR_ST_WORD(m, 1) = CR_ST_WORD(m, 2) = CR_ST_WORD(m, 3) = 0;
239 
240     rd[0] = m.l[0];
241     rd[1] = m.l[1];
242 
243     clear_tail_16(vd, desc);
244 }
245 
246 void HELPER(crypto_sha1su1)(void *vd, void *vm, uint32_t desc)
247 {
248     uint64_t *rd = vd;
249     uint64_t *rm = vm;
250     union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
251     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
252 
253     CR_ST_WORD(d, 0) = rol32(CR_ST_WORD(d, 0) ^ CR_ST_WORD(m, 1), 1);
254     CR_ST_WORD(d, 1) = rol32(CR_ST_WORD(d, 1) ^ CR_ST_WORD(m, 2), 1);
255     CR_ST_WORD(d, 2) = rol32(CR_ST_WORD(d, 2) ^ CR_ST_WORD(m, 3), 1);
256     CR_ST_WORD(d, 3) = rol32(CR_ST_WORD(d, 3) ^ CR_ST_WORD(d, 0), 1);
257 
258     rd[0] = d.l[0];
259     rd[1] = d.l[1];
260 
261     clear_tail_16(vd, desc);
262 }
263 
264 /*
265  * The SHA-256 logical functions, according to
266  * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
267  */
268 
269 static uint32_t S0(uint32_t x)
270 {
271     return ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22);
272 }
273 
274 static uint32_t S1(uint32_t x)
275 {
276     return ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25);
277 }
278 
279 static uint32_t s0(uint32_t x)
280 {
281     return ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3);
282 }
283 
284 static uint32_t s1(uint32_t x)
285 {
286     return ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10);
287 }
288 
289 void HELPER(crypto_sha256h)(void *vd, void *vn, void *vm, uint32_t desc)
290 {
291     uint64_t *rd = vd;
292     uint64_t *rn = vn;
293     uint64_t *rm = vm;
294     union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
295     union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
296     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
297     int i;
298 
299     for (i = 0; i < 4; i++) {
300         uint32_t t = cho(CR_ST_WORD(n, 0), CR_ST_WORD(n, 1), CR_ST_WORD(n, 2))
301                      + CR_ST_WORD(n, 3) + S1(CR_ST_WORD(n, 0))
302                      + CR_ST_WORD(m, i);
303 
304         CR_ST_WORD(n, 3) = CR_ST_WORD(n, 2);
305         CR_ST_WORD(n, 2) = CR_ST_WORD(n, 1);
306         CR_ST_WORD(n, 1) = CR_ST_WORD(n, 0);
307         CR_ST_WORD(n, 0) = CR_ST_WORD(d, 3) + t;
308 
309         t += maj(CR_ST_WORD(d, 0), CR_ST_WORD(d, 1), CR_ST_WORD(d, 2))
310              + S0(CR_ST_WORD(d, 0));
311 
312         CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2);
313         CR_ST_WORD(d, 2) = CR_ST_WORD(d, 1);
314         CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0);
315         CR_ST_WORD(d, 0) = t;
316     }
317 
318     rd[0] = d.l[0];
319     rd[1] = d.l[1];
320 
321     clear_tail_16(vd, desc);
322 }
323 
324 void HELPER(crypto_sha256h2)(void *vd, void *vn, void *vm, uint32_t desc)
325 {
326     uint64_t *rd = vd;
327     uint64_t *rn = vn;
328     uint64_t *rm = vm;
329     union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
330     union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
331     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
332     int i;
333 
334     for (i = 0; i < 4; i++) {
335         uint32_t t = cho(CR_ST_WORD(d, 0), CR_ST_WORD(d, 1), CR_ST_WORD(d, 2))
336                      + CR_ST_WORD(d, 3) + S1(CR_ST_WORD(d, 0))
337                      + CR_ST_WORD(m, i);
338 
339         CR_ST_WORD(d, 3) = CR_ST_WORD(d, 2);
340         CR_ST_WORD(d, 2) = CR_ST_WORD(d, 1);
341         CR_ST_WORD(d, 1) = CR_ST_WORD(d, 0);
342         CR_ST_WORD(d, 0) = CR_ST_WORD(n, 3 - i) + t;
343     }
344 
345     rd[0] = d.l[0];
346     rd[1] = d.l[1];
347 
348     clear_tail_16(vd, desc);
349 }
350 
351 void HELPER(crypto_sha256su0)(void *vd, void *vm, uint32_t desc)
352 {
353     uint64_t *rd = vd;
354     uint64_t *rm = vm;
355     union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
356     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
357 
358     CR_ST_WORD(d, 0) += s0(CR_ST_WORD(d, 1));
359     CR_ST_WORD(d, 1) += s0(CR_ST_WORD(d, 2));
360     CR_ST_WORD(d, 2) += s0(CR_ST_WORD(d, 3));
361     CR_ST_WORD(d, 3) += s0(CR_ST_WORD(m, 0));
362 
363     rd[0] = d.l[0];
364     rd[1] = d.l[1];
365 
366     clear_tail_16(vd, desc);
367 }
368 
369 void HELPER(crypto_sha256su1)(void *vd, void *vn, void *vm, uint32_t desc)
370 {
371     uint64_t *rd = vd;
372     uint64_t *rn = vn;
373     uint64_t *rm = vm;
374     union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
375     union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
376     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
377 
378     CR_ST_WORD(d, 0) += s1(CR_ST_WORD(m, 2)) + CR_ST_WORD(n, 1);
379     CR_ST_WORD(d, 1) += s1(CR_ST_WORD(m, 3)) + CR_ST_WORD(n, 2);
380     CR_ST_WORD(d, 2) += s1(CR_ST_WORD(d, 0)) + CR_ST_WORD(n, 3);
381     CR_ST_WORD(d, 3) += s1(CR_ST_WORD(d, 1)) + CR_ST_WORD(m, 0);
382 
383     rd[0] = d.l[0];
384     rd[1] = d.l[1];
385 
386     clear_tail_16(vd, desc);
387 }
388 
389 /*
390  * The SHA-512 logical functions (same as above but using 64-bit operands)
391  */
392 
393 static uint64_t cho512(uint64_t x, uint64_t y, uint64_t z)
394 {
395     return (x & (y ^ z)) ^ z;
396 }
397 
398 static uint64_t maj512(uint64_t x, uint64_t y, uint64_t z)
399 {
400     return (x & y) | ((x | y) & z);
401 }
402 
403 static uint64_t S0_512(uint64_t x)
404 {
405     return ror64(x, 28) ^ ror64(x, 34) ^ ror64(x, 39);
406 }
407 
408 static uint64_t S1_512(uint64_t x)
409 {
410     return ror64(x, 14) ^ ror64(x, 18) ^ ror64(x, 41);
411 }
412 
413 static uint64_t s0_512(uint64_t x)
414 {
415     return ror64(x, 1) ^ ror64(x, 8) ^ (x >> 7);
416 }
417 
418 static uint64_t s1_512(uint64_t x)
419 {
420     return ror64(x, 19) ^ ror64(x, 61) ^ (x >> 6);
421 }
422 
423 void HELPER(crypto_sha512h)(void *vd, void *vn, void *vm, uint32_t desc)
424 {
425     uint64_t *rd = vd;
426     uint64_t *rn = vn;
427     uint64_t *rm = vm;
428     uint64_t d0 = rd[0];
429     uint64_t d1 = rd[1];
430 
431     d1 += S1_512(rm[1]) + cho512(rm[1], rn[0], rn[1]);
432     d0 += S1_512(d1 + rm[0]) + cho512(d1 + rm[0], rm[1], rn[0]);
433 
434     rd[0] = d0;
435     rd[1] = d1;
436 
437     clear_tail_16(vd, desc);
438 }
439 
440 void HELPER(crypto_sha512h2)(void *vd, void *vn, void *vm, uint32_t desc)
441 {
442     uint64_t *rd = vd;
443     uint64_t *rn = vn;
444     uint64_t *rm = vm;
445     uint64_t d0 = rd[0];
446     uint64_t d1 = rd[1];
447 
448     d1 += S0_512(rm[0]) + maj512(rn[0], rm[1], rm[0]);
449     d0 += S0_512(d1) + maj512(d1, rm[0], rm[1]);
450 
451     rd[0] = d0;
452     rd[1] = d1;
453 
454     clear_tail_16(vd, desc);
455 }
456 
457 void HELPER(crypto_sha512su0)(void *vd, void *vn, uint32_t desc)
458 {
459     uint64_t *rd = vd;
460     uint64_t *rn = vn;
461     uint64_t d0 = rd[0];
462     uint64_t d1 = rd[1];
463 
464     d0 += s0_512(rd[1]);
465     d1 += s0_512(rn[0]);
466 
467     rd[0] = d0;
468     rd[1] = d1;
469 
470     clear_tail_16(vd, desc);
471 }
472 
473 void HELPER(crypto_sha512su1)(void *vd, void *vn, void *vm, uint32_t desc)
474 {
475     uint64_t *rd = vd;
476     uint64_t *rn = vn;
477     uint64_t *rm = vm;
478 
479     rd[0] += s1_512(rn[0]) + rm[0];
480     rd[1] += s1_512(rn[1]) + rm[1];
481 
482     clear_tail_16(vd, desc);
483 }
484 
485 void HELPER(crypto_sm3partw1)(void *vd, void *vn, void *vm, uint32_t desc)
486 {
487     uint64_t *rd = vd;
488     uint64_t *rn = vn;
489     uint64_t *rm = vm;
490     union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
491     union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
492     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
493     uint32_t t;
494 
495     t = CR_ST_WORD(d, 0) ^ CR_ST_WORD(n, 0) ^ ror32(CR_ST_WORD(m, 1), 17);
496     CR_ST_WORD(d, 0) = t ^ ror32(t, 17) ^ ror32(t, 9);
497 
498     t = CR_ST_WORD(d, 1) ^ CR_ST_WORD(n, 1) ^ ror32(CR_ST_WORD(m, 2), 17);
499     CR_ST_WORD(d, 1) = t ^ ror32(t, 17) ^ ror32(t, 9);
500 
501     t = CR_ST_WORD(d, 2) ^ CR_ST_WORD(n, 2) ^ ror32(CR_ST_WORD(m, 3), 17);
502     CR_ST_WORD(d, 2) = t ^ ror32(t, 17) ^ ror32(t, 9);
503 
504     t = CR_ST_WORD(d, 3) ^ CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(d, 0), 17);
505     CR_ST_WORD(d, 3) = t ^ ror32(t, 17) ^ ror32(t, 9);
506 
507     rd[0] = d.l[0];
508     rd[1] = d.l[1];
509 
510     clear_tail_16(vd, desc);
511 }
512 
513 void HELPER(crypto_sm3partw2)(void *vd, void *vn, void *vm, uint32_t desc)
514 {
515     uint64_t *rd = vd;
516     uint64_t *rn = vn;
517     uint64_t *rm = vm;
518     union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
519     union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
520     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
521     uint32_t t = CR_ST_WORD(n, 0) ^ ror32(CR_ST_WORD(m, 0), 25);
522 
523     CR_ST_WORD(d, 0) ^= t;
524     CR_ST_WORD(d, 1) ^= CR_ST_WORD(n, 1) ^ ror32(CR_ST_WORD(m, 1), 25);
525     CR_ST_WORD(d, 2) ^= CR_ST_WORD(n, 2) ^ ror32(CR_ST_WORD(m, 2), 25);
526     CR_ST_WORD(d, 3) ^= CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(m, 3), 25) ^
527                         ror32(t, 17) ^ ror32(t, 2) ^ ror32(t, 26);
528 
529     rd[0] = d.l[0];
530     rd[1] = d.l[1];
531 
532     clear_tail_16(vd, desc);
533 }
534 
535 static inline void QEMU_ALWAYS_INLINE
536 crypto_sm3tt(uint64_t *rd, uint64_t *rn, uint64_t *rm,
537              uint32_t desc, uint32_t opcode)
538 {
539     union CRYPTO_STATE d = { .l = { rd[0], rd[1] } };
540     union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
541     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
542     uint32_t imm2 = simd_data(desc);
543     uint32_t t;
544 
545     assert(imm2 < 4);
546 
547     if (opcode == 0 || opcode == 2) {
548         /* SM3TT1A, SM3TT2A */
549         t = par(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1));
550     } else if (opcode == 1) {
551         /* SM3TT1B */
552         t = maj(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1));
553     } else if (opcode == 3) {
554         /* SM3TT2B */
555         t = cho(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1));
556     } else {
557         qemu_build_not_reached();
558     }
559 
560     t += CR_ST_WORD(d, 0) + CR_ST_WORD(m, imm2);
561 
562     CR_ST_WORD(d, 0) = CR_ST_WORD(d, 1);
563 
564     if (opcode < 2) {
565         /* SM3TT1A, SM3TT1B */
566         t += CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(d, 3), 20);
567 
568         CR_ST_WORD(d, 1) = ror32(CR_ST_WORD(d, 2), 23);
569     } else {
570         /* SM3TT2A, SM3TT2B */
571         t += CR_ST_WORD(n, 3);
572         t ^= rol32(t, 9) ^ rol32(t, 17);
573 
574         CR_ST_WORD(d, 1) = ror32(CR_ST_WORD(d, 2), 13);
575     }
576 
577     CR_ST_WORD(d, 2) = CR_ST_WORD(d, 3);
578     CR_ST_WORD(d, 3) = t;
579 
580     rd[0] = d.l[0];
581     rd[1] = d.l[1];
582 
583     clear_tail_16(rd, desc);
584 }
585 
586 #define DO_SM3TT(NAME, OPCODE) \
587     void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
588     { crypto_sm3tt(vd, vn, vm, desc, OPCODE); }
589 
590 DO_SM3TT(crypto_sm3tt1a, 0)
591 DO_SM3TT(crypto_sm3tt1b, 1)
592 DO_SM3TT(crypto_sm3tt2a, 2)
593 DO_SM3TT(crypto_sm3tt2b, 3)
594 
595 #undef DO_SM3TT
596 
597 static void do_crypto_sm4e(uint64_t *rd, uint64_t *rn, uint64_t *rm)
598 {
599     union CRYPTO_STATE d = { .l = { rn[0], rn[1] } };
600     union CRYPTO_STATE n = { .l = { rm[0], rm[1] } };
601     uint32_t t, i;
602 
603     for (i = 0; i < 4; i++) {
604         t = CR_ST_WORD(d, (i + 1) % 4) ^
605             CR_ST_WORD(d, (i + 2) % 4) ^
606             CR_ST_WORD(d, (i + 3) % 4) ^
607             CR_ST_WORD(n, i);
608 
609         t = sm4_sbox[t & 0xff] |
610             sm4_sbox[(t >> 8) & 0xff] << 8 |
611             sm4_sbox[(t >> 16) & 0xff] << 16 |
612             sm4_sbox[(t >> 24) & 0xff] << 24;
613 
614         CR_ST_WORD(d, i) ^= t ^ rol32(t, 2) ^ rol32(t, 10) ^ rol32(t, 18) ^
615                             rol32(t, 24);
616     }
617 
618     rd[0] = d.l[0];
619     rd[1] = d.l[1];
620 }
621 
622 void HELPER(crypto_sm4e)(void *vd, void *vn, void *vm, uint32_t desc)
623 {
624     intptr_t i, opr_sz = simd_oprsz(desc);
625 
626     for (i = 0; i < opr_sz; i += 16) {
627         do_crypto_sm4e(vd + i, vn + i, vm + i);
628     }
629     clear_tail(vd, opr_sz, simd_maxsz(desc));
630 }
631 
632 static void do_crypto_sm4ekey(uint64_t *rd, uint64_t *rn, uint64_t *rm)
633 {
634     union CRYPTO_STATE d;
635     union CRYPTO_STATE n = { .l = { rn[0], rn[1] } };
636     union CRYPTO_STATE m = { .l = { rm[0], rm[1] } };
637     uint32_t t, i;
638 
639     d = n;
640     for (i = 0; i < 4; i++) {
641         t = CR_ST_WORD(d, (i + 1) % 4) ^
642             CR_ST_WORD(d, (i + 2) % 4) ^
643             CR_ST_WORD(d, (i + 3) % 4) ^
644             CR_ST_WORD(m, i);
645 
646         t = sm4_sbox[t & 0xff] |
647             sm4_sbox[(t >> 8) & 0xff] << 8 |
648             sm4_sbox[(t >> 16) & 0xff] << 16 |
649             sm4_sbox[(t >> 24) & 0xff] << 24;
650 
651         CR_ST_WORD(d, i) ^= t ^ rol32(t, 13) ^ rol32(t, 23);
652     }
653 
654     rd[0] = d.l[0];
655     rd[1] = d.l[1];
656 }
657 
658 void HELPER(crypto_sm4ekey)(void *vd, void *vn, void* vm, uint32_t desc)
659 {
660     intptr_t i, opr_sz = simd_oprsz(desc);
661 
662     for (i = 0; i < opr_sz; i += 16) {
663         do_crypto_sm4ekey(vd + i, vn + i, vm + i);
664     }
665     clear_tail(vd, opr_sz, simd_maxsz(desc));
666 }
667 
668 void HELPER(crypto_rax1)(void *vd, void *vn, void *vm, uint32_t desc)
669 {
670     intptr_t i, opr_sz = simd_oprsz(desc);
671     uint64_t *d = vd, *n = vn, *m = vm;
672 
673     for (i = 0; i < opr_sz / 8; ++i) {
674         d[i] = n[i] ^ rol64(m[i], 1);
675     }
676     clear_tail(vd, opr_sz, simd_maxsz(desc));
677 }
678