1// SPDX-License-Identifier: GPL-2.0-or-later 2/* 3 * sm3-neon-core.S - SM3 secure hash using NEON instructions 4 * 5 * Linux/arm64 port of the libgcrypt SM3 implementation for AArch64 6 * 7 * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi> 8 * Copyright (c) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 9 */ 10 11#include <linux/linkage.h> 12#include <asm/assembler.h> 13 14/* Context structure */ 15 16#define state_h0 0 17#define state_h1 4 18#define state_h2 8 19#define state_h3 12 20#define state_h4 16 21#define state_h5 20 22#define state_h6 24 23#define state_h7 28 24 25/* Stack structure */ 26 27#define STACK_W_SIZE (32 * 2 * 3) 28 29#define STACK_W (0) 30#define STACK_SIZE (STACK_W + STACK_W_SIZE) 31 32/* Register macros */ 33 34#define RSTATE x0 35#define RDATA x1 36#define RNBLKS x2 37#define RKPTR x28 38#define RFRAME x29 39 40#define ra w3 41#define rb w4 42#define rc w5 43#define rd w6 44#define re w7 45#define rf w8 46#define rg w9 47#define rh w10 48 49#define t0 w11 50#define t1 w12 51#define t2 w13 52#define t3 w14 53#define t4 w15 54#define t5 w16 55#define t6 w17 56 57#define k_even w19 58#define k_odd w20 59 60#define addr0 x21 61#define addr1 x22 62 63#define s0 w23 64#define s1 w24 65#define s2 w25 66#define s3 w26 67 68#define W0 v0 69#define W1 v1 70#define W2 v2 71#define W3 v3 72#define W4 v4 73#define W5 v5 74 75#define XTMP0 v6 76#define XTMP1 v7 77#define XTMP2 v16 78#define XTMP3 v17 79#define XTMP4 v18 80#define XTMP5 v19 81#define XTMP6 v20 82 83/* Helper macros. */ 84 85#define _(...) /*_*/ 86 87#define clear_vec(x) \ 88 movi x.8h, #0; 89 90#define rolw(o, a, n) \ 91 ror o, a, #(32 - n); 92 93/* Round function macros. */ 94 95#define GG1_1(x, y, z, o, t) \ 96 eor o, x, y; 97#define GG1_2(x, y, z, o, t) \ 98 eor o, o, z; 99#define GG1_3(x, y, z, o, t) 100 101#define FF1_1(x, y, z, o, t) GG1_1(x, y, z, o, t) 102#define FF1_2(x, y, z, o, t) 103#define FF1_3(x, y, z, o, t) GG1_2(x, y, z, o, t) 104 105#define GG2_1(x, y, z, o, t) \ 106 bic o, z, x; 107#define GG2_2(x, y, z, o, t) \ 108 and t, y, x; 109#define GG2_3(x, y, z, o, t) \ 110 eor o, o, t; 111 112#define FF2_1(x, y, z, o, t) \ 113 eor o, x, y; 114#define FF2_2(x, y, z, o, t) \ 115 and t, x, y; \ 116 and o, o, z; 117#define FF2_3(x, y, z, o, t) \ 118 eor o, o, t; 119 120#define R(i, a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \ 121 K_LOAD(round); \ 122 ldr t5, [sp, #(wtype##_W1_ADDR(round, widx))]; \ 123 rolw(t0, a, 12); /* rol(a, 12) => t0 */ \ 124 IOP(1, iop_param); \ 125 FF##i##_1(a, b, c, t1, t2); \ 126 ldr t6, [sp, #(wtype##_W1W2_ADDR(round, widx))]; \ 127 add k, k, e; \ 128 IOP(2, iop_param); \ 129 GG##i##_1(e, f, g, t3, t4); \ 130 FF##i##_2(a, b, c, t1, t2); \ 131 IOP(3, iop_param); \ 132 add k, k, t0; \ 133 add h, h, t5; \ 134 add d, d, t6; /* w1w2 + d => d */ \ 135 IOP(4, iop_param); \ 136 rolw(k, k, 7); /* rol (t0 + e + t), 7) => k */ \ 137 GG##i##_2(e, f, g, t3, t4); \ 138 add h, h, k; /* h + w1 + k => h */ \ 139 IOP(5, iop_param); \ 140 FF##i##_3(a, b, c, t1, t2); \ 141 eor t0, t0, k; /* k ^ t0 => t0 */ \ 142 GG##i##_3(e, f, g, t3, t4); \ 143 add d, d, t1; /* FF(a,b,c) + d => d */ \ 144 IOP(6, iop_param); \ 145 add t3, t3, h; /* GG(e,f,g) + h => t3 */ \ 146 rolw(b, b, 9); /* rol(b, 9) => b */ \ 147 eor h, t3, t3, ror #(32-9); \ 148 IOP(7, iop_param); \ 149 add d, d, t0; /* t0 + d => d */ \ 150 rolw(f, f, 19); /* rol(f, 19) => f */ \ 151 IOP(8, iop_param); \ 152 eor h, h, t3, ror #(32-17); /* P0(t3) => h */ 153 154#define R1(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \ 155 R(1, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param) 156 157#define R2(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \ 158 R(2, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param) 159 160#define KL(round) \ 161 ldp k_even, k_odd, [RKPTR, #(4*(round))]; 162 163/* Input expansion macros. */ 164 165/* Byte-swapped input address. */ 166#define IW_W_ADDR(round, widx, offs) \ 167 (STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4)) 168 169/* Expanded input address. */ 170#define XW_W_ADDR(round, widx, offs) \ 171 (STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4)) 172 173/* Rounds 1-12, byte-swapped input block addresses. */ 174#define IW_W1_ADDR(round, widx) IW_W_ADDR(round, widx, 32) 175#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 48) 176 177/* Rounds 1-12, expanded input block addresses. */ 178#define XW_W1_ADDR(round, widx) XW_W_ADDR(round, widx, 0) 179#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 16) 180 181/* Input block loading. 182 * Interleaving within round function needed for in-order CPUs. */ 183#define LOAD_W_VEC_1_1() \ 184 add addr0, sp, #IW_W1_ADDR(0, 0); 185#define LOAD_W_VEC_1_2() \ 186 add addr1, sp, #IW_W1_ADDR(4, 0); 187#define LOAD_W_VEC_1_3() \ 188 ld1 {W0.16b}, [RDATA], #16; 189#define LOAD_W_VEC_1_4() \ 190 ld1 {W1.16b}, [RDATA], #16; 191#define LOAD_W_VEC_1_5() \ 192 ld1 {W2.16b}, [RDATA], #16; 193#define LOAD_W_VEC_1_6() \ 194 ld1 {W3.16b}, [RDATA], #16; 195#define LOAD_W_VEC_1_7() \ 196 rev32 XTMP0.16b, W0.16b; 197#define LOAD_W_VEC_1_8() \ 198 rev32 XTMP1.16b, W1.16b; 199#define LOAD_W_VEC_2_1() \ 200 rev32 XTMP2.16b, W2.16b; 201#define LOAD_W_VEC_2_2() \ 202 rev32 XTMP3.16b, W3.16b; 203#define LOAD_W_VEC_2_3() \ 204 eor XTMP4.16b, XTMP1.16b, XTMP0.16b; 205#define LOAD_W_VEC_2_4() \ 206 eor XTMP5.16b, XTMP2.16b, XTMP1.16b; 207#define LOAD_W_VEC_2_5() \ 208 st1 {XTMP0.16b}, [addr0], #16; 209#define LOAD_W_VEC_2_6() \ 210 st1 {XTMP4.16b}, [addr0]; \ 211 add addr0, sp, #IW_W1_ADDR(8, 0); 212#define LOAD_W_VEC_2_7() \ 213 eor XTMP6.16b, XTMP3.16b, XTMP2.16b; 214#define LOAD_W_VEC_2_8() \ 215 ext W0.16b, XTMP0.16b, XTMP0.16b, #8; /* W0: xx, w0, xx, xx */ 216#define LOAD_W_VEC_3_1() \ 217 mov W2.16b, XTMP1.16b; /* W2: xx, w6, w5, w4 */ 218#define LOAD_W_VEC_3_2() \ 219 st1 {XTMP1.16b}, [addr1], #16; 220#define LOAD_W_VEC_3_3() \ 221 st1 {XTMP5.16b}, [addr1]; \ 222 ext W1.16b, XTMP0.16b, XTMP0.16b, #4; /* W1: xx, w3, w2, w1 */ 223#define LOAD_W_VEC_3_4() \ 224 ext W3.16b, XTMP1.16b, XTMP2.16b, #12; /* W3: xx, w9, w8, w7 */ 225#define LOAD_W_VEC_3_5() \ 226 ext W4.16b, XTMP2.16b, XTMP3.16b, #8; /* W4: xx, w12, w11, w10 */ 227#define LOAD_W_VEC_3_6() \ 228 st1 {XTMP2.16b}, [addr0], #16; 229#define LOAD_W_VEC_3_7() \ 230 st1 {XTMP6.16b}, [addr0]; 231#define LOAD_W_VEC_3_8() \ 232 ext W5.16b, XTMP3.16b, XTMP3.16b, #4; /* W5: xx, w15, w14, w13 */ 233 234#define LOAD_W_VEC_1(iop_num, ...) \ 235 LOAD_W_VEC_1_##iop_num() 236#define LOAD_W_VEC_2(iop_num, ...) \ 237 LOAD_W_VEC_2_##iop_num() 238#define LOAD_W_VEC_3(iop_num, ...) \ 239 LOAD_W_VEC_3_##iop_num() 240 241/* Message scheduling. Note: 3 words per vector register. 242 * Interleaving within round function needed for in-order CPUs. */ 243#define SCHED_W_1_1(round, w0, w1, w2, w3, w4, w5) \ 244 /* Load (w[i - 16]) => XTMP0 */ \ 245 /* Load (w[i - 13]) => XTMP5 */ \ 246 ext XTMP0.16b, w0.16b, w0.16b, #12; /* XTMP0: w0, xx, xx, xx */ 247#define SCHED_W_1_2(round, w0, w1, w2, w3, w4, w5) \ 248 ext XTMP5.16b, w1.16b, w1.16b, #12; 249#define SCHED_W_1_3(round, w0, w1, w2, w3, w4, w5) \ 250 ext XTMP0.16b, XTMP0.16b, w1.16b, #12; /* XTMP0: xx, w2, w1, w0 */ 251#define SCHED_W_1_4(round, w0, w1, w2, w3, w4, w5) \ 252 ext XTMP5.16b, XTMP5.16b, w2.16b, #12; 253#define SCHED_W_1_5(round, w0, w1, w2, w3, w4, w5) \ 254 /* w[i - 9] == w3 */ \ 255 /* W3 ^ XTMP0 => XTMP0 */ \ 256 eor XTMP0.16b, XTMP0.16b, w3.16b; 257#define SCHED_W_1_6(round, w0, w1, w2, w3, w4, w5) \ 258 /* w[i - 3] == w5 */ \ 259 /* rol(XMM5, 15) ^ XTMP0 => XTMP0 */ \ 260 /* rol(XTMP5, 7) => XTMP1 */ \ 261 add addr0, sp, #XW_W1_ADDR((round), 0); \ 262 shl XTMP2.4s, w5.4s, #15; 263#define SCHED_W_1_7(round, w0, w1, w2, w3, w4, w5) \ 264 shl XTMP1.4s, XTMP5.4s, #7; 265#define SCHED_W_1_8(round, w0, w1, w2, w3, w4, w5) \ 266 sri XTMP2.4s, w5.4s, #(32-15); 267#define SCHED_W_2_1(round, w0, w1, w2, w3, w4, w5) \ 268 sri XTMP1.4s, XTMP5.4s, #(32-7); 269#define SCHED_W_2_2(round, w0, w1, w2, w3, w4, w5) \ 270 eor XTMP0.16b, XTMP0.16b, XTMP2.16b; 271#define SCHED_W_2_3(round, w0, w1, w2, w3, w4, w5) \ 272 /* w[i - 6] == W4 */ \ 273 /* W4 ^ XTMP1 => XTMP1 */ \ 274 eor XTMP1.16b, XTMP1.16b, w4.16b; 275#define SCHED_W_2_4(round, w0, w1, w2, w3, w4, w5) \ 276 /* P1(XTMP0) ^ XTMP1 => W0 */ \ 277 shl XTMP3.4s, XTMP0.4s, #15; 278#define SCHED_W_2_5(round, w0, w1, w2, w3, w4, w5) \ 279 shl XTMP4.4s, XTMP0.4s, #23; 280#define SCHED_W_2_6(round, w0, w1, w2, w3, w4, w5) \ 281 eor w0.16b, XTMP1.16b, XTMP0.16b; 282#define SCHED_W_2_7(round, w0, w1, w2, w3, w4, w5) \ 283 sri XTMP3.4s, XTMP0.4s, #(32-15); 284#define SCHED_W_2_8(round, w0, w1, w2, w3, w4, w5) \ 285 sri XTMP4.4s, XTMP0.4s, #(32-23); 286#define SCHED_W_3_1(round, w0, w1, w2, w3, w4, w5) \ 287 eor w0.16b, w0.16b, XTMP3.16b; 288#define SCHED_W_3_2(round, w0, w1, w2, w3, w4, w5) \ 289 /* Load (w[i - 3]) => XTMP2 */ \ 290 ext XTMP2.16b, w4.16b, w4.16b, #12; 291#define SCHED_W_3_3(round, w0, w1, w2, w3, w4, w5) \ 292 eor w0.16b, w0.16b, XTMP4.16b; 293#define SCHED_W_3_4(round, w0, w1, w2, w3, w4, w5) \ 294 ext XTMP2.16b, XTMP2.16b, w5.16b, #12; 295#define SCHED_W_3_5(round, w0, w1, w2, w3, w4, w5) \ 296 /* W1 ^ W2 => XTMP3 */ \ 297 eor XTMP3.16b, XTMP2.16b, w0.16b; 298#define SCHED_W_3_6(round, w0, w1, w2, w3, w4, w5) 299#define SCHED_W_3_7(round, w0, w1, w2, w3, w4, w5) \ 300 st1 {XTMP2.16b-XTMP3.16b}, [addr0]; 301#define SCHED_W_3_8(round, w0, w1, w2, w3, w4, w5) 302 303#define SCHED_W_W0W1W2W3W4W5_1(iop_num, round) \ 304 SCHED_W_1_##iop_num(round, W0, W1, W2, W3, W4, W5) 305#define SCHED_W_W0W1W2W3W4W5_2(iop_num, round) \ 306 SCHED_W_2_##iop_num(round, W0, W1, W2, W3, W4, W5) 307#define SCHED_W_W0W1W2W3W4W5_3(iop_num, round) \ 308 SCHED_W_3_##iop_num(round, W0, W1, W2, W3, W4, W5) 309 310#define SCHED_W_W1W2W3W4W5W0_1(iop_num, round) \ 311 SCHED_W_1_##iop_num(round, W1, W2, W3, W4, W5, W0) 312#define SCHED_W_W1W2W3W4W5W0_2(iop_num, round) \ 313 SCHED_W_2_##iop_num(round, W1, W2, W3, W4, W5, W0) 314#define SCHED_W_W1W2W3W4W5W0_3(iop_num, round) \ 315 SCHED_W_3_##iop_num(round, W1, W2, W3, W4, W5, W0) 316 317#define SCHED_W_W2W3W4W5W0W1_1(iop_num, round) \ 318 SCHED_W_1_##iop_num(round, W2, W3, W4, W5, W0, W1) 319#define SCHED_W_W2W3W4W5W0W1_2(iop_num, round) \ 320 SCHED_W_2_##iop_num(round, W2, W3, W4, W5, W0, W1) 321#define SCHED_W_W2W3W4W5W0W1_3(iop_num, round) \ 322 SCHED_W_3_##iop_num(round, W2, W3, W4, W5, W0, W1) 323 324#define SCHED_W_W3W4W5W0W1W2_1(iop_num, round) \ 325 SCHED_W_1_##iop_num(round, W3, W4, W5, W0, W1, W2) 326#define SCHED_W_W3W4W5W0W1W2_2(iop_num, round) \ 327 SCHED_W_2_##iop_num(round, W3, W4, W5, W0, W1, W2) 328#define SCHED_W_W3W4W5W0W1W2_3(iop_num, round) \ 329 SCHED_W_3_##iop_num(round, W3, W4, W5, W0, W1, W2) 330 331#define SCHED_W_W4W5W0W1W2W3_1(iop_num, round) \ 332 SCHED_W_1_##iop_num(round, W4, W5, W0, W1, W2, W3) 333#define SCHED_W_W4W5W0W1W2W3_2(iop_num, round) \ 334 SCHED_W_2_##iop_num(round, W4, W5, W0, W1, W2, W3) 335#define SCHED_W_W4W5W0W1W2W3_3(iop_num, round) \ 336 SCHED_W_3_##iop_num(round, W4, W5, W0, W1, W2, W3) 337 338#define SCHED_W_W5W0W1W2W3W4_1(iop_num, round) \ 339 SCHED_W_1_##iop_num(round, W5, W0, W1, W2, W3, W4) 340#define SCHED_W_W5W0W1W2W3W4_2(iop_num, round) \ 341 SCHED_W_2_##iop_num(round, W5, W0, W1, W2, W3, W4) 342#define SCHED_W_W5W0W1W2W3W4_3(iop_num, round) \ 343 SCHED_W_3_##iop_num(round, W5, W0, W1, W2, W3, W4) 344 345 346 /* 347 * Transform nblocks*64 bytes (nblocks*16 32-bit words) at 'data'. 348 * 349 * void sm3_neon_transform(struct sm3_block_state *state, 350 * const u8 *data, size_t nblocks) 351 */ 352 .text 353.align 3 354SYM_FUNC_START(sm3_neon_transform) 355 ldp ra, rb, [RSTATE, #0] 356 ldp rc, rd, [RSTATE, #8] 357 ldp re, rf, [RSTATE, #16] 358 ldp rg, rh, [RSTATE, #24] 359 360 stp x28, x29, [sp, #-16]! 361 stp x19, x20, [sp, #-16]! 362 stp x21, x22, [sp, #-16]! 363 stp x23, x24, [sp, #-16]! 364 stp x25, x26, [sp, #-16]! 365 mov RFRAME, sp 366 367 sub addr0, sp, #STACK_SIZE 368 adr_l RKPTR, .LKtable 369 and sp, addr0, #(~63) 370 371 /* Preload first block. */ 372 LOAD_W_VEC_1(1, 0) 373 LOAD_W_VEC_1(2, 0) 374 LOAD_W_VEC_1(3, 0) 375 LOAD_W_VEC_1(4, 0) 376 LOAD_W_VEC_1(5, 0) 377 LOAD_W_VEC_1(6, 0) 378 LOAD_W_VEC_1(7, 0) 379 LOAD_W_VEC_1(8, 0) 380 LOAD_W_VEC_2(1, 0) 381 LOAD_W_VEC_2(2, 0) 382 LOAD_W_VEC_2(3, 0) 383 LOAD_W_VEC_2(4, 0) 384 LOAD_W_VEC_2(5, 0) 385 LOAD_W_VEC_2(6, 0) 386 LOAD_W_VEC_2(7, 0) 387 LOAD_W_VEC_2(8, 0) 388 LOAD_W_VEC_3(1, 0) 389 LOAD_W_VEC_3(2, 0) 390 LOAD_W_VEC_3(3, 0) 391 LOAD_W_VEC_3(4, 0) 392 LOAD_W_VEC_3(5, 0) 393 LOAD_W_VEC_3(6, 0) 394 LOAD_W_VEC_3(7, 0) 395 LOAD_W_VEC_3(8, 0) 396 397.balign 16 398.Loop: 399 /* Transform 0-3 */ 400 R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 0, 0, IW, _, 0) 401 R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 1, 1, IW, _, 0) 402 R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 2, 2, IW, _, 0) 403 R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 3, 3, IW, _, 0) 404 405 /* Transform 4-7 + Precalc 12-14 */ 406 R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 4, 0, IW, _, 0) 407 R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 5, 1, IW, _, 0) 408 R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 6, 2, IW, SCHED_W_W0W1W2W3W4W5_1, 12) 409 R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 7, 3, IW, SCHED_W_W0W1W2W3W4W5_2, 12) 410 411 /* Transform 8-11 + Precalc 12-17 */ 412 R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 8, 0, IW, SCHED_W_W0W1W2W3W4W5_3, 12) 413 R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 9, 1, IW, SCHED_W_W1W2W3W4W5W0_1, 15) 414 R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 10, 2, IW, SCHED_W_W1W2W3W4W5W0_2, 15) 415 R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 11, 3, IW, SCHED_W_W1W2W3W4W5W0_3, 15) 416 417 /* Transform 12-14 + Precalc 18-20 */ 418 R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 12, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 18) 419 R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 13, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 18) 420 R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 14, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 18) 421 422 /* Transform 15-17 + Precalc 21-23 */ 423 R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 15, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 21) 424 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 16, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 21) 425 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 17, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 21) 426 427 /* Transform 18-20 + Precalc 24-26 */ 428 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 18, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 24) 429 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 19, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 24) 430 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 20, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 24) 431 432 /* Transform 21-23 + Precalc 27-29 */ 433 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 21, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 27) 434 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 22, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 27) 435 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 23, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 27) 436 437 /* Transform 24-26 + Precalc 30-32 */ 438 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 24, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 30) 439 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 25, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 30) 440 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 26, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 30) 441 442 /* Transform 27-29 + Precalc 33-35 */ 443 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 27, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 33) 444 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 28, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 33) 445 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 29, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 33) 446 447 /* Transform 30-32 + Precalc 36-38 */ 448 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 30, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 36) 449 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 31, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 36) 450 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 32, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 36) 451 452 /* Transform 33-35 + Precalc 39-41 */ 453 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 33, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 39) 454 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 34, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 39) 455 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 35, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 39) 456 457 /* Transform 36-38 + Precalc 42-44 */ 458 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 36, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 42) 459 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 37, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 42) 460 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 38, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 42) 461 462 /* Transform 39-41 + Precalc 45-47 */ 463 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 39, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 45) 464 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 40, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 45) 465 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 41, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 45) 466 467 /* Transform 42-44 + Precalc 48-50 */ 468 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 42, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 48) 469 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 43, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 48) 470 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 44, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 48) 471 472 /* Transform 45-47 + Precalc 51-53 */ 473 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 45, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 51) 474 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 46, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 51) 475 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 47, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 51) 476 477 /* Transform 48-50 + Precalc 54-56 */ 478 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 48, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 54) 479 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 49, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 54) 480 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 50, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 54) 481 482 /* Transform 51-53 + Precalc 57-59 */ 483 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 51, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 57) 484 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 52, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 57) 485 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 53, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 57) 486 487 /* Transform 54-56 + Precalc 60-62 */ 488 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 54, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 60) 489 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 55, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 60) 490 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 56, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 60) 491 492 /* Transform 57-59 + Precalc 63 */ 493 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 57, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 63) 494 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 58, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 63) 495 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 59, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 63) 496 497 /* Transform 60 */ 498 R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 60, 0, XW, _, _) 499 subs RNBLKS, RNBLKS, #1 500 b.eq .Lend 501 502 /* Transform 61-63 + Preload next block */ 503 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 61, 1, XW, LOAD_W_VEC_1, _) 504 ldp s0, s1, [RSTATE, #0] 505 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, LOAD_W_VEC_2, _) 506 ldp s2, s3, [RSTATE, #8] 507 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 63, 0, XW, LOAD_W_VEC_3, _) 508 509 /* Update the chaining variables. */ 510 eor ra, ra, s0 511 eor rb, rb, s1 512 ldp s0, s1, [RSTATE, #16] 513 eor rc, rc, s2 514 ldp k_even, k_odd, [RSTATE, #24] 515 eor rd, rd, s3 516 eor re, re, s0 517 stp ra, rb, [RSTATE, #0] 518 eor rf, rf, s1 519 stp rc, rd, [RSTATE, #8] 520 eor rg, rg, k_even 521 stp re, rf, [RSTATE, #16] 522 eor rh, rh, k_odd 523 stp rg, rh, [RSTATE, #24] 524 b .Loop 525 526.Lend: 527 /* Transform 61-63 */ 528 R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd, _, 61, 1, XW, _, _) 529 ldp s0, s1, [RSTATE, #0] 530 R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, _, _) 531 ldp s2, s3, [RSTATE, #8] 532 R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd, _, 63, 0, XW, _, _) 533 534 /* Update the chaining variables. */ 535 eor ra, ra, s0 536 clear_vec(W0) 537 eor rb, rb, s1 538 clear_vec(W1) 539 ldp s0, s1, [RSTATE, #16] 540 clear_vec(W2) 541 eor rc, rc, s2 542 clear_vec(W3) 543 ldp k_even, k_odd, [RSTATE, #24] 544 clear_vec(W4) 545 eor rd, rd, s3 546 clear_vec(W5) 547 eor re, re, s0 548 clear_vec(XTMP0) 549 stp ra, rb, [RSTATE, #0] 550 clear_vec(XTMP1) 551 eor rf, rf, s1 552 clear_vec(XTMP2) 553 stp rc, rd, [RSTATE, #8] 554 clear_vec(XTMP3) 555 eor rg, rg, k_even 556 clear_vec(XTMP4) 557 stp re, rf, [RSTATE, #16] 558 clear_vec(XTMP5) 559 eor rh, rh, k_odd 560 clear_vec(XTMP6) 561 stp rg, rh, [RSTATE, #24] 562 563 /* Clear message expansion area */ 564 add addr0, sp, #STACK_W 565 st1 {W0.16b-W3.16b}, [addr0], #64 566 st1 {W0.16b-W3.16b}, [addr0], #64 567 st1 {W0.16b-W3.16b}, [addr0] 568 569 mov sp, RFRAME 570 571 ldp x25, x26, [sp], #16 572 ldp x23, x24, [sp], #16 573 ldp x21, x22, [sp], #16 574 ldp x19, x20, [sp], #16 575 ldp x28, x29, [sp], #16 576 577 ret 578SYM_FUNC_END(sm3_neon_transform) 579 580 581 .section ".rodata", "a" 582 583 .align 4 584.LKtable: 585 .long 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb 586 .long 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc 587 .long 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce 588 .long 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6 589 .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c 590 .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce 591 .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec 592 .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5 593 .long 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53 594 .long 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d 595 .long 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4 596 .long 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43 597 .long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c 598 .long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce 599 .long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec 600 .long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5 601