1/* SPDX-License-Identifier: GPL-2.0 OR MIT */ 2/* 3 * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved. 4 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 5 */ 6 7#define MASK_U32 0x3c 8#define CHACHA20_BLOCK_SIZE 64 9#define STACK_SIZE 32 10 11#define X0 $t0 12#define X1 $t1 13#define X2 $t2 14#define X3 $t3 15#define X4 $t4 16#define X5 $t5 17#define X6 $t6 18#define X7 $t7 19#define X8 $t8 20#define X9 $t9 21#define X10 $v1 22#define X11 $s6 23#define X12 $s5 24#define X13 $s4 25#define X14 $s3 26#define X15 $s2 27/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */ 28#define T0 $s1 29#define T1 $s0 30#define T(n) T ## n 31#define X(n) X ## n 32 33/* Input arguments */ 34#define STATE $a0 35#define OUT $a1 36#define IN $a2 37#define BYTES $a3 38 39/* Output argument */ 40/* NONCE[0] is kept in a register and not in memory. 41 * We don't want to touch original value in memory. 42 * Must be incremented every loop iteration. 43 */ 44#define NONCE_0 $v0 45 46/* SAVED_X and SAVED_CA are set in the jump table. 47 * Use regs which are overwritten on exit else we don't leak clear data. 48 * They are used to handling the last bytes which are not multiple of 4. 49 */ 50#define SAVED_X X15 51#define SAVED_CA $s7 52 53#define IS_UNALIGNED $s7 54 55#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 56#define MSB 0 57#define LSB 3 58#define CPU_TO_LE32(n) \ 59 wsbh n, n; \ 60 rotr n, 16; 61#else 62#define MSB 3 63#define LSB 0 64#define CPU_TO_LE32(n) 65#endif 66 67#define FOR_EACH_WORD(x) \ 68 x( 0); \ 69 x( 1); \ 70 x( 2); \ 71 x( 3); \ 72 x( 4); \ 73 x( 5); \ 74 x( 6); \ 75 x( 7); \ 76 x( 8); \ 77 x( 9); \ 78 x(10); \ 79 x(11); \ 80 x(12); \ 81 x(13); \ 82 x(14); \ 83 x(15); 84 85#define FOR_EACH_WORD_REV(x) \ 86 x(15); \ 87 x(14); \ 88 x(13); \ 89 x(12); \ 90 x(11); \ 91 x(10); \ 92 x( 9); \ 93 x( 8); \ 94 x( 7); \ 95 x( 6); \ 96 x( 5); \ 97 x( 4); \ 98 x( 3); \ 99 x( 2); \ 100 x( 1); \ 101 x( 0); 102 103#define PLUS_ONE_0 1 104#define PLUS_ONE_1 2 105#define PLUS_ONE_2 3 106#define PLUS_ONE_3 4 107#define PLUS_ONE_4 5 108#define PLUS_ONE_5 6 109#define PLUS_ONE_6 7 110#define PLUS_ONE_7 8 111#define PLUS_ONE_8 9 112#define PLUS_ONE_9 10 113#define PLUS_ONE_10 11 114#define PLUS_ONE_11 12 115#define PLUS_ONE_12 13 116#define PLUS_ONE_13 14 117#define PLUS_ONE_14 15 118#define PLUS_ONE_15 16 119#define PLUS_ONE(x) PLUS_ONE_ ## x 120#define _CONCAT3(a,b,c) a ## b ## c 121#define CONCAT3(a,b,c) _CONCAT3(a,b,c) 122 123#define STORE_UNALIGNED(x) \ 124CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \ 125 .if (x != 12); \ 126 lw T0, (x*4)(STATE); \ 127 .endif; \ 128 lwl T1, (x*4)+MSB ## (IN); \ 129 lwr T1, (x*4)+LSB ## (IN); \ 130 .if (x == 12); \ 131 addu X ## x, NONCE_0; \ 132 .else; \ 133 addu X ## x, T0; \ 134 .endif; \ 135 CPU_TO_LE32(X ## x); \ 136 xor X ## x, T1; \ 137 swl X ## x, (x*4)+MSB ## (OUT); \ 138 swr X ## x, (x*4)+LSB ## (OUT); 139 140#define STORE_ALIGNED(x) \ 141CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \ 142 .if (x != 12); \ 143 lw T0, (x*4)(STATE); \ 144 .endif; \ 145 lw T1, (x*4) ## (IN); \ 146 .if (x == 12); \ 147 addu X ## x, NONCE_0; \ 148 .else; \ 149 addu X ## x, T0; \ 150 .endif; \ 151 CPU_TO_LE32(X ## x); \ 152 xor X ## x, T1; \ 153 sw X ## x, (x*4) ## (OUT); 154 155/* Jump table macro. 156 * Used for setup and handling the last bytes, which are not multiple of 4. 157 * X15 is free to store Xn 158 * Every jumptable entry must be equal in size. 159 */ 160#define JMPTBL_ALIGNED(x) \ 161.Lchacha_mips_jmptbl_aligned_ ## x: ; \ 162 .set noreorder; \ 163 b .Lchacha_mips_xor_aligned_ ## x ## _b; \ 164 .if (x == 12); \ 165 addu SAVED_X, X ## x, NONCE_0; \ 166 .else; \ 167 addu SAVED_X, X ## x, SAVED_CA; \ 168 .endif; \ 169 .set reorder 170 171#define JMPTBL_UNALIGNED(x) \ 172.Lchacha_mips_jmptbl_unaligned_ ## x: ; \ 173 .set noreorder; \ 174 b .Lchacha_mips_xor_unaligned_ ## x ## _b; \ 175 .if (x == 12); \ 176 addu SAVED_X, X ## x, NONCE_0; \ 177 .else; \ 178 addu SAVED_X, X ## x, SAVED_CA; \ 179 .endif; \ 180 .set reorder 181 182#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \ 183 addu X(A), X(K); \ 184 addu X(B), X(L); \ 185 addu X(C), X(M); \ 186 addu X(D), X(N); \ 187 xor X(V), X(A); \ 188 xor X(W), X(B); \ 189 xor X(Y), X(C); \ 190 xor X(Z), X(D); \ 191 rotr X(V), 32 - S; \ 192 rotr X(W), 32 - S; \ 193 rotr X(Y), 32 - S; \ 194 rotr X(Z), 32 - S; 195 196.text 197.set reorder 198.set noat 199.globl chacha_crypt_arch 200.ent chacha_crypt_arch 201chacha_crypt_arch: 202 .frame $sp, STACK_SIZE, $ra 203 204 /* Load number of rounds */ 205 lw $at, 16($sp) 206 207 addiu $sp, -STACK_SIZE 208 209 /* Return bytes = 0. */ 210 beqz BYTES, .Lchacha_mips_end 211 212 lw NONCE_0, 48(STATE) 213 214 /* Save s0-s7 */ 215 sw $s0, 0($sp) 216 sw $s1, 4($sp) 217 sw $s2, 8($sp) 218 sw $s3, 12($sp) 219 sw $s4, 16($sp) 220 sw $s5, 20($sp) 221 sw $s6, 24($sp) 222 sw $s7, 28($sp) 223 224 /* Test IN or OUT is unaligned. 225 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003 226 */ 227 or IS_UNALIGNED, IN, OUT 228 andi IS_UNALIGNED, 0x3 229 230 b .Lchacha_rounds_start 231 232.align 4 233.Loop_chacha_rounds: 234 addiu IN, CHACHA20_BLOCK_SIZE 235 addiu OUT, CHACHA20_BLOCK_SIZE 236 addiu NONCE_0, 1 237 238.Lchacha_rounds_start: 239 lw X0, 0(STATE) 240 lw X1, 4(STATE) 241 lw X2, 8(STATE) 242 lw X3, 12(STATE) 243 244 lw X4, 16(STATE) 245 lw X5, 20(STATE) 246 lw X6, 24(STATE) 247 lw X7, 28(STATE) 248 lw X8, 32(STATE) 249 lw X9, 36(STATE) 250 lw X10, 40(STATE) 251 lw X11, 44(STATE) 252 253 move X12, NONCE_0 254 lw X13, 52(STATE) 255 lw X14, 56(STATE) 256 lw X15, 60(STATE) 257 258.Loop_chacha_xor_rounds: 259 addiu $at, -2 260 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); 261 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); 262 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); 263 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); 264 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); 265 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); 266 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); 267 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); 268 bnez $at, .Loop_chacha_xor_rounds 269 270 addiu BYTES, -(CHACHA20_BLOCK_SIZE) 271 272 /* Is data src/dst unaligned? Jump */ 273 bnez IS_UNALIGNED, .Loop_chacha_unaligned 274 275 /* Set number rounds here to fill delayslot. */ 276 lw $at, (STACK_SIZE+16)($sp) 277 278 /* BYTES < 0, it has no full block. */ 279 bltz BYTES, .Lchacha_mips_no_full_block_aligned 280 281 FOR_EACH_WORD_REV(STORE_ALIGNED) 282 283 /* BYTES > 0? Loop again. */ 284 bgtz BYTES, .Loop_chacha_rounds 285 286 /* Place this here to fill delay slot */ 287 addiu NONCE_0, 1 288 289 /* BYTES < 0? Handle last bytes */ 290 bltz BYTES, .Lchacha_mips_xor_bytes 291 292.Lchacha_mips_xor_done: 293 /* Restore used registers */ 294 lw $s0, 0($sp) 295 lw $s1, 4($sp) 296 lw $s2, 8($sp) 297 lw $s3, 12($sp) 298 lw $s4, 16($sp) 299 lw $s5, 20($sp) 300 lw $s6, 24($sp) 301 lw $s7, 28($sp) 302 303 /* Write NONCE_0 back to right location in state */ 304 sw NONCE_0, 48(STATE) 305 306.Lchacha_mips_end: 307 addiu $sp, STACK_SIZE 308 jr $ra 309 310.Lchacha_mips_no_full_block_aligned: 311 /* Restore the offset on BYTES */ 312 addiu BYTES, CHACHA20_BLOCK_SIZE 313 314 /* Get number of full WORDS */ 315 andi $at, BYTES, MASK_U32 316 317 /* Load upper half of jump table addr */ 318 lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0) 319 320 /* Calculate lower half jump table offset */ 321 ins T0, $at, 1, 6 322 323 /* Add offset to STATE */ 324 addu T1, STATE, $at 325 326 /* Add lower half jump table addr */ 327 addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0) 328 329 /* Read value from STATE */ 330 lw SAVED_CA, 0(T1) 331 332 /* Store remaining bytecounter as negative value */ 333 subu BYTES, $at, BYTES 334 335 jr T0 336 337 /* Jump table */ 338 FOR_EACH_WORD(JMPTBL_ALIGNED) 339 340 341.Loop_chacha_unaligned: 342 /* Set number rounds here to fill delayslot. */ 343 lw $at, (STACK_SIZE+16)($sp) 344 345 /* BYTES > 0, it has no full block. */ 346 bltz BYTES, .Lchacha_mips_no_full_block_unaligned 347 348 FOR_EACH_WORD_REV(STORE_UNALIGNED) 349 350 /* BYTES > 0? Loop again. */ 351 bgtz BYTES, .Loop_chacha_rounds 352 353 /* Write NONCE_0 back to right location in state */ 354 sw NONCE_0, 48(STATE) 355 356 .set noreorder 357 /* Fall through to byte handling */ 358 bgez BYTES, .Lchacha_mips_xor_done 359.Lchacha_mips_xor_unaligned_0_b: 360.Lchacha_mips_xor_aligned_0_b: 361 /* Place this here to fill delay slot */ 362 addiu NONCE_0, 1 363 .set reorder 364 365.Lchacha_mips_xor_bytes: 366 addu IN, $at 367 addu OUT, $at 368 /* First byte */ 369 lbu T1, 0(IN) 370 addiu $at, BYTES, 1 371 xor T1, SAVED_X 372 sb T1, 0(OUT) 373 beqz $at, .Lchacha_mips_xor_done 374 /* Second byte */ 375 lbu T1, 1(IN) 376 addiu $at, BYTES, 2 377 rotr SAVED_X, 8 378 xor T1, SAVED_X 379 sb T1, 1(OUT) 380 beqz $at, .Lchacha_mips_xor_done 381 /* Third byte */ 382 lbu T1, 2(IN) 383 rotr SAVED_X, 8 384 xor T1, SAVED_X 385 sb T1, 2(OUT) 386 b .Lchacha_mips_xor_done 387 388.Lchacha_mips_no_full_block_unaligned: 389 /* Restore the offset on BYTES */ 390 addiu BYTES, CHACHA20_BLOCK_SIZE 391 392 /* Get number of full WORDS */ 393 andi $at, BYTES, MASK_U32 394 395 /* Load upper half of jump table addr */ 396 lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0) 397 398 /* Calculate lower half jump table offset */ 399 ins T0, $at, 1, 6 400 401 /* Add offset to STATE */ 402 addu T1, STATE, $at 403 404 /* Add lower half jump table addr */ 405 addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0) 406 407 /* Read value from STATE */ 408 lw SAVED_CA, 0(T1) 409 410 /* Store remaining bytecounter as negative value */ 411 subu BYTES, $at, BYTES 412 413 jr T0 414 415 /* Jump table */ 416 FOR_EACH_WORD(JMPTBL_UNALIGNED) 417.end chacha_crypt_arch 418.set at 419 420/* Input arguments 421 * STATE $a0 422 * OUT $a1 423 * NROUND $a2 424 */ 425 426#undef X12 427#undef X13 428#undef X14 429#undef X15 430 431#define X12 $a3 432#define X13 $at 433#define X14 $v0 434#define X15 STATE 435 436.set noat 437.globl hchacha_block_arch 438.ent hchacha_block_arch 439hchacha_block_arch: 440 .frame $sp, STACK_SIZE, $ra 441 442 addiu $sp, -STACK_SIZE 443 444 /* Save X11(s6) */ 445 sw X11, 0($sp) 446 447 lw X0, 0(STATE) 448 lw X1, 4(STATE) 449 lw X2, 8(STATE) 450 lw X3, 12(STATE) 451 lw X4, 16(STATE) 452 lw X5, 20(STATE) 453 lw X6, 24(STATE) 454 lw X7, 28(STATE) 455 lw X8, 32(STATE) 456 lw X9, 36(STATE) 457 lw X10, 40(STATE) 458 lw X11, 44(STATE) 459 lw X12, 48(STATE) 460 lw X13, 52(STATE) 461 lw X14, 56(STATE) 462 lw X15, 60(STATE) 463 464.Loop_hchacha_xor_rounds: 465 addiu $a2, -2 466 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); 467 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); 468 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); 469 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); 470 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); 471 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); 472 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); 473 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); 474 bnez $a2, .Loop_hchacha_xor_rounds 475 476 /* Restore used register */ 477 lw X11, 0($sp) 478 479 sw X0, 0(OUT) 480 sw X1, 4(OUT) 481 sw X2, 8(OUT) 482 sw X3, 12(OUT) 483 sw X12, 16(OUT) 484 sw X13, 20(OUT) 485 sw X14, 24(OUT) 486 sw X15, 28(OUT) 487 488 addiu $sp, STACK_SIZE 489 jr $ra 490.end hchacha_block_arch 491.set at 492