1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Accelerated AES-GCM implementation with ARMv8 Crypto Extensions. 4 * 5 * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org> 6 */ 7 8#include <linux/linkage.h> 9#include <asm/assembler.h> 10 11 SHASH .req v0 12 SHASH2 .req v1 13 T1 .req v2 14 T2 .req v3 15 MASK .req v4 16 XM .req v5 17 XL .req v6 18 XH .req v7 19 IN1 .req v7 20 21 XL2 .req v8 22 XM2 .req v9 23 XH2 .req v10 24 XL3 .req v11 25 XM3 .req v12 26 XH3 .req v13 27 TT3 .req v14 28 TT4 .req v15 29 HH .req v16 30 HH3 .req v17 31 HH4 .req v18 32 HH34 .req v19 33 34 .text 35 .arch armv8-a+crypto 36 37 .macro __pmull_pre_p64 38 add x8, x3, #16 39 ld1 {HH.2d-HH4.2d}, [x8] 40 41 trn1 SHASH2.2d, SHASH.2d, HH.2d 42 trn2 T1.2d, SHASH.2d, HH.2d 43 eor SHASH2.16b, SHASH2.16b, T1.16b 44 45 trn1 HH34.2d, HH3.2d, HH4.2d 46 trn2 T1.2d, HH3.2d, HH4.2d 47 eor HH34.16b, HH34.16b, T1.16b 48 49 movi MASK.16b, #0xe1 50 shl MASK.2d, MASK.2d, #57 51 .endm 52 53 .macro __pmull_reduce_p64 54 pmull T2.1q, XL.1d, MASK.1d 55 eor XM.16b, XM.16b, T1.16b 56 57 mov XH.d[0], XM.d[1] 58 mov XM.d[1], XL.d[0] 59 60 eor XL.16b, XM.16b, T2.16b 61 ext T2.16b, XL.16b, XL.16b, #8 62 pmull XL.1q, XL.1d, MASK.1d 63 .endm 64 65 /* 66 * void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src, 67 * u64 const h[4][2], const char *head) 68 */ 69SYM_FUNC_START(pmull_ghash_update_p64) 70 ld1 {SHASH.2d}, [x3] 71 ld1 {XL.2d}, [x1] 72 73 __pmull_pre_p64 74 75 /* do the head block first, if supplied */ 76 cbz x4, 0f 77 ld1 {T1.2d}, [x4] 78 mov x4, xzr 79 b 3f 80 810: 82 tbnz w0, #0, 2f // skip until #blocks is a 83 tbnz w0, #1, 2f // round multiple of 4 84 851: ld1 {XM3.16b-TT4.16b}, [x2], #64 86 87 sub w0, w0, #4 88 89 rev64 T1.16b, XM3.16b 90 rev64 T2.16b, XH3.16b 91 rev64 TT4.16b, TT4.16b 92 rev64 TT3.16b, TT3.16b 93 94 ext IN1.16b, TT4.16b, TT4.16b, #8 95 ext XL3.16b, TT3.16b, TT3.16b, #8 96 97 eor TT4.16b, TT4.16b, IN1.16b 98 pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1 99 pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0 100 pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) 101 102 eor TT3.16b, TT3.16b, XL3.16b 103 pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1 104 pmull XL3.1q, HH.1d, XL3.1d // a0 * b0 105 pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) 106 107 ext IN1.16b, T2.16b, T2.16b, #8 108 eor XL2.16b, XL2.16b, XL3.16b 109 eor XH2.16b, XH2.16b, XH3.16b 110 eor XM2.16b, XM2.16b, XM3.16b 111 112 eor T2.16b, T2.16b, IN1.16b 113 pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1 114 pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0 115 pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) 116 117 eor XL2.16b, XL2.16b, XL3.16b 118 eor XH2.16b, XH2.16b, XH3.16b 119 eor XM2.16b, XM2.16b, XM3.16b 120 121 ext IN1.16b, T1.16b, T1.16b, #8 122 ext TT3.16b, XL.16b, XL.16b, #8 123 eor XL.16b, XL.16b, IN1.16b 124 eor T1.16b, T1.16b, TT3.16b 125 126 pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1 127 eor T1.16b, T1.16b, XL.16b 128 pmull XL.1q, HH4.1d, XL.1d // a0 * b0 129 pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) 130 131 eor XL.16b, XL.16b, XL2.16b 132 eor XH.16b, XH.16b, XH2.16b 133 eor XM.16b, XM.16b, XM2.16b 134 135 eor T2.16b, XL.16b, XH.16b 136 ext T1.16b, XL.16b, XH.16b, #8 137 eor XM.16b, XM.16b, T2.16b 138 139 __pmull_reduce_p64 140 141 eor T2.16b, T2.16b, XH.16b 142 eor XL.16b, XL.16b, T2.16b 143 144 cbz w0, 5f 145 b 1b 146 1472: ld1 {T1.2d}, [x2], #16 148 sub w0, w0, #1 149 1503: /* multiply XL by SHASH in GF(2^128) */ 151CPU_LE( rev64 T1.16b, T1.16b ) 152 153 ext T2.16b, XL.16b, XL.16b, #8 154 ext IN1.16b, T1.16b, T1.16b, #8 155 eor T1.16b, T1.16b, T2.16b 156 eor XL.16b, XL.16b, IN1.16b 157 158 pmull2 XH.1q, XL.2d, SHASH.2d // a1 * b1 159 eor T1.16b, T1.16b, XL.16b 160 pmull XL.1q, XL.1d, SHASH.1d // a0 * b0 161 pmull XM.1q, T1.1d, SHASH2.1d // (a1 + a0)(b1 + b0) 162 1634: eor T2.16b, XL.16b, XH.16b 164 ext T1.16b, XL.16b, XH.16b, #8 165 eor XM.16b, XM.16b, T2.16b 166 167 __pmull_reduce_p64 168 169 eor T2.16b, T2.16b, XH.16b 170 eor XL.16b, XL.16b, T2.16b 171 172 cbnz w0, 0b 173 1745: st1 {XL.2d}, [x1] 175 ret 176SYM_FUNC_END(pmull_ghash_update_p64) 177 178 KS0 .req v8 179 KS1 .req v9 180 KS2 .req v10 181 KS3 .req v11 182 183 INP0 .req v21 184 INP1 .req v22 185 INP2 .req v23 186 INP3 .req v24 187 188 K0 .req v25 189 K1 .req v26 190 K2 .req v27 191 K3 .req v28 192 K4 .req v12 193 K5 .req v13 194 K6 .req v4 195 K7 .req v5 196 K8 .req v14 197 K9 .req v15 198 KK .req v29 199 KL .req v30 200 KM .req v31 201 202 .macro load_round_keys, rounds, rk, tmp 203 add \tmp, \rk, #64 204 ld1 {K0.4s-K3.4s}, [\rk] 205 ld1 {K4.4s-K5.4s}, [\tmp] 206 add \tmp, \rk, \rounds, lsl #4 207 sub \tmp, \tmp, #32 208 ld1 {KK.4s-KM.4s}, [\tmp] 209 .endm 210 211 .macro enc_round, state, key 212 aese \state\().16b, \key\().16b 213 aesmc \state\().16b, \state\().16b 214 .endm 215 216 .macro enc_qround, s0, s1, s2, s3, key 217 enc_round \s0, \key 218 enc_round \s1, \key 219 enc_round \s2, \key 220 enc_round \s3, \key 221 .endm 222 223 .macro enc_block, state, rounds, rk, tmp 224 add \tmp, \rk, #96 225 ld1 {K6.4s-K7.4s}, [\tmp], #32 226 .irp key, K0, K1, K2, K3, K4 K5 227 enc_round \state, \key 228 .endr 229 230 tbnz \rounds, #2, .Lnot128_\@ 231.Lout256_\@: 232 enc_round \state, K6 233 enc_round \state, K7 234 235.Lout192_\@: 236 enc_round \state, KK 237 aese \state\().16b, KL.16b 238 eor \state\().16b, \state\().16b, KM.16b 239 240 .subsection 1 241.Lnot128_\@: 242 ld1 {K8.4s-K9.4s}, [\tmp], #32 243 enc_round \state, K6 244 enc_round \state, K7 245 ld1 {K6.4s-K7.4s}, [\tmp] 246 enc_round \state, K8 247 enc_round \state, K9 248 tbz \rounds, #1, .Lout192_\@ 249 b .Lout256_\@ 250 .previous 251 .endm 252 253 .align 6 254 .macro pmull_gcm_do_crypt, enc 255 frame_push 1 256 257 load_round_keys x7, x6, x8 258 259 ld1 {SHASH.2d}, [x3], #16 260 ld1 {HH.2d-HH4.2d}, [x3] 261 262 trn1 SHASH2.2d, SHASH.2d, HH.2d 263 trn2 T1.2d, SHASH.2d, HH.2d 264 eor SHASH2.16b, SHASH2.16b, T1.16b 265 266 trn1 HH34.2d, HH3.2d, HH4.2d 267 trn2 T1.2d, HH3.2d, HH4.2d 268 eor HH34.16b, HH34.16b, T1.16b 269 270 ld1 {XL.2d}, [x4] 271 272 cbz x0, 3f // tag only? 273 274 ldr w8, [x5, #12] // load lower counter 275CPU_LE( rev w8, w8 ) 276 2770: mov w9, #4 // max blocks per round 278 add x10, x0, #0xf 279 lsr x10, x10, #4 // remaining blocks 280 281 subs x0, x0, #64 282 csel w9, w10, w9, mi 283 add w8, w8, w9 284 285 bmi 1f 286 ld1 {INP0.16b-INP3.16b}, [x2], #64 287 .subsection 1 288 /* 289 * Populate the four input registers right to left with up to 63 bytes 290 * of data, using overlapping loads to avoid branches. 291 * 292 * INP0 INP1 INP2 INP3 293 * 1 byte | | | |x | 294 * 16 bytes | | | |xxxxxxxx| 295 * 17 bytes | | |xxxxxxxx|x | 296 * 47 bytes | |xxxxxxxx|xxxxxxxx|xxxxxxx | 297 * etc etc 298 * 299 * Note that this code may read up to 15 bytes before the start of 300 * the input. It is up to the calling code to ensure this is safe if 301 * this happens in the first iteration of the loop (i.e., when the 302 * input size is < 16 bytes) 303 */ 3041: mov x15, #16 305 ands x19, x0, #0xf 306 csel x19, x19, x15, ne 307 adr_l x17, .Lpermute_table + 16 308 309 sub x11, x15, x19 310 add x12, x17, x11 311 sub x17, x17, x11 312 ld1 {T1.16b}, [x12] 313 sub x10, x1, x11 314 sub x11, x2, x11 315 316 cmp x0, #-16 317 csel x14, x15, xzr, gt 318 cmp x0, #-32 319 csel x15, x15, xzr, gt 320 cmp x0, #-48 321 csel x16, x19, xzr, gt 322 csel x1, x1, x10, gt 323 csel x2, x2, x11, gt 324 325 ld1 {INP0.16b}, [x2], x14 326 ld1 {INP1.16b}, [x2], x15 327 ld1 {INP2.16b}, [x2], x16 328 ld1 {INP3.16b}, [x2] 329 tbl INP3.16b, {INP3.16b}, T1.16b 330 b 2f 331 .previous 332 3332: .if \enc == 0 334 bl pmull_gcm_ghash_4x 335 .endif 336 337 bl pmull_gcm_enc_4x 338 339 tbnz x0, #63, 6f 340 st1 {INP0.16b-INP3.16b}, [x1], #64 341 .if \enc == 1 342 bl pmull_gcm_ghash_4x 343 .endif 344 bne 0b 345 3463: ldr x10, [sp, #.Lframe_local_offset] 347 cbz x10, 5f // output tag? 348 349 ld1 {INP3.16b}, [x10] // load lengths[] 350 mov w9, #1 351 bl pmull_gcm_ghash_4x 352 353 mov w11, #(0x1 << 24) // BE '1U' 354 ld1 {KS0.16b}, [x5] 355 mov KS0.s[3], w11 356 357 enc_block KS0, x7, x6, x12 358 359 ext XL.16b, XL.16b, XL.16b, #8 360 rev64 XL.16b, XL.16b 361 eor XL.16b, XL.16b, KS0.16b 362 363 .if \enc == 1 364 st1 {XL.16b}, [x10] // store tag 365 .else 366 ldp x11, x12, [sp, #40] // load tag pointer and authsize 367 adr_l x17, .Lpermute_table 368 ld1 {KS0.16b}, [x11] // load supplied tag 369 add x17, x17, x12 370 ld1 {KS1.16b}, [x17] // load permute vector 371 372 cmeq XL.16b, XL.16b, KS0.16b // compare tags 373 mvn XL.16b, XL.16b // -1 for fail, 0 for pass 374 tbl XL.16b, {XL.16b}, KS1.16b // keep authsize bytes only 375 sminv b0, XL.16b // signed minimum across XL 376 smov w0, v0.b[0] // return b0 377 .endif 378 3794: frame_pop 380 ret 381 3825: 383CPU_LE( rev w8, w8 ) 384 str w8, [x5, #12] // store lower counter 385 st1 {XL.2d}, [x4] 386 b 4b 387 3886: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors 389 sub x17, x17, x19, lsl #1 390 391 cmp w9, #1 392 beq 7f 393 .subsection 1 3947: ld1 {INP2.16b}, [x1] 395 tbx INP2.16b, {INP3.16b}, T1.16b 396 mov INP3.16b, INP2.16b 397 b 8f 398 .previous 399 400 st1 {INP0.16b}, [x1], x14 401 st1 {INP1.16b}, [x1], x15 402 st1 {INP2.16b}, [x1], x16 403 tbl INP3.16b, {INP3.16b}, T1.16b 404 tbx INP3.16b, {INP2.16b}, T2.16b 4058: st1 {INP3.16b}, [x1] 406 407 .if \enc == 1 408 ld1 {T1.16b}, [x17] 409 tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits 410 bl pmull_gcm_ghash_4x 411 .endif 412 b 3b 413 .endm 414 415 /* 416 * void pmull_gcm_encrypt(int bytes, u8 dst[], const u8 src[], 417 * u64 const h[4][2], u64 dg[], u8 ctr[], 418 * u32 const rk[], int rounds, u8 tag[]) 419 */ 420SYM_FUNC_START(pmull_gcm_encrypt) 421 pmull_gcm_do_crypt 1 422SYM_FUNC_END(pmull_gcm_encrypt) 423 424 /* 425 * int pmull_gcm_decrypt(int bytes, u8 dst[], const u8 src[], 426 * u64 const h[4][2], u64 dg[], u8 ctr[], 427 * u32 const rk[], int rounds, const u8 l[], 428 * const u8 tag[], u64 authsize) 429 */ 430SYM_FUNC_START(pmull_gcm_decrypt) 431 pmull_gcm_do_crypt 0 432SYM_FUNC_END(pmull_gcm_decrypt) 433 434SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x) 435 movi MASK.16b, #0xe1 436 shl MASK.2d, MASK.2d, #57 437 438 rev64 T1.16b, INP0.16b 439 rev64 T2.16b, INP1.16b 440 rev64 TT3.16b, INP2.16b 441 rev64 TT4.16b, INP3.16b 442 443 ext XL.16b, XL.16b, XL.16b, #8 444 445 tbz w9, #2, 0f // <4 blocks? 446 .subsection 1 4470: movi XH2.16b, #0 448 movi XM2.16b, #0 449 movi XL2.16b, #0 450 451 tbz w9, #0, 1f // 2 blocks? 452 tbz w9, #1, 2f // 1 block? 453 454 eor T2.16b, T2.16b, XL.16b 455 ext T1.16b, T2.16b, T2.16b, #8 456 b .Lgh3 457 4581: eor TT3.16b, TT3.16b, XL.16b 459 ext T2.16b, TT3.16b, TT3.16b, #8 460 b .Lgh2 461 4622: eor TT4.16b, TT4.16b, XL.16b 463 ext IN1.16b, TT4.16b, TT4.16b, #8 464 b .Lgh1 465 .previous 466 467 eor T1.16b, T1.16b, XL.16b 468 ext IN1.16b, T1.16b, T1.16b, #8 469 470 pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1 471 eor T1.16b, T1.16b, IN1.16b 472 pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0 473 pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0) 474 475 ext T1.16b, T2.16b, T2.16b, #8 476.Lgh3: eor T2.16b, T2.16b, T1.16b 477 pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1 478 pmull XL.1q, HH3.1d, T1.1d // a0 * b0 479 pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0) 480 481 eor XH2.16b, XH2.16b, XH.16b 482 eor XL2.16b, XL2.16b, XL.16b 483 eor XM2.16b, XM2.16b, XM.16b 484 485 ext T2.16b, TT3.16b, TT3.16b, #8 486.Lgh2: eor TT3.16b, TT3.16b, T2.16b 487 pmull2 XH.1q, HH.2d, T2.2d // a1 * b1 488 pmull XL.1q, HH.1d, T2.1d // a0 * b0 489 pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0) 490 491 eor XH2.16b, XH2.16b, XH.16b 492 eor XL2.16b, XL2.16b, XL.16b 493 eor XM2.16b, XM2.16b, XM.16b 494 495 ext IN1.16b, TT4.16b, TT4.16b, #8 496.Lgh1: eor TT4.16b, TT4.16b, IN1.16b 497 pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0 498 pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1 499 pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0) 500 501 eor XH.16b, XH.16b, XH2.16b 502 eor XL.16b, XL.16b, XL2.16b 503 eor XM.16b, XM.16b, XM2.16b 504 505 eor T2.16b, XL.16b, XH.16b 506 ext T1.16b, XL.16b, XH.16b, #8 507 eor XM.16b, XM.16b, T2.16b 508 509 __pmull_reduce_p64 510 511 eor T2.16b, T2.16b, XH.16b 512 eor XL.16b, XL.16b, T2.16b 513 514 ret 515SYM_FUNC_END(pmull_gcm_ghash_4x) 516 517SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x) 518 ld1 {KS0.16b}, [x5] // load upper counter 519 sub w10, w8, #4 520 sub w11, w8, #3 521 sub w12, w8, #2 522 sub w13, w8, #1 523 rev w10, w10 524 rev w11, w11 525 rev w12, w12 526 rev w13, w13 527 mov KS1.16b, KS0.16b 528 mov KS2.16b, KS0.16b 529 mov KS3.16b, KS0.16b 530 ins KS0.s[3], w10 // set lower counter 531 ins KS1.s[3], w11 532 ins KS2.s[3], w12 533 ins KS3.s[3], w13 534 535 add x10, x6, #96 // round key pointer 536 ld1 {K6.4s-K7.4s}, [x10], #32 537 .irp key, K0, K1, K2, K3, K4, K5 538 enc_qround KS0, KS1, KS2, KS3, \key 539 .endr 540 541 tbnz x7, #2, .Lnot128 542 .subsection 1 543.Lnot128: 544 ld1 {K8.4s-K9.4s}, [x10], #32 545 .irp key, K6, K7 546 enc_qround KS0, KS1, KS2, KS3, \key 547 .endr 548 ld1 {K6.4s-K7.4s}, [x10] 549 .irp key, K8, K9 550 enc_qround KS0, KS1, KS2, KS3, \key 551 .endr 552 tbz x7, #1, .Lout192 553 b .Lout256 554 .previous 555 556.Lout256: 557 .irp key, K6, K7 558 enc_qround KS0, KS1, KS2, KS3, \key 559 .endr 560 561.Lout192: 562 enc_qround KS0, KS1, KS2, KS3, KK 563 564 aese KS0.16b, KL.16b 565 aese KS1.16b, KL.16b 566 aese KS2.16b, KL.16b 567 aese KS3.16b, KL.16b 568 569 eor KS0.16b, KS0.16b, KM.16b 570 eor KS1.16b, KS1.16b, KM.16b 571 eor KS2.16b, KS2.16b, KM.16b 572 eor KS3.16b, KS3.16b, KM.16b 573 574 eor INP0.16b, INP0.16b, KS0.16b 575 eor INP1.16b, INP1.16b, KS1.16b 576 eor INP2.16b, INP2.16b, KS2.16b 577 eor INP3.16b, INP3.16b, KS3.16b 578 579 ret 580SYM_FUNC_END(pmull_gcm_enc_4x) 581 582 .section ".rodata", "a" 583 .align 6 584.Lpermute_table: 585 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 586 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 587 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 588 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf 589 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 590 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 591 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 592 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf 593 .previous 594