1/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ 2// 3// Copyright 2025 Google LLC 4// 5// Author: Eric Biggers <ebiggers@google.com> 6// 7// This file is dual-licensed, meaning that you can use it under your choice of 8// either of the following two licenses: 9// 10// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy 11// of the License at 12// 13// http://www.apache.org/licenses/LICENSE-2.0 14// 15// Unless required by applicable law or agreed to in writing, software 16// distributed under the License is distributed on an "AS IS" BASIS, 17// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18// See the License for the specific language governing permissions and 19// limitations under the License. 20// 21// or 22// 23// Redistribution and use in source and binary forms, with or without 24// modification, are permitted provided that the following conditions are met: 25// 26// 1. Redistributions of source code must retain the above copyright notice, 27// this list of conditions and the following disclaimer. 28// 29// 2. Redistributions in binary form must reproduce the above copyright 30// notice, this list of conditions and the following disclaimer in the 31// documentation and/or other materials provided with the distribution. 32// 33// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 34// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 37// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 38// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 39// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 40// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 41// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 42// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 43// POSSIBILITY OF SUCH DAMAGE. 44// 45//------------------------------------------------------------------------------ 46// 47// This file contains x86_64 assembly implementations of AES-CTR and AES-XCTR 48// using the following sets of CPU features: 49// - AES-NI && AVX 50// - VAES && AVX2 51// - VAES && AVX512BW && AVX512VL && BMI2 52// 53// See the function definitions at the bottom of the file for more information. 54 55#include <linux/linkage.h> 56#include <linux/cfi_types.h> 57 58.section .rodata 59.p2align 4 60 61.Lbswap_mask: 62 .octa 0x000102030405060708090a0b0c0d0e0f 63 64.Lctr_pattern: 65 .quad 0, 0 66.Lone: 67 .quad 1, 0 68.Ltwo: 69 .quad 2, 0 70 .quad 3, 0 71 72.Lfour: 73 .quad 4, 0 74 75.text 76 77// Move a vector between memory and a register. 78.macro _vmovdqu src, dst 79.if VL < 64 80 vmovdqu \src, \dst 81.else 82 vmovdqu8 \src, \dst 83.endif 84.endm 85 86// Move a vector between registers. 87.macro _vmovdqa src, dst 88.if VL < 64 89 vmovdqa \src, \dst 90.else 91 vmovdqa64 \src, \dst 92.endif 93.endm 94 95// Broadcast a 128-bit value from memory to all 128-bit lanes of a vector 96// register. 97.macro _vbroadcast128 src, dst 98.if VL == 16 99 vmovdqu \src, \dst 100.elseif VL == 32 101 vbroadcasti128 \src, \dst 102.else 103 vbroadcasti32x4 \src, \dst 104.endif 105.endm 106 107// XOR two vectors together. 108.macro _vpxor src1, src2, dst 109.if VL < 64 110 vpxor \src1, \src2, \dst 111.else 112 vpxord \src1, \src2, \dst 113.endif 114.endm 115 116// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst 117// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}. 118.macro _load_partial_block src, dst, tmp64, tmp32 119 sub $8, %ecx // LEN - 8 120 jle .Lle8\@ 121 122 // Load 9 <= LEN <= 15 bytes. 123 vmovq (\src), \dst // Load first 8 bytes 124 mov (\src, %rcx), %rax // Load last 8 bytes 125 neg %ecx 126 shl $3, %ecx 127 shr %cl, %rax // Discard overlapping bytes 128 vpinsrq $1, %rax, \dst, \dst 129 jmp .Ldone\@ 130 131.Lle8\@: 132 add $4, %ecx // LEN - 4 133 jl .Llt4\@ 134 135 // Load 4 <= LEN <= 8 bytes. 136 mov (\src), %eax // Load first 4 bytes 137 mov (\src, %rcx), \tmp32 // Load last 4 bytes 138 jmp .Lcombine\@ 139 140.Llt4\@: 141 // Load 1 <= LEN <= 3 bytes. 142 add $2, %ecx // LEN - 2 143 movzbl (\src), %eax // Load first byte 144 jl .Lmovq\@ 145 movzwl (\src, %rcx), \tmp32 // Load last 2 bytes 146.Lcombine\@: 147 shl $3, %ecx 148 shl %cl, \tmp64 149 or \tmp64, %rax // Combine the two parts 150.Lmovq\@: 151 vmovq %rax, \dst 152.Ldone\@: 153.endm 154 155// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst. 156// Clobbers %rax, %rcx, and \tmp{64,32}. 157.macro _store_partial_block src, dst, tmp64, tmp32 158 sub $8, %ecx // LEN - 8 159 jl .Llt8\@ 160 161 // Store 8 <= LEN <= 15 bytes. 162 vpextrq $1, \src, %rax 163 mov %ecx, \tmp32 164 shl $3, %ecx 165 ror %cl, %rax 166 mov %rax, (\dst, \tmp64) // Store last LEN - 8 bytes 167 vmovq \src, (\dst) // Store first 8 bytes 168 jmp .Ldone\@ 169 170.Llt8\@: 171 add $4, %ecx // LEN - 4 172 jl .Llt4\@ 173 174 // Store 4 <= LEN <= 7 bytes. 175 vpextrd $1, \src, %eax 176 mov %ecx, \tmp32 177 shl $3, %ecx 178 ror %cl, %eax 179 mov %eax, (\dst, \tmp64) // Store last LEN - 4 bytes 180 vmovd \src, (\dst) // Store first 4 bytes 181 jmp .Ldone\@ 182 183.Llt4\@: 184 // Store 1 <= LEN <= 3 bytes. 185 vpextrb $0, \src, 0(\dst) 186 cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2? 187 jl .Ldone\@ 188 vpextrb $1, \src, 1(\dst) 189 je .Ldone\@ 190 vpextrb $2, \src, 2(\dst) 191.Ldone\@: 192.endm 193 194// Prepare the next two vectors of AES inputs in AESDATA\i0 and AESDATA\i1, and 195// XOR each with the zero-th round key. Also update LE_CTR if !\final. 196.macro _prepare_2_ctr_vecs is_xctr, i0, i1, final=0 197.if \is_xctr 198 .if USE_AVX512 199 vmovdqa64 LE_CTR, AESDATA\i0 200 vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i0 201 .else 202 vpxor XCTR_IV, LE_CTR, AESDATA\i0 203 vpxor RNDKEY0, AESDATA\i0, AESDATA\i0 204 .endif 205 vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1 206 207 .if USE_AVX512 208 vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i1 209 .else 210 vpxor XCTR_IV, AESDATA\i1, AESDATA\i1 211 vpxor RNDKEY0, AESDATA\i1, AESDATA\i1 212 .endif 213.else 214 vpshufb BSWAP_MASK, LE_CTR, AESDATA\i0 215 _vpxor RNDKEY0, AESDATA\i0, AESDATA\i0 216 vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1 217 vpshufb BSWAP_MASK, AESDATA\i1, AESDATA\i1 218 _vpxor RNDKEY0, AESDATA\i1, AESDATA\i1 219.endif 220.if !\final 221 vpaddq LE_CTR_INC2, LE_CTR, LE_CTR 222.endif 223.endm 224 225// Do all AES rounds on the data in the given AESDATA vectors, excluding the 226// zero-th and last rounds. 227.macro _aesenc_loop vecs:vararg 228 mov KEY, %rax 2291: 230 _vbroadcast128 (%rax), RNDKEY 231.irp i, \vecs 232 vaesenc RNDKEY, AESDATA\i, AESDATA\i 233.endr 234 add $16, %rax 235 cmp %rax, RNDKEYLAST_PTR 236 jne 1b 237.endm 238 239// Finalize the keystream blocks in the given AESDATA vectors by doing the last 240// AES round, then XOR those keystream blocks with the corresponding data. 241// Reduce latency by doing the XOR before the vaesenclast, utilizing the 242// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). 243.macro _aesenclast_and_xor vecs:vararg 244.irp i, \vecs 245 _vpxor \i*VL(SRC), RNDKEYLAST, RNDKEY 246 vaesenclast RNDKEY, AESDATA\i, AESDATA\i 247.endr 248.irp i, \vecs 249 _vmovdqu AESDATA\i, \i*VL(DST) 250.endr 251.endm 252 253// XOR the keystream blocks in the specified AESDATA vectors with the 254// corresponding data. 255.macro _xor_data vecs:vararg 256.irp i, \vecs 257 _vpxor \i*VL(SRC), AESDATA\i, AESDATA\i 258.endr 259.irp i, \vecs 260 _vmovdqu AESDATA\i, \i*VL(DST) 261.endr 262.endm 263 264.macro _aes_ctr_crypt is_xctr 265 266 // Define register aliases V0-V15 that map to the xmm, ymm, or zmm 267 // registers according to the selected Vector Length (VL). 268.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 269 .if VL == 16 270 .set V\i, %xmm\i 271 .elseif VL == 32 272 .set V\i, %ymm\i 273 .elseif VL == 64 274 .set V\i, %zmm\i 275 .else 276 .error "Unsupported Vector Length (VL)" 277 .endif 278.endr 279 280 // Function arguments 281 .set KEY, %rdi // Initially points to the start of the 282 // crypto_aes_ctx, then is advanced to 283 // point to the index 1 round key 284 .set KEY32, %edi // Available as temp register after all 285 // keystream blocks have been generated 286 .set SRC, %rsi // Pointer to next source data 287 .set DST, %rdx // Pointer to next destination data 288 .set LEN, %ecx // Remaining length in bytes. 289 // Note: _load_partial_block relies on 290 // this being in %ecx. 291 .set LEN64, %rcx // Zero-extend LEN before using! 292 .set LEN8, %cl 293.if \is_xctr 294 .set XCTR_IV_PTR, %r8 // const u8 iv[AES_BLOCK_SIZE]; 295 .set XCTR_CTR, %r9 // u64 ctr; 296.else 297 .set LE_CTR_PTR, %r8 // const u64 le_ctr[2]; 298.endif 299 300 // Additional local variables 301 .set RNDKEYLAST_PTR, %r10 302 .set AESDATA0, V0 303 .set AESDATA0_XMM, %xmm0 304 .set AESDATA1, V1 305 .set AESDATA1_XMM, %xmm1 306 .set AESDATA2, V2 307 .set AESDATA3, V3 308 .set AESDATA4, V4 309 .set AESDATA5, V5 310 .set AESDATA6, V6 311 .set AESDATA7, V7 312.if \is_xctr 313 .set XCTR_IV, V8 314.else 315 .set BSWAP_MASK, V8 316.endif 317 .set LE_CTR, V9 318 .set LE_CTR_XMM, %xmm9 319 .set LE_CTR_INC1, V10 320 .set LE_CTR_INC2, V11 321 .set RNDKEY0, V12 322 .set RNDKEYLAST, V13 323 .set RNDKEY, V14 324 325 // Create the first vector of counters. 326.if \is_xctr 327 .if VL == 16 328 vmovq XCTR_CTR, LE_CTR 329 .elseif VL == 32 330 vmovq XCTR_CTR, LE_CTR_XMM 331 inc XCTR_CTR 332 vmovq XCTR_CTR, AESDATA0_XMM 333 vinserti128 $1, AESDATA0_XMM, LE_CTR, LE_CTR 334 .else 335 vpbroadcastq XCTR_CTR, LE_CTR 336 vpsrldq $8, LE_CTR, LE_CTR 337 vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR 338 .endif 339 _vbroadcast128 (XCTR_IV_PTR), XCTR_IV 340.else 341 _vbroadcast128 (LE_CTR_PTR), LE_CTR 342 .if VL > 16 343 vpaddq .Lctr_pattern(%rip), LE_CTR, LE_CTR 344 .endif 345 _vbroadcast128 .Lbswap_mask(%rip), BSWAP_MASK 346.endif 347 348.if VL == 16 349 _vbroadcast128 .Lone(%rip), LE_CTR_INC1 350.elseif VL == 32 351 _vbroadcast128 .Ltwo(%rip), LE_CTR_INC1 352.else 353 _vbroadcast128 .Lfour(%rip), LE_CTR_INC1 354.endif 355 vpsllq $1, LE_CTR_INC1, LE_CTR_INC2 356 357 // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). 358 movl 480(KEY), %eax 359 360 // Compute the pointer to the last round key. 361 lea 6*16(KEY, %rax, 4), RNDKEYLAST_PTR 362 363 // Load the zero-th and last round keys. 364 _vbroadcast128 (KEY), RNDKEY0 365 _vbroadcast128 (RNDKEYLAST_PTR), RNDKEYLAST 366 367 // Make KEY point to the first round key. 368 add $16, KEY 369 370 // This is the main loop, which encrypts 8 vectors of data at a time. 371 add $-8*VL, LEN 372 jl .Lloop_8x_done\@ 373.Lloop_8x\@: 374 _prepare_2_ctr_vecs \is_xctr, 0, 1 375 _prepare_2_ctr_vecs \is_xctr, 2, 3 376 _prepare_2_ctr_vecs \is_xctr, 4, 5 377 _prepare_2_ctr_vecs \is_xctr, 6, 7 378 _aesenc_loop 0,1,2,3,4,5,6,7 379 _aesenclast_and_xor 0,1,2,3,4,5,6,7 380 sub $-8*VL, SRC 381 sub $-8*VL, DST 382 add $-8*VL, LEN 383 jge .Lloop_8x\@ 384.Lloop_8x_done\@: 385 sub $-8*VL, LEN 386 jz .Ldone\@ 387 388 // 1 <= LEN < 8*VL. Generate 2, 4, or 8 more vectors of keystream 389 // blocks, depending on the remaining LEN. 390 391 _prepare_2_ctr_vecs \is_xctr, 0, 1 392 _prepare_2_ctr_vecs \is_xctr, 2, 3 393 cmp $4*VL, LEN 394 jle .Lenc_tail_atmost4vecs\@ 395 396 // 4*VL < LEN < 8*VL. Generate 8 vectors of keystream blocks. Use the 397 // first 4 to XOR 4 full vectors of data. Then XOR the remaining data. 398 _prepare_2_ctr_vecs \is_xctr, 4, 5 399 _prepare_2_ctr_vecs \is_xctr, 6, 7, final=1 400 _aesenc_loop 0,1,2,3,4,5,6,7 401 _aesenclast_and_xor 0,1,2,3 402 vaesenclast RNDKEYLAST, AESDATA4, AESDATA0 403 vaesenclast RNDKEYLAST, AESDATA5, AESDATA1 404 vaesenclast RNDKEYLAST, AESDATA6, AESDATA2 405 vaesenclast RNDKEYLAST, AESDATA7, AESDATA3 406 sub $-4*VL, SRC 407 sub $-4*VL, DST 408 add $-4*VL, LEN 409 cmp $1*VL-1, LEN 410 jle .Lxor_tail_partial_vec_0\@ 411 _xor_data 0 412 cmp $2*VL-1, LEN 413 jle .Lxor_tail_partial_vec_1\@ 414 _xor_data 1 415 cmp $3*VL-1, LEN 416 jle .Lxor_tail_partial_vec_2\@ 417 _xor_data 2 418 cmp $4*VL-1, LEN 419 jle .Lxor_tail_partial_vec_3\@ 420 _xor_data 3 421 jmp .Ldone\@ 422 423.Lenc_tail_atmost4vecs\@: 424 cmp $2*VL, LEN 425 jle .Lenc_tail_atmost2vecs\@ 426 427 // 2*VL < LEN <= 4*VL. Generate 4 vectors of keystream blocks. Use the 428 // first 2 to XOR 2 full vectors of data. Then XOR the remaining data. 429 _aesenc_loop 0,1,2,3 430 _aesenclast_and_xor 0,1 431 vaesenclast RNDKEYLAST, AESDATA2, AESDATA0 432 vaesenclast RNDKEYLAST, AESDATA3, AESDATA1 433 sub $-2*VL, SRC 434 sub $-2*VL, DST 435 add $-2*VL, LEN 436 jmp .Lxor_tail_upto2vecs\@ 437 438.Lenc_tail_atmost2vecs\@: 439 // 1 <= LEN <= 2*VL. Generate 2 vectors of keystream blocks. Then XOR 440 // the remaining data. 441 _aesenc_loop 0,1 442 vaesenclast RNDKEYLAST, AESDATA0, AESDATA0 443 vaesenclast RNDKEYLAST, AESDATA1, AESDATA1 444 445.Lxor_tail_upto2vecs\@: 446 cmp $1*VL-1, LEN 447 jle .Lxor_tail_partial_vec_0\@ 448 _xor_data 0 449 cmp $2*VL-1, LEN 450 jle .Lxor_tail_partial_vec_1\@ 451 _xor_data 1 452 jmp .Ldone\@ 453 454.Lxor_tail_partial_vec_1\@: 455 add $-1*VL, LEN 456 jz .Ldone\@ 457 sub $-1*VL, SRC 458 sub $-1*VL, DST 459 _vmovdqa AESDATA1, AESDATA0 460 jmp .Lxor_tail_partial_vec_0\@ 461 462.Lxor_tail_partial_vec_2\@: 463 add $-2*VL, LEN 464 jz .Ldone\@ 465 sub $-2*VL, SRC 466 sub $-2*VL, DST 467 _vmovdqa AESDATA2, AESDATA0 468 jmp .Lxor_tail_partial_vec_0\@ 469 470.Lxor_tail_partial_vec_3\@: 471 add $-3*VL, LEN 472 jz .Ldone\@ 473 sub $-3*VL, SRC 474 sub $-3*VL, DST 475 _vmovdqa AESDATA3, AESDATA0 476 477.Lxor_tail_partial_vec_0\@: 478 // XOR the remaining 1 <= LEN < VL bytes. It's easy if masked 479 // loads/stores are available; otherwise it's a bit harder... 480.if USE_AVX512 481 mov $-1, %rax 482 bzhi LEN64, %rax, %rax 483 kmovq %rax, %k1 484 vmovdqu8 (SRC), AESDATA1{%k1}{z} 485 vpxord AESDATA1, AESDATA0, AESDATA0 486 vmovdqu8 AESDATA0, (DST){%k1} 487.else 488 .if VL == 32 489 cmp $16, LEN 490 jl 1f 491 vpxor (SRC), AESDATA0_XMM, AESDATA1_XMM 492 vmovdqu AESDATA1_XMM, (DST) 493 add $16, SRC 494 add $16, DST 495 sub $16, LEN 496 jz .Ldone\@ 497 vextracti128 $1, AESDATA0, AESDATA0_XMM 4981: 499 .endif 500 mov LEN, %r10d 501 _load_partial_block SRC, AESDATA1_XMM, KEY, KEY32 502 vpxor AESDATA1_XMM, AESDATA0_XMM, AESDATA0_XMM 503 mov %r10d, %ecx 504 _store_partial_block AESDATA0_XMM, DST, KEY, KEY32 505.endif 506 507.Ldone\@: 508.if VL > 16 509 vzeroupper 510.endif 511 RET 512.endm 513 514// Below are the definitions of the functions generated by the above macro. 515// They have the following prototypes: 516// 517// 518// void aes_ctr64_crypt_##suffix(const struct crypto_aes_ctx *key, 519// const u8 *src, u8 *dst, int len, 520// const u64 le_ctr[2]); 521// 522// void aes_xctr_crypt_##suffix(const struct crypto_aes_ctx *key, 523// const u8 *src, u8 *dst, int len, 524// const u8 iv[AES_BLOCK_SIZE], u64 ctr); 525// 526// Both functions generate |len| bytes of keystream, XOR it with the data from 527// |src|, and write the result to |dst|. On non-final calls, |len| must be a 528// multiple of 16. On the final call, |len| can be any value. 529// 530// aes_ctr64_crypt_* implement "regular" CTR, where the keystream is generated 531// from a 128-bit big endian counter that increments by 1 for each AES block. 532// HOWEVER, to keep the assembly code simple, some of the counter management is 533// left to the caller. aes_ctr64_crypt_* take the counter in little endian 534// form, only increment the low 64 bits internally, do the conversion to big 535// endian internally, and don't write the updated counter back to memory. The 536// caller is responsible for converting the starting IV to the little endian 537// le_ctr, detecting the (very rare) case of a carry out of the low 64 bits 538// being needed and splitting at that point with a carry done in between, and 539// updating le_ctr after each part if the message is multi-part. 540// 541// aes_xctr_crypt_* implement XCTR as specified in "Length-preserving encryption 542// with HCTR2" (https://eprint.iacr.org/2021/1441.pdf). XCTR is an 543// easier-to-implement variant of CTR that uses little endian byte order and 544// eliminates carries. |ctr| is the per-message block counter starting at 1. 545 546.set VL, 16 547.set USE_AVX512, 0 548SYM_TYPED_FUNC_START(aes_ctr64_crypt_aesni_avx) 549 _aes_ctr_crypt 0 550SYM_FUNC_END(aes_ctr64_crypt_aesni_avx) 551SYM_TYPED_FUNC_START(aes_xctr_crypt_aesni_avx) 552 _aes_ctr_crypt 1 553SYM_FUNC_END(aes_xctr_crypt_aesni_avx) 554 555#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) 556.set VL, 32 557.set USE_AVX512, 0 558SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx2) 559 _aes_ctr_crypt 0 560SYM_FUNC_END(aes_ctr64_crypt_vaes_avx2) 561SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx2) 562 _aes_ctr_crypt 1 563SYM_FUNC_END(aes_xctr_crypt_vaes_avx2) 564 565.set VL, 64 566.set USE_AVX512, 1 567SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx512) 568 _aes_ctr_crypt 0 569SYM_FUNC_END(aes_ctr64_crypt_vaes_avx512) 570SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx512) 571 _aes_ctr_crypt 1 572SYM_FUNC_END(aes_xctr_crypt_vaes_avx512) 573#endif // CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ 574