1#! /usr/bin/env perl 2# Copyright (C) 2023 Intel Corporation 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# This implementation is based on the AES-XTS code (AVX512VAES + VPCLMULQDQ) 10# from Intel(R) Intelligent Storage Acceleration Library Crypto Version 11# (https://github.com/intel/isa-l_crypto). 12# 13###################################################################### 14# The main building block of the loop is code that encrypts/decrypts 15# 8/16 blocks of data stitching with generation of tweak for the next 16# 8/16 blocks, utilizing VAES and VPCLMULQDQ instructions with full width 17# of ZMM registers. The main loop is selected based on the input length. 18# main_loop_run_16 encrypts/decrypts 16 blocks in parallel and it's selected 19# when input length >= 256 bytes (16 blocks) 20# main_loop_run_8 encrypts/decrypts 8 blocks in parallel and it's selected 21# when 128 bytes <= input length < 256 bytes (8-15 blocks) 22# Input length < 128 bytes (8 blocks) is handled by do_n_blocks. 23# 24# This implementation mainly uses vpshrdq from AVX-512-VBMI2 family and vaesenc, 25# vaesdec, vpclmulqdq from AVX-512F family. 26$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 27$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 28 29$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 30$avx512vaes=0; 31 32$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 33( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 34( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 35die "can't locate x86_64-xlate.pl"; 36 37if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 38 =~ /GNU assembler version ([0-9]+)\.([0-9]+)/) { 39 my $ver = $1 + $2/100.0; # 3.1->3.01, 3.10->3.10 40 $avx512vaes = ($ver >= 2.30); 41} 42 43if (!$avx512vaes && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 44 `nasm -v 2>&1` =~ /NASM version ([0-9]+)\.([0-9]+)(?:\.([0-9]+))?/) { 45 my $ver = $1 + $2/100.0 + $3/10000.0; # 3.1.0->3.01, 3.10.1->3.1001 46 $avx512vaes = ($ver >= 2.1108); 47} 48 49if (!$avx512vaes && `$ENV{CC} -v 2>&1` 50 =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) { 51 my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001 52 if ($1) { 53 # Apple conditions, they use a different version series, see 54 # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2 55 # clang 7.0.0 is Apple clang 10.0.1 56 $avx512vaes = ($ver>=10.0001) 57 } else { 58 $avx512vaes = ($ver>=7.0); 59 } 60} 61 62open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 63 or die "can't call $xlate: $!"; 64*STDOUT=*OUT; 65 66#====================================================================== 67 68if ($avx512vaes) { 69 70 my $GP_STORAGE = $win64 ? (16 * 18) : (16 * 8); # store rbx 71 my $XMM_STORAGE = $win64 ? (16 * 8) : 0; # store xmm6:xmm15 72 my $VARIABLE_OFFSET = $win64 ? (16*8 + 16*10 + 8*3) : 73 (16*8 + 8*1); 74 75 # right now, >= 0x80 (128) is used for expanded keys. all usages of 76 # rsp should be invoked via $TW, not shadowed by any other name or 77 # used directly. 78 my $TW = "%rsp"; 79 my $TEMPHIGH = "%rbx"; 80 my $TEMPLOW = "%rax"; 81 my $ZPOLY = "%zmm25"; 82 83 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 84 # ;;; Function arguments abstraction 85 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 86 my ($key2, $key1, $tweak, $length, $input, $output); 87 88 89$input = "%rdi"; 90$output = "%rsi"; 91$length = "%rdx"; 92$key1 = "%rcx"; 93$key2 = "%r8"; 94$tweak = "%r9"; 95 96 # arguments for temp parameters 97 my ($tmp1, $gf_poly_8b, $gf_poly_8b_temp); 98 $tmp1 = "%r8"; 99 $gf_poly_8b = "%r10"; 100 $gf_poly_8b_temp = "%r11"; 101 102 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 103 # ;;; Helper functions 104 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 105 106 # Generates "random" local labels 107 sub random_string() { 108 my @chars = ('a' .. 'z', 'A' .. 'Z', '0' .. '9', '_'); 109 my $length = 15; 110 my $str; 111 map { $str .= $chars[rand(33)] } 1 .. $length; 112 return $str; 113 } 114 115 # ; Seed the RNG so the labels are generated deterministically 116 srand(12345); 117 118 sub encrypt_tweak { 119 my $state_tweak = $_[0]; 120 my $is_128 = $_[1]; 121 122 $code.=<<___; 123 vpxor ($key2), $state_tweak, $state_tweak 124 vaesenc 0x10($key2), $state_tweak, $state_tweak 125 vaesenc 0x20($key2), $state_tweak, $state_tweak 126 vaesenc 0x30($key2), $state_tweak, $state_tweak 127 vaesenc 0x40($key2), $state_tweak, $state_tweak 128 vaesenc 0x50($key2), $state_tweak, $state_tweak 129 vaesenc 0x60($key2), $state_tweak, $state_tweak 130 vaesenc 0x70($key2), $state_tweak, $state_tweak 131 vaesenc 0x80($key2), $state_tweak, $state_tweak 132 vaesenc 0x90($key2), $state_tweak, $state_tweak 133___ 134 135 if ($is_128) { 136 $code .= "vaesenclast 0xa0($key2), $state_tweak, $state_tweak\n"; 137 } else { 138 $code .= "vaesenc 0xa0($key2), $state_tweak, $state_tweak\n"; 139 $code .= "vaesenc 0xb0($key2), $state_tweak, $state_tweak\n"; 140 $code .= "vaesenc 0xc0($key2), $state_tweak, $state_tweak\n"; 141 $code .= "vaesenc 0xd0($key2), $state_tweak, $state_tweak\n"; 142 $code .= "vaesenclast 0xe0($key2), $state_tweak, $state_tweak\n"; 143 } 144 $code .= "vmovdqa $state_tweak, ($TW)\n"; 145 } 146 147 sub encrypt_final { 148 my $st = $_[0]; 149 my $tw = $_[1]; 150 my $is_128 = $_[2]; 151 152 # xor Tweak value 153 $code .= "vpxor $tw, $st, $st\n"; 154 $code .= "vpxor ($key1), $st, $st\n"; 155 156 my $rounds = $is_128 ? 10 : 14; 157 for (my $i = 1; $i < $rounds; $i++) { 158 $code .= "vaesenc 16*$i($key1), $st, $st\n"; 159 } 160 161 $code .=<<___; 162 vaesenclast 16*$rounds($key1), $st, $st 163 vpxor $tw, $st, $st 164___ 165 } 166 167 # decrypt initial blocks of AES 168 # 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted 169 # next 8 Tweak values are generated 170 sub decrypt_initial { 171 my @st; 172 $st[0] = $_[0]; 173 $st[1] = $_[1]; 174 $st[2] = $_[2]; 175 $st[3] = $_[3]; 176 $st[4] = $_[4]; 177 $st[5] = $_[5]; 178 $st[6] = $_[6]; 179 $st[7] = $_[7]; 180 181 my @tw; 182 $tw[0] = $_[8]; 183 $tw[1] = $_[9]; 184 $tw[2] = $_[10]; 185 $tw[3] = $_[11]; 186 $tw[4] = $_[12]; 187 $tw[5] = $_[13]; 188 $tw[6] = $_[14]; 189 my $t0 = $_[15]; 190 my $num_blocks = $_[16]; 191 my $lt128 = $_[17]; 192 my $is_128 = $_[18]; 193 194 # num_blocks blocks encrypted 195 # num_blocks can be 1, 2, 3, 4, 5, 6, 7 196 197 # xor Tweak value 198 for (my $i = 0; $i < $num_blocks; $i++) { 199 $code .= "vpxor $tw[$i], $st[$i], $st[$i]\n"; 200 } 201 202 $code .= "vmovdqu ($key1), $t0\n"; 203 204 for (my $i = 0; $i < $num_blocks; $i++) { 205 $code .= "vpxor $t0, $st[$i], $st[$i]\n"; 206 } 207 208 if (0 == $lt128) { 209 $code .= <<___; 210 xor $gf_poly_8b_temp, $gf_poly_8b_temp 211 shl \$1, $TEMPLOW 212 adc $TEMPHIGH, $TEMPHIGH 213___ 214 } 215 # round 1 216 $code .= "vmovdqu 0x10($key1), $t0\n"; 217 218 for (my $i = 0; $i < $num_blocks; $i++) { 219 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 220 } 221 222 if (0 == $lt128) { 223 $code .= <<___; 224 cmovc $gf_poly_8b, $gf_poly_8b_temp 225 xor $gf_poly_8b_temp, $TEMPLOW 226 mov $TEMPLOW, ($TW) # next Tweak1 generated 227 mov $TEMPLOW, 0x08($TW) 228 xor $gf_poly_8b_temp, $gf_poly_8b_temp 229___ 230 } 231 232 # round 2 233 $code .= "vmovdqu 0x20($key1), $t0\n"; 234 235 for (my $i = 0; $i < $num_blocks; $i++) { 236 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 237 } 238 239 if (0 == $lt128) { 240 $code .= <<___; 241 shl \$1, $TEMPLOW 242 adc $TEMPHIGH, $TEMPHIGH 243 cmovc $gf_poly_8b, $gf_poly_8b_temp 244 xor $gf_poly_8b_temp, $TEMPLOW 245 mov $TEMPLOW, 0x10($TW) # next Tweak2 generated 246___ 247 } 248 249 # round 3 250 $code .= "vmovdqu 0x30($key1), $t0\n"; 251 252 for (my $i = 0; $i < $num_blocks; $i++) { 253 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 254 } 255 256 if (0 == $lt128) { 257 $code .= <<___; 258 mov $TEMPHIGH, 0x18($TW) 259 xor $gf_poly_8b_temp, $gf_poly_8b_temp 260 shl \$1, $TEMPLOW 261 adc $TEMPHIGH, $TEMPHIGH 262 cmovc $gf_poly_8b, $gf_poly_8b_temp 263___ 264 } 265 266 # round 4 267 $code .= "vmovdqu 0x40($key1), $t0\n"; 268 269 for (my $i = 0; $i < $num_blocks; $i++) { 270 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 271 } 272 273 if (0 == $lt128) { 274 $code .= <<___; 275 xor $gf_poly_8b_temp, $TEMPLOW 276 mov $TEMPLOW, 0x20($TW) # next Tweak3 generated 277 mov $TEMPHIGH, 0x28($TW) 278 xor $gf_poly_8b_temp, $gf_poly_8b_temp 279 shl \$1, $TEMPLOW 280___ 281 } 282 283 # round 5 284 $code .= "vmovdqu 0x50($key1), $t0\n"; 285 286 for (my $i = 0; $i < $num_blocks; $i++) { 287 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 288 } 289 290 if (0 == $lt128) { 291 $code .= <<___; 292 adc $TEMPHIGH, $TEMPHIGH 293 cmovc $gf_poly_8b, $gf_poly_8b_temp 294 xor $gf_poly_8b_temp, $TEMPLOW 295 mov $TEMPLOW, 0x30($TW) # next Tweak4 generated 296 mov $TEMPHIGH, 0x38($TW) 297___ 298 } 299 300 # round 6 301 $code .= "vmovdqu 0x60($key1), $t0\n"; 302 303 for (my $i = 0; $i < $num_blocks; $i++) { 304 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 305 } 306 307 if (0 == $lt128) { 308 $code .= <<___; 309 xor $gf_poly_8b_temp, $gf_poly_8b_temp 310 shl \$1, $TEMPLOW 311 adc $TEMPHIGH, $TEMPHIGH 312 cmovc $gf_poly_8b, $gf_poly_8b_temp 313 xor $gf_poly_8b_temp, $TEMPLOW 314 mov $TEMPLOW, 0x40($TW) # next Tweak5 generated 315 mov $TEMPHIGH, 0x48($TW) 316___ 317 } 318 319 # round 7 320 $code .= "vmovdqu 0x70($key1), $t0\n"; 321 322 for (my $i = 0; $i < $num_blocks; $i++) { 323 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 324 } 325 326 if (0 == $lt128) { 327 $code .= <<___; 328 xor $gf_poly_8b_temp, $gf_poly_8b_temp 329 shl \$1, $TEMPLOW 330 adc $TEMPHIGH, $TEMPHIGH 331 cmovc $gf_poly_8b, $gf_poly_8b_temp 332 xor $gf_poly_8b_temp, $TEMPLOW 333 mov $TEMPLOW, 0x50($TW) # next Tweak6 generated 334 mov $TEMPHIGH, 0x58($TW) 335___ 336 } 337 338 # round 8 339 $code .= "vmovdqu 0x80($key1), $t0\n"; 340 341 for (my $i = 0; $i < $num_blocks; $i++) { 342 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 343 } 344 345 if (0 == $lt128) { 346 $code .= <<___; 347 xor $gf_poly_8b_temp, $gf_poly_8b_temp 348 shl \$1, $TEMPLOW 349 adc $TEMPHIGH, $TEMPHIGH 350 cmovc $gf_poly_8b, $gf_poly_8b_temp 351 xor $gf_poly_8b_temp, $TEMPLOW 352 mov $TEMPLOW, 0x60($TW) # next Tweak7 generated 353 mov $TEMPHIGH, 0x68($TW) 354___ 355 } 356 357 # round 9 358 $code .= "vmovdqu 0x90($key1), $t0\n"; 359 360 for (my $i = 0; $i < $num_blocks; $i++) { 361 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 362 } 363 364 if (0 == $lt128) { 365 $code .= <<___; 366 xor $gf_poly_8b_temp, $gf_poly_8b_temp 367 shl \$1, $TEMPLOW 368 adc $TEMPHIGH, $TEMPHIGH 369 cmovc $gf_poly_8b, $gf_poly_8b_temp 370 xor $gf_poly_8b_temp, $TEMPLOW 371 mov $TEMPLOW, 0x70($TW) # next Tweak8 generated 372 mov $TEMPHIGH, 0x78($TW) 373___ 374 } 375 376 if ($is_128) { 377 # round 10 378 $code .= "vmovdqu 0xa0($key1), $t0\n"; 379 for (my $i = 0; $i < $num_blocks; $i++) { 380 $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n"; 381 } 382 } else { 383 # round 10 384 $code .= "vmovdqu 0xa0($key1), $t0\n"; 385 for (my $i = 0; $i < $num_blocks; $i++) { 386 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 387 } 388 389 # round 11 390 $code .= "vmovdqu 0xb0($key1), $t0\n"; 391 for (my $i = 0; $i < $num_blocks; $i++) { 392 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 393 } 394 395 # round 12 396 $code .= "vmovdqu 0xc0($key1), $t0\n"; 397 for (my $i = 0; $i < $num_blocks; $i++) { 398 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 399 } 400 401 # round 13 402 $code .= "vmovdqu 0xd0($key1), $t0\n"; 403 for (my $i = 0; $i < $num_blocks; $i++) { 404 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 405 } 406 407 # round 14 408 $code .= "vmovdqu 0xe0($key1), $t0\n"; 409 for (my $i = 0; $i < $num_blocks; $i++) { 410 $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n"; 411 } 412 } 413 414 # xor Tweak values 415 for (my $i = 0; $i < $num_blocks; $i++) { 416 $code .= "vpxor $tw[$i], $st[$i], $st[$i]\n"; 417 } 418 419 if (0 == $lt128) { 420 # load next Tweak values 421 $code .= <<___; 422 vmovdqa ($TW), $tw1 423 vmovdqa 0x10($TW), $tw2 424 vmovdqa 0x20($TW), $tw3 425 vmovdqa 0x30($TW), $tw4 426 vmovdqa 0x40($TW), $tw5 427 vmovdqa 0x50($TW), $tw6 428 vmovdqa 0x60($TW), $tw7 429___ 430 } 431 } 432 433 sub initialize { 434 my @st; 435 $st[0] = $_[0]; 436 $st[1] = $_[1]; 437 $st[2] = $_[2]; 438 $st[3] = $_[3]; 439 $st[4] = $_[4]; 440 $st[5] = $_[5]; 441 $st[6] = $_[6]; 442 $st[7] = $_[7]; 443 444 my @tw; 445 $tw[0] = $_[8]; 446 $tw[1] = $_[9]; 447 $tw[2] = $_[10]; 448 $tw[3] = $_[11]; 449 $tw[4] = $_[12]; 450 $tw[5] = $_[13]; 451 $tw[6] = $_[14]; 452 my $num_initial_blocks = $_[15]; 453 454 $code .= <<___; 455 vmovdqa 0x0($TW), $tw[0] 456 mov 0x0($TW), $TEMPLOW 457 mov 0x08($TW), $TEMPHIGH 458 vmovdqu 0x0($input), $st[0] 459___ 460 461 if ($num_initial_blocks >= 2) { 462 for (my $i = 1; $i < $num_initial_blocks; $i++) { 463 $code .= "xor $gf_poly_8b_temp, $gf_poly_8b_temp\n"; 464 $code .= "shl \$1, $TEMPLOW\n"; 465 $code .= "adc $TEMPHIGH, $TEMPHIGH\n"; 466 $code .= "cmovc $gf_poly_8b, $gf_poly_8b_temp\n"; 467 $code .= "xor $gf_poly_8b_temp, $TEMPLOW\n"; 468 my $offset = $i * 16; 469 $code .= "mov $TEMPLOW, $offset($TW)\n"; 470 $code .= "mov $TEMPHIGH, $offset + 8($TW)\n"; 471 $code .= "vmovdqa $offset($TW), $tw[$i]\n"; 472 $code .= "vmovdqu $offset($input), $st[$i]\n"; 473 } 474 } 475 } 476 477 # Encrypt 4 blocks in parallel 478 sub encrypt_by_four { 479 my $st1 = $_[0]; # state 1 480 my $tw1 = $_[1]; # tweak 1 481 my $tmp = $_[2]; 482 my $is_128 = $_[3]; 483 484 $code .= "vbroadcasti32x4 ($key1), $tmp\n"; 485 $code .= "vpternlogq \$0x96, $tmp, $tw1, $st1\n"; 486 487 my $rounds = $is_128 ? 10 : 14; 488 for (my $i = 1; $i < $rounds; $i++) { 489 $code .= "vbroadcasti32x4 16*$i($key1), $tmp\n"; 490 $code .= "vaesenc $tmp, $st1, $st1\n"; 491 } 492 493 $code .= "vbroadcasti32x4 16*$rounds($key1), $tmp\n"; 494 $code .= "vaesenclast $tmp, $st1, $st1\n"; 495 496 $code .= "vpxorq $tw1, $st1, $st1\n"; 497 } 498 499 # Encrypt 8 blocks in parallel 500 # generate next 8 tweak values 501 sub encrypt_by_eight_zmm { 502 my $st1 = $_[0]; 503 my $st2 = $_[1]; 504 my $tw1 = $_[2]; 505 my $tw2 = $_[3]; 506 my $t0 = $_[4]; 507 my $last_eight = $_[5]; 508 my $is_128 = $_[6]; 509 510 $code .= <<___; 511 vbroadcasti32x4 ($key1), $t0 512 vpternlogq \$0x96, $t0, $tw1, $st1 513 vpternlogq \$0x96, $t0, $tw2, $st2 514___ 515 516 if (0 == $last_eight) { 517 $code .= <<___; 518 vpsrldq \$0xf, $tw1, %zmm13 519 vpclmulqdq \$0x0, $ZPOLY, %zmm13, %zmm14 520 vpslldq \$0x1, $tw1, %zmm15 521 vpxord %zmm14, %zmm15, %zmm15 522___ 523 } 524 # round 1 525 $code .= <<___; 526 vbroadcasti32x4 0x10($key1), $t0 527 vaesenc $t0, $st1, $st1 528 vaesenc $t0, $st2, $st2 529 530 # round 2 531 vbroadcasti32x4 0x20($key1), $t0 532 vaesenc $t0, $st1, $st1 533 vaesenc $t0, $st2, $st2 534 535 # round 3 536 vbroadcasti32x4 0x30($key1), $t0 537 vaesenc $t0, $st1, $st1 538 vaesenc $t0, $st2, $st2 539___ 540 541 if (0 == $last_eight) { 542 $code .= <<___; 543 vpsrldq \$0xf, $tw2, %zmm13 544 vpclmulqdq \$0x0, $ZPOLY, %zmm13, %zmm14 545 vpslldq \$0x1, $tw2, %zmm16 546 vpxord %zmm14, %zmm16, %zmm16 547___ 548 } 549 550 $code .= <<___; 551 # round 4 552 vbroadcasti32x4 0x40($key1), $t0 553 vaesenc $t0, $st1, $st1 554 vaesenc $t0, $st2, $st2 555 556 # round 5 557 vbroadcasti32x4 0x50($key1), $t0 558 vaesenc $t0, $st1, $st1 559 vaesenc $t0, $st2, $st2 560 561 # round 6 562 vbroadcasti32x4 0x60($key1), $t0 563 vaesenc $t0, $st1, $st1 564 vaesenc $t0, $st2, $st2 565 566 # round 7 567 vbroadcasti32x4 0x70($key1), $t0 568 vaesenc $t0, $st1, $st1 569 vaesenc $t0, $st2, $st2 570 571 # round 8 572 vbroadcasti32x4 0x80($key1), $t0 573 vaesenc $t0, $st1, $st1 574 vaesenc $t0, $st2, $st2 575 576 # round 9 577 vbroadcasti32x4 0x90($key1), $t0 578 vaesenc $t0, $st1, $st1 579 vaesenc $t0, $st2, $st2 580___ 581 582 if ($is_128) { 583 $code .= <<___; 584 # round 10 585 vbroadcasti32x4 0xa0($key1), $t0 586 vaesenclast $t0, $st1, $st1 587 vaesenclast $t0, $st2, $st2 588___ 589 } else { 590 $code .= <<___; 591 # round 10 592 vbroadcasti32x4 0xa0($key1), $t0 593 vaesenc $t0, $st1, $st1 594 vaesenc $t0, $st2, $st2 595 596 # round 11 597 vbroadcasti32x4 0xb0($key1), $t0 598 vaesenc $t0, $st1, $st1 599 vaesenc $t0, $st2, $st2 600 601 # round 12 602 vbroadcasti32x4 0xc0($key1), $t0 603 vaesenc $t0, $st1, $st1 604 vaesenc $t0, $st2, $st2 605 606 # round 13 607 vbroadcasti32x4 0xd0($key1), $t0 608 vaesenc $t0, $st1, $st1 609 vaesenc $t0, $st2, $st2 610 611 # round 14 612 vbroadcasti32x4 0xe0($key1), $t0 613 vaesenclast $t0, $st1, $st1 614 vaesenclast $t0, $st2, $st2 615___ 616 } 617 618 # xor Tweak values 619 $code .= "vpxorq $tw1, $st1, $st1\n"; 620 $code .= "vpxorq $tw2, $st2, $st2\n"; 621 622 if (0 == $last_eight) { 623 # load next Tweak values 624 $code .= <<___; 625 vmovdqa32 %zmm15, $tw1 626 vmovdqa32 %zmm16, $tw2 627___ 628 } 629 } 630 631 # Decrypt 8 blocks in parallel 632 # generate next 8 tweak values 633 sub decrypt_by_eight_zmm { 634 my $st1 = $_[0]; 635 my $st2 = $_[1]; 636 my $tw1 = $_[2]; 637 my $tw2 = $_[3]; 638 my $t0 = $_[4]; 639 my $last_eight = $_[5]; 640 my $is_128 = $_[6]; 641 642 $code .= <<___; 643 # xor Tweak values 644 vpxorq $tw1, $st1, $st1 645 vpxorq $tw2, $st2, $st2 646 647 # ARK 648 vbroadcasti32x4 ($key1), $t0 649 vpxorq $t0, $st1, $st1 650 vpxorq $t0, $st2, $st2 651___ 652 653 if (0 == $last_eight) { 654 $code .= <<___; 655 vpsrldq \$0xf, $tw1, %zmm13 656 vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 657 vpslldq \$0x1, $tw1, %zmm15 658 vpxord %zmm14, %zmm15, %zmm15 659___ 660 } 661 # round 1 662 $code .= <<___; 663 vbroadcasti32x4 0x10($key1), $t0 664 vaesdec $t0, $st1, $st1 665 vaesdec $t0, $st2, $st2 666 667 # round 2 668 vbroadcasti32x4 0x20($key1), $t0 669 vaesdec $t0, $st1, $st1 670 vaesdec $t0, $st2, $st2 671 672 # round 3 673 vbroadcasti32x4 0x30($key1), $t0 674 vaesdec $t0, $st1, $st1 675 vaesdec $t0, $st2, $st2 676___ 677 678 if (0 == $last_eight) { 679 $code .= <<___; 680 vpsrldq \$0xf, $tw2, %zmm13 681 vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 682 vpslldq \$0x1, $tw2, %zmm16 683 vpxord %zmm14, %zmm16, %zmm16 684___ 685 } 686 687 $code .= <<___; 688 # round 4 689 vbroadcasti32x4 0x40($key1), $t0 690 vaesdec $t0, $st1, $st1 691 vaesdec $t0, $st2, $st2 692 693 # round 5 694 vbroadcasti32x4 0x50($key1), $t0 695 vaesdec $t0, $st1, $st1 696 vaesdec $t0, $st2, $st2 697 698 # round 6 699 vbroadcasti32x4 0x60($key1), $t0 700 vaesdec $t0, $st1, $st1 701 vaesdec $t0, $st2, $st2 702 703 # round 7 704 vbroadcasti32x4 0x70($key1), $t0 705 vaesdec $t0, $st1, $st1 706 vaesdec $t0, $st2, $st2 707 708 # round 8 709 vbroadcasti32x4 0x80($key1), $t0 710 vaesdec $t0, $st1, $st1 711 vaesdec $t0, $st2, $st2 712 713 # round 9 714 vbroadcasti32x4 0x90($key1), $t0 715 vaesdec $t0, $st1, $st1 716 vaesdec $t0, $st2, $st2 717 718___ 719 if ($is_128) { 720 $code .= <<___; 721 # round 10 722 vbroadcasti32x4 0xa0($key1), $t0 723 vaesdeclast $t0, $st1, $st1 724 vaesdeclast $t0, $st2, $st2 725___ 726 } else { 727 $code .= <<___; 728 # round 10 729 vbroadcasti32x4 0xa0($key1), $t0 730 vaesdec $t0, $st1, $st1 731 vaesdec $t0, $st2, $st2 732 733 # round 11 734 vbroadcasti32x4 0xb0($key1), $t0 735 vaesdec $t0, $st1, $st1 736 vaesdec $t0, $st2, $st2 737 738 # round 12 739 vbroadcasti32x4 0xc0($key1), $t0 740 vaesdec $t0, $st1, $st1 741 vaesdec $t0, $st2, $st2 742 743 # round 13 744 vbroadcasti32x4 0xd0($key1), $t0 745 vaesdec $t0, $st1, $st1 746 vaesdec $t0, $st2, $st2 747 748 # round 14 749 vbroadcasti32x4 0xe0($key1), $t0 750 vaesdeclast $t0, $st1, $st1 751 vaesdeclast $t0, $st2, $st2 752___ 753 } 754 755 $code .= <<___; 756 # xor Tweak values 757 vpxorq $tw1, $st1, $st1 758 vpxorq $tw2, $st2, $st2 759 760 # load next Tweak values 761 vmovdqa32 %zmm15, $tw1 762 vmovdqa32 %zmm16, $tw2 763___ 764 } 765 766 # Encrypt 16 blocks in parallel 767 # generate next 16 tweak values 768 sub encrypt_by_16_zmm { 769 my @st; 770 $st[0] = $_[0]; 771 $st[1] = $_[1]; 772 $st[2] = $_[2]; 773 $st[3] = $_[3]; 774 775 my @tw; 776 $tw[0] = $_[4]; 777 $tw[1] = $_[5]; 778 $tw[2] = $_[6]; 779 $tw[3] = $_[7]; 780 781 my $t0 = $_[8]; 782 my $last_eight = $_[9]; 783 my $is_128 = $_[10]; 784 785 # xor Tweak values 786 for (my $i = 0; $i < 4; $i++) { 787 $code .= "vpxorq $tw[$i], $st[$i], $st[$i]\n"; 788 } 789 790 # ARK 791 $code .= "vbroadcasti32x4 ($key1), $t0\n"; 792 for (my $i = 0; $i < 4; $i++) { 793 $code .= "vpxorq $t0, $st[$i], $st[$i]\n"; 794 } 795 796 if (0 == $last_eight) { 797 $code .= <<___; 798 vpsrldq \$0xf, $tw[2], %zmm13 799 vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 800 vpslldq \$0x1, $tw[2], %zmm15 801 vpxord %zmm14, %zmm15, %zmm15 802___ 803 } 804 805 # round 1 806 $code .= "vbroadcasti32x4 0x10($key1), $t0\n"; 807 for (my $i = 0; $i < 4; $i++) { 808 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 809 } 810 811 # round 2 812 $code .= "vbroadcasti32x4 0x20($key1), $t0\n"; 813 for (my $i = 0; $i < 4; $i++) { 814 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 815 } 816 817 # round 3 818 $code .= "vbroadcasti32x4 0x30($key1), $t0\n"; 819 for (my $i = 0; $i < 4; $i++) { 820 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 821 } 822 823 if (0 == $last_eight) { 824 $code .= <<___; 825 vpsrldq \$0xf, $tw[3], %zmm13 826 vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 827 vpslldq \$0x1, $tw[3], %zmm16 828 vpxord %zmm14, %zmm16, %zmm16 829___ 830 } 831 # round 4 832 $code .= "vbroadcasti32x4 0x40($key1), $t0\n"; 833 for (my $i = 0; $i < 4; $i++) { 834 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 835 } 836 837 # round 5 838 $code .= "vbroadcasti32x4 0x50($key1), $t0\n"; 839 for (my $i = 0; $i < 4; $i++) { 840 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 841 } 842 843 # round 6 844 $code .= "vbroadcasti32x4 0x60($key1), $t0\n"; 845 for (my $i = 0; $i < 4; $i++) { 846 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 847 } 848 849 if (0 == $last_eight) { 850 $code .= <<___; 851 vpsrldq \$0xf, %zmm15, %zmm13 852 vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 853 vpslldq \$0x1, %zmm15, %zmm17 854 vpxord %zmm14, %zmm17, %zmm17 855___ 856 } 857 # round 7 858 $code .= "vbroadcasti32x4 0x70($key1), $t0\n"; 859 for (my $i = 0; $i < 4; $i++) { 860 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 861 } 862 863 # round 8 864 $code .= "vbroadcasti32x4 0x80($key1), $t0\n"; 865 for (my $i = 0; $i < 4; $i++) { 866 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 867 } 868 869 # round 9 870 $code .= "vbroadcasti32x4 0x90($key1), $t0\n"; 871 for (my $i = 0; $i < 4; $i++) { 872 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 873 } 874 875 if (0 == $last_eight) { 876 $code .= <<___; 877 vpsrldq \$0xf, %zmm16, %zmm13 878 vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 879 vpslldq \$0x1, %zmm16, %zmm18 880 vpxord %zmm14, %zmm18, %zmm18 881___ 882 } 883 if ($is_128) { 884 # round 10 885 $code .= "vbroadcasti32x4 0xa0($key1), $t0\n"; 886 for (my $i = 0; $i < 4; $i++) { 887 $code .= "vaesenclast $t0, $st[$i], $st[$i]\n"; 888 } 889 } else { 890 # round 10 891 $code .= "vbroadcasti32x4 0xa0($key1), $t0\n"; 892 for (my $i = 0; $i < 4; $i++) { 893 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 894 } 895 # round 11 896 $code .= "vbroadcasti32x4 0xb0($key1), $t0\n"; 897 for (my $i = 0; $i < 4; $i++) { 898 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 899 } 900 # round 12 901 $code .= "vbroadcasti32x4 0xc0($key1), $t0\n"; 902 for (my $i = 0; $i < 4; $i++) { 903 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 904 } 905 # round 13 906 $code .= "vbroadcasti32x4 0xd0($key1), $t0\n"; 907 for (my $i = 0; $i < 4; $i++) { 908 $code .= "vaesenc $t0, $st[$i], $st[$i]\n"; 909 } 910 # round 14 911 $code .= "vbroadcasti32x4 0xe0($key1), $t0\n"; 912 for (my $i = 0; $i < 4; $i++) { 913 $code .= "vaesenclast $t0, $st[$i], $st[$i]\n"; 914 } 915 } 916 917 # xor Tweak values 918 for (my $i = 0; $i < 4; $i++) { 919 $code .= "vpxorq $tw[$i], $st[$i], $st[$i]\n"; 920 } 921 922 $code .= <<___; 923 # load next Tweak values 924 vmovdqa32 %zmm15, $tw[0] 925 vmovdqa32 %zmm16, $tw[1] 926 vmovdqa32 %zmm17, $tw[2] 927 vmovdqa32 %zmm18, $tw[3] 928___ 929 } 930 931 # Decrypt 16 blocks in parallel 932 # generate next 8 tweak values 933 sub decrypt_by_16_zmm { 934 my @st; 935 $st[0] = $_[0]; 936 $st[1] = $_[1]; 937 $st[2] = $_[2]; 938 $st[3] = $_[3]; 939 940 my @tw; 941 $tw[0] = $_[4]; 942 $tw[1] = $_[5]; 943 $tw[2] = $_[6]; 944 $tw[3] = $_[7]; 945 946 my $t0 = $_[8]; 947 my $last_eight = $_[9]; 948 my $is_128 = $_[10]; 949 950 # xor Tweak values 951 for (my $i = 0; $i < 4; $i++) { 952 $code .= "vpxorq $tw[$i], $st[$i], $st[$i]\n"; 953 } 954 955 # ARK 956 $code .= "vbroadcasti32x4 ($key1), $t0\n"; 957 for (my $i = 0; $i < 4; $i++) { 958 $code .= "vpxorq $t0, $st[$i], $st[$i]\n"; 959 } 960 961 if (0 == $last_eight) { 962 $code .= <<___; 963 vpsrldq \$0xf, $tw[2], %zmm13 964 vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 965 vpslldq \$0x1, $tw[2], %zmm15 966 vpxord %zmm14, %zmm15, %zmm15 967___ 968 } 969 970 # round 1 971 $code .= "vbroadcasti32x4 0x10($key1), $t0\n"; 972 for (my $i = 0; $i < 4; $i++) { 973 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 974 } 975 976 # round 2 977 $code .= "vbroadcasti32x4 0x20($key1), $t0\n"; 978 for (my $i = 0; $i < 4; $i++) { 979 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 980 } 981 982 # round 3 983 $code .= "vbroadcasti32x4 0x30($key1), $t0\n"; 984 for (my $i = 0; $i < 4; $i++) { 985 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 986 } 987 988 if (0 == $last_eight) { 989 $code .= <<___; 990 vpsrldq \$0xf, $tw[3], %zmm13 991 vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 992 vpslldq \$0x1, $tw[3], %zmm16 993 vpxord %zmm14, %zmm16, %zmm16 994___ 995 } 996 # round 4 997 $code .= "vbroadcasti32x4 0x40($key1), $t0\n"; 998 for (my $i = 0; $i < 4; $i++) { 999 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 1000 } 1001 1002 # round 5 1003 $code .= "vbroadcasti32x4 0x50($key1), $t0\n"; 1004 for (my $i = 0; $i < 4; $i++) { 1005 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 1006 } 1007 1008 # round 6 1009 $code .= "vbroadcasti32x4 0x60($key1), $t0\n"; 1010 for (my $i = 0; $i < 4; $i++) { 1011 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 1012 } 1013 1014 if (0 == $last_eight) { 1015 $code .= <<___; 1016 vpsrldq \$0xf, %zmm15, %zmm13 1017 vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 1018 vpslldq \$0x1, %zmm15, %zmm17 1019 vpxord %zmm14, %zmm17, %zmm17 1020___ 1021 } 1022 # round 7 1023 $code .= "vbroadcasti32x4 0x70($key1), $t0\n"; 1024 for (my $i = 0; $i < 4; $i++) { 1025 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 1026 } 1027 1028 # round 8 1029 $code .= "vbroadcasti32x4 0x80($key1), $t0\n"; 1030 for (my $i = 0; $i < 4; $i++) { 1031 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 1032 } 1033 1034 # round 9 1035 $code .= "vbroadcasti32x4 0x90($key1), $t0\n"; 1036 for (my $i = 0; $i < 4; $i++) { 1037 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 1038 } 1039 1040 if (0 == $last_eight) { 1041 $code .= <<___; 1042 vpsrldq \$0xf, %zmm16, %zmm13 1043 vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14 1044 vpslldq \$0x1, %zmm16, %zmm18 1045 vpxord %zmm14, %zmm18, %zmm18 1046___ 1047 } 1048 if ($is_128) { 1049 # round 10 1050 $code .= "vbroadcasti32x4 0xa0($key1), $t0\n"; 1051 for (my $i = 0; $i < 4; $i++) { 1052 $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n"; 1053 } 1054 } else { 1055 # round 10 1056 $code .= "vbroadcasti32x4 0xa0($key1), $t0\n"; 1057 for (my $i = 0; $i < 4; $i++) { 1058 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 1059 } 1060 1061 # round 11 1062 $code .= "vbroadcasti32x4 0xb0($key1), $t0\n"; 1063 for (my $i = 0; $i < 4; $i++) { 1064 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 1065 } 1066 1067 # round 12 1068 $code .= "vbroadcasti32x4 0xc0($key1), $t0\n"; 1069 for (my $i = 0; $i < 4; $i++) { 1070 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 1071 } 1072 1073 # round 13 1074 $code .= "vbroadcasti32x4 0xd0($key1), $t0\n"; 1075 for (my $i = 0; $i < 4; $i++) { 1076 $code .= "vaesdec $t0, $st[$i], $st[$i]\n"; 1077 } 1078 1079 # round 14 1080 $code .= "vbroadcasti32x4 0xe0($key1), $t0\n"; 1081 for (my $i = 0; $i < 4; $i++) { 1082 $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n"; 1083 } 1084 } 1085 1086 # xor Tweak values 1087 for (my $i = 0; $i < 4; $i++) { 1088 $code .= "vpxorq $tw[$i], $st[$i], $st[$i]\n"; 1089 } 1090 1091 $code .= <<___; 1092 # load next Tweak values 1093 vmovdqa32 %zmm15, $tw[0] 1094 vmovdqa32 %zmm16, $tw[1] 1095 vmovdqa32 %zmm17, $tw[2] 1096 vmovdqa32 %zmm18, $tw[3] 1097___ 1098 } 1099 1100 $code .= ".text\n"; 1101 1102 { 1103 $code.=<<"___"; 1104 .extern OPENSSL_ia32cap_P 1105 .globl aesni_xts_avx512_eligible 1106 .type aesni_xts_avx512_eligible,\@abi-omnipotent 1107 .align 32 1108 aesni_xts_avx512_eligible: 1109 mov OPENSSL_ia32cap_P+8(%rip), %ecx 1110 xor %eax,%eax 1111 # 1<<31|1<<30|1<<17|1<<16 avx512vl + avx512bw + avx512dq + avx512f 1112 and \$0xc0030000, %ecx 1113 cmp \$0xc0030000, %ecx 1114 jne .L_done 1115 mov OPENSSL_ia32cap_P+12(%rip), %ecx 1116 # 1<<10|1<<9|1<<6 vaes + vpclmulqdq + vbmi2 1117 and \$0x640, %ecx 1118 cmp \$0x640, %ecx 1119 cmove %ecx,%eax 1120 .L_done: 1121 ret 1122 .size aesni_xts_avx512_eligible, .-aesni_xts_avx512_eligible 1123___ 1124 } 1125 1126 1127 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1128 # ;void aesni_xts_[128|256]_encrypt_avx512( 1129 # ; const uint8_t *in, // input data 1130 # ; uint8_t *out, // output data 1131 # ; size_t length, // sector size, in bytes 1132 # ; const AES_KEY *key1, // key used for "ECB" encryption 1133 # ; const AES_KEY *key2, // key used for tweaking 1134 # ; const uint8_t iv[16]) // initial tweak value, 16 bytes 1135 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1136 sub enc { 1137 my $is_128 = $_[0]; 1138 my $rndsuffix = &random_string(); 1139 1140 if ($is_128) { 1141 $code.=<<___; 1142 .globl aesni_xts_128_encrypt_avx512 1143 .hidden aesni_xts_128_encrypt_avx512 1144 .type aesni_xts_128_encrypt_avx512,\@function,6 1145 .align 32 1146 aesni_xts_128_encrypt_avx512: 1147 .cfi_startproc 1148 endbranch 1149___ 1150 } else { 1151 $code.=<<___; 1152 .globl aesni_xts_256_encrypt_avx512 1153 .hidden aesni_xts_256_encrypt_avx512 1154 .type aesni_xts_256_encrypt_avx512,\@function,6 1155 .align 32 1156 aesni_xts_256_encrypt_avx512: 1157 .cfi_startproc 1158 endbranch 1159___ 1160 } 1161 $code .= "push %rbp\n"; 1162 $code .= "mov $TW,%rbp\n"; 1163 $code .= "sub \$$VARIABLE_OFFSET,$TW\n"; 1164 $code .= "and \$0xffffffffffffffc0,$TW\n"; 1165 $code .= "mov %rbx,$GP_STORAGE($TW)\n"; 1166 1167 if ($win64) { 1168 $code .= "mov %rdi,$GP_STORAGE + 8*1($TW)\n"; 1169 $code .= "mov %rsi,$GP_STORAGE + 8*2($TW)\n"; 1170 $code .= "vmovdqa %xmm6, $XMM_STORAGE + 16*0($TW)\n"; 1171 $code .= "vmovdqa %xmm7, $XMM_STORAGE + 16*1($TW)\n"; 1172 $code .= "vmovdqa %xmm8, $XMM_STORAGE + 16*2($TW)\n"; 1173 $code .= "vmovdqa %xmm9, $XMM_STORAGE + 16*3($TW)\n"; 1174 $code .= "vmovdqa %xmm10, $XMM_STORAGE + 16*4($TW)\n"; 1175 $code .= "vmovdqa %xmm11, $XMM_STORAGE + 16*5($TW)\n"; 1176 $code .= "vmovdqa %xmm12, $XMM_STORAGE + 16*6($TW)\n"; 1177 $code .= "vmovdqa %xmm13, $XMM_STORAGE + 16*7($TW)\n"; 1178 $code .= "vmovdqa %xmm14, $XMM_STORAGE + 16*8($TW)\n"; 1179 $code .= "vmovdqa %xmm15, $XMM_STORAGE + 16*9($TW)\n"; 1180 } 1181 1182 $code .= "mov \$0x87, $gf_poly_8b\n"; 1183 $code .= "vmovdqu ($tweak),%xmm1\n"; # read initial tweak values 1184 1185 encrypt_tweak("%xmm1", $is_128); 1186 1187 if ($win64) { 1188 $code .= "mov $input, 8 + 8*5(%rbp)\n"; # ciphertext pointer 1189 $code .= "mov $output, 8 + 8*6(%rbp)\n"; # plaintext pointer 1190 } 1191 1192 { 1193 $code.=<<___; 1194 1195 cmp \$0x80,$length 1196 jl .L_less_than_128_bytes_${rndsuffix} 1197 vpbroadcastq $gf_poly_8b,$ZPOLY 1198 cmp \$0x100,$length 1199 jge .L_start_by16_${rndsuffix} 1200 cmp \$0x80,$length 1201 jge .L_start_by8_${rndsuffix} 1202 1203 .L_do_n_blocks_${rndsuffix}: 1204 cmp \$0x0,$length 1205 je .L_ret_${rndsuffix} 1206 cmp \$0x70,$length 1207 jge .L_remaining_num_blocks_is_7_${rndsuffix} 1208 cmp \$0x60,$length 1209 jge .L_remaining_num_blocks_is_6_${rndsuffix} 1210 cmp \$0x50,$length 1211 jge .L_remaining_num_blocks_is_5_${rndsuffix} 1212 cmp \$0x40,$length 1213 jge .L_remaining_num_blocks_is_4_${rndsuffix} 1214 cmp \$0x30,$length 1215 jge .L_remaining_num_blocks_is_3_${rndsuffix} 1216 cmp \$0x20,$length 1217 jge .L_remaining_num_blocks_is_2_${rndsuffix} 1218 cmp \$0x10,$length 1219 jge .L_remaining_num_blocks_is_1_${rndsuffix} 1220 vmovdqa %xmm0,%xmm8 1221 vmovdqa %xmm9,%xmm0 1222 jmp .L_steal_cipher_${rndsuffix} 1223 1224 .L_remaining_num_blocks_is_7_${rndsuffix}: 1225 mov \$0x0000ffffffffffff,$tmp1 1226 kmovq $tmp1,%k1 1227 vmovdqu8 ($input),%zmm1 1228 vmovdqu8 0x40($input),%zmm2{%k1} 1229 add \$0x70,$input 1230___ 1231 } 1232 1233 encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 1234 1235 { 1236 $code .= <<___; 1237 vmovdqu8 %zmm1,($output) 1238 vmovdqu8 %zmm2,0x40($output){%k1} 1239 add \$0x70,$output 1240 vextracti32x4 \$0x2,%zmm2,%xmm8 1241 vextracti32x4 \$0x3,%zmm10,%xmm0 1242 and \$0xf,$length 1243 je .L_ret_${rndsuffix} 1244 jmp .L_steal_cipher_${rndsuffix} 1245 1246 .L_remaining_num_blocks_is_6_${rndsuffix}: 1247 vmovdqu8 ($input),%zmm1 1248 vmovdqu8 0x40($input),%ymm2 1249 add \$0x60,$input 1250___ 1251 } 1252 1253 encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 1254 1255 { 1256 $code .= <<___; 1257 vmovdqu8 %zmm1,($output) 1258 vmovdqu8 %ymm2,0x40($output) 1259 add \$0x60,$output 1260 vextracti32x4 \$0x1,%zmm2,%xmm8 1261 vextracti32x4 \$0x2,%zmm10,%xmm0 1262 and \$0xf,$length 1263 je .L_ret_${rndsuffix} 1264 jmp .L_steal_cipher_${rndsuffix} 1265 1266 .L_remaining_num_blocks_is_5_${rndsuffix}: 1267 vmovdqu8 ($input),%zmm1 1268 vmovdqu 0x40($input),%xmm2 1269 add \$0x50,$input 1270___ 1271 } 1272 1273 encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 1274 1275 { 1276 $code .= <<___; 1277 vmovdqu8 %zmm1,($output) 1278 vmovdqu %xmm2,0x40($output) 1279 add \$0x50,$output 1280 vmovdqa %xmm2,%xmm8 1281 vextracti32x4 \$0x1,%zmm10,%xmm0 1282 and \$0xf,$length 1283 je .L_ret_${rndsuffix} 1284 jmp .L_steal_cipher_${rndsuffix} 1285 1286 .L_remaining_num_blocks_is_4_${rndsuffix}: 1287 vmovdqu8 ($input),%zmm1 1288 add \$0x40,$input 1289___ 1290 } 1291 1292 encrypt_by_four("%zmm1", "%zmm9", "%zmm0", $is_128); 1293 1294 { 1295 $code .= <<___; 1296 vmovdqu8 %zmm1,($output) 1297 add \$0x40,$output 1298 vextracti32x4 \$0x3,%zmm1,%xmm8 1299 vmovdqa64 %xmm10, %xmm0 1300 and \$0xf,$length 1301 je .L_ret_${rndsuffix} 1302 jmp .L_steal_cipher_${rndsuffix} 1303___ 1304 } 1305 1306 { 1307 $code .= <<___; 1308 .L_remaining_num_blocks_is_3_${rndsuffix}: 1309 mov \$-1, $tmp1 1310 shr \$0x10, $tmp1 1311 kmovq $tmp1, %k1 1312 vmovdqu8 ($input), %zmm1{%k1} 1313 add \$0x30, $input 1314___ 1315 } 1316 1317 encrypt_by_four("%zmm1", "%zmm9", "%zmm0", $is_128); 1318 1319 { 1320 $code .= <<___; 1321 vmovdqu8 %zmm1, ($output){%k1} 1322 add \$0x30, $output 1323 vextracti32x4 \$0x2, %zmm1, %xmm8 1324 vextracti32x4 \$0x3, %zmm9, %xmm0 1325 and \$0xf, $length 1326 je .L_ret_${rndsuffix} 1327 jmp .L_steal_cipher_${rndsuffix} 1328___ 1329 } 1330 1331 { 1332 $code .= <<___; 1333 .L_remaining_num_blocks_is_2_${rndsuffix}: 1334 vmovdqu8 ($input), %ymm1 1335 add \$0x20, $input 1336___ 1337 } 1338 1339 encrypt_by_four("%ymm1", "%ymm9", "%ymm0", $is_128); 1340 1341 { 1342 $code .= <<___; 1343 vmovdqu %ymm1,($output) 1344 add \$0x20,$output 1345 vextracti32x4 \$0x1, %zmm1, %xmm8 1346 vextracti32x4 \$0x2,%zmm9,%xmm0 1347 and \$0xf,$length 1348 je .L_ret_${rndsuffix} 1349 jmp .L_steal_cipher_${rndsuffix} 1350___ 1351 } 1352 1353 { 1354 $code .= <<___; 1355 .L_remaining_num_blocks_is_1_${rndsuffix}: 1356 vmovdqu ($input),%xmm1 1357 add \$0x10,$input 1358___ 1359 } 1360 1361 encrypt_final("%xmm1", "%xmm9", $is_128); 1362 1363 { 1364 $code .= <<___; 1365 vmovdqu %xmm1,($output) 1366 add \$0x10,$output 1367 vmovdqa %xmm1,%xmm8 1368 vextracti32x4 \$0x1,%zmm9,%xmm0 1369 and \$0xf,$length 1370 je .L_ret_${rndsuffix} 1371 jmp .L_steal_cipher_${rndsuffix} 1372 1373 1374 .L_start_by16_${rndsuffix}: 1375 vbroadcasti32x4 ($TW),%zmm0 1376 vbroadcasti32x4 shufb_15_7(%rip),%zmm8 1377 mov \$0xaa,$tmp1 1378 kmovq $tmp1,%k2 1379 vpshufb %zmm8,%zmm0,%zmm1 1380 vpsllvq const_dq3210(%rip),%zmm0,%zmm4 1381 vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 1382 vpclmulqdq \$0x0,%zmm25,%zmm2,%zmm3 1383 vpxorq %zmm2,%zmm4,%zmm4{%k2} 1384 vpxord %zmm4,%zmm3,%zmm9 1385 vpsllvq const_dq7654(%rip),%zmm0,%zmm5 1386 vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 1387 vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7 1388 vpxorq %zmm6,%zmm5,%zmm5{%k2} 1389 vpxord %zmm5,%zmm7,%zmm10 1390 vpsrldq \$0xf,%zmm9,%zmm13 1391 vpclmulqdq \$0x0,%zmm25,%zmm13,%zmm14 1392 vpslldq \$0x1,%zmm9,%zmm11 1393 vpxord %zmm14,%zmm11,%zmm11 1394 vpsrldq \$0xf,%zmm10,%zmm15 1395 vpclmulqdq \$0x0,%zmm25,%zmm15,%zmm16 1396 vpslldq \$0x1,%zmm10,%zmm12 1397 vpxord %zmm16,%zmm12,%zmm12 1398 1399 .L_main_loop_run_16_${rndsuffix}: 1400 vmovdqu8 ($input),%zmm1 1401 vmovdqu8 0x40($input),%zmm2 1402 vmovdqu8 0x80($input),%zmm3 1403 vmovdqu8 0xc0($input),%zmm4 1404 add \$0x100,$input 1405___ 1406 } 1407 1408 encrypt_by_16_zmm("%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm9", 1409 "%zmm10", "%zmm11", "%zmm12", "%zmm0", 0, $is_128); 1410 1411 { 1412 $code .= <<___; 1413 vmovdqu8 %zmm1,($output) 1414 vmovdqu8 %zmm2,0x40($output) 1415 vmovdqu8 %zmm3,0x80($output) 1416 vmovdqu8 %zmm4,0xc0($output) 1417 add \$0x100,$output 1418 sub \$0x100,$length 1419 cmp \$0x100,$length 1420 jae .L_main_loop_run_16_${rndsuffix} 1421 cmp \$0x80,$length 1422 jae .L_main_loop_run_8_${rndsuffix} 1423 vextracti32x4 \$0x3,%zmm4,%xmm0 1424 jmp .L_do_n_blocks_${rndsuffix} 1425 1426 .L_start_by8_${rndsuffix}: 1427 vbroadcasti32x4 ($TW),%zmm0 1428 vbroadcasti32x4 shufb_15_7(%rip),%zmm8 1429 mov \$0xaa,$tmp1 1430 kmovq $tmp1,%k2 1431 vpshufb %zmm8,%zmm0,%zmm1 1432 vpsllvq const_dq3210(%rip),%zmm0,%zmm4 1433 vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 1434 vpclmulqdq \$0x0,%zmm25,%zmm2,%zmm3 1435 vpxorq %zmm2,%zmm4,%zmm4{%k2} 1436 vpxord %zmm4,%zmm3,%zmm9 1437 vpsllvq const_dq7654(%rip),%zmm0,%zmm5 1438 vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 1439 vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7 1440 vpxorq %zmm6,%zmm5,%zmm5{%k2} 1441 vpxord %zmm5,%zmm7,%zmm10 1442 1443 .L_main_loop_run_8_${rndsuffix}: 1444 vmovdqu8 ($input),%zmm1 1445 vmovdqu8 0x40($input),%zmm2 1446 add \$0x80,$input 1447___ 1448 } 1449 1450 encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 0, $is_128); 1451 1452 { 1453 $code .= <<___; 1454 vmovdqu8 %zmm1,($output) 1455 vmovdqu8 %zmm2,0x40($output) 1456 add \$0x80,$output 1457 sub \$0x80,$length 1458 cmp \$0x80,$length 1459 jae .L_main_loop_run_8_${rndsuffix} 1460 vextracti32x4 \$0x3,%zmm2,%xmm0 1461 jmp .L_do_n_blocks_${rndsuffix} 1462 1463 .L_steal_cipher_${rndsuffix}: 1464 vmovdqa %xmm8,%xmm2 1465 lea vpshufb_shf_table(%rip),$TEMPLOW 1466 vmovdqu ($TEMPLOW,$length,1),%xmm10 1467 vpshufb %xmm10,%xmm8,%xmm8 1468 vmovdqu -0x10($input,$length,1),%xmm3 1469 vmovdqu %xmm8,-0x10($output,$length,1) 1470 lea vpshufb_shf_table(%rip),$TEMPLOW 1471 add \$16, $TEMPLOW 1472 sub $length,$TEMPLOW 1473 vmovdqu ($TEMPLOW),%xmm10 1474 vpxor mask1(%rip),%xmm10,%xmm10 1475 vpshufb %xmm10,%xmm3,%xmm3 1476 vpblendvb %xmm10,%xmm2,%xmm3,%xmm3 1477 vpxor %xmm0,%xmm3,%xmm8 1478 vpxor ($key1),%xmm8,%xmm8 1479 vaesenc 0x10($key1),%xmm8,%xmm8 1480 vaesenc 0x20($key1),%xmm8,%xmm8 1481 vaesenc 0x30($key1),%xmm8,%xmm8 1482 vaesenc 0x40($key1),%xmm8,%xmm8 1483 vaesenc 0x50($key1),%xmm8,%xmm8 1484 vaesenc 0x60($key1),%xmm8,%xmm8 1485 vaesenc 0x70($key1),%xmm8,%xmm8 1486 vaesenc 0x80($key1),%xmm8,%xmm8 1487 vaesenc 0x90($key1),%xmm8,%xmm8 1488___ 1489 if ($is_128) { 1490 $code .= "vaesenclast 0xa0($key1),%xmm8,%xmm8\n"; 1491 } else { 1492 $code .= <<___ 1493 vaesenc 0xa0($key1),%xmm8,%xmm8 1494 vaesenc 0xb0($key1),%xmm8,%xmm8 1495 vaesenc 0xc0($key1),%xmm8,%xmm8 1496 vaesenc 0xd0($key1),%xmm8,%xmm8 1497 vaesenclast 0xe0($key1),%xmm8,%xmm8 1498___ 1499 } 1500 $code .= "vpxor %xmm0,%xmm8,%xmm8\n"; 1501 $code .= "vmovdqu %xmm8,-0x10($output)\n"; 1502 } 1503 1504 { 1505 $code .= <<___; 1506 .L_ret_${rndsuffix}: 1507 mov $GP_STORAGE($TW),%rbx 1508 xor $tmp1,$tmp1 1509 mov $tmp1,$GP_STORAGE($TW) 1510 # Zero-out the whole of `%zmm0`. 1511 vpxorq %zmm0,%zmm0,%zmm0 1512___ 1513 } 1514 1515 if ($win64) { 1516 $code .= <<___; 1517 mov $GP_STORAGE + 8*1($TW),%rdi 1518 mov $tmp1,$GP_STORAGE + 8*1($TW) 1519 mov $GP_STORAGE + 8*2($TW),%rsi 1520 mov $tmp1,$GP_STORAGE + 8*2($TW) 1521 1522 vmovdqa $XMM_STORAGE + 16 * 0($TW), %xmm6 1523 vmovdqa $XMM_STORAGE + 16 * 1($TW), %xmm7 1524 vmovdqa $XMM_STORAGE + 16 * 2($TW), %xmm8 1525 vmovdqa $XMM_STORAGE + 16 * 3($TW), %xmm9 1526 1527 # Zero the 64 bytes we just restored to the xmm registers. 1528 vmovdqa64 %zmm0,$XMM_STORAGE($TW) 1529 1530 vmovdqa $XMM_STORAGE + 16 * 4($TW), %xmm10 1531 vmovdqa $XMM_STORAGE + 16 * 5($TW), %xmm11 1532 vmovdqa $XMM_STORAGE + 16 * 6($TW), %xmm12 1533 vmovdqa $XMM_STORAGE + 16 * 7($TW), %xmm13 1534 1535 # And again. 1536 vmovdqa64 %zmm0,$XMM_STORAGE + 16 * 4($TW) 1537 1538 vmovdqa $XMM_STORAGE + 16 * 8($TW), %xmm14 1539 vmovdqa $XMM_STORAGE + 16 * 9($TW), %xmm15 1540 1541 # Last round is only 32 bytes (256-bits), so we use `%ymm` as the 1542 # source operand. 1543 vmovdqa %ymm0,$XMM_STORAGE + 16 * 8($TW) 1544___ 1545 } 1546 1547 { 1548 $code .= <<___; 1549 mov %rbp,$TW 1550 pop %rbp 1551 vzeroupper 1552 ret 1553 1554 .L_less_than_128_bytes_${rndsuffix}: 1555 vpbroadcastq $gf_poly_8b, $ZPOLY 1556 cmp \$0x10,$length 1557 jb .L_ret_${rndsuffix} 1558 vbroadcasti32x4 ($TW), %zmm0 1559 vbroadcasti32x4 shufb_15_7(%rip), %zmm8 1560 movl \$0xaa, %r8d 1561 kmovq %r8, %k2 1562 mov $length,$tmp1 1563 and \$0x70,$tmp1 1564 cmp \$0x60,$tmp1 1565 je .L_num_blocks_is_6_${rndsuffix} 1566 cmp \$0x50,$tmp1 1567 je .L_num_blocks_is_5_${rndsuffix} 1568 cmp \$0x40,$tmp1 1569 je .L_num_blocks_is_4_${rndsuffix} 1570 cmp \$0x30,$tmp1 1571 je .L_num_blocks_is_3_${rndsuffix} 1572 cmp \$0x20,$tmp1 1573 je .L_num_blocks_is_2_${rndsuffix} 1574 cmp \$0x10,$tmp1 1575 je .L_num_blocks_is_1_${rndsuffix} 1576 1577 .L_num_blocks_is_7_${rndsuffix}: 1578 vpshufb %zmm8, %zmm0, %zmm1 1579 vpsllvq const_dq3210(%rip), %zmm0, %zmm4 1580 vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 1581 vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 1582 vpxorq %zmm2, %zmm4, %zmm4{%k2} 1583 vpxord %zmm4, %zmm3, %zmm9 1584 vpsllvq const_dq7654(%rip), %zmm0, %zmm5 1585 vpsrlvq const_dq1234(%rip), %zmm1, %zmm6 1586 vpclmulqdq \$0x00, $ZPOLY, %zmm6, %zmm7 1587 vpxorq %zmm6, %zmm5, %zmm5{%k2} 1588 vpxord %zmm5, %zmm7, %zmm10 1589 mov \$0x0000ffffffffffff, $tmp1 1590 kmovq $tmp1, %k1 1591 vmovdqu8 16*0($input), %zmm1 1592 vmovdqu8 16*4($input), %zmm2{%k1} 1593 1594 add \$0x70,$input 1595___ 1596 } 1597 1598 encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 1599 1600 { 1601 $code .= <<___; 1602 vmovdqu8 %zmm1, 16*0($output) 1603 vmovdqu8 %zmm2, 16*4($output){%k1} 1604 add \$0x70,$output 1605 vextracti32x4 \$0x2, %zmm2, %xmm8 1606 vextracti32x4 \$0x3, %zmm10, %xmm0 1607 and \$0xf,$length 1608 je .L_ret_${rndsuffix} 1609 jmp .L_steal_cipher_${rndsuffix} 1610___ 1611 } 1612 1613 { 1614 $code .= <<___; 1615 .L_num_blocks_is_6_${rndsuffix}: 1616 vpshufb %zmm8, %zmm0, %zmm1 1617 vpsllvq const_dq3210(%rip), %zmm0, %zmm4 1618 vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 1619 vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 1620 vpxorq %zmm2, %zmm4, %zmm4{%k2} 1621 vpxord %zmm4, %zmm3, %zmm9 1622 vpsllvq const_dq7654(%rip), %zmm0, %zmm5 1623 vpsrlvq const_dq1234(%rip), %zmm1, %zmm6 1624 vpclmulqdq \$0x00, $ZPOLY, %zmm6, %zmm7 1625 vpxorq %zmm6, %zmm5, %zmm5{%k2} 1626 vpxord %zmm5, %zmm7, %zmm10 1627 vmovdqu8 16*0($input), %zmm1 1628 vmovdqu8 16*4($input), %ymm2 1629 add \$96, $input 1630___ 1631 } 1632 1633 encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 1634 1635 { 1636 $code .= <<___; 1637 vmovdqu8 %zmm1, 16*0($output) 1638 vmovdqu8 %ymm2, 16*4($output) 1639 add \$96, $output 1640 1641 vextracti32x4 \$0x1, %ymm2, %xmm8 1642 vextracti32x4 \$0x2, %zmm10, %xmm0 1643 and \$0xf,$length 1644 je .L_ret_${rndsuffix} 1645 jmp .L_steal_cipher_${rndsuffix} 1646___ 1647 } 1648 1649 { 1650 $code .= <<___; 1651 .L_num_blocks_is_5_${rndsuffix}: 1652 vpshufb %zmm8, %zmm0, %zmm1 1653 vpsllvq const_dq3210(%rip), %zmm0, %zmm4 1654 vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 1655 vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 1656 vpxorq %zmm2, %zmm4, %zmm4{%k2} 1657 vpxord %zmm4, %zmm3, %zmm9 1658 vpsllvq const_dq7654(%rip), %zmm0, %zmm5 1659 vpsrlvq const_dq1234(%rip), %zmm1, %zmm6 1660 vpclmulqdq \$0x00, $ZPOLY, %zmm6, %zmm7 1661 vpxorq %zmm6, %zmm5, %zmm5{%k2} 1662 vpxord %zmm5, %zmm7, %zmm10 1663 vmovdqu8 16*0($input), %zmm1 1664 vmovdqu8 16*4($input), %xmm2 1665 add \$80, $input 1666___ 1667 } 1668 1669 encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 1670 1671 { 1672 $code .= <<___; 1673 vmovdqu8 %zmm1, 16*0($output) 1674 vmovdqu8 %xmm2, 16*4($output) 1675 add \$80, $output 1676 1677 vmovdqa %xmm2, %xmm8 1678 vextracti32x4 \$0x1, %zmm10, %xmm0 1679 and \$0xf,$length 1680 je .L_ret_${rndsuffix} 1681 jmp .L_steal_cipher_${rndsuffix} 1682___ 1683 } 1684 1685 { 1686 $code .= <<___; 1687 .L_num_blocks_is_4_${rndsuffix}: 1688 vpshufb %zmm8, %zmm0, %zmm1 1689 vpsllvq const_dq3210(%rip), %zmm0, %zmm4 1690 vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 1691 vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 1692 vpxorq %zmm2, %zmm4, %zmm4{%k2} 1693 vpxord %zmm4, %zmm3, %zmm9 1694 vpsllvq const_dq7654(%rip), %zmm0, %zmm5 1695 vpsrlvq const_dq1234(%rip), %zmm1, %zmm6 1696 vpclmulqdq \$0x00, $ZPOLY, %zmm6, %zmm7 1697 vpxorq %zmm6, %zmm5, %zmm5{%k2} 1698 vpxord %zmm5, %zmm7, %zmm10 1699 vmovdqu8 16*0($input), %zmm1 1700 add \$64, $input 1701___ 1702 } 1703 1704 encrypt_by_four("%zmm1", "%zmm9", "%zmm0", $is_128); 1705 1706 { 1707 $code .= <<___; 1708 vmovdqu8 %zmm1, 16*0($output) 1709 add \$64, $output 1710 vextracti32x4 \$0x3, %zmm1, %xmm8 1711 vmovdqa %xmm10, %xmm0 1712 and \$0xf,$length 1713 je .L_ret_${rndsuffix} 1714 jmp .L_steal_cipher_${rndsuffix} 1715___ 1716 } 1717 1718 { 1719 $code .= <<___; 1720 .L_num_blocks_is_3_${rndsuffix}: 1721 vpshufb %zmm8, %zmm0, %zmm1 1722 vpsllvq const_dq3210(%rip), %zmm0, %zmm4 1723 vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 1724 vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 1725 vpxorq %zmm2, %zmm4, %zmm4{%k2} 1726 vpxord %zmm4, %zmm3, %zmm9 1727 mov \$0x0000ffffffffffff, $tmp1 1728 kmovq $tmp1, %k1 1729 vmovdqu8 16*0($input), %zmm1{%k1} 1730 add \$48, $input 1731___ 1732 } 1733 1734 encrypt_by_four("%zmm1", "%zmm9", "%zmm0", $is_128); 1735 1736 { 1737 $code .= <<___; 1738 vmovdqu8 %zmm1, 16*0($output){%k1} 1739 add \$48, $output 1740 vextracti32x4 \$2, %zmm1, %xmm8 1741 vextracti32x4 \$3, %zmm9, %xmm0 1742 and \$0xf,$length 1743 je .L_ret_${rndsuffix} 1744 jmp .L_steal_cipher_${rndsuffix} 1745___ 1746 } 1747 1748 { 1749 $code .= <<___; 1750 .L_num_blocks_is_2_${rndsuffix}: 1751 vpshufb %zmm8, %zmm0, %zmm1 1752 vpsllvq const_dq3210(%rip), %zmm0, %zmm4 1753 vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 1754 vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 1755 vpxorq %zmm2, %zmm4, %zmm4{%k2} 1756 vpxord %zmm4, %zmm3, %zmm9 1757 1758 vmovdqu8 16*0($input), %ymm1 1759 add \$32, $input 1760___ 1761 } 1762 1763 encrypt_by_four("%ymm1", "%ymm9", "%ymm0", $is_128); 1764 1765 { 1766 $code .= <<___; 1767 vmovdqu8 %ymm1, 16*0($output) 1768 add \$32, $output 1769 1770 vextracti32x4 \$1, %ymm1, %xmm8 1771 vextracti32x4 \$2, %zmm9, %xmm0 1772 and \$0xf,$length 1773 je .L_ret_${rndsuffix} 1774 jmp .L_steal_cipher_${rndsuffix} 1775___ 1776 } 1777 1778 { 1779 $code .= <<___; 1780 .L_num_blocks_is_1_${rndsuffix}: 1781 vpshufb %zmm8, %zmm0, %zmm1 1782 vpsllvq const_dq3210(%rip), %zmm0, %zmm4 1783 vpsrlvq const_dq5678(%rip), %zmm1, %zmm2 1784 vpclmulqdq \$0x00, $ZPOLY, %zmm2, %zmm3 1785 vpxorq %zmm2, %zmm4, %zmm4{%k2} 1786 vpxord %zmm4, %zmm3, %zmm9 1787 1788 vmovdqu8 16*0($input), %xmm1 1789 add \$16, $input 1790___ 1791 } 1792 1793 encrypt_by_four("%ymm1", "%ymm9", "%ymm0", $is_128); 1794 1795 { 1796 $code .= <<___; 1797 vmovdqu8 %xmm1, 16*0($output) 1798 add \$16, $output 1799 1800 vmovdqa %xmm1, %xmm8 1801 vextracti32x4 \$1, %zmm9, %xmm0 1802 and \$0xf,$length 1803 je .L_ret_${rndsuffix} 1804 jmp .L_steal_cipher_${rndsuffix} 1805 .cfi_endproc 1806___ 1807 } 1808 } 1809 1810 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1811 # ;void aesni_xts_[128|256]_decrypt_avx512( 1812 # ; const uint8_t *in, // input data 1813 # ; uint8_t *out, // output data 1814 # ; size_t length, // sector size, in bytes 1815 # ; const AES_KEY *key1, // key used for "ECB" encryption, 16*2 bytes 1816 # ; const AES_KEY *key2, // key used for tweaking, 16*2 bytes 1817 # ; const uint8_t iv[16]) // initial tweak value, 16 bytes 1818 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1819 sub dec { 1820 my $is_128 = $_[0]; 1821 my $rndsuffix = &random_string(); 1822 1823 if ($is_128) { 1824 $code.=<<___; 1825 .globl aesni_xts_128_decrypt_avx512 1826 .hidden aesni_xts_128_decrypt_avx512 1827 .type aesni_xts_128_decrypt_avx512,\@function,6 1828 .align 32 1829 aesni_xts_128_decrypt_avx512: 1830 .cfi_startproc 1831 endbranch 1832___ 1833 } else { 1834 $code.=<<___; 1835 .globl aesni_xts_256_decrypt_avx512 1836 .hidden aesni_xts_256_decrypt_avx512 1837 .type aesni_xts_256_decrypt_avx512,\@function,6 1838 .align 32 1839 aesni_xts_256_decrypt_avx512: 1840 .cfi_startproc 1841 endbranch 1842___ 1843 } 1844 $code .= "push %rbp\n"; 1845 $code .= "mov $TW,%rbp\n"; 1846 $code .= "sub \$$VARIABLE_OFFSET,$TW\n"; 1847 $code .= "and \$0xffffffffffffffc0,$TW\n"; 1848 $code .= "mov %rbx,$GP_STORAGE($TW)\n"; 1849 1850 if ($win64) { 1851 $code .= "mov %rdi,$GP_STORAGE + 8*1($TW)\n"; 1852 $code .= "mov %rsi,$GP_STORAGE + 8*2($TW)\n"; 1853 $code .= "vmovdqa %xmm6, $XMM_STORAGE + 16*0($TW)\n"; 1854 $code .= "vmovdqa %xmm7, $XMM_STORAGE + 16*1($TW)\n"; 1855 $code .= "vmovdqa %xmm8, $XMM_STORAGE + 16*2($TW)\n"; 1856 $code .= "vmovdqa %xmm9, $XMM_STORAGE + 16*3($TW)\n"; 1857 $code .= "vmovdqa %xmm10, $XMM_STORAGE + 16*4($TW)\n"; 1858 $code .= "vmovdqa %xmm11, $XMM_STORAGE + 16*5($TW)\n"; 1859 $code .= "vmovdqa %xmm12, $XMM_STORAGE + 16*6($TW)\n"; 1860 $code .= "vmovdqa %xmm13, $XMM_STORAGE + 16*7($TW)\n"; 1861 $code .= "vmovdqa %xmm14, $XMM_STORAGE + 16*8($TW)\n"; 1862 $code .= "vmovdqa %xmm15, $XMM_STORAGE + 16*9($TW)\n"; 1863 } 1864 1865 $code .= "mov \$0x87, $gf_poly_8b\n"; 1866 $code .= "vmovdqu ($tweak),%xmm1\n"; # read initial tweak values 1867 1868 encrypt_tweak("%xmm1", $is_128); 1869 1870 if ($win64) { 1871 $code .= "mov $input, 8 + 8*5(%rbp)\n"; # ciphertext pointer 1872 $code .= "mov $output, 8 + 8*6(%rbp)\n"; # plaintext pointer 1873 } 1874 1875 { 1876 $code.=<<___; 1877 1878 cmp \$0x80,$length 1879 jb .L_less_than_128_bytes_${rndsuffix} 1880 vpbroadcastq $gf_poly_8b,$ZPOLY 1881 cmp \$0x100,$length 1882 jge .L_start_by16_${rndsuffix} 1883 jmp .L_start_by8_${rndsuffix} 1884 1885 .L_do_n_blocks_${rndsuffix}: 1886 cmp \$0x0,$length 1887 je .L_ret_${rndsuffix} 1888 cmp \$0x70,$length 1889 jge .L_remaining_num_blocks_is_7_${rndsuffix} 1890 cmp \$0x60,$length 1891 jge .L_remaining_num_blocks_is_6_${rndsuffix} 1892 cmp \$0x50,$length 1893 jge .L_remaining_num_blocks_is_5_${rndsuffix} 1894 cmp \$0x40,$length 1895 jge .L_remaining_num_blocks_is_4_${rndsuffix} 1896 cmp \$0x30,$length 1897 jge .L_remaining_num_blocks_is_3_${rndsuffix} 1898 cmp \$0x20,$length 1899 jge .L_remaining_num_blocks_is_2_${rndsuffix} 1900 cmp \$0x10,$length 1901 jge .L_remaining_num_blocks_is_1_${rndsuffix} 1902 1903 # _remaining_num_blocks_is_0: 1904 vmovdqu %xmm5, %xmm1 1905 # xmm5 contains last full block to decrypt with next teawk 1906___ 1907 } 1908 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 1909 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 1910 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128); 1911 1912 { 1913 $code .= <<___; 1914 vmovdqu %xmm1, -0x10($output) 1915 vmovdqa %xmm1, %xmm8 1916 1917 # Calc previous tweak 1918 mov \$0x1,$tmp1 1919 kmovq $tmp1, %k1 1920 vpsllq \$0x3f,%xmm9,%xmm13 1921 vpsraq \$0x3f,%xmm13,%xmm14 1922 vpandq %xmm25,%xmm14,%xmm5 1923 vpxorq %xmm5,%xmm9,%xmm9{%k1} 1924 vpsrldq \$0x8,%xmm9,%xmm10 1925 .byte 98, 211, 181, 8, 115, 194, 1 #vpshrdq \$0x1,%xmm10,%xmm9,%xmm0 1926 vpslldq \$0x8,%xmm13,%xmm13 1927 vpxorq %xmm13,%xmm0,%xmm0 1928 jmp .L_steal_cipher_${rndsuffix} 1929 1930 .L_remaining_num_blocks_is_7_${rndsuffix}: 1931 mov \$0xffffffffffffffff,$tmp1 1932 shr \$0x10,$tmp1 1933 kmovq $tmp1,%k1 1934 vmovdqu8 ($input),%zmm1 1935 vmovdqu8 0x40($input),%zmm2{%k1} 1936 add \$0x70,$input 1937 and \$0xf,$length 1938 je .L_done_7_remain_${rndsuffix} 1939 vextracti32x4 \$0x2,%zmm10,%xmm12 1940 vextracti32x4 \$0x3,%zmm10,%xmm13 1941 vinserti32x4 \$0x2,%xmm13,%zmm10,%zmm10 1942___ 1943 } 1944 1945 decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 1946 1947 { 1948 $code .= <<___; 1949 vmovdqu8 %zmm1, ($output) 1950 vmovdqu8 %zmm2, 0x40($output){%k1} 1951 add \$0x70, $output 1952 vextracti32x4 \$0x2,%zmm2,%xmm8 1953 vmovdqa %xmm12,%xmm0 1954 jmp .L_steal_cipher_${rndsuffix} 1955___ 1956 } 1957 1958 $code .= "\n.L_done_7_remain_${rndsuffix}:\n"; 1959 decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 1960 1961 { 1962 $code .= <<___; 1963 vmovdqu8 %zmm1, ($output) 1964 vmovdqu8 %zmm2, 0x40($output){%k1} 1965 jmp .L_ret_${rndsuffix} 1966 1967 .L_remaining_num_blocks_is_6_${rndsuffix}: 1968 vmovdqu8 ($input),%zmm1 1969 vmovdqu8 0x40($input),%ymm2 1970 add \$0x60,$input 1971 and \$0xf, $length 1972 je .L_done_6_remain_${rndsuffix} 1973 vextracti32x4 \$0x1,%zmm10,%xmm12 1974 vextracti32x4 \$0x2,%zmm10,%xmm13 1975 vinserti32x4 \$0x1,%xmm13,%zmm10,%zmm10 1976___ 1977 } 1978 1979 decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 1980 1981 { 1982 $code .= <<___; 1983 vmovdqu8 %zmm1, ($output) 1984 vmovdqu8 %ymm2, 0x40($output) 1985 add \$0x60,$output 1986 vextracti32x4 \$0x1,%zmm2,%xmm8 1987 vmovdqa %xmm12,%xmm0 1988 jmp .L_steal_cipher_${rndsuffix} 1989___ 1990 } 1991 1992 $code .= "\n.L_done_6_remain_${rndsuffix}:\n"; 1993 decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 1994 1995 { 1996 $code .= <<___; 1997 vmovdqu8 %zmm1, ($output) 1998 vmovdqu8 %ymm2,0x40($output) 1999 jmp .L_ret_${rndsuffix} 2000 2001 .L_remaining_num_blocks_is_5_${rndsuffix}: 2002 vmovdqu8 ($input),%zmm1 2003 vmovdqu 0x40($input),%xmm2 2004 add \$0x50,$input 2005 and \$0xf,$length 2006 je .L_done_5_remain_${rndsuffix} 2007 vmovdqa %xmm10,%xmm12 2008 vextracti32x4 \$0x1,%zmm10,%xmm10 2009___ 2010 } 2011 2012 decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 2013 2014 { 2015 $code .= <<___; 2016 vmovdqu8 %zmm1, ($output) 2017 vmovdqu %xmm2, 0x40($output) 2018 add \$0x50, $output 2019 vmovdqa %xmm2,%xmm8 2020 vmovdqa %xmm12,%xmm0 2021 jmp .L_steal_cipher_${rndsuffix} 2022___ 2023 } 2024 2025 $code .= "\n.L_done_5_remain_${rndsuffix}:\n"; 2026 decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 2027 2028 { 2029 $code .= <<___; 2030 vmovdqu8 %zmm1, ($output) 2031 vmovdqu8 %xmm2, 0x40($output) 2032 jmp .L_ret_${rndsuffix} 2033 2034 .L_remaining_num_blocks_is_4_${rndsuffix}: 2035 vmovdqu8 ($input),%zmm1 2036 add \$0x40,$input 2037 and \$0xf, $length 2038 je .L_done_4_remain_${rndsuffix} 2039 vextracti32x4 \$0x3,%zmm9,%xmm12 2040 vinserti32x4 \$0x3,%xmm10,%zmm9,%zmm9 2041___ 2042 } 2043 2044 decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 2045 2046 { 2047 $code .= <<___; 2048 vmovdqu8 %zmm1,($output) 2049 add \$0x40,$output 2050 vextracti32x4 \$0x3,%zmm1,%xmm8 2051 vmovdqa %xmm12,%xmm0 2052 jmp .L_steal_cipher_${rndsuffix} 2053___ 2054 } 2055 2056 $code .= "\n.L_done_4_remain_${rndsuffix}:\n"; 2057 decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128); 2058 2059 { 2060 $code .= <<___; 2061 vmovdqu8 %zmm1, ($output) 2062 jmp .L_ret_${rndsuffix} 2063 2064 .L_remaining_num_blocks_is_3_${rndsuffix}: 2065 vmovdqu ($input),%xmm1 2066 vmovdqu 0x10($input),%xmm2 2067 vmovdqu 0x20($input),%xmm3 2068 add \$0x30,$input 2069 and \$0xf,$length 2070 je .L_done_3_remain_${rndsuffix} 2071 vextracti32x4 \$0x2,%zmm9,%xmm13 2072 vextracti32x4 \$0x1,%zmm9,%xmm10 2073 vextracti32x4 \$0x3,%zmm9,%xmm11 2074___ 2075 } 2076 2077 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2078 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2079 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1, $is_128); 2080 2081 { 2082 $code .= <<___; 2083 vmovdqu %xmm1,($output) 2084 vmovdqu %xmm2,0x10($output) 2085 vmovdqu %xmm3,0x20($output) 2086 add \$0x30,$output 2087 vmovdqa %xmm3,%xmm8 2088 vmovdqa %xmm13,%xmm0 2089 jmp .L_steal_cipher_${rndsuffix} 2090___ 2091 } 2092 $code .= "\n.L_done_3_remain_${rndsuffix}:\n"; 2093 $code .= "vextracti32x4 \$0x1,%zmm9,%xmm10\n"; 2094 $code .= "vextracti32x4 \$0x2,%zmm9,%xmm11\n"; 2095 2096 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2097 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2098 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1, $is_128); 2099 2100 { 2101 $code .= <<___; 2102 vmovdqu %xmm1,($output) 2103 vmovdqu %xmm2,0x10($output) 2104 vmovdqu %xmm3,0x20($output) 2105 jmp .L_ret_${rndsuffix} 2106 2107 .L_remaining_num_blocks_is_2_${rndsuffix}: 2108 vmovdqu ($input),%xmm1 2109 vmovdqu 0x10($input),%xmm2 2110 add \$0x20,$input 2111 and \$0xf,$length 2112 je .L_done_2_remain_${rndsuffix} 2113 vextracti32x4 \$0x2,%zmm9,%xmm10 2114 vextracti32x4 \$0x1,%zmm9,%xmm12 2115___ 2116 } 2117 2118 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2119 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2120 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1, $is_128); 2121 2122 { 2123 $code .= <<___; 2124 vmovdqu %xmm1,($output) 2125 vmovdqu %xmm2,0x10($output) 2126 add \$0x20,$output 2127 vmovdqa %xmm2,%xmm8 2128 vmovdqa %xmm12,%xmm0 2129 jmp .L_steal_cipher_${rndsuffix} 2130___ 2131 } 2132 $code .= "\n.L_done_2_remain_${rndsuffix}:\n"; 2133 $code .= "vextracti32x4 \$0x1,%zmm9,%xmm10\n"; 2134 2135 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2136 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2137 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1, $is_128); 2138 2139 { 2140 $code .= <<___; 2141 vmovdqu %xmm1,($output) 2142 vmovdqu %xmm2,0x10($output) 2143 jmp .L_ret_${rndsuffix} 2144 2145 .L_remaining_num_blocks_is_1_${rndsuffix}: 2146 vmovdqu ($input),%xmm1 2147 add \$0x10,$input 2148 and \$0xf,$length 2149 je .L_done_1_remain_${rndsuffix} 2150 vextracti32x4 \$0x1,%zmm9,%xmm11 2151___ 2152 } 2153 2154 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2155 "%xmm7", "%xmm8", "%xmm11", "%xmm10", "%xmm9", "%xmm12", 2156 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128); 2157 { 2158 $code .= <<___; 2159 vmovdqu %xmm1,($output) 2160 add \$0x10,$output 2161 vmovdqa %xmm1,%xmm8 2162 vmovdqa %xmm9,%xmm0 2163 jmp .L_steal_cipher_${rndsuffix} 2164___ 2165 } 2166 2167 $code .= "\n.L_done_1_remain_${rndsuffix}:\n"; 2168 2169 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2170 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2171 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128); 2172 2173 { 2174 $code .= <<___; 2175 vmovdqu %xmm1, ($output) 2176 jmp .L_ret_${rndsuffix} 2177 2178 .L_start_by16_${rndsuffix}: 2179 vbroadcasti32x4 ($TW),%zmm0 2180 vbroadcasti32x4 shufb_15_7(%rip),%zmm8 2181 mov \$0xaa,$tmp1 2182 kmovq $tmp1,%k2 2183 2184 # Mult tweak by 2^{3, 2, 1, 0} 2185 vpshufb %zmm8,%zmm0,%zmm1 2186 vpsllvq const_dq3210(%rip),%zmm0,%zmm4 2187 vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 2188 vpclmulqdq \$0x0,$ZPOLY,%zmm2,%zmm3 2189 vpxorq %zmm2,%zmm4,%zmm4{%k2} 2190 vpxord %zmm4,%zmm3,%zmm9 2191 2192 # Mult tweak by 2^{7, 6, 5, 4} 2193 vpsllvq const_dq7654(%rip),%zmm0,%zmm5 2194 vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 2195 vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7 2196 vpxorq %zmm6,%zmm5,%zmm5{%k2} 2197 vpxord %zmm5,%zmm7,%zmm10 2198 2199 # Make next 8 tweak values by all x 2^8 2200 vpsrldq \$0xf,%zmm9,%zmm13 2201 vpclmulqdq \$0x0,%zmm25,%zmm13,%zmm14 2202 vpslldq \$0x1,%zmm9,%zmm11 2203 vpxord %zmm14,%zmm11,%zmm11 2204 2205 vpsrldq \$0xf,%zmm10,%zmm15 2206 vpclmulqdq \$0x0,%zmm25,%zmm15,%zmm16 2207 vpslldq \$0x1,%zmm10,%zmm12 2208 vpxord %zmm16,%zmm12,%zmm12 2209 2210 .L_main_loop_run_16_${rndsuffix}: 2211 vmovdqu8 ($input),%zmm1 2212 vmovdqu8 0x40($input),%zmm2 2213 vmovdqu8 0x80($input),%zmm3 2214 vmovdqu8 0xc0($input),%zmm4 2215 vmovdqu8 0xf0($input),%xmm5 2216 add \$0x100,$input 2217___ 2218 } 2219 2220 decrypt_by_16_zmm("%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm9", 2221 "%zmm10", "%zmm11", "%zmm12", "%zmm0", 0, $is_128); 2222 2223 { 2224 $code .= <<___; 2225 vmovdqu8 %zmm1,($output) 2226 vmovdqu8 %zmm2,0x40($output) 2227 vmovdqu8 %zmm3,0x80($output) 2228 vmovdqu8 %zmm4,0xc0($output) 2229 add \$0x100,$output 2230 sub \$0x100,$length 2231 cmp \$0x100,$length 2232 jge .L_main_loop_run_16_${rndsuffix} 2233 2234 cmp \$0x80,$length 2235 jge .L_main_loop_run_8_${rndsuffix} 2236 jmp .L_do_n_blocks_${rndsuffix} 2237 2238 .L_start_by8_${rndsuffix}: 2239 # Make first 7 tweak values 2240 vbroadcasti32x4 ($TW),%zmm0 2241 vbroadcasti32x4 shufb_15_7(%rip),%zmm8 2242 mov \$0xaa,$tmp1 2243 kmovq $tmp1,%k2 2244 2245 # Mult tweak by 2^{3, 2, 1, 0} 2246 vpshufb %zmm8,%zmm0,%zmm1 2247 vpsllvq const_dq3210(%rip),%zmm0,%zmm4 2248 vpsrlvq const_dq5678(%rip),%zmm1,%zmm2 2249 vpclmulqdq \$0x0,%zmm25,%zmm2,%zmm3 2250 vpxorq %zmm2,%zmm4,%zmm4{%k2} 2251 vpxord %zmm4,%zmm3,%zmm9 2252 2253 # Mult tweak by 2^{7, 6, 5, 4} 2254 vpsllvq const_dq7654(%rip),%zmm0,%zmm5 2255 vpsrlvq const_dq1234(%rip),%zmm1,%zmm6 2256 vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7 2257 vpxorq %zmm6,%zmm5,%zmm5{%k2} 2258 vpxord %zmm5,%zmm7,%zmm10 2259 2260 .L_main_loop_run_8_${rndsuffix}: 2261 vmovdqu8 ($input),%zmm1 2262 vmovdqu8 0x40($input),%zmm2 2263 vmovdqu8 0x70($input),%xmm5 2264 add \$0x80,$input 2265___ 2266 } 2267 2268 2269 decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 0, $is_128); 2270 2271 { 2272 $code .= <<___; 2273 vmovdqu8 %zmm1,($output) 2274 vmovdqu8 %zmm2,0x40($output) 2275 add \$0x80,$output 2276 sub \$0x80,$length 2277 cmp \$0x80,$length 2278 jge .L_main_loop_run_8_${rndsuffix} 2279 jmp .L_do_n_blocks_${rndsuffix} 2280 2281 .L_steal_cipher_${rndsuffix}: 2282 # start cipher stealing simplified: xmm8-last cipher block, xmm0-next tweak 2283 vmovdqa %xmm8,%xmm2 2284 2285 # shift xmm8 to the left by 16-N_val bytes 2286 lea vpshufb_shf_table(%rip),$TEMPLOW 2287 vmovdqu ($TEMPLOW,$length,1),%xmm10 2288 vpshufb %xmm10,%xmm8,%xmm8 2289 2290 2291 vmovdqu -0x10($input,$length,1),%xmm3 2292 vmovdqu %xmm8,-0x10($output,$length,1) 2293 2294 # shift xmm3 to the right by 16-N_val bytes 2295 lea vpshufb_shf_table(%rip), $TEMPLOW 2296 add \$16, $TEMPLOW 2297 sub $length,$TEMPLOW 2298 vmovdqu ($TEMPLOW),%xmm10 2299 vpxor mask1(%rip),%xmm10,%xmm10 2300 vpshufb %xmm10,%xmm3,%xmm3 2301 2302 vpblendvb %xmm10,%xmm2,%xmm3,%xmm3 2303 2304 # xor Tweak value 2305 vpxor %xmm0,%xmm3,%xmm8 2306 2307 # decrypt last block with cipher stealing 2308 vpxor ($key1),%xmm8,%xmm8 2309 vaesdec 0x10($key1),%xmm8,%xmm8 2310 vaesdec 0x20($key1),%xmm8,%xmm8 2311 vaesdec 0x30($key1),%xmm8,%xmm8 2312 vaesdec 0x40($key1),%xmm8,%xmm8 2313 vaesdec 0x50($key1),%xmm8,%xmm8 2314 vaesdec 0x60($key1),%xmm8,%xmm8 2315 vaesdec 0x70($key1),%xmm8,%xmm8 2316 vaesdec 0x80($key1),%xmm8,%xmm8 2317 vaesdec 0x90($key1),%xmm8,%xmm8 2318___ 2319 if ($is_128) { 2320 $code .= "vaesdeclast 0xa0($key1),%xmm8,%xmm8\n"; 2321 } else { 2322 $code .= <<___; 2323 vaesdec 0xa0($key1),%xmm8,%xmm8 2324 vaesdec 0xb0($key1),%xmm8,%xmm8 2325 vaesdec 0xc0($key1),%xmm8,%xmm8 2326 vaesdec 0xd0($key1),%xmm8,%xmm8 2327 vaesdeclast 0xe0($key1),%xmm8,%xmm8 2328___ 2329 } 2330 $code .= <<___ 2331 # xor Tweak value 2332 vpxor %xmm0,%xmm8,%xmm8 2333 2334 .L_done_${rndsuffix}: 2335 # store last ciphertext value 2336 vmovdqu %xmm8,-0x10($output) 2337___ 2338 } 2339 2340 { 2341 $code .= <<___; 2342 .L_ret_${rndsuffix}: 2343 mov $GP_STORAGE($TW),%rbx 2344 xor $tmp1,$tmp1 2345 mov $tmp1,$GP_STORAGE($TW) 2346 # Zero-out the whole of `%zmm0`. 2347 vpxorq %zmm0,%zmm0,%zmm0 2348___ 2349 } 2350 2351 if ($win64) { 2352 $code .= <<___; 2353 mov $GP_STORAGE + 8*1($TW),%rdi 2354 mov $tmp1,$GP_STORAGE + 8*1($TW) 2355 mov $GP_STORAGE + 8*2($TW),%rsi 2356 mov $tmp1,$GP_STORAGE + 8*2($TW) 2357 2358 vmovdqa $XMM_STORAGE + 16 * 0($TW), %xmm6 2359 vmovdqa $XMM_STORAGE + 16 * 1($TW), %xmm7 2360 vmovdqa $XMM_STORAGE + 16 * 2($TW), %xmm8 2361 vmovdqa $XMM_STORAGE + 16 * 3($TW), %xmm9 2362 2363 # Zero the 64 bytes we just restored to the xmm registers. 2364 vmovdqa64 %zmm0,$XMM_STORAGE($TW) 2365 2366 vmovdqa $XMM_STORAGE + 16 * 4($TW), %xmm10 2367 vmovdqa $XMM_STORAGE + 16 * 5($TW), %xmm11 2368 vmovdqa $XMM_STORAGE + 16 * 6($TW), %xmm12 2369 vmovdqa $XMM_STORAGE + 16 * 7($TW), %xmm13 2370 2371 # And again. 2372 vmovdqa64 %zmm0,$XMM_STORAGE + 16 * 4($TW) 2373 2374 vmovdqa $XMM_STORAGE + 16 * 8($TW), %xmm14 2375 vmovdqa $XMM_STORAGE + 16 * 9($TW), %xmm15 2376 2377 # Last round is only 32 bytes (256-bits), so we use `%ymm` as the 2378 # source operand. 2379 vmovdqa %ymm0,$XMM_STORAGE + 16 * 8($TW) 2380___ 2381 } 2382 2383 { 2384 $code .= <<___; 2385 mov %rbp,$TW 2386 pop %rbp 2387 vzeroupper 2388 ret 2389 2390 .L_less_than_128_bytes_${rndsuffix}: 2391 cmp \$0x10,$length 2392 jb .L_ret_${rndsuffix} 2393 2394 mov $length,$tmp1 2395 and \$0x70,$tmp1 2396 cmp \$0x60,$tmp1 2397 je .L_num_blocks_is_6_${rndsuffix} 2398 cmp \$0x50,$tmp1 2399 je .L_num_blocks_is_5_${rndsuffix} 2400 cmp \$0x40,$tmp1 2401 je .L_num_blocks_is_4_${rndsuffix} 2402 cmp \$0x30,$tmp1 2403 je .L_num_blocks_is_3_${rndsuffix} 2404 cmp \$0x20,$tmp1 2405 je .L_num_blocks_is_2_${rndsuffix} 2406 cmp \$0x10,$tmp1 2407 je .L_num_blocks_is_1_${rndsuffix} 2408___ 2409 } 2410 2411 $code .= "\n.L_num_blocks_is_7_${rndsuffix}:\n"; 2412 initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2413 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2414 "%xmm13", "%xmm14", "%xmm15", 7); 2415 2416 { 2417 $code .= <<___; 2418 add \$0x70,$input 2419 and \$0xf,$length 2420 je .L_done_7_${rndsuffix} 2421 2422 .L_steal_cipher_7_${rndsuffix}: 2423 xor $gf_poly_8b_temp, $gf_poly_8b_temp 2424 shl \$1, $TEMPLOW 2425 adc $TEMPHIGH, $TEMPHIGH 2426 cmovc $gf_poly_8b, $gf_poly_8b_temp 2427 xor $gf_poly_8b_temp, $TEMPLOW 2428 mov $TEMPLOW,0x10($TW) 2429 mov $TEMPHIGH,0x18($TW) 2430 vmovdqa64 %xmm15,%xmm16 2431 vmovdqa 0x10($TW),%xmm15 2432___ 2433 } 2434 2435 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2436 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2437 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 7, 1, $is_128); 2438 2439 { 2440 $code .= <<___; 2441 vmovdqu %xmm1,($output) 2442 vmovdqu %xmm2,0x10($output) 2443 vmovdqu %xmm3,0x20($output) 2444 vmovdqu %xmm4,0x30($output) 2445 vmovdqu %xmm5,0x40($output) 2446 vmovdqu %xmm6,0x50($output) 2447 add \$0x70,$output 2448 vmovdqa64 %xmm16,%xmm0 2449 vmovdqa %xmm7,%xmm8 2450 jmp .L_steal_cipher_${rndsuffix} 2451___ 2452 } 2453 2454 $code .= "\n.L_done_7_${rndsuffix}:\n"; 2455 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2456 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2457 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 7, 1, $is_128); 2458 2459 { 2460 $code .= <<___; 2461 vmovdqu %xmm1,($output) 2462 vmovdqu %xmm2,0x10($output) 2463 vmovdqu %xmm3,0x20($output) 2464 vmovdqu %xmm4,0x30($output) 2465 vmovdqu %xmm5,0x40($output) 2466 vmovdqu %xmm6,0x50($output) 2467 add \$0x70,$output 2468 vmovdqa %xmm7,%xmm8 2469 jmp .L_done_${rndsuffix} 2470___ 2471 } 2472 2473 $code .= "\n.L_num_blocks_is_6_${rndsuffix}:\n"; 2474 initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2475 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2476 "%xmm13", "%xmm14", "%xmm15", 6); 2477 2478 { 2479 $code .= <<___; 2480 add \$0x60,$input 2481 and \$0xf,$length 2482 je .L_done_6_${rndsuffix} 2483 2484 .L_steal_cipher_6_${rndsuffix}: 2485 xor $gf_poly_8b_temp, $gf_poly_8b_temp 2486 shl \$1, $TEMPLOW 2487 adc $TEMPHIGH, $TEMPHIGH 2488 cmovc $gf_poly_8b, $gf_poly_8b_temp 2489 xor $gf_poly_8b_temp, $TEMPLOW 2490 mov $TEMPLOW,0x10($TW) 2491 mov $TEMPHIGH,0x18($TW) 2492 vmovdqa64 %xmm14,%xmm15 2493 vmovdqa 0x10($TW),%xmm14 2494___ 2495 } 2496 2497 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2498 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2499 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 6, 1, $is_128); 2500 2501 { 2502 $code .= <<___; 2503 vmovdqu %xmm1,($output) 2504 vmovdqu %xmm2,0x10($output) 2505 vmovdqu %xmm3,0x20($output) 2506 vmovdqu %xmm4,0x30($output) 2507 vmovdqu %xmm5,0x40($output) 2508 add \$0x60,$output 2509 vmovdqa %xmm15,%xmm0 2510 vmovdqa %xmm6,%xmm8 2511 jmp .L_steal_cipher_${rndsuffix} 2512___ 2513 } 2514 $code .= "\n.L_done_6_${rndsuffix}:\n"; 2515 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2516 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2517 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 6, 1, $is_128); 2518 2519 { 2520 $code .= <<___; 2521 vmovdqu %xmm1,($output) 2522 vmovdqu %xmm2,0x10($output) 2523 vmovdqu %xmm3,0x20($output) 2524 vmovdqu %xmm4,0x30($output) 2525 vmovdqu %xmm5,0x40($output) 2526 add \$0x60,$output 2527 vmovdqa %xmm6,%xmm8 2528 jmp .L_done_${rndsuffix} 2529___ 2530 } 2531 2532 $code .= "\n.L_num_blocks_is_5_${rndsuffix}:\n"; 2533 initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2534 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2535 "%xmm13", "%xmm14", "%xmm15", 5); 2536 2537 { 2538 $code .= <<___; 2539 add \$0x50,$input 2540 and \$0xf,$length 2541 je .L_done_5_${rndsuffix} 2542 2543 .L_steal_cipher_5_${rndsuffix}: 2544 xor $gf_poly_8b_temp, $gf_poly_8b_temp 2545 shl \$1, $TEMPLOW 2546 adc $TEMPHIGH, $TEMPHIGH 2547 cmovc $gf_poly_8b, $gf_poly_8b_temp 2548 xor $gf_poly_8b_temp, $TEMPLOW 2549 mov $TEMPLOW,0x10($TW) 2550 mov $TEMPHIGH,0x18($TW) 2551 vmovdqa64 %xmm13,%xmm14 2552 vmovdqa 0x10($TW),%xmm13 2553___ 2554 } 2555 2556 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2557 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2558 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 5, 1, $is_128); 2559 2560 { 2561 $code .= <<___; 2562 vmovdqu %xmm1,($output) 2563 vmovdqu %xmm2,0x10($output) 2564 vmovdqu %xmm3,0x20($output) 2565 vmovdqu %xmm4,0x30($output) 2566 add \$0x50,$output 2567 vmovdqa %xmm14,%xmm0 2568 vmovdqa %xmm5,%xmm8 2569 jmp .L_steal_cipher_${rndsuffix} 2570___ 2571 } 2572 2573 $code .= "\n.L_done_5_${rndsuffix}:\n"; 2574 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2575 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2576 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 5, 1, $is_128); 2577 2578 { 2579 $code .= <<___; 2580 vmovdqu %xmm1,($output) 2581 vmovdqu %xmm2,0x10($output) 2582 vmovdqu %xmm3,0x20($output) 2583 vmovdqu %xmm4,0x30($output) 2584 add \$0x50,$output 2585 vmovdqa %xmm5,%xmm8 2586 jmp .L_done_${rndsuffix} 2587___ 2588 } 2589 2590 $code .= "\n.L_num_blocks_is_4_${rndsuffix}:\n"; 2591 2592 initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2593 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2594 "%xmm13", "%xmm14", "%xmm15", 4); 2595 2596 { 2597 $code .= <<___; 2598 add \$0x40,$input 2599 and \$0xf,$length 2600 je .L_done_4_${rndsuffix} 2601 2602 .L_steal_cipher_4_${rndsuffix}: 2603 xor $gf_poly_8b_temp, $gf_poly_8b_temp 2604 shl \$1, $TEMPLOW 2605 adc $TEMPHIGH, $TEMPHIGH 2606 cmovc $gf_poly_8b, $gf_poly_8b_temp 2607 xor $gf_poly_8b_temp, $TEMPLOW 2608 mov $TEMPLOW,0x10($TW) 2609 mov $TEMPHIGH,0x18($TW) 2610 vmovdqa64 %xmm12,%xmm13 2611 vmovdqa 0x10($TW),%xmm12 2612___ 2613 } 2614 2615 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2616 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2617 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 4, 1, $is_128); 2618 2619 { 2620 $code .= <<___; 2621 vmovdqu %xmm1,($output) 2622 vmovdqu %xmm2,0x10($output) 2623 vmovdqu %xmm3,0x20($output) 2624 add \$0x40,$output 2625 vmovdqa %xmm13,%xmm0 2626 vmovdqa %xmm4,%xmm8 2627 jmp .L_steal_cipher_${rndsuffix} 2628___ 2629 } 2630 2631 $code .= "\n.L_done_4_${rndsuffix}:\n"; 2632 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2633 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2634 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 4, 1, $is_128); 2635 2636 { 2637 $code .= <<___; 2638 vmovdqu %xmm1,($output) 2639 vmovdqu %xmm2,0x10($output) 2640 vmovdqu %xmm3,0x20($output) 2641 add \$0x40,$output 2642 vmovdqa %xmm4,%xmm8 2643 jmp .L_done_${rndsuffix} 2644___ 2645 } 2646 2647 $code .= "\n.L_num_blocks_is_3_${rndsuffix}:\n"; 2648 2649 initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2650 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2651 "%xmm13", "%xmm14", "%xmm15", 3); 2652 2653 { 2654 $code .= <<___; 2655 add \$0x30,$input 2656 and \$0xf,$length 2657 je .L_done_3_${rndsuffix} 2658 2659 .L_steal_cipher_3_${rndsuffix}: 2660 xor $gf_poly_8b_temp, $gf_poly_8b_temp 2661 shl \$1, $TEMPLOW 2662 adc $TEMPHIGH, $TEMPHIGH 2663 cmovc $gf_poly_8b, $gf_poly_8b_temp 2664 xor $gf_poly_8b_temp, $TEMPLOW 2665 mov $TEMPLOW,0x10($TW) 2666 mov $TEMPHIGH,0x18($TW) 2667 vmovdqa64 %xmm11,%xmm12 2668 vmovdqa 0x10($TW),%xmm11 2669___ 2670 } 2671 2672 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2673 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2674 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1, $is_128); 2675 2676 { 2677 $code .= <<___; 2678 vmovdqu %xmm1,($output) 2679 vmovdqu %xmm2,0x10($output) 2680 add \$0x30,$output 2681 vmovdqa %xmm12,%xmm0 2682 vmovdqa %xmm3,%xmm8 2683 jmp .L_steal_cipher_${rndsuffix} 2684___ 2685 } 2686 $code .= "\n.L_done_3_${rndsuffix}:\n"; 2687 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2688 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2689 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1, $is_128); 2690 2691 { 2692 $code .= <<___; 2693 vmovdqu %xmm1,($output) 2694 vmovdqu %xmm2,0x10($output) 2695 add \$0x30,$output 2696 vmovdqa %xmm3,%xmm8 2697 jmp .L_done_${rndsuffix} 2698___ 2699 } 2700 2701 $code .= "\n.L_num_blocks_is_2_${rndsuffix}:\n"; 2702 2703 initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2704 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2705 "%xmm13", "%xmm14", "%xmm15", 2); 2706 2707 { 2708 $code .= <<___; 2709 add \$0x20,$input 2710 and \$0xf,$length 2711 je .L_done_2_${rndsuffix} 2712 2713 .L_steal_cipher_2_${rndsuffix}: 2714 xor $gf_poly_8b_temp, $gf_poly_8b_temp 2715 shl \$1, $TEMPLOW 2716 adc $TEMPHIGH, $TEMPHIGH 2717 cmovc $gf_poly_8b, $gf_poly_8b_temp 2718 xor $gf_poly_8b_temp, $TEMPLOW 2719 mov $TEMPLOW,0x10($TW) 2720 mov $TEMPHIGH,0x18($TW) 2721 vmovdqa64 %xmm10,%xmm11 2722 vmovdqa 0x10($TW),%xmm10 2723___ 2724 } 2725 2726 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2727 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2728 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1, $is_128); 2729 2730 { 2731 $code .= <<___; 2732 vmovdqu %xmm1,($output) 2733 add \$0x20,$output 2734 vmovdqa %xmm11,%xmm0 2735 vmovdqa %xmm2,%xmm8 2736 jmp .L_steal_cipher_${rndsuffix} 2737___ 2738 } 2739 2740 $code .= "\n.L_done_2_${rndsuffix}:\n"; 2741 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2742 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2743 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1, $is_128); 2744 2745 { 2746 $code .= <<___; 2747 vmovdqu %xmm1,($output) 2748 add \$0x20,$output 2749 vmovdqa %xmm2,%xmm8 2750 jmp .L_done_${rndsuffix} 2751___ 2752 } 2753 2754 $code .= "\n.L_num_blocks_is_1_${rndsuffix}:\n"; 2755 2756 initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2757 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2758 "%xmm13", "%xmm14", "%xmm15", 1); 2759 2760 { 2761 $code .= <<___; 2762 add \$0x10,$input 2763 and \$0xf,$length 2764 je .L_done_1_${rndsuffix} 2765 2766 .L_steal_cipher_1_${rndsuffix}: 2767 xor $gf_poly_8b_temp, $gf_poly_8b_temp 2768 shl \$1, $TEMPLOW 2769 adc $TEMPHIGH, $TEMPHIGH 2770 cmovc $gf_poly_8b, $gf_poly_8b_temp 2771 xor $gf_poly_8b_temp, $TEMPLOW 2772 mov $TEMPLOW,0x10($TW) 2773 mov $TEMPHIGH,0x18($TW) 2774 vmovdqa64 %xmm9,%xmm10 2775 vmovdqa 0x10($TW),%xmm9 2776___ 2777 } 2778 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2779 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2780 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128); 2781 2782 { 2783 $code .= <<___; 2784 add \$0x10,$output 2785 vmovdqa %xmm10,%xmm0 2786 vmovdqa %xmm1,%xmm8 2787 jmp .L_steal_cipher_${rndsuffix} 2788___ 2789 } 2790 $code .= "\n.L_done_1_${rndsuffix}:\n"; 2791 decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", 2792 "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", 2793 "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128); 2794 2795 { 2796 $code .= <<___; 2797 add \$0x10,$output 2798 vmovdqa %xmm1,%xmm8 2799 jmp .L_done_${rndsuffix} 2800 .cfi_endproc 2801___ 2802 } 2803 2804 } 2805 2806 # The only difference between AES-XTS-128 and -256 is the number of rounds, 2807 # so we generate from the same perlasm base, extending to 14 rounds when 2808 # `$is_128' is 0. 2809 2810 enc(1); 2811 dec(1); 2812 2813 enc(0); 2814 dec(0); 2815 2816 $code .= <<___; 2817 .section .rodata 2818 .align 16 2819 2820 vpshufb_shf_table: 2821 .quad 0x8786858483828100, 0x8f8e8d8c8b8a8988 2822 .quad 0x0706050403020100, 0x000e0d0c0b0a0908 2823 2824 mask1: 2825 .quad 0x8080808080808080, 0x8080808080808080 2826 2827 const_dq3210: 2828 .quad 0, 0, 1, 1, 2, 2, 3, 3 2829 const_dq5678: 2830 .quad 8, 8, 7, 7, 6, 6, 5, 5 2831 const_dq7654: 2832 .quad 4, 4, 5, 5, 6, 6, 7, 7 2833 const_dq1234: 2834 .quad 4, 4, 3, 3, 2, 2, 1, 1 2835 2836 shufb_15_7: 2837 .byte 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff 2838 .byte 0xff, 0xff, 0xff, 0xff, 0xff 2839 2840.text 2841___ 2842 2843} else { 2844 $code .= <<___; 2845 .text 2846 .globl aesni_xts_128_encrypt_avx512 2847 .globl aesni_xts_128_decrypt_avx512 2848 2849 aesni_xts_128_encrypt_avx512: 2850 aesni_xts_128_decrypt_avx512: 2851 .byte 0x0f,0x0b # ud2 2852 ret 2853 2854 .globl aesni_xts_256_encrypt_avx512 2855 .globl aesni_xts_256_decrypt_avx512 2856 2857 aesni_xts_256_encrypt_avx512: 2858 aesni_xts_256_decrypt_avx512: 2859 .byte 0x0f,0x0b # ud2 2860 ret 2861 2862 .globl aesni_xts_avx512_eligible 2863 .type aesni_xts_avx512_eligible,\@abi-omnipotent 2864 aesni_xts_avx512_eligible: 2865 xor %eax,%eax 2866 ret 2867 .size aesni_xts_avx512_eligible, .-aesni_xts_avx512_eligible 2868 2869___ 2870} 2871 2872print $code; 2873 2874close STDOUT or die "error closing STDOUT: $!"; 2875