xref: /src/crypto/openssl/crypto/aes/asm/aesni-xts-avx512.pl (revision f25b8c9fb4f58cf61adb47d7570abe7caa6d385d)
1#! /usr/bin/env perl
2# Copyright (C) 2023 Intel Corporation
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9# This implementation is based on the AES-XTS code (AVX512VAES + VPCLMULQDQ)
10# from Intel(R) Intelligent Storage Acceleration Library Crypto Version
11# (https://github.com/intel/isa-l_crypto).
12#
13######################################################################
14# The main building block of the loop is code that encrypts/decrypts
15# 8/16 blocks of data stitching with generation of tweak for the next
16# 8/16 blocks, utilizing VAES and VPCLMULQDQ instructions with full width
17# of ZMM registers. The main loop is selected based on the input length.
18# main_loop_run_16 encrypts/decrypts 16 blocks in parallel and it's selected
19# when input length >= 256 bytes (16 blocks)
20# main_loop_run_8 encrypts/decrypts 8 blocks in parallel and it's selected
21# when 128 bytes <= input length < 256 bytes (8-15 blocks)
22# Input length < 128 bytes (8 blocks) is handled by do_n_blocks.
23#
24# This implementation mainly uses vpshrdq from AVX-512-VBMI2 family and vaesenc,
25# vaesdec, vpclmulqdq from AVX-512F family.
26$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
27$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
28
29$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
30$avx512vaes=0;
31
32$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
33( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
34( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
35die "can't locate x86_64-xlate.pl";
36
37if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
38        =~ /GNU assembler version ([0-9]+)\.([0-9]+)/) {
39    my $ver = $1 + $2/100.0; # 3.1->3.01, 3.10->3.10
40    $avx512vaes = ($ver >= 2.30);
41}
42
43if (!$avx512vaes && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
44       `nasm -v 2>&1` =~ /NASM version ([0-9]+)\.([0-9]+)(?:\.([0-9]+))?/) {
45    my $ver = $1 + $2/100.0 + $3/10000.0; # 3.1.0->3.01, 3.10.1->3.1001
46    $avx512vaes = ($ver >= 2.1108);
47}
48
49if (!$avx512vaes && `$ENV{CC} -v 2>&1`
50    =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) {
51    my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001
52    if ($1) {
53        # Apple conditions, they use a different version series, see
54        # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2
55        # clang 7.0.0 is Apple clang 10.0.1
56        $avx512vaes = ($ver>=10.0001)
57    } else {
58        $avx512vaes = ($ver>=7.0);
59    }
60}
61
62open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
63    or die "can't call $xlate: $!";
64*STDOUT=*OUT;
65
66#======================================================================
67
68if ($avx512vaes) {
69
70  my $GP_STORAGE  = $win64 ? (16 * 18)  : (16 * 8);    # store rbx
71  my $XMM_STORAGE = $win64 ? (16 * 8) : 0;     # store xmm6:xmm15
72  my $VARIABLE_OFFSET = $win64 ? (16*8 + 16*10 + 8*3) :
73                                 (16*8 + 8*1);
74
75  # right now, >= 0x80 (128) is used for expanded keys. all usages of
76  # rsp should be invoked via $TW, not shadowed by any other name or
77  # used directly.
78  my $TW = "%rsp";
79  my $TEMPHIGH = "%rbx";
80  my $TEMPLOW = "%rax";
81  my $ZPOLY = "%zmm25";
82
83  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
84  # ;;; Function arguments abstraction
85  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
86  my ($key2, $key1, $tweak, $length, $input, $output);
87
88
89$input    = "%rdi";
90$output   = "%rsi";
91$length   = "%rdx";
92$key1     = "%rcx";
93$key2     = "%r8";
94$tweak    = "%r9";
95
96  # arguments for temp parameters
97  my ($tmp1, $gf_poly_8b, $gf_poly_8b_temp);
98    $tmp1                = "%r8";
99    $gf_poly_8b       = "%r10";
100    $gf_poly_8b_temp  = "%r11";
101
102  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
103  # ;;; Helper functions
104  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
105
106  # Generates "random" local labels
107  sub random_string() {
108    my @chars  = ('a' .. 'z', 'A' .. 'Z', '0' .. '9', '_');
109    my $length = 15;
110    my $str;
111    map { $str .= $chars[rand(33)] } 1 .. $length;
112    return $str;
113  }
114
115  # ; Seed the RNG so the labels are generated deterministically
116  srand(12345);
117
118  sub encrypt_tweak {
119    my $state_tweak = $_[0];
120    my $is_128 = $_[1];
121
122    $code.=<<___;
123    vpxor	($key2), $state_tweak, $state_tweak
124    vaesenc	0x10($key2), $state_tweak, $state_tweak
125    vaesenc	0x20($key2), $state_tweak, $state_tweak
126    vaesenc	0x30($key2), $state_tweak, $state_tweak
127    vaesenc	0x40($key2), $state_tweak, $state_tweak
128    vaesenc	0x50($key2), $state_tweak, $state_tweak
129    vaesenc	0x60($key2), $state_tweak, $state_tweak
130    vaesenc	0x70($key2), $state_tweak, $state_tweak
131    vaesenc	0x80($key2), $state_tweak, $state_tweak
132    vaesenc	0x90($key2), $state_tweak, $state_tweak
133___
134
135    if ($is_128) {
136      $code .= "vaesenclast	0xa0($key2), $state_tweak, $state_tweak\n";
137    } else {
138      $code .= "vaesenc	0xa0($key2), $state_tweak, $state_tweak\n";
139      $code .= "vaesenc	0xb0($key2), $state_tweak, $state_tweak\n";
140      $code .= "vaesenc	0xc0($key2), $state_tweak, $state_tweak\n";
141      $code .= "vaesenc	0xd0($key2), $state_tweak, $state_tweak\n";
142      $code .= "vaesenclast	0xe0($key2), $state_tweak, $state_tweak\n";
143    }
144    $code .= "vmovdqa	$state_tweak, ($TW)\n";
145  }
146
147  sub encrypt_final {
148    my $st = $_[0];
149    my $tw = $_[1];
150    my $is_128 = $_[2];
151
152    # xor Tweak value
153	$code .= "vpxor	$tw, $st, $st\n";
154    $code .= "vpxor	($key1), $st, $st\n";
155
156    my $rounds = $is_128 ? 10 : 14;
157    for (my $i = 1; $i < $rounds; $i++) {
158      $code .= "vaesenc	16*$i($key1), $st, $st\n";
159    }
160
161    $code .=<<___;
162    vaesenclast 16*$rounds($key1), $st, $st
163    vpxor	$tw, $st, $st
164___
165  }
166
167  # decrypt initial blocks of AES
168  # 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
169  # next 8 Tweak values are generated
170  sub decrypt_initial {
171    my @st;
172    $st[0] = $_[0];
173    $st[1] = $_[1];
174    $st[2] = $_[2];
175    $st[3] = $_[3];
176    $st[4] = $_[4];
177    $st[5] = $_[5];
178    $st[6] = $_[6];
179    $st[7] = $_[7];
180
181    my @tw;
182    $tw[0] = $_[8];
183    $tw[1] = $_[9];
184    $tw[2] = $_[10];
185    $tw[3] = $_[11];
186    $tw[4] = $_[12];
187    $tw[5] = $_[13];
188    $tw[6] = $_[14];
189    my $t0 = $_[15];
190    my $num_blocks = $_[16];
191    my $lt128 = $_[17];
192    my $is_128 = $_[18];
193
194    # num_blocks blocks encrypted
195    # num_blocks can be 1, 2, 3, 4, 5, 6, 7
196
197    #  xor Tweak value
198    for (my $i = 0; $i < $num_blocks; $i++) {
199      $code .= "vpxor $tw[$i], $st[$i], $st[$i]\n";
200    }
201
202    $code .= "vmovdqu  ($key1), $t0\n";
203
204    for (my $i = 0; $i < $num_blocks; $i++) {
205      $code .= "vpxor $t0, $st[$i], $st[$i]\n";
206    }
207
208    if (0 == $lt128) {
209      $code .= <<___;
210      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
211      shl     \$1, $TEMPLOW
212      adc     $TEMPHIGH, $TEMPHIGH
213___
214    }
215    # round 1
216    $code .= "vmovdqu 0x10($key1), $t0\n";
217
218    for (my $i = 0; $i < $num_blocks; $i++) {
219      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
220    }
221
222    if (0 == $lt128) {
223    $code .= <<___;
224      cmovc   $gf_poly_8b, $gf_poly_8b_temp
225      xor     $gf_poly_8b_temp, $TEMPLOW
226      mov     $TEMPLOW, ($TW)     # next Tweak1 generated
227      mov     $TEMPLOW, 0x08($TW)
228      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
229___
230    }
231
232    # round 2
233    $code .= "vmovdqu 0x20($key1), $t0\n";
234
235    for (my $i = 0; $i < $num_blocks; $i++) {
236      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
237    }
238
239    if (0 == $lt128) {
240      $code .= <<___;
241      shl     \$1, $TEMPLOW
242      adc     $TEMPHIGH, $TEMPHIGH
243      cmovc   $gf_poly_8b, $gf_poly_8b_temp
244      xor     $gf_poly_8b_temp, $TEMPLOW
245      mov     $TEMPLOW, 0x10($TW) # next Tweak2 generated
246___
247    }
248
249    # round 3
250    $code .= "vmovdqu 0x30($key1), $t0\n";
251
252    for (my $i = 0; $i < $num_blocks; $i++) {
253      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
254    }
255
256    if (0 == $lt128) {
257      $code .= <<___;
258      mov     $TEMPHIGH, 0x18($TW)
259      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
260      shl     \$1, $TEMPLOW
261      adc     $TEMPHIGH, $TEMPHIGH
262      cmovc   $gf_poly_8b, $gf_poly_8b_temp
263___
264    }
265
266    # round 4
267    $code .= "vmovdqu 0x40($key1), $t0\n";
268
269    for (my $i = 0; $i < $num_blocks; $i++) {
270      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
271    }
272
273    if (0 == $lt128) {
274    $code .= <<___;
275    xor     $gf_poly_8b_temp, $TEMPLOW
276    mov     $TEMPLOW, 0x20($TW) # next Tweak3 generated
277    mov     $TEMPHIGH, 0x28($TW)
278    xor     $gf_poly_8b_temp, $gf_poly_8b_temp
279    shl     \$1, $TEMPLOW
280___
281    }
282
283    # round 5
284    $code .= "vmovdqu 0x50($key1), $t0\n";
285
286    for (my $i = 0; $i < $num_blocks; $i++) {
287      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
288    }
289
290    if (0 == $lt128) {
291    $code .= <<___;
292      adc     $TEMPHIGH, $TEMPHIGH
293      cmovc   $gf_poly_8b, $gf_poly_8b_temp
294      xor     $gf_poly_8b_temp, $TEMPLOW
295      mov     $TEMPLOW, 0x30($TW) # next Tweak4 generated
296      mov     $TEMPHIGH, 0x38($TW)
297___
298    }
299
300    # round 6
301    $code .= "vmovdqu 0x60($key1), $t0\n";
302
303    for (my $i = 0; $i < $num_blocks; $i++) {
304      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
305    }
306
307    if (0 == $lt128) {
308      $code .= <<___;
309      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
310      shl     \$1, $TEMPLOW
311      adc     $TEMPHIGH, $TEMPHIGH
312      cmovc   $gf_poly_8b, $gf_poly_8b_temp
313      xor     $gf_poly_8b_temp, $TEMPLOW
314      mov     $TEMPLOW, 0x40($TW) # next Tweak5 generated
315      mov     $TEMPHIGH, 0x48($TW)
316___
317    }
318
319    # round 7
320    $code .= "vmovdqu 0x70($key1), $t0\n";
321
322    for (my $i = 0; $i < $num_blocks; $i++) {
323      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
324    }
325
326    if (0 == $lt128) {
327      $code .= <<___;
328      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
329      shl     \$1, $TEMPLOW
330      adc     $TEMPHIGH, $TEMPHIGH
331      cmovc   $gf_poly_8b, $gf_poly_8b_temp
332      xor     $gf_poly_8b_temp, $TEMPLOW
333      mov     $TEMPLOW, 0x50($TW) # next Tweak6 generated
334      mov     $TEMPHIGH, 0x58($TW)
335___
336    }
337
338    # round 8
339    $code .= "vmovdqu 0x80($key1), $t0\n";
340
341    for (my $i = 0; $i < $num_blocks; $i++) {
342      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
343    }
344
345    if (0 == $lt128) {
346      $code .= <<___;
347      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
348      shl     \$1, $TEMPLOW
349      adc     $TEMPHIGH, $TEMPHIGH
350      cmovc   $gf_poly_8b, $gf_poly_8b_temp
351      xor     $gf_poly_8b_temp, $TEMPLOW
352      mov     $TEMPLOW, 0x60($TW) # next Tweak7 generated
353      mov     $TEMPHIGH, 0x68($TW)
354___
355    }
356
357    # round 9
358    $code .= "vmovdqu 0x90($key1), $t0\n";
359
360    for (my $i = 0; $i < $num_blocks; $i++) {
361      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
362    }
363
364    if (0 == $lt128) {
365      $code .= <<___;
366      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
367      shl     \$1, $TEMPLOW
368      adc     $TEMPHIGH, $TEMPHIGH
369      cmovc   $gf_poly_8b, $gf_poly_8b_temp
370      xor     $gf_poly_8b_temp, $TEMPLOW
371      mov     $TEMPLOW, 0x70($TW) # next Tweak8 generated
372      mov     $TEMPHIGH, 0x78($TW)
373___
374    }
375
376    if ($is_128) {
377      # round 10
378      $code .= "vmovdqu 0xa0($key1), $t0\n";
379      for (my $i = 0; $i < $num_blocks; $i++) {
380        $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n";
381      }
382    } else {
383      # round 10
384      $code .= "vmovdqu 0xa0($key1), $t0\n";
385      for (my $i = 0; $i < $num_blocks; $i++) {
386        $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
387      }
388
389      # round 11
390      $code .= "vmovdqu 0xb0($key1), $t0\n";
391      for (my $i = 0; $i < $num_blocks; $i++) {
392        $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
393      }
394
395      # round 12
396      $code .= "vmovdqu 0xc0($key1), $t0\n";
397      for (my $i = 0; $i < $num_blocks; $i++) {
398        $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
399      }
400
401      # round 13
402      $code .= "vmovdqu 0xd0($key1), $t0\n";
403      for (my $i = 0; $i < $num_blocks; $i++) {
404        $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
405      }
406
407      # round 14
408      $code .= "vmovdqu 0xe0($key1), $t0\n";
409      for (my $i = 0; $i < $num_blocks; $i++) {
410        $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n";
411      }
412    }
413
414    # xor Tweak values
415    for (my $i = 0; $i < $num_blocks; $i++) {
416      $code .= "vpxor $tw[$i], $st[$i], $st[$i]\n";
417    }
418
419    if (0 == $lt128) {
420      # load next Tweak values
421      $code .= <<___;
422      vmovdqa  ($TW), $tw1
423      vmovdqa  0x10($TW), $tw2
424      vmovdqa  0x20($TW), $tw3
425      vmovdqa  0x30($TW), $tw4
426      vmovdqa  0x40($TW), $tw5
427      vmovdqa  0x50($TW), $tw6
428      vmovdqa  0x60($TW), $tw7
429___
430    }
431  }
432
433  sub initialize {
434    my @st;
435    $st[0] = $_[0];
436    $st[1] = $_[1];
437    $st[2] = $_[2];
438    $st[3] = $_[3];
439    $st[4] = $_[4];
440    $st[5] = $_[5];
441    $st[6] = $_[6];
442    $st[7] = $_[7];
443
444    my @tw;
445    $tw[0] = $_[8];
446    $tw[1] = $_[9];
447    $tw[2] = $_[10];
448    $tw[3] = $_[11];
449    $tw[4] = $_[12];
450    $tw[5] = $_[13];
451    $tw[6] = $_[14];
452    my $num_initial_blocks = $_[15];
453
454    $code .= <<___;
455    vmovdqa  0x0($TW), $tw[0]
456    mov      0x0($TW), $TEMPLOW
457    mov      0x08($TW), $TEMPHIGH
458    vmovdqu  0x0($input), $st[0]
459___
460
461    if ($num_initial_blocks >= 2) {
462      for (my $i = 1; $i < $num_initial_blocks; $i++) {
463        $code .= "xor      $gf_poly_8b_temp, $gf_poly_8b_temp\n";
464        $code .= "shl      \$1, $TEMPLOW\n";
465        $code .= "adc      $TEMPHIGH, $TEMPHIGH\n";
466        $code .= "cmovc    $gf_poly_8b, $gf_poly_8b_temp\n";
467        $code .= "xor      $gf_poly_8b_temp, $TEMPLOW\n";
468        my $offset = $i * 16;
469        $code .= "mov      $TEMPLOW, $offset($TW)\n";
470        $code .= "mov      $TEMPHIGH, $offset + 8($TW)\n";
471        $code .= "vmovdqa  $offset($TW), $tw[$i]\n";
472        $code .= "vmovdqu  $offset($input), $st[$i]\n";
473      }
474    }
475  }
476
477  # Encrypt 4 blocks in parallel
478  sub encrypt_by_four {
479    my $st1 = $_[0]; # state 1
480    my $tw1 = $_[1]; # tweak 1
481    my $tmp = $_[2];
482    my $is_128 = $_[3];
483
484    $code .= "vbroadcasti32x4 ($key1), $tmp\n";
485    $code .= "vpternlogq      \$0x96, $tmp, $tw1, $st1\n";
486
487    my $rounds = $is_128 ? 10 : 14;
488    for (my $i = 1; $i < $rounds; $i++) {
489      $code .= "vbroadcasti32x4 16*$i($key1), $tmp\n";
490      $code .= "vaesenc  $tmp, $st1, $st1\n";
491    }
492
493    $code .= "vbroadcasti32x4 16*$rounds($key1), $tmp\n";
494    $code .= "vaesenclast  $tmp, $st1, $st1\n";
495
496    $code .= "vpxorq $tw1, $st1, $st1\n";
497  }
498
499  # Encrypt 8 blocks in parallel
500  # generate next 8 tweak values
501  sub encrypt_by_eight_zmm {
502    my $st1 = $_[0];
503    my $st2 = $_[1];
504    my $tw1 = $_[2];
505    my $tw2 = $_[3];
506    my $t0 = $_[4];
507    my $last_eight = $_[5];
508    my $is_128 = $_[6];
509
510    $code .= <<___;
511	vbroadcasti32x4 ($key1), $t0
512	vpternlogq    \$0x96, $t0, $tw1, $st1
513	vpternlogq    \$0x96, $t0, $tw2, $st2
514___
515
516    if (0 == $last_eight) {
517      $code .= <<___;
518      vpsrldq		\$0xf, $tw1, %zmm13
519      vpclmulqdq	\$0x0, $ZPOLY, %zmm13, %zmm14
520      vpslldq		\$0x1, $tw1, %zmm15
521      vpxord		%zmm14, %zmm15, %zmm15
522___
523    }
524    # round 1
525    $code .= <<___;
526    vbroadcasti32x4 0x10($key1), $t0
527    vaesenc  $t0, $st1, $st1
528    vaesenc  $t0, $st2, $st2
529
530    # round 2
531    vbroadcasti32x4 0x20($key1), $t0
532    vaesenc  $t0, $st1, $st1
533    vaesenc  $t0, $st2, $st2
534
535    # round 3
536    vbroadcasti32x4 0x30($key1), $t0
537    vaesenc  $t0, $st1, $st1
538    vaesenc  $t0, $st2, $st2
539___
540
541    if (0 == $last_eight) {
542      $code .= <<___;
543      vpsrldq		\$0xf, $tw2, %zmm13
544      vpclmulqdq	\$0x0, $ZPOLY, %zmm13, %zmm14
545      vpslldq		\$0x1, $tw2, %zmm16
546      vpxord		%zmm14, %zmm16, %zmm16
547___
548    }
549
550    $code .= <<___;
551    # round 4
552    vbroadcasti32x4 0x40($key1), $t0
553    vaesenc  $t0, $st1, $st1
554    vaesenc  $t0, $st2, $st2
555
556    # round 5
557    vbroadcasti32x4 0x50($key1), $t0
558    vaesenc  $t0, $st1, $st1
559    vaesenc  $t0, $st2, $st2
560
561    # round 6
562    vbroadcasti32x4 0x60($key1), $t0
563    vaesenc  $t0, $st1, $st1
564    vaesenc  $t0, $st2, $st2
565
566    # round 7
567    vbroadcasti32x4 0x70($key1), $t0
568    vaesenc  $t0, $st1, $st1
569    vaesenc  $t0, $st2, $st2
570
571    # round 8
572    vbroadcasti32x4 0x80($key1), $t0
573    vaesenc  $t0, $st1, $st1
574    vaesenc  $t0, $st2, $st2
575
576    # round 9
577    vbroadcasti32x4 0x90($key1), $t0
578    vaesenc  $t0, $st1, $st1
579    vaesenc  $t0, $st2, $st2
580___
581
582    if ($is_128) {
583      $code .= <<___;
584      # round 10
585      vbroadcasti32x4 0xa0($key1), $t0
586      vaesenclast  $t0, $st1, $st1
587      vaesenclast  $t0, $st2, $st2
588___
589    } else {
590      $code .= <<___;
591      # round 10
592      vbroadcasti32x4 0xa0($key1), $t0
593      vaesenc  $t0, $st1, $st1
594      vaesenc  $t0, $st2, $st2
595
596      # round 11
597      vbroadcasti32x4 0xb0($key1), $t0
598      vaesenc  $t0, $st1, $st1
599      vaesenc  $t0, $st2, $st2
600
601      # round 12
602      vbroadcasti32x4 0xc0($key1), $t0
603      vaesenc  $t0, $st1, $st1
604      vaesenc  $t0, $st2, $st2
605
606      # round 13
607      vbroadcasti32x4 0xd0($key1), $t0
608      vaesenc  $t0, $st1, $st1
609      vaesenc  $t0, $st2, $st2
610
611      # round 14
612      vbroadcasti32x4 0xe0($key1), $t0
613      vaesenclast  $t0, $st1, $st1
614      vaesenclast  $t0, $st2, $st2
615___
616    }
617
618    # xor Tweak values
619    $code .= "vpxorq    $tw1, $st1, $st1\n";
620    $code .= "vpxorq    $tw2, $st2, $st2\n";
621
622    if (0 == $last_eight) {
623      # load next Tweak values
624      $code .= <<___;
625      vmovdqa32  %zmm15, $tw1
626      vmovdqa32  %zmm16, $tw2
627___
628    }
629  }
630
631  # Decrypt 8 blocks in parallel
632  # generate next 8 tweak values
633  sub decrypt_by_eight_zmm {
634    my $st1 = $_[0];
635    my $st2 = $_[1];
636    my $tw1 = $_[2];
637    my $tw2 = $_[3];
638    my $t0 = $_[4];
639    my $last_eight = $_[5];
640    my $is_128 = $_[6];
641
642    $code .= <<___;
643    # xor Tweak values
644    vpxorq    $tw1, $st1, $st1
645    vpxorq    $tw2, $st2, $st2
646
647    # ARK
648    vbroadcasti32x4 ($key1), $t0
649    vpxorq    $t0, $st1, $st1
650    vpxorq    $t0, $st2, $st2
651___
652
653    if (0 == $last_eight) {
654      $code .= <<___;
655      vpsrldq		\$0xf, $tw1, %zmm13
656      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
657      vpslldq		\$0x1, $tw1, %zmm15
658      vpxord		%zmm14, %zmm15, %zmm15
659___
660    }
661    # round 1
662    $code .= <<___;
663    vbroadcasti32x4 0x10($key1), $t0
664    vaesdec  $t0, $st1, $st1
665    vaesdec  $t0, $st2, $st2
666
667    # round 2
668    vbroadcasti32x4 0x20($key1), $t0
669    vaesdec  $t0, $st1, $st1
670    vaesdec  $t0, $st2, $st2
671
672    # round 3
673    vbroadcasti32x4 0x30($key1), $t0
674    vaesdec  $t0, $st1, $st1
675    vaesdec  $t0, $st2, $st2
676___
677
678    if (0 == $last_eight) {
679      $code .= <<___;
680      vpsrldq		\$0xf, $tw2, %zmm13
681      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
682      vpslldq		\$0x1, $tw2, %zmm16
683      vpxord		%zmm14, %zmm16, %zmm16
684___
685    }
686
687    $code .= <<___;
688    # round 4
689    vbroadcasti32x4 0x40($key1), $t0
690    vaesdec  $t0, $st1, $st1
691    vaesdec  $t0, $st2, $st2
692
693    # round 5
694    vbroadcasti32x4 0x50($key1), $t0
695    vaesdec  $t0, $st1, $st1
696    vaesdec  $t0, $st2, $st2
697
698    # round 6
699    vbroadcasti32x4 0x60($key1), $t0
700    vaesdec  $t0, $st1, $st1
701    vaesdec  $t0, $st2, $st2
702
703    # round 7
704    vbroadcasti32x4 0x70($key1), $t0
705    vaesdec  $t0, $st1, $st1
706    vaesdec  $t0, $st2, $st2
707
708    # round 8
709    vbroadcasti32x4 0x80($key1), $t0
710    vaesdec  $t0, $st1, $st1
711    vaesdec  $t0, $st2, $st2
712
713    # round 9
714    vbroadcasti32x4 0x90($key1), $t0
715    vaesdec  $t0, $st1, $st1
716    vaesdec  $t0, $st2, $st2
717
718___
719    if ($is_128) {
720      $code .= <<___;
721      # round 10
722      vbroadcasti32x4 0xa0($key1), $t0
723      vaesdeclast  $t0, $st1, $st1
724      vaesdeclast  $t0, $st2, $st2
725___
726    } else {
727      $code .= <<___;
728      # round 10
729      vbroadcasti32x4 0xa0($key1), $t0
730      vaesdec  $t0, $st1, $st1
731      vaesdec  $t0, $st2, $st2
732
733      # round 11
734      vbroadcasti32x4 0xb0($key1), $t0
735      vaesdec  $t0, $st1, $st1
736      vaesdec  $t0, $st2, $st2
737
738      # round 12
739      vbroadcasti32x4 0xc0($key1), $t0
740      vaesdec  $t0, $st1, $st1
741      vaesdec  $t0, $st2, $st2
742
743      # round 13
744      vbroadcasti32x4 0xd0($key1), $t0
745      vaesdec  $t0, $st1, $st1
746      vaesdec  $t0, $st2, $st2
747
748      # round 14
749      vbroadcasti32x4 0xe0($key1), $t0
750      vaesdeclast  $t0, $st1, $st1
751      vaesdeclast  $t0, $st2, $st2
752___
753    }
754
755    $code .= <<___;
756    # xor Tweak values
757    vpxorq    $tw1, $st1, $st1
758    vpxorq    $tw2, $st2, $st2
759
760    # load next Tweak values
761    vmovdqa32  %zmm15, $tw1
762    vmovdqa32  %zmm16, $tw2
763___
764  }
765
766  # Encrypt 16 blocks in parallel
767  # generate next 16 tweak values
768  sub encrypt_by_16_zmm {
769    my @st;
770    $st[0] = $_[0];
771    $st[1] = $_[1];
772    $st[2] = $_[2];
773    $st[3] = $_[3];
774
775    my @tw;
776    $tw[0] = $_[4];
777    $tw[1] = $_[5];
778    $tw[2] = $_[6];
779    $tw[3] = $_[7];
780
781    my $t0 = $_[8];
782    my $last_eight = $_[9];
783    my $is_128 = $_[10];
784
785    # xor Tweak values
786    for (my $i = 0; $i < 4; $i++) {
787      $code .= "vpxorq    $tw[$i], $st[$i], $st[$i]\n";
788    }
789
790    # ARK
791    $code .= "vbroadcasti32x4 ($key1), $t0\n";
792    for (my $i = 0; $i < 4; $i++) {
793      $code .= "vpxorq $t0, $st[$i], $st[$i]\n";
794    }
795
796    if (0 == $last_eight) {
797      $code .= <<___;
798      vpsrldq		\$0xf, $tw[2], %zmm13
799      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
800      vpslldq		\$0x1, $tw[2], %zmm15
801      vpxord		%zmm14, %zmm15, %zmm15
802___
803    }
804
805    # round 1
806    $code .= "vbroadcasti32x4 0x10($key1), $t0\n";
807    for (my $i = 0; $i < 4; $i++) {
808      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
809    }
810
811    # round 2
812    $code .= "vbroadcasti32x4 0x20($key1), $t0\n";
813    for (my $i = 0; $i < 4; $i++) {
814      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
815    }
816
817    # round 3
818    $code .= "vbroadcasti32x4 0x30($key1), $t0\n";
819    for (my $i = 0; $i < 4; $i++) {
820      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
821    }
822
823    if (0 == $last_eight) {
824      $code .= <<___;
825      vpsrldq		\$0xf, $tw[3], %zmm13
826      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
827      vpslldq		\$0x1, $tw[3], %zmm16
828      vpxord		%zmm14, %zmm16, %zmm16
829___
830    }
831    # round 4
832    $code .= "vbroadcasti32x4 0x40($key1), $t0\n";
833    for (my $i = 0; $i < 4; $i++) {
834      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
835    }
836
837    # round 5
838    $code .= "vbroadcasti32x4 0x50($key1), $t0\n";
839    for (my $i = 0; $i < 4; $i++) {
840      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
841    }
842
843    # round 6
844    $code .= "vbroadcasti32x4 0x60($key1), $t0\n";
845    for (my $i = 0; $i < 4; $i++) {
846      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
847    }
848
849    if (0 == $last_eight) {
850      $code .= <<___;
851      vpsrldq		\$0xf, %zmm15, %zmm13
852      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
853      vpslldq		\$0x1, %zmm15, %zmm17
854      vpxord		%zmm14, %zmm17, %zmm17
855___
856    }
857    # round 7
858    $code .= "vbroadcasti32x4 0x70($key1), $t0\n";
859    for (my $i = 0; $i < 4; $i++) {
860      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
861    }
862
863    # round 8
864    $code .= "vbroadcasti32x4 0x80($key1), $t0\n";
865    for (my $i = 0; $i < 4; $i++) {
866      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
867    }
868
869    # round 9
870    $code .= "vbroadcasti32x4 0x90($key1), $t0\n";
871    for (my $i = 0; $i < 4; $i++) {
872      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
873    }
874
875    if (0 == $last_eight) {
876      $code .= <<___;
877      vpsrldq		\$0xf, %zmm16, %zmm13
878      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
879      vpslldq		\$0x1, %zmm16, %zmm18
880      vpxord		%zmm14, %zmm18, %zmm18
881___
882    }
883    if ($is_128) {
884      # round 10
885      $code .= "vbroadcasti32x4 0xa0($key1), $t0\n";
886      for (my $i = 0; $i < 4; $i++) {
887        $code .= "vaesenclast $t0, $st[$i], $st[$i]\n";
888      }
889    } else {
890      # round 10
891      $code .= "vbroadcasti32x4 0xa0($key1), $t0\n";
892      for (my $i = 0; $i < 4; $i++) {
893        $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
894      }
895      # round 11
896      $code .= "vbroadcasti32x4 0xb0($key1), $t0\n";
897      for (my $i = 0; $i < 4; $i++) {
898        $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
899      }
900      # round 12
901      $code .= "vbroadcasti32x4 0xc0($key1), $t0\n";
902      for (my $i = 0; $i < 4; $i++) {
903        $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
904      }
905      # round 13
906      $code .= "vbroadcasti32x4 0xd0($key1), $t0\n";
907      for (my $i = 0; $i < 4; $i++) {
908        $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
909      }
910      # round 14
911      $code .= "vbroadcasti32x4 0xe0($key1), $t0\n";
912      for (my $i = 0; $i < 4; $i++) {
913        $code .= "vaesenclast $t0, $st[$i], $st[$i]\n";
914      }
915    }
916
917    # xor Tweak values
918    for (my $i = 0; $i < 4; $i++) {
919      $code .= "vpxorq    $tw[$i], $st[$i], $st[$i]\n";
920    }
921
922    $code .= <<___;
923    # load next Tweak values
924    vmovdqa32  %zmm15, $tw[0]
925    vmovdqa32  %zmm16, $tw[1]
926    vmovdqa32  %zmm17, $tw[2]
927    vmovdqa32  %zmm18, $tw[3]
928___
929  }
930
931  # Decrypt 16 blocks in parallel
932  # generate next 8 tweak values
933  sub decrypt_by_16_zmm {
934    my @st;
935    $st[0] = $_[0];
936    $st[1] = $_[1];
937    $st[2] = $_[2];
938    $st[3] = $_[3];
939
940    my @tw;
941    $tw[0] = $_[4];
942    $tw[1] = $_[5];
943    $tw[2] = $_[6];
944    $tw[3] = $_[7];
945
946    my $t0 = $_[8];
947    my $last_eight = $_[9];
948    my $is_128 = $_[10];
949
950    # xor Tweak values
951    for (my $i = 0; $i < 4; $i++) {
952      $code .= "vpxorq    $tw[$i], $st[$i], $st[$i]\n";
953    }
954
955    # ARK
956    $code .= "vbroadcasti32x4 ($key1), $t0\n";
957    for (my $i = 0; $i < 4; $i++) {
958      $code .= "vpxorq $t0, $st[$i], $st[$i]\n";
959    }
960
961    if (0 == $last_eight) {
962      $code .= <<___;
963      vpsrldq		\$0xf, $tw[2], %zmm13
964      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
965      vpslldq		\$0x1, $tw[2], %zmm15
966      vpxord		%zmm14, %zmm15, %zmm15
967___
968    }
969
970    # round 1
971    $code .= "vbroadcasti32x4 0x10($key1), $t0\n";
972    for (my $i = 0; $i < 4; $i++) {
973      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
974    }
975
976    # round 2
977    $code .= "vbroadcasti32x4 0x20($key1), $t0\n";
978    for (my $i = 0; $i < 4; $i++) {
979      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
980    }
981
982    # round 3
983    $code .= "vbroadcasti32x4 0x30($key1), $t0\n";
984    for (my $i = 0; $i < 4; $i++) {
985      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
986    }
987
988    if (0 == $last_eight) {
989      $code .= <<___;
990      vpsrldq		\$0xf, $tw[3], %zmm13
991      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
992      vpslldq		\$0x1, $tw[3], %zmm16
993      vpxord		%zmm14, %zmm16, %zmm16
994___
995    }
996    # round 4
997    $code .= "vbroadcasti32x4 0x40($key1), $t0\n";
998    for (my $i = 0; $i < 4; $i++) {
999      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
1000    }
1001
1002    # round 5
1003    $code .= "vbroadcasti32x4 0x50($key1), $t0\n";
1004    for (my $i = 0; $i < 4; $i++) {
1005      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
1006    }
1007
1008    # round 6
1009    $code .= "vbroadcasti32x4 0x60($key1), $t0\n";
1010    for (my $i = 0; $i < 4; $i++) {
1011      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
1012    }
1013
1014    if (0 == $last_eight) {
1015      $code .= <<___;
1016      vpsrldq		\$0xf, %zmm15, %zmm13
1017      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
1018      vpslldq		\$0x1, %zmm15, %zmm17
1019      vpxord		%zmm14, %zmm17, %zmm17
1020___
1021    }
1022    # round 7
1023    $code .= "vbroadcasti32x4 0x70($key1), $t0\n";
1024    for (my $i = 0; $i < 4; $i++) {
1025      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
1026    }
1027
1028    # round 8
1029    $code .= "vbroadcasti32x4 0x80($key1), $t0\n";
1030    for (my $i = 0; $i < 4; $i++) {
1031      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
1032    }
1033
1034    # round 9
1035    $code .= "vbroadcasti32x4 0x90($key1), $t0\n";
1036    for (my $i = 0; $i < 4; $i++) {
1037      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
1038    }
1039
1040    if (0 == $last_eight) {
1041      $code .= <<___;
1042      vpsrldq		\$0xf, %zmm16, %zmm13
1043      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
1044      vpslldq		\$0x1, %zmm16, %zmm18
1045      vpxord		%zmm14, %zmm18, %zmm18
1046___
1047    }
1048    if ($is_128) {
1049      # round 10
1050      $code .= "vbroadcasti32x4 0xa0($key1), $t0\n";
1051      for (my $i = 0; $i < 4; $i++) {
1052        $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n";
1053      }
1054    } else {
1055      # round 10
1056      $code .= "vbroadcasti32x4 0xa0($key1), $t0\n";
1057      for (my $i = 0; $i < 4; $i++) {
1058        $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
1059      }
1060
1061      # round 11
1062      $code .= "vbroadcasti32x4 0xb0($key1), $t0\n";
1063      for (my $i = 0; $i < 4; $i++) {
1064        $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
1065      }
1066
1067      # round 12
1068      $code .= "vbroadcasti32x4 0xc0($key1), $t0\n";
1069      for (my $i = 0; $i < 4; $i++) {
1070        $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
1071      }
1072
1073      # round 13
1074      $code .= "vbroadcasti32x4 0xd0($key1), $t0\n";
1075      for (my $i = 0; $i < 4; $i++) {
1076        $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
1077      }
1078
1079      # round 14
1080      $code .= "vbroadcasti32x4 0xe0($key1), $t0\n";
1081      for (my $i = 0; $i < 4; $i++) {
1082        $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n";
1083      }
1084    }
1085
1086    # xor Tweak values
1087    for (my $i = 0; $i < 4; $i++) {
1088      $code .= "vpxorq    $tw[$i], $st[$i], $st[$i]\n";
1089    }
1090
1091    $code .= <<___;
1092    # load next Tweak values
1093    vmovdqa32  %zmm15, $tw[0]
1094    vmovdqa32  %zmm16, $tw[1]
1095    vmovdqa32  %zmm17, $tw[2]
1096    vmovdqa32  %zmm18, $tw[3]
1097___
1098  }
1099
1100  $code .= ".text\n";
1101
1102  {
1103    $code.=<<"___";
1104    .extern	OPENSSL_ia32cap_P
1105    .globl	aesni_xts_avx512_eligible
1106    .type	aesni_xts_avx512_eligible,\@abi-omnipotent
1107    .align	32
1108    aesni_xts_avx512_eligible:
1109        mov	OPENSSL_ia32cap_P+8(%rip), %ecx
1110        xor	%eax,%eax
1111    	# 1<<31|1<<30|1<<17|1<<16 avx512vl + avx512bw + avx512dq + avx512f
1112        and	\$0xc0030000, %ecx
1113        cmp	\$0xc0030000, %ecx
1114        jne	.L_done
1115        mov	OPENSSL_ia32cap_P+12(%rip), %ecx
1116    	# 1<<10|1<<9|1<<6 vaes + vpclmulqdq + vbmi2
1117        and	\$0x640, %ecx
1118        cmp	\$0x640, %ecx
1119        cmove	%ecx,%eax
1120        .L_done:
1121        ret
1122    .size   aesni_xts_avx512_eligible, .-aesni_xts_avx512_eligible
1123___
1124  }
1125
1126
1127  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1128  # ;void aesni_xts_[128|256]_encrypt_avx512(
1129  # ;               const uint8_t *in,        // input data
1130  # ;               uint8_t *out,             // output data
1131  # ;               size_t length,            // sector size, in bytes
1132  # ;               const AES_KEY *key1,      // key used for "ECB" encryption
1133  # ;               const AES_KEY *key2,      // key used for tweaking
1134  # ;               const uint8_t iv[16])     // initial tweak value, 16 bytes
1135  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1136  sub enc {
1137    my $is_128 = $_[0];
1138    my $rndsuffix = &random_string();
1139
1140    if ($is_128) {
1141      $code.=<<___;
1142      .globl	aesni_xts_128_encrypt_avx512
1143      .hidden	aesni_xts_128_encrypt_avx512
1144      .type	aesni_xts_128_encrypt_avx512,\@function,6
1145      .align	32
1146      aesni_xts_128_encrypt_avx512:
1147      .cfi_startproc
1148      endbranch
1149___
1150    } else {
1151      $code.=<<___;
1152      .globl	aesni_xts_256_encrypt_avx512
1153      .hidden	aesni_xts_256_encrypt_avx512
1154      .type	aesni_xts_256_encrypt_avx512,\@function,6
1155      .align	32
1156      aesni_xts_256_encrypt_avx512:
1157      .cfi_startproc
1158      endbranch
1159___
1160    }
1161    $code .= "push 	 %rbp\n";
1162    $code .= "mov 	 $TW,%rbp\n";
1163    $code .= "sub 	 \$$VARIABLE_OFFSET,$TW\n";
1164    $code .= "and 	 \$0xffffffffffffffc0,$TW\n";
1165    $code .= "mov 	 %rbx,$GP_STORAGE($TW)\n";
1166
1167    if ($win64) {
1168      $code .= "mov 	 %rdi,$GP_STORAGE + 8*1($TW)\n";
1169      $code .= "mov 	 %rsi,$GP_STORAGE + 8*2($TW)\n";
1170      $code .= "vmovdqa      %xmm6, $XMM_STORAGE + 16*0($TW)\n";
1171      $code .= "vmovdqa      %xmm7, $XMM_STORAGE + 16*1($TW)\n";
1172      $code .= "vmovdqa      %xmm8, $XMM_STORAGE + 16*2($TW)\n";
1173      $code .= "vmovdqa      %xmm9, $XMM_STORAGE + 16*3($TW)\n";
1174      $code .= "vmovdqa      %xmm10, $XMM_STORAGE + 16*4($TW)\n";
1175      $code .= "vmovdqa      %xmm11, $XMM_STORAGE + 16*5($TW)\n";
1176      $code .= "vmovdqa      %xmm12, $XMM_STORAGE + 16*6($TW)\n";
1177      $code .= "vmovdqa      %xmm13, $XMM_STORAGE + 16*7($TW)\n";
1178      $code .= "vmovdqa      %xmm14, $XMM_STORAGE + 16*8($TW)\n";
1179      $code .= "vmovdqa      %xmm15, $XMM_STORAGE + 16*9($TW)\n";
1180    }
1181
1182    $code .= "mov 	 \$0x87, $gf_poly_8b\n";
1183    $code .= "vmovdqu 	 ($tweak),%xmm1\n";      # read initial tweak values
1184
1185    encrypt_tweak("%xmm1", $is_128);
1186
1187    if ($win64) {
1188      $code .= "mov	 $input, 8 + 8*5(%rbp)\n";  # ciphertext pointer
1189      $code .= "mov        $output, 8 + 8*6(%rbp)\n"; # plaintext pointer
1190    }
1191
1192    {
1193    $code.=<<___;
1194
1195    cmp 	 \$0x80,$length
1196    jl 	 .L_less_than_128_bytes_${rndsuffix}
1197    vpbroadcastq 	 $gf_poly_8b,$ZPOLY
1198    cmp 	 \$0x100,$length
1199    jge 	 .L_start_by16_${rndsuffix}
1200    cmp 	 \$0x80,$length
1201    jge 	 .L_start_by8_${rndsuffix}
1202
1203    .L_do_n_blocks_${rndsuffix}:
1204    cmp 	 \$0x0,$length
1205    je 	 .L_ret_${rndsuffix}
1206    cmp 	 \$0x70,$length
1207    jge 	 .L_remaining_num_blocks_is_7_${rndsuffix}
1208    cmp 	 \$0x60,$length
1209    jge 	 .L_remaining_num_blocks_is_6_${rndsuffix}
1210    cmp 	 \$0x50,$length
1211    jge 	 .L_remaining_num_blocks_is_5_${rndsuffix}
1212    cmp 	 \$0x40,$length
1213    jge 	 .L_remaining_num_blocks_is_4_${rndsuffix}
1214    cmp 	 \$0x30,$length
1215    jge 	 .L_remaining_num_blocks_is_3_${rndsuffix}
1216    cmp 	 \$0x20,$length
1217    jge 	 .L_remaining_num_blocks_is_2_${rndsuffix}
1218    cmp 	 \$0x10,$length
1219    jge 	 .L_remaining_num_blocks_is_1_${rndsuffix}
1220    vmovdqa 	 %xmm0,%xmm8
1221    vmovdqa 	 %xmm9,%xmm0
1222    jmp 	 .L_steal_cipher_${rndsuffix}
1223
1224    .L_remaining_num_blocks_is_7_${rndsuffix}:
1225    mov 	 \$0x0000ffffffffffff,$tmp1
1226    kmovq 	 $tmp1,%k1
1227    vmovdqu8 	 ($input),%zmm1
1228    vmovdqu8 	 0x40($input),%zmm2{%k1}
1229    add 	 \$0x70,$input
1230___
1231    }
1232
1233    encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
1234
1235    {
1236    $code .= <<___;
1237    vmovdqu8 	 %zmm1,($output)
1238    vmovdqu8 	 %zmm2,0x40($output){%k1}
1239    add 	 \$0x70,$output
1240    vextracti32x4 	 \$0x2,%zmm2,%xmm8
1241    vextracti32x4 	 \$0x3,%zmm10,%xmm0
1242    and 	 \$0xf,$length
1243    je 	 .L_ret_${rndsuffix}
1244    jmp 	 .L_steal_cipher_${rndsuffix}
1245
1246    .L_remaining_num_blocks_is_6_${rndsuffix}:
1247    vmovdqu8 	 ($input),%zmm1
1248    vmovdqu8 	 0x40($input),%ymm2
1249    add 	 \$0x60,$input
1250___
1251    }
1252
1253    encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
1254
1255    {
1256    $code .= <<___;
1257    vmovdqu8 	 %zmm1,($output)
1258    vmovdqu8 	 %ymm2,0x40($output)
1259    add 	 \$0x60,$output
1260    vextracti32x4 	 \$0x1,%zmm2,%xmm8
1261    vextracti32x4 	 \$0x2,%zmm10,%xmm0
1262    and 	 \$0xf,$length
1263    je 	 .L_ret_${rndsuffix}
1264    jmp 	 .L_steal_cipher_${rndsuffix}
1265
1266    .L_remaining_num_blocks_is_5_${rndsuffix}:
1267    vmovdqu8 	 ($input),%zmm1
1268    vmovdqu 	 0x40($input),%xmm2
1269    add 	 \$0x50,$input
1270___
1271    }
1272
1273    encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
1274
1275    {
1276    $code .= <<___;
1277    vmovdqu8 	 %zmm1,($output)
1278    vmovdqu 	 %xmm2,0x40($output)
1279    add 	 \$0x50,$output
1280    vmovdqa 	 %xmm2,%xmm8
1281    vextracti32x4 	 \$0x1,%zmm10,%xmm0
1282    and 	 \$0xf,$length
1283    je 	 .L_ret_${rndsuffix}
1284    jmp 	 .L_steal_cipher_${rndsuffix}
1285
1286    .L_remaining_num_blocks_is_4_${rndsuffix}:
1287    vmovdqu8 	 ($input),%zmm1
1288    add 	 \$0x40,$input
1289___
1290    }
1291
1292    encrypt_by_four("%zmm1", "%zmm9", "%zmm0", $is_128);
1293
1294    {
1295    $code .= <<___;
1296    vmovdqu8	%zmm1,($output)
1297    add	\$0x40,$output
1298    vextracti32x4	\$0x3,%zmm1,%xmm8
1299    vmovdqa64	%xmm10, %xmm0
1300    and	\$0xf,$length
1301    je	.L_ret_${rndsuffix}
1302    jmp	.L_steal_cipher_${rndsuffix}
1303___
1304    }
1305
1306    {
1307    $code .= <<___;
1308    .L_remaining_num_blocks_is_3_${rndsuffix}:
1309    mov	\$-1, $tmp1
1310    shr	\$0x10, $tmp1
1311    kmovq	$tmp1, %k1
1312    vmovdqu8	($input), %zmm1{%k1}
1313    add	\$0x30, $input
1314___
1315    }
1316
1317    encrypt_by_four("%zmm1", "%zmm9", "%zmm0", $is_128);
1318
1319    {
1320    $code .= <<___;
1321    vmovdqu8	%zmm1, ($output){%k1}
1322    add	\$0x30, $output
1323    vextracti32x4	\$0x2, %zmm1, %xmm8
1324    vextracti32x4	\$0x3, %zmm9, %xmm0
1325    and	\$0xf, $length
1326    je	.L_ret_${rndsuffix}
1327    jmp	.L_steal_cipher_${rndsuffix}
1328___
1329    }
1330
1331    {
1332    $code .= <<___;
1333    .L_remaining_num_blocks_is_2_${rndsuffix}:
1334    vmovdqu8	($input), %ymm1
1335    add	\$0x20, $input
1336___
1337    }
1338
1339    encrypt_by_four("%ymm1", "%ymm9", "%ymm0", $is_128);
1340
1341    {
1342    $code .= <<___;
1343    vmovdqu 	 %ymm1,($output)
1344    add 	 \$0x20,$output
1345    vextracti32x4	\$0x1, %zmm1, %xmm8
1346    vextracti32x4	\$0x2,%zmm9,%xmm0
1347    and 	 \$0xf,$length
1348    je 	 .L_ret_${rndsuffix}
1349    jmp 	 .L_steal_cipher_${rndsuffix}
1350___
1351    }
1352
1353    {
1354    $code .= <<___;
1355    .L_remaining_num_blocks_is_1_${rndsuffix}:
1356    vmovdqu 	 ($input),%xmm1
1357    add 	 \$0x10,$input
1358___
1359    }
1360
1361    encrypt_final("%xmm1", "%xmm9", $is_128);
1362
1363    {
1364    $code .= <<___;
1365    vmovdqu 	 %xmm1,($output)
1366    add 	 \$0x10,$output
1367    vmovdqa 	 %xmm1,%xmm8
1368    vextracti32x4 	 \$0x1,%zmm9,%xmm0
1369    and 	 \$0xf,$length
1370    je 	 .L_ret_${rndsuffix}
1371    jmp 	 .L_steal_cipher_${rndsuffix}
1372
1373
1374    .L_start_by16_${rndsuffix}:
1375    vbroadcasti32x4 	 ($TW),%zmm0
1376    vbroadcasti32x4 shufb_15_7(%rip),%zmm8
1377    mov 	 \$0xaa,$tmp1
1378    kmovq 	 $tmp1,%k2
1379    vpshufb 	 %zmm8,%zmm0,%zmm1
1380    vpsllvq const_dq3210(%rip),%zmm0,%zmm4
1381    vpsrlvq const_dq5678(%rip),%zmm1,%zmm2
1382    vpclmulqdq 	 \$0x0,%zmm25,%zmm2,%zmm3
1383    vpxorq 	 %zmm2,%zmm4,%zmm4{%k2}
1384    vpxord 	 %zmm4,%zmm3,%zmm9
1385    vpsllvq const_dq7654(%rip),%zmm0,%zmm5
1386    vpsrlvq const_dq1234(%rip),%zmm1,%zmm6
1387    vpclmulqdq 	 \$0x0,%zmm25,%zmm6,%zmm7
1388    vpxorq 	 %zmm6,%zmm5,%zmm5{%k2}
1389    vpxord 	 %zmm5,%zmm7,%zmm10
1390    vpsrldq 	 \$0xf,%zmm9,%zmm13
1391    vpclmulqdq 	 \$0x0,%zmm25,%zmm13,%zmm14
1392    vpslldq 	 \$0x1,%zmm9,%zmm11
1393    vpxord 	 %zmm14,%zmm11,%zmm11
1394    vpsrldq 	 \$0xf,%zmm10,%zmm15
1395    vpclmulqdq 	 \$0x0,%zmm25,%zmm15,%zmm16
1396    vpslldq 	 \$0x1,%zmm10,%zmm12
1397    vpxord 	 %zmm16,%zmm12,%zmm12
1398
1399    .L_main_loop_run_16_${rndsuffix}:
1400    vmovdqu8 	 ($input),%zmm1
1401    vmovdqu8 	 0x40($input),%zmm2
1402    vmovdqu8 	 0x80($input),%zmm3
1403    vmovdqu8 	 0xc0($input),%zmm4
1404    add 	 \$0x100,$input
1405___
1406    }
1407
1408    encrypt_by_16_zmm("%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm9",
1409                      "%zmm10", "%zmm11", "%zmm12", "%zmm0", 0, $is_128);
1410
1411    {
1412    $code .= <<___;
1413    vmovdqu8 	 %zmm1,($output)
1414    vmovdqu8 	 %zmm2,0x40($output)
1415    vmovdqu8 	 %zmm3,0x80($output)
1416    vmovdqu8 	 %zmm4,0xc0($output)
1417    add 	 \$0x100,$output
1418    sub 	 \$0x100,$length
1419    cmp 	 \$0x100,$length
1420    jae 	 .L_main_loop_run_16_${rndsuffix}
1421    cmp 	 \$0x80,$length
1422    jae 	 .L_main_loop_run_8_${rndsuffix}
1423    vextracti32x4 	 \$0x3,%zmm4,%xmm0
1424    jmp 	 .L_do_n_blocks_${rndsuffix}
1425
1426    .L_start_by8_${rndsuffix}:
1427    vbroadcasti32x4 	 ($TW),%zmm0
1428    vbroadcasti32x4 shufb_15_7(%rip),%zmm8
1429    mov 	 \$0xaa,$tmp1
1430    kmovq 	 $tmp1,%k2
1431    vpshufb 	 %zmm8,%zmm0,%zmm1
1432    vpsllvq const_dq3210(%rip),%zmm0,%zmm4
1433    vpsrlvq const_dq5678(%rip),%zmm1,%zmm2
1434    vpclmulqdq 	 \$0x0,%zmm25,%zmm2,%zmm3
1435    vpxorq 	 %zmm2,%zmm4,%zmm4{%k2}
1436    vpxord 	 %zmm4,%zmm3,%zmm9
1437    vpsllvq const_dq7654(%rip),%zmm0,%zmm5
1438    vpsrlvq const_dq1234(%rip),%zmm1,%zmm6
1439    vpclmulqdq 	 \$0x0,%zmm25,%zmm6,%zmm7
1440    vpxorq 	 %zmm6,%zmm5,%zmm5{%k2}
1441    vpxord 	 %zmm5,%zmm7,%zmm10
1442
1443    .L_main_loop_run_8_${rndsuffix}:
1444    vmovdqu8 	 ($input),%zmm1
1445    vmovdqu8 	 0x40($input),%zmm2
1446    add 	 \$0x80,$input
1447___
1448    }
1449
1450    encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 0, $is_128);
1451
1452    {
1453    $code .= <<___;
1454    vmovdqu8 	 %zmm1,($output)
1455    vmovdqu8 	 %zmm2,0x40($output)
1456    add 	 \$0x80,$output
1457    sub 	 \$0x80,$length
1458    cmp 	 \$0x80,$length
1459    jae 	 .L_main_loop_run_8_${rndsuffix}
1460    vextracti32x4 	 \$0x3,%zmm2,%xmm0
1461    jmp 	 .L_do_n_blocks_${rndsuffix}
1462
1463    .L_steal_cipher_${rndsuffix}:
1464    vmovdqa	%xmm8,%xmm2
1465    lea	vpshufb_shf_table(%rip),$TEMPLOW
1466    vmovdqu	($TEMPLOW,$length,1),%xmm10
1467    vpshufb	%xmm10,%xmm8,%xmm8
1468    vmovdqu	-0x10($input,$length,1),%xmm3
1469    vmovdqu	%xmm8,-0x10($output,$length,1)
1470    lea	vpshufb_shf_table(%rip),$TEMPLOW
1471    add	\$16, $TEMPLOW
1472    sub	$length,$TEMPLOW
1473    vmovdqu	($TEMPLOW),%xmm10
1474    vpxor	mask1(%rip),%xmm10,%xmm10
1475    vpshufb	%xmm10,%xmm3,%xmm3
1476    vpblendvb	%xmm10,%xmm2,%xmm3,%xmm3
1477    vpxor	%xmm0,%xmm3,%xmm8
1478    vpxor	($key1),%xmm8,%xmm8
1479    vaesenc	0x10($key1),%xmm8,%xmm8
1480    vaesenc	0x20($key1),%xmm8,%xmm8
1481    vaesenc	0x30($key1),%xmm8,%xmm8
1482    vaesenc	0x40($key1),%xmm8,%xmm8
1483    vaesenc	0x50($key1),%xmm8,%xmm8
1484    vaesenc	0x60($key1),%xmm8,%xmm8
1485    vaesenc	0x70($key1),%xmm8,%xmm8
1486    vaesenc	0x80($key1),%xmm8,%xmm8
1487    vaesenc	0x90($key1),%xmm8,%xmm8
1488___
1489    if ($is_128) {
1490      $code .= "vaesenclast	0xa0($key1),%xmm8,%xmm8\n";
1491    } else {
1492      $code .= <<___
1493      vaesenc	0xa0($key1),%xmm8,%xmm8
1494      vaesenc	0xb0($key1),%xmm8,%xmm8
1495      vaesenc	0xc0($key1),%xmm8,%xmm8
1496      vaesenc	0xd0($key1),%xmm8,%xmm8
1497      vaesenclast	0xe0($key1),%xmm8,%xmm8
1498___
1499    }
1500    $code .= "vpxor	%xmm0,%xmm8,%xmm8\n";
1501    $code .= "vmovdqu	%xmm8,-0x10($output)\n";
1502    }
1503
1504    {
1505    $code .= <<___;
1506    .L_ret_${rndsuffix}:
1507    mov 	 $GP_STORAGE($TW),%rbx
1508    xor    $tmp1,$tmp1
1509    mov    $tmp1,$GP_STORAGE($TW)
1510    # Zero-out the whole of `%zmm0`.
1511    vpxorq %zmm0,%zmm0,%zmm0
1512___
1513    }
1514
1515    if ($win64) {
1516      $code .= <<___;
1517      mov $GP_STORAGE + 8*1($TW),%rdi
1518      mov $tmp1,$GP_STORAGE + 8*1($TW)
1519      mov $GP_STORAGE + 8*2($TW),%rsi
1520      mov $tmp1,$GP_STORAGE + 8*2($TW)
1521
1522      vmovdqa $XMM_STORAGE + 16 * 0($TW), %xmm6
1523      vmovdqa $XMM_STORAGE + 16 * 1($TW), %xmm7
1524      vmovdqa $XMM_STORAGE + 16 * 2($TW), %xmm8
1525      vmovdqa $XMM_STORAGE + 16 * 3($TW), %xmm9
1526
1527      # Zero the 64 bytes we just restored to the xmm registers.
1528      vmovdqa64 %zmm0,$XMM_STORAGE($TW)
1529
1530      vmovdqa $XMM_STORAGE + 16 * 4($TW), %xmm10
1531      vmovdqa $XMM_STORAGE + 16 * 5($TW), %xmm11
1532      vmovdqa $XMM_STORAGE + 16 * 6($TW), %xmm12
1533      vmovdqa $XMM_STORAGE + 16 * 7($TW), %xmm13
1534
1535      # And again.
1536      vmovdqa64 %zmm0,$XMM_STORAGE + 16 * 4($TW)
1537
1538      vmovdqa $XMM_STORAGE + 16 * 8($TW), %xmm14
1539      vmovdqa $XMM_STORAGE + 16 * 9($TW), %xmm15
1540
1541      # Last round is only 32 bytes (256-bits), so we use `%ymm` as the
1542      # source operand.
1543      vmovdqa %ymm0,$XMM_STORAGE + 16 * 8($TW)
1544___
1545    }
1546
1547    {
1548    $code .= <<___;
1549    mov %rbp,$TW
1550    pop %rbp
1551    vzeroupper
1552    ret
1553
1554    .L_less_than_128_bytes_${rndsuffix}:
1555    vpbroadcastq	$gf_poly_8b, $ZPOLY
1556    cmp 	 \$0x10,$length
1557    jb 	 .L_ret_${rndsuffix}
1558    vbroadcasti32x4	($TW), %zmm0
1559    vbroadcasti32x4	shufb_15_7(%rip), %zmm8
1560    movl    \$0xaa, %r8d
1561    kmovq	%r8, %k2
1562    mov	$length,$tmp1
1563    and	\$0x70,$tmp1
1564    cmp	\$0x60,$tmp1
1565    je	.L_num_blocks_is_6_${rndsuffix}
1566    cmp	\$0x50,$tmp1
1567    je	.L_num_blocks_is_5_${rndsuffix}
1568    cmp	\$0x40,$tmp1
1569    je	.L_num_blocks_is_4_${rndsuffix}
1570    cmp	\$0x30,$tmp1
1571    je	.L_num_blocks_is_3_${rndsuffix}
1572    cmp	\$0x20,$tmp1
1573    je	.L_num_blocks_is_2_${rndsuffix}
1574    cmp	\$0x10,$tmp1
1575    je	.L_num_blocks_is_1_${rndsuffix}
1576
1577    .L_num_blocks_is_7_${rndsuffix}:
1578    vpshufb	%zmm8, %zmm0, %zmm1
1579    vpsllvq	const_dq3210(%rip), %zmm0, %zmm4
1580    vpsrlvq	const_dq5678(%rip), %zmm1, %zmm2
1581    vpclmulqdq	\$0x00, $ZPOLY, %zmm2, %zmm3
1582    vpxorq	%zmm2, %zmm4, %zmm4{%k2}
1583    vpxord	%zmm4, %zmm3, %zmm9
1584    vpsllvq	const_dq7654(%rip), %zmm0, %zmm5
1585    vpsrlvq	const_dq1234(%rip), %zmm1, %zmm6
1586    vpclmulqdq	\$0x00, $ZPOLY, %zmm6, %zmm7
1587    vpxorq	%zmm6, %zmm5, %zmm5{%k2}
1588    vpxord	%zmm5, %zmm7, %zmm10
1589    mov	\$0x0000ffffffffffff, $tmp1
1590    kmovq	$tmp1, %k1
1591    vmovdqu8	16*0($input), %zmm1
1592    vmovdqu8	16*4($input), %zmm2{%k1}
1593
1594    add	\$0x70,$input
1595___
1596    }
1597
1598    encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
1599
1600    {
1601    $code .= <<___;
1602    vmovdqu8	%zmm1, 16*0($output)
1603    vmovdqu8	%zmm2, 16*4($output){%k1}
1604    add	\$0x70,$output
1605    vextracti32x4	\$0x2, %zmm2, %xmm8
1606    vextracti32x4	\$0x3, %zmm10, %xmm0
1607    and	\$0xf,$length
1608    je	.L_ret_${rndsuffix}
1609    jmp	.L_steal_cipher_${rndsuffix}
1610___
1611    }
1612
1613    {
1614    $code .= <<___;
1615    .L_num_blocks_is_6_${rndsuffix}:
1616    vpshufb	%zmm8, %zmm0, %zmm1
1617    vpsllvq	const_dq3210(%rip), %zmm0, %zmm4
1618    vpsrlvq	const_dq5678(%rip), %zmm1, %zmm2
1619    vpclmulqdq	\$0x00, $ZPOLY, %zmm2, %zmm3
1620    vpxorq	%zmm2, %zmm4, %zmm4{%k2}
1621    vpxord	%zmm4, %zmm3, %zmm9
1622    vpsllvq	const_dq7654(%rip), %zmm0, %zmm5
1623    vpsrlvq	const_dq1234(%rip), %zmm1, %zmm6
1624    vpclmulqdq	\$0x00, $ZPOLY, %zmm6, %zmm7
1625    vpxorq	%zmm6, %zmm5, %zmm5{%k2}
1626    vpxord	%zmm5, %zmm7, %zmm10
1627    vmovdqu8	16*0($input), %zmm1
1628    vmovdqu8	16*4($input), %ymm2
1629    add	\$96, $input
1630___
1631    }
1632
1633    encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
1634
1635    {
1636    $code .= <<___;
1637    vmovdqu8	%zmm1, 16*0($output)
1638    vmovdqu8	%ymm2, 16*4($output)
1639    add	\$96, $output
1640
1641    vextracti32x4	\$0x1, %ymm2, %xmm8
1642    vextracti32x4	\$0x2, %zmm10, %xmm0
1643    and	\$0xf,$length
1644    je	.L_ret_${rndsuffix}
1645    jmp	.L_steal_cipher_${rndsuffix}
1646___
1647    }
1648
1649    {
1650    $code .= <<___;
1651    .L_num_blocks_is_5_${rndsuffix}:
1652    vpshufb	%zmm8, %zmm0, %zmm1
1653    vpsllvq	const_dq3210(%rip), %zmm0, %zmm4
1654    vpsrlvq	const_dq5678(%rip), %zmm1, %zmm2
1655    vpclmulqdq	\$0x00, $ZPOLY, %zmm2, %zmm3
1656    vpxorq	%zmm2, %zmm4, %zmm4{%k2}
1657    vpxord	%zmm4, %zmm3, %zmm9
1658    vpsllvq	const_dq7654(%rip), %zmm0, %zmm5
1659    vpsrlvq	const_dq1234(%rip), %zmm1, %zmm6
1660    vpclmulqdq	\$0x00, $ZPOLY, %zmm6, %zmm7
1661    vpxorq	%zmm6, %zmm5, %zmm5{%k2}
1662    vpxord	%zmm5, %zmm7, %zmm10
1663    vmovdqu8	16*0($input), %zmm1
1664    vmovdqu8	16*4($input), %xmm2
1665    add	\$80, $input
1666___
1667    }
1668
1669    encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
1670
1671    {
1672    $code .= <<___;
1673    vmovdqu8	%zmm1, 16*0($output)
1674    vmovdqu8	%xmm2, 16*4($output)
1675    add	\$80, $output
1676
1677    vmovdqa	%xmm2, %xmm8
1678    vextracti32x4	\$0x1, %zmm10, %xmm0
1679    and	\$0xf,$length
1680    je	.L_ret_${rndsuffix}
1681    jmp	.L_steal_cipher_${rndsuffix}
1682___
1683    }
1684
1685    {
1686    $code .= <<___;
1687    .L_num_blocks_is_4_${rndsuffix}:
1688    vpshufb	%zmm8, %zmm0, %zmm1
1689    vpsllvq	const_dq3210(%rip), %zmm0, %zmm4
1690    vpsrlvq	const_dq5678(%rip), %zmm1, %zmm2
1691    vpclmulqdq	\$0x00, $ZPOLY, %zmm2, %zmm3
1692    vpxorq	%zmm2, %zmm4, %zmm4{%k2}
1693    vpxord	%zmm4, %zmm3, %zmm9
1694    vpsllvq	const_dq7654(%rip), %zmm0, %zmm5
1695    vpsrlvq	const_dq1234(%rip), %zmm1, %zmm6
1696    vpclmulqdq	\$0x00, $ZPOLY, %zmm6, %zmm7
1697    vpxorq	%zmm6, %zmm5, %zmm5{%k2}
1698    vpxord	%zmm5, %zmm7, %zmm10
1699    vmovdqu8	16*0($input), %zmm1
1700    add	\$64, $input
1701___
1702    }
1703
1704    encrypt_by_four("%zmm1", "%zmm9", "%zmm0", $is_128);
1705
1706    {
1707    $code .= <<___;
1708    vmovdqu8	%zmm1, 16*0($output)
1709    add	\$64, $output
1710    vextracti32x4	\$0x3, %zmm1, %xmm8
1711    vmovdqa	%xmm10, %xmm0
1712    and	\$0xf,$length
1713    je	.L_ret_${rndsuffix}
1714    jmp	.L_steal_cipher_${rndsuffix}
1715___
1716    }
1717
1718    {
1719    $code .= <<___;
1720    .L_num_blocks_is_3_${rndsuffix}:
1721    vpshufb	%zmm8, %zmm0, %zmm1
1722    vpsllvq	const_dq3210(%rip), %zmm0, %zmm4
1723    vpsrlvq	const_dq5678(%rip), %zmm1, %zmm2
1724    vpclmulqdq	\$0x00, $ZPOLY, %zmm2, %zmm3
1725    vpxorq	%zmm2, %zmm4, %zmm4{%k2}
1726    vpxord	%zmm4, %zmm3, %zmm9
1727    mov	\$0x0000ffffffffffff, $tmp1
1728    kmovq	$tmp1, %k1
1729    vmovdqu8	16*0($input), %zmm1{%k1}
1730    add	\$48, $input
1731___
1732    }
1733
1734    encrypt_by_four("%zmm1", "%zmm9", "%zmm0", $is_128);
1735
1736    {
1737    $code .= <<___;
1738    vmovdqu8	%zmm1, 16*0($output){%k1}
1739    add	\$48, $output
1740    vextracti32x4	\$2, %zmm1, %xmm8
1741    vextracti32x4	\$3, %zmm9, %xmm0
1742    and	\$0xf,$length
1743    je	.L_ret_${rndsuffix}
1744    jmp	.L_steal_cipher_${rndsuffix}
1745___
1746    }
1747
1748    {
1749    $code .= <<___;
1750    .L_num_blocks_is_2_${rndsuffix}:
1751    vpshufb	%zmm8, %zmm0, %zmm1
1752    vpsllvq	const_dq3210(%rip), %zmm0, %zmm4
1753    vpsrlvq	const_dq5678(%rip), %zmm1, %zmm2
1754    vpclmulqdq	\$0x00, $ZPOLY, %zmm2, %zmm3
1755    vpxorq	%zmm2, %zmm4, %zmm4{%k2}
1756    vpxord	%zmm4, %zmm3, %zmm9
1757
1758    vmovdqu8	16*0($input), %ymm1
1759    add	\$32, $input
1760___
1761    }
1762
1763    encrypt_by_four("%ymm1", "%ymm9", "%ymm0", $is_128);
1764
1765    {
1766    $code .= <<___;
1767    vmovdqu8	%ymm1, 16*0($output)
1768    add	\$32, $output
1769
1770    vextracti32x4	\$1, %ymm1, %xmm8
1771    vextracti32x4	\$2, %zmm9, %xmm0
1772    and	\$0xf,$length
1773    je	.L_ret_${rndsuffix}
1774    jmp	.L_steal_cipher_${rndsuffix}
1775___
1776    }
1777
1778    {
1779    $code .= <<___;
1780    .L_num_blocks_is_1_${rndsuffix}:
1781    vpshufb	%zmm8, %zmm0, %zmm1
1782    vpsllvq	const_dq3210(%rip), %zmm0, %zmm4
1783    vpsrlvq	const_dq5678(%rip), %zmm1, %zmm2
1784    vpclmulqdq	\$0x00, $ZPOLY, %zmm2, %zmm3
1785    vpxorq	%zmm2, %zmm4, %zmm4{%k2}
1786    vpxord	%zmm4, %zmm3, %zmm9
1787
1788    vmovdqu8	16*0($input), %xmm1
1789    add	\$16, $input
1790___
1791    }
1792
1793    encrypt_by_four("%ymm1", "%ymm9", "%ymm0", $is_128);
1794
1795    {
1796    $code .= <<___;
1797    vmovdqu8	%xmm1, 16*0($output)
1798    add	\$16, $output
1799
1800    vmovdqa	%xmm1, %xmm8
1801    vextracti32x4	\$1, %zmm9, %xmm0
1802    and	\$0xf,$length
1803    je	.L_ret_${rndsuffix}
1804    jmp	.L_steal_cipher_${rndsuffix}
1805    .cfi_endproc
1806___
1807    }
1808  }
1809
1810  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1811  # ;void aesni_xts_[128|256]_decrypt_avx512(
1812  # ;               const uint8_t *in,        // input data
1813  # ;               uint8_t *out,             // output data
1814  # ;               size_t length,            // sector size, in bytes
1815  # ;               const AES_KEY *key1,      // key used for "ECB" encryption, 16*2 bytes
1816  # ;               const AES_KEY *key2,      // key used for tweaking, 16*2 bytes
1817  # ;               const uint8_t iv[16])      // initial tweak value, 16 bytes
1818  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1819  sub dec {
1820    my $is_128 = $_[0];
1821    my $rndsuffix = &random_string();
1822
1823    if ($is_128) {
1824      $code.=<<___;
1825      .globl	aesni_xts_128_decrypt_avx512
1826      .hidden	aesni_xts_128_decrypt_avx512
1827      .type	aesni_xts_128_decrypt_avx512,\@function,6
1828      .align	32
1829      aesni_xts_128_decrypt_avx512:
1830      .cfi_startproc
1831      endbranch
1832___
1833    } else {
1834      $code.=<<___;
1835      .globl	aesni_xts_256_decrypt_avx512
1836      .hidden	aesni_xts_256_decrypt_avx512
1837      .type	aesni_xts_256_decrypt_avx512,\@function,6
1838      .align	32
1839      aesni_xts_256_decrypt_avx512:
1840      .cfi_startproc
1841      endbranch
1842___
1843    }
1844    $code .= "push 	 %rbp\n";
1845    $code .= "mov 	 $TW,%rbp\n";
1846    $code .= "sub 	 \$$VARIABLE_OFFSET,$TW\n";
1847    $code .= "and 	 \$0xffffffffffffffc0,$TW\n";
1848    $code .= "mov 	 %rbx,$GP_STORAGE($TW)\n";
1849
1850    if ($win64) {
1851      $code .= "mov 	 %rdi,$GP_STORAGE + 8*1($TW)\n";
1852      $code .= "mov 	 %rsi,$GP_STORAGE + 8*2($TW)\n";
1853      $code .= "vmovdqa      %xmm6, $XMM_STORAGE + 16*0($TW)\n";
1854      $code .= "vmovdqa      %xmm7, $XMM_STORAGE + 16*1($TW)\n";
1855      $code .= "vmovdqa      %xmm8, $XMM_STORAGE + 16*2($TW)\n";
1856      $code .= "vmovdqa      %xmm9, $XMM_STORAGE + 16*3($TW)\n";
1857      $code .= "vmovdqa      %xmm10, $XMM_STORAGE + 16*4($TW)\n";
1858      $code .= "vmovdqa      %xmm11, $XMM_STORAGE + 16*5($TW)\n";
1859      $code .= "vmovdqa      %xmm12, $XMM_STORAGE + 16*6($TW)\n";
1860      $code .= "vmovdqa      %xmm13, $XMM_STORAGE + 16*7($TW)\n";
1861      $code .= "vmovdqa      %xmm14, $XMM_STORAGE + 16*8($TW)\n";
1862      $code .= "vmovdqa      %xmm15, $XMM_STORAGE + 16*9($TW)\n";
1863    }
1864
1865    $code .= "mov 	 \$0x87, $gf_poly_8b\n";
1866    $code .= "vmovdqu 	 ($tweak),%xmm1\n";      # read initial tweak values
1867
1868    encrypt_tweak("%xmm1", $is_128);
1869
1870    if ($win64) {
1871      $code .= "mov	 $input, 8 + 8*5(%rbp)\n"; # ciphertext pointer
1872      $code .= "mov        $output, 8 + 8*6(%rbp)\n"; # plaintext pointer
1873    }
1874
1875    {
1876    $code.=<<___;
1877
1878    cmp 	 \$0x80,$length
1879    jb 	 .L_less_than_128_bytes_${rndsuffix}
1880    vpbroadcastq 	 $gf_poly_8b,$ZPOLY
1881    cmp 	 \$0x100,$length
1882    jge 	 .L_start_by16_${rndsuffix}
1883    jmp 	 .L_start_by8_${rndsuffix}
1884
1885    .L_do_n_blocks_${rndsuffix}:
1886    cmp 	 \$0x0,$length
1887    je 	 .L_ret_${rndsuffix}
1888    cmp 	 \$0x70,$length
1889    jge 	 .L_remaining_num_blocks_is_7_${rndsuffix}
1890    cmp 	 \$0x60,$length
1891    jge 	 .L_remaining_num_blocks_is_6_${rndsuffix}
1892    cmp 	 \$0x50,$length
1893    jge 	 .L_remaining_num_blocks_is_5_${rndsuffix}
1894    cmp 	 \$0x40,$length
1895    jge 	 .L_remaining_num_blocks_is_4_${rndsuffix}
1896    cmp 	 \$0x30,$length
1897    jge 	 .L_remaining_num_blocks_is_3_${rndsuffix}
1898    cmp 	 \$0x20,$length
1899    jge 	 .L_remaining_num_blocks_is_2_${rndsuffix}
1900    cmp 	 \$0x10,$length
1901    jge 	 .L_remaining_num_blocks_is_1_${rndsuffix}
1902
1903    # _remaining_num_blocks_is_0:
1904    vmovdqu		%xmm5, %xmm1
1905    # xmm5 contains last full block to decrypt with next teawk
1906___
1907    }
1908    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
1909                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
1910                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128);
1911
1912    {
1913    $code .= <<___;
1914    vmovdqu %xmm1, -0x10($output)
1915    vmovdqa %xmm1, %xmm8
1916
1917    # Calc previous tweak
1918    mov		\$0x1,$tmp1
1919    kmovq		$tmp1, %k1
1920    vpsllq	\$0x3f,%xmm9,%xmm13
1921    vpsraq	\$0x3f,%xmm13,%xmm14
1922    vpandq	%xmm25,%xmm14,%xmm5
1923    vpxorq        %xmm5,%xmm9,%xmm9{%k1}
1924    vpsrldq       \$0x8,%xmm9,%xmm10
1925    .byte 98, 211, 181, 8, 115, 194, 1 #vpshrdq \$0x1,%xmm10,%xmm9,%xmm0
1926    vpslldq       \$0x8,%xmm13,%xmm13
1927    vpxorq        %xmm13,%xmm0,%xmm0
1928    jmp           .L_steal_cipher_${rndsuffix}
1929
1930    .L_remaining_num_blocks_is_7_${rndsuffix}:
1931    mov 	 \$0xffffffffffffffff,$tmp1
1932    shr 	 \$0x10,$tmp1
1933    kmovq 	 $tmp1,%k1
1934    vmovdqu8 	 ($input),%zmm1
1935    vmovdqu8 	 0x40($input),%zmm2{%k1}
1936    add 	         \$0x70,$input
1937    and            \$0xf,$length
1938    je             .L_done_7_remain_${rndsuffix}
1939    vextracti32x4   \$0x2,%zmm10,%xmm12
1940    vextracti32x4   \$0x3,%zmm10,%xmm13
1941    vinserti32x4    \$0x2,%xmm13,%zmm10,%zmm10
1942___
1943    }
1944
1945    decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
1946
1947    {
1948    $code .= <<___;
1949    vmovdqu8 	 %zmm1, ($output)
1950    vmovdqu8 	 %zmm2, 0x40($output){%k1}
1951    add 	         \$0x70, $output
1952    vextracti32x4  \$0x2,%zmm2,%xmm8
1953    vmovdqa        %xmm12,%xmm0
1954    jmp            .L_steal_cipher_${rndsuffix}
1955___
1956    }
1957
1958    $code .= "\n.L_done_7_remain_${rndsuffix}:\n";
1959    decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
1960
1961    {
1962    $code .= <<___;
1963    vmovdqu8        %zmm1, ($output)
1964    vmovdqu8        %zmm2, 0x40($output){%k1}
1965    jmp     .L_ret_${rndsuffix}
1966
1967    .L_remaining_num_blocks_is_6_${rndsuffix}:
1968    vmovdqu8 	 ($input),%zmm1
1969    vmovdqu8 	 0x40($input),%ymm2
1970    add 	         \$0x60,$input
1971    and            \$0xf, $length
1972    je             .L_done_6_remain_${rndsuffix}
1973    vextracti32x4   \$0x1,%zmm10,%xmm12
1974    vextracti32x4   \$0x2,%zmm10,%xmm13
1975    vinserti32x4    \$0x1,%xmm13,%zmm10,%zmm10
1976___
1977    }
1978
1979    decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
1980
1981    {
1982    $code .= <<___;
1983    vmovdqu8 	 %zmm1, ($output)
1984    vmovdqu8 	 %ymm2, 0x40($output)
1985    add 	         \$0x60,$output
1986    vextracti32x4  \$0x1,%zmm2,%xmm8
1987    vmovdqa        %xmm12,%xmm0
1988    jmp            .L_steal_cipher_${rndsuffix}
1989___
1990    }
1991
1992    $code .= "\n.L_done_6_remain_${rndsuffix}:\n";
1993    decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
1994
1995    {
1996    $code .= <<___;
1997    vmovdqu8        %zmm1, ($output)
1998    vmovdqu8        %ymm2,0x40($output)
1999    jmp             .L_ret_${rndsuffix}
2000
2001    .L_remaining_num_blocks_is_5_${rndsuffix}:
2002    vmovdqu8 	 ($input),%zmm1
2003    vmovdqu 	 0x40($input),%xmm2
2004    add 	         \$0x50,$input
2005    and            \$0xf,$length
2006    je             .L_done_5_remain_${rndsuffix}
2007    vmovdqa        %xmm10,%xmm12
2008    vextracti32x4  \$0x1,%zmm10,%xmm10
2009___
2010    }
2011
2012    decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
2013
2014    {
2015    $code .= <<___;
2016    vmovdqu8         %zmm1, ($output)
2017    vmovdqu          %xmm2, 0x40($output)
2018    add              \$0x50, $output
2019    vmovdqa          %xmm2,%xmm8
2020    vmovdqa          %xmm12,%xmm0
2021    jmp              .L_steal_cipher_${rndsuffix}
2022___
2023    }
2024
2025    $code .= "\n.L_done_5_remain_${rndsuffix}:\n";
2026    decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
2027
2028    {
2029    $code .= <<___;
2030    vmovdqu8        %zmm1, ($output)
2031    vmovdqu8        %xmm2, 0x40($output)
2032    jmp             .L_ret_${rndsuffix}
2033
2034    .L_remaining_num_blocks_is_4_${rndsuffix}:
2035    vmovdqu8 	 ($input),%zmm1
2036    add 	         \$0x40,$input
2037    and            \$0xf, $length
2038    je             .L_done_4_remain_${rndsuffix}
2039    vextracti32x4   \$0x3,%zmm9,%xmm12
2040    vinserti32x4    \$0x3,%xmm10,%zmm9,%zmm9
2041___
2042    }
2043
2044    decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
2045
2046    {
2047    $code .= <<___;
2048    vmovdqu8        %zmm1,($output)
2049    add             \$0x40,$output
2050    vextracti32x4   \$0x3,%zmm1,%xmm8
2051    vmovdqa         %xmm12,%xmm0
2052    jmp             .L_steal_cipher_${rndsuffix}
2053___
2054    }
2055
2056    $code .= "\n.L_done_4_remain_${rndsuffix}:\n";
2057    decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
2058
2059    {
2060    $code .= <<___;
2061    vmovdqu8        %zmm1, ($output)
2062    jmp             .L_ret_${rndsuffix}
2063
2064    .L_remaining_num_blocks_is_3_${rndsuffix}:
2065    vmovdqu         ($input),%xmm1
2066    vmovdqu         0x10($input),%xmm2
2067    vmovdqu         0x20($input),%xmm3
2068    add             \$0x30,$input
2069    and             \$0xf,$length
2070    je              .L_done_3_remain_${rndsuffix}
2071    vextracti32x4   \$0x2,%zmm9,%xmm13
2072    vextracti32x4   \$0x1,%zmm9,%xmm10
2073    vextracti32x4   \$0x3,%zmm9,%xmm11
2074___
2075    }
2076
2077    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2078                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2079                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1, $is_128);
2080
2081    {
2082    $code .= <<___;
2083    vmovdqu 	 %xmm1,($output)
2084    vmovdqu 	 %xmm2,0x10($output)
2085    vmovdqu 	 %xmm3,0x20($output)
2086    add 	         \$0x30,$output
2087    vmovdqa 	 %xmm3,%xmm8
2088    vmovdqa        %xmm13,%xmm0
2089    jmp 	         .L_steal_cipher_${rndsuffix}
2090___
2091    }
2092    $code .= "\n.L_done_3_remain_${rndsuffix}:\n";
2093    $code .= "vextracti32x4   \$0x1,%zmm9,%xmm10\n";
2094    $code .= "vextracti32x4   \$0x2,%zmm9,%xmm11\n";
2095
2096    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2097                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2098                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1, $is_128);
2099
2100    {
2101    $code .= <<___;
2102    vmovdqu %xmm1,($output)
2103    vmovdqu %xmm2,0x10($output)
2104    vmovdqu %xmm3,0x20($output)
2105    jmp     .L_ret_${rndsuffix}
2106
2107    .L_remaining_num_blocks_is_2_${rndsuffix}:
2108    vmovdqu         ($input),%xmm1
2109    vmovdqu         0x10($input),%xmm2
2110    add             \$0x20,$input
2111    and             \$0xf,$length
2112    je              .L_done_2_remain_${rndsuffix}
2113    vextracti32x4   \$0x2,%zmm9,%xmm10
2114    vextracti32x4   \$0x1,%zmm9,%xmm12
2115___
2116    }
2117
2118    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2119                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2120                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1, $is_128);
2121
2122    {
2123    $code .= <<___;
2124    vmovdqu 	 %xmm1,($output)
2125    vmovdqu 	 %xmm2,0x10($output)
2126    add 	         \$0x20,$output
2127    vmovdqa 	 %xmm2,%xmm8
2128    vmovdqa 	 %xmm12,%xmm0
2129    jmp 	         .L_steal_cipher_${rndsuffix}
2130___
2131    }
2132    $code .= "\n.L_done_2_remain_${rndsuffix}:\n";
2133    $code .= "vextracti32x4   \$0x1,%zmm9,%xmm10\n";
2134
2135    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2136                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2137                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1, $is_128);
2138
2139    {
2140    $code .= <<___;
2141    vmovdqu   %xmm1,($output)
2142    vmovdqu   %xmm2,0x10($output)
2143    jmp       .L_ret_${rndsuffix}
2144
2145    .L_remaining_num_blocks_is_1_${rndsuffix}:
2146    vmovdqu 	 ($input),%xmm1
2147    add 	         \$0x10,$input
2148    and            \$0xf,$length
2149    je             .L_done_1_remain_${rndsuffix}
2150    vextracti32x4  \$0x1,%zmm9,%xmm11
2151___
2152    }
2153
2154    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2155                    "%xmm7", "%xmm8", "%xmm11", "%xmm10", "%xmm9", "%xmm12",
2156                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128);
2157    {
2158    $code .= <<___;
2159    vmovdqu 	 %xmm1,($output)
2160    add 	         \$0x10,$output
2161    vmovdqa 	 %xmm1,%xmm8
2162    vmovdqa 	 %xmm9,%xmm0
2163    jmp 	         .L_steal_cipher_${rndsuffix}
2164___
2165    }
2166
2167    $code .= "\n.L_done_1_remain_${rndsuffix}:\n";
2168
2169    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2170                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2171                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128);
2172
2173    {
2174    $code .= <<___;
2175    vmovdqu   %xmm1, ($output)
2176    jmp       .L_ret_${rndsuffix}
2177
2178    .L_start_by16_${rndsuffix}:
2179    vbroadcasti32x4 	 ($TW),%zmm0
2180    vbroadcasti32x4 shufb_15_7(%rip),%zmm8
2181    mov 	 \$0xaa,$tmp1
2182    kmovq 	 $tmp1,%k2
2183
2184    # Mult tweak by 2^{3, 2, 1, 0}
2185    vpshufb 	 %zmm8,%zmm0,%zmm1
2186    vpsllvq const_dq3210(%rip),%zmm0,%zmm4
2187    vpsrlvq const_dq5678(%rip),%zmm1,%zmm2
2188    vpclmulqdq 	 \$0x0,$ZPOLY,%zmm2,%zmm3
2189    vpxorq 	 %zmm2,%zmm4,%zmm4{%k2}
2190    vpxord 	 %zmm4,%zmm3,%zmm9
2191
2192    # Mult tweak by 2^{7, 6, 5, 4}
2193    vpsllvq const_dq7654(%rip),%zmm0,%zmm5
2194    vpsrlvq const_dq1234(%rip),%zmm1,%zmm6
2195    vpclmulqdq 	 \$0x0,%zmm25,%zmm6,%zmm7
2196    vpxorq 	 %zmm6,%zmm5,%zmm5{%k2}
2197    vpxord 	 %zmm5,%zmm7,%zmm10
2198
2199    # Make next 8 tweak values by all x 2^8
2200    vpsrldq 	 \$0xf,%zmm9,%zmm13
2201    vpclmulqdq 	 \$0x0,%zmm25,%zmm13,%zmm14
2202    vpslldq 	 \$0x1,%zmm9,%zmm11
2203    vpxord 	 %zmm14,%zmm11,%zmm11
2204
2205    vpsrldq 	 \$0xf,%zmm10,%zmm15
2206    vpclmulqdq 	 \$0x0,%zmm25,%zmm15,%zmm16
2207    vpslldq 	 \$0x1,%zmm10,%zmm12
2208    vpxord 	 %zmm16,%zmm12,%zmm12
2209
2210    .L_main_loop_run_16_${rndsuffix}:
2211    vmovdqu8 	 ($input),%zmm1
2212    vmovdqu8 	 0x40($input),%zmm2
2213    vmovdqu8 	 0x80($input),%zmm3
2214    vmovdqu8 	 0xc0($input),%zmm4
2215    vmovdqu8 	 0xf0($input),%xmm5
2216    add 	 \$0x100,$input
2217___
2218    }
2219
2220    decrypt_by_16_zmm("%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm9",
2221                      "%zmm10", "%zmm11", "%zmm12", "%zmm0", 0, $is_128);
2222
2223    {
2224    $code .= <<___;
2225    vmovdqu8 	 %zmm1,($output)
2226    vmovdqu8 	 %zmm2,0x40($output)
2227    vmovdqu8 	 %zmm3,0x80($output)
2228    vmovdqu8 	 %zmm4,0xc0($output)
2229    add 	 \$0x100,$output
2230    sub 	 \$0x100,$length
2231    cmp 	 \$0x100,$length
2232    jge 	 .L_main_loop_run_16_${rndsuffix}
2233
2234    cmp 	 \$0x80,$length
2235    jge 	 .L_main_loop_run_8_${rndsuffix}
2236    jmp 	 .L_do_n_blocks_${rndsuffix}
2237
2238    .L_start_by8_${rndsuffix}:
2239    # Make first 7 tweak values
2240    vbroadcasti32x4 	 ($TW),%zmm0
2241    vbroadcasti32x4 shufb_15_7(%rip),%zmm8
2242    mov 	 \$0xaa,$tmp1
2243    kmovq 	 $tmp1,%k2
2244
2245    # Mult tweak by 2^{3, 2, 1, 0}
2246    vpshufb 	 %zmm8,%zmm0,%zmm1
2247    vpsllvq const_dq3210(%rip),%zmm0,%zmm4
2248    vpsrlvq const_dq5678(%rip),%zmm1,%zmm2
2249    vpclmulqdq 	 \$0x0,%zmm25,%zmm2,%zmm3
2250    vpxorq 	 %zmm2,%zmm4,%zmm4{%k2}
2251    vpxord 	 %zmm4,%zmm3,%zmm9
2252
2253    # Mult tweak by 2^{7, 6, 5, 4}
2254    vpsllvq const_dq7654(%rip),%zmm0,%zmm5
2255    vpsrlvq const_dq1234(%rip),%zmm1,%zmm6
2256    vpclmulqdq 	 \$0x0,%zmm25,%zmm6,%zmm7
2257    vpxorq 	 %zmm6,%zmm5,%zmm5{%k2}
2258    vpxord 	 %zmm5,%zmm7,%zmm10
2259
2260    .L_main_loop_run_8_${rndsuffix}:
2261    vmovdqu8 	 ($input),%zmm1
2262    vmovdqu8 	 0x40($input),%zmm2
2263    vmovdqu8 	 0x70($input),%xmm5
2264    add 	         \$0x80,$input
2265___
2266    }
2267
2268
2269    decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 0, $is_128);
2270
2271    {
2272    $code .= <<___;
2273    vmovdqu8 	 %zmm1,($output)
2274    vmovdqu8 	 %zmm2,0x40($output)
2275    add 	 \$0x80,$output
2276    sub 	 \$0x80,$length
2277    cmp 	 \$0x80,$length
2278    jge 	 .L_main_loop_run_8_${rndsuffix}
2279    jmp 	 .L_do_n_blocks_${rndsuffix}
2280
2281    .L_steal_cipher_${rndsuffix}:
2282    # start cipher stealing simplified: xmm8-last cipher block, xmm0-next tweak
2283    vmovdqa 	 %xmm8,%xmm2
2284
2285    # shift xmm8 to the left by 16-N_val bytes
2286    lea vpshufb_shf_table(%rip),$TEMPLOW
2287    vmovdqu 	 ($TEMPLOW,$length,1),%xmm10
2288    vpshufb 	 %xmm10,%xmm8,%xmm8
2289
2290
2291    vmovdqu 	 -0x10($input,$length,1),%xmm3
2292    vmovdqu 	 %xmm8,-0x10($output,$length,1)
2293
2294    # shift xmm3 to the right by 16-N_val bytes
2295    lea vpshufb_shf_table(%rip), $TEMPLOW
2296    add \$16, $TEMPLOW
2297    sub 	 $length,$TEMPLOW
2298    vmovdqu 	 ($TEMPLOW),%xmm10
2299    vpxor mask1(%rip),%xmm10,%xmm10
2300    vpshufb 	 %xmm10,%xmm3,%xmm3
2301
2302    vpblendvb 	 %xmm10,%xmm2,%xmm3,%xmm3
2303
2304    # xor Tweak value
2305    vpxor 	 %xmm0,%xmm3,%xmm8
2306
2307    # decrypt last block with cipher stealing
2308    vpxor	($key1),%xmm8,%xmm8
2309    vaesdec	0x10($key1),%xmm8,%xmm8
2310    vaesdec	0x20($key1),%xmm8,%xmm8
2311    vaesdec	0x30($key1),%xmm8,%xmm8
2312    vaesdec	0x40($key1),%xmm8,%xmm8
2313    vaesdec	0x50($key1),%xmm8,%xmm8
2314    vaesdec	0x60($key1),%xmm8,%xmm8
2315    vaesdec	0x70($key1),%xmm8,%xmm8
2316    vaesdec	0x80($key1),%xmm8,%xmm8
2317    vaesdec	0x90($key1),%xmm8,%xmm8
2318___
2319    if ($is_128) {
2320      $code .= "vaesdeclast	0xa0($key1),%xmm8,%xmm8\n";
2321    } else {
2322      $code .= <<___;
2323      vaesdec	0xa0($key1),%xmm8,%xmm8
2324      vaesdec	0xb0($key1),%xmm8,%xmm8
2325      vaesdec	0xc0($key1),%xmm8,%xmm8
2326      vaesdec	0xd0($key1),%xmm8,%xmm8
2327      vaesdeclast	0xe0($key1),%xmm8,%xmm8
2328___
2329    }
2330    $code .= <<___
2331    # xor Tweak value
2332    vpxor 	 %xmm0,%xmm8,%xmm8
2333
2334    .L_done_${rndsuffix}:
2335    # store last ciphertext value
2336    vmovdqu 	 %xmm8,-0x10($output)
2337___
2338    }
2339
2340    {
2341    $code .= <<___;
2342    .L_ret_${rndsuffix}:
2343    mov 	 $GP_STORAGE($TW),%rbx
2344    xor    $tmp1,$tmp1
2345    mov    $tmp1,$GP_STORAGE($TW)
2346    # Zero-out the whole of `%zmm0`.
2347    vpxorq %zmm0,%zmm0,%zmm0
2348___
2349    }
2350
2351    if ($win64) {
2352      $code .= <<___;
2353      mov $GP_STORAGE + 8*1($TW),%rdi
2354      mov $tmp1,$GP_STORAGE + 8*1($TW)
2355      mov $GP_STORAGE + 8*2($TW),%rsi
2356      mov $tmp1,$GP_STORAGE + 8*2($TW)
2357
2358      vmovdqa $XMM_STORAGE + 16 * 0($TW), %xmm6
2359      vmovdqa $XMM_STORAGE + 16 * 1($TW), %xmm7
2360      vmovdqa $XMM_STORAGE + 16 * 2($TW), %xmm8
2361      vmovdqa $XMM_STORAGE + 16 * 3($TW), %xmm9
2362
2363      # Zero the 64 bytes we just restored to the xmm registers.
2364      vmovdqa64 %zmm0,$XMM_STORAGE($TW)
2365
2366      vmovdqa $XMM_STORAGE + 16 * 4($TW), %xmm10
2367      vmovdqa $XMM_STORAGE + 16 * 5($TW), %xmm11
2368      vmovdqa $XMM_STORAGE + 16 * 6($TW), %xmm12
2369      vmovdqa $XMM_STORAGE + 16 * 7($TW), %xmm13
2370
2371      # And again.
2372      vmovdqa64 %zmm0,$XMM_STORAGE + 16 * 4($TW)
2373
2374      vmovdqa $XMM_STORAGE + 16 * 8($TW), %xmm14
2375      vmovdqa $XMM_STORAGE + 16 * 9($TW), %xmm15
2376
2377      # Last round is only 32 bytes (256-bits), so we use `%ymm` as the
2378      # source operand.
2379      vmovdqa %ymm0,$XMM_STORAGE + 16 * 8($TW)
2380___
2381    }
2382
2383    {
2384    $code .= <<___;
2385    mov %rbp,$TW
2386    pop %rbp
2387    vzeroupper
2388    ret
2389
2390    .L_less_than_128_bytes_${rndsuffix}:
2391    cmp 	 \$0x10,$length
2392    jb 	 .L_ret_${rndsuffix}
2393
2394    mov 	 $length,$tmp1
2395    and 	 \$0x70,$tmp1
2396    cmp 	 \$0x60,$tmp1
2397    je 	 .L_num_blocks_is_6_${rndsuffix}
2398    cmp 	 \$0x50,$tmp1
2399    je 	 .L_num_blocks_is_5_${rndsuffix}
2400    cmp 	 \$0x40,$tmp1
2401    je 	 .L_num_blocks_is_4_${rndsuffix}
2402    cmp 	 \$0x30,$tmp1
2403    je 	 .L_num_blocks_is_3_${rndsuffix}
2404    cmp 	 \$0x20,$tmp1
2405    je 	 .L_num_blocks_is_2_${rndsuffix}
2406    cmp 	 \$0x10,$tmp1
2407    je 	 .L_num_blocks_is_1_${rndsuffix}
2408___
2409    }
2410
2411    $code .= "\n.L_num_blocks_is_7_${rndsuffix}:\n";
2412    initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2413               "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2414               "%xmm13", "%xmm14", "%xmm15", 7);
2415
2416    {
2417    $code .= <<___;
2418    add    \$0x70,$input
2419    and    \$0xf,$length
2420    je      .L_done_7_${rndsuffix}
2421
2422    .L_steal_cipher_7_${rndsuffix}:
2423     xor         $gf_poly_8b_temp, $gf_poly_8b_temp
2424     shl         \$1, $TEMPLOW
2425     adc         $TEMPHIGH, $TEMPHIGH
2426     cmovc       $gf_poly_8b, $gf_poly_8b_temp
2427     xor         $gf_poly_8b_temp, $TEMPLOW
2428     mov         $TEMPLOW,0x10($TW)
2429     mov         $TEMPHIGH,0x18($TW)
2430     vmovdqa64   %xmm15,%xmm16
2431     vmovdqa     0x10($TW),%xmm15
2432___
2433    }
2434
2435    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2436                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2437                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 7, 1, $is_128);
2438
2439    {
2440    $code .= <<___;
2441    vmovdqu 	 %xmm1,($output)
2442    vmovdqu 	 %xmm2,0x10($output)
2443    vmovdqu 	 %xmm3,0x20($output)
2444    vmovdqu 	 %xmm4,0x30($output)
2445    vmovdqu 	 %xmm5,0x40($output)
2446    vmovdqu 	 %xmm6,0x50($output)
2447    add 	         \$0x70,$output
2448    vmovdqa64 	 %xmm16,%xmm0
2449    vmovdqa 	 %xmm7,%xmm8
2450    jmp 	         .L_steal_cipher_${rndsuffix}
2451___
2452    }
2453
2454    $code .= "\n.L_done_7_${rndsuffix}:\n";
2455    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2456                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2457                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 7, 1, $is_128);
2458
2459    {
2460    $code .= <<___;
2461    vmovdqu 	 %xmm1,($output)
2462    vmovdqu 	 %xmm2,0x10($output)
2463    vmovdqu 	 %xmm3,0x20($output)
2464    vmovdqu 	 %xmm4,0x30($output)
2465    vmovdqu 	 %xmm5,0x40($output)
2466    vmovdqu 	 %xmm6,0x50($output)
2467    add 	         \$0x70,$output
2468    vmovdqa 	 %xmm7,%xmm8
2469    jmp 	         .L_done_${rndsuffix}
2470___
2471    }
2472
2473    $code .= "\n.L_num_blocks_is_6_${rndsuffix}:\n";
2474    initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2475               "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2476               "%xmm13", "%xmm14", "%xmm15", 6);
2477
2478    {
2479    $code .= <<___;
2480    add    \$0x60,$input
2481    and    \$0xf,$length
2482    je      .L_done_6_${rndsuffix}
2483
2484    .L_steal_cipher_6_${rndsuffix}:
2485     xor         $gf_poly_8b_temp, $gf_poly_8b_temp
2486     shl         \$1, $TEMPLOW
2487     adc         $TEMPHIGH, $TEMPHIGH
2488     cmovc       $gf_poly_8b, $gf_poly_8b_temp
2489     xor         $gf_poly_8b_temp, $TEMPLOW
2490     mov         $TEMPLOW,0x10($TW)
2491     mov         $TEMPHIGH,0x18($TW)
2492     vmovdqa64   %xmm14,%xmm15
2493     vmovdqa     0x10($TW),%xmm14
2494___
2495    }
2496
2497    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2498                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2499                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 6, 1, $is_128);
2500
2501    {
2502    $code .= <<___;
2503    vmovdqu 	 %xmm1,($output)
2504    vmovdqu 	 %xmm2,0x10($output)
2505    vmovdqu 	 %xmm3,0x20($output)
2506    vmovdqu 	 %xmm4,0x30($output)
2507    vmovdqu 	 %xmm5,0x40($output)
2508    add 	         \$0x60,$output
2509    vmovdqa 	 %xmm15,%xmm0
2510    vmovdqa 	 %xmm6,%xmm8
2511    jmp 	         .L_steal_cipher_${rndsuffix}
2512___
2513    }
2514    $code .= "\n.L_done_6_${rndsuffix}:\n";
2515    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2516                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2517                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 6, 1, $is_128);
2518
2519    {
2520    $code .= <<___;
2521    vmovdqu 	 %xmm1,($output)
2522    vmovdqu 	 %xmm2,0x10($output)
2523    vmovdqu 	 %xmm3,0x20($output)
2524    vmovdqu 	 %xmm4,0x30($output)
2525    vmovdqu 	 %xmm5,0x40($output)
2526    add 	         \$0x60,$output
2527    vmovdqa 	 %xmm6,%xmm8
2528    jmp 	         .L_done_${rndsuffix}
2529___
2530    }
2531
2532    $code .= "\n.L_num_blocks_is_5_${rndsuffix}:\n";
2533    initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2534               "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2535               "%xmm13", "%xmm14", "%xmm15", 5);
2536
2537    {
2538    $code .= <<___;
2539    add    \$0x50,$input
2540    and    \$0xf,$length
2541    je      .L_done_5_${rndsuffix}
2542
2543    .L_steal_cipher_5_${rndsuffix}:
2544     xor         $gf_poly_8b_temp, $gf_poly_8b_temp
2545     shl         \$1, $TEMPLOW
2546     adc         $TEMPHIGH, $TEMPHIGH
2547     cmovc       $gf_poly_8b, $gf_poly_8b_temp
2548     xor         $gf_poly_8b_temp, $TEMPLOW
2549     mov         $TEMPLOW,0x10($TW)
2550     mov         $TEMPHIGH,0x18($TW)
2551     vmovdqa64   %xmm13,%xmm14
2552     vmovdqa     0x10($TW),%xmm13
2553___
2554    }
2555
2556    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2557                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2558                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 5, 1, $is_128);
2559
2560    {
2561    $code .= <<___;
2562    vmovdqu 	 %xmm1,($output)
2563    vmovdqu 	 %xmm2,0x10($output)
2564    vmovdqu 	 %xmm3,0x20($output)
2565    vmovdqu 	 %xmm4,0x30($output)
2566    add 	         \$0x50,$output
2567    vmovdqa 	 %xmm14,%xmm0
2568    vmovdqa 	 %xmm5,%xmm8
2569    jmp 	         .L_steal_cipher_${rndsuffix}
2570___
2571    }
2572
2573    $code .= "\n.L_done_5_${rndsuffix}:\n";
2574    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2575                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2576                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 5, 1, $is_128);
2577
2578    {
2579    $code .= <<___;
2580    vmovdqu 	 %xmm1,($output)
2581    vmovdqu 	 %xmm2,0x10($output)
2582    vmovdqu 	 %xmm3,0x20($output)
2583    vmovdqu 	 %xmm4,0x30($output)
2584    add 	         \$0x50,$output
2585    vmovdqa 	 %xmm5,%xmm8
2586    jmp 	         .L_done_${rndsuffix}
2587___
2588    }
2589
2590    $code .= "\n.L_num_blocks_is_4_${rndsuffix}:\n";
2591
2592    initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2593               "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2594               "%xmm13", "%xmm14", "%xmm15", 4);
2595
2596    {
2597    $code .= <<___;
2598    add    \$0x40,$input
2599    and    \$0xf,$length
2600    je      .L_done_4_${rndsuffix}
2601
2602    .L_steal_cipher_4_${rndsuffix}:
2603     xor         $gf_poly_8b_temp, $gf_poly_8b_temp
2604     shl         \$1, $TEMPLOW
2605     adc         $TEMPHIGH, $TEMPHIGH
2606     cmovc       $gf_poly_8b, $gf_poly_8b_temp
2607     xor         $gf_poly_8b_temp, $TEMPLOW
2608     mov         $TEMPLOW,0x10($TW)
2609     mov         $TEMPHIGH,0x18($TW)
2610     vmovdqa64   %xmm12,%xmm13
2611     vmovdqa     0x10($TW),%xmm12
2612___
2613    }
2614
2615    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2616                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2617                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 4, 1, $is_128);
2618
2619    {
2620    $code .= <<___;
2621    vmovdqu 	 %xmm1,($output)
2622    vmovdqu 	 %xmm2,0x10($output)
2623    vmovdqu 	 %xmm3,0x20($output)
2624    add 	         \$0x40,$output
2625    vmovdqa 	 %xmm13,%xmm0
2626    vmovdqa 	 %xmm4,%xmm8
2627    jmp 	         .L_steal_cipher_${rndsuffix}
2628___
2629    }
2630
2631    $code .= "\n.L_done_4_${rndsuffix}:\n";
2632    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2633                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2634                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 4, 1, $is_128);
2635
2636    {
2637    $code .= <<___;
2638    vmovdqu 	 %xmm1,($output)
2639    vmovdqu 	 %xmm2,0x10($output)
2640    vmovdqu 	 %xmm3,0x20($output)
2641    add 	         \$0x40,$output
2642    vmovdqa 	 %xmm4,%xmm8
2643    jmp 	         .L_done_${rndsuffix}
2644___
2645    }
2646
2647    $code .= "\n.L_num_blocks_is_3_${rndsuffix}:\n";
2648
2649    initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2650               "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2651               "%xmm13", "%xmm14", "%xmm15", 3);
2652
2653    {
2654    $code .= <<___;
2655    add    \$0x30,$input
2656    and    \$0xf,$length
2657    je      .L_done_3_${rndsuffix}
2658
2659    .L_steal_cipher_3_${rndsuffix}:
2660     xor         $gf_poly_8b_temp, $gf_poly_8b_temp
2661     shl         \$1, $TEMPLOW
2662     adc         $TEMPHIGH, $TEMPHIGH
2663     cmovc       $gf_poly_8b, $gf_poly_8b_temp
2664     xor         $gf_poly_8b_temp, $TEMPLOW
2665     mov         $TEMPLOW,0x10($TW)
2666     mov         $TEMPHIGH,0x18($TW)
2667     vmovdqa64   %xmm11,%xmm12
2668     vmovdqa     0x10($TW),%xmm11
2669___
2670    }
2671
2672    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2673                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2674                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1, $is_128);
2675
2676    {
2677    $code .= <<___;
2678    vmovdqu 	 %xmm1,($output)
2679    vmovdqu 	 %xmm2,0x10($output)
2680    add 	         \$0x30,$output
2681    vmovdqa 	 %xmm12,%xmm0
2682    vmovdqa 	 %xmm3,%xmm8
2683    jmp 	         .L_steal_cipher_${rndsuffix}
2684___
2685    }
2686    $code .= "\n.L_done_3_${rndsuffix}:\n";
2687    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2688                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2689                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1, $is_128);
2690
2691    {
2692    $code .= <<___;
2693    vmovdqu 	 %xmm1,($output)
2694    vmovdqu 	 %xmm2,0x10($output)
2695    add 	         \$0x30,$output
2696    vmovdqa 	 %xmm3,%xmm8
2697    jmp 	         .L_done_${rndsuffix}
2698___
2699    }
2700
2701    $code .= "\n.L_num_blocks_is_2_${rndsuffix}:\n";
2702
2703    initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2704               "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2705               "%xmm13", "%xmm14", "%xmm15", 2);
2706
2707    {
2708    $code .= <<___;
2709    add    \$0x20,$input
2710    and    \$0xf,$length
2711    je      .L_done_2_${rndsuffix}
2712
2713    .L_steal_cipher_2_${rndsuffix}:
2714     xor         $gf_poly_8b_temp, $gf_poly_8b_temp
2715     shl         \$1, $TEMPLOW
2716     adc         $TEMPHIGH, $TEMPHIGH
2717     cmovc       $gf_poly_8b, $gf_poly_8b_temp
2718     xor         $gf_poly_8b_temp, $TEMPLOW
2719     mov         $TEMPLOW,0x10($TW)
2720     mov         $TEMPHIGH,0x18($TW)
2721     vmovdqa64   %xmm10,%xmm11
2722     vmovdqa     0x10($TW),%xmm10
2723___
2724    }
2725
2726    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2727                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2728                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1, $is_128);
2729
2730    {
2731    $code .= <<___;
2732    vmovdqu 	 %xmm1,($output)
2733    add 	         \$0x20,$output
2734    vmovdqa 	 %xmm11,%xmm0
2735    vmovdqa 	 %xmm2,%xmm8
2736    jmp 	         .L_steal_cipher_${rndsuffix}
2737___
2738    }
2739
2740    $code .= "\n.L_done_2_${rndsuffix}:\n";
2741    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2742                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2743                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1, $is_128);
2744
2745    {
2746    $code .= <<___;
2747    vmovdqu 	 %xmm1,($output)
2748    add 	         \$0x20,$output
2749    vmovdqa 	 %xmm2,%xmm8
2750    jmp 	         .L_done_${rndsuffix}
2751___
2752    }
2753
2754    $code .= "\n.L_num_blocks_is_1_${rndsuffix}:\n";
2755
2756    initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2757               "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2758               "%xmm13", "%xmm14", "%xmm15", 1);
2759
2760    {
2761    $code .= <<___;
2762    add    \$0x10,$input
2763    and    \$0xf,$length
2764    je      .L_done_1_${rndsuffix}
2765
2766    .L_steal_cipher_1_${rndsuffix}:
2767     xor         $gf_poly_8b_temp, $gf_poly_8b_temp
2768     shl         \$1, $TEMPLOW
2769     adc         $TEMPHIGH, $TEMPHIGH
2770     cmovc       $gf_poly_8b, $gf_poly_8b_temp
2771     xor         $gf_poly_8b_temp, $TEMPLOW
2772     mov         $TEMPLOW,0x10($TW)
2773     mov         $TEMPHIGH,0x18($TW)
2774     vmovdqa64   %xmm9,%xmm10
2775     vmovdqa     0x10($TW),%xmm9
2776___
2777    }
2778    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2779                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2780                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128);
2781
2782    {
2783    $code .= <<___;
2784    add 	         \$0x10,$output
2785    vmovdqa 	 %xmm10,%xmm0
2786    vmovdqa 	 %xmm1,%xmm8
2787    jmp 	         .L_steal_cipher_${rndsuffix}
2788___
2789    }
2790    $code .= "\n.L_done_1_${rndsuffix}:\n";
2791    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2792                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2793                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128);
2794
2795    {
2796    $code .= <<___;
2797    add 	         \$0x10,$output
2798    vmovdqa 	 %xmm1,%xmm8
2799    jmp 	         .L_done_${rndsuffix}
2800    .cfi_endproc
2801___
2802    }
2803
2804  }
2805
2806  # The only difference between AES-XTS-128 and -256 is the number of rounds,
2807  # so we generate from the same perlasm base, extending to 14 rounds when
2808  # `$is_128' is 0.
2809
2810  enc(1);
2811  dec(1);
2812
2813  enc(0);
2814  dec(0);
2815
2816  $code .= <<___;
2817  .section .rodata
2818  .align 16
2819
2820  vpshufb_shf_table:
2821    .quad 0x8786858483828100, 0x8f8e8d8c8b8a8988
2822    .quad 0x0706050403020100, 0x000e0d0c0b0a0908
2823
2824  mask1:
2825    .quad 0x8080808080808080, 0x8080808080808080
2826
2827  const_dq3210:
2828    .quad 0, 0, 1, 1, 2, 2, 3, 3
2829  const_dq5678:
2830    .quad 8, 8, 7, 7, 6, 6, 5, 5
2831  const_dq7654:
2832    .quad 4, 4, 5, 5, 6, 6, 7, 7
2833  const_dq1234:
2834    .quad 4, 4, 3, 3, 2, 2, 1, 1
2835
2836  shufb_15_7:
2837    .byte  15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff
2838    .byte  0xff, 0xff, 0xff, 0xff, 0xff
2839
2840.text
2841___
2842
2843} else {
2844    $code .= <<___;
2845    .text
2846    .globl  aesni_xts_128_encrypt_avx512
2847    .globl  aesni_xts_128_decrypt_avx512
2848
2849    aesni_xts_128_encrypt_avx512:
2850    aesni_xts_128_decrypt_avx512:
2851    .byte   0x0f,0x0b    # ud2
2852    ret
2853
2854    .globl  aesni_xts_256_encrypt_avx512
2855    .globl  aesni_xts_256_decrypt_avx512
2856
2857    aesni_xts_256_encrypt_avx512:
2858    aesni_xts_256_decrypt_avx512:
2859    .byte   0x0f,0x0b    # ud2
2860    ret
2861
2862    .globl  aesni_xts_avx512_eligible
2863    .type   aesni_xts_avx512_eligible,\@abi-omnipotent
2864    aesni_xts_avx512_eligible:
2865    xor	%eax,%eax
2866    ret
2867    .size   aesni_xts_avx512_eligible, .-aesni_xts_avx512_eligible
2868
2869___
2870}
2871
2872print $code;
2873
2874close STDOUT or die "error closing STDOUT: $!";
2875