1// SPDX-License-Identifier: Apache-2.0 2# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# March, June 2010 18# 19# The module implements "4-bit" GCM GHASH function and underlying 20# single multiplication operation in GF(2^128). "4-bit" means that 21# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH 22# function features so called "528B" variant utilizing additional 23# 256+16 bytes of per-key storage [+512 bytes shared table]. 24# Performance results are for this streamed GHASH subroutine and are 25# expressed in cycles per processed byte, less is better: 26# 27# gcc 3.4.x(*) assembler 28# 29# P4 28.6 14.0 +100% 30# Opteron 19.3 7.7 +150% 31# Core2 17.8 8.1(**) +120% 32# Atom 31.6 16.8 +88% 33# VIA Nano 21.8 10.1 +115% 34# 35# (*) comparison is not completely fair, because C results are 36# for vanilla "256B" implementation, while assembler results 37# are for "528B";-) 38# (**) it's mystery [to me] why Core2 result is not same as for 39# Opteron; 40 41# May 2010 42# 43# Add PCLMULQDQ version performing at 2.02 cycles per processed byte. 44# See ghash-x86.pl for background information and details about coding 45# techniques. 46# 47# Special thanks to David Woodhouse for providing access to a 48# Westmere-based system on behalf of Intel Open Source Technology Centre. 49 50# December 2012 51# 52# Overhaul: aggregate Karatsuba post-processing, improve ILP in 53# reduction_alg9, increase reduction aggregate factor to 4x. As for 54# the latter. ghash-x86.pl discusses that it makes lesser sense to 55# increase aggregate factor. Then why increase here? Critical path 56# consists of 3 independent pclmulqdq instructions, Karatsuba post- 57# processing and reduction. "On top" of this we lay down aggregated 58# multiplication operations, triplets of independent pclmulqdq's. As 59# issue rate for pclmulqdq is limited, it makes lesser sense to 60# aggregate more multiplications than it takes to perform remaining 61# non-multiplication operations. 2x is near-optimal coefficient for 62# contemporary Intel CPUs (therefore modest improvement coefficient), 63# but not for Bulldozer. Latter is because logical SIMD operations 64# are twice as slow in comparison to Intel, so that critical path is 65# longer. A CPU with higher pclmulqdq issue rate would also benefit 66# from higher aggregate factor... 67# 68# Westmere 1.78(+13%) 69# Sandy Bridge 1.80(+8%) 70# Ivy Bridge 1.80(+7%) 71# Haswell 0.55(+93%) (if system doesn't support AVX) 72# Broadwell 0.45(+110%)(if system doesn't support AVX) 73# Skylake 0.44(+110%)(if system doesn't support AVX) 74# Bulldozer 1.49(+27%) 75# Silvermont 2.88(+13%) 76# Knights L 2.12(-) (if system doesn't support AVX) 77# Goldmont 1.08(+24%) 78 79# March 2013 80# 81# ... 8x aggregate factor AVX code path is using reduction algorithm 82# suggested by Shay Gueron[1]. Even though contemporary AVX-capable 83# CPUs such as Sandy and Ivy Bridge can execute it, the code performs 84# sub-optimally in comparison to above mentioned version. But thanks 85# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that 86# it performs in 0.41 cycles per byte on Haswell processor, in 87# 0.29 on Broadwell, and in 0.36 on Skylake. 88# 89# Knights Landing achieves 1.09 cpb. 90# 91# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest 92 93# Generated once from 94# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/ghash-x86_64.pl 95# and modified for ICP. Modification are kept at a bare minimum to ease later 96# upstream merges. 97 98#if defined(__x86_64__) && defined(HAVE_AVX) && \ 99 defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) 100 101#define _ASM 102#include <sys/asm_linkage.h> 103 104.text 105 106/* Windows userland links with OpenSSL */ 107#if !defined (_WIN32) || defined (_KERNEL) 108ENTRY_ALIGN(gcm_gmult_clmul, 16) 109 110.cfi_startproc 111 ENDBR 112 113.L_gmult_clmul: 114 movdqu (%rdi),%xmm0 115 movdqa .Lbswap_mask(%rip),%xmm5 116 movdqu (%rsi),%xmm2 117 movdqu 32(%rsi),%xmm4 118.byte 102,15,56,0,197 119 movdqa %xmm0,%xmm1 120 pshufd $78,%xmm0,%xmm3 121 pxor %xmm0,%xmm3 122.byte 102,15,58,68,194,0 123.byte 102,15,58,68,202,17 124.byte 102,15,58,68,220,0 125 pxor %xmm0,%xmm3 126 pxor %xmm1,%xmm3 127 128 movdqa %xmm3,%xmm4 129 psrldq $8,%xmm3 130 pslldq $8,%xmm4 131 pxor %xmm3,%xmm1 132 pxor %xmm4,%xmm0 133 134 movdqa %xmm0,%xmm4 135 movdqa %xmm0,%xmm3 136 psllq $5,%xmm0 137 pxor %xmm0,%xmm3 138 psllq $1,%xmm0 139 pxor %xmm3,%xmm0 140 psllq $57,%xmm0 141 movdqa %xmm0,%xmm3 142 pslldq $8,%xmm0 143 psrldq $8,%xmm3 144 pxor %xmm4,%xmm0 145 pxor %xmm3,%xmm1 146 147 148 movdqa %xmm0,%xmm4 149 psrlq $1,%xmm0 150 pxor %xmm4,%xmm1 151 pxor %xmm0,%xmm4 152 psrlq $5,%xmm0 153 pxor %xmm4,%xmm0 154 psrlq $1,%xmm0 155 pxor %xmm1,%xmm0 156.byte 102,15,56,0,197 157 movdqu %xmm0,(%rdi) 158 RET 159.cfi_endproc 160SET_SIZE(gcm_gmult_clmul) 161#endif /* !_WIN32 || _KERNEL */ 162 163ENTRY_ALIGN(gcm_init_htab_avx, 32) 164.cfi_startproc 165 ENDBR 166 vzeroupper 167 168 vmovdqu (%rsi),%xmm2 169 // KCF/ICP stores H in network byte order with the hi qword first 170 // so we need to swap all bytes, not the 2 qwords. 171 vmovdqu .Lbswap_mask(%rip),%xmm4 172 vpshufb %xmm4,%xmm2,%xmm2 173 174 175 vpshufd $255,%xmm2,%xmm4 176 vpsrlq $63,%xmm2,%xmm3 177 vpsllq $1,%xmm2,%xmm2 178 vpxor %xmm5,%xmm5,%xmm5 179 vpcmpgtd %xmm4,%xmm5,%xmm5 180 vpslldq $8,%xmm3,%xmm3 181 vpor %xmm3,%xmm2,%xmm2 182 183 184 vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5 185 vpxor %xmm5,%xmm2,%xmm2 186 187 vpunpckhqdq %xmm2,%xmm2,%xmm6 188 vmovdqa %xmm2,%xmm0 189 vpxor %xmm2,%xmm6,%xmm6 190 movq $4,%r10 191 jmp .Linit_start_avx 192.balign 32 193.Linit_loop_avx: 194 vpalignr $8,%xmm3,%xmm4,%xmm5 195 vmovdqu %xmm5,-16(%rdi) 196 vpunpckhqdq %xmm0,%xmm0,%xmm3 197 vpxor %xmm0,%xmm3,%xmm3 198 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 199 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 200 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 201 vpxor %xmm0,%xmm1,%xmm4 202 vpxor %xmm4,%xmm3,%xmm3 203 204 vpslldq $8,%xmm3,%xmm4 205 vpsrldq $8,%xmm3,%xmm3 206 vpxor %xmm4,%xmm0,%xmm0 207 vpxor %xmm3,%xmm1,%xmm1 208 vpsllq $57,%xmm0,%xmm3 209 vpsllq $62,%xmm0,%xmm4 210 vpxor %xmm3,%xmm4,%xmm4 211 vpsllq $63,%xmm0,%xmm3 212 vpxor %xmm3,%xmm4,%xmm4 213 vpslldq $8,%xmm4,%xmm3 214 vpsrldq $8,%xmm4,%xmm4 215 vpxor %xmm3,%xmm0,%xmm0 216 vpxor %xmm4,%xmm1,%xmm1 217 218 vpsrlq $1,%xmm0,%xmm4 219 vpxor %xmm0,%xmm1,%xmm1 220 vpxor %xmm4,%xmm0,%xmm0 221 vpsrlq $5,%xmm4,%xmm4 222 vpxor %xmm4,%xmm0,%xmm0 223 vpsrlq $1,%xmm0,%xmm0 224 vpxor %xmm1,%xmm0,%xmm0 225.Linit_start_avx: 226 vmovdqa %xmm0,%xmm5 227 vpunpckhqdq %xmm0,%xmm0,%xmm3 228 vpxor %xmm0,%xmm3,%xmm3 229 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 230 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 231 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 232 vpxor %xmm0,%xmm1,%xmm4 233 vpxor %xmm4,%xmm3,%xmm3 234 235 vpslldq $8,%xmm3,%xmm4 236 vpsrldq $8,%xmm3,%xmm3 237 vpxor %xmm4,%xmm0,%xmm0 238 vpxor %xmm3,%xmm1,%xmm1 239 vpsllq $57,%xmm0,%xmm3 240 vpsllq $62,%xmm0,%xmm4 241 vpxor %xmm3,%xmm4,%xmm4 242 vpsllq $63,%xmm0,%xmm3 243 vpxor %xmm3,%xmm4,%xmm4 244 vpslldq $8,%xmm4,%xmm3 245 vpsrldq $8,%xmm4,%xmm4 246 vpxor %xmm3,%xmm0,%xmm0 247 vpxor %xmm4,%xmm1,%xmm1 248 249 vpsrlq $1,%xmm0,%xmm4 250 vpxor %xmm0,%xmm1,%xmm1 251 vpxor %xmm4,%xmm0,%xmm0 252 vpsrlq $5,%xmm4,%xmm4 253 vpxor %xmm4,%xmm0,%xmm0 254 vpsrlq $1,%xmm0,%xmm0 255 vpxor %xmm1,%xmm0,%xmm0 256 vpshufd $78,%xmm5,%xmm3 257 vpshufd $78,%xmm0,%xmm4 258 vpxor %xmm5,%xmm3,%xmm3 259 vmovdqu %xmm5,0(%rdi) 260 vpxor %xmm0,%xmm4,%xmm4 261 vmovdqu %xmm0,16(%rdi) 262 leaq 48(%rdi),%rdi 263 subq $1,%r10 264 jnz .Linit_loop_avx 265 266 vpalignr $8,%xmm4,%xmm3,%xmm5 267 vmovdqu %xmm5,-16(%rdi) 268 269 vzeroupper 270 RET 271.cfi_endproc 272SET_SIZE(gcm_init_htab_avx) 273 274#if !defined (_WIN32) || defined (_KERNEL) 275ENTRY_ALIGN(gcm_gmult_avx, 32) 276.cfi_startproc 277 ENDBR 278 jmp .L_gmult_clmul 279.cfi_endproc 280SET_SIZE(gcm_gmult_avx) 281 282ENTRY_ALIGN(gcm_ghash_avx, 32) 283.cfi_startproc 284 ENDBR 285 vzeroupper 286 287 vmovdqu (%rdi),%xmm10 288 leaq .L0x1c2_polynomial(%rip),%r10 289 leaq 64(%rsi),%rsi 290 vmovdqu .Lbswap_mask(%rip),%xmm13 291 vpshufb %xmm13,%xmm10,%xmm10 292 cmpq $0x80,%rcx 293 jb .Lshort_avx 294 subq $0x80,%rcx 295 296 vmovdqu 112(%rdx),%xmm14 297 vmovdqu 0-64(%rsi),%xmm6 298 vpshufb %xmm13,%xmm14,%xmm14 299 vmovdqu 32-64(%rsi),%xmm7 300 301 vpunpckhqdq %xmm14,%xmm14,%xmm9 302 vmovdqu 96(%rdx),%xmm15 303 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 304 vpxor %xmm14,%xmm9,%xmm9 305 vpshufb %xmm13,%xmm15,%xmm15 306 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 307 vmovdqu 16-64(%rsi),%xmm6 308 vpunpckhqdq %xmm15,%xmm15,%xmm8 309 vmovdqu 80(%rdx),%xmm14 310 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 311 vpxor %xmm15,%xmm8,%xmm8 312 313 vpshufb %xmm13,%xmm14,%xmm14 314 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 315 vpunpckhqdq %xmm14,%xmm14,%xmm9 316 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 317 vmovdqu 48-64(%rsi),%xmm6 318 vpxor %xmm14,%xmm9,%xmm9 319 vmovdqu 64(%rdx),%xmm15 320 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 321 vmovdqu 80-64(%rsi),%xmm7 322 323 vpshufb %xmm13,%xmm15,%xmm15 324 vpxor %xmm0,%xmm3,%xmm3 325 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 326 vpxor %xmm1,%xmm4,%xmm4 327 vpunpckhqdq %xmm15,%xmm15,%xmm8 328 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 329 vmovdqu 64-64(%rsi),%xmm6 330 vpxor %xmm2,%xmm5,%xmm5 331 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 332 vpxor %xmm15,%xmm8,%xmm8 333 334 vmovdqu 48(%rdx),%xmm14 335 vpxor %xmm3,%xmm0,%xmm0 336 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 337 vpxor %xmm4,%xmm1,%xmm1 338 vpshufb %xmm13,%xmm14,%xmm14 339 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 340 vmovdqu 96-64(%rsi),%xmm6 341 vpxor %xmm5,%xmm2,%xmm2 342 vpunpckhqdq %xmm14,%xmm14,%xmm9 343 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 344 vmovdqu 128-64(%rsi),%xmm7 345 vpxor %xmm14,%xmm9,%xmm9 346 347 vmovdqu 32(%rdx),%xmm15 348 vpxor %xmm0,%xmm3,%xmm3 349 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 350 vpxor %xmm1,%xmm4,%xmm4 351 vpshufb %xmm13,%xmm15,%xmm15 352 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 353 vmovdqu 112-64(%rsi),%xmm6 354 vpxor %xmm2,%xmm5,%xmm5 355 vpunpckhqdq %xmm15,%xmm15,%xmm8 356 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 357 vpxor %xmm15,%xmm8,%xmm8 358 359 vmovdqu 16(%rdx),%xmm14 360 vpxor %xmm3,%xmm0,%xmm0 361 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 362 vpxor %xmm4,%xmm1,%xmm1 363 vpshufb %xmm13,%xmm14,%xmm14 364 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 365 vmovdqu 144-64(%rsi),%xmm6 366 vpxor %xmm5,%xmm2,%xmm2 367 vpunpckhqdq %xmm14,%xmm14,%xmm9 368 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 369 vmovdqu 176-64(%rsi),%xmm7 370 vpxor %xmm14,%xmm9,%xmm9 371 372 vmovdqu (%rdx),%xmm15 373 vpxor %xmm0,%xmm3,%xmm3 374 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 375 vpxor %xmm1,%xmm4,%xmm4 376 vpshufb %xmm13,%xmm15,%xmm15 377 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 378 vmovdqu 160-64(%rsi),%xmm6 379 vpxor %xmm2,%xmm5,%xmm5 380 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 381 382 leaq 128(%rdx),%rdx 383 cmpq $0x80,%rcx 384 jb .Ltail_avx 385 386 vpxor %xmm10,%xmm15,%xmm15 387 subq $0x80,%rcx 388 jmp .Loop8x_avx 389 390.balign 32 391.Loop8x_avx: 392 vpunpckhqdq %xmm15,%xmm15,%xmm8 393 vmovdqu 112(%rdx),%xmm14 394 vpxor %xmm0,%xmm3,%xmm3 395 vpxor %xmm15,%xmm8,%xmm8 396 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 397 vpshufb %xmm13,%xmm14,%xmm14 398 vpxor %xmm1,%xmm4,%xmm4 399 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 400 vmovdqu 0-64(%rsi),%xmm6 401 vpunpckhqdq %xmm14,%xmm14,%xmm9 402 vpxor %xmm2,%xmm5,%xmm5 403 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 404 vmovdqu 32-64(%rsi),%xmm7 405 vpxor %xmm14,%xmm9,%xmm9 406 407 vmovdqu 96(%rdx),%xmm15 408 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 409 vpxor %xmm3,%xmm10,%xmm10 410 vpshufb %xmm13,%xmm15,%xmm15 411 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 412 vxorps %xmm4,%xmm11,%xmm11 413 vmovdqu 16-64(%rsi),%xmm6 414 vpunpckhqdq %xmm15,%xmm15,%xmm8 415 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 416 vpxor %xmm5,%xmm12,%xmm12 417 vxorps %xmm15,%xmm8,%xmm8 418 419 vmovdqu 80(%rdx),%xmm14 420 vpxor %xmm10,%xmm12,%xmm12 421 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 422 vpxor %xmm11,%xmm12,%xmm12 423 vpslldq $8,%xmm12,%xmm9 424 vpxor %xmm0,%xmm3,%xmm3 425 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 426 vpsrldq $8,%xmm12,%xmm12 427 vpxor %xmm9,%xmm10,%xmm10 428 vmovdqu 48-64(%rsi),%xmm6 429 vpshufb %xmm13,%xmm14,%xmm14 430 vxorps %xmm12,%xmm11,%xmm11 431 vpxor %xmm1,%xmm4,%xmm4 432 vpunpckhqdq %xmm14,%xmm14,%xmm9 433 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 434 vmovdqu 80-64(%rsi),%xmm7 435 vpxor %xmm14,%xmm9,%xmm9 436 vpxor %xmm2,%xmm5,%xmm5 437 438 vmovdqu 64(%rdx),%xmm15 439 vpalignr $8,%xmm10,%xmm10,%xmm12 440 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 441 vpshufb %xmm13,%xmm15,%xmm15 442 vpxor %xmm3,%xmm0,%xmm0 443 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 444 vmovdqu 64-64(%rsi),%xmm6 445 vpunpckhqdq %xmm15,%xmm15,%xmm8 446 vpxor %xmm4,%xmm1,%xmm1 447 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 448 vxorps %xmm15,%xmm8,%xmm8 449 vpxor %xmm5,%xmm2,%xmm2 450 451 vmovdqu 48(%rdx),%xmm14 452 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 453 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 454 vpshufb %xmm13,%xmm14,%xmm14 455 vpxor %xmm0,%xmm3,%xmm3 456 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 457 vmovdqu 96-64(%rsi),%xmm6 458 vpunpckhqdq %xmm14,%xmm14,%xmm9 459 vpxor %xmm1,%xmm4,%xmm4 460 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 461 vmovdqu 128-64(%rsi),%xmm7 462 vpxor %xmm14,%xmm9,%xmm9 463 vpxor %xmm2,%xmm5,%xmm5 464 465 vmovdqu 32(%rdx),%xmm15 466 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 467 vpshufb %xmm13,%xmm15,%xmm15 468 vpxor %xmm3,%xmm0,%xmm0 469 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 470 vmovdqu 112-64(%rsi),%xmm6 471 vpunpckhqdq %xmm15,%xmm15,%xmm8 472 vpxor %xmm4,%xmm1,%xmm1 473 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 474 vpxor %xmm15,%xmm8,%xmm8 475 vpxor %xmm5,%xmm2,%xmm2 476 vxorps %xmm12,%xmm10,%xmm10 477 478 vmovdqu 16(%rdx),%xmm14 479 vpalignr $8,%xmm10,%xmm10,%xmm12 480 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 481 vpshufb %xmm13,%xmm14,%xmm14 482 vpxor %xmm0,%xmm3,%xmm3 483 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 484 vmovdqu 144-64(%rsi),%xmm6 485 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 486 vxorps %xmm11,%xmm12,%xmm12 487 vpunpckhqdq %xmm14,%xmm14,%xmm9 488 vpxor %xmm1,%xmm4,%xmm4 489 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 490 vmovdqu 176-64(%rsi),%xmm7 491 vpxor %xmm14,%xmm9,%xmm9 492 vpxor %xmm2,%xmm5,%xmm5 493 494 vmovdqu (%rdx),%xmm15 495 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 496 vpshufb %xmm13,%xmm15,%xmm15 497 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 498 vmovdqu 160-64(%rsi),%xmm6 499 vpxor %xmm12,%xmm15,%xmm15 500 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 501 vpxor %xmm10,%xmm15,%xmm15 502 503 leaq 128(%rdx),%rdx 504 subq $0x80,%rcx 505 jnc .Loop8x_avx 506 507 addq $0x80,%rcx 508 jmp .Ltail_no_xor_avx 509 510.balign 32 511.Lshort_avx: 512 vmovdqu -16(%rdx,%rcx,1),%xmm14 513 leaq (%rdx,%rcx,1),%rdx 514 vmovdqu 0-64(%rsi),%xmm6 515 vmovdqu 32-64(%rsi),%xmm7 516 vpshufb %xmm13,%xmm14,%xmm15 517 518 vmovdqa %xmm0,%xmm3 519 vmovdqa %xmm1,%xmm4 520 vmovdqa %xmm2,%xmm5 521 subq $0x10,%rcx 522 jz .Ltail_avx 523 524 vpunpckhqdq %xmm15,%xmm15,%xmm8 525 vpxor %xmm0,%xmm3,%xmm3 526 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 527 vpxor %xmm15,%xmm8,%xmm8 528 vmovdqu -32(%rdx),%xmm14 529 vpxor %xmm1,%xmm4,%xmm4 530 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 531 vmovdqu 16-64(%rsi),%xmm6 532 vpshufb %xmm13,%xmm14,%xmm15 533 vpxor %xmm2,%xmm5,%xmm5 534 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 535 vpsrldq $8,%xmm7,%xmm7 536 subq $0x10,%rcx 537 jz .Ltail_avx 538 539 vpunpckhqdq %xmm15,%xmm15,%xmm8 540 vpxor %xmm0,%xmm3,%xmm3 541 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 542 vpxor %xmm15,%xmm8,%xmm8 543 vmovdqu -48(%rdx),%xmm14 544 vpxor %xmm1,%xmm4,%xmm4 545 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 546 vmovdqu 48-64(%rsi),%xmm6 547 vpshufb %xmm13,%xmm14,%xmm15 548 vpxor %xmm2,%xmm5,%xmm5 549 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 550 vmovdqu 80-64(%rsi),%xmm7 551 subq $0x10,%rcx 552 jz .Ltail_avx 553 554 vpunpckhqdq %xmm15,%xmm15,%xmm8 555 vpxor %xmm0,%xmm3,%xmm3 556 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 557 vpxor %xmm15,%xmm8,%xmm8 558 vmovdqu -64(%rdx),%xmm14 559 vpxor %xmm1,%xmm4,%xmm4 560 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 561 vmovdqu 64-64(%rsi),%xmm6 562 vpshufb %xmm13,%xmm14,%xmm15 563 vpxor %xmm2,%xmm5,%xmm5 564 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 565 vpsrldq $8,%xmm7,%xmm7 566 subq $0x10,%rcx 567 jz .Ltail_avx 568 569 vpunpckhqdq %xmm15,%xmm15,%xmm8 570 vpxor %xmm0,%xmm3,%xmm3 571 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 572 vpxor %xmm15,%xmm8,%xmm8 573 vmovdqu -80(%rdx),%xmm14 574 vpxor %xmm1,%xmm4,%xmm4 575 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 576 vmovdqu 96-64(%rsi),%xmm6 577 vpshufb %xmm13,%xmm14,%xmm15 578 vpxor %xmm2,%xmm5,%xmm5 579 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 580 vmovdqu 128-64(%rsi),%xmm7 581 subq $0x10,%rcx 582 jz .Ltail_avx 583 584 vpunpckhqdq %xmm15,%xmm15,%xmm8 585 vpxor %xmm0,%xmm3,%xmm3 586 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 587 vpxor %xmm15,%xmm8,%xmm8 588 vmovdqu -96(%rdx),%xmm14 589 vpxor %xmm1,%xmm4,%xmm4 590 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 591 vmovdqu 112-64(%rsi),%xmm6 592 vpshufb %xmm13,%xmm14,%xmm15 593 vpxor %xmm2,%xmm5,%xmm5 594 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 595 vpsrldq $8,%xmm7,%xmm7 596 subq $0x10,%rcx 597 jz .Ltail_avx 598 599 vpunpckhqdq %xmm15,%xmm15,%xmm8 600 vpxor %xmm0,%xmm3,%xmm3 601 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 602 vpxor %xmm15,%xmm8,%xmm8 603 vmovdqu -112(%rdx),%xmm14 604 vpxor %xmm1,%xmm4,%xmm4 605 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 606 vmovdqu 144-64(%rsi),%xmm6 607 vpshufb %xmm13,%xmm14,%xmm15 608 vpxor %xmm2,%xmm5,%xmm5 609 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 610 vmovq 184-64(%rsi),%xmm7 611 subq $0x10,%rcx 612 jmp .Ltail_avx 613 614.balign 32 615.Ltail_avx: 616 vpxor %xmm10,%xmm15,%xmm15 617.Ltail_no_xor_avx: 618 vpunpckhqdq %xmm15,%xmm15,%xmm8 619 vpxor %xmm0,%xmm3,%xmm3 620 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 621 vpxor %xmm15,%xmm8,%xmm8 622 vpxor %xmm1,%xmm4,%xmm4 623 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 624 vpxor %xmm2,%xmm5,%xmm5 625 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 626 627 vmovdqu (%r10),%xmm12 628 629 vpxor %xmm0,%xmm3,%xmm10 630 vpxor %xmm1,%xmm4,%xmm11 631 vpxor %xmm2,%xmm5,%xmm5 632 633 vpxor %xmm10,%xmm5,%xmm5 634 vpxor %xmm11,%xmm5,%xmm5 635 vpslldq $8,%xmm5,%xmm9 636 vpsrldq $8,%xmm5,%xmm5 637 vpxor %xmm9,%xmm10,%xmm10 638 vpxor %xmm5,%xmm11,%xmm11 639 640 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 641 vpalignr $8,%xmm10,%xmm10,%xmm10 642 vpxor %xmm9,%xmm10,%xmm10 643 644 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 645 vpalignr $8,%xmm10,%xmm10,%xmm10 646 vpxor %xmm11,%xmm10,%xmm10 647 vpxor %xmm9,%xmm10,%xmm10 648 649 cmpq $0,%rcx 650 jne .Lshort_avx 651 652 vpshufb %xmm13,%xmm10,%xmm10 653 vmovdqu %xmm10,(%rdi) 654 vzeroupper 655 RET 656.cfi_endproc 657SET_SIZE(gcm_ghash_avx) 658 659#endif /* !_WIN32 || _KERNEL */ 660 661SECTION_STATIC 662.balign 64 663.Lbswap_mask: 664.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 665.L0x1c2_polynomial: 666.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 667.L7_mask: 668.long 7,0,7,0 669.L7_mask_poly: 670.long 7,0,450,0 671.balign 64 672SET_OBJ(.Lrem_4bit) 673.Lrem_4bit: 674.long 0,0,0,471859200,0,943718400,0,610271232 675.long 0,1887436800,0,1822425088,0,1220542464,0,1423966208 676.long 0,3774873600,0,4246732800,0,3644850176,0,3311403008 677.long 0,2441084928,0,2376073216,0,2847932416,0,3051356160 678SET_OBJ(.Lrem_8bit) 679.Lrem_8bit: 680.value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E 681.value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E 682.value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E 683.value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E 684.value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E 685.value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E 686.value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E 687.value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E 688.value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE 689.value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE 690.value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE 691.value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE 692.value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E 693.value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E 694.value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE 695.value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE 696.value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E 697.value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E 698.value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E 699.value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E 700.value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E 701.value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E 702.value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E 703.value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E 704.value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE 705.value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE 706.value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE 707.value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE 708.value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E 709.value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E 710.value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE 711.value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE 712 713.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 714.balign 64 715 716/* Mark the stack non-executable. */ 717#if defined(__linux__) && defined(__ELF__) 718.section .note.GNU-stack,"",%progbits 719#endif 720 721#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */ 722