xref: /src/lib/libmd/amd64/sha1block.S (revision ec3242ed1906e77c9af2c54da636833a946c62b6)
18b4684afSRobert Clausecker/*-
28b4684afSRobert Clausecker * Copyright (c) 2013 The Go Authors. All rights reserved.
3ec3242edSRobert Clausecker * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>
48b4684afSRobert Clausecker *
58b4684afSRobert Clausecker * Adapted from Go's crypto/sha1/sha1block_amd64.s.
68b4684afSRobert Clausecker *
78b4684afSRobert Clausecker * Redistribution and use in source and binary forms, with or without
88b4684afSRobert Clausecker * modification, are permitted provided that the following conditions are
98b4684afSRobert Clausecker * met:
108b4684afSRobert Clausecker *
118b4684afSRobert Clausecker *   * Redistributions of source code must retain the above copyright
128b4684afSRobert Clausecker * notice, this list of conditions and the following disclaimer.
138b4684afSRobert Clausecker *   * Redistributions in binary form must reproduce the above
148b4684afSRobert Clausecker * copyright notice, this list of conditions and the following disclaimer
158b4684afSRobert Clausecker * in the documentation and/or other materials provided with the
168b4684afSRobert Clausecker * distribution.
178b4684afSRobert Clausecker *   * Neither the name of Google Inc. nor the names of its
188b4684afSRobert Clausecker * contributors may be used to endorse or promote products derived from
198b4684afSRobert Clausecker * this software without specific prior written permission.
208b4684afSRobert Clausecker *
218b4684afSRobert Clausecker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
228b4684afSRobert Clausecker * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
238b4684afSRobert Clausecker * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
248b4684afSRobert Clausecker * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
258b4684afSRobert Clausecker * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
268b4684afSRobert Clausecker * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
278b4684afSRobert Clausecker * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
288b4684afSRobert Clausecker * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
298b4684afSRobert Clausecker * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
308b4684afSRobert Clausecker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
318b4684afSRobert Clausecker * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
328b4684afSRobert Clausecker */
338b4684afSRobert Clausecker
348b4684afSRobert Clausecker#include <machine/asm.h>
358b4684afSRobert Clausecker
368b4684afSRobert Clausecker/*
378b4684afSRobert Clausecker * SHA-1 block routine. See sha1c.c for C equivalent.
388b4684afSRobert Clausecker *
398b4684afSRobert Clausecker * There are 80 rounds of 4 types:
408b4684afSRobert Clausecker *   - rounds 0-15 are type 1 and load data (round1 macro).
418b4684afSRobert Clausecker *   - rounds 16-19 are type 1 and do not load data (round1x macro).
428b4684afSRobert Clausecker *   - rounds 20-39 are type 2 and do not load data (round2 macro).
438b4684afSRobert Clausecker *   - rounds 40-59 are type 3 and do not load data (round3 macro).
448b4684afSRobert Clausecker *   - rounds 60-79 are type 4 and do not load data (round4 macro).
458b4684afSRobert Clausecker *
468b4684afSRobert Clausecker * Each round loads or shuffles the data, then computes a per-round
478b4684afSRobert Clausecker * function of b, c, d, and then mixes the result into and rotates the
488b4684afSRobert Clausecker * five registers a, b, c, d, e holding the intermediate results.
498b4684afSRobert Clausecker *
508b4684afSRobert Clausecker * The register rotation is implemented by rotating the arguments to
518b4684afSRobert Clausecker * the round macros instead of by explicit move instructions.
528b4684afSRobert Clausecker */
538b4684afSRobert Clausecker.macro	load		index
548b4684afSRobert Clausecker	mov		(\index)*4(%rsi), %r10d
558b4684afSRobert Clausecker	bswap		%r10d
568b4684afSRobert Clausecker	mov		%r10d, (\index)*4(%rsp)
578b4684afSRobert Clausecker.endm
588b4684afSRobert Clausecker
598b4684afSRobert Clausecker.macro	shuffle		index
608b4684afSRobert Clausecker	mov		((\index   )&0xf)*4(%rsp), %r10d
618b4684afSRobert Clausecker	xor		((\index- 3)&0xf)*4(%rsp), %r10d
628b4684afSRobert Clausecker	xor		((\index- 8)&0xf)*4(%rsp), %r10d
638b4684afSRobert Clausecker	xor		((\index-14)&0xf)*4(%rsp), %r10d
648b4684afSRobert Clausecker	rol		$1, %r10d
658b4684afSRobert Clausecker	mov		%r10d, ((\index)&0xf)*4(%rsp)
668b4684afSRobert Clausecker.endm
678b4684afSRobert Clausecker
688b4684afSRobert Clausecker.macro	func1		a, b, c, d, e
698b4684afSRobert Clausecker	mov		\d, %r9d
708b4684afSRobert Clausecker	xor		\c, %r9d
718b4684afSRobert Clausecker	and		\b, %r9d
728b4684afSRobert Clausecker	xor		\d, %r9d
738b4684afSRobert Clausecker.endm
748b4684afSRobert Clausecker
758b4684afSRobert Clausecker.macro	func2		a, b, c, d, e
768b4684afSRobert Clausecker	mov		\b, %r9d
778b4684afSRobert Clausecker	xor		\c, %r9d
788b4684afSRobert Clausecker	xor		\d, %r9d
798b4684afSRobert Clausecker.endm
808b4684afSRobert Clausecker
818b4684afSRobert Clausecker.macro	func3		a, b, c, d, e
828b4684afSRobert Clausecker	mov		\b, %r8d
838b4684afSRobert Clausecker	or		\c, %r8d
848b4684afSRobert Clausecker	and		\d, %r8d
858b4684afSRobert Clausecker	mov		\b, %r9d
868b4684afSRobert Clausecker	and		\c, %r9d
878b4684afSRobert Clausecker	or		%r8d, %r9d
888b4684afSRobert Clausecker.endm
898b4684afSRobert Clausecker
908b4684afSRobert Clausecker.macro	func4		a, b, c, d, e
918b4684afSRobert Clausecker	func2		\a, \b, \c, \d, \e
928b4684afSRobert Clausecker.endm
938b4684afSRobert Clausecker
948b4684afSRobert Clausecker.macro	mix		a, b, c, d, e, const
958b4684afSRobert Clausecker	rol		$30, \b
968b4684afSRobert Clausecker	add		%r9d, \e
978b4684afSRobert Clausecker	mov		\a, %r8d
988b4684afSRobert Clausecker	rol		$5, %r8d
998b4684afSRobert Clausecker	lea		\const(\e, %r10d, 1), \e
1008b4684afSRobert Clausecker	add		%r8d, \e
1018b4684afSRobert Clausecker.endm
1028b4684afSRobert Clausecker
1038b4684afSRobert Clausecker.macro	round1		a, b, c, d, e, index
1048b4684afSRobert Clausecker	load		\index
1058b4684afSRobert Clausecker	func1		\a, \b, \c, \d, \e
1068b4684afSRobert Clausecker	mix		\a, \b, \c, \d, \e, 0x5a827999
1078b4684afSRobert Clausecker.endm
1088b4684afSRobert Clausecker
1098b4684afSRobert Clausecker.macro	round1x		a, b, c, d, e, index
1108b4684afSRobert Clausecker	shuffle		\index
1118b4684afSRobert Clausecker	func1		\a, \b, \c, \d, \e
1128b4684afSRobert Clausecker	mix		\a, \b, \c, \d, \e, 0x5a827999
1138b4684afSRobert Clausecker.endm
1148b4684afSRobert Clausecker
1158b4684afSRobert Clausecker.macro	round2		a, b, c, d, e, index
1168b4684afSRobert Clausecker	shuffle		\index
1178b4684afSRobert Clausecker	func2		\a, \b, \c, \d, \e
1188b4684afSRobert Clausecker	mix		\a, \b, \c, \d, \e, 0x6ed9eba1
1198b4684afSRobert Clausecker.endm
1208b4684afSRobert Clausecker
1218b4684afSRobert Clausecker.macro	round3		a, b, c, d, e, index
1228b4684afSRobert Clausecker	shuffle		\index
1238b4684afSRobert Clausecker	func3		\a, \b, \c, \d, \e
1248b4684afSRobert Clausecker	mix		\a, \b, \c, \d, \e, 0x8f1bbcdc
1258b4684afSRobert Clausecker.endm
1268b4684afSRobert Clausecker
1278b4684afSRobert Clausecker.macro	round4		a, b, c, d, e, index
1288b4684afSRobert Clausecker	shuffle		\index
1298b4684afSRobert Clausecker	func4		\a, \b, \c, \d, \e
1308b4684afSRobert Clausecker	mix		\a, \b, \c, \d, \e, 0xca62c1d6
1318b4684afSRobert Clausecker.endm
1328b4684afSRobert Clausecker
1338b4684afSRobert Clausecker	// sha1block(SHA1_CTX, buf, len)
1348b4684afSRobert ClauseckerENTRY(_libmd_sha1block_scalar)
1358b4684afSRobert Clausecker	push		%rbp
1368b4684afSRobert Clausecker	push		%rbx
1378b4684afSRobert Clausecker	push		%r12
1388b4684afSRobert Clausecker	push		%r13
1398b4684afSRobert Clausecker	push		%r14
1408b4684afSRobert Clausecker	push		%r15
1418b4684afSRobert Clausecker	push		%rdi			// rdi: SHA1_CTX
1428b4684afSRobert Clausecker	sub		$64+8, %rsp		// 64 bytes for round keys
1438b4684afSRobert Clausecker						// plus alignment
1448b4684afSRobert Clausecker
1458b4684afSRobert Clausecker	mov		%rdi, %rbp
1468b4684afSRobert Clausecker						// rsi: buf
1478b4684afSRobert Clausecker	and		$~63, %rdx		// rdx: length in blocks
1488b4684afSRobert Clausecker	lea		(%rsi, %rdx, 1), %rdi	// rdi: end pointer
1498b4684afSRobert Clausecker	mov		(%rbp),	%eax		// c->h0
1508b4684afSRobert Clausecker	mov		4(%rbp), %ebx		// c->h1
1518b4684afSRobert Clausecker	mov		8(%rbp), %ecx		// c->h2
1528b4684afSRobert Clausecker	mov		12(%rbp), %edx		// c->h3
1538b4684afSRobert Clausecker	mov		16(%rbp), %ebp		// c->h4
1548b4684afSRobert Clausecker
1558b4684afSRobert Clausecker	cmp		%rsi, %rdi		// any data to process?
1568b4684afSRobert Clausecker	je		.Lend
1578b4684afSRobert Clausecker
1588b4684afSRobert Clausecker.Lloop:	mov		%eax, %r11d
1598b4684afSRobert Clausecker	mov		%ebx, %r12d
1608b4684afSRobert Clausecker	mov		%ecx, %r13d
1618b4684afSRobert Clausecker	mov		%edx, %r14d
1628b4684afSRobert Clausecker	mov		%ebp, %r15d
1638b4684afSRobert Clausecker
1648b4684afSRobert Clausecker	round1		%eax, %ebx, %ecx, %edx, %ebp,  0
1658b4684afSRobert Clausecker	round1		%ebp, %eax, %ebx, %ecx, %edx,  1
1668b4684afSRobert Clausecker	round1		%edx, %ebp, %eax, %ebx, %ecx,  2
1678b4684afSRobert Clausecker	round1		%ecx, %edx, %ebp, %eax, %ebx,  3
1688b4684afSRobert Clausecker	round1		%ebx, %ecx, %edx, %ebp, %eax,  4
1698b4684afSRobert Clausecker
1708b4684afSRobert Clausecker	round1		%eax, %ebx, %ecx, %edx, %ebp,  5
1718b4684afSRobert Clausecker	round1		%ebp, %eax, %ebx, %ecx, %edx,  6
1728b4684afSRobert Clausecker	round1		%edx, %ebp, %eax, %ebx, %ecx,  7
1738b4684afSRobert Clausecker	round1		%ecx, %edx, %ebp, %eax, %ebx,  8
1748b4684afSRobert Clausecker	round1		%ebx, %ecx, %edx, %ebp, %eax,  9
1758b4684afSRobert Clausecker
1768b4684afSRobert Clausecker	round1		%eax, %ebx, %ecx, %edx, %ebp, 10
1778b4684afSRobert Clausecker	round1		%ebp, %eax, %ebx, %ecx, %edx, 11
1788b4684afSRobert Clausecker	round1		%edx, %ebp, %eax, %ebx, %ecx, 12
1798b4684afSRobert Clausecker	round1		%ecx, %edx, %ebp, %eax, %ebx, 13
1808b4684afSRobert Clausecker	round1		%ebx, %ecx, %edx, %ebp, %eax, 14
1818b4684afSRobert Clausecker
1828b4684afSRobert Clausecker	round1		%eax, %ebx, %ecx, %edx, %ebp, 15
1838b4684afSRobert Clausecker	round1x		%ebp, %eax, %ebx, %ecx, %edx, 16
1848b4684afSRobert Clausecker	round1x		%edx, %ebp, %eax, %ebx, %ecx, 17
1858b4684afSRobert Clausecker	round1x		%ecx, %edx, %ebp, %eax, %ebx, 18
1868b4684afSRobert Clausecker	round1x		%ebx, %ecx, %edx, %ebp, %eax, 19
1878b4684afSRobert Clausecker
1888b4684afSRobert Clausecker	round2		%eax, %ebx, %ecx, %edx, %ebp, 20
1898b4684afSRobert Clausecker	round2		%ebp, %eax, %ebx, %ecx, %edx, 21
1908b4684afSRobert Clausecker	round2		%edx, %ebp, %eax, %ebx, %ecx, 22
1918b4684afSRobert Clausecker	round2		%ecx, %edx, %ebp, %eax, %ebx, 23
1928b4684afSRobert Clausecker	round2		%ebx, %ecx, %edx, %ebp, %eax, 24
1938b4684afSRobert Clausecker
1948b4684afSRobert Clausecker	round2		%eax, %ebx, %ecx, %edx, %ebp, 25
1958b4684afSRobert Clausecker	round2		%ebp, %eax, %ebx, %ecx, %edx, 26
1968b4684afSRobert Clausecker	round2		%edx, %ebp, %eax, %ebx, %ecx, 27
1978b4684afSRobert Clausecker	round2		%ecx, %edx, %ebp, %eax, %ebx, 28
1988b4684afSRobert Clausecker	round2		%ebx, %ecx, %edx, %ebp, %eax, 29
1998b4684afSRobert Clausecker
2008b4684afSRobert Clausecker	round2		%eax, %ebx, %ecx, %edx, %ebp, 30
2018b4684afSRobert Clausecker	round2		%ebp, %eax, %ebx, %ecx, %edx, 31
2028b4684afSRobert Clausecker	round2		%edx, %ebp, %eax, %ebx, %ecx, 32
2038b4684afSRobert Clausecker	round2		%ecx, %edx, %ebp, %eax, %ebx, 33
2048b4684afSRobert Clausecker	round2		%ebx, %ecx, %edx, %ebp, %eax, 34
2058b4684afSRobert Clausecker
2068b4684afSRobert Clausecker	round2		%eax, %ebx, %ecx, %edx, %ebp, 35
2078b4684afSRobert Clausecker	round2		%ebp, %eax, %ebx, %ecx, %edx, 36
2088b4684afSRobert Clausecker	round2		%edx, %ebp, %eax, %ebx, %ecx, 37
2098b4684afSRobert Clausecker	round2		%ecx, %edx, %ebp, %eax, %ebx, 38
2108b4684afSRobert Clausecker	round2		%ebx, %ecx, %edx, %ebp, %eax, 39
2118b4684afSRobert Clausecker
2128b4684afSRobert Clausecker	round3		%eax, %ebx, %ecx, %edx, %ebp, 40
2138b4684afSRobert Clausecker	round3		%ebp, %eax, %ebx, %ecx, %edx, 41
2148b4684afSRobert Clausecker	round3		%edx, %ebp, %eax, %ebx, %ecx, 42
2158b4684afSRobert Clausecker	round3		%ecx, %edx, %ebp, %eax, %ebx, 43
2168b4684afSRobert Clausecker	round3		%ebx, %ecx, %edx, %ebp, %eax, 44
2178b4684afSRobert Clausecker
2188b4684afSRobert Clausecker	round3		%eax, %ebx, %ecx, %edx, %ebp, 45
2198b4684afSRobert Clausecker	round3		%ebp, %eax, %ebx, %ecx, %edx, 46
2208b4684afSRobert Clausecker	round3		%edx, %ebp, %eax, %ebx, %ecx, 47
2218b4684afSRobert Clausecker	round3		%ecx, %edx, %ebp, %eax, %ebx, 48
2228b4684afSRobert Clausecker	round3		%ebx, %ecx, %edx, %ebp, %eax, 49
2238b4684afSRobert Clausecker
2248b4684afSRobert Clausecker	round3		%eax, %ebx, %ecx, %edx, %ebp, 50
2258b4684afSRobert Clausecker	round3		%ebp, %eax, %ebx, %ecx, %edx, 51
2268b4684afSRobert Clausecker	round3		%edx, %ebp, %eax, %ebx, %ecx, 52
2278b4684afSRobert Clausecker	round3		%ecx, %edx, %ebp, %eax, %ebx, 53
2288b4684afSRobert Clausecker	round3		%ebx, %ecx, %edx, %ebp, %eax, 54
2298b4684afSRobert Clausecker
2308b4684afSRobert Clausecker	round3		%eax, %ebx, %ecx, %edx, %ebp, 55
2318b4684afSRobert Clausecker	round3		%ebp, %eax, %ebx, %ecx, %edx, 56
2328b4684afSRobert Clausecker	round3		%edx, %ebp, %eax, %ebx, %ecx, 57
2338b4684afSRobert Clausecker	round3		%ecx, %edx, %ebp, %eax, %ebx, 58
2348b4684afSRobert Clausecker	round3		%ebx, %ecx, %edx, %ebp, %eax, 59
2358b4684afSRobert Clausecker
2368b4684afSRobert Clausecker	round4		%eax, %ebx, %ecx, %edx, %ebp, 60
2378b4684afSRobert Clausecker	round4		%ebp, %eax, %ebx, %ecx, %edx, 61
2388b4684afSRobert Clausecker	round4		%edx, %ebp, %eax, %ebx, %ecx, 62
2398b4684afSRobert Clausecker	round4		%ecx, %edx, %ebp, %eax, %ebx, 63
2408b4684afSRobert Clausecker	round4		%ebx, %ecx, %edx, %ebp, %eax, 64
2418b4684afSRobert Clausecker
2428b4684afSRobert Clausecker	round4		%eax, %ebx, %ecx, %edx, %ebp, 65
2438b4684afSRobert Clausecker	round4		%ebp, %eax, %ebx, %ecx, %edx, 66
2448b4684afSRobert Clausecker	round4		%edx, %ebp, %eax, %ebx, %ecx, 67
2458b4684afSRobert Clausecker	round4		%ecx, %edx, %ebp, %eax, %ebx, 68
2468b4684afSRobert Clausecker	round4		%ebx, %ecx, %edx, %ebp, %eax, 69
2478b4684afSRobert Clausecker
2488b4684afSRobert Clausecker	round4		%eax, %ebx, %ecx, %edx, %ebp, 70
2498b4684afSRobert Clausecker	round4		%ebp, %eax, %ebx, %ecx, %edx, 71
2508b4684afSRobert Clausecker	round4		%edx, %ebp, %eax, %ebx, %ecx, 72
2518b4684afSRobert Clausecker	round4		%ecx, %edx, %ebp, %eax, %ebx, 73
2528b4684afSRobert Clausecker	round4		%ebx, %ecx, %edx, %ebp, %eax, 74
2538b4684afSRobert Clausecker
2548b4684afSRobert Clausecker	round4		%eax, %ebx, %ecx, %edx, %ebp, 75
2558b4684afSRobert Clausecker	round4		%ebp, %eax, %ebx, %ecx, %edx, 76
2568b4684afSRobert Clausecker	round4		%edx, %ebp, %eax, %ebx, %ecx, 77
2578b4684afSRobert Clausecker	round4		%ecx, %edx, %ebp, %eax, %ebx, 78
2588b4684afSRobert Clausecker	round4		%ebx, %ecx, %edx, %ebp, %eax, 79
2598b4684afSRobert Clausecker
2608b4684afSRobert Clausecker	add		%r11d, %eax
2618b4684afSRobert Clausecker	add		%r12d, %ebx
2628b4684afSRobert Clausecker	add		%r13d, %ecx
2638b4684afSRobert Clausecker	add		%r14d, %edx
2648b4684afSRobert Clausecker	add		%r15d, %ebp
2658b4684afSRobert Clausecker
2668b4684afSRobert Clausecker	add		$64, %rsi
2678b4684afSRobert Clausecker	cmp		%rdi, %rsi
2688b4684afSRobert Clausecker	jb		.Lloop
2698b4684afSRobert Clausecker
2708b4684afSRobert Clausecker.Lend:	add		$64+8, %rsp
2718b4684afSRobert Clausecker	pop		%rdi			// SHA1_CTX
2728b4684afSRobert Clausecker	mov		%eax, (%rdi)
2738b4684afSRobert Clausecker	mov		%ebx, 4(%rdi)
2748b4684afSRobert Clausecker	mov		%ecx, 8(%rdi)
2758b4684afSRobert Clausecker	mov		%edx, 12(%rdi)
2768b4684afSRobert Clausecker	mov		%ebp, 16(%rdi)
2778b4684afSRobert Clausecker
2788b4684afSRobert Clausecker	pop		%r15
2798b4684afSRobert Clausecker	pop		%r14
2808b4684afSRobert Clausecker	pop		%r13
2818b4684afSRobert Clausecker	pop		%r12
2828b4684afSRobert Clausecker	pop		%rbx
2838b4684afSRobert Clausecker	pop		%rbp
2848b4684afSRobert Clausecker	ret
2858b4684afSRobert ClauseckerEND(_libmd_sha1block_scalar)
2868b4684afSRobert Clausecker
2878b4684afSRobert Clausecker/*
2888b4684afSRobert Clausecker * This is the implementation using AVX2, BMI1 and BMI2. It is based on:
2898b4684afSRobert Clausecker * "SHA-1 implementation with Intel(R) AVX2 instruction set extensions"
2908b4684afSRobert Clausecker * From http://software.intel.com/en-us/articles
2918b4684afSRobert Clausecker * (look for improving-the-performance-of-the-secure-hash-algorithm-1)
2928b4684afSRobert Clausecker * This implementation is 2x unrolled, and interleaves vector instructions,
2938b4684afSRobert Clausecker * used to precompute W, with scalar computation of current round
2948b4684afSRobert Clausecker * for optimal scheduling.
2958b4684afSRobert Clausecker */
2968b4684afSRobert Clausecker
2978b4684afSRobert Clausecker	/* trivial helper macros */
2988b4684afSRobert Clausecker.macro	update_hash	a, tb, c, d, e
2998b4684afSRobert Clausecker	add		(%r9), \a
3008b4684afSRobert Clausecker	mov		\a, (%r9)
3018b4684afSRobert Clausecker	add		4(%r9), \tb
3028b4684afSRobert Clausecker	mov		\tb, 4(%r9)
3038b4684afSRobert Clausecker	add		8(%r9), \c
3048b4684afSRobert Clausecker	mov		\c, 8(%r9)
3058b4684afSRobert Clausecker	add		12(%r9), \d
3068b4684afSRobert Clausecker	mov		\d, 12(%r9)
3078b4684afSRobert Clausecker	add		16(%r9), \e
3088b4684afSRobert Clausecker	mov		\e, 16(%r9)
3098b4684afSRobert Clausecker.endm
3108b4684afSRobert Clausecker
3118b4684afSRobert Clausecker	/* help macros for recalc, which does precomputations */
3128b4684afSRobert Clausecker.macro	precalc0	offset
3138b4684afSRobert Clausecker	vmovdqu		\offset(%r10), %xmm0
3148b4684afSRobert Clausecker.endm
3158b4684afSRobert Clausecker
3168b4684afSRobert Clausecker.macro	precalc1	offset
3178b4684afSRobert Clausecker	vinserti128	$1, \offset(%r13), %ymm0, %ymm0
3188b4684afSRobert Clausecker.endm
3198b4684afSRobert Clausecker
3208b4684afSRobert Clausecker.macro	precalc2	yreg
3218b4684afSRobert Clausecker	vpshufb		%ymm10, %ymm0, \yreg
3228b4684afSRobert Clausecker.endm
3238b4684afSRobert Clausecker
3248b4684afSRobert Clausecker.macro	precalc4	yreg, k_offset
3258b4684afSRobert Clausecker	vpaddd		\k_offset(%r8), \yreg, %ymm0
3268b4684afSRobert Clausecker.endm
3278b4684afSRobert Clausecker
3288b4684afSRobert Clausecker.macro	precalc7	offset
3298b4684afSRobert Clausecker	vmovdqu		%ymm0, (\offset)*2(%r14)
3308b4684afSRobert Clausecker.endm
3318b4684afSRobert Clausecker
3328b4684afSRobert Clausecker/*
3338b4684afSRobert Clausecker * Message scheduling pre-compute for rounds 0-15
3348b4684afSRobert Clausecker * r13      is a pointer to the even 64-byte block
3358b4684afSRobert Clausecker * r10      is a pointer to the odd 64-byte block
3368b4684afSRobert Clausecker * r14      is a pointer to the temp buffer
3378b4684afSRobert Clausecker * xmm0     is used as a temp register
3388b4684afSRobert Clausecker * yreg     is clobbered as part of the computation
3398b4684afSRobert Clausecker * offset   chooses a 16 byte chunk within a block
3408b4684afSRobert Clausecker * r8       is a pointer to the constants block
3418b4684afSRobert Clausecker * k_offset chooses K constants relevant to this round
3428b4684afSRobert Clausecker * xmm10    holds the swap mask
3438b4684afSRobert Clausecker */
3448b4684afSRobert Clausecker.macro	precalc00_15	offset, yreg
3458b4684afSRobert Clausecker	precalc0	\offset
3468b4684afSRobert Clausecker	precalc1	\offset
3478b4684afSRobert Clausecker	precalc2	\yreg
3488b4684afSRobert Clausecker	precalc4	\yreg, 0
3498b4684afSRobert Clausecker	precalc7	\offset
3508b4684afSRobert Clausecker.endm
3518b4684afSRobert Clausecker
3528b4684afSRobert Clausecker	/* helper macros for precalc16_31 */
3538b4684afSRobert Clausecker.macro	precalc16	reg_sub16, reg_sub12, reg_sub4, reg
3548b4684afSRobert Clausecker	vpalignr	$8, \reg_sub16, \reg_sub12, \reg	// w[i - 14]
3558b4684afSRobert Clausecker	vpsrldq		$4, \reg_sub4, %ymm0			// w[i -  3]
3568b4684afSRobert Clausecker.endm
3578b4684afSRobert Clausecker
3588b4684afSRobert Clausecker.macro	precalc17	reg_sub16, reg_sub8, reg
3598b4684afSRobert Clausecker	vpxor		\reg_sub8, \reg, \reg
3608b4684afSRobert Clausecker	vpxor		\reg_sub16, %ymm0, %ymm0
3618b4684afSRobert Clausecker.endm
3628b4684afSRobert Clausecker
3638b4684afSRobert Clausecker.macro	precalc18	reg
3648b4684afSRobert Clausecker	vpxor		%ymm0, \reg, \reg
3658b4684afSRobert Clausecker	vpslldq		$12, \reg, %ymm9
3668b4684afSRobert Clausecker.endm
3678b4684afSRobert Clausecker
3688b4684afSRobert Clausecker.macro	precalc19	reg
3698b4684afSRobert Clausecker	vpslld		$1, \reg, %ymm0
3708b4684afSRobert Clausecker	vpsrld		$31, \reg, \reg
3718b4684afSRobert Clausecker	.endm
3728b4684afSRobert Clausecker
3738b4684afSRobert Clausecker.macro	precalc20	reg
3748b4684afSRobert Clausecker	vpor		\reg, %ymm0, %ymm0
3758b4684afSRobert Clausecker	vpslld		$2, %ymm9, \reg
3768b4684afSRobert Clausecker.endm
3778b4684afSRobert Clausecker
3788b4684afSRobert Clausecker.macro	precalc21	reg
3798b4684afSRobert Clausecker	vpsrld		$30, %ymm9, %ymm9
3808b4684afSRobert Clausecker	vpxor		\reg, %ymm0, %ymm0
3818b4684afSRobert Clausecker.endm
3828b4684afSRobert Clausecker
3838b4684afSRobert Clausecker.macro	precalc23	reg, k_offset, offset
3848b4684afSRobert Clausecker	vpxor		%ymm9, %ymm0, \reg
3858b4684afSRobert Clausecker	vpaddd		\k_offset(%r8), \reg, %ymm0
3868b4684afSRobert Clausecker	vmovdqu		%ymm0, (\offset)(%r14)
3878b4684afSRobert Clausecker.endm
3888b4684afSRobert Clausecker
3898b4684afSRobert Clausecker/*
3908b4684afSRobert Clausecker * Message scheduling pre-compute for rounds 16-31
3918b4684afSRobert Clausecker * calculating last 32 w[i] values in 8 XMM registers
3928b4684afSRobert Clausecker * pre-calculate K+w[i] values and store to mem
3938b4684afSRobert Clausecker * for later load by ALU add instruction.
3948b4684afSRobert Clausecker * "brute force" vectorization for rounds 16-31 only
3958b4684afSRobert Clausecker * due to w[i]->w[i-3] dependency.
3968b4684afSRobert Clausecker + clobbers 5 input ymm registers REG_SUB*
3978b4684afSRobert Clausecker * uses xmm0 and xmm9 as temp registers
3988b4684afSRobert Clausecker * As always, r8 is a pointer to constants block
3998b4684afSRobert Clausecker * and r14 is a pointer to temp buffer
4008b4684afSRobert Clausecker */
4018b4684afSRobert Clausecker.macro	precalc16_31	reg, reg_sub4, reg_sub8, reg_sub12, reg_sub16, k_offset, offset
4028b4684afSRobert Clausecker	precalc16	\reg_sub16, \reg_sub12, \reg_sub4, \reg
4038b4684afSRobert Clausecker	precalc17	\reg_sub16, \reg_sub8, \reg
4048b4684afSRobert Clausecker	precalc18	\reg
4058b4684afSRobert Clausecker	precalc19	\reg
4068b4684afSRobert Clausecker	precalc20	\reg
4078b4684afSRobert Clausecker	precalc21	\reg
4088b4684afSRobert Clausecker	precalc23	\reg, \k_offset, \offset
4098b4684afSRobert Clausecker.endm
4108b4684afSRobert Clausecker
4118b4684afSRobert Clausecker	/* helper macros for precalc_32_79 */
4128b4684afSRobert Clausecker.macro	precalc32	reg_sub8, reg_sub4
4138b4684afSRobert Clausecker	vpalignr	$8, \reg_sub8, \reg_sub4, %ymm0
4148b4684afSRobert Clausecker.endm
4158b4684afSRobert Clausecker
4168b4684afSRobert Clausecker.macro	precalc33	reg_sub28, reg
4178b4684afSRobert Clausecker	vpxor		\reg_sub28, \reg, \reg
4188b4684afSRobert Clausecker.endm
4198b4684afSRobert Clausecker
4208b4684afSRobert Clausecker.macro	precalc34	reg_sub16
4218b4684afSRobert Clausecker	vpxor		\reg_sub16, %ymm0, %ymm0
4228b4684afSRobert Clausecker.endm
4238b4684afSRobert Clausecker
4248b4684afSRobert Clausecker.macro	precalc35	reg
4258b4684afSRobert Clausecker	vpxor		%ymm0, \reg, \reg
4268b4684afSRobert Clausecker.endm
4278b4684afSRobert Clausecker
4288b4684afSRobert Clausecker.macro	precalc36	reg
4298b4684afSRobert Clausecker	vpslld		$2, \reg, %ymm0
4308b4684afSRobert Clausecker.endm
4318b4684afSRobert Clausecker
4328b4684afSRobert Clausecker.macro	precalc37	reg
4338b4684afSRobert Clausecker	vpsrld		$30, \reg, \reg
4348b4684afSRobert Clausecker	vpor		\reg, %ymm0, \reg
4358b4684afSRobert Clausecker.endm
4368b4684afSRobert Clausecker
4378b4684afSRobert Clausecker.macro	precalc39	reg, k_offset, offset
4388b4684afSRobert Clausecker	vpaddd		\k_offset(%r8), \reg, %ymm0
4398b4684afSRobert Clausecker	vmovdqu		%ymm0, \offset(%r14)
4408b4684afSRobert Clausecker.endm
4418b4684afSRobert Clausecker
4428b4684afSRobert Clausecker.macro	precalc32_79	reg, reg_sub4, reg_sub8, reg_sub16, reg_sub28, k_offset, offset
4438b4684afSRobert Clausecker	precalc32	\reg_sub8, \reg_sub4
4448b4684afSRobert Clausecker	precalc33	\reg_sub28, \reg
4458b4684afSRobert Clausecker	precalc34	\reg_sub16
4468b4684afSRobert Clausecker	precalc35	\reg
4478b4684afSRobert Clausecker	precalc36	\reg
4488b4684afSRobert Clausecker	precalc37	\reg
4498b4684afSRobert Clausecker	precalc39	\reg, \k_offset, \offset
4508b4684afSRobert Clausecker.endm
4518b4684afSRobert Clausecker
4528b4684afSRobert Clausecker.macro	precalc
4538b4684afSRobert Clausecker	precalc00_15	0x00, %ymm15
4548b4684afSRobert Clausecker	precalc00_15	0x10, %ymm14
4558b4684afSRobert Clausecker	precalc00_15	0x20, %ymm13
4568b4684afSRobert Clausecker	precalc00_15	0x30, %ymm12
4578b4684afSRobert Clausecker	precalc16_31	%ymm8,  %ymm12, %ymm13, %ymm14, %ymm15, 0x00, 0x080
4588b4684afSRobert Clausecker	precalc16_31	%ymm7,  %ymm8,  %ymm12, %ymm13, %ymm14, 0x20, 0x0a0
4598b4684afSRobert Clausecker	precalc16_31	%ymm5,  %ymm7,  %ymm8,  %ymm12, %ymm13, 0x20, 0x0c0
4608b4684afSRobert Clausecker	precalc16_31	%ymm3,  %ymm5,  %ymm7,  %ymm8,  %ymm12, 0x20, 0x0e0
4618b4684afSRobert Clausecker	precalc32_79	%ymm15, %ymm3,  %ymm5,  %ymm8,  %ymm14, 0x20, 0x100
4628b4684afSRobert Clausecker	precalc32_79	%ymm14, %ymm15, %ymm3,  %ymm7,  %ymm13, 0x20, 0x120
4638b4684afSRobert Clausecker	precalc32_79	%ymm13, %ymm14, %ymm15, %ymm5,  %ymm12, 0x40, 0x140
4648b4684afSRobert Clausecker	precalc32_79	%ymm12, %ymm13, %ymm14, %ymm3,  %ymm8,  0x40, 0x160
4658b4684afSRobert Clausecker	precalc32_79	%ymm8,  %ymm12, %ymm13, %ymm15, %ymm7,  0x40, 0x180
4668b4684afSRobert Clausecker	precalc32_79	%ymm7,  %ymm8,  %ymm12, %ymm14, %ymm5,  0x40, 0x1a0
4678b4684afSRobert Clausecker	precalc32_79	%ymm5,  %ymm7,  %ymm8,  %ymm13, %ymm3,  0x40, 0x1c0
4688b4684afSRobert Clausecker	precalc32_79	%ymm3,  %ymm5,  %ymm7,  %ymm12, %ymm15, 0x60, 0x1e0
4698b4684afSRobert Clausecker	precalc32_79	%ymm15, %ymm3,  %ymm5,  %ymm8,  %ymm14, 0x60, 0x200
4708b4684afSRobert Clausecker	precalc32_79	%ymm14, %ymm15, %ymm3,  %ymm7,  %ymm13, 0x60, 0x220
4718b4684afSRobert Clausecker	precalc32_79	%ymm13, %ymm14, %ymm15, %ymm5,  %ymm12, 0x60, 0x240
4728b4684afSRobert Clausecker	precalc32_79	%ymm12, %ymm13, %ymm14, %ymm3,  %ymm8,  0x60, 0x260
4738b4684afSRobert Clausecker.endm
4748b4684afSRobert Clausecker
4758b4684afSRobert Clausecker/*
4768b4684afSRobert Clausecker * Macros calculating individual rounds have general form
4778b4684afSRobert Clausecker * calc_round_pre + precalc_round + calc_round_post
4788b4684afSRobert Clausecker * calc_round_{pre,post} macros follow
4798b4684afSRobert Clausecker */
4808b4684afSRobert Clausecker.macro	calc_f1_pre	offset, reg_a, reg_b, reg_c, reg_e
4818b4684afSRobert Clausecker	add		\offset(%r15), \reg_e
4828b4684afSRobert Clausecker	andn		\reg_c, \reg_a, %ebp
4838b4684afSRobert Clausecker	add		\reg_b, \reg_e			// add F from the previous round
4848b4684afSRobert Clausecker	rorx		$0x1b, \reg_a, %r12d
4858b4684afSRobert Clausecker	rorx		$2, \reg_a, \reg_b		// for the next round
4868b4684afSRobert Clausecker.endm
4878b4684afSRobert Clausecker
4888b4684afSRobert Clausecker/*
4898b4684afSRobert Clausecker * Calculate F for the next round
4908b4684afSRobert Clausecker */
4918b4684afSRobert Clausecker.macro	calc_f1_post	reg_a, reg_b, reg_e
4928b4684afSRobert Clausecker	and		\reg_b, \reg_a			// b & c
4938b4684afSRobert Clausecker	xor		%ebp, \reg_a			// F1 = (b&c) ^ (~b&d)
4948b4684afSRobert Clausecker	add		%r12d, \reg_e
4958b4684afSRobert Clausecker.endm
4968b4684afSRobert Clausecker
4978b4684afSRobert Clausecker/*
4988b4684afSRobert Clausecker * Registers are cyclically rotated:
4998b4684afSRobert Clausecker * edx -> eax -> edi -> esi -> ebx -> ecx
5008b4684afSRobert Clausecker */
5018b4684afSRobert Clausecker.macro	calc0
5028b4684afSRobert Clausecker	mov		%esi, %ebx			// precalculate first round
5038b4684afSRobert Clausecker	rorx		$2, %esi, %esi
5048b4684afSRobert Clausecker	andn		%eax, %ebx, %ebp
5058b4684afSRobert Clausecker	and		%edi, %ebx
5068b4684afSRobert Clausecker	xor		%ebp, %ebx
5078b4684afSRobert Clausecker	calc_f1_pre	0x0, %ecx, %ebx, %edi, %edx
5088b4684afSRobert Clausecker	precalc0	0x80
5098b4684afSRobert Clausecker	calc_f1_post	%ecx, %esi, %edx
5108b4684afSRobert Clausecker.endm
5118b4684afSRobert Clausecker
5128b4684afSRobert Clausecker.macro	calc1
5138b4684afSRobert Clausecker	calc_f1_pre	0x4, %edx, %ecx, %esi, %eax
5148b4684afSRobert Clausecker	precalc1	0x80
5158b4684afSRobert Clausecker	calc_f1_post	%edx, %ebx, %eax
5168b4684afSRobert Clausecker.endm
5178b4684afSRobert Clausecker
5188b4684afSRobert Clausecker.macro	calc2
5198b4684afSRobert Clausecker	calc_f1_pre	0x8, %eax, %edx, %ebx, %edi
5208b4684afSRobert Clausecker	precalc2	%ymm15
5218b4684afSRobert Clausecker	calc_f1_post	%eax, %ecx, %edi
5228b4684afSRobert Clausecker.endm
5238b4684afSRobert Clausecker
5248b4684afSRobert Clausecker.macro	calc3
5258b4684afSRobert Clausecker	calc_f1_pre	0xc, %edi, %eax, %ecx, %esi
5268b4684afSRobert Clausecker	calc_f1_post	%edi, %edx, %esi
5278b4684afSRobert Clausecker.endm
5288b4684afSRobert Clausecker
5298b4684afSRobert Clausecker.macro	calc4
5308b4684afSRobert Clausecker	calc_f1_pre	0x20, %esi, %edi, %edx, %ebx
5318b4684afSRobert Clausecker	precalc4	%ymm15, 0x0
5328b4684afSRobert Clausecker	calc_f1_post	%esi, %eax, %ebx
5338b4684afSRobert Clausecker.endm
5348b4684afSRobert Clausecker
5358b4684afSRobert Clausecker.macro	calc5
5368b4684afSRobert Clausecker	calc_f1_pre	0x24, %ebx, %esi, %eax, %ecx
5378b4684afSRobert Clausecker	calc_f1_post	%ebx, %edi, %ecx
5388b4684afSRobert Clausecker.endm
5398b4684afSRobert Clausecker
5408b4684afSRobert Clausecker.macro	calc6
5418b4684afSRobert Clausecker	calc_f1_pre	0x28, %ecx, %ebx, %edi, %edx
5428b4684afSRobert Clausecker	calc_f1_post	%ecx, %esi, %edx
5438b4684afSRobert Clausecker.endm
5448b4684afSRobert Clausecker
5458b4684afSRobert Clausecker.macro	calc7
5468b4684afSRobert Clausecker	calc_f1_pre	0x2c, %edx, %ecx, %esi, %eax
5478b4684afSRobert Clausecker	precalc7	0x0
5488b4684afSRobert Clausecker	calc_f1_post	%edx, %ebx, %eax
5498b4684afSRobert Clausecker.endm
5508b4684afSRobert Clausecker
5518b4684afSRobert Clausecker.macro	calc8
5528b4684afSRobert Clausecker	calc_f1_pre	0x40, %eax, %edx, %ebx, %edi
5538b4684afSRobert Clausecker	precalc0	0x90
5548b4684afSRobert Clausecker	calc_f1_post	%eax, %ecx, %edi
5558b4684afSRobert Clausecker.endm
5568b4684afSRobert Clausecker
5578b4684afSRobert Clausecker.macro	calc9
5588b4684afSRobert Clausecker	calc_f1_pre	0x44, %edi, %eax, %ecx, %esi
5598b4684afSRobert Clausecker	precalc1	0x90
5608b4684afSRobert Clausecker	calc_f1_post	%edi, %edx, %esi
5618b4684afSRobert Clausecker.endm
5628b4684afSRobert Clausecker
5638b4684afSRobert Clausecker.macro	calc10
5648b4684afSRobert Clausecker	calc_f1_pre	0x48, %esi, %edi, %edx, %ebx
5658b4684afSRobert Clausecker	precalc2	%ymm14
5668b4684afSRobert Clausecker	calc_f1_post	%esi, %eax, %ebx
5678b4684afSRobert Clausecker.endm
5688b4684afSRobert Clausecker
5698b4684afSRobert Clausecker.macro	calc11
5708b4684afSRobert Clausecker	calc_f1_pre	0x4c, %ebx, %esi, %eax, %ecx
5718b4684afSRobert Clausecker	calc_f1_post	%ebx, %edi, %ecx
5728b4684afSRobert Clausecker.endm
5738b4684afSRobert Clausecker
5748b4684afSRobert Clausecker.macro	calc12
5758b4684afSRobert Clausecker	calc_f1_pre	0x60, %ecx, %ebx, %edi, %edx
5768b4684afSRobert Clausecker	precalc4	%ymm14, 0
5778b4684afSRobert Clausecker	calc_f1_post	%ecx, %esi, %edx
5788b4684afSRobert Clausecker.endm
5798b4684afSRobert Clausecker
5808b4684afSRobert Clausecker.macro	calc13
5818b4684afSRobert Clausecker	calc_f1_pre	0x64, %edx, %ecx, %esi, %eax
5828b4684afSRobert Clausecker	calc_f1_post	%edx, %ebx, %eax
5838b4684afSRobert Clausecker.endm
5848b4684afSRobert Clausecker
5858b4684afSRobert Clausecker.macro	calc14
5868b4684afSRobert Clausecker	calc_f1_pre	0x68, %eax, %edx, %ebx, %edi
5878b4684afSRobert Clausecker	calc_f1_post	%eax, %ecx, %edi
5888b4684afSRobert Clausecker.endm
5898b4684afSRobert Clausecker
5908b4684afSRobert Clausecker.macro	calc15
5918b4684afSRobert Clausecker	calc_f1_pre	0x6c, %edi, %eax, %ecx, %esi
5928b4684afSRobert Clausecker	precalc7	0x10
5938b4684afSRobert Clausecker	calc_f1_post	%edi, %edx, %esi
5948b4684afSRobert Clausecker.endm
5958b4684afSRobert Clausecker
5968b4684afSRobert Clausecker.macro	calc16
5978b4684afSRobert Clausecker	calc_f1_pre	0x80, %esi, %edi, %edx, %ebx
5988b4684afSRobert Clausecker	precalc0	0xa0
5998b4684afSRobert Clausecker	calc_f1_post	%esi, %eax, %ebx
6008b4684afSRobert Clausecker.endm
6018b4684afSRobert Clausecker
6028b4684afSRobert Clausecker.macro	calc17
6038b4684afSRobert Clausecker	calc_f1_pre	0x84, %ebx, %esi, %eax, %ecx
6048b4684afSRobert Clausecker	precalc1	0xa0
6058b4684afSRobert Clausecker	calc_f1_post	%ebx, %edi, %ecx
6068b4684afSRobert Clausecker.endm
6078b4684afSRobert Clausecker
6088b4684afSRobert Clausecker.macro	calc18
6098b4684afSRobert Clausecker	calc_f1_pre	0x88, %ecx, %ebx, %edi, %edx
6108b4684afSRobert Clausecker	precalc2	%ymm13
6118b4684afSRobert Clausecker	calc_f1_post	%ecx, %esi, %edx
6128b4684afSRobert Clausecker.endm
6138b4684afSRobert Clausecker
6148b4684afSRobert Clausecker.macro	calc_f2_pre	offset, reg_a, reg_b, reg_e
6158b4684afSRobert Clausecker	add		\offset(%r15), \reg_e
6168b4684afSRobert Clausecker	add		\reg_b, \reg_e			// add F from the previous round
6178b4684afSRobert Clausecker	rorx		$0x1b, \reg_a, %r12d
6188b4684afSRobert Clausecker	rorx		$2, \reg_a, \reg_b		// for next round
6198b4684afSRobert Clausecker.endm
6208b4684afSRobert Clausecker
6218b4684afSRobert Clausecker.macro	calc_f2_post	reg_a, reg_b, reg_c, reg_e
6228b4684afSRobert Clausecker	xor		\reg_b, \reg_a
6238b4684afSRobert Clausecker	add		%r12d, \reg_e
6248b4684afSRobert Clausecker	xor		\reg_c, \reg_a
6258b4684afSRobert Clausecker.endm
6268b4684afSRobert Clausecker
6278b4684afSRobert Clausecker.macro	calc19
6288b4684afSRobert Clausecker	calc_f2_pre	0x8c, %edx, %ecx, %eax
6298b4684afSRobert Clausecker	calc_f2_post	%edx, %ebx, %esi, %eax
6308b4684afSRobert Clausecker.endm
6318b4684afSRobert Clausecker
6328b4684afSRobert Clausecker.macro	calc20
6338b4684afSRobert Clausecker	calc_f2_pre	0xa0, %eax, %edx, %edi
6348b4684afSRobert Clausecker	precalc4	%ymm13, 0x0
6358b4684afSRobert Clausecker	calc_f2_post	%eax, %ecx, %ebx, %edi
6368b4684afSRobert Clausecker.endm
6378b4684afSRobert Clausecker
6388b4684afSRobert Clausecker.macro	calc21
6398b4684afSRobert Clausecker	calc_f2_pre	0xa4, %edi, %eax, %esi
6408b4684afSRobert Clausecker	calc_f2_post	%edi, %edx, %ecx, %esi
6418b4684afSRobert Clausecker.endm
6428b4684afSRobert Clausecker
6438b4684afSRobert Clausecker.macro	calc22
6448b4684afSRobert Clausecker	calc_f2_pre	0xa8, %esi, %edi, %ebx
6458b4684afSRobert Clausecker	calc_f2_post	%esi, %eax, %edx, %ebx
6468b4684afSRobert Clausecker.endm
6478b4684afSRobert Clausecker
6488b4684afSRobert Clausecker.macro	calc23
6498b4684afSRobert Clausecker	calc_f2_pre	0xac, %ebx, %esi, %ecx
6508b4684afSRobert Clausecker	precalc7	0x20
6518b4684afSRobert Clausecker	calc_f2_post	%ebx, %edi, %eax, %ecx
6528b4684afSRobert Clausecker.endm
6538b4684afSRobert Clausecker
6548b4684afSRobert Clausecker.macro	calc24
6558b4684afSRobert Clausecker	calc_f2_pre	0xc0, %ecx, %ebx, %edx
6568b4684afSRobert Clausecker	precalc0	0xb0
6578b4684afSRobert Clausecker	calc_f2_post	%ecx, %esi, %edi, %edx
6588b4684afSRobert Clausecker.endm
6598b4684afSRobert Clausecker
6608b4684afSRobert Clausecker.macro	calc25
6618b4684afSRobert Clausecker	calc_f2_pre	0xc4, %edx, %ecx, %eax
6628b4684afSRobert Clausecker	precalc1	0xb0
6638b4684afSRobert Clausecker	calc_f2_post	%edx, %ebx, %esi, %eax
6648b4684afSRobert Clausecker.endm
6658b4684afSRobert Clausecker
6668b4684afSRobert Clausecker.macro	calc26
6678b4684afSRobert Clausecker	calc_f2_pre	0xc8, %eax, %edx, %edi
6688b4684afSRobert Clausecker	precalc2	%ymm12
6698b4684afSRobert Clausecker	calc_f2_post	%eax, %ecx, %ebx, %edi
6708b4684afSRobert Clausecker.endm
6718b4684afSRobert Clausecker
6728b4684afSRobert Clausecker.macro	calc27
6738b4684afSRobert Clausecker	calc_f2_pre	0xcc, %edi, %eax, %esi
6748b4684afSRobert Clausecker	calc_f2_post	%edi, %edx, %ecx, %esi
6758b4684afSRobert Clausecker.endm
6768b4684afSRobert Clausecker
6778b4684afSRobert Clausecker.macro	calc28
6788b4684afSRobert Clausecker	calc_f2_pre	0xe0, %esi, %edi, %ebx
6798b4684afSRobert Clausecker	precalc4	%ymm12, 0x0
6808b4684afSRobert Clausecker	calc_f2_post	%esi, %eax, %edx, %ebx
6818b4684afSRobert Clausecker.endm
6828b4684afSRobert Clausecker
6838b4684afSRobert Clausecker.macro	calc29
6848b4684afSRobert Clausecker	calc_f2_pre	0xe4, %ebx, %esi, %ecx
6858b4684afSRobert Clausecker	calc_f2_post	%ebx, %edi, %eax, %ecx
6868b4684afSRobert Clausecker.endm
6878b4684afSRobert Clausecker
6888b4684afSRobert Clausecker.macro	calc30
6898b4684afSRobert Clausecker	calc_f2_pre	0xe8, %ecx, %ebx, %edx
6908b4684afSRobert Clausecker	calc_f2_post	%ecx, %esi, %edi, %edx
6918b4684afSRobert Clausecker.endm
6928b4684afSRobert Clausecker
6938b4684afSRobert Clausecker.macro	calc31
6948b4684afSRobert Clausecker	calc_f2_pre	0xec, %edx, %ecx, %eax
6958b4684afSRobert Clausecker	precalc7	0x30
6968b4684afSRobert Clausecker	calc_f2_post	%edx, %ebx, %esi, %eax
6978b4684afSRobert Clausecker.endm
6988b4684afSRobert Clausecker
6998b4684afSRobert Clausecker.macro	calc32
7008b4684afSRobert Clausecker	calc_f2_pre	0x100, %eax, %edx, %edi
7018b4684afSRobert Clausecker	precalc16	%ymm15, %ymm14, %ymm12, %ymm8
7028b4684afSRobert Clausecker	calc_f2_post	%eax, %ecx, %ebx, %edi
7038b4684afSRobert Clausecker.endm
7048b4684afSRobert Clausecker
7058b4684afSRobert Clausecker.macro	calc33
7068b4684afSRobert Clausecker	calc_f2_pre	0x104, %edi, %eax, %esi
7078b4684afSRobert Clausecker	precalc17	%ymm15, %ymm13, %ymm8
7088b4684afSRobert Clausecker	calc_f2_post	%edi, %edx, %ecx, %esi
7098b4684afSRobert Clausecker.endm
7108b4684afSRobert Clausecker
7118b4684afSRobert Clausecker.macro	calc34
7128b4684afSRobert Clausecker	calc_f2_pre	0x108, %esi, %edi, %ebx
7138b4684afSRobert Clausecker	precalc18	%ymm8
7148b4684afSRobert Clausecker	calc_f2_post	%esi, %eax, %edx, %ebx
7158b4684afSRobert Clausecker.endm
7168b4684afSRobert Clausecker
7178b4684afSRobert Clausecker.macro	calc35
7188b4684afSRobert Clausecker	calc_f2_pre	0x10c, %ebx, %esi, %ecx
7198b4684afSRobert Clausecker	precalc19	%ymm8
7208b4684afSRobert Clausecker	calc_f2_post	%ebx, %edi, %eax, %ecx
7218b4684afSRobert Clausecker.endm
7228b4684afSRobert Clausecker
7238b4684afSRobert Clausecker.macro	calc36
7248b4684afSRobert Clausecker	calc_f2_pre	0x120, %ecx, %ebx, %edx
7258b4684afSRobert Clausecker	precalc20	%ymm8
7268b4684afSRobert Clausecker	calc_f2_post	%ecx, %esi, %edi, %edx
7278b4684afSRobert Clausecker.endm
7288b4684afSRobert Clausecker
7298b4684afSRobert Clausecker.macro	calc37
7308b4684afSRobert Clausecker	calc_f2_pre	0x124, %edx, %ecx, %eax
7318b4684afSRobert Clausecker	precalc21	%ymm8
7328b4684afSRobert Clausecker	calc_f2_post	%edx, %ebx, %esi, %eax
7338b4684afSRobert Clausecker.endm
7348b4684afSRobert Clausecker
7358b4684afSRobert Clausecker.macro	calc38
7368b4684afSRobert Clausecker	calc_f2_pre	0x128, %eax, %edx, %edi
7378b4684afSRobert Clausecker	calc_f2_post	%eax, %ecx, %ebx, %edi
7388b4684afSRobert Clausecker.endm
7398b4684afSRobert Clausecker
7408b4684afSRobert Clausecker.macro	calc_f3_pre	offset, reg_e
7418b4684afSRobert Clausecker	add		\offset(%r15), \reg_e
7428b4684afSRobert Clausecker.endm
7438b4684afSRobert Clausecker
7448b4684afSRobert Clausecker.macro	calc_f3_post	reg_a, reg_b, reg_c, reg_e, reg_tb
7458b4684afSRobert Clausecker	add		\reg_tb, \reg_e		// add F from the previous round
7468b4684afSRobert Clausecker	mov		\reg_b, %ebp
7478b4684afSRobert Clausecker	or		\reg_a, %ebp
7488b4684afSRobert Clausecker	rorx		$0x1b, \reg_a, %r12d
7498b4684afSRobert Clausecker	rorx		$2, \reg_a, \reg_tb
7508b4684afSRobert Clausecker	and		\reg_c, %ebp		// calculate F for the next round
7518b4684afSRobert Clausecker	and		\reg_b, \reg_a
7528b4684afSRobert Clausecker	or		%ebp, \reg_a
7538b4684afSRobert Clausecker	add		%r12d, \reg_e
7548b4684afSRobert Clausecker.endm
7558b4684afSRobert Clausecker
7568b4684afSRobert Clausecker.macro	calc39
7578b4684afSRobert Clausecker	calc_f3_pre	0x12c, %esi
7588b4684afSRobert Clausecker	precalc23	%ymm8, 0x0, 0x80
7598b4684afSRobert Clausecker	calc_f3_post	%edi, %edx, %ecx, %esi, %eax
7608b4684afSRobert Clausecker.endm
7618b4684afSRobert Clausecker
7628b4684afSRobert Clausecker.macro	calc40
7638b4684afSRobert Clausecker	calc_f3_pre	0x140, %ebx
7648b4684afSRobert Clausecker	precalc16	%ymm14, %ymm13, %ymm8, %ymm7
7658b4684afSRobert Clausecker	calc_f3_post	%esi, %eax, %edx, %ebx, %edi
7668b4684afSRobert Clausecker.endm
7678b4684afSRobert Clausecker
7688b4684afSRobert Clausecker.macro	calc41
7698b4684afSRobert Clausecker	calc_f3_pre	0x144, %ecx
7708b4684afSRobert Clausecker	precalc17	%ymm14, %ymm12, %ymm7
7718b4684afSRobert Clausecker	calc_f3_post	%ebx, %edi, %eax, %ecx, %esi
7728b4684afSRobert Clausecker.endm
7738b4684afSRobert Clausecker
7748b4684afSRobert Clausecker.macro	calc42
7758b4684afSRobert Clausecker	calc_f3_pre	0x148, %edx
7768b4684afSRobert Clausecker	precalc18	%ymm7
7778b4684afSRobert Clausecker	calc_f3_post	%ecx, %esi, %edi, %edx, %ebx
7788b4684afSRobert Clausecker.endm
7798b4684afSRobert Clausecker
7808b4684afSRobert Clausecker.macro	calc43
7818b4684afSRobert Clausecker	calc_f3_pre	0x14c, %eax
7828b4684afSRobert Clausecker	precalc19	%ymm7
7838b4684afSRobert Clausecker	calc_f3_post	%edx, %ebx, %esi, %eax, %ecx
7848b4684afSRobert Clausecker.endm
7858b4684afSRobert Clausecker
7868b4684afSRobert Clausecker.macro	calc44
7878b4684afSRobert Clausecker	calc_f3_pre	0x160, %edi
7888b4684afSRobert Clausecker	precalc20	%ymm7
7898b4684afSRobert Clausecker	calc_f3_post	%eax, %ecx, %ebx, %edi, %edx
7908b4684afSRobert Clausecker.endm
7918b4684afSRobert Clausecker
7928b4684afSRobert Clausecker.macro	calc45
7938b4684afSRobert Clausecker	calc_f3_pre	0x164, %esi
7948b4684afSRobert Clausecker	precalc21	%ymm7
7958b4684afSRobert Clausecker	calc_f3_post	%edi, %edx, %ecx, %esi, %eax
7968b4684afSRobert Clausecker.endm
7978b4684afSRobert Clausecker
7988b4684afSRobert Clausecker.macro	calc46
7998b4684afSRobert Clausecker	calc_f3_pre	0x168, %ebx
8008b4684afSRobert Clausecker	calc_f3_post	%esi, %eax, %edx, %ebx, %edi
8018b4684afSRobert Clausecker.endm
8028b4684afSRobert Clausecker
8038b4684afSRobert Clausecker.macro	calc47
8048b4684afSRobert Clausecker	calc_f3_pre	0x16c, %ecx
8058b4684afSRobert Clausecker	vpxor		%ymm9, %ymm0, %ymm7
8068b4684afSRobert Clausecker	vpaddd		0x20(%r8), %ymm7, %ymm0
8078b4684afSRobert Clausecker	vmovdqu		%ymm0, 0xa0(%r14)
8088b4684afSRobert Clausecker	calc_f3_post	%ebx, %edi, %eax, %ecx, %esi
8098b4684afSRobert Clausecker.endm
8108b4684afSRobert Clausecker
8118b4684afSRobert Clausecker.macro	calc48
8128b4684afSRobert Clausecker	calc_f3_pre	0x180, %edx
8138b4684afSRobert Clausecker	precalc16	%ymm13, %ymm12, %ymm7, %ymm5
8148b4684afSRobert Clausecker	calc_f3_post	%ecx, %esi, %edi, %edx, %ebx
8158b4684afSRobert Clausecker.endm
8168b4684afSRobert Clausecker
8178b4684afSRobert Clausecker.macro	calc49
8188b4684afSRobert Clausecker	calc_f3_pre	0x184, %eax
8198b4684afSRobert Clausecker	precalc17	%ymm13, %ymm8, %ymm5
8208b4684afSRobert Clausecker	calc_f3_post	%edx, %ebx, %esi, %eax, %ecx
8218b4684afSRobert Clausecker.endm
8228b4684afSRobert Clausecker
8238b4684afSRobert Clausecker.macro	calc50
8248b4684afSRobert Clausecker	calc_f3_pre	0x188, %edi
8258b4684afSRobert Clausecker	precalc18	%ymm5
8268b4684afSRobert Clausecker	calc_f3_post	%eax, %ecx, %ebx, %edi, %edx
8278b4684afSRobert Clausecker.endm
8288b4684afSRobert Clausecker
8298b4684afSRobert Clausecker.macro	calc51
8308b4684afSRobert Clausecker	calc_f3_pre	0x18c, %esi
8318b4684afSRobert Clausecker	precalc19	%ymm5
8328b4684afSRobert Clausecker	calc_f3_post	%edi, %edx, %ecx, %esi, %eax
8338b4684afSRobert Clausecker.endm
8348b4684afSRobert Clausecker
8358b4684afSRobert Clausecker.macro	calc52
8368b4684afSRobert Clausecker	calc_f3_pre	0x1a0, %ebx
8378b4684afSRobert Clausecker	precalc20	%ymm5
8388b4684afSRobert Clausecker	calc_f3_post	%esi, %eax, %edx, %ebx, %edi
8398b4684afSRobert Clausecker.endm
8408b4684afSRobert Clausecker
8418b4684afSRobert Clausecker.macro	calc53
8428b4684afSRobert Clausecker	calc_f3_pre	0x1a4, %ecx
8438b4684afSRobert Clausecker	precalc21	%ymm5
8448b4684afSRobert Clausecker	calc_f3_post	%ebx, %edi, %eax, %ecx, %esi
8458b4684afSRobert Clausecker.endm
8468b4684afSRobert Clausecker
8478b4684afSRobert Clausecker.macro	calc54
8488b4684afSRobert Clausecker	calc_f3_pre	0x1a8, %edx
8498b4684afSRobert Clausecker	calc_f3_post	%ecx, %esi, %edi, %edx, %ebx
8508b4684afSRobert Clausecker.endm
8518b4684afSRobert Clausecker
8528b4684afSRobert Clausecker.macro	calc55
8538b4684afSRobert Clausecker	calc_f3_pre	0x1ac, %eax
8548b4684afSRobert Clausecker	precalc23	%ymm5, 0x20, 0xc0
8558b4684afSRobert Clausecker	calc_f3_post	%edx, %ebx, %esi, %eax, %ecx
8568b4684afSRobert Clausecker.endm
8578b4684afSRobert Clausecker
8588b4684afSRobert Clausecker.macro	calc56
8598b4684afSRobert Clausecker	calc_f3_pre	0x1c0, %edi
8608b4684afSRobert Clausecker	precalc16	%ymm12, %ymm8, %ymm5, %ymm3
8618b4684afSRobert Clausecker	calc_f3_post	%eax, %ecx, %ebx, %edi, %edx
8628b4684afSRobert Clausecker.endm
8638b4684afSRobert Clausecker
8648b4684afSRobert Clausecker.macro	calc57
8658b4684afSRobert Clausecker	calc_f3_pre	0x1c4, %esi
8668b4684afSRobert Clausecker	precalc17	%ymm12, %ymm7, %ymm3
8678b4684afSRobert Clausecker	calc_f3_post	%edi, %edx, %ecx, %esi, %eax
8688b4684afSRobert Clausecker.endm
8698b4684afSRobert Clausecker
8708b4684afSRobert Clausecker.macro	calc58
8718b4684afSRobert Clausecker	calc_f3_pre	0x1c8, %ebx
8728b4684afSRobert Clausecker	precalc18	%ymm3
8738b4684afSRobert Clausecker	calc_f3_post	%esi, %eax, %edx, %ebx, %edi
8748b4684afSRobert Clausecker.endm
8758b4684afSRobert Clausecker
8768b4684afSRobert Clausecker.macro	calc59
8778b4684afSRobert Clausecker	calc_f2_pre	0x1cc, %ebx, %esi, %ecx
8788b4684afSRobert Clausecker	precalc19	%ymm3
8798b4684afSRobert Clausecker	calc_f2_post	%ebx, %edi, %eax, %ecx
8808b4684afSRobert Clausecker.endm
8818b4684afSRobert Clausecker
8828b4684afSRobert Clausecker.macro	calc60
8838b4684afSRobert Clausecker	calc_f2_pre	0x1e0, %ecx, %ebx, %edx
8848b4684afSRobert Clausecker	precalc20	%ymm3
8858b4684afSRobert Clausecker	calc_f2_post	%ecx, %esi, %edi, %edx
8868b4684afSRobert Clausecker.endm
8878b4684afSRobert Clausecker
8888b4684afSRobert Clausecker.macro	calc61
8898b4684afSRobert Clausecker	calc_f2_pre	0x1e4, %edx, %ecx, %eax
8908b4684afSRobert Clausecker	precalc21	%ymm3
8918b4684afSRobert Clausecker	calc_f2_post	%edx, %ebx, %esi, %eax
8928b4684afSRobert Clausecker.endm
8938b4684afSRobert Clausecker
8948b4684afSRobert Clausecker.macro	calc62
8958b4684afSRobert Clausecker	calc_f2_pre	0x1e8, %eax, %edx, %edi
8968b4684afSRobert Clausecker	calc_f2_post	%eax, %ecx, %ebx, %edi
8978b4684afSRobert Clausecker.endm
8988b4684afSRobert Clausecker
8998b4684afSRobert Clausecker.macro	calc63
9008b4684afSRobert Clausecker	calc_f2_pre	0x1ec, %edi, %eax, %esi
9018b4684afSRobert Clausecker	precalc23	%ymm3, 0x20, 0xe0
9028b4684afSRobert Clausecker	calc_f2_post	%edi, %edx, %ecx, %esi
9038b4684afSRobert Clausecker.endm
9048b4684afSRobert Clausecker
9058b4684afSRobert Clausecker.macro	calc64
9068b4684afSRobert Clausecker	calc_f2_pre	0x200, %esi, %edi, %ebx
9078b4684afSRobert Clausecker	precalc32	%ymm5, %ymm3
9088b4684afSRobert Clausecker	calc_f2_post	%esi, %eax, %edx, %ebx
9098b4684afSRobert Clausecker.endm
9108b4684afSRobert Clausecker
9118b4684afSRobert Clausecker.macro	calc65
9128b4684afSRobert Clausecker	calc_f2_pre	0x204, %ebx, %esi, %ecx
9138b4684afSRobert Clausecker	precalc33	%ymm14, %ymm15
9148b4684afSRobert Clausecker	calc_f2_post	%ebx, %edi, %eax, %ecx
9158b4684afSRobert Clausecker.endm
9168b4684afSRobert Clausecker
9178b4684afSRobert Clausecker.macro	calc66
9188b4684afSRobert Clausecker	calc_f2_pre	0x208, %ecx, %ebx, %edx
9198b4684afSRobert Clausecker	precalc34	%ymm8
9208b4684afSRobert Clausecker	calc_f2_post	%ecx, %esi, %edi, %edx
9218b4684afSRobert Clausecker.endm
9228b4684afSRobert Clausecker
9238b4684afSRobert Clausecker.macro	calc67
9248b4684afSRobert Clausecker	calc_f2_pre	0x20c, %edx, %ecx, %eax
9258b4684afSRobert Clausecker	precalc35	%ymm15
9268b4684afSRobert Clausecker	calc_f2_post	%edx, %ebx, %esi, %eax
9278b4684afSRobert Clausecker.endm
9288b4684afSRobert Clausecker
9298b4684afSRobert Clausecker.macro	calc68
9308b4684afSRobert Clausecker	calc_f2_pre	0x220, %eax, %edx, %edi
9318b4684afSRobert Clausecker	precalc36	%ymm15
9328b4684afSRobert Clausecker	calc_f2_post	%eax, %ecx, %ebx, %edi
9338b4684afSRobert Clausecker.endm
9348b4684afSRobert Clausecker
9358b4684afSRobert Clausecker.macro	calc69
9368b4684afSRobert Clausecker	calc_f2_pre	0x224, %edi, %eax, %esi
9378b4684afSRobert Clausecker	precalc37	%ymm15
9388b4684afSRobert Clausecker	calc_f2_post	%edi, %edx, %ecx, %esi
9398b4684afSRobert Clausecker.endm
9408b4684afSRobert Clausecker
9418b4684afSRobert Clausecker.macro	calc70
9428b4684afSRobert Clausecker	calc_f2_pre	0x228, %esi, %edi, %ebx
9438b4684afSRobert Clausecker	calc_f2_post	%esi, %eax, %edx, %ebx
9448b4684afSRobert Clausecker.endm
9458b4684afSRobert Clausecker
9468b4684afSRobert Clausecker.macro	calc71
9478b4684afSRobert Clausecker	calc_f2_pre	0x22c, %ebx, %esi, %ecx
9488b4684afSRobert Clausecker	precalc39	%ymm15, 0x20, 0x100
9498b4684afSRobert Clausecker	calc_f2_post	%ebx, %edi, %eax, %ecx
9508b4684afSRobert Clausecker.endm
9518b4684afSRobert Clausecker
9528b4684afSRobert Clausecker.macro	calc72
9538b4684afSRobert Clausecker	calc_f2_pre	0x240, %ecx, %ebx, %edx
9548b4684afSRobert Clausecker	precalc32	%ymm3, %ymm15
9558b4684afSRobert Clausecker	calc_f2_post	%ecx, %esi, %edi, %edx
9568b4684afSRobert Clausecker.endm
9578b4684afSRobert Clausecker
9588b4684afSRobert Clausecker.macro	calc73
9598b4684afSRobert Clausecker	calc_f2_pre	0x244, %edx, %ecx, %eax
9608b4684afSRobert Clausecker	precalc33	%ymm13, %ymm14
9618b4684afSRobert Clausecker	calc_f2_post	%edx, %ebx, %esi, %eax
9628b4684afSRobert Clausecker.endm
9638b4684afSRobert Clausecker
9648b4684afSRobert Clausecker.macro	calc74
9658b4684afSRobert Clausecker	calc_f2_pre	0x248, %eax, %edx, %edi
9668b4684afSRobert Clausecker	precalc34	%ymm7
9678b4684afSRobert Clausecker	calc_f2_post	%eax, %ecx, %ebx, %edi
9688b4684afSRobert Clausecker.endm
9698b4684afSRobert Clausecker
9708b4684afSRobert Clausecker.macro	calc75
9718b4684afSRobert Clausecker	calc_f2_pre	0x24c, %edi, %eax, %esi
9728b4684afSRobert Clausecker	precalc35	%ymm14
9738b4684afSRobert Clausecker	calc_f2_post	%edi, %edx, %ecx, %esi
9748b4684afSRobert Clausecker.endm
9758b4684afSRobert Clausecker
9768b4684afSRobert Clausecker.macro	calc76
9778b4684afSRobert Clausecker	calc_f2_pre	0x260, %esi, %edi, %ebx
9788b4684afSRobert Clausecker	precalc36	%ymm14
9798b4684afSRobert Clausecker	calc_f2_post	%esi, %eax, %edx, %ebx
9808b4684afSRobert Clausecker.endm
9818b4684afSRobert Clausecker
9828b4684afSRobert Clausecker.macro	calc77
9838b4684afSRobert Clausecker	calc_f2_pre	0x264, %ebx, %esi, %ecx
9848b4684afSRobert Clausecker	precalc37	%ymm14
9858b4684afSRobert Clausecker	calc_f2_post	%ebx, %edi, %eax, %ecx
9868b4684afSRobert Clausecker.endm
9878b4684afSRobert Clausecker
9888b4684afSRobert Clausecker.macro	calc78
9898b4684afSRobert Clausecker	calc_f2_pre	0x268, %ecx, %ebx, %edx
9908b4684afSRobert Clausecker	calc_f2_post	%ecx, %esi, %edi, %edx
9918b4684afSRobert Clausecker.endm
9928b4684afSRobert Clausecker
9938b4684afSRobert Clausecker.macro	calc79
9948b4684afSRobert Clausecker	add		0x26c(%r15), %eax
9958b4684afSRobert Clausecker	add		%ecx, %eax
9968b4684afSRobert Clausecker	rorx		$0x1b, %edx, %r12d
9978b4684afSRobert Clausecker	precalc39	%ymm14, 0x20, 0x120
9988b4684afSRobert Clausecker	add		%r12d, %eax
9998b4684afSRobert Clausecker.endm
10008b4684afSRobert Clausecker
10018b4684afSRobert Clausecker/*
10028b4684afSRobert Clausecker * Similar to calc0
10038b4684afSRobert Clausecker */
10048b4684afSRobert Clausecker.macro	calc80
10058b4684afSRobert Clausecker	mov		%ecx, %edx			// precalculate first round
10068b4684afSRobert Clausecker	rorx		$2, %ecx, %ecx
10078b4684afSRobert Clausecker	andn		%esi, %edx, %ebp
10088b4684afSRobert Clausecker	and		%ebx, %edx
10098b4684afSRobert Clausecker	xor		%ebp, %edx
10108b4684afSRobert Clausecker	calc_f1_pre	0x10, %eax, %edx, %ebx, %edi
10118b4684afSRobert Clausecker	precalc32	%ymm15, %ymm14
10128b4684afSRobert Clausecker	calc_f1_post	%eax, %ecx, %edi
10138b4684afSRobert Clausecker.endm
10148b4684afSRobert Clausecker
10158b4684afSRobert Clausecker.macro	calc81
10168b4684afSRobert Clausecker	calc_f1_pre	0x14, %edi, %eax, %ecx, %esi
10178b4684afSRobert Clausecker	precalc33	%ymm12, %ymm13
10188b4684afSRobert Clausecker	calc_f1_post	%edi, %edx, %esi
10198b4684afSRobert Clausecker.endm
10208b4684afSRobert Clausecker
10218b4684afSRobert Clausecker.macro	calc82
10228b4684afSRobert Clausecker	calc_f1_pre	0x18, %esi, %edi, %edx, %ebx
10238b4684afSRobert Clausecker	precalc34	%ymm5
10248b4684afSRobert Clausecker	calc_f1_post	%esi, %eax, %ebx
10258b4684afSRobert Clausecker.endm
10268b4684afSRobert Clausecker
10278b4684afSRobert Clausecker.macro	calc83
10288b4684afSRobert Clausecker	calc_f1_pre	0x1c, %ebx, %esi, %eax, %ecx
10298b4684afSRobert Clausecker	precalc35	%ymm13
10308b4684afSRobert Clausecker	calc_f1_post	%ebx, %edi, %ecx
10318b4684afSRobert Clausecker.endm
10328b4684afSRobert Clausecker
10338b4684afSRobert Clausecker.macro	calc84
10348b4684afSRobert Clausecker	calc_f1_pre	0x30, %ecx, %ebx, %edi, %edx
10358b4684afSRobert Clausecker	precalc36	%ymm13
10368b4684afSRobert Clausecker	calc_f1_post	%ecx, %esi, %edx
10378b4684afSRobert Clausecker.endm
10388b4684afSRobert Clausecker
10398b4684afSRobert Clausecker.macro	calc85
10408b4684afSRobert Clausecker	calc_f1_pre	0x34, %edx, %ecx, %esi, %eax
10418b4684afSRobert Clausecker	precalc37	%ymm13
10428b4684afSRobert Clausecker	calc_f1_post	%edx, %ebx, %eax
10438b4684afSRobert Clausecker.endm
10448b4684afSRobert Clausecker
10458b4684afSRobert Clausecker.macro	calc86
10468b4684afSRobert Clausecker	calc_f1_pre	0x38, %eax, %edx, %ebx, %edi
10478b4684afSRobert Clausecker	calc_f1_post	%eax, %ecx, %edi
10488b4684afSRobert Clausecker.endm
10498b4684afSRobert Clausecker
10508b4684afSRobert Clausecker.macro	calc87
10518b4684afSRobert Clausecker	calc_f1_pre	0x3c, %edi, %eax, %ecx, %esi
10528b4684afSRobert Clausecker	precalc39	%ymm13, 0x40, 0x140
10538b4684afSRobert Clausecker	calc_f1_post	%edi, %edx, %esi
10548b4684afSRobert Clausecker.endm
10558b4684afSRobert Clausecker
10568b4684afSRobert Clausecker.macro	calc88
10578b4684afSRobert Clausecker	calc_f1_pre	0x50, %esi, %edi, %edx, %ebx
10588b4684afSRobert Clausecker	precalc32	%ymm14, %ymm13
10598b4684afSRobert Clausecker	calc_f1_post	%esi, %eax, %ebx
10608b4684afSRobert Clausecker.endm
10618b4684afSRobert Clausecker
10628b4684afSRobert Clausecker.macro	calc89
10638b4684afSRobert Clausecker	calc_f1_pre	0x54, %ebx, %esi, %eax, %ecx
10648b4684afSRobert Clausecker	precalc33	%ymm8, %ymm12
10658b4684afSRobert Clausecker	calc_f1_post	%ebx, %edi, %ecx
10668b4684afSRobert Clausecker.endm
10678b4684afSRobert Clausecker
10688b4684afSRobert Clausecker.macro	calc90
10698b4684afSRobert Clausecker	calc_f1_pre	0x58, %ecx, %ebx, %edi, %edx
10708b4684afSRobert Clausecker	precalc34	%ymm3
10718b4684afSRobert Clausecker	calc_f1_post	%ecx, %esi, %edx
10728b4684afSRobert Clausecker.endm
10738b4684afSRobert Clausecker
10748b4684afSRobert Clausecker.macro	calc91
10758b4684afSRobert Clausecker	calc_f1_pre	0x5c, %edx, %ecx, %esi, %eax
10768b4684afSRobert Clausecker	precalc35	%ymm12
10778b4684afSRobert Clausecker	calc_f1_post	%edx, %ebx, %eax
10788b4684afSRobert Clausecker.endm
10798b4684afSRobert Clausecker
10808b4684afSRobert Clausecker.macro	calc92
10818b4684afSRobert Clausecker	calc_f1_pre	0x70, %eax, %edx, %ebx, %edi
10828b4684afSRobert Clausecker	precalc36	%ymm12
10838b4684afSRobert Clausecker	calc_f1_post	%eax, %ecx, %edi
10848b4684afSRobert Clausecker.endm
10858b4684afSRobert Clausecker
10868b4684afSRobert Clausecker.macro	calc93
10878b4684afSRobert Clausecker	calc_f1_pre	0x74, %edi, %eax, %ecx, %esi
10888b4684afSRobert Clausecker	precalc37	%ymm12
10898b4684afSRobert Clausecker	calc_f1_post	%edi, %edx, %esi
10908b4684afSRobert Clausecker.endm
10918b4684afSRobert Clausecker
10928b4684afSRobert Clausecker.macro	calc94
10938b4684afSRobert Clausecker	calc_f1_pre	0x78, %esi, %edi, %edx, %ebx
10948b4684afSRobert Clausecker	calc_f1_post	%esi, %eax, %ebx
10958b4684afSRobert Clausecker.endm
10968b4684afSRobert Clausecker
10978b4684afSRobert Clausecker.macro	calc95
10988b4684afSRobert Clausecker	calc_f1_pre	0x7c, %ebx, %esi, %eax, %ecx
10998b4684afSRobert Clausecker	precalc39	%ymm12, 0x40, 0x160
11008b4684afSRobert Clausecker	calc_f1_post	%ebx, %edi, %ecx
11018b4684afSRobert Clausecker.endm
11028b4684afSRobert Clausecker
11038b4684afSRobert Clausecker.macro	calc96
11048b4684afSRobert Clausecker	calc_f1_pre	0x90, %ecx, %ebx, %edi, %edx
11058b4684afSRobert Clausecker	precalc32	%ymm13, %ymm12
11068b4684afSRobert Clausecker	calc_f1_post	%ecx, %esi, %edx
11078b4684afSRobert Clausecker.endm
11088b4684afSRobert Clausecker
11098b4684afSRobert Clausecker.macro	calc97
11108b4684afSRobert Clausecker	calc_f1_pre	0x94, %edx, %ecx, %esi, %eax
11118b4684afSRobert Clausecker	precalc33	%ymm7, %ymm8
11128b4684afSRobert Clausecker	calc_f1_post	%edx, %ebx, %eax
11138b4684afSRobert Clausecker.endm
11148b4684afSRobert Clausecker
11158b4684afSRobert Clausecker.macro	calc98
11168b4684afSRobert Clausecker	calc_f1_pre	0x98, %eax, %edx, %ebx, %edi
11178b4684afSRobert Clausecker	precalc34	%ymm15
11188b4684afSRobert Clausecker	calc_f1_post	%eax, %ecx, %edi
11198b4684afSRobert Clausecker.endm
11208b4684afSRobert Clausecker
11218b4684afSRobert Clausecker.macro	calc99
11228b4684afSRobert Clausecker	calc_f2_pre	0x9c, %edi, %eax, %esi
11238b4684afSRobert Clausecker	precalc35	%ymm8
11248b4684afSRobert Clausecker	calc_f2_post	%edi, %edx, %ecx, %esi
11258b4684afSRobert Clausecker.endm
11268b4684afSRobert Clausecker
11278b4684afSRobert Clausecker.macro	calc100
11288b4684afSRobert Clausecker	calc_f2_pre	0xb0, %esi, %edi, %ebx
11298b4684afSRobert Clausecker	precalc36	%ymm8
11308b4684afSRobert Clausecker	calc_f2_post	%esi, %eax, %edx, %ebx
11318b4684afSRobert Clausecker.endm
11328b4684afSRobert Clausecker
11338b4684afSRobert Clausecker.macro	calc101
11348b4684afSRobert Clausecker	calc_f2_pre	0xb4, %ebx, %esi, %ecx
11358b4684afSRobert Clausecker	precalc37	%ymm8
11368b4684afSRobert Clausecker	calc_f2_post	%ebx, %edi, %eax, %ecx
11378b4684afSRobert Clausecker.endm
11388b4684afSRobert Clausecker
11398b4684afSRobert Clausecker.macro	calc102
11408b4684afSRobert Clausecker	calc_f2_pre	0xb8, %ecx, %ebx, %edx
11418b4684afSRobert Clausecker	calc_f2_post	%ecx, %esi, %edi, %edx
11428b4684afSRobert Clausecker.endm
11438b4684afSRobert Clausecker
11448b4684afSRobert Clausecker.macro	calc103
11458b4684afSRobert Clausecker	calc_f2_pre	0xbc, %edx, %ecx, %eax
11468b4684afSRobert Clausecker	precalc39	%ymm8, 0x40, 0x180
11478b4684afSRobert Clausecker	calc_f2_post	%edx, %ebx, %esi, %eax
11488b4684afSRobert Clausecker.endm
11498b4684afSRobert Clausecker
11508b4684afSRobert Clausecker.macro	calc104
11518b4684afSRobert Clausecker	calc_f2_pre	0xd0, %eax, %edx, %edi
11528b4684afSRobert Clausecker	precalc32	%ymm12, %ymm8
11538b4684afSRobert Clausecker	calc_f2_post	%eax, %ecx, %ebx, %edi
11548b4684afSRobert Clausecker.endm
11558b4684afSRobert Clausecker
11568b4684afSRobert Clausecker.macro	calc105
11578b4684afSRobert Clausecker	calc_f2_pre	0xd4, %edi, %eax, %esi
11588b4684afSRobert Clausecker	precalc33	%ymm5, %ymm7
11598b4684afSRobert Clausecker	calc_f2_post	%edi, %edx, %ecx, %esi
11608b4684afSRobert Clausecker.endm
11618b4684afSRobert Clausecker
11628b4684afSRobert Clausecker.macro	calc106
11638b4684afSRobert Clausecker	calc_f2_pre	0xd8, %esi, %edi, %ebx
11648b4684afSRobert Clausecker	precalc34	%ymm14
11658b4684afSRobert Clausecker	calc_f2_post	%esi, %eax, %edx, %ebx
11668b4684afSRobert Clausecker.endm
11678b4684afSRobert Clausecker
11688b4684afSRobert Clausecker.macro	calc107
11698b4684afSRobert Clausecker	calc_f2_pre	0xdc, %ebx, %esi, %ecx
11708b4684afSRobert Clausecker	precalc35	%ymm7
11718b4684afSRobert Clausecker	calc_f2_post	%ebx, %edi, %eax, %ecx
11728b4684afSRobert Clausecker.endm
11738b4684afSRobert Clausecker
11748b4684afSRobert Clausecker.macro	calc108
11758b4684afSRobert Clausecker	calc_f2_pre	0xf0, %ecx, %ebx, %edx
11768b4684afSRobert Clausecker	precalc36	%ymm7
11778b4684afSRobert Clausecker	calc_f2_post	%ecx, %esi, %edi, %edx
11788b4684afSRobert Clausecker.endm
11798b4684afSRobert Clausecker
11808b4684afSRobert Clausecker.macro	calc109
11818b4684afSRobert Clausecker	calc_f2_pre	0xf4, %edx, %ecx, %eax
11828b4684afSRobert Clausecker	precalc37	%ymm7
11838b4684afSRobert Clausecker	calc_f2_post	%edx, %ebx, %esi, %eax
11848b4684afSRobert Clausecker.endm
11858b4684afSRobert Clausecker
11868b4684afSRobert Clausecker.macro	calc110
11878b4684afSRobert Clausecker	calc_f2_pre	0xf8, %eax, %edx, %edi
11888b4684afSRobert Clausecker	calc_f2_post	%eax, %ecx, %ebx, %edi
11898b4684afSRobert Clausecker.endm
11908b4684afSRobert Clausecker
11918b4684afSRobert Clausecker.macro	calc111
11928b4684afSRobert Clausecker	calc_f2_pre	0xfc, %edi, %eax, %esi
11938b4684afSRobert Clausecker	precalc39	%ymm7, 0x40, 0x1a0
11948b4684afSRobert Clausecker	calc_f2_post	%edi, %edx, %ecx, %esi
11958b4684afSRobert Clausecker.endm
11968b4684afSRobert Clausecker
11978b4684afSRobert Clausecker.macro	calc112
11988b4684afSRobert Clausecker	calc_f2_pre	0x110, %esi, %edi, %ebx
11998b4684afSRobert Clausecker	precalc32	%ymm8, %ymm7
12008b4684afSRobert Clausecker	calc_f2_post	%esi, %eax, %edx, %ebx
12018b4684afSRobert Clausecker.endm
12028b4684afSRobert Clausecker
12038b4684afSRobert Clausecker.macro	calc113
12048b4684afSRobert Clausecker	calc_f2_pre	0x114, %ebx, %esi, %ecx
12058b4684afSRobert Clausecker	precalc33	%ymm3, %ymm5
12068b4684afSRobert Clausecker	calc_f2_post	%ebx, %edi, %eax, %ecx
12078b4684afSRobert Clausecker.endm
12088b4684afSRobert Clausecker
12098b4684afSRobert Clausecker.macro	calc114
12108b4684afSRobert Clausecker	calc_f2_pre	0x118, %ecx, %ebx, %edx
12118b4684afSRobert Clausecker	precalc34	%ymm13
12128b4684afSRobert Clausecker	calc_f2_post	%ecx, %esi, %edi, %edx
12138b4684afSRobert Clausecker.endm
12148b4684afSRobert Clausecker
12158b4684afSRobert Clausecker.macro	calc115
12168b4684afSRobert Clausecker	calc_f2_pre	0x11c, %edx, %ecx, %eax
12178b4684afSRobert Clausecker	precalc35	%ymm5
12188b4684afSRobert Clausecker	calc_f2_post	%edx, %ebx, %esi, %eax
12198b4684afSRobert Clausecker.endm
12208b4684afSRobert Clausecker
12218b4684afSRobert Clausecker.macro	calc116
12228b4684afSRobert Clausecker	calc_f2_pre	0x130, %eax, %edx, %edi
1223207f3b2bSJessica Clarke	precalc36	%ymm5
12248b4684afSRobert Clausecker	calc_f2_post	%eax, %ecx, %ebx, %edi
12258b4684afSRobert Clausecker.endm
12268b4684afSRobert Clausecker
12278b4684afSRobert Clausecker.macro	calc117
12288b4684afSRobert Clausecker	calc_f2_pre	0x134, %edi, %eax, %esi
12298b4684afSRobert Clausecker	precalc37	%ymm5
12308b4684afSRobert Clausecker	calc_f2_post	%edi, %edx, %ecx, %esi
12318b4684afSRobert Clausecker.endm
12328b4684afSRobert Clausecker
12338b4684afSRobert Clausecker.macro	calc118
12348b4684afSRobert Clausecker	calc_f2_pre	0x138, %esi, %edi, %ebx
12358b4684afSRobert Clausecker	calc_f2_post	%esi, %eax, %edx, %ebx
12368b4684afSRobert Clausecker.endm
12378b4684afSRobert Clausecker
12388b4684afSRobert Clausecker.macro	calc119
12398b4684afSRobert Clausecker	calc_f3_pre	0x13c, %ecx
12408b4684afSRobert Clausecker	precalc39	%ymm5, 0x40, 0x1c0
12418b4684afSRobert Clausecker	calc_f3_post	%ebx, %edi, %eax, %ecx, %esi
12428b4684afSRobert Clausecker.endm
12438b4684afSRobert Clausecker
12448b4684afSRobert Clausecker.macro	calc120
12458b4684afSRobert Clausecker	calc_f3_pre	0x150, %edx
12468b4684afSRobert Clausecker	precalc32	%ymm7, %ymm5
12478b4684afSRobert Clausecker	calc_f3_post	%ecx, %esi, %edi, %edx, %ebx
12488b4684afSRobert Clausecker.endm
12498b4684afSRobert Clausecker
12508b4684afSRobert Clausecker.macro	calc121
12518b4684afSRobert Clausecker	calc_f3_pre	0x154, %eax
12528b4684afSRobert Clausecker	precalc33	%ymm15, %ymm3
12538b4684afSRobert Clausecker	calc_f3_post	%edx, %ebx, %esi, %eax, %ecx
12548b4684afSRobert Clausecker.endm
12558b4684afSRobert Clausecker
12568b4684afSRobert Clausecker.macro	calc122
12578b4684afSRobert Clausecker	calc_f3_pre	0x158, %edi
12588b4684afSRobert Clausecker	precalc34	%ymm12
12598b4684afSRobert Clausecker	calc_f3_post	%eax, %ecx, %ebx, %edi, %edx
12608b4684afSRobert Clausecker.endm
12618b4684afSRobert Clausecker
12628b4684afSRobert Clausecker.macro	calc123
12638b4684afSRobert Clausecker	calc_f3_pre	0x15c, %esi
12648b4684afSRobert Clausecker	precalc35	%ymm3
12658b4684afSRobert Clausecker	calc_f3_post	%edi, %edx, %ecx, %esi, %eax
12668b4684afSRobert Clausecker.endm
12678b4684afSRobert Clausecker
12688b4684afSRobert Clausecker.macro	calc124
12698b4684afSRobert Clausecker	calc_f3_pre	0x170, %ebx
12708b4684afSRobert Clausecker	precalc36	%ymm3
12718b4684afSRobert Clausecker	calc_f3_post	%esi, %eax, %edx, %ebx, %edi
12728b4684afSRobert Clausecker.endm
12738b4684afSRobert Clausecker
12748b4684afSRobert Clausecker.macro	calc125
12758b4684afSRobert Clausecker	calc_f3_pre	0x174, %ecx
12768b4684afSRobert Clausecker	precalc37	%ymm3
12778b4684afSRobert Clausecker	calc_f3_post	%ebx, %edi, %eax, %ecx, %esi
12788b4684afSRobert Clausecker.endm
12798b4684afSRobert Clausecker
12808b4684afSRobert Clausecker.macro	calc126
12818b4684afSRobert Clausecker	calc_f3_pre	0x178, %edx
12828b4684afSRobert Clausecker	calc_f3_post	%ecx, %esi, %edi, %edx, %ebx
12838b4684afSRobert Clausecker.endm
12848b4684afSRobert Clausecker
12858b4684afSRobert Clausecker.macro	calc127
12868b4684afSRobert Clausecker	calc_f3_pre	0x17c, %eax
12878b4684afSRobert Clausecker	precalc39	%ymm3, 0x60, 0x1e0
12888b4684afSRobert Clausecker	calc_f3_post	%edx, %ebx, %esi, %eax, %ecx
12898b4684afSRobert Clausecker.endm
12908b4684afSRobert Clausecker
12918b4684afSRobert Clausecker.macro	calc128
12928b4684afSRobert Clausecker	calc_f3_pre	0x190, %edi
12938b4684afSRobert Clausecker	precalc32	%ymm5, %ymm3
12948b4684afSRobert Clausecker	calc_f3_post	%eax, %ecx, %ebx, %edi, %edx
12958b4684afSRobert Clausecker.endm
12968b4684afSRobert Clausecker
12978b4684afSRobert Clausecker.macro	calc129
12988b4684afSRobert Clausecker	calc_f3_pre	0x194, %esi
12998b4684afSRobert Clausecker	precalc33	%ymm14, %ymm15
13008b4684afSRobert Clausecker	calc_f3_post	%edi, %edx, %ecx, %esi, %eax
13018b4684afSRobert Clausecker.endm
13028b4684afSRobert Clausecker
13038b4684afSRobert Clausecker.macro	calc130
13048b4684afSRobert Clausecker	calc_f3_pre	0x198, %ebx
13058b4684afSRobert Clausecker	precalc34	%ymm8
13068b4684afSRobert Clausecker	calc_f3_post	%esi, %eax, %edx, %ebx, %edi
13078b4684afSRobert Clausecker.endm
13088b4684afSRobert Clausecker
13098b4684afSRobert Clausecker.macro	calc131
13108b4684afSRobert Clausecker	calc_f3_pre	0x19c, %ecx
13118b4684afSRobert Clausecker	precalc35	%ymm15
13128b4684afSRobert Clausecker	calc_f3_post	%ebx, %edi, %eax, %ecx, %esi
13138b4684afSRobert Clausecker.endm
13148b4684afSRobert Clausecker
13158b4684afSRobert Clausecker.macro	calc132
13168b4684afSRobert Clausecker	calc_f3_pre	0x1b0, %edx
13178b4684afSRobert Clausecker	precalc36	%ymm15
13188b4684afSRobert Clausecker	calc_f3_post	%ecx, %esi, %edi, %edx, %ebx
13198b4684afSRobert Clausecker.endm
13208b4684afSRobert Clausecker
13218b4684afSRobert Clausecker.macro	calc133
13228b4684afSRobert Clausecker	calc_f3_pre	0x1b4, %eax
13238b4684afSRobert Clausecker	precalc37	%ymm15
13248b4684afSRobert Clausecker	calc_f3_post	%edx, %ebx, %esi, %eax, %ecx
13258b4684afSRobert Clausecker.endm
13268b4684afSRobert Clausecker
13278b4684afSRobert Clausecker.macro	calc134
13288b4684afSRobert Clausecker	calc_f3_pre	0x1b8, %edi
13298b4684afSRobert Clausecker	calc_f3_post	%eax, %ecx, %ebx, %edi, %edx
13308b4684afSRobert Clausecker.endm
13318b4684afSRobert Clausecker
13328b4684afSRobert Clausecker.macro	calc135
13338b4684afSRobert Clausecker	calc_f3_pre	0x1bc, %esi
13348b4684afSRobert Clausecker	precalc39	%ymm15, 0x60, 0x200
13358b4684afSRobert Clausecker	calc_f3_post	%edi, %edx, %ecx, %esi, %eax
13368b4684afSRobert Clausecker.endm
13378b4684afSRobert Clausecker
13388b4684afSRobert Clausecker.macro	calc136
13398b4684afSRobert Clausecker	calc_f3_pre	0x1d0, %ebx
13408b4684afSRobert Clausecker	precalc32	%ymm3, %ymm15
13418b4684afSRobert Clausecker	calc_f3_post	%esi, %eax, %edx, %ebx, %edi
13428b4684afSRobert Clausecker.endm
13438b4684afSRobert Clausecker
13448b4684afSRobert Clausecker.macro	calc137
13458b4684afSRobert Clausecker	calc_f3_pre	0x1d4, %ecx
13468b4684afSRobert Clausecker	precalc33	%ymm13, %ymm14
13478b4684afSRobert Clausecker	calc_f3_post	%ebx, %edi, %eax, %ecx, %esi
13488b4684afSRobert Clausecker.endm
13498b4684afSRobert Clausecker
13508b4684afSRobert Clausecker.macro	calc138
13518b4684afSRobert Clausecker	calc_f3_pre	0x1d8, %edx
13528b4684afSRobert Clausecker	precalc34	%ymm7
13538b4684afSRobert Clausecker	calc_f3_post	%ecx, %esi, %edi, %edx, %ebx
13548b4684afSRobert Clausecker.endm
13558b4684afSRobert Clausecker
13568b4684afSRobert Clausecker.macro	calc139
1357207f3b2bSJessica Clarke	calc_f2_pre	0x1dc, %edx, %ecx, %eax
13588b4684afSRobert Clausecker	precalc35	%ymm14
13598b4684afSRobert Clausecker	calc_f2_post	%edx, %ebx, %esi, %eax
13608b4684afSRobert Clausecker.endm
13618b4684afSRobert Clausecker
13628b4684afSRobert Clausecker.macro	calc140
13638b4684afSRobert Clausecker	calc_f2_pre	0x1f0, %eax, %edx, %edi
13648b4684afSRobert Clausecker	precalc36	%ymm14
13658b4684afSRobert Clausecker	calc_f2_post	%eax, %ecx, %ebx, %edi
13668b4684afSRobert Clausecker.endm
13678b4684afSRobert Clausecker
13688b4684afSRobert Clausecker.macro	calc141
13698b4684afSRobert Clausecker	calc_f2_pre	0x1f4, %edi, %eax, %esi
13708b4684afSRobert Clausecker	precalc37	%ymm14
13718b4684afSRobert Clausecker	calc_f2_post	%edi, %edx, %ecx, %esi
13728b4684afSRobert Clausecker.endm
13738b4684afSRobert Clausecker
13748b4684afSRobert Clausecker.macro	calc142
13758b4684afSRobert Clausecker	calc_f2_pre	0x1f8, %esi, %edi, %ebx
13768b4684afSRobert Clausecker	calc_f2_post	%esi, %eax, %edx, %ebx
13778b4684afSRobert Clausecker.endm
13788b4684afSRobert Clausecker
13798b4684afSRobert Clausecker.macro	calc143
13808b4684afSRobert Clausecker	calc_f2_pre	0x1fc, %ebx, %esi, %ecx
13818b4684afSRobert Clausecker	precalc39	%ymm14, 0x60, 0x220
13828b4684afSRobert Clausecker	calc_f2_post	%ebx, %edi, %eax, %ecx
13838b4684afSRobert Clausecker.endm
13848b4684afSRobert Clausecker
13858b4684afSRobert Clausecker.macro	calc144
13868b4684afSRobert Clausecker	calc_f2_pre	0x210, %ecx, %ebx, %edx
13878b4684afSRobert Clausecker	precalc32	%ymm15, %ymm14
13888b4684afSRobert Clausecker	calc_f2_post	%ecx, %esi, %edi, %edx
13898b4684afSRobert Clausecker.endm
13908b4684afSRobert Clausecker
13918b4684afSRobert Clausecker.macro	calc145
13928b4684afSRobert Clausecker	calc_f2_pre	0x214, %edx, %ecx, %eax
13938b4684afSRobert Clausecker	precalc33	%ymm12, %ymm13
13948b4684afSRobert Clausecker	calc_f2_post	%edx, %ebx, %esi, %eax
13958b4684afSRobert Clausecker.endm
13968b4684afSRobert Clausecker
13978b4684afSRobert Clausecker.macro	calc146
13988b4684afSRobert Clausecker	calc_f2_pre	0x218, %eax, %edx, %edi
13998b4684afSRobert Clausecker	precalc34	%ymm5
14008b4684afSRobert Clausecker	calc_f2_post	%eax, %ecx, %ebx, %edi
14018b4684afSRobert Clausecker.endm
14028b4684afSRobert Clausecker
14038b4684afSRobert Clausecker.macro	calc147
14048b4684afSRobert Clausecker	calc_f2_pre	0x21c, %edi, %eax, %esi
14058b4684afSRobert Clausecker	precalc35	%ymm13
14068b4684afSRobert Clausecker	calc_f2_post	%edi, %edx, %ecx, %esi
14078b4684afSRobert Clausecker.endm
14088b4684afSRobert Clausecker
14098b4684afSRobert Clausecker.macro	calc148
14108b4684afSRobert Clausecker	calc_f2_pre	0x230, %esi, %edi, %ebx
14118b4684afSRobert Clausecker	precalc36	%ymm13
14128b4684afSRobert Clausecker	calc_f2_post	%esi, %eax, %edx, %ebx
14138b4684afSRobert Clausecker.endm
14148b4684afSRobert Clausecker
14158b4684afSRobert Clausecker.macro	calc149
14168b4684afSRobert Clausecker	calc_f2_pre	0x234, %ebx, %esi, %ecx
14178b4684afSRobert Clausecker	precalc37	%ymm13
14188b4684afSRobert Clausecker	calc_f2_post	%ebx, %edi, %eax, %ecx
14198b4684afSRobert Clausecker.endm
14208b4684afSRobert Clausecker
14218b4684afSRobert Clausecker.macro	calc150
14228b4684afSRobert Clausecker	calc_f2_pre	0x238, %ecx, %ebx, %edx
14238b4684afSRobert Clausecker	calc_f2_post	%ecx, %esi, %edi, %edx
14248b4684afSRobert Clausecker.endm
14258b4684afSRobert Clausecker
14268b4684afSRobert Clausecker.macro	calc151
14278b4684afSRobert Clausecker	calc_f2_pre	0x23c, %edx, %ecx, %eax
14288b4684afSRobert Clausecker	precalc39	%ymm13, 0x60, 0x240
14298b4684afSRobert Clausecker	calc_f2_post	%edx, %ebx, %esi, %eax
14308b4684afSRobert Clausecker.endm
14318b4684afSRobert Clausecker
14328b4684afSRobert Clausecker.macro	calc152
14338b4684afSRobert Clausecker	calc_f2_pre	0x250, %eax, %edx, %edi
14348b4684afSRobert Clausecker	precalc32	%ymm14, %ymm13
14358b4684afSRobert Clausecker	calc_f2_post	%eax, %ecx, %ebx, %edi
14368b4684afSRobert Clausecker.endm
14378b4684afSRobert Clausecker
14388b4684afSRobert Clausecker.macro	calc153
14398b4684afSRobert Clausecker	calc_f2_pre	0x254, %edi, %eax, %esi
14408b4684afSRobert Clausecker	precalc33	%ymm8, %ymm12
14418b4684afSRobert Clausecker	calc_f2_post	%edi, %edx, %ecx, %esi
14428b4684afSRobert Clausecker.endm
14438b4684afSRobert Clausecker
14448b4684afSRobert Clausecker.macro	calc154
14458b4684afSRobert Clausecker	calc_f2_pre	0x258, %esi, %edi, %ebx
14468b4684afSRobert Clausecker	precalc34	%ymm3
14478b4684afSRobert Clausecker	calc_f2_post	%esi, %eax, %edx, %ebx
14488b4684afSRobert Clausecker.endm
14498b4684afSRobert Clausecker
14508b4684afSRobert Clausecker.macro	calc155
14518b4684afSRobert Clausecker	calc_f2_pre	0x25c, %ebx, %esi, %ecx
14528b4684afSRobert Clausecker	precalc35	%ymm12
14538b4684afSRobert Clausecker	calc_f2_post	%ebx, %edi, %eax, %ecx
14548b4684afSRobert Clausecker.endm
14558b4684afSRobert Clausecker
14568b4684afSRobert Clausecker.macro	calc156
14578b4684afSRobert Clausecker	calc_f2_pre	0x270, %ecx, %ebx, %edx
14588b4684afSRobert Clausecker	precalc36	%ymm12
14598b4684afSRobert Clausecker	calc_f2_post	%ecx, %esi, %edi, %edx
14608b4684afSRobert Clausecker.endm
14618b4684afSRobert Clausecker
14628b4684afSRobert Clausecker.macro	calc157
14638b4684afSRobert Clausecker	calc_f2_pre	0x274, %edx, %ecx, %eax
14648b4684afSRobert Clausecker	precalc37	%ymm12
14658b4684afSRobert Clausecker	calc_f2_post	%edx, %ebx, %esi, %eax
14668b4684afSRobert Clausecker.endm
14678b4684afSRobert Clausecker
14688b4684afSRobert Clausecker.macro	calc158
14698b4684afSRobert Clausecker	calc_f2_pre	0x278, %eax, %edx, %edi
14708b4684afSRobert Clausecker	calc_f2_post	%eax, %ecx, %ebx, %edi
14718b4684afSRobert Clausecker.endm
14728b4684afSRobert Clausecker
14738b4684afSRobert Clausecker.macro	calc159
14748b4684afSRobert Clausecker	add		0x27c(%r15), %esi
14758b4684afSRobert Clausecker	add		%eax, %esi
14768b4684afSRobert Clausecker	rorx		$0x1b, %edi, %r12d
14778b4684afSRobert Clausecker	precalc39	%ymm12, 0x60, 0x260
14788b4684afSRobert Clausecker	add		%r12d, %esi
14798b4684afSRobert Clausecker.endm
14808b4684afSRobert Clausecker
14818b4684afSRobert Clausecker	// sha1block(SHA1_CTX, buf, len)
14828b4684afSRobert ClauseckerENTRY(_libmd_sha1block_avx2)
14838b4684afSRobert Clausecker	push		%rbx
14848b4684afSRobert Clausecker	push		%rbp
14858b4684afSRobert Clausecker	push		%r12
14868b4684afSRobert Clausecker	push		%r13
14878b4684afSRobert Clausecker	push		%r14
14888b4684afSRobert Clausecker	push		%r15
14898b4684afSRobert Clausecker	sub		$1408+8, %rsp
14908b4684afSRobert Clausecker
14918b4684afSRobert Clausecker	and		$~63, %rdx
14928b4684afSRobert Clausecker	lea		k_xmm_ar(%rip), %r8
14938b4684afSRobert Clausecker	mov		%rdi, %r9
14948b4684afSRobert Clausecker	mov		%rsi, %r10
14958b4684afSRobert Clausecker	lea		64(%rsi), %r13
14968b4684afSRobert Clausecker	lea		64(%rsi, %rdx), %r11
14978b4684afSRobert Clausecker	cmp		%r11, %r13
14988b4684afSRobert Clausecker	cmovae		%r8, %r13
14998b4684afSRobert Clausecker	vmovdqu		bswap_shufb_ctl(%rip), %ymm10
15008b4684afSRobert Clausecker
15018b4684afSRobert Clausecker	mov		(%r9), %ecx
15028b4684afSRobert Clausecker	mov		4(%r9), %esi
15038b4684afSRobert Clausecker	mov		8(%r9), %edi
15048b4684afSRobert Clausecker	mov		12(%r9), %eax
15058b4684afSRobert Clausecker	mov		16(%r9), %edx
15068b4684afSRobert Clausecker	mov		%rsp, %r14
15078b4684afSRobert Clausecker	lea		2*4*80+32(%rsp), %r15
15088b4684afSRobert Clausecker	precalc						// precalc WK for first 2 blocks
15098b4684afSRobert Clausecker	xchg		%r14, %r15
15108b4684afSRobert Clausecker
15118b4684afSRobert Clausecker	// this is unrolled
15128b4684afSRobert Clausecker.Loop:	cmp		%r8, %r10			// we use the value of R8 (set below)
15138b4684afSRobert Clausecker							// as a signal of the last block
15148b4684afSRobert Clausecker	jne		.Lbegin
15158b4684afSRobert Clausecker	add		$1408+8, %rsp
15168b4684afSRobert Clausecker	pop		%r15
15178b4684afSRobert Clausecker	pop		%r14
15188b4684afSRobert Clausecker	pop		%r13
15198b4684afSRobert Clausecker	pop		%r12
15208b4684afSRobert Clausecker	pop		%rbp
15218b4684afSRobert Clausecker	pop		%rbx
15228b4684afSRobert Clausecker	vzeroupper
15238b4684afSRobert Clausecker	ret
15248b4684afSRobert Clausecker
15258b4684afSRobert Clausecker.Lbegin:
15268b4684afSRobert Clausecker	calc0
15278b4684afSRobert Clausecker	calc1
15288b4684afSRobert Clausecker	calc2
15298b4684afSRobert Clausecker	calc3
15308b4684afSRobert Clausecker	calc4
15318b4684afSRobert Clausecker	calc5
15328b4684afSRobert Clausecker	calc6
15338b4684afSRobert Clausecker	calc7
15348b4684afSRobert Clausecker	calc8
15358b4684afSRobert Clausecker	calc9
15368b4684afSRobert Clausecker	calc10
15378b4684afSRobert Clausecker	calc11
15388b4684afSRobert Clausecker	calc12
15398b4684afSRobert Clausecker	calc13
15408b4684afSRobert Clausecker	calc14
15418b4684afSRobert Clausecker	calc15
15428b4684afSRobert Clausecker	calc16
15438b4684afSRobert Clausecker	calc17
15448b4684afSRobert Clausecker	calc18
15458b4684afSRobert Clausecker	calc19
15468b4684afSRobert Clausecker	calc20
15478b4684afSRobert Clausecker	calc21
15488b4684afSRobert Clausecker	calc22
15498b4684afSRobert Clausecker	calc23
15508b4684afSRobert Clausecker	calc24
15518b4684afSRobert Clausecker	calc25
15528b4684afSRobert Clausecker	calc26
15538b4684afSRobert Clausecker	calc27
15548b4684afSRobert Clausecker	calc28
15558b4684afSRobert Clausecker	calc29
15568b4684afSRobert Clausecker	calc30
15578b4684afSRobert Clausecker	calc31
15588b4684afSRobert Clausecker	calc32
15598b4684afSRobert Clausecker	calc33
15608b4684afSRobert Clausecker	calc34
15618b4684afSRobert Clausecker	calc35
15628b4684afSRobert Clausecker	calc36
15638b4684afSRobert Clausecker	calc37
15648b4684afSRobert Clausecker	calc38
15658b4684afSRobert Clausecker	calc39
15668b4684afSRobert Clausecker	calc40
15678b4684afSRobert Clausecker	calc41
15688b4684afSRobert Clausecker	calc42
15698b4684afSRobert Clausecker	calc43
15708b4684afSRobert Clausecker	calc44
15718b4684afSRobert Clausecker	calc45
15728b4684afSRobert Clausecker	calc46
15738b4684afSRobert Clausecker	calc47
15748b4684afSRobert Clausecker	calc48
15758b4684afSRobert Clausecker	calc49
15768b4684afSRobert Clausecker	calc50
15778b4684afSRobert Clausecker	calc51
15788b4684afSRobert Clausecker	calc52
15798b4684afSRobert Clausecker	calc53
15808b4684afSRobert Clausecker	calc54
15818b4684afSRobert Clausecker	calc55
15828b4684afSRobert Clausecker	calc56
15838b4684afSRobert Clausecker	calc57
15848b4684afSRobert Clausecker	calc58
15858b4684afSRobert Clausecker	calc59
15868b4684afSRobert Clausecker
15878b4684afSRobert Clausecker	add		$128, %r10		// move to the next even-64-byte block
15888b4684afSRobert Clausecker	cmp		%r11, %r10		// is the current block the last one?
1589207f3b2bSJessica Clarke	cmovae		%r8, %r10		// signal the last iteration smartly
15908b4684afSRobert Clausecker
15918b4684afSRobert Clausecker	calc60
15928b4684afSRobert Clausecker	calc61
15938b4684afSRobert Clausecker	calc62
15948b4684afSRobert Clausecker	calc63
15958b4684afSRobert Clausecker	calc64
15968b4684afSRobert Clausecker	calc65
15978b4684afSRobert Clausecker	calc66
15988b4684afSRobert Clausecker	calc67
15998b4684afSRobert Clausecker	calc68
16008b4684afSRobert Clausecker	calc69
16018b4684afSRobert Clausecker	calc70
16028b4684afSRobert Clausecker	calc71
16038b4684afSRobert Clausecker	calc72
16048b4684afSRobert Clausecker	calc73
16058b4684afSRobert Clausecker	calc74
16068b4684afSRobert Clausecker	calc75
16078b4684afSRobert Clausecker	calc76
16088b4684afSRobert Clausecker	calc77
16098b4684afSRobert Clausecker	calc78
16108b4684afSRobert Clausecker	calc79
16118b4684afSRobert Clausecker
16128b4684afSRobert Clausecker	update_hash	%eax, %edx, %ebx, %esi, %edi
16138b4684afSRobert Clausecker	cmp		%r8, %r10		// is the current block the last one?
16148b4684afSRobert Clausecker	je		.Loop
16158b4684afSRobert Clausecker	mov		%edx, %ecx
16168b4684afSRobert Clausecker
16178b4684afSRobert Clausecker	calc80
16188b4684afSRobert Clausecker	calc81
16198b4684afSRobert Clausecker	calc82
16208b4684afSRobert Clausecker	calc83
16218b4684afSRobert Clausecker	calc84
16228b4684afSRobert Clausecker	calc85
16238b4684afSRobert Clausecker	calc86
16248b4684afSRobert Clausecker	calc87
16258b4684afSRobert Clausecker	calc88
16268b4684afSRobert Clausecker	calc89
16278b4684afSRobert Clausecker	calc90
16288b4684afSRobert Clausecker	calc91
16298b4684afSRobert Clausecker	calc92
16308b4684afSRobert Clausecker	calc93
16318b4684afSRobert Clausecker	calc94
16328b4684afSRobert Clausecker	calc95
16338b4684afSRobert Clausecker	calc96
16348b4684afSRobert Clausecker	calc97
16358b4684afSRobert Clausecker	calc98
16368b4684afSRobert Clausecker	calc99
16378b4684afSRobert Clausecker	calc100
16388b4684afSRobert Clausecker	calc101
16398b4684afSRobert Clausecker	calc102
16408b4684afSRobert Clausecker	calc103
16418b4684afSRobert Clausecker	calc104
16428b4684afSRobert Clausecker	calc105
16438b4684afSRobert Clausecker	calc106
16448b4684afSRobert Clausecker	calc107
16458b4684afSRobert Clausecker	calc108
16468b4684afSRobert Clausecker	calc109
16478b4684afSRobert Clausecker	calc110
16488b4684afSRobert Clausecker	calc111
16498b4684afSRobert Clausecker	calc112
16508b4684afSRobert Clausecker	calc113
16518b4684afSRobert Clausecker	calc114
16528b4684afSRobert Clausecker	calc115
16538b4684afSRobert Clausecker	calc116
16548b4684afSRobert Clausecker	calc117
16558b4684afSRobert Clausecker	calc118
16568b4684afSRobert Clausecker	calc119
16578b4684afSRobert Clausecker	calc120
16588b4684afSRobert Clausecker	calc121
16598b4684afSRobert Clausecker	calc122
16608b4684afSRobert Clausecker	calc123
16618b4684afSRobert Clausecker	calc124
16628b4684afSRobert Clausecker	calc125
16638b4684afSRobert Clausecker	calc126
16648b4684afSRobert Clausecker	calc127
16658b4684afSRobert Clausecker	calc128
16668b4684afSRobert Clausecker	calc129
16678b4684afSRobert Clausecker	calc130
16688b4684afSRobert Clausecker	calc131
16698b4684afSRobert Clausecker	calc132
16708b4684afSRobert Clausecker	calc133
16718b4684afSRobert Clausecker	calc134
16728b4684afSRobert Clausecker	calc135
16738b4684afSRobert Clausecker	calc136
16748b4684afSRobert Clausecker	calc137
16758b4684afSRobert Clausecker	calc138
16768b4684afSRobert Clausecker	calc139
16778b4684afSRobert Clausecker
16788b4684afSRobert Clausecker	add		$128, %r13		// move to the next even-64-byte block
16798b4684afSRobert Clausecker	cmp		%r11, %r13		// is the current block the last one?
16808b4684afSRobert Clausecker	cmovae		%r8, %r10
16818b4684afSRobert Clausecker
16828b4684afSRobert Clausecker	calc140
16838b4684afSRobert Clausecker	calc141
16848b4684afSRobert Clausecker	calc142
16858b4684afSRobert Clausecker	calc143
16868b4684afSRobert Clausecker	calc144
16878b4684afSRobert Clausecker	calc145
16888b4684afSRobert Clausecker	calc146
16898b4684afSRobert Clausecker	calc147
16908b4684afSRobert Clausecker	calc148
16918b4684afSRobert Clausecker	calc149
16928b4684afSRobert Clausecker	calc150
16938b4684afSRobert Clausecker	calc151
16948b4684afSRobert Clausecker	calc152
16958b4684afSRobert Clausecker	calc153
16968b4684afSRobert Clausecker	calc154
16978b4684afSRobert Clausecker	calc155
16988b4684afSRobert Clausecker	calc156
16998b4684afSRobert Clausecker	calc157
17008b4684afSRobert Clausecker	calc158
17018b4684afSRobert Clausecker	calc159
17028b4684afSRobert Clausecker
17038b4684afSRobert Clausecker	update_hash	%esi, %edi, %edx, %ecx, %ebx
17048b4684afSRobert Clausecker	mov		%esi, %r12d		// reset state for AVX2 reg permutation
17058b4684afSRobert Clausecker	mov		%edi, %esi
17068b4684afSRobert Clausecker	mov		%edx, %edi
17078b4684afSRobert Clausecker	mov		%ebx, %edx
17088b4684afSRobert Clausecker	mov		%ecx, %eax
17098b4684afSRobert Clausecker	mov		%r12d, %ecx
17108b4684afSRobert Clausecker	xchg		%r14, %r15
17118b4684afSRobert Clausecker	jmp		.Loop
17128b4684afSRobert ClauseckerEND(_libmd_sha1block_avx2)
17138b4684afSRobert Clausecker
17148b4684afSRobert Clausecker	.section	.rodata
17158b4684afSRobert Clausecker	.balign		32
17168b4684afSRobert Clauseckerk_xmm_ar:
17178b4684afSRobert Clausecker	.fill		8, 4, 0x5a827999
17188b4684afSRobert Clausecker	.fill		8, 4, 0x6ed9eba1
17198b4684afSRobert Clausecker	.fill		8, 4, 0x8f1bbcdc
17208b4684afSRobert Clausecker	.fill		8, 4, 0xca62c1d6
17218b4684afSRobert Clausecker	.size		k_xmm_ar, .-k_xmm_ar
17228b4684afSRobert Clausecker
17238b4684afSRobert Clauseckerbswap_shufb_ctl:
17248b4684afSRobert Clausecker	.4byte		0x00010203
17258b4684afSRobert Clausecker	.4byte		0x04050607
17268b4684afSRobert Clausecker	.4byte		0x08090a0b
17278b4684afSRobert Clausecker	.4byte		0x0c0d0e0f
17288b4684afSRobert Clausecker	.4byte		0x00010203
17298b4684afSRobert Clausecker	.4byte		0x04050607
17308b4684afSRobert Clausecker	.4byte		0x08090a0b
17318b4684afSRobert Clausecker	.4byte		0x0c0d0e0f
17328b4684afSRobert Clausecker	.size		bswap_shufb_ctl, .-bswap_shufb_ctl
17338b4684afSRobert Clausecker
17348b4684afSRobert Clausecker	/*
17358b4684afSRobert Clausecker	 * SHA1 implementation using the Intel SHA extensions (SHANI).
17368b4684afSRobert Clausecker	 *
17378b4684afSRobert Clausecker	 * Imlemented according to the Intel white paper
17388b4684afSRobert Clausecker	 *
17398b4684afSRobert Clausecker	 * S. Gulley, V. Gopal, K. Yap, W. Feghali, J. Guilford,
17408b4684afSRobert Clausecker	 * G. Wolrich: "Intel SHA Extensions: new instruction supporting
17418b4684afSRobert Clausecker	 * the Secure Hash Algorithm on Intel® architecture processors",
17428b4684afSRobert Clausecker	 * July 2013.
17438b4684afSRobert Clausecker	 */
17448b4684afSRobert Clausecker	// sha1block(SHA1_CTX, buf, len)
17458b4684afSRobert ClauseckerENTRY(_libmd_sha1block_shani)
17468b4684afSRobert Clausecker	and		$~63, %rdx		// round length to block-size multiple
17478b4684afSRobert Clausecker	lea		(%rsi, %rdx, 1), %rcx	// end pointer
17488b4684afSRobert Clausecker	test		%rdx, %rdx		// nothing to do?
17498b4684afSRobert Clausecker	je		1f			// if so, terminate immediately
17508b4684afSRobert Clausecker
17518b4684afSRobert Clausecker	movdqu		(%rdi), %xmm6		// h0, h1, h2, h3
17528b4684afSRobert Clausecker	pxor		%xmm7, %xmm7
17538b4684afSRobert Clausecker	pshufd		$0x1b, %xmm6, %xmm6	// h3, h2, h1, h0
17548b4684afSRobert Clausecker	pinsrd		$3, 16(%rdi), %xmm7	// h4 in the highest word of xmm7
17558b4684afSRobert Clausecker	movdqu		shuf_mask(%rip), %xmm4
17568b4684afSRobert Clausecker
17578b4684afSRobert Clausecker	// main loop
17588b4684afSRobert Clausecker0:	movdqa		%xmm6, %xmm8		// stash ABCD
17598b4684afSRobert Clausecker	movdqa		%xmm7, %xmm9		// stash E
17608b4684afSRobert Clausecker
17618b4684afSRobert Clausecker	// rounds 0--3
17628b4684afSRobert Clausecker	movdqu		0*16(%rsi), %xmm0	// load first message block
17638b4684afSRobert Clausecker	pshufb		%xmm4, %xmm0		// and byte-swap
17648b4684afSRobert Clausecker	paddd		%xmm0, %xmm7		// E += w[0]
17658b4684afSRobert Clausecker	movdqa		%xmm6, %xmm5		// E' = A
17668b4684afSRobert Clausecker	sha1rnds4	$0, %xmm7, %xmm6	// perform rounds 0--3
17678b4684afSRobert Clausecker
17688b4684afSRobert Clausecker	// rounds 4--7
17698b4684afSRobert Clausecker	movdqu		1*16(%rsi), %xmm1
17708b4684afSRobert Clausecker	pshufb		%xmm4, %xmm1
17718b4684afSRobert Clausecker	sha1nexte	%xmm1, %xmm5
17728b4684afSRobert Clausecker	movdqa		%xmm6, %xmm7
17738b4684afSRobert Clausecker	sha1rnds4	$0, %xmm5, %xmm6
17748b4684afSRobert Clausecker	sha1msg1	%xmm1, %xmm0
17758b4684afSRobert Clausecker
17768b4684afSRobert Clausecker	// rounds 8--11
17778b4684afSRobert Clausecker	movdqu		2*16(%rsi), %xmm2
17788b4684afSRobert Clausecker	pshufb		%xmm4, %xmm2
17798b4684afSRobert Clausecker	sha1nexte	%xmm2, %xmm7
17808b4684afSRobert Clausecker	movdqa		%xmm6, %xmm5
17818b4684afSRobert Clausecker	sha1rnds4	$0, %xmm7, %xmm6
17828b4684afSRobert Clausecker	sha1msg1	%xmm2, %xmm1
17838b4684afSRobert Clausecker	pxor		%xmm2, %xmm0
17848b4684afSRobert Clausecker
17858b4684afSRobert Clausecker.macro	midround	msg3, msg0, msg1, msg2, e1, e0, k
17868b4684afSRobert Clausecker	sha1nexte	\msg3, \e1
17878b4684afSRobert Clausecker	movdqa		%xmm6, \e0
17888b4684afSRobert Clausecker	sha1msg2	\msg3, \msg0
17898b4684afSRobert Clausecker	sha1rnds4	$\k, \e1, %xmm6
17908b4684afSRobert Clausecker	sha1msg1	\msg3, \msg2
17918b4684afSRobert Clausecker	pxor		\msg3, \msg1
17928b4684afSRobert Clausecker.endm
17938b4684afSRobert Clausecker
17948b4684afSRobert Clausecker	movdqu		3*16(%rsi), %xmm3	// load third message block
17958b4684afSRobert Clausecker	pshufb		%xmm4, %xmm3
17968b4684afSRobert Clausecker
17978b4684afSRobert Clausecker	add		$4*16, %rsi
17988b4684afSRobert Clausecker
17998b4684afSRobert Clausecker	midround	%xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 0	// 12--15
18008b4684afSRobert Clausecker	midround	%xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 0	// 16--19
18018b4684afSRobert Clausecker	midround	%xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 1	// 20--23
18028b4684afSRobert Clausecker	midround	%xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 1	// 24--27
18038b4684afSRobert Clausecker	midround	%xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 1	// 28--31
18048b4684afSRobert Clausecker	midround	%xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 1	// 32--35
18058b4684afSRobert Clausecker	midround	%xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 1	// 36--39
18068b4684afSRobert Clausecker	midround	%xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 2	// 40--43
18078b4684afSRobert Clausecker	midround	%xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 2	// 44--47
18088b4684afSRobert Clausecker	midround	%xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 2	// 48--51
18098b4684afSRobert Clausecker	midround	%xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 2	// 52--55
18108b4684afSRobert Clausecker	midround	%xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 2	// 56--59
18118b4684afSRobert Clausecker	midround	%xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 3	// 60--63
18128b4684afSRobert Clausecker	midround	%xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 3	// 64--67
18138b4684afSRobert Clausecker
18148b4684afSRobert Clausecker	// rounds 68--71
18158b4684afSRobert Clausecker	sha1nexte	%xmm1, %xmm5
18168b4684afSRobert Clausecker	movdqa		%xmm6, %xmm7
18178b4684afSRobert Clausecker	sha1msg2	%xmm1, %xmm2
18188b4684afSRobert Clausecker	sha1rnds4	$3, %xmm5, %xmm6
18198b4684afSRobert Clausecker	pxor		%xmm1, %xmm3
18208b4684afSRobert Clausecker
18218b4684afSRobert Clausecker	// rounds 72--75
18228b4684afSRobert Clausecker	sha1nexte	%xmm2, %xmm7
18238b4684afSRobert Clausecker	movdqa		%xmm6, %xmm5
18248b4684afSRobert Clausecker	sha1msg2	%xmm2, %xmm3
18258b4684afSRobert Clausecker	sha1rnds4	$3, %xmm7, %xmm6
18268b4684afSRobert Clausecker
18278b4684afSRobert Clausecker	// rounds 76--79
18288b4684afSRobert Clausecker	sha1nexte	%xmm3, %xmm5
18298b4684afSRobert Clausecker	movdqa		%xmm6, %xmm7
18308b4684afSRobert Clausecker	sha1rnds4	$3, %xmm5, %xmm6
18318b4684afSRobert Clausecker
18328b4684afSRobert Clausecker	sha1nexte	%xmm9, %xmm7		// add saved E
18338b4684afSRobert Clausecker	paddd		%xmm8, %xmm6		// add saved ABCD
18348b4684afSRobert Clausecker
18358b4684afSRobert Clausecker	cmp		%rsi, %rcx		// end reached?
18368b4684afSRobert Clausecker	jne		0b
18378b4684afSRobert Clausecker
18388b4684afSRobert Clausecker	pshufd		$0x1b, %xmm6, %xmm6	// restore order of h0--h3
18398b4684afSRobert Clausecker	movdqu		%xmm6, (%rdi)		// write h0--h3
18408b4684afSRobert Clausecker	pextrd		$3, %xmm7, 16(%rdi)	// write h4
18418b4684afSRobert Clausecker1:	ret
18428b4684afSRobert ClauseckerEND(_libmd_sha1block_shani)
18438b4684afSRobert Clausecker
18448b4684afSRobert Clausecker	.section	.rodata
18458b4684afSRobert Clausecker	.balign		16
18468b4684afSRobert Clauseckershuf_mask:
18478b4684afSRobert Clausecker	.8byte		0x08090a0b0c0d0e0f
18488b4684afSRobert Clausecker	.8byte		0x0001020304050607
18498b4684afSRobert Clausecker	.size		shuf_mask, .-shuf_mask
18508b4684afSRobert Clausecker
18518b4684afSRobert Clausecker	.section .note.GNU-stack,"",%progbits
1852