18b4684afSRobert Clausecker/*- 28b4684afSRobert Clausecker * Copyright (c) 2013 The Go Authors. All rights reserved. 3ec3242edSRobert Clausecker * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org> 48b4684afSRobert Clausecker * 58b4684afSRobert Clausecker * Adapted from Go's crypto/sha1/sha1block_amd64.s. 68b4684afSRobert Clausecker * 78b4684afSRobert Clausecker * Redistribution and use in source and binary forms, with or without 88b4684afSRobert Clausecker * modification, are permitted provided that the following conditions are 98b4684afSRobert Clausecker * met: 108b4684afSRobert Clausecker * 118b4684afSRobert Clausecker * * Redistributions of source code must retain the above copyright 128b4684afSRobert Clausecker * notice, this list of conditions and the following disclaimer. 138b4684afSRobert Clausecker * * Redistributions in binary form must reproduce the above 148b4684afSRobert Clausecker * copyright notice, this list of conditions and the following disclaimer 158b4684afSRobert Clausecker * in the documentation and/or other materials provided with the 168b4684afSRobert Clausecker * distribution. 178b4684afSRobert Clausecker * * Neither the name of Google Inc. nor the names of its 188b4684afSRobert Clausecker * contributors may be used to endorse or promote products derived from 198b4684afSRobert Clausecker * this software without specific prior written permission. 208b4684afSRobert Clausecker * 218b4684afSRobert Clausecker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 228b4684afSRobert Clausecker * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 238b4684afSRobert Clausecker * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 248b4684afSRobert Clausecker * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 258b4684afSRobert Clausecker * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 268b4684afSRobert Clausecker * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 278b4684afSRobert Clausecker * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 288b4684afSRobert Clausecker * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 298b4684afSRobert Clausecker * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 308b4684afSRobert Clausecker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 318b4684afSRobert Clausecker * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 328b4684afSRobert Clausecker */ 338b4684afSRobert Clausecker 348b4684afSRobert Clausecker#include <machine/asm.h> 358b4684afSRobert Clausecker 368b4684afSRobert Clausecker/* 378b4684afSRobert Clausecker * SHA-1 block routine. See sha1c.c for C equivalent. 388b4684afSRobert Clausecker * 398b4684afSRobert Clausecker * There are 80 rounds of 4 types: 408b4684afSRobert Clausecker * - rounds 0-15 are type 1 and load data (round1 macro). 418b4684afSRobert Clausecker * - rounds 16-19 are type 1 and do not load data (round1x macro). 428b4684afSRobert Clausecker * - rounds 20-39 are type 2 and do not load data (round2 macro). 438b4684afSRobert Clausecker * - rounds 40-59 are type 3 and do not load data (round3 macro). 448b4684afSRobert Clausecker * - rounds 60-79 are type 4 and do not load data (round4 macro). 458b4684afSRobert Clausecker * 468b4684afSRobert Clausecker * Each round loads or shuffles the data, then computes a per-round 478b4684afSRobert Clausecker * function of b, c, d, and then mixes the result into and rotates the 488b4684afSRobert Clausecker * five registers a, b, c, d, e holding the intermediate results. 498b4684afSRobert Clausecker * 508b4684afSRobert Clausecker * The register rotation is implemented by rotating the arguments to 518b4684afSRobert Clausecker * the round macros instead of by explicit move instructions. 528b4684afSRobert Clausecker */ 538b4684afSRobert Clausecker.macro load index 548b4684afSRobert Clausecker mov (\index)*4(%rsi), %r10d 558b4684afSRobert Clausecker bswap %r10d 568b4684afSRobert Clausecker mov %r10d, (\index)*4(%rsp) 578b4684afSRobert Clausecker.endm 588b4684afSRobert Clausecker 598b4684afSRobert Clausecker.macro shuffle index 608b4684afSRobert Clausecker mov ((\index )&0xf)*4(%rsp), %r10d 618b4684afSRobert Clausecker xor ((\index- 3)&0xf)*4(%rsp), %r10d 628b4684afSRobert Clausecker xor ((\index- 8)&0xf)*4(%rsp), %r10d 638b4684afSRobert Clausecker xor ((\index-14)&0xf)*4(%rsp), %r10d 648b4684afSRobert Clausecker rol $1, %r10d 658b4684afSRobert Clausecker mov %r10d, ((\index)&0xf)*4(%rsp) 668b4684afSRobert Clausecker.endm 678b4684afSRobert Clausecker 688b4684afSRobert Clausecker.macro func1 a, b, c, d, e 698b4684afSRobert Clausecker mov \d, %r9d 708b4684afSRobert Clausecker xor \c, %r9d 718b4684afSRobert Clausecker and \b, %r9d 728b4684afSRobert Clausecker xor \d, %r9d 738b4684afSRobert Clausecker.endm 748b4684afSRobert Clausecker 758b4684afSRobert Clausecker.macro func2 a, b, c, d, e 768b4684afSRobert Clausecker mov \b, %r9d 778b4684afSRobert Clausecker xor \c, %r9d 788b4684afSRobert Clausecker xor \d, %r9d 798b4684afSRobert Clausecker.endm 808b4684afSRobert Clausecker 818b4684afSRobert Clausecker.macro func3 a, b, c, d, e 828b4684afSRobert Clausecker mov \b, %r8d 838b4684afSRobert Clausecker or \c, %r8d 848b4684afSRobert Clausecker and \d, %r8d 858b4684afSRobert Clausecker mov \b, %r9d 868b4684afSRobert Clausecker and \c, %r9d 878b4684afSRobert Clausecker or %r8d, %r9d 888b4684afSRobert Clausecker.endm 898b4684afSRobert Clausecker 908b4684afSRobert Clausecker.macro func4 a, b, c, d, e 918b4684afSRobert Clausecker func2 \a, \b, \c, \d, \e 928b4684afSRobert Clausecker.endm 938b4684afSRobert Clausecker 948b4684afSRobert Clausecker.macro mix a, b, c, d, e, const 958b4684afSRobert Clausecker rol $30, \b 968b4684afSRobert Clausecker add %r9d, \e 978b4684afSRobert Clausecker mov \a, %r8d 988b4684afSRobert Clausecker rol $5, %r8d 998b4684afSRobert Clausecker lea \const(\e, %r10d, 1), \e 1008b4684afSRobert Clausecker add %r8d, \e 1018b4684afSRobert Clausecker.endm 1028b4684afSRobert Clausecker 1038b4684afSRobert Clausecker.macro round1 a, b, c, d, e, index 1048b4684afSRobert Clausecker load \index 1058b4684afSRobert Clausecker func1 \a, \b, \c, \d, \e 1068b4684afSRobert Clausecker mix \a, \b, \c, \d, \e, 0x5a827999 1078b4684afSRobert Clausecker.endm 1088b4684afSRobert Clausecker 1098b4684afSRobert Clausecker.macro round1x a, b, c, d, e, index 1108b4684afSRobert Clausecker shuffle \index 1118b4684afSRobert Clausecker func1 \a, \b, \c, \d, \e 1128b4684afSRobert Clausecker mix \a, \b, \c, \d, \e, 0x5a827999 1138b4684afSRobert Clausecker.endm 1148b4684afSRobert Clausecker 1158b4684afSRobert Clausecker.macro round2 a, b, c, d, e, index 1168b4684afSRobert Clausecker shuffle \index 1178b4684afSRobert Clausecker func2 \a, \b, \c, \d, \e 1188b4684afSRobert Clausecker mix \a, \b, \c, \d, \e, 0x6ed9eba1 1198b4684afSRobert Clausecker.endm 1208b4684afSRobert Clausecker 1218b4684afSRobert Clausecker.macro round3 a, b, c, d, e, index 1228b4684afSRobert Clausecker shuffle \index 1238b4684afSRobert Clausecker func3 \a, \b, \c, \d, \e 1248b4684afSRobert Clausecker mix \a, \b, \c, \d, \e, 0x8f1bbcdc 1258b4684afSRobert Clausecker.endm 1268b4684afSRobert Clausecker 1278b4684afSRobert Clausecker.macro round4 a, b, c, d, e, index 1288b4684afSRobert Clausecker shuffle \index 1298b4684afSRobert Clausecker func4 \a, \b, \c, \d, \e 1308b4684afSRobert Clausecker mix \a, \b, \c, \d, \e, 0xca62c1d6 1318b4684afSRobert Clausecker.endm 1328b4684afSRobert Clausecker 1338b4684afSRobert Clausecker // sha1block(SHA1_CTX, buf, len) 1348b4684afSRobert ClauseckerENTRY(_libmd_sha1block_scalar) 1358b4684afSRobert Clausecker push %rbp 1368b4684afSRobert Clausecker push %rbx 1378b4684afSRobert Clausecker push %r12 1388b4684afSRobert Clausecker push %r13 1398b4684afSRobert Clausecker push %r14 1408b4684afSRobert Clausecker push %r15 1418b4684afSRobert Clausecker push %rdi // rdi: SHA1_CTX 1428b4684afSRobert Clausecker sub $64+8, %rsp // 64 bytes for round keys 1438b4684afSRobert Clausecker // plus alignment 1448b4684afSRobert Clausecker 1458b4684afSRobert Clausecker mov %rdi, %rbp 1468b4684afSRobert Clausecker // rsi: buf 1478b4684afSRobert Clausecker and $~63, %rdx // rdx: length in blocks 1488b4684afSRobert Clausecker lea (%rsi, %rdx, 1), %rdi // rdi: end pointer 1498b4684afSRobert Clausecker mov (%rbp), %eax // c->h0 1508b4684afSRobert Clausecker mov 4(%rbp), %ebx // c->h1 1518b4684afSRobert Clausecker mov 8(%rbp), %ecx // c->h2 1528b4684afSRobert Clausecker mov 12(%rbp), %edx // c->h3 1538b4684afSRobert Clausecker mov 16(%rbp), %ebp // c->h4 1548b4684afSRobert Clausecker 1558b4684afSRobert Clausecker cmp %rsi, %rdi // any data to process? 1568b4684afSRobert Clausecker je .Lend 1578b4684afSRobert Clausecker 1588b4684afSRobert Clausecker.Lloop: mov %eax, %r11d 1598b4684afSRobert Clausecker mov %ebx, %r12d 1608b4684afSRobert Clausecker mov %ecx, %r13d 1618b4684afSRobert Clausecker mov %edx, %r14d 1628b4684afSRobert Clausecker mov %ebp, %r15d 1638b4684afSRobert Clausecker 1648b4684afSRobert Clausecker round1 %eax, %ebx, %ecx, %edx, %ebp, 0 1658b4684afSRobert Clausecker round1 %ebp, %eax, %ebx, %ecx, %edx, 1 1668b4684afSRobert Clausecker round1 %edx, %ebp, %eax, %ebx, %ecx, 2 1678b4684afSRobert Clausecker round1 %ecx, %edx, %ebp, %eax, %ebx, 3 1688b4684afSRobert Clausecker round1 %ebx, %ecx, %edx, %ebp, %eax, 4 1698b4684afSRobert Clausecker 1708b4684afSRobert Clausecker round1 %eax, %ebx, %ecx, %edx, %ebp, 5 1718b4684afSRobert Clausecker round1 %ebp, %eax, %ebx, %ecx, %edx, 6 1728b4684afSRobert Clausecker round1 %edx, %ebp, %eax, %ebx, %ecx, 7 1738b4684afSRobert Clausecker round1 %ecx, %edx, %ebp, %eax, %ebx, 8 1748b4684afSRobert Clausecker round1 %ebx, %ecx, %edx, %ebp, %eax, 9 1758b4684afSRobert Clausecker 1768b4684afSRobert Clausecker round1 %eax, %ebx, %ecx, %edx, %ebp, 10 1778b4684afSRobert Clausecker round1 %ebp, %eax, %ebx, %ecx, %edx, 11 1788b4684afSRobert Clausecker round1 %edx, %ebp, %eax, %ebx, %ecx, 12 1798b4684afSRobert Clausecker round1 %ecx, %edx, %ebp, %eax, %ebx, 13 1808b4684afSRobert Clausecker round1 %ebx, %ecx, %edx, %ebp, %eax, 14 1818b4684afSRobert Clausecker 1828b4684afSRobert Clausecker round1 %eax, %ebx, %ecx, %edx, %ebp, 15 1838b4684afSRobert Clausecker round1x %ebp, %eax, %ebx, %ecx, %edx, 16 1848b4684afSRobert Clausecker round1x %edx, %ebp, %eax, %ebx, %ecx, 17 1858b4684afSRobert Clausecker round1x %ecx, %edx, %ebp, %eax, %ebx, 18 1868b4684afSRobert Clausecker round1x %ebx, %ecx, %edx, %ebp, %eax, 19 1878b4684afSRobert Clausecker 1888b4684afSRobert Clausecker round2 %eax, %ebx, %ecx, %edx, %ebp, 20 1898b4684afSRobert Clausecker round2 %ebp, %eax, %ebx, %ecx, %edx, 21 1908b4684afSRobert Clausecker round2 %edx, %ebp, %eax, %ebx, %ecx, 22 1918b4684afSRobert Clausecker round2 %ecx, %edx, %ebp, %eax, %ebx, 23 1928b4684afSRobert Clausecker round2 %ebx, %ecx, %edx, %ebp, %eax, 24 1938b4684afSRobert Clausecker 1948b4684afSRobert Clausecker round2 %eax, %ebx, %ecx, %edx, %ebp, 25 1958b4684afSRobert Clausecker round2 %ebp, %eax, %ebx, %ecx, %edx, 26 1968b4684afSRobert Clausecker round2 %edx, %ebp, %eax, %ebx, %ecx, 27 1978b4684afSRobert Clausecker round2 %ecx, %edx, %ebp, %eax, %ebx, 28 1988b4684afSRobert Clausecker round2 %ebx, %ecx, %edx, %ebp, %eax, 29 1998b4684afSRobert Clausecker 2008b4684afSRobert Clausecker round2 %eax, %ebx, %ecx, %edx, %ebp, 30 2018b4684afSRobert Clausecker round2 %ebp, %eax, %ebx, %ecx, %edx, 31 2028b4684afSRobert Clausecker round2 %edx, %ebp, %eax, %ebx, %ecx, 32 2038b4684afSRobert Clausecker round2 %ecx, %edx, %ebp, %eax, %ebx, 33 2048b4684afSRobert Clausecker round2 %ebx, %ecx, %edx, %ebp, %eax, 34 2058b4684afSRobert Clausecker 2068b4684afSRobert Clausecker round2 %eax, %ebx, %ecx, %edx, %ebp, 35 2078b4684afSRobert Clausecker round2 %ebp, %eax, %ebx, %ecx, %edx, 36 2088b4684afSRobert Clausecker round2 %edx, %ebp, %eax, %ebx, %ecx, 37 2098b4684afSRobert Clausecker round2 %ecx, %edx, %ebp, %eax, %ebx, 38 2108b4684afSRobert Clausecker round2 %ebx, %ecx, %edx, %ebp, %eax, 39 2118b4684afSRobert Clausecker 2128b4684afSRobert Clausecker round3 %eax, %ebx, %ecx, %edx, %ebp, 40 2138b4684afSRobert Clausecker round3 %ebp, %eax, %ebx, %ecx, %edx, 41 2148b4684afSRobert Clausecker round3 %edx, %ebp, %eax, %ebx, %ecx, 42 2158b4684afSRobert Clausecker round3 %ecx, %edx, %ebp, %eax, %ebx, 43 2168b4684afSRobert Clausecker round3 %ebx, %ecx, %edx, %ebp, %eax, 44 2178b4684afSRobert Clausecker 2188b4684afSRobert Clausecker round3 %eax, %ebx, %ecx, %edx, %ebp, 45 2198b4684afSRobert Clausecker round3 %ebp, %eax, %ebx, %ecx, %edx, 46 2208b4684afSRobert Clausecker round3 %edx, %ebp, %eax, %ebx, %ecx, 47 2218b4684afSRobert Clausecker round3 %ecx, %edx, %ebp, %eax, %ebx, 48 2228b4684afSRobert Clausecker round3 %ebx, %ecx, %edx, %ebp, %eax, 49 2238b4684afSRobert Clausecker 2248b4684afSRobert Clausecker round3 %eax, %ebx, %ecx, %edx, %ebp, 50 2258b4684afSRobert Clausecker round3 %ebp, %eax, %ebx, %ecx, %edx, 51 2268b4684afSRobert Clausecker round3 %edx, %ebp, %eax, %ebx, %ecx, 52 2278b4684afSRobert Clausecker round3 %ecx, %edx, %ebp, %eax, %ebx, 53 2288b4684afSRobert Clausecker round3 %ebx, %ecx, %edx, %ebp, %eax, 54 2298b4684afSRobert Clausecker 2308b4684afSRobert Clausecker round3 %eax, %ebx, %ecx, %edx, %ebp, 55 2318b4684afSRobert Clausecker round3 %ebp, %eax, %ebx, %ecx, %edx, 56 2328b4684afSRobert Clausecker round3 %edx, %ebp, %eax, %ebx, %ecx, 57 2338b4684afSRobert Clausecker round3 %ecx, %edx, %ebp, %eax, %ebx, 58 2348b4684afSRobert Clausecker round3 %ebx, %ecx, %edx, %ebp, %eax, 59 2358b4684afSRobert Clausecker 2368b4684afSRobert Clausecker round4 %eax, %ebx, %ecx, %edx, %ebp, 60 2378b4684afSRobert Clausecker round4 %ebp, %eax, %ebx, %ecx, %edx, 61 2388b4684afSRobert Clausecker round4 %edx, %ebp, %eax, %ebx, %ecx, 62 2398b4684afSRobert Clausecker round4 %ecx, %edx, %ebp, %eax, %ebx, 63 2408b4684afSRobert Clausecker round4 %ebx, %ecx, %edx, %ebp, %eax, 64 2418b4684afSRobert Clausecker 2428b4684afSRobert Clausecker round4 %eax, %ebx, %ecx, %edx, %ebp, 65 2438b4684afSRobert Clausecker round4 %ebp, %eax, %ebx, %ecx, %edx, 66 2448b4684afSRobert Clausecker round4 %edx, %ebp, %eax, %ebx, %ecx, 67 2458b4684afSRobert Clausecker round4 %ecx, %edx, %ebp, %eax, %ebx, 68 2468b4684afSRobert Clausecker round4 %ebx, %ecx, %edx, %ebp, %eax, 69 2478b4684afSRobert Clausecker 2488b4684afSRobert Clausecker round4 %eax, %ebx, %ecx, %edx, %ebp, 70 2498b4684afSRobert Clausecker round4 %ebp, %eax, %ebx, %ecx, %edx, 71 2508b4684afSRobert Clausecker round4 %edx, %ebp, %eax, %ebx, %ecx, 72 2518b4684afSRobert Clausecker round4 %ecx, %edx, %ebp, %eax, %ebx, 73 2528b4684afSRobert Clausecker round4 %ebx, %ecx, %edx, %ebp, %eax, 74 2538b4684afSRobert Clausecker 2548b4684afSRobert Clausecker round4 %eax, %ebx, %ecx, %edx, %ebp, 75 2558b4684afSRobert Clausecker round4 %ebp, %eax, %ebx, %ecx, %edx, 76 2568b4684afSRobert Clausecker round4 %edx, %ebp, %eax, %ebx, %ecx, 77 2578b4684afSRobert Clausecker round4 %ecx, %edx, %ebp, %eax, %ebx, 78 2588b4684afSRobert Clausecker round4 %ebx, %ecx, %edx, %ebp, %eax, 79 2598b4684afSRobert Clausecker 2608b4684afSRobert Clausecker add %r11d, %eax 2618b4684afSRobert Clausecker add %r12d, %ebx 2628b4684afSRobert Clausecker add %r13d, %ecx 2638b4684afSRobert Clausecker add %r14d, %edx 2648b4684afSRobert Clausecker add %r15d, %ebp 2658b4684afSRobert Clausecker 2668b4684afSRobert Clausecker add $64, %rsi 2678b4684afSRobert Clausecker cmp %rdi, %rsi 2688b4684afSRobert Clausecker jb .Lloop 2698b4684afSRobert Clausecker 2708b4684afSRobert Clausecker.Lend: add $64+8, %rsp 2718b4684afSRobert Clausecker pop %rdi // SHA1_CTX 2728b4684afSRobert Clausecker mov %eax, (%rdi) 2738b4684afSRobert Clausecker mov %ebx, 4(%rdi) 2748b4684afSRobert Clausecker mov %ecx, 8(%rdi) 2758b4684afSRobert Clausecker mov %edx, 12(%rdi) 2768b4684afSRobert Clausecker mov %ebp, 16(%rdi) 2778b4684afSRobert Clausecker 2788b4684afSRobert Clausecker pop %r15 2798b4684afSRobert Clausecker pop %r14 2808b4684afSRobert Clausecker pop %r13 2818b4684afSRobert Clausecker pop %r12 2828b4684afSRobert Clausecker pop %rbx 2838b4684afSRobert Clausecker pop %rbp 2848b4684afSRobert Clausecker ret 2858b4684afSRobert ClauseckerEND(_libmd_sha1block_scalar) 2868b4684afSRobert Clausecker 2878b4684afSRobert Clausecker/* 2888b4684afSRobert Clausecker * This is the implementation using AVX2, BMI1 and BMI2. It is based on: 2898b4684afSRobert Clausecker * "SHA-1 implementation with Intel(R) AVX2 instruction set extensions" 2908b4684afSRobert Clausecker * From http://software.intel.com/en-us/articles 2918b4684afSRobert Clausecker * (look for improving-the-performance-of-the-secure-hash-algorithm-1) 2928b4684afSRobert Clausecker * This implementation is 2x unrolled, and interleaves vector instructions, 2938b4684afSRobert Clausecker * used to precompute W, with scalar computation of current round 2948b4684afSRobert Clausecker * for optimal scheduling. 2958b4684afSRobert Clausecker */ 2968b4684afSRobert Clausecker 2978b4684afSRobert Clausecker /* trivial helper macros */ 2988b4684afSRobert Clausecker.macro update_hash a, tb, c, d, e 2998b4684afSRobert Clausecker add (%r9), \a 3008b4684afSRobert Clausecker mov \a, (%r9) 3018b4684afSRobert Clausecker add 4(%r9), \tb 3028b4684afSRobert Clausecker mov \tb, 4(%r9) 3038b4684afSRobert Clausecker add 8(%r9), \c 3048b4684afSRobert Clausecker mov \c, 8(%r9) 3058b4684afSRobert Clausecker add 12(%r9), \d 3068b4684afSRobert Clausecker mov \d, 12(%r9) 3078b4684afSRobert Clausecker add 16(%r9), \e 3088b4684afSRobert Clausecker mov \e, 16(%r9) 3098b4684afSRobert Clausecker.endm 3108b4684afSRobert Clausecker 3118b4684afSRobert Clausecker /* help macros for recalc, which does precomputations */ 3128b4684afSRobert Clausecker.macro precalc0 offset 3138b4684afSRobert Clausecker vmovdqu \offset(%r10), %xmm0 3148b4684afSRobert Clausecker.endm 3158b4684afSRobert Clausecker 3168b4684afSRobert Clausecker.macro precalc1 offset 3178b4684afSRobert Clausecker vinserti128 $1, \offset(%r13), %ymm0, %ymm0 3188b4684afSRobert Clausecker.endm 3198b4684afSRobert Clausecker 3208b4684afSRobert Clausecker.macro precalc2 yreg 3218b4684afSRobert Clausecker vpshufb %ymm10, %ymm0, \yreg 3228b4684afSRobert Clausecker.endm 3238b4684afSRobert Clausecker 3248b4684afSRobert Clausecker.macro precalc4 yreg, k_offset 3258b4684afSRobert Clausecker vpaddd \k_offset(%r8), \yreg, %ymm0 3268b4684afSRobert Clausecker.endm 3278b4684afSRobert Clausecker 3288b4684afSRobert Clausecker.macro precalc7 offset 3298b4684afSRobert Clausecker vmovdqu %ymm0, (\offset)*2(%r14) 3308b4684afSRobert Clausecker.endm 3318b4684afSRobert Clausecker 3328b4684afSRobert Clausecker/* 3338b4684afSRobert Clausecker * Message scheduling pre-compute for rounds 0-15 3348b4684afSRobert Clausecker * r13 is a pointer to the even 64-byte block 3358b4684afSRobert Clausecker * r10 is a pointer to the odd 64-byte block 3368b4684afSRobert Clausecker * r14 is a pointer to the temp buffer 3378b4684afSRobert Clausecker * xmm0 is used as a temp register 3388b4684afSRobert Clausecker * yreg is clobbered as part of the computation 3398b4684afSRobert Clausecker * offset chooses a 16 byte chunk within a block 3408b4684afSRobert Clausecker * r8 is a pointer to the constants block 3418b4684afSRobert Clausecker * k_offset chooses K constants relevant to this round 3428b4684afSRobert Clausecker * xmm10 holds the swap mask 3438b4684afSRobert Clausecker */ 3448b4684afSRobert Clausecker.macro precalc00_15 offset, yreg 3458b4684afSRobert Clausecker precalc0 \offset 3468b4684afSRobert Clausecker precalc1 \offset 3478b4684afSRobert Clausecker precalc2 \yreg 3488b4684afSRobert Clausecker precalc4 \yreg, 0 3498b4684afSRobert Clausecker precalc7 \offset 3508b4684afSRobert Clausecker.endm 3518b4684afSRobert Clausecker 3528b4684afSRobert Clausecker /* helper macros for precalc16_31 */ 3538b4684afSRobert Clausecker.macro precalc16 reg_sub16, reg_sub12, reg_sub4, reg 3548b4684afSRobert Clausecker vpalignr $8, \reg_sub16, \reg_sub12, \reg // w[i - 14] 3558b4684afSRobert Clausecker vpsrldq $4, \reg_sub4, %ymm0 // w[i - 3] 3568b4684afSRobert Clausecker.endm 3578b4684afSRobert Clausecker 3588b4684afSRobert Clausecker.macro precalc17 reg_sub16, reg_sub8, reg 3598b4684afSRobert Clausecker vpxor \reg_sub8, \reg, \reg 3608b4684afSRobert Clausecker vpxor \reg_sub16, %ymm0, %ymm0 3618b4684afSRobert Clausecker.endm 3628b4684afSRobert Clausecker 3638b4684afSRobert Clausecker.macro precalc18 reg 3648b4684afSRobert Clausecker vpxor %ymm0, \reg, \reg 3658b4684afSRobert Clausecker vpslldq $12, \reg, %ymm9 3668b4684afSRobert Clausecker.endm 3678b4684afSRobert Clausecker 3688b4684afSRobert Clausecker.macro precalc19 reg 3698b4684afSRobert Clausecker vpslld $1, \reg, %ymm0 3708b4684afSRobert Clausecker vpsrld $31, \reg, \reg 3718b4684afSRobert Clausecker .endm 3728b4684afSRobert Clausecker 3738b4684afSRobert Clausecker.macro precalc20 reg 3748b4684afSRobert Clausecker vpor \reg, %ymm0, %ymm0 3758b4684afSRobert Clausecker vpslld $2, %ymm9, \reg 3768b4684afSRobert Clausecker.endm 3778b4684afSRobert Clausecker 3788b4684afSRobert Clausecker.macro precalc21 reg 3798b4684afSRobert Clausecker vpsrld $30, %ymm9, %ymm9 3808b4684afSRobert Clausecker vpxor \reg, %ymm0, %ymm0 3818b4684afSRobert Clausecker.endm 3828b4684afSRobert Clausecker 3838b4684afSRobert Clausecker.macro precalc23 reg, k_offset, offset 3848b4684afSRobert Clausecker vpxor %ymm9, %ymm0, \reg 3858b4684afSRobert Clausecker vpaddd \k_offset(%r8), \reg, %ymm0 3868b4684afSRobert Clausecker vmovdqu %ymm0, (\offset)(%r14) 3878b4684afSRobert Clausecker.endm 3888b4684afSRobert Clausecker 3898b4684afSRobert Clausecker/* 3908b4684afSRobert Clausecker * Message scheduling pre-compute for rounds 16-31 3918b4684afSRobert Clausecker * calculating last 32 w[i] values in 8 XMM registers 3928b4684afSRobert Clausecker * pre-calculate K+w[i] values and store to mem 3938b4684afSRobert Clausecker * for later load by ALU add instruction. 3948b4684afSRobert Clausecker * "brute force" vectorization for rounds 16-31 only 3958b4684afSRobert Clausecker * due to w[i]->w[i-3] dependency. 3968b4684afSRobert Clausecker + clobbers 5 input ymm registers REG_SUB* 3978b4684afSRobert Clausecker * uses xmm0 and xmm9 as temp registers 3988b4684afSRobert Clausecker * As always, r8 is a pointer to constants block 3998b4684afSRobert Clausecker * and r14 is a pointer to temp buffer 4008b4684afSRobert Clausecker */ 4018b4684afSRobert Clausecker.macro precalc16_31 reg, reg_sub4, reg_sub8, reg_sub12, reg_sub16, k_offset, offset 4028b4684afSRobert Clausecker precalc16 \reg_sub16, \reg_sub12, \reg_sub4, \reg 4038b4684afSRobert Clausecker precalc17 \reg_sub16, \reg_sub8, \reg 4048b4684afSRobert Clausecker precalc18 \reg 4058b4684afSRobert Clausecker precalc19 \reg 4068b4684afSRobert Clausecker precalc20 \reg 4078b4684afSRobert Clausecker precalc21 \reg 4088b4684afSRobert Clausecker precalc23 \reg, \k_offset, \offset 4098b4684afSRobert Clausecker.endm 4108b4684afSRobert Clausecker 4118b4684afSRobert Clausecker /* helper macros for precalc_32_79 */ 4128b4684afSRobert Clausecker.macro precalc32 reg_sub8, reg_sub4 4138b4684afSRobert Clausecker vpalignr $8, \reg_sub8, \reg_sub4, %ymm0 4148b4684afSRobert Clausecker.endm 4158b4684afSRobert Clausecker 4168b4684afSRobert Clausecker.macro precalc33 reg_sub28, reg 4178b4684afSRobert Clausecker vpxor \reg_sub28, \reg, \reg 4188b4684afSRobert Clausecker.endm 4198b4684afSRobert Clausecker 4208b4684afSRobert Clausecker.macro precalc34 reg_sub16 4218b4684afSRobert Clausecker vpxor \reg_sub16, %ymm0, %ymm0 4228b4684afSRobert Clausecker.endm 4238b4684afSRobert Clausecker 4248b4684afSRobert Clausecker.macro precalc35 reg 4258b4684afSRobert Clausecker vpxor %ymm0, \reg, \reg 4268b4684afSRobert Clausecker.endm 4278b4684afSRobert Clausecker 4288b4684afSRobert Clausecker.macro precalc36 reg 4298b4684afSRobert Clausecker vpslld $2, \reg, %ymm0 4308b4684afSRobert Clausecker.endm 4318b4684afSRobert Clausecker 4328b4684afSRobert Clausecker.macro precalc37 reg 4338b4684afSRobert Clausecker vpsrld $30, \reg, \reg 4348b4684afSRobert Clausecker vpor \reg, %ymm0, \reg 4358b4684afSRobert Clausecker.endm 4368b4684afSRobert Clausecker 4378b4684afSRobert Clausecker.macro precalc39 reg, k_offset, offset 4388b4684afSRobert Clausecker vpaddd \k_offset(%r8), \reg, %ymm0 4398b4684afSRobert Clausecker vmovdqu %ymm0, \offset(%r14) 4408b4684afSRobert Clausecker.endm 4418b4684afSRobert Clausecker 4428b4684afSRobert Clausecker.macro precalc32_79 reg, reg_sub4, reg_sub8, reg_sub16, reg_sub28, k_offset, offset 4438b4684afSRobert Clausecker precalc32 \reg_sub8, \reg_sub4 4448b4684afSRobert Clausecker precalc33 \reg_sub28, \reg 4458b4684afSRobert Clausecker precalc34 \reg_sub16 4468b4684afSRobert Clausecker precalc35 \reg 4478b4684afSRobert Clausecker precalc36 \reg 4488b4684afSRobert Clausecker precalc37 \reg 4498b4684afSRobert Clausecker precalc39 \reg, \k_offset, \offset 4508b4684afSRobert Clausecker.endm 4518b4684afSRobert Clausecker 4528b4684afSRobert Clausecker.macro precalc 4538b4684afSRobert Clausecker precalc00_15 0x00, %ymm15 4548b4684afSRobert Clausecker precalc00_15 0x10, %ymm14 4558b4684afSRobert Clausecker precalc00_15 0x20, %ymm13 4568b4684afSRobert Clausecker precalc00_15 0x30, %ymm12 4578b4684afSRobert Clausecker precalc16_31 %ymm8, %ymm12, %ymm13, %ymm14, %ymm15, 0x00, 0x080 4588b4684afSRobert Clausecker precalc16_31 %ymm7, %ymm8, %ymm12, %ymm13, %ymm14, 0x20, 0x0a0 4598b4684afSRobert Clausecker precalc16_31 %ymm5, %ymm7, %ymm8, %ymm12, %ymm13, 0x20, 0x0c0 4608b4684afSRobert Clausecker precalc16_31 %ymm3, %ymm5, %ymm7, %ymm8, %ymm12, 0x20, 0x0e0 4618b4684afSRobert Clausecker precalc32_79 %ymm15, %ymm3, %ymm5, %ymm8, %ymm14, 0x20, 0x100 4628b4684afSRobert Clausecker precalc32_79 %ymm14, %ymm15, %ymm3, %ymm7, %ymm13, 0x20, 0x120 4638b4684afSRobert Clausecker precalc32_79 %ymm13, %ymm14, %ymm15, %ymm5, %ymm12, 0x40, 0x140 4648b4684afSRobert Clausecker precalc32_79 %ymm12, %ymm13, %ymm14, %ymm3, %ymm8, 0x40, 0x160 4658b4684afSRobert Clausecker precalc32_79 %ymm8, %ymm12, %ymm13, %ymm15, %ymm7, 0x40, 0x180 4668b4684afSRobert Clausecker precalc32_79 %ymm7, %ymm8, %ymm12, %ymm14, %ymm5, 0x40, 0x1a0 4678b4684afSRobert Clausecker precalc32_79 %ymm5, %ymm7, %ymm8, %ymm13, %ymm3, 0x40, 0x1c0 4688b4684afSRobert Clausecker precalc32_79 %ymm3, %ymm5, %ymm7, %ymm12, %ymm15, 0x60, 0x1e0 4698b4684afSRobert Clausecker precalc32_79 %ymm15, %ymm3, %ymm5, %ymm8, %ymm14, 0x60, 0x200 4708b4684afSRobert Clausecker precalc32_79 %ymm14, %ymm15, %ymm3, %ymm7, %ymm13, 0x60, 0x220 4718b4684afSRobert Clausecker precalc32_79 %ymm13, %ymm14, %ymm15, %ymm5, %ymm12, 0x60, 0x240 4728b4684afSRobert Clausecker precalc32_79 %ymm12, %ymm13, %ymm14, %ymm3, %ymm8, 0x60, 0x260 4738b4684afSRobert Clausecker.endm 4748b4684afSRobert Clausecker 4758b4684afSRobert Clausecker/* 4768b4684afSRobert Clausecker * Macros calculating individual rounds have general form 4778b4684afSRobert Clausecker * calc_round_pre + precalc_round + calc_round_post 4788b4684afSRobert Clausecker * calc_round_{pre,post} macros follow 4798b4684afSRobert Clausecker */ 4808b4684afSRobert Clausecker.macro calc_f1_pre offset, reg_a, reg_b, reg_c, reg_e 4818b4684afSRobert Clausecker add \offset(%r15), \reg_e 4828b4684afSRobert Clausecker andn \reg_c, \reg_a, %ebp 4838b4684afSRobert Clausecker add \reg_b, \reg_e // add F from the previous round 4848b4684afSRobert Clausecker rorx $0x1b, \reg_a, %r12d 4858b4684afSRobert Clausecker rorx $2, \reg_a, \reg_b // for the next round 4868b4684afSRobert Clausecker.endm 4878b4684afSRobert Clausecker 4888b4684afSRobert Clausecker/* 4898b4684afSRobert Clausecker * Calculate F for the next round 4908b4684afSRobert Clausecker */ 4918b4684afSRobert Clausecker.macro calc_f1_post reg_a, reg_b, reg_e 4928b4684afSRobert Clausecker and \reg_b, \reg_a // b & c 4938b4684afSRobert Clausecker xor %ebp, \reg_a // F1 = (b&c) ^ (~b&d) 4948b4684afSRobert Clausecker add %r12d, \reg_e 4958b4684afSRobert Clausecker.endm 4968b4684afSRobert Clausecker 4978b4684afSRobert Clausecker/* 4988b4684afSRobert Clausecker * Registers are cyclically rotated: 4998b4684afSRobert Clausecker * edx -> eax -> edi -> esi -> ebx -> ecx 5008b4684afSRobert Clausecker */ 5018b4684afSRobert Clausecker.macro calc0 5028b4684afSRobert Clausecker mov %esi, %ebx // precalculate first round 5038b4684afSRobert Clausecker rorx $2, %esi, %esi 5048b4684afSRobert Clausecker andn %eax, %ebx, %ebp 5058b4684afSRobert Clausecker and %edi, %ebx 5068b4684afSRobert Clausecker xor %ebp, %ebx 5078b4684afSRobert Clausecker calc_f1_pre 0x0, %ecx, %ebx, %edi, %edx 5088b4684afSRobert Clausecker precalc0 0x80 5098b4684afSRobert Clausecker calc_f1_post %ecx, %esi, %edx 5108b4684afSRobert Clausecker.endm 5118b4684afSRobert Clausecker 5128b4684afSRobert Clausecker.macro calc1 5138b4684afSRobert Clausecker calc_f1_pre 0x4, %edx, %ecx, %esi, %eax 5148b4684afSRobert Clausecker precalc1 0x80 5158b4684afSRobert Clausecker calc_f1_post %edx, %ebx, %eax 5168b4684afSRobert Clausecker.endm 5178b4684afSRobert Clausecker 5188b4684afSRobert Clausecker.macro calc2 5198b4684afSRobert Clausecker calc_f1_pre 0x8, %eax, %edx, %ebx, %edi 5208b4684afSRobert Clausecker precalc2 %ymm15 5218b4684afSRobert Clausecker calc_f1_post %eax, %ecx, %edi 5228b4684afSRobert Clausecker.endm 5238b4684afSRobert Clausecker 5248b4684afSRobert Clausecker.macro calc3 5258b4684afSRobert Clausecker calc_f1_pre 0xc, %edi, %eax, %ecx, %esi 5268b4684afSRobert Clausecker calc_f1_post %edi, %edx, %esi 5278b4684afSRobert Clausecker.endm 5288b4684afSRobert Clausecker 5298b4684afSRobert Clausecker.macro calc4 5308b4684afSRobert Clausecker calc_f1_pre 0x20, %esi, %edi, %edx, %ebx 5318b4684afSRobert Clausecker precalc4 %ymm15, 0x0 5328b4684afSRobert Clausecker calc_f1_post %esi, %eax, %ebx 5338b4684afSRobert Clausecker.endm 5348b4684afSRobert Clausecker 5358b4684afSRobert Clausecker.macro calc5 5368b4684afSRobert Clausecker calc_f1_pre 0x24, %ebx, %esi, %eax, %ecx 5378b4684afSRobert Clausecker calc_f1_post %ebx, %edi, %ecx 5388b4684afSRobert Clausecker.endm 5398b4684afSRobert Clausecker 5408b4684afSRobert Clausecker.macro calc6 5418b4684afSRobert Clausecker calc_f1_pre 0x28, %ecx, %ebx, %edi, %edx 5428b4684afSRobert Clausecker calc_f1_post %ecx, %esi, %edx 5438b4684afSRobert Clausecker.endm 5448b4684afSRobert Clausecker 5458b4684afSRobert Clausecker.macro calc7 5468b4684afSRobert Clausecker calc_f1_pre 0x2c, %edx, %ecx, %esi, %eax 5478b4684afSRobert Clausecker precalc7 0x0 5488b4684afSRobert Clausecker calc_f1_post %edx, %ebx, %eax 5498b4684afSRobert Clausecker.endm 5508b4684afSRobert Clausecker 5518b4684afSRobert Clausecker.macro calc8 5528b4684afSRobert Clausecker calc_f1_pre 0x40, %eax, %edx, %ebx, %edi 5538b4684afSRobert Clausecker precalc0 0x90 5548b4684afSRobert Clausecker calc_f1_post %eax, %ecx, %edi 5558b4684afSRobert Clausecker.endm 5568b4684afSRobert Clausecker 5578b4684afSRobert Clausecker.macro calc9 5588b4684afSRobert Clausecker calc_f1_pre 0x44, %edi, %eax, %ecx, %esi 5598b4684afSRobert Clausecker precalc1 0x90 5608b4684afSRobert Clausecker calc_f1_post %edi, %edx, %esi 5618b4684afSRobert Clausecker.endm 5628b4684afSRobert Clausecker 5638b4684afSRobert Clausecker.macro calc10 5648b4684afSRobert Clausecker calc_f1_pre 0x48, %esi, %edi, %edx, %ebx 5658b4684afSRobert Clausecker precalc2 %ymm14 5668b4684afSRobert Clausecker calc_f1_post %esi, %eax, %ebx 5678b4684afSRobert Clausecker.endm 5688b4684afSRobert Clausecker 5698b4684afSRobert Clausecker.macro calc11 5708b4684afSRobert Clausecker calc_f1_pre 0x4c, %ebx, %esi, %eax, %ecx 5718b4684afSRobert Clausecker calc_f1_post %ebx, %edi, %ecx 5728b4684afSRobert Clausecker.endm 5738b4684afSRobert Clausecker 5748b4684afSRobert Clausecker.macro calc12 5758b4684afSRobert Clausecker calc_f1_pre 0x60, %ecx, %ebx, %edi, %edx 5768b4684afSRobert Clausecker precalc4 %ymm14, 0 5778b4684afSRobert Clausecker calc_f1_post %ecx, %esi, %edx 5788b4684afSRobert Clausecker.endm 5798b4684afSRobert Clausecker 5808b4684afSRobert Clausecker.macro calc13 5818b4684afSRobert Clausecker calc_f1_pre 0x64, %edx, %ecx, %esi, %eax 5828b4684afSRobert Clausecker calc_f1_post %edx, %ebx, %eax 5838b4684afSRobert Clausecker.endm 5848b4684afSRobert Clausecker 5858b4684afSRobert Clausecker.macro calc14 5868b4684afSRobert Clausecker calc_f1_pre 0x68, %eax, %edx, %ebx, %edi 5878b4684afSRobert Clausecker calc_f1_post %eax, %ecx, %edi 5888b4684afSRobert Clausecker.endm 5898b4684afSRobert Clausecker 5908b4684afSRobert Clausecker.macro calc15 5918b4684afSRobert Clausecker calc_f1_pre 0x6c, %edi, %eax, %ecx, %esi 5928b4684afSRobert Clausecker precalc7 0x10 5938b4684afSRobert Clausecker calc_f1_post %edi, %edx, %esi 5948b4684afSRobert Clausecker.endm 5958b4684afSRobert Clausecker 5968b4684afSRobert Clausecker.macro calc16 5978b4684afSRobert Clausecker calc_f1_pre 0x80, %esi, %edi, %edx, %ebx 5988b4684afSRobert Clausecker precalc0 0xa0 5998b4684afSRobert Clausecker calc_f1_post %esi, %eax, %ebx 6008b4684afSRobert Clausecker.endm 6018b4684afSRobert Clausecker 6028b4684afSRobert Clausecker.macro calc17 6038b4684afSRobert Clausecker calc_f1_pre 0x84, %ebx, %esi, %eax, %ecx 6048b4684afSRobert Clausecker precalc1 0xa0 6058b4684afSRobert Clausecker calc_f1_post %ebx, %edi, %ecx 6068b4684afSRobert Clausecker.endm 6078b4684afSRobert Clausecker 6088b4684afSRobert Clausecker.macro calc18 6098b4684afSRobert Clausecker calc_f1_pre 0x88, %ecx, %ebx, %edi, %edx 6108b4684afSRobert Clausecker precalc2 %ymm13 6118b4684afSRobert Clausecker calc_f1_post %ecx, %esi, %edx 6128b4684afSRobert Clausecker.endm 6138b4684afSRobert Clausecker 6148b4684afSRobert Clausecker.macro calc_f2_pre offset, reg_a, reg_b, reg_e 6158b4684afSRobert Clausecker add \offset(%r15), \reg_e 6168b4684afSRobert Clausecker add \reg_b, \reg_e // add F from the previous round 6178b4684afSRobert Clausecker rorx $0x1b, \reg_a, %r12d 6188b4684afSRobert Clausecker rorx $2, \reg_a, \reg_b // for next round 6198b4684afSRobert Clausecker.endm 6208b4684afSRobert Clausecker 6218b4684afSRobert Clausecker.macro calc_f2_post reg_a, reg_b, reg_c, reg_e 6228b4684afSRobert Clausecker xor \reg_b, \reg_a 6238b4684afSRobert Clausecker add %r12d, \reg_e 6248b4684afSRobert Clausecker xor \reg_c, \reg_a 6258b4684afSRobert Clausecker.endm 6268b4684afSRobert Clausecker 6278b4684afSRobert Clausecker.macro calc19 6288b4684afSRobert Clausecker calc_f2_pre 0x8c, %edx, %ecx, %eax 6298b4684afSRobert Clausecker calc_f2_post %edx, %ebx, %esi, %eax 6308b4684afSRobert Clausecker.endm 6318b4684afSRobert Clausecker 6328b4684afSRobert Clausecker.macro calc20 6338b4684afSRobert Clausecker calc_f2_pre 0xa0, %eax, %edx, %edi 6348b4684afSRobert Clausecker precalc4 %ymm13, 0x0 6358b4684afSRobert Clausecker calc_f2_post %eax, %ecx, %ebx, %edi 6368b4684afSRobert Clausecker.endm 6378b4684afSRobert Clausecker 6388b4684afSRobert Clausecker.macro calc21 6398b4684afSRobert Clausecker calc_f2_pre 0xa4, %edi, %eax, %esi 6408b4684afSRobert Clausecker calc_f2_post %edi, %edx, %ecx, %esi 6418b4684afSRobert Clausecker.endm 6428b4684afSRobert Clausecker 6438b4684afSRobert Clausecker.macro calc22 6448b4684afSRobert Clausecker calc_f2_pre 0xa8, %esi, %edi, %ebx 6458b4684afSRobert Clausecker calc_f2_post %esi, %eax, %edx, %ebx 6468b4684afSRobert Clausecker.endm 6478b4684afSRobert Clausecker 6488b4684afSRobert Clausecker.macro calc23 6498b4684afSRobert Clausecker calc_f2_pre 0xac, %ebx, %esi, %ecx 6508b4684afSRobert Clausecker precalc7 0x20 6518b4684afSRobert Clausecker calc_f2_post %ebx, %edi, %eax, %ecx 6528b4684afSRobert Clausecker.endm 6538b4684afSRobert Clausecker 6548b4684afSRobert Clausecker.macro calc24 6558b4684afSRobert Clausecker calc_f2_pre 0xc0, %ecx, %ebx, %edx 6568b4684afSRobert Clausecker precalc0 0xb0 6578b4684afSRobert Clausecker calc_f2_post %ecx, %esi, %edi, %edx 6588b4684afSRobert Clausecker.endm 6598b4684afSRobert Clausecker 6608b4684afSRobert Clausecker.macro calc25 6618b4684afSRobert Clausecker calc_f2_pre 0xc4, %edx, %ecx, %eax 6628b4684afSRobert Clausecker precalc1 0xb0 6638b4684afSRobert Clausecker calc_f2_post %edx, %ebx, %esi, %eax 6648b4684afSRobert Clausecker.endm 6658b4684afSRobert Clausecker 6668b4684afSRobert Clausecker.macro calc26 6678b4684afSRobert Clausecker calc_f2_pre 0xc8, %eax, %edx, %edi 6688b4684afSRobert Clausecker precalc2 %ymm12 6698b4684afSRobert Clausecker calc_f2_post %eax, %ecx, %ebx, %edi 6708b4684afSRobert Clausecker.endm 6718b4684afSRobert Clausecker 6728b4684afSRobert Clausecker.macro calc27 6738b4684afSRobert Clausecker calc_f2_pre 0xcc, %edi, %eax, %esi 6748b4684afSRobert Clausecker calc_f2_post %edi, %edx, %ecx, %esi 6758b4684afSRobert Clausecker.endm 6768b4684afSRobert Clausecker 6778b4684afSRobert Clausecker.macro calc28 6788b4684afSRobert Clausecker calc_f2_pre 0xe0, %esi, %edi, %ebx 6798b4684afSRobert Clausecker precalc4 %ymm12, 0x0 6808b4684afSRobert Clausecker calc_f2_post %esi, %eax, %edx, %ebx 6818b4684afSRobert Clausecker.endm 6828b4684afSRobert Clausecker 6838b4684afSRobert Clausecker.macro calc29 6848b4684afSRobert Clausecker calc_f2_pre 0xe4, %ebx, %esi, %ecx 6858b4684afSRobert Clausecker calc_f2_post %ebx, %edi, %eax, %ecx 6868b4684afSRobert Clausecker.endm 6878b4684afSRobert Clausecker 6888b4684afSRobert Clausecker.macro calc30 6898b4684afSRobert Clausecker calc_f2_pre 0xe8, %ecx, %ebx, %edx 6908b4684afSRobert Clausecker calc_f2_post %ecx, %esi, %edi, %edx 6918b4684afSRobert Clausecker.endm 6928b4684afSRobert Clausecker 6938b4684afSRobert Clausecker.macro calc31 6948b4684afSRobert Clausecker calc_f2_pre 0xec, %edx, %ecx, %eax 6958b4684afSRobert Clausecker precalc7 0x30 6968b4684afSRobert Clausecker calc_f2_post %edx, %ebx, %esi, %eax 6978b4684afSRobert Clausecker.endm 6988b4684afSRobert Clausecker 6998b4684afSRobert Clausecker.macro calc32 7008b4684afSRobert Clausecker calc_f2_pre 0x100, %eax, %edx, %edi 7018b4684afSRobert Clausecker precalc16 %ymm15, %ymm14, %ymm12, %ymm8 7028b4684afSRobert Clausecker calc_f2_post %eax, %ecx, %ebx, %edi 7038b4684afSRobert Clausecker.endm 7048b4684afSRobert Clausecker 7058b4684afSRobert Clausecker.macro calc33 7068b4684afSRobert Clausecker calc_f2_pre 0x104, %edi, %eax, %esi 7078b4684afSRobert Clausecker precalc17 %ymm15, %ymm13, %ymm8 7088b4684afSRobert Clausecker calc_f2_post %edi, %edx, %ecx, %esi 7098b4684afSRobert Clausecker.endm 7108b4684afSRobert Clausecker 7118b4684afSRobert Clausecker.macro calc34 7128b4684afSRobert Clausecker calc_f2_pre 0x108, %esi, %edi, %ebx 7138b4684afSRobert Clausecker precalc18 %ymm8 7148b4684afSRobert Clausecker calc_f2_post %esi, %eax, %edx, %ebx 7158b4684afSRobert Clausecker.endm 7168b4684afSRobert Clausecker 7178b4684afSRobert Clausecker.macro calc35 7188b4684afSRobert Clausecker calc_f2_pre 0x10c, %ebx, %esi, %ecx 7198b4684afSRobert Clausecker precalc19 %ymm8 7208b4684afSRobert Clausecker calc_f2_post %ebx, %edi, %eax, %ecx 7218b4684afSRobert Clausecker.endm 7228b4684afSRobert Clausecker 7238b4684afSRobert Clausecker.macro calc36 7248b4684afSRobert Clausecker calc_f2_pre 0x120, %ecx, %ebx, %edx 7258b4684afSRobert Clausecker precalc20 %ymm8 7268b4684afSRobert Clausecker calc_f2_post %ecx, %esi, %edi, %edx 7278b4684afSRobert Clausecker.endm 7288b4684afSRobert Clausecker 7298b4684afSRobert Clausecker.macro calc37 7308b4684afSRobert Clausecker calc_f2_pre 0x124, %edx, %ecx, %eax 7318b4684afSRobert Clausecker precalc21 %ymm8 7328b4684afSRobert Clausecker calc_f2_post %edx, %ebx, %esi, %eax 7338b4684afSRobert Clausecker.endm 7348b4684afSRobert Clausecker 7358b4684afSRobert Clausecker.macro calc38 7368b4684afSRobert Clausecker calc_f2_pre 0x128, %eax, %edx, %edi 7378b4684afSRobert Clausecker calc_f2_post %eax, %ecx, %ebx, %edi 7388b4684afSRobert Clausecker.endm 7398b4684afSRobert Clausecker 7408b4684afSRobert Clausecker.macro calc_f3_pre offset, reg_e 7418b4684afSRobert Clausecker add \offset(%r15), \reg_e 7428b4684afSRobert Clausecker.endm 7438b4684afSRobert Clausecker 7448b4684afSRobert Clausecker.macro calc_f3_post reg_a, reg_b, reg_c, reg_e, reg_tb 7458b4684afSRobert Clausecker add \reg_tb, \reg_e // add F from the previous round 7468b4684afSRobert Clausecker mov \reg_b, %ebp 7478b4684afSRobert Clausecker or \reg_a, %ebp 7488b4684afSRobert Clausecker rorx $0x1b, \reg_a, %r12d 7498b4684afSRobert Clausecker rorx $2, \reg_a, \reg_tb 7508b4684afSRobert Clausecker and \reg_c, %ebp // calculate F for the next round 7518b4684afSRobert Clausecker and \reg_b, \reg_a 7528b4684afSRobert Clausecker or %ebp, \reg_a 7538b4684afSRobert Clausecker add %r12d, \reg_e 7548b4684afSRobert Clausecker.endm 7558b4684afSRobert Clausecker 7568b4684afSRobert Clausecker.macro calc39 7578b4684afSRobert Clausecker calc_f3_pre 0x12c, %esi 7588b4684afSRobert Clausecker precalc23 %ymm8, 0x0, 0x80 7598b4684afSRobert Clausecker calc_f3_post %edi, %edx, %ecx, %esi, %eax 7608b4684afSRobert Clausecker.endm 7618b4684afSRobert Clausecker 7628b4684afSRobert Clausecker.macro calc40 7638b4684afSRobert Clausecker calc_f3_pre 0x140, %ebx 7648b4684afSRobert Clausecker precalc16 %ymm14, %ymm13, %ymm8, %ymm7 7658b4684afSRobert Clausecker calc_f3_post %esi, %eax, %edx, %ebx, %edi 7668b4684afSRobert Clausecker.endm 7678b4684afSRobert Clausecker 7688b4684afSRobert Clausecker.macro calc41 7698b4684afSRobert Clausecker calc_f3_pre 0x144, %ecx 7708b4684afSRobert Clausecker precalc17 %ymm14, %ymm12, %ymm7 7718b4684afSRobert Clausecker calc_f3_post %ebx, %edi, %eax, %ecx, %esi 7728b4684afSRobert Clausecker.endm 7738b4684afSRobert Clausecker 7748b4684afSRobert Clausecker.macro calc42 7758b4684afSRobert Clausecker calc_f3_pre 0x148, %edx 7768b4684afSRobert Clausecker precalc18 %ymm7 7778b4684afSRobert Clausecker calc_f3_post %ecx, %esi, %edi, %edx, %ebx 7788b4684afSRobert Clausecker.endm 7798b4684afSRobert Clausecker 7808b4684afSRobert Clausecker.macro calc43 7818b4684afSRobert Clausecker calc_f3_pre 0x14c, %eax 7828b4684afSRobert Clausecker precalc19 %ymm7 7838b4684afSRobert Clausecker calc_f3_post %edx, %ebx, %esi, %eax, %ecx 7848b4684afSRobert Clausecker.endm 7858b4684afSRobert Clausecker 7868b4684afSRobert Clausecker.macro calc44 7878b4684afSRobert Clausecker calc_f3_pre 0x160, %edi 7888b4684afSRobert Clausecker precalc20 %ymm7 7898b4684afSRobert Clausecker calc_f3_post %eax, %ecx, %ebx, %edi, %edx 7908b4684afSRobert Clausecker.endm 7918b4684afSRobert Clausecker 7928b4684afSRobert Clausecker.macro calc45 7938b4684afSRobert Clausecker calc_f3_pre 0x164, %esi 7948b4684afSRobert Clausecker precalc21 %ymm7 7958b4684afSRobert Clausecker calc_f3_post %edi, %edx, %ecx, %esi, %eax 7968b4684afSRobert Clausecker.endm 7978b4684afSRobert Clausecker 7988b4684afSRobert Clausecker.macro calc46 7998b4684afSRobert Clausecker calc_f3_pre 0x168, %ebx 8008b4684afSRobert Clausecker calc_f3_post %esi, %eax, %edx, %ebx, %edi 8018b4684afSRobert Clausecker.endm 8028b4684afSRobert Clausecker 8038b4684afSRobert Clausecker.macro calc47 8048b4684afSRobert Clausecker calc_f3_pre 0x16c, %ecx 8058b4684afSRobert Clausecker vpxor %ymm9, %ymm0, %ymm7 8068b4684afSRobert Clausecker vpaddd 0x20(%r8), %ymm7, %ymm0 8078b4684afSRobert Clausecker vmovdqu %ymm0, 0xa0(%r14) 8088b4684afSRobert Clausecker calc_f3_post %ebx, %edi, %eax, %ecx, %esi 8098b4684afSRobert Clausecker.endm 8108b4684afSRobert Clausecker 8118b4684afSRobert Clausecker.macro calc48 8128b4684afSRobert Clausecker calc_f3_pre 0x180, %edx 8138b4684afSRobert Clausecker precalc16 %ymm13, %ymm12, %ymm7, %ymm5 8148b4684afSRobert Clausecker calc_f3_post %ecx, %esi, %edi, %edx, %ebx 8158b4684afSRobert Clausecker.endm 8168b4684afSRobert Clausecker 8178b4684afSRobert Clausecker.macro calc49 8188b4684afSRobert Clausecker calc_f3_pre 0x184, %eax 8198b4684afSRobert Clausecker precalc17 %ymm13, %ymm8, %ymm5 8208b4684afSRobert Clausecker calc_f3_post %edx, %ebx, %esi, %eax, %ecx 8218b4684afSRobert Clausecker.endm 8228b4684afSRobert Clausecker 8238b4684afSRobert Clausecker.macro calc50 8248b4684afSRobert Clausecker calc_f3_pre 0x188, %edi 8258b4684afSRobert Clausecker precalc18 %ymm5 8268b4684afSRobert Clausecker calc_f3_post %eax, %ecx, %ebx, %edi, %edx 8278b4684afSRobert Clausecker.endm 8288b4684afSRobert Clausecker 8298b4684afSRobert Clausecker.macro calc51 8308b4684afSRobert Clausecker calc_f3_pre 0x18c, %esi 8318b4684afSRobert Clausecker precalc19 %ymm5 8328b4684afSRobert Clausecker calc_f3_post %edi, %edx, %ecx, %esi, %eax 8338b4684afSRobert Clausecker.endm 8348b4684afSRobert Clausecker 8358b4684afSRobert Clausecker.macro calc52 8368b4684afSRobert Clausecker calc_f3_pre 0x1a0, %ebx 8378b4684afSRobert Clausecker precalc20 %ymm5 8388b4684afSRobert Clausecker calc_f3_post %esi, %eax, %edx, %ebx, %edi 8398b4684afSRobert Clausecker.endm 8408b4684afSRobert Clausecker 8418b4684afSRobert Clausecker.macro calc53 8428b4684afSRobert Clausecker calc_f3_pre 0x1a4, %ecx 8438b4684afSRobert Clausecker precalc21 %ymm5 8448b4684afSRobert Clausecker calc_f3_post %ebx, %edi, %eax, %ecx, %esi 8458b4684afSRobert Clausecker.endm 8468b4684afSRobert Clausecker 8478b4684afSRobert Clausecker.macro calc54 8488b4684afSRobert Clausecker calc_f3_pre 0x1a8, %edx 8498b4684afSRobert Clausecker calc_f3_post %ecx, %esi, %edi, %edx, %ebx 8508b4684afSRobert Clausecker.endm 8518b4684afSRobert Clausecker 8528b4684afSRobert Clausecker.macro calc55 8538b4684afSRobert Clausecker calc_f3_pre 0x1ac, %eax 8548b4684afSRobert Clausecker precalc23 %ymm5, 0x20, 0xc0 8558b4684afSRobert Clausecker calc_f3_post %edx, %ebx, %esi, %eax, %ecx 8568b4684afSRobert Clausecker.endm 8578b4684afSRobert Clausecker 8588b4684afSRobert Clausecker.macro calc56 8598b4684afSRobert Clausecker calc_f3_pre 0x1c0, %edi 8608b4684afSRobert Clausecker precalc16 %ymm12, %ymm8, %ymm5, %ymm3 8618b4684afSRobert Clausecker calc_f3_post %eax, %ecx, %ebx, %edi, %edx 8628b4684afSRobert Clausecker.endm 8638b4684afSRobert Clausecker 8648b4684afSRobert Clausecker.macro calc57 8658b4684afSRobert Clausecker calc_f3_pre 0x1c4, %esi 8668b4684afSRobert Clausecker precalc17 %ymm12, %ymm7, %ymm3 8678b4684afSRobert Clausecker calc_f3_post %edi, %edx, %ecx, %esi, %eax 8688b4684afSRobert Clausecker.endm 8698b4684afSRobert Clausecker 8708b4684afSRobert Clausecker.macro calc58 8718b4684afSRobert Clausecker calc_f3_pre 0x1c8, %ebx 8728b4684afSRobert Clausecker precalc18 %ymm3 8738b4684afSRobert Clausecker calc_f3_post %esi, %eax, %edx, %ebx, %edi 8748b4684afSRobert Clausecker.endm 8758b4684afSRobert Clausecker 8768b4684afSRobert Clausecker.macro calc59 8778b4684afSRobert Clausecker calc_f2_pre 0x1cc, %ebx, %esi, %ecx 8788b4684afSRobert Clausecker precalc19 %ymm3 8798b4684afSRobert Clausecker calc_f2_post %ebx, %edi, %eax, %ecx 8808b4684afSRobert Clausecker.endm 8818b4684afSRobert Clausecker 8828b4684afSRobert Clausecker.macro calc60 8838b4684afSRobert Clausecker calc_f2_pre 0x1e0, %ecx, %ebx, %edx 8848b4684afSRobert Clausecker precalc20 %ymm3 8858b4684afSRobert Clausecker calc_f2_post %ecx, %esi, %edi, %edx 8868b4684afSRobert Clausecker.endm 8878b4684afSRobert Clausecker 8888b4684afSRobert Clausecker.macro calc61 8898b4684afSRobert Clausecker calc_f2_pre 0x1e4, %edx, %ecx, %eax 8908b4684afSRobert Clausecker precalc21 %ymm3 8918b4684afSRobert Clausecker calc_f2_post %edx, %ebx, %esi, %eax 8928b4684afSRobert Clausecker.endm 8938b4684afSRobert Clausecker 8948b4684afSRobert Clausecker.macro calc62 8958b4684afSRobert Clausecker calc_f2_pre 0x1e8, %eax, %edx, %edi 8968b4684afSRobert Clausecker calc_f2_post %eax, %ecx, %ebx, %edi 8978b4684afSRobert Clausecker.endm 8988b4684afSRobert Clausecker 8998b4684afSRobert Clausecker.macro calc63 9008b4684afSRobert Clausecker calc_f2_pre 0x1ec, %edi, %eax, %esi 9018b4684afSRobert Clausecker precalc23 %ymm3, 0x20, 0xe0 9028b4684afSRobert Clausecker calc_f2_post %edi, %edx, %ecx, %esi 9038b4684afSRobert Clausecker.endm 9048b4684afSRobert Clausecker 9058b4684afSRobert Clausecker.macro calc64 9068b4684afSRobert Clausecker calc_f2_pre 0x200, %esi, %edi, %ebx 9078b4684afSRobert Clausecker precalc32 %ymm5, %ymm3 9088b4684afSRobert Clausecker calc_f2_post %esi, %eax, %edx, %ebx 9098b4684afSRobert Clausecker.endm 9108b4684afSRobert Clausecker 9118b4684afSRobert Clausecker.macro calc65 9128b4684afSRobert Clausecker calc_f2_pre 0x204, %ebx, %esi, %ecx 9138b4684afSRobert Clausecker precalc33 %ymm14, %ymm15 9148b4684afSRobert Clausecker calc_f2_post %ebx, %edi, %eax, %ecx 9158b4684afSRobert Clausecker.endm 9168b4684afSRobert Clausecker 9178b4684afSRobert Clausecker.macro calc66 9188b4684afSRobert Clausecker calc_f2_pre 0x208, %ecx, %ebx, %edx 9198b4684afSRobert Clausecker precalc34 %ymm8 9208b4684afSRobert Clausecker calc_f2_post %ecx, %esi, %edi, %edx 9218b4684afSRobert Clausecker.endm 9228b4684afSRobert Clausecker 9238b4684afSRobert Clausecker.macro calc67 9248b4684afSRobert Clausecker calc_f2_pre 0x20c, %edx, %ecx, %eax 9258b4684afSRobert Clausecker precalc35 %ymm15 9268b4684afSRobert Clausecker calc_f2_post %edx, %ebx, %esi, %eax 9278b4684afSRobert Clausecker.endm 9288b4684afSRobert Clausecker 9298b4684afSRobert Clausecker.macro calc68 9308b4684afSRobert Clausecker calc_f2_pre 0x220, %eax, %edx, %edi 9318b4684afSRobert Clausecker precalc36 %ymm15 9328b4684afSRobert Clausecker calc_f2_post %eax, %ecx, %ebx, %edi 9338b4684afSRobert Clausecker.endm 9348b4684afSRobert Clausecker 9358b4684afSRobert Clausecker.macro calc69 9368b4684afSRobert Clausecker calc_f2_pre 0x224, %edi, %eax, %esi 9378b4684afSRobert Clausecker precalc37 %ymm15 9388b4684afSRobert Clausecker calc_f2_post %edi, %edx, %ecx, %esi 9398b4684afSRobert Clausecker.endm 9408b4684afSRobert Clausecker 9418b4684afSRobert Clausecker.macro calc70 9428b4684afSRobert Clausecker calc_f2_pre 0x228, %esi, %edi, %ebx 9438b4684afSRobert Clausecker calc_f2_post %esi, %eax, %edx, %ebx 9448b4684afSRobert Clausecker.endm 9458b4684afSRobert Clausecker 9468b4684afSRobert Clausecker.macro calc71 9478b4684afSRobert Clausecker calc_f2_pre 0x22c, %ebx, %esi, %ecx 9488b4684afSRobert Clausecker precalc39 %ymm15, 0x20, 0x100 9498b4684afSRobert Clausecker calc_f2_post %ebx, %edi, %eax, %ecx 9508b4684afSRobert Clausecker.endm 9518b4684afSRobert Clausecker 9528b4684afSRobert Clausecker.macro calc72 9538b4684afSRobert Clausecker calc_f2_pre 0x240, %ecx, %ebx, %edx 9548b4684afSRobert Clausecker precalc32 %ymm3, %ymm15 9558b4684afSRobert Clausecker calc_f2_post %ecx, %esi, %edi, %edx 9568b4684afSRobert Clausecker.endm 9578b4684afSRobert Clausecker 9588b4684afSRobert Clausecker.macro calc73 9598b4684afSRobert Clausecker calc_f2_pre 0x244, %edx, %ecx, %eax 9608b4684afSRobert Clausecker precalc33 %ymm13, %ymm14 9618b4684afSRobert Clausecker calc_f2_post %edx, %ebx, %esi, %eax 9628b4684afSRobert Clausecker.endm 9638b4684afSRobert Clausecker 9648b4684afSRobert Clausecker.macro calc74 9658b4684afSRobert Clausecker calc_f2_pre 0x248, %eax, %edx, %edi 9668b4684afSRobert Clausecker precalc34 %ymm7 9678b4684afSRobert Clausecker calc_f2_post %eax, %ecx, %ebx, %edi 9688b4684afSRobert Clausecker.endm 9698b4684afSRobert Clausecker 9708b4684afSRobert Clausecker.macro calc75 9718b4684afSRobert Clausecker calc_f2_pre 0x24c, %edi, %eax, %esi 9728b4684afSRobert Clausecker precalc35 %ymm14 9738b4684afSRobert Clausecker calc_f2_post %edi, %edx, %ecx, %esi 9748b4684afSRobert Clausecker.endm 9758b4684afSRobert Clausecker 9768b4684afSRobert Clausecker.macro calc76 9778b4684afSRobert Clausecker calc_f2_pre 0x260, %esi, %edi, %ebx 9788b4684afSRobert Clausecker precalc36 %ymm14 9798b4684afSRobert Clausecker calc_f2_post %esi, %eax, %edx, %ebx 9808b4684afSRobert Clausecker.endm 9818b4684afSRobert Clausecker 9828b4684afSRobert Clausecker.macro calc77 9838b4684afSRobert Clausecker calc_f2_pre 0x264, %ebx, %esi, %ecx 9848b4684afSRobert Clausecker precalc37 %ymm14 9858b4684afSRobert Clausecker calc_f2_post %ebx, %edi, %eax, %ecx 9868b4684afSRobert Clausecker.endm 9878b4684afSRobert Clausecker 9888b4684afSRobert Clausecker.macro calc78 9898b4684afSRobert Clausecker calc_f2_pre 0x268, %ecx, %ebx, %edx 9908b4684afSRobert Clausecker calc_f2_post %ecx, %esi, %edi, %edx 9918b4684afSRobert Clausecker.endm 9928b4684afSRobert Clausecker 9938b4684afSRobert Clausecker.macro calc79 9948b4684afSRobert Clausecker add 0x26c(%r15), %eax 9958b4684afSRobert Clausecker add %ecx, %eax 9968b4684afSRobert Clausecker rorx $0x1b, %edx, %r12d 9978b4684afSRobert Clausecker precalc39 %ymm14, 0x20, 0x120 9988b4684afSRobert Clausecker add %r12d, %eax 9998b4684afSRobert Clausecker.endm 10008b4684afSRobert Clausecker 10018b4684afSRobert Clausecker/* 10028b4684afSRobert Clausecker * Similar to calc0 10038b4684afSRobert Clausecker */ 10048b4684afSRobert Clausecker.macro calc80 10058b4684afSRobert Clausecker mov %ecx, %edx // precalculate first round 10068b4684afSRobert Clausecker rorx $2, %ecx, %ecx 10078b4684afSRobert Clausecker andn %esi, %edx, %ebp 10088b4684afSRobert Clausecker and %ebx, %edx 10098b4684afSRobert Clausecker xor %ebp, %edx 10108b4684afSRobert Clausecker calc_f1_pre 0x10, %eax, %edx, %ebx, %edi 10118b4684afSRobert Clausecker precalc32 %ymm15, %ymm14 10128b4684afSRobert Clausecker calc_f1_post %eax, %ecx, %edi 10138b4684afSRobert Clausecker.endm 10148b4684afSRobert Clausecker 10158b4684afSRobert Clausecker.macro calc81 10168b4684afSRobert Clausecker calc_f1_pre 0x14, %edi, %eax, %ecx, %esi 10178b4684afSRobert Clausecker precalc33 %ymm12, %ymm13 10188b4684afSRobert Clausecker calc_f1_post %edi, %edx, %esi 10198b4684afSRobert Clausecker.endm 10208b4684afSRobert Clausecker 10218b4684afSRobert Clausecker.macro calc82 10228b4684afSRobert Clausecker calc_f1_pre 0x18, %esi, %edi, %edx, %ebx 10238b4684afSRobert Clausecker precalc34 %ymm5 10248b4684afSRobert Clausecker calc_f1_post %esi, %eax, %ebx 10258b4684afSRobert Clausecker.endm 10268b4684afSRobert Clausecker 10278b4684afSRobert Clausecker.macro calc83 10288b4684afSRobert Clausecker calc_f1_pre 0x1c, %ebx, %esi, %eax, %ecx 10298b4684afSRobert Clausecker precalc35 %ymm13 10308b4684afSRobert Clausecker calc_f1_post %ebx, %edi, %ecx 10318b4684afSRobert Clausecker.endm 10328b4684afSRobert Clausecker 10338b4684afSRobert Clausecker.macro calc84 10348b4684afSRobert Clausecker calc_f1_pre 0x30, %ecx, %ebx, %edi, %edx 10358b4684afSRobert Clausecker precalc36 %ymm13 10368b4684afSRobert Clausecker calc_f1_post %ecx, %esi, %edx 10378b4684afSRobert Clausecker.endm 10388b4684afSRobert Clausecker 10398b4684afSRobert Clausecker.macro calc85 10408b4684afSRobert Clausecker calc_f1_pre 0x34, %edx, %ecx, %esi, %eax 10418b4684afSRobert Clausecker precalc37 %ymm13 10428b4684afSRobert Clausecker calc_f1_post %edx, %ebx, %eax 10438b4684afSRobert Clausecker.endm 10448b4684afSRobert Clausecker 10458b4684afSRobert Clausecker.macro calc86 10468b4684afSRobert Clausecker calc_f1_pre 0x38, %eax, %edx, %ebx, %edi 10478b4684afSRobert Clausecker calc_f1_post %eax, %ecx, %edi 10488b4684afSRobert Clausecker.endm 10498b4684afSRobert Clausecker 10508b4684afSRobert Clausecker.macro calc87 10518b4684afSRobert Clausecker calc_f1_pre 0x3c, %edi, %eax, %ecx, %esi 10528b4684afSRobert Clausecker precalc39 %ymm13, 0x40, 0x140 10538b4684afSRobert Clausecker calc_f1_post %edi, %edx, %esi 10548b4684afSRobert Clausecker.endm 10558b4684afSRobert Clausecker 10568b4684afSRobert Clausecker.macro calc88 10578b4684afSRobert Clausecker calc_f1_pre 0x50, %esi, %edi, %edx, %ebx 10588b4684afSRobert Clausecker precalc32 %ymm14, %ymm13 10598b4684afSRobert Clausecker calc_f1_post %esi, %eax, %ebx 10608b4684afSRobert Clausecker.endm 10618b4684afSRobert Clausecker 10628b4684afSRobert Clausecker.macro calc89 10638b4684afSRobert Clausecker calc_f1_pre 0x54, %ebx, %esi, %eax, %ecx 10648b4684afSRobert Clausecker precalc33 %ymm8, %ymm12 10658b4684afSRobert Clausecker calc_f1_post %ebx, %edi, %ecx 10668b4684afSRobert Clausecker.endm 10678b4684afSRobert Clausecker 10688b4684afSRobert Clausecker.macro calc90 10698b4684afSRobert Clausecker calc_f1_pre 0x58, %ecx, %ebx, %edi, %edx 10708b4684afSRobert Clausecker precalc34 %ymm3 10718b4684afSRobert Clausecker calc_f1_post %ecx, %esi, %edx 10728b4684afSRobert Clausecker.endm 10738b4684afSRobert Clausecker 10748b4684afSRobert Clausecker.macro calc91 10758b4684afSRobert Clausecker calc_f1_pre 0x5c, %edx, %ecx, %esi, %eax 10768b4684afSRobert Clausecker precalc35 %ymm12 10778b4684afSRobert Clausecker calc_f1_post %edx, %ebx, %eax 10788b4684afSRobert Clausecker.endm 10798b4684afSRobert Clausecker 10808b4684afSRobert Clausecker.macro calc92 10818b4684afSRobert Clausecker calc_f1_pre 0x70, %eax, %edx, %ebx, %edi 10828b4684afSRobert Clausecker precalc36 %ymm12 10838b4684afSRobert Clausecker calc_f1_post %eax, %ecx, %edi 10848b4684afSRobert Clausecker.endm 10858b4684afSRobert Clausecker 10868b4684afSRobert Clausecker.macro calc93 10878b4684afSRobert Clausecker calc_f1_pre 0x74, %edi, %eax, %ecx, %esi 10888b4684afSRobert Clausecker precalc37 %ymm12 10898b4684afSRobert Clausecker calc_f1_post %edi, %edx, %esi 10908b4684afSRobert Clausecker.endm 10918b4684afSRobert Clausecker 10928b4684afSRobert Clausecker.macro calc94 10938b4684afSRobert Clausecker calc_f1_pre 0x78, %esi, %edi, %edx, %ebx 10948b4684afSRobert Clausecker calc_f1_post %esi, %eax, %ebx 10958b4684afSRobert Clausecker.endm 10968b4684afSRobert Clausecker 10978b4684afSRobert Clausecker.macro calc95 10988b4684afSRobert Clausecker calc_f1_pre 0x7c, %ebx, %esi, %eax, %ecx 10998b4684afSRobert Clausecker precalc39 %ymm12, 0x40, 0x160 11008b4684afSRobert Clausecker calc_f1_post %ebx, %edi, %ecx 11018b4684afSRobert Clausecker.endm 11028b4684afSRobert Clausecker 11038b4684afSRobert Clausecker.macro calc96 11048b4684afSRobert Clausecker calc_f1_pre 0x90, %ecx, %ebx, %edi, %edx 11058b4684afSRobert Clausecker precalc32 %ymm13, %ymm12 11068b4684afSRobert Clausecker calc_f1_post %ecx, %esi, %edx 11078b4684afSRobert Clausecker.endm 11088b4684afSRobert Clausecker 11098b4684afSRobert Clausecker.macro calc97 11108b4684afSRobert Clausecker calc_f1_pre 0x94, %edx, %ecx, %esi, %eax 11118b4684afSRobert Clausecker precalc33 %ymm7, %ymm8 11128b4684afSRobert Clausecker calc_f1_post %edx, %ebx, %eax 11138b4684afSRobert Clausecker.endm 11148b4684afSRobert Clausecker 11158b4684afSRobert Clausecker.macro calc98 11168b4684afSRobert Clausecker calc_f1_pre 0x98, %eax, %edx, %ebx, %edi 11178b4684afSRobert Clausecker precalc34 %ymm15 11188b4684afSRobert Clausecker calc_f1_post %eax, %ecx, %edi 11198b4684afSRobert Clausecker.endm 11208b4684afSRobert Clausecker 11218b4684afSRobert Clausecker.macro calc99 11228b4684afSRobert Clausecker calc_f2_pre 0x9c, %edi, %eax, %esi 11238b4684afSRobert Clausecker precalc35 %ymm8 11248b4684afSRobert Clausecker calc_f2_post %edi, %edx, %ecx, %esi 11258b4684afSRobert Clausecker.endm 11268b4684afSRobert Clausecker 11278b4684afSRobert Clausecker.macro calc100 11288b4684afSRobert Clausecker calc_f2_pre 0xb0, %esi, %edi, %ebx 11298b4684afSRobert Clausecker precalc36 %ymm8 11308b4684afSRobert Clausecker calc_f2_post %esi, %eax, %edx, %ebx 11318b4684afSRobert Clausecker.endm 11328b4684afSRobert Clausecker 11338b4684afSRobert Clausecker.macro calc101 11348b4684afSRobert Clausecker calc_f2_pre 0xb4, %ebx, %esi, %ecx 11358b4684afSRobert Clausecker precalc37 %ymm8 11368b4684afSRobert Clausecker calc_f2_post %ebx, %edi, %eax, %ecx 11378b4684afSRobert Clausecker.endm 11388b4684afSRobert Clausecker 11398b4684afSRobert Clausecker.macro calc102 11408b4684afSRobert Clausecker calc_f2_pre 0xb8, %ecx, %ebx, %edx 11418b4684afSRobert Clausecker calc_f2_post %ecx, %esi, %edi, %edx 11428b4684afSRobert Clausecker.endm 11438b4684afSRobert Clausecker 11448b4684afSRobert Clausecker.macro calc103 11458b4684afSRobert Clausecker calc_f2_pre 0xbc, %edx, %ecx, %eax 11468b4684afSRobert Clausecker precalc39 %ymm8, 0x40, 0x180 11478b4684afSRobert Clausecker calc_f2_post %edx, %ebx, %esi, %eax 11488b4684afSRobert Clausecker.endm 11498b4684afSRobert Clausecker 11508b4684afSRobert Clausecker.macro calc104 11518b4684afSRobert Clausecker calc_f2_pre 0xd0, %eax, %edx, %edi 11528b4684afSRobert Clausecker precalc32 %ymm12, %ymm8 11538b4684afSRobert Clausecker calc_f2_post %eax, %ecx, %ebx, %edi 11548b4684afSRobert Clausecker.endm 11558b4684afSRobert Clausecker 11568b4684afSRobert Clausecker.macro calc105 11578b4684afSRobert Clausecker calc_f2_pre 0xd4, %edi, %eax, %esi 11588b4684afSRobert Clausecker precalc33 %ymm5, %ymm7 11598b4684afSRobert Clausecker calc_f2_post %edi, %edx, %ecx, %esi 11608b4684afSRobert Clausecker.endm 11618b4684afSRobert Clausecker 11628b4684afSRobert Clausecker.macro calc106 11638b4684afSRobert Clausecker calc_f2_pre 0xd8, %esi, %edi, %ebx 11648b4684afSRobert Clausecker precalc34 %ymm14 11658b4684afSRobert Clausecker calc_f2_post %esi, %eax, %edx, %ebx 11668b4684afSRobert Clausecker.endm 11678b4684afSRobert Clausecker 11688b4684afSRobert Clausecker.macro calc107 11698b4684afSRobert Clausecker calc_f2_pre 0xdc, %ebx, %esi, %ecx 11708b4684afSRobert Clausecker precalc35 %ymm7 11718b4684afSRobert Clausecker calc_f2_post %ebx, %edi, %eax, %ecx 11728b4684afSRobert Clausecker.endm 11738b4684afSRobert Clausecker 11748b4684afSRobert Clausecker.macro calc108 11758b4684afSRobert Clausecker calc_f2_pre 0xf0, %ecx, %ebx, %edx 11768b4684afSRobert Clausecker precalc36 %ymm7 11778b4684afSRobert Clausecker calc_f2_post %ecx, %esi, %edi, %edx 11788b4684afSRobert Clausecker.endm 11798b4684afSRobert Clausecker 11808b4684afSRobert Clausecker.macro calc109 11818b4684afSRobert Clausecker calc_f2_pre 0xf4, %edx, %ecx, %eax 11828b4684afSRobert Clausecker precalc37 %ymm7 11838b4684afSRobert Clausecker calc_f2_post %edx, %ebx, %esi, %eax 11848b4684afSRobert Clausecker.endm 11858b4684afSRobert Clausecker 11868b4684afSRobert Clausecker.macro calc110 11878b4684afSRobert Clausecker calc_f2_pre 0xf8, %eax, %edx, %edi 11888b4684afSRobert Clausecker calc_f2_post %eax, %ecx, %ebx, %edi 11898b4684afSRobert Clausecker.endm 11908b4684afSRobert Clausecker 11918b4684afSRobert Clausecker.macro calc111 11928b4684afSRobert Clausecker calc_f2_pre 0xfc, %edi, %eax, %esi 11938b4684afSRobert Clausecker precalc39 %ymm7, 0x40, 0x1a0 11948b4684afSRobert Clausecker calc_f2_post %edi, %edx, %ecx, %esi 11958b4684afSRobert Clausecker.endm 11968b4684afSRobert Clausecker 11978b4684afSRobert Clausecker.macro calc112 11988b4684afSRobert Clausecker calc_f2_pre 0x110, %esi, %edi, %ebx 11998b4684afSRobert Clausecker precalc32 %ymm8, %ymm7 12008b4684afSRobert Clausecker calc_f2_post %esi, %eax, %edx, %ebx 12018b4684afSRobert Clausecker.endm 12028b4684afSRobert Clausecker 12038b4684afSRobert Clausecker.macro calc113 12048b4684afSRobert Clausecker calc_f2_pre 0x114, %ebx, %esi, %ecx 12058b4684afSRobert Clausecker precalc33 %ymm3, %ymm5 12068b4684afSRobert Clausecker calc_f2_post %ebx, %edi, %eax, %ecx 12078b4684afSRobert Clausecker.endm 12088b4684afSRobert Clausecker 12098b4684afSRobert Clausecker.macro calc114 12108b4684afSRobert Clausecker calc_f2_pre 0x118, %ecx, %ebx, %edx 12118b4684afSRobert Clausecker precalc34 %ymm13 12128b4684afSRobert Clausecker calc_f2_post %ecx, %esi, %edi, %edx 12138b4684afSRobert Clausecker.endm 12148b4684afSRobert Clausecker 12158b4684afSRobert Clausecker.macro calc115 12168b4684afSRobert Clausecker calc_f2_pre 0x11c, %edx, %ecx, %eax 12178b4684afSRobert Clausecker precalc35 %ymm5 12188b4684afSRobert Clausecker calc_f2_post %edx, %ebx, %esi, %eax 12198b4684afSRobert Clausecker.endm 12208b4684afSRobert Clausecker 12218b4684afSRobert Clausecker.macro calc116 12228b4684afSRobert Clausecker calc_f2_pre 0x130, %eax, %edx, %edi 1223207f3b2bSJessica Clarke precalc36 %ymm5 12248b4684afSRobert Clausecker calc_f2_post %eax, %ecx, %ebx, %edi 12258b4684afSRobert Clausecker.endm 12268b4684afSRobert Clausecker 12278b4684afSRobert Clausecker.macro calc117 12288b4684afSRobert Clausecker calc_f2_pre 0x134, %edi, %eax, %esi 12298b4684afSRobert Clausecker precalc37 %ymm5 12308b4684afSRobert Clausecker calc_f2_post %edi, %edx, %ecx, %esi 12318b4684afSRobert Clausecker.endm 12328b4684afSRobert Clausecker 12338b4684afSRobert Clausecker.macro calc118 12348b4684afSRobert Clausecker calc_f2_pre 0x138, %esi, %edi, %ebx 12358b4684afSRobert Clausecker calc_f2_post %esi, %eax, %edx, %ebx 12368b4684afSRobert Clausecker.endm 12378b4684afSRobert Clausecker 12388b4684afSRobert Clausecker.macro calc119 12398b4684afSRobert Clausecker calc_f3_pre 0x13c, %ecx 12408b4684afSRobert Clausecker precalc39 %ymm5, 0x40, 0x1c0 12418b4684afSRobert Clausecker calc_f3_post %ebx, %edi, %eax, %ecx, %esi 12428b4684afSRobert Clausecker.endm 12438b4684afSRobert Clausecker 12448b4684afSRobert Clausecker.macro calc120 12458b4684afSRobert Clausecker calc_f3_pre 0x150, %edx 12468b4684afSRobert Clausecker precalc32 %ymm7, %ymm5 12478b4684afSRobert Clausecker calc_f3_post %ecx, %esi, %edi, %edx, %ebx 12488b4684afSRobert Clausecker.endm 12498b4684afSRobert Clausecker 12508b4684afSRobert Clausecker.macro calc121 12518b4684afSRobert Clausecker calc_f3_pre 0x154, %eax 12528b4684afSRobert Clausecker precalc33 %ymm15, %ymm3 12538b4684afSRobert Clausecker calc_f3_post %edx, %ebx, %esi, %eax, %ecx 12548b4684afSRobert Clausecker.endm 12558b4684afSRobert Clausecker 12568b4684afSRobert Clausecker.macro calc122 12578b4684afSRobert Clausecker calc_f3_pre 0x158, %edi 12588b4684afSRobert Clausecker precalc34 %ymm12 12598b4684afSRobert Clausecker calc_f3_post %eax, %ecx, %ebx, %edi, %edx 12608b4684afSRobert Clausecker.endm 12618b4684afSRobert Clausecker 12628b4684afSRobert Clausecker.macro calc123 12638b4684afSRobert Clausecker calc_f3_pre 0x15c, %esi 12648b4684afSRobert Clausecker precalc35 %ymm3 12658b4684afSRobert Clausecker calc_f3_post %edi, %edx, %ecx, %esi, %eax 12668b4684afSRobert Clausecker.endm 12678b4684afSRobert Clausecker 12688b4684afSRobert Clausecker.macro calc124 12698b4684afSRobert Clausecker calc_f3_pre 0x170, %ebx 12708b4684afSRobert Clausecker precalc36 %ymm3 12718b4684afSRobert Clausecker calc_f3_post %esi, %eax, %edx, %ebx, %edi 12728b4684afSRobert Clausecker.endm 12738b4684afSRobert Clausecker 12748b4684afSRobert Clausecker.macro calc125 12758b4684afSRobert Clausecker calc_f3_pre 0x174, %ecx 12768b4684afSRobert Clausecker precalc37 %ymm3 12778b4684afSRobert Clausecker calc_f3_post %ebx, %edi, %eax, %ecx, %esi 12788b4684afSRobert Clausecker.endm 12798b4684afSRobert Clausecker 12808b4684afSRobert Clausecker.macro calc126 12818b4684afSRobert Clausecker calc_f3_pre 0x178, %edx 12828b4684afSRobert Clausecker calc_f3_post %ecx, %esi, %edi, %edx, %ebx 12838b4684afSRobert Clausecker.endm 12848b4684afSRobert Clausecker 12858b4684afSRobert Clausecker.macro calc127 12868b4684afSRobert Clausecker calc_f3_pre 0x17c, %eax 12878b4684afSRobert Clausecker precalc39 %ymm3, 0x60, 0x1e0 12888b4684afSRobert Clausecker calc_f3_post %edx, %ebx, %esi, %eax, %ecx 12898b4684afSRobert Clausecker.endm 12908b4684afSRobert Clausecker 12918b4684afSRobert Clausecker.macro calc128 12928b4684afSRobert Clausecker calc_f3_pre 0x190, %edi 12938b4684afSRobert Clausecker precalc32 %ymm5, %ymm3 12948b4684afSRobert Clausecker calc_f3_post %eax, %ecx, %ebx, %edi, %edx 12958b4684afSRobert Clausecker.endm 12968b4684afSRobert Clausecker 12978b4684afSRobert Clausecker.macro calc129 12988b4684afSRobert Clausecker calc_f3_pre 0x194, %esi 12998b4684afSRobert Clausecker precalc33 %ymm14, %ymm15 13008b4684afSRobert Clausecker calc_f3_post %edi, %edx, %ecx, %esi, %eax 13018b4684afSRobert Clausecker.endm 13028b4684afSRobert Clausecker 13038b4684afSRobert Clausecker.macro calc130 13048b4684afSRobert Clausecker calc_f3_pre 0x198, %ebx 13058b4684afSRobert Clausecker precalc34 %ymm8 13068b4684afSRobert Clausecker calc_f3_post %esi, %eax, %edx, %ebx, %edi 13078b4684afSRobert Clausecker.endm 13088b4684afSRobert Clausecker 13098b4684afSRobert Clausecker.macro calc131 13108b4684afSRobert Clausecker calc_f3_pre 0x19c, %ecx 13118b4684afSRobert Clausecker precalc35 %ymm15 13128b4684afSRobert Clausecker calc_f3_post %ebx, %edi, %eax, %ecx, %esi 13138b4684afSRobert Clausecker.endm 13148b4684afSRobert Clausecker 13158b4684afSRobert Clausecker.macro calc132 13168b4684afSRobert Clausecker calc_f3_pre 0x1b0, %edx 13178b4684afSRobert Clausecker precalc36 %ymm15 13188b4684afSRobert Clausecker calc_f3_post %ecx, %esi, %edi, %edx, %ebx 13198b4684afSRobert Clausecker.endm 13208b4684afSRobert Clausecker 13218b4684afSRobert Clausecker.macro calc133 13228b4684afSRobert Clausecker calc_f3_pre 0x1b4, %eax 13238b4684afSRobert Clausecker precalc37 %ymm15 13248b4684afSRobert Clausecker calc_f3_post %edx, %ebx, %esi, %eax, %ecx 13258b4684afSRobert Clausecker.endm 13268b4684afSRobert Clausecker 13278b4684afSRobert Clausecker.macro calc134 13288b4684afSRobert Clausecker calc_f3_pre 0x1b8, %edi 13298b4684afSRobert Clausecker calc_f3_post %eax, %ecx, %ebx, %edi, %edx 13308b4684afSRobert Clausecker.endm 13318b4684afSRobert Clausecker 13328b4684afSRobert Clausecker.macro calc135 13338b4684afSRobert Clausecker calc_f3_pre 0x1bc, %esi 13348b4684afSRobert Clausecker precalc39 %ymm15, 0x60, 0x200 13358b4684afSRobert Clausecker calc_f3_post %edi, %edx, %ecx, %esi, %eax 13368b4684afSRobert Clausecker.endm 13378b4684afSRobert Clausecker 13388b4684afSRobert Clausecker.macro calc136 13398b4684afSRobert Clausecker calc_f3_pre 0x1d0, %ebx 13408b4684afSRobert Clausecker precalc32 %ymm3, %ymm15 13418b4684afSRobert Clausecker calc_f3_post %esi, %eax, %edx, %ebx, %edi 13428b4684afSRobert Clausecker.endm 13438b4684afSRobert Clausecker 13448b4684afSRobert Clausecker.macro calc137 13458b4684afSRobert Clausecker calc_f3_pre 0x1d4, %ecx 13468b4684afSRobert Clausecker precalc33 %ymm13, %ymm14 13478b4684afSRobert Clausecker calc_f3_post %ebx, %edi, %eax, %ecx, %esi 13488b4684afSRobert Clausecker.endm 13498b4684afSRobert Clausecker 13508b4684afSRobert Clausecker.macro calc138 13518b4684afSRobert Clausecker calc_f3_pre 0x1d8, %edx 13528b4684afSRobert Clausecker precalc34 %ymm7 13538b4684afSRobert Clausecker calc_f3_post %ecx, %esi, %edi, %edx, %ebx 13548b4684afSRobert Clausecker.endm 13558b4684afSRobert Clausecker 13568b4684afSRobert Clausecker.macro calc139 1357207f3b2bSJessica Clarke calc_f2_pre 0x1dc, %edx, %ecx, %eax 13588b4684afSRobert Clausecker precalc35 %ymm14 13598b4684afSRobert Clausecker calc_f2_post %edx, %ebx, %esi, %eax 13608b4684afSRobert Clausecker.endm 13618b4684afSRobert Clausecker 13628b4684afSRobert Clausecker.macro calc140 13638b4684afSRobert Clausecker calc_f2_pre 0x1f0, %eax, %edx, %edi 13648b4684afSRobert Clausecker precalc36 %ymm14 13658b4684afSRobert Clausecker calc_f2_post %eax, %ecx, %ebx, %edi 13668b4684afSRobert Clausecker.endm 13678b4684afSRobert Clausecker 13688b4684afSRobert Clausecker.macro calc141 13698b4684afSRobert Clausecker calc_f2_pre 0x1f4, %edi, %eax, %esi 13708b4684afSRobert Clausecker precalc37 %ymm14 13718b4684afSRobert Clausecker calc_f2_post %edi, %edx, %ecx, %esi 13728b4684afSRobert Clausecker.endm 13738b4684afSRobert Clausecker 13748b4684afSRobert Clausecker.macro calc142 13758b4684afSRobert Clausecker calc_f2_pre 0x1f8, %esi, %edi, %ebx 13768b4684afSRobert Clausecker calc_f2_post %esi, %eax, %edx, %ebx 13778b4684afSRobert Clausecker.endm 13788b4684afSRobert Clausecker 13798b4684afSRobert Clausecker.macro calc143 13808b4684afSRobert Clausecker calc_f2_pre 0x1fc, %ebx, %esi, %ecx 13818b4684afSRobert Clausecker precalc39 %ymm14, 0x60, 0x220 13828b4684afSRobert Clausecker calc_f2_post %ebx, %edi, %eax, %ecx 13838b4684afSRobert Clausecker.endm 13848b4684afSRobert Clausecker 13858b4684afSRobert Clausecker.macro calc144 13868b4684afSRobert Clausecker calc_f2_pre 0x210, %ecx, %ebx, %edx 13878b4684afSRobert Clausecker precalc32 %ymm15, %ymm14 13888b4684afSRobert Clausecker calc_f2_post %ecx, %esi, %edi, %edx 13898b4684afSRobert Clausecker.endm 13908b4684afSRobert Clausecker 13918b4684afSRobert Clausecker.macro calc145 13928b4684afSRobert Clausecker calc_f2_pre 0x214, %edx, %ecx, %eax 13938b4684afSRobert Clausecker precalc33 %ymm12, %ymm13 13948b4684afSRobert Clausecker calc_f2_post %edx, %ebx, %esi, %eax 13958b4684afSRobert Clausecker.endm 13968b4684afSRobert Clausecker 13978b4684afSRobert Clausecker.macro calc146 13988b4684afSRobert Clausecker calc_f2_pre 0x218, %eax, %edx, %edi 13998b4684afSRobert Clausecker precalc34 %ymm5 14008b4684afSRobert Clausecker calc_f2_post %eax, %ecx, %ebx, %edi 14018b4684afSRobert Clausecker.endm 14028b4684afSRobert Clausecker 14038b4684afSRobert Clausecker.macro calc147 14048b4684afSRobert Clausecker calc_f2_pre 0x21c, %edi, %eax, %esi 14058b4684afSRobert Clausecker precalc35 %ymm13 14068b4684afSRobert Clausecker calc_f2_post %edi, %edx, %ecx, %esi 14078b4684afSRobert Clausecker.endm 14088b4684afSRobert Clausecker 14098b4684afSRobert Clausecker.macro calc148 14108b4684afSRobert Clausecker calc_f2_pre 0x230, %esi, %edi, %ebx 14118b4684afSRobert Clausecker precalc36 %ymm13 14128b4684afSRobert Clausecker calc_f2_post %esi, %eax, %edx, %ebx 14138b4684afSRobert Clausecker.endm 14148b4684afSRobert Clausecker 14158b4684afSRobert Clausecker.macro calc149 14168b4684afSRobert Clausecker calc_f2_pre 0x234, %ebx, %esi, %ecx 14178b4684afSRobert Clausecker precalc37 %ymm13 14188b4684afSRobert Clausecker calc_f2_post %ebx, %edi, %eax, %ecx 14198b4684afSRobert Clausecker.endm 14208b4684afSRobert Clausecker 14218b4684afSRobert Clausecker.macro calc150 14228b4684afSRobert Clausecker calc_f2_pre 0x238, %ecx, %ebx, %edx 14238b4684afSRobert Clausecker calc_f2_post %ecx, %esi, %edi, %edx 14248b4684afSRobert Clausecker.endm 14258b4684afSRobert Clausecker 14268b4684afSRobert Clausecker.macro calc151 14278b4684afSRobert Clausecker calc_f2_pre 0x23c, %edx, %ecx, %eax 14288b4684afSRobert Clausecker precalc39 %ymm13, 0x60, 0x240 14298b4684afSRobert Clausecker calc_f2_post %edx, %ebx, %esi, %eax 14308b4684afSRobert Clausecker.endm 14318b4684afSRobert Clausecker 14328b4684afSRobert Clausecker.macro calc152 14338b4684afSRobert Clausecker calc_f2_pre 0x250, %eax, %edx, %edi 14348b4684afSRobert Clausecker precalc32 %ymm14, %ymm13 14358b4684afSRobert Clausecker calc_f2_post %eax, %ecx, %ebx, %edi 14368b4684afSRobert Clausecker.endm 14378b4684afSRobert Clausecker 14388b4684afSRobert Clausecker.macro calc153 14398b4684afSRobert Clausecker calc_f2_pre 0x254, %edi, %eax, %esi 14408b4684afSRobert Clausecker precalc33 %ymm8, %ymm12 14418b4684afSRobert Clausecker calc_f2_post %edi, %edx, %ecx, %esi 14428b4684afSRobert Clausecker.endm 14438b4684afSRobert Clausecker 14448b4684afSRobert Clausecker.macro calc154 14458b4684afSRobert Clausecker calc_f2_pre 0x258, %esi, %edi, %ebx 14468b4684afSRobert Clausecker precalc34 %ymm3 14478b4684afSRobert Clausecker calc_f2_post %esi, %eax, %edx, %ebx 14488b4684afSRobert Clausecker.endm 14498b4684afSRobert Clausecker 14508b4684afSRobert Clausecker.macro calc155 14518b4684afSRobert Clausecker calc_f2_pre 0x25c, %ebx, %esi, %ecx 14528b4684afSRobert Clausecker precalc35 %ymm12 14538b4684afSRobert Clausecker calc_f2_post %ebx, %edi, %eax, %ecx 14548b4684afSRobert Clausecker.endm 14558b4684afSRobert Clausecker 14568b4684afSRobert Clausecker.macro calc156 14578b4684afSRobert Clausecker calc_f2_pre 0x270, %ecx, %ebx, %edx 14588b4684afSRobert Clausecker precalc36 %ymm12 14598b4684afSRobert Clausecker calc_f2_post %ecx, %esi, %edi, %edx 14608b4684afSRobert Clausecker.endm 14618b4684afSRobert Clausecker 14628b4684afSRobert Clausecker.macro calc157 14638b4684afSRobert Clausecker calc_f2_pre 0x274, %edx, %ecx, %eax 14648b4684afSRobert Clausecker precalc37 %ymm12 14658b4684afSRobert Clausecker calc_f2_post %edx, %ebx, %esi, %eax 14668b4684afSRobert Clausecker.endm 14678b4684afSRobert Clausecker 14688b4684afSRobert Clausecker.macro calc158 14698b4684afSRobert Clausecker calc_f2_pre 0x278, %eax, %edx, %edi 14708b4684afSRobert Clausecker calc_f2_post %eax, %ecx, %ebx, %edi 14718b4684afSRobert Clausecker.endm 14728b4684afSRobert Clausecker 14738b4684afSRobert Clausecker.macro calc159 14748b4684afSRobert Clausecker add 0x27c(%r15), %esi 14758b4684afSRobert Clausecker add %eax, %esi 14768b4684afSRobert Clausecker rorx $0x1b, %edi, %r12d 14778b4684afSRobert Clausecker precalc39 %ymm12, 0x60, 0x260 14788b4684afSRobert Clausecker add %r12d, %esi 14798b4684afSRobert Clausecker.endm 14808b4684afSRobert Clausecker 14818b4684afSRobert Clausecker // sha1block(SHA1_CTX, buf, len) 14828b4684afSRobert ClauseckerENTRY(_libmd_sha1block_avx2) 14838b4684afSRobert Clausecker push %rbx 14848b4684afSRobert Clausecker push %rbp 14858b4684afSRobert Clausecker push %r12 14868b4684afSRobert Clausecker push %r13 14878b4684afSRobert Clausecker push %r14 14888b4684afSRobert Clausecker push %r15 14898b4684afSRobert Clausecker sub $1408+8, %rsp 14908b4684afSRobert Clausecker 14918b4684afSRobert Clausecker and $~63, %rdx 14928b4684afSRobert Clausecker lea k_xmm_ar(%rip), %r8 14938b4684afSRobert Clausecker mov %rdi, %r9 14948b4684afSRobert Clausecker mov %rsi, %r10 14958b4684afSRobert Clausecker lea 64(%rsi), %r13 14968b4684afSRobert Clausecker lea 64(%rsi, %rdx), %r11 14978b4684afSRobert Clausecker cmp %r11, %r13 14988b4684afSRobert Clausecker cmovae %r8, %r13 14998b4684afSRobert Clausecker vmovdqu bswap_shufb_ctl(%rip), %ymm10 15008b4684afSRobert Clausecker 15018b4684afSRobert Clausecker mov (%r9), %ecx 15028b4684afSRobert Clausecker mov 4(%r9), %esi 15038b4684afSRobert Clausecker mov 8(%r9), %edi 15048b4684afSRobert Clausecker mov 12(%r9), %eax 15058b4684afSRobert Clausecker mov 16(%r9), %edx 15068b4684afSRobert Clausecker mov %rsp, %r14 15078b4684afSRobert Clausecker lea 2*4*80+32(%rsp), %r15 15088b4684afSRobert Clausecker precalc // precalc WK for first 2 blocks 15098b4684afSRobert Clausecker xchg %r14, %r15 15108b4684afSRobert Clausecker 15118b4684afSRobert Clausecker // this is unrolled 15128b4684afSRobert Clausecker.Loop: cmp %r8, %r10 // we use the value of R8 (set below) 15138b4684afSRobert Clausecker // as a signal of the last block 15148b4684afSRobert Clausecker jne .Lbegin 15158b4684afSRobert Clausecker add $1408+8, %rsp 15168b4684afSRobert Clausecker pop %r15 15178b4684afSRobert Clausecker pop %r14 15188b4684afSRobert Clausecker pop %r13 15198b4684afSRobert Clausecker pop %r12 15208b4684afSRobert Clausecker pop %rbp 15218b4684afSRobert Clausecker pop %rbx 15228b4684afSRobert Clausecker vzeroupper 15238b4684afSRobert Clausecker ret 15248b4684afSRobert Clausecker 15258b4684afSRobert Clausecker.Lbegin: 15268b4684afSRobert Clausecker calc0 15278b4684afSRobert Clausecker calc1 15288b4684afSRobert Clausecker calc2 15298b4684afSRobert Clausecker calc3 15308b4684afSRobert Clausecker calc4 15318b4684afSRobert Clausecker calc5 15328b4684afSRobert Clausecker calc6 15338b4684afSRobert Clausecker calc7 15348b4684afSRobert Clausecker calc8 15358b4684afSRobert Clausecker calc9 15368b4684afSRobert Clausecker calc10 15378b4684afSRobert Clausecker calc11 15388b4684afSRobert Clausecker calc12 15398b4684afSRobert Clausecker calc13 15408b4684afSRobert Clausecker calc14 15418b4684afSRobert Clausecker calc15 15428b4684afSRobert Clausecker calc16 15438b4684afSRobert Clausecker calc17 15448b4684afSRobert Clausecker calc18 15458b4684afSRobert Clausecker calc19 15468b4684afSRobert Clausecker calc20 15478b4684afSRobert Clausecker calc21 15488b4684afSRobert Clausecker calc22 15498b4684afSRobert Clausecker calc23 15508b4684afSRobert Clausecker calc24 15518b4684afSRobert Clausecker calc25 15528b4684afSRobert Clausecker calc26 15538b4684afSRobert Clausecker calc27 15548b4684afSRobert Clausecker calc28 15558b4684afSRobert Clausecker calc29 15568b4684afSRobert Clausecker calc30 15578b4684afSRobert Clausecker calc31 15588b4684afSRobert Clausecker calc32 15598b4684afSRobert Clausecker calc33 15608b4684afSRobert Clausecker calc34 15618b4684afSRobert Clausecker calc35 15628b4684afSRobert Clausecker calc36 15638b4684afSRobert Clausecker calc37 15648b4684afSRobert Clausecker calc38 15658b4684afSRobert Clausecker calc39 15668b4684afSRobert Clausecker calc40 15678b4684afSRobert Clausecker calc41 15688b4684afSRobert Clausecker calc42 15698b4684afSRobert Clausecker calc43 15708b4684afSRobert Clausecker calc44 15718b4684afSRobert Clausecker calc45 15728b4684afSRobert Clausecker calc46 15738b4684afSRobert Clausecker calc47 15748b4684afSRobert Clausecker calc48 15758b4684afSRobert Clausecker calc49 15768b4684afSRobert Clausecker calc50 15778b4684afSRobert Clausecker calc51 15788b4684afSRobert Clausecker calc52 15798b4684afSRobert Clausecker calc53 15808b4684afSRobert Clausecker calc54 15818b4684afSRobert Clausecker calc55 15828b4684afSRobert Clausecker calc56 15838b4684afSRobert Clausecker calc57 15848b4684afSRobert Clausecker calc58 15858b4684afSRobert Clausecker calc59 15868b4684afSRobert Clausecker 15878b4684afSRobert Clausecker add $128, %r10 // move to the next even-64-byte block 15888b4684afSRobert Clausecker cmp %r11, %r10 // is the current block the last one? 1589207f3b2bSJessica Clarke cmovae %r8, %r10 // signal the last iteration smartly 15908b4684afSRobert Clausecker 15918b4684afSRobert Clausecker calc60 15928b4684afSRobert Clausecker calc61 15938b4684afSRobert Clausecker calc62 15948b4684afSRobert Clausecker calc63 15958b4684afSRobert Clausecker calc64 15968b4684afSRobert Clausecker calc65 15978b4684afSRobert Clausecker calc66 15988b4684afSRobert Clausecker calc67 15998b4684afSRobert Clausecker calc68 16008b4684afSRobert Clausecker calc69 16018b4684afSRobert Clausecker calc70 16028b4684afSRobert Clausecker calc71 16038b4684afSRobert Clausecker calc72 16048b4684afSRobert Clausecker calc73 16058b4684afSRobert Clausecker calc74 16068b4684afSRobert Clausecker calc75 16078b4684afSRobert Clausecker calc76 16088b4684afSRobert Clausecker calc77 16098b4684afSRobert Clausecker calc78 16108b4684afSRobert Clausecker calc79 16118b4684afSRobert Clausecker 16128b4684afSRobert Clausecker update_hash %eax, %edx, %ebx, %esi, %edi 16138b4684afSRobert Clausecker cmp %r8, %r10 // is the current block the last one? 16148b4684afSRobert Clausecker je .Loop 16158b4684afSRobert Clausecker mov %edx, %ecx 16168b4684afSRobert Clausecker 16178b4684afSRobert Clausecker calc80 16188b4684afSRobert Clausecker calc81 16198b4684afSRobert Clausecker calc82 16208b4684afSRobert Clausecker calc83 16218b4684afSRobert Clausecker calc84 16228b4684afSRobert Clausecker calc85 16238b4684afSRobert Clausecker calc86 16248b4684afSRobert Clausecker calc87 16258b4684afSRobert Clausecker calc88 16268b4684afSRobert Clausecker calc89 16278b4684afSRobert Clausecker calc90 16288b4684afSRobert Clausecker calc91 16298b4684afSRobert Clausecker calc92 16308b4684afSRobert Clausecker calc93 16318b4684afSRobert Clausecker calc94 16328b4684afSRobert Clausecker calc95 16338b4684afSRobert Clausecker calc96 16348b4684afSRobert Clausecker calc97 16358b4684afSRobert Clausecker calc98 16368b4684afSRobert Clausecker calc99 16378b4684afSRobert Clausecker calc100 16388b4684afSRobert Clausecker calc101 16398b4684afSRobert Clausecker calc102 16408b4684afSRobert Clausecker calc103 16418b4684afSRobert Clausecker calc104 16428b4684afSRobert Clausecker calc105 16438b4684afSRobert Clausecker calc106 16448b4684afSRobert Clausecker calc107 16458b4684afSRobert Clausecker calc108 16468b4684afSRobert Clausecker calc109 16478b4684afSRobert Clausecker calc110 16488b4684afSRobert Clausecker calc111 16498b4684afSRobert Clausecker calc112 16508b4684afSRobert Clausecker calc113 16518b4684afSRobert Clausecker calc114 16528b4684afSRobert Clausecker calc115 16538b4684afSRobert Clausecker calc116 16548b4684afSRobert Clausecker calc117 16558b4684afSRobert Clausecker calc118 16568b4684afSRobert Clausecker calc119 16578b4684afSRobert Clausecker calc120 16588b4684afSRobert Clausecker calc121 16598b4684afSRobert Clausecker calc122 16608b4684afSRobert Clausecker calc123 16618b4684afSRobert Clausecker calc124 16628b4684afSRobert Clausecker calc125 16638b4684afSRobert Clausecker calc126 16648b4684afSRobert Clausecker calc127 16658b4684afSRobert Clausecker calc128 16668b4684afSRobert Clausecker calc129 16678b4684afSRobert Clausecker calc130 16688b4684afSRobert Clausecker calc131 16698b4684afSRobert Clausecker calc132 16708b4684afSRobert Clausecker calc133 16718b4684afSRobert Clausecker calc134 16728b4684afSRobert Clausecker calc135 16738b4684afSRobert Clausecker calc136 16748b4684afSRobert Clausecker calc137 16758b4684afSRobert Clausecker calc138 16768b4684afSRobert Clausecker calc139 16778b4684afSRobert Clausecker 16788b4684afSRobert Clausecker add $128, %r13 // move to the next even-64-byte block 16798b4684afSRobert Clausecker cmp %r11, %r13 // is the current block the last one? 16808b4684afSRobert Clausecker cmovae %r8, %r10 16818b4684afSRobert Clausecker 16828b4684afSRobert Clausecker calc140 16838b4684afSRobert Clausecker calc141 16848b4684afSRobert Clausecker calc142 16858b4684afSRobert Clausecker calc143 16868b4684afSRobert Clausecker calc144 16878b4684afSRobert Clausecker calc145 16888b4684afSRobert Clausecker calc146 16898b4684afSRobert Clausecker calc147 16908b4684afSRobert Clausecker calc148 16918b4684afSRobert Clausecker calc149 16928b4684afSRobert Clausecker calc150 16938b4684afSRobert Clausecker calc151 16948b4684afSRobert Clausecker calc152 16958b4684afSRobert Clausecker calc153 16968b4684afSRobert Clausecker calc154 16978b4684afSRobert Clausecker calc155 16988b4684afSRobert Clausecker calc156 16998b4684afSRobert Clausecker calc157 17008b4684afSRobert Clausecker calc158 17018b4684afSRobert Clausecker calc159 17028b4684afSRobert Clausecker 17038b4684afSRobert Clausecker update_hash %esi, %edi, %edx, %ecx, %ebx 17048b4684afSRobert Clausecker mov %esi, %r12d // reset state for AVX2 reg permutation 17058b4684afSRobert Clausecker mov %edi, %esi 17068b4684afSRobert Clausecker mov %edx, %edi 17078b4684afSRobert Clausecker mov %ebx, %edx 17088b4684afSRobert Clausecker mov %ecx, %eax 17098b4684afSRobert Clausecker mov %r12d, %ecx 17108b4684afSRobert Clausecker xchg %r14, %r15 17118b4684afSRobert Clausecker jmp .Loop 17128b4684afSRobert ClauseckerEND(_libmd_sha1block_avx2) 17138b4684afSRobert Clausecker 17148b4684afSRobert Clausecker .section .rodata 17158b4684afSRobert Clausecker .balign 32 17168b4684afSRobert Clauseckerk_xmm_ar: 17178b4684afSRobert Clausecker .fill 8, 4, 0x5a827999 17188b4684afSRobert Clausecker .fill 8, 4, 0x6ed9eba1 17198b4684afSRobert Clausecker .fill 8, 4, 0x8f1bbcdc 17208b4684afSRobert Clausecker .fill 8, 4, 0xca62c1d6 17218b4684afSRobert Clausecker .size k_xmm_ar, .-k_xmm_ar 17228b4684afSRobert Clausecker 17238b4684afSRobert Clauseckerbswap_shufb_ctl: 17248b4684afSRobert Clausecker .4byte 0x00010203 17258b4684afSRobert Clausecker .4byte 0x04050607 17268b4684afSRobert Clausecker .4byte 0x08090a0b 17278b4684afSRobert Clausecker .4byte 0x0c0d0e0f 17288b4684afSRobert Clausecker .4byte 0x00010203 17298b4684afSRobert Clausecker .4byte 0x04050607 17308b4684afSRobert Clausecker .4byte 0x08090a0b 17318b4684afSRobert Clausecker .4byte 0x0c0d0e0f 17328b4684afSRobert Clausecker .size bswap_shufb_ctl, .-bswap_shufb_ctl 17338b4684afSRobert Clausecker 17348b4684afSRobert Clausecker /* 17358b4684afSRobert Clausecker * SHA1 implementation using the Intel SHA extensions (SHANI). 17368b4684afSRobert Clausecker * 17378b4684afSRobert Clausecker * Imlemented according to the Intel white paper 17388b4684afSRobert Clausecker * 17398b4684afSRobert Clausecker * S. Gulley, V. Gopal, K. Yap, W. Feghali, J. Guilford, 17408b4684afSRobert Clausecker * G. Wolrich: "Intel SHA Extensions: new instruction supporting 17418b4684afSRobert Clausecker * the Secure Hash Algorithm on Intel® architecture processors", 17428b4684afSRobert Clausecker * July 2013. 17438b4684afSRobert Clausecker */ 17448b4684afSRobert Clausecker // sha1block(SHA1_CTX, buf, len) 17458b4684afSRobert ClauseckerENTRY(_libmd_sha1block_shani) 17468b4684afSRobert Clausecker and $~63, %rdx // round length to block-size multiple 17478b4684afSRobert Clausecker lea (%rsi, %rdx, 1), %rcx // end pointer 17488b4684afSRobert Clausecker test %rdx, %rdx // nothing to do? 17498b4684afSRobert Clausecker je 1f // if so, terminate immediately 17508b4684afSRobert Clausecker 17518b4684afSRobert Clausecker movdqu (%rdi), %xmm6 // h0, h1, h2, h3 17528b4684afSRobert Clausecker pxor %xmm7, %xmm7 17538b4684afSRobert Clausecker pshufd $0x1b, %xmm6, %xmm6 // h3, h2, h1, h0 17548b4684afSRobert Clausecker pinsrd $3, 16(%rdi), %xmm7 // h4 in the highest word of xmm7 17558b4684afSRobert Clausecker movdqu shuf_mask(%rip), %xmm4 17568b4684afSRobert Clausecker 17578b4684afSRobert Clausecker // main loop 17588b4684afSRobert Clausecker0: movdqa %xmm6, %xmm8 // stash ABCD 17598b4684afSRobert Clausecker movdqa %xmm7, %xmm9 // stash E 17608b4684afSRobert Clausecker 17618b4684afSRobert Clausecker // rounds 0--3 17628b4684afSRobert Clausecker movdqu 0*16(%rsi), %xmm0 // load first message block 17638b4684afSRobert Clausecker pshufb %xmm4, %xmm0 // and byte-swap 17648b4684afSRobert Clausecker paddd %xmm0, %xmm7 // E += w[0] 17658b4684afSRobert Clausecker movdqa %xmm6, %xmm5 // E' = A 17668b4684afSRobert Clausecker sha1rnds4 $0, %xmm7, %xmm6 // perform rounds 0--3 17678b4684afSRobert Clausecker 17688b4684afSRobert Clausecker // rounds 4--7 17698b4684afSRobert Clausecker movdqu 1*16(%rsi), %xmm1 17708b4684afSRobert Clausecker pshufb %xmm4, %xmm1 17718b4684afSRobert Clausecker sha1nexte %xmm1, %xmm5 17728b4684afSRobert Clausecker movdqa %xmm6, %xmm7 17738b4684afSRobert Clausecker sha1rnds4 $0, %xmm5, %xmm6 17748b4684afSRobert Clausecker sha1msg1 %xmm1, %xmm0 17758b4684afSRobert Clausecker 17768b4684afSRobert Clausecker // rounds 8--11 17778b4684afSRobert Clausecker movdqu 2*16(%rsi), %xmm2 17788b4684afSRobert Clausecker pshufb %xmm4, %xmm2 17798b4684afSRobert Clausecker sha1nexte %xmm2, %xmm7 17808b4684afSRobert Clausecker movdqa %xmm6, %xmm5 17818b4684afSRobert Clausecker sha1rnds4 $0, %xmm7, %xmm6 17828b4684afSRobert Clausecker sha1msg1 %xmm2, %xmm1 17838b4684afSRobert Clausecker pxor %xmm2, %xmm0 17848b4684afSRobert Clausecker 17858b4684afSRobert Clausecker.macro midround msg3, msg0, msg1, msg2, e1, e0, k 17868b4684afSRobert Clausecker sha1nexte \msg3, \e1 17878b4684afSRobert Clausecker movdqa %xmm6, \e0 17888b4684afSRobert Clausecker sha1msg2 \msg3, \msg0 17898b4684afSRobert Clausecker sha1rnds4 $\k, \e1, %xmm6 17908b4684afSRobert Clausecker sha1msg1 \msg3, \msg2 17918b4684afSRobert Clausecker pxor \msg3, \msg1 17928b4684afSRobert Clausecker.endm 17938b4684afSRobert Clausecker 17948b4684afSRobert Clausecker movdqu 3*16(%rsi), %xmm3 // load third message block 17958b4684afSRobert Clausecker pshufb %xmm4, %xmm3 17968b4684afSRobert Clausecker 17978b4684afSRobert Clausecker add $4*16, %rsi 17988b4684afSRobert Clausecker 17998b4684afSRobert Clausecker midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 0 // 12--15 18008b4684afSRobert Clausecker midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 0 // 16--19 18018b4684afSRobert Clausecker midround %xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 1 // 20--23 18028b4684afSRobert Clausecker midround %xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 1 // 24--27 18038b4684afSRobert Clausecker midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 1 // 28--31 18048b4684afSRobert Clausecker midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 1 // 32--35 18058b4684afSRobert Clausecker midround %xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 1 // 36--39 18068b4684afSRobert Clausecker midround %xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 2 // 40--43 18078b4684afSRobert Clausecker midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 2 // 44--47 18088b4684afSRobert Clausecker midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 2 // 48--51 18098b4684afSRobert Clausecker midround %xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 2 // 52--55 18108b4684afSRobert Clausecker midround %xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 2 // 56--59 18118b4684afSRobert Clausecker midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 3 // 60--63 18128b4684afSRobert Clausecker midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 3 // 64--67 18138b4684afSRobert Clausecker 18148b4684afSRobert Clausecker // rounds 68--71 18158b4684afSRobert Clausecker sha1nexte %xmm1, %xmm5 18168b4684afSRobert Clausecker movdqa %xmm6, %xmm7 18178b4684afSRobert Clausecker sha1msg2 %xmm1, %xmm2 18188b4684afSRobert Clausecker sha1rnds4 $3, %xmm5, %xmm6 18198b4684afSRobert Clausecker pxor %xmm1, %xmm3 18208b4684afSRobert Clausecker 18218b4684afSRobert Clausecker // rounds 72--75 18228b4684afSRobert Clausecker sha1nexte %xmm2, %xmm7 18238b4684afSRobert Clausecker movdqa %xmm6, %xmm5 18248b4684afSRobert Clausecker sha1msg2 %xmm2, %xmm3 18258b4684afSRobert Clausecker sha1rnds4 $3, %xmm7, %xmm6 18268b4684afSRobert Clausecker 18278b4684afSRobert Clausecker // rounds 76--79 18288b4684afSRobert Clausecker sha1nexte %xmm3, %xmm5 18298b4684afSRobert Clausecker movdqa %xmm6, %xmm7 18308b4684afSRobert Clausecker sha1rnds4 $3, %xmm5, %xmm6 18318b4684afSRobert Clausecker 18328b4684afSRobert Clausecker sha1nexte %xmm9, %xmm7 // add saved E 18338b4684afSRobert Clausecker paddd %xmm8, %xmm6 // add saved ABCD 18348b4684afSRobert Clausecker 18358b4684afSRobert Clausecker cmp %rsi, %rcx // end reached? 18368b4684afSRobert Clausecker jne 0b 18378b4684afSRobert Clausecker 18388b4684afSRobert Clausecker pshufd $0x1b, %xmm6, %xmm6 // restore order of h0--h3 18398b4684afSRobert Clausecker movdqu %xmm6, (%rdi) // write h0--h3 18408b4684afSRobert Clausecker pextrd $3, %xmm7, 16(%rdi) // write h4 18418b4684afSRobert Clausecker1: ret 18428b4684afSRobert ClauseckerEND(_libmd_sha1block_shani) 18438b4684afSRobert Clausecker 18448b4684afSRobert Clausecker .section .rodata 18458b4684afSRobert Clausecker .balign 16 18468b4684afSRobert Clauseckershuf_mask: 18478b4684afSRobert Clausecker .8byte 0x08090a0b0c0d0e0f 18488b4684afSRobert Clausecker .8byte 0x0001020304050607 18498b4684afSRobert Clausecker .size shuf_mask, .-shuf_mask 18508b4684afSRobert Clausecker 18518b4684afSRobert Clausecker .section .note.GNU-stack,"",%progbits 1852