1012c8238SEric Biggers/* SPDX-License-Identifier: GPL-2.0 */ 2012c8238SEric Biggers/* 3012c8238SEric Biggers * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated 4012c8238SEric Biggers * 5012c8238SEric Biggers * Copyright 2018 Google LLC 6012c8238SEric Biggers * 7012c8238SEric Biggers * Author: Eric Biggers <ebiggers@google.com> 8012c8238SEric Biggers */ 9012c8238SEric Biggers 10012c8238SEric Biggers#include <linux/linkage.h> 11012c8238SEric Biggers 12012c8238SEric Biggers#define PASS0_SUMS %xmm0 13012c8238SEric Biggers#define PASS1_SUMS %xmm1 14012c8238SEric Biggers#define PASS2_SUMS %xmm2 15012c8238SEric Biggers#define PASS3_SUMS %xmm3 16012c8238SEric Biggers#define K0 %xmm4 17012c8238SEric Biggers#define K1 %xmm5 18012c8238SEric Biggers#define K2 %xmm6 19012c8238SEric Biggers#define K3 %xmm7 20012c8238SEric Biggers#define T0 %xmm8 21012c8238SEric Biggers#define T1 %xmm9 22012c8238SEric Biggers#define T2 %xmm10 23012c8238SEric Biggers#define T3 %xmm11 24012c8238SEric Biggers#define T4 %xmm12 25012c8238SEric Biggers#define T5 %xmm13 26012c8238SEric Biggers#define T6 %xmm14 27012c8238SEric Biggers#define T7 %xmm15 28012c8238SEric Biggers#define KEY %rdi 29012c8238SEric Biggers#define MESSAGE %rsi 30012c8238SEric Biggers#define MESSAGE_LEN %rdx 31012c8238SEric Biggers#define HASH %rcx 32012c8238SEric Biggers 33012c8238SEric Biggers.macro _nh_stride k0, k1, k2, k3, offset 34012c8238SEric Biggers 35012c8238SEric Biggers // Load next message stride 36012c8238SEric Biggers movdqu \offset(MESSAGE), T1 37012c8238SEric Biggers 38012c8238SEric Biggers // Load next key stride 39012c8238SEric Biggers movdqu \offset(KEY), \k3 40012c8238SEric Biggers 41012c8238SEric Biggers // Add message words to key words 42012c8238SEric Biggers movdqa T1, T2 43012c8238SEric Biggers movdqa T1, T3 44012c8238SEric Biggers paddd T1, \k0 // reuse k0 to avoid a move 45012c8238SEric Biggers paddd \k1, T1 46012c8238SEric Biggers paddd \k2, T2 47012c8238SEric Biggers paddd \k3, T3 48012c8238SEric Biggers 49012c8238SEric Biggers // Multiply 32x32 => 64 and accumulate 50012c8238SEric Biggers pshufd $0x10, \k0, T4 51012c8238SEric Biggers pshufd $0x32, \k0, \k0 52012c8238SEric Biggers pshufd $0x10, T1, T5 53012c8238SEric Biggers pshufd $0x32, T1, T1 54012c8238SEric Biggers pshufd $0x10, T2, T6 55012c8238SEric Biggers pshufd $0x32, T2, T2 56012c8238SEric Biggers pshufd $0x10, T3, T7 57012c8238SEric Biggers pshufd $0x32, T3, T3 58012c8238SEric Biggers pmuludq T4, \k0 59012c8238SEric Biggers pmuludq T5, T1 60012c8238SEric Biggers pmuludq T6, T2 61012c8238SEric Biggers pmuludq T7, T3 62012c8238SEric Biggers paddq \k0, PASS0_SUMS 63012c8238SEric Biggers paddq T1, PASS1_SUMS 64012c8238SEric Biggers paddq T2, PASS2_SUMS 65012c8238SEric Biggers paddq T3, PASS3_SUMS 66012c8238SEric Biggers.endm 67012c8238SEric Biggers 68012c8238SEric Biggers/* 69012c8238SEric Biggers * void nh_sse2(const u32 *key, const u8 *message, size_t message_len, 700f8bc4bdSEric Biggers * __le64 hash[NH_NUM_PASSES]) 71012c8238SEric Biggers * 72012c8238SEric Biggers * It's guaranteed that message_len % 16 == 0. 73012c8238SEric Biggers */ 74*a229d832SEric BiggersSYM_FUNC_START(nh_sse2) 75012c8238SEric Biggers 76012c8238SEric Biggers movdqu 0x00(KEY), K0 77012c8238SEric Biggers movdqu 0x10(KEY), K1 78012c8238SEric Biggers movdqu 0x20(KEY), K2 79012c8238SEric Biggers add $0x30, KEY 80012c8238SEric Biggers pxor PASS0_SUMS, PASS0_SUMS 81012c8238SEric Biggers pxor PASS1_SUMS, PASS1_SUMS 82012c8238SEric Biggers pxor PASS2_SUMS, PASS2_SUMS 83012c8238SEric Biggers pxor PASS3_SUMS, PASS3_SUMS 84012c8238SEric Biggers 85012c8238SEric Biggers sub $0x40, MESSAGE_LEN 86012c8238SEric Biggers jl .Lloop4_done 87012c8238SEric Biggers.Lloop4: 88012c8238SEric Biggers _nh_stride K0, K1, K2, K3, 0x00 89012c8238SEric Biggers _nh_stride K1, K2, K3, K0, 0x10 90012c8238SEric Biggers _nh_stride K2, K3, K0, K1, 0x20 91012c8238SEric Biggers _nh_stride K3, K0, K1, K2, 0x30 92012c8238SEric Biggers add $0x40, KEY 93012c8238SEric Biggers add $0x40, MESSAGE 94012c8238SEric Biggers sub $0x40, MESSAGE_LEN 95012c8238SEric Biggers jge .Lloop4 96012c8238SEric Biggers 97012c8238SEric Biggers.Lloop4_done: 98012c8238SEric Biggers and $0x3f, MESSAGE_LEN 99012c8238SEric Biggers jz .Ldone 100012c8238SEric Biggers _nh_stride K0, K1, K2, K3, 0x00 101012c8238SEric Biggers 102012c8238SEric Biggers sub $0x10, MESSAGE_LEN 103012c8238SEric Biggers jz .Ldone 104012c8238SEric Biggers _nh_stride K1, K2, K3, K0, 0x10 105012c8238SEric Biggers 106012c8238SEric Biggers sub $0x10, MESSAGE_LEN 107012c8238SEric Biggers jz .Ldone 108012c8238SEric Biggers _nh_stride K2, K3, K0, K1, 0x20 109012c8238SEric Biggers 110012c8238SEric Biggers.Ldone: 111012c8238SEric Biggers // Sum the accumulators for each pass, then store the sums to 'hash' 112012c8238SEric Biggers movdqa PASS0_SUMS, T0 113012c8238SEric Biggers movdqa PASS2_SUMS, T1 114012c8238SEric Biggers punpcklqdq PASS1_SUMS, T0 // => (PASS0_SUM_A PASS1_SUM_A) 115012c8238SEric Biggers punpcklqdq PASS3_SUMS, T1 // => (PASS2_SUM_A PASS3_SUM_A) 116012c8238SEric Biggers punpckhqdq PASS1_SUMS, PASS0_SUMS // => (PASS0_SUM_B PASS1_SUM_B) 117012c8238SEric Biggers punpckhqdq PASS3_SUMS, PASS2_SUMS // => (PASS2_SUM_B PASS3_SUM_B) 118012c8238SEric Biggers paddq PASS0_SUMS, T0 119012c8238SEric Biggers paddq PASS2_SUMS, T1 120012c8238SEric Biggers movdqu T0, 0x00(HASH) 121012c8238SEric Biggers movdqu T1, 0x10(HASH) 122f94909ceSPeter Zijlstra RET 1236dcc5627SJiri SlabySYM_FUNC_END(nh_sse2) 124