xref: /linux/lib/crypto/x86/nh-sse2.S (revision a229d83235c7627c490deb7dd4744a72567cea12)
1012c8238SEric Biggers/* SPDX-License-Identifier: GPL-2.0 */
2012c8238SEric Biggers/*
3012c8238SEric Biggers * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
4012c8238SEric Biggers *
5012c8238SEric Biggers * Copyright 2018 Google LLC
6012c8238SEric Biggers *
7012c8238SEric Biggers * Author: Eric Biggers <ebiggers@google.com>
8012c8238SEric Biggers */
9012c8238SEric Biggers
10012c8238SEric Biggers#include <linux/linkage.h>
11012c8238SEric Biggers
12012c8238SEric Biggers#define		PASS0_SUMS	%xmm0
13012c8238SEric Biggers#define		PASS1_SUMS	%xmm1
14012c8238SEric Biggers#define		PASS2_SUMS	%xmm2
15012c8238SEric Biggers#define		PASS3_SUMS	%xmm3
16012c8238SEric Biggers#define		K0		%xmm4
17012c8238SEric Biggers#define		K1		%xmm5
18012c8238SEric Biggers#define		K2		%xmm6
19012c8238SEric Biggers#define		K3		%xmm7
20012c8238SEric Biggers#define		T0		%xmm8
21012c8238SEric Biggers#define		T1		%xmm9
22012c8238SEric Biggers#define		T2		%xmm10
23012c8238SEric Biggers#define		T3		%xmm11
24012c8238SEric Biggers#define		T4		%xmm12
25012c8238SEric Biggers#define		T5		%xmm13
26012c8238SEric Biggers#define		T6		%xmm14
27012c8238SEric Biggers#define		T7		%xmm15
28012c8238SEric Biggers#define		KEY		%rdi
29012c8238SEric Biggers#define		MESSAGE		%rsi
30012c8238SEric Biggers#define		MESSAGE_LEN	%rdx
31012c8238SEric Biggers#define		HASH		%rcx
32012c8238SEric Biggers
33012c8238SEric Biggers.macro _nh_stride	k0, k1, k2, k3, offset
34012c8238SEric Biggers
35012c8238SEric Biggers	// Load next message stride
36012c8238SEric Biggers	movdqu		\offset(MESSAGE), T1
37012c8238SEric Biggers
38012c8238SEric Biggers	// Load next key stride
39012c8238SEric Biggers	movdqu		\offset(KEY), \k3
40012c8238SEric Biggers
41012c8238SEric Biggers	// Add message words to key words
42012c8238SEric Biggers	movdqa		T1, T2
43012c8238SEric Biggers	movdqa		T1, T3
44012c8238SEric Biggers	paddd		T1, \k0    // reuse k0 to avoid a move
45012c8238SEric Biggers	paddd		\k1, T1
46012c8238SEric Biggers	paddd		\k2, T2
47012c8238SEric Biggers	paddd		\k3, T3
48012c8238SEric Biggers
49012c8238SEric Biggers	// Multiply 32x32 => 64 and accumulate
50012c8238SEric Biggers	pshufd		$0x10, \k0, T4
51012c8238SEric Biggers	pshufd		$0x32, \k0, \k0
52012c8238SEric Biggers	pshufd		$0x10, T1, T5
53012c8238SEric Biggers	pshufd		$0x32, T1, T1
54012c8238SEric Biggers	pshufd		$0x10, T2, T6
55012c8238SEric Biggers	pshufd		$0x32, T2, T2
56012c8238SEric Biggers	pshufd		$0x10, T3, T7
57012c8238SEric Biggers	pshufd		$0x32, T3, T3
58012c8238SEric Biggers	pmuludq		T4, \k0
59012c8238SEric Biggers	pmuludq		T5, T1
60012c8238SEric Biggers	pmuludq		T6, T2
61012c8238SEric Biggers	pmuludq		T7, T3
62012c8238SEric Biggers	paddq		\k0, PASS0_SUMS
63012c8238SEric Biggers	paddq		T1, PASS1_SUMS
64012c8238SEric Biggers	paddq		T2, PASS2_SUMS
65012c8238SEric Biggers	paddq		T3, PASS3_SUMS
66012c8238SEric Biggers.endm
67012c8238SEric Biggers
68012c8238SEric Biggers/*
69012c8238SEric Biggers * void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
700f8bc4bdSEric Biggers *		__le64 hash[NH_NUM_PASSES])
71012c8238SEric Biggers *
72012c8238SEric Biggers * It's guaranteed that message_len % 16 == 0.
73012c8238SEric Biggers */
74*a229d832SEric BiggersSYM_FUNC_START(nh_sse2)
75012c8238SEric Biggers
76012c8238SEric Biggers	movdqu		0x00(KEY), K0
77012c8238SEric Biggers	movdqu		0x10(KEY), K1
78012c8238SEric Biggers	movdqu		0x20(KEY), K2
79012c8238SEric Biggers	add		$0x30, KEY
80012c8238SEric Biggers	pxor		PASS0_SUMS, PASS0_SUMS
81012c8238SEric Biggers	pxor		PASS1_SUMS, PASS1_SUMS
82012c8238SEric Biggers	pxor		PASS2_SUMS, PASS2_SUMS
83012c8238SEric Biggers	pxor		PASS3_SUMS, PASS3_SUMS
84012c8238SEric Biggers
85012c8238SEric Biggers	sub		$0x40, MESSAGE_LEN
86012c8238SEric Biggers	jl		.Lloop4_done
87012c8238SEric Biggers.Lloop4:
88012c8238SEric Biggers	_nh_stride	K0, K1, K2, K3, 0x00
89012c8238SEric Biggers	_nh_stride	K1, K2, K3, K0, 0x10
90012c8238SEric Biggers	_nh_stride	K2, K3, K0, K1, 0x20
91012c8238SEric Biggers	_nh_stride	K3, K0, K1, K2, 0x30
92012c8238SEric Biggers	add		$0x40, KEY
93012c8238SEric Biggers	add		$0x40, MESSAGE
94012c8238SEric Biggers	sub		$0x40, MESSAGE_LEN
95012c8238SEric Biggers	jge		.Lloop4
96012c8238SEric Biggers
97012c8238SEric Biggers.Lloop4_done:
98012c8238SEric Biggers	and		$0x3f, MESSAGE_LEN
99012c8238SEric Biggers	jz		.Ldone
100012c8238SEric Biggers	_nh_stride	K0, K1, K2, K3, 0x00
101012c8238SEric Biggers
102012c8238SEric Biggers	sub		$0x10, MESSAGE_LEN
103012c8238SEric Biggers	jz		.Ldone
104012c8238SEric Biggers	_nh_stride	K1, K2, K3, K0, 0x10
105012c8238SEric Biggers
106012c8238SEric Biggers	sub		$0x10, MESSAGE_LEN
107012c8238SEric Biggers	jz		.Ldone
108012c8238SEric Biggers	_nh_stride	K2, K3, K0, K1, 0x20
109012c8238SEric Biggers
110012c8238SEric Biggers.Ldone:
111012c8238SEric Biggers	// Sum the accumulators for each pass, then store the sums to 'hash'
112012c8238SEric Biggers	movdqa		PASS0_SUMS, T0
113012c8238SEric Biggers	movdqa		PASS2_SUMS, T1
114012c8238SEric Biggers	punpcklqdq	PASS1_SUMS, T0		// => (PASS0_SUM_A PASS1_SUM_A)
115012c8238SEric Biggers	punpcklqdq	PASS3_SUMS, T1		// => (PASS2_SUM_A PASS3_SUM_A)
116012c8238SEric Biggers	punpckhqdq	PASS1_SUMS, PASS0_SUMS	// => (PASS0_SUM_B PASS1_SUM_B)
117012c8238SEric Biggers	punpckhqdq	PASS3_SUMS, PASS2_SUMS	// => (PASS2_SUM_B PASS3_SUM_B)
118012c8238SEric Biggers	paddq		PASS0_SUMS, T0
119012c8238SEric Biggers	paddq		PASS2_SUMS, T1
120012c8238SEric Biggers	movdqu		T0, 0x00(HASH)
121012c8238SEric Biggers	movdqu		T1, 0x10(HASH)
122f94909ceSPeter Zijlstra	RET
1236dcc5627SJiri SlabySYM_FUNC_END(nh_sse2)
124