1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
4 */
5
6#include <linux/export.h>
7#include <linux/linkage.h>
8#include <linux/objtool.h>
9#include <asm/asm.h>
10
11/*
12 * copy_user_nocache - Uncached memory copy with exception handling
13 *
14 * This copies from user space into kernel space, but the kernel
15 * space accesses can take a machine check exception, so they too
16 * need exception handling.
17 *
18 * Note: only 32-bit and 64-bit stores have non-temporal versions,
19 * and we only use aligned versions. Any unaligned parts at the
20 * start or end of the copy will be done using normal cached stores.
21 *
22 * Input:
23 * rdi destination
24 * rsi source
25 * edx count
26 *
27 * Output:
28 * rax uncopied bytes or 0 if successful.
29 */
30SYM_FUNC_START(__copy_user_nocache)
31	ANNOTATE_NOENDBR
32	/* If destination is not 7-byte aligned, we'll have to align it */
33	testb $7,%dil
34	jne .Lalign
35
36.Lis_aligned:
37	cmp $64,%edx
38	jb .Lquadwords
39
40	.p2align 4,0x90
41.Lunrolled:
4210:	movq (%rsi),%r8
4311:	movq 8(%rsi),%r9
4412:	movq 16(%rsi),%r10
4513:	movq 24(%rsi),%r11
4620:	movnti %r8,(%rdi)
4721:	movnti %r9,8(%rdi)
4822:	movnti %r10,16(%rdi)
4923:	movnti %r11,24(%rdi)
5030:	movq 32(%rsi),%r8
5131:	movq 40(%rsi),%r9
5232:	movq 48(%rsi),%r10
5333:	movq 56(%rsi),%r11
5440:	movnti %r8,32(%rdi)
5541:	movnti %r9,40(%rdi)
5642:	movnti %r10,48(%rdi)
5743:	movnti %r11,56(%rdi)
58
59	addq $64,%rsi
60	addq $64,%rdi
61	sub $64,%edx
62	cmp $64,%edx
63	jae .Lunrolled
64
65/*
66 * First set of user mode loads have been done
67 * without any stores, so if they fail, we can
68 * just try the non-unrolled loop.
69 */
70_ASM_EXTABLE_UA(10b, .Lquadwords)
71_ASM_EXTABLE_UA(11b, .Lquadwords)
72_ASM_EXTABLE_UA(12b, .Lquadwords)
73_ASM_EXTABLE_UA(13b, .Lquadwords)
74
75/*
76 * The second set of user mode loads have been
77 * done with 32 bytes stored to the destination,
78 * so we need to take that into account before
79 * falling back to the unrolled loop.
80 */
81_ASM_EXTABLE_UA(30b, .Lfixup32)
82_ASM_EXTABLE_UA(31b, .Lfixup32)
83_ASM_EXTABLE_UA(32b, .Lfixup32)
84_ASM_EXTABLE_UA(33b, .Lfixup32)
85
86/*
87 * An exception on a write means that we're
88 * done, but we need to update the count
89 * depending on where in the unrolled loop
90 * we were.
91 */
92_ASM_EXTABLE_UA(20b, .Ldone0)
93_ASM_EXTABLE_UA(21b, .Ldone8)
94_ASM_EXTABLE_UA(22b, .Ldone16)
95_ASM_EXTABLE_UA(23b, .Ldone24)
96_ASM_EXTABLE_UA(40b, .Ldone32)
97_ASM_EXTABLE_UA(41b, .Ldone40)
98_ASM_EXTABLE_UA(42b, .Ldone48)
99_ASM_EXTABLE_UA(43b, .Ldone56)
100
101.Lquadwords:
102	cmp $8,%edx
103	jb .Llong
10450:	movq (%rsi),%rax
10551:	movnti %rax,(%rdi)
106	addq $8,%rsi
107	addq $8,%rdi
108	sub $8,%edx
109	jmp .Lquadwords
110
111/*
112 * If we fail on the last full quadword, we will
113 * not try to do any byte-wise cached accesses.
114 * We will try to do one more 4-byte uncached
115 * one, though.
116 */
117_ASM_EXTABLE_UA(50b, .Llast4)
118_ASM_EXTABLE_UA(51b, .Ldone0)
119
120.Llong:
121	test $4,%dl
122	je .Lword
12360:	movl (%rsi),%eax
12461:	movnti %eax,(%rdi)
125	addq $4,%rsi
126	addq $4,%rdi
127	sub $4,%edx
128.Lword:
129	sfence
130	test $2,%dl
131	je .Lbyte
13270:	movw (%rsi),%ax
13371:	movw %ax,(%rdi)
134	addq $2,%rsi
135	addq $2,%rdi
136	sub $2,%edx
137.Lbyte:
138	test $1,%dl
139	je .Ldone
14080:	movb (%rsi),%al
14181:	movb %al,(%rdi)
142	dec %edx
143.Ldone:
144	mov %edx,%eax
145	RET
146
147/*
148 * If we fail on the last four bytes, we won't
149 * bother with any fixups. It's dead, Jim. Note
150 * that there's no need for 'sfence' for any
151 * of this, since the exception will have been
152 * serializing.
153 */
154_ASM_EXTABLE_UA(60b, .Ldone)
155_ASM_EXTABLE_UA(61b, .Ldone)
156_ASM_EXTABLE_UA(70b, .Ldone)
157_ASM_EXTABLE_UA(71b, .Ldone)
158_ASM_EXTABLE_UA(80b, .Ldone)
159_ASM_EXTABLE_UA(81b, .Ldone)
160
161/*
162 * This is the "head needs aliging" case when
163 * the destination isn't 8-byte aligned. The
164 * 4-byte case can be done uncached, but any
165 * smaller alignment is done with regular stores.
166 */
167.Lalign:
168	test $1,%dil
169	je .Lalign_word
170	test %edx,%edx
171	je .Ldone
17290:	movb (%rsi),%al
17391:	movb %al,(%rdi)
174	inc %rsi
175	inc %rdi
176	dec %edx
177.Lalign_word:
178	test $2,%dil
179	je .Lalign_long
180	cmp $2,%edx
181	jb .Lbyte
18292:	movw (%rsi),%ax
18393:	movw %ax,(%rdi)
184	addq $2,%rsi
185	addq $2,%rdi
186	sub $2,%edx
187.Lalign_long:
188	test $4,%dil
189	je .Lis_aligned
190	cmp $4,%edx
191	jb .Lword
19294:	movl (%rsi),%eax
19395:	movnti %eax,(%rdi)
194	addq $4,%rsi
195	addq $4,%rdi
196	sub $4,%edx
197	jmp .Lis_aligned
198
199/*
200 * If we fail on the initial alignment accesses,
201 * we're all done. Again, no point in trying to
202 * do byte-by-byte probing if the 4-byte load
203 * fails - we're not doing any uncached accesses
204 * any more.
205 */
206_ASM_EXTABLE_UA(90b, .Ldone)
207_ASM_EXTABLE_UA(91b, .Ldone)
208_ASM_EXTABLE_UA(92b, .Ldone)
209_ASM_EXTABLE_UA(93b, .Ldone)
210_ASM_EXTABLE_UA(94b, .Ldone)
211_ASM_EXTABLE_UA(95b, .Ldone)
212
213/*
214 * Exception table fixups for faults in the middle
215 */
216.Ldone56: sub $8,%edx
217.Ldone48: sub $8,%edx
218.Ldone40: sub $8,%edx
219.Ldone32: sub $8,%edx
220.Ldone24: sub $8,%edx
221.Ldone16: sub $8,%edx
222.Ldone8: sub $8,%edx
223.Ldone0:
224	mov %edx,%eax
225	RET
226
227.Lfixup32:
228	addq $32,%rsi
229	addq $32,%rdi
230	sub $32,%edx
231	jmp .Lquadwords
232
233.Llast4:
23452:	movl (%rsi),%eax
23553:	movnti %eax,(%rdi)
236	sfence
237	sub $4,%edx
238	mov %edx,%eax
239	RET
240_ASM_EXTABLE_UA(52b, .Ldone0)
241_ASM_EXTABLE_UA(53b, .Ldone0)
242
243SYM_FUNC_END(__copy_user_nocache)
244EXPORT_SYMBOL(__copy_user_nocache)
245