xref: /linux/arch/x86/lib/memcpy_64.S (revision e78f70bad29c5ae1e1076698b690b15794e9b81e)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/* Copyright 2002 Andi Kleen */
3
4#include <linux/export.h>
5#include <linux/linkage.h>
6#include <linux/cfi_types.h>
7#include <asm/errno.h>
8#include <asm/cpufeatures.h>
9#include <asm/alternative.h>
10
11.section .noinstr.text, "ax"
12
13/*
14 * memcpy - Copy a memory block.
15 *
16 * Input:
17 *  rdi destination
18 *  rsi source
19 *  rdx count
20 *
21 * Output:
22 * rax original destination
23 *
24 * The FSRM alternative should be done inline (avoiding the call and
25 * the disgusting return handling), but that would require some help
26 * from the compiler for better calling conventions.
27 *
28 * The 'rep movsb' itself is small enough to replace the call, but the
29 * two register moves blow up the code. And one of them is "needed"
30 * only for the return value that is the same as the source input,
31 * which the compiler could/should do much better anyway.
32 */
33SYM_TYPED_FUNC_START(__memcpy)
34	ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
35
36	movq %rdi, %rax
37	movq %rdx, %rcx
38	rep movsb
39	RET
40SYM_FUNC_END(__memcpy)
41EXPORT_SYMBOL(__memcpy)
42
43SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy)
44SYM_PIC_ALIAS(memcpy)
45EXPORT_SYMBOL(memcpy)
46
47SYM_FUNC_START_LOCAL(memcpy_orig)
48	movq %rdi, %rax
49
50	cmpq $0x20, %rdx
51	jb .Lhandle_tail
52
53	/*
54	 * We check whether memory false dependence could occur,
55	 * then jump to corresponding copy mode.
56	 */
57	cmp  %dil, %sil
58	jl .Lcopy_backward
59	subq $0x20, %rdx
60.Lcopy_forward_loop:
61	subq $0x20,	%rdx
62
63	/*
64	 * Move in blocks of 4x8 bytes:
65	 */
66	movq 0*8(%rsi),	%r8
67	movq 1*8(%rsi),	%r9
68	movq 2*8(%rsi),	%r10
69	movq 3*8(%rsi),	%r11
70	leaq 4*8(%rsi),	%rsi
71
72	movq %r8,	0*8(%rdi)
73	movq %r9,	1*8(%rdi)
74	movq %r10,	2*8(%rdi)
75	movq %r11,	3*8(%rdi)
76	leaq 4*8(%rdi),	%rdi
77	jae  .Lcopy_forward_loop
78	addl $0x20,	%edx
79	jmp  .Lhandle_tail
80
81.Lcopy_backward:
82	/*
83	 * Calculate copy position to tail.
84	 */
85	addq %rdx,	%rsi
86	addq %rdx,	%rdi
87	subq $0x20,	%rdx
88	/*
89	 * At most 3 ALU operations in one cycle,
90	 * so append NOPS in the same 16 bytes trunk.
91	 */
92	.p2align 4
93.Lcopy_backward_loop:
94	subq $0x20,	%rdx
95	movq -1*8(%rsi),	%r8
96	movq -2*8(%rsi),	%r9
97	movq -3*8(%rsi),	%r10
98	movq -4*8(%rsi),	%r11
99	leaq -4*8(%rsi),	%rsi
100	movq %r8,		-1*8(%rdi)
101	movq %r9,		-2*8(%rdi)
102	movq %r10,		-3*8(%rdi)
103	movq %r11,		-4*8(%rdi)
104	leaq -4*8(%rdi),	%rdi
105	jae  .Lcopy_backward_loop
106
107	/*
108	 * Calculate copy position to head.
109	 */
110	addl $0x20,	%edx
111	subq %rdx,	%rsi
112	subq %rdx,	%rdi
113.Lhandle_tail:
114	cmpl $16,	%edx
115	jb   .Lless_16bytes
116
117	/*
118	 * Move data from 16 bytes to 31 bytes.
119	 */
120	movq 0*8(%rsi), %r8
121	movq 1*8(%rsi),	%r9
122	movq -2*8(%rsi, %rdx),	%r10
123	movq -1*8(%rsi, %rdx),	%r11
124	movq %r8,	0*8(%rdi)
125	movq %r9,	1*8(%rdi)
126	movq %r10,	-2*8(%rdi, %rdx)
127	movq %r11,	-1*8(%rdi, %rdx)
128	RET
129	.p2align 4
130.Lless_16bytes:
131	cmpl $8,	%edx
132	jb   .Lless_8bytes
133	/*
134	 * Move data from 8 bytes to 15 bytes.
135	 */
136	movq 0*8(%rsi),	%r8
137	movq -1*8(%rsi, %rdx),	%r9
138	movq %r8,	0*8(%rdi)
139	movq %r9,	-1*8(%rdi, %rdx)
140	RET
141	.p2align 4
142.Lless_8bytes:
143	cmpl $4,	%edx
144	jb   .Lless_3bytes
145
146	/*
147	 * Move data from 4 bytes to 7 bytes.
148	 */
149	movl (%rsi), %ecx
150	movl -4(%rsi, %rdx), %r8d
151	movl %ecx, (%rdi)
152	movl %r8d, -4(%rdi, %rdx)
153	RET
154	.p2align 4
155.Lless_3bytes:
156	subl $1, %edx
157	jb .Lend
158	/*
159	 * Move data from 1 bytes to 3 bytes.
160	 */
161	movzbl (%rsi), %ecx
162	jz .Lstore_1byte
163	movzbq 1(%rsi), %r8
164	movzbq (%rsi, %rdx), %r9
165	movb %r8b, 1(%rdi)
166	movb %r9b, (%rdi, %rdx)
167.Lstore_1byte:
168	movb %cl, (%rdi)
169
170.Lend:
171	RET
172SYM_FUNC_END(memcpy_orig)
173
174