1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright (C) 2013 ARM Ltd.
4 * Copyright (C) 2013 Linaro.
5 *
6 * This code is based on glibc cortex strings work originally authored by Linaro
7 * be found @
8 *
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
11 */
12
13
14/*
15 * Copy a buffer from src to dest (alignment handled by the hardware)
16 *
17 * Parameters:
18 *	x0 - dest
19 *	x1 - src
20 *	x2 - n
21 * Returns:
22 *	x0 - dest
23 */
24dstin	.req	x0
25src	.req	x1
26count	.req	x2
27tmp1	.req	x3
28tmp1w	.req	w3
29tmp2	.req	x4
30tmp2w	.req	w4
31dst	.req	x6
32
33A_l	.req	x7
34A_h	.req	x8
35B_l	.req	x9
36B_h	.req	x10
37C_l	.req	x11
38C_h	.req	x12
39D_l	.req	x13
40D_h	.req	x14
41
42	mov	dst, dstin
43
44#ifdef CONFIG_AS_HAS_MOPS
45alternative_if_not ARM64_HAS_MOPS
46	b	.Lno_mops
47alternative_else_nop_endif
48	cpy1	dst, src, count
49	b	.Lexitfunc
50.Lno_mops:
51#endif
52
53	cmp	count, #16
54	/*When memory length is less than 16, the accessed are not aligned.*/
55	b.lo	.Ltiny15
56
57	neg	tmp2, src
58	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
59	b.eq	.LSrcAligned
60	sub	count, count, tmp2
61	/*
62	* Copy the leading memory data from src to dst in an increasing
63	* address order.By this way,the risk of overwriting the source
64	* memory data is eliminated when the distance between src and
65	* dst is less than 16. The memory accesses here are alignment.
66	*/
67	tbz	tmp2, #0, 1f
68	ldrb1	tmp1w, src, #1
69	strb1	tmp1w, dst, #1
701:
71	tbz	tmp2, #1, 2f
72	ldrh1	tmp1w, src, #2
73	strh1	tmp1w, dst, #2
742:
75	tbz	tmp2, #2, 3f
76	ldr1	tmp1w, src, #4
77	str1	tmp1w, dst, #4
783:
79	tbz	tmp2, #3, .LSrcAligned
80	ldr1	tmp1, src, #8
81	str1	tmp1, dst, #8
82
83.LSrcAligned:
84	cmp	count, #64
85	b.ge	.Lcpy_over64
86	/*
87	* Deal with small copies quickly by dropping straight into the
88	* exit block.
89	*/
90.Ltail63:
91	/*
92	* Copy up to 48 bytes of data. At this point we only need the
93	* bottom 6 bits of count to be accurate.
94	*/
95	ands	tmp1, count, #0x30
96	b.eq	.Ltiny15
97	cmp	tmp1w, #0x20
98	b.eq	1f
99	b.lt	2f
100	ldp1	A_l, A_h, src, #16
101	stp1	A_l, A_h, dst, #16
1021:
103	ldp1	A_l, A_h, src, #16
104	stp1	A_l, A_h, dst, #16
1052:
106	ldp1	A_l, A_h, src, #16
107	stp1	A_l, A_h, dst, #16
108.Ltiny15:
109	/*
110	* Prefer to break one ldp/stp into several load/store to access
111	* memory in an increasing address order,rather than to load/store 16
112	* bytes from (src-16) to (dst-16) and to backward the src to aligned
113	* address,which way is used in original cortex memcpy. If keeping
114	* the original memcpy process here, memmove need to satisfy the
115	* precondition that src address is at least 16 bytes bigger than dst
116	* address,otherwise some source data will be overwritten when memove
117	* call memcpy directly. To make memmove simpler and decouple the
118	* memcpy's dependency on memmove, withdrew the original process.
119	*/
120	tbz	count, #3, 1f
121	ldr1	tmp1, src, #8
122	str1	tmp1, dst, #8
1231:
124	tbz	count, #2, 2f
125	ldr1	tmp1w, src, #4
126	str1	tmp1w, dst, #4
1272:
128	tbz	count, #1, 3f
129	ldrh1	tmp1w, src, #2
130	strh1	tmp1w, dst, #2
1313:
132	tbz	count, #0, .Lexitfunc
133	ldrb1	tmp1w, src, #1
134	strb1	tmp1w, dst, #1
135
136	b	.Lexitfunc
137
138.Lcpy_over64:
139	subs	count, count, #128
140	b.ge	.Lcpy_body_large
141	/*
142	* Less than 128 bytes to copy, so handle 64 here and then jump
143	* to the tail.
144	*/
145	ldp1	A_l, A_h, src, #16
146	stp1	A_l, A_h, dst, #16
147	ldp1	B_l, B_h, src, #16
148	ldp1	C_l, C_h, src, #16
149	stp1	B_l, B_h, dst, #16
150	stp1	C_l, C_h, dst, #16
151	ldp1	D_l, D_h, src, #16
152	stp1	D_l, D_h, dst, #16
153
154	tst	count, #0x3f
155	b.ne	.Ltail63
156	b	.Lexitfunc
157
158	/*
159	* Critical loop.  Start at a new cache line boundary.  Assuming
160	* 64 bytes per line this ensures the entire loop is in one line.
161	*/
162	.p2align	L1_CACHE_SHIFT
163.Lcpy_body_large:
164	/* pre-get 64 bytes data. */
165	ldp1	A_l, A_h, src, #16
166	ldp1	B_l, B_h, src, #16
167	ldp1	C_l, C_h, src, #16
168	ldp1	D_l, D_h, src, #16
1691:
170	/*
171	* interlace the load of next 64 bytes data block with store of the last
172	* loaded 64 bytes data.
173	*/
174	stp1	A_l, A_h, dst, #16
175	ldp1	A_l, A_h, src, #16
176	stp1	B_l, B_h, dst, #16
177	ldp1	B_l, B_h, src, #16
178	stp1	C_l, C_h, dst, #16
179	ldp1	C_l, C_h, src, #16
180	stp1	D_l, D_h, dst, #16
181	ldp1	D_l, D_h, src, #16
182	subs	count, count, #64
183	b.ge	1b
184	stp1	A_l, A_h, dst, #16
185	stp1	B_l, B_h, dst, #16
186	stp1	C_l, C_h, dst, #16
187	stp1	D_l, D_h, dst, #16
188
189	tst	count, #0x3f
190	b.ne	.Ltail63
191.Lexitfunc:
192