1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Copyright (C) 2013 ARM Ltd. 4 * Copyright (C) 2013 Linaro. 5 * 6 * This code is based on glibc cortex strings work originally authored by Linaro 7 * be found @ 8 * 9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 10 * files/head:/src/aarch64/ 11 */ 12 13 14/* 15 * Copy a buffer from src to dest (alignment handled by the hardware) 16 * 17 * Parameters: 18 * x0 - dest 19 * x1 - src 20 * x2 - n 21 * Returns: 22 * x0 - dest 23 */ 24dstin .req x0 25src .req x1 26count .req x2 27tmp1 .req x3 28tmp1w .req w3 29tmp2 .req x4 30tmp2w .req w4 31dst .req x6 32 33A_l .req x7 34A_h .req x8 35B_l .req x9 36B_h .req x10 37C_l .req x11 38C_h .req x12 39D_l .req x13 40D_h .req x14 41 42 mov dst, dstin 43 44#ifdef CONFIG_AS_HAS_MOPS 45alternative_if_not ARM64_HAS_MOPS 46 b .Lno_mops 47alternative_else_nop_endif 48 cpy1 dst, src, count 49 b .Lexitfunc 50.Lno_mops: 51#endif 52 53 cmp count, #16 54 /*When memory length is less than 16, the accessed are not aligned.*/ 55 b.lo .Ltiny15 56 57 neg tmp2, src 58 ands tmp2, tmp2, #15/* Bytes to reach alignment. */ 59 b.eq .LSrcAligned 60 sub count, count, tmp2 61 /* 62 * Copy the leading memory data from src to dst in an increasing 63 * address order.By this way,the risk of overwriting the source 64 * memory data is eliminated when the distance between src and 65 * dst is less than 16. The memory accesses here are alignment. 66 */ 67 tbz tmp2, #0, 1f 68 ldrb1 tmp1w, src, #1 69 strb1 tmp1w, dst, #1 701: 71 tbz tmp2, #1, 2f 72 ldrh1 tmp1w, src, #2 73 strh1 tmp1w, dst, #2 742: 75 tbz tmp2, #2, 3f 76 ldr1 tmp1w, src, #4 77 str1 tmp1w, dst, #4 783: 79 tbz tmp2, #3, .LSrcAligned 80 ldr1 tmp1, src, #8 81 str1 tmp1, dst, #8 82 83.LSrcAligned: 84 cmp count, #64 85 b.ge .Lcpy_over64 86 /* 87 * Deal with small copies quickly by dropping straight into the 88 * exit block. 89 */ 90.Ltail63: 91 /* 92 * Copy up to 48 bytes of data. At this point we only need the 93 * bottom 6 bits of count to be accurate. 94 */ 95 ands tmp1, count, #0x30 96 b.eq .Ltiny15 97 cmp tmp1w, #0x20 98 b.eq 1f 99 b.lt 2f 100 ldp1 A_l, A_h, src, #16 101 stp1 A_l, A_h, dst, #16 1021: 103 ldp1 A_l, A_h, src, #16 104 stp1 A_l, A_h, dst, #16 1052: 106 ldp1 A_l, A_h, src, #16 107 stp1 A_l, A_h, dst, #16 108.Ltiny15: 109 /* 110 * Prefer to break one ldp/stp into several load/store to access 111 * memory in an increasing address order,rather than to load/store 16 112 * bytes from (src-16) to (dst-16) and to backward the src to aligned 113 * address,which way is used in original cortex memcpy. If keeping 114 * the original memcpy process here, memmove need to satisfy the 115 * precondition that src address is at least 16 bytes bigger than dst 116 * address,otherwise some source data will be overwritten when memove 117 * call memcpy directly. To make memmove simpler and decouple the 118 * memcpy's dependency on memmove, withdrew the original process. 119 */ 120 tbz count, #3, 1f 121 ldr1 tmp1, src, #8 122 str1 tmp1, dst, #8 1231: 124 tbz count, #2, 2f 125 ldr1 tmp1w, src, #4 126 str1 tmp1w, dst, #4 1272: 128 tbz count, #1, 3f 129 ldrh1 tmp1w, src, #2 130 strh1 tmp1w, dst, #2 1313: 132 tbz count, #0, .Lexitfunc 133 ldrb1 tmp1w, src, #1 134 strb1 tmp1w, dst, #1 135 136 b .Lexitfunc 137 138.Lcpy_over64: 139 subs count, count, #128 140 b.ge .Lcpy_body_large 141 /* 142 * Less than 128 bytes to copy, so handle 64 here and then jump 143 * to the tail. 144 */ 145 ldp1 A_l, A_h, src, #16 146 stp1 A_l, A_h, dst, #16 147 ldp1 B_l, B_h, src, #16 148 ldp1 C_l, C_h, src, #16 149 stp1 B_l, B_h, dst, #16 150 stp1 C_l, C_h, dst, #16 151 ldp1 D_l, D_h, src, #16 152 stp1 D_l, D_h, dst, #16 153 154 tst count, #0x3f 155 b.ne .Ltail63 156 b .Lexitfunc 157 158 /* 159 * Critical loop. Start at a new cache line boundary. Assuming 160 * 64 bytes per line this ensures the entire loop is in one line. 161 */ 162 .p2align L1_CACHE_SHIFT 163.Lcpy_body_large: 164 /* pre-get 64 bytes data. */ 165 ldp1 A_l, A_h, src, #16 166 ldp1 B_l, B_h, src, #16 167 ldp1 C_l, C_h, src, #16 168 ldp1 D_l, D_h, src, #16 1691: 170 /* 171 * interlace the load of next 64 bytes data block with store of the last 172 * loaded 64 bytes data. 173 */ 174 stp1 A_l, A_h, dst, #16 175 ldp1 A_l, A_h, src, #16 176 stp1 B_l, B_h, dst, #16 177 ldp1 B_l, B_h, src, #16 178 stp1 C_l, C_h, dst, #16 179 ldp1 C_l, C_h, src, #16 180 stp1 D_l, D_h, dst, #16 181 ldp1 D_l, D_h, src, #16 182 subs count, count, #64 183 b.ge 1b 184 stp1 A_l, A_h, dst, #16 185 stp1 B_l, B_h, dst, #16 186 stp1 C_l, C_h, dst, #16 187 stp1 D_l, D_h, dst, #16 188 189 tst count, #0x3f 190 b.ne .Ltail63 191.Lexitfunc: 192