1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* Copyright 2002 Andi Kleen */ 3 4#include <linux/export.h> 5#include <linux/linkage.h> 6#include <linux/cfi_types.h> 7#include <asm/errno.h> 8#include <asm/cpufeatures.h> 9#include <asm/alternative.h> 10 11.section .noinstr.text, "ax" 12 13/* 14 * memcpy - Copy a memory block. 15 * 16 * Input: 17 * rdi destination 18 * rsi source 19 * rdx count 20 * 21 * Output: 22 * rax original destination 23 * 24 * The FSRM alternative should be done inline (avoiding the call and 25 * the disgusting return handling), but that would require some help 26 * from the compiler for better calling conventions. 27 * 28 * The 'rep movsb' itself is small enough to replace the call, but the 29 * two register moves blow up the code. And one of them is "needed" 30 * only for the return value that is the same as the source input, 31 * which the compiler could/should do much better anyway. 32 */ 33SYM_TYPED_FUNC_START(__memcpy) 34 ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM 35 36 movq %rdi, %rax 37 movq %rdx, %rcx 38 rep movsb 39 RET 40SYM_FUNC_END(__memcpy) 41EXPORT_SYMBOL(__memcpy) 42 43SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy) 44SYM_PIC_ALIAS(memcpy) 45EXPORT_SYMBOL(memcpy) 46 47SYM_FUNC_START_LOCAL(memcpy_orig) 48 movq %rdi, %rax 49 50 cmpq $0x20, %rdx 51 jb .Lhandle_tail 52 53 /* 54 * We check whether memory false dependence could occur, 55 * then jump to corresponding copy mode. 56 */ 57 cmp %dil, %sil 58 jl .Lcopy_backward 59 subq $0x20, %rdx 60.Lcopy_forward_loop: 61 subq $0x20, %rdx 62 63 /* 64 * Move in blocks of 4x8 bytes: 65 */ 66 movq 0*8(%rsi), %r8 67 movq 1*8(%rsi), %r9 68 movq 2*8(%rsi), %r10 69 movq 3*8(%rsi), %r11 70 leaq 4*8(%rsi), %rsi 71 72 movq %r8, 0*8(%rdi) 73 movq %r9, 1*8(%rdi) 74 movq %r10, 2*8(%rdi) 75 movq %r11, 3*8(%rdi) 76 leaq 4*8(%rdi), %rdi 77 jae .Lcopy_forward_loop 78 addl $0x20, %edx 79 jmp .Lhandle_tail 80 81.Lcopy_backward: 82 /* 83 * Calculate copy position to tail. 84 */ 85 addq %rdx, %rsi 86 addq %rdx, %rdi 87 subq $0x20, %rdx 88 /* 89 * At most 3 ALU operations in one cycle, 90 * so append NOPS in the same 16 bytes trunk. 91 */ 92 .p2align 4 93.Lcopy_backward_loop: 94 subq $0x20, %rdx 95 movq -1*8(%rsi), %r8 96 movq -2*8(%rsi), %r9 97 movq -3*8(%rsi), %r10 98 movq -4*8(%rsi), %r11 99 leaq -4*8(%rsi), %rsi 100 movq %r8, -1*8(%rdi) 101 movq %r9, -2*8(%rdi) 102 movq %r10, -3*8(%rdi) 103 movq %r11, -4*8(%rdi) 104 leaq -4*8(%rdi), %rdi 105 jae .Lcopy_backward_loop 106 107 /* 108 * Calculate copy position to head. 109 */ 110 addl $0x20, %edx 111 subq %rdx, %rsi 112 subq %rdx, %rdi 113.Lhandle_tail: 114 cmpl $16, %edx 115 jb .Lless_16bytes 116 117 /* 118 * Move data from 16 bytes to 31 bytes. 119 */ 120 movq 0*8(%rsi), %r8 121 movq 1*8(%rsi), %r9 122 movq -2*8(%rsi, %rdx), %r10 123 movq -1*8(%rsi, %rdx), %r11 124 movq %r8, 0*8(%rdi) 125 movq %r9, 1*8(%rdi) 126 movq %r10, -2*8(%rdi, %rdx) 127 movq %r11, -1*8(%rdi, %rdx) 128 RET 129 .p2align 4 130.Lless_16bytes: 131 cmpl $8, %edx 132 jb .Lless_8bytes 133 /* 134 * Move data from 8 bytes to 15 bytes. 135 */ 136 movq 0*8(%rsi), %r8 137 movq -1*8(%rsi, %rdx), %r9 138 movq %r8, 0*8(%rdi) 139 movq %r9, -1*8(%rdi, %rdx) 140 RET 141 .p2align 4 142.Lless_8bytes: 143 cmpl $4, %edx 144 jb .Lless_3bytes 145 146 /* 147 * Move data from 4 bytes to 7 bytes. 148 */ 149 movl (%rsi), %ecx 150 movl -4(%rsi, %rdx), %r8d 151 movl %ecx, (%rdi) 152 movl %r8d, -4(%rdi, %rdx) 153 RET 154 .p2align 4 155.Lless_3bytes: 156 subl $1, %edx 157 jb .Lend 158 /* 159 * Move data from 1 bytes to 3 bytes. 160 */ 161 movzbl (%rsi), %ecx 162 jz .Lstore_1byte 163 movzbq 1(%rsi), %r8 164 movzbq (%rsi, %rdx), %r9 165 movb %r8b, 1(%rdi) 166 movb %r9b, (%rdi, %rdx) 167.Lstore_1byte: 168 movb %cl, (%rdi) 169 170.Lend: 171 RET 172SYM_FUNC_END(memcpy_orig) 173 174