1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org> 4 */ 5 6#include <linux/export.h> 7#include <linux/linkage.h> 8#include <linux/objtool.h> 9#include <asm/asm.h> 10 11/* 12 * copy_user_nocache - Uncached memory copy with exception handling 13 * 14 * This copies from user space into kernel space, but the kernel 15 * space accesses can take a machine check exception, so they too 16 * need exception handling. 17 * 18 * Note: only 32-bit and 64-bit stores have non-temporal versions, 19 * and we only use aligned versions. Any unaligned parts at the 20 * start or end of the copy will be done using normal cached stores. 21 * 22 * Input: 23 * rdi destination 24 * rsi source 25 * edx count 26 * 27 * Output: 28 * rax uncopied bytes or 0 if successful. 29 */ 30SYM_FUNC_START(__copy_user_nocache) 31 ANNOTATE_NOENDBR 32 /* If destination is not 7-byte aligned, we'll have to align it */ 33 testb $7,%dil 34 jne .Lalign 35 36.Lis_aligned: 37 cmp $64,%edx 38 jb .Lquadwords 39 40 .p2align 4,0x90 41.Lunrolled: 4210: movq (%rsi),%r8 4311: movq 8(%rsi),%r9 4412: movq 16(%rsi),%r10 4513: movq 24(%rsi),%r11 4620: movnti %r8,(%rdi) 4721: movnti %r9,8(%rdi) 4822: movnti %r10,16(%rdi) 4923: movnti %r11,24(%rdi) 5030: movq 32(%rsi),%r8 5131: movq 40(%rsi),%r9 5232: movq 48(%rsi),%r10 5333: movq 56(%rsi),%r11 5440: movnti %r8,32(%rdi) 5541: movnti %r9,40(%rdi) 5642: movnti %r10,48(%rdi) 5743: movnti %r11,56(%rdi) 58 59 addq $64,%rsi 60 addq $64,%rdi 61 sub $64,%edx 62 cmp $64,%edx 63 jae .Lunrolled 64 65/* 66 * First set of user mode loads have been done 67 * without any stores, so if they fail, we can 68 * just try the non-unrolled loop. 69 */ 70_ASM_EXTABLE_UA(10b, .Lquadwords) 71_ASM_EXTABLE_UA(11b, .Lquadwords) 72_ASM_EXTABLE_UA(12b, .Lquadwords) 73_ASM_EXTABLE_UA(13b, .Lquadwords) 74 75/* 76 * The second set of user mode loads have been 77 * done with 32 bytes stored to the destination, 78 * so we need to take that into account before 79 * falling back to the unrolled loop. 80 */ 81_ASM_EXTABLE_UA(30b, .Lfixup32) 82_ASM_EXTABLE_UA(31b, .Lfixup32) 83_ASM_EXTABLE_UA(32b, .Lfixup32) 84_ASM_EXTABLE_UA(33b, .Lfixup32) 85 86/* 87 * An exception on a write means that we're 88 * done, but we need to update the count 89 * depending on where in the unrolled loop 90 * we were. 91 */ 92_ASM_EXTABLE_UA(20b, .Ldone0) 93_ASM_EXTABLE_UA(21b, .Ldone8) 94_ASM_EXTABLE_UA(22b, .Ldone16) 95_ASM_EXTABLE_UA(23b, .Ldone24) 96_ASM_EXTABLE_UA(40b, .Ldone32) 97_ASM_EXTABLE_UA(41b, .Ldone40) 98_ASM_EXTABLE_UA(42b, .Ldone48) 99_ASM_EXTABLE_UA(43b, .Ldone56) 100 101.Lquadwords: 102 cmp $8,%edx 103 jb .Llong 10450: movq (%rsi),%rax 10551: movnti %rax,(%rdi) 106 addq $8,%rsi 107 addq $8,%rdi 108 sub $8,%edx 109 jmp .Lquadwords 110 111/* 112 * If we fail on the last full quadword, we will 113 * not try to do any byte-wise cached accesses. 114 * We will try to do one more 4-byte uncached 115 * one, though. 116 */ 117_ASM_EXTABLE_UA(50b, .Llast4) 118_ASM_EXTABLE_UA(51b, .Ldone0) 119 120.Llong: 121 test $4,%dl 122 je .Lword 12360: movl (%rsi),%eax 12461: movnti %eax,(%rdi) 125 addq $4,%rsi 126 addq $4,%rdi 127 sub $4,%edx 128.Lword: 129 sfence 130 test $2,%dl 131 je .Lbyte 13270: movw (%rsi),%ax 13371: movw %ax,(%rdi) 134 addq $2,%rsi 135 addq $2,%rdi 136 sub $2,%edx 137.Lbyte: 138 test $1,%dl 139 je .Ldone 14080: movb (%rsi),%al 14181: movb %al,(%rdi) 142 dec %edx 143.Ldone: 144 mov %edx,%eax 145 RET 146 147/* 148 * If we fail on the last four bytes, we won't 149 * bother with any fixups. It's dead, Jim. Note 150 * that there's no need for 'sfence' for any 151 * of this, since the exception will have been 152 * serializing. 153 */ 154_ASM_EXTABLE_UA(60b, .Ldone) 155_ASM_EXTABLE_UA(61b, .Ldone) 156_ASM_EXTABLE_UA(70b, .Ldone) 157_ASM_EXTABLE_UA(71b, .Ldone) 158_ASM_EXTABLE_UA(80b, .Ldone) 159_ASM_EXTABLE_UA(81b, .Ldone) 160 161/* 162 * This is the "head needs aliging" case when 163 * the destination isn't 8-byte aligned. The 164 * 4-byte case can be done uncached, but any 165 * smaller alignment is done with regular stores. 166 */ 167.Lalign: 168 test $1,%dil 169 je .Lalign_word 170 test %edx,%edx 171 je .Ldone 17290: movb (%rsi),%al 17391: movb %al,(%rdi) 174 inc %rsi 175 inc %rdi 176 dec %edx 177.Lalign_word: 178 test $2,%dil 179 je .Lalign_long 180 cmp $2,%edx 181 jb .Lbyte 18292: movw (%rsi),%ax 18393: movw %ax,(%rdi) 184 addq $2,%rsi 185 addq $2,%rdi 186 sub $2,%edx 187.Lalign_long: 188 test $4,%dil 189 je .Lis_aligned 190 cmp $4,%edx 191 jb .Lword 19294: movl (%rsi),%eax 19395: movnti %eax,(%rdi) 194 addq $4,%rsi 195 addq $4,%rdi 196 sub $4,%edx 197 jmp .Lis_aligned 198 199/* 200 * If we fail on the initial alignment accesses, 201 * we're all done. Again, no point in trying to 202 * do byte-by-byte probing if the 4-byte load 203 * fails - we're not doing any uncached accesses 204 * any more. 205 */ 206_ASM_EXTABLE_UA(90b, .Ldone) 207_ASM_EXTABLE_UA(91b, .Ldone) 208_ASM_EXTABLE_UA(92b, .Ldone) 209_ASM_EXTABLE_UA(93b, .Ldone) 210_ASM_EXTABLE_UA(94b, .Ldone) 211_ASM_EXTABLE_UA(95b, .Ldone) 212 213/* 214 * Exception table fixups for faults in the middle 215 */ 216.Ldone56: sub $8,%edx 217.Ldone48: sub $8,%edx 218.Ldone40: sub $8,%edx 219.Ldone32: sub $8,%edx 220.Ldone24: sub $8,%edx 221.Ldone16: sub $8,%edx 222.Ldone8: sub $8,%edx 223.Ldone0: 224 mov %edx,%eax 225 RET 226 227.Lfixup32: 228 addq $32,%rsi 229 addq $32,%rdi 230 sub $32,%edx 231 jmp .Lquadwords 232 233.Llast4: 23452: movl (%rsi),%eax 23553: movnti %eax,(%rdi) 236 sfence 237 sub $4,%edx 238 mov %edx,%eax 239 RET 240_ASM_EXTABLE_UA(52b, .Ldone0) 241_ASM_EXTABLE_UA(53b, .Ldone0) 242 243SYM_FUNC_END(__copy_user_nocache) 244EXPORT_SYMBOL(__copy_user_nocache) 245