1 // SPDX-License-Identifier: GPL-2.0-only
2 /* 64-bit system call dispatch */
3
4 #include <linux/linkage.h>
5 #include <linux/sys.h>
6 #include <linux/cache.h>
7 #include <linux/syscalls.h>
8 #include <linux/entry-common.h>
9 #include <linux/nospec.h>
10 #include <asm/syscall.h>
11
12 #define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
13 #define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __x64_##sym(const struct pt_regs *);
14 #include <asm/syscalls_64.h>
15 #ifdef CONFIG_X86_X32_ABI
16 #include <asm/syscalls_x32.h>
17 #endif
18 #undef __SYSCALL
19
20 #undef __SYSCALL_NORETURN
21 #define __SYSCALL_NORETURN __SYSCALL
22
23 /*
24 * The sys_call_table[] is no longer used for system calls, but
25 * kernel/trace/trace_syscalls.c still wants to know the system
26 * call address.
27 */
28 #define __SYSCALL(nr, sym) __x64_##sym,
29 const sys_call_ptr_t sys_call_table[] = {
30 #include <asm/syscalls_64.h>
31 };
32 #undef __SYSCALL
33
34 #define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
x64_sys_call(const struct pt_regs * regs,unsigned int nr)35 long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
36 {
37 switch (nr) {
38 #include <asm/syscalls_64.h>
39 default: return __x64_sys_ni_syscall(regs);
40 }
41 }
42
43 #ifdef CONFIG_X86_X32_ABI
x32_sys_call(const struct pt_regs * regs,unsigned int nr)44 long x32_sys_call(const struct pt_regs *regs, unsigned int nr)
45 {
46 switch (nr) {
47 #include <asm/syscalls_x32.h>
48 default: return __x64_sys_ni_syscall(regs);
49 }
50 }
51 #endif
52
do_syscall_x64(struct pt_regs * regs,int nr)53 static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
54 {
55 /*
56 * Convert negative numbers to very high and thus out of range
57 * numbers for comparisons.
58 */
59 unsigned int unr = nr;
60
61 if (likely(unr < NR_syscalls)) {
62 unr = array_index_nospec(unr, NR_syscalls);
63 regs->ax = x64_sys_call(regs, unr);
64 return true;
65 }
66 return false;
67 }
68
do_syscall_x32(struct pt_regs * regs,int nr)69 static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
70 {
71 /*
72 * Adjust the starting offset of the table, and convert numbers
73 * < __X32_SYSCALL_BIT to very high and thus out of range
74 * numbers for comparisons.
75 */
76 unsigned int xnr = nr - __X32_SYSCALL_BIT;
77
78 if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
79 xnr = array_index_nospec(xnr, X32_NR_syscalls);
80 regs->ax = x32_sys_call(regs, xnr);
81 return true;
82 }
83 return false;
84 }
85
86 /* Returns true to return using SYSRET, or false to use IRET */
do_syscall_64(struct pt_regs * regs,int nr)87 __visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
88 {
89 add_random_kstack_offset();
90 nr = syscall_enter_from_user_mode(regs, nr);
91
92 instrumentation_begin();
93
94 if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
95 /* Invalid system call, but still a system call. */
96 regs->ax = __x64_sys_ni_syscall(regs);
97 }
98
99 instrumentation_end();
100 syscall_exit_to_user_mode(regs);
101
102 /*
103 * Check that the register state is valid for using SYSRET to exit
104 * to userspace. Otherwise use the slower but fully capable IRET
105 * exit path.
106 */
107
108 /* XEN PV guests always use the IRET path */
109 if (cpu_feature_enabled(X86_FEATURE_XENPV))
110 return false;
111
112 /* SYSRET requires RCX == RIP and R11 == EFLAGS */
113 if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
114 return false;
115
116 /* CS and SS must match the values set in MSR_STAR */
117 if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
118 return false;
119
120 /*
121 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
122 * in kernel space. This essentially lets the user take over
123 * the kernel, since userspace controls RSP.
124 *
125 * TASK_SIZE_MAX covers all user-accessible addresses other than
126 * the deprecated vsyscall page.
127 */
128 if (unlikely(regs->ip >= TASK_SIZE_MAX))
129 return false;
130
131 /*
132 * SYSRET cannot restore RF. It can restore TF, but unlike IRET,
133 * restoring TF results in a trap from userspace immediately after
134 * SYSRET.
135 */
136 if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
137 return false;
138
139 /* Use SYSRET to exit to userspace */
140 return true;
141 }
142