1 // SPDX-License-Identifier: GPL-2.0-only
2 /* 32-bit system call dispatch */
3 
4 #include <linux/linkage.h>
5 #include <linux/sys.h>
6 #include <linux/cache.h>
7 #include <linux/syscalls.h>
8 #include <linux/entry-common.h>
9 #include <linux/nospec.h>
10 #include <linux/uaccess.h>
11 #include <asm/apic.h>
12 #include <asm/traps.h>
13 #include <asm/cpufeature.h>
14 #include <asm/syscall.h>
15 
16 #ifdef CONFIG_IA32_EMULATION
17 #define __SYSCALL_WITH_COMPAT(nr, native, compat)	__SYSCALL(nr, compat)
18 #else
19 #define __SYSCALL_WITH_COMPAT(nr, native, compat)	__SYSCALL(nr, native)
20 #endif
21 
22 #define __SYSCALL(nr, sym) extern long __ia32_##sym(const struct pt_regs *);
23 #define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __ia32_##sym(const struct pt_regs *);
24 #include <asm/syscalls_32.h>
25 #undef  __SYSCALL
26 
27 #undef  __SYSCALL_NORETURN
28 #define __SYSCALL_NORETURN __SYSCALL
29 
30 /*
31  * The sys_call_table[] is no longer used for system calls, but
32  * kernel/trace/trace_syscalls.c still wants to know the system
33  * call address.
34  */
35 #ifdef CONFIG_X86_32
36 #define __SYSCALL(nr, sym) __ia32_##sym,
37 const sys_call_ptr_t sys_call_table[] = {
38 #include <asm/syscalls_32.h>
39 };
40 #undef  __SYSCALL
41 #endif
42 
43 #define __SYSCALL(nr, sym) case nr: return __ia32_##sym(regs);
ia32_sys_call(const struct pt_regs * regs,unsigned int nr)44 long ia32_sys_call(const struct pt_regs *regs, unsigned int nr)
45 {
46 	switch (nr) {
47 	#include <asm/syscalls_32.h>
48 	default: return __ia32_sys_ni_syscall(regs);
49 	}
50 }
51 
syscall_32_enter(struct pt_regs * regs)52 static __always_inline int syscall_32_enter(struct pt_regs *regs)
53 {
54 	if (IS_ENABLED(CONFIG_IA32_EMULATION))
55 		current_thread_info()->status |= TS_COMPAT;
56 
57 	return (int)regs->orig_ax;
58 }
59 
60 #ifdef CONFIG_IA32_EMULATION
61 bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED);
62 
ia32_emulation_override_cmdline(char * arg)63 static int __init ia32_emulation_override_cmdline(char *arg)
64 {
65 	return kstrtobool(arg, &__ia32_enabled);
66 }
67 early_param("ia32_emulation", ia32_emulation_override_cmdline);
68 #endif
69 
70 /*
71  * Invoke a 32-bit syscall.  Called with IRQs on in CT_STATE_KERNEL.
72  */
do_syscall_32_irqs_on(struct pt_regs * regs,int nr)73 static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
74 {
75 	/*
76 	 * Convert negative numbers to very high and thus out of range
77 	 * numbers for comparisons.
78 	 */
79 	unsigned int unr = nr;
80 
81 	if (likely(unr < IA32_NR_syscalls)) {
82 		unr = array_index_nospec(unr, IA32_NR_syscalls);
83 		regs->ax = ia32_sys_call(regs, unr);
84 	} else if (nr != -1) {
85 		regs->ax = __ia32_sys_ni_syscall(regs);
86 	}
87 }
88 
89 #ifdef CONFIG_IA32_EMULATION
int80_is_external(void)90 static __always_inline bool int80_is_external(void)
91 {
92 	const unsigned int offs = (0x80 / 32) * 0x10;
93 	const u32 bit = BIT(0x80 % 32);
94 
95 	/* The local APIC on XENPV guests is fake */
96 	if (cpu_feature_enabled(X86_FEATURE_XENPV))
97 		return false;
98 
99 	/*
100 	 * If vector 0x80 is set in the APIC ISR then this is an external
101 	 * interrupt. Either from broken hardware or injected by a VMM.
102 	 *
103 	 * Note: In guest mode this is only valid for secure guests where
104 	 * the secure module fully controls the vAPIC exposed to the guest.
105 	 */
106 	return apic_read(APIC_ISR + offs) & bit;
107 }
108 
109 /**
110  * do_int80_emulation - 32-bit legacy syscall C entry from asm
111  * @regs: syscall arguments in struct pt_args on the stack.
112  *
113  * This entry point can be used by 32-bit and 64-bit programs to perform
114  * 32-bit system calls.  Instances of INT $0x80 can be found inline in
115  * various programs and libraries.  It is also used by the vDSO's
116  * __kernel_vsyscall fallback for hardware that doesn't support a faster
117  * entry method.  Restarted 32-bit system calls also fall back to INT
118  * $0x80 regardless of what instruction was originally used to do the
119  * system call.
120  *
121  * This is considered a slow path.  It is not used by most libc
122  * implementations on modern hardware except during process startup.
123  *
124  * The arguments for the INT $0x80 based syscall are on stack in the
125  * pt_regs structure:
126  *   eax:				system call number
127  *   ebx, ecx, edx, esi, edi, ebp:	arg1 - arg 6
128  */
do_int80_emulation(struct pt_regs * regs)129 __visible noinstr void do_int80_emulation(struct pt_regs *regs)
130 {
131 	int nr;
132 
133 	/* Kernel does not use INT $0x80! */
134 	if (unlikely(!user_mode(regs))) {
135 		irqentry_enter(regs);
136 		instrumentation_begin();
137 		panic("Unexpected external interrupt 0x80\n");
138 	}
139 
140 	/*
141 	 * Establish kernel context for instrumentation, including for
142 	 * int80_is_external() below which calls into the APIC driver.
143 	 * Identical for soft and external interrupts.
144 	 */
145 	enter_from_user_mode(regs);
146 
147 	instrumentation_begin();
148 	add_random_kstack_offset();
149 
150 	/* Validate that this is a soft interrupt to the extent possible */
151 	if (unlikely(int80_is_external()))
152 		panic("Unexpected external interrupt 0x80\n");
153 
154 	/*
155 	 * The low level idtentry code pushed -1 into regs::orig_ax
156 	 * and regs::ax contains the syscall number.
157 	 *
158 	 * User tracing code (ptrace or signal handlers) might assume
159 	 * that the regs::orig_ax contains a 32-bit number on invoking
160 	 * a 32-bit syscall.
161 	 *
162 	 * Establish the syscall convention by saving the 32bit truncated
163 	 * syscall number in regs::orig_ax and by invalidating regs::ax.
164 	 */
165 	regs->orig_ax = regs->ax & GENMASK(31, 0);
166 	regs->ax = -ENOSYS;
167 
168 	nr = syscall_32_enter(regs);
169 
170 	local_irq_enable();
171 	nr = syscall_enter_from_user_mode_work(regs, nr);
172 	do_syscall_32_irqs_on(regs, nr);
173 
174 	instrumentation_end();
175 	syscall_exit_to_user_mode(regs);
176 }
177 
178 #ifdef CONFIG_X86_FRED
179 /*
180  * A FRED-specific INT80 handler is warranted for the follwing reasons:
181  *
182  * 1) As INT instructions and hardware interrupts are separate event
183  *    types, FRED does not preclude the use of vector 0x80 for external
184  *    interrupts. As a result, the FRED setup code does not reserve
185  *    vector 0x80 and calling int80_is_external() is not merely
186  *    suboptimal but actively incorrect: it could cause a system call
187  *    to be incorrectly ignored.
188  *
189  * 2) It is called only for handling vector 0x80 of event type
190  *    EVENT_TYPE_SWINT and will never be called to handle any external
191  *    interrupt (event type EVENT_TYPE_EXTINT).
192  *
193  * 3) FRED has separate entry flows depending on if the event came from
194  *    user space or kernel space, and because the kernel does not use
195  *    INT insns, the FRED kernel entry handler fred_entry_from_kernel()
196  *    falls through to fred_bad_type() if the event type is
197  *    EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling
198  *    an INT insn, it can only be from a user level.
199  *
200  * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will
201  *    likely take a different approach if it is ever needed: it
202  *    probably belongs in either fred_intx()/ fred_other() or
203  *    asm_fred_entrypoint_user(), depending on if this ought to be done
204  *    for all entries from userspace or only system
205  *    calls.
206  *
207  * 5) INT $0x80 is the fast path for 32-bit system calls under FRED.
208  */
DEFINE_FREDENTRY_RAW(int80_emulation)209 DEFINE_FREDENTRY_RAW(int80_emulation)
210 {
211 	int nr;
212 
213 	enter_from_user_mode(regs);
214 
215 	instrumentation_begin();
216 	add_random_kstack_offset();
217 
218 	/*
219 	 * FRED pushed 0 into regs::orig_ax and regs::ax contains the
220 	 * syscall number.
221 	 *
222 	 * User tracing code (ptrace or signal handlers) might assume
223 	 * that the regs::orig_ax contains a 32-bit number on invoking
224 	 * a 32-bit syscall.
225 	 *
226 	 * Establish the syscall convention by saving the 32bit truncated
227 	 * syscall number in regs::orig_ax and by invalidating regs::ax.
228 	 */
229 	regs->orig_ax = regs->ax & GENMASK(31, 0);
230 	regs->ax = -ENOSYS;
231 
232 	nr = syscall_32_enter(regs);
233 
234 	local_irq_enable();
235 	nr = syscall_enter_from_user_mode_work(regs, nr);
236 	do_syscall_32_irqs_on(regs, nr);
237 
238 	instrumentation_end();
239 	syscall_exit_to_user_mode(regs);
240 }
241 #endif /* CONFIG_X86_FRED */
242 
243 #else /* CONFIG_IA32_EMULATION */
244 
245 /* Handles int $0x80 on a 32bit kernel */
do_int80_syscall_32(struct pt_regs * regs)246 __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
247 {
248 	int nr = syscall_32_enter(regs);
249 
250 	add_random_kstack_offset();
251 	/*
252 	 * Subtlety here: if ptrace pokes something larger than 2^31-1 into
253 	 * orig_ax, the int return value truncates it. This matches
254 	 * the semantics of syscall_get_nr().
255 	 */
256 	nr = syscall_enter_from_user_mode(regs, nr);
257 	instrumentation_begin();
258 
259 	do_syscall_32_irqs_on(regs, nr);
260 
261 	instrumentation_end();
262 	syscall_exit_to_user_mode(regs);
263 }
264 #endif /* !CONFIG_IA32_EMULATION */
265 
__do_fast_syscall_32(struct pt_regs * regs)266 static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
267 {
268 	int nr = syscall_32_enter(regs);
269 	int res;
270 
271 	add_random_kstack_offset();
272 	/*
273 	 * This cannot use syscall_enter_from_user_mode() as it has to
274 	 * fetch EBP before invoking any of the syscall entry work
275 	 * functions.
276 	 */
277 	syscall_enter_from_user_mode_prepare(regs);
278 
279 	instrumentation_begin();
280 	/* Fetch EBP from where the vDSO stashed it. */
281 	if (IS_ENABLED(CONFIG_X86_64)) {
282 		/*
283 		 * Micro-optimization: the pointer we're following is
284 		 * explicitly 32 bits, so it can't be out of range.
285 		 */
286 		res = __get_user(*(u32 *)&regs->bp,
287 			 (u32 __user __force *)(unsigned long)(u32)regs->sp);
288 	} else {
289 		res = get_user(*(u32 *)&regs->bp,
290 		       (u32 __user __force *)(unsigned long)(u32)regs->sp);
291 	}
292 
293 	if (res) {
294 		/* User code screwed up. */
295 		regs->ax = -EFAULT;
296 
297 		local_irq_disable();
298 		instrumentation_end();
299 		irqentry_exit_to_user_mode(regs);
300 		return false;
301 	}
302 
303 	nr = syscall_enter_from_user_mode_work(regs, nr);
304 
305 	/* Now this is just like a normal syscall. */
306 	do_syscall_32_irqs_on(regs, nr);
307 
308 	instrumentation_end();
309 	syscall_exit_to_user_mode(regs);
310 	return true;
311 }
312 
313 /* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
do_fast_syscall_32(struct pt_regs * regs)314 __visible noinstr bool do_fast_syscall_32(struct pt_regs *regs)
315 {
316 	/*
317 	 * Called using the internal vDSO SYSENTER/SYSCALL32 calling
318 	 * convention.  Adjust regs so it looks like we entered using int80.
319 	 */
320 	unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
321 					vdso_image_32.sym_int80_landing_pad;
322 
323 	/*
324 	 * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
325 	 * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
326 	 * Fix it up.
327 	 */
328 	regs->ip = landing_pad;
329 
330 	/* Invoke the syscall. If it failed, keep it simple: use IRET. */
331 	if (!__do_fast_syscall_32(regs))
332 		return false;
333 
334 	/*
335 	 * Check that the register state is valid for using SYSRETL/SYSEXIT
336 	 * to exit to userspace.  Otherwise use the slower but fully capable
337 	 * IRET exit path.
338 	 */
339 
340 	/* XEN PV guests always use the IRET path */
341 	if (cpu_feature_enabled(X86_FEATURE_XENPV))
342 		return false;
343 
344 	/* EIP must point to the VDSO landing pad */
345 	if (unlikely(regs->ip != landing_pad))
346 		return false;
347 
348 	/* CS and SS must match the values set in MSR_STAR */
349 	if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS))
350 		return false;
351 
352 	/* If the TF, RF, or VM flags are set, use IRET */
353 	if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)))
354 		return false;
355 
356 	/* Use SYSRETL/SYSEXIT to exit to userspace */
357 	return true;
358 }
359 
360 /* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
do_SYSENTER_32(struct pt_regs * regs)361 __visible noinstr bool do_SYSENTER_32(struct pt_regs *regs)
362 {
363 	/* SYSENTER loses RSP, but the vDSO saved it in RBP. */
364 	regs->sp = regs->bp;
365 
366 	/* SYSENTER clobbers EFLAGS.IF.  Assume it was set in usermode. */
367 	regs->flags |= X86_EFLAGS_IF;
368 
369 	return do_fast_syscall_32(regs);
370 }
371