xref: /linux/include/linux/entry-common.h (revision 15a1bccddccba6cab63fec1345fbd24102d9e0b8)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef __LINUX_ENTRYCOMMON_H
3 #define __LINUX_ENTRYCOMMON_H
4 
5 #include <linux/audit.h>
6 #include <linux/irq-entry-common.h>
7 #include <linux/livepatch.h>
8 #include <linux/ptrace.h>
9 #include <linux/resume_user_mode.h>
10 #include <linux/seccomp.h>
11 #include <linux/sched.h>
12 
13 #include <asm/entry-common.h>
14 #include <asm/syscall.h>
15 
16 #ifndef _TIF_UPROBE
17 # define _TIF_UPROBE			(0)
18 #endif
19 
20 /*
21  * SYSCALL_WORK flags handled in syscall_enter_from_user_mode()
22  */
23 #ifndef ARCH_SYSCALL_WORK_ENTER
24 # define ARCH_SYSCALL_WORK_ENTER	(0)
25 #endif
26 
27 /*
28  * SYSCALL_WORK flags handled in syscall_exit_to_user_mode()
29  */
30 #ifndef ARCH_SYSCALL_WORK_EXIT
31 # define ARCH_SYSCALL_WORK_EXIT		(0)
32 #endif
33 
34 #define SYSCALL_WORK_ENTER	(SYSCALL_WORK_SECCOMP |			\
35 				 SYSCALL_WORK_SYSCALL_TRACEPOINT |	\
36 				 SYSCALL_WORK_SYSCALL_TRACE |		\
37 				 SYSCALL_WORK_SYSCALL_EMU |		\
38 				 SYSCALL_WORK_SYSCALL_AUDIT |		\
39 				 SYSCALL_WORK_SYSCALL_USER_DISPATCH |	\
40 				 SYSCALL_WORK_SYSCALL_RSEQ_SLICE |	\
41 				 ARCH_SYSCALL_WORK_ENTER)
42 #define SYSCALL_WORK_EXIT	(SYSCALL_WORK_SYSCALL_TRACEPOINT |	\
43 				 SYSCALL_WORK_SYSCALL_TRACE |		\
44 				 SYSCALL_WORK_SYSCALL_AUDIT |		\
45 				 SYSCALL_WORK_SYSCALL_USER_DISPATCH |	\
46 				 SYSCALL_WORK_SYSCALL_EXIT_TRAP	|	\
47 				 ARCH_SYSCALL_WORK_EXIT)
48 
49 /**
50  * arch_ptrace_report_syscall_entry - Architecture specific ptrace_report_syscall_entry() wrapper
51  * @regs: Pointer to the register state at syscall entry
52  *
53  * Invoked from syscall_trace_enter() to wrap ptrace_report_syscall_entry().
54  *
55  * This allows architecture specific ptrace_report_syscall_entry()
56  * implementations. If not defined by the architecture this falls back to
57  * to ptrace_report_syscall_entry().
58  */
59 static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs);
60 
61 #ifndef arch_ptrace_report_syscall_entry
62 static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs)
63 {
64 	return ptrace_report_syscall_entry(regs);
65 }
66 #endif
67 
68 bool syscall_user_dispatch(struct pt_regs *regs);
69 long trace_syscall_enter(struct pt_regs *regs, long syscall);
70 void trace_syscall_exit(struct pt_regs *regs, long ret);
71 
72 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
73 {
74 	if (unlikely(audit_context())) {
75 		unsigned long args[6];
76 
77 		syscall_get_arguments(current, regs, args);
78 		audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
79 	}
80 }
81 
82 static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned long work)
83 {
84 	long syscall, ret = 0;
85 
86 	/*
87 	 * Handle Syscall User Dispatch.  This must comes first, since
88 	 * the ABI here can be something that doesn't make sense for
89 	 * other syscall_work features.
90 	 */
91 	if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
92 		if (syscall_user_dispatch(regs))
93 			return -1L;
94 	}
95 
96 	/*
97 	 * User space got a time slice extension granted and relinquishes
98 	 * the CPU. The work stops the slice timer to avoid an extra round
99 	 * through hrtimer_interrupt().
100 	 */
101 	if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE)
102 		rseq_syscall_enter_work(syscall_get_nr(current, regs));
103 
104 	/* Handle ptrace */
105 	if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
106 		ret = arch_ptrace_report_syscall_entry(regs);
107 		if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
108 			return -1L;
109 	}
110 
111 	/* Do seccomp after ptrace, to catch any tracer changes. */
112 	if (work & SYSCALL_WORK_SECCOMP) {
113 		ret = __secure_computing();
114 		if (ret == -1L)
115 			return ret;
116 	}
117 
118 	/* Either of the above might have changed the syscall number */
119 	syscall = syscall_get_nr(current, regs);
120 
121 	if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
122 		syscall = trace_syscall_enter(regs, syscall);
123 
124 	syscall_enter_audit(regs, syscall);
125 
126 	return ret ? : syscall;
127 }
128 
129 /**
130  * syscall_enter_from_user_mode_work - Check and handle work before invoking
131  *				       a syscall
132  * @regs:	Pointer to currents pt_regs
133  * @syscall:	The syscall number
134  *
135  * Invoked from architecture specific syscall entry code with interrupts
136  * enabled after invoking enter_from_user_mode(), enabling interrupts and
137  * extra architecture specific work.
138  *
139  * Returns: The original or a modified syscall number
140  *
141  * If the returned syscall number is -1 then the syscall should be
142  * skipped. In this case the caller may invoke syscall_set_error() or
143  * syscall_set_return_value() first.  If neither of those are called and -1
144  * is returned, then the syscall will fail with ENOSYS.
145  *
146  * It handles the following work items:
147  *
148  *  1) syscall_work flag dependent invocations of
149  *     ptrace_report_syscall_entry(), __secure_computing(), trace_sys_enter()
150  *  2) Invocation of audit_syscall_entry()
151  */
152 static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
153 {
154 	unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
155 
156 	if (work & SYSCALL_WORK_ENTER)
157 		syscall = syscall_trace_enter(regs, work);
158 
159 	return syscall;
160 }
161 
162 /**
163  * syscall_enter_from_user_mode - Establish state and check and handle work
164  *				  before invoking a syscall
165  * @regs:	Pointer to currents pt_regs
166  * @syscall:	The syscall number
167  *
168  * Invoked from architecture specific syscall entry code with interrupts
169  * disabled. The calling code has to be non-instrumentable. When the
170  * function returns all state is correct, interrupts are enabled and the
171  * subsequent functions can be instrumented.
172  *
173  * This is the combination of enter_from_user_mode() and
174  * syscall_enter_from_user_mode_work() to be used when there is no
175  * architecture specific work to be done between the two.
176  *
177  * Returns: The original or a modified syscall number. See
178  * syscall_enter_from_user_mode_work() for further explanation.
179  */
180 static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
181 {
182 	long ret;
183 
184 	enter_from_user_mode(regs);
185 
186 	instrumentation_begin();
187 	local_irq_enable();
188 	ret = syscall_enter_from_user_mode_work(regs, syscall);
189 	instrumentation_end();
190 
191 	return ret;
192 }
193 
194 /*
195  * If SYSCALL_EMU is set, then the only reason to report is when
196  * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP).  This syscall
197  * instruction has been already reported in syscall_enter_from_user_mode().
198  */
199 static __always_inline bool report_single_step(unsigned long work)
200 {
201 	if (work & SYSCALL_WORK_SYSCALL_EMU)
202 		return false;
203 
204 	return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
205 }
206 
207 /**
208  * arch_ptrace_report_syscall_exit - Architecture specific ptrace_report_syscall_exit()
209  * @regs: Pointer to the register state at syscall exit
210  * @step: Indicates a single-step exit rather than a normal syscall exit
211  *
212  * This allows architecture specific ptrace_report_syscall_exit()
213  * implementations. If not defined by the architecture this falls back to
214  * to ptrace_report_syscall_exit().
215  */
216 static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
217 							    int step);
218 
219 #ifndef arch_ptrace_report_syscall_exit
220 static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
221 							    int step)
222 {
223 	ptrace_report_syscall_exit(regs, step);
224 }
225 #endif
226 
227 /**
228  * syscall_exit_work - Handle work before returning to user mode
229  * @regs:	Pointer to current pt_regs
230  * @work:	Current thread syscall work
231  *
232  * Do one-time syscall specific work.
233  */
234 static __always_inline void syscall_exit_work(struct pt_regs *regs, unsigned long work)
235 {
236 	bool step;
237 
238 	/*
239 	 * If the syscall was rolled back due to syscall user dispatching,
240 	 * then the tracers below are not invoked for the same reason as
241 	 * the entry side was not invoked in syscall_trace_enter(): The ABI
242 	 * of these syscalls is unknown.
243 	 */
244 	if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
245 		if (unlikely(current->syscall_dispatch.on_dispatch)) {
246 			current->syscall_dispatch.on_dispatch = false;
247 			return;
248 		}
249 	}
250 
251 	audit_syscall_exit(regs);
252 
253 	if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
254 		trace_syscall_exit(regs, syscall_get_return_value(current, regs));
255 
256 	step = report_single_step(work);
257 	if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
258 		arch_ptrace_report_syscall_exit(regs, step);
259 }
260 
261 /**
262  * syscall_exit_to_user_mode_work - Handle one time work before returning to user mode
263  * @regs:	Pointer to currents pt_regs
264  *
265  * Step 1 of syscall_exit_to_user_mode() with the same calling convention.
266  *
267  * The caller must invoke steps 2-3 of syscall_exit_to_user_mode() afterwards.
268  */
269 static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
270 {
271 	unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
272 	unsigned long nr = syscall_get_nr(current, regs);
273 
274 	CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
275 
276 	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
277 		if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
278 			local_irq_enable();
279 	}
280 
281 	rseq_debug_syscall_return(regs);
282 
283 	/*
284 	 * Do one-time syscall specific work. If these work items are
285 	 * enabled, we want to run them exactly once per syscall exit with
286 	 * interrupts enabled.
287 	 */
288 	if (unlikely(work & SYSCALL_WORK_EXIT))
289 		syscall_exit_work(regs, work);
290 }
291 
292 /**
293  * syscall_exit_to_user_mode - Handle work before returning to user mode
294  * @regs:	Pointer to currents pt_regs
295  *
296  * Invoked with interrupts enabled and fully valid @regs. Returns with all
297  * work handled, interrupts disabled such that the caller can immediately
298  * switch to user mode. Called from architecture specific syscall and ret
299  * from fork code.
300  *
301  * The call order is:
302  *  1) One-time syscall exit work:
303  *	- rseq syscall exit
304  *      - audit
305  *	- syscall tracing
306  *	- ptrace (single stepping)
307  *
308  *  2) Preparatory work
309  *	- Disable interrupts
310  *	- Exit to user mode loop (common TIF handling). Invokes
311  *	  arch_exit_to_user_mode_work() for architecture specific TIF work
312  *	- Architecture specific one time work arch_exit_to_user_mode_prepare()
313  *	- Address limit and lockdep checks
314  *
315  *  3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the
316  *     functionality in exit_to_user_mode().
317  *
318  * This is a combination of syscall_exit_to_user_mode_work() (1), disabling
319  * interrupts followed by syscall_exit_to_user_mode_prepare() (2) and
320  * exit_to_user_mode() (3). This function is preferred unless there is a
321  * compelling architectural reason to invoke the functions separately.
322  */
323 static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs)
324 {
325 	instrumentation_begin();
326 	syscall_exit_to_user_mode_work(regs);
327 	local_irq_disable_exit_to_user();
328 	syscall_exit_to_user_mode_prepare(regs);
329 	instrumentation_end();
330 	exit_to_user_mode();
331 }
332 
333 #endif
334