1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef __LINUX_ENTRYCOMMON_H
3 #define __LINUX_ENTRYCOMMON_H
4
5 #include <linux/audit.h>
6 #include <linux/irq-entry-common.h>
7 #include <linux/livepatch.h>
8 #include <linux/ptrace.h>
9 #include <linux/resume_user_mode.h>
10 #include <linux/seccomp.h>
11 #include <linux/sched.h>
12
13 #include <asm/entry-common.h>
14 #include <asm/syscall.h>
15
16 #ifndef _TIF_UPROBE
17 # define _TIF_UPROBE (0)
18 #endif
19
20 /*
21 * SYSCALL_WORK flags handled in syscall_enter_from_user_mode()
22 */
23 #ifndef ARCH_SYSCALL_WORK_ENTER
24 # define ARCH_SYSCALL_WORK_ENTER (0)
25 #endif
26
27 /*
28 * SYSCALL_WORK flags handled in syscall_exit_to_user_mode()
29 */
30 #ifndef ARCH_SYSCALL_WORK_EXIT
31 # define ARCH_SYSCALL_WORK_EXIT (0)
32 #endif
33
34 #define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \
35 SYSCALL_WORK_SYSCALL_TRACEPOINT | \
36 SYSCALL_WORK_SYSCALL_TRACE | \
37 SYSCALL_WORK_SYSCALL_EMU | \
38 SYSCALL_WORK_SYSCALL_AUDIT | \
39 SYSCALL_WORK_SYSCALL_USER_DISPATCH | \
40 SYSCALL_WORK_SYSCALL_RSEQ_SLICE | \
41 ARCH_SYSCALL_WORK_ENTER)
42 #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \
43 SYSCALL_WORK_SYSCALL_TRACE | \
44 SYSCALL_WORK_SYSCALL_AUDIT | \
45 SYSCALL_WORK_SYSCALL_USER_DISPATCH | \
46 SYSCALL_WORK_SYSCALL_EXIT_TRAP | \
47 ARCH_SYSCALL_WORK_EXIT)
48
49 /**
50 * arch_ptrace_report_syscall_entry - Architecture specific ptrace_report_syscall_entry() wrapper
51 * @regs: Pointer to the register state at syscall entry
52 *
53 * Invoked from syscall_trace_enter() to wrap ptrace_report_syscall_entry().
54 *
55 * This allows architecture specific ptrace_report_syscall_entry()
56 * implementations. If not defined by the architecture this falls back to
57 * to ptrace_report_syscall_entry().
58 */
59 static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs);
60
61 #ifndef arch_ptrace_report_syscall_entry
arch_ptrace_report_syscall_entry(struct pt_regs * regs)62 static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs)
63 {
64 return ptrace_report_syscall_entry(regs);
65 }
66 #endif
67
68 bool syscall_user_dispatch(struct pt_regs *regs);
69 long trace_syscall_enter(struct pt_regs *regs, long syscall);
70 void trace_syscall_exit(struct pt_regs *regs, long ret);
71
syscall_enter_audit(struct pt_regs * regs,long syscall)72 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
73 {
74 if (unlikely(audit_context())) {
75 unsigned long args[6];
76
77 syscall_get_arguments(current, regs, args);
78 audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
79 }
80 }
81
syscall_trace_enter(struct pt_regs * regs,unsigned long work)82 static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned long work)
83 {
84 long syscall, ret = 0;
85
86 /*
87 * Handle Syscall User Dispatch. This must comes first, since
88 * the ABI here can be something that doesn't make sense for
89 * other syscall_work features.
90 */
91 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
92 if (syscall_user_dispatch(regs))
93 return -1L;
94 }
95
96 /*
97 * User space got a time slice extension granted and relinquishes
98 * the CPU. The work stops the slice timer to avoid an extra round
99 * through hrtimer_interrupt().
100 */
101 if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE)
102 rseq_syscall_enter_work(syscall_get_nr(current, regs));
103
104 /* Handle ptrace */
105 if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
106 ret = arch_ptrace_report_syscall_entry(regs);
107 if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
108 return -1L;
109 }
110
111 /* Do seccomp after ptrace, to catch any tracer changes. */
112 if (work & SYSCALL_WORK_SECCOMP) {
113 ret = __secure_computing();
114 if (ret == -1L)
115 return ret;
116 }
117
118 /* Either of the above might have changed the syscall number */
119 syscall = syscall_get_nr(current, regs);
120
121 if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
122 syscall = trace_syscall_enter(regs, syscall);
123
124 syscall_enter_audit(regs, syscall);
125
126 return ret ? : syscall;
127 }
128
129 /**
130 * syscall_enter_from_user_mode_work - Check and handle work before invoking
131 * a syscall
132 * @regs: Pointer to currents pt_regs
133 * @syscall: The syscall number
134 *
135 * Invoked from architecture specific syscall entry code with interrupts
136 * enabled after invoking enter_from_user_mode(), enabling interrupts and
137 * extra architecture specific work.
138 *
139 * Returns: The original or a modified syscall number
140 *
141 * If the returned syscall number is -1 then the syscall should be
142 * skipped. In this case the caller may invoke syscall_set_error() or
143 * syscall_set_return_value() first. If neither of those are called and -1
144 * is returned, then the syscall will fail with ENOSYS.
145 *
146 * It handles the following work items:
147 *
148 * 1) syscall_work flag dependent invocations of
149 * ptrace_report_syscall_entry(), __secure_computing(), trace_sys_enter()
150 * 2) Invocation of audit_syscall_entry()
151 */
syscall_enter_from_user_mode_work(struct pt_regs * regs,long syscall)152 static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
153 {
154 unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
155
156 if (work & SYSCALL_WORK_ENTER)
157 syscall = syscall_trace_enter(regs, work);
158
159 return syscall;
160 }
161
162 /**
163 * syscall_enter_from_user_mode - Establish state and check and handle work
164 * before invoking a syscall
165 * @regs: Pointer to currents pt_regs
166 * @syscall: The syscall number
167 *
168 * Invoked from architecture specific syscall entry code with interrupts
169 * disabled. The calling code has to be non-instrumentable. When the
170 * function returns all state is correct, interrupts are enabled and the
171 * subsequent functions can be instrumented.
172 *
173 * This is the combination of enter_from_user_mode() and
174 * syscall_enter_from_user_mode_work() to be used when there is no
175 * architecture specific work to be done between the two.
176 *
177 * Returns: The original or a modified syscall number. See
178 * syscall_enter_from_user_mode_work() for further explanation.
179 */
syscall_enter_from_user_mode(struct pt_regs * regs,long syscall)180 static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
181 {
182 long ret;
183
184 enter_from_user_mode(regs);
185
186 instrumentation_begin();
187 local_irq_enable();
188 ret = syscall_enter_from_user_mode_work(regs, syscall);
189 instrumentation_end();
190
191 return ret;
192 }
193
194 /*
195 * If SYSCALL_EMU is set, then the only reason to report is when
196 * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
197 * instruction has been already reported in syscall_enter_from_user_mode().
198 */
report_single_step(unsigned long work)199 static __always_inline bool report_single_step(unsigned long work)
200 {
201 if (work & SYSCALL_WORK_SYSCALL_EMU)
202 return false;
203
204 return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
205 }
206
207 /**
208 * arch_ptrace_report_syscall_exit - Architecture specific ptrace_report_syscall_exit()
209 * @regs: Pointer to the register state at syscall exit
210 * @step: Indicates a single-step exit rather than a normal syscall exit
211 *
212 * This allows architecture specific ptrace_report_syscall_exit()
213 * implementations. If not defined by the architecture this falls back to
214 * to ptrace_report_syscall_exit().
215 */
216 static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
217 int step);
218
219 #ifndef arch_ptrace_report_syscall_exit
arch_ptrace_report_syscall_exit(struct pt_regs * regs,int step)220 static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
221 int step)
222 {
223 ptrace_report_syscall_exit(regs, step);
224 }
225 #endif
226
227 /**
228 * syscall_exit_work - Handle work before returning to user mode
229 * @regs: Pointer to current pt_regs
230 * @work: Current thread syscall work
231 *
232 * Do one-time syscall specific work.
233 */
syscall_exit_work(struct pt_regs * regs,unsigned long work)234 static __always_inline void syscall_exit_work(struct pt_regs *regs, unsigned long work)
235 {
236 bool step;
237
238 /*
239 * If the syscall was rolled back due to syscall user dispatching,
240 * then the tracers below are not invoked for the same reason as
241 * the entry side was not invoked in syscall_trace_enter(): The ABI
242 * of these syscalls is unknown.
243 */
244 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
245 if (unlikely(current->syscall_dispatch.on_dispatch)) {
246 current->syscall_dispatch.on_dispatch = false;
247 return;
248 }
249 }
250
251 audit_syscall_exit(regs);
252
253 if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
254 trace_syscall_exit(regs, syscall_get_return_value(current, regs));
255
256 step = report_single_step(work);
257 if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
258 arch_ptrace_report_syscall_exit(regs, step);
259 }
260
261 /**
262 * syscall_exit_to_user_mode_work - Handle one time work before returning to user mode
263 * @regs: Pointer to currents pt_regs
264 *
265 * Step 1 of syscall_exit_to_user_mode() with the same calling convention.
266 *
267 * The caller must invoke steps 2-3 of syscall_exit_to_user_mode() afterwards.
268 */
syscall_exit_to_user_mode_work(struct pt_regs * regs)269 static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
270 {
271 unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
272 unsigned long nr = syscall_get_nr(current, regs);
273
274 CT_WARN_ON(ct_state() != CT_STATE_KERNEL);
275
276 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
277 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
278 local_irq_enable();
279 }
280
281 rseq_debug_syscall_return(regs);
282
283 /*
284 * Do one-time syscall specific work. If these work items are
285 * enabled, we want to run them exactly once per syscall exit with
286 * interrupts enabled.
287 */
288 if (unlikely(work & SYSCALL_WORK_EXIT))
289 syscall_exit_work(regs, work);
290 }
291
292 /**
293 * syscall_exit_to_user_mode - Handle work before returning to user mode
294 * @regs: Pointer to currents pt_regs
295 *
296 * Invoked with interrupts enabled and fully valid @regs. Returns with all
297 * work handled, interrupts disabled such that the caller can immediately
298 * switch to user mode. Called from architecture specific syscall and ret
299 * from fork code.
300 *
301 * The call order is:
302 * 1) One-time syscall exit work:
303 * - rseq syscall exit
304 * - audit
305 * - syscall tracing
306 * - ptrace (single stepping)
307 *
308 * 2) Preparatory work
309 * - Disable interrupts
310 * - Exit to user mode loop (common TIF handling). Invokes
311 * arch_exit_to_user_mode_work() for architecture specific TIF work
312 * - Architecture specific one time work arch_exit_to_user_mode_prepare()
313 * - Address limit and lockdep checks
314 *
315 * 3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the
316 * functionality in exit_to_user_mode().
317 *
318 * This is a combination of syscall_exit_to_user_mode_work() (1), disabling
319 * interrupts followed by syscall_exit_to_user_mode_prepare() (2) and
320 * exit_to_user_mode() (3). This function is preferred unless there is a
321 * compelling architectural reason to invoke the functions separately.
322 */
syscall_exit_to_user_mode(struct pt_regs * regs)323 static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs)
324 {
325 instrumentation_begin();
326 syscall_exit_to_user_mode_work(regs);
327 local_irq_disable();
328 syscall_exit_to_user_mode_prepare(regs);
329 instrumentation_end();
330 exit_to_user_mode();
331 }
332
333 #endif
334