1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef __LINUX_ENTRYCOMMON_H 3 #define __LINUX_ENTRYCOMMON_H 4 5 #include <linux/audit.h> 6 #include <linux/irq-entry-common.h> 7 #include <linux/livepatch.h> 8 #include <linux/ptrace.h> 9 #include <linux/resume_user_mode.h> 10 #include <linux/seccomp.h> 11 #include <linux/sched.h> 12 13 #include <asm/entry-common.h> 14 #include <asm/syscall.h> 15 16 #ifndef _TIF_UPROBE 17 # define _TIF_UPROBE (0) 18 #endif 19 20 /* 21 * SYSCALL_WORK flags handled in syscall_enter_from_user_mode() 22 */ 23 #ifndef ARCH_SYSCALL_WORK_ENTER 24 # define ARCH_SYSCALL_WORK_ENTER (0) 25 #endif 26 27 /* 28 * SYSCALL_WORK flags handled in syscall_exit_to_user_mode() 29 */ 30 #ifndef ARCH_SYSCALL_WORK_EXIT 31 # define ARCH_SYSCALL_WORK_EXIT (0) 32 #endif 33 34 #define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \ 35 SYSCALL_WORK_SYSCALL_TRACEPOINT | \ 36 SYSCALL_WORK_SYSCALL_TRACE | \ 37 SYSCALL_WORK_SYSCALL_EMU | \ 38 SYSCALL_WORK_SYSCALL_AUDIT | \ 39 SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ 40 SYSCALL_WORK_SYSCALL_RSEQ_SLICE | \ 41 ARCH_SYSCALL_WORK_ENTER) 42 #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ 43 SYSCALL_WORK_SYSCALL_TRACE | \ 44 SYSCALL_WORK_SYSCALL_AUDIT | \ 45 SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ 46 SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ 47 ARCH_SYSCALL_WORK_EXIT) 48 49 /** 50 * arch_ptrace_report_syscall_entry - Architecture specific ptrace_report_syscall_entry() wrapper 51 * @regs: Pointer to the register state at syscall entry 52 * 53 * Invoked from syscall_trace_enter() to wrap ptrace_report_syscall_entry(). 54 * 55 * This allows architecture specific ptrace_report_syscall_entry() 56 * implementations. If not defined by the architecture this falls back to 57 * to ptrace_report_syscall_entry(). 58 */ 59 static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs); 60 61 #ifndef arch_ptrace_report_syscall_entry 62 static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs) 63 { 64 return ptrace_report_syscall_entry(regs); 65 } 66 #endif 67 68 bool syscall_user_dispatch(struct pt_regs *regs); 69 long trace_syscall_enter(struct pt_regs *regs, long syscall); 70 void trace_syscall_exit(struct pt_regs *regs, long ret); 71 72 static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) 73 { 74 if (unlikely(audit_context())) { 75 unsigned long args[6]; 76 77 syscall_get_arguments(current, regs, args); 78 audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); 79 } 80 } 81 82 static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned long work) 83 { 84 long syscall, ret = 0; 85 86 /* 87 * Handle Syscall User Dispatch. This must comes first, since 88 * the ABI here can be something that doesn't make sense for 89 * other syscall_work features. 90 */ 91 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 92 if (syscall_user_dispatch(regs)) 93 return -1L; 94 } 95 96 /* 97 * User space got a time slice extension granted and relinquishes 98 * the CPU. The work stops the slice timer to avoid an extra round 99 * through hrtimer_interrupt(). 100 */ 101 if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE) 102 rseq_syscall_enter_work(syscall_get_nr(current, regs)); 103 104 /* Handle ptrace */ 105 if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { 106 ret = arch_ptrace_report_syscall_entry(regs); 107 if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) 108 return -1L; 109 } 110 111 /* Do seccomp after ptrace, to catch any tracer changes. */ 112 if (work & SYSCALL_WORK_SECCOMP) { 113 ret = __secure_computing(); 114 if (ret == -1L) 115 return ret; 116 } 117 118 /* Either of the above might have changed the syscall number */ 119 syscall = syscall_get_nr(current, regs); 120 121 if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) 122 syscall = trace_syscall_enter(regs, syscall); 123 124 syscall_enter_audit(regs, syscall); 125 126 return ret ? : syscall; 127 } 128 129 /** 130 * syscall_enter_from_user_mode_work - Check and handle work before invoking 131 * a syscall 132 * @regs: Pointer to currents pt_regs 133 * @syscall: The syscall number 134 * 135 * Invoked from architecture specific syscall entry code with interrupts 136 * enabled after invoking enter_from_user_mode(), enabling interrupts and 137 * extra architecture specific work. 138 * 139 * Returns: The original or a modified syscall number 140 * 141 * If the returned syscall number is -1 then the syscall should be 142 * skipped. In this case the caller may invoke syscall_set_error() or 143 * syscall_set_return_value() first. If neither of those are called and -1 144 * is returned, then the syscall will fail with ENOSYS. 145 * 146 * It handles the following work items: 147 * 148 * 1) syscall_work flag dependent invocations of 149 * ptrace_report_syscall_entry(), __secure_computing(), trace_sys_enter() 150 * 2) Invocation of audit_syscall_entry() 151 */ 152 static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) 153 { 154 unsigned long work = READ_ONCE(current_thread_info()->syscall_work); 155 156 if (work & SYSCALL_WORK_ENTER) 157 syscall = syscall_trace_enter(regs, work); 158 159 return syscall; 160 } 161 162 /** 163 * syscall_enter_from_user_mode - Establish state and check and handle work 164 * before invoking a syscall 165 * @regs: Pointer to currents pt_regs 166 * @syscall: The syscall number 167 * 168 * Invoked from architecture specific syscall entry code with interrupts 169 * disabled. The calling code has to be non-instrumentable. When the 170 * function returns all state is correct, interrupts are enabled and the 171 * subsequent functions can be instrumented. 172 * 173 * This is the combination of enter_from_user_mode() and 174 * syscall_enter_from_user_mode_work() to be used when there is no 175 * architecture specific work to be done between the two. 176 * 177 * Returns: The original or a modified syscall number. See 178 * syscall_enter_from_user_mode_work() for further explanation. 179 */ 180 static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) 181 { 182 long ret; 183 184 enter_from_user_mode(regs); 185 186 instrumentation_begin(); 187 local_irq_enable(); 188 ret = syscall_enter_from_user_mode_work(regs, syscall); 189 instrumentation_end(); 190 191 return ret; 192 } 193 194 /* 195 * If SYSCALL_EMU is set, then the only reason to report is when 196 * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall 197 * instruction has been already reported in syscall_enter_from_user_mode(). 198 */ 199 static __always_inline bool report_single_step(unsigned long work) 200 { 201 if (work & SYSCALL_WORK_SYSCALL_EMU) 202 return false; 203 204 return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; 205 } 206 207 /** 208 * arch_ptrace_report_syscall_exit - Architecture specific ptrace_report_syscall_exit() 209 * @regs: Pointer to the register state at syscall exit 210 * @step: Indicates a single-step exit rather than a normal syscall exit 211 * 212 * This allows architecture specific ptrace_report_syscall_exit() 213 * implementations. If not defined by the architecture this falls back to 214 * to ptrace_report_syscall_exit(). 215 */ 216 static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs, 217 int step); 218 219 #ifndef arch_ptrace_report_syscall_exit 220 static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs, 221 int step) 222 { 223 ptrace_report_syscall_exit(regs, step); 224 } 225 #endif 226 227 /** 228 * syscall_exit_work - Handle work before returning to user mode 229 * @regs: Pointer to current pt_regs 230 * @work: Current thread syscall work 231 * 232 * Do one-time syscall specific work. 233 */ 234 static __always_inline void syscall_exit_work(struct pt_regs *regs, unsigned long work) 235 { 236 bool step; 237 238 /* 239 * If the syscall was rolled back due to syscall user dispatching, 240 * then the tracers below are not invoked for the same reason as 241 * the entry side was not invoked in syscall_trace_enter(): The ABI 242 * of these syscalls is unknown. 243 */ 244 if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { 245 if (unlikely(current->syscall_dispatch.on_dispatch)) { 246 current->syscall_dispatch.on_dispatch = false; 247 return; 248 } 249 } 250 251 audit_syscall_exit(regs); 252 253 if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) 254 trace_syscall_exit(regs, syscall_get_return_value(current, regs)); 255 256 step = report_single_step(work); 257 if (step || work & SYSCALL_WORK_SYSCALL_TRACE) 258 arch_ptrace_report_syscall_exit(regs, step); 259 } 260 261 /** 262 * syscall_exit_to_user_mode_work - Handle one time work before returning to user mode 263 * @regs: Pointer to currents pt_regs 264 * 265 * Step 1 of syscall_exit_to_user_mode() with the same calling convention. 266 * 267 * The caller must invoke steps 2-3 of syscall_exit_to_user_mode() afterwards. 268 */ 269 static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) 270 { 271 unsigned long work = READ_ONCE(current_thread_info()->syscall_work); 272 unsigned long nr = syscall_get_nr(current, regs); 273 274 CT_WARN_ON(ct_state() != CT_STATE_KERNEL); 275 276 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { 277 if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) 278 local_irq_enable(); 279 } 280 281 rseq_debug_syscall_return(regs); 282 283 /* 284 * Do one-time syscall specific work. If these work items are 285 * enabled, we want to run them exactly once per syscall exit with 286 * interrupts enabled. 287 */ 288 if (unlikely(work & SYSCALL_WORK_EXIT)) 289 syscall_exit_work(regs, work); 290 } 291 292 /** 293 * syscall_exit_to_user_mode - Handle work before returning to user mode 294 * @regs: Pointer to currents pt_regs 295 * 296 * Invoked with interrupts enabled and fully valid @regs. Returns with all 297 * work handled, interrupts disabled such that the caller can immediately 298 * switch to user mode. Called from architecture specific syscall and ret 299 * from fork code. 300 * 301 * The call order is: 302 * 1) One-time syscall exit work: 303 * - rseq syscall exit 304 * - audit 305 * - syscall tracing 306 * - ptrace (single stepping) 307 * 308 * 2) Preparatory work 309 * - Disable interrupts 310 * - Exit to user mode loop (common TIF handling). Invokes 311 * arch_exit_to_user_mode_work() for architecture specific TIF work 312 * - Architecture specific one time work arch_exit_to_user_mode_prepare() 313 * - Address limit and lockdep checks 314 * 315 * 3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the 316 * functionality in exit_to_user_mode(). 317 * 318 * This is a combination of syscall_exit_to_user_mode_work() (1), disabling 319 * interrupts followed by syscall_exit_to_user_mode_prepare() (2) and 320 * exit_to_user_mode() (3). This function is preferred unless there is a 321 * compelling architectural reason to invoke the functions separately. 322 */ 323 static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs) 324 { 325 instrumentation_begin(); 326 syscall_exit_to_user_mode_work(regs); 327 local_irq_disable_exit_to_user(); 328 syscall_exit_to_user_mode_prepare(regs); 329 instrumentation_end(); 330 exit_to_user_mode(); 331 } 332 333 #endif 334