xref: /linux/kernel/entry/common.c (revision c1fe867b5bf9c57ab7856486d342720e2b205eed)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/irq-entry-common.h>
4 #include <linux/resume_user_mode.h>
5 #include <linux/highmem.h>
6 #include <linux/jump_label.h>
7 #include <linux/kmsan.h>
8 #include <linux/livepatch.h>
9 #include <linux/tick.h>
10 
11 /* Workaround to allow gradual conversion of architecture code */
12 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
13 
14 #ifdef CONFIG_HAVE_GENERIC_TIF_BITS
15 #define EXIT_TO_USER_MODE_WORK_LOOP	(EXIT_TO_USER_MODE_WORK & ~_TIF_RSEQ)
16 #else
17 #define EXIT_TO_USER_MODE_WORK_LOOP	(EXIT_TO_USER_MODE_WORK)
18 #endif
19 
20 /* TIF bits, which prevent a time slice extension. */
21 #ifdef CONFIG_PREEMPT_RT
22 /*
23  * Since rseq slice ext has a direct correlation to the worst case
24  * scheduling latency (schedule is delayed after all), only have it affect
25  * LAZY reschedules on PREEMPT_RT for now.
26  *
27  * However, since this delay is only applicable to userspace, a value
28  * for rseq_slice_extension_nsec that is strictly less than the worst case
29  * kernel space preempt_disable() region, should mean the scheduling latency
30  * is not affected, even for !LAZY.
31  *
32  * However, since this value depends on the hardware at hand, it cannot be
33  * pre-determined in any sensible way. Hence punt on this problem for now.
34  */
35 # define TIF_SLICE_EXT_SCHED	(_TIF_NEED_RESCHED_LAZY)
36 #else
37 # define TIF_SLICE_EXT_SCHED	(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)
38 #endif
39 #define TIF_SLICE_EXT_DENY	(EXIT_TO_USER_MODE_WORK & ~TIF_SLICE_EXT_SCHED)
40 
41 static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs,
42 							      unsigned long ti_work)
43 {
44 	/*
45 	 * Before returning to user space ensure that all pending work
46 	 * items have been completed.
47 	 */
48 	while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) {
49 
50 		local_irq_enable_exit_to_user(ti_work);
51 
52 		if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
53 			if (!rseq_grant_slice_extension(ti_work, TIF_SLICE_EXT_DENY))
54 				schedule();
55 		}
56 
57 		if (ti_work & _TIF_UPROBE)
58 			uprobe_notify_resume(regs);
59 
60 		if (ti_work & _TIF_PATCH_PENDING)
61 			klp_update_patch_state(current);
62 
63 		if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
64 			arch_do_signal_or_restart(regs);
65 
66 		if (ti_work & _TIF_NOTIFY_RESUME)
67 			resume_user_mode_work(regs);
68 
69 		/* Architecture specific TIF work */
70 		arch_exit_to_user_mode_work(regs, ti_work);
71 
72 		/*
73 		 * Disable interrupts and reevaluate the work flags as they
74 		 * might have changed while interrupts and preemption was
75 		 * enabled above.
76 		 */
77 		local_irq_disable_exit_to_user();
78 
79 		/* Check if any of the above work has queued a deferred wakeup */
80 		tick_nohz_user_enter_prepare();
81 
82 		ti_work = read_thread_flags();
83 	}
84 
85 	/* Return the latest work state for arch_exit_to_user_mode() */
86 	return ti_work;
87 }
88 
89 /**
90  * exit_to_user_mode_loop - do any pending work before leaving to user space
91  * @regs:	Pointer to pt_regs on entry stack
92  * @ti_work:	TIF work flags as read by the caller
93  */
94 __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
95 						     unsigned long ti_work)
96 {
97 	for (;;) {
98 		ti_work = __exit_to_user_mode_loop(regs, ti_work);
99 
100 		if (likely(!rseq_exit_to_user_mode_restart(regs, ti_work)))
101 			return ti_work;
102 		ti_work = read_thread_flags();
103 	}
104 }
105 
106 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
107 {
108 	irqentry_state_t ret = {
109 		.exit_rcu = false,
110 	};
111 
112 	if (user_mode(regs)) {
113 		irqentry_enter_from_user_mode(regs);
114 		return ret;
115 	}
116 
117 	/*
118 	 * If this entry hit the idle task invoke ct_irq_enter() whether
119 	 * RCU is watching or not.
120 	 *
121 	 * Interrupts can nest when the first interrupt invokes softirq
122 	 * processing on return which enables interrupts.
123 	 *
124 	 * Scheduler ticks in the idle task can mark quiescent state and
125 	 * terminate a grace period, if and only if the timer interrupt is
126 	 * not nested into another interrupt.
127 	 *
128 	 * Checking for rcu_is_watching() here would prevent the nesting
129 	 * interrupt to invoke ct_irq_enter(). If that nested interrupt is
130 	 * the tick then rcu_flavor_sched_clock_irq() would wrongfully
131 	 * assume that it is the first interrupt and eventually claim
132 	 * quiescent state and end grace periods prematurely.
133 	 *
134 	 * Unconditionally invoke ct_irq_enter() so RCU state stays
135 	 * consistent.
136 	 *
137 	 * TINY_RCU does not support EQS, so let the compiler eliminate
138 	 * this part when enabled.
139 	 */
140 	if (!IS_ENABLED(CONFIG_TINY_RCU) &&
141 	    (is_idle_task(current) || arch_in_rcu_eqs())) {
142 		/*
143 		 * If RCU is not watching then the same careful
144 		 * sequence vs. lockdep and tracing is required
145 		 * as in irqentry_enter_from_user_mode().
146 		 */
147 		lockdep_hardirqs_off(CALLER_ADDR0);
148 		ct_irq_enter();
149 		instrumentation_begin();
150 		kmsan_unpoison_entry_regs(regs);
151 		trace_hardirqs_off_finish();
152 		instrumentation_end();
153 
154 		ret.exit_rcu = true;
155 		return ret;
156 	}
157 
158 	/*
159 	 * If RCU is watching then RCU only wants to check whether it needs
160 	 * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
161 	 * already contains a warning when RCU is not watching, so no point
162 	 * in having another one here.
163 	 */
164 	lockdep_hardirqs_off(CALLER_ADDR0);
165 	instrumentation_begin();
166 	kmsan_unpoison_entry_regs(regs);
167 	rcu_irq_enter_check_tick();
168 	trace_hardirqs_off_finish();
169 	instrumentation_end();
170 
171 	return ret;
172 }
173 
174 /**
175  * arch_irqentry_exit_need_resched - Architecture specific need resched function
176  *
177  * Invoked from raw_irqentry_exit_cond_resched() to check if resched is needed.
178  * Defaults return true.
179  *
180  * The main purpose is to permit arch to avoid preemption of a task from an IRQ.
181  */
182 static inline bool arch_irqentry_exit_need_resched(void);
183 
184 #ifndef arch_irqentry_exit_need_resched
185 static inline bool arch_irqentry_exit_need_resched(void) { return true; }
186 #endif
187 
188 void raw_irqentry_exit_cond_resched(void)
189 {
190 	if (!preempt_count()) {
191 		/* Sanity check RCU and thread stack */
192 		rcu_irq_exit_check_preempt();
193 		if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
194 			WARN_ON_ONCE(!on_thread_stack());
195 		if (need_resched() && arch_irqentry_exit_need_resched())
196 			preempt_schedule_irq();
197 	}
198 }
199 #ifdef CONFIG_PREEMPT_DYNAMIC
200 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
201 DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
202 #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
203 DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
204 void dynamic_irqentry_exit_cond_resched(void)
205 {
206 	if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
207 		return;
208 	raw_irqentry_exit_cond_resched();
209 }
210 #endif
211 #endif
212 
213 noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
214 {
215 	lockdep_assert_irqs_disabled();
216 
217 	/* Check whether this returns to user mode */
218 	if (user_mode(regs)) {
219 		irqentry_exit_to_user_mode(regs);
220 	} else if (!regs_irqs_disabled(regs)) {
221 		/*
222 		 * If RCU was not watching on entry this needs to be done
223 		 * carefully and needs the same ordering of lockdep/tracing
224 		 * and RCU as the return to user mode path.
225 		 */
226 		if (state.exit_rcu) {
227 			instrumentation_begin();
228 			hrtimer_rearm_deferred();
229 			/* Tell the tracer that IRET will enable interrupts */
230 			trace_hardirqs_on_prepare();
231 			lockdep_hardirqs_on_prepare();
232 			instrumentation_end();
233 			ct_irq_exit();
234 			lockdep_hardirqs_on(CALLER_ADDR0);
235 			return;
236 		}
237 
238 		instrumentation_begin();
239 		if (IS_ENABLED(CONFIG_PREEMPTION))
240 			irqentry_exit_cond_resched();
241 
242 		hrtimer_rearm_deferred();
243 		/* Covers both tracing and lockdep */
244 		trace_hardirqs_on();
245 		instrumentation_end();
246 	} else {
247 		/*
248 		 * IRQ flags state is correct already. Just tell RCU if it
249 		 * was not watching on entry.
250 		 */
251 		if (state.exit_rcu)
252 			ct_irq_exit();
253 	}
254 }
255 
256 irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
257 {
258 	irqentry_state_t irq_state;
259 
260 	irq_state.lockdep = lockdep_hardirqs_enabled();
261 
262 	__nmi_enter();
263 	lockdep_hardirqs_off(CALLER_ADDR0);
264 	lockdep_hardirq_enter();
265 	ct_nmi_enter();
266 
267 	instrumentation_begin();
268 	kmsan_unpoison_entry_regs(regs);
269 	trace_hardirqs_off_finish();
270 	ftrace_nmi_enter();
271 	instrumentation_end();
272 
273 	return irq_state;
274 }
275 
276 void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
277 {
278 	instrumentation_begin();
279 	ftrace_nmi_exit();
280 	if (irq_state.lockdep) {
281 		trace_hardirqs_on_prepare();
282 		lockdep_hardirqs_on_prepare();
283 	}
284 	instrumentation_end();
285 
286 	ct_nmi_exit();
287 	lockdep_hardirq_exit();
288 	if (irq_state.lockdep)
289 		lockdep_hardirqs_on(CALLER_ADDR0);
290 	__nmi_exit();
291 }
292