1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Detect Hung Task
4 *
5 * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
6 *
7 */
8
9 #include <linux/mm.h>
10 #include <linux/cpu.h>
11 #include <linux/nmi.h>
12 #include <linux/init.h>
13 #include <linux/delay.h>
14 #include <linux/freezer.h>
15 #include <linux/kthread.h>
16 #include <linux/lockdep.h>
17 #include <linux/export.h>
18 #include <linux/panic_notifier.h>
19 #include <linux/sysctl.h>
20 #include <linux/suspend.h>
21 #include <linux/utsname.h>
22 #include <linux/sched/signal.h>
23 #include <linux/sched/debug.h>
24 #include <linux/sched/sysctl.h>
25
26 #include <trace/events/sched.h>
27
28 /*
29 * The number of tasks checked:
30 */
31 static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
32
33 /*
34 * Total number of tasks detected as hung since boot:
35 */
36 static unsigned long __read_mostly sysctl_hung_task_detect_count;
37
38 /*
39 * Limit number of tasks checked in a batch.
40 *
41 * This value controls the preemptibility of khungtaskd since preemption
42 * is disabled during the critical section. It also controls the size of
43 * the RCU grace period. So it needs to be upper-bound.
44 */
45 #define HUNG_TASK_LOCK_BREAK (HZ / 10)
46
47 /*
48 * Zero means infinite timeout - no checking done:
49 */
50 unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
51 EXPORT_SYMBOL_GPL(sysctl_hung_task_timeout_secs);
52
53 /*
54 * Zero (default value) means use sysctl_hung_task_timeout_secs:
55 */
56 static unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
57
58 static int __read_mostly sysctl_hung_task_warnings = 10;
59
60 static int __read_mostly did_panic;
61 static bool hung_task_show_lock;
62 static bool hung_task_call_panic;
63 static bool hung_task_show_all_bt;
64
65 static struct task_struct *watchdog_task;
66
67 #ifdef CONFIG_SMP
68 /*
69 * Should we dump all CPUs backtraces in a hung task event?
70 * Defaults to 0, can be changed via sysctl.
71 */
72 static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
73 #else
74 #define sysctl_hung_task_all_cpu_backtrace 0
75 #endif /* CONFIG_SMP */
76
77 /*
78 * Should we panic (and reboot, if panic_timeout= is set) when a
79 * hung task is detected:
80 */
81 static unsigned int __read_mostly sysctl_hung_task_panic =
82 IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC);
83
84 static int
hung_task_panic(struct notifier_block * this,unsigned long event,void * ptr)85 hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
86 {
87 did_panic = 1;
88
89 return NOTIFY_DONE;
90 }
91
92 static struct notifier_block panic_block = {
93 .notifier_call = hung_task_panic,
94 };
95
96
97 #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
debug_show_blocker(struct task_struct * task)98 static void debug_show_blocker(struct task_struct *task)
99 {
100 struct task_struct *g, *t;
101 unsigned long owner;
102 struct mutex *lock;
103
104 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held");
105
106 lock = READ_ONCE(task->blocker_mutex);
107 if (!lock)
108 return;
109
110 owner = mutex_get_owner(lock);
111 if (unlikely(!owner)) {
112 pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n",
113 task->comm, task->pid);
114 return;
115 }
116
117 /* Ensure the owner information is correct. */
118 for_each_process_thread(g, t) {
119 if ((unsigned long)t == owner) {
120 pr_err("INFO: task %s:%d is blocked on a mutex likely owned by task %s:%d.\n",
121 task->comm, task->pid, t->comm, t->pid);
122 sched_show_task(t);
123 return;
124 }
125 }
126 }
127 #else
debug_show_blocker(struct task_struct * task)128 static inline void debug_show_blocker(struct task_struct *task)
129 {
130 }
131 #endif
132
check_hung_task(struct task_struct * t,unsigned long timeout)133 static void check_hung_task(struct task_struct *t, unsigned long timeout)
134 {
135 unsigned long switch_count = t->nvcsw + t->nivcsw;
136
137 /*
138 * Ensure the task is not frozen.
139 * Also, skip vfork and any other user process that freezer should skip.
140 */
141 if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN))
142 return;
143
144 /*
145 * When a freshly created task is scheduled once, changes its state to
146 * TASK_UNINTERRUPTIBLE without having ever been switched out once, it
147 * musn't be checked.
148 */
149 if (unlikely(!switch_count))
150 return;
151
152 if (switch_count != t->last_switch_count) {
153 t->last_switch_count = switch_count;
154 t->last_switch_time = jiffies;
155 return;
156 }
157 if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
158 return;
159
160 /*
161 * This counter tracks the total number of tasks detected as hung
162 * since boot.
163 */
164 sysctl_hung_task_detect_count++;
165
166 trace_sched_process_hang(t);
167
168 if (sysctl_hung_task_panic) {
169 console_verbose();
170 hung_task_show_lock = true;
171 hung_task_call_panic = true;
172 }
173
174 /*
175 * Ok, the task did not get scheduled for more than 2 minutes,
176 * complain:
177 */
178 if (sysctl_hung_task_warnings || hung_task_call_panic) {
179 if (sysctl_hung_task_warnings > 0)
180 sysctl_hung_task_warnings--;
181 pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
182 t->comm, t->pid, (jiffies - t->last_switch_time) / HZ);
183 pr_err(" %s %s %.*s\n",
184 print_tainted(), init_utsname()->release,
185 (int)strcspn(init_utsname()->version, " "),
186 init_utsname()->version);
187 if (t->flags & PF_POSTCOREDUMP)
188 pr_err(" Blocked by coredump.\n");
189 pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
190 " disables this message.\n");
191 sched_show_task(t);
192 debug_show_blocker(t);
193 hung_task_show_lock = true;
194
195 if (sysctl_hung_task_all_cpu_backtrace)
196 hung_task_show_all_bt = true;
197 if (!sysctl_hung_task_warnings)
198 pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n");
199 }
200
201 touch_nmi_watchdog();
202 }
203
204 /*
205 * To avoid extending the RCU grace period for an unbounded amount of time,
206 * periodically exit the critical section and enter a new one.
207 *
208 * For preemptible RCU it is sufficient to call rcu_read_unlock in order
209 * to exit the grace period. For classic RCU, a reschedule is required.
210 */
rcu_lock_break(struct task_struct * g,struct task_struct * t)211 static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)
212 {
213 bool can_cont;
214
215 get_task_struct(g);
216 get_task_struct(t);
217 rcu_read_unlock();
218 cond_resched();
219 rcu_read_lock();
220 can_cont = pid_alive(g) && pid_alive(t);
221 put_task_struct(t);
222 put_task_struct(g);
223
224 return can_cont;
225 }
226
227 /*
228 * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
229 * a really long time (120 seconds). If that happens, print out
230 * a warning.
231 */
check_hung_uninterruptible_tasks(unsigned long timeout)232 static void check_hung_uninterruptible_tasks(unsigned long timeout)
233 {
234 int max_count = sysctl_hung_task_check_count;
235 unsigned long last_break = jiffies;
236 struct task_struct *g, *t;
237
238 /*
239 * If the system crashed already then all bets are off,
240 * do not report extra hung tasks:
241 */
242 if (test_taint(TAINT_DIE) || did_panic)
243 return;
244
245 hung_task_show_lock = false;
246 rcu_read_lock();
247 for_each_process_thread(g, t) {
248 unsigned int state;
249
250 if (!max_count--)
251 goto unlock;
252 if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) {
253 if (!rcu_lock_break(g, t))
254 goto unlock;
255 last_break = jiffies;
256 }
257 /*
258 * skip the TASK_KILLABLE tasks -- these can be killed
259 * skip the TASK_IDLE tasks -- those are genuinely idle
260 */
261 state = READ_ONCE(t->__state);
262 if ((state & TASK_UNINTERRUPTIBLE) &&
263 !(state & TASK_WAKEKILL) &&
264 !(state & TASK_NOLOAD))
265 check_hung_task(t, timeout);
266 }
267 unlock:
268 rcu_read_unlock();
269 if (hung_task_show_lock)
270 debug_show_all_locks();
271
272 if (hung_task_show_all_bt) {
273 hung_task_show_all_bt = false;
274 trigger_all_cpu_backtrace();
275 }
276
277 if (hung_task_call_panic)
278 panic("hung_task: blocked tasks");
279 }
280
hung_timeout_jiffies(unsigned long last_checked,unsigned long timeout)281 static long hung_timeout_jiffies(unsigned long last_checked,
282 unsigned long timeout)
283 {
284 /* timeout of 0 will disable the watchdog */
285 return timeout ? last_checked - jiffies + timeout * HZ :
286 MAX_SCHEDULE_TIMEOUT;
287 }
288
289 #ifdef CONFIG_SYSCTL
290 /*
291 * Process updating of timeout sysctl
292 */
proc_dohung_task_timeout_secs(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)293 static int proc_dohung_task_timeout_secs(const struct ctl_table *table, int write,
294 void *buffer,
295 size_t *lenp, loff_t *ppos)
296 {
297 int ret;
298
299 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
300
301 if (ret || !write)
302 goto out;
303
304 wake_up_process(watchdog_task);
305
306 out:
307 return ret;
308 }
309
310 /*
311 * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs
312 * and hung_task_check_interval_secs
313 */
314 static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ);
315 static const struct ctl_table hung_task_sysctls[] = {
316 #ifdef CONFIG_SMP
317 {
318 .procname = "hung_task_all_cpu_backtrace",
319 .data = &sysctl_hung_task_all_cpu_backtrace,
320 .maxlen = sizeof(int),
321 .mode = 0644,
322 .proc_handler = proc_dointvec_minmax,
323 .extra1 = SYSCTL_ZERO,
324 .extra2 = SYSCTL_ONE,
325 },
326 #endif /* CONFIG_SMP */
327 {
328 .procname = "hung_task_panic",
329 .data = &sysctl_hung_task_panic,
330 .maxlen = sizeof(int),
331 .mode = 0644,
332 .proc_handler = proc_dointvec_minmax,
333 .extra1 = SYSCTL_ZERO,
334 .extra2 = SYSCTL_ONE,
335 },
336 {
337 .procname = "hung_task_check_count",
338 .data = &sysctl_hung_task_check_count,
339 .maxlen = sizeof(int),
340 .mode = 0644,
341 .proc_handler = proc_dointvec_minmax,
342 .extra1 = SYSCTL_ZERO,
343 },
344 {
345 .procname = "hung_task_timeout_secs",
346 .data = &sysctl_hung_task_timeout_secs,
347 .maxlen = sizeof(unsigned long),
348 .mode = 0644,
349 .proc_handler = proc_dohung_task_timeout_secs,
350 .extra2 = (void *)&hung_task_timeout_max,
351 },
352 {
353 .procname = "hung_task_check_interval_secs",
354 .data = &sysctl_hung_task_check_interval_secs,
355 .maxlen = sizeof(unsigned long),
356 .mode = 0644,
357 .proc_handler = proc_dohung_task_timeout_secs,
358 .extra2 = (void *)&hung_task_timeout_max,
359 },
360 {
361 .procname = "hung_task_warnings",
362 .data = &sysctl_hung_task_warnings,
363 .maxlen = sizeof(int),
364 .mode = 0644,
365 .proc_handler = proc_dointvec_minmax,
366 .extra1 = SYSCTL_NEG_ONE,
367 },
368 {
369 .procname = "hung_task_detect_count",
370 .data = &sysctl_hung_task_detect_count,
371 .maxlen = sizeof(unsigned long),
372 .mode = 0444,
373 .proc_handler = proc_doulongvec_minmax,
374 },
375 };
376
hung_task_sysctl_init(void)377 static void __init hung_task_sysctl_init(void)
378 {
379 register_sysctl_init("kernel", hung_task_sysctls);
380 }
381 #else
382 #define hung_task_sysctl_init() do { } while (0)
383 #endif /* CONFIG_SYSCTL */
384
385
386 static atomic_t reset_hung_task = ATOMIC_INIT(0);
387
reset_hung_task_detector(void)388 void reset_hung_task_detector(void)
389 {
390 atomic_set(&reset_hung_task, 1);
391 }
392 EXPORT_SYMBOL_GPL(reset_hung_task_detector);
393
394 static bool hung_detector_suspended;
395
hungtask_pm_notify(struct notifier_block * self,unsigned long action,void * hcpu)396 static int hungtask_pm_notify(struct notifier_block *self,
397 unsigned long action, void *hcpu)
398 {
399 switch (action) {
400 case PM_SUSPEND_PREPARE:
401 case PM_HIBERNATION_PREPARE:
402 case PM_RESTORE_PREPARE:
403 hung_detector_suspended = true;
404 break;
405 case PM_POST_SUSPEND:
406 case PM_POST_HIBERNATION:
407 case PM_POST_RESTORE:
408 hung_detector_suspended = false;
409 break;
410 default:
411 break;
412 }
413 return NOTIFY_OK;
414 }
415
416 /*
417 * kthread which checks for tasks stuck in D state
418 */
watchdog(void * dummy)419 static int watchdog(void *dummy)
420 {
421 unsigned long hung_last_checked = jiffies;
422
423 set_user_nice(current, 0);
424
425 for ( ; ; ) {
426 unsigned long timeout = sysctl_hung_task_timeout_secs;
427 unsigned long interval = sysctl_hung_task_check_interval_secs;
428 long t;
429
430 if (interval == 0)
431 interval = timeout;
432 interval = min_t(unsigned long, interval, timeout);
433 t = hung_timeout_jiffies(hung_last_checked, interval);
434 if (t <= 0) {
435 if (!atomic_xchg(&reset_hung_task, 0) &&
436 !hung_detector_suspended)
437 check_hung_uninterruptible_tasks(timeout);
438 hung_last_checked = jiffies;
439 continue;
440 }
441 schedule_timeout_interruptible(t);
442 }
443
444 return 0;
445 }
446
hung_task_init(void)447 static int __init hung_task_init(void)
448 {
449 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
450
451 /* Disable hung task detector on suspend */
452 pm_notifier(hungtask_pm_notify, 0);
453
454 watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
455 hung_task_sysctl_init();
456
457 return 0;
458 }
459 subsys_initcall(hung_task_init);
460