1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* delayacct.c - per-task delay accounting
3 *
4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
5 */
6
7 #include <linux/sched.h>
8 #include <linux/sched/task.h>
9 #include <linux/sched/cputime.h>
10 #include <linux/sched/clock.h>
11 #include <linux/slab.h>
12 #include <linux/taskstats.h>
13 #include <linux/sysctl.h>
14 #include <linux/delayacct.h>
15 #include <linux/module.h>
16
17 #define UPDATE_DELAY(type) \
18 do { \
19 d->type##_delay_max = tsk->delays->type##_delay_max; \
20 d->type##_delay_min = tsk->delays->type##_delay_min; \
21 d->type##_delay_max_ts.tv_sec = tsk->delays->type##_delay_max_ts.tv_sec; \
22 d->type##_delay_max_ts.tv_nsec = tsk->delays->type##_delay_max_ts.tv_nsec; \
23 tmp = d->type##_delay_total + tsk->delays->type##_delay; \
24 d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \
25 d->type##_count += tsk->delays->type##_count; \
26 } while (0)
27
28 DEFINE_STATIC_KEY_FALSE(delayacct_key);
29 int delayacct_on __read_mostly; /* Delay accounting turned on/off */
30 struct kmem_cache *delayacct_cache;
31
set_delayacct(bool enabled)32 static void set_delayacct(bool enabled)
33 {
34 if (enabled) {
35 static_branch_enable(&delayacct_key);
36 delayacct_on = 1;
37 } else {
38 delayacct_on = 0;
39 static_branch_disable(&delayacct_key);
40 }
41 }
42
delayacct_setup_enable(char * str)43 static int __init delayacct_setup_enable(char *str)
44 {
45 delayacct_on = 1;
46 return 1;
47 }
48 __setup("delayacct", delayacct_setup_enable);
49
delayacct_init(void)50 void delayacct_init(void)
51 {
52 delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT);
53 delayacct_tsk_init(&init_task);
54 set_delayacct(delayacct_on);
55 }
56
57 #ifdef CONFIG_PROC_SYSCTL
sysctl_delayacct(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)58 static int sysctl_delayacct(const struct ctl_table *table, int write, void *buffer,
59 size_t *lenp, loff_t *ppos)
60 {
61 int state = delayacct_on;
62 struct ctl_table t;
63 int err;
64
65 if (write && !capable(CAP_SYS_ADMIN))
66 return -EPERM;
67
68 t = *table;
69 t.data = &state;
70 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
71 if (err < 0)
72 return err;
73 if (write)
74 set_delayacct(state);
75 return err;
76 }
77
78 static const struct ctl_table kern_delayacct_table[] = {
79 {
80 .procname = "task_delayacct",
81 .data = NULL,
82 .maxlen = sizeof(unsigned int),
83 .mode = 0644,
84 .proc_handler = sysctl_delayacct,
85 .extra1 = SYSCTL_ZERO,
86 .extra2 = SYSCTL_ONE,
87 },
88 };
89
kernel_delayacct_sysctls_init(void)90 static __init int kernel_delayacct_sysctls_init(void)
91 {
92 register_sysctl_init("kernel", kern_delayacct_table);
93 return 0;
94 }
95 late_initcall(kernel_delayacct_sysctls_init);
96 #endif
97
__delayacct_tsk_init(struct task_struct * tsk)98 void __delayacct_tsk_init(struct task_struct *tsk)
99 {
100 tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
101 if (tsk->delays)
102 raw_spin_lock_init(&tsk->delays->lock);
103 }
104
105 /*
106 * Finish delay accounting for a statistic using its timestamps (@start),
107 * accumulator (@total) and @count
108 */
delayacct_end(raw_spinlock_t * lock,u64 * start,u64 * total,u32 * count,u64 * max,u64 * min,struct timespec64 * ts)109 static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count,
110 u64 *max, u64 *min, struct timespec64 *ts)
111 {
112 s64 ns = local_clock() - *start;
113 unsigned long flags;
114
115 if (ns > 0) {
116 raw_spin_lock_irqsave(lock, flags);
117 *total += ns;
118 (*count)++;
119 if (ns > *max) {
120 *max = ns;
121 ktime_get_real_ts64(ts);
122 }
123 if (*min == 0 || ns < *min)
124 *min = ns;
125 raw_spin_unlock_irqrestore(lock, flags);
126 }
127 }
128
__delayacct_blkio_start(void)129 void __delayacct_blkio_start(void)
130 {
131 current->delays->blkio_start = local_clock();
132 }
133
134 /*
135 * We cannot rely on the `current` macro, as we haven't yet switched back to
136 * the process being woken.
137 */
__delayacct_blkio_end(struct task_struct * p)138 void __delayacct_blkio_end(struct task_struct *p)
139 {
140 delayacct_end(&p->delays->lock,
141 &p->delays->blkio_start,
142 &p->delays->blkio_delay,
143 &p->delays->blkio_count,
144 &p->delays->blkio_delay_max,
145 &p->delays->blkio_delay_min,
146 &p->delays->blkio_delay_max_ts);
147 }
148
delayacct_add_tsk(struct taskstats * d,struct task_struct * tsk)149 int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
150 {
151 u64 utime, stime, stimescaled, utimescaled;
152 unsigned long long t2, t3;
153 unsigned long flags, t1;
154 s64 tmp;
155
156 task_cputime(tsk, &utime, &stime);
157 tmp = (s64)d->cpu_run_real_total;
158 tmp += utime + stime;
159 d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
160
161 task_cputime_scaled(tsk, &utimescaled, &stimescaled);
162 tmp = (s64)d->cpu_scaled_run_real_total;
163 tmp += utimescaled + stimescaled;
164 d->cpu_scaled_run_real_total =
165 (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
166
167 /*
168 * No locking available for sched_info (and too expensive to add one)
169 * Mitigate by taking snapshot of values
170 */
171 t1 = tsk->sched_info.pcount;
172 t2 = tsk->sched_info.run_delay;
173 t3 = tsk->se.sum_exec_runtime;
174
175 d->cpu_count += t1;
176
177 d->cpu_delay_max = tsk->sched_info.max_run_delay;
178 d->cpu_delay_min = tsk->sched_info.min_run_delay;
179 d->cpu_delay_max_ts.tv_sec = tsk->sched_info.max_run_delay_ts.tv_sec;
180 d->cpu_delay_max_ts.tv_nsec = tsk->sched_info.max_run_delay_ts.tv_nsec;
181 tmp = (s64)d->cpu_delay_total + t2;
182 d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
183 tmp = (s64)d->cpu_run_virtual_total + t3;
184
185 d->cpu_run_virtual_total =
186 (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
187
188 if (!tsk->delays)
189 return 0;
190
191 /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
192 raw_spin_lock_irqsave(&tsk->delays->lock, flags);
193 UPDATE_DELAY(blkio);
194 UPDATE_DELAY(swapin);
195 UPDATE_DELAY(freepages);
196 UPDATE_DELAY(thrashing);
197 UPDATE_DELAY(compact);
198 UPDATE_DELAY(wpcopy);
199 UPDATE_DELAY(irq);
200 raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
201
202 return 0;
203 }
204
__delayacct_blkio_ticks(struct task_struct * tsk)205 __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
206 {
207 __u64 ret;
208 unsigned long flags;
209
210 raw_spin_lock_irqsave(&tsk->delays->lock, flags);
211 ret = nsec_to_clock_t(tsk->delays->blkio_delay);
212 raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
213 return ret;
214 }
215
__delayacct_freepages_start(void)216 void __delayacct_freepages_start(void)
217 {
218 current->delays->freepages_start = local_clock();
219 }
220
__delayacct_freepages_end(void)221 void __delayacct_freepages_end(void)
222 {
223 delayacct_end(¤t->delays->lock,
224 ¤t->delays->freepages_start,
225 ¤t->delays->freepages_delay,
226 ¤t->delays->freepages_count,
227 ¤t->delays->freepages_delay_max,
228 ¤t->delays->freepages_delay_min,
229 ¤t->delays->freepages_delay_max_ts);
230 }
231
__delayacct_thrashing_start(bool * in_thrashing)232 void __delayacct_thrashing_start(bool *in_thrashing)
233 {
234 *in_thrashing = !!current->in_thrashing;
235 if (*in_thrashing)
236 return;
237
238 current->in_thrashing = 1;
239 current->delays->thrashing_start = local_clock();
240 }
241
__delayacct_thrashing_end(bool * in_thrashing)242 void __delayacct_thrashing_end(bool *in_thrashing)
243 {
244 if (*in_thrashing)
245 return;
246
247 current->in_thrashing = 0;
248 delayacct_end(¤t->delays->lock,
249 ¤t->delays->thrashing_start,
250 ¤t->delays->thrashing_delay,
251 ¤t->delays->thrashing_count,
252 ¤t->delays->thrashing_delay_max,
253 ¤t->delays->thrashing_delay_min,
254 ¤t->delays->thrashing_delay_max_ts);
255 }
256
__delayacct_swapin_start(void)257 void __delayacct_swapin_start(void)
258 {
259 current->delays->swapin_start = local_clock();
260 }
261
__delayacct_swapin_end(void)262 void __delayacct_swapin_end(void)
263 {
264 delayacct_end(¤t->delays->lock,
265 ¤t->delays->swapin_start,
266 ¤t->delays->swapin_delay,
267 ¤t->delays->swapin_count,
268 ¤t->delays->swapin_delay_max,
269 ¤t->delays->swapin_delay_min,
270 ¤t->delays->swapin_delay_max_ts);
271 }
272
__delayacct_compact_start(void)273 void __delayacct_compact_start(void)
274 {
275 current->delays->compact_start = local_clock();
276 }
277
__delayacct_compact_end(void)278 void __delayacct_compact_end(void)
279 {
280 delayacct_end(¤t->delays->lock,
281 ¤t->delays->compact_start,
282 ¤t->delays->compact_delay,
283 ¤t->delays->compact_count,
284 ¤t->delays->compact_delay_max,
285 ¤t->delays->compact_delay_min,
286 ¤t->delays->compact_delay_max_ts);
287 }
288
__delayacct_wpcopy_start(void)289 void __delayacct_wpcopy_start(void)
290 {
291 current->delays->wpcopy_start = local_clock();
292 }
293
__delayacct_wpcopy_end(void)294 void __delayacct_wpcopy_end(void)
295 {
296 delayacct_end(¤t->delays->lock,
297 ¤t->delays->wpcopy_start,
298 ¤t->delays->wpcopy_delay,
299 ¤t->delays->wpcopy_count,
300 ¤t->delays->wpcopy_delay_max,
301 ¤t->delays->wpcopy_delay_min,
302 ¤t->delays->wpcopy_delay_max_ts);
303 }
304
__delayacct_irq(struct task_struct * task,u32 delta)305 void __delayacct_irq(struct task_struct *task, u32 delta)
306 {
307 unsigned long flags;
308
309 raw_spin_lock_irqsave(&task->delays->lock, flags);
310 task->delays->irq_delay += delta;
311 task->delays->irq_count++;
312 if (delta > task->delays->irq_delay_max) {
313 task->delays->irq_delay_max = delta;
314 ktime_get_real_ts64(&task->delays->irq_delay_max_ts);
315 }
316 if (delta && (!task->delays->irq_delay_min || delta < task->delays->irq_delay_min))
317 task->delays->irq_delay_min = delta;
318 raw_spin_unlock_irqrestore(&task->delays->lock, flags);
319 }
320
321