1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2010 The FreeBSD Foundation
5 *
6 * This software was developed by Edward Tomasz Napierala under sponsorship
7 * from the FreeBSD Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 #include <sys/cdefs.h>
32 #include "opt_sched.h"
33
34 #include <sys/param.h>
35 #include <sys/buf.h>
36 #include <sys/systm.h>
37 #include <sys/eventhandler.h>
38 #include <sys/jail.h>
39 #include <sys/kernel.h>
40 #include <sys/kthread.h>
41 #include <sys/lock.h>
42 #include <sys/loginclass.h>
43 #include <sys/malloc.h>
44 #include <sys/mutex.h>
45 #include <sys/proc.h>
46 #include <sys/racct.h>
47 #include <sys/resourcevar.h>
48 #include <sys/sbuf.h>
49 #include <sys/sched.h>
50 #include <sys/sdt.h>
51 #include <sys/smp.h>
52 #include <sys/sx.h>
53 #include <sys/sysctl.h>
54 #include <sys/sysproto.h>
55 #include <sys/umtxvar.h>
56 #include <machine/smp.h>
57
58 #ifdef RCTL
59 #include <sys/rctl.h>
60 #endif
61
62 FEATURE(racct, "Resource Accounting");
63
64 /*
65 * Do not block processes that have their %cpu usage <= pcpu_threshold.
66 */
67 static int pcpu_threshold = 1;
68 #ifdef RACCT_DEFAULT_TO_DISABLED
69 bool __read_frequently racct_enable = false;
70 #else
71 bool __read_frequently racct_enable = true;
72 #endif
73
74 SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
75 "Resource Accounting");
76 SYSCTL_BOOL(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable,
77 0, "Enable RACCT/RCTL");
78 SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold,
79 0, "Processes with higher %cpu usage than this value can be throttled.");
80
81 struct mtx racct_lock;
82 MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF);
83
84 static uma_zone_t racct_zone;
85
86 static void racct_sub_racct(struct racct *dest, const struct racct *src);
87 static void racct_sub_cred_locked(struct ucred *cred, int resource,
88 uint64_t amount);
89 static void racct_add_cred_locked(struct ucred *cred, int resource,
90 uint64_t amount);
91 static int racct_set_locked(struct proc *p, int resource, uint64_t amount,
92 int force);
93 static void racct_updatepcpu_locked(struct proc *p);
94 static void racct_updatepcpu_racct_locked(struct racct *racct);
95 static void racct_updatepcpu_containers(void);
96 static void racct_settime_locked(struct proc *p, bool exit);
97 static void racct_zeropcpu_locked(struct proc *p);
98
99 SDT_PROVIDER_DEFINE(racct);
100 SDT_PROBE_DEFINE3(racct, , rusage, add,
101 "struct proc *", "int", "uint64_t");
102 SDT_PROBE_DEFINE3(racct, , rusage, add__failure,
103 "struct proc *", "int", "uint64_t");
104 SDT_PROBE_DEFINE3(racct, , rusage, add__buf,
105 "struct proc *", "const struct buf *", "int");
106 SDT_PROBE_DEFINE3(racct, , rusage, add__cred,
107 "struct ucred *", "int", "uint64_t");
108 SDT_PROBE_DEFINE3(racct, , rusage, add__force,
109 "struct proc *", "int", "uint64_t");
110 SDT_PROBE_DEFINE3(racct, , rusage, set,
111 "struct proc *", "int", "uint64_t");
112 SDT_PROBE_DEFINE3(racct, , rusage, set__failure,
113 "struct proc *", "int", "uint64_t");
114 SDT_PROBE_DEFINE3(racct, , rusage, set__force,
115 "struct proc *", "int", "uint64_t");
116 SDT_PROBE_DEFINE3(racct, , rusage, sub,
117 "struct proc *", "int", "uint64_t");
118 SDT_PROBE_DEFINE3(racct, , rusage, sub__cred,
119 "struct ucred *", "int", "uint64_t");
120 SDT_PROBE_DEFINE1(racct, , racct, create,
121 "struct racct *");
122 SDT_PROBE_DEFINE1(racct, , racct, destroy,
123 "struct racct *");
124 SDT_PROBE_DEFINE2(racct, , racct, join,
125 "struct racct *", "struct racct *");
126 SDT_PROBE_DEFINE2(racct, , racct, join__failure,
127 "struct racct *", "struct racct *");
128 SDT_PROBE_DEFINE2(racct, , racct, leave,
129 "struct racct *", "struct racct *");
130
131 int racct_types[] = {
132 [RACCT_CPU] =
133 RACCT_IN_MILLIONS,
134 [RACCT_DATA] =
135 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
136 [RACCT_STACK] =
137 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
138 [RACCT_CORE] =
139 RACCT_DENIABLE,
140 [RACCT_RSS] =
141 RACCT_RECLAIMABLE,
142 [RACCT_MEMLOCK] =
143 RACCT_RECLAIMABLE | RACCT_DENIABLE,
144 [RACCT_NPROC] =
145 RACCT_RECLAIMABLE | RACCT_DENIABLE,
146 [RACCT_NOFILE] =
147 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
148 [RACCT_VMEM] =
149 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
150 [RACCT_NPTS] =
151 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
152 [RACCT_SWAP] =
153 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
154 [RACCT_NTHR] =
155 RACCT_RECLAIMABLE | RACCT_DENIABLE,
156 [RACCT_MSGQQUEUED] =
157 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
158 [RACCT_MSGQSIZE] =
159 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
160 [RACCT_NMSGQ] =
161 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
162 [RACCT_NSEM] =
163 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
164 [RACCT_NSEMOP] =
165 RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
166 [RACCT_NSHM] =
167 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
168 [RACCT_SHMSIZE] =
169 RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
170 [RACCT_WALLCLOCK] =
171 RACCT_IN_MILLIONS,
172 [RACCT_PCTCPU] =
173 RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS,
174 [RACCT_READBPS] =
175 RACCT_DECAYING,
176 [RACCT_WRITEBPS] =
177 RACCT_DECAYING,
178 [RACCT_READIOPS] =
179 RACCT_DECAYING,
180 [RACCT_WRITEIOPS] =
181 RACCT_DECAYING };
182
183 static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE;
184
185 static void
racct_add_racct(struct racct * dest,const struct racct * src)186 racct_add_racct(struct racct *dest, const struct racct *src)
187 {
188 int i;
189
190 ASSERT_RACCT_ENABLED();
191 RACCT_LOCK_ASSERT();
192
193 /*
194 * Update resource usage in dest.
195 */
196 for (i = 0; i <= RACCT_MAX; i++) {
197 KASSERT(dest->r_resources[i] >= 0,
198 ("%s: resource %d propagation meltdown: dest < 0",
199 __func__, i));
200 KASSERT(src->r_resources[i] >= 0,
201 ("%s: resource %d propagation meltdown: src < 0",
202 __func__, i));
203 dest->r_resources[i] += src->r_resources[i];
204 }
205 }
206
207 static void
racct_sub_racct(struct racct * dest,const struct racct * src)208 racct_sub_racct(struct racct *dest, const struct racct *src)
209 {
210 int i;
211
212 ASSERT_RACCT_ENABLED();
213 RACCT_LOCK_ASSERT();
214
215 /*
216 * Update resource usage in dest.
217 */
218 for (i = 0; i <= RACCT_MAX; i++) {
219 if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) {
220 KASSERT(dest->r_resources[i] >= 0,
221 ("%s: resource %d propagation meltdown: dest < 0",
222 __func__, i));
223 KASSERT(src->r_resources[i] >= 0,
224 ("%s: resource %d propagation meltdown: src < 0",
225 __func__, i));
226 KASSERT(src->r_resources[i] <= dest->r_resources[i],
227 ("%s: resource %d propagation meltdown: src > dest",
228 __func__, i));
229 }
230 if (RACCT_CAN_DROP(i)) {
231 dest->r_resources[i] -= src->r_resources[i];
232 if (dest->r_resources[i] < 0)
233 dest->r_resources[i] = 0;
234 }
235 }
236 }
237
238 void
racct_create(struct racct ** racctp)239 racct_create(struct racct **racctp)
240 {
241
242 if (!racct_enable)
243 return;
244
245 SDT_PROBE1(racct, , racct, create, racctp);
246
247 KASSERT(*racctp == NULL, ("racct already allocated"));
248
249 *racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO);
250 }
251
252 static void
racct_destroy_locked(struct racct ** racctp)253 racct_destroy_locked(struct racct **racctp)
254 {
255 struct racct *racct;
256 int i;
257
258 ASSERT_RACCT_ENABLED();
259
260 SDT_PROBE1(racct, , racct, destroy, racctp);
261
262 RACCT_LOCK_ASSERT();
263 KASSERT(racctp != NULL, ("NULL racctp"));
264 KASSERT(*racctp != NULL, ("NULL racct"));
265
266 racct = *racctp;
267
268 for (i = 0; i <= RACCT_MAX; i++) {
269 if (RACCT_IS_SLOPPY(i))
270 continue;
271 if (!RACCT_IS_RECLAIMABLE(i))
272 continue;
273 KASSERT(racct->r_resources[i] == 0,
274 ("destroying non-empty racct: "
275 "%ju allocated for resource %d\n",
276 racct->r_resources[i], i));
277 }
278 uma_zfree(racct_zone, racct);
279 *racctp = NULL;
280 }
281
282 void
racct_destroy(struct racct ** racct)283 racct_destroy(struct racct **racct)
284 {
285
286 if (!racct_enable)
287 return;
288
289 RACCT_LOCK();
290 racct_destroy_locked(racct);
291 RACCT_UNLOCK();
292 }
293
294 /*
295 * Increase consumption of 'resource' by 'amount' for 'racct',
296 * but not its parents. Differently from other cases, 'amount' here
297 * may be less than zero.
298 */
299 static void
racct_adjust_resource(struct racct * racct,int resource,int64_t amount)300 racct_adjust_resource(struct racct *racct, int resource,
301 int64_t amount)
302 {
303
304 ASSERT_RACCT_ENABLED();
305 RACCT_LOCK_ASSERT();
306 KASSERT(racct != NULL, ("NULL racct"));
307
308 racct->r_resources[resource] += amount;
309 if (racct->r_resources[resource] < 0) {
310 KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource),
311 ("%s: resource %d usage < 0", __func__, resource));
312 racct->r_resources[resource] = 0;
313 }
314 }
315
316 static int
racct_add_locked(struct proc * p,int resource,uint64_t amount,int force)317 racct_add_locked(struct proc *p, int resource, uint64_t amount, int force)
318 {
319 #ifdef RCTL
320 int error;
321 #endif
322
323 ASSERT_RACCT_ENABLED();
324
325 /*
326 * We need proc lock to dereference p->p_ucred.
327 */
328 PROC_LOCK_ASSERT(p, MA_OWNED);
329
330 #ifdef RCTL
331 error = rctl_enforce(p, resource, amount);
332 if (error && !force && RACCT_IS_DENIABLE(resource)) {
333 SDT_PROBE3(racct, , rusage, add__failure, p, resource, amount);
334 return (error);
335 }
336 #endif
337 racct_adjust_resource(p->p_racct, resource, amount);
338 racct_add_cred_locked(p->p_ucred, resource, amount);
339
340 return (0);
341 }
342
343 /*
344 * Increase allocation of 'resource' by 'amount' for process 'p'.
345 * Return 0 if it's below limits, or errno, if it's not.
346 */
347 int
racct_add(struct proc * p,int resource,uint64_t amount)348 racct_add(struct proc *p, int resource, uint64_t amount)
349 {
350 int error;
351
352 if (!racct_enable)
353 return (0);
354
355 SDT_PROBE3(racct, , rusage, add, p, resource, amount);
356
357 RACCT_LOCK();
358 error = racct_add_locked(p, resource, amount, 0);
359 RACCT_UNLOCK();
360 return (error);
361 }
362
363 /*
364 * Increase allocation of 'resource' by 'amount' for process 'p'.
365 * Doesn't check for limits and never fails.
366 */
367 void
racct_add_force(struct proc * p,int resource,uint64_t amount)368 racct_add_force(struct proc *p, int resource, uint64_t amount)
369 {
370
371 if (!racct_enable)
372 return;
373
374 SDT_PROBE3(racct, , rusage, add__force, p, resource, amount);
375
376 RACCT_LOCK();
377 racct_add_locked(p, resource, amount, 1);
378 RACCT_UNLOCK();
379 }
380
381 static void
racct_add_cred_locked(struct ucred * cred,int resource,uint64_t amount)382 racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount)
383 {
384 struct prison *pr;
385
386 ASSERT_RACCT_ENABLED();
387
388 racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, amount);
389 for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
390 racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource,
391 amount);
392 racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, amount);
393 }
394
395 /*
396 * Increase allocation of 'resource' by 'amount' for credential 'cred'.
397 * Doesn't check for limits and never fails.
398 */
399 void
racct_add_cred(struct ucred * cred,int resource,uint64_t amount)400 racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
401 {
402
403 if (!racct_enable)
404 return;
405
406 SDT_PROBE3(racct, , rusage, add__cred, cred, resource, amount);
407
408 RACCT_LOCK();
409 racct_add_cred_locked(cred, resource, amount);
410 RACCT_UNLOCK();
411 }
412
413 /*
414 * Account for disk IO resource consumption. Checks for limits,
415 * but never fails, due to disk limits being undeniable.
416 */
417 void
racct_add_buf(struct proc * p,const struct buf * bp,int is_write)418 racct_add_buf(struct proc *p, const struct buf *bp, int is_write)
419 {
420
421 ASSERT_RACCT_ENABLED();
422 PROC_LOCK_ASSERT(p, MA_OWNED);
423
424 SDT_PROBE3(racct, , rusage, add__buf, p, bp, is_write);
425
426 RACCT_LOCK();
427 if (is_write) {
428 racct_add_locked(curproc, RACCT_WRITEBPS, bp->b_bcount, 1);
429 racct_add_locked(curproc, RACCT_WRITEIOPS, 1, 1);
430 } else {
431 racct_add_locked(curproc, RACCT_READBPS, bp->b_bcount, 1);
432 racct_add_locked(curproc, RACCT_READIOPS, 1, 1);
433 }
434 RACCT_UNLOCK();
435 }
436
437 static void
racct_settime_locked(struct proc * p,bool exit)438 racct_settime_locked(struct proc *p, bool exit)
439 {
440 struct thread *td;
441 struct timeval wallclock;
442 uint64_t runtime;
443
444 ASSERT_RACCT_ENABLED();
445 RACCT_LOCK_ASSERT();
446 PROC_LOCK_ASSERT(p, MA_OWNED);
447
448 if (exit) {
449 /*
450 * proc_reap() has already calculated rux
451 * and added crux to rux.
452 */
453 runtime = cputick2usec(p->p_rux.rux_runtime -
454 p->p_crux.rux_runtime);
455 } else {
456 PROC_STATLOCK(p);
457 FOREACH_THREAD_IN_PROC(p, td)
458 ruxagg(p, td);
459 PROC_STATUNLOCK(p);
460 runtime = cputick2usec(p->p_rux.rux_runtime);
461 }
462 microuptime(&wallclock);
463 timevalsub(&wallclock, &p->p_stats->p_start);
464
465 racct_set_locked(p, RACCT_CPU, runtime, 0);
466 racct_set_locked(p, RACCT_WALLCLOCK,
467 (uint64_t)wallclock.tv_sec * 1000000 +
468 wallclock.tv_usec, 0);
469 }
470
471 static int
racct_set_locked(struct proc * p,int resource,uint64_t amount,int force)472 racct_set_locked(struct proc *p, int resource, uint64_t amount, int force)
473 {
474 int64_t old_amount, diff_proc, diff_cred;
475 #ifdef RCTL
476 int error;
477 #endif
478
479 ASSERT_RACCT_ENABLED();
480
481 /*
482 * We need proc lock to dereference p->p_ucred.
483 */
484 PROC_LOCK_ASSERT(p, MA_OWNED);
485
486 old_amount = p->p_racct->r_resources[resource];
487 /*
488 * The diffs may be negative.
489 */
490 diff_proc = amount - old_amount;
491 diff_cred = diff_proc;
492 #ifdef notyet
493 KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource),
494 ("%s: usage of non-droppable resource %d dropping", __func__,
495 resource));
496 #endif
497 #ifdef RCTL
498 if (diff_proc > 0) {
499 error = rctl_enforce(p, resource, diff_proc);
500 if (error && !force && RACCT_IS_DENIABLE(resource)) {
501 SDT_PROBE3(racct, , rusage, set__failure, p, resource,
502 amount);
503 return (error);
504 }
505 }
506 #endif
507 racct_adjust_resource(p->p_racct, resource, diff_proc);
508 if (diff_cred > 0)
509 racct_add_cred_locked(p->p_ucred, resource, diff_cred);
510 else if (diff_cred < 0)
511 racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
512
513 return (0);
514 }
515
516 /*
517 * Set allocation of 'resource' to 'amount' for process 'p'.
518 * Return 0 if it's below limits, or errno, if it's not.
519 *
520 * Note that decreasing the allocation always returns 0,
521 * even if it's above the limit.
522 */
523 int
racct_set_unlocked(struct proc * p,int resource,uint64_t amount)524 racct_set_unlocked(struct proc *p, int resource, uint64_t amount)
525 {
526 int error;
527
528 ASSERT_RACCT_ENABLED();
529 PROC_LOCK(p);
530 error = racct_set(p, resource, amount);
531 PROC_UNLOCK(p);
532 return (error);
533 }
534
535 int
racct_set(struct proc * p,int resource,uint64_t amount)536 racct_set(struct proc *p, int resource, uint64_t amount)
537 {
538 int error;
539
540 if (!racct_enable)
541 return (0);
542
543 SDT_PROBE3(racct, , rusage, set__force, p, resource, amount);
544
545 RACCT_LOCK();
546 error = racct_set_locked(p, resource, amount, 0);
547 RACCT_UNLOCK();
548 return (error);
549 }
550
551 void
racct_set_force(struct proc * p,int resource,uint64_t amount)552 racct_set_force(struct proc *p, int resource, uint64_t amount)
553 {
554
555 if (!racct_enable)
556 return;
557
558 SDT_PROBE3(racct, , rusage, set, p, resource, amount);
559
560 RACCT_LOCK();
561 racct_set_locked(p, resource, amount, 1);
562 RACCT_UNLOCK();
563 }
564
565 /*
566 * Returns amount of 'resource' the process 'p' can keep allocated.
567 * Allocating more than that would be denied, unless the resource
568 * is marked undeniable. Amount of already allocated resource does
569 * not matter.
570 */
571 uint64_t
racct_get_limit(struct proc * p,int resource)572 racct_get_limit(struct proc *p, int resource)
573 {
574 #ifdef RCTL
575 uint64_t available;
576
577 if (!racct_enable)
578 return (UINT64_MAX);
579
580 RACCT_LOCK();
581 available = rctl_get_limit(p, resource);
582 RACCT_UNLOCK();
583
584 return (available);
585 #else
586
587 return (UINT64_MAX);
588 #endif
589 }
590
591 /*
592 * Returns amount of 'resource' the process 'p' can keep allocated.
593 * Allocating more than that would be denied, unless the resource
594 * is marked undeniable. Amount of already allocated resource does
595 * matter.
596 */
597 uint64_t
racct_get_available(struct proc * p,int resource)598 racct_get_available(struct proc *p, int resource)
599 {
600 #ifdef RCTL
601 uint64_t available;
602
603 if (!racct_enable)
604 return (UINT64_MAX);
605
606 RACCT_LOCK();
607 available = rctl_get_available(p, resource);
608 RACCT_UNLOCK();
609
610 return (available);
611 #else
612
613 return (UINT64_MAX);
614 #endif
615 }
616
617 /*
618 * Returns amount of the %cpu resource that process 'p' can add to its %cpu
619 * utilization. Adding more than that would lead to the process being
620 * throttled.
621 */
622 static int64_t
racct_pcpu_available(struct proc * p)623 racct_pcpu_available(struct proc *p)
624 {
625 #ifdef RCTL
626 uint64_t available;
627
628 ASSERT_RACCT_ENABLED();
629
630 RACCT_LOCK();
631 available = rctl_pcpu_available(p);
632 RACCT_UNLOCK();
633
634 return (available);
635 #else
636
637 return (INT64_MAX);
638 #endif
639 }
640
641 /*
642 * Decrease allocation of 'resource' by 'amount' for process 'p'.
643 */
644 void
racct_sub(struct proc * p,int resource,uint64_t amount)645 racct_sub(struct proc *p, int resource, uint64_t amount)
646 {
647
648 if (!racct_enable)
649 return;
650
651 SDT_PROBE3(racct, , rusage, sub, p, resource, amount);
652
653 /*
654 * We need proc lock to dereference p->p_ucred.
655 */
656 PROC_LOCK_ASSERT(p, MA_OWNED);
657 KASSERT(RACCT_CAN_DROP(resource),
658 ("%s: called for non-droppable resource %d", __func__, resource));
659
660 RACCT_LOCK();
661 KASSERT(amount <= p->p_racct->r_resources[resource],
662 ("%s: freeing %ju of resource %d, which is more "
663 "than allocated %jd for %s (pid %d)", __func__, amount, resource,
664 (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid));
665
666 racct_adjust_resource(p->p_racct, resource, -amount);
667 racct_sub_cred_locked(p->p_ucred, resource, amount);
668 RACCT_UNLOCK();
669 }
670
671 static void
racct_sub_cred_locked(struct ucred * cred,int resource,uint64_t amount)672 racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount)
673 {
674 struct prison *pr;
675
676 ASSERT_RACCT_ENABLED();
677
678 racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, -amount);
679 for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
680 racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource,
681 -amount);
682 racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, -amount);
683 }
684
685 /*
686 * Decrease allocation of 'resource' by 'amount' for credential 'cred'.
687 */
688 void
racct_sub_cred(struct ucred * cred,int resource,uint64_t amount)689 racct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
690 {
691
692 if (!racct_enable)
693 return;
694
695 SDT_PROBE3(racct, , rusage, sub__cred, cred, resource, amount);
696
697 #ifdef notyet
698 KASSERT(RACCT_CAN_DROP(resource),
699 ("%s: called for resource %d which can not drop", __func__,
700 resource));
701 #endif
702
703 RACCT_LOCK();
704 racct_sub_cred_locked(cred, resource, amount);
705 RACCT_UNLOCK();
706 }
707
708 /*
709 * Inherit resource usage information from the parent process.
710 */
711 int
racct_proc_fork(struct proc * parent,struct proc * child)712 racct_proc_fork(struct proc *parent, struct proc *child)
713 {
714 int i, error = 0;
715
716 if (!racct_enable)
717 return (0);
718
719 /*
720 * Create racct for the child process.
721 */
722 racct_create(&child->p_racct);
723
724 PROC_LOCK(parent);
725 PROC_LOCK(child);
726 RACCT_LOCK();
727
728 #ifdef RCTL
729 error = rctl_proc_fork(parent, child);
730 if (error != 0)
731 goto out;
732 #endif
733
734 child->p_throttled = 0;
735
736 /*
737 * Inherit resource usage.
738 */
739 for (i = 0; i <= RACCT_MAX; i++) {
740 if (parent->p_racct->r_resources[i] == 0 ||
741 !RACCT_IS_INHERITABLE(i))
742 continue;
743
744 error = racct_set_locked(child, i,
745 parent->p_racct->r_resources[i], 0);
746 if (error != 0)
747 goto out;
748 }
749
750 error = racct_add_locked(child, RACCT_NPROC, 1, 0);
751 error += racct_add_locked(child, RACCT_NTHR, 1, 0);
752
753 out:
754 RACCT_UNLOCK();
755 PROC_UNLOCK(child);
756 PROC_UNLOCK(parent);
757
758 if (error != 0)
759 racct_proc_exit(child);
760
761 return (error);
762 }
763
764 /*
765 * Called at the end of fork1(), to handle rules that require the process
766 * to be fully initialized.
767 */
768 void
racct_proc_fork_done(struct proc * child)769 racct_proc_fork_done(struct proc *child)
770 {
771
772 if (!racct_enable)
773 return;
774
775 #ifdef RCTL
776 PROC_LOCK(child);
777 RACCT_LOCK();
778 rctl_enforce(child, RACCT_NPROC, 0);
779 rctl_enforce(child, RACCT_NTHR, 0);
780 RACCT_UNLOCK();
781 PROC_UNLOCK(child);
782 #endif
783 }
784
785 void
racct_proc_exit(struct proc * p)786 racct_proc_exit(struct proc *p)
787 {
788 int i;
789
790 if (!racct_enable)
791 return;
792
793 PROC_LOCK(p);
794 RACCT_LOCK();
795
796 racct_settime_locked(p, true);
797 racct_zeropcpu_locked(p);
798
799 KASSERT(p->p_racct->r_resources[RACCT_RSS] == 0,
800 ("process reaped with %ju allocated for RSS\n",
801 p->p_racct->r_resources[RACCT_RSS]));
802 for (i = 0; i <= RACCT_MAX; i++) {
803 if (p->p_racct->r_resources[i] == 0)
804 continue;
805 if (!RACCT_IS_RECLAIMABLE(i))
806 continue;
807 racct_set_locked(p, i, 0, 0);
808 }
809
810 #ifdef RCTL
811 rctl_racct_release(p->p_racct);
812 #endif
813 racct_destroy_locked(&p->p_racct);
814 RACCT_UNLOCK();
815 PROC_UNLOCK(p);
816 }
817
818 /*
819 * Called to signal credentials change, to move resource utilisation
820 * between raccts. Must be called with the proc lock held, in the same span as
821 * the credentials change itself (i.e., without the proc lock being unlocked
822 * between the two), but the order does not matter.
823 */
824 void
racct_proc_ucred_changed(struct proc * p,struct ucred * oldcred,struct ucred * newcred)825 racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred,
826 struct ucred *newcred)
827 {
828 struct uidinfo *olduip, *newuip;
829 struct loginclass *oldlc, *newlc;
830 struct prison *oldpr, *newpr, *pr;
831
832 if (!racct_enable)
833 return;
834
835 PROC_LOCK_ASSERT(p, MA_OWNED);
836
837 newuip = newcred->cr_ruidinfo;
838 olduip = oldcred->cr_ruidinfo;
839 newlc = newcred->cr_loginclass;
840 oldlc = oldcred->cr_loginclass;
841 newpr = newcred->cr_prison;
842 oldpr = oldcred->cr_prison;
843
844 RACCT_LOCK();
845 if (newuip != olduip) {
846 racct_sub_racct(olduip->ui_racct, p->p_racct);
847 racct_add_racct(newuip->ui_racct, p->p_racct);
848 }
849 if (newlc != oldlc) {
850 racct_sub_racct(oldlc->lc_racct, p->p_racct);
851 racct_add_racct(newlc->lc_racct, p->p_racct);
852 }
853 if (newpr != oldpr) {
854 for (pr = oldpr; pr != NULL; pr = pr->pr_parent)
855 racct_sub_racct(pr->pr_prison_racct->prr_racct,
856 p->p_racct);
857 for (pr = newpr; pr != NULL; pr = pr->pr_parent)
858 racct_add_racct(pr->pr_prison_racct->prr_racct,
859 p->p_racct);
860 }
861 RACCT_UNLOCK();
862 }
863
864 void
racct_move(struct racct * dest,struct racct * src)865 racct_move(struct racct *dest, struct racct *src)
866 {
867
868 ASSERT_RACCT_ENABLED();
869
870 RACCT_LOCK();
871 racct_add_racct(dest, src);
872 racct_sub_racct(src, src);
873 dest->r_runtime = src->r_runtime;
874 dest->r_time = src->r_time;
875 src->r_runtime = 0;
876 timevalsub(&src->r_time, &src->r_time);
877 RACCT_UNLOCK();
878 }
879
880 static void
ast_racct(struct thread * td,int tda __unused)881 ast_racct(struct thread *td, int tda __unused)
882 {
883 struct proc *p;
884
885 ASSERT_RACCT_ENABLED();
886
887 p = td->td_proc;
888 if (p->p_throttled == 0)
889 return;
890
891 PROC_LOCK(p);
892 while (p->p_throttled != 0) {
893 msleep(p->p_racct, &p->p_mtx, 0, "racct",
894 p->p_throttled < 0 ? 0 : p->p_throttled);
895 if (p->p_throttled > 0)
896 p->p_throttled = 0;
897 }
898 PROC_UNLOCK(p);
899 }
900
901 /*
902 * Make the process sleep in userret() for 'timeout' ticks. Setting
903 * timeout to -1 makes it sleep until woken up by racct_proc_wakeup().
904 */
905 void
racct_proc_throttle(struct proc * p,int timeout)906 racct_proc_throttle(struct proc *p, int timeout)
907 {
908 struct thread *td;
909 #ifdef SMP
910 int cpuid;
911 #endif
912
913 KASSERT(timeout != 0, ("timeout %d", timeout));
914 ASSERT_RACCT_ENABLED();
915 PROC_LOCK_ASSERT(p, MA_OWNED);
916
917 /*
918 * Do not block kernel processes. Also do not block processes with
919 * low %cpu utilization to improve interactivity.
920 */
921 if ((p->p_flag & (P_SYSTEM | P_KPROC)) != 0)
922 return;
923
924 if (p->p_throttled < 0 || (timeout > 0 && p->p_throttled > timeout))
925 return;
926
927 p->p_throttled = timeout;
928
929 FOREACH_THREAD_IN_PROC(p, td) {
930 thread_lock(td);
931 ast_sched_locked(td, TDA_RACCT);
932
933 switch (TD_GET_STATE(td)) {
934 case TDS_RUNQ:
935 /*
936 * If the thread is on the scheduler run-queue, we can
937 * not just remove it from there. So we set the flag
938 * TDA_SCHED for the thread, so that once it is
939 * running, it is taken off the cpu as soon as possible.
940 */
941 ast_sched_locked(td, TDA_SCHED);
942 break;
943 case TDS_RUNNING:
944 /*
945 * If the thread is running, we request a context
946 * switch for it by setting the TDA_SCHED flag.
947 */
948 ast_sched_locked(td, TDA_SCHED);
949 #ifdef SMP
950 cpuid = td->td_oncpu;
951 if ((cpuid != NOCPU) && (td != curthread))
952 ipi_cpu(cpuid, IPI_AST);
953 #endif
954 break;
955 default:
956 break;
957 }
958 thread_unlock(td);
959 }
960 }
961
962 static void
racct_proc_wakeup(struct proc * p)963 racct_proc_wakeup(struct proc *p)
964 {
965
966 ASSERT_RACCT_ENABLED();
967
968 PROC_LOCK_ASSERT(p, MA_OWNED);
969
970 if (p->p_throttled != 0) {
971 p->p_throttled = 0;
972 wakeup(p->p_racct);
973 }
974 }
975
976 static void
racct_decay_callback(struct racct * racct,void * dummy1,void * dummy2)977 racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2)
978 {
979 ASSERT_RACCT_ENABLED();
980 RACCT_LOCK_ASSERT();
981
982 #ifdef RCTL
983 rctl_throttle_decay(racct, RACCT_READBPS);
984 rctl_throttle_decay(racct, RACCT_WRITEBPS);
985 rctl_throttle_decay(racct, RACCT_READIOPS);
986 rctl_throttle_decay(racct, RACCT_WRITEIOPS);
987 #endif
988 }
989
990 static void
racct_decay_pre(void)991 racct_decay_pre(void)
992 {
993
994 RACCT_LOCK();
995 }
996
997 static void
racct_decay_post(void)998 racct_decay_post(void)
999 {
1000
1001 RACCT_UNLOCK();
1002 }
1003
1004 static void
racct_decay(void)1005 racct_decay(void)
1006 {
1007
1008 ASSERT_RACCT_ENABLED();
1009
1010 ui_racct_foreach(racct_decay_callback, racct_decay_pre,
1011 racct_decay_post, NULL, NULL);
1012 loginclass_racct_foreach(racct_decay_callback, racct_decay_pre,
1013 racct_decay_post, NULL, NULL);
1014 prison_racct_foreach(racct_decay_callback, racct_decay_pre,
1015 racct_decay_post, NULL, NULL);
1016 }
1017
1018 static void
racct_updatepcpu_racct_locked(struct racct * racct)1019 racct_updatepcpu_racct_locked(struct racct *racct)
1020 {
1021 struct timeval diff;
1022 uint64_t elapsed;
1023 uint64_t runtime;
1024 uint64_t newpcpu;
1025 uint64_t oldpcpu;
1026
1027 ASSERT_RACCT_ENABLED();
1028 RACCT_LOCK_ASSERT();
1029
1030 /* Difference between now and previously-recorded time. */
1031 microuptime(&diff);
1032 timevalsub(&diff, &racct->r_time);
1033 elapsed = (uint64_t)diff.tv_sec * 1000000 + diff.tv_usec;
1034
1035 /* Difference between current and previously-recorded runtime. */
1036 runtime = racct->r_resources[RACCT_CPU] - racct->r_runtime;
1037
1038 newpcpu = runtime * 100 * 1000000 / elapsed;
1039 oldpcpu = racct->r_resources[RACCT_PCTCPU];
1040 /*
1041 * This calculation is equivalent to
1042 * (1 - 0.3) * newpcpu + 0.3 * oldpcpu
1043 * where RACCT_DECAY_FACTOR = 0.3 * FSCALE.
1044 */
1045 racct->r_resources[RACCT_PCTCPU] = ((FSCALE - RACCT_DECAY_FACTOR) *
1046 newpcpu + RACCT_DECAY_FACTOR * oldpcpu) / FSCALE;
1047 if (racct->r_resources[RACCT_PCTCPU] >
1048 100 * 1000000 * (uint64_t)mp_ncpus)
1049 racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 *
1050 (uint64_t)mp_ncpus;
1051
1052 /* Record current times. */
1053 racct->r_runtime = racct->r_resources[RACCT_CPU];
1054 timevaladd(&racct->r_time, &diff);
1055 }
1056
1057 static void
racct_zeropcpu_locked(struct proc * p)1058 racct_zeropcpu_locked(struct proc *p)
1059 {
1060 ASSERT_RACCT_ENABLED();
1061 PROC_LOCK_ASSERT(p, MA_OWNED);
1062
1063 p->p_racct->r_resources[RACCT_PCTCPU] = 0;
1064 }
1065
1066 static void
racct_updatepcpu_locked(struct proc * p)1067 racct_updatepcpu_locked(struct proc *p)
1068 {
1069 ASSERT_RACCT_ENABLED();
1070 PROC_LOCK_ASSERT(p, MA_OWNED);
1071
1072 racct_updatepcpu_racct_locked(p->p_racct);
1073 }
1074
1075 static void
racct_updatepcpu_pre(void)1076 racct_updatepcpu_pre(void)
1077 {
1078
1079 RACCT_LOCK();
1080 }
1081
1082 static void
racct_updatepcpu_post(void)1083 racct_updatepcpu_post(void)
1084 {
1085
1086 RACCT_UNLOCK();
1087 }
1088
1089 static void
racct_updatepcpu_racct_callback(struct racct * racct,void * dummy1,void * dummy2)1090 racct_updatepcpu_racct_callback(struct racct *racct, void *dummy1, void *dummy2)
1091 {
1092 racct_updatepcpu_racct_locked(racct);
1093 }
1094
1095 static void
racct_updatepcpu_containers(void)1096 racct_updatepcpu_containers(void)
1097 {
1098 ASSERT_RACCT_ENABLED();
1099
1100 ui_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre,
1101 racct_updatepcpu_post, NULL, NULL);
1102 loginclass_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre,
1103 racct_updatepcpu_post, NULL, NULL);
1104 prison_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre,
1105 racct_updatepcpu_post, NULL, NULL);
1106 }
1107
1108 static bool
racct_proc_to_skip(const struct proc * p)1109 racct_proc_to_skip(const struct proc *p)
1110 {
1111 PROC_LOCK_ASSERT(p, MA_OWNED);
1112 return (p->p_state != PRS_NORMAL || (p->p_flag & P_IDLEPROC) != 0);
1113 }
1114
1115 static void
racctd(void)1116 racctd(void)
1117 {
1118 struct proc *p;
1119
1120 ASSERT_RACCT_ENABLED();
1121
1122 for (;;) {
1123 racct_decay();
1124
1125 sx_slock(&allproc_lock);
1126
1127 FOREACH_PROC_IN_SYSTEM(p) {
1128 PROC_LOCK(p);
1129 if (racct_proc_to_skip(p)) {
1130 PROC_UNLOCK(p);
1131 continue;
1132 }
1133
1134 RACCT_LOCK();
1135 #ifdef RCTL
1136 rctl_throttle_decay(p->p_racct, RACCT_READBPS);
1137 rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS);
1138 rctl_throttle_decay(p->p_racct, RACCT_READIOPS);
1139 rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS);
1140 #endif
1141 racct_settime_locked(p, false);
1142 racct_updatepcpu_locked(p);
1143 RACCT_UNLOCK();
1144 PROC_UNLOCK(p);
1145 }
1146
1147 /*
1148 * To ensure that processes are throttled in a fair way, we need
1149 * to iterate over all processes again and check the limits
1150 * for %cpu resource only after ucred racct containers have been
1151 * properly filled.
1152 */
1153 FOREACH_PROC_IN_SYSTEM(p) {
1154 PROC_LOCK(p);
1155 if (racct_proc_to_skip(p)) {
1156 PROC_UNLOCK(p);
1157 continue;
1158 }
1159
1160 if (racct_pcpu_available(p) <= 0) {
1161 if (p->p_racct->r_resources[RACCT_PCTCPU] >
1162 pcpu_threshold)
1163 racct_proc_throttle(p, -1);
1164 } else if (p->p_throttled == -1) {
1165 racct_proc_wakeup(p);
1166 }
1167 PROC_UNLOCK(p);
1168 }
1169 sx_sunlock(&allproc_lock);
1170
1171 racct_updatepcpu_containers();
1172 pause("-", hz);
1173 }
1174 }
1175
1176 static struct kproc_desc racctd_kp = {
1177 "racctd",
1178 racctd,
1179 NULL
1180 };
1181
1182 static void
racctd_init(void * dummy __unused)1183 racctd_init(void *dummy __unused)
1184 {
1185 if (!racct_enable)
1186 return;
1187
1188 kproc_start(&racctd_kp);
1189 }
1190 SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL);
1191
1192 static void
racct_init(void * dummy __unused)1193 racct_init(void *dummy __unused)
1194 {
1195 if (!racct_enable)
1196 return;
1197
1198 racct_zone = uma_zcreate("racct", sizeof(struct racct),
1199 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1200 ast_register(TDA_RACCT, ASTR_ASTF_REQUIRED, 0, ast_racct);
1201
1202 /*
1203 * XXX: Move this somewhere.
1204 */
1205 prison0.pr_prison_racct = prison_racct_find("0");
1206 }
1207 SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL);
1208