xref: /src/sys/kern/kern_racct.c (revision 55ce12672b948c002431851af54843bfc7f50d6f)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2010 The FreeBSD Foundation
5  *
6  * This software was developed by Edward Tomasz Napierala under sponsorship
7  * from the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 #include "opt_sched.h"
33 
34 #include <sys/param.h>
35 #include <sys/buf.h>
36 #include <sys/systm.h>
37 #include <sys/eventhandler.h>
38 #include <sys/jail.h>
39 #include <sys/kernel.h>
40 #include <sys/kthread.h>
41 #include <sys/lock.h>
42 #include <sys/loginclass.h>
43 #include <sys/malloc.h>
44 #include <sys/mutex.h>
45 #include <sys/proc.h>
46 #include <sys/racct.h>
47 #include <sys/resourcevar.h>
48 #include <sys/sbuf.h>
49 #include <sys/sched.h>
50 #include <sys/sdt.h>
51 #include <sys/smp.h>
52 #include <sys/sx.h>
53 #include <sys/sysctl.h>
54 #include <sys/sysproto.h>
55 #include <sys/umtxvar.h>
56 #include <machine/smp.h>
57 
58 #ifdef RCTL
59 #include <sys/rctl.h>
60 #endif
61 
62 FEATURE(racct, "Resource Accounting");
63 
64 /*
65  * Do not block processes that have their %cpu usage <= pcpu_threshold.
66  */
67 static int pcpu_threshold = 1;
68 #ifdef RACCT_DEFAULT_TO_DISABLED
69 bool __read_frequently racct_enable = false;
70 #else
71 bool __read_frequently racct_enable = true;
72 #endif
73 
74 SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
75     "Resource Accounting");
76 SYSCTL_BOOL(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable,
77     0, "Enable RACCT/RCTL");
78 SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold,
79     0, "Processes with higher %cpu usage than this value can be throttled.");
80 
81 struct mtx racct_lock;
82 MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF);
83 
84 static uma_zone_t racct_zone;
85 
86 static void racct_sub_racct(struct racct *dest, const struct racct *src);
87 static void racct_sub_cred_locked(struct ucred *cred, int resource,
88 		uint64_t amount);
89 static void racct_add_cred_locked(struct ucred *cred, int resource,
90 		uint64_t amount);
91 static int racct_set_locked(struct proc *p, int resource, uint64_t amount,
92                 int force);
93 static void racct_updatepcpu_locked(struct proc *p);
94 static void racct_updatepcpu_racct_locked(struct racct *racct);
95 static void racct_updatepcpu_containers(void);
96 static void racct_settime_locked(struct proc *p, bool exit);
97 static void racct_zeropcpu_locked(struct proc *p);
98 
99 SDT_PROVIDER_DEFINE(racct);
100 SDT_PROBE_DEFINE3(racct, , rusage, add,
101     "struct proc *", "int", "uint64_t");
102 SDT_PROBE_DEFINE3(racct, , rusage, add__failure,
103     "struct proc *", "int", "uint64_t");
104 SDT_PROBE_DEFINE3(racct, , rusage, add__buf,
105     "struct proc *", "const struct buf *", "int");
106 SDT_PROBE_DEFINE3(racct, , rusage, add__cred,
107     "struct ucred *", "int", "uint64_t");
108 SDT_PROBE_DEFINE3(racct, , rusage, add__force,
109     "struct proc *", "int", "uint64_t");
110 SDT_PROBE_DEFINE3(racct, , rusage, set,
111     "struct proc *", "int", "uint64_t");
112 SDT_PROBE_DEFINE3(racct, , rusage, set__failure,
113     "struct proc *", "int", "uint64_t");
114 SDT_PROBE_DEFINE3(racct, , rusage, set__force,
115     "struct proc *", "int", "uint64_t");
116 SDT_PROBE_DEFINE3(racct, , rusage, sub,
117     "struct proc *", "int", "uint64_t");
118 SDT_PROBE_DEFINE3(racct, , rusage, sub__cred,
119     "struct ucred *", "int", "uint64_t");
120 SDT_PROBE_DEFINE1(racct, , racct, create,
121     "struct racct *");
122 SDT_PROBE_DEFINE1(racct, , racct, destroy,
123     "struct racct *");
124 SDT_PROBE_DEFINE2(racct, , racct, join,
125     "struct racct *", "struct racct *");
126 SDT_PROBE_DEFINE2(racct, , racct, join__failure,
127     "struct racct *", "struct racct *");
128 SDT_PROBE_DEFINE2(racct, , racct, leave,
129     "struct racct *", "struct racct *");
130 
131 int racct_types[] = {
132 	[RACCT_CPU] =
133 		RACCT_IN_MILLIONS,
134 	[RACCT_DATA] =
135 		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
136 	[RACCT_STACK] =
137 		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
138 	[RACCT_CORE] =
139 		RACCT_DENIABLE,
140 	[RACCT_RSS] =
141 		RACCT_RECLAIMABLE,
142 	[RACCT_MEMLOCK] =
143 		RACCT_RECLAIMABLE | RACCT_DENIABLE,
144 	[RACCT_NPROC] =
145 		RACCT_RECLAIMABLE | RACCT_DENIABLE,
146 	[RACCT_NOFILE] =
147 		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
148 	[RACCT_VMEM] =
149 		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
150 	[RACCT_NPTS] =
151 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
152 	[RACCT_SWAP] =
153 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
154 	[RACCT_NTHR] =
155 		RACCT_RECLAIMABLE | RACCT_DENIABLE,
156 	[RACCT_MSGQQUEUED] =
157 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
158 	[RACCT_MSGQSIZE] =
159 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
160 	[RACCT_NMSGQ] =
161 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
162 	[RACCT_NSEM] =
163 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
164 	[RACCT_NSEMOP] =
165 		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
166 	[RACCT_NSHM] =
167 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
168 	[RACCT_SHMSIZE] =
169 		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
170 	[RACCT_WALLCLOCK] =
171 		RACCT_IN_MILLIONS,
172 	[RACCT_PCTCPU] =
173 		RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS,
174 	[RACCT_READBPS] =
175 		RACCT_DECAYING,
176 	[RACCT_WRITEBPS] =
177 		RACCT_DECAYING,
178 	[RACCT_READIOPS] =
179 		RACCT_DECAYING,
180 	[RACCT_WRITEIOPS] =
181 		RACCT_DECAYING };
182 
183 static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE;
184 
185 static void
racct_add_racct(struct racct * dest,const struct racct * src)186 racct_add_racct(struct racct *dest, const struct racct *src)
187 {
188 	int i;
189 
190 	ASSERT_RACCT_ENABLED();
191 	RACCT_LOCK_ASSERT();
192 
193 	/*
194 	 * Update resource usage in dest.
195 	 */
196 	for (i = 0; i <= RACCT_MAX; i++) {
197 		KASSERT(dest->r_resources[i] >= 0,
198 		    ("%s: resource %d propagation meltdown: dest < 0",
199 		    __func__, i));
200 		KASSERT(src->r_resources[i] >= 0,
201 		    ("%s: resource %d propagation meltdown: src < 0",
202 		    __func__, i));
203 		dest->r_resources[i] += src->r_resources[i];
204 	}
205 }
206 
207 static void
racct_sub_racct(struct racct * dest,const struct racct * src)208 racct_sub_racct(struct racct *dest, const struct racct *src)
209 {
210 	int i;
211 
212 	ASSERT_RACCT_ENABLED();
213 	RACCT_LOCK_ASSERT();
214 
215 	/*
216 	 * Update resource usage in dest.
217 	 */
218 	for (i = 0; i <= RACCT_MAX; i++) {
219 		if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) {
220 			KASSERT(dest->r_resources[i] >= 0,
221 			    ("%s: resource %d propagation meltdown: dest < 0",
222 			    __func__, i));
223 			KASSERT(src->r_resources[i] >= 0,
224 			    ("%s: resource %d propagation meltdown: src < 0",
225 			    __func__, i));
226 			KASSERT(src->r_resources[i] <= dest->r_resources[i],
227 			    ("%s: resource %d propagation meltdown: src > dest",
228 			    __func__, i));
229 		}
230 		if (RACCT_CAN_DROP(i)) {
231 			dest->r_resources[i] -= src->r_resources[i];
232 			if (dest->r_resources[i] < 0)
233 				dest->r_resources[i] = 0;
234 		}
235 	}
236 }
237 
238 void
racct_create(struct racct ** racctp)239 racct_create(struct racct **racctp)
240 {
241 
242 	if (!racct_enable)
243 		return;
244 
245 	SDT_PROBE1(racct, , racct, create, racctp);
246 
247 	KASSERT(*racctp == NULL, ("racct already allocated"));
248 
249 	*racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO);
250 }
251 
252 static void
racct_destroy_locked(struct racct ** racctp)253 racct_destroy_locked(struct racct **racctp)
254 {
255 	struct racct *racct;
256 	int i;
257 
258 	ASSERT_RACCT_ENABLED();
259 
260 	SDT_PROBE1(racct, , racct, destroy, racctp);
261 
262 	RACCT_LOCK_ASSERT();
263 	KASSERT(racctp != NULL, ("NULL racctp"));
264 	KASSERT(*racctp != NULL, ("NULL racct"));
265 
266 	racct = *racctp;
267 
268 	for (i = 0; i <= RACCT_MAX; i++) {
269 		if (RACCT_IS_SLOPPY(i))
270 			continue;
271 		if (!RACCT_IS_RECLAIMABLE(i))
272 			continue;
273 		KASSERT(racct->r_resources[i] == 0,
274 		    ("destroying non-empty racct: "
275 		    "%ju allocated for resource %d\n",
276 		    racct->r_resources[i], i));
277 	}
278 	uma_zfree(racct_zone, racct);
279 	*racctp = NULL;
280 }
281 
282 void
racct_destroy(struct racct ** racct)283 racct_destroy(struct racct **racct)
284 {
285 
286 	if (!racct_enable)
287 		return;
288 
289 	RACCT_LOCK();
290 	racct_destroy_locked(racct);
291 	RACCT_UNLOCK();
292 }
293 
294 /*
295  * Increase consumption of 'resource' by 'amount' for 'racct',
296  * but not its parents.  Differently from other cases, 'amount' here
297  * may be less than zero.
298  */
299 static void
racct_adjust_resource(struct racct * racct,int resource,int64_t amount)300 racct_adjust_resource(struct racct *racct, int resource,
301     int64_t amount)
302 {
303 
304 	ASSERT_RACCT_ENABLED();
305 	RACCT_LOCK_ASSERT();
306 	KASSERT(racct != NULL, ("NULL racct"));
307 
308 	racct->r_resources[resource] += amount;
309 	if (racct->r_resources[resource] < 0) {
310 		KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource),
311 		    ("%s: resource %d usage < 0", __func__, resource));
312 		racct->r_resources[resource] = 0;
313 	}
314 }
315 
316 static int
racct_add_locked(struct proc * p,int resource,uint64_t amount,int force)317 racct_add_locked(struct proc *p, int resource, uint64_t amount, int force)
318 {
319 #ifdef RCTL
320 	int error;
321 #endif
322 
323 	ASSERT_RACCT_ENABLED();
324 
325 	/*
326 	 * We need proc lock to dereference p->p_ucred.
327 	 */
328 	PROC_LOCK_ASSERT(p, MA_OWNED);
329 
330 #ifdef RCTL
331 	error = rctl_enforce(p, resource, amount);
332 	if (error && !force && RACCT_IS_DENIABLE(resource)) {
333 		SDT_PROBE3(racct, , rusage, add__failure, p, resource, amount);
334 		return (error);
335 	}
336 #endif
337 	racct_adjust_resource(p->p_racct, resource, amount);
338 	racct_add_cred_locked(p->p_ucred, resource, amount);
339 
340 	return (0);
341 }
342 
343 /*
344  * Increase allocation of 'resource' by 'amount' for process 'p'.
345  * Return 0 if it's below limits, or errno, if it's not.
346  */
347 int
racct_add(struct proc * p,int resource,uint64_t amount)348 racct_add(struct proc *p, int resource, uint64_t amount)
349 {
350 	int error;
351 
352 	if (!racct_enable)
353 		return (0);
354 
355 	SDT_PROBE3(racct, , rusage, add, p, resource, amount);
356 
357 	RACCT_LOCK();
358 	error = racct_add_locked(p, resource, amount, 0);
359 	RACCT_UNLOCK();
360 	return (error);
361 }
362 
363 /*
364  * Increase allocation of 'resource' by 'amount' for process 'p'.
365  * Doesn't check for limits and never fails.
366  */
367 void
racct_add_force(struct proc * p,int resource,uint64_t amount)368 racct_add_force(struct proc *p, int resource, uint64_t amount)
369 {
370 
371 	if (!racct_enable)
372 		return;
373 
374 	SDT_PROBE3(racct, , rusage, add__force, p, resource, amount);
375 
376 	RACCT_LOCK();
377 	racct_add_locked(p, resource, amount, 1);
378 	RACCT_UNLOCK();
379 }
380 
381 static void
racct_add_cred_locked(struct ucred * cred,int resource,uint64_t amount)382 racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount)
383 {
384 	struct prison *pr;
385 
386 	ASSERT_RACCT_ENABLED();
387 
388 	racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, amount);
389 	for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
390 		racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource,
391 		    amount);
392 	racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, amount);
393 }
394 
395 /*
396  * Increase allocation of 'resource' by 'amount' for credential 'cred'.
397  * Doesn't check for limits and never fails.
398  */
399 void
racct_add_cred(struct ucred * cred,int resource,uint64_t amount)400 racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
401 {
402 
403 	if (!racct_enable)
404 		return;
405 
406 	SDT_PROBE3(racct, , rusage, add__cred, cred, resource, amount);
407 
408 	RACCT_LOCK();
409 	racct_add_cred_locked(cred, resource, amount);
410 	RACCT_UNLOCK();
411 }
412 
413 /*
414  * Account for disk IO resource consumption.  Checks for limits,
415  * but never fails, due to disk limits being undeniable.
416  */
417 void
racct_add_buf(struct proc * p,const struct buf * bp,int is_write)418 racct_add_buf(struct proc *p, const struct buf *bp, int is_write)
419 {
420 
421 	ASSERT_RACCT_ENABLED();
422 	PROC_LOCK_ASSERT(p, MA_OWNED);
423 
424 	SDT_PROBE3(racct, , rusage, add__buf, p, bp, is_write);
425 
426 	RACCT_LOCK();
427 	if (is_write) {
428 		racct_add_locked(curproc, RACCT_WRITEBPS, bp->b_bcount, 1);
429 		racct_add_locked(curproc, RACCT_WRITEIOPS, 1, 1);
430 	} else {
431 		racct_add_locked(curproc, RACCT_READBPS, bp->b_bcount, 1);
432 		racct_add_locked(curproc, RACCT_READIOPS, 1, 1);
433 	}
434 	RACCT_UNLOCK();
435 }
436 
437 static void
racct_settime_locked(struct proc * p,bool exit)438 racct_settime_locked(struct proc *p, bool exit)
439 {
440 	struct thread *td;
441 	struct timeval wallclock;
442 	uint64_t runtime;
443 
444 	ASSERT_RACCT_ENABLED();
445 	RACCT_LOCK_ASSERT();
446 	PROC_LOCK_ASSERT(p, MA_OWNED);
447 
448 	if (exit) {
449 		/*
450 		 * proc_reap() has already calculated rux
451 		 * and added crux to rux.
452 		 */
453 		runtime = cputick2usec(p->p_rux.rux_runtime -
454 		    p->p_crux.rux_runtime);
455 	} else {
456 		PROC_STATLOCK(p);
457 		FOREACH_THREAD_IN_PROC(p, td)
458 			ruxagg(p, td);
459 		PROC_STATUNLOCK(p);
460 		runtime = cputick2usec(p->p_rux.rux_runtime);
461 	}
462 	microuptime(&wallclock);
463 	timevalsub(&wallclock, &p->p_stats->p_start);
464 
465 	racct_set_locked(p, RACCT_CPU, runtime, 0);
466 	racct_set_locked(p, RACCT_WALLCLOCK,
467 	    (uint64_t)wallclock.tv_sec * 1000000 +
468 	    wallclock.tv_usec, 0);
469 }
470 
471 static int
racct_set_locked(struct proc * p,int resource,uint64_t amount,int force)472 racct_set_locked(struct proc *p, int resource, uint64_t amount, int force)
473 {
474 	int64_t old_amount, diff_proc, diff_cred;
475 #ifdef RCTL
476 	int error;
477 #endif
478 
479 	ASSERT_RACCT_ENABLED();
480 
481 	/*
482 	 * We need proc lock to dereference p->p_ucred.
483 	 */
484 	PROC_LOCK_ASSERT(p, MA_OWNED);
485 
486 	old_amount = p->p_racct->r_resources[resource];
487 	/*
488 	 * The diffs may be negative.
489 	 */
490 	diff_proc = amount - old_amount;
491 	diff_cred = diff_proc;
492 #ifdef notyet
493 	KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource),
494 	    ("%s: usage of non-droppable resource %d dropping", __func__,
495 	     resource));
496 #endif
497 #ifdef RCTL
498 	if (diff_proc > 0) {
499 		error = rctl_enforce(p, resource, diff_proc);
500 		if (error && !force && RACCT_IS_DENIABLE(resource)) {
501 			SDT_PROBE3(racct, , rusage, set__failure, p, resource,
502 			    amount);
503 			return (error);
504 		}
505 	}
506 #endif
507 	racct_adjust_resource(p->p_racct, resource, diff_proc);
508 	if (diff_cred > 0)
509 		racct_add_cred_locked(p->p_ucred, resource, diff_cred);
510 	else if (diff_cred < 0)
511 		racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
512 
513 	return (0);
514 }
515 
516 /*
517  * Set allocation of 'resource' to 'amount' for process 'p'.
518  * Return 0 if it's below limits, or errno, if it's not.
519  *
520  * Note that decreasing the allocation always returns 0,
521  * even if it's above the limit.
522  */
523 int
racct_set_unlocked(struct proc * p,int resource,uint64_t amount)524 racct_set_unlocked(struct proc *p, int resource, uint64_t amount)
525 {
526 	int error;
527 
528 	ASSERT_RACCT_ENABLED();
529 	PROC_LOCK(p);
530 	error = racct_set(p, resource, amount);
531 	PROC_UNLOCK(p);
532 	return (error);
533 }
534 
535 int
racct_set(struct proc * p,int resource,uint64_t amount)536 racct_set(struct proc *p, int resource, uint64_t amount)
537 {
538 	int error;
539 
540 	if (!racct_enable)
541 		return (0);
542 
543 	SDT_PROBE3(racct, , rusage, set__force, p, resource, amount);
544 
545 	RACCT_LOCK();
546 	error = racct_set_locked(p, resource, amount, 0);
547 	RACCT_UNLOCK();
548 	return (error);
549 }
550 
551 void
racct_set_force(struct proc * p,int resource,uint64_t amount)552 racct_set_force(struct proc *p, int resource, uint64_t amount)
553 {
554 
555 	if (!racct_enable)
556 		return;
557 
558 	SDT_PROBE3(racct, , rusage, set, p, resource, amount);
559 
560 	RACCT_LOCK();
561 	racct_set_locked(p, resource, amount, 1);
562 	RACCT_UNLOCK();
563 }
564 
565 /*
566  * Returns amount of 'resource' the process 'p' can keep allocated.
567  * Allocating more than that would be denied, unless the resource
568  * is marked undeniable.  Amount of already allocated resource does
569  * not matter.
570  */
571 uint64_t
racct_get_limit(struct proc * p,int resource)572 racct_get_limit(struct proc *p, int resource)
573 {
574 #ifdef RCTL
575 	uint64_t available;
576 
577 	if (!racct_enable)
578 		return (UINT64_MAX);
579 
580 	RACCT_LOCK();
581 	available = rctl_get_limit(p, resource);
582 	RACCT_UNLOCK();
583 
584 	return (available);
585 #else
586 
587 	return (UINT64_MAX);
588 #endif
589 }
590 
591 /*
592  * Returns amount of 'resource' the process 'p' can keep allocated.
593  * Allocating more than that would be denied, unless the resource
594  * is marked undeniable.  Amount of already allocated resource does
595  * matter.
596  */
597 uint64_t
racct_get_available(struct proc * p,int resource)598 racct_get_available(struct proc *p, int resource)
599 {
600 #ifdef RCTL
601 	uint64_t available;
602 
603 	if (!racct_enable)
604 		return (UINT64_MAX);
605 
606 	RACCT_LOCK();
607 	available = rctl_get_available(p, resource);
608 	RACCT_UNLOCK();
609 
610 	return (available);
611 #else
612 
613 	return (UINT64_MAX);
614 #endif
615 }
616 
617 /*
618  * Returns amount of the %cpu resource that process 'p' can add to its %cpu
619  * utilization.  Adding more than that would lead to the process being
620  * throttled.
621  */
622 static int64_t
racct_pcpu_available(struct proc * p)623 racct_pcpu_available(struct proc *p)
624 {
625 #ifdef RCTL
626 	uint64_t available;
627 
628 	ASSERT_RACCT_ENABLED();
629 
630 	RACCT_LOCK();
631 	available = rctl_pcpu_available(p);
632 	RACCT_UNLOCK();
633 
634 	return (available);
635 #else
636 
637 	return (INT64_MAX);
638 #endif
639 }
640 
641 /*
642  * Decrease allocation of 'resource' by 'amount' for process 'p'.
643  */
644 void
racct_sub(struct proc * p,int resource,uint64_t amount)645 racct_sub(struct proc *p, int resource, uint64_t amount)
646 {
647 
648 	if (!racct_enable)
649 		return;
650 
651 	SDT_PROBE3(racct, , rusage, sub, p, resource, amount);
652 
653 	/*
654 	 * We need proc lock to dereference p->p_ucred.
655 	 */
656 	PROC_LOCK_ASSERT(p, MA_OWNED);
657 	KASSERT(RACCT_CAN_DROP(resource),
658 	    ("%s: called for non-droppable resource %d", __func__, resource));
659 
660 	RACCT_LOCK();
661 	KASSERT(amount <= p->p_racct->r_resources[resource],
662 	    ("%s: freeing %ju of resource %d, which is more "
663 	     "than allocated %jd for %s (pid %d)", __func__, amount, resource,
664 	    (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid));
665 
666 	racct_adjust_resource(p->p_racct, resource, -amount);
667 	racct_sub_cred_locked(p->p_ucred, resource, amount);
668 	RACCT_UNLOCK();
669 }
670 
671 static void
racct_sub_cred_locked(struct ucred * cred,int resource,uint64_t amount)672 racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount)
673 {
674 	struct prison *pr;
675 
676 	ASSERT_RACCT_ENABLED();
677 
678 	racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, -amount);
679 	for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
680 		racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource,
681 		    -amount);
682 	racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, -amount);
683 }
684 
685 /*
686  * Decrease allocation of 'resource' by 'amount' for credential 'cred'.
687  */
688 void
racct_sub_cred(struct ucred * cred,int resource,uint64_t amount)689 racct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
690 {
691 
692 	if (!racct_enable)
693 		return;
694 
695 	SDT_PROBE3(racct, , rusage, sub__cred, cred, resource, amount);
696 
697 #ifdef notyet
698 	KASSERT(RACCT_CAN_DROP(resource),
699 	    ("%s: called for resource %d which can not drop", __func__,
700 	     resource));
701 #endif
702 
703 	RACCT_LOCK();
704 	racct_sub_cred_locked(cred, resource, amount);
705 	RACCT_UNLOCK();
706 }
707 
708 /*
709  * Inherit resource usage information from the parent process.
710  */
711 int
racct_proc_fork(struct proc * parent,struct proc * child)712 racct_proc_fork(struct proc *parent, struct proc *child)
713 {
714 	int i, error = 0;
715 
716 	if (!racct_enable)
717 		return (0);
718 
719 	/*
720 	 * Create racct for the child process.
721 	 */
722 	racct_create(&child->p_racct);
723 
724 	PROC_LOCK(parent);
725 	PROC_LOCK(child);
726 	RACCT_LOCK();
727 
728 #ifdef RCTL
729 	error = rctl_proc_fork(parent, child);
730 	if (error != 0)
731 		goto out;
732 #endif
733 
734 	child->p_throttled = 0;
735 
736 	/*
737 	 * Inherit resource usage.
738 	 */
739 	for (i = 0; i <= RACCT_MAX; i++) {
740 		if (parent->p_racct->r_resources[i] == 0 ||
741 		    !RACCT_IS_INHERITABLE(i))
742 			continue;
743 
744 		error = racct_set_locked(child, i,
745 		    parent->p_racct->r_resources[i], 0);
746 		if (error != 0)
747 			goto out;
748 	}
749 
750 	error = racct_add_locked(child, RACCT_NPROC, 1, 0);
751 	error += racct_add_locked(child, RACCT_NTHR, 1, 0);
752 
753 out:
754 	RACCT_UNLOCK();
755 	PROC_UNLOCK(child);
756 	PROC_UNLOCK(parent);
757 
758 	if (error != 0)
759 		racct_proc_exit(child);
760 
761 	return (error);
762 }
763 
764 /*
765  * Called at the end of fork1(), to handle rules that require the process
766  * to be fully initialized.
767  */
768 void
racct_proc_fork_done(struct proc * child)769 racct_proc_fork_done(struct proc *child)
770 {
771 
772 	if (!racct_enable)
773 		return;
774 
775 #ifdef RCTL
776 	PROC_LOCK(child);
777 	RACCT_LOCK();
778 	rctl_enforce(child, RACCT_NPROC, 0);
779 	rctl_enforce(child, RACCT_NTHR, 0);
780 	RACCT_UNLOCK();
781 	PROC_UNLOCK(child);
782 #endif
783 }
784 
785 void
racct_proc_exit(struct proc * p)786 racct_proc_exit(struct proc *p)
787 {
788 	int i;
789 
790 	if (!racct_enable)
791 		return;
792 
793 	PROC_LOCK(p);
794 	RACCT_LOCK();
795 
796 	racct_settime_locked(p, true);
797 	racct_zeropcpu_locked(p);
798 
799 	KASSERT(p->p_racct->r_resources[RACCT_RSS] == 0,
800 	    ("process reaped with %ju allocated for RSS\n",
801 	    p->p_racct->r_resources[RACCT_RSS]));
802 	for (i = 0; i <= RACCT_MAX; i++) {
803 		if (p->p_racct->r_resources[i] == 0)
804 			continue;
805 		if (!RACCT_IS_RECLAIMABLE(i))
806 			continue;
807 		racct_set_locked(p, i, 0, 0);
808 	}
809 
810 #ifdef RCTL
811 	rctl_racct_release(p->p_racct);
812 #endif
813 	racct_destroy_locked(&p->p_racct);
814 	RACCT_UNLOCK();
815 	PROC_UNLOCK(p);
816 }
817 
818 /*
819  * Called to signal credentials change, to move resource utilisation
820  * between raccts.  Must be called with the proc lock held, in the same span as
821  * the credentials change itself (i.e., without the proc lock being unlocked
822  * between the two), but the order does not matter.
823  */
824 void
racct_proc_ucred_changed(struct proc * p,struct ucred * oldcred,struct ucred * newcred)825 racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred,
826     struct ucred *newcred)
827 {
828 	struct uidinfo *olduip, *newuip;
829 	struct loginclass *oldlc, *newlc;
830 	struct prison *oldpr, *newpr, *pr;
831 
832 	if (!racct_enable)
833 		return;
834 
835 	PROC_LOCK_ASSERT(p, MA_OWNED);
836 
837 	newuip = newcred->cr_ruidinfo;
838 	olduip = oldcred->cr_ruidinfo;
839 	newlc = newcred->cr_loginclass;
840 	oldlc = oldcred->cr_loginclass;
841 	newpr = newcred->cr_prison;
842 	oldpr = oldcred->cr_prison;
843 
844 	RACCT_LOCK();
845 	if (newuip != olduip) {
846 		racct_sub_racct(olduip->ui_racct, p->p_racct);
847 		racct_add_racct(newuip->ui_racct, p->p_racct);
848 	}
849 	if (newlc != oldlc) {
850 		racct_sub_racct(oldlc->lc_racct, p->p_racct);
851 		racct_add_racct(newlc->lc_racct, p->p_racct);
852 	}
853 	if (newpr != oldpr) {
854 		for (pr = oldpr; pr != NULL; pr = pr->pr_parent)
855 			racct_sub_racct(pr->pr_prison_racct->prr_racct,
856 			    p->p_racct);
857 		for (pr = newpr; pr != NULL; pr = pr->pr_parent)
858 			racct_add_racct(pr->pr_prison_racct->prr_racct,
859 			    p->p_racct);
860 	}
861 	RACCT_UNLOCK();
862 }
863 
864 void
racct_move(struct racct * dest,struct racct * src)865 racct_move(struct racct *dest, struct racct *src)
866 {
867 
868 	ASSERT_RACCT_ENABLED();
869 
870 	RACCT_LOCK();
871 	racct_add_racct(dest, src);
872 	racct_sub_racct(src, src);
873 	dest->r_runtime = src->r_runtime;
874 	dest->r_time = src->r_time;
875 	src->r_runtime = 0;
876 	timevalsub(&src->r_time, &src->r_time);
877 	RACCT_UNLOCK();
878 }
879 
880 static void
ast_racct(struct thread * td,int tda __unused)881 ast_racct(struct thread *td, int tda __unused)
882 {
883 	struct proc *p;
884 
885 	ASSERT_RACCT_ENABLED();
886 
887 	p = td->td_proc;
888 	if (p->p_throttled == 0)
889 		return;
890 
891 	PROC_LOCK(p);
892 	while (p->p_throttled != 0) {
893 		msleep(p->p_racct, &p->p_mtx, 0, "racct",
894 		    p->p_throttled < 0 ? 0 : p->p_throttled);
895 		if (p->p_throttled > 0)
896 			p->p_throttled = 0;
897 	}
898 	PROC_UNLOCK(p);
899 }
900 
901 /*
902  * Make the process sleep in userret() for 'timeout' ticks.  Setting
903  * timeout to -1 makes it sleep until woken up by racct_proc_wakeup().
904  */
905 void
racct_proc_throttle(struct proc * p,int timeout)906 racct_proc_throttle(struct proc *p, int timeout)
907 {
908 	struct thread *td;
909 #ifdef SMP
910 	int cpuid;
911 #endif
912 
913 	KASSERT(timeout != 0, ("timeout %d", timeout));
914 	ASSERT_RACCT_ENABLED();
915 	PROC_LOCK_ASSERT(p, MA_OWNED);
916 
917 	/*
918 	 * Do not block kernel processes.  Also do not block processes with
919 	 * low %cpu utilization to improve interactivity.
920 	 */
921 	if ((p->p_flag & (P_SYSTEM | P_KPROC)) != 0)
922 		return;
923 
924 	if (p->p_throttled < 0 || (timeout > 0 && p->p_throttled > timeout))
925 		return;
926 
927 	p->p_throttled = timeout;
928 
929 	FOREACH_THREAD_IN_PROC(p, td) {
930 		thread_lock(td);
931 		ast_sched_locked(td, TDA_RACCT);
932 
933 		switch (TD_GET_STATE(td)) {
934 		case TDS_RUNQ:
935 			/*
936 			 * If the thread is on the scheduler run-queue, we can
937 			 * not just remove it from there.  So we set the flag
938 			 * TDA_SCHED for the thread, so that once it is
939 			 * running, it is taken off the cpu as soon as possible.
940 			 */
941 			ast_sched_locked(td, TDA_SCHED);
942 			break;
943 		case TDS_RUNNING:
944 			/*
945 			 * If the thread is running, we request a context
946 			 * switch for it by setting the TDA_SCHED flag.
947 			 */
948 			ast_sched_locked(td, TDA_SCHED);
949 #ifdef SMP
950 			cpuid = td->td_oncpu;
951 			if ((cpuid != NOCPU) && (td != curthread))
952 				ipi_cpu(cpuid, IPI_AST);
953 #endif
954 			break;
955 		default:
956 			break;
957 		}
958 		thread_unlock(td);
959 	}
960 }
961 
962 static void
racct_proc_wakeup(struct proc * p)963 racct_proc_wakeup(struct proc *p)
964 {
965 
966 	ASSERT_RACCT_ENABLED();
967 
968 	PROC_LOCK_ASSERT(p, MA_OWNED);
969 
970 	if (p->p_throttled != 0) {
971 		p->p_throttled = 0;
972 		wakeup(p->p_racct);
973 	}
974 }
975 
976 static void
racct_decay_callback(struct racct * racct,void * dummy1,void * dummy2)977 racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2)
978 {
979 	ASSERT_RACCT_ENABLED();
980 	RACCT_LOCK_ASSERT();
981 
982 #ifdef RCTL
983 	rctl_throttle_decay(racct, RACCT_READBPS);
984 	rctl_throttle_decay(racct, RACCT_WRITEBPS);
985 	rctl_throttle_decay(racct, RACCT_READIOPS);
986 	rctl_throttle_decay(racct, RACCT_WRITEIOPS);
987 #endif
988 }
989 
990 static void
racct_decay_pre(void)991 racct_decay_pre(void)
992 {
993 
994 	RACCT_LOCK();
995 }
996 
997 static void
racct_decay_post(void)998 racct_decay_post(void)
999 {
1000 
1001 	RACCT_UNLOCK();
1002 }
1003 
1004 static void
racct_decay(void)1005 racct_decay(void)
1006 {
1007 
1008 	ASSERT_RACCT_ENABLED();
1009 
1010 	ui_racct_foreach(racct_decay_callback, racct_decay_pre,
1011 	    racct_decay_post, NULL, NULL);
1012 	loginclass_racct_foreach(racct_decay_callback, racct_decay_pre,
1013 	    racct_decay_post, NULL, NULL);
1014 	prison_racct_foreach(racct_decay_callback, racct_decay_pre,
1015 	    racct_decay_post, NULL, NULL);
1016 }
1017 
1018 static void
racct_updatepcpu_racct_locked(struct racct * racct)1019 racct_updatepcpu_racct_locked(struct racct *racct)
1020 {
1021 	struct timeval diff;
1022 	uint64_t elapsed;
1023 	uint64_t runtime;
1024 	uint64_t newpcpu;
1025 	uint64_t oldpcpu;
1026 
1027 	ASSERT_RACCT_ENABLED();
1028 	RACCT_LOCK_ASSERT();
1029 
1030 	/* Difference between now and previously-recorded time. */
1031 	microuptime(&diff);
1032 	timevalsub(&diff, &racct->r_time);
1033 	elapsed = (uint64_t)diff.tv_sec * 1000000 + diff.tv_usec;
1034 
1035 	/* Difference between current and previously-recorded runtime. */
1036 	runtime = racct->r_resources[RACCT_CPU] - racct->r_runtime;
1037 
1038 	newpcpu = runtime * 100 * 1000000 / elapsed;
1039 	oldpcpu = racct->r_resources[RACCT_PCTCPU];
1040 	/*
1041 	 * This calculation is equivalent to
1042 	 *    (1 - 0.3) * newpcpu + 0.3 * oldpcpu
1043 	 * where RACCT_DECAY_FACTOR = 0.3 * FSCALE.
1044 	 */
1045 	racct->r_resources[RACCT_PCTCPU] = ((FSCALE - RACCT_DECAY_FACTOR) *
1046 	    newpcpu + RACCT_DECAY_FACTOR * oldpcpu) / FSCALE;
1047 	if (racct->r_resources[RACCT_PCTCPU] >
1048 	    100 * 1000000 * (uint64_t)mp_ncpus)
1049 		racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 *
1050 		    (uint64_t)mp_ncpus;
1051 
1052 	/* Record current times. */
1053 	racct->r_runtime = racct->r_resources[RACCT_CPU];
1054 	timevaladd(&racct->r_time, &diff);
1055 }
1056 
1057 static void
racct_zeropcpu_locked(struct proc * p)1058 racct_zeropcpu_locked(struct proc *p)
1059 {
1060 	ASSERT_RACCT_ENABLED();
1061 	PROC_LOCK_ASSERT(p, MA_OWNED);
1062 
1063 	p->p_racct->r_resources[RACCT_PCTCPU] = 0;
1064 }
1065 
1066 static void
racct_updatepcpu_locked(struct proc * p)1067 racct_updatepcpu_locked(struct proc *p)
1068 {
1069 	ASSERT_RACCT_ENABLED();
1070 	PROC_LOCK_ASSERT(p, MA_OWNED);
1071 
1072 	racct_updatepcpu_racct_locked(p->p_racct);
1073 }
1074 
1075 static void
racct_updatepcpu_pre(void)1076 racct_updatepcpu_pre(void)
1077 {
1078 
1079 	RACCT_LOCK();
1080 }
1081 
1082 static void
racct_updatepcpu_post(void)1083 racct_updatepcpu_post(void)
1084 {
1085 
1086 	RACCT_UNLOCK();
1087 }
1088 
1089 static void
racct_updatepcpu_racct_callback(struct racct * racct,void * dummy1,void * dummy2)1090 racct_updatepcpu_racct_callback(struct racct *racct, void *dummy1, void *dummy2)
1091 {
1092 	racct_updatepcpu_racct_locked(racct);
1093 }
1094 
1095 static void
racct_updatepcpu_containers(void)1096 racct_updatepcpu_containers(void)
1097 {
1098 	ASSERT_RACCT_ENABLED();
1099 
1100 	ui_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre,
1101 	    racct_updatepcpu_post, NULL, NULL);
1102 	loginclass_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre,
1103 	    racct_updatepcpu_post, NULL, NULL);
1104 	prison_racct_foreach(racct_updatepcpu_racct_callback, racct_updatepcpu_pre,
1105 	    racct_updatepcpu_post, NULL, NULL);
1106 }
1107 
1108 static bool
racct_proc_to_skip(const struct proc * p)1109 racct_proc_to_skip(const struct proc *p)
1110 {
1111 	PROC_LOCK_ASSERT(p, MA_OWNED);
1112 	return (p->p_state != PRS_NORMAL || (p->p_flag & P_IDLEPROC) != 0);
1113 }
1114 
1115 static void
racctd(void)1116 racctd(void)
1117 {
1118 	struct proc *p;
1119 
1120 	ASSERT_RACCT_ENABLED();
1121 
1122 	for (;;) {
1123 		racct_decay();
1124 
1125 		sx_slock(&allproc_lock);
1126 
1127 		FOREACH_PROC_IN_SYSTEM(p) {
1128 			PROC_LOCK(p);
1129 			if (racct_proc_to_skip(p)) {
1130 				PROC_UNLOCK(p);
1131 				continue;
1132 			}
1133 
1134 			RACCT_LOCK();
1135 #ifdef RCTL
1136 			rctl_throttle_decay(p->p_racct, RACCT_READBPS);
1137 			rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS);
1138 			rctl_throttle_decay(p->p_racct, RACCT_READIOPS);
1139 			rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS);
1140 #endif
1141 			racct_settime_locked(p, false);
1142 			racct_updatepcpu_locked(p);
1143 			RACCT_UNLOCK();
1144 			PROC_UNLOCK(p);
1145 		}
1146 
1147 		/*
1148 		 * To ensure that processes are throttled in a fair way, we need
1149 		 * to iterate over all processes again and check the limits
1150 		 * for %cpu resource only after ucred racct containers have been
1151 		 * properly filled.
1152 		 */
1153 		FOREACH_PROC_IN_SYSTEM(p) {
1154 			PROC_LOCK(p);
1155 			if (racct_proc_to_skip(p)) {
1156 				PROC_UNLOCK(p);
1157 				continue;
1158 			}
1159 
1160 			if (racct_pcpu_available(p) <= 0) {
1161 				if (p->p_racct->r_resources[RACCT_PCTCPU] >
1162 				    pcpu_threshold)
1163 					racct_proc_throttle(p, -1);
1164 			} else if (p->p_throttled == -1) {
1165 				racct_proc_wakeup(p);
1166 			}
1167 			PROC_UNLOCK(p);
1168 		}
1169 		sx_sunlock(&allproc_lock);
1170 
1171 		racct_updatepcpu_containers();
1172 		pause("-", hz);
1173 	}
1174 }
1175 
1176 static struct kproc_desc racctd_kp = {
1177 	"racctd",
1178 	racctd,
1179 	NULL
1180 };
1181 
1182 static void
racctd_init(void * dummy __unused)1183 racctd_init(void *dummy __unused)
1184 {
1185 	if (!racct_enable)
1186 		return;
1187 
1188 	kproc_start(&racctd_kp);
1189 }
1190 SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL);
1191 
1192 static void
racct_init(void * dummy __unused)1193 racct_init(void *dummy __unused)
1194 {
1195 	if (!racct_enable)
1196 		return;
1197 
1198 	racct_zone = uma_zcreate("racct", sizeof(struct racct),
1199 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1200 	ast_register(TDA_RACCT, ASTR_ASTF_REQUIRED, 0, ast_racct);
1201 
1202 	/*
1203 	 * XXX: Move this somewhere.
1204 	 */
1205 	prison0.pr_prison_racct = prison_racct_find("0");
1206 }
1207 SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL);
1208