1 // SPDX-License-Identifier: GPL-2.0-or-later
2
3 #include "cgroup-internal.h"
4 #include "cpuset-internal.h"
5
6 /*
7 * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
8 */
9 struct cpuset_remove_tasks_struct {
10 struct work_struct work;
11 struct cpuset *cs;
12 };
13
14 /*
15 * Frequency meter - How fast is some event occurring?
16 *
17 * These routines manage a digitally filtered, constant time based,
18 * event frequency meter. There are four routines:
19 * fmeter_init() - initialize a frequency meter.
20 * fmeter_markevent() - called each time the event happens.
21 * fmeter_getrate() - returns the recent rate of such events.
22 * fmeter_update() - internal routine used to update fmeter.
23 *
24 * A common data structure is passed to each of these routines,
25 * which is used to keep track of the state required to manage the
26 * frequency meter and its digital filter.
27 *
28 * The filter works on the number of events marked per unit time.
29 * The filter is single-pole low-pass recursive (IIR). The time unit
30 * is 1 second. Arithmetic is done using 32-bit integers scaled to
31 * simulate 3 decimal digits of precision (multiplied by 1000).
32 *
33 * With an FM_COEF of 933, and a time base of 1 second, the filter
34 * has a half-life of 10 seconds, meaning that if the events quit
35 * happening, then the rate returned from the fmeter_getrate()
36 * will be cut in half each 10 seconds, until it converges to zero.
37 *
38 * It is not worth doing a real infinitely recursive filter. If more
39 * than FM_MAXTICKS ticks have elapsed since the last filter event,
40 * just compute FM_MAXTICKS ticks worth, by which point the level
41 * will be stable.
42 *
43 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
44 * arithmetic overflow in the fmeter_update() routine.
45 *
46 * Given the simple 32 bit integer arithmetic used, this meter works
47 * best for reporting rates between one per millisecond (msec) and
48 * one per 32 (approx) seconds. At constant rates faster than one
49 * per msec it maxes out at values just under 1,000,000. At constant
50 * rates between one per msec, and one per second it will stabilize
51 * to a value N*1000, where N is the rate of events per second.
52 * At constant rates between one per second and one per 32 seconds,
53 * it will be choppy, moving up on the seconds that have an event,
54 * and then decaying until the next event. At rates slower than
55 * about one in 32 seconds, it decays all the way back to zero between
56 * each event.
57 */
58
59 #define FM_COEF 933 /* coefficient for half-life of 10 secs */
60 #define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
61 #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
62 #define FM_SCALE 1000 /* faux fixed point scale */
63
64 /* Initialize a frequency meter */
fmeter_init(struct fmeter * fmp)65 void fmeter_init(struct fmeter *fmp)
66 {
67 fmp->cnt = 0;
68 fmp->val = 0;
69 fmp->time = 0;
70 spin_lock_init(&fmp->lock);
71 }
72
73 /* Internal meter update - process cnt events and update value */
fmeter_update(struct fmeter * fmp)74 static void fmeter_update(struct fmeter *fmp)
75 {
76 time64_t now;
77 u32 ticks;
78
79 now = ktime_get_seconds();
80 ticks = now - fmp->time;
81
82 if (ticks == 0)
83 return;
84
85 ticks = min(FM_MAXTICKS, ticks);
86 while (ticks-- > 0)
87 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
88 fmp->time = now;
89
90 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
91 fmp->cnt = 0;
92 }
93
94 /* Process any previous ticks, then bump cnt by one (times scale). */
fmeter_markevent(struct fmeter * fmp)95 static void fmeter_markevent(struct fmeter *fmp)
96 {
97 spin_lock(&fmp->lock);
98 fmeter_update(fmp);
99 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
100 spin_unlock(&fmp->lock);
101 }
102
103 /* Process any previous ticks, then return current value. */
fmeter_getrate(struct fmeter * fmp)104 static int fmeter_getrate(struct fmeter *fmp)
105 {
106 int val;
107
108 spin_lock(&fmp->lock);
109 fmeter_update(fmp);
110 val = fmp->val;
111 spin_unlock(&fmp->lock);
112 return val;
113 }
114
115 /*
116 * Collection of memory_pressure is suppressed unless
117 * this flag is enabled by writing "1" to the special
118 * cpuset file 'memory_pressure_enabled' in the root cpuset.
119 */
120
121 int cpuset_memory_pressure_enabled __read_mostly;
122
123 /*
124 * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
125 *
126 * Keep a running average of the rate of synchronous (direct)
127 * page reclaim efforts initiated by tasks in each cpuset.
128 *
129 * This represents the rate at which some task in the cpuset
130 * ran low on memory on all nodes it was allowed to use, and
131 * had to enter the kernels page reclaim code in an effort to
132 * create more free memory by tossing clean pages or swapping
133 * or writing dirty pages.
134 *
135 * Display to user space in the per-cpuset read-only file
136 * "memory_pressure". Value displayed is an integer
137 * representing the recent rate of entry into the synchronous
138 * (direct) page reclaim by any task attached to the cpuset.
139 */
140
__cpuset_memory_pressure_bump(void)141 void __cpuset_memory_pressure_bump(void)
142 {
143 rcu_read_lock();
144 fmeter_markevent(&task_cs(current)->fmeter);
145 rcu_read_unlock();
146 }
147
update_relax_domain_level(struct cpuset * cs,s64 val)148 static int update_relax_domain_level(struct cpuset *cs, s64 val)
149 {
150 #ifdef CONFIG_SMP
151 if (val < -1 || val > sched_domain_level_max + 1)
152 return -EINVAL;
153 #endif
154
155 if (val != cs->relax_domain_level) {
156 cs->relax_domain_level = val;
157 if (!cpumask_empty(cs->cpus_allowed) &&
158 is_sched_load_balance(cs))
159 rebuild_sched_domains_locked();
160 }
161
162 return 0;
163 }
164
cpuset_write_s64(struct cgroup_subsys_state * css,struct cftype * cft,s64 val)165 static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
166 s64 val)
167 {
168 struct cpuset *cs = css_cs(css);
169 cpuset_filetype_t type = cft->private;
170 int retval = -ENODEV;
171
172 cpus_read_lock();
173 cpuset_lock();
174 if (!is_cpuset_online(cs))
175 goto out_unlock;
176
177 switch (type) {
178 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
179 pr_info_once("cpuset.%s is deprecated\n", cft->name);
180 retval = update_relax_domain_level(cs, val);
181 break;
182 default:
183 retval = -EINVAL;
184 break;
185 }
186 out_unlock:
187 cpuset_unlock();
188 cpus_read_unlock();
189 return retval;
190 }
191
cpuset_read_s64(struct cgroup_subsys_state * css,struct cftype * cft)192 static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
193 {
194 struct cpuset *cs = css_cs(css);
195 cpuset_filetype_t type = cft->private;
196
197 switch (type) {
198 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
199 return cs->relax_domain_level;
200 default:
201 BUG();
202 }
203
204 /* Unreachable but makes gcc happy */
205 return 0;
206 }
207
208 /*
209 * update task's spread flag if cpuset's page/slab spread flag is set
210 *
211 * Call with callback_lock or cpuset_mutex held. The check can be skipped
212 * if on default hierarchy.
213 */
cpuset1_update_task_spread_flags(struct cpuset * cs,struct task_struct * tsk)214 void cpuset1_update_task_spread_flags(struct cpuset *cs,
215 struct task_struct *tsk)
216 {
217 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
218 return;
219
220 if (is_spread_page(cs))
221 task_set_spread_page(tsk);
222 else
223 task_clear_spread_page(tsk);
224
225 if (is_spread_slab(cs))
226 task_set_spread_slab(tsk);
227 else
228 task_clear_spread_slab(tsk);
229 }
230
231 /**
232 * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset.
233 * @cs: the cpuset in which each task's spread flags needs to be changed
234 *
235 * Iterate through each task of @cs updating its spread flags. As this
236 * function is called with cpuset_mutex held, cpuset membership stays
237 * stable.
238 */
cpuset1_update_tasks_flags(struct cpuset * cs)239 void cpuset1_update_tasks_flags(struct cpuset *cs)
240 {
241 struct css_task_iter it;
242 struct task_struct *task;
243
244 css_task_iter_start(&cs->css, 0, &it);
245 while ((task = css_task_iter_next(&it)))
246 cpuset1_update_task_spread_flags(cs, task);
247 css_task_iter_end(&it);
248 }
249
250 /*
251 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
252 * or memory nodes, we need to walk over the cpuset hierarchy,
253 * removing that CPU or node from all cpusets. If this removes the
254 * last CPU or node from a cpuset, then move the tasks in the empty
255 * cpuset to its next-highest non-empty parent.
256 */
remove_tasks_in_empty_cpuset(struct cpuset * cs)257 static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
258 {
259 struct cpuset *parent;
260
261 /*
262 * Find its next-highest non-empty parent, (top cpuset
263 * has online cpus, so can't be empty).
264 */
265 parent = parent_cs(cs);
266 while (cpumask_empty(parent->cpus_allowed) ||
267 nodes_empty(parent->mems_allowed))
268 parent = parent_cs(parent);
269
270 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
271 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
272 pr_cont_cgroup_name(cs->css.cgroup);
273 pr_cont("\n");
274 }
275 }
276
cpuset_migrate_tasks_workfn(struct work_struct * work)277 static void cpuset_migrate_tasks_workfn(struct work_struct *work)
278 {
279 struct cpuset_remove_tasks_struct *s;
280
281 s = container_of(work, struct cpuset_remove_tasks_struct, work);
282 remove_tasks_in_empty_cpuset(s->cs);
283 css_put(&s->cs->css);
284 kfree(s);
285 }
286
cpuset1_hotplug_update_tasks(struct cpuset * cs,struct cpumask * new_cpus,nodemask_t * new_mems,bool cpus_updated,bool mems_updated)287 void cpuset1_hotplug_update_tasks(struct cpuset *cs,
288 struct cpumask *new_cpus, nodemask_t *new_mems,
289 bool cpus_updated, bool mems_updated)
290 {
291 bool is_empty;
292
293 cpuset_callback_lock_irq();
294 cpumask_copy(cs->cpus_allowed, new_cpus);
295 cpumask_copy(cs->effective_cpus, new_cpus);
296 cs->mems_allowed = *new_mems;
297 cs->effective_mems = *new_mems;
298 cpuset_callback_unlock_irq();
299
300 /*
301 * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty,
302 * as the tasks will be migrated to an ancestor.
303 */
304 if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
305 cpuset_update_tasks_cpumask(cs, new_cpus);
306 if (mems_updated && !nodes_empty(cs->mems_allowed))
307 cpuset_update_tasks_nodemask(cs);
308
309 is_empty = cpumask_empty(cs->cpus_allowed) ||
310 nodes_empty(cs->mems_allowed);
311
312 /*
313 * Move tasks to the nearest ancestor with execution resources,
314 * This is full cgroup operation which will also call back into
315 * cpuset. Execute it asynchronously using workqueue.
316 */
317 if (is_empty && cs->css.cgroup->nr_populated_csets &&
318 css_tryget_online(&cs->css)) {
319 struct cpuset_remove_tasks_struct *s;
320
321 s = kzalloc(sizeof(*s), GFP_KERNEL);
322 if (WARN_ON_ONCE(!s)) {
323 css_put(&cs->css);
324 return;
325 }
326
327 s->cs = cs;
328 INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
329 schedule_work(&s->work);
330 }
331 }
332
333 /*
334 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
335 *
336 * One cpuset is a subset of another if all its allowed CPUs and
337 * Memory Nodes are a subset of the other, and its exclusive flags
338 * are only set if the other's are set. Call holding cpuset_mutex.
339 */
340
is_cpuset_subset(const struct cpuset * p,const struct cpuset * q)341 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
342 {
343 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
344 nodes_subset(p->mems_allowed, q->mems_allowed) &&
345 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
346 is_mem_exclusive(p) <= is_mem_exclusive(q);
347 }
348
349 /*
350 * cpuset1_validate_change() - Validate conditions specific to legacy (v1)
351 * behavior.
352 */
cpuset1_validate_change(struct cpuset * cur,struct cpuset * trial)353 int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial)
354 {
355 struct cgroup_subsys_state *css;
356 struct cpuset *c, *par;
357 int ret;
358
359 WARN_ON_ONCE(!rcu_read_lock_held());
360
361 /* Each of our child cpusets must be a subset of us */
362 ret = -EBUSY;
363 cpuset_for_each_child(c, css, cur)
364 if (!is_cpuset_subset(c, trial))
365 goto out;
366
367 /* On legacy hierarchy, we must be a subset of our parent cpuset. */
368 ret = -EACCES;
369 par = parent_cs(cur);
370 if (par && !is_cpuset_subset(trial, par))
371 goto out;
372
373 ret = 0;
374 out:
375 return ret;
376 }
377
378 #ifdef CONFIG_PROC_PID_CPUSET
379 /*
380 * proc_cpuset_show()
381 * - Print tasks cpuset path into seq_file.
382 * - Used for /proc/<pid>/cpuset.
383 */
proc_cpuset_show(struct seq_file * m,struct pid_namespace * ns,struct pid * pid,struct task_struct * tsk)384 int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
385 struct pid *pid, struct task_struct *tsk)
386 {
387 char *buf;
388 struct cgroup_subsys_state *css;
389 int retval;
390
391 retval = -ENOMEM;
392 buf = kmalloc(PATH_MAX, GFP_KERNEL);
393 if (!buf)
394 goto out;
395
396 rcu_read_lock();
397 spin_lock_irq(&css_set_lock);
398 css = task_css(tsk, cpuset_cgrp_id);
399 retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
400 current->nsproxy->cgroup_ns);
401 spin_unlock_irq(&css_set_lock);
402 rcu_read_unlock();
403
404 if (retval == -E2BIG)
405 retval = -ENAMETOOLONG;
406 if (retval < 0)
407 goto out_free;
408 seq_puts(m, buf);
409 seq_putc(m, '\n');
410 retval = 0;
411 out_free:
412 kfree(buf);
413 out:
414 return retval;
415 }
416 #endif /* CONFIG_PROC_PID_CPUSET */
417
cpuset_read_u64(struct cgroup_subsys_state * css,struct cftype * cft)418 static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
419 {
420 struct cpuset *cs = css_cs(css);
421 cpuset_filetype_t type = cft->private;
422
423 switch (type) {
424 case FILE_CPU_EXCLUSIVE:
425 return is_cpu_exclusive(cs);
426 case FILE_MEM_EXCLUSIVE:
427 return is_mem_exclusive(cs);
428 case FILE_MEM_HARDWALL:
429 return is_mem_hardwall(cs);
430 case FILE_SCHED_LOAD_BALANCE:
431 return is_sched_load_balance(cs);
432 case FILE_MEMORY_MIGRATE:
433 return is_memory_migrate(cs);
434 case FILE_MEMORY_PRESSURE_ENABLED:
435 return cpuset_memory_pressure_enabled;
436 case FILE_MEMORY_PRESSURE:
437 return fmeter_getrate(&cs->fmeter);
438 case FILE_SPREAD_PAGE:
439 return is_spread_page(cs);
440 case FILE_SPREAD_SLAB:
441 return is_spread_slab(cs);
442 default:
443 BUG();
444 }
445
446 /* Unreachable but makes gcc happy */
447 return 0;
448 }
449
cpuset_write_u64(struct cgroup_subsys_state * css,struct cftype * cft,u64 val)450 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
451 u64 val)
452 {
453 struct cpuset *cs = css_cs(css);
454 cpuset_filetype_t type = cft->private;
455 int retval = 0;
456
457 cpus_read_lock();
458 cpuset_lock();
459 if (!is_cpuset_online(cs)) {
460 retval = -ENODEV;
461 goto out_unlock;
462 }
463
464 switch (type) {
465 case FILE_CPU_EXCLUSIVE:
466 retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val);
467 break;
468 case FILE_MEM_EXCLUSIVE:
469 pr_info_once("cpuset.%s is deprecated\n", cft->name);
470 retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val);
471 break;
472 case FILE_MEM_HARDWALL:
473 pr_info_once("cpuset.%s is deprecated\n", cft->name);
474 retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val);
475 break;
476 case FILE_SCHED_LOAD_BALANCE:
477 pr_info_once("cpuset.%s is deprecated, use cpuset.cpus.partition instead\n", cft->name);
478 retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
479 break;
480 case FILE_MEMORY_MIGRATE:
481 pr_info_once("cpuset.%s is deprecated\n", cft->name);
482 retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val);
483 break;
484 case FILE_MEMORY_PRESSURE_ENABLED:
485 pr_info_once("cpuset.%s is deprecated, use memory.pressure with CONFIG_PSI instead\n", cft->name);
486 cpuset_memory_pressure_enabled = !!val;
487 break;
488 case FILE_SPREAD_PAGE:
489 pr_info_once("cpuset.%s is deprecated\n", cft->name);
490 retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val);
491 break;
492 case FILE_SPREAD_SLAB:
493 pr_warn_once("cpuset.%s is deprecated\n", cft->name);
494 retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val);
495 break;
496 default:
497 retval = -EINVAL;
498 break;
499 }
500 out_unlock:
501 cpuset_unlock();
502 cpus_read_unlock();
503 return retval;
504 }
505
506 /*
507 * for the common functions, 'private' gives the type of file
508 */
509
510 struct cftype cpuset1_files[] = {
511 {
512 .name = "cpus",
513 .seq_show = cpuset_common_seq_show,
514 .write = cpuset_write_resmask,
515 .max_write_len = (100U + 6 * NR_CPUS),
516 .private = FILE_CPULIST,
517 },
518
519 {
520 .name = "mems",
521 .seq_show = cpuset_common_seq_show,
522 .write = cpuset_write_resmask,
523 .max_write_len = (100U + 6 * MAX_NUMNODES),
524 .private = FILE_MEMLIST,
525 },
526
527 {
528 .name = "effective_cpus",
529 .seq_show = cpuset_common_seq_show,
530 .private = FILE_EFFECTIVE_CPULIST,
531 },
532
533 {
534 .name = "effective_mems",
535 .seq_show = cpuset_common_seq_show,
536 .private = FILE_EFFECTIVE_MEMLIST,
537 },
538
539 {
540 .name = "cpu_exclusive",
541 .read_u64 = cpuset_read_u64,
542 .write_u64 = cpuset_write_u64,
543 .private = FILE_CPU_EXCLUSIVE,
544 },
545
546 {
547 .name = "mem_exclusive",
548 .read_u64 = cpuset_read_u64,
549 .write_u64 = cpuset_write_u64,
550 .private = FILE_MEM_EXCLUSIVE,
551 },
552
553 {
554 .name = "mem_hardwall",
555 .read_u64 = cpuset_read_u64,
556 .write_u64 = cpuset_write_u64,
557 .private = FILE_MEM_HARDWALL,
558 },
559
560 {
561 .name = "sched_load_balance",
562 .read_u64 = cpuset_read_u64,
563 .write_u64 = cpuset_write_u64,
564 .private = FILE_SCHED_LOAD_BALANCE,
565 },
566
567 {
568 .name = "sched_relax_domain_level",
569 .read_s64 = cpuset_read_s64,
570 .write_s64 = cpuset_write_s64,
571 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
572 },
573
574 {
575 .name = "memory_migrate",
576 .read_u64 = cpuset_read_u64,
577 .write_u64 = cpuset_write_u64,
578 .private = FILE_MEMORY_MIGRATE,
579 },
580
581 {
582 .name = "memory_pressure",
583 .read_u64 = cpuset_read_u64,
584 .private = FILE_MEMORY_PRESSURE,
585 },
586
587 {
588 .name = "memory_spread_page",
589 .read_u64 = cpuset_read_u64,
590 .write_u64 = cpuset_write_u64,
591 .private = FILE_SPREAD_PAGE,
592 },
593
594 {
595 /* obsolete, may be removed in the future */
596 .name = "memory_spread_slab",
597 .read_u64 = cpuset_read_u64,
598 .write_u64 = cpuset_write_u64,
599 .private = FILE_SPREAD_SLAB,
600 },
601
602 {
603 .name = "memory_pressure_enabled",
604 .flags = CFTYPE_ONLY_ON_ROOT,
605 .read_u64 = cpuset_read_u64,
606 .write_u64 = cpuset_write_u64,
607 .private = FILE_MEMORY_PRESSURE_ENABLED,
608 },
609
610 { } /* terminate */
611 };
612