1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 #include "cgroup-internal.h" 4 #include "cpuset-internal.h" 5 6 /* 7 * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously 8 */ 9 struct cpuset_remove_tasks_struct { 10 struct work_struct work; 11 struct cpuset *cs; 12 }; 13 14 /* 15 * Frequency meter - How fast is some event occurring? 16 * 17 * These routines manage a digitally filtered, constant time based, 18 * event frequency meter. There are four routines: 19 * fmeter_init() - initialize a frequency meter. 20 * fmeter_markevent() - called each time the event happens. 21 * fmeter_getrate() - returns the recent rate of such events. 22 * fmeter_update() - internal routine used to update fmeter. 23 * 24 * A common data structure is passed to each of these routines, 25 * which is used to keep track of the state required to manage the 26 * frequency meter and its digital filter. 27 * 28 * The filter works on the number of events marked per unit time. 29 * The filter is single-pole low-pass recursive (IIR). The time unit 30 * is 1 second. Arithmetic is done using 32-bit integers scaled to 31 * simulate 3 decimal digits of precision (multiplied by 1000). 32 * 33 * With an FM_COEF of 933, and a time base of 1 second, the filter 34 * has a half-life of 10 seconds, meaning that if the events quit 35 * happening, then the rate returned from the fmeter_getrate() 36 * will be cut in half each 10 seconds, until it converges to zero. 37 * 38 * It is not worth doing a real infinitely recursive filter. If more 39 * than FM_MAXTICKS ticks have elapsed since the last filter event, 40 * just compute FM_MAXTICKS ticks worth, by which point the level 41 * will be stable. 42 * 43 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid 44 * arithmetic overflow in the fmeter_update() routine. 45 * 46 * Given the simple 32 bit integer arithmetic used, this meter works 47 * best for reporting rates between one per millisecond (msec) and 48 * one per 32 (approx) seconds. At constant rates faster than one 49 * per msec it maxes out at values just under 1,000,000. At constant 50 * rates between one per msec, and one per second it will stabilize 51 * to a value N*1000, where N is the rate of events per second. 52 * At constant rates between one per second and one per 32 seconds, 53 * it will be choppy, moving up on the seconds that have an event, 54 * and then decaying until the next event. At rates slower than 55 * about one in 32 seconds, it decays all the way back to zero between 56 * each event. 57 */ 58 59 #define FM_COEF 933 /* coefficient for half-life of 10 secs */ 60 #define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ 61 #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ 62 #define FM_SCALE 1000 /* faux fixed point scale */ 63 64 /* Initialize a frequency meter */ 65 void fmeter_init(struct fmeter *fmp) 66 { 67 fmp->cnt = 0; 68 fmp->val = 0; 69 fmp->time = 0; 70 spin_lock_init(&fmp->lock); 71 } 72 73 /* Internal meter update - process cnt events and update value */ 74 static void fmeter_update(struct fmeter *fmp) 75 { 76 time64_t now; 77 u32 ticks; 78 79 now = ktime_get_seconds(); 80 ticks = now - fmp->time; 81 82 if (ticks == 0) 83 return; 84 85 ticks = min(FM_MAXTICKS, ticks); 86 while (ticks-- > 0) 87 fmp->val = (FM_COEF * fmp->val) / FM_SCALE; 88 fmp->time = now; 89 90 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; 91 fmp->cnt = 0; 92 } 93 94 /* Process any previous ticks, then bump cnt by one (times scale). */ 95 static void fmeter_markevent(struct fmeter *fmp) 96 { 97 spin_lock(&fmp->lock); 98 fmeter_update(fmp); 99 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); 100 spin_unlock(&fmp->lock); 101 } 102 103 /* Process any previous ticks, then return current value. */ 104 static int fmeter_getrate(struct fmeter *fmp) 105 { 106 int val; 107 108 spin_lock(&fmp->lock); 109 fmeter_update(fmp); 110 val = fmp->val; 111 spin_unlock(&fmp->lock); 112 return val; 113 } 114 115 /* 116 * Collection of memory_pressure is suppressed unless 117 * this flag is enabled by writing "1" to the special 118 * cpuset file 'memory_pressure_enabled' in the root cpuset. 119 */ 120 121 int cpuset_memory_pressure_enabled __read_mostly; 122 123 /* 124 * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. 125 * 126 * Keep a running average of the rate of synchronous (direct) 127 * page reclaim efforts initiated by tasks in each cpuset. 128 * 129 * This represents the rate at which some task in the cpuset 130 * ran low on memory on all nodes it was allowed to use, and 131 * had to enter the kernels page reclaim code in an effort to 132 * create more free memory by tossing clean pages or swapping 133 * or writing dirty pages. 134 * 135 * Display to user space in the per-cpuset read-only file 136 * "memory_pressure". Value displayed is an integer 137 * representing the recent rate of entry into the synchronous 138 * (direct) page reclaim by any task attached to the cpuset. 139 */ 140 141 void __cpuset_memory_pressure_bump(void) 142 { 143 rcu_read_lock(); 144 fmeter_markevent(&task_cs(current)->fmeter); 145 rcu_read_unlock(); 146 } 147 148 static int update_relax_domain_level(struct cpuset *cs, s64 val) 149 { 150 #ifdef CONFIG_SMP 151 if (val < -1 || val > sched_domain_level_max + 1) 152 return -EINVAL; 153 #endif 154 155 if (val != cs->relax_domain_level) { 156 cs->relax_domain_level = val; 157 if (!cpumask_empty(cs->cpus_allowed) && 158 is_sched_load_balance(cs)) 159 rebuild_sched_domains_locked(); 160 } 161 162 return 0; 163 } 164 165 static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, 166 s64 val) 167 { 168 struct cpuset *cs = css_cs(css); 169 cpuset_filetype_t type = cft->private; 170 int retval = -ENODEV; 171 172 cpus_read_lock(); 173 cpuset_lock(); 174 if (!is_cpuset_online(cs)) 175 goto out_unlock; 176 177 switch (type) { 178 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 179 pr_info_once("cpuset.%s is deprecated\n", cft->name); 180 retval = update_relax_domain_level(cs, val); 181 break; 182 default: 183 retval = -EINVAL; 184 break; 185 } 186 out_unlock: 187 cpuset_unlock(); 188 cpus_read_unlock(); 189 return retval; 190 } 191 192 static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) 193 { 194 struct cpuset *cs = css_cs(css); 195 cpuset_filetype_t type = cft->private; 196 197 switch (type) { 198 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 199 return cs->relax_domain_level; 200 default: 201 BUG(); 202 } 203 204 /* Unreachable but makes gcc happy */ 205 return 0; 206 } 207 208 /* 209 * update task's spread flag if cpuset's page/slab spread flag is set 210 * 211 * Call with callback_lock or cpuset_mutex held. The check can be skipped 212 * if on default hierarchy. 213 */ 214 void cpuset1_update_task_spread_flags(struct cpuset *cs, 215 struct task_struct *tsk) 216 { 217 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) 218 return; 219 220 if (is_spread_page(cs)) 221 task_set_spread_page(tsk); 222 else 223 task_clear_spread_page(tsk); 224 225 if (is_spread_slab(cs)) 226 task_set_spread_slab(tsk); 227 else 228 task_clear_spread_slab(tsk); 229 } 230 231 /** 232 * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset. 233 * @cs: the cpuset in which each task's spread flags needs to be changed 234 * 235 * Iterate through each task of @cs updating its spread flags. As this 236 * function is called with cpuset_mutex held, cpuset membership stays 237 * stable. 238 */ 239 void cpuset1_update_tasks_flags(struct cpuset *cs) 240 { 241 struct css_task_iter it; 242 struct task_struct *task; 243 244 css_task_iter_start(&cs->css, 0, &it); 245 while ((task = css_task_iter_next(&it))) 246 cpuset1_update_task_spread_flags(cs, task); 247 css_task_iter_end(&it); 248 } 249 250 /* 251 * If CPU and/or memory hotplug handlers, below, unplug any CPUs 252 * or memory nodes, we need to walk over the cpuset hierarchy, 253 * removing that CPU or node from all cpusets. If this removes the 254 * last CPU or node from a cpuset, then move the tasks in the empty 255 * cpuset to its next-highest non-empty parent. 256 */ 257 static void remove_tasks_in_empty_cpuset(struct cpuset *cs) 258 { 259 struct cpuset *parent; 260 261 /* 262 * Find its next-highest non-empty parent, (top cpuset 263 * has online cpus, so can't be empty). 264 */ 265 parent = parent_cs(cs); 266 while (cpumask_empty(parent->cpus_allowed) || 267 nodes_empty(parent->mems_allowed)) 268 parent = parent_cs(parent); 269 270 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { 271 pr_err("cpuset: failed to transfer tasks out of empty cpuset "); 272 pr_cont_cgroup_name(cs->css.cgroup); 273 pr_cont("\n"); 274 } 275 } 276 277 static void cpuset_migrate_tasks_workfn(struct work_struct *work) 278 { 279 struct cpuset_remove_tasks_struct *s; 280 281 s = container_of(work, struct cpuset_remove_tasks_struct, work); 282 remove_tasks_in_empty_cpuset(s->cs); 283 css_put(&s->cs->css); 284 kfree(s); 285 } 286 287 void cpuset1_hotplug_update_tasks(struct cpuset *cs, 288 struct cpumask *new_cpus, nodemask_t *new_mems, 289 bool cpus_updated, bool mems_updated) 290 { 291 bool is_empty; 292 293 cpuset_callback_lock_irq(); 294 cpumask_copy(cs->cpus_allowed, new_cpus); 295 cpumask_copy(cs->effective_cpus, new_cpus); 296 cs->mems_allowed = *new_mems; 297 cs->effective_mems = *new_mems; 298 cpuset_callback_unlock_irq(); 299 300 /* 301 * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty, 302 * as the tasks will be migrated to an ancestor. 303 */ 304 if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) 305 cpuset_update_tasks_cpumask(cs, new_cpus); 306 if (mems_updated && !nodes_empty(cs->mems_allowed)) 307 cpuset_update_tasks_nodemask(cs); 308 309 is_empty = cpumask_empty(cs->cpus_allowed) || 310 nodes_empty(cs->mems_allowed); 311 312 /* 313 * Move tasks to the nearest ancestor with execution resources, 314 * This is full cgroup operation which will also call back into 315 * cpuset. Execute it asynchronously using workqueue. 316 */ 317 if (is_empty && cs->css.cgroup->nr_populated_csets && 318 css_tryget_online(&cs->css)) { 319 struct cpuset_remove_tasks_struct *s; 320 321 s = kzalloc(sizeof(*s), GFP_KERNEL); 322 if (WARN_ON_ONCE(!s)) { 323 css_put(&cs->css); 324 return; 325 } 326 327 s->cs = cs; 328 INIT_WORK(&s->work, cpuset_migrate_tasks_workfn); 329 schedule_work(&s->work); 330 } 331 } 332 333 /* 334 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? 335 * 336 * One cpuset is a subset of another if all its allowed CPUs and 337 * Memory Nodes are a subset of the other, and its exclusive flags 338 * are only set if the other's are set. Call holding cpuset_mutex. 339 */ 340 341 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 342 { 343 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && 344 nodes_subset(p->mems_allowed, q->mems_allowed) && 345 is_cpu_exclusive(p) <= is_cpu_exclusive(q) && 346 is_mem_exclusive(p) <= is_mem_exclusive(q); 347 } 348 349 /* 350 * cpuset1_validate_change() - Validate conditions specific to legacy (v1) 351 * behavior. 352 */ 353 int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) 354 { 355 struct cgroup_subsys_state *css; 356 struct cpuset *c, *par; 357 int ret; 358 359 WARN_ON_ONCE(!rcu_read_lock_held()); 360 361 /* Each of our child cpusets must be a subset of us */ 362 ret = -EBUSY; 363 cpuset_for_each_child(c, css, cur) 364 if (!is_cpuset_subset(c, trial)) 365 goto out; 366 367 /* On legacy hierarchy, we must be a subset of our parent cpuset. */ 368 ret = -EACCES; 369 par = parent_cs(cur); 370 if (par && !is_cpuset_subset(trial, par)) 371 goto out; 372 373 ret = 0; 374 out: 375 return ret; 376 } 377 378 #ifdef CONFIG_PROC_PID_CPUSET 379 /* 380 * proc_cpuset_show() 381 * - Print tasks cpuset path into seq_file. 382 * - Used for /proc/<pid>/cpuset. 383 */ 384 int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, 385 struct pid *pid, struct task_struct *tsk) 386 { 387 char *buf; 388 struct cgroup_subsys_state *css; 389 int retval; 390 391 retval = -ENOMEM; 392 buf = kmalloc(PATH_MAX, GFP_KERNEL); 393 if (!buf) 394 goto out; 395 396 rcu_read_lock(); 397 spin_lock_irq(&css_set_lock); 398 css = task_css(tsk, cpuset_cgrp_id); 399 retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, 400 current->nsproxy->cgroup_ns); 401 spin_unlock_irq(&css_set_lock); 402 rcu_read_unlock(); 403 404 if (retval == -E2BIG) 405 retval = -ENAMETOOLONG; 406 if (retval < 0) 407 goto out_free; 408 seq_puts(m, buf); 409 seq_putc(m, '\n'); 410 retval = 0; 411 out_free: 412 kfree(buf); 413 out: 414 return retval; 415 } 416 #endif /* CONFIG_PROC_PID_CPUSET */ 417 418 static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) 419 { 420 struct cpuset *cs = css_cs(css); 421 cpuset_filetype_t type = cft->private; 422 423 switch (type) { 424 case FILE_CPU_EXCLUSIVE: 425 return is_cpu_exclusive(cs); 426 case FILE_MEM_EXCLUSIVE: 427 return is_mem_exclusive(cs); 428 case FILE_MEM_HARDWALL: 429 return is_mem_hardwall(cs); 430 case FILE_SCHED_LOAD_BALANCE: 431 return is_sched_load_balance(cs); 432 case FILE_MEMORY_MIGRATE: 433 return is_memory_migrate(cs); 434 case FILE_MEMORY_PRESSURE_ENABLED: 435 return cpuset_memory_pressure_enabled; 436 case FILE_MEMORY_PRESSURE: 437 return fmeter_getrate(&cs->fmeter); 438 case FILE_SPREAD_PAGE: 439 return is_spread_page(cs); 440 case FILE_SPREAD_SLAB: 441 return is_spread_slab(cs); 442 default: 443 BUG(); 444 } 445 446 /* Unreachable but makes gcc happy */ 447 return 0; 448 } 449 450 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, 451 u64 val) 452 { 453 struct cpuset *cs = css_cs(css); 454 cpuset_filetype_t type = cft->private; 455 int retval = 0; 456 457 cpus_read_lock(); 458 cpuset_lock(); 459 if (!is_cpuset_online(cs)) { 460 retval = -ENODEV; 461 goto out_unlock; 462 } 463 464 switch (type) { 465 case FILE_CPU_EXCLUSIVE: 466 retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val); 467 break; 468 case FILE_MEM_EXCLUSIVE: 469 pr_info_once("cpuset.%s is deprecated\n", cft->name); 470 retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val); 471 break; 472 case FILE_MEM_HARDWALL: 473 pr_info_once("cpuset.%s is deprecated\n", cft->name); 474 retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val); 475 break; 476 case FILE_SCHED_LOAD_BALANCE: 477 pr_info_once("cpuset.%s is deprecated, use cpuset.cpus.partition instead\n", cft->name); 478 retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val); 479 break; 480 case FILE_MEMORY_MIGRATE: 481 pr_info_once("cpuset.%s is deprecated\n", cft->name); 482 retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val); 483 break; 484 case FILE_MEMORY_PRESSURE_ENABLED: 485 pr_info_once("cpuset.%s is deprecated, use memory.pressure with CONFIG_PSI instead\n", cft->name); 486 cpuset_memory_pressure_enabled = !!val; 487 break; 488 case FILE_SPREAD_PAGE: 489 pr_info_once("cpuset.%s is deprecated\n", cft->name); 490 retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val); 491 break; 492 case FILE_SPREAD_SLAB: 493 pr_warn_once("cpuset.%s is deprecated\n", cft->name); 494 retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val); 495 break; 496 default: 497 retval = -EINVAL; 498 break; 499 } 500 out_unlock: 501 cpuset_unlock(); 502 cpus_read_unlock(); 503 return retval; 504 } 505 506 /* 507 * for the common functions, 'private' gives the type of file 508 */ 509 510 struct cftype cpuset1_files[] = { 511 { 512 .name = "cpus", 513 .seq_show = cpuset_common_seq_show, 514 .write = cpuset_write_resmask, 515 .max_write_len = (100U + 6 * NR_CPUS), 516 .private = FILE_CPULIST, 517 }, 518 519 { 520 .name = "mems", 521 .seq_show = cpuset_common_seq_show, 522 .write = cpuset_write_resmask, 523 .max_write_len = (100U + 6 * MAX_NUMNODES), 524 .private = FILE_MEMLIST, 525 }, 526 527 { 528 .name = "effective_cpus", 529 .seq_show = cpuset_common_seq_show, 530 .private = FILE_EFFECTIVE_CPULIST, 531 }, 532 533 { 534 .name = "effective_mems", 535 .seq_show = cpuset_common_seq_show, 536 .private = FILE_EFFECTIVE_MEMLIST, 537 }, 538 539 { 540 .name = "cpu_exclusive", 541 .read_u64 = cpuset_read_u64, 542 .write_u64 = cpuset_write_u64, 543 .private = FILE_CPU_EXCLUSIVE, 544 }, 545 546 { 547 .name = "mem_exclusive", 548 .read_u64 = cpuset_read_u64, 549 .write_u64 = cpuset_write_u64, 550 .private = FILE_MEM_EXCLUSIVE, 551 }, 552 553 { 554 .name = "mem_hardwall", 555 .read_u64 = cpuset_read_u64, 556 .write_u64 = cpuset_write_u64, 557 .private = FILE_MEM_HARDWALL, 558 }, 559 560 { 561 .name = "sched_load_balance", 562 .read_u64 = cpuset_read_u64, 563 .write_u64 = cpuset_write_u64, 564 .private = FILE_SCHED_LOAD_BALANCE, 565 }, 566 567 { 568 .name = "sched_relax_domain_level", 569 .read_s64 = cpuset_read_s64, 570 .write_s64 = cpuset_write_s64, 571 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, 572 }, 573 574 { 575 .name = "memory_migrate", 576 .read_u64 = cpuset_read_u64, 577 .write_u64 = cpuset_write_u64, 578 .private = FILE_MEMORY_MIGRATE, 579 }, 580 581 { 582 .name = "memory_pressure", 583 .read_u64 = cpuset_read_u64, 584 .private = FILE_MEMORY_PRESSURE, 585 }, 586 587 { 588 .name = "memory_spread_page", 589 .read_u64 = cpuset_read_u64, 590 .write_u64 = cpuset_write_u64, 591 .private = FILE_SPREAD_PAGE, 592 }, 593 594 { 595 /* obsolete, may be removed in the future */ 596 .name = "memory_spread_slab", 597 .read_u64 = cpuset_read_u64, 598 .write_u64 = cpuset_write_u64, 599 .private = FILE_SPREAD_SLAB, 600 }, 601 602 { 603 .name = "memory_pressure_enabled", 604 .flags = CFTYPE_ONLY_ON_ROOT, 605 .read_u64 = cpuset_read_u64, 606 .write_u64 = cpuset_write_u64, 607 .private = FILE_MEMORY_PRESSURE_ENABLED, 608 }, 609 610 { } /* terminate */ 611 }; 612