1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 #include <linux/memcontrol.h> 4 #include <linux/swap.h> 5 #include <linux/mm_inline.h> 6 #include <linux/pagewalk.h> 7 #include <linux/backing-dev.h> 8 #include <linux/swap_cgroup.h> 9 #include <linux/eventfd.h> 10 #include <linux/poll.h> 11 #include <linux/sort.h> 12 #include <linux/file.h> 13 #include <linux/seq_buf.h> 14 15 #include "internal.h" 16 #include "swap.h" 17 #include "memcontrol-v1.h" 18 19 /* 20 * Cgroups above their limits are maintained in a RB-Tree, independent of 21 * their hierarchy representation 22 */ 23 24 struct mem_cgroup_tree_per_node { 25 struct rb_root rb_root; 26 struct rb_node *rb_rightmost; 27 spinlock_t lock; 28 }; 29 30 struct mem_cgroup_tree { 31 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 32 }; 33 34 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 35 36 /* 37 * Maximum loops in mem_cgroup_soft_reclaim(), used for soft 38 * limit reclaim to prevent infinite loops, if they ever occur. 39 */ 40 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 41 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 42 43 /* for OOM */ 44 struct mem_cgroup_eventfd_list { 45 struct list_head list; 46 struct eventfd_ctx *eventfd; 47 }; 48 49 /* 50 * cgroup_event represents events which userspace want to receive. 51 */ 52 struct mem_cgroup_event { 53 /* 54 * memcg which the event belongs to. 55 */ 56 struct mem_cgroup *memcg; 57 /* 58 * eventfd to signal userspace about the event. 59 */ 60 struct eventfd_ctx *eventfd; 61 /* 62 * Each of these stored in a list by the cgroup. 63 */ 64 struct list_head list; 65 /* 66 * register_event() callback will be used to add new userspace 67 * waiter for changes related to this event. Use eventfd_signal() 68 * on eventfd to send notification to userspace. 69 */ 70 int (*register_event)(struct mem_cgroup *memcg, 71 struct eventfd_ctx *eventfd, const char *args); 72 /* 73 * unregister_event() callback will be called when userspace closes 74 * the eventfd or on cgroup removing. This callback must be set, 75 * if you want provide notification functionality. 76 */ 77 void (*unregister_event)(struct mem_cgroup *memcg, 78 struct eventfd_ctx *eventfd); 79 /* 80 * All fields below needed to unregister event when 81 * userspace closes eventfd. 82 */ 83 poll_table pt; 84 wait_queue_head_t *wqh; 85 wait_queue_entry_t wait; 86 struct work_struct remove; 87 }; 88 89 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 90 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 91 #define MEMFILE_ATTR(val) ((val) & 0xffff) 92 93 enum { 94 RES_USAGE, 95 RES_LIMIT, 96 RES_MAX_USAGE, 97 RES_FAILCNT, 98 RES_SOFT_LIMIT, 99 }; 100 101 #ifdef CONFIG_LOCKDEP 102 static struct lockdep_map memcg_oom_lock_dep_map = { 103 .name = "memcg_oom_lock", 104 }; 105 #endif 106 107 DEFINE_SPINLOCK(memcg_oom_lock); 108 109 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, 110 struct mem_cgroup_tree_per_node *mctz, 111 unsigned long new_usage_in_excess) 112 { 113 struct rb_node **p = &mctz->rb_root.rb_node; 114 struct rb_node *parent = NULL; 115 struct mem_cgroup_per_node *mz_node; 116 bool rightmost = true; 117 118 if (mz->on_tree) 119 return; 120 121 mz->usage_in_excess = new_usage_in_excess; 122 if (!mz->usage_in_excess) 123 return; 124 while (*p) { 125 parent = *p; 126 mz_node = rb_entry(parent, struct mem_cgroup_per_node, 127 tree_node); 128 if (mz->usage_in_excess < mz_node->usage_in_excess) { 129 p = &(*p)->rb_left; 130 rightmost = false; 131 } else { 132 p = &(*p)->rb_right; 133 } 134 } 135 136 if (rightmost) 137 mctz->rb_rightmost = &mz->tree_node; 138 139 rb_link_node(&mz->tree_node, parent, p); 140 rb_insert_color(&mz->tree_node, &mctz->rb_root); 141 mz->on_tree = true; 142 } 143 144 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 145 struct mem_cgroup_tree_per_node *mctz) 146 { 147 if (!mz->on_tree) 148 return; 149 150 if (&mz->tree_node == mctz->rb_rightmost) 151 mctz->rb_rightmost = rb_prev(&mz->tree_node); 152 153 rb_erase(&mz->tree_node, &mctz->rb_root); 154 mz->on_tree = false; 155 } 156 157 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, 158 struct mem_cgroup_tree_per_node *mctz) 159 { 160 unsigned long flags; 161 162 spin_lock_irqsave(&mctz->lock, flags); 163 __mem_cgroup_remove_exceeded(mz, mctz); 164 spin_unlock_irqrestore(&mctz->lock, flags); 165 } 166 167 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 168 { 169 unsigned long nr_pages = page_counter_read(&memcg->memory); 170 unsigned long soft_limit = READ_ONCE(memcg->soft_limit); 171 unsigned long excess = 0; 172 173 if (nr_pages > soft_limit) 174 excess = nr_pages - soft_limit; 175 176 return excess; 177 } 178 179 static void memcg1_update_tree(struct mem_cgroup *memcg, int nid) 180 { 181 unsigned long excess; 182 struct mem_cgroup_per_node *mz; 183 struct mem_cgroup_tree_per_node *mctz; 184 185 if (lru_gen_enabled()) { 186 if (soft_limit_excess(memcg)) 187 lru_gen_soft_reclaim(memcg, nid); 188 return; 189 } 190 191 mctz = soft_limit_tree.rb_tree_per_node[nid]; 192 if (!mctz) 193 return; 194 /* 195 * Necessary to update all ancestors when hierarchy is used. 196 * because their event counter is not touched. 197 */ 198 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 199 mz = memcg->nodeinfo[nid]; 200 excess = soft_limit_excess(memcg); 201 /* 202 * We have to update the tree if mz is on RB-tree or 203 * mem is over its softlimit. 204 */ 205 if (excess || mz->on_tree) { 206 unsigned long flags; 207 208 spin_lock_irqsave(&mctz->lock, flags); 209 /* if on-tree, remove it */ 210 if (mz->on_tree) 211 __mem_cgroup_remove_exceeded(mz, mctz); 212 /* 213 * Insert again. mz->usage_in_excess will be updated. 214 * If excess is 0, no tree ops. 215 */ 216 __mem_cgroup_insert_exceeded(mz, mctz, excess); 217 spin_unlock_irqrestore(&mctz->lock, flags); 218 } 219 } 220 } 221 222 void memcg1_remove_from_trees(struct mem_cgroup *memcg) 223 { 224 struct mem_cgroup_tree_per_node *mctz; 225 struct mem_cgroup_per_node *mz; 226 int nid; 227 228 for_each_node(nid) { 229 mz = memcg->nodeinfo[nid]; 230 mctz = soft_limit_tree.rb_tree_per_node[nid]; 231 if (mctz) 232 mem_cgroup_remove_exceeded(mz, mctz); 233 } 234 } 235 236 static struct mem_cgroup_per_node * 237 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 238 { 239 struct mem_cgroup_per_node *mz; 240 241 retry: 242 mz = NULL; 243 if (!mctz->rb_rightmost) 244 goto done; /* Nothing to reclaim from */ 245 246 mz = rb_entry(mctz->rb_rightmost, 247 struct mem_cgroup_per_node, tree_node); 248 /* 249 * Remove the node now but someone else can add it back, 250 * we will to add it back at the end of reclaim to its correct 251 * position in the tree. 252 */ 253 __mem_cgroup_remove_exceeded(mz, mctz); 254 if (!soft_limit_excess(mz->memcg) || 255 !css_tryget(&mz->memcg->css)) 256 goto retry; 257 done: 258 return mz; 259 } 260 261 static struct mem_cgroup_per_node * 262 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) 263 { 264 struct mem_cgroup_per_node *mz; 265 266 spin_lock_irq(&mctz->lock); 267 mz = __mem_cgroup_largest_soft_limit_node(mctz); 268 spin_unlock_irq(&mctz->lock); 269 return mz; 270 } 271 272 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 273 pg_data_t *pgdat, 274 gfp_t gfp_mask, 275 unsigned long *total_scanned) 276 { 277 struct mem_cgroup *victim = NULL; 278 int total = 0; 279 int loop = 0; 280 unsigned long excess; 281 unsigned long nr_scanned; 282 struct mem_cgroup_reclaim_cookie reclaim = { 283 .pgdat = pgdat, 284 }; 285 286 excess = soft_limit_excess(root_memcg); 287 288 while (1) { 289 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 290 if (!victim) { 291 loop++; 292 if (loop >= 2) { 293 /* 294 * If we have not been able to reclaim 295 * anything, it might because there are 296 * no reclaimable pages under this hierarchy 297 */ 298 if (!total) 299 break; 300 /* 301 * We want to do more targeted reclaim. 302 * excess >> 2 is not to excessive so as to 303 * reclaim too much, nor too less that we keep 304 * coming back to reclaim from this cgroup 305 */ 306 if (total >= (excess >> 2) || 307 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 308 break; 309 } 310 continue; 311 } 312 total += mem_cgroup_shrink_node(victim, gfp_mask, false, 313 pgdat, &nr_scanned); 314 *total_scanned += nr_scanned; 315 if (!soft_limit_excess(root_memcg)) 316 break; 317 } 318 mem_cgroup_iter_break(root_memcg, victim); 319 return total; 320 } 321 322 unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, 323 gfp_t gfp_mask, 324 unsigned long *total_scanned) 325 { 326 unsigned long nr_reclaimed = 0; 327 struct mem_cgroup_per_node *mz, *next_mz = NULL; 328 unsigned long reclaimed; 329 int loop = 0; 330 struct mem_cgroup_tree_per_node *mctz; 331 unsigned long excess; 332 333 if (lru_gen_enabled()) 334 return 0; 335 336 if (order > 0) 337 return 0; 338 339 mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id]; 340 341 /* 342 * Do not even bother to check the largest node if the root 343 * is empty. Do it lockless to prevent lock bouncing. Races 344 * are acceptable as soft limit is best effort anyway. 345 */ 346 if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) 347 return 0; 348 349 /* 350 * This loop can run a while, specially if mem_cgroup's continuously 351 * keep exceeding their soft limit and putting the system under 352 * pressure 353 */ 354 do { 355 if (next_mz) 356 mz = next_mz; 357 else 358 mz = mem_cgroup_largest_soft_limit_node(mctz); 359 if (!mz) 360 break; 361 362 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, 363 gfp_mask, total_scanned); 364 nr_reclaimed += reclaimed; 365 spin_lock_irq(&mctz->lock); 366 367 /* 368 * If we failed to reclaim anything from this memory cgroup 369 * it is time to move on to the next cgroup 370 */ 371 next_mz = NULL; 372 if (!reclaimed) 373 next_mz = __mem_cgroup_largest_soft_limit_node(mctz); 374 375 excess = soft_limit_excess(mz->memcg); 376 /* 377 * One school of thought says that we should not add 378 * back the node to the tree if reclaim returns 0. 379 * But our reclaim could return 0, simply because due 380 * to priority we are exposing a smaller subset of 381 * memory to reclaim from. Consider this as a longer 382 * term TODO. 383 */ 384 /* If excess == 0, no tree ops */ 385 __mem_cgroup_insert_exceeded(mz, mctz, excess); 386 spin_unlock_irq(&mctz->lock); 387 css_put(&mz->memcg->css); 388 loop++; 389 /* 390 * Could not reclaim anything and there are no more 391 * mem cgroups to try or we seem to be looping without 392 * reclaiming anything. 393 */ 394 if (!nr_reclaimed && 395 (next_mz == NULL || 396 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 397 break; 398 } while (!nr_reclaimed); 399 if (next_mz) 400 css_put(&next_mz->memcg->css); 401 return nr_reclaimed; 402 } 403 404 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 405 struct cftype *cft) 406 { 407 return 0; 408 } 409 410 #ifdef CONFIG_MMU 411 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 412 struct cftype *cft, u64 val) 413 { 414 pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. " 415 "Please report your usecase to linux-mm@kvack.org if you " 416 "depend on this functionality.\n"); 417 418 if (val != 0) 419 return -EINVAL; 420 return 0; 421 } 422 #else 423 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 424 struct cftype *cft, u64 val) 425 { 426 return -ENOSYS; 427 } 428 #endif 429 430 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 431 { 432 struct mem_cgroup_threshold_ary *t; 433 unsigned long usage; 434 int i; 435 436 rcu_read_lock(); 437 if (!swap) 438 t = rcu_dereference(memcg->thresholds.primary); 439 else 440 t = rcu_dereference(memcg->memsw_thresholds.primary); 441 442 if (!t) 443 goto unlock; 444 445 usage = mem_cgroup_usage(memcg, swap); 446 447 /* 448 * current_threshold points to threshold just below or equal to usage. 449 * If it's not true, a threshold was crossed after last 450 * call of __mem_cgroup_threshold(). 451 */ 452 i = t->current_threshold; 453 454 /* 455 * Iterate backward over array of thresholds starting from 456 * current_threshold and check if a threshold is crossed. 457 * If none of thresholds below usage is crossed, we read 458 * only one element of the array here. 459 */ 460 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 461 eventfd_signal(t->entries[i].eventfd); 462 463 /* i = current_threshold + 1 */ 464 i++; 465 466 /* 467 * Iterate forward over array of thresholds starting from 468 * current_threshold+1 and check if a threshold is crossed. 469 * If none of thresholds above usage is crossed, we read 470 * only one element of the array here. 471 */ 472 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 473 eventfd_signal(t->entries[i].eventfd); 474 475 /* Update current_threshold */ 476 t->current_threshold = i - 1; 477 unlock: 478 rcu_read_unlock(); 479 } 480 481 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 482 { 483 while (memcg) { 484 __mem_cgroup_threshold(memcg, false); 485 if (do_memsw_account()) 486 __mem_cgroup_threshold(memcg, true); 487 488 memcg = parent_mem_cgroup(memcg); 489 } 490 } 491 492 /* Cgroup1: threshold notifications & softlimit tree updates */ 493 494 /* 495 * Per memcg event counter is incremented at every pagein/pageout. With THP, 496 * it will be incremented by the number of pages. This counter is used 497 * to trigger some periodic events. This is straightforward and better 498 * than using jiffies etc. to handle periodic memcg event. 499 */ 500 enum mem_cgroup_events_target { 501 MEM_CGROUP_TARGET_THRESH, 502 MEM_CGROUP_TARGET_SOFTLIMIT, 503 MEM_CGROUP_NTARGETS, 504 }; 505 506 struct memcg1_events_percpu { 507 unsigned long nr_page_events; 508 unsigned long targets[MEM_CGROUP_NTARGETS]; 509 }; 510 511 static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages) 512 { 513 /* pagein of a big page is an event. So, ignore page size */ 514 if (nr_pages > 0) 515 __count_memcg_events(memcg, PGPGIN, 1); 516 else { 517 __count_memcg_events(memcg, PGPGOUT, 1); 518 nr_pages = -nr_pages; /* for event */ 519 } 520 521 __this_cpu_add(memcg->events_percpu->nr_page_events, nr_pages); 522 } 523 524 #define THRESHOLDS_EVENTS_TARGET 128 525 #define SOFTLIMIT_EVENTS_TARGET 1024 526 527 static bool memcg1_event_ratelimit(struct mem_cgroup *memcg, 528 enum mem_cgroup_events_target target) 529 { 530 unsigned long val, next; 531 532 val = __this_cpu_read(memcg->events_percpu->nr_page_events); 533 next = __this_cpu_read(memcg->events_percpu->targets[target]); 534 /* from time_after() in jiffies.h */ 535 if ((long)(next - val) < 0) { 536 switch (target) { 537 case MEM_CGROUP_TARGET_THRESH: 538 next = val + THRESHOLDS_EVENTS_TARGET; 539 break; 540 case MEM_CGROUP_TARGET_SOFTLIMIT: 541 next = val + SOFTLIMIT_EVENTS_TARGET; 542 break; 543 default: 544 break; 545 } 546 __this_cpu_write(memcg->events_percpu->targets[target], next); 547 return true; 548 } 549 return false; 550 } 551 552 /* 553 * Check events in order. 554 * 555 */ 556 static void memcg1_check_events(struct mem_cgroup *memcg, int nid) 557 { 558 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 559 return; 560 561 /* threshold event is triggered in finer grain than soft limit */ 562 if (unlikely(memcg1_event_ratelimit(memcg, 563 MEM_CGROUP_TARGET_THRESH))) { 564 bool do_softlimit; 565 566 do_softlimit = memcg1_event_ratelimit(memcg, 567 MEM_CGROUP_TARGET_SOFTLIMIT); 568 mem_cgroup_threshold(memcg); 569 if (unlikely(do_softlimit)) 570 memcg1_update_tree(memcg, nid); 571 } 572 } 573 574 void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg) 575 { 576 unsigned long flags; 577 578 local_irq_save(flags); 579 memcg1_charge_statistics(memcg, folio_nr_pages(folio)); 580 memcg1_check_events(memcg, folio_nid(folio)); 581 local_irq_restore(flags); 582 } 583 584 /** 585 * memcg1_swapout - transfer a memsw charge to swap 586 * @folio: folio whose memsw charge to transfer 587 * @entry: swap entry to move the charge to 588 * 589 * Transfer the memsw charge of @folio to @entry. 590 */ 591 void memcg1_swapout(struct folio *folio, swp_entry_t entry) 592 { 593 struct mem_cgroup *memcg, *swap_memcg; 594 unsigned int nr_entries; 595 596 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); 597 VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); 598 599 if (mem_cgroup_disabled()) 600 return; 601 602 if (!do_memsw_account()) 603 return; 604 605 memcg = folio_memcg(folio); 606 607 VM_WARN_ON_ONCE_FOLIO(!memcg, folio); 608 if (!memcg) 609 return; 610 611 /* 612 * In case the memcg owning these pages has been offlined and doesn't 613 * have an ID allocated to it anymore, charge the closest online 614 * ancestor for the swap instead and transfer the memory+swap charge. 615 */ 616 swap_memcg = mem_cgroup_id_get_online(memcg); 617 nr_entries = folio_nr_pages(folio); 618 /* Get references for the tail pages, too */ 619 if (nr_entries > 1) 620 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); 621 mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); 622 623 swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry); 624 625 folio_unqueue_deferred_split(folio); 626 folio->memcg_data = 0; 627 628 if (!mem_cgroup_is_root(memcg)) 629 page_counter_uncharge(&memcg->memory, nr_entries); 630 631 if (memcg != swap_memcg) { 632 if (!mem_cgroup_is_root(swap_memcg)) 633 page_counter_charge(&swap_memcg->memsw, nr_entries); 634 page_counter_uncharge(&memcg->memsw, nr_entries); 635 } 636 637 /* 638 * Interrupts should be disabled here because the caller holds the 639 * i_pages lock which is taken with interrupts-off. It is 640 * important here to have the interrupts disabled because it is the 641 * only synchronisation we have for updating the per-CPU variables. 642 */ 643 preempt_disable_nested(); 644 VM_WARN_ON_IRQS_ENABLED(); 645 memcg1_charge_statistics(memcg, -folio_nr_pages(folio)); 646 preempt_enable_nested(); 647 memcg1_check_events(memcg, folio_nid(folio)); 648 649 css_put(&memcg->css); 650 } 651 652 /* 653 * memcg1_swapin - uncharge swap slot 654 * @entry: the first swap entry for which the pages are charged 655 * @nr_pages: number of pages which will be uncharged 656 * 657 * Call this function after successfully adding the charged page to swapcache. 658 * 659 * Note: This function assumes the page for which swap slot is being uncharged 660 * is order 0 page. 661 */ 662 void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages) 663 { 664 /* 665 * Cgroup1's unified memory+swap counter has been charged with the 666 * new swapcache page, finish the transfer by uncharging the swap 667 * slot. The swap slot would also get uncharged when it dies, but 668 * it can stick around indefinitely and we'd count the page twice 669 * the entire time. 670 * 671 * Cgroup2 has separate resource counters for memory and swap, 672 * so this is a non-issue here. Memory and swap charge lifetimes 673 * correspond 1:1 to page and swap slot lifetimes: we charge the 674 * page to memory here, and uncharge swap when the slot is freed. 675 */ 676 if (do_memsw_account()) { 677 /* 678 * The swap entry might not get freed for a long time, 679 * let's not wait for it. The page already received a 680 * memory+swap charge, drop the swap entry duplicate. 681 */ 682 mem_cgroup_uncharge_swap(entry, nr_pages); 683 } 684 } 685 686 void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, 687 unsigned long nr_memory, int nid) 688 { 689 unsigned long flags; 690 691 local_irq_save(flags); 692 __count_memcg_events(memcg, PGPGOUT, pgpgout); 693 __this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory); 694 memcg1_check_events(memcg, nid); 695 local_irq_restore(flags); 696 } 697 698 static int compare_thresholds(const void *a, const void *b) 699 { 700 const struct mem_cgroup_threshold *_a = a; 701 const struct mem_cgroup_threshold *_b = b; 702 703 if (_a->threshold > _b->threshold) 704 return 1; 705 706 if (_a->threshold < _b->threshold) 707 return -1; 708 709 return 0; 710 } 711 712 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 713 { 714 struct mem_cgroup_eventfd_list *ev; 715 716 spin_lock(&memcg_oom_lock); 717 718 list_for_each_entry(ev, &memcg->oom_notify, list) 719 eventfd_signal(ev->eventfd); 720 721 spin_unlock(&memcg_oom_lock); 722 return 0; 723 } 724 725 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 726 { 727 struct mem_cgroup *iter; 728 729 for_each_mem_cgroup_tree(iter, memcg) 730 mem_cgroup_oom_notify_cb(iter); 731 } 732 733 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 734 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 735 { 736 struct mem_cgroup_thresholds *thresholds; 737 struct mem_cgroup_threshold_ary *new; 738 unsigned long threshold; 739 unsigned long usage; 740 int i, size, ret; 741 742 ret = page_counter_memparse(args, "-1", &threshold); 743 if (ret) 744 return ret; 745 746 mutex_lock(&memcg->thresholds_lock); 747 748 if (type == _MEM) { 749 thresholds = &memcg->thresholds; 750 usage = mem_cgroup_usage(memcg, false); 751 } else if (type == _MEMSWAP) { 752 thresholds = &memcg->memsw_thresholds; 753 usage = mem_cgroup_usage(memcg, true); 754 } else 755 BUG(); 756 757 /* Check if a threshold crossed before adding a new one */ 758 if (thresholds->primary) 759 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 760 761 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 762 763 /* Allocate memory for new array of thresholds */ 764 new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); 765 if (!new) { 766 ret = -ENOMEM; 767 goto unlock; 768 } 769 new->size = size; 770 771 /* Copy thresholds (if any) to new array */ 772 if (thresholds->primary) 773 memcpy(new->entries, thresholds->primary->entries, 774 flex_array_size(new, entries, size - 1)); 775 776 /* Add new threshold */ 777 new->entries[size - 1].eventfd = eventfd; 778 new->entries[size - 1].threshold = threshold; 779 780 /* Sort thresholds. Registering of new threshold isn't time-critical */ 781 sort(new->entries, size, sizeof(*new->entries), 782 compare_thresholds, NULL); 783 784 /* Find current threshold */ 785 new->current_threshold = -1; 786 for (i = 0; i < size; i++) { 787 if (new->entries[i].threshold <= usage) { 788 /* 789 * new->current_threshold will not be used until 790 * rcu_assign_pointer(), so it's safe to increment 791 * it here. 792 */ 793 ++new->current_threshold; 794 } else 795 break; 796 } 797 798 /* Free old spare buffer and save old primary buffer as spare */ 799 kfree(thresholds->spare); 800 thresholds->spare = thresholds->primary; 801 802 rcu_assign_pointer(thresholds->primary, new); 803 804 /* To be sure that nobody uses thresholds */ 805 synchronize_rcu(); 806 807 unlock: 808 mutex_unlock(&memcg->thresholds_lock); 809 810 return ret; 811 } 812 813 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 814 struct eventfd_ctx *eventfd, const char *args) 815 { 816 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 817 } 818 819 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 820 struct eventfd_ctx *eventfd, const char *args) 821 { 822 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 823 } 824 825 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 826 struct eventfd_ctx *eventfd, enum res_type type) 827 { 828 struct mem_cgroup_thresholds *thresholds; 829 struct mem_cgroup_threshold_ary *new; 830 unsigned long usage; 831 int i, j, size, entries; 832 833 mutex_lock(&memcg->thresholds_lock); 834 835 if (type == _MEM) { 836 thresholds = &memcg->thresholds; 837 usage = mem_cgroup_usage(memcg, false); 838 } else if (type == _MEMSWAP) { 839 thresholds = &memcg->memsw_thresholds; 840 usage = mem_cgroup_usage(memcg, true); 841 } else 842 BUG(); 843 844 if (!thresholds->primary) 845 goto unlock; 846 847 /* Check if a threshold crossed before removing */ 848 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 849 850 /* Calculate new number of threshold */ 851 size = entries = 0; 852 for (i = 0; i < thresholds->primary->size; i++) { 853 if (thresholds->primary->entries[i].eventfd != eventfd) 854 size++; 855 else 856 entries++; 857 } 858 859 new = thresholds->spare; 860 861 /* If no items related to eventfd have been cleared, nothing to do */ 862 if (!entries) 863 goto unlock; 864 865 /* Set thresholds array to NULL if we don't have thresholds */ 866 if (!size) { 867 kfree(new); 868 new = NULL; 869 goto swap_buffers; 870 } 871 872 new->size = size; 873 874 /* Copy thresholds and find current threshold */ 875 new->current_threshold = -1; 876 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 877 if (thresholds->primary->entries[i].eventfd == eventfd) 878 continue; 879 880 new->entries[j] = thresholds->primary->entries[i]; 881 if (new->entries[j].threshold <= usage) { 882 /* 883 * new->current_threshold will not be used 884 * until rcu_assign_pointer(), so it's safe to increment 885 * it here. 886 */ 887 ++new->current_threshold; 888 } 889 j++; 890 } 891 892 swap_buffers: 893 /* Swap primary and spare array */ 894 thresholds->spare = thresholds->primary; 895 896 rcu_assign_pointer(thresholds->primary, new); 897 898 /* To be sure that nobody uses thresholds */ 899 synchronize_rcu(); 900 901 /* If all events are unregistered, free the spare array */ 902 if (!new) { 903 kfree(thresholds->spare); 904 thresholds->spare = NULL; 905 } 906 unlock: 907 mutex_unlock(&memcg->thresholds_lock); 908 } 909 910 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 911 struct eventfd_ctx *eventfd) 912 { 913 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 914 } 915 916 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 917 struct eventfd_ctx *eventfd) 918 { 919 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 920 } 921 922 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 923 struct eventfd_ctx *eventfd, const char *args) 924 { 925 struct mem_cgroup_eventfd_list *event; 926 927 event = kmalloc(sizeof(*event), GFP_KERNEL); 928 if (!event) 929 return -ENOMEM; 930 931 spin_lock(&memcg_oom_lock); 932 933 event->eventfd = eventfd; 934 list_add(&event->list, &memcg->oom_notify); 935 936 /* already in OOM ? */ 937 if (memcg->under_oom) 938 eventfd_signal(eventfd); 939 spin_unlock(&memcg_oom_lock); 940 941 return 0; 942 } 943 944 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 945 struct eventfd_ctx *eventfd) 946 { 947 struct mem_cgroup_eventfd_list *ev, *tmp; 948 949 spin_lock(&memcg_oom_lock); 950 951 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 952 if (ev->eventfd == eventfd) { 953 list_del(&ev->list); 954 kfree(ev); 955 } 956 } 957 958 spin_unlock(&memcg_oom_lock); 959 } 960 961 /* 962 * DO NOT USE IN NEW FILES. 963 * 964 * "cgroup.event_control" implementation. 965 * 966 * This is way over-engineered. It tries to support fully configurable 967 * events for each user. Such level of flexibility is completely 968 * unnecessary especially in the light of the planned unified hierarchy. 969 * 970 * Please deprecate this and replace with something simpler if at all 971 * possible. 972 */ 973 974 /* 975 * Unregister event and free resources. 976 * 977 * Gets called from workqueue. 978 */ 979 static void memcg_event_remove(struct work_struct *work) 980 { 981 struct mem_cgroup_event *event = 982 container_of(work, struct mem_cgroup_event, remove); 983 struct mem_cgroup *memcg = event->memcg; 984 985 remove_wait_queue(event->wqh, &event->wait); 986 987 event->unregister_event(memcg, event->eventfd); 988 989 /* Notify userspace the event is going away. */ 990 eventfd_signal(event->eventfd); 991 992 eventfd_ctx_put(event->eventfd); 993 kfree(event); 994 css_put(&memcg->css); 995 } 996 997 /* 998 * Gets called on EPOLLHUP on eventfd when user closes it. 999 * 1000 * Called with wqh->lock held and interrupts disabled. 1001 */ 1002 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned int mode, 1003 int sync, void *key) 1004 { 1005 struct mem_cgroup_event *event = 1006 container_of(wait, struct mem_cgroup_event, wait); 1007 struct mem_cgroup *memcg = event->memcg; 1008 __poll_t flags = key_to_poll(key); 1009 1010 if (flags & EPOLLHUP) { 1011 /* 1012 * If the event has been detached at cgroup removal, we 1013 * can simply return knowing the other side will cleanup 1014 * for us. 1015 * 1016 * We can't race against event freeing since the other 1017 * side will require wqh->lock via remove_wait_queue(), 1018 * which we hold. 1019 */ 1020 spin_lock(&memcg->event_list_lock); 1021 if (!list_empty(&event->list)) { 1022 list_del_init(&event->list); 1023 /* 1024 * We are in atomic context, but cgroup_event_remove() 1025 * may sleep, so we have to call it in workqueue. 1026 */ 1027 schedule_work(&event->remove); 1028 } 1029 spin_unlock(&memcg->event_list_lock); 1030 } 1031 1032 return 0; 1033 } 1034 1035 static void memcg_event_ptable_queue_proc(struct file *file, 1036 wait_queue_head_t *wqh, poll_table *pt) 1037 { 1038 struct mem_cgroup_event *event = 1039 container_of(pt, struct mem_cgroup_event, pt); 1040 1041 event->wqh = wqh; 1042 add_wait_queue(wqh, &event->wait); 1043 } 1044 1045 /* 1046 * DO NOT USE IN NEW FILES. 1047 * 1048 * Parse input and register new cgroup event handler. 1049 * 1050 * Input must be in format '<event_fd> <control_fd> <args>'. 1051 * Interpretation of args is defined by control file implementation. 1052 */ 1053 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 1054 char *buf, size_t nbytes, loff_t off) 1055 { 1056 struct cgroup_subsys_state *css = of_css(of); 1057 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1058 struct mem_cgroup_event *event; 1059 struct cgroup_subsys_state *cfile_css; 1060 unsigned int efd, cfd; 1061 struct dentry *cdentry; 1062 const char *name; 1063 char *endp; 1064 int ret; 1065 1066 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 1067 return -EOPNOTSUPP; 1068 1069 buf = strstrip(buf); 1070 1071 efd = simple_strtoul(buf, &endp, 10); 1072 if (*endp != ' ') 1073 return -EINVAL; 1074 buf = endp + 1; 1075 1076 cfd = simple_strtoul(buf, &endp, 10); 1077 if (*endp == '\0') 1078 buf = endp; 1079 else if (*endp == ' ') 1080 buf = endp + 1; 1081 else 1082 return -EINVAL; 1083 1084 CLASS(fd, efile)(efd); 1085 if (fd_empty(efile)) 1086 return -EBADF; 1087 1088 CLASS(fd, cfile)(cfd); 1089 1090 event = kzalloc(sizeof(*event), GFP_KERNEL); 1091 if (!event) 1092 return -ENOMEM; 1093 1094 event->memcg = memcg; 1095 INIT_LIST_HEAD(&event->list); 1096 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 1097 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 1098 INIT_WORK(&event->remove, memcg_event_remove); 1099 1100 event->eventfd = eventfd_ctx_fileget(fd_file(efile)); 1101 if (IS_ERR(event->eventfd)) { 1102 ret = PTR_ERR(event->eventfd); 1103 goto out_kfree; 1104 } 1105 1106 if (fd_empty(cfile)) { 1107 ret = -EBADF; 1108 goto out_put_eventfd; 1109 } 1110 1111 /* the process need read permission on control file */ 1112 /* AV: shouldn't we check that it's been opened for read instead? */ 1113 ret = file_permission(fd_file(cfile), MAY_READ); 1114 if (ret < 0) 1115 goto out_put_eventfd; 1116 1117 /* 1118 * The control file must be a regular cgroup1 file. As a regular cgroup 1119 * file can't be renamed, it's safe to access its name afterwards. 1120 */ 1121 cdentry = fd_file(cfile)->f_path.dentry; 1122 if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { 1123 ret = -EINVAL; 1124 goto out_put_eventfd; 1125 } 1126 1127 /* 1128 * Determine the event callbacks and set them in @event. This used 1129 * to be done via struct cftype but cgroup core no longer knows 1130 * about these events. The following is crude but the whole thing 1131 * is for compatibility anyway. 1132 * 1133 * DO NOT ADD NEW FILES. 1134 */ 1135 name = cdentry->d_name.name; 1136 1137 if (!strcmp(name, "memory.usage_in_bytes")) { 1138 event->register_event = mem_cgroup_usage_register_event; 1139 event->unregister_event = mem_cgroup_usage_unregister_event; 1140 } else if (!strcmp(name, "memory.oom_control")) { 1141 pr_warn_once("oom_control is deprecated and will be removed. " 1142 "Please report your usecase to linux-mm-@kvack.org" 1143 " if you depend on this functionality.\n"); 1144 event->register_event = mem_cgroup_oom_register_event; 1145 event->unregister_event = mem_cgroup_oom_unregister_event; 1146 } else if (!strcmp(name, "memory.pressure_level")) { 1147 pr_warn_once("pressure_level is deprecated and will be removed. " 1148 "Please report your usecase to linux-mm-@kvack.org " 1149 "if you depend on this functionality.\n"); 1150 event->register_event = vmpressure_register_event; 1151 event->unregister_event = vmpressure_unregister_event; 1152 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 1153 event->register_event = memsw_cgroup_usage_register_event; 1154 event->unregister_event = memsw_cgroup_usage_unregister_event; 1155 } else { 1156 ret = -EINVAL; 1157 goto out_put_eventfd; 1158 } 1159 1160 /* 1161 * Verify @cfile should belong to @css. Also, remaining events are 1162 * automatically removed on cgroup destruction but the removal is 1163 * asynchronous, so take an extra ref on @css. 1164 */ 1165 cfile_css = css_tryget_online_from_dir(cdentry->d_parent, 1166 &memory_cgrp_subsys); 1167 ret = -EINVAL; 1168 if (IS_ERR(cfile_css)) 1169 goto out_put_eventfd; 1170 if (cfile_css != css) 1171 goto out_put_css; 1172 1173 ret = event->register_event(memcg, event->eventfd, buf); 1174 if (ret) 1175 goto out_put_css; 1176 1177 vfs_poll(fd_file(efile), &event->pt); 1178 1179 spin_lock_irq(&memcg->event_list_lock); 1180 list_add(&event->list, &memcg->event_list); 1181 spin_unlock_irq(&memcg->event_list_lock); 1182 return nbytes; 1183 1184 out_put_css: 1185 css_put(cfile_css); 1186 out_put_eventfd: 1187 eventfd_ctx_put(event->eventfd); 1188 out_kfree: 1189 kfree(event); 1190 return ret; 1191 } 1192 1193 void memcg1_memcg_init(struct mem_cgroup *memcg) 1194 { 1195 INIT_LIST_HEAD(&memcg->oom_notify); 1196 mutex_init(&memcg->thresholds_lock); 1197 INIT_LIST_HEAD(&memcg->event_list); 1198 spin_lock_init(&memcg->event_list_lock); 1199 } 1200 1201 void memcg1_css_offline(struct mem_cgroup *memcg) 1202 { 1203 struct mem_cgroup_event *event, *tmp; 1204 1205 /* 1206 * Unregister events and notify userspace. 1207 * Notify userspace about cgroup removing only after rmdir of cgroup 1208 * directory to avoid race between userspace and kernelspace. 1209 */ 1210 spin_lock_irq(&memcg->event_list_lock); 1211 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 1212 list_del_init(&event->list); 1213 schedule_work(&event->remove); 1214 } 1215 spin_unlock_irq(&memcg->event_list_lock); 1216 } 1217 1218 /* 1219 * Check OOM-Killer is already running under our hierarchy. 1220 * If someone is running, return false. 1221 */ 1222 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1223 { 1224 struct mem_cgroup *iter, *failed = NULL; 1225 1226 spin_lock(&memcg_oom_lock); 1227 1228 for_each_mem_cgroup_tree(iter, memcg) { 1229 if (iter->oom_lock) { 1230 /* 1231 * this subtree of our hierarchy is already locked 1232 * so we cannot give a lock. 1233 */ 1234 failed = iter; 1235 mem_cgroup_iter_break(memcg, iter); 1236 break; 1237 } 1238 iter->oom_lock = true; 1239 } 1240 1241 if (failed) { 1242 /* 1243 * OK, we failed to lock the whole subtree so we have 1244 * to clean up what we set up to the failing subtree 1245 */ 1246 for_each_mem_cgroup_tree(iter, memcg) { 1247 if (iter == failed) { 1248 mem_cgroup_iter_break(memcg, iter); 1249 break; 1250 } 1251 iter->oom_lock = false; 1252 } 1253 } else 1254 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1255 1256 spin_unlock(&memcg_oom_lock); 1257 1258 return !failed; 1259 } 1260 1261 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1262 { 1263 struct mem_cgroup *iter; 1264 1265 spin_lock(&memcg_oom_lock); 1266 mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); 1267 for_each_mem_cgroup_tree(iter, memcg) 1268 iter->oom_lock = false; 1269 spin_unlock(&memcg_oom_lock); 1270 } 1271 1272 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1273 { 1274 struct mem_cgroup *iter; 1275 1276 spin_lock(&memcg_oom_lock); 1277 for_each_mem_cgroup_tree(iter, memcg) 1278 iter->under_oom++; 1279 spin_unlock(&memcg_oom_lock); 1280 } 1281 1282 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1283 { 1284 struct mem_cgroup *iter; 1285 1286 /* 1287 * Be careful about under_oom underflows because a child memcg 1288 * could have been added after mem_cgroup_mark_under_oom. 1289 */ 1290 spin_lock(&memcg_oom_lock); 1291 for_each_mem_cgroup_tree(iter, memcg) 1292 if (iter->under_oom > 0) 1293 iter->under_oom--; 1294 spin_unlock(&memcg_oom_lock); 1295 } 1296 1297 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1298 1299 struct oom_wait_info { 1300 struct mem_cgroup *memcg; 1301 wait_queue_entry_t wait; 1302 }; 1303 1304 static int memcg_oom_wake_function(wait_queue_entry_t *wait, 1305 unsigned int mode, int sync, void *arg) 1306 { 1307 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 1308 struct mem_cgroup *oom_wait_memcg; 1309 struct oom_wait_info *oom_wait_info; 1310 1311 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1312 oom_wait_memcg = oom_wait_info->memcg; 1313 1314 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && 1315 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) 1316 return 0; 1317 return autoremove_wake_function(wait, mode, sync, arg); 1318 } 1319 1320 void memcg1_oom_recover(struct mem_cgroup *memcg) 1321 { 1322 /* 1323 * For the following lockless ->under_oom test, the only required 1324 * guarantee is that it must see the state asserted by an OOM when 1325 * this function is called as a result of userland actions 1326 * triggered by the notification of the OOM. This is trivially 1327 * achieved by invoking mem_cgroup_mark_under_oom() before 1328 * triggering notification. 1329 */ 1330 if (memcg && memcg->under_oom) 1331 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 1332 } 1333 1334 /** 1335 * mem_cgroup_oom_synchronize - complete memcg OOM handling 1336 * @handle: actually kill/wait or just clean up the OOM state 1337 * 1338 * This has to be called at the end of a page fault if the memcg OOM 1339 * handler was enabled. 1340 * 1341 * Memcg supports userspace OOM handling where failed allocations must 1342 * sleep on a waitqueue until the userspace task resolves the 1343 * situation. Sleeping directly in the charge context with all kinds 1344 * of locks held is not a good idea, instead we remember an OOM state 1345 * in the task and mem_cgroup_oom_synchronize() has to be called at 1346 * the end of the page fault to complete the OOM handling. 1347 * 1348 * Returns %true if an ongoing memcg OOM situation was detected and 1349 * completed, %false otherwise. 1350 */ 1351 bool mem_cgroup_oom_synchronize(bool handle) 1352 { 1353 struct mem_cgroup *memcg = current->memcg_in_oom; 1354 struct oom_wait_info owait; 1355 bool locked; 1356 1357 /* OOM is global, do not handle */ 1358 if (!memcg) 1359 return false; 1360 1361 if (!handle) 1362 goto cleanup; 1363 1364 owait.memcg = memcg; 1365 owait.wait.flags = 0; 1366 owait.wait.func = memcg_oom_wake_function; 1367 owait.wait.private = current; 1368 INIT_LIST_HEAD(&owait.wait.entry); 1369 1370 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1371 mem_cgroup_mark_under_oom(memcg); 1372 1373 locked = mem_cgroup_oom_trylock(memcg); 1374 1375 if (locked) 1376 mem_cgroup_oom_notify(memcg); 1377 1378 schedule(); 1379 mem_cgroup_unmark_under_oom(memcg); 1380 finish_wait(&memcg_oom_waitq, &owait.wait); 1381 1382 if (locked) 1383 mem_cgroup_oom_unlock(memcg); 1384 cleanup: 1385 current->memcg_in_oom = NULL; 1386 css_put(&memcg->css); 1387 return true; 1388 } 1389 1390 1391 bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked) 1392 { 1393 /* 1394 * We are in the middle of the charge context here, so we 1395 * don't want to block when potentially sitting on a callstack 1396 * that holds all kinds of filesystem and mm locks. 1397 * 1398 * cgroup1 allows disabling the OOM killer and waiting for outside 1399 * handling until the charge can succeed; remember the context and put 1400 * the task to sleep at the end of the page fault when all locks are 1401 * released. 1402 * 1403 * On the other hand, in-kernel OOM killer allows for an async victim 1404 * memory reclaim (oom_reaper) and that means that we are not solely 1405 * relying on the oom victim to make a forward progress and we can 1406 * invoke the oom killer here. 1407 * 1408 * Please note that mem_cgroup_out_of_memory might fail to find a 1409 * victim and then we have to bail out from the charge path. 1410 */ 1411 if (READ_ONCE(memcg->oom_kill_disable)) { 1412 if (current->in_user_fault) { 1413 css_get(&memcg->css); 1414 current->memcg_in_oom = memcg; 1415 } 1416 return false; 1417 } 1418 1419 mem_cgroup_mark_under_oom(memcg); 1420 1421 *locked = mem_cgroup_oom_trylock(memcg); 1422 1423 if (*locked) 1424 mem_cgroup_oom_notify(memcg); 1425 1426 mem_cgroup_unmark_under_oom(memcg); 1427 1428 return true; 1429 } 1430 1431 void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked) 1432 { 1433 if (locked) 1434 mem_cgroup_oom_unlock(memcg); 1435 } 1436 1437 static DEFINE_MUTEX(memcg_max_mutex); 1438 1439 static int mem_cgroup_resize_max(struct mem_cgroup *memcg, 1440 unsigned long max, bool memsw) 1441 { 1442 bool enlarge = false; 1443 bool drained = false; 1444 int ret; 1445 bool limits_invariant; 1446 struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; 1447 1448 do { 1449 if (signal_pending(current)) { 1450 ret = -EINTR; 1451 break; 1452 } 1453 1454 mutex_lock(&memcg_max_mutex); 1455 /* 1456 * Make sure that the new limit (memsw or memory limit) doesn't 1457 * break our basic invariant rule memory.max <= memsw.max. 1458 */ 1459 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) : 1460 max <= memcg->memsw.max; 1461 if (!limits_invariant) { 1462 mutex_unlock(&memcg_max_mutex); 1463 ret = -EINVAL; 1464 break; 1465 } 1466 if (max > counter->max) 1467 enlarge = true; 1468 ret = page_counter_set_max(counter, max); 1469 mutex_unlock(&memcg_max_mutex); 1470 1471 if (!ret) 1472 break; 1473 1474 if (!drained) { 1475 drain_all_stock(memcg); 1476 drained = true; 1477 continue; 1478 } 1479 1480 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 1481 memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) { 1482 ret = -EBUSY; 1483 break; 1484 } 1485 } while (true); 1486 1487 if (!ret && enlarge) 1488 memcg1_oom_recover(memcg); 1489 1490 return ret; 1491 } 1492 1493 /* 1494 * Reclaims as many pages from the given memcg as possible. 1495 * 1496 * Caller is responsible for holding css reference for memcg. 1497 */ 1498 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 1499 { 1500 int nr_retries = MAX_RECLAIM_RETRIES; 1501 1502 /* we call try-to-free pages for make this cgroup empty */ 1503 lru_add_drain_all(); 1504 1505 drain_all_stock(memcg); 1506 1507 /* try to free all pages in this cgroup */ 1508 while (nr_retries && page_counter_read(&memcg->memory)) { 1509 if (signal_pending(current)) 1510 return -EINTR; 1511 1512 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, 1513 MEMCG_RECLAIM_MAY_SWAP, NULL)) 1514 nr_retries--; 1515 } 1516 1517 return 0; 1518 } 1519 1520 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 1521 char *buf, size_t nbytes, 1522 loff_t off) 1523 { 1524 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 1525 1526 if (mem_cgroup_is_root(memcg)) 1527 return -EINVAL; 1528 return mem_cgroup_force_empty(memcg) ?: nbytes; 1529 } 1530 1531 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 1532 struct cftype *cft) 1533 { 1534 return 1; 1535 } 1536 1537 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 1538 struct cftype *cft, u64 val) 1539 { 1540 if (val == 1) 1541 return 0; 1542 1543 pr_warn_once("Non-hierarchical mode is deprecated. " 1544 "Please report your usecase to linux-mm@kvack.org if you " 1545 "depend on this functionality.\n"); 1546 1547 return -EINVAL; 1548 } 1549 1550 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 1551 struct cftype *cft) 1552 { 1553 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1554 struct page_counter *counter; 1555 1556 switch (MEMFILE_TYPE(cft->private)) { 1557 case _MEM: 1558 counter = &memcg->memory; 1559 break; 1560 case _MEMSWAP: 1561 counter = &memcg->memsw; 1562 break; 1563 case _KMEM: 1564 counter = &memcg->kmem; 1565 break; 1566 case _TCP: 1567 counter = &memcg->tcpmem; 1568 break; 1569 default: 1570 BUG(); 1571 } 1572 1573 switch (MEMFILE_ATTR(cft->private)) { 1574 case RES_USAGE: 1575 if (counter == &memcg->memory) 1576 return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; 1577 if (counter == &memcg->memsw) 1578 return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; 1579 return (u64)page_counter_read(counter) * PAGE_SIZE; 1580 case RES_LIMIT: 1581 return (u64)counter->max * PAGE_SIZE; 1582 case RES_MAX_USAGE: 1583 return (u64)counter->watermark * PAGE_SIZE; 1584 case RES_FAILCNT: 1585 return counter->failcnt; 1586 case RES_SOFT_LIMIT: 1587 return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE; 1588 default: 1589 BUG(); 1590 } 1591 } 1592 1593 /* 1594 * This function doesn't do anything useful. Its only job is to provide a read 1595 * handler for a file so that cgroup_file_mode() will add read permissions. 1596 */ 1597 static int mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m, 1598 __always_unused void *v) 1599 { 1600 return -EINVAL; 1601 } 1602 1603 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) 1604 { 1605 int ret; 1606 1607 mutex_lock(&memcg_max_mutex); 1608 1609 ret = page_counter_set_max(&memcg->tcpmem, max); 1610 if (ret) 1611 goto out; 1612 1613 if (!memcg->tcpmem_active) { 1614 /* 1615 * The active flag needs to be written after the static_key 1616 * update. This is what guarantees that the socket activation 1617 * function is the last one to run. See mem_cgroup_sk_alloc() 1618 * for details, and note that we don't mark any socket as 1619 * belonging to this memcg until that flag is up. 1620 * 1621 * We need to do this, because static_keys will span multiple 1622 * sites, but we can't control their order. If we mark a socket 1623 * as accounted, but the accounting functions are not patched in 1624 * yet, we'll lose accounting. 1625 * 1626 * We never race with the readers in mem_cgroup_sk_alloc(), 1627 * because when this value change, the code to process it is not 1628 * patched in yet. 1629 */ 1630 static_branch_inc(&memcg_sockets_enabled_key); 1631 memcg->tcpmem_active = true; 1632 } 1633 out: 1634 mutex_unlock(&memcg_max_mutex); 1635 return ret; 1636 } 1637 1638 /* 1639 * The user of this function is... 1640 * RES_LIMIT. 1641 */ 1642 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 1643 char *buf, size_t nbytes, loff_t off) 1644 { 1645 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 1646 unsigned long nr_pages; 1647 int ret; 1648 1649 buf = strstrip(buf); 1650 ret = page_counter_memparse(buf, "-1", &nr_pages); 1651 if (ret) 1652 return ret; 1653 1654 switch (MEMFILE_ATTR(of_cft(of)->private)) { 1655 case RES_LIMIT: 1656 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 1657 ret = -EINVAL; 1658 break; 1659 } 1660 switch (MEMFILE_TYPE(of_cft(of)->private)) { 1661 case _MEM: 1662 ret = mem_cgroup_resize_max(memcg, nr_pages, false); 1663 break; 1664 case _MEMSWAP: 1665 ret = mem_cgroup_resize_max(memcg, nr_pages, true); 1666 break; 1667 case _KMEM: 1668 pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " 1669 "Writing any value to this file has no effect. " 1670 "Please report your usecase to linux-mm@kvack.org if you " 1671 "depend on this functionality.\n"); 1672 ret = 0; 1673 break; 1674 case _TCP: 1675 pr_warn_once("kmem.tcp.limit_in_bytes is deprecated and will be removed. " 1676 "Please report your usecase to linux-mm@kvack.org if you " 1677 "depend on this functionality.\n"); 1678 ret = memcg_update_tcp_max(memcg, nr_pages); 1679 break; 1680 } 1681 break; 1682 case RES_SOFT_LIMIT: 1683 if (IS_ENABLED(CONFIG_PREEMPT_RT)) { 1684 ret = -EOPNOTSUPP; 1685 } else { 1686 pr_warn_once("soft_limit_in_bytes is deprecated and will be removed. " 1687 "Please report your usecase to linux-mm@kvack.org if you " 1688 "depend on this functionality.\n"); 1689 WRITE_ONCE(memcg->soft_limit, nr_pages); 1690 ret = 0; 1691 } 1692 break; 1693 } 1694 return ret ?: nbytes; 1695 } 1696 1697 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 1698 size_t nbytes, loff_t off) 1699 { 1700 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 1701 struct page_counter *counter; 1702 1703 switch (MEMFILE_TYPE(of_cft(of)->private)) { 1704 case _MEM: 1705 counter = &memcg->memory; 1706 break; 1707 case _MEMSWAP: 1708 counter = &memcg->memsw; 1709 break; 1710 case _KMEM: 1711 counter = &memcg->kmem; 1712 break; 1713 case _TCP: 1714 counter = &memcg->tcpmem; 1715 break; 1716 default: 1717 BUG(); 1718 } 1719 1720 switch (MEMFILE_ATTR(of_cft(of)->private)) { 1721 case RES_MAX_USAGE: 1722 page_counter_reset_watermark(counter); 1723 break; 1724 case RES_FAILCNT: 1725 counter->failcnt = 0; 1726 break; 1727 default: 1728 BUG(); 1729 } 1730 1731 return nbytes; 1732 } 1733 1734 #ifdef CONFIG_NUMA 1735 1736 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) 1737 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) 1738 #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) 1739 1740 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 1741 int nid, unsigned int lru_mask, bool tree) 1742 { 1743 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); 1744 unsigned long nr = 0; 1745 enum lru_list lru; 1746 1747 VM_BUG_ON((unsigned int)nid >= nr_node_ids); 1748 1749 for_each_lru(lru) { 1750 if (!(BIT(lru) & lru_mask)) 1751 continue; 1752 if (tree) 1753 nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); 1754 else 1755 nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); 1756 } 1757 return nr; 1758 } 1759 1760 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 1761 unsigned int lru_mask, 1762 bool tree) 1763 { 1764 unsigned long nr = 0; 1765 enum lru_list lru; 1766 1767 for_each_lru(lru) { 1768 if (!(BIT(lru) & lru_mask)) 1769 continue; 1770 if (tree) 1771 nr += memcg_page_state(memcg, NR_LRU_BASE + lru); 1772 else 1773 nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); 1774 } 1775 return nr; 1776 } 1777 1778 static int memcg_numa_stat_show(struct seq_file *m, void *v) 1779 { 1780 struct numa_stat { 1781 const char *name; 1782 unsigned int lru_mask; 1783 }; 1784 1785 static const struct numa_stat stats[] = { 1786 { "total", LRU_ALL }, 1787 { "file", LRU_ALL_FILE }, 1788 { "anon", LRU_ALL_ANON }, 1789 { "unevictable", BIT(LRU_UNEVICTABLE) }, 1790 }; 1791 const struct numa_stat *stat; 1792 int nid; 1793 struct mem_cgroup *memcg = mem_cgroup_from_seq(m); 1794 1795 mem_cgroup_flush_stats(memcg); 1796 1797 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 1798 seq_printf(m, "%s=%lu", stat->name, 1799 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 1800 false)); 1801 for_each_node_state(nid, N_MEMORY) 1802 seq_printf(m, " N%d=%lu", nid, 1803 mem_cgroup_node_nr_lru_pages(memcg, nid, 1804 stat->lru_mask, false)); 1805 seq_putc(m, '\n'); 1806 } 1807 1808 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 1809 1810 seq_printf(m, "hierarchical_%s=%lu", stat->name, 1811 mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, 1812 true)); 1813 for_each_node_state(nid, N_MEMORY) 1814 seq_printf(m, " N%d=%lu", nid, 1815 mem_cgroup_node_nr_lru_pages(memcg, nid, 1816 stat->lru_mask, true)); 1817 seq_putc(m, '\n'); 1818 } 1819 1820 return 0; 1821 } 1822 #endif /* CONFIG_NUMA */ 1823 1824 static const unsigned int memcg1_stats[] = { 1825 NR_FILE_PAGES, 1826 NR_ANON_MAPPED, 1827 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1828 NR_ANON_THPS, 1829 #endif 1830 NR_SHMEM, 1831 NR_FILE_MAPPED, 1832 NR_FILE_DIRTY, 1833 NR_WRITEBACK, 1834 WORKINGSET_REFAULT_ANON, 1835 WORKINGSET_REFAULT_FILE, 1836 #ifdef CONFIG_SWAP 1837 MEMCG_SWAP, 1838 NR_SWAPCACHE, 1839 #endif 1840 }; 1841 1842 static const char *const memcg1_stat_names[] = { 1843 "cache", 1844 "rss", 1845 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1846 "rss_huge", 1847 #endif 1848 "shmem", 1849 "mapped_file", 1850 "dirty", 1851 "writeback", 1852 "workingset_refault_anon", 1853 "workingset_refault_file", 1854 #ifdef CONFIG_SWAP 1855 "swap", 1856 "swapcached", 1857 #endif 1858 }; 1859 1860 /* Universal VM events cgroup1 shows, original sort order */ 1861 static const unsigned int memcg1_events[] = { 1862 PGPGIN, 1863 PGPGOUT, 1864 PGFAULT, 1865 PGMAJFAULT, 1866 }; 1867 1868 void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) 1869 { 1870 unsigned long memory, memsw; 1871 struct mem_cgroup *mi; 1872 unsigned int i; 1873 1874 BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); 1875 1876 mem_cgroup_flush_stats(memcg); 1877 1878 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 1879 unsigned long nr; 1880 1881 nr = memcg_page_state_local_output(memcg, memcg1_stats[i]); 1882 seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr); 1883 } 1884 1885 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 1886 seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]), 1887 memcg_events_local(memcg, memcg1_events[i])); 1888 1889 for (i = 0; i < NR_LRU_LISTS; i++) 1890 seq_buf_printf(s, "%s %lu\n", lru_list_name(i), 1891 memcg_page_state_local(memcg, NR_LRU_BASE + i) * 1892 PAGE_SIZE); 1893 1894 /* Hierarchical information */ 1895 memory = memsw = PAGE_COUNTER_MAX; 1896 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 1897 memory = min(memory, READ_ONCE(mi->memory.max)); 1898 memsw = min(memsw, READ_ONCE(mi->memsw.max)); 1899 } 1900 seq_buf_printf(s, "hierarchical_memory_limit %llu\n", 1901 (u64)memory * PAGE_SIZE); 1902 seq_buf_printf(s, "hierarchical_memsw_limit %llu\n", 1903 (u64)memsw * PAGE_SIZE); 1904 1905 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { 1906 unsigned long nr; 1907 1908 nr = memcg_page_state_output(memcg, memcg1_stats[i]); 1909 seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i], 1910 (u64)nr); 1911 } 1912 1913 for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) 1914 seq_buf_printf(s, "total_%s %llu\n", 1915 vm_event_name(memcg1_events[i]), 1916 (u64)memcg_events(memcg, memcg1_events[i])); 1917 1918 for (i = 0; i < NR_LRU_LISTS; i++) 1919 seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i), 1920 (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * 1921 PAGE_SIZE); 1922 1923 #ifdef CONFIG_DEBUG_VM 1924 { 1925 pg_data_t *pgdat; 1926 struct mem_cgroup_per_node *mz; 1927 unsigned long anon_cost = 0; 1928 unsigned long file_cost = 0; 1929 1930 for_each_online_pgdat(pgdat) { 1931 mz = memcg->nodeinfo[pgdat->node_id]; 1932 1933 anon_cost += mz->lruvec.anon_cost; 1934 file_cost += mz->lruvec.file_cost; 1935 } 1936 seq_buf_printf(s, "anon_cost %lu\n", anon_cost); 1937 seq_buf_printf(s, "file_cost %lu\n", file_cost); 1938 } 1939 #endif 1940 } 1941 1942 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 1943 struct cftype *cft) 1944 { 1945 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1946 1947 return mem_cgroup_swappiness(memcg); 1948 } 1949 1950 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 1951 struct cftype *cft, u64 val) 1952 { 1953 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1954 1955 if (val > MAX_SWAPPINESS) 1956 return -EINVAL; 1957 1958 if (!mem_cgroup_is_root(memcg)) { 1959 pr_info_once("Per memcg swappiness does not exist in cgroup v2. " 1960 "See memory.reclaim or memory.swap.max there\n "); 1961 WRITE_ONCE(memcg->swappiness, val); 1962 } else 1963 WRITE_ONCE(vm_swappiness, val); 1964 1965 return 0; 1966 } 1967 1968 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 1969 { 1970 struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); 1971 1972 seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable)); 1973 seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); 1974 seq_printf(sf, "oom_kill %lu\n", 1975 atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); 1976 return 0; 1977 } 1978 1979 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 1980 struct cftype *cft, u64 val) 1981 { 1982 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 1983 1984 pr_warn_once("oom_control is deprecated and will be removed. " 1985 "Please report your usecase to linux-mm-@kvack.org if you " 1986 "depend on this functionality.\n"); 1987 1988 /* cannot set to root cgroup and only 0 and 1 are allowed */ 1989 if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) 1990 return -EINVAL; 1991 1992 WRITE_ONCE(memcg->oom_kill_disable, val); 1993 if (!val) 1994 memcg1_oom_recover(memcg); 1995 1996 return 0; 1997 } 1998 1999 #ifdef CONFIG_SLUB_DEBUG 2000 static int mem_cgroup_slab_show(struct seq_file *m, void *p) 2001 { 2002 /* 2003 * Deprecated. 2004 * Please, take a look at tools/cgroup/memcg_slabinfo.py . 2005 */ 2006 return 0; 2007 } 2008 #endif 2009 2010 struct cftype mem_cgroup_legacy_files[] = { 2011 { 2012 .name = "usage_in_bytes", 2013 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 2014 .read_u64 = mem_cgroup_read_u64, 2015 }, 2016 { 2017 .name = "max_usage_in_bytes", 2018 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 2019 .write = mem_cgroup_reset, 2020 .read_u64 = mem_cgroup_read_u64, 2021 }, 2022 { 2023 .name = "limit_in_bytes", 2024 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 2025 .write = mem_cgroup_write, 2026 .read_u64 = mem_cgroup_read_u64, 2027 }, 2028 { 2029 .name = "soft_limit_in_bytes", 2030 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 2031 .write = mem_cgroup_write, 2032 .read_u64 = mem_cgroup_read_u64, 2033 }, 2034 { 2035 .name = "failcnt", 2036 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 2037 .write = mem_cgroup_reset, 2038 .read_u64 = mem_cgroup_read_u64, 2039 }, 2040 { 2041 .name = "stat", 2042 .seq_show = memory_stat_show, 2043 }, 2044 { 2045 .name = "force_empty", 2046 .write = mem_cgroup_force_empty_write, 2047 }, 2048 { 2049 .name = "use_hierarchy", 2050 .write_u64 = mem_cgroup_hierarchy_write, 2051 .read_u64 = mem_cgroup_hierarchy_read, 2052 }, 2053 { 2054 .name = "cgroup.event_control", /* XXX: for compat */ 2055 .write = memcg_write_event_control, 2056 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, 2057 }, 2058 { 2059 .name = "swappiness", 2060 .read_u64 = mem_cgroup_swappiness_read, 2061 .write_u64 = mem_cgroup_swappiness_write, 2062 }, 2063 { 2064 .name = "move_charge_at_immigrate", 2065 .read_u64 = mem_cgroup_move_charge_read, 2066 .write_u64 = mem_cgroup_move_charge_write, 2067 }, 2068 { 2069 .name = "oom_control", 2070 .seq_show = mem_cgroup_oom_control_read, 2071 .write_u64 = mem_cgroup_oom_control_write, 2072 }, 2073 { 2074 .name = "pressure_level", 2075 .seq_show = mem_cgroup_dummy_seq_show, 2076 }, 2077 #ifdef CONFIG_NUMA 2078 { 2079 .name = "numa_stat", 2080 .seq_show = memcg_numa_stat_show, 2081 }, 2082 #endif 2083 { 2084 .name = "kmem.limit_in_bytes", 2085 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 2086 .write = mem_cgroup_write, 2087 .read_u64 = mem_cgroup_read_u64, 2088 }, 2089 { 2090 .name = "kmem.usage_in_bytes", 2091 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 2092 .read_u64 = mem_cgroup_read_u64, 2093 }, 2094 { 2095 .name = "kmem.failcnt", 2096 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 2097 .write = mem_cgroup_reset, 2098 .read_u64 = mem_cgroup_read_u64, 2099 }, 2100 { 2101 .name = "kmem.max_usage_in_bytes", 2102 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 2103 .write = mem_cgroup_reset, 2104 .read_u64 = mem_cgroup_read_u64, 2105 }, 2106 #ifdef CONFIG_SLUB_DEBUG 2107 { 2108 .name = "kmem.slabinfo", 2109 .seq_show = mem_cgroup_slab_show, 2110 }, 2111 #endif 2112 { 2113 .name = "kmem.tcp.limit_in_bytes", 2114 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), 2115 .write = mem_cgroup_write, 2116 .read_u64 = mem_cgroup_read_u64, 2117 }, 2118 { 2119 .name = "kmem.tcp.usage_in_bytes", 2120 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), 2121 .read_u64 = mem_cgroup_read_u64, 2122 }, 2123 { 2124 .name = "kmem.tcp.failcnt", 2125 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), 2126 .write = mem_cgroup_reset, 2127 .read_u64 = mem_cgroup_read_u64, 2128 }, 2129 { 2130 .name = "kmem.tcp.max_usage_in_bytes", 2131 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), 2132 .write = mem_cgroup_reset, 2133 .read_u64 = mem_cgroup_read_u64, 2134 }, 2135 { }, /* terminate */ 2136 }; 2137 2138 struct cftype memsw_files[] = { 2139 { 2140 .name = "memsw.usage_in_bytes", 2141 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 2142 .read_u64 = mem_cgroup_read_u64, 2143 }, 2144 { 2145 .name = "memsw.max_usage_in_bytes", 2146 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 2147 .write = mem_cgroup_reset, 2148 .read_u64 = mem_cgroup_read_u64, 2149 }, 2150 { 2151 .name = "memsw.limit_in_bytes", 2152 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 2153 .write = mem_cgroup_write, 2154 .read_u64 = mem_cgroup_read_u64, 2155 }, 2156 { 2157 .name = "memsw.failcnt", 2158 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 2159 .write = mem_cgroup_reset, 2160 .read_u64 = mem_cgroup_read_u64, 2161 }, 2162 { }, /* terminate */ 2163 }; 2164 2165 void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages) 2166 { 2167 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { 2168 if (nr_pages > 0) 2169 page_counter_charge(&memcg->kmem, nr_pages); 2170 else 2171 page_counter_uncharge(&memcg->kmem, -nr_pages); 2172 } 2173 } 2174 2175 bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, 2176 gfp_t gfp_mask) 2177 { 2178 struct page_counter *fail; 2179 2180 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { 2181 memcg->tcpmem_pressure = 0; 2182 return true; 2183 } 2184 memcg->tcpmem_pressure = 1; 2185 if (gfp_mask & __GFP_NOFAIL) { 2186 page_counter_charge(&memcg->tcpmem, nr_pages); 2187 return true; 2188 } 2189 return false; 2190 } 2191 2192 bool memcg1_alloc_events(struct mem_cgroup *memcg) 2193 { 2194 memcg->events_percpu = alloc_percpu_gfp(struct memcg1_events_percpu, 2195 GFP_KERNEL_ACCOUNT); 2196 return !!memcg->events_percpu; 2197 } 2198 2199 void memcg1_free_events(struct mem_cgroup *memcg) 2200 { 2201 if (memcg->events_percpu) 2202 free_percpu(memcg->events_percpu); 2203 } 2204 2205 static int __init memcg1_init(void) 2206 { 2207 int node; 2208 2209 for_each_node(node) { 2210 struct mem_cgroup_tree_per_node *rtpn; 2211 2212 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node); 2213 2214 rtpn->rb_root = RB_ROOT; 2215 rtpn->rb_rightmost = NULL; 2216 spin_lock_init(&rtpn->lock); 2217 soft_limit_tree.rb_tree_per_node[node] = rtpn; 2218 } 2219 2220 return 0; 2221 } 2222 subsys_initcall(memcg1_init); 2223