1 /* SPDX-License-Identifier: GPL-2.0 */
2 #define _GNU_SOURCE
3
4 #include <linux/limits.h>
5 #include <linux/oom.h>
6 #include <fcntl.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <sys/stat.h>
11 #include <sys/types.h>
12 #include <unistd.h>
13 #include <sys/inotify.h>
14 #include <sys/socket.h>
15 #include <sys/wait.h>
16 #include <arpa/inet.h>
17 #include <netinet/in.h>
18 #include <netdb.h>
19 #include <errno.h>
20 #include <sys/mman.h>
21
22 #include "kselftest.h"
23 #include "cgroup_util.h"
24
25 #define MEMCG_SOCKSTAT_WAIT_RETRIES 30
26
27 static bool has_localevents;
28 static bool has_recursiveprot;
29
get_temp_fd(void)30 int get_temp_fd(void)
31 {
32 return open(".", O_TMPFILE | O_RDWR | O_EXCL);
33 }
34
alloc_pagecache(int fd,size_t size)35 int alloc_pagecache(int fd, size_t size)
36 {
37 char buf[PAGE_SIZE];
38 struct stat st;
39 int i;
40
41 if (fstat(fd, &st))
42 goto cleanup;
43
44 size += st.st_size;
45
46 if (ftruncate(fd, size))
47 goto cleanup;
48
49 for (i = 0; i < size; i += sizeof(buf))
50 read(fd, buf, sizeof(buf));
51
52 return 0;
53
54 cleanup:
55 return -1;
56 }
57
alloc_anon(const char * cgroup,void * arg)58 int alloc_anon(const char *cgroup, void *arg)
59 {
60 size_t size = (unsigned long)arg;
61 char *buf, *ptr;
62
63 buf = malloc(size);
64 for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
65 *ptr = 0;
66
67 free(buf);
68 return 0;
69 }
70
is_swap_enabled(void)71 int is_swap_enabled(void)
72 {
73 char buf[PAGE_SIZE];
74 const char delim[] = "\n";
75 int cnt = 0;
76 char *line;
77
78 if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)
79 return -1;
80
81 for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
82 cnt++;
83
84 return cnt > 1;
85 }
86
set_oom_adj_score(int pid,int score)87 int set_oom_adj_score(int pid, int score)
88 {
89 char path[PATH_MAX];
90 int fd, len;
91
92 sprintf(path, "/proc/%d/oom_score_adj", pid);
93
94 fd = open(path, O_WRONLY | O_APPEND);
95 if (fd < 0)
96 return fd;
97
98 len = dprintf(fd, "%d", score);
99 if (len < 0) {
100 close(fd);
101 return len;
102 }
103
104 close(fd);
105 return 0;
106 }
107
108 /*
109 * This test creates two nested cgroups with and without enabling
110 * the memory controller.
111 */
test_memcg_subtree_control(const char * root)112 static int test_memcg_subtree_control(const char *root)
113 {
114 char *parent, *child, *parent2 = NULL, *child2 = NULL;
115 int ret = KSFT_FAIL;
116 char buf[PAGE_SIZE];
117
118 /* Create two nested cgroups with the memory controller enabled */
119 parent = cg_name(root, "memcg_test_0");
120 child = cg_name(root, "memcg_test_0/memcg_test_1");
121 if (!parent || !child)
122 goto cleanup_free;
123
124 if (cg_create(parent))
125 goto cleanup_free;
126
127 if (cg_write(parent, "cgroup.subtree_control", "+memory"))
128 goto cleanup_parent;
129
130 if (cg_create(child))
131 goto cleanup_parent;
132
133 if (cg_read_strstr(child, "cgroup.controllers", "memory"))
134 goto cleanup_child;
135
136 /* Create two nested cgroups without enabling memory controller */
137 parent2 = cg_name(root, "memcg_test_1");
138 child2 = cg_name(root, "memcg_test_1/memcg_test_1");
139 if (!parent2 || !child2)
140 goto cleanup_free2;
141
142 if (cg_create(parent2))
143 goto cleanup_free2;
144
145 if (cg_create(child2))
146 goto cleanup_parent2;
147
148 if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
149 goto cleanup_all;
150
151 if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
152 goto cleanup_all;
153
154 ret = KSFT_PASS;
155
156 cleanup_all:
157 cg_destroy(child2);
158 cleanup_parent2:
159 cg_destroy(parent2);
160 cleanup_free2:
161 free(parent2);
162 free(child2);
163 cleanup_child:
164 cg_destroy(child);
165 cleanup_parent:
166 cg_destroy(parent);
167 cleanup_free:
168 free(parent);
169 free(child);
170
171 return ret;
172 }
173
alloc_anon_50M_check(const char * cgroup,void * arg)174 static int alloc_anon_50M_check(const char *cgroup, void *arg)
175 {
176 size_t size = MB(50);
177 char *buf, *ptr;
178 long anon, current;
179 int ret = -1;
180
181 buf = malloc(size);
182 if (buf == NULL) {
183 fprintf(stderr, "malloc() failed\n");
184 return -1;
185 }
186
187 for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
188 *ptr = 0;
189
190 current = cg_read_long(cgroup, "memory.current");
191 if (current < size)
192 goto cleanup;
193
194 if (!values_close(size, current, 3))
195 goto cleanup;
196
197 anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
198 if (anon < 0)
199 goto cleanup;
200
201 if (!values_close(anon, current, 3))
202 goto cleanup;
203
204 ret = 0;
205 cleanup:
206 free(buf);
207 return ret;
208 }
209
alloc_pagecache_50M_check(const char * cgroup,void * arg)210 static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
211 {
212 size_t size = MB(50);
213 int ret = -1;
214 long current, file;
215 int fd;
216
217 fd = get_temp_fd();
218 if (fd < 0)
219 return -1;
220
221 if (alloc_pagecache(fd, size))
222 goto cleanup;
223
224 current = cg_read_long(cgroup, "memory.current");
225 if (current < size)
226 goto cleanup;
227
228 file = cg_read_key_long(cgroup, "memory.stat", "file ");
229 if (file < 0)
230 goto cleanup;
231
232 if (!values_close(file, current, 10))
233 goto cleanup;
234
235 ret = 0;
236
237 cleanup:
238 close(fd);
239 return ret;
240 }
241
242 /*
243 * This test create a memory cgroup, allocates
244 * some anonymous memory and some pagecache
245 * and checks memory.current, memory.peak, and some memory.stat values.
246 */
test_memcg_current_peak(const char * root)247 static int test_memcg_current_peak(const char *root)
248 {
249 int ret = KSFT_FAIL;
250 long current, peak, peak_reset;
251 char *memcg;
252 bool fd2_closed = false, fd3_closed = false, fd4_closed = false;
253 int peak_fd = -1, peak_fd2 = -1, peak_fd3 = -1, peak_fd4 = -1;
254 struct stat ss;
255
256 memcg = cg_name(root, "memcg_test");
257 if (!memcg)
258 goto cleanup;
259
260 if (cg_create(memcg))
261 goto cleanup;
262
263 current = cg_read_long(memcg, "memory.current");
264 if (current != 0)
265 goto cleanup;
266
267 peak = cg_read_long(memcg, "memory.peak");
268 if (peak != 0)
269 goto cleanup;
270
271 if (cg_run(memcg, alloc_anon_50M_check, NULL))
272 goto cleanup;
273
274 peak = cg_read_long(memcg, "memory.peak");
275 if (peak < MB(50))
276 goto cleanup;
277
278 /*
279 * We'll open a few FDs for the same memory.peak file to exercise the free-path
280 * We need at least three to be closed in a different order than writes occurred to test
281 * the linked-list handling.
282 */
283 peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
284
285 if (peak_fd == -1) {
286 if (errno == ENOENT)
287 ret = KSFT_SKIP;
288 goto cleanup;
289 }
290
291 /*
292 * Before we try to use memory.peak's fd, try to figure out whether
293 * this kernel supports writing to that file in the first place. (by
294 * checking the writable bit on the file's st_mode)
295 */
296 if (fstat(peak_fd, &ss))
297 goto cleanup;
298
299 if ((ss.st_mode & S_IWUSR) == 0) {
300 ret = KSFT_SKIP;
301 goto cleanup;
302 }
303
304 peak_fd2 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
305
306 if (peak_fd2 == -1)
307 goto cleanup;
308
309 peak_fd3 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
310
311 if (peak_fd3 == -1)
312 goto cleanup;
313
314 /* any non-empty string resets, but make it clear */
315 static const char reset_string[] = "reset\n";
316
317 peak_reset = write(peak_fd, reset_string, sizeof(reset_string));
318 if (peak_reset != sizeof(reset_string))
319 goto cleanup;
320
321 peak_reset = write(peak_fd2, reset_string, sizeof(reset_string));
322 if (peak_reset != sizeof(reset_string))
323 goto cleanup;
324
325 peak_reset = write(peak_fd3, reset_string, sizeof(reset_string));
326 if (peak_reset != sizeof(reset_string))
327 goto cleanup;
328
329 /* Make sure a completely independent read isn't affected by our FD-local reset above*/
330 peak = cg_read_long(memcg, "memory.peak");
331 if (peak < MB(50))
332 goto cleanup;
333
334 fd2_closed = true;
335 if (close(peak_fd2))
336 goto cleanup;
337
338 peak_fd4 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
339
340 if (peak_fd4 == -1)
341 goto cleanup;
342
343 peak_reset = write(peak_fd4, reset_string, sizeof(reset_string));
344 if (peak_reset != sizeof(reset_string))
345 goto cleanup;
346
347 peak = cg_read_long_fd(peak_fd);
348 if (peak > MB(30) || peak < 0)
349 goto cleanup;
350
351 if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
352 goto cleanup;
353
354 peak = cg_read_long(memcg, "memory.peak");
355 if (peak < MB(50))
356 goto cleanup;
357
358 /* Make sure everything is back to normal */
359 peak = cg_read_long_fd(peak_fd);
360 if (peak < MB(50))
361 goto cleanup;
362
363 peak = cg_read_long_fd(peak_fd4);
364 if (peak < MB(50))
365 goto cleanup;
366
367 fd3_closed = true;
368 if (close(peak_fd3))
369 goto cleanup;
370
371 fd4_closed = true;
372 if (close(peak_fd4))
373 goto cleanup;
374
375 ret = KSFT_PASS;
376
377 cleanup:
378 close(peak_fd);
379 if (!fd2_closed)
380 close(peak_fd2);
381 if (!fd3_closed)
382 close(peak_fd3);
383 if (!fd4_closed)
384 close(peak_fd4);
385 cg_destroy(memcg);
386 free(memcg);
387
388 return ret;
389 }
390
alloc_pagecache_50M_noexit(const char * cgroup,void * arg)391 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
392 {
393 int fd = (long)arg;
394 int ppid = getppid();
395
396 if (alloc_pagecache(fd, MB(50)))
397 return -1;
398
399 while (getppid() == ppid)
400 sleep(1);
401
402 return 0;
403 }
404
alloc_anon_noexit(const char * cgroup,void * arg)405 static int alloc_anon_noexit(const char *cgroup, void *arg)
406 {
407 int ppid = getppid();
408 size_t size = (unsigned long)arg;
409 char *buf, *ptr;
410
411 buf = malloc(size);
412 if (buf == NULL) {
413 fprintf(stderr, "malloc() failed\n");
414 return -1;
415 }
416
417 for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
418 *ptr = 0;
419
420 while (getppid() == ppid)
421 sleep(1);
422
423 free(buf);
424 return 0;
425 }
426
427 /*
428 * Wait until processes are killed asynchronously by the OOM killer
429 * If we exceed a timeout, fail.
430 */
cg_test_proc_killed(const char * cgroup)431 static int cg_test_proc_killed(const char *cgroup)
432 {
433 int limit;
434
435 for (limit = 10; limit > 0; limit--) {
436 if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
437 return 0;
438
439 usleep(100000);
440 }
441 return -1;
442 }
443
444 static bool reclaim_until(const char *memcg, long goal);
445
446 /*
447 * First, this test creates the following hierarchy:
448 * A memory.min = 0, memory.max = 200M
449 * A/B memory.min = 50M
450 * A/B/C memory.min = 75M, memory.current = 50M
451 * A/B/D memory.min = 25M, memory.current = 50M
452 * A/B/E memory.min = 0, memory.current = 50M
453 * A/B/F memory.min = 500M, memory.current = 0
454 *
455 * (or memory.low if we test soft protection)
456 *
457 * Usages are pagecache and the test keeps a running
458 * process in every leaf cgroup.
459 * Then it creates A/G and creates a significant
460 * memory pressure in A.
461 *
462 * Then it checks actual memory usages and expects that:
463 * A/B memory.current ~= 50M
464 * A/B/C memory.current ~= 29M [memory.events:low > 0]
465 * A/B/D memory.current ~= 21M [memory.events:low > 0]
466 * A/B/E memory.current ~= 0 [memory.events:low == 0 if !memory_recursiveprot,
467 * undefined otherwise]
468 * A/B/F memory.current = 0 [memory.events:low == 0]
469 * (for origin of the numbers, see model in memcg_protection.m.)
470 *
471 * After that it tries to allocate more than there is
472 * unprotected memory in A available, and checks that:
473 * a) memory.min protects pagecache even in this case,
474 * b) memory.low allows reclaiming page cache with low events.
475 *
476 * Then we try to reclaim from A/B/C using memory.reclaim until its
477 * usage reaches 10M.
478 * This makes sure that:
479 * (a) We ignore the protection of the reclaim target memcg.
480 * (b) The previously calculated emin value (~29M) should be dismissed.
481 */
test_memcg_protection(const char * root,bool min)482 static int test_memcg_protection(const char *root, bool min)
483 {
484 int ret = KSFT_FAIL, rc;
485 char *parent[3] = {NULL};
486 char *children[4] = {NULL};
487 const char *attribute = min ? "memory.min" : "memory.low";
488 long c[4];
489 long current;
490 int i, attempts;
491 int fd;
492
493 fd = get_temp_fd();
494 if (fd < 0)
495 goto cleanup;
496
497 parent[0] = cg_name(root, "memcg_test_0");
498 if (!parent[0])
499 goto cleanup;
500
501 parent[1] = cg_name(parent[0], "memcg_test_1");
502 if (!parent[1])
503 goto cleanup;
504
505 parent[2] = cg_name(parent[0], "memcg_test_2");
506 if (!parent[2])
507 goto cleanup;
508
509 if (cg_create(parent[0]))
510 goto cleanup;
511
512 if (cg_read_long(parent[0], attribute)) {
513 /* No memory.min on older kernels is fine */
514 if (min)
515 ret = KSFT_SKIP;
516 goto cleanup;
517 }
518
519 if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
520 goto cleanup;
521
522 if (cg_write(parent[0], "memory.max", "200M"))
523 goto cleanup;
524
525 if (cg_write(parent[0], "memory.swap.max", "0"))
526 goto cleanup;
527
528 if (cg_create(parent[1]))
529 goto cleanup;
530
531 if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
532 goto cleanup;
533
534 if (cg_create(parent[2]))
535 goto cleanup;
536
537 for (i = 0; i < ARRAY_SIZE(children); i++) {
538 children[i] = cg_name_indexed(parent[1], "child_memcg", i);
539 if (!children[i])
540 goto cleanup;
541
542 if (cg_create(children[i]))
543 goto cleanup;
544
545 if (i > 2)
546 continue;
547
548 cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
549 (void *)(long)fd);
550 }
551
552 if (cg_write(parent[1], attribute, "50M"))
553 goto cleanup;
554 if (cg_write(children[0], attribute, "75M"))
555 goto cleanup;
556 if (cg_write(children[1], attribute, "25M"))
557 goto cleanup;
558 if (cg_write(children[2], attribute, "0"))
559 goto cleanup;
560 if (cg_write(children[3], attribute, "500M"))
561 goto cleanup;
562
563 attempts = 0;
564 while (!values_close(cg_read_long(parent[1], "memory.current"),
565 MB(150), 3)) {
566 if (attempts++ > 5)
567 break;
568 sleep(1);
569 }
570
571 if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
572 goto cleanup;
573
574 if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
575 goto cleanup;
576
577 for (i = 0; i < ARRAY_SIZE(children); i++)
578 c[i] = cg_read_long(children[i], "memory.current");
579
580 if (!values_close(c[0], MB(29), 15))
581 goto cleanup;
582
583 if (!values_close(c[1], MB(21), 20))
584 goto cleanup;
585
586 if (c[3] != 0)
587 goto cleanup;
588
589 rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
590 if (min && !rc)
591 goto cleanup;
592 else if (!min && rc) {
593 fprintf(stderr,
594 "memory.low prevents from allocating anon memory\n");
595 goto cleanup;
596 }
597
598 current = min ? MB(50) : MB(30);
599 if (!values_close(cg_read_long(parent[1], "memory.current"), current, 3))
600 goto cleanup;
601
602 if (!reclaim_until(children[0], MB(10)))
603 goto cleanup;
604
605 if (min) {
606 ret = KSFT_PASS;
607 goto cleanup;
608 }
609
610 /*
611 * Child 2 has memory.low=0, but some low protection may still be
612 * distributed down from its parent with memory.low=50M if cgroup2
613 * memory_recursiveprot mount option is enabled. Ignore the low
614 * event count in this case.
615 */
616 for (i = 0; i < ARRAY_SIZE(children); i++) {
617 int ignore_low_events_index = has_recursiveprot ? 2 : -1;
618 int no_low_events_index = 1;
619 long low, oom;
620
621 oom = cg_read_key_long(children[i], "memory.events", "oom ");
622 low = cg_read_key_long(children[i], "memory.events", "low ");
623
624 if (oom)
625 goto cleanup;
626 if (i == ignore_low_events_index)
627 continue;
628 if (i <= no_low_events_index && low <= 0)
629 goto cleanup;
630 if (i > no_low_events_index && low)
631 goto cleanup;
632
633 }
634
635 ret = KSFT_PASS;
636
637 cleanup:
638 for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
639 if (!children[i])
640 continue;
641
642 cg_destroy(children[i]);
643 free(children[i]);
644 }
645
646 for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
647 if (!parent[i])
648 continue;
649
650 cg_destroy(parent[i]);
651 free(parent[i]);
652 }
653 close(fd);
654 return ret;
655 }
656
test_memcg_min(const char * root)657 static int test_memcg_min(const char *root)
658 {
659 return test_memcg_protection(root, true);
660 }
661
test_memcg_low(const char * root)662 static int test_memcg_low(const char *root)
663 {
664 return test_memcg_protection(root, false);
665 }
666
alloc_pagecache_max_30M(const char * cgroup,void * arg)667 static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
668 {
669 size_t size = MB(50);
670 int ret = -1;
671 long current, high, max;
672 int fd;
673
674 high = cg_read_long(cgroup, "memory.high");
675 max = cg_read_long(cgroup, "memory.max");
676 if (high != MB(30) && max != MB(30))
677 return -1;
678
679 fd = get_temp_fd();
680 if (fd < 0)
681 return -1;
682
683 if (alloc_pagecache(fd, size))
684 goto cleanup;
685
686 current = cg_read_long(cgroup, "memory.current");
687 if (!values_close(current, MB(30), 5))
688 goto cleanup;
689
690 ret = 0;
691
692 cleanup:
693 close(fd);
694 return ret;
695
696 }
697
698 /*
699 * This test checks that memory.high limits the amount of
700 * memory which can be consumed by either anonymous memory
701 * or pagecache.
702 */
test_memcg_high(const char * root)703 static int test_memcg_high(const char *root)
704 {
705 int ret = KSFT_FAIL;
706 char *memcg;
707 long high;
708
709 memcg = cg_name(root, "memcg_test");
710 if (!memcg)
711 goto cleanup;
712
713 if (cg_create(memcg))
714 goto cleanup;
715
716 if (cg_read_strcmp(memcg, "memory.high", "max\n"))
717 goto cleanup;
718
719 if (cg_write(memcg, "memory.swap.max", "0"))
720 goto cleanup;
721
722 if (cg_write(memcg, "memory.high", "30M"))
723 goto cleanup;
724
725 if (cg_run(memcg, alloc_anon, (void *)MB(31)))
726 goto cleanup;
727
728 if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
729 goto cleanup;
730
731 if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
732 goto cleanup;
733
734 high = cg_read_key_long(memcg, "memory.events", "high ");
735 if (high <= 0)
736 goto cleanup;
737
738 ret = KSFT_PASS;
739
740 cleanup:
741 cg_destroy(memcg);
742 free(memcg);
743
744 return ret;
745 }
746
alloc_anon_mlock(const char * cgroup,void * arg)747 static int alloc_anon_mlock(const char *cgroup, void *arg)
748 {
749 size_t size = (size_t)arg;
750 void *buf;
751
752 buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
753 0, 0);
754 if (buf == MAP_FAILED)
755 return -1;
756
757 mlock(buf, size);
758 munmap(buf, size);
759 return 0;
760 }
761
762 /*
763 * This test checks that memory.high is able to throttle big single shot
764 * allocation i.e. large allocation within one kernel entry.
765 */
test_memcg_high_sync(const char * root)766 static int test_memcg_high_sync(const char *root)
767 {
768 int ret = KSFT_FAIL, pid, fd = -1;
769 char *memcg;
770 long pre_high, pre_max;
771 long post_high, post_max;
772
773 memcg = cg_name(root, "memcg_test");
774 if (!memcg)
775 goto cleanup;
776
777 if (cg_create(memcg))
778 goto cleanup;
779
780 pre_high = cg_read_key_long(memcg, "memory.events", "high ");
781 pre_max = cg_read_key_long(memcg, "memory.events", "max ");
782 if (pre_high < 0 || pre_max < 0)
783 goto cleanup;
784
785 if (cg_write(memcg, "memory.swap.max", "0"))
786 goto cleanup;
787
788 if (cg_write(memcg, "memory.high", "30M"))
789 goto cleanup;
790
791 if (cg_write(memcg, "memory.max", "140M"))
792 goto cleanup;
793
794 fd = memcg_prepare_for_wait(memcg);
795 if (fd < 0)
796 goto cleanup;
797
798 pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
799 if (pid < 0)
800 goto cleanup;
801
802 cg_wait_for(fd);
803
804 post_high = cg_read_key_long(memcg, "memory.events", "high ");
805 post_max = cg_read_key_long(memcg, "memory.events", "max ");
806 if (post_high < 0 || post_max < 0)
807 goto cleanup;
808
809 if (pre_high == post_high || pre_max != post_max)
810 goto cleanup;
811
812 ret = KSFT_PASS;
813
814 cleanup:
815 if (fd >= 0)
816 close(fd);
817 cg_destroy(memcg);
818 free(memcg);
819
820 return ret;
821 }
822
823 /*
824 * This test checks that memory.max limits the amount of
825 * memory which can be consumed by either anonymous memory
826 * or pagecache.
827 */
test_memcg_max(const char * root)828 static int test_memcg_max(const char *root)
829 {
830 int ret = KSFT_FAIL;
831 char *memcg;
832 long current, max;
833
834 memcg = cg_name(root, "memcg_test");
835 if (!memcg)
836 goto cleanup;
837
838 if (cg_create(memcg))
839 goto cleanup;
840
841 if (cg_read_strcmp(memcg, "memory.max", "max\n"))
842 goto cleanup;
843
844 if (cg_write(memcg, "memory.swap.max", "0"))
845 goto cleanup;
846
847 if (cg_write(memcg, "memory.max", "30M"))
848 goto cleanup;
849
850 /* Should be killed by OOM killer */
851 if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
852 goto cleanup;
853
854 if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
855 goto cleanup;
856
857 current = cg_read_long(memcg, "memory.current");
858 if (current > MB(30) || !current)
859 goto cleanup;
860
861 max = cg_read_key_long(memcg, "memory.events", "max ");
862 if (max <= 0)
863 goto cleanup;
864
865 ret = KSFT_PASS;
866
867 cleanup:
868 cg_destroy(memcg);
869 free(memcg);
870
871 return ret;
872 }
873
874 /*
875 * Reclaim from @memcg until usage reaches @goal by writing to
876 * memory.reclaim.
877 *
878 * This function will return false if the usage is already below the
879 * goal.
880 *
881 * This function assumes that writing to memory.reclaim is the only
882 * source of change in memory.current (no concurrent allocations or
883 * reclaim).
884 *
885 * This function makes sure memory.reclaim is sane. It will return
886 * false if memory.reclaim's error codes do not make sense, even if
887 * the usage goal was satisfied.
888 */
reclaim_until(const char * memcg,long goal)889 static bool reclaim_until(const char *memcg, long goal)
890 {
891 char buf[64];
892 int retries, err;
893 long current, to_reclaim;
894 bool reclaimed = false;
895
896 for (retries = 5; retries > 0; retries--) {
897 current = cg_read_long(memcg, "memory.current");
898
899 if (current < goal || values_close(current, goal, 3))
900 break;
901 /* Did memory.reclaim return 0 incorrectly? */
902 else if (reclaimed)
903 return false;
904
905 to_reclaim = current - goal;
906 snprintf(buf, sizeof(buf), "%ld", to_reclaim);
907 err = cg_write(memcg, "memory.reclaim", buf);
908 if (!err)
909 reclaimed = true;
910 else if (err != -EAGAIN)
911 return false;
912 }
913 return reclaimed;
914 }
915
916 /*
917 * This test checks that memory.reclaim reclaims the given
918 * amount of memory (from both anon and file, if possible).
919 */
test_memcg_reclaim(const char * root)920 static int test_memcg_reclaim(const char *root)
921 {
922 int ret = KSFT_FAIL;
923 int fd = -1;
924 int retries;
925 char *memcg;
926 long current, expected_usage;
927
928 memcg = cg_name(root, "memcg_test");
929 if (!memcg)
930 goto cleanup;
931
932 if (cg_create(memcg))
933 goto cleanup;
934
935 current = cg_read_long(memcg, "memory.current");
936 if (current != 0)
937 goto cleanup;
938
939 fd = get_temp_fd();
940 if (fd < 0)
941 goto cleanup;
942
943 cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
944
945 /*
946 * If swap is enabled, try to reclaim from both anon and file, else try
947 * to reclaim from file only.
948 */
949 if (is_swap_enabled()) {
950 cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
951 expected_usage = MB(100);
952 } else
953 expected_usage = MB(50);
954
955 /*
956 * Wait until current usage reaches the expected usage (or we run out of
957 * retries).
958 */
959 retries = 5;
960 while (!values_close(cg_read_long(memcg, "memory.current"),
961 expected_usage, 10)) {
962 if (retries--) {
963 sleep(1);
964 continue;
965 } else {
966 fprintf(stderr,
967 "failed to allocate %ld for memcg reclaim test\n",
968 expected_usage);
969 goto cleanup;
970 }
971 }
972
973 /*
974 * Reclaim until current reaches 30M, this makes sure we hit both anon
975 * and file if swap is enabled.
976 */
977 if (!reclaim_until(memcg, MB(30)))
978 goto cleanup;
979
980 ret = KSFT_PASS;
981 cleanup:
982 cg_destroy(memcg);
983 free(memcg);
984 close(fd);
985
986 return ret;
987 }
988
alloc_anon_50M_check_swap(const char * cgroup,void * arg)989 static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
990 {
991 long mem_max = (long)arg;
992 size_t size = MB(50);
993 char *buf, *ptr;
994 long mem_current, swap_current;
995 int ret = -1;
996
997 buf = malloc(size);
998 if (buf == NULL) {
999 fprintf(stderr, "malloc() failed\n");
1000 return -1;
1001 }
1002
1003 for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
1004 *ptr = 0;
1005
1006 mem_current = cg_read_long(cgroup, "memory.current");
1007 if (!mem_current || !values_close(mem_current, mem_max, 3))
1008 goto cleanup;
1009
1010 swap_current = cg_read_long(cgroup, "memory.swap.current");
1011 if (!swap_current ||
1012 !values_close(mem_current + swap_current, size, 3))
1013 goto cleanup;
1014
1015 ret = 0;
1016 cleanup:
1017 free(buf);
1018 return ret;
1019 }
1020
1021 /*
1022 * This test checks that memory.swap.max limits the amount of
1023 * anonymous memory which can be swapped out. Additionally, it verifies that
1024 * memory.swap.peak reflects the high watermark and can be reset.
1025 */
test_memcg_swap_max_peak(const char * root)1026 static int test_memcg_swap_max_peak(const char *root)
1027 {
1028 int ret = KSFT_FAIL;
1029 char *memcg;
1030 long max, peak;
1031 struct stat ss;
1032 int swap_peak_fd = -1, mem_peak_fd = -1;
1033
1034 /* any non-empty string resets */
1035 static const char reset_string[] = "foobarbaz";
1036
1037 if (!is_swap_enabled())
1038 return KSFT_SKIP;
1039
1040 memcg = cg_name(root, "memcg_test");
1041 if (!memcg)
1042 goto cleanup;
1043
1044 if (cg_create(memcg))
1045 goto cleanup;
1046
1047 if (cg_read_long(memcg, "memory.swap.current")) {
1048 ret = KSFT_SKIP;
1049 goto cleanup;
1050 }
1051
1052 swap_peak_fd = cg_open(memcg, "memory.swap.peak",
1053 O_RDWR | O_APPEND | O_CLOEXEC);
1054
1055 if (swap_peak_fd == -1) {
1056 if (errno == ENOENT)
1057 ret = KSFT_SKIP;
1058 goto cleanup;
1059 }
1060
1061 /*
1062 * Before we try to use memory.swap.peak's fd, try to figure out
1063 * whether this kernel supports writing to that file in the first
1064 * place. (by checking the writable bit on the file's st_mode)
1065 */
1066 if (fstat(swap_peak_fd, &ss))
1067 goto cleanup;
1068
1069 if ((ss.st_mode & S_IWUSR) == 0) {
1070 ret = KSFT_SKIP;
1071 goto cleanup;
1072 }
1073
1074 mem_peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
1075
1076 if (mem_peak_fd == -1)
1077 goto cleanup;
1078
1079 if (cg_read_long(memcg, "memory.swap.peak"))
1080 goto cleanup;
1081
1082 if (cg_read_long_fd(swap_peak_fd))
1083 goto cleanup;
1084
1085 /* switch the swap and mem fds into local-peak tracking mode*/
1086 int peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
1087
1088 if (peak_reset != sizeof(reset_string))
1089 goto cleanup;
1090
1091 if (cg_read_long_fd(swap_peak_fd))
1092 goto cleanup;
1093
1094 if (cg_read_long(memcg, "memory.peak"))
1095 goto cleanup;
1096
1097 if (cg_read_long_fd(mem_peak_fd))
1098 goto cleanup;
1099
1100 peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
1101 if (peak_reset != sizeof(reset_string))
1102 goto cleanup;
1103
1104 if (cg_read_long_fd(mem_peak_fd))
1105 goto cleanup;
1106
1107 if (cg_read_strcmp(memcg, "memory.max", "max\n"))
1108 goto cleanup;
1109
1110 if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
1111 goto cleanup;
1112
1113 if (cg_write(memcg, "memory.swap.max", "30M"))
1114 goto cleanup;
1115
1116 if (cg_write(memcg, "memory.max", "30M"))
1117 goto cleanup;
1118
1119 /* Should be killed by OOM killer */
1120 if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1121 goto cleanup;
1122
1123 if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
1124 goto cleanup;
1125
1126 if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
1127 goto cleanup;
1128
1129 peak = cg_read_long(memcg, "memory.peak");
1130 if (peak < MB(29))
1131 goto cleanup;
1132
1133 peak = cg_read_long(memcg, "memory.swap.peak");
1134 if (peak < MB(29))
1135 goto cleanup;
1136
1137 peak = cg_read_long_fd(mem_peak_fd);
1138 if (peak < MB(29))
1139 goto cleanup;
1140
1141 peak = cg_read_long_fd(swap_peak_fd);
1142 if (peak < MB(29))
1143 goto cleanup;
1144
1145 /*
1146 * open, reset and close the peak swap on another FD to make sure
1147 * multiple extant fds don't corrupt the linked-list
1148 */
1149 peak_reset = cg_write(memcg, "memory.swap.peak", (char *)reset_string);
1150 if (peak_reset)
1151 goto cleanup;
1152
1153 peak_reset = cg_write(memcg, "memory.peak", (char *)reset_string);
1154 if (peak_reset)
1155 goto cleanup;
1156
1157 /* actually reset on the fds */
1158 peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
1159 if (peak_reset != sizeof(reset_string))
1160 goto cleanup;
1161
1162 peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
1163 if (peak_reset != sizeof(reset_string))
1164 goto cleanup;
1165
1166 peak = cg_read_long_fd(swap_peak_fd);
1167 if (peak > MB(10))
1168 goto cleanup;
1169
1170 /*
1171 * The cgroup is now empty, but there may be a page or two associated
1172 * with the open FD accounted to it.
1173 */
1174 peak = cg_read_long_fd(mem_peak_fd);
1175 if (peak > MB(1))
1176 goto cleanup;
1177
1178 if (cg_read_long(memcg, "memory.peak") < MB(29))
1179 goto cleanup;
1180
1181 if (cg_read_long(memcg, "memory.swap.peak") < MB(29))
1182 goto cleanup;
1183
1184 if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
1185 goto cleanup;
1186
1187 max = cg_read_key_long(memcg, "memory.events", "max ");
1188 if (max <= 0)
1189 goto cleanup;
1190
1191 peak = cg_read_long(memcg, "memory.peak");
1192 if (peak < MB(29))
1193 goto cleanup;
1194
1195 peak = cg_read_long(memcg, "memory.swap.peak");
1196 if (peak < MB(29))
1197 goto cleanup;
1198
1199 peak = cg_read_long_fd(mem_peak_fd);
1200 if (peak < MB(29))
1201 goto cleanup;
1202
1203 peak = cg_read_long_fd(swap_peak_fd);
1204 if (peak < MB(19))
1205 goto cleanup;
1206
1207 ret = KSFT_PASS;
1208
1209 cleanup:
1210 if (mem_peak_fd != -1 && close(mem_peak_fd))
1211 ret = KSFT_FAIL;
1212 if (swap_peak_fd != -1 && close(swap_peak_fd))
1213 ret = KSFT_FAIL;
1214 cg_destroy(memcg);
1215 free(memcg);
1216
1217 return ret;
1218 }
1219
1220 /*
1221 * This test disables swapping and tries to allocate anonymous memory
1222 * up to OOM. Then it checks for oom and oom_kill events in
1223 * memory.events.
1224 */
test_memcg_oom_events(const char * root)1225 static int test_memcg_oom_events(const char *root)
1226 {
1227 int ret = KSFT_FAIL;
1228 char *memcg;
1229
1230 memcg = cg_name(root, "memcg_test");
1231 if (!memcg)
1232 goto cleanup;
1233
1234 if (cg_create(memcg))
1235 goto cleanup;
1236
1237 if (cg_write(memcg, "memory.max", "30M"))
1238 goto cleanup;
1239
1240 if (cg_write(memcg, "memory.swap.max", "0"))
1241 goto cleanup;
1242
1243 if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1244 goto cleanup;
1245
1246 if (cg_read_strcmp(memcg, "cgroup.procs", ""))
1247 goto cleanup;
1248
1249 if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
1250 goto cleanup;
1251
1252 if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
1253 goto cleanup;
1254
1255 ret = KSFT_PASS;
1256
1257 cleanup:
1258 cg_destroy(memcg);
1259 free(memcg);
1260
1261 return ret;
1262 }
1263
1264 struct tcp_server_args {
1265 unsigned short port;
1266 int ctl[2];
1267 };
1268
tcp_server(const char * cgroup,void * arg)1269 static int tcp_server(const char *cgroup, void *arg)
1270 {
1271 struct tcp_server_args *srv_args = arg;
1272 struct sockaddr_in6 saddr = { 0 };
1273 socklen_t slen = sizeof(saddr);
1274 int sk, client_sk, ctl_fd, yes = 1, ret = -1;
1275
1276 close(srv_args->ctl[0]);
1277 ctl_fd = srv_args->ctl[1];
1278
1279 saddr.sin6_family = AF_INET6;
1280 saddr.sin6_addr = in6addr_any;
1281 saddr.sin6_port = htons(srv_args->port);
1282
1283 sk = socket(AF_INET6, SOCK_STREAM, 0);
1284 if (sk < 0) {
1285 /* Pass back errno to the ctl_fd */
1286 write(ctl_fd, &errno, sizeof(errno));
1287 return ret;
1288 }
1289
1290 if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
1291 goto cleanup;
1292
1293 if (bind(sk, (struct sockaddr *)&saddr, slen)) {
1294 write(ctl_fd, &errno, sizeof(errno));
1295 goto cleanup;
1296 }
1297
1298 if (listen(sk, 1))
1299 goto cleanup;
1300
1301 ret = 0;
1302 if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
1303 ret = -1;
1304 goto cleanup;
1305 }
1306
1307 client_sk = accept(sk, NULL, NULL);
1308 if (client_sk < 0)
1309 goto cleanup;
1310
1311 ret = -1;
1312 for (;;) {
1313 uint8_t buf[0x100000];
1314
1315 if (write(client_sk, buf, sizeof(buf)) <= 0) {
1316 if (errno == ECONNRESET)
1317 ret = 0;
1318 break;
1319 }
1320 }
1321
1322 close(client_sk);
1323
1324 cleanup:
1325 close(sk);
1326 return ret;
1327 }
1328
tcp_client(const char * cgroup,unsigned short port)1329 static int tcp_client(const char *cgroup, unsigned short port)
1330 {
1331 const char server[] = "localhost";
1332 struct addrinfo *ai;
1333 char servport[6];
1334 int retries = 0x10; /* nice round number */
1335 int sk, ret;
1336 long allocated;
1337
1338 allocated = cg_read_long(cgroup, "memory.current");
1339 snprintf(servport, sizeof(servport), "%hd", port);
1340 ret = getaddrinfo(server, servport, NULL, &ai);
1341 if (ret)
1342 return ret;
1343
1344 sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
1345 if (sk < 0)
1346 goto free_ainfo;
1347
1348 ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
1349 if (ret < 0)
1350 goto close_sk;
1351
1352 ret = KSFT_FAIL;
1353 while (retries--) {
1354 uint8_t buf[0x100000];
1355 long current, sock;
1356
1357 if (read(sk, buf, sizeof(buf)) <= 0)
1358 goto close_sk;
1359
1360 current = cg_read_long(cgroup, "memory.current");
1361 sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
1362
1363 if (current < 0 || sock < 0)
1364 goto close_sk;
1365
1366 /* exclude the memory not related to socket connection */
1367 if (values_close(current - allocated, sock, 10)) {
1368 ret = KSFT_PASS;
1369 break;
1370 }
1371 }
1372
1373 close_sk:
1374 close(sk);
1375 free_ainfo:
1376 freeaddrinfo(ai);
1377 return ret;
1378 }
1379
1380 /*
1381 * This test checks socket memory accounting.
1382 * The test forks a TCP server listens on a random port between 1000
1383 * and 61000. Once it gets a client connection, it starts writing to
1384 * its socket.
1385 * The TCP client interleaves reads from the socket with check whether
1386 * memory.current and memory.stat.sock are similar.
1387 */
test_memcg_sock(const char * root)1388 static int test_memcg_sock(const char *root)
1389 {
1390 int bind_retries = 5, ret = KSFT_FAIL, pid, err;
1391 unsigned short port;
1392 char *memcg;
1393 long sock_post = -1;
1394
1395 memcg = cg_name(root, "memcg_test");
1396 if (!memcg)
1397 goto cleanup;
1398
1399 if (cg_create(memcg))
1400 goto cleanup;
1401
1402 while (bind_retries--) {
1403 struct tcp_server_args args;
1404
1405 if (pipe(args.ctl))
1406 goto cleanup;
1407
1408 port = args.port = 1000 + rand() % 60000;
1409
1410 pid = cg_run_nowait(memcg, tcp_server, &args);
1411 if (pid < 0)
1412 goto cleanup;
1413
1414 close(args.ctl[1]);
1415 if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
1416 goto cleanup;
1417 close(args.ctl[0]);
1418
1419 /* Skip if address family not supported by protocol */
1420 if (err == EAFNOSUPPORT) {
1421 ret = KSFT_SKIP;
1422 goto cleanup;
1423 }
1424
1425 if (!err)
1426 break;
1427 if (err != EADDRINUSE)
1428 goto cleanup;
1429
1430 waitpid(pid, NULL, 0);
1431 }
1432
1433 if (err == EADDRINUSE) {
1434 ret = KSFT_SKIP;
1435 goto cleanup;
1436 }
1437
1438 if (tcp_client(memcg, port) != KSFT_PASS)
1439 goto cleanup;
1440
1441 waitpid(pid, &err, 0);
1442 if (WEXITSTATUS(err))
1443 goto cleanup;
1444
1445 if (cg_read_long(memcg, "memory.current") < 0)
1446 goto cleanup;
1447
1448 /*
1449 * memory.stat is updated asynchronously via the memcg rstat
1450 * flushing worker, which runs periodically (every 2 seconds,
1451 * see FLUSH_TIME). On a busy system, the "sock " counter may
1452 * stay non-zero for a short period of time after the TCP
1453 * connection is closed and all socket memory has been
1454 * uncharged.
1455 *
1456 * Poll memory.stat for up to 3 seconds (~FLUSH_TIME plus some
1457 * scheduling slack) and require that the "sock " counter
1458 * eventually drops to zero.
1459 */
1460 sock_post = cg_read_key_long_poll(memcg, "memory.stat", "sock ", 0,
1461 MEMCG_SOCKSTAT_WAIT_RETRIES,
1462 DEFAULT_WAIT_INTERVAL_US);
1463 if (sock_post)
1464 goto cleanup;
1465
1466 ret = KSFT_PASS;
1467
1468 cleanup:
1469 cg_destroy(memcg);
1470 free(memcg);
1471
1472 return ret;
1473 }
1474
1475 /*
1476 * This test disables swapping and tries to allocate anonymous memory
1477 * up to OOM with memory.group.oom set. Then it checks that all
1478 * processes in the leaf were killed. It also checks that oom_events
1479 * were propagated to the parent level.
1480 */
test_memcg_oom_group_leaf_events(const char * root)1481 static int test_memcg_oom_group_leaf_events(const char *root)
1482 {
1483 int ret = KSFT_FAIL;
1484 char *parent, *child;
1485 long parent_oom_events;
1486
1487 parent = cg_name(root, "memcg_test_0");
1488 child = cg_name(root, "memcg_test_0/memcg_test_1");
1489
1490 if (!parent || !child)
1491 goto cleanup;
1492
1493 if (cg_create(parent))
1494 goto cleanup;
1495
1496 if (cg_create(child))
1497 goto cleanup;
1498
1499 if (cg_write(parent, "cgroup.subtree_control", "+memory"))
1500 goto cleanup;
1501
1502 if (cg_write(child, "memory.max", "50M"))
1503 goto cleanup;
1504
1505 if (cg_write(child, "memory.swap.max", "0"))
1506 goto cleanup;
1507
1508 if (cg_write(child, "memory.oom.group", "1"))
1509 goto cleanup;
1510
1511 cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1512 cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1513 cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1514 if (!cg_run(child, alloc_anon, (void *)MB(100)))
1515 goto cleanup;
1516
1517 if (cg_test_proc_killed(child))
1518 goto cleanup;
1519
1520 if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
1521 goto cleanup;
1522
1523 parent_oom_events = cg_read_key_long(
1524 parent, "memory.events", "oom_kill ");
1525 /*
1526 * If memory_localevents is not enabled (the default), the parent should
1527 * count OOM events in its children groups. Otherwise, it should not
1528 * have observed any events.
1529 */
1530 if (has_localevents && parent_oom_events != 0)
1531 goto cleanup;
1532 else if (!has_localevents && parent_oom_events <= 0)
1533 goto cleanup;
1534
1535 ret = KSFT_PASS;
1536
1537 cleanup:
1538 if (child)
1539 cg_destroy(child);
1540 if (parent)
1541 cg_destroy(parent);
1542 free(child);
1543 free(parent);
1544
1545 return ret;
1546 }
1547
1548 /*
1549 * This test disables swapping and tries to allocate anonymous memory
1550 * up to OOM with memory.group.oom set. Then it checks that all
1551 * processes in the parent and leaf were killed.
1552 */
test_memcg_oom_group_parent_events(const char * root)1553 static int test_memcg_oom_group_parent_events(const char *root)
1554 {
1555 int ret = KSFT_FAIL;
1556 char *parent, *child;
1557
1558 parent = cg_name(root, "memcg_test_0");
1559 child = cg_name(root, "memcg_test_0/memcg_test_1");
1560
1561 if (!parent || !child)
1562 goto cleanup;
1563
1564 if (cg_create(parent))
1565 goto cleanup;
1566
1567 if (cg_create(child))
1568 goto cleanup;
1569
1570 if (cg_write(parent, "memory.max", "80M"))
1571 goto cleanup;
1572
1573 if (cg_write(parent, "memory.swap.max", "0"))
1574 goto cleanup;
1575
1576 if (cg_write(parent, "memory.oom.group", "1"))
1577 goto cleanup;
1578
1579 cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1580 cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1581 cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1582
1583 if (!cg_run(child, alloc_anon, (void *)MB(100)))
1584 goto cleanup;
1585
1586 if (cg_test_proc_killed(child))
1587 goto cleanup;
1588 if (cg_test_proc_killed(parent))
1589 goto cleanup;
1590
1591 ret = KSFT_PASS;
1592
1593 cleanup:
1594 if (child)
1595 cg_destroy(child);
1596 if (parent)
1597 cg_destroy(parent);
1598 free(child);
1599 free(parent);
1600
1601 return ret;
1602 }
1603
1604 /*
1605 * This test disables swapping and tries to allocate anonymous memory
1606 * up to OOM with memory.group.oom set. Then it checks that all
1607 * processes were killed except those set with OOM_SCORE_ADJ_MIN
1608 */
test_memcg_oom_group_score_events(const char * root)1609 static int test_memcg_oom_group_score_events(const char *root)
1610 {
1611 int ret = KSFT_FAIL;
1612 char *memcg;
1613 int safe_pid;
1614
1615 memcg = cg_name(root, "memcg_test_0");
1616
1617 if (!memcg)
1618 goto cleanup;
1619
1620 if (cg_create(memcg))
1621 goto cleanup;
1622
1623 if (cg_write(memcg, "memory.max", "50M"))
1624 goto cleanup;
1625
1626 if (cg_write(memcg, "memory.swap.max", "0"))
1627 goto cleanup;
1628
1629 if (cg_write(memcg, "memory.oom.group", "1"))
1630 goto cleanup;
1631
1632 safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1633 if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
1634 goto cleanup;
1635
1636 cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1637 if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1638 goto cleanup;
1639
1640 if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
1641 goto cleanup;
1642
1643 if (kill(safe_pid, SIGKILL))
1644 goto cleanup;
1645
1646 ret = KSFT_PASS;
1647
1648 cleanup:
1649 if (memcg)
1650 cg_destroy(memcg);
1651 free(memcg);
1652
1653 return ret;
1654 }
1655
read_event(int inotify_fd,int expected_event,int expected_wd)1656 static int read_event(int inotify_fd, int expected_event, int expected_wd)
1657 {
1658 struct inotify_event event;
1659 ssize_t len = 0;
1660
1661 len = read(inotify_fd, &event, sizeof(event));
1662 if (len < (ssize_t)sizeof(event))
1663 return -1;
1664
1665 if (event.mask != expected_event || event.wd != expected_wd) {
1666 fprintf(stderr,
1667 "event does not match expected values: mask %d (expected %d) wd %d (expected %d)\n",
1668 event.mask, expected_event, event.wd, expected_wd);
1669 return -1;
1670 }
1671
1672 return 0;
1673 }
1674
test_memcg_inotify_delete_file(const char * root)1675 static int test_memcg_inotify_delete_file(const char *root)
1676 {
1677 int ret = KSFT_FAIL;
1678 char *memcg = NULL;
1679 int fd, wd;
1680
1681 memcg = cg_name(root, "memcg_test_0");
1682
1683 if (!memcg)
1684 goto cleanup;
1685
1686 if (cg_create(memcg))
1687 goto cleanup;
1688
1689 fd = inotify_init1(0);
1690 if (fd == -1)
1691 goto cleanup;
1692
1693 wd = inotify_add_watch(fd, cg_control(memcg, "memory.events"), IN_DELETE_SELF);
1694 if (wd == -1)
1695 goto cleanup;
1696
1697 if (cg_destroy(memcg))
1698 goto cleanup;
1699 free(memcg);
1700 memcg = NULL;
1701
1702 if (read_event(fd, IN_DELETE_SELF, wd))
1703 goto cleanup;
1704
1705 if (read_event(fd, IN_IGNORED, wd))
1706 goto cleanup;
1707
1708 ret = KSFT_PASS;
1709
1710 cleanup:
1711 if (fd >= 0)
1712 close(fd);
1713 if (memcg)
1714 cg_destroy(memcg);
1715 free(memcg);
1716
1717 return ret;
1718 }
1719
test_memcg_inotify_delete_dir(const char * root)1720 static int test_memcg_inotify_delete_dir(const char *root)
1721 {
1722 int ret = KSFT_FAIL;
1723 char *memcg = NULL;
1724 int fd, wd;
1725
1726 memcg = cg_name(root, "memcg_test_0");
1727
1728 if (!memcg)
1729 goto cleanup;
1730
1731 if (cg_create(memcg))
1732 goto cleanup;
1733
1734 fd = inotify_init1(0);
1735 if (fd == -1)
1736 goto cleanup;
1737
1738 wd = inotify_add_watch(fd, memcg, IN_DELETE_SELF);
1739 if (wd == -1)
1740 goto cleanup;
1741
1742 if (cg_destroy(memcg))
1743 goto cleanup;
1744 free(memcg);
1745 memcg = NULL;
1746
1747 if (read_event(fd, IN_DELETE_SELF, wd))
1748 goto cleanup;
1749
1750 if (read_event(fd, IN_IGNORED, wd))
1751 goto cleanup;
1752
1753 ret = KSFT_PASS;
1754
1755 cleanup:
1756 if (fd >= 0)
1757 close(fd);
1758 if (memcg)
1759 cg_destroy(memcg);
1760 free(memcg);
1761
1762 return ret;
1763 }
1764
1765 #define T(x) { x, #x }
1766 struct memcg_test {
1767 int (*fn)(const char *root);
1768 const char *name;
1769 } tests[] = {
1770 T(test_memcg_subtree_control),
1771 T(test_memcg_current_peak),
1772 T(test_memcg_min),
1773 T(test_memcg_low),
1774 T(test_memcg_high),
1775 T(test_memcg_high_sync),
1776 T(test_memcg_max),
1777 T(test_memcg_reclaim),
1778 T(test_memcg_oom_events),
1779 T(test_memcg_swap_max_peak),
1780 T(test_memcg_sock),
1781 T(test_memcg_oom_group_leaf_events),
1782 T(test_memcg_oom_group_parent_events),
1783 T(test_memcg_oom_group_score_events),
1784 T(test_memcg_inotify_delete_file),
1785 T(test_memcg_inotify_delete_dir),
1786 };
1787 #undef T
1788
main(int argc,char ** argv)1789 int main(int argc, char **argv)
1790 {
1791 char root[PATH_MAX];
1792 int i, proc_status;
1793
1794 ksft_print_header();
1795 ksft_set_plan(ARRAY_SIZE(tests));
1796 if (cg_find_unified_root(root, sizeof(root), NULL))
1797 ksft_exit_skip("cgroup v2 isn't mounted\n");
1798
1799 /*
1800 * Check that memory controller is available:
1801 * memory is listed in cgroup.controllers
1802 */
1803 if (cg_read_strstr(root, "cgroup.controllers", "memory"))
1804 ksft_exit_skip("memory controller isn't available\n");
1805
1806 if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
1807 if (cg_write(root, "cgroup.subtree_control", "+memory"))
1808 ksft_exit_skip("Failed to set memory controller\n");
1809
1810 proc_status = proc_mount_contains("memory_recursiveprot");
1811 if (proc_status < 0)
1812 ksft_exit_skip("Failed to query cgroup mount option\n");
1813 has_recursiveprot = proc_status;
1814
1815 proc_status = proc_mount_contains("memory_localevents");
1816 if (proc_status < 0)
1817 ksft_exit_skip("Failed to query cgroup mount option\n");
1818 has_localevents = proc_status;
1819
1820 for (i = 0; i < ARRAY_SIZE(tests); i++) {
1821 switch (tests[i].fn(root)) {
1822 case KSFT_PASS:
1823 ksft_test_result_pass("%s\n", tests[i].name);
1824 break;
1825 case KSFT_SKIP:
1826 ksft_test_result_skip("%s\n", tests[i].name);
1827 break;
1828 default:
1829 ksft_test_result_fail("%s\n", tests[i].name);
1830 break;
1831 }
1832 }
1833
1834 ksft_finished();
1835 }
1836