xref: /linux/tools/testing/selftests/cgroup/test_memcontrol.c (revision 334fbe734e687404f346eba7d5d96ed2b44d35ab)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <linux/oom.h>
6 #include <fcntl.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <sys/stat.h>
11 #include <sys/types.h>
12 #include <unistd.h>
13 #include <sys/inotify.h>
14 #include <sys/socket.h>
15 #include <sys/wait.h>
16 #include <arpa/inet.h>
17 #include <netinet/in.h>
18 #include <netdb.h>
19 #include <errno.h>
20 #include <sys/mman.h>
21 
22 #include "kselftest.h"
23 #include "cgroup_util.h"
24 
25 #define MEMCG_SOCKSTAT_WAIT_RETRIES        30
26 
27 static bool has_localevents;
28 static bool has_recursiveprot;
29 
get_temp_fd(void)30 int get_temp_fd(void)
31 {
32 	return open(".", O_TMPFILE | O_RDWR | O_EXCL);
33 }
34 
alloc_pagecache(int fd,size_t size)35 int alloc_pagecache(int fd, size_t size)
36 {
37 	char buf[PAGE_SIZE];
38 	struct stat st;
39 	int i;
40 
41 	if (fstat(fd, &st))
42 		goto cleanup;
43 
44 	size += st.st_size;
45 
46 	if (ftruncate(fd, size))
47 		goto cleanup;
48 
49 	for (i = 0; i < size; i += sizeof(buf))
50 		read(fd, buf, sizeof(buf));
51 
52 	return 0;
53 
54 cleanup:
55 	return -1;
56 }
57 
alloc_anon(const char * cgroup,void * arg)58 int alloc_anon(const char *cgroup, void *arg)
59 {
60 	size_t size = (unsigned long)arg;
61 	char *buf, *ptr;
62 
63 	buf = malloc(size);
64 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
65 		*ptr = 0;
66 
67 	free(buf);
68 	return 0;
69 }
70 
is_swap_enabled(void)71 int is_swap_enabled(void)
72 {
73 	char buf[PAGE_SIZE];
74 	const char delim[] = "\n";
75 	int cnt = 0;
76 	char *line;
77 
78 	if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)
79 		return -1;
80 
81 	for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
82 		cnt++;
83 
84 	return cnt > 1;
85 }
86 
set_oom_adj_score(int pid,int score)87 int set_oom_adj_score(int pid, int score)
88 {
89 	char path[PATH_MAX];
90 	int fd, len;
91 
92 	sprintf(path, "/proc/%d/oom_score_adj", pid);
93 
94 	fd = open(path, O_WRONLY | O_APPEND);
95 	if (fd < 0)
96 		return fd;
97 
98 	len = dprintf(fd, "%d", score);
99 	if (len < 0) {
100 		close(fd);
101 		return len;
102 	}
103 
104 	close(fd);
105 	return 0;
106 }
107 
108 /*
109  * This test creates two nested cgroups with and without enabling
110  * the memory controller.
111  */
test_memcg_subtree_control(const char * root)112 static int test_memcg_subtree_control(const char *root)
113 {
114 	char *parent, *child, *parent2 = NULL, *child2 = NULL;
115 	int ret = KSFT_FAIL;
116 	char buf[PAGE_SIZE];
117 
118 	/* Create two nested cgroups with the memory controller enabled */
119 	parent = cg_name(root, "memcg_test_0");
120 	child = cg_name(root, "memcg_test_0/memcg_test_1");
121 	if (!parent || !child)
122 		goto cleanup_free;
123 
124 	if (cg_create(parent))
125 		goto cleanup_free;
126 
127 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
128 		goto cleanup_parent;
129 
130 	if (cg_create(child))
131 		goto cleanup_parent;
132 
133 	if (cg_read_strstr(child, "cgroup.controllers", "memory"))
134 		goto cleanup_child;
135 
136 	/* Create two nested cgroups without enabling memory controller */
137 	parent2 = cg_name(root, "memcg_test_1");
138 	child2 = cg_name(root, "memcg_test_1/memcg_test_1");
139 	if (!parent2 || !child2)
140 		goto cleanup_free2;
141 
142 	if (cg_create(parent2))
143 		goto cleanup_free2;
144 
145 	if (cg_create(child2))
146 		goto cleanup_parent2;
147 
148 	if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
149 		goto cleanup_all;
150 
151 	if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
152 		goto cleanup_all;
153 
154 	ret = KSFT_PASS;
155 
156 cleanup_all:
157 	cg_destroy(child2);
158 cleanup_parent2:
159 	cg_destroy(parent2);
160 cleanup_free2:
161 	free(parent2);
162 	free(child2);
163 cleanup_child:
164 	cg_destroy(child);
165 cleanup_parent:
166 	cg_destroy(parent);
167 cleanup_free:
168 	free(parent);
169 	free(child);
170 
171 	return ret;
172 }
173 
alloc_anon_50M_check(const char * cgroup,void * arg)174 static int alloc_anon_50M_check(const char *cgroup, void *arg)
175 {
176 	size_t size = MB(50);
177 	char *buf, *ptr;
178 	long anon, current;
179 	int ret = -1;
180 
181 	buf = malloc(size);
182 	if (buf == NULL) {
183 		fprintf(stderr, "malloc() failed\n");
184 		return -1;
185 	}
186 
187 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
188 		*ptr = 0;
189 
190 	current = cg_read_long(cgroup, "memory.current");
191 	if (current < size)
192 		goto cleanup;
193 
194 	if (!values_close(size, current, 3))
195 		goto cleanup;
196 
197 	anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
198 	if (anon < 0)
199 		goto cleanup;
200 
201 	if (!values_close(anon, current, 3))
202 		goto cleanup;
203 
204 	ret = 0;
205 cleanup:
206 	free(buf);
207 	return ret;
208 }
209 
alloc_pagecache_50M_check(const char * cgroup,void * arg)210 static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
211 {
212 	size_t size = MB(50);
213 	int ret = -1;
214 	long current, file;
215 	int fd;
216 
217 	fd = get_temp_fd();
218 	if (fd < 0)
219 		return -1;
220 
221 	if (alloc_pagecache(fd, size))
222 		goto cleanup;
223 
224 	current = cg_read_long(cgroup, "memory.current");
225 	if (current < size)
226 		goto cleanup;
227 
228 	file = cg_read_key_long(cgroup, "memory.stat", "file ");
229 	if (file < 0)
230 		goto cleanup;
231 
232 	if (!values_close(file, current, 10))
233 		goto cleanup;
234 
235 	ret = 0;
236 
237 cleanup:
238 	close(fd);
239 	return ret;
240 }
241 
242 /*
243  * This test create a memory cgroup, allocates
244  * some anonymous memory and some pagecache
245  * and checks memory.current, memory.peak, and some memory.stat values.
246  */
test_memcg_current_peak(const char * root)247 static int test_memcg_current_peak(const char *root)
248 {
249 	int ret = KSFT_FAIL;
250 	long current, peak, peak_reset;
251 	char *memcg;
252 	bool fd2_closed = false, fd3_closed = false, fd4_closed = false;
253 	int peak_fd = -1, peak_fd2 = -1, peak_fd3 = -1, peak_fd4 = -1;
254 	struct stat ss;
255 
256 	memcg = cg_name(root, "memcg_test");
257 	if (!memcg)
258 		goto cleanup;
259 
260 	if (cg_create(memcg))
261 		goto cleanup;
262 
263 	current = cg_read_long(memcg, "memory.current");
264 	if (current != 0)
265 		goto cleanup;
266 
267 	peak = cg_read_long(memcg, "memory.peak");
268 	if (peak != 0)
269 		goto cleanup;
270 
271 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
272 		goto cleanup;
273 
274 	peak = cg_read_long(memcg, "memory.peak");
275 	if (peak < MB(50))
276 		goto cleanup;
277 
278 	/*
279 	 * We'll open a few FDs for the same memory.peak file to exercise the free-path
280 	 * We need at least three to be closed in a different order than writes occurred to test
281 	 * the linked-list handling.
282 	 */
283 	peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
284 
285 	if (peak_fd == -1) {
286 		if (errno == ENOENT)
287 			ret = KSFT_SKIP;
288 		goto cleanup;
289 	}
290 
291 	/*
292 	 * Before we try to use memory.peak's fd, try to figure out whether
293 	 * this kernel supports writing to that file in the first place. (by
294 	 * checking the writable bit on the file's st_mode)
295 	 */
296 	if (fstat(peak_fd, &ss))
297 		goto cleanup;
298 
299 	if ((ss.st_mode & S_IWUSR) == 0) {
300 		ret = KSFT_SKIP;
301 		goto cleanup;
302 	}
303 
304 	peak_fd2 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
305 
306 	if (peak_fd2 == -1)
307 		goto cleanup;
308 
309 	peak_fd3 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
310 
311 	if (peak_fd3 == -1)
312 		goto cleanup;
313 
314 	/* any non-empty string resets, but make it clear */
315 	static const char reset_string[] = "reset\n";
316 
317 	peak_reset = write(peak_fd, reset_string, sizeof(reset_string));
318 	if (peak_reset != sizeof(reset_string))
319 		goto cleanup;
320 
321 	peak_reset = write(peak_fd2, reset_string, sizeof(reset_string));
322 	if (peak_reset != sizeof(reset_string))
323 		goto cleanup;
324 
325 	peak_reset = write(peak_fd3, reset_string, sizeof(reset_string));
326 	if (peak_reset != sizeof(reset_string))
327 		goto cleanup;
328 
329 	/* Make sure a completely independent read isn't affected by our  FD-local reset above*/
330 	peak = cg_read_long(memcg, "memory.peak");
331 	if (peak < MB(50))
332 		goto cleanup;
333 
334 	fd2_closed = true;
335 	if (close(peak_fd2))
336 		goto cleanup;
337 
338 	peak_fd4 = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
339 
340 	if (peak_fd4 == -1)
341 		goto cleanup;
342 
343 	peak_reset = write(peak_fd4, reset_string, sizeof(reset_string));
344 	if (peak_reset != sizeof(reset_string))
345 		goto cleanup;
346 
347 	peak = cg_read_long_fd(peak_fd);
348 	if (peak > MB(30) || peak < 0)
349 		goto cleanup;
350 
351 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
352 		goto cleanup;
353 
354 	peak = cg_read_long(memcg, "memory.peak");
355 	if (peak < MB(50))
356 		goto cleanup;
357 
358 	/* Make sure everything is back to normal */
359 	peak = cg_read_long_fd(peak_fd);
360 	if (peak < MB(50))
361 		goto cleanup;
362 
363 	peak = cg_read_long_fd(peak_fd4);
364 	if (peak < MB(50))
365 		goto cleanup;
366 
367 	fd3_closed = true;
368 	if (close(peak_fd3))
369 		goto cleanup;
370 
371 	fd4_closed = true;
372 	if (close(peak_fd4))
373 		goto cleanup;
374 
375 	ret = KSFT_PASS;
376 
377 cleanup:
378 	close(peak_fd);
379 	if (!fd2_closed)
380 		close(peak_fd2);
381 	if (!fd3_closed)
382 		close(peak_fd3);
383 	if (!fd4_closed)
384 		close(peak_fd4);
385 	cg_destroy(memcg);
386 	free(memcg);
387 
388 	return ret;
389 }
390 
alloc_pagecache_50M_noexit(const char * cgroup,void * arg)391 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
392 {
393 	int fd = (long)arg;
394 	int ppid = getppid();
395 
396 	if (alloc_pagecache(fd, MB(50)))
397 		return -1;
398 
399 	while (getppid() == ppid)
400 		sleep(1);
401 
402 	return 0;
403 }
404 
alloc_anon_noexit(const char * cgroup,void * arg)405 static int alloc_anon_noexit(const char *cgroup, void *arg)
406 {
407 	int ppid = getppid();
408 	size_t size = (unsigned long)arg;
409 	char *buf, *ptr;
410 
411 	buf = malloc(size);
412 	if (buf == NULL) {
413 		fprintf(stderr, "malloc() failed\n");
414 		return -1;
415 	}
416 
417 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
418 		*ptr = 0;
419 
420 	while (getppid() == ppid)
421 		sleep(1);
422 
423 	free(buf);
424 	return 0;
425 }
426 
427 /*
428  * Wait until processes are killed asynchronously by the OOM killer
429  * If we exceed a timeout, fail.
430  */
cg_test_proc_killed(const char * cgroup)431 static int cg_test_proc_killed(const char *cgroup)
432 {
433 	int limit;
434 
435 	for (limit = 10; limit > 0; limit--) {
436 		if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
437 			return 0;
438 
439 		usleep(100000);
440 	}
441 	return -1;
442 }
443 
444 static bool reclaim_until(const char *memcg, long goal);
445 
446 /*
447  * First, this test creates the following hierarchy:
448  * A       memory.min = 0,    memory.max = 200M
449  * A/B     memory.min = 50M
450  * A/B/C   memory.min = 75M,  memory.current = 50M
451  * A/B/D   memory.min = 25M,  memory.current = 50M
452  * A/B/E   memory.min = 0,    memory.current = 50M
453  * A/B/F   memory.min = 500M, memory.current = 0
454  *
455  * (or memory.low if we test soft protection)
456  *
457  * Usages are pagecache and the test keeps a running
458  * process in every leaf cgroup.
459  * Then it creates A/G and creates a significant
460  * memory pressure in A.
461  *
462  * Then it checks actual memory usages and expects that:
463  * A/B    memory.current ~= 50M
464  * A/B/C  memory.current ~= 29M [memory.events:low > 0]
465  * A/B/D  memory.current ~= 21M [memory.events:low > 0]
466  * A/B/E  memory.current ~= 0   [memory.events:low == 0 if !memory_recursiveprot,
467  *				 undefined otherwise]
468  * A/B/F  memory.current  = 0   [memory.events:low == 0]
469  * (for origin of the numbers, see model in memcg_protection.m.)
470  *
471  * After that it tries to allocate more than there is
472  * unprotected memory in A available, and checks that:
473  * a) memory.min protects pagecache even in this case,
474  * b) memory.low allows reclaiming page cache with low events.
475  *
476  * Then we try to reclaim from A/B/C using memory.reclaim until its
477  * usage reaches 10M.
478  * This makes sure that:
479  * (a) We ignore the protection of the reclaim target memcg.
480  * (b) The previously calculated emin value (~29M) should be dismissed.
481  */
test_memcg_protection(const char * root,bool min)482 static int test_memcg_protection(const char *root, bool min)
483 {
484 	int ret = KSFT_FAIL, rc;
485 	char *parent[3] = {NULL};
486 	char *children[4] = {NULL};
487 	const char *attribute = min ? "memory.min" : "memory.low";
488 	long c[4];
489 	long current;
490 	int i, attempts;
491 	int fd;
492 
493 	fd = get_temp_fd();
494 	if (fd < 0)
495 		goto cleanup;
496 
497 	parent[0] = cg_name(root, "memcg_test_0");
498 	if (!parent[0])
499 		goto cleanup;
500 
501 	parent[1] = cg_name(parent[0], "memcg_test_1");
502 	if (!parent[1])
503 		goto cleanup;
504 
505 	parent[2] = cg_name(parent[0], "memcg_test_2");
506 	if (!parent[2])
507 		goto cleanup;
508 
509 	if (cg_create(parent[0]))
510 		goto cleanup;
511 
512 	if (cg_read_long(parent[0], attribute)) {
513 		/* No memory.min on older kernels is fine */
514 		if (min)
515 			ret = KSFT_SKIP;
516 		goto cleanup;
517 	}
518 
519 	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
520 		goto cleanup;
521 
522 	if (cg_write(parent[0], "memory.max", "200M"))
523 		goto cleanup;
524 
525 	if (cg_write(parent[0], "memory.swap.max", "0"))
526 		goto cleanup;
527 
528 	if (cg_create(parent[1]))
529 		goto cleanup;
530 
531 	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
532 		goto cleanup;
533 
534 	if (cg_create(parent[2]))
535 		goto cleanup;
536 
537 	for (i = 0; i < ARRAY_SIZE(children); i++) {
538 		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
539 		if (!children[i])
540 			goto cleanup;
541 
542 		if (cg_create(children[i]))
543 			goto cleanup;
544 
545 		if (i > 2)
546 			continue;
547 
548 		cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
549 			      (void *)(long)fd);
550 	}
551 
552 	if (cg_write(parent[1],   attribute, "50M"))
553 		goto cleanup;
554 	if (cg_write(children[0], attribute, "75M"))
555 		goto cleanup;
556 	if (cg_write(children[1], attribute, "25M"))
557 		goto cleanup;
558 	if (cg_write(children[2], attribute, "0"))
559 		goto cleanup;
560 	if (cg_write(children[3], attribute, "500M"))
561 		goto cleanup;
562 
563 	attempts = 0;
564 	while (!values_close(cg_read_long(parent[1], "memory.current"),
565 			     MB(150), 3)) {
566 		if (attempts++ > 5)
567 			break;
568 		sleep(1);
569 	}
570 
571 	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
572 		goto cleanup;
573 
574 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
575 		goto cleanup;
576 
577 	for (i = 0; i < ARRAY_SIZE(children); i++)
578 		c[i] = cg_read_long(children[i], "memory.current");
579 
580 	if (!values_close(c[0], MB(29), 15))
581 		goto cleanup;
582 
583 	if (!values_close(c[1], MB(21), 20))
584 		goto cleanup;
585 
586 	if (c[3] != 0)
587 		goto cleanup;
588 
589 	rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
590 	if (min && !rc)
591 		goto cleanup;
592 	else if (!min && rc) {
593 		fprintf(stderr,
594 			"memory.low prevents from allocating anon memory\n");
595 		goto cleanup;
596 	}
597 
598 	current = min ? MB(50) : MB(30);
599 	if (!values_close(cg_read_long(parent[1], "memory.current"), current, 3))
600 		goto cleanup;
601 
602 	if (!reclaim_until(children[0], MB(10)))
603 		goto cleanup;
604 
605 	if (min) {
606 		ret = KSFT_PASS;
607 		goto cleanup;
608 	}
609 
610 	/*
611 	 * Child 2 has memory.low=0, but some low protection may still be
612 	 * distributed down from its parent with memory.low=50M if cgroup2
613 	 * memory_recursiveprot mount option is enabled. Ignore the low
614 	 * event count in this case.
615 	 */
616 	for (i = 0; i < ARRAY_SIZE(children); i++) {
617 		int ignore_low_events_index = has_recursiveprot ? 2 : -1;
618 		int no_low_events_index = 1;
619 		long low, oom;
620 
621 		oom = cg_read_key_long(children[i], "memory.events", "oom ");
622 		low = cg_read_key_long(children[i], "memory.events", "low ");
623 
624 		if (oom)
625 			goto cleanup;
626 		if (i == ignore_low_events_index)
627 			continue;
628 		if (i <= no_low_events_index && low <= 0)
629 			goto cleanup;
630 		if (i > no_low_events_index && low)
631 			goto cleanup;
632 
633 	}
634 
635 	ret = KSFT_PASS;
636 
637 cleanup:
638 	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
639 		if (!children[i])
640 			continue;
641 
642 		cg_destroy(children[i]);
643 		free(children[i]);
644 	}
645 
646 	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
647 		if (!parent[i])
648 			continue;
649 
650 		cg_destroy(parent[i]);
651 		free(parent[i]);
652 	}
653 	close(fd);
654 	return ret;
655 }
656 
test_memcg_min(const char * root)657 static int test_memcg_min(const char *root)
658 {
659 	return test_memcg_protection(root, true);
660 }
661 
test_memcg_low(const char * root)662 static int test_memcg_low(const char *root)
663 {
664 	return test_memcg_protection(root, false);
665 }
666 
alloc_pagecache_max_30M(const char * cgroup,void * arg)667 static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
668 {
669 	size_t size = MB(50);
670 	int ret = -1;
671 	long current, high, max;
672 	int fd;
673 
674 	high = cg_read_long(cgroup, "memory.high");
675 	max = cg_read_long(cgroup, "memory.max");
676 	if (high != MB(30) && max != MB(30))
677 		return -1;
678 
679 	fd = get_temp_fd();
680 	if (fd < 0)
681 		return -1;
682 
683 	if (alloc_pagecache(fd, size))
684 		goto cleanup;
685 
686 	current = cg_read_long(cgroup, "memory.current");
687 	if (!values_close(current, MB(30), 5))
688 		goto cleanup;
689 
690 	ret = 0;
691 
692 cleanup:
693 	close(fd);
694 	return ret;
695 
696 }
697 
698 /*
699  * This test checks that memory.high limits the amount of
700  * memory which can be consumed by either anonymous memory
701  * or pagecache.
702  */
test_memcg_high(const char * root)703 static int test_memcg_high(const char *root)
704 {
705 	int ret = KSFT_FAIL;
706 	char *memcg;
707 	long high;
708 
709 	memcg = cg_name(root, "memcg_test");
710 	if (!memcg)
711 		goto cleanup;
712 
713 	if (cg_create(memcg))
714 		goto cleanup;
715 
716 	if (cg_read_strcmp(memcg, "memory.high", "max\n"))
717 		goto cleanup;
718 
719 	if (cg_write(memcg, "memory.swap.max", "0"))
720 		goto cleanup;
721 
722 	if (cg_write(memcg, "memory.high", "30M"))
723 		goto cleanup;
724 
725 	if (cg_run(memcg, alloc_anon, (void *)MB(31)))
726 		goto cleanup;
727 
728 	if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
729 		goto cleanup;
730 
731 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
732 		goto cleanup;
733 
734 	high = cg_read_key_long(memcg, "memory.events", "high ");
735 	if (high <= 0)
736 		goto cleanup;
737 
738 	ret = KSFT_PASS;
739 
740 cleanup:
741 	cg_destroy(memcg);
742 	free(memcg);
743 
744 	return ret;
745 }
746 
alloc_anon_mlock(const char * cgroup,void * arg)747 static int alloc_anon_mlock(const char *cgroup, void *arg)
748 {
749 	size_t size = (size_t)arg;
750 	void *buf;
751 
752 	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
753 		   0, 0);
754 	if (buf == MAP_FAILED)
755 		return -1;
756 
757 	mlock(buf, size);
758 	munmap(buf, size);
759 	return 0;
760 }
761 
762 /*
763  * This test checks that memory.high is able to throttle big single shot
764  * allocation i.e. large allocation within one kernel entry.
765  */
test_memcg_high_sync(const char * root)766 static int test_memcg_high_sync(const char *root)
767 {
768 	int ret = KSFT_FAIL, pid, fd = -1;
769 	char *memcg;
770 	long pre_high, pre_max;
771 	long post_high, post_max;
772 
773 	memcg = cg_name(root, "memcg_test");
774 	if (!memcg)
775 		goto cleanup;
776 
777 	if (cg_create(memcg))
778 		goto cleanup;
779 
780 	pre_high = cg_read_key_long(memcg, "memory.events", "high ");
781 	pre_max = cg_read_key_long(memcg, "memory.events", "max ");
782 	if (pre_high < 0 || pre_max < 0)
783 		goto cleanup;
784 
785 	if (cg_write(memcg, "memory.swap.max", "0"))
786 		goto cleanup;
787 
788 	if (cg_write(memcg, "memory.high", "30M"))
789 		goto cleanup;
790 
791 	if (cg_write(memcg, "memory.max", "140M"))
792 		goto cleanup;
793 
794 	fd = memcg_prepare_for_wait(memcg);
795 	if (fd < 0)
796 		goto cleanup;
797 
798 	pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
799 	if (pid < 0)
800 		goto cleanup;
801 
802 	cg_wait_for(fd);
803 
804 	post_high = cg_read_key_long(memcg, "memory.events", "high ");
805 	post_max = cg_read_key_long(memcg, "memory.events", "max ");
806 	if (post_high < 0 || post_max < 0)
807 		goto cleanup;
808 
809 	if (pre_high == post_high || pre_max != post_max)
810 		goto cleanup;
811 
812 	ret = KSFT_PASS;
813 
814 cleanup:
815 	if (fd >= 0)
816 		close(fd);
817 	cg_destroy(memcg);
818 	free(memcg);
819 
820 	return ret;
821 }
822 
823 /*
824  * This test checks that memory.max limits the amount of
825  * memory which can be consumed by either anonymous memory
826  * or pagecache.
827  */
test_memcg_max(const char * root)828 static int test_memcg_max(const char *root)
829 {
830 	int ret = KSFT_FAIL;
831 	char *memcg;
832 	long current, max;
833 
834 	memcg = cg_name(root, "memcg_test");
835 	if (!memcg)
836 		goto cleanup;
837 
838 	if (cg_create(memcg))
839 		goto cleanup;
840 
841 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
842 		goto cleanup;
843 
844 	if (cg_write(memcg, "memory.swap.max", "0"))
845 		goto cleanup;
846 
847 	if (cg_write(memcg, "memory.max", "30M"))
848 		goto cleanup;
849 
850 	/* Should be killed by OOM killer */
851 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
852 		goto cleanup;
853 
854 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
855 		goto cleanup;
856 
857 	current = cg_read_long(memcg, "memory.current");
858 	if (current > MB(30) || !current)
859 		goto cleanup;
860 
861 	max = cg_read_key_long(memcg, "memory.events", "max ");
862 	if (max <= 0)
863 		goto cleanup;
864 
865 	ret = KSFT_PASS;
866 
867 cleanup:
868 	cg_destroy(memcg);
869 	free(memcg);
870 
871 	return ret;
872 }
873 
874 /*
875  * Reclaim from @memcg until usage reaches @goal by writing to
876  * memory.reclaim.
877  *
878  * This function will return false if the usage is already below the
879  * goal.
880  *
881  * This function assumes that writing to memory.reclaim is the only
882  * source of change in memory.current (no concurrent allocations or
883  * reclaim).
884  *
885  * This function makes sure memory.reclaim is sane. It will return
886  * false if memory.reclaim's error codes do not make sense, even if
887  * the usage goal was satisfied.
888  */
reclaim_until(const char * memcg,long goal)889 static bool reclaim_until(const char *memcg, long goal)
890 {
891 	char buf[64];
892 	int retries, err;
893 	long current, to_reclaim;
894 	bool reclaimed = false;
895 
896 	for (retries = 5; retries > 0; retries--) {
897 		current = cg_read_long(memcg, "memory.current");
898 
899 		if (current < goal || values_close(current, goal, 3))
900 			break;
901 		/* Did memory.reclaim return 0 incorrectly? */
902 		else if (reclaimed)
903 			return false;
904 
905 		to_reclaim = current - goal;
906 		snprintf(buf, sizeof(buf), "%ld", to_reclaim);
907 		err = cg_write(memcg, "memory.reclaim", buf);
908 		if (!err)
909 			reclaimed = true;
910 		else if (err != -EAGAIN)
911 			return false;
912 	}
913 	return reclaimed;
914 }
915 
916 /*
917  * This test checks that memory.reclaim reclaims the given
918  * amount of memory (from both anon and file, if possible).
919  */
test_memcg_reclaim(const char * root)920 static int test_memcg_reclaim(const char *root)
921 {
922 	int ret = KSFT_FAIL;
923 	int fd = -1;
924 	int retries;
925 	char *memcg;
926 	long current, expected_usage;
927 
928 	memcg = cg_name(root, "memcg_test");
929 	if (!memcg)
930 		goto cleanup;
931 
932 	if (cg_create(memcg))
933 		goto cleanup;
934 
935 	current = cg_read_long(memcg, "memory.current");
936 	if (current != 0)
937 		goto cleanup;
938 
939 	fd = get_temp_fd();
940 	if (fd < 0)
941 		goto cleanup;
942 
943 	cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
944 
945 	/*
946 	 * If swap is enabled, try to reclaim from both anon and file, else try
947 	 * to reclaim from file only.
948 	 */
949 	if (is_swap_enabled()) {
950 		cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
951 		expected_usage = MB(100);
952 	} else
953 		expected_usage = MB(50);
954 
955 	/*
956 	 * Wait until current usage reaches the expected usage (or we run out of
957 	 * retries).
958 	 */
959 	retries = 5;
960 	while (!values_close(cg_read_long(memcg, "memory.current"),
961 			    expected_usage, 10)) {
962 		if (retries--) {
963 			sleep(1);
964 			continue;
965 		} else {
966 			fprintf(stderr,
967 				"failed to allocate %ld for memcg reclaim test\n",
968 				expected_usage);
969 			goto cleanup;
970 		}
971 	}
972 
973 	/*
974 	 * Reclaim until current reaches 30M, this makes sure we hit both anon
975 	 * and file if swap is enabled.
976 	 */
977 	if (!reclaim_until(memcg, MB(30)))
978 		goto cleanup;
979 
980 	ret = KSFT_PASS;
981 cleanup:
982 	cg_destroy(memcg);
983 	free(memcg);
984 	close(fd);
985 
986 	return ret;
987 }
988 
alloc_anon_50M_check_swap(const char * cgroup,void * arg)989 static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
990 {
991 	long mem_max = (long)arg;
992 	size_t size = MB(50);
993 	char *buf, *ptr;
994 	long mem_current, swap_current;
995 	int ret = -1;
996 
997 	buf = malloc(size);
998 	if (buf == NULL) {
999 		fprintf(stderr, "malloc() failed\n");
1000 		return -1;
1001 	}
1002 
1003 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
1004 		*ptr = 0;
1005 
1006 	mem_current = cg_read_long(cgroup, "memory.current");
1007 	if (!mem_current || !values_close(mem_current, mem_max, 3))
1008 		goto cleanup;
1009 
1010 	swap_current = cg_read_long(cgroup, "memory.swap.current");
1011 	if (!swap_current ||
1012 	    !values_close(mem_current + swap_current, size, 3))
1013 		goto cleanup;
1014 
1015 	ret = 0;
1016 cleanup:
1017 	free(buf);
1018 	return ret;
1019 }
1020 
1021 /*
1022  * This test checks that memory.swap.max limits the amount of
1023  * anonymous memory which can be swapped out. Additionally, it verifies that
1024  * memory.swap.peak reflects the high watermark and can be reset.
1025  */
test_memcg_swap_max_peak(const char * root)1026 static int test_memcg_swap_max_peak(const char *root)
1027 {
1028 	int ret = KSFT_FAIL;
1029 	char *memcg;
1030 	long max, peak;
1031 	struct stat ss;
1032 	int swap_peak_fd = -1, mem_peak_fd = -1;
1033 
1034 	/* any non-empty string resets */
1035 	static const char reset_string[] = "foobarbaz";
1036 
1037 	if (!is_swap_enabled())
1038 		return KSFT_SKIP;
1039 
1040 	memcg = cg_name(root, "memcg_test");
1041 	if (!memcg)
1042 		goto cleanup;
1043 
1044 	if (cg_create(memcg))
1045 		goto cleanup;
1046 
1047 	if (cg_read_long(memcg, "memory.swap.current")) {
1048 		ret = KSFT_SKIP;
1049 		goto cleanup;
1050 	}
1051 
1052 	swap_peak_fd = cg_open(memcg, "memory.swap.peak",
1053 			       O_RDWR | O_APPEND | O_CLOEXEC);
1054 
1055 	if (swap_peak_fd == -1) {
1056 		if (errno == ENOENT)
1057 			ret = KSFT_SKIP;
1058 		goto cleanup;
1059 	}
1060 
1061 	/*
1062 	 * Before we try to use memory.swap.peak's fd, try to figure out
1063 	 * whether this kernel supports writing to that file in the first
1064 	 * place. (by checking the writable bit on the file's st_mode)
1065 	 */
1066 	if (fstat(swap_peak_fd, &ss))
1067 		goto cleanup;
1068 
1069 	if ((ss.st_mode & S_IWUSR) == 0) {
1070 		ret = KSFT_SKIP;
1071 		goto cleanup;
1072 	}
1073 
1074 	mem_peak_fd = cg_open(memcg, "memory.peak", O_RDWR | O_APPEND | O_CLOEXEC);
1075 
1076 	if (mem_peak_fd == -1)
1077 		goto cleanup;
1078 
1079 	if (cg_read_long(memcg, "memory.swap.peak"))
1080 		goto cleanup;
1081 
1082 	if (cg_read_long_fd(swap_peak_fd))
1083 		goto cleanup;
1084 
1085 	/* switch the swap and mem fds into local-peak tracking mode*/
1086 	int peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
1087 
1088 	if (peak_reset != sizeof(reset_string))
1089 		goto cleanup;
1090 
1091 	if (cg_read_long_fd(swap_peak_fd))
1092 		goto cleanup;
1093 
1094 	if (cg_read_long(memcg, "memory.peak"))
1095 		goto cleanup;
1096 
1097 	if (cg_read_long_fd(mem_peak_fd))
1098 		goto cleanup;
1099 
1100 	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
1101 	if (peak_reset != sizeof(reset_string))
1102 		goto cleanup;
1103 
1104 	if (cg_read_long_fd(mem_peak_fd))
1105 		goto cleanup;
1106 
1107 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
1108 		goto cleanup;
1109 
1110 	if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
1111 		goto cleanup;
1112 
1113 	if (cg_write(memcg, "memory.swap.max", "30M"))
1114 		goto cleanup;
1115 
1116 	if (cg_write(memcg, "memory.max", "30M"))
1117 		goto cleanup;
1118 
1119 	/* Should be killed by OOM killer */
1120 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1121 		goto cleanup;
1122 
1123 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
1124 		goto cleanup;
1125 
1126 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
1127 		goto cleanup;
1128 
1129 	peak = cg_read_long(memcg, "memory.peak");
1130 	if (peak < MB(29))
1131 		goto cleanup;
1132 
1133 	peak = cg_read_long(memcg, "memory.swap.peak");
1134 	if (peak < MB(29))
1135 		goto cleanup;
1136 
1137 	peak = cg_read_long_fd(mem_peak_fd);
1138 	if (peak < MB(29))
1139 		goto cleanup;
1140 
1141 	peak = cg_read_long_fd(swap_peak_fd);
1142 	if (peak < MB(29))
1143 		goto cleanup;
1144 
1145 	/*
1146 	 * open, reset and close the peak swap on another FD to make sure
1147 	 * multiple extant fds don't corrupt the linked-list
1148 	 */
1149 	peak_reset = cg_write(memcg, "memory.swap.peak", (char *)reset_string);
1150 	if (peak_reset)
1151 		goto cleanup;
1152 
1153 	peak_reset = cg_write(memcg, "memory.peak", (char *)reset_string);
1154 	if (peak_reset)
1155 		goto cleanup;
1156 
1157 	/* actually reset on the fds */
1158 	peak_reset = write(swap_peak_fd, reset_string, sizeof(reset_string));
1159 	if (peak_reset != sizeof(reset_string))
1160 		goto cleanup;
1161 
1162 	peak_reset = write(mem_peak_fd, reset_string, sizeof(reset_string));
1163 	if (peak_reset != sizeof(reset_string))
1164 		goto cleanup;
1165 
1166 	peak = cg_read_long_fd(swap_peak_fd);
1167 	if (peak > MB(10))
1168 		goto cleanup;
1169 
1170 	/*
1171 	 * The cgroup is now empty, but there may be a page or two associated
1172 	 * with the open FD accounted to it.
1173 	 */
1174 	peak = cg_read_long_fd(mem_peak_fd);
1175 	if (peak > MB(1))
1176 		goto cleanup;
1177 
1178 	if (cg_read_long(memcg, "memory.peak") < MB(29))
1179 		goto cleanup;
1180 
1181 	if (cg_read_long(memcg, "memory.swap.peak") < MB(29))
1182 		goto cleanup;
1183 
1184 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
1185 		goto cleanup;
1186 
1187 	max = cg_read_key_long(memcg, "memory.events", "max ");
1188 	if (max <= 0)
1189 		goto cleanup;
1190 
1191 	peak = cg_read_long(memcg, "memory.peak");
1192 	if (peak < MB(29))
1193 		goto cleanup;
1194 
1195 	peak = cg_read_long(memcg, "memory.swap.peak");
1196 	if (peak < MB(29))
1197 		goto cleanup;
1198 
1199 	peak = cg_read_long_fd(mem_peak_fd);
1200 	if (peak < MB(29))
1201 		goto cleanup;
1202 
1203 	peak = cg_read_long_fd(swap_peak_fd);
1204 	if (peak < MB(19))
1205 		goto cleanup;
1206 
1207 	ret = KSFT_PASS;
1208 
1209 cleanup:
1210 	if (mem_peak_fd != -1 && close(mem_peak_fd))
1211 		ret = KSFT_FAIL;
1212 	if (swap_peak_fd != -1 && close(swap_peak_fd))
1213 		ret = KSFT_FAIL;
1214 	cg_destroy(memcg);
1215 	free(memcg);
1216 
1217 	return ret;
1218 }
1219 
1220 /*
1221  * This test disables swapping and tries to allocate anonymous memory
1222  * up to OOM. Then it checks for oom and oom_kill events in
1223  * memory.events.
1224  */
test_memcg_oom_events(const char * root)1225 static int test_memcg_oom_events(const char *root)
1226 {
1227 	int ret = KSFT_FAIL;
1228 	char *memcg;
1229 
1230 	memcg = cg_name(root, "memcg_test");
1231 	if (!memcg)
1232 		goto cleanup;
1233 
1234 	if (cg_create(memcg))
1235 		goto cleanup;
1236 
1237 	if (cg_write(memcg, "memory.max", "30M"))
1238 		goto cleanup;
1239 
1240 	if (cg_write(memcg, "memory.swap.max", "0"))
1241 		goto cleanup;
1242 
1243 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1244 		goto cleanup;
1245 
1246 	if (cg_read_strcmp(memcg, "cgroup.procs", ""))
1247 		goto cleanup;
1248 
1249 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
1250 		goto cleanup;
1251 
1252 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
1253 		goto cleanup;
1254 
1255 	ret = KSFT_PASS;
1256 
1257 cleanup:
1258 	cg_destroy(memcg);
1259 	free(memcg);
1260 
1261 	return ret;
1262 }
1263 
1264 struct tcp_server_args {
1265 	unsigned short port;
1266 	int ctl[2];
1267 };
1268 
tcp_server(const char * cgroup,void * arg)1269 static int tcp_server(const char *cgroup, void *arg)
1270 {
1271 	struct tcp_server_args *srv_args = arg;
1272 	struct sockaddr_in6 saddr = { 0 };
1273 	socklen_t slen = sizeof(saddr);
1274 	int sk, client_sk, ctl_fd, yes = 1, ret = -1;
1275 
1276 	close(srv_args->ctl[0]);
1277 	ctl_fd = srv_args->ctl[1];
1278 
1279 	saddr.sin6_family = AF_INET6;
1280 	saddr.sin6_addr = in6addr_any;
1281 	saddr.sin6_port = htons(srv_args->port);
1282 
1283 	sk = socket(AF_INET6, SOCK_STREAM, 0);
1284 	if (sk < 0) {
1285 		/* Pass back errno to the ctl_fd */
1286 		write(ctl_fd, &errno, sizeof(errno));
1287 		return ret;
1288 	}
1289 
1290 	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
1291 		goto cleanup;
1292 
1293 	if (bind(sk, (struct sockaddr *)&saddr, slen)) {
1294 		write(ctl_fd, &errno, sizeof(errno));
1295 		goto cleanup;
1296 	}
1297 
1298 	if (listen(sk, 1))
1299 		goto cleanup;
1300 
1301 	ret = 0;
1302 	if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
1303 		ret = -1;
1304 		goto cleanup;
1305 	}
1306 
1307 	client_sk = accept(sk, NULL, NULL);
1308 	if (client_sk < 0)
1309 		goto cleanup;
1310 
1311 	ret = -1;
1312 	for (;;) {
1313 		uint8_t buf[0x100000];
1314 
1315 		if (write(client_sk, buf, sizeof(buf)) <= 0) {
1316 			if (errno == ECONNRESET)
1317 				ret = 0;
1318 			break;
1319 		}
1320 	}
1321 
1322 	close(client_sk);
1323 
1324 cleanup:
1325 	close(sk);
1326 	return ret;
1327 }
1328 
tcp_client(const char * cgroup,unsigned short port)1329 static int tcp_client(const char *cgroup, unsigned short port)
1330 {
1331 	const char server[] = "localhost";
1332 	struct addrinfo *ai;
1333 	char servport[6];
1334 	int retries = 0x10; /* nice round number */
1335 	int sk, ret;
1336 	long allocated;
1337 
1338 	allocated = cg_read_long(cgroup, "memory.current");
1339 	snprintf(servport, sizeof(servport), "%hd", port);
1340 	ret = getaddrinfo(server, servport, NULL, &ai);
1341 	if (ret)
1342 		return ret;
1343 
1344 	sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
1345 	if (sk < 0)
1346 		goto free_ainfo;
1347 
1348 	ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
1349 	if (ret < 0)
1350 		goto close_sk;
1351 
1352 	ret = KSFT_FAIL;
1353 	while (retries--) {
1354 		uint8_t buf[0x100000];
1355 		long current, sock;
1356 
1357 		if (read(sk, buf, sizeof(buf)) <= 0)
1358 			goto close_sk;
1359 
1360 		current = cg_read_long(cgroup, "memory.current");
1361 		sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
1362 
1363 		if (current < 0 || sock < 0)
1364 			goto close_sk;
1365 
1366 		/* exclude the memory not related to socket connection */
1367 		if (values_close(current - allocated, sock, 10)) {
1368 			ret = KSFT_PASS;
1369 			break;
1370 		}
1371 	}
1372 
1373 close_sk:
1374 	close(sk);
1375 free_ainfo:
1376 	freeaddrinfo(ai);
1377 	return ret;
1378 }
1379 
1380 /*
1381  * This test checks socket memory accounting.
1382  * The test forks a TCP server listens on a random port between 1000
1383  * and 61000. Once it gets a client connection, it starts writing to
1384  * its socket.
1385  * The TCP client interleaves reads from the socket with check whether
1386  * memory.current and memory.stat.sock are similar.
1387  */
test_memcg_sock(const char * root)1388 static int test_memcg_sock(const char *root)
1389 {
1390 	int bind_retries = 5, ret = KSFT_FAIL, pid, err;
1391 	unsigned short port;
1392 	char *memcg;
1393 	long sock_post = -1;
1394 
1395 	memcg = cg_name(root, "memcg_test");
1396 	if (!memcg)
1397 		goto cleanup;
1398 
1399 	if (cg_create(memcg))
1400 		goto cleanup;
1401 
1402 	while (bind_retries--) {
1403 		struct tcp_server_args args;
1404 
1405 		if (pipe(args.ctl))
1406 			goto cleanup;
1407 
1408 		port = args.port = 1000 + rand() % 60000;
1409 
1410 		pid = cg_run_nowait(memcg, tcp_server, &args);
1411 		if (pid < 0)
1412 			goto cleanup;
1413 
1414 		close(args.ctl[1]);
1415 		if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
1416 			goto cleanup;
1417 		close(args.ctl[0]);
1418 
1419 		/* Skip if address family not supported by protocol */
1420 		if (err == EAFNOSUPPORT) {
1421 			ret = KSFT_SKIP;
1422 			goto cleanup;
1423 		}
1424 
1425 		if (!err)
1426 			break;
1427 		if (err != EADDRINUSE)
1428 			goto cleanup;
1429 
1430 		waitpid(pid, NULL, 0);
1431 	}
1432 
1433 	if (err == EADDRINUSE) {
1434 		ret = KSFT_SKIP;
1435 		goto cleanup;
1436 	}
1437 
1438 	if (tcp_client(memcg, port) != KSFT_PASS)
1439 		goto cleanup;
1440 
1441 	waitpid(pid, &err, 0);
1442 	if (WEXITSTATUS(err))
1443 		goto cleanup;
1444 
1445 	if (cg_read_long(memcg, "memory.current") < 0)
1446 		goto cleanup;
1447 
1448 	/*
1449 	 * memory.stat is updated asynchronously via the memcg rstat
1450 	 * flushing worker, which runs periodically (every 2 seconds,
1451 	 * see FLUSH_TIME). On a busy system, the "sock " counter may
1452 	 * stay non-zero for a short period of time after the TCP
1453 	 * connection is closed and all socket memory has been
1454 	 * uncharged.
1455 	 *
1456 	 * Poll memory.stat for up to 3 seconds (~FLUSH_TIME plus some
1457 	 * scheduling slack) and require that the "sock " counter
1458 	 * eventually drops to zero.
1459 	 */
1460 	sock_post = cg_read_key_long_poll(memcg, "memory.stat", "sock ", 0,
1461 					 MEMCG_SOCKSTAT_WAIT_RETRIES,
1462 					 DEFAULT_WAIT_INTERVAL_US);
1463 	if (sock_post)
1464 		goto cleanup;
1465 
1466 	ret = KSFT_PASS;
1467 
1468 cleanup:
1469 	cg_destroy(memcg);
1470 	free(memcg);
1471 
1472 	return ret;
1473 }
1474 
1475 /*
1476  * This test disables swapping and tries to allocate anonymous memory
1477  * up to OOM with memory.group.oom set. Then it checks that all
1478  * processes in the leaf were killed. It also checks that oom_events
1479  * were propagated to the parent level.
1480  */
test_memcg_oom_group_leaf_events(const char * root)1481 static int test_memcg_oom_group_leaf_events(const char *root)
1482 {
1483 	int ret = KSFT_FAIL;
1484 	char *parent, *child;
1485 	long parent_oom_events;
1486 
1487 	parent = cg_name(root, "memcg_test_0");
1488 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1489 
1490 	if (!parent || !child)
1491 		goto cleanup;
1492 
1493 	if (cg_create(parent))
1494 		goto cleanup;
1495 
1496 	if (cg_create(child))
1497 		goto cleanup;
1498 
1499 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
1500 		goto cleanup;
1501 
1502 	if (cg_write(child, "memory.max", "50M"))
1503 		goto cleanup;
1504 
1505 	if (cg_write(child, "memory.swap.max", "0"))
1506 		goto cleanup;
1507 
1508 	if (cg_write(child, "memory.oom.group", "1"))
1509 		goto cleanup;
1510 
1511 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1512 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1513 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1514 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1515 		goto cleanup;
1516 
1517 	if (cg_test_proc_killed(child))
1518 		goto cleanup;
1519 
1520 	if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
1521 		goto cleanup;
1522 
1523 	parent_oom_events = cg_read_key_long(
1524 			parent, "memory.events", "oom_kill ");
1525 	/*
1526 	 * If memory_localevents is not enabled (the default), the parent should
1527 	 * count OOM events in its children groups. Otherwise, it should not
1528 	 * have observed any events.
1529 	 */
1530 	if (has_localevents && parent_oom_events != 0)
1531 		goto cleanup;
1532 	else if (!has_localevents && parent_oom_events <= 0)
1533 		goto cleanup;
1534 
1535 	ret = KSFT_PASS;
1536 
1537 cleanup:
1538 	if (child)
1539 		cg_destroy(child);
1540 	if (parent)
1541 		cg_destroy(parent);
1542 	free(child);
1543 	free(parent);
1544 
1545 	return ret;
1546 }
1547 
1548 /*
1549  * This test disables swapping and tries to allocate anonymous memory
1550  * up to OOM with memory.group.oom set. Then it checks that all
1551  * processes in the parent and leaf were killed.
1552  */
test_memcg_oom_group_parent_events(const char * root)1553 static int test_memcg_oom_group_parent_events(const char *root)
1554 {
1555 	int ret = KSFT_FAIL;
1556 	char *parent, *child;
1557 
1558 	parent = cg_name(root, "memcg_test_0");
1559 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1560 
1561 	if (!parent || !child)
1562 		goto cleanup;
1563 
1564 	if (cg_create(parent))
1565 		goto cleanup;
1566 
1567 	if (cg_create(child))
1568 		goto cleanup;
1569 
1570 	if (cg_write(parent, "memory.max", "80M"))
1571 		goto cleanup;
1572 
1573 	if (cg_write(parent, "memory.swap.max", "0"))
1574 		goto cleanup;
1575 
1576 	if (cg_write(parent, "memory.oom.group", "1"))
1577 		goto cleanup;
1578 
1579 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1580 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1581 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1582 
1583 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1584 		goto cleanup;
1585 
1586 	if (cg_test_proc_killed(child))
1587 		goto cleanup;
1588 	if (cg_test_proc_killed(parent))
1589 		goto cleanup;
1590 
1591 	ret = KSFT_PASS;
1592 
1593 cleanup:
1594 	if (child)
1595 		cg_destroy(child);
1596 	if (parent)
1597 		cg_destroy(parent);
1598 	free(child);
1599 	free(parent);
1600 
1601 	return ret;
1602 }
1603 
1604 /*
1605  * This test disables swapping and tries to allocate anonymous memory
1606  * up to OOM with memory.group.oom set. Then it checks that all
1607  * processes were killed except those set with OOM_SCORE_ADJ_MIN
1608  */
test_memcg_oom_group_score_events(const char * root)1609 static int test_memcg_oom_group_score_events(const char *root)
1610 {
1611 	int ret = KSFT_FAIL;
1612 	char *memcg;
1613 	int safe_pid;
1614 
1615 	memcg = cg_name(root, "memcg_test_0");
1616 
1617 	if (!memcg)
1618 		goto cleanup;
1619 
1620 	if (cg_create(memcg))
1621 		goto cleanup;
1622 
1623 	if (cg_write(memcg, "memory.max", "50M"))
1624 		goto cleanup;
1625 
1626 	if (cg_write(memcg, "memory.swap.max", "0"))
1627 		goto cleanup;
1628 
1629 	if (cg_write(memcg, "memory.oom.group", "1"))
1630 		goto cleanup;
1631 
1632 	safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1633 	if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
1634 		goto cleanup;
1635 
1636 	cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1637 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1638 		goto cleanup;
1639 
1640 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
1641 		goto cleanup;
1642 
1643 	if (kill(safe_pid, SIGKILL))
1644 		goto cleanup;
1645 
1646 	ret = KSFT_PASS;
1647 
1648 cleanup:
1649 	if (memcg)
1650 		cg_destroy(memcg);
1651 	free(memcg);
1652 
1653 	return ret;
1654 }
1655 
read_event(int inotify_fd,int expected_event,int expected_wd)1656 static int read_event(int inotify_fd, int expected_event, int expected_wd)
1657 {
1658 	struct inotify_event event;
1659 	ssize_t len = 0;
1660 
1661 	len = read(inotify_fd, &event, sizeof(event));
1662 	if (len < (ssize_t)sizeof(event))
1663 		return -1;
1664 
1665 	if (event.mask != expected_event || event.wd != expected_wd) {
1666 		fprintf(stderr,
1667 			"event does not match expected values: mask %d (expected %d) wd %d (expected %d)\n",
1668 			event.mask, expected_event, event.wd, expected_wd);
1669 		return -1;
1670 	}
1671 
1672 	return 0;
1673 }
1674 
test_memcg_inotify_delete_file(const char * root)1675 static int test_memcg_inotify_delete_file(const char *root)
1676 {
1677 	int ret = KSFT_FAIL;
1678 	char *memcg = NULL;
1679 	int fd, wd;
1680 
1681 	memcg = cg_name(root, "memcg_test_0");
1682 
1683 	if (!memcg)
1684 		goto cleanup;
1685 
1686 	if (cg_create(memcg))
1687 		goto cleanup;
1688 
1689 	fd = inotify_init1(0);
1690 	if (fd == -1)
1691 		goto cleanup;
1692 
1693 	wd = inotify_add_watch(fd, cg_control(memcg, "memory.events"), IN_DELETE_SELF);
1694 	if (wd == -1)
1695 		goto cleanup;
1696 
1697 	if (cg_destroy(memcg))
1698 		goto cleanup;
1699 	free(memcg);
1700 	memcg = NULL;
1701 
1702 	if (read_event(fd, IN_DELETE_SELF, wd))
1703 		goto cleanup;
1704 
1705 	if (read_event(fd, IN_IGNORED, wd))
1706 		goto cleanup;
1707 
1708 	ret = KSFT_PASS;
1709 
1710 cleanup:
1711 	if (fd >= 0)
1712 		close(fd);
1713 	if (memcg)
1714 		cg_destroy(memcg);
1715 	free(memcg);
1716 
1717 	return ret;
1718 }
1719 
test_memcg_inotify_delete_dir(const char * root)1720 static int test_memcg_inotify_delete_dir(const char *root)
1721 {
1722 	int ret = KSFT_FAIL;
1723 	char *memcg = NULL;
1724 	int fd, wd;
1725 
1726 	memcg = cg_name(root, "memcg_test_0");
1727 
1728 	if (!memcg)
1729 		goto cleanup;
1730 
1731 	if (cg_create(memcg))
1732 		goto cleanup;
1733 
1734 	fd = inotify_init1(0);
1735 	if (fd == -1)
1736 		goto cleanup;
1737 
1738 	wd = inotify_add_watch(fd, memcg, IN_DELETE_SELF);
1739 	if (wd == -1)
1740 		goto cleanup;
1741 
1742 	if (cg_destroy(memcg))
1743 		goto cleanup;
1744 	free(memcg);
1745 	memcg = NULL;
1746 
1747 	if (read_event(fd, IN_DELETE_SELF, wd))
1748 		goto cleanup;
1749 
1750 	if (read_event(fd, IN_IGNORED, wd))
1751 		goto cleanup;
1752 
1753 	ret = KSFT_PASS;
1754 
1755 cleanup:
1756 	if (fd >= 0)
1757 		close(fd);
1758 	if (memcg)
1759 		cg_destroy(memcg);
1760 	free(memcg);
1761 
1762 	return ret;
1763 }
1764 
1765 #define T(x) { x, #x }
1766 struct memcg_test {
1767 	int (*fn)(const char *root);
1768 	const char *name;
1769 } tests[] = {
1770 	T(test_memcg_subtree_control),
1771 	T(test_memcg_current_peak),
1772 	T(test_memcg_min),
1773 	T(test_memcg_low),
1774 	T(test_memcg_high),
1775 	T(test_memcg_high_sync),
1776 	T(test_memcg_max),
1777 	T(test_memcg_reclaim),
1778 	T(test_memcg_oom_events),
1779 	T(test_memcg_swap_max_peak),
1780 	T(test_memcg_sock),
1781 	T(test_memcg_oom_group_leaf_events),
1782 	T(test_memcg_oom_group_parent_events),
1783 	T(test_memcg_oom_group_score_events),
1784 	T(test_memcg_inotify_delete_file),
1785 	T(test_memcg_inotify_delete_dir),
1786 };
1787 #undef T
1788 
main(int argc,char ** argv)1789 int main(int argc, char **argv)
1790 {
1791 	char root[PATH_MAX];
1792 	int i, proc_status;
1793 
1794 	ksft_print_header();
1795 	ksft_set_plan(ARRAY_SIZE(tests));
1796 	if (cg_find_unified_root(root, sizeof(root), NULL))
1797 		ksft_exit_skip("cgroup v2 isn't mounted\n");
1798 
1799 	/*
1800 	 * Check that memory controller is available:
1801 	 * memory is listed in cgroup.controllers
1802 	 */
1803 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
1804 		ksft_exit_skip("memory controller isn't available\n");
1805 
1806 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
1807 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
1808 			ksft_exit_skip("Failed to set memory controller\n");
1809 
1810 	proc_status = proc_mount_contains("memory_recursiveprot");
1811 	if (proc_status < 0)
1812 		ksft_exit_skip("Failed to query cgroup mount option\n");
1813 	has_recursiveprot = proc_status;
1814 
1815 	proc_status = proc_mount_contains("memory_localevents");
1816 	if (proc_status < 0)
1817 		ksft_exit_skip("Failed to query cgroup mount option\n");
1818 	has_localevents = proc_status;
1819 
1820 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
1821 		switch (tests[i].fn(root)) {
1822 		case KSFT_PASS:
1823 			ksft_test_result_pass("%s\n", tests[i].name);
1824 			break;
1825 		case KSFT_SKIP:
1826 			ksft_test_result_skip("%s\n", tests[i].name);
1827 			break;
1828 		default:
1829 			ksft_test_result_fail("%s\n", tests[i].name);
1830 			break;
1831 		}
1832 	}
1833 
1834 	ksft_finished();
1835 }
1836