xref: /linux/tools/testing/selftests/pidfd/pidfd_autoreap_test.c (revision 07c3ef58223e2c75ea209d8c416b976ec30d9413)
1 // SPDX-License-Identifier: GPL-2.0
2 // Copyright (c) 2026 Christian Brauner <brauner@kernel.org>
3 
4 #define _GNU_SOURCE
5 #include <errno.h>
6 #include <linux/types.h>
7 #include <poll.h>
8 #include <pthread.h>
9 #include <sched.h>
10 #include <signal.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <syscall.h>
15 #include <sys/ioctl.h>
16 #include <sys/prctl.h>
17 #include <sys/socket.h>
18 #include <sys/types.h>
19 #include <sys/wait.h>
20 #include <unistd.h>
21 
22 #include "pidfd.h"
23 #include "kselftest_harness.h"
24 
25 #ifndef CLONE_AUTOREAP
26 #define CLONE_AUTOREAP (1ULL << 34)
27 #endif
28 
29 #ifndef CLONE_NNP
30 #define CLONE_NNP (1ULL << 35)
31 #endif
32 
33 #ifndef CLONE_PIDFD_AUTOKILL
34 #define CLONE_PIDFD_AUTOKILL (1ULL << 36)
35 #endif
36 
37 #ifndef _LINUX_CAPABILITY_VERSION_3
38 #define _LINUX_CAPABILITY_VERSION_3 0x20080522
39 #endif
40 
41 struct cap_header {
42 	__u32 version;
43 	int pid;
44 };
45 
46 struct cap_data {
47 	__u32 effective;
48 	__u32 permitted;
49 	__u32 inheritable;
50 };
51 
drop_all_caps(void)52 static int drop_all_caps(void)
53 {
54 	struct cap_header hdr = { .version = _LINUX_CAPABILITY_VERSION_3 };
55 	struct cap_data data[2] = {};
56 
57 	return syscall(__NR_capset, &hdr, data);
58 }
59 
create_autoreap_child(int * pidfd)60 static pid_t create_autoreap_child(int *pidfd)
61 {
62 	struct __clone_args args = {
63 		.flags		= CLONE_PIDFD | CLONE_AUTOREAP,
64 		.exit_signal	= 0,
65 		.pidfd		= ptr_to_u64(pidfd),
66 	};
67 
68 	return sys_clone3(&args, sizeof(args));
69 }
70 
71 /*
72  * Test that CLONE_AUTOREAP works without CLONE_PIDFD (fire-and-forget).
73  */
TEST(autoreap_without_pidfd)74 TEST(autoreap_without_pidfd)
75 {
76 	struct __clone_args args = {
77 		.flags		= CLONE_AUTOREAP,
78 		.exit_signal	= 0,
79 	};
80 	pid_t pid;
81 	int ret;
82 
83 	pid = sys_clone3(&args, sizeof(args));
84 	if (pid < 0 && errno == EINVAL)
85 		SKIP(return, "CLONE_AUTOREAP not supported");
86 	ASSERT_GE(pid, 0);
87 
88 	if (pid == 0)
89 		_exit(0);
90 
91 	/*
92 	 * Give the child a moment to exit and be autoreaped.
93 	 * Then verify no zombie remains.
94 	 */
95 	usleep(200000);
96 	ret = waitpid(pid, NULL, WNOHANG);
97 	ASSERT_EQ(ret, -1);
98 	ASSERT_EQ(errno, ECHILD);
99 }
100 
101 /*
102  * Test that CLONE_AUTOREAP with a non-zero exit_signal fails.
103  */
TEST(autoreap_rejects_exit_signal)104 TEST(autoreap_rejects_exit_signal)
105 {
106 	struct __clone_args args = {
107 		.flags		= CLONE_AUTOREAP,
108 		.exit_signal	= SIGCHLD,
109 	};
110 	pid_t pid;
111 
112 	pid = sys_clone3(&args, sizeof(args));
113 	ASSERT_EQ(pid, -1);
114 	ASSERT_EQ(errno, EINVAL);
115 }
116 
117 /*
118  * Test that CLONE_AUTOREAP with CLONE_PARENT fails.
119  */
TEST(autoreap_rejects_parent)120 TEST(autoreap_rejects_parent)
121 {
122 	struct __clone_args args = {
123 		.flags		= CLONE_AUTOREAP | CLONE_PARENT,
124 		.exit_signal	= 0,
125 	};
126 	pid_t pid;
127 
128 	pid = sys_clone3(&args, sizeof(args));
129 	ASSERT_EQ(pid, -1);
130 	ASSERT_EQ(errno, EINVAL);
131 }
132 
133 /*
134  * Test that CLONE_AUTOREAP with CLONE_THREAD fails.
135  */
TEST(autoreap_rejects_thread)136 TEST(autoreap_rejects_thread)
137 {
138 	struct __clone_args args = {
139 		.flags		= CLONE_AUTOREAP | CLONE_THREAD |
140 				  CLONE_SIGHAND | CLONE_VM,
141 		.exit_signal	= 0,
142 	};
143 	pid_t pid;
144 
145 	pid = sys_clone3(&args, sizeof(args));
146 	ASSERT_EQ(pid, -1);
147 	ASSERT_EQ(errno, EINVAL);
148 }
149 
150 /*
151  * Basic test: create an autoreap child, let it exit, verify:
152  * - pidfd becomes readable (poll returns POLLIN)
153  * - PIDFD_GET_INFO returns the correct exit code
154  * - waitpid() returns -1/ECHILD (no zombie)
155  */
TEST(autoreap_basic)156 TEST(autoreap_basic)
157 {
158 	struct pidfd_info info = { .mask = PIDFD_INFO_EXIT };
159 	int pidfd = -1, ret;
160 	struct pollfd pfd;
161 	pid_t pid;
162 
163 	pid = create_autoreap_child(&pidfd);
164 	if (pid < 0 && errno == EINVAL)
165 		SKIP(return, "CLONE_AUTOREAP not supported");
166 	ASSERT_GE(pid, 0);
167 
168 	if (pid == 0)
169 		_exit(42);
170 
171 	ASSERT_GE(pidfd, 0);
172 
173 	/* Wait for the child to exit via pidfd poll. */
174 	pfd.fd = pidfd;
175 	pfd.events = POLLIN;
176 	ret = poll(&pfd, 1, 5000);
177 	ASSERT_EQ(ret, 1);
178 	ASSERT_TRUE(pfd.revents & POLLIN);
179 
180 	/* Verify exit info via PIDFD_GET_INFO. */
181 	ret = ioctl(pidfd, PIDFD_GET_INFO, &info);
182 	ASSERT_EQ(ret, 0);
183 	ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT);
184 	/*
185 	 * exit_code is in waitpid format: for _exit(42),
186 	 * WIFEXITED is true and WEXITSTATUS is 42.
187 	 */
188 	ASSERT_TRUE(WIFEXITED(info.exit_code));
189 	ASSERT_EQ(WEXITSTATUS(info.exit_code), 42);
190 
191 	/* Verify no zombie: waitpid should fail with ECHILD. */
192 	ret = waitpid(pid, NULL, WNOHANG);
193 	ASSERT_EQ(ret, -1);
194 	ASSERT_EQ(errno, ECHILD);
195 
196 	close(pidfd);
197 }
198 
199 /*
200  * Test that an autoreap child killed by a signal reports
201  * the correct exit info.
202  */
TEST(autoreap_signaled)203 TEST(autoreap_signaled)
204 {
205 	struct pidfd_info info = { .mask = PIDFD_INFO_EXIT };
206 	int pidfd = -1, ret;
207 	struct pollfd pfd;
208 	pid_t pid;
209 
210 	pid = create_autoreap_child(&pidfd);
211 	if (pid < 0 && errno == EINVAL)
212 		SKIP(return, "CLONE_AUTOREAP not supported");
213 	ASSERT_GE(pid, 0);
214 
215 	if (pid == 0) {
216 		pause();
217 		_exit(1);
218 	}
219 
220 	ASSERT_GE(pidfd, 0);
221 
222 	/* Kill the child. */
223 	ret = sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0);
224 	ASSERT_EQ(ret, 0);
225 
226 	/* Wait for exit via pidfd. */
227 	pfd.fd = pidfd;
228 	pfd.events = POLLIN;
229 	ret = poll(&pfd, 1, 5000);
230 	ASSERT_EQ(ret, 1);
231 	ASSERT_TRUE(pfd.revents & POLLIN);
232 
233 	/* Verify signal info. */
234 	ret = ioctl(pidfd, PIDFD_GET_INFO, &info);
235 	ASSERT_EQ(ret, 0);
236 	ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT);
237 	ASSERT_TRUE(WIFSIGNALED(info.exit_code));
238 	ASSERT_EQ(WTERMSIG(info.exit_code), SIGKILL);
239 
240 	/* No zombie. */
241 	ret = waitpid(pid, NULL, WNOHANG);
242 	ASSERT_EQ(ret, -1);
243 	ASSERT_EQ(errno, ECHILD);
244 
245 	close(pidfd);
246 }
247 
248 /*
249  * Test autoreap survives reparenting: middle process creates an
250  * autoreap grandchild, then exits. The grandchild gets reparented
251  * to us (the grandparent, which is a subreaper). When the grandchild
252  * exits, it should still be autoreaped - no zombie under us.
253  */
TEST(autoreap_reparent)254 TEST(autoreap_reparent)
255 {
256 	int ipc_sockets[2], ret;
257 	int pidfd = -1;
258 	struct pollfd pfd;
259 	pid_t mid_pid, grandchild_pid;
260 	char buf[32] = {};
261 
262 	/* Make ourselves a subreaper so reparented children come to us. */
263 	ret = prctl(PR_SET_CHILD_SUBREAPER, 1);
264 	ASSERT_EQ(ret, 0);
265 
266 	ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
267 	ASSERT_EQ(ret, 0);
268 
269 	mid_pid = fork();
270 	ASSERT_GE(mid_pid, 0);
271 
272 	if (mid_pid == 0) {
273 		/* Middle child: create an autoreap grandchild. */
274 		int gc_pidfd = -1;
275 
276 		close(ipc_sockets[0]);
277 
278 		grandchild_pid = create_autoreap_child(&gc_pidfd);
279 		if (grandchild_pid < 0) {
280 			write_nointr(ipc_sockets[1], "E", 1);
281 			close(ipc_sockets[1]);
282 			_exit(1);
283 		}
284 
285 		if (grandchild_pid == 0) {
286 			/* Grandchild: wait for signal to exit. */
287 			close(ipc_sockets[1]);
288 			if (gc_pidfd >= 0)
289 				close(gc_pidfd);
290 			pause();
291 			_exit(0);
292 		}
293 
294 		/* Send grandchild PID to grandparent. */
295 		snprintf(buf, sizeof(buf), "%d", grandchild_pid);
296 		write_nointr(ipc_sockets[1], buf, strlen(buf));
297 		close(ipc_sockets[1]);
298 		if (gc_pidfd >= 0)
299 			close(gc_pidfd);
300 
301 		/* Middle child exits, grandchild gets reparented. */
302 		_exit(0);
303 	}
304 
305 	close(ipc_sockets[1]);
306 
307 	/* Read grandchild's PID. */
308 	ret = read_nointr(ipc_sockets[0], buf, sizeof(buf) - 1);
309 	close(ipc_sockets[0]);
310 	ASSERT_GT(ret, 0);
311 
312 	if (buf[0] == 'E') {
313 		waitpid(mid_pid, NULL, 0);
314 		prctl(PR_SET_CHILD_SUBREAPER, 0);
315 		SKIP(return, "CLONE_AUTOREAP not supported");
316 	}
317 
318 	grandchild_pid = atoi(buf);
319 	ASSERT_GT(grandchild_pid, 0);
320 
321 	/* Wait for the middle child to exit. */
322 	ret = waitpid(mid_pid, NULL, 0);
323 	ASSERT_EQ(ret, mid_pid);
324 
325 	/*
326 	 * Now the grandchild is reparented to us (subreaper).
327 	 * Open a pidfd for the grandchild and kill it.
328 	 */
329 	pidfd = sys_pidfd_open(grandchild_pid, 0);
330 	ASSERT_GE(pidfd, 0);
331 
332 	ret = sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0);
333 	ASSERT_EQ(ret, 0);
334 
335 	/* Wait for it to exit via pidfd poll. */
336 	pfd.fd = pidfd;
337 	pfd.events = POLLIN;
338 	ret = poll(&pfd, 1, 5000);
339 	ASSERT_EQ(ret, 1);
340 	ASSERT_TRUE(pfd.revents & POLLIN);
341 
342 	/*
343 	 * The grandchild should have been autoreaped even though
344 	 * we (the new parent) haven't set SA_NOCLDWAIT.
345 	 * waitpid should return -1/ECHILD.
346 	 */
347 	ret = waitpid(grandchild_pid, NULL, WNOHANG);
348 	EXPECT_EQ(ret, -1);
349 	EXPECT_EQ(errno, ECHILD);
350 
351 	close(pidfd);
352 
353 	/* Clean up subreaper status. */
354 	prctl(PR_SET_CHILD_SUBREAPER, 0);
355 }
356 
357 static int thread_sock_fd;
358 
thread_func(void * arg)359 static void *thread_func(void *arg)
360 {
361 	/* Signal parent we're running. */
362 	write_nointr(thread_sock_fd, "1", 1);
363 
364 	/* Give main thread time to call _exit() first. */
365 	usleep(200000);
366 
367 	return NULL;
368 }
369 
370 /*
371  * Test that an autoreap child with multiple threads is properly
372  * autoreaped only after all threads have exited.
373  */
TEST(autoreap_multithreaded)374 TEST(autoreap_multithreaded)
375 {
376 	struct pidfd_info info = { .mask = PIDFD_INFO_EXIT };
377 	int ipc_sockets[2], ret;
378 	int pidfd = -1;
379 	struct pollfd pfd;
380 	pid_t pid;
381 	char c;
382 
383 	ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
384 	ASSERT_EQ(ret, 0);
385 
386 	pid = create_autoreap_child(&pidfd);
387 	if (pid < 0 && errno == EINVAL) {
388 		close(ipc_sockets[0]);
389 		close(ipc_sockets[1]);
390 		SKIP(return, "CLONE_AUTOREAP not supported");
391 	}
392 	ASSERT_GE(pid, 0);
393 
394 	if (pid == 0) {
395 		pthread_t thread;
396 
397 		close(ipc_sockets[0]);
398 
399 		/*
400 		 * Create a sub-thread that outlives the main thread.
401 		 * The thread signals readiness, then sleeps.
402 		 * The main thread waits briefly, then calls _exit().
403 		 */
404 		thread_sock_fd = ipc_sockets[1];
405 		pthread_create(&thread, NULL, thread_func, NULL);
406 		pthread_detach(thread);
407 
408 		/* Wait for thread to be running. */
409 		usleep(100000);
410 
411 		/* Main thread exits; sub-thread is still alive. */
412 		_exit(99);
413 	}
414 
415 	close(ipc_sockets[1]);
416 
417 	/* Wait for the sub-thread to signal readiness. */
418 	ret = read_nointr(ipc_sockets[0], &c, 1);
419 	close(ipc_sockets[0]);
420 	ASSERT_EQ(ret, 1);
421 
422 	/* Wait for the process to fully exit via pidfd poll. */
423 	pfd.fd = pidfd;
424 	pfd.events = POLLIN;
425 	ret = poll(&pfd, 1, 5000);
426 	ASSERT_EQ(ret, 1);
427 	ASSERT_TRUE(pfd.revents & POLLIN);
428 
429 	/* Verify exit info. */
430 	ret = ioctl(pidfd, PIDFD_GET_INFO, &info);
431 	ASSERT_EQ(ret, 0);
432 	ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT);
433 	ASSERT_TRUE(WIFEXITED(info.exit_code));
434 	ASSERT_EQ(WEXITSTATUS(info.exit_code), 99);
435 
436 	/* No zombie. */
437 	ret = waitpid(pid, NULL, WNOHANG);
438 	ASSERT_EQ(ret, -1);
439 	ASSERT_EQ(errno, ECHILD);
440 
441 	close(pidfd);
442 }
443 
444 /*
445  * Test that autoreap is NOT inherited by grandchildren.
446  */
TEST(autoreap_no_inherit)447 TEST(autoreap_no_inherit)
448 {
449 	int ipc_sockets[2], ret;
450 	int pidfd = -1;
451 	pid_t pid;
452 	char buf[2] = {};
453 	struct pollfd pfd;
454 
455 	ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
456 	ASSERT_EQ(ret, 0);
457 
458 	pid = create_autoreap_child(&pidfd);
459 	if (pid < 0 && errno == EINVAL) {
460 		close(ipc_sockets[0]);
461 		close(ipc_sockets[1]);
462 		SKIP(return, "CLONE_AUTOREAP not supported");
463 	}
464 	ASSERT_GE(pid, 0);
465 
466 	if (pid == 0) {
467 		pid_t gc;
468 		int status;
469 
470 		close(ipc_sockets[0]);
471 
472 		/* Autoreap child forks a grandchild (without autoreap). */
473 		gc = fork();
474 		if (gc < 0) {
475 			write_nointr(ipc_sockets[1], "E", 1);
476 			_exit(1);
477 		}
478 		if (gc == 0) {
479 			/* Grandchild: exit immediately. */
480 			close(ipc_sockets[1]);
481 			_exit(77);
482 		}
483 
484 		/*
485 		 * The grandchild should become a regular zombie
486 		 * since it was NOT created with CLONE_AUTOREAP.
487 		 * Wait for it to verify.
488 		 */
489 		ret = waitpid(gc, &status, 0);
490 		if (ret == gc && WIFEXITED(status) &&
491 		    WEXITSTATUS(status) == 77) {
492 			write_nointr(ipc_sockets[1], "P", 1);
493 		} else {
494 			write_nointr(ipc_sockets[1], "F", 1);
495 		}
496 		close(ipc_sockets[1]);
497 		_exit(0);
498 	}
499 
500 	close(ipc_sockets[1]);
501 
502 	ret = read_nointr(ipc_sockets[0], buf, 1);
503 	close(ipc_sockets[0]);
504 	ASSERT_EQ(ret, 1);
505 
506 	/*
507 	 * 'P' means the autoreap child was able to waitpid() its
508 	 * grandchild (correct - grandchild should be a normal zombie,
509 	 * not autoreaped).
510 	 */
511 	ASSERT_EQ(buf[0], 'P');
512 
513 	/* Wait for the autoreap child to exit. */
514 	pfd.fd = pidfd;
515 	pfd.events = POLLIN;
516 	ret = poll(&pfd, 1, 5000);
517 	ASSERT_EQ(ret, 1);
518 
519 	/* Autoreap child itself should be autoreaped. */
520 	ret = waitpid(pid, NULL, WNOHANG);
521 	ASSERT_EQ(ret, -1);
522 	ASSERT_EQ(errno, ECHILD);
523 
524 	close(pidfd);
525 }
526 
527 /*
528  * Test that CLONE_NNP sets no_new_privs on the child.
529  * The child checks via prctl(PR_GET_NO_NEW_PRIVS) and reports back.
530  * The parent must NOT have no_new_privs set afterwards.
531  */
TEST(nnp_sets_no_new_privs)532 TEST(nnp_sets_no_new_privs)
533 {
534 	struct __clone_args args = {
535 		.flags		= CLONE_PIDFD | CLONE_AUTOREAP | CLONE_NNP,
536 		.exit_signal	= 0,
537 	};
538 	struct pidfd_info info = { .mask = PIDFD_INFO_EXIT };
539 	int pidfd = -1, ret;
540 	struct pollfd pfd;
541 	pid_t pid;
542 
543 	/* Ensure parent does not already have no_new_privs. */
544 	ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
545 	ASSERT_EQ(ret, 0) {
546 		TH_LOG("Parent already has no_new_privs set, cannot run test");
547 	}
548 
549 	args.pidfd = ptr_to_u64(&pidfd);
550 
551 	pid = sys_clone3(&args, sizeof(args));
552 	if (pid < 0 && errno == EINVAL)
553 		SKIP(return, "CLONE_NNP not supported");
554 	ASSERT_GE(pid, 0);
555 
556 	if (pid == 0) {
557 		/*
558 		 * Child: check no_new_privs. Exit 0 if set, 1 if not.
559 		 */
560 		ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
561 		_exit(ret == 1 ? 0 : 1);
562 	}
563 
564 	ASSERT_GE(pidfd, 0);
565 
566 	/* Parent must still NOT have no_new_privs. */
567 	ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
568 	ASSERT_EQ(ret, 0) {
569 		TH_LOG("Parent got no_new_privs after creating CLONE_NNP child");
570 	}
571 
572 	/* Wait for child to exit. */
573 	pfd.fd = pidfd;
574 	pfd.events = POLLIN;
575 	ret = poll(&pfd, 1, 5000);
576 	ASSERT_EQ(ret, 1);
577 
578 	/* Verify child exited with 0 (no_new_privs was set). */
579 	ret = ioctl(pidfd, PIDFD_GET_INFO, &info);
580 	ASSERT_EQ(ret, 0);
581 	ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT);
582 	ASSERT_TRUE(WIFEXITED(info.exit_code));
583 	ASSERT_EQ(WEXITSTATUS(info.exit_code), 0) {
584 		TH_LOG("Child did not have no_new_privs set");
585 	}
586 
587 	close(pidfd);
588 }
589 
590 /*
591  * Test that CLONE_NNP with CLONE_THREAD fails with EINVAL.
592  */
TEST(nnp_rejects_thread)593 TEST(nnp_rejects_thread)
594 {
595 	struct __clone_args args = {
596 		.flags		= CLONE_NNP | CLONE_THREAD |
597 				  CLONE_SIGHAND | CLONE_VM,
598 		.exit_signal	= 0,
599 	};
600 	pid_t pid;
601 
602 	pid = sys_clone3(&args, sizeof(args));
603 	ASSERT_EQ(pid, -1);
604 	ASSERT_EQ(errno, EINVAL);
605 }
606 
607 /*
608  * Test that a plain CLONE_AUTOREAP child does NOT get no_new_privs.
609  * Only CLONE_NNP should set it.
610  */
TEST(autoreap_no_new_privs_unset)611 TEST(autoreap_no_new_privs_unset)
612 {
613 	struct pidfd_info info = { .mask = PIDFD_INFO_EXIT };
614 	int pidfd = -1, ret;
615 	struct pollfd pfd;
616 	pid_t pid;
617 
618 	pid = create_autoreap_child(&pidfd);
619 	if (pid < 0 && errno == EINVAL)
620 		SKIP(return, "CLONE_AUTOREAP not supported");
621 	ASSERT_GE(pid, 0);
622 
623 	if (pid == 0) {
624 		/*
625 		 * Child: check no_new_privs. Exit 0 if NOT set, 1 if set.
626 		 */
627 		ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
628 		_exit(ret == 0 ? 0 : 1);
629 	}
630 
631 	ASSERT_GE(pidfd, 0);
632 
633 	pfd.fd = pidfd;
634 	pfd.events = POLLIN;
635 	ret = poll(&pfd, 1, 5000);
636 	ASSERT_EQ(ret, 1);
637 
638 	ret = ioctl(pidfd, PIDFD_GET_INFO, &info);
639 	ASSERT_EQ(ret, 0);
640 	ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT);
641 	ASSERT_TRUE(WIFEXITED(info.exit_code));
642 	ASSERT_EQ(WEXITSTATUS(info.exit_code), 0) {
643 		TH_LOG("Plain autoreap child unexpectedly has no_new_privs");
644 	}
645 
646 	close(pidfd);
647 }
648 
649 /*
650  * Helper: create a child with CLONE_PIDFD | CLONE_PIDFD_AUTOKILL | CLONE_AUTOREAP | CLONE_NNP.
651  */
create_autokill_child(int * pidfd)652 static pid_t create_autokill_child(int *pidfd)
653 {
654 	struct __clone_args args = {
655 		.flags		= CLONE_PIDFD | CLONE_PIDFD_AUTOKILL |
656 				  CLONE_AUTOREAP | CLONE_NNP,
657 		.exit_signal	= 0,
658 		.pidfd		= ptr_to_u64(pidfd),
659 	};
660 
661 	return sys_clone3(&args, sizeof(args));
662 }
663 
664 /*
665  * Basic autokill test: child blocks in pause(), parent closes the
666  * clone3 pidfd, child should be killed and autoreaped.
667  */
TEST(autokill_basic)668 TEST(autokill_basic)
669 {
670 	int pidfd = -1, pollfd_fd = -1, ret;
671 	struct pollfd pfd;
672 	pid_t pid;
673 
674 	pid = create_autokill_child(&pidfd);
675 	if (pid < 0 && errno == EINVAL)
676 		SKIP(return, "CLONE_PIDFD_AUTOKILL not supported");
677 	ASSERT_GE(pid, 0);
678 
679 	if (pid == 0) {
680 		pause();
681 		_exit(1);
682 	}
683 
684 	ASSERT_GE(pidfd, 0);
685 
686 	/*
687 	 * Open a second pidfd via pidfd_open() so we can observe the
688 	 * child's death after closing the clone3 pidfd.
689 	 */
690 	pollfd_fd = sys_pidfd_open(pid, 0);
691 	ASSERT_GE(pollfd_fd, 0);
692 
693 	/* Close the clone3 pidfd — this should trigger autokill. */
694 	close(pidfd);
695 
696 	/* Wait for the child to die via the pidfd_open'd fd. */
697 	pfd.fd = pollfd_fd;
698 	pfd.events = POLLIN;
699 	ret = poll(&pfd, 1, 5000);
700 	ASSERT_EQ(ret, 1);
701 	ASSERT_TRUE(pfd.revents & POLLIN);
702 
703 	/* Child should be autoreaped — no zombie. */
704 	usleep(100000);
705 	ret = waitpid(pid, NULL, WNOHANG);
706 	ASSERT_EQ(ret, -1);
707 	ASSERT_EQ(errno, ECHILD);
708 
709 	close(pollfd_fd);
710 }
711 
712 /*
713  * CLONE_PIDFD_AUTOKILL without CLONE_PIDFD must fail with EINVAL.
714  */
TEST(autokill_requires_pidfd)715 TEST(autokill_requires_pidfd)
716 {
717 	struct __clone_args args = {
718 		.flags		= CLONE_PIDFD_AUTOKILL | CLONE_AUTOREAP,
719 		.exit_signal	= 0,
720 	};
721 	pid_t pid;
722 
723 	pid = sys_clone3(&args, sizeof(args));
724 	ASSERT_EQ(pid, -1);
725 	ASSERT_EQ(errno, EINVAL);
726 }
727 
728 /*
729  * CLONE_PIDFD_AUTOKILL without CLONE_AUTOREAP must fail with EINVAL.
730  */
TEST(autokill_requires_autoreap)731 TEST(autokill_requires_autoreap)
732 {
733 	int pidfd = -1;
734 	struct __clone_args args = {
735 		.flags		= CLONE_PIDFD | CLONE_PIDFD_AUTOKILL,
736 		.exit_signal	= 0,
737 		.pidfd		= ptr_to_u64(&pidfd),
738 	};
739 	pid_t pid;
740 
741 	pid = sys_clone3(&args, sizeof(args));
742 	ASSERT_EQ(pid, -1);
743 	ASSERT_EQ(errno, EINVAL);
744 }
745 
746 /*
747  * CLONE_PIDFD_AUTOKILL with CLONE_THREAD must fail with EINVAL.
748  */
TEST(autokill_rejects_thread)749 TEST(autokill_rejects_thread)
750 {
751 	int pidfd = -1;
752 	struct __clone_args args = {
753 		.flags		= CLONE_PIDFD | CLONE_PIDFD_AUTOKILL |
754 				  CLONE_AUTOREAP | CLONE_THREAD |
755 				  CLONE_SIGHAND | CLONE_VM,
756 		.exit_signal	= 0,
757 		.pidfd		= ptr_to_u64(&pidfd),
758 	};
759 	pid_t pid;
760 
761 	pid = sys_clone3(&args, sizeof(args));
762 	ASSERT_EQ(pid, -1);
763 	ASSERT_EQ(errno, EINVAL);
764 }
765 
766 /*
767  * Test that only the clone3 pidfd triggers autokill, not pidfd_open().
768  * Close the pidfd_open'd fd first — child should survive.
769  * Then close the clone3 pidfd — child should be killed and autoreaped.
770  */
TEST(autokill_pidfd_open_no_effect)771 TEST(autokill_pidfd_open_no_effect)
772 {
773 	int pidfd = -1, open_fd = -1, ret;
774 	struct pollfd pfd;
775 	pid_t pid;
776 
777 	pid = create_autokill_child(&pidfd);
778 	if (pid < 0 && errno == EINVAL)
779 		SKIP(return, "CLONE_PIDFD_AUTOKILL not supported");
780 	ASSERT_GE(pid, 0);
781 
782 	if (pid == 0) {
783 		pause();
784 		_exit(1);
785 	}
786 
787 	ASSERT_GE(pidfd, 0);
788 
789 	/* Open a second pidfd via pidfd_open(). */
790 	open_fd = sys_pidfd_open(pid, 0);
791 	ASSERT_GE(open_fd, 0);
792 
793 	/*
794 	 * Close the pidfd_open'd fd — child should survive because
795 	 * only the clone3 pidfd has autokill.
796 	 */
797 	close(open_fd);
798 	usleep(200000);
799 
800 	/* Verify child is still alive by polling the clone3 pidfd. */
801 	pfd.fd = pidfd;
802 	pfd.events = POLLIN;
803 	ret = poll(&pfd, 1, 0);
804 	ASSERT_EQ(ret, 0) {
805 		TH_LOG("Child died after closing pidfd_open fd — should still be alive");
806 	}
807 
808 	/* Open another observation fd before triggering autokill. */
809 	open_fd = sys_pidfd_open(pid, 0);
810 	ASSERT_GE(open_fd, 0);
811 
812 	/* Now close the clone3 pidfd — this triggers autokill. */
813 	close(pidfd);
814 
815 	pfd.fd = open_fd;
816 	pfd.events = POLLIN;
817 	ret = poll(&pfd, 1, 5000);
818 	ASSERT_EQ(ret, 1);
819 	ASSERT_TRUE(pfd.revents & POLLIN);
820 
821 	/* Child should be autoreaped — no zombie. */
822 	usleep(100000);
823 	ret = waitpid(pid, NULL, WNOHANG);
824 	ASSERT_EQ(ret, -1);
825 	ASSERT_EQ(errno, ECHILD);
826 
827 	close(open_fd);
828 }
829 
830 /*
831  * Test that CLONE_PIDFD_AUTOKILL without CLONE_NNP fails with EPERM
832  * for an unprivileged caller.
833  */
TEST(autokill_requires_cap_sys_admin)834 TEST(autokill_requires_cap_sys_admin)
835 {
836 	int pidfd = -1, ret;
837 	struct __clone_args args = {
838 		.flags		= CLONE_PIDFD | CLONE_PIDFD_AUTOKILL |
839 				  CLONE_AUTOREAP,
840 		.exit_signal	= 0,
841 		.pidfd		= ptr_to_u64(&pidfd),
842 	};
843 	pid_t pid;
844 
845 	/* Drop all capabilities so we lack CAP_SYS_ADMIN. */
846 	ret = drop_all_caps();
847 	ASSERT_EQ(ret, 0);
848 
849 	pid = sys_clone3(&args, sizeof(args));
850 	ASSERT_EQ(pid, -1);
851 	ASSERT_EQ(errno, EPERM);
852 }
853 
854 /*
855  * Test that CLONE_PIDFD_AUTOKILL without CLONE_NNP succeeds with
856  * CAP_SYS_ADMIN.
857  */
TEST(autokill_without_nnp_with_cap)858 TEST(autokill_without_nnp_with_cap)
859 {
860 	struct __clone_args args = {
861 		.flags		= CLONE_PIDFD | CLONE_PIDFD_AUTOKILL |
862 				  CLONE_AUTOREAP,
863 		.exit_signal	= 0,
864 	};
865 	struct pidfd_info info = { .mask = PIDFD_INFO_EXIT };
866 	int pidfd = -1, ret;
867 	struct pollfd pfd;
868 	pid_t pid;
869 
870 	if (geteuid() != 0)
871 		SKIP(return, "Need root/CAP_SYS_ADMIN");
872 
873 	args.pidfd = ptr_to_u64(&pidfd);
874 
875 	pid = sys_clone3(&args, sizeof(args));
876 	if (pid < 0 && errno == EINVAL)
877 		SKIP(return, "CLONE_PIDFD_AUTOKILL not supported");
878 	ASSERT_GE(pid, 0);
879 
880 	if (pid == 0)
881 		_exit(0);
882 
883 	ASSERT_GE(pidfd, 0);
884 
885 	/* Wait for child to exit. */
886 	pfd.fd = pidfd;
887 	pfd.events = POLLIN;
888 	ret = poll(&pfd, 1, 5000);
889 	ASSERT_EQ(ret, 1);
890 
891 	ret = ioctl(pidfd, PIDFD_GET_INFO, &info);
892 	ASSERT_EQ(ret, 0);
893 	ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT);
894 	ASSERT_TRUE(WIFEXITED(info.exit_code));
895 	ASSERT_EQ(WEXITSTATUS(info.exit_code), 0);
896 
897 	close(pidfd);
898 }
899 
900 TEST_HARNESS_MAIN
901