1 // SPDX-License-Identifier: GPL-2.0
2 // Copyright (c) 2026 Christian Brauner <brauner@kernel.org>
3
4 #define _GNU_SOURCE
5 #include <errno.h>
6 #include <linux/types.h>
7 #include <poll.h>
8 #include <pthread.h>
9 #include <sched.h>
10 #include <signal.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <syscall.h>
15 #include <sys/ioctl.h>
16 #include <sys/prctl.h>
17 #include <sys/socket.h>
18 #include <sys/types.h>
19 #include <sys/wait.h>
20 #include <unistd.h>
21
22 #include "pidfd.h"
23 #include "kselftest_harness.h"
24
25 #ifndef CLONE_AUTOREAP
26 #define CLONE_AUTOREAP (1ULL << 34)
27 #endif
28
29 #ifndef CLONE_NNP
30 #define CLONE_NNP (1ULL << 35)
31 #endif
32
33 #ifndef CLONE_PIDFD_AUTOKILL
34 #define CLONE_PIDFD_AUTOKILL (1ULL << 36)
35 #endif
36
37 #ifndef _LINUX_CAPABILITY_VERSION_3
38 #define _LINUX_CAPABILITY_VERSION_3 0x20080522
39 #endif
40
41 struct cap_header {
42 __u32 version;
43 int pid;
44 };
45
46 struct cap_data {
47 __u32 effective;
48 __u32 permitted;
49 __u32 inheritable;
50 };
51
drop_all_caps(void)52 static int drop_all_caps(void)
53 {
54 struct cap_header hdr = { .version = _LINUX_CAPABILITY_VERSION_3 };
55 struct cap_data data[2] = {};
56
57 return syscall(__NR_capset, &hdr, data);
58 }
59
create_autoreap_child(int * pidfd)60 static pid_t create_autoreap_child(int *pidfd)
61 {
62 struct __clone_args args = {
63 .flags = CLONE_PIDFD | CLONE_AUTOREAP,
64 .exit_signal = 0,
65 .pidfd = ptr_to_u64(pidfd),
66 };
67
68 return sys_clone3(&args, sizeof(args));
69 }
70
71 /*
72 * Test that CLONE_AUTOREAP works without CLONE_PIDFD (fire-and-forget).
73 */
TEST(autoreap_without_pidfd)74 TEST(autoreap_without_pidfd)
75 {
76 struct __clone_args args = {
77 .flags = CLONE_AUTOREAP,
78 .exit_signal = 0,
79 };
80 pid_t pid;
81 int ret;
82
83 pid = sys_clone3(&args, sizeof(args));
84 if (pid < 0 && errno == EINVAL)
85 SKIP(return, "CLONE_AUTOREAP not supported");
86 ASSERT_GE(pid, 0);
87
88 if (pid == 0)
89 _exit(0);
90
91 /*
92 * Give the child a moment to exit and be autoreaped.
93 * Then verify no zombie remains.
94 */
95 usleep(200000);
96 ret = waitpid(pid, NULL, WNOHANG);
97 ASSERT_EQ(ret, -1);
98 ASSERT_EQ(errno, ECHILD);
99 }
100
101 /*
102 * Test that CLONE_AUTOREAP with a non-zero exit_signal fails.
103 */
TEST(autoreap_rejects_exit_signal)104 TEST(autoreap_rejects_exit_signal)
105 {
106 struct __clone_args args = {
107 .flags = CLONE_AUTOREAP,
108 .exit_signal = SIGCHLD,
109 };
110 pid_t pid;
111
112 pid = sys_clone3(&args, sizeof(args));
113 ASSERT_EQ(pid, -1);
114 ASSERT_EQ(errno, EINVAL);
115 }
116
117 /*
118 * Test that CLONE_AUTOREAP with CLONE_PARENT fails.
119 */
TEST(autoreap_rejects_parent)120 TEST(autoreap_rejects_parent)
121 {
122 struct __clone_args args = {
123 .flags = CLONE_AUTOREAP | CLONE_PARENT,
124 .exit_signal = 0,
125 };
126 pid_t pid;
127
128 pid = sys_clone3(&args, sizeof(args));
129 ASSERT_EQ(pid, -1);
130 ASSERT_EQ(errno, EINVAL);
131 }
132
133 /*
134 * Test that CLONE_AUTOREAP with CLONE_THREAD fails.
135 */
TEST(autoreap_rejects_thread)136 TEST(autoreap_rejects_thread)
137 {
138 struct __clone_args args = {
139 .flags = CLONE_AUTOREAP | CLONE_THREAD |
140 CLONE_SIGHAND | CLONE_VM,
141 .exit_signal = 0,
142 };
143 pid_t pid;
144
145 pid = sys_clone3(&args, sizeof(args));
146 ASSERT_EQ(pid, -1);
147 ASSERT_EQ(errno, EINVAL);
148 }
149
150 /*
151 * Basic test: create an autoreap child, let it exit, verify:
152 * - pidfd becomes readable (poll returns POLLIN)
153 * - PIDFD_GET_INFO returns the correct exit code
154 * - waitpid() returns -1/ECHILD (no zombie)
155 */
TEST(autoreap_basic)156 TEST(autoreap_basic)
157 {
158 struct pidfd_info info = { .mask = PIDFD_INFO_EXIT };
159 int pidfd = -1, ret;
160 struct pollfd pfd;
161 pid_t pid;
162
163 pid = create_autoreap_child(&pidfd);
164 if (pid < 0 && errno == EINVAL)
165 SKIP(return, "CLONE_AUTOREAP not supported");
166 ASSERT_GE(pid, 0);
167
168 if (pid == 0)
169 _exit(42);
170
171 ASSERT_GE(pidfd, 0);
172
173 /* Wait for the child to exit via pidfd poll. */
174 pfd.fd = pidfd;
175 pfd.events = POLLIN;
176 ret = poll(&pfd, 1, 5000);
177 ASSERT_EQ(ret, 1);
178 ASSERT_TRUE(pfd.revents & POLLIN);
179
180 /* Verify exit info via PIDFD_GET_INFO. */
181 ret = ioctl(pidfd, PIDFD_GET_INFO, &info);
182 ASSERT_EQ(ret, 0);
183 ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT);
184 /*
185 * exit_code is in waitpid format: for _exit(42),
186 * WIFEXITED is true and WEXITSTATUS is 42.
187 */
188 ASSERT_TRUE(WIFEXITED(info.exit_code));
189 ASSERT_EQ(WEXITSTATUS(info.exit_code), 42);
190
191 /* Verify no zombie: waitpid should fail with ECHILD. */
192 ret = waitpid(pid, NULL, WNOHANG);
193 ASSERT_EQ(ret, -1);
194 ASSERT_EQ(errno, ECHILD);
195
196 close(pidfd);
197 }
198
199 /*
200 * Test that an autoreap child killed by a signal reports
201 * the correct exit info.
202 */
TEST(autoreap_signaled)203 TEST(autoreap_signaled)
204 {
205 struct pidfd_info info = { .mask = PIDFD_INFO_EXIT };
206 int pidfd = -1, ret;
207 struct pollfd pfd;
208 pid_t pid;
209
210 pid = create_autoreap_child(&pidfd);
211 if (pid < 0 && errno == EINVAL)
212 SKIP(return, "CLONE_AUTOREAP not supported");
213 ASSERT_GE(pid, 0);
214
215 if (pid == 0) {
216 pause();
217 _exit(1);
218 }
219
220 ASSERT_GE(pidfd, 0);
221
222 /* Kill the child. */
223 ret = sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0);
224 ASSERT_EQ(ret, 0);
225
226 /* Wait for exit via pidfd. */
227 pfd.fd = pidfd;
228 pfd.events = POLLIN;
229 ret = poll(&pfd, 1, 5000);
230 ASSERT_EQ(ret, 1);
231 ASSERT_TRUE(pfd.revents & POLLIN);
232
233 /* Verify signal info. */
234 ret = ioctl(pidfd, PIDFD_GET_INFO, &info);
235 ASSERT_EQ(ret, 0);
236 ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT);
237 ASSERT_TRUE(WIFSIGNALED(info.exit_code));
238 ASSERT_EQ(WTERMSIG(info.exit_code), SIGKILL);
239
240 /* No zombie. */
241 ret = waitpid(pid, NULL, WNOHANG);
242 ASSERT_EQ(ret, -1);
243 ASSERT_EQ(errno, ECHILD);
244
245 close(pidfd);
246 }
247
248 /*
249 * Test autoreap survives reparenting: middle process creates an
250 * autoreap grandchild, then exits. The grandchild gets reparented
251 * to us (the grandparent, which is a subreaper). When the grandchild
252 * exits, it should still be autoreaped - no zombie under us.
253 */
TEST(autoreap_reparent)254 TEST(autoreap_reparent)
255 {
256 int ipc_sockets[2], ret;
257 int pidfd = -1;
258 struct pollfd pfd;
259 pid_t mid_pid, grandchild_pid;
260 char buf[32] = {};
261
262 /* Make ourselves a subreaper so reparented children come to us. */
263 ret = prctl(PR_SET_CHILD_SUBREAPER, 1);
264 ASSERT_EQ(ret, 0);
265
266 ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
267 ASSERT_EQ(ret, 0);
268
269 mid_pid = fork();
270 ASSERT_GE(mid_pid, 0);
271
272 if (mid_pid == 0) {
273 /* Middle child: create an autoreap grandchild. */
274 int gc_pidfd = -1;
275
276 close(ipc_sockets[0]);
277
278 grandchild_pid = create_autoreap_child(&gc_pidfd);
279 if (grandchild_pid < 0) {
280 write_nointr(ipc_sockets[1], "E", 1);
281 close(ipc_sockets[1]);
282 _exit(1);
283 }
284
285 if (grandchild_pid == 0) {
286 /* Grandchild: wait for signal to exit. */
287 close(ipc_sockets[1]);
288 if (gc_pidfd >= 0)
289 close(gc_pidfd);
290 pause();
291 _exit(0);
292 }
293
294 /* Send grandchild PID to grandparent. */
295 snprintf(buf, sizeof(buf), "%d", grandchild_pid);
296 write_nointr(ipc_sockets[1], buf, strlen(buf));
297 close(ipc_sockets[1]);
298 if (gc_pidfd >= 0)
299 close(gc_pidfd);
300
301 /* Middle child exits, grandchild gets reparented. */
302 _exit(0);
303 }
304
305 close(ipc_sockets[1]);
306
307 /* Read grandchild's PID. */
308 ret = read_nointr(ipc_sockets[0], buf, sizeof(buf) - 1);
309 close(ipc_sockets[0]);
310 ASSERT_GT(ret, 0);
311
312 if (buf[0] == 'E') {
313 waitpid(mid_pid, NULL, 0);
314 prctl(PR_SET_CHILD_SUBREAPER, 0);
315 SKIP(return, "CLONE_AUTOREAP not supported");
316 }
317
318 grandchild_pid = atoi(buf);
319 ASSERT_GT(grandchild_pid, 0);
320
321 /* Wait for the middle child to exit. */
322 ret = waitpid(mid_pid, NULL, 0);
323 ASSERT_EQ(ret, mid_pid);
324
325 /*
326 * Now the grandchild is reparented to us (subreaper).
327 * Open a pidfd for the grandchild and kill it.
328 */
329 pidfd = sys_pidfd_open(grandchild_pid, 0);
330 ASSERT_GE(pidfd, 0);
331
332 ret = sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0);
333 ASSERT_EQ(ret, 0);
334
335 /* Wait for it to exit via pidfd poll. */
336 pfd.fd = pidfd;
337 pfd.events = POLLIN;
338 ret = poll(&pfd, 1, 5000);
339 ASSERT_EQ(ret, 1);
340 ASSERT_TRUE(pfd.revents & POLLIN);
341
342 /*
343 * The grandchild should have been autoreaped even though
344 * we (the new parent) haven't set SA_NOCLDWAIT.
345 * waitpid should return -1/ECHILD.
346 */
347 ret = waitpid(grandchild_pid, NULL, WNOHANG);
348 EXPECT_EQ(ret, -1);
349 EXPECT_EQ(errno, ECHILD);
350
351 close(pidfd);
352
353 /* Clean up subreaper status. */
354 prctl(PR_SET_CHILD_SUBREAPER, 0);
355 }
356
357 static int thread_sock_fd;
358
thread_func(void * arg)359 static void *thread_func(void *arg)
360 {
361 /* Signal parent we're running. */
362 write_nointr(thread_sock_fd, "1", 1);
363
364 /* Give main thread time to call _exit() first. */
365 usleep(200000);
366
367 return NULL;
368 }
369
370 /*
371 * Test that an autoreap child with multiple threads is properly
372 * autoreaped only after all threads have exited.
373 */
TEST(autoreap_multithreaded)374 TEST(autoreap_multithreaded)
375 {
376 struct pidfd_info info = { .mask = PIDFD_INFO_EXIT };
377 int ipc_sockets[2], ret;
378 int pidfd = -1;
379 struct pollfd pfd;
380 pid_t pid;
381 char c;
382
383 ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
384 ASSERT_EQ(ret, 0);
385
386 pid = create_autoreap_child(&pidfd);
387 if (pid < 0 && errno == EINVAL) {
388 close(ipc_sockets[0]);
389 close(ipc_sockets[1]);
390 SKIP(return, "CLONE_AUTOREAP not supported");
391 }
392 ASSERT_GE(pid, 0);
393
394 if (pid == 0) {
395 pthread_t thread;
396
397 close(ipc_sockets[0]);
398
399 /*
400 * Create a sub-thread that outlives the main thread.
401 * The thread signals readiness, then sleeps.
402 * The main thread waits briefly, then calls _exit().
403 */
404 thread_sock_fd = ipc_sockets[1];
405 pthread_create(&thread, NULL, thread_func, NULL);
406 pthread_detach(thread);
407
408 /* Wait for thread to be running. */
409 usleep(100000);
410
411 /* Main thread exits; sub-thread is still alive. */
412 _exit(99);
413 }
414
415 close(ipc_sockets[1]);
416
417 /* Wait for the sub-thread to signal readiness. */
418 ret = read_nointr(ipc_sockets[0], &c, 1);
419 close(ipc_sockets[0]);
420 ASSERT_EQ(ret, 1);
421
422 /* Wait for the process to fully exit via pidfd poll. */
423 pfd.fd = pidfd;
424 pfd.events = POLLIN;
425 ret = poll(&pfd, 1, 5000);
426 ASSERT_EQ(ret, 1);
427 ASSERT_TRUE(pfd.revents & POLLIN);
428
429 /* Verify exit info. */
430 ret = ioctl(pidfd, PIDFD_GET_INFO, &info);
431 ASSERT_EQ(ret, 0);
432 ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT);
433 ASSERT_TRUE(WIFEXITED(info.exit_code));
434 ASSERT_EQ(WEXITSTATUS(info.exit_code), 99);
435
436 /* No zombie. */
437 ret = waitpid(pid, NULL, WNOHANG);
438 ASSERT_EQ(ret, -1);
439 ASSERT_EQ(errno, ECHILD);
440
441 close(pidfd);
442 }
443
444 /*
445 * Test that autoreap is NOT inherited by grandchildren.
446 */
TEST(autoreap_no_inherit)447 TEST(autoreap_no_inherit)
448 {
449 int ipc_sockets[2], ret;
450 int pidfd = -1;
451 pid_t pid;
452 char buf[2] = {};
453 struct pollfd pfd;
454
455 ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
456 ASSERT_EQ(ret, 0);
457
458 pid = create_autoreap_child(&pidfd);
459 if (pid < 0 && errno == EINVAL) {
460 close(ipc_sockets[0]);
461 close(ipc_sockets[1]);
462 SKIP(return, "CLONE_AUTOREAP not supported");
463 }
464 ASSERT_GE(pid, 0);
465
466 if (pid == 0) {
467 pid_t gc;
468 int status;
469
470 close(ipc_sockets[0]);
471
472 /* Autoreap child forks a grandchild (without autoreap). */
473 gc = fork();
474 if (gc < 0) {
475 write_nointr(ipc_sockets[1], "E", 1);
476 _exit(1);
477 }
478 if (gc == 0) {
479 /* Grandchild: exit immediately. */
480 close(ipc_sockets[1]);
481 _exit(77);
482 }
483
484 /*
485 * The grandchild should become a regular zombie
486 * since it was NOT created with CLONE_AUTOREAP.
487 * Wait for it to verify.
488 */
489 ret = waitpid(gc, &status, 0);
490 if (ret == gc && WIFEXITED(status) &&
491 WEXITSTATUS(status) == 77) {
492 write_nointr(ipc_sockets[1], "P", 1);
493 } else {
494 write_nointr(ipc_sockets[1], "F", 1);
495 }
496 close(ipc_sockets[1]);
497 _exit(0);
498 }
499
500 close(ipc_sockets[1]);
501
502 ret = read_nointr(ipc_sockets[0], buf, 1);
503 close(ipc_sockets[0]);
504 ASSERT_EQ(ret, 1);
505
506 /*
507 * 'P' means the autoreap child was able to waitpid() its
508 * grandchild (correct - grandchild should be a normal zombie,
509 * not autoreaped).
510 */
511 ASSERT_EQ(buf[0], 'P');
512
513 /* Wait for the autoreap child to exit. */
514 pfd.fd = pidfd;
515 pfd.events = POLLIN;
516 ret = poll(&pfd, 1, 5000);
517 ASSERT_EQ(ret, 1);
518
519 /* Autoreap child itself should be autoreaped. */
520 ret = waitpid(pid, NULL, WNOHANG);
521 ASSERT_EQ(ret, -1);
522 ASSERT_EQ(errno, ECHILD);
523
524 close(pidfd);
525 }
526
527 /*
528 * Test that CLONE_NNP sets no_new_privs on the child.
529 * The child checks via prctl(PR_GET_NO_NEW_PRIVS) and reports back.
530 * The parent must NOT have no_new_privs set afterwards.
531 */
TEST(nnp_sets_no_new_privs)532 TEST(nnp_sets_no_new_privs)
533 {
534 struct __clone_args args = {
535 .flags = CLONE_PIDFD | CLONE_AUTOREAP | CLONE_NNP,
536 .exit_signal = 0,
537 };
538 struct pidfd_info info = { .mask = PIDFD_INFO_EXIT };
539 int pidfd = -1, ret;
540 struct pollfd pfd;
541 pid_t pid;
542
543 /* Ensure parent does not already have no_new_privs. */
544 ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
545 ASSERT_EQ(ret, 0) {
546 TH_LOG("Parent already has no_new_privs set, cannot run test");
547 }
548
549 args.pidfd = ptr_to_u64(&pidfd);
550
551 pid = sys_clone3(&args, sizeof(args));
552 if (pid < 0 && errno == EINVAL)
553 SKIP(return, "CLONE_NNP not supported");
554 ASSERT_GE(pid, 0);
555
556 if (pid == 0) {
557 /*
558 * Child: check no_new_privs. Exit 0 if set, 1 if not.
559 */
560 ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
561 _exit(ret == 1 ? 0 : 1);
562 }
563
564 ASSERT_GE(pidfd, 0);
565
566 /* Parent must still NOT have no_new_privs. */
567 ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
568 ASSERT_EQ(ret, 0) {
569 TH_LOG("Parent got no_new_privs after creating CLONE_NNP child");
570 }
571
572 /* Wait for child to exit. */
573 pfd.fd = pidfd;
574 pfd.events = POLLIN;
575 ret = poll(&pfd, 1, 5000);
576 ASSERT_EQ(ret, 1);
577
578 /* Verify child exited with 0 (no_new_privs was set). */
579 ret = ioctl(pidfd, PIDFD_GET_INFO, &info);
580 ASSERT_EQ(ret, 0);
581 ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT);
582 ASSERT_TRUE(WIFEXITED(info.exit_code));
583 ASSERT_EQ(WEXITSTATUS(info.exit_code), 0) {
584 TH_LOG("Child did not have no_new_privs set");
585 }
586
587 close(pidfd);
588 }
589
590 /*
591 * Test that CLONE_NNP with CLONE_THREAD fails with EINVAL.
592 */
TEST(nnp_rejects_thread)593 TEST(nnp_rejects_thread)
594 {
595 struct __clone_args args = {
596 .flags = CLONE_NNP | CLONE_THREAD |
597 CLONE_SIGHAND | CLONE_VM,
598 .exit_signal = 0,
599 };
600 pid_t pid;
601
602 pid = sys_clone3(&args, sizeof(args));
603 ASSERT_EQ(pid, -1);
604 ASSERT_EQ(errno, EINVAL);
605 }
606
607 /*
608 * Test that a plain CLONE_AUTOREAP child does NOT get no_new_privs.
609 * Only CLONE_NNP should set it.
610 */
TEST(autoreap_no_new_privs_unset)611 TEST(autoreap_no_new_privs_unset)
612 {
613 struct pidfd_info info = { .mask = PIDFD_INFO_EXIT };
614 int pidfd = -1, ret;
615 struct pollfd pfd;
616 pid_t pid;
617
618 pid = create_autoreap_child(&pidfd);
619 if (pid < 0 && errno == EINVAL)
620 SKIP(return, "CLONE_AUTOREAP not supported");
621 ASSERT_GE(pid, 0);
622
623 if (pid == 0) {
624 /*
625 * Child: check no_new_privs. Exit 0 if NOT set, 1 if set.
626 */
627 ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
628 _exit(ret == 0 ? 0 : 1);
629 }
630
631 ASSERT_GE(pidfd, 0);
632
633 pfd.fd = pidfd;
634 pfd.events = POLLIN;
635 ret = poll(&pfd, 1, 5000);
636 ASSERT_EQ(ret, 1);
637
638 ret = ioctl(pidfd, PIDFD_GET_INFO, &info);
639 ASSERT_EQ(ret, 0);
640 ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT);
641 ASSERT_TRUE(WIFEXITED(info.exit_code));
642 ASSERT_EQ(WEXITSTATUS(info.exit_code), 0) {
643 TH_LOG("Plain autoreap child unexpectedly has no_new_privs");
644 }
645
646 close(pidfd);
647 }
648
649 /*
650 * Helper: create a child with CLONE_PIDFD | CLONE_PIDFD_AUTOKILL | CLONE_AUTOREAP | CLONE_NNP.
651 */
create_autokill_child(int * pidfd)652 static pid_t create_autokill_child(int *pidfd)
653 {
654 struct __clone_args args = {
655 .flags = CLONE_PIDFD | CLONE_PIDFD_AUTOKILL |
656 CLONE_AUTOREAP | CLONE_NNP,
657 .exit_signal = 0,
658 .pidfd = ptr_to_u64(pidfd),
659 };
660
661 return sys_clone3(&args, sizeof(args));
662 }
663
664 /*
665 * Basic autokill test: child blocks in pause(), parent closes the
666 * clone3 pidfd, child should be killed and autoreaped.
667 */
TEST(autokill_basic)668 TEST(autokill_basic)
669 {
670 int pidfd = -1, pollfd_fd = -1, ret;
671 struct pollfd pfd;
672 pid_t pid;
673
674 pid = create_autokill_child(&pidfd);
675 if (pid < 0 && errno == EINVAL)
676 SKIP(return, "CLONE_PIDFD_AUTOKILL not supported");
677 ASSERT_GE(pid, 0);
678
679 if (pid == 0) {
680 pause();
681 _exit(1);
682 }
683
684 ASSERT_GE(pidfd, 0);
685
686 /*
687 * Open a second pidfd via pidfd_open() so we can observe the
688 * child's death after closing the clone3 pidfd.
689 */
690 pollfd_fd = sys_pidfd_open(pid, 0);
691 ASSERT_GE(pollfd_fd, 0);
692
693 /* Close the clone3 pidfd — this should trigger autokill. */
694 close(pidfd);
695
696 /* Wait for the child to die via the pidfd_open'd fd. */
697 pfd.fd = pollfd_fd;
698 pfd.events = POLLIN;
699 ret = poll(&pfd, 1, 5000);
700 ASSERT_EQ(ret, 1);
701 ASSERT_TRUE(pfd.revents & POLLIN);
702
703 /* Child should be autoreaped — no zombie. */
704 usleep(100000);
705 ret = waitpid(pid, NULL, WNOHANG);
706 ASSERT_EQ(ret, -1);
707 ASSERT_EQ(errno, ECHILD);
708
709 close(pollfd_fd);
710 }
711
712 /*
713 * CLONE_PIDFD_AUTOKILL without CLONE_PIDFD must fail with EINVAL.
714 */
TEST(autokill_requires_pidfd)715 TEST(autokill_requires_pidfd)
716 {
717 struct __clone_args args = {
718 .flags = CLONE_PIDFD_AUTOKILL | CLONE_AUTOREAP,
719 .exit_signal = 0,
720 };
721 pid_t pid;
722
723 pid = sys_clone3(&args, sizeof(args));
724 ASSERT_EQ(pid, -1);
725 ASSERT_EQ(errno, EINVAL);
726 }
727
728 /*
729 * CLONE_PIDFD_AUTOKILL without CLONE_AUTOREAP must fail with EINVAL.
730 */
TEST(autokill_requires_autoreap)731 TEST(autokill_requires_autoreap)
732 {
733 int pidfd = -1;
734 struct __clone_args args = {
735 .flags = CLONE_PIDFD | CLONE_PIDFD_AUTOKILL,
736 .exit_signal = 0,
737 .pidfd = ptr_to_u64(&pidfd),
738 };
739 pid_t pid;
740
741 pid = sys_clone3(&args, sizeof(args));
742 ASSERT_EQ(pid, -1);
743 ASSERT_EQ(errno, EINVAL);
744 }
745
746 /*
747 * CLONE_PIDFD_AUTOKILL with CLONE_THREAD must fail with EINVAL.
748 */
TEST(autokill_rejects_thread)749 TEST(autokill_rejects_thread)
750 {
751 int pidfd = -1;
752 struct __clone_args args = {
753 .flags = CLONE_PIDFD | CLONE_PIDFD_AUTOKILL |
754 CLONE_AUTOREAP | CLONE_THREAD |
755 CLONE_SIGHAND | CLONE_VM,
756 .exit_signal = 0,
757 .pidfd = ptr_to_u64(&pidfd),
758 };
759 pid_t pid;
760
761 pid = sys_clone3(&args, sizeof(args));
762 ASSERT_EQ(pid, -1);
763 ASSERT_EQ(errno, EINVAL);
764 }
765
766 /*
767 * Test that only the clone3 pidfd triggers autokill, not pidfd_open().
768 * Close the pidfd_open'd fd first — child should survive.
769 * Then close the clone3 pidfd — child should be killed and autoreaped.
770 */
TEST(autokill_pidfd_open_no_effect)771 TEST(autokill_pidfd_open_no_effect)
772 {
773 int pidfd = -1, open_fd = -1, ret;
774 struct pollfd pfd;
775 pid_t pid;
776
777 pid = create_autokill_child(&pidfd);
778 if (pid < 0 && errno == EINVAL)
779 SKIP(return, "CLONE_PIDFD_AUTOKILL not supported");
780 ASSERT_GE(pid, 0);
781
782 if (pid == 0) {
783 pause();
784 _exit(1);
785 }
786
787 ASSERT_GE(pidfd, 0);
788
789 /* Open a second pidfd via pidfd_open(). */
790 open_fd = sys_pidfd_open(pid, 0);
791 ASSERT_GE(open_fd, 0);
792
793 /*
794 * Close the pidfd_open'd fd — child should survive because
795 * only the clone3 pidfd has autokill.
796 */
797 close(open_fd);
798 usleep(200000);
799
800 /* Verify child is still alive by polling the clone3 pidfd. */
801 pfd.fd = pidfd;
802 pfd.events = POLLIN;
803 ret = poll(&pfd, 1, 0);
804 ASSERT_EQ(ret, 0) {
805 TH_LOG("Child died after closing pidfd_open fd — should still be alive");
806 }
807
808 /* Open another observation fd before triggering autokill. */
809 open_fd = sys_pidfd_open(pid, 0);
810 ASSERT_GE(open_fd, 0);
811
812 /* Now close the clone3 pidfd — this triggers autokill. */
813 close(pidfd);
814
815 pfd.fd = open_fd;
816 pfd.events = POLLIN;
817 ret = poll(&pfd, 1, 5000);
818 ASSERT_EQ(ret, 1);
819 ASSERT_TRUE(pfd.revents & POLLIN);
820
821 /* Child should be autoreaped — no zombie. */
822 usleep(100000);
823 ret = waitpid(pid, NULL, WNOHANG);
824 ASSERT_EQ(ret, -1);
825 ASSERT_EQ(errno, ECHILD);
826
827 close(open_fd);
828 }
829
830 /*
831 * Test that CLONE_PIDFD_AUTOKILL without CLONE_NNP fails with EPERM
832 * for an unprivileged caller.
833 */
TEST(autokill_requires_cap_sys_admin)834 TEST(autokill_requires_cap_sys_admin)
835 {
836 int pidfd = -1, ret;
837 struct __clone_args args = {
838 .flags = CLONE_PIDFD | CLONE_PIDFD_AUTOKILL |
839 CLONE_AUTOREAP,
840 .exit_signal = 0,
841 .pidfd = ptr_to_u64(&pidfd),
842 };
843 pid_t pid;
844
845 /* Drop all capabilities so we lack CAP_SYS_ADMIN. */
846 ret = drop_all_caps();
847 ASSERT_EQ(ret, 0);
848
849 pid = sys_clone3(&args, sizeof(args));
850 ASSERT_EQ(pid, -1);
851 ASSERT_EQ(errno, EPERM);
852 }
853
854 /*
855 * Test that CLONE_PIDFD_AUTOKILL without CLONE_NNP succeeds with
856 * CAP_SYS_ADMIN.
857 */
TEST(autokill_without_nnp_with_cap)858 TEST(autokill_without_nnp_with_cap)
859 {
860 struct __clone_args args = {
861 .flags = CLONE_PIDFD | CLONE_PIDFD_AUTOKILL |
862 CLONE_AUTOREAP,
863 .exit_signal = 0,
864 };
865 struct pidfd_info info = { .mask = PIDFD_INFO_EXIT };
866 int pidfd = -1, ret;
867 struct pollfd pfd;
868 pid_t pid;
869
870 if (geteuid() != 0)
871 SKIP(return, "Need root/CAP_SYS_ADMIN");
872
873 args.pidfd = ptr_to_u64(&pidfd);
874
875 pid = sys_clone3(&args, sizeof(args));
876 if (pid < 0 && errno == EINVAL)
877 SKIP(return, "CLONE_PIDFD_AUTOKILL not supported");
878 ASSERT_GE(pid, 0);
879
880 if (pid == 0)
881 _exit(0);
882
883 ASSERT_GE(pidfd, 0);
884
885 /* Wait for child to exit. */
886 pfd.fd = pidfd;
887 pfd.events = POLLIN;
888 ret = poll(&pfd, 1, 5000);
889 ASSERT_EQ(ret, 1);
890
891 ret = ioctl(pidfd, PIDFD_GET_INFO, &info);
892 ASSERT_EQ(ret, 0);
893 ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT);
894 ASSERT_TRUE(WIFEXITED(info.exit_code));
895 ASSERT_EQ(WEXITSTATUS(info.exit_code), 0);
896
897 close(pidfd);
898 }
899
900 TEST_HARNESS_MAIN
901