xref: /linux/tools/testing/selftests/seccomp/seccomp_bpf.c (revision ab93e0dd72c37d378dd936f031ffb83ff2bd87ce)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
4  *
5  * Test code for seccomp bpf.
6  */
7 
8 #define _GNU_SOURCE
9 #include <sys/types.h>
10 
11 /*
12  * glibc 2.26 and later have SIGSYS in siginfo_t. Before that,
13  * we need to use the kernel's siginfo.h file and trick glibc
14  * into accepting it.
15  */
16 #if !__GLIBC_PREREQ(2, 26)
17 # include <asm/siginfo.h>
18 # define __have_siginfo_t 1
19 # define __have_sigval_t 1
20 # define __have_sigevent_t 1
21 #endif
22 
23 #include <errno.h>
24 #include <linux/filter.h>
25 #include <sys/prctl.h>
26 #include <sys/ptrace.h>
27 #include <sys/user.h>
28 #include <linux/prctl.h>
29 #include <linux/ptrace.h>
30 #include <linux/seccomp.h>
31 #include <pthread.h>
32 #include <semaphore.h>
33 #include <signal.h>
34 #include <stddef.h>
35 #include <stdbool.h>
36 #include <string.h>
37 #include <time.h>
38 #include <limits.h>
39 #include <linux/elf.h>
40 #include <sys/uio.h>
41 #include <sys/utsname.h>
42 #include <sys/fcntl.h>
43 #include <sys/mman.h>
44 #include <sys/times.h>
45 #include <sys/socket.h>
46 #include <sys/ioctl.h>
47 #include <linux/kcmp.h>
48 #include <sys/resource.h>
49 #include <sys/capability.h>
50 #include <linux/perf_event.h>
51 
52 #include <unistd.h>
53 #include <sys/syscall.h>
54 #include <poll.h>
55 
56 #include "../kselftest_harness.h"
57 #include "../clone3/clone3_selftests.h"
58 
59 /* Attempt to de-conflict with the selftests tree. */
60 #ifndef SKIP
61 #define SKIP(s, ...)	XFAIL(s, ##__VA_ARGS__)
62 #endif
63 
64 #ifndef MIN
65 #define MIN(X, Y) ((X) < (Y) ? (X) : (Y))
66 #endif
67 
68 #ifndef PR_SET_PTRACER
69 # define PR_SET_PTRACER 0x59616d61
70 #endif
71 
72 #ifndef noinline
73 #define noinline __attribute__((noinline))
74 #endif
75 
76 #ifndef PR_SET_NO_NEW_PRIVS
77 #define PR_SET_NO_NEW_PRIVS 38
78 #define PR_GET_NO_NEW_PRIVS 39
79 #endif
80 
81 #ifndef PR_SECCOMP_EXT
82 #define PR_SECCOMP_EXT 43
83 #endif
84 
85 #ifndef SECCOMP_EXT_ACT
86 #define SECCOMP_EXT_ACT 1
87 #endif
88 
89 #ifndef SECCOMP_EXT_ACT_TSYNC
90 #define SECCOMP_EXT_ACT_TSYNC 1
91 #endif
92 
93 #ifndef SECCOMP_MODE_STRICT
94 #define SECCOMP_MODE_STRICT 1
95 #endif
96 
97 #ifndef SECCOMP_MODE_FILTER
98 #define SECCOMP_MODE_FILTER 2
99 #endif
100 
101 #ifndef SECCOMP_RET_ALLOW
102 struct seccomp_data {
103 	int nr;
104 	__u32 arch;
105 	__u64 instruction_pointer;
106 	__u64 args[6];
107 };
108 #endif
109 
110 #ifndef SECCOMP_RET_KILL_PROCESS
111 #define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
112 #define SECCOMP_RET_KILL_THREAD	 0x00000000U /* kill the thread */
113 #endif
114 #ifndef SECCOMP_RET_KILL
115 #define SECCOMP_RET_KILL	 SECCOMP_RET_KILL_THREAD
116 #define SECCOMP_RET_TRAP	 0x00030000U /* disallow and force a SIGSYS */
117 #define SECCOMP_RET_ERRNO	 0x00050000U /* returns an errno */
118 #define SECCOMP_RET_TRACE	 0x7ff00000U /* pass to a tracer or disallow */
119 #define SECCOMP_RET_ALLOW	 0x7fff0000U /* allow */
120 #endif
121 #ifndef SECCOMP_RET_LOG
122 #define SECCOMP_RET_LOG		 0x7ffc0000U /* allow after logging */
123 #endif
124 
125 #ifndef __NR_seccomp
126 # if defined(__i386__)
127 #  define __NR_seccomp 354
128 # elif defined(__x86_64__)
129 #  define __NR_seccomp 317
130 # elif defined(__arm__)
131 #  define __NR_seccomp 383
132 # elif defined(__aarch64__)
133 #  define __NR_seccomp 277
134 # elif defined(__riscv)
135 #  define __NR_seccomp 277
136 # elif defined(__csky__)
137 #  define __NR_seccomp 277
138 # elif defined(__loongarch__)
139 #  define __NR_seccomp 277
140 # elif defined(__hppa__)
141 #  define __NR_seccomp 338
142 # elif defined(__powerpc__)
143 #  define __NR_seccomp 358
144 # elif defined(__s390__)
145 #  define __NR_seccomp 348
146 # elif defined(__xtensa__)
147 #  define __NR_seccomp 337
148 # elif defined(__sh__)
149 #  define __NR_seccomp 372
150 # elif defined(__mc68000__)
151 #  define __NR_seccomp 380
152 # else
153 #  warning "seccomp syscall number unknown for this architecture"
154 #  define __NR_seccomp 0xffff
155 # endif
156 #endif
157 
158 #ifndef __NR_uretprobe
159 # if defined(__x86_64__)
160 #  define __NR_uretprobe 335
161 # endif
162 #endif
163 
164 #ifndef SECCOMP_SET_MODE_STRICT
165 #define SECCOMP_SET_MODE_STRICT 0
166 #endif
167 
168 #ifndef SECCOMP_SET_MODE_FILTER
169 #define SECCOMP_SET_MODE_FILTER 1
170 #endif
171 
172 #ifndef SECCOMP_GET_ACTION_AVAIL
173 #define SECCOMP_GET_ACTION_AVAIL 2
174 #endif
175 
176 #ifndef SECCOMP_GET_NOTIF_SIZES
177 #define SECCOMP_GET_NOTIF_SIZES 3
178 #endif
179 
180 #ifndef SECCOMP_FILTER_FLAG_TSYNC
181 #define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
182 #endif
183 
184 #ifndef SECCOMP_FILTER_FLAG_LOG
185 #define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
186 #endif
187 
188 #ifndef SECCOMP_FILTER_FLAG_SPEC_ALLOW
189 #define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
190 #endif
191 
192 #ifndef PTRACE_SECCOMP_GET_METADATA
193 #define PTRACE_SECCOMP_GET_METADATA	0x420d
194 
195 struct seccomp_metadata {
196 	__u64 filter_off;       /* Input: which filter */
197 	__u64 flags;             /* Output: filter's flags */
198 };
199 #endif
200 
201 #ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
202 #define SECCOMP_FILTER_FLAG_NEW_LISTENER	(1UL << 3)
203 #endif
204 
205 #ifndef SECCOMP_RET_USER_NOTIF
206 #define SECCOMP_RET_USER_NOTIF 0x7fc00000U
207 
208 #define SECCOMP_IOC_MAGIC		'!'
209 #define SECCOMP_IO(nr)			_IO(SECCOMP_IOC_MAGIC, nr)
210 #define SECCOMP_IOR(nr, type)		_IOR(SECCOMP_IOC_MAGIC, nr, type)
211 #define SECCOMP_IOW(nr, type)		_IOW(SECCOMP_IOC_MAGIC, nr, type)
212 #define SECCOMP_IOWR(nr, type)		_IOWR(SECCOMP_IOC_MAGIC, nr, type)
213 
214 /* Flags for seccomp notification fd ioctl. */
215 #define SECCOMP_IOCTL_NOTIF_RECV	SECCOMP_IOWR(0, struct seccomp_notif)
216 #define SECCOMP_IOCTL_NOTIF_SEND	SECCOMP_IOWR(1,	\
217 						struct seccomp_notif_resp)
218 #define SECCOMP_IOCTL_NOTIF_ID_VALID	SECCOMP_IOW(2, __u64)
219 
220 struct seccomp_notif {
221 	__u64 id;
222 	__u32 pid;
223 	__u32 flags;
224 	struct seccomp_data data;
225 };
226 
227 struct seccomp_notif_resp {
228 	__u64 id;
229 	__s64 val;
230 	__s32 error;
231 	__u32 flags;
232 };
233 
234 struct seccomp_notif_sizes {
235 	__u16 seccomp_notif;
236 	__u16 seccomp_notif_resp;
237 	__u16 seccomp_data;
238 };
239 #endif
240 
241 #ifndef SECCOMP_IOCTL_NOTIF_ADDFD
242 /* On success, the return value is the remote process's added fd number */
243 #define SECCOMP_IOCTL_NOTIF_ADDFD	SECCOMP_IOW(3,	\
244 						struct seccomp_notif_addfd)
245 
246 /* valid flags for seccomp_notif_addfd */
247 #define SECCOMP_ADDFD_FLAG_SETFD	(1UL << 0) /* Specify remote fd */
248 
249 struct seccomp_notif_addfd {
250 	__u64 id;
251 	__u32 flags;
252 	__u32 srcfd;
253 	__u32 newfd;
254 	__u32 newfd_flags;
255 };
256 #endif
257 
258 #ifndef SECCOMP_ADDFD_FLAG_SEND
259 #define SECCOMP_ADDFD_FLAG_SEND	(1UL << 1) /* Addfd and return it, atomically */
260 #endif
261 
262 struct seccomp_notif_addfd_small {
263 	__u64 id;
264 	char weird[4];
265 };
266 #define SECCOMP_IOCTL_NOTIF_ADDFD_SMALL	\
267 	SECCOMP_IOW(3, struct seccomp_notif_addfd_small)
268 
269 struct seccomp_notif_addfd_big {
270 	union {
271 		struct seccomp_notif_addfd addfd;
272 		char buf[sizeof(struct seccomp_notif_addfd) + 8];
273 	};
274 };
275 #define SECCOMP_IOCTL_NOTIF_ADDFD_BIG	\
276 	SECCOMP_IOWR(3, struct seccomp_notif_addfd_big)
277 
278 #ifndef PTRACE_EVENTMSG_SYSCALL_ENTRY
279 #define PTRACE_EVENTMSG_SYSCALL_ENTRY	1
280 #define PTRACE_EVENTMSG_SYSCALL_EXIT	2
281 #endif
282 
283 #ifndef SECCOMP_USER_NOTIF_FLAG_CONTINUE
284 #define SECCOMP_USER_NOTIF_FLAG_CONTINUE 0x00000001
285 #endif
286 
287 #ifndef SECCOMP_FILTER_FLAG_TSYNC_ESRCH
288 #define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
289 #endif
290 
291 #ifndef SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV
292 #define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV (1UL << 5)
293 #endif
294 
295 #ifndef seccomp
seccomp(unsigned int op,unsigned int flags,void * args)296 int seccomp(unsigned int op, unsigned int flags, void *args)
297 {
298 	errno = 0;
299 	return syscall(__NR_seccomp, op, flags, args);
300 }
301 #endif
302 
303 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
304 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
305 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
306 #define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]) + sizeof(__u32))
307 #else
308 #error "wut? Unknown __BYTE_ORDER__?!"
309 #endif
310 
311 #define SIBLING_EXIT_UNKILLED	0xbadbeef
312 #define SIBLING_EXIT_FAILURE	0xbadface
313 #define SIBLING_EXIT_NEWPRIVS	0xbadfeed
314 
__filecmp(pid_t pid1,pid_t pid2,int fd1,int fd2)315 static int __filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
316 {
317 #ifdef __NR_kcmp
318 	errno = 0;
319 	return syscall(__NR_kcmp, pid1, pid2, KCMP_FILE, fd1, fd2);
320 #else
321 	errno = ENOSYS;
322 	return -1;
323 #endif
324 }
325 
326 /* Have TH_LOG report actual location filecmp() is used. */
327 #define filecmp(pid1, pid2, fd1, fd2)	({		\
328 	int _ret;					\
329 							\
330 	_ret = __filecmp(pid1, pid2, fd1, fd2);		\
331 	if (_ret != 0) {				\
332 		if (_ret < 0 && errno == ENOSYS) {	\
333 			TH_LOG("kcmp() syscall missing (test is less accurate)");\
334 			_ret = 0;			\
335 		}					\
336 	}						\
337 	_ret; })
338 
TEST(kcmp)339 TEST(kcmp)
340 {
341 	int ret;
342 
343 	ret = __filecmp(getpid(), getpid(), 1, 1);
344 	EXPECT_EQ(ret, 0);
345 	if (ret != 0 && errno == ENOSYS)
346 		SKIP(return, "Kernel does not support kcmp() (missing CONFIG_KCMP?)");
347 }
348 
TEST(mode_strict_support)349 TEST(mode_strict_support)
350 {
351 	long ret;
352 
353 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
354 	ASSERT_EQ(0, ret) {
355 		TH_LOG("Kernel does not support CONFIG_SECCOMP");
356 	}
357 	syscall(__NR_exit, 0);
358 }
359 
TEST_SIGNAL(mode_strict_cannot_call_prctl,SIGKILL)360 TEST_SIGNAL(mode_strict_cannot_call_prctl, SIGKILL)
361 {
362 	long ret;
363 
364 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
365 	ASSERT_EQ(0, ret) {
366 		TH_LOG("Kernel does not support CONFIG_SECCOMP");
367 	}
368 	syscall(__NR_prctl, PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
369 		NULL, NULL, NULL);
370 	EXPECT_FALSE(true) {
371 		TH_LOG("Unreachable!");
372 	}
373 }
374 
375 /* Note! This doesn't test no new privs behavior */
TEST(no_new_privs_support)376 TEST(no_new_privs_support)
377 {
378 	long ret;
379 
380 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
381 	EXPECT_EQ(0, ret) {
382 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
383 	}
384 }
385 
386 /* Tests kernel support by checking for a copy_from_user() fault on NULL. */
TEST(mode_filter_support)387 TEST(mode_filter_support)
388 {
389 	long ret;
390 
391 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
392 	ASSERT_EQ(0, ret) {
393 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
394 	}
395 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, NULL, NULL);
396 	EXPECT_EQ(-1, ret);
397 	EXPECT_EQ(EFAULT, errno) {
398 		TH_LOG("Kernel does not support CONFIG_SECCOMP_FILTER!");
399 	}
400 }
401 
TEST(mode_filter_without_nnp)402 TEST(mode_filter_without_nnp)
403 {
404 	struct sock_filter filter[] = {
405 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
406 	};
407 	struct sock_fprog prog = {
408 		.len = (unsigned short)ARRAY_SIZE(filter),
409 		.filter = filter,
410 	};
411 	long ret;
412 	cap_t cap = cap_get_proc();
413 	cap_flag_value_t is_cap_sys_admin = 0;
414 
415 	ret = prctl(PR_GET_NO_NEW_PRIVS, 0, NULL, 0, 0);
416 	ASSERT_LE(0, ret) {
417 		TH_LOG("Expected 0 or unsupported for NO_NEW_PRIVS");
418 	}
419 	errno = 0;
420 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
421 	/* Succeeds with CAP_SYS_ADMIN, fails without */
422 	cap_get_flag(cap, CAP_SYS_ADMIN, CAP_EFFECTIVE, &is_cap_sys_admin);
423 	if (!is_cap_sys_admin) {
424 		EXPECT_EQ(-1, ret);
425 		EXPECT_EQ(EACCES, errno);
426 	} else {
427 		EXPECT_EQ(0, ret);
428 	}
429 }
430 
431 #define MAX_INSNS_PER_PATH 32768
432 
TEST(filter_size_limits)433 TEST(filter_size_limits)
434 {
435 	int i;
436 	int count = BPF_MAXINSNS + 1;
437 	struct sock_filter allow[] = {
438 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
439 	};
440 	struct sock_filter *filter;
441 	struct sock_fprog prog = { };
442 	long ret;
443 
444 	filter = calloc(count, sizeof(*filter));
445 	ASSERT_NE(NULL, filter);
446 
447 	for (i = 0; i < count; i++)
448 		filter[i] = allow[0];
449 
450 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
451 	ASSERT_EQ(0, ret);
452 
453 	prog.filter = filter;
454 	prog.len = count;
455 
456 	/* Too many filter instructions in a single filter. */
457 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
458 	ASSERT_NE(0, ret) {
459 		TH_LOG("Installing %d insn filter was allowed", prog.len);
460 	}
461 
462 	/* One less is okay, though. */
463 	prog.len -= 1;
464 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
465 	ASSERT_EQ(0, ret) {
466 		TH_LOG("Installing %d insn filter wasn't allowed", prog.len);
467 	}
468 }
469 
TEST(filter_chain_limits)470 TEST(filter_chain_limits)
471 {
472 	int i;
473 	int count = BPF_MAXINSNS;
474 	struct sock_filter allow[] = {
475 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
476 	};
477 	struct sock_filter *filter;
478 	struct sock_fprog prog = { };
479 	long ret;
480 
481 	filter = calloc(count, sizeof(*filter));
482 	ASSERT_NE(NULL, filter);
483 
484 	for (i = 0; i < count; i++)
485 		filter[i] = allow[0];
486 
487 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
488 	ASSERT_EQ(0, ret);
489 
490 	prog.filter = filter;
491 	prog.len = 1;
492 
493 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
494 	ASSERT_EQ(0, ret);
495 
496 	prog.len = count;
497 
498 	/* Too many total filter instructions. */
499 	for (i = 0; i < MAX_INSNS_PER_PATH; i++) {
500 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
501 		if (ret != 0)
502 			break;
503 	}
504 	ASSERT_NE(0, ret) {
505 		TH_LOG("Allowed %d %d-insn filters (total with penalties:%d)",
506 		       i, count, i * (count + 4));
507 	}
508 }
509 
TEST(mode_filter_cannot_move_to_strict)510 TEST(mode_filter_cannot_move_to_strict)
511 {
512 	struct sock_filter filter[] = {
513 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
514 	};
515 	struct sock_fprog prog = {
516 		.len = (unsigned short)ARRAY_SIZE(filter),
517 		.filter = filter,
518 	};
519 	long ret;
520 
521 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
522 	ASSERT_EQ(0, ret);
523 
524 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
525 	ASSERT_EQ(0, ret);
526 
527 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, 0, 0);
528 	EXPECT_EQ(-1, ret);
529 	EXPECT_EQ(EINVAL, errno);
530 }
531 
532 
TEST(mode_filter_get_seccomp)533 TEST(mode_filter_get_seccomp)
534 {
535 	struct sock_filter filter[] = {
536 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
537 	};
538 	struct sock_fprog prog = {
539 		.len = (unsigned short)ARRAY_SIZE(filter),
540 		.filter = filter,
541 	};
542 	long ret;
543 
544 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
545 	ASSERT_EQ(0, ret);
546 
547 	ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
548 	EXPECT_EQ(0, ret);
549 
550 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
551 	ASSERT_EQ(0, ret);
552 
553 	ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
554 	EXPECT_EQ(2, ret);
555 }
556 
557 
TEST(ALLOW_all)558 TEST(ALLOW_all)
559 {
560 	struct sock_filter filter[] = {
561 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
562 	};
563 	struct sock_fprog prog = {
564 		.len = (unsigned short)ARRAY_SIZE(filter),
565 		.filter = filter,
566 	};
567 	long ret;
568 
569 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
570 	ASSERT_EQ(0, ret);
571 
572 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
573 	ASSERT_EQ(0, ret);
574 }
575 
TEST(empty_prog)576 TEST(empty_prog)
577 {
578 	struct sock_filter filter[] = {
579 	};
580 	struct sock_fprog prog = {
581 		.len = (unsigned short)ARRAY_SIZE(filter),
582 		.filter = filter,
583 	};
584 	long ret;
585 
586 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
587 	ASSERT_EQ(0, ret);
588 
589 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
590 	EXPECT_EQ(-1, ret);
591 	EXPECT_EQ(EINVAL, errno);
592 }
593 
TEST(log_all)594 TEST(log_all)
595 {
596 	struct sock_filter filter[] = {
597 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
598 	};
599 	struct sock_fprog prog = {
600 		.len = (unsigned short)ARRAY_SIZE(filter),
601 		.filter = filter,
602 	};
603 	long ret;
604 	pid_t parent = getppid();
605 
606 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
607 	ASSERT_EQ(0, ret);
608 
609 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
610 	ASSERT_EQ(0, ret);
611 
612 	/* getppid() should succeed and be logged (no check for logging) */
613 	EXPECT_EQ(parent, syscall(__NR_getppid));
614 }
615 
TEST_SIGNAL(unknown_ret_is_kill_inside,SIGSYS)616 TEST_SIGNAL(unknown_ret_is_kill_inside, SIGSYS)
617 {
618 	struct sock_filter filter[] = {
619 		BPF_STMT(BPF_RET|BPF_K, 0x10000000U),
620 	};
621 	struct sock_fprog prog = {
622 		.len = (unsigned short)ARRAY_SIZE(filter),
623 		.filter = filter,
624 	};
625 	long ret;
626 
627 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
628 	ASSERT_EQ(0, ret);
629 
630 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
631 	ASSERT_EQ(0, ret);
632 	EXPECT_EQ(0, syscall(__NR_getpid)) {
633 		TH_LOG("getpid() shouldn't ever return");
634 	}
635 }
636 
637 /* return code >= 0x80000000 is unused. */
TEST_SIGNAL(unknown_ret_is_kill_above_allow,SIGSYS)638 TEST_SIGNAL(unknown_ret_is_kill_above_allow, SIGSYS)
639 {
640 	struct sock_filter filter[] = {
641 		BPF_STMT(BPF_RET|BPF_K, 0x90000000U),
642 	};
643 	struct sock_fprog prog = {
644 		.len = (unsigned short)ARRAY_SIZE(filter),
645 		.filter = filter,
646 	};
647 	long ret;
648 
649 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
650 	ASSERT_EQ(0, ret);
651 
652 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
653 	ASSERT_EQ(0, ret);
654 	EXPECT_EQ(0, syscall(__NR_getpid)) {
655 		TH_LOG("getpid() shouldn't ever return");
656 	}
657 }
658 
TEST_SIGNAL(KILL_all,SIGSYS)659 TEST_SIGNAL(KILL_all, SIGSYS)
660 {
661 	struct sock_filter filter[] = {
662 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
663 	};
664 	struct sock_fprog prog = {
665 		.len = (unsigned short)ARRAY_SIZE(filter),
666 		.filter = filter,
667 	};
668 	long ret;
669 
670 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
671 	ASSERT_EQ(0, ret);
672 
673 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
674 	ASSERT_EQ(0, ret);
675 }
676 
TEST_SIGNAL(KILL_one,SIGSYS)677 TEST_SIGNAL(KILL_one, SIGSYS)
678 {
679 	struct sock_filter filter[] = {
680 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
681 			offsetof(struct seccomp_data, nr)),
682 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
683 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
684 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
685 	};
686 	struct sock_fprog prog = {
687 		.len = (unsigned short)ARRAY_SIZE(filter),
688 		.filter = filter,
689 	};
690 	long ret;
691 	pid_t parent = getppid();
692 
693 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
694 	ASSERT_EQ(0, ret);
695 
696 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
697 	ASSERT_EQ(0, ret);
698 
699 	EXPECT_EQ(parent, syscall(__NR_getppid));
700 	/* getpid() should never return. */
701 	EXPECT_EQ(0, syscall(__NR_getpid));
702 }
703 
TEST_SIGNAL(KILL_one_arg_one,SIGSYS)704 TEST_SIGNAL(KILL_one_arg_one, SIGSYS)
705 {
706 	void *fatal_address;
707 	struct sock_filter filter[] = {
708 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
709 			offsetof(struct seccomp_data, nr)),
710 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_times, 1, 0),
711 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
712 		/* Only both with lower 32-bit for now. */
713 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(0)),
714 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K,
715 			(unsigned long)&fatal_address, 0, 1),
716 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
717 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
718 	};
719 	struct sock_fprog prog = {
720 		.len = (unsigned short)ARRAY_SIZE(filter),
721 		.filter = filter,
722 	};
723 	long ret;
724 	pid_t parent = getppid();
725 	struct tms timebuf;
726 	clock_t clock = times(&timebuf);
727 
728 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
729 	ASSERT_EQ(0, ret);
730 
731 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
732 	ASSERT_EQ(0, ret);
733 
734 	EXPECT_EQ(parent, syscall(__NR_getppid));
735 	EXPECT_LE(clock, syscall(__NR_times, &timebuf));
736 	/* times() should never return. */
737 	EXPECT_EQ(0, syscall(__NR_times, &fatal_address));
738 }
739 
TEST_SIGNAL(KILL_one_arg_six,SIGSYS)740 TEST_SIGNAL(KILL_one_arg_six, SIGSYS)
741 {
742 #ifndef __NR_mmap2
743 	int sysno = __NR_mmap;
744 #else
745 	int sysno = __NR_mmap2;
746 #endif
747 	struct sock_filter filter[] = {
748 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
749 			offsetof(struct seccomp_data, nr)),
750 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, sysno, 1, 0),
751 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
752 		/* Only both with lower 32-bit for now. */
753 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(5)),
754 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, 0x0C0FFEE, 0, 1),
755 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
756 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
757 	};
758 	struct sock_fprog prog = {
759 		.len = (unsigned short)ARRAY_SIZE(filter),
760 		.filter = filter,
761 	};
762 	long ret;
763 	pid_t parent = getppid();
764 	int fd;
765 	void *map1, *map2;
766 	int page_size = sysconf(_SC_PAGESIZE);
767 
768 	ASSERT_LT(0, page_size);
769 
770 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
771 	ASSERT_EQ(0, ret);
772 
773 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
774 	ASSERT_EQ(0, ret);
775 
776 	fd = open("/dev/zero", O_RDONLY);
777 	ASSERT_NE(-1, fd);
778 
779 	EXPECT_EQ(parent, syscall(__NR_getppid));
780 	map1 = (void *)syscall(sysno,
781 		NULL, page_size, PROT_READ, MAP_PRIVATE, fd, page_size);
782 	EXPECT_NE(MAP_FAILED, map1);
783 	/* mmap2() should never return. */
784 	map2 = (void *)syscall(sysno,
785 		 NULL, page_size, PROT_READ, MAP_PRIVATE, fd, 0x0C0FFEE);
786 	EXPECT_EQ(MAP_FAILED, map2);
787 
788 	/* The test failed, so clean up the resources. */
789 	munmap(map1, page_size);
790 	munmap(map2, page_size);
791 	close(fd);
792 }
793 
794 /* This is a thread task to die via seccomp filter violation. */
kill_thread(void * data)795 void *kill_thread(void *data)
796 {
797 	bool die = (bool)data;
798 
799 	if (die) {
800 		syscall(__NR_getpid);
801 		return (void *)SIBLING_EXIT_FAILURE;
802 	}
803 
804 	return (void *)SIBLING_EXIT_UNKILLED;
805 }
806 
807 enum kill_t {
808 	KILL_THREAD,
809 	KILL_PROCESS,
810 	RET_UNKNOWN
811 };
812 
813 /* Prepare a thread that will kill itself or both of us. */
kill_thread_or_group(struct __test_metadata * _metadata,enum kill_t kill_how)814 void kill_thread_or_group(struct __test_metadata *_metadata,
815 			  enum kill_t kill_how)
816 {
817 	pthread_t thread;
818 	void *status;
819 	/* Kill only when calling __NR_getpid. */
820 	struct sock_filter filter_thread[] = {
821 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
822 			offsetof(struct seccomp_data, nr)),
823 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
824 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
825 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
826 	};
827 	struct sock_fprog prog_thread = {
828 		.len = (unsigned short)ARRAY_SIZE(filter_thread),
829 		.filter = filter_thread,
830 	};
831 	int kill = kill_how == KILL_PROCESS ? SECCOMP_RET_KILL_PROCESS : 0xAAAAAAAA;
832 	struct sock_filter filter_process[] = {
833 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
834 			offsetof(struct seccomp_data, nr)),
835 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
836 		BPF_STMT(BPF_RET|BPF_K, kill),
837 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
838 	};
839 	struct sock_fprog prog_process = {
840 		.len = (unsigned short)ARRAY_SIZE(filter_process),
841 		.filter = filter_process,
842 	};
843 
844 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
845 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
846 	}
847 
848 	ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0,
849 			     kill_how == KILL_THREAD ? &prog_thread
850 						     : &prog_process));
851 
852 	/*
853 	 * Add the KILL_THREAD rule again to make sure that the KILL_PROCESS
854 	 * flag cannot be downgraded by a new filter.
855 	 */
856 	if (kill_how == KILL_PROCESS)
857 		ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog_thread));
858 
859 	/* Start a thread that will exit immediately. */
860 	ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)false));
861 	ASSERT_EQ(0, pthread_join(thread, &status));
862 	ASSERT_EQ(SIBLING_EXIT_UNKILLED, (unsigned long)status);
863 
864 	/* Start a thread that will die immediately. */
865 	ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)true));
866 	ASSERT_EQ(0, pthread_join(thread, &status));
867 	ASSERT_NE(SIBLING_EXIT_FAILURE, (unsigned long)status);
868 
869 	/*
870 	 * If we get here, only the spawned thread died. Let the parent know
871 	 * the whole process didn't die (i.e. this thread, the spawner,
872 	 * stayed running).
873 	 */
874 	exit(42);
875 }
876 
TEST(KILL_thread)877 TEST(KILL_thread)
878 {
879 	int status;
880 	pid_t child_pid;
881 
882 	child_pid = fork();
883 	ASSERT_LE(0, child_pid);
884 	if (child_pid == 0) {
885 		kill_thread_or_group(_metadata, KILL_THREAD);
886 		_exit(38);
887 	}
888 
889 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
890 
891 	/* If only the thread was killed, we'll see exit 42. */
892 	ASSERT_TRUE(WIFEXITED(status));
893 	ASSERT_EQ(42, WEXITSTATUS(status));
894 }
895 
TEST(KILL_process)896 TEST(KILL_process)
897 {
898 	int status;
899 	pid_t child_pid;
900 
901 	child_pid = fork();
902 	ASSERT_LE(0, child_pid);
903 	if (child_pid == 0) {
904 		kill_thread_or_group(_metadata, KILL_PROCESS);
905 		_exit(38);
906 	}
907 
908 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
909 
910 	/* If the entire process was killed, we'll see SIGSYS. */
911 	ASSERT_TRUE(WIFSIGNALED(status));
912 	ASSERT_EQ(SIGSYS, WTERMSIG(status));
913 }
914 
TEST(KILL_unknown)915 TEST(KILL_unknown)
916 {
917 	int status;
918 	pid_t child_pid;
919 
920 	child_pid = fork();
921 	ASSERT_LE(0, child_pid);
922 	if (child_pid == 0) {
923 		kill_thread_or_group(_metadata, RET_UNKNOWN);
924 		_exit(38);
925 	}
926 
927 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
928 
929 	/* If the entire process was killed, we'll see SIGSYS. */
930 	EXPECT_TRUE(WIFSIGNALED(status)) {
931 		TH_LOG("Unknown SECCOMP_RET is only killing the thread?");
932 	}
933 	ASSERT_EQ(SIGSYS, WTERMSIG(status));
934 }
935 
936 /* TODO(wad) add 64-bit versus 32-bit arg tests. */
TEST(arg_out_of_range)937 TEST(arg_out_of_range)
938 {
939 	struct sock_filter filter[] = {
940 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(6)),
941 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
942 	};
943 	struct sock_fprog prog = {
944 		.len = (unsigned short)ARRAY_SIZE(filter),
945 		.filter = filter,
946 	};
947 	long ret;
948 
949 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
950 	ASSERT_EQ(0, ret);
951 
952 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
953 	EXPECT_EQ(-1, ret);
954 	EXPECT_EQ(EINVAL, errno);
955 }
956 
957 #define ERRNO_FILTER(name, errno)					\
958 	struct sock_filter _read_filter_##name[] = {			\
959 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,				\
960 			offsetof(struct seccomp_data, nr)),		\
961 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),	\
962 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | errno),	\
963 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),		\
964 	};								\
965 	struct sock_fprog prog_##name = {				\
966 		.len = (unsigned short)ARRAY_SIZE(_read_filter_##name),	\
967 		.filter = _read_filter_##name,				\
968 	}
969 
970 /* Make sure basic errno values are correctly passed through a filter. */
TEST(ERRNO_valid)971 TEST(ERRNO_valid)
972 {
973 	ERRNO_FILTER(valid, E2BIG);
974 	long ret;
975 	pid_t parent = getppid();
976 
977 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
978 	ASSERT_EQ(0, ret);
979 
980 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_valid);
981 	ASSERT_EQ(0, ret);
982 
983 	EXPECT_EQ(parent, syscall(__NR_getppid));
984 	EXPECT_EQ(-1, read(-1, NULL, 0));
985 	EXPECT_EQ(E2BIG, errno);
986 }
987 
988 /* Make sure an errno of zero is correctly handled by the arch code. */
TEST(ERRNO_zero)989 TEST(ERRNO_zero)
990 {
991 	ERRNO_FILTER(zero, 0);
992 	long ret;
993 	pid_t parent = getppid();
994 
995 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
996 	ASSERT_EQ(0, ret);
997 
998 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_zero);
999 	ASSERT_EQ(0, ret);
1000 
1001 	EXPECT_EQ(parent, syscall(__NR_getppid));
1002 	/* "errno" of 0 is ok. */
1003 	EXPECT_EQ(0, read(-1, NULL, 0));
1004 }
1005 
1006 /*
1007  * The SECCOMP_RET_DATA mask is 16 bits wide, but errno is smaller.
1008  * This tests that the errno value gets capped correctly, fixed by
1009  * 580c57f10768 ("seccomp: cap SECCOMP_RET_ERRNO data to MAX_ERRNO").
1010  */
TEST(ERRNO_capped)1011 TEST(ERRNO_capped)
1012 {
1013 	ERRNO_FILTER(capped, 4096);
1014 	long ret;
1015 	pid_t parent = getppid();
1016 
1017 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1018 	ASSERT_EQ(0, ret);
1019 
1020 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_capped);
1021 	ASSERT_EQ(0, ret);
1022 
1023 	EXPECT_EQ(parent, syscall(__NR_getppid));
1024 	EXPECT_EQ(-1, read(-1, NULL, 0));
1025 	EXPECT_EQ(4095, errno);
1026 }
1027 
1028 /*
1029  * Filters are processed in reverse order: last applied is executed first.
1030  * Since only the SECCOMP_RET_ACTION mask is tested for return values, the
1031  * SECCOMP_RET_DATA mask results will follow the most recently applied
1032  * matching filter return (and not the lowest or highest value).
1033  */
TEST(ERRNO_order)1034 TEST(ERRNO_order)
1035 {
1036 	ERRNO_FILTER(first,  11);
1037 	ERRNO_FILTER(second, 13);
1038 	ERRNO_FILTER(third,  12);
1039 	long ret;
1040 	pid_t parent = getppid();
1041 
1042 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1043 	ASSERT_EQ(0, ret);
1044 
1045 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_first);
1046 	ASSERT_EQ(0, ret);
1047 
1048 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_second);
1049 	ASSERT_EQ(0, ret);
1050 
1051 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_third);
1052 	ASSERT_EQ(0, ret);
1053 
1054 	EXPECT_EQ(parent, syscall(__NR_getppid));
1055 	EXPECT_EQ(-1, read(-1, NULL, 0));
1056 	EXPECT_EQ(12, errno);
1057 }
1058 
FIXTURE(TRAP)1059 FIXTURE(TRAP) {
1060 	struct sock_fprog prog;
1061 };
1062 
FIXTURE_SETUP(TRAP)1063 FIXTURE_SETUP(TRAP)
1064 {
1065 	struct sock_filter filter[] = {
1066 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1067 			offsetof(struct seccomp_data, nr)),
1068 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
1069 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
1070 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1071 	};
1072 
1073 	memset(&self->prog, 0, sizeof(self->prog));
1074 	self->prog.filter = malloc(sizeof(filter));
1075 	ASSERT_NE(NULL, self->prog.filter);
1076 	memcpy(self->prog.filter, filter, sizeof(filter));
1077 	self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1078 }
1079 
FIXTURE_TEARDOWN(TRAP)1080 FIXTURE_TEARDOWN(TRAP)
1081 {
1082 	if (self->prog.filter)
1083 		free(self->prog.filter);
1084 }
1085 
TEST_F_SIGNAL(TRAP,dfl,SIGSYS)1086 TEST_F_SIGNAL(TRAP, dfl, SIGSYS)
1087 {
1088 	long ret;
1089 
1090 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1091 	ASSERT_EQ(0, ret);
1092 
1093 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1094 	ASSERT_EQ(0, ret);
1095 	syscall(__NR_getpid);
1096 }
1097 
1098 /* Ensure that SIGSYS overrides SIG_IGN */
TEST_F_SIGNAL(TRAP,ign,SIGSYS)1099 TEST_F_SIGNAL(TRAP, ign, SIGSYS)
1100 {
1101 	long ret;
1102 
1103 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1104 	ASSERT_EQ(0, ret);
1105 
1106 	signal(SIGSYS, SIG_IGN);
1107 
1108 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1109 	ASSERT_EQ(0, ret);
1110 	syscall(__NR_getpid);
1111 }
1112 
1113 static siginfo_t TRAP_info;
1114 static volatile int TRAP_nr;
TRAP_action(int nr,siginfo_t * info,void * void_context)1115 static void TRAP_action(int nr, siginfo_t *info, void *void_context)
1116 {
1117 	memcpy(&TRAP_info, info, sizeof(TRAP_info));
1118 	TRAP_nr = nr;
1119 }
1120 
TEST_F(TRAP,handler)1121 TEST_F(TRAP, handler)
1122 {
1123 	int ret, test;
1124 	struct sigaction act;
1125 	sigset_t mask;
1126 
1127 	memset(&act, 0, sizeof(act));
1128 	sigemptyset(&mask);
1129 	sigaddset(&mask, SIGSYS);
1130 
1131 	act.sa_sigaction = &TRAP_action;
1132 	act.sa_flags = SA_SIGINFO;
1133 	ret = sigaction(SIGSYS, &act, NULL);
1134 	ASSERT_EQ(0, ret) {
1135 		TH_LOG("sigaction failed");
1136 	}
1137 	ret = sigprocmask(SIG_UNBLOCK, &mask, NULL);
1138 	ASSERT_EQ(0, ret) {
1139 		TH_LOG("sigprocmask failed");
1140 	}
1141 
1142 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1143 	ASSERT_EQ(0, ret);
1144 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
1145 	ASSERT_EQ(0, ret);
1146 	TRAP_nr = 0;
1147 	memset(&TRAP_info, 0, sizeof(TRAP_info));
1148 	/* Expect the registers to be rolled back. (nr = error) may vary
1149 	 * based on arch. */
1150 	ret = syscall(__NR_getpid);
1151 	/* Silence gcc warning about volatile. */
1152 	test = TRAP_nr;
1153 	EXPECT_EQ(SIGSYS, test);
1154 	struct local_sigsys {
1155 		void *_call_addr;	/* calling user insn */
1156 		int _syscall;		/* triggering system call number */
1157 		unsigned int _arch;	/* AUDIT_ARCH_* of syscall */
1158 	} *sigsys = (struct local_sigsys *)
1159 #ifdef si_syscall
1160 		&(TRAP_info.si_call_addr);
1161 #else
1162 		&TRAP_info.si_pid;
1163 #endif
1164 	EXPECT_EQ(__NR_getpid, sigsys->_syscall);
1165 	/* Make sure arch is non-zero. */
1166 	EXPECT_NE(0, sigsys->_arch);
1167 	EXPECT_NE(0, (unsigned long)sigsys->_call_addr);
1168 }
1169 
FIXTURE(precedence)1170 FIXTURE(precedence) {
1171 	struct sock_fprog allow;
1172 	struct sock_fprog log;
1173 	struct sock_fprog trace;
1174 	struct sock_fprog error;
1175 	struct sock_fprog trap;
1176 	struct sock_fprog kill;
1177 };
1178 
FIXTURE_SETUP(precedence)1179 FIXTURE_SETUP(precedence)
1180 {
1181 	struct sock_filter allow_insns[] = {
1182 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1183 	};
1184 	struct sock_filter log_insns[] = {
1185 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1186 			offsetof(struct seccomp_data, nr)),
1187 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1188 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1189 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
1190 	};
1191 	struct sock_filter trace_insns[] = {
1192 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1193 			offsetof(struct seccomp_data, nr)),
1194 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1195 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1196 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE),
1197 	};
1198 	struct sock_filter error_insns[] = {
1199 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1200 			offsetof(struct seccomp_data, nr)),
1201 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1202 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1203 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO),
1204 	};
1205 	struct sock_filter trap_insns[] = {
1206 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1207 			offsetof(struct seccomp_data, nr)),
1208 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1209 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1210 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
1211 	};
1212 	struct sock_filter kill_insns[] = {
1213 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1214 			offsetof(struct seccomp_data, nr)),
1215 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
1216 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1217 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
1218 	};
1219 
1220 	memset(self, 0, sizeof(*self));
1221 #define FILTER_ALLOC(_x) \
1222 	self->_x.filter = malloc(sizeof(_x##_insns)); \
1223 	ASSERT_NE(NULL, self->_x.filter); \
1224 	memcpy(self->_x.filter, &_x##_insns, sizeof(_x##_insns)); \
1225 	self->_x.len = (unsigned short)ARRAY_SIZE(_x##_insns)
1226 	FILTER_ALLOC(allow);
1227 	FILTER_ALLOC(log);
1228 	FILTER_ALLOC(trace);
1229 	FILTER_ALLOC(error);
1230 	FILTER_ALLOC(trap);
1231 	FILTER_ALLOC(kill);
1232 }
1233 
FIXTURE_TEARDOWN(precedence)1234 FIXTURE_TEARDOWN(precedence)
1235 {
1236 #define FILTER_FREE(_x) if (self->_x.filter) free(self->_x.filter)
1237 	FILTER_FREE(allow);
1238 	FILTER_FREE(log);
1239 	FILTER_FREE(trace);
1240 	FILTER_FREE(error);
1241 	FILTER_FREE(trap);
1242 	FILTER_FREE(kill);
1243 }
1244 
TEST_F(precedence,allow_ok)1245 TEST_F(precedence, allow_ok)
1246 {
1247 	pid_t parent, res = 0;
1248 	long ret;
1249 
1250 	parent = getppid();
1251 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1252 	ASSERT_EQ(0, ret);
1253 
1254 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1255 	ASSERT_EQ(0, ret);
1256 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1257 	ASSERT_EQ(0, ret);
1258 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1259 	ASSERT_EQ(0, ret);
1260 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1261 	ASSERT_EQ(0, ret);
1262 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1263 	ASSERT_EQ(0, ret);
1264 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1265 	ASSERT_EQ(0, ret);
1266 	/* Should work just fine. */
1267 	res = syscall(__NR_getppid);
1268 	EXPECT_EQ(parent, res);
1269 }
1270 
TEST_F_SIGNAL(precedence,kill_is_highest,SIGSYS)1271 TEST_F_SIGNAL(precedence, kill_is_highest, SIGSYS)
1272 {
1273 	pid_t parent, res = 0;
1274 	long ret;
1275 
1276 	parent = getppid();
1277 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1278 	ASSERT_EQ(0, ret);
1279 
1280 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1281 	ASSERT_EQ(0, ret);
1282 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1283 	ASSERT_EQ(0, ret);
1284 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1285 	ASSERT_EQ(0, ret);
1286 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1287 	ASSERT_EQ(0, ret);
1288 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1289 	ASSERT_EQ(0, ret);
1290 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1291 	ASSERT_EQ(0, ret);
1292 	/* Should work just fine. */
1293 	res = syscall(__NR_getppid);
1294 	EXPECT_EQ(parent, res);
1295 	/* getpid() should never return. */
1296 	res = syscall(__NR_getpid);
1297 	EXPECT_EQ(0, res);
1298 }
1299 
TEST_F_SIGNAL(precedence,kill_is_highest_in_any_order,SIGSYS)1300 TEST_F_SIGNAL(precedence, kill_is_highest_in_any_order, SIGSYS)
1301 {
1302 	pid_t parent;
1303 	long ret;
1304 
1305 	parent = getppid();
1306 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1307 	ASSERT_EQ(0, ret);
1308 
1309 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1310 	ASSERT_EQ(0, ret);
1311 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
1312 	ASSERT_EQ(0, ret);
1313 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1314 	ASSERT_EQ(0, ret);
1315 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1316 	ASSERT_EQ(0, ret);
1317 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1318 	ASSERT_EQ(0, ret);
1319 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1320 	ASSERT_EQ(0, ret);
1321 	/* Should work just fine. */
1322 	EXPECT_EQ(parent, syscall(__NR_getppid));
1323 	/* getpid() should never return. */
1324 	EXPECT_EQ(0, syscall(__NR_getpid));
1325 }
1326 
TEST_F_SIGNAL(precedence,trap_is_second,SIGSYS)1327 TEST_F_SIGNAL(precedence, trap_is_second, SIGSYS)
1328 {
1329 	pid_t parent;
1330 	long ret;
1331 
1332 	parent = getppid();
1333 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1334 	ASSERT_EQ(0, ret);
1335 
1336 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1337 	ASSERT_EQ(0, ret);
1338 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1339 	ASSERT_EQ(0, ret);
1340 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1341 	ASSERT_EQ(0, ret);
1342 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1343 	ASSERT_EQ(0, ret);
1344 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1345 	ASSERT_EQ(0, ret);
1346 	/* Should work just fine. */
1347 	EXPECT_EQ(parent, syscall(__NR_getppid));
1348 	/* getpid() should never return. */
1349 	EXPECT_EQ(0, syscall(__NR_getpid));
1350 }
1351 
TEST_F_SIGNAL(precedence,trap_is_second_in_any_order,SIGSYS)1352 TEST_F_SIGNAL(precedence, trap_is_second_in_any_order, SIGSYS)
1353 {
1354 	pid_t parent;
1355 	long ret;
1356 
1357 	parent = getppid();
1358 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1359 	ASSERT_EQ(0, ret);
1360 
1361 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1362 	ASSERT_EQ(0, ret);
1363 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
1364 	ASSERT_EQ(0, ret);
1365 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1366 	ASSERT_EQ(0, ret);
1367 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1368 	ASSERT_EQ(0, ret);
1369 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1370 	ASSERT_EQ(0, ret);
1371 	/* Should work just fine. */
1372 	EXPECT_EQ(parent, syscall(__NR_getppid));
1373 	/* getpid() should never return. */
1374 	EXPECT_EQ(0, syscall(__NR_getpid));
1375 }
1376 
TEST_F(precedence,errno_is_third)1377 TEST_F(precedence, errno_is_third)
1378 {
1379 	pid_t parent;
1380 	long ret;
1381 
1382 	parent = getppid();
1383 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1384 	ASSERT_EQ(0, ret);
1385 
1386 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1387 	ASSERT_EQ(0, ret);
1388 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1389 	ASSERT_EQ(0, ret);
1390 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1391 	ASSERT_EQ(0, ret);
1392 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1393 	ASSERT_EQ(0, ret);
1394 	/* Should work just fine. */
1395 	EXPECT_EQ(parent, syscall(__NR_getppid));
1396 	EXPECT_EQ(0, syscall(__NR_getpid));
1397 }
1398 
TEST_F(precedence,errno_is_third_in_any_order)1399 TEST_F(precedence, errno_is_third_in_any_order)
1400 {
1401 	pid_t parent;
1402 	long ret;
1403 
1404 	parent = getppid();
1405 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1406 	ASSERT_EQ(0, ret);
1407 
1408 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1409 	ASSERT_EQ(0, ret);
1410 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
1411 	ASSERT_EQ(0, ret);
1412 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1413 	ASSERT_EQ(0, ret);
1414 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1415 	ASSERT_EQ(0, ret);
1416 	/* Should work just fine. */
1417 	EXPECT_EQ(parent, syscall(__NR_getppid));
1418 	EXPECT_EQ(0, syscall(__NR_getpid));
1419 }
1420 
TEST_F(precedence,trace_is_fourth)1421 TEST_F(precedence, trace_is_fourth)
1422 {
1423 	pid_t parent;
1424 	long ret;
1425 
1426 	parent = getppid();
1427 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1428 	ASSERT_EQ(0, ret);
1429 
1430 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1431 	ASSERT_EQ(0, ret);
1432 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1433 	ASSERT_EQ(0, ret);
1434 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1435 	ASSERT_EQ(0, ret);
1436 	/* Should work just fine. */
1437 	EXPECT_EQ(parent, syscall(__NR_getppid));
1438 	/* No ptracer */
1439 	EXPECT_EQ(-1, syscall(__NR_getpid));
1440 }
1441 
TEST_F(precedence,trace_is_fourth_in_any_order)1442 TEST_F(precedence, trace_is_fourth_in_any_order)
1443 {
1444 	pid_t parent;
1445 	long ret;
1446 
1447 	parent = getppid();
1448 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1449 	ASSERT_EQ(0, ret);
1450 
1451 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
1452 	ASSERT_EQ(0, ret);
1453 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1454 	ASSERT_EQ(0, ret);
1455 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1456 	ASSERT_EQ(0, ret);
1457 	/* Should work just fine. */
1458 	EXPECT_EQ(parent, syscall(__NR_getppid));
1459 	/* No ptracer */
1460 	EXPECT_EQ(-1, syscall(__NR_getpid));
1461 }
1462 
TEST_F(precedence,log_is_fifth)1463 TEST_F(precedence, log_is_fifth)
1464 {
1465 	pid_t mypid, parent;
1466 	long ret;
1467 
1468 	mypid = getpid();
1469 	parent = getppid();
1470 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1471 	ASSERT_EQ(0, ret);
1472 
1473 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1474 	ASSERT_EQ(0, ret);
1475 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1476 	ASSERT_EQ(0, ret);
1477 	/* Should work just fine. */
1478 	EXPECT_EQ(parent, syscall(__NR_getppid));
1479 	/* Should also work just fine */
1480 	EXPECT_EQ(mypid, syscall(__NR_getpid));
1481 }
1482 
TEST_F(precedence,log_is_fifth_in_any_order)1483 TEST_F(precedence, log_is_fifth_in_any_order)
1484 {
1485 	pid_t mypid, parent;
1486 	long ret;
1487 
1488 	mypid = getpid();
1489 	parent = getppid();
1490 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1491 	ASSERT_EQ(0, ret);
1492 
1493 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
1494 	ASSERT_EQ(0, ret);
1495 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
1496 	ASSERT_EQ(0, ret);
1497 	/* Should work just fine. */
1498 	EXPECT_EQ(parent, syscall(__NR_getppid));
1499 	/* Should also work just fine */
1500 	EXPECT_EQ(mypid, syscall(__NR_getpid));
1501 }
1502 
1503 #ifndef PTRACE_O_TRACESECCOMP
1504 #define PTRACE_O_TRACESECCOMP	0x00000080
1505 #endif
1506 
1507 /* Catch the Ubuntu 12.04 value error. */
1508 #if PTRACE_EVENT_SECCOMP != 7
1509 #undef PTRACE_EVENT_SECCOMP
1510 #endif
1511 
1512 #ifndef PTRACE_EVENT_SECCOMP
1513 #define PTRACE_EVENT_SECCOMP 7
1514 #endif
1515 
1516 #define PTRACE_EVENT_MASK(status) ((status) >> 16)
1517 bool tracer_running;
tracer_stop(int sig)1518 void tracer_stop(int sig)
1519 {
1520 	tracer_running = false;
1521 }
1522 
1523 typedef void tracer_func_t(struct __test_metadata *_metadata,
1524 			   pid_t tracee, int status, void *args);
1525 
start_tracer(struct __test_metadata * _metadata,int fd,pid_t tracee,tracer_func_t tracer_func,void * args,bool ptrace_syscall)1526 void start_tracer(struct __test_metadata *_metadata, int fd, pid_t tracee,
1527 	    tracer_func_t tracer_func, void *args, bool ptrace_syscall)
1528 {
1529 	int ret = -1;
1530 	struct sigaction action = {
1531 		.sa_handler = tracer_stop,
1532 	};
1533 
1534 	/* Allow external shutdown. */
1535 	tracer_running = true;
1536 	ASSERT_EQ(0, sigaction(SIGUSR1, &action, NULL));
1537 
1538 	errno = 0;
1539 	while (ret == -1 && errno != EINVAL)
1540 		ret = ptrace(PTRACE_ATTACH, tracee, NULL, 0);
1541 	ASSERT_EQ(0, ret) {
1542 		kill(tracee, SIGKILL);
1543 	}
1544 	/* Wait for attach stop */
1545 	wait(NULL);
1546 
1547 	ret = ptrace(PTRACE_SETOPTIONS, tracee, NULL, ptrace_syscall ?
1548 						      PTRACE_O_TRACESYSGOOD :
1549 						      PTRACE_O_TRACESECCOMP);
1550 	ASSERT_EQ(0, ret) {
1551 		TH_LOG("Failed to set PTRACE_O_TRACESECCOMP");
1552 		kill(tracee, SIGKILL);
1553 	}
1554 	ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1555 		     tracee, NULL, 0);
1556 	ASSERT_EQ(0, ret);
1557 
1558 	/* Unblock the tracee */
1559 	ASSERT_EQ(1, write(fd, "A", 1));
1560 	ASSERT_EQ(0, close(fd));
1561 
1562 	/* Run until we're shut down. Must assert to stop execution. */
1563 	while (tracer_running) {
1564 		int status;
1565 
1566 		if (wait(&status) != tracee)
1567 			continue;
1568 
1569 		if (WIFSIGNALED(status)) {
1570 			/* Child caught a fatal signal. */
1571 			return;
1572 		}
1573 		if (WIFEXITED(status)) {
1574 			/* Child exited with code. */
1575 			return;
1576 		}
1577 
1578 		/* Check if we got an expected event. */
1579 		ASSERT_EQ(WIFCONTINUED(status), false);
1580 		ASSERT_EQ(WIFSTOPPED(status), true);
1581 		ASSERT_EQ(WSTOPSIG(status) & SIGTRAP, SIGTRAP) {
1582 			TH_LOG("Unexpected WSTOPSIG: %d", WSTOPSIG(status));
1583 		}
1584 
1585 		tracer_func(_metadata, tracee, status, args);
1586 
1587 		ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
1588 			     tracee, NULL, 0);
1589 		ASSERT_EQ(0, ret);
1590 	}
1591 	/* Directly report the status of our test harness results. */
1592 	syscall(__NR_exit, _metadata->exit_code);
1593 }
1594 
1595 /* Common tracer setup/teardown functions. */
cont_handler(int num)1596 void cont_handler(int num)
1597 { }
setup_trace_fixture(struct __test_metadata * _metadata,tracer_func_t func,void * args,bool ptrace_syscall)1598 pid_t setup_trace_fixture(struct __test_metadata *_metadata,
1599 			  tracer_func_t func, void *args, bool ptrace_syscall)
1600 {
1601 	char sync;
1602 	int pipefd[2];
1603 	pid_t tracer_pid;
1604 	pid_t tracee = getpid();
1605 
1606 	/* Setup a pipe for clean synchronization. */
1607 	ASSERT_EQ(0, pipe(pipefd));
1608 
1609 	/* Fork a child which we'll promote to tracer */
1610 	tracer_pid = fork();
1611 	ASSERT_LE(0, tracer_pid);
1612 	signal(SIGALRM, cont_handler);
1613 	if (tracer_pid == 0) {
1614 		close(pipefd[0]);
1615 		start_tracer(_metadata, pipefd[1], tracee, func, args,
1616 			     ptrace_syscall);
1617 		syscall(__NR_exit, 0);
1618 	}
1619 	close(pipefd[1]);
1620 	prctl(PR_SET_PTRACER, tracer_pid, 0, 0, 0);
1621 	read(pipefd[0], &sync, 1);
1622 	close(pipefd[0]);
1623 
1624 	return tracer_pid;
1625 }
1626 
teardown_trace_fixture(struct __test_metadata * _metadata,pid_t tracer)1627 void teardown_trace_fixture(struct __test_metadata *_metadata,
1628 			    pid_t tracer)
1629 {
1630 	if (tracer) {
1631 		int status;
1632 		ASSERT_EQ(0, kill(tracer, SIGUSR1));
1633 		ASSERT_EQ(tracer, waitpid(tracer, &status, 0));
1634 	}
1635 }
1636 
1637 /* "poke" tracer arguments and function. */
1638 struct tracer_args_poke_t {
1639 	unsigned long poke_addr;
1640 };
1641 
tracer_poke(struct __test_metadata * _metadata,pid_t tracee,int status,void * args)1642 void tracer_poke(struct __test_metadata *_metadata, pid_t tracee, int status,
1643 		 void *args)
1644 {
1645 	int ret;
1646 	unsigned long msg;
1647 	struct tracer_args_poke_t *info = (struct tracer_args_poke_t *)args;
1648 
1649 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
1650 	EXPECT_EQ(0, ret);
1651 	/* If this fails, don't try to recover. */
1652 	ASSERT_EQ(0x1001, msg) {
1653 		kill(tracee, SIGKILL);
1654 	}
1655 	/*
1656 	 * Poke in the message.
1657 	 * Registers are not touched to try to keep this relatively arch
1658 	 * agnostic.
1659 	 */
1660 	ret = ptrace(PTRACE_POKEDATA, tracee, info->poke_addr, 0x1001);
1661 	EXPECT_EQ(0, ret);
1662 }
1663 
FIXTURE(TRACE_poke)1664 FIXTURE(TRACE_poke) {
1665 	struct sock_fprog prog;
1666 	pid_t tracer;
1667 	long poked;
1668 	struct tracer_args_poke_t tracer_args;
1669 };
1670 
FIXTURE_SETUP(TRACE_poke)1671 FIXTURE_SETUP(TRACE_poke)
1672 {
1673 	struct sock_filter filter[] = {
1674 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
1675 			offsetof(struct seccomp_data, nr)),
1676 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
1677 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1001),
1678 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
1679 	};
1680 
1681 	self->poked = 0;
1682 	memset(&self->prog, 0, sizeof(self->prog));
1683 	self->prog.filter = malloc(sizeof(filter));
1684 	ASSERT_NE(NULL, self->prog.filter);
1685 	memcpy(self->prog.filter, filter, sizeof(filter));
1686 	self->prog.len = (unsigned short)ARRAY_SIZE(filter);
1687 
1688 	/* Set up tracer args. */
1689 	self->tracer_args.poke_addr = (unsigned long)&self->poked;
1690 
1691 	/* Launch tracer. */
1692 	self->tracer = setup_trace_fixture(_metadata, tracer_poke,
1693 					   &self->tracer_args, false);
1694 }
1695 
FIXTURE_TEARDOWN(TRACE_poke)1696 FIXTURE_TEARDOWN(TRACE_poke)
1697 {
1698 	teardown_trace_fixture(_metadata, self->tracer);
1699 	if (self->prog.filter)
1700 		free(self->prog.filter);
1701 }
1702 
TEST_F(TRACE_poke,read_has_side_effects)1703 TEST_F(TRACE_poke, read_has_side_effects)
1704 {
1705 	ssize_t ret;
1706 
1707 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1708 	ASSERT_EQ(0, ret);
1709 
1710 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1711 	ASSERT_EQ(0, ret);
1712 
1713 	EXPECT_EQ(0, self->poked);
1714 	ret = read(-1, NULL, 0);
1715 	EXPECT_EQ(-1, ret);
1716 	EXPECT_EQ(0x1001, self->poked);
1717 }
1718 
TEST_F(TRACE_poke,getpid_runs_normally)1719 TEST_F(TRACE_poke, getpid_runs_normally)
1720 {
1721 	long ret;
1722 
1723 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
1724 	ASSERT_EQ(0, ret);
1725 
1726 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
1727 	ASSERT_EQ(0, ret);
1728 
1729 	EXPECT_EQ(0, self->poked);
1730 	EXPECT_NE(0, syscall(__NR_getpid));
1731 	EXPECT_EQ(0, self->poked);
1732 }
1733 
1734 #if defined(__x86_64__)
1735 # define ARCH_REGS		struct user_regs_struct
1736 # define SYSCALL_NUM(_regs)	(_regs).orig_rax
1737 # define SYSCALL_RET(_regs)	(_regs).rax
1738 #elif defined(__i386__)
1739 # define ARCH_REGS		struct user_regs_struct
1740 # define SYSCALL_NUM(_regs)	(_regs).orig_eax
1741 # define SYSCALL_RET(_regs)	(_regs).eax
1742 #elif defined(__arm__)
1743 # define ARCH_REGS		struct pt_regs
1744 # define SYSCALL_NUM(_regs)	(_regs).ARM_r7
1745 # ifndef PTRACE_SET_SYSCALL
1746 #  define PTRACE_SET_SYSCALL   23
1747 # endif
1748 # define SYSCALL_NUM_SET(_regs, _nr)	\
1749 		EXPECT_EQ(0, ptrace(PTRACE_SET_SYSCALL, tracee, NULL, _nr))
1750 # define SYSCALL_RET(_regs)	(_regs).ARM_r0
1751 #elif defined(__aarch64__)
1752 # define ARCH_REGS		struct user_pt_regs
1753 # define SYSCALL_NUM(_regs)	(_regs).regs[8]
1754 # ifndef NT_ARM_SYSTEM_CALL
1755 #  define NT_ARM_SYSTEM_CALL 0x404
1756 # endif
1757 # define SYSCALL_NUM_SET(_regs, _nr)				\
1758 	do {							\
1759 		struct iovec __v;				\
1760 		typeof(_nr) __nr = (_nr);			\
1761 		__v.iov_base = &__nr;				\
1762 		__v.iov_len = sizeof(__nr);			\
1763 		EXPECT_EQ(0, ptrace(PTRACE_SETREGSET, tracee,	\
1764 				    NT_ARM_SYSTEM_CALL, &__v));	\
1765 	} while (0)
1766 # define SYSCALL_RET(_regs)	(_regs).regs[0]
1767 #elif defined(__loongarch__)
1768 # define ARCH_REGS		struct user_pt_regs
1769 # define SYSCALL_NUM(_regs)	(_regs).regs[11]
1770 # define SYSCALL_RET(_regs)	(_regs).regs[4]
1771 #elif defined(__riscv) && __riscv_xlen == 64
1772 # define ARCH_REGS		struct user_regs_struct
1773 # define SYSCALL_NUM(_regs)	(_regs).a7
1774 # define SYSCALL_RET(_regs)	(_regs).a0
1775 #elif defined(__csky__)
1776 # define ARCH_REGS		struct pt_regs
1777 #  if defined(__CSKYABIV2__)
1778 #   define SYSCALL_NUM(_regs)	(_regs).regs[3]
1779 #  else
1780 #   define SYSCALL_NUM(_regs)	(_regs).regs[9]
1781 #  endif
1782 # define SYSCALL_RET(_regs)	(_regs).a0
1783 #elif defined(__hppa__)
1784 # define ARCH_REGS		struct user_regs_struct
1785 # define SYSCALL_NUM(_regs)	(_regs).gr[20]
1786 # define SYSCALL_RET(_regs)	(_regs).gr[28]
1787 #elif defined(__powerpc__)
1788 # define ARCH_REGS		struct pt_regs
1789 # define SYSCALL_NUM(_regs)	(_regs).gpr[0]
1790 # define SYSCALL_RET(_regs)	(_regs).gpr[3]
1791 # define SYSCALL_RET_SET(_regs, _val)				\
1792 	do {							\
1793 		typeof(_val) _result = (_val);			\
1794 		if ((_regs.trap & 0xfff0) == 0x3000) {		\
1795 			/*					\
1796 			 * scv 0 system call uses -ve result	\
1797 			 * for error, so no need to adjust.	\
1798 			 */					\
1799 			SYSCALL_RET(_regs) = _result;		\
1800 		} else {					\
1801 			/*					\
1802 			 * A syscall error is signaled by the	\
1803 			 * CR0 SO bit and the code is stored as	\
1804 			 * a positive value.			\
1805 			 */					\
1806 			if (_result < 0) {			\
1807 				SYSCALL_RET(_regs) = -_result;	\
1808 				(_regs).ccr |= 0x10000000;	\
1809 			} else {				\
1810 				SYSCALL_RET(_regs) = _result;	\
1811 				(_regs).ccr &= ~0x10000000;	\
1812 			}					\
1813 		}						\
1814 	} while (0)
1815 # define SYSCALL_RET_SET_ON_PTRACE_EXIT
1816 #elif defined(__s390__)
1817 # define ARCH_REGS		s390_regs
1818 # define SYSCALL_NUM(_regs)	(_regs).gprs[2]
1819 # define SYSCALL_RET_SET(_regs, _val)			\
1820 		TH_LOG("Can't modify syscall return on this architecture")
1821 #elif defined(__mips__)
1822 # include <asm/unistd_nr_n32.h>
1823 # include <asm/unistd_nr_n64.h>
1824 # include <asm/unistd_nr_o32.h>
1825 # define ARCH_REGS		struct pt_regs
1826 # define SYSCALL_NUM(_regs)				\
1827 	({						\
1828 		typeof((_regs).regs[2]) _nr;		\
1829 		if ((_regs).regs[2] == __NR_O32_Linux)	\
1830 			_nr = (_regs).regs[4];		\
1831 		else					\
1832 			_nr = (_regs).regs[2];		\
1833 		_nr;					\
1834 	})
1835 # define SYSCALL_NUM_SET(_regs, _nr)			\
1836 	do {						\
1837 		if ((_regs).regs[2] == __NR_O32_Linux)	\
1838 			(_regs).regs[4] = _nr;		\
1839 		else					\
1840 			(_regs).regs[2] = _nr;		\
1841 	} while (0)
1842 # define SYSCALL_RET_SET(_regs, _val)			\
1843 		TH_LOG("Can't modify syscall return on this architecture")
1844 #elif defined(__xtensa__)
1845 # define ARCH_REGS		struct user_pt_regs
1846 # define SYSCALL_NUM(_regs)	(_regs).syscall
1847 /*
1848  * On xtensa syscall return value is in the register
1849  * a2 of the current window which is not fixed.
1850  */
1851 #define SYSCALL_RET(_regs)	(_regs).a[(_regs).windowbase * 4 + 2]
1852 #elif defined(__sh__)
1853 # define ARCH_REGS		struct pt_regs
1854 # define SYSCALL_NUM(_regs)	(_regs).regs[3]
1855 # define SYSCALL_RET(_regs)	(_regs).regs[0]
1856 #elif defined(__mc68000__)
1857 # define ARCH_REGS		struct user_regs_struct
1858 # define SYSCALL_NUM(_regs)	(_regs).orig_d0
1859 # define SYSCALL_RET(_regs)	(_regs).d0
1860 #else
1861 # error "Do not know how to find your architecture's registers and syscalls"
1862 #endif
1863 
1864 /*
1865  * Most architectures can change the syscall by just updating the
1866  * associated register. This is the default if not defined above.
1867  */
1868 #ifndef SYSCALL_NUM_SET
1869 # define SYSCALL_NUM_SET(_regs, _nr)		\
1870 	do {					\
1871 		SYSCALL_NUM(_regs) = (_nr);	\
1872 	} while (0)
1873 #endif
1874 /*
1875  * Most architectures can change the syscall return value by just
1876  * writing to the SYSCALL_RET register. This is the default if not
1877  * defined above. If an architecture cannot set the return value
1878  * (for example when the syscall and return value register is
1879  * shared), report it with TH_LOG() in an arch-specific definition
1880  * of SYSCALL_RET_SET() above, and leave SYSCALL_RET undefined.
1881  */
1882 #if !defined(SYSCALL_RET) && !defined(SYSCALL_RET_SET)
1883 # error "One of SYSCALL_RET or SYSCALL_RET_SET is needed for this arch"
1884 #endif
1885 #ifndef SYSCALL_RET_SET
1886 # define SYSCALL_RET_SET(_regs, _val)		\
1887 	do {					\
1888 		SYSCALL_RET(_regs) = (_val);	\
1889 	} while (0)
1890 #endif
1891 
1892 /* When the syscall return can't be changed, stub out the tests for it. */
1893 #ifndef SYSCALL_RET
1894 # define EXPECT_SYSCALL_RETURN(val, action)	EXPECT_EQ(-1, action)
1895 #else
1896 # define EXPECT_SYSCALL_RETURN(val, action)		\
1897 	do {						\
1898 		errno = 0;				\
1899 		if (val < 0) {				\
1900 			EXPECT_EQ(-1, action);		\
1901 			EXPECT_EQ(-(val), errno);	\
1902 		} else {				\
1903 			EXPECT_EQ(val, action);		\
1904 		}					\
1905 	} while (0)
1906 #endif
1907 
1908 /*
1909  * Some architectures (e.g. powerpc) can only set syscall
1910  * return values on syscall exit during ptrace.
1911  */
1912 const bool ptrace_entry_set_syscall_nr = true;
1913 const bool ptrace_entry_set_syscall_ret =
1914 #ifndef SYSCALL_RET_SET_ON_PTRACE_EXIT
1915 	true;
1916 #else
1917 	false;
1918 #endif
1919 
1920 /*
1921  * Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for
1922  * architectures without HAVE_ARCH_TRACEHOOK (e.g. User-mode Linux).
1923  */
1924 #if defined(__x86_64__) || defined(__i386__) || defined(__mips__) || defined(__mc68000__)
1925 # define ARCH_GETREGS(_regs)	ptrace(PTRACE_GETREGS, tracee, 0, &(_regs))
1926 # define ARCH_SETREGS(_regs)	ptrace(PTRACE_SETREGS, tracee, 0, &(_regs))
1927 #else
1928 # define ARCH_GETREGS(_regs)	({					\
1929 		struct iovec __v;					\
1930 		__v.iov_base = &(_regs);				\
1931 		__v.iov_len = sizeof(_regs);				\
1932 		ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &__v);	\
1933 	})
1934 # define ARCH_SETREGS(_regs)	({					\
1935 		struct iovec __v;					\
1936 		__v.iov_base = &(_regs);				\
1937 		__v.iov_len = sizeof(_regs);				\
1938 		ptrace(PTRACE_SETREGSET, tracee, NT_PRSTATUS, &__v);	\
1939 	})
1940 #endif
1941 
1942 /* Architecture-specific syscall fetching routine. */
get_syscall(struct __test_metadata * _metadata,pid_t tracee)1943 int get_syscall(struct __test_metadata *_metadata, pid_t tracee)
1944 {
1945 	ARCH_REGS regs;
1946 
1947 	EXPECT_EQ(0, ARCH_GETREGS(regs)) {
1948 		return -1;
1949 	}
1950 
1951 	return SYSCALL_NUM(regs);
1952 }
1953 
1954 /* Architecture-specific syscall changing routine. */
__change_syscall(struct __test_metadata * _metadata,pid_t tracee,long * syscall,long * ret)1955 void __change_syscall(struct __test_metadata *_metadata,
1956 		    pid_t tracee, long *syscall, long *ret)
1957 {
1958 	ARCH_REGS orig, regs;
1959 
1960 	/* Do not get/set registers if we have nothing to do. */
1961 	if (!syscall && !ret)
1962 		return;
1963 
1964 	EXPECT_EQ(0, ARCH_GETREGS(regs)) {
1965 		return;
1966 	}
1967 	orig = regs;
1968 
1969 	if (syscall)
1970 		SYSCALL_NUM_SET(regs, *syscall);
1971 
1972 	if (ret)
1973 		SYSCALL_RET_SET(regs, *ret);
1974 
1975 	/* Flush any register changes made. */
1976 	if (memcmp(&orig, &regs, sizeof(orig)) != 0)
1977 		EXPECT_EQ(0, ARCH_SETREGS(regs));
1978 }
1979 
1980 /* Change only syscall number. */
change_syscall_nr(struct __test_metadata * _metadata,pid_t tracee,long syscall)1981 void change_syscall_nr(struct __test_metadata *_metadata,
1982 		       pid_t tracee, long syscall)
1983 {
1984 	__change_syscall(_metadata, tracee, &syscall, NULL);
1985 }
1986 
1987 /* Change syscall return value (and set syscall number to -1). */
change_syscall_ret(struct __test_metadata * _metadata,pid_t tracee,long ret)1988 void change_syscall_ret(struct __test_metadata *_metadata,
1989 			pid_t tracee, long ret)
1990 {
1991 	long syscall = -1;
1992 
1993 	__change_syscall(_metadata, tracee, &syscall, &ret);
1994 }
1995 
tracer_seccomp(struct __test_metadata * _metadata,pid_t tracee,int status,void * args)1996 void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee,
1997 		    int status, void *args)
1998 {
1999 	int ret;
2000 	unsigned long msg;
2001 
2002 	EXPECT_EQ(PTRACE_EVENT_MASK(status), PTRACE_EVENT_SECCOMP) {
2003 		TH_LOG("Unexpected ptrace event: %d", PTRACE_EVENT_MASK(status));
2004 		return;
2005 	}
2006 
2007 	/* Make sure we got the right message. */
2008 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
2009 	EXPECT_EQ(0, ret);
2010 
2011 	/* Validate and take action on expected syscalls. */
2012 	switch (msg) {
2013 	case 0x1002:
2014 		/* change getpid to getppid. */
2015 		EXPECT_EQ(__NR_getpid, get_syscall(_metadata, tracee));
2016 		change_syscall_nr(_metadata, tracee, __NR_getppid);
2017 		break;
2018 	case 0x1003:
2019 		/* skip gettid with valid return code. */
2020 		EXPECT_EQ(__NR_gettid, get_syscall(_metadata, tracee));
2021 		change_syscall_ret(_metadata, tracee, 45000);
2022 		break;
2023 	case 0x1004:
2024 		/* skip openat with error. */
2025 		EXPECT_EQ(__NR_openat, get_syscall(_metadata, tracee));
2026 		change_syscall_ret(_metadata, tracee, -ESRCH);
2027 		break;
2028 	case 0x1005:
2029 		/* do nothing (allow getppid) */
2030 		EXPECT_EQ(__NR_getppid, get_syscall(_metadata, tracee));
2031 		break;
2032 	default:
2033 		EXPECT_EQ(0, msg) {
2034 			TH_LOG("Unknown PTRACE_GETEVENTMSG: 0x%lx", msg);
2035 			kill(tracee, SIGKILL);
2036 		}
2037 	}
2038 
2039 }
2040 
FIXTURE(TRACE_syscall)2041 FIXTURE(TRACE_syscall) {
2042 	struct sock_fprog prog;
2043 	pid_t tracer, mytid, mypid, parent;
2044 	long syscall_nr;
2045 };
2046 
tracer_ptrace(struct __test_metadata * _metadata,pid_t tracee,int status,void * args)2047 void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
2048 		   int status, void *args)
2049 {
2050 	int ret;
2051 	unsigned long msg;
2052 	static bool entry;
2053 	long syscall_nr_val, syscall_ret_val;
2054 	long *syscall_nr = NULL, *syscall_ret = NULL;
2055 	FIXTURE_DATA(TRACE_syscall) *self = args;
2056 
2057 	EXPECT_EQ(WSTOPSIG(status) & 0x80, 0x80) {
2058 		TH_LOG("Unexpected WSTOPSIG: %d", WSTOPSIG(status));
2059 		return;
2060 	}
2061 
2062 	/*
2063 	 * The traditional way to tell PTRACE_SYSCALL entry/exit
2064 	 * is by counting.
2065 	 */
2066 	entry = !entry;
2067 
2068 	/* Make sure we got an appropriate message. */
2069 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
2070 	EXPECT_EQ(0, ret);
2071 	EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
2072 			: PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
2073 
2074 	/*
2075 	 * Some architectures only support setting return values during
2076 	 * syscall exit under ptrace, and on exit the syscall number may
2077 	 * no longer be available. Therefore, save the initial sycall
2078 	 * number here, so it can be examined during both entry and exit
2079 	 * phases.
2080 	 */
2081 	if (entry)
2082 		self->syscall_nr = get_syscall(_metadata, tracee);
2083 
2084 	/*
2085 	 * Depending on the architecture's syscall setting abilities, we
2086 	 * pick which things to set during this phase (entry or exit).
2087 	 */
2088 	if (entry == ptrace_entry_set_syscall_nr)
2089 		syscall_nr = &syscall_nr_val;
2090 	if (entry == ptrace_entry_set_syscall_ret)
2091 		syscall_ret = &syscall_ret_val;
2092 
2093 	/* Now handle the actual rewriting cases. */
2094 	switch (self->syscall_nr) {
2095 	case __NR_getpid:
2096 		syscall_nr_val = __NR_getppid;
2097 		/* Never change syscall return for this case. */
2098 		syscall_ret = NULL;
2099 		break;
2100 	case __NR_gettid:
2101 		syscall_nr_val = -1;
2102 		syscall_ret_val = 45000;
2103 		break;
2104 	case __NR_openat:
2105 		syscall_nr_val = -1;
2106 		syscall_ret_val = -ESRCH;
2107 		break;
2108 	default:
2109 		/* Unhandled, do nothing. */
2110 		return;
2111 	}
2112 
2113 	__change_syscall(_metadata, tracee, syscall_nr, syscall_ret);
2114 }
2115 
FIXTURE_VARIANT(TRACE_syscall)2116 FIXTURE_VARIANT(TRACE_syscall) {
2117 	/*
2118 	 * All of the SECCOMP_RET_TRACE behaviors can be tested with either
2119 	 * SECCOMP_RET_TRACE+PTRACE_CONT or plain ptrace()+PTRACE_SYSCALL.
2120 	 * This indicates if we should use SECCOMP_RET_TRACE (false), or
2121 	 * ptrace (true).
2122 	 */
2123 	bool use_ptrace;
2124 };
2125 
FIXTURE_VARIANT_ADD(TRACE_syscall,ptrace)2126 FIXTURE_VARIANT_ADD(TRACE_syscall, ptrace) {
2127 	.use_ptrace = true,
2128 };
2129 
FIXTURE_VARIANT_ADD(TRACE_syscall,seccomp)2130 FIXTURE_VARIANT_ADD(TRACE_syscall, seccomp) {
2131 	.use_ptrace = false,
2132 };
2133 
FIXTURE_SETUP(TRACE_syscall)2134 FIXTURE_SETUP(TRACE_syscall)
2135 {
2136 	struct sock_filter filter[] = {
2137 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2138 			offsetof(struct seccomp_data, nr)),
2139 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
2140 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1002),
2141 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_gettid, 0, 1),
2142 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1003),
2143 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_openat, 0, 1),
2144 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1004),
2145 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2146 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1005),
2147 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2148 	};
2149 	struct sock_fprog prog = {
2150 		.len = (unsigned short)ARRAY_SIZE(filter),
2151 		.filter = filter,
2152 	};
2153 	long ret;
2154 
2155 	/* Prepare some testable syscall results. */
2156 	self->mytid = syscall(__NR_gettid);
2157 	ASSERT_GT(self->mytid, 0);
2158 	ASSERT_NE(self->mytid, 1) {
2159 		TH_LOG("Running this test as init is not supported. :)");
2160 	}
2161 
2162 	self->mypid = getpid();
2163 	ASSERT_GT(self->mypid, 0);
2164 	ASSERT_EQ(self->mytid, self->mypid);
2165 
2166 	self->parent = getppid();
2167 	ASSERT_GT(self->parent, 0);
2168 	ASSERT_NE(self->parent, self->mypid);
2169 
2170 	/* Launch tracer. */
2171 	self->tracer = setup_trace_fixture(_metadata,
2172 					   variant->use_ptrace ? tracer_ptrace
2173 							       : tracer_seccomp,
2174 					   self, variant->use_ptrace);
2175 
2176 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2177 	ASSERT_EQ(0, ret);
2178 
2179 	/* Do not install seccomp rewrite filters, as we'll use ptrace instead. */
2180 	if (variant->use_ptrace)
2181 		return;
2182 
2183 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2184 	ASSERT_EQ(0, ret);
2185 }
2186 
FIXTURE_TEARDOWN(TRACE_syscall)2187 FIXTURE_TEARDOWN(TRACE_syscall)
2188 {
2189 	teardown_trace_fixture(_metadata, self->tracer);
2190 }
2191 
TEST(negative_ENOSYS)2192 TEST(negative_ENOSYS)
2193 {
2194 #if defined(__arm__)
2195 	SKIP(return, "arm32 does not support calling syscall -1");
2196 #endif
2197 	/*
2198 	 * There should be no difference between an "internal" skip
2199 	 * and userspace asking for syscall "-1".
2200 	 */
2201 	errno = 0;
2202 	EXPECT_EQ(-1, syscall(-1));
2203 	EXPECT_EQ(errno, ENOSYS);
2204 	/* And no difference for "still not valid but not -1". */
2205 	errno = 0;
2206 	EXPECT_EQ(-1, syscall(-101));
2207 	EXPECT_EQ(errno, ENOSYS);
2208 }
2209 
TEST_F(TRACE_syscall,negative_ENOSYS)2210 TEST_F(TRACE_syscall, negative_ENOSYS)
2211 {
2212 	negative_ENOSYS(_metadata);
2213 }
2214 
TEST_F(TRACE_syscall,syscall_allowed)2215 TEST_F(TRACE_syscall, syscall_allowed)
2216 {
2217 	/* getppid works as expected (no changes). */
2218 	EXPECT_EQ(self->parent, syscall(__NR_getppid));
2219 	EXPECT_NE(self->mypid, syscall(__NR_getppid));
2220 }
2221 
TEST_F(TRACE_syscall,syscall_redirected)2222 TEST_F(TRACE_syscall, syscall_redirected)
2223 {
2224 	/* getpid has been redirected to getppid as expected. */
2225 	EXPECT_EQ(self->parent, syscall(__NR_getpid));
2226 	EXPECT_NE(self->mypid, syscall(__NR_getpid));
2227 }
2228 
TEST_F(TRACE_syscall,syscall_errno)2229 TEST_F(TRACE_syscall, syscall_errno)
2230 {
2231 	/* Tracer should skip the open syscall, resulting in ESRCH. */
2232 	EXPECT_SYSCALL_RETURN(-ESRCH, syscall(__NR_openat));
2233 }
2234 
TEST_F(TRACE_syscall,syscall_faked)2235 TEST_F(TRACE_syscall, syscall_faked)
2236 {
2237 	/* Tracer skips the gettid syscall and store altered return value. */
2238 	EXPECT_SYSCALL_RETURN(45000, syscall(__NR_gettid));
2239 }
2240 
TEST_F_SIGNAL(TRACE_syscall,kill_immediate,SIGSYS)2241 TEST_F_SIGNAL(TRACE_syscall, kill_immediate, SIGSYS)
2242 {
2243 	struct sock_filter filter[] = {
2244 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2245 			offsetof(struct seccomp_data, nr)),
2246 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_mknodat, 0, 1),
2247 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
2248 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2249 	};
2250 	struct sock_fprog prog = {
2251 		.len = (unsigned short)ARRAY_SIZE(filter),
2252 		.filter = filter,
2253 	};
2254 	long ret;
2255 
2256 	/* Install "kill on mknodat" filter. */
2257 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2258 	ASSERT_EQ(0, ret);
2259 
2260 	/* This should immediately die with SIGSYS, regardless of tracer. */
2261 	EXPECT_EQ(-1, syscall(__NR_mknodat, -1, NULL, 0, 0));
2262 }
2263 
TEST_F(TRACE_syscall,skip_after)2264 TEST_F(TRACE_syscall, skip_after)
2265 {
2266 	struct sock_filter filter[] = {
2267 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2268 			offsetof(struct seccomp_data, nr)),
2269 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2270 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EPERM),
2271 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2272 	};
2273 	struct sock_fprog prog = {
2274 		.len = (unsigned short)ARRAY_SIZE(filter),
2275 		.filter = filter,
2276 	};
2277 	long ret;
2278 
2279 	/* Install additional "errno on getppid" filter. */
2280 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2281 	ASSERT_EQ(0, ret);
2282 
2283 	/* Tracer will redirect getpid to getppid, and we should see EPERM. */
2284 	errno = 0;
2285 	EXPECT_EQ(-1, syscall(__NR_getpid));
2286 	EXPECT_EQ(EPERM, errno);
2287 }
2288 
TEST_F_SIGNAL(TRACE_syscall,kill_after,SIGSYS)2289 TEST_F_SIGNAL(TRACE_syscall, kill_after, SIGSYS)
2290 {
2291 	struct sock_filter filter[] = {
2292 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2293 			offsetof(struct seccomp_data, nr)),
2294 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
2295 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2296 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2297 	};
2298 	struct sock_fprog prog = {
2299 		.len = (unsigned short)ARRAY_SIZE(filter),
2300 		.filter = filter,
2301 	};
2302 	long ret;
2303 
2304 	/* Install additional "death on getppid" filter. */
2305 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
2306 	ASSERT_EQ(0, ret);
2307 
2308 	/* Tracer will redirect getpid to getppid, and we should die. */
2309 	EXPECT_NE(self->mypid, syscall(__NR_getpid));
2310 }
2311 
TEST(seccomp_syscall)2312 TEST(seccomp_syscall)
2313 {
2314 	struct sock_filter filter[] = {
2315 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2316 	};
2317 	struct sock_fprog prog = {
2318 		.len = (unsigned short)ARRAY_SIZE(filter),
2319 		.filter = filter,
2320 	};
2321 	long ret;
2322 
2323 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
2324 	ASSERT_EQ(0, ret) {
2325 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2326 	}
2327 
2328 	/* Reject insane operation. */
2329 	ret = seccomp(-1, 0, &prog);
2330 	ASSERT_NE(ENOSYS, errno) {
2331 		TH_LOG("Kernel does not support seccomp syscall!");
2332 	}
2333 	EXPECT_EQ(EINVAL, errno) {
2334 		TH_LOG("Did not reject crazy op value!");
2335 	}
2336 
2337 	/* Reject strict with flags or pointer. */
2338 	ret = seccomp(SECCOMP_SET_MODE_STRICT, -1, NULL);
2339 	EXPECT_EQ(EINVAL, errno) {
2340 		TH_LOG("Did not reject mode strict with flags!");
2341 	}
2342 	ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, &prog);
2343 	EXPECT_EQ(EINVAL, errno) {
2344 		TH_LOG("Did not reject mode strict with uargs!");
2345 	}
2346 
2347 	/* Reject insane args for filter. */
2348 	ret = seccomp(SECCOMP_SET_MODE_FILTER, -1, &prog);
2349 	EXPECT_EQ(EINVAL, errno) {
2350 		TH_LOG("Did not reject crazy filter flags!");
2351 	}
2352 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, NULL);
2353 	EXPECT_EQ(EFAULT, errno) {
2354 		TH_LOG("Did not reject NULL filter!");
2355 	}
2356 
2357 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2358 	EXPECT_EQ(0, errno) {
2359 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER: %s",
2360 			strerror(errno));
2361 	}
2362 }
2363 
TEST(seccomp_syscall_mode_lock)2364 TEST(seccomp_syscall_mode_lock)
2365 {
2366 	struct sock_filter filter[] = {
2367 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2368 	};
2369 	struct sock_fprog prog = {
2370 		.len = (unsigned short)ARRAY_SIZE(filter),
2371 		.filter = filter,
2372 	};
2373 	long ret;
2374 
2375 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2376 	ASSERT_EQ(0, ret) {
2377 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2378 	}
2379 
2380 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2381 	ASSERT_NE(ENOSYS, errno) {
2382 		TH_LOG("Kernel does not support seccomp syscall!");
2383 	}
2384 	EXPECT_EQ(0, ret) {
2385 		TH_LOG("Could not install filter!");
2386 	}
2387 
2388 	/* Make sure neither entry point will switch to strict. */
2389 	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
2390 	EXPECT_EQ(EINVAL, errno) {
2391 		TH_LOG("Switched to mode strict!");
2392 	}
2393 
2394 	ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, NULL);
2395 	EXPECT_EQ(EINVAL, errno) {
2396 		TH_LOG("Switched to mode strict!");
2397 	}
2398 }
2399 
2400 /*
2401  * Test detection of known and unknown filter flags. Userspace needs to be able
2402  * to check if a filter flag is supported by the current kernel and a good way
2403  * of doing that is by attempting to enter filter mode, with the flag bit in
2404  * question set, and a NULL pointer for the _args_ parameter. EFAULT indicates
2405  * that the flag is valid and EINVAL indicates that the flag is invalid.
2406  */
TEST(detect_seccomp_filter_flags)2407 TEST(detect_seccomp_filter_flags)
2408 {
2409 	unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC,
2410 				 SECCOMP_FILTER_FLAG_LOG,
2411 				 SECCOMP_FILTER_FLAG_SPEC_ALLOW,
2412 				 SECCOMP_FILTER_FLAG_NEW_LISTENER,
2413 				 SECCOMP_FILTER_FLAG_TSYNC_ESRCH };
2414 	unsigned int exclusive[] = {
2415 				SECCOMP_FILTER_FLAG_TSYNC,
2416 				SECCOMP_FILTER_FLAG_NEW_LISTENER };
2417 	unsigned int flag, all_flags, exclusive_mask;
2418 	int i;
2419 	long ret;
2420 
2421 	/* Test detection of individual known-good filter flags */
2422 	for (i = 0, all_flags = 0; i < ARRAY_SIZE(flags); i++) {
2423 		int bits = 0;
2424 
2425 		flag = flags[i];
2426 		/* Make sure the flag is a single bit! */
2427 		while (flag) {
2428 			if (flag & 0x1)
2429 				bits ++;
2430 			flag >>= 1;
2431 		}
2432 		ASSERT_EQ(1, bits);
2433 		flag = flags[i];
2434 
2435 		ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2436 		ASSERT_NE(ENOSYS, errno) {
2437 			TH_LOG("Kernel does not support seccomp syscall!");
2438 		}
2439 		EXPECT_EQ(-1, ret);
2440 		EXPECT_EQ(EFAULT, errno) {
2441 			TH_LOG("Failed to detect that a known-good filter flag (0x%X) is supported!",
2442 			       flag);
2443 		}
2444 
2445 		all_flags |= flag;
2446 	}
2447 
2448 	/*
2449 	 * Test detection of all known-good filter flags combined. But
2450 	 * for the exclusive flags we need to mask them out and try them
2451 	 * individually for the "all flags" testing.
2452 	 */
2453 	exclusive_mask = 0;
2454 	for (i = 0; i < ARRAY_SIZE(exclusive); i++)
2455 		exclusive_mask |= exclusive[i];
2456 	for (i = 0; i < ARRAY_SIZE(exclusive); i++) {
2457 		flag = all_flags & ~exclusive_mask;
2458 		flag |= exclusive[i];
2459 
2460 		ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2461 		EXPECT_EQ(-1, ret);
2462 		EXPECT_EQ(EFAULT, errno) {
2463 			TH_LOG("Failed to detect that all known-good filter flags (0x%X) are supported!",
2464 			       flag);
2465 		}
2466 	}
2467 
2468 	/* Test detection of an unknown filter flags, without exclusives. */
2469 	flag = -1;
2470 	flag &= ~exclusive_mask;
2471 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2472 	EXPECT_EQ(-1, ret);
2473 	EXPECT_EQ(EINVAL, errno) {
2474 		TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported!",
2475 		       flag);
2476 	}
2477 
2478 	/*
2479 	 * Test detection of an unknown filter flag that may simply need to be
2480 	 * added to this test
2481 	 */
2482 	flag = flags[ARRAY_SIZE(flags) - 1] << 1;
2483 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
2484 	EXPECT_EQ(-1, ret);
2485 	EXPECT_EQ(EINVAL, errno) {
2486 		TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported! Does a new flag need to be added to this test?",
2487 		       flag);
2488 	}
2489 }
2490 
TEST(TSYNC_first)2491 TEST(TSYNC_first)
2492 {
2493 	struct sock_filter filter[] = {
2494 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2495 	};
2496 	struct sock_fprog prog = {
2497 		.len = (unsigned short)ARRAY_SIZE(filter),
2498 		.filter = filter,
2499 	};
2500 	long ret;
2501 
2502 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
2503 	ASSERT_EQ(0, ret) {
2504 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2505 	}
2506 
2507 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2508 		      &prog);
2509 	ASSERT_NE(ENOSYS, errno) {
2510 		TH_LOG("Kernel does not support seccomp syscall!");
2511 	}
2512 	EXPECT_EQ(0, ret) {
2513 		TH_LOG("Could not install initial filter with TSYNC!");
2514 	}
2515 }
2516 
2517 #define TSYNC_SIBLINGS 2
2518 struct tsync_sibling {
2519 	pthread_t tid;
2520 	pid_t system_tid;
2521 	sem_t *started;
2522 	pthread_cond_t *cond;
2523 	pthread_mutex_t *mutex;
2524 	int diverge;
2525 	int num_waits;
2526 	struct sock_fprog *prog;
2527 	struct __test_metadata *metadata;
2528 };
2529 
2530 /*
2531  * To avoid joining joined threads (which is not allowed by Bionic),
2532  * make sure we both successfully join and clear the tid to skip a
2533  * later join attempt during fixture teardown. Any remaining threads
2534  * will be directly killed during teardown.
2535  */
2536 #define PTHREAD_JOIN(tid, status)					\
2537 	do {								\
2538 		int _rc = pthread_join(tid, status);			\
2539 		if (_rc) {						\
2540 			TH_LOG("pthread_join of tid %u failed: %d\n",	\
2541 				(unsigned int)tid, _rc);		\
2542 		} else {						\
2543 			tid = 0;					\
2544 		}							\
2545 	} while (0)
2546 
FIXTURE(TSYNC)2547 FIXTURE(TSYNC) {
2548 	struct sock_fprog root_prog, apply_prog;
2549 	struct tsync_sibling sibling[TSYNC_SIBLINGS];
2550 	sem_t started;
2551 	pthread_cond_t cond;
2552 	pthread_mutex_t mutex;
2553 	int sibling_count;
2554 };
2555 
FIXTURE_SETUP(TSYNC)2556 FIXTURE_SETUP(TSYNC)
2557 {
2558 	struct sock_filter root_filter[] = {
2559 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2560 	};
2561 	struct sock_filter apply_filter[] = {
2562 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2563 			offsetof(struct seccomp_data, nr)),
2564 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
2565 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
2566 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2567 	};
2568 
2569 	memset(&self->root_prog, 0, sizeof(self->root_prog));
2570 	memset(&self->apply_prog, 0, sizeof(self->apply_prog));
2571 	memset(&self->sibling, 0, sizeof(self->sibling));
2572 	self->root_prog.filter = malloc(sizeof(root_filter));
2573 	ASSERT_NE(NULL, self->root_prog.filter);
2574 	memcpy(self->root_prog.filter, &root_filter, sizeof(root_filter));
2575 	self->root_prog.len = (unsigned short)ARRAY_SIZE(root_filter);
2576 
2577 	self->apply_prog.filter = malloc(sizeof(apply_filter));
2578 	ASSERT_NE(NULL, self->apply_prog.filter);
2579 	memcpy(self->apply_prog.filter, &apply_filter, sizeof(apply_filter));
2580 	self->apply_prog.len = (unsigned short)ARRAY_SIZE(apply_filter);
2581 
2582 	self->sibling_count = 0;
2583 	pthread_mutex_init(&self->mutex, NULL);
2584 	pthread_cond_init(&self->cond, NULL);
2585 	sem_init(&self->started, 0, 0);
2586 	self->sibling[0].tid = 0;
2587 	self->sibling[0].cond = &self->cond;
2588 	self->sibling[0].started = &self->started;
2589 	self->sibling[0].mutex = &self->mutex;
2590 	self->sibling[0].diverge = 0;
2591 	self->sibling[0].num_waits = 1;
2592 	self->sibling[0].prog = &self->root_prog;
2593 	self->sibling[0].metadata = _metadata;
2594 	self->sibling[1].tid = 0;
2595 	self->sibling[1].cond = &self->cond;
2596 	self->sibling[1].started = &self->started;
2597 	self->sibling[1].mutex = &self->mutex;
2598 	self->sibling[1].diverge = 0;
2599 	self->sibling[1].prog = &self->root_prog;
2600 	self->sibling[1].num_waits = 1;
2601 	self->sibling[1].metadata = _metadata;
2602 }
2603 
FIXTURE_TEARDOWN(TSYNC)2604 FIXTURE_TEARDOWN(TSYNC)
2605 {
2606 	int sib = 0;
2607 
2608 	if (self->root_prog.filter)
2609 		free(self->root_prog.filter);
2610 	if (self->apply_prog.filter)
2611 		free(self->apply_prog.filter);
2612 
2613 	for ( ; sib < self->sibling_count; ++sib) {
2614 		struct tsync_sibling *s = &self->sibling[sib];
2615 
2616 		if (!s->tid)
2617 			continue;
2618 		/*
2619 		 * If a thread is still running, it may be stuck, so hit
2620 		 * it over the head really hard.
2621 		 */
2622 		pthread_kill(s->tid, 9);
2623 	}
2624 	pthread_mutex_destroy(&self->mutex);
2625 	pthread_cond_destroy(&self->cond);
2626 	sem_destroy(&self->started);
2627 }
2628 
tsync_sibling(void * data)2629 void *tsync_sibling(void *data)
2630 {
2631 	long ret = 0;
2632 	struct tsync_sibling *me = data;
2633 
2634 	me->system_tid = syscall(__NR_gettid);
2635 
2636 	pthread_mutex_lock(me->mutex);
2637 	if (me->diverge) {
2638 		/* Just re-apply the root prog to fork the tree */
2639 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
2640 				me->prog, 0, 0);
2641 	}
2642 	sem_post(me->started);
2643 	/* Return outside of started so parent notices failures. */
2644 	if (ret) {
2645 		pthread_mutex_unlock(me->mutex);
2646 		return (void *)SIBLING_EXIT_FAILURE;
2647 	}
2648 	do {
2649 		pthread_cond_wait(me->cond, me->mutex);
2650 		me->num_waits = me->num_waits - 1;
2651 	} while (me->num_waits);
2652 	pthread_mutex_unlock(me->mutex);
2653 
2654 	ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
2655 	if (!ret)
2656 		return (void *)SIBLING_EXIT_NEWPRIVS;
2657 	read(-1, NULL, 0);
2658 	return (void *)SIBLING_EXIT_UNKILLED;
2659 }
2660 
tsync_start_sibling(struct tsync_sibling * sibling)2661 void tsync_start_sibling(struct tsync_sibling *sibling)
2662 {
2663 	pthread_create(&sibling->tid, NULL, tsync_sibling, (void *)sibling);
2664 }
2665 
TEST_F(TSYNC,siblings_fail_prctl)2666 TEST_F(TSYNC, siblings_fail_prctl)
2667 {
2668 	long ret;
2669 	void *status;
2670 	struct sock_filter filter[] = {
2671 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
2672 			offsetof(struct seccomp_data, nr)),
2673 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
2674 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EINVAL),
2675 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
2676 	};
2677 	struct sock_fprog prog = {
2678 		.len = (unsigned short)ARRAY_SIZE(filter),
2679 		.filter = filter,
2680 	};
2681 
2682 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2683 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2684 	}
2685 
2686 	/* Check prctl failure detection by requesting sib 0 diverge. */
2687 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
2688 	ASSERT_NE(ENOSYS, errno) {
2689 		TH_LOG("Kernel does not support seccomp syscall!");
2690 	}
2691 	ASSERT_EQ(0, ret) {
2692 		TH_LOG("setting filter failed");
2693 	}
2694 
2695 	self->sibling[0].diverge = 1;
2696 	tsync_start_sibling(&self->sibling[0]);
2697 	tsync_start_sibling(&self->sibling[1]);
2698 
2699 	while (self->sibling_count < TSYNC_SIBLINGS) {
2700 		sem_wait(&self->started);
2701 		self->sibling_count++;
2702 	}
2703 
2704 	/* Signal the threads to clean up*/
2705 	pthread_mutex_lock(&self->mutex);
2706 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2707 		TH_LOG("cond broadcast non-zero");
2708 	}
2709 	pthread_mutex_unlock(&self->mutex);
2710 
2711 	/* Ensure diverging sibling failed to call prctl. */
2712 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2713 	EXPECT_EQ(SIBLING_EXIT_FAILURE, (long)status);
2714 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2715 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2716 }
2717 
TEST_F(TSYNC,two_siblings_with_ancestor)2718 TEST_F(TSYNC, two_siblings_with_ancestor)
2719 {
2720 	long ret;
2721 	void *status;
2722 
2723 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2724 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2725 	}
2726 
2727 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2728 	ASSERT_NE(ENOSYS, errno) {
2729 		TH_LOG("Kernel does not support seccomp syscall!");
2730 	}
2731 	ASSERT_EQ(0, ret) {
2732 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2733 	}
2734 	tsync_start_sibling(&self->sibling[0]);
2735 	tsync_start_sibling(&self->sibling[1]);
2736 
2737 	while (self->sibling_count < TSYNC_SIBLINGS) {
2738 		sem_wait(&self->started);
2739 		self->sibling_count++;
2740 	}
2741 
2742 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2743 		      &self->apply_prog);
2744 	ASSERT_EQ(0, ret) {
2745 		TH_LOG("Could install filter on all threads!");
2746 	}
2747 	/* Tell the siblings to test the policy */
2748 	pthread_mutex_lock(&self->mutex);
2749 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2750 		TH_LOG("cond broadcast non-zero");
2751 	}
2752 	pthread_mutex_unlock(&self->mutex);
2753 	/* Ensure they are both killed and don't exit cleanly. */
2754 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2755 	EXPECT_EQ(0x0, (long)status);
2756 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2757 	EXPECT_EQ(0x0, (long)status);
2758 }
2759 
TEST_F(TSYNC,two_sibling_want_nnp)2760 TEST_F(TSYNC, two_sibling_want_nnp)
2761 {
2762 	void *status;
2763 
2764 	/* start siblings before any prctl() operations */
2765 	tsync_start_sibling(&self->sibling[0]);
2766 	tsync_start_sibling(&self->sibling[1]);
2767 	while (self->sibling_count < TSYNC_SIBLINGS) {
2768 		sem_wait(&self->started);
2769 		self->sibling_count++;
2770 	}
2771 
2772 	/* Tell the siblings to test no policy */
2773 	pthread_mutex_lock(&self->mutex);
2774 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2775 		TH_LOG("cond broadcast non-zero");
2776 	}
2777 	pthread_mutex_unlock(&self->mutex);
2778 
2779 	/* Ensure they are both upset about lacking nnp. */
2780 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2781 	EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2782 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2783 	EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
2784 }
2785 
TEST_F(TSYNC,two_siblings_with_no_filter)2786 TEST_F(TSYNC, two_siblings_with_no_filter)
2787 {
2788 	long ret;
2789 	void *status;
2790 
2791 	/* start siblings before any prctl() operations */
2792 	tsync_start_sibling(&self->sibling[0]);
2793 	tsync_start_sibling(&self->sibling[1]);
2794 	while (self->sibling_count < TSYNC_SIBLINGS) {
2795 		sem_wait(&self->started);
2796 		self->sibling_count++;
2797 	}
2798 
2799 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2800 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2801 	}
2802 
2803 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2804 		      &self->apply_prog);
2805 	ASSERT_NE(ENOSYS, errno) {
2806 		TH_LOG("Kernel does not support seccomp syscall!");
2807 	}
2808 	ASSERT_EQ(0, ret) {
2809 		TH_LOG("Could install filter on all threads!");
2810 	}
2811 
2812 	/* Tell the siblings to test the policy */
2813 	pthread_mutex_lock(&self->mutex);
2814 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2815 		TH_LOG("cond broadcast non-zero");
2816 	}
2817 	pthread_mutex_unlock(&self->mutex);
2818 
2819 	/* Ensure they are both killed and don't exit cleanly. */
2820 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2821 	EXPECT_EQ(0x0, (long)status);
2822 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2823 	EXPECT_EQ(0x0, (long)status);
2824 }
2825 
TEST_F(TSYNC,two_siblings_with_one_divergence)2826 TEST_F(TSYNC, two_siblings_with_one_divergence)
2827 {
2828 	long ret;
2829 	void *status;
2830 
2831 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2832 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2833 	}
2834 
2835 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2836 	ASSERT_NE(ENOSYS, errno) {
2837 		TH_LOG("Kernel does not support seccomp syscall!");
2838 	}
2839 	ASSERT_EQ(0, ret) {
2840 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2841 	}
2842 	self->sibling[0].diverge = 1;
2843 	tsync_start_sibling(&self->sibling[0]);
2844 	tsync_start_sibling(&self->sibling[1]);
2845 
2846 	while (self->sibling_count < TSYNC_SIBLINGS) {
2847 		sem_wait(&self->started);
2848 		self->sibling_count++;
2849 	}
2850 
2851 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2852 		      &self->apply_prog);
2853 	ASSERT_EQ(self->sibling[0].system_tid, ret) {
2854 		TH_LOG("Did not fail on diverged sibling.");
2855 	}
2856 
2857 	/* Wake the threads */
2858 	pthread_mutex_lock(&self->mutex);
2859 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2860 		TH_LOG("cond broadcast non-zero");
2861 	}
2862 	pthread_mutex_unlock(&self->mutex);
2863 
2864 	/* Ensure they are both unkilled. */
2865 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2866 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2867 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2868 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2869 }
2870 
TEST_F(TSYNC,two_siblings_with_one_divergence_no_tid_in_err)2871 TEST_F(TSYNC, two_siblings_with_one_divergence_no_tid_in_err)
2872 {
2873 	long ret, flags;
2874 	void *status;
2875 
2876 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2877 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2878 	}
2879 
2880 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2881 	ASSERT_NE(ENOSYS, errno) {
2882 		TH_LOG("Kernel does not support seccomp syscall!");
2883 	}
2884 	ASSERT_EQ(0, ret) {
2885 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2886 	}
2887 	self->sibling[0].diverge = 1;
2888 	tsync_start_sibling(&self->sibling[0]);
2889 	tsync_start_sibling(&self->sibling[1]);
2890 
2891 	while (self->sibling_count < TSYNC_SIBLINGS) {
2892 		sem_wait(&self->started);
2893 		self->sibling_count++;
2894 	}
2895 
2896 	flags = SECCOMP_FILTER_FLAG_TSYNC | \
2897 		SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
2898 	ret = seccomp(SECCOMP_SET_MODE_FILTER, flags, &self->apply_prog);
2899 	ASSERT_EQ(ESRCH, errno) {
2900 		TH_LOG("Did not return ESRCH for diverged sibling.");
2901 	}
2902 	ASSERT_EQ(-1, ret) {
2903 		TH_LOG("Did not fail on diverged sibling.");
2904 	}
2905 
2906 	/* Wake the threads */
2907 	pthread_mutex_lock(&self->mutex);
2908 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2909 		TH_LOG("cond broadcast non-zero");
2910 	}
2911 	pthread_mutex_unlock(&self->mutex);
2912 
2913 	/* Ensure they are both unkilled. */
2914 	PTHREAD_JOIN(self->sibling[0].tid, &status);
2915 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2916 	PTHREAD_JOIN(self->sibling[1].tid, &status);
2917 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2918 }
2919 
TEST_F(TSYNC,two_siblings_not_under_filter)2920 TEST_F(TSYNC, two_siblings_not_under_filter)
2921 {
2922 	long ret, sib;
2923 	void *status;
2924 	struct timespec delay = { .tv_nsec = 100000000 };
2925 
2926 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
2927 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
2928 	}
2929 
2930 	/*
2931 	 * Sibling 0 will have its own seccomp policy
2932 	 * and Sibling 1 will not be under seccomp at
2933 	 * all. Sibling 1 will enter seccomp and 0
2934 	 * will cause failure.
2935 	 */
2936 	self->sibling[0].diverge = 1;
2937 	tsync_start_sibling(&self->sibling[0]);
2938 	tsync_start_sibling(&self->sibling[1]);
2939 
2940 	while (self->sibling_count < TSYNC_SIBLINGS) {
2941 		sem_wait(&self->started);
2942 		self->sibling_count++;
2943 	}
2944 
2945 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
2946 	ASSERT_NE(ENOSYS, errno) {
2947 		TH_LOG("Kernel does not support seccomp syscall!");
2948 	}
2949 	ASSERT_EQ(0, ret) {
2950 		TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
2951 	}
2952 
2953 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2954 		      &self->apply_prog);
2955 	ASSERT_EQ(ret, self->sibling[0].system_tid) {
2956 		TH_LOG("Did not fail on diverged sibling.");
2957 	}
2958 	sib = 1;
2959 	if (ret == self->sibling[0].system_tid)
2960 		sib = 0;
2961 
2962 	pthread_mutex_lock(&self->mutex);
2963 
2964 	/* Increment the other siblings num_waits so we can clean up
2965 	 * the one we just saw.
2966 	 */
2967 	self->sibling[!sib].num_waits += 1;
2968 
2969 	/* Signal the thread to clean up*/
2970 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2971 		TH_LOG("cond broadcast non-zero");
2972 	}
2973 	pthread_mutex_unlock(&self->mutex);
2974 	PTHREAD_JOIN(self->sibling[sib].tid, &status);
2975 	EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
2976 	/* Poll for actual task death. pthread_join doesn't guarantee it. */
2977 	while (!kill(self->sibling[sib].system_tid, 0))
2978 		nanosleep(&delay, NULL);
2979 	/* Switch to the remaining sibling */
2980 	sib = !sib;
2981 
2982 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
2983 		      &self->apply_prog);
2984 	ASSERT_EQ(0, ret) {
2985 		TH_LOG("Expected the remaining sibling to sync");
2986 	};
2987 
2988 	pthread_mutex_lock(&self->mutex);
2989 
2990 	/* If remaining sibling didn't have a chance to wake up during
2991 	 * the first broadcast, manually reduce the num_waits now.
2992 	 */
2993 	if (self->sibling[sib].num_waits > 1)
2994 		self->sibling[sib].num_waits = 1;
2995 	ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
2996 		TH_LOG("cond broadcast non-zero");
2997 	}
2998 	pthread_mutex_unlock(&self->mutex);
2999 	PTHREAD_JOIN(self->sibling[sib].tid, &status);
3000 	EXPECT_EQ(0, (long)status);
3001 	/* Poll for actual task death. pthread_join doesn't guarantee it. */
3002 	while (!kill(self->sibling[sib].system_tid, 0))
3003 		nanosleep(&delay, NULL);
3004 
3005 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
3006 		      &self->apply_prog);
3007 	ASSERT_EQ(0, ret);  /* just us chickens */
3008 }
3009 
3010 /* Make sure restarted syscalls are seen directly as "restart_syscall". */
TEST(syscall_restart)3011 TEST(syscall_restart)
3012 {
3013 	long ret;
3014 	unsigned long msg;
3015 	pid_t child_pid;
3016 	int pipefd[2];
3017 	int status;
3018 	siginfo_t info = { };
3019 	struct sock_filter filter[] = {
3020 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
3021 			 offsetof(struct seccomp_data, nr)),
3022 
3023 #ifdef __NR_sigreturn
3024 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_sigreturn, 7, 0),
3025 #endif
3026 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 6, 0),
3027 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit, 5, 0),
3028 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigreturn, 4, 0),
3029 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_nanosleep, 5, 0),
3030 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_clock_nanosleep, 4, 0),
3031 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_restart_syscall, 4, 0),
3032 
3033 		/* Allow __NR_write for easy logging. */
3034 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_write, 0, 1),
3035 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3036 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
3037 		/* The nanosleep jump target. */
3038 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x100),
3039 		/* The restart_syscall jump target. */
3040 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x200),
3041 	};
3042 	struct sock_fprog prog = {
3043 		.len = (unsigned short)ARRAY_SIZE(filter),
3044 		.filter = filter,
3045 	};
3046 #if defined(__arm__)
3047 	struct utsname utsbuf;
3048 #endif
3049 
3050 	ASSERT_EQ(0, pipe(pipefd));
3051 
3052 	child_pid = fork();
3053 	ASSERT_LE(0, child_pid);
3054 	if (child_pid == 0) {
3055 		/* Child uses EXPECT not ASSERT to deliver status correctly. */
3056 		char buf = ' ';
3057 		struct timespec timeout = { };
3058 
3059 		/* Attach parent as tracer and stop. */
3060 		EXPECT_EQ(0, ptrace(PTRACE_TRACEME));
3061 		EXPECT_EQ(0, raise(SIGSTOP));
3062 
3063 		EXPECT_EQ(0, close(pipefd[1]));
3064 
3065 		EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
3066 			TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3067 		}
3068 
3069 		ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
3070 		EXPECT_EQ(0, ret) {
3071 			TH_LOG("Failed to install filter!");
3072 		}
3073 
3074 		EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
3075 			TH_LOG("Failed to read() sync from parent");
3076 		}
3077 		EXPECT_EQ('.', buf) {
3078 			TH_LOG("Failed to get sync data from read()");
3079 		}
3080 
3081 		/* Start nanosleep to be interrupted. */
3082 		timeout.tv_sec = 1;
3083 		errno = 0;
3084 		EXPECT_EQ(0, nanosleep(&timeout, NULL)) {
3085 			TH_LOG("Call to nanosleep() failed (errno %d: %s)",
3086 				errno, strerror(errno));
3087 		}
3088 
3089 		/* Read final sync from parent. */
3090 		EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
3091 			TH_LOG("Failed final read() from parent");
3092 		}
3093 		EXPECT_EQ('!', buf) {
3094 			TH_LOG("Failed to get final data from read()");
3095 		}
3096 
3097 		/* Directly report the status of our test harness results. */
3098 		syscall(__NR_exit, _metadata->exit_code);
3099 	}
3100 	EXPECT_EQ(0, close(pipefd[0]));
3101 
3102 	/* Attach to child, setup options, and release. */
3103 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3104 	ASSERT_EQ(true, WIFSTOPPED(status));
3105 	ASSERT_EQ(0, ptrace(PTRACE_SETOPTIONS, child_pid, NULL,
3106 			    PTRACE_O_TRACESECCOMP));
3107 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3108 	ASSERT_EQ(1, write(pipefd[1], ".", 1));
3109 
3110 	/* Wait for nanosleep() to start. */
3111 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3112 	ASSERT_EQ(true, WIFSTOPPED(status));
3113 	ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
3114 	ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
3115 	ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
3116 	ASSERT_EQ(0x100, msg);
3117 	ret = get_syscall(_metadata, child_pid);
3118 	EXPECT_TRUE(ret == __NR_nanosleep || ret == __NR_clock_nanosleep);
3119 
3120 	/* Might as well check siginfo for sanity while we're here. */
3121 	ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
3122 	ASSERT_EQ(SIGTRAP, info.si_signo);
3123 	ASSERT_EQ(SIGTRAP | (PTRACE_EVENT_SECCOMP << 8), info.si_code);
3124 	EXPECT_EQ(0, info.si_errno);
3125 	EXPECT_EQ(getuid(), info.si_uid);
3126 	/* Verify signal delivery came from child (seccomp-triggered). */
3127 	EXPECT_EQ(child_pid, info.si_pid);
3128 
3129 	/* Interrupt nanosleep with SIGSTOP (which we'll need to handle). */
3130 	ASSERT_EQ(0, kill(child_pid, SIGSTOP));
3131 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3132 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3133 	ASSERT_EQ(true, WIFSTOPPED(status));
3134 	ASSERT_EQ(SIGSTOP, WSTOPSIG(status));
3135 	ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
3136 	/*
3137 	 * There is no siginfo on SIGSTOP any more, so we can't verify
3138 	 * signal delivery came from parent now (getpid() == info.si_pid).
3139 	 * https://lkml.kernel.org/r/CAGXu5jJaZAOzP1qFz66tYrtbuywqb+UN2SOA1VLHpCCOiYvYeg@mail.gmail.com
3140 	 * At least verify the SIGSTOP via PTRACE_GETSIGINFO.
3141 	 */
3142 	EXPECT_EQ(SIGSTOP, info.si_signo);
3143 
3144 	/* Restart nanosleep with SIGCONT, which triggers restart_syscall. */
3145 	ASSERT_EQ(0, kill(child_pid, SIGCONT));
3146 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3147 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3148 	ASSERT_EQ(true, WIFSTOPPED(status));
3149 	ASSERT_EQ(SIGCONT, WSTOPSIG(status));
3150 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3151 
3152 	/* Wait for restart_syscall() to start. */
3153 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3154 	ASSERT_EQ(true, WIFSTOPPED(status));
3155 	ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
3156 	ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
3157 	ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
3158 
3159 	ASSERT_EQ(0x200, msg);
3160 	ret = get_syscall(_metadata, child_pid);
3161 #if defined(__arm__)
3162 	/*
3163 	 * - native ARM registers do NOT expose true syscall.
3164 	 * - compat ARM registers on ARM64 DO expose true syscall.
3165 	 * - values of utsbuf.machine include 'armv8l' or 'armb8b'
3166 	 *   for ARM64 running in compat mode.
3167 	 */
3168 	ASSERT_EQ(0, uname(&utsbuf));
3169 	if ((strncmp(utsbuf.machine, "arm", 3) == 0) &&
3170 	    (strncmp(utsbuf.machine, "armv8l", 6) != 0) &&
3171 	    (strncmp(utsbuf.machine, "armv8b", 6) != 0)) {
3172 		EXPECT_EQ(__NR_nanosleep, ret);
3173 	} else
3174 #endif
3175 	{
3176 		EXPECT_EQ(__NR_restart_syscall, ret);
3177 	}
3178 
3179 	/* Write again to end test. */
3180 	ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
3181 	ASSERT_EQ(1, write(pipefd[1], "!", 1));
3182 	EXPECT_EQ(0, close(pipefd[1]));
3183 
3184 	ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
3185 	if (WIFSIGNALED(status) || WEXITSTATUS(status))
3186 		_metadata->exit_code = KSFT_FAIL;
3187 }
3188 
TEST_SIGNAL(filter_flag_log,SIGSYS)3189 TEST_SIGNAL(filter_flag_log, SIGSYS)
3190 {
3191 	struct sock_filter allow_filter[] = {
3192 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3193 	};
3194 	struct sock_filter kill_filter[] = {
3195 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
3196 			offsetof(struct seccomp_data, nr)),
3197 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
3198 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
3199 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3200 	};
3201 	struct sock_fprog allow_prog = {
3202 		.len = (unsigned short)ARRAY_SIZE(allow_filter),
3203 		.filter = allow_filter,
3204 	};
3205 	struct sock_fprog kill_prog = {
3206 		.len = (unsigned short)ARRAY_SIZE(kill_filter),
3207 		.filter = kill_filter,
3208 	};
3209 	long ret;
3210 	pid_t parent = getppid();
3211 
3212 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3213 	ASSERT_EQ(0, ret);
3214 
3215 	/* Verify that the FILTER_FLAG_LOG flag isn't accepted in strict mode */
3216 	ret = seccomp(SECCOMP_SET_MODE_STRICT, SECCOMP_FILTER_FLAG_LOG,
3217 		      &allow_prog);
3218 	ASSERT_NE(ENOSYS, errno) {
3219 		TH_LOG("Kernel does not support seccomp syscall!");
3220 	}
3221 	EXPECT_NE(0, ret) {
3222 		TH_LOG("Kernel accepted FILTER_FLAG_LOG flag in strict mode!");
3223 	}
3224 	EXPECT_EQ(EINVAL, errno) {
3225 		TH_LOG("Kernel returned unexpected errno for FILTER_FLAG_LOG flag in strict mode!");
3226 	}
3227 
3228 	/* Verify that a simple, permissive filter can be added with no flags */
3229 	ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &allow_prog);
3230 	EXPECT_EQ(0, ret);
3231 
3232 	/* See if the same filter can be added with the FILTER_FLAG_LOG flag */
3233 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
3234 		      &allow_prog);
3235 	ASSERT_NE(EINVAL, errno) {
3236 		TH_LOG("Kernel does not support the FILTER_FLAG_LOG flag!");
3237 	}
3238 	EXPECT_EQ(0, ret);
3239 
3240 	/* Ensure that the kill filter works with the FILTER_FLAG_LOG flag */
3241 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
3242 		      &kill_prog);
3243 	EXPECT_EQ(0, ret);
3244 
3245 	EXPECT_EQ(parent, syscall(__NR_getppid));
3246 	/* getpid() should never return. */
3247 	EXPECT_EQ(0, syscall(__NR_getpid));
3248 }
3249 
TEST(get_action_avail)3250 TEST(get_action_avail)
3251 {
3252 	__u32 actions[] = { SECCOMP_RET_KILL_THREAD, SECCOMP_RET_TRAP,
3253 			    SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE,
3254 			    SECCOMP_RET_LOG,   SECCOMP_RET_ALLOW };
3255 	__u32 unknown_action = 0x10000000U;
3256 	int i;
3257 	long ret;
3258 
3259 	ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[0]);
3260 	ASSERT_NE(ENOSYS, errno) {
3261 		TH_LOG("Kernel does not support seccomp syscall!");
3262 	}
3263 	ASSERT_NE(EINVAL, errno) {
3264 		TH_LOG("Kernel does not support SECCOMP_GET_ACTION_AVAIL operation!");
3265 	}
3266 	EXPECT_EQ(ret, 0);
3267 
3268 	for (i = 0; i < ARRAY_SIZE(actions); i++) {
3269 		ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[i]);
3270 		EXPECT_EQ(ret, 0) {
3271 			TH_LOG("Expected action (0x%X) not available!",
3272 			       actions[i]);
3273 		}
3274 	}
3275 
3276 	/* Check that an unknown action is handled properly (EOPNOTSUPP) */
3277 	ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &unknown_action);
3278 	EXPECT_EQ(ret, -1);
3279 	EXPECT_EQ(errno, EOPNOTSUPP);
3280 }
3281 
TEST(get_metadata)3282 TEST(get_metadata)
3283 {
3284 	pid_t pid;
3285 	int pipefd[2];
3286 	char buf;
3287 	struct seccomp_metadata md;
3288 	long ret;
3289 
3290 	/* Only real root can get metadata. */
3291 	if (geteuid()) {
3292 		SKIP(return, "get_metadata requires real root");
3293 		return;
3294 	}
3295 
3296 	ASSERT_EQ(0, pipe(pipefd));
3297 
3298 	pid = fork();
3299 	ASSERT_GE(pid, 0);
3300 	if (pid == 0) {
3301 		struct sock_filter filter[] = {
3302 			BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3303 		};
3304 		struct sock_fprog prog = {
3305 			.len = (unsigned short)ARRAY_SIZE(filter),
3306 			.filter = filter,
3307 		};
3308 
3309 		/* one with log, one without */
3310 		EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER,
3311 				     SECCOMP_FILTER_FLAG_LOG, &prog));
3312 		EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog));
3313 
3314 		EXPECT_EQ(0, close(pipefd[0]));
3315 		ASSERT_EQ(1, write(pipefd[1], "1", 1));
3316 		ASSERT_EQ(0, close(pipefd[1]));
3317 
3318 		while (1)
3319 			sleep(100);
3320 	}
3321 
3322 	ASSERT_EQ(0, close(pipefd[1]));
3323 	ASSERT_EQ(1, read(pipefd[0], &buf, 1));
3324 
3325 	ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid));
3326 	ASSERT_EQ(pid, waitpid(pid, NULL, 0));
3327 
3328 	/* Past here must not use ASSERT or child process is never killed. */
3329 
3330 	md.filter_off = 0;
3331 	errno = 0;
3332 	ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3333 	EXPECT_EQ(sizeof(md), ret) {
3334 		if (errno == EINVAL)
3335 			SKIP(goto skip, "Kernel does not support PTRACE_SECCOMP_GET_METADATA (missing CONFIG_CHECKPOINT_RESTORE?)");
3336 	}
3337 
3338 	EXPECT_EQ(md.flags, SECCOMP_FILTER_FLAG_LOG);
3339 	EXPECT_EQ(md.filter_off, 0);
3340 
3341 	md.filter_off = 1;
3342 	ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
3343 	EXPECT_EQ(sizeof(md), ret);
3344 	EXPECT_EQ(md.flags, 0);
3345 	EXPECT_EQ(md.filter_off, 1);
3346 
3347 skip:
3348 	ASSERT_EQ(0, kill(pid, SIGKILL));
3349 }
3350 
user_notif_syscall(int nr,unsigned int flags)3351 static int user_notif_syscall(int nr, unsigned int flags)
3352 {
3353 	struct sock_filter filter[] = {
3354 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
3355 			offsetof(struct seccomp_data, nr)),
3356 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, nr, 0, 1),
3357 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_USER_NOTIF),
3358 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3359 	};
3360 
3361 	struct sock_fprog prog = {
3362 		.len = (unsigned short)ARRAY_SIZE(filter),
3363 		.filter = filter,
3364 	};
3365 
3366 	return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
3367 }
3368 
3369 #define USER_NOTIF_MAGIC INT_MAX
TEST(user_notification_basic)3370 TEST(user_notification_basic)
3371 {
3372 	pid_t pid;
3373 	long ret;
3374 	int status, listener;
3375 	struct seccomp_notif req = {};
3376 	struct seccomp_notif_resp resp = {};
3377 	struct pollfd pollfd;
3378 
3379 	struct sock_filter filter[] = {
3380 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
3381 	};
3382 	struct sock_fprog prog = {
3383 		.len = (unsigned short)ARRAY_SIZE(filter),
3384 		.filter = filter,
3385 	};
3386 
3387 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3388 	ASSERT_EQ(0, ret) {
3389 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3390 	}
3391 
3392 	pid = fork();
3393 	ASSERT_GE(pid, 0);
3394 
3395 	/* Check that we get -ENOSYS with no listener attached */
3396 	if (pid == 0) {
3397 		if (user_notif_syscall(__NR_getppid, 0) < 0)
3398 			exit(1);
3399 		ret = syscall(__NR_getppid);
3400 		exit(ret >= 0 || errno != ENOSYS);
3401 	}
3402 
3403 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3404 	EXPECT_EQ(true, WIFEXITED(status));
3405 	EXPECT_EQ(0, WEXITSTATUS(status));
3406 
3407 	/* Add some no-op filters for grins. */
3408 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3409 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3410 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3411 	EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
3412 
3413 	/* Check that the basic notification machinery works */
3414 	listener = user_notif_syscall(__NR_getppid,
3415 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3416 	ASSERT_GE(listener, 0);
3417 
3418 	/* Installing a second listener in the chain should EBUSY */
3419 	EXPECT_EQ(user_notif_syscall(__NR_getppid,
3420 				     SECCOMP_FILTER_FLAG_NEW_LISTENER),
3421 		  -1);
3422 	EXPECT_EQ(errno, EBUSY);
3423 
3424 	pid = fork();
3425 	ASSERT_GE(pid, 0);
3426 
3427 	if (pid == 0) {
3428 		ret = syscall(__NR_getppid);
3429 		exit(ret != USER_NOTIF_MAGIC);
3430 	}
3431 
3432 	pollfd.fd = listener;
3433 	pollfd.events = POLLIN | POLLOUT;
3434 
3435 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3436 	EXPECT_EQ(pollfd.revents, POLLIN);
3437 
3438 	/* Test that we can't pass garbage to the kernel. */
3439 	memset(&req, 0, sizeof(req));
3440 	req.pid = -1;
3441 	errno = 0;
3442 	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req);
3443 	EXPECT_EQ(-1, ret);
3444 	EXPECT_EQ(EINVAL, errno);
3445 
3446 	if (ret) {
3447 		req.pid = 0;
3448 		EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3449 	}
3450 
3451 	pollfd.fd = listener;
3452 	pollfd.events = POLLIN | POLLOUT;
3453 
3454 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3455 	EXPECT_EQ(pollfd.revents, POLLOUT);
3456 
3457 	EXPECT_EQ(req.data.nr,  __NR_getppid);
3458 
3459 	resp.id = req.id;
3460 	resp.error = 0;
3461 	resp.val = USER_NOTIF_MAGIC;
3462 
3463 	/* check that we make sure flags == 0 */
3464 	resp.flags = 1;
3465 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3466 	EXPECT_EQ(errno, EINVAL);
3467 
3468 	resp.flags = 0;
3469 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3470 
3471 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3472 	EXPECT_EQ(true, WIFEXITED(status));
3473 	EXPECT_EQ(0, WEXITSTATUS(status));
3474 }
3475 
TEST(user_notification_with_tsync)3476 TEST(user_notification_with_tsync)
3477 {
3478 	int ret;
3479 	unsigned int flags;
3480 
3481 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3482 	ASSERT_EQ(0, ret) {
3483 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3484 	}
3485 
3486 	/* these were exclusive */
3487 	flags = SECCOMP_FILTER_FLAG_NEW_LISTENER |
3488 		SECCOMP_FILTER_FLAG_TSYNC;
3489 	ASSERT_EQ(-1, user_notif_syscall(__NR_getppid, flags));
3490 	ASSERT_EQ(EINVAL, errno);
3491 
3492 	/* but now they're not */
3493 	flags |= SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
3494 	ret = user_notif_syscall(__NR_getppid, flags);
3495 	close(ret);
3496 	ASSERT_LE(0, ret);
3497 }
3498 
TEST(user_notification_kill_in_middle)3499 TEST(user_notification_kill_in_middle)
3500 {
3501 	pid_t pid;
3502 	long ret;
3503 	int listener;
3504 	struct seccomp_notif req = {};
3505 	struct seccomp_notif_resp resp = {};
3506 
3507 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3508 	ASSERT_EQ(0, ret) {
3509 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3510 	}
3511 
3512 	listener = user_notif_syscall(__NR_getppid,
3513 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3514 	ASSERT_GE(listener, 0);
3515 
3516 	/*
3517 	 * Check that nothing bad happens when we kill the task in the middle
3518 	 * of a syscall.
3519 	 */
3520 	pid = fork();
3521 	ASSERT_GE(pid, 0);
3522 
3523 	if (pid == 0) {
3524 		ret = syscall(__NR_getppid);
3525 		exit(ret != USER_NOTIF_MAGIC);
3526 	}
3527 
3528 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3529 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), 0);
3530 
3531 	EXPECT_EQ(kill(pid, SIGKILL), 0);
3532 	EXPECT_EQ(waitpid(pid, NULL, 0), pid);
3533 
3534 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), -1);
3535 
3536 	resp.id = req.id;
3537 	ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
3538 	EXPECT_EQ(ret, -1);
3539 	EXPECT_EQ(errno, ENOENT);
3540 }
3541 
3542 static int handled = -1;
3543 
signal_handler(int signal)3544 static void signal_handler(int signal)
3545 {
3546 	if (write(handled, "c", 1) != 1)
3547 		perror("write from signal");
3548 }
3549 
TEST(user_notification_signal)3550 TEST(user_notification_signal)
3551 {
3552 	pid_t pid;
3553 	long ret;
3554 	int status, listener, sk_pair[2];
3555 	struct seccomp_notif req = {};
3556 	struct seccomp_notif_resp resp = {};
3557 	char c;
3558 
3559 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3560 	ASSERT_EQ(0, ret) {
3561 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3562 	}
3563 
3564 	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
3565 
3566 	listener = user_notif_syscall(__NR_gettid,
3567 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3568 	ASSERT_GE(listener, 0);
3569 
3570 	pid = fork();
3571 	ASSERT_GE(pid, 0);
3572 
3573 	if (pid == 0) {
3574 		close(sk_pair[0]);
3575 		handled = sk_pair[1];
3576 		if (signal(SIGUSR1, signal_handler) == SIG_ERR) {
3577 			perror("signal");
3578 			exit(1);
3579 		}
3580 		/*
3581 		 * ERESTARTSYS behavior is a bit hard to test, because we need
3582 		 * to rely on a signal that has not yet been handled. Let's at
3583 		 * least check that the error code gets propagated through, and
3584 		 * hope that it doesn't break when there is actually a signal :)
3585 		 */
3586 		ret = syscall(__NR_gettid);
3587 		exit(!(ret == -1 && errno == 512));
3588 	}
3589 
3590 	close(sk_pair[1]);
3591 
3592 	memset(&req, 0, sizeof(req));
3593 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3594 
3595 	EXPECT_EQ(kill(pid, SIGUSR1), 0);
3596 
3597 	/*
3598 	 * Make sure the signal really is delivered, which means we're not
3599 	 * stuck in the user notification code any more and the notification
3600 	 * should be dead.
3601 	 */
3602 	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
3603 
3604 	resp.id = req.id;
3605 	resp.error = -EPERM;
3606 	resp.val = 0;
3607 
3608 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3609 	EXPECT_EQ(errno, ENOENT);
3610 
3611 	memset(&req, 0, sizeof(req));
3612 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3613 
3614 	resp.id = req.id;
3615 	resp.error = -512; /* -ERESTARTSYS */
3616 	resp.val = 0;
3617 
3618 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3619 
3620 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3621 	EXPECT_EQ(true, WIFEXITED(status));
3622 	EXPECT_EQ(0, WEXITSTATUS(status));
3623 }
3624 
TEST(user_notification_closed_listener)3625 TEST(user_notification_closed_listener)
3626 {
3627 	pid_t pid;
3628 	long ret;
3629 	int status, listener;
3630 
3631 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3632 	ASSERT_EQ(0, ret) {
3633 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3634 	}
3635 
3636 	listener = user_notif_syscall(__NR_getppid,
3637 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3638 	ASSERT_GE(listener, 0);
3639 
3640 	/*
3641 	 * Check that we get an ENOSYS when the listener is closed.
3642 	 */
3643 	pid = fork();
3644 	ASSERT_GE(pid, 0);
3645 	if (pid == 0) {
3646 		close(listener);
3647 		ret = syscall(__NR_getppid);
3648 		exit(ret != -1 && errno != ENOSYS);
3649 	}
3650 
3651 	close(listener);
3652 
3653 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3654 	EXPECT_EQ(true, WIFEXITED(status));
3655 	EXPECT_EQ(0, WEXITSTATUS(status));
3656 }
3657 
3658 /*
3659  * Check that a pid in a child namespace still shows up as valid in ours.
3660  */
TEST(user_notification_child_pid_ns)3661 TEST(user_notification_child_pid_ns)
3662 {
3663 	pid_t pid;
3664 	int status, listener;
3665 	struct seccomp_notif req = {};
3666 	struct seccomp_notif_resp resp = {};
3667 
3668 	ASSERT_EQ(unshare(CLONE_NEWUSER | CLONE_NEWPID), 0) {
3669 		if (errno == EINVAL)
3670 			SKIP(return, "kernel missing CLONE_NEWUSER support");
3671 	};
3672 
3673 	listener = user_notif_syscall(__NR_getppid,
3674 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3675 	ASSERT_GE(listener, 0);
3676 
3677 	pid = fork();
3678 	ASSERT_GE(pid, 0);
3679 
3680 	if (pid == 0)
3681 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3682 
3683 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3684 	EXPECT_EQ(req.pid, pid);
3685 
3686 	resp.id = req.id;
3687 	resp.error = 0;
3688 	resp.val = USER_NOTIF_MAGIC;
3689 
3690 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3691 
3692 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3693 	EXPECT_EQ(true, WIFEXITED(status));
3694 	EXPECT_EQ(0, WEXITSTATUS(status));
3695 	close(listener);
3696 }
3697 
3698 /*
3699  * Check that a pid in a sibling (i.e. unrelated) namespace shows up as 0, i.e.
3700  * invalid.
3701  */
TEST(user_notification_sibling_pid_ns)3702 TEST(user_notification_sibling_pid_ns)
3703 {
3704 	pid_t pid, pid2;
3705 	int status, listener;
3706 	struct seccomp_notif req = {};
3707 	struct seccomp_notif_resp resp = {};
3708 
3709 	ASSERT_EQ(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), 0) {
3710 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3711 	}
3712 
3713 	listener = user_notif_syscall(__NR_getppid,
3714 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3715 	ASSERT_GE(listener, 0);
3716 
3717 	pid = fork();
3718 	ASSERT_GE(pid, 0);
3719 
3720 	if (pid == 0) {
3721 		ASSERT_EQ(unshare(CLONE_NEWPID), 0) {
3722 			if (errno == EPERM)
3723 				SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN");
3724 			else if (errno == EINVAL)
3725 				SKIP(return, "CLONE_NEWPID is invalid (missing CONFIG_PID_NS?)");
3726 		}
3727 
3728 		pid2 = fork();
3729 		ASSERT_GE(pid2, 0);
3730 
3731 		if (pid2 == 0)
3732 			exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3733 
3734 		EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3735 		EXPECT_EQ(true, WIFEXITED(status));
3736 		EXPECT_EQ(0, WEXITSTATUS(status));
3737 		exit(WEXITSTATUS(status));
3738 	}
3739 
3740 	/* Create the sibling ns, and sibling in it. */
3741 	ASSERT_EQ(unshare(CLONE_NEWPID), 0) {
3742 		if (errno == EPERM)
3743 			SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN");
3744 		else if (errno == EINVAL)
3745 			SKIP(return, "CLONE_NEWPID is invalid (missing CONFIG_PID_NS?)");
3746 	}
3747 	ASSERT_EQ(errno, 0);
3748 
3749 	pid2 = fork();
3750 	ASSERT_GE(pid2, 0);
3751 
3752 	if (pid2 == 0) {
3753 		ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3754 		/*
3755 		 * The pid should be 0, i.e. the task is in some namespace that
3756 		 * we can't "see".
3757 		 */
3758 		EXPECT_EQ(req.pid, 0);
3759 
3760 		resp.id = req.id;
3761 		resp.error = 0;
3762 		resp.val = USER_NOTIF_MAGIC;
3763 
3764 		ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3765 		exit(0);
3766 	}
3767 
3768 	close(listener);
3769 
3770 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3771 	EXPECT_EQ(true, WIFEXITED(status));
3772 	EXPECT_EQ(0, WEXITSTATUS(status));
3773 
3774 	EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
3775 	EXPECT_EQ(true, WIFEXITED(status));
3776 	EXPECT_EQ(0, WEXITSTATUS(status));
3777 }
3778 
TEST(user_notification_fault_recv)3779 TEST(user_notification_fault_recv)
3780 {
3781 	pid_t pid;
3782 	int status, listener;
3783 	struct seccomp_notif req = {};
3784 	struct seccomp_notif_resp resp = {};
3785 
3786 	ASSERT_EQ(unshare(CLONE_NEWUSER), 0) {
3787 		if (errno == EINVAL)
3788 			SKIP(return, "kernel missing CLONE_NEWUSER support");
3789 	}
3790 
3791 	listener = user_notif_syscall(__NR_getppid,
3792 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
3793 	ASSERT_GE(listener, 0);
3794 
3795 	pid = fork();
3796 	ASSERT_GE(pid, 0);
3797 
3798 	if (pid == 0)
3799 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
3800 
3801 	/* Do a bad recv() */
3802 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, NULL), -1);
3803 	EXPECT_EQ(errno, EFAULT);
3804 
3805 	/* We should still be able to receive this notification, though. */
3806 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3807 	EXPECT_EQ(req.pid, pid);
3808 
3809 	resp.id = req.id;
3810 	resp.error = 0;
3811 	resp.val = USER_NOTIF_MAGIC;
3812 
3813 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
3814 
3815 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3816 	EXPECT_EQ(true, WIFEXITED(status));
3817 	EXPECT_EQ(0, WEXITSTATUS(status));
3818 }
3819 
TEST(seccomp_get_notif_sizes)3820 TEST(seccomp_get_notif_sizes)
3821 {
3822 	struct seccomp_notif_sizes sizes;
3823 
3824 	ASSERT_EQ(seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes), 0);
3825 	EXPECT_EQ(sizes.seccomp_notif, sizeof(struct seccomp_notif));
3826 	EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp));
3827 }
3828 
TEST(user_notification_continue)3829 TEST(user_notification_continue)
3830 {
3831 	pid_t pid;
3832 	long ret;
3833 	int status, listener;
3834 	struct seccomp_notif req = {};
3835 	struct seccomp_notif_resp resp = {};
3836 	struct pollfd pollfd;
3837 
3838 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3839 	ASSERT_EQ(0, ret) {
3840 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3841 	}
3842 
3843 	listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3844 	ASSERT_GE(listener, 0);
3845 
3846 	pid = fork();
3847 	ASSERT_GE(pid, 0);
3848 
3849 	if (pid == 0) {
3850 		int dup_fd, pipe_fds[2];
3851 		pid_t self;
3852 
3853 		ASSERT_GE(pipe(pipe_fds), 0);
3854 
3855 		dup_fd = dup(pipe_fds[0]);
3856 		ASSERT_GE(dup_fd, 0);
3857 		EXPECT_NE(pipe_fds[0], dup_fd);
3858 
3859 		self = getpid();
3860 		ASSERT_EQ(filecmp(self, self, pipe_fds[0], dup_fd), 0);
3861 		exit(0);
3862 	}
3863 
3864 	pollfd.fd = listener;
3865 	pollfd.events = POLLIN | POLLOUT;
3866 
3867 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3868 	EXPECT_EQ(pollfd.revents, POLLIN);
3869 
3870 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
3871 
3872 	pollfd.fd = listener;
3873 	pollfd.events = POLLIN | POLLOUT;
3874 
3875 	EXPECT_GT(poll(&pollfd, 1, -1), 0);
3876 	EXPECT_EQ(pollfd.revents, POLLOUT);
3877 
3878 	EXPECT_EQ(req.data.nr, __NR_dup);
3879 
3880 	resp.id = req.id;
3881 	resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
3882 
3883 	/*
3884 	 * Verify that setting SECCOMP_USER_NOTIF_FLAG_CONTINUE enforces other
3885 	 * args be set to 0.
3886 	 */
3887 	resp.error = 0;
3888 	resp.val = USER_NOTIF_MAGIC;
3889 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3890 	EXPECT_EQ(errno, EINVAL);
3891 
3892 	resp.error = USER_NOTIF_MAGIC;
3893 	resp.val = 0;
3894 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
3895 	EXPECT_EQ(errno, EINVAL);
3896 
3897 	resp.error = 0;
3898 	resp.val = 0;
3899 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0) {
3900 		if (errno == EINVAL)
3901 			SKIP(goto skip, "Kernel does not support SECCOMP_USER_NOTIF_FLAG_CONTINUE");
3902 	}
3903 
3904 skip:
3905 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3906 	EXPECT_EQ(true, WIFEXITED(status));
3907 	EXPECT_EQ(0, WEXITSTATUS(status)) {
3908 		if (WEXITSTATUS(status) == 2) {
3909 			SKIP(return, "Kernel does not support kcmp() syscall");
3910 			return;
3911 		}
3912 	}
3913 }
3914 
TEST(user_notification_filter_empty)3915 TEST(user_notification_filter_empty)
3916 {
3917 	pid_t pid;
3918 	long ret;
3919 	int status;
3920 	struct pollfd pollfd;
3921 	struct __clone_args args = {
3922 		.flags = CLONE_FILES,
3923 		.exit_signal = SIGCHLD,
3924 	};
3925 
3926 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3927 	ASSERT_EQ(0, ret) {
3928 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3929 	}
3930 
3931 	if (__NR_clone3 < 0)
3932 		SKIP(return, "Test not built with clone3 support");
3933 
3934 	pid = sys_clone3(&args, sizeof(args));
3935 	ASSERT_GE(pid, 0);
3936 
3937 	if (pid == 0) {
3938 		int listener;
3939 
3940 		listener = user_notif_syscall(__NR_mknodat, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3941 		if (listener < 0)
3942 			_exit(EXIT_FAILURE);
3943 
3944 		if (dup2(listener, 200) != 200)
3945 			_exit(EXIT_FAILURE);
3946 
3947 		close(listener);
3948 
3949 		_exit(EXIT_SUCCESS);
3950 	}
3951 
3952 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
3953 	EXPECT_EQ(true, WIFEXITED(status));
3954 	EXPECT_EQ(0, WEXITSTATUS(status));
3955 
3956 	/*
3957 	 * The seccomp filter has become unused so we should be notified once
3958 	 * the kernel gets around to cleaning up task struct.
3959 	 */
3960 	pollfd.fd = 200;
3961 	pollfd.events = POLLHUP;
3962 
3963 	EXPECT_GT(poll(&pollfd, 1, 2000), 0);
3964 	EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
3965 }
3966 
TEST(user_ioctl_notification_filter_empty)3967 TEST(user_ioctl_notification_filter_empty)
3968 {
3969 	pid_t pid;
3970 	long ret;
3971 	int status, p[2];
3972 	struct __clone_args args = {
3973 		.flags = CLONE_FILES,
3974 		.exit_signal = SIGCHLD,
3975 	};
3976 	struct seccomp_notif req = {};
3977 
3978 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
3979 	ASSERT_EQ(0, ret) {
3980 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
3981 	}
3982 
3983 	if (__NR_clone3 < 0)
3984 		SKIP(return, "Test not built with clone3 support");
3985 
3986 	ASSERT_EQ(0, pipe(p));
3987 
3988 	pid = sys_clone3(&args, sizeof(args));
3989 	ASSERT_GE(pid, 0);
3990 
3991 	if (pid == 0) {
3992 		int listener;
3993 
3994 		listener = user_notif_syscall(__NR_mknodat, SECCOMP_FILTER_FLAG_NEW_LISTENER);
3995 		if (listener < 0)
3996 			_exit(EXIT_FAILURE);
3997 
3998 		if (dup2(listener, 200) != 200)
3999 			_exit(EXIT_FAILURE);
4000 		close(p[1]);
4001 		close(listener);
4002 		sleep(1);
4003 
4004 		_exit(EXIT_SUCCESS);
4005 	}
4006 	if (read(p[0], &status, 1) != 0)
4007 		_exit(EXIT_SUCCESS);
4008 	close(p[0]);
4009 	/*
4010 	 * The seccomp filter has become unused so we should be notified once
4011 	 * the kernel gets around to cleaning up task struct.
4012 	 */
4013 	EXPECT_EQ(ioctl(200, SECCOMP_IOCTL_NOTIF_RECV, &req), -1);
4014 	EXPECT_EQ(errno, ENOENT);
4015 
4016 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4017 	EXPECT_EQ(true, WIFEXITED(status));
4018 	EXPECT_EQ(0, WEXITSTATUS(status));
4019 }
4020 
do_thread(void * data)4021 static void *do_thread(void *data)
4022 {
4023 	return NULL;
4024 }
4025 
TEST(user_notification_filter_empty_threaded)4026 TEST(user_notification_filter_empty_threaded)
4027 {
4028 	pid_t pid;
4029 	long ret;
4030 	int status;
4031 	struct pollfd pollfd;
4032 	struct __clone_args args = {
4033 		.flags = CLONE_FILES,
4034 		.exit_signal = SIGCHLD,
4035 	};
4036 
4037 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4038 	ASSERT_EQ(0, ret) {
4039 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4040 	}
4041 
4042 	if (__NR_clone3 < 0)
4043 		SKIP(return, "Test not built with clone3 support");
4044 
4045 	pid = sys_clone3(&args, sizeof(args));
4046 	ASSERT_GE(pid, 0);
4047 
4048 	if (pid == 0) {
4049 		pid_t pid1, pid2;
4050 		int listener, status;
4051 		pthread_t thread;
4052 
4053 		listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
4054 		if (listener < 0)
4055 			_exit(EXIT_FAILURE);
4056 
4057 		if (dup2(listener, 200) != 200)
4058 			_exit(EXIT_FAILURE);
4059 
4060 		close(listener);
4061 
4062 		pid1 = fork();
4063 		if (pid1 < 0)
4064 			_exit(EXIT_FAILURE);
4065 
4066 		if (pid1 == 0)
4067 			_exit(EXIT_SUCCESS);
4068 
4069 		pid2 = fork();
4070 		if (pid2 < 0)
4071 			_exit(EXIT_FAILURE);
4072 
4073 		if (pid2 == 0)
4074 			_exit(EXIT_SUCCESS);
4075 
4076 		if (pthread_create(&thread, NULL, do_thread, NULL) ||
4077 		    pthread_join(thread, NULL))
4078 			_exit(EXIT_FAILURE);
4079 
4080 		if (pthread_create(&thread, NULL, do_thread, NULL) ||
4081 		    pthread_join(thread, NULL))
4082 			_exit(EXIT_FAILURE);
4083 
4084 		if (waitpid(pid1, &status, 0) != pid1 || !WIFEXITED(status) ||
4085 		    WEXITSTATUS(status))
4086 			_exit(EXIT_FAILURE);
4087 
4088 		if (waitpid(pid2, &status, 0) != pid2 || !WIFEXITED(status) ||
4089 		    WEXITSTATUS(status))
4090 			_exit(EXIT_FAILURE);
4091 
4092 		exit(EXIT_SUCCESS);
4093 	}
4094 
4095 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4096 	EXPECT_EQ(true, WIFEXITED(status));
4097 	EXPECT_EQ(0, WEXITSTATUS(status));
4098 
4099 	/*
4100 	 * The seccomp filter has become unused so we should be notified once
4101 	 * the kernel gets around to cleaning up task struct.
4102 	 */
4103 	pollfd.fd = 200;
4104 	pollfd.events = POLLHUP;
4105 
4106 	EXPECT_GT(poll(&pollfd, 1, 2000), 0);
4107 	EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
4108 }
4109 
4110 
get_next_fd(int prev_fd)4111 int get_next_fd(int prev_fd)
4112 {
4113 	for (int i = prev_fd + 1; i < FD_SETSIZE; ++i) {
4114 		if (fcntl(i, F_GETFD) == -1)
4115 			return i;
4116 	}
4117 	_exit(EXIT_FAILURE);
4118 }
4119 
TEST(user_notification_addfd)4120 TEST(user_notification_addfd)
4121 {
4122 	pid_t pid;
4123 	long ret;
4124 	int status, listener, memfd, fd, nextfd;
4125 	struct seccomp_notif_addfd addfd = {};
4126 	struct seccomp_notif_addfd_small small = {};
4127 	struct seccomp_notif_addfd_big big = {};
4128 	struct seccomp_notif req = {};
4129 	struct seccomp_notif_resp resp = {};
4130 	/* 100 ms */
4131 	struct timespec delay = { .tv_nsec = 100000000 };
4132 
4133 	/* There may be arbitrary already-open fds at test start. */
4134 	memfd = memfd_create("test", 0);
4135 	ASSERT_GE(memfd, 0);
4136 	nextfd = get_next_fd(memfd);
4137 
4138 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4139 	ASSERT_EQ(0, ret) {
4140 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4141 	}
4142 
4143 	/* fd: 4 */
4144 	/* Check that the basic notification machinery works */
4145 	listener = user_notif_syscall(__NR_getppid,
4146 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
4147 	ASSERT_EQ(listener, nextfd);
4148 	nextfd = get_next_fd(nextfd);
4149 
4150 	pid = fork();
4151 	ASSERT_GE(pid, 0);
4152 
4153 	if (pid == 0) {
4154 		/* fds will be added and this value is expected */
4155 		if (syscall(__NR_getppid) != USER_NOTIF_MAGIC)
4156 			exit(1);
4157 
4158 		/* Atomic addfd+send is received here. Check it is a valid fd */
4159 		if (fcntl(syscall(__NR_getppid), F_GETFD) == -1)
4160 			exit(1);
4161 
4162 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
4163 	}
4164 
4165 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4166 
4167 	addfd.srcfd = memfd;
4168 	addfd.newfd = 0;
4169 	addfd.id = req.id;
4170 	addfd.flags = 0x0;
4171 
4172 	/* Verify bad newfd_flags cannot be set */
4173 	addfd.newfd_flags = ~O_CLOEXEC;
4174 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4175 	EXPECT_EQ(errno, EINVAL);
4176 	addfd.newfd_flags = O_CLOEXEC;
4177 
4178 	/* Verify bad flags cannot be set */
4179 	addfd.flags = 0xff;
4180 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4181 	EXPECT_EQ(errno, EINVAL);
4182 	addfd.flags = 0;
4183 
4184 	/* Verify that remote_fd cannot be set without setting flags */
4185 	addfd.newfd = 1;
4186 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4187 	EXPECT_EQ(errno, EINVAL);
4188 	addfd.newfd = 0;
4189 
4190 	/* Verify small size cannot be set */
4191 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_SMALL, &small), -1);
4192 	EXPECT_EQ(errno, EINVAL);
4193 
4194 	/* Verify we can't send bits filled in unknown buffer area */
4195 	memset(&big, 0xAA, sizeof(big));
4196 	big.addfd = addfd;
4197 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big), -1);
4198 	EXPECT_EQ(errno, E2BIG);
4199 
4200 
4201 	/* Verify we can set an arbitrary remote fd */
4202 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4203 	EXPECT_EQ(fd, nextfd);
4204 	nextfd = get_next_fd(nextfd);
4205 	EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4206 
4207 	/* Verify we can set an arbitrary remote fd with large size */
4208 	memset(&big, 0x0, sizeof(big));
4209 	big.addfd = addfd;
4210 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big);
4211 	EXPECT_EQ(fd, nextfd);
4212 	nextfd = get_next_fd(nextfd);
4213 
4214 	/* Verify we can set a specific remote fd */
4215 	addfd.newfd = 42;
4216 	addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
4217 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4218 	EXPECT_EQ(fd, 42);
4219 	EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4220 
4221 	/* Resume syscall */
4222 	resp.id = req.id;
4223 	resp.error = 0;
4224 	resp.val = USER_NOTIF_MAGIC;
4225 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4226 
4227 	/*
4228 	 * This sets the ID of the ADD FD to the last request plus 1. The
4229 	 * notification ID increments 1 per notification.
4230 	 */
4231 	addfd.id = req.id + 1;
4232 
4233 	/* This spins until the underlying notification is generated */
4234 	while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 &&
4235 	       errno != -EINPROGRESS)
4236 		nanosleep(&delay, NULL);
4237 
4238 	memset(&req, 0, sizeof(req));
4239 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4240 	ASSERT_EQ(addfd.id, req.id);
4241 
4242 	/* Verify we can do an atomic addfd and send */
4243 	addfd.newfd = 0;
4244 	addfd.flags = SECCOMP_ADDFD_FLAG_SEND;
4245 	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
4246 	/*
4247 	 * Child has earlier "low" fds and now 42, so we expect the next
4248 	 * lowest available fd to be assigned here.
4249 	 */
4250 	EXPECT_EQ(fd, nextfd);
4251 	nextfd = get_next_fd(nextfd);
4252 	ASSERT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
4253 
4254 	/*
4255 	 * This sets the ID of the ADD FD to the last request plus 1. The
4256 	 * notification ID increments 1 per notification.
4257 	 */
4258 	addfd.id = req.id + 1;
4259 
4260 	/* This spins until the underlying notification is generated */
4261 	while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 &&
4262 	       errno != -EINPROGRESS)
4263 		nanosleep(&delay, NULL);
4264 
4265 	memset(&req, 0, sizeof(req));
4266 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4267 	ASSERT_EQ(addfd.id, req.id);
4268 
4269 	resp.id = req.id;
4270 	resp.error = 0;
4271 	resp.val = USER_NOTIF_MAGIC;
4272 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4273 
4274 	/* Wait for child to finish. */
4275 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4276 	EXPECT_EQ(true, WIFEXITED(status));
4277 	EXPECT_EQ(0, WEXITSTATUS(status));
4278 
4279 	close(memfd);
4280 }
4281 
TEST(user_notification_addfd_rlimit)4282 TEST(user_notification_addfd_rlimit)
4283 {
4284 	pid_t pid;
4285 	long ret;
4286 	int status, listener, memfd;
4287 	struct seccomp_notif_addfd addfd = {};
4288 	struct seccomp_notif req = {};
4289 	struct seccomp_notif_resp resp = {};
4290 	const struct rlimit lim = {
4291 		.rlim_cur	= 0,
4292 		.rlim_max	= 0,
4293 	};
4294 
4295 	memfd = memfd_create("test", 0);
4296 	ASSERT_GE(memfd, 0);
4297 
4298 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4299 	ASSERT_EQ(0, ret) {
4300 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4301 	}
4302 
4303 	/* Check that the basic notification machinery works */
4304 	listener = user_notif_syscall(__NR_getppid,
4305 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
4306 	ASSERT_GE(listener, 0);
4307 
4308 	pid = fork();
4309 	ASSERT_GE(pid, 0);
4310 
4311 	if (pid == 0)
4312 		exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
4313 
4314 
4315 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4316 
4317 	ASSERT_EQ(prlimit(pid, RLIMIT_NOFILE, &lim, NULL), 0);
4318 
4319 	addfd.srcfd = memfd;
4320 	addfd.newfd_flags = O_CLOEXEC;
4321 	addfd.newfd = 0;
4322 	addfd.id = req.id;
4323 	addfd.flags = 0;
4324 
4325 	/* Should probably spot check /proc/sys/fs/file-nr */
4326 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4327 	EXPECT_EQ(errno, EMFILE);
4328 
4329 	addfd.flags = SECCOMP_ADDFD_FLAG_SEND;
4330 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4331 	EXPECT_EQ(errno, EMFILE);
4332 
4333 	addfd.newfd = 100;
4334 	addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
4335 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
4336 	EXPECT_EQ(errno, EBADF);
4337 
4338 	resp.id = req.id;
4339 	resp.error = 0;
4340 	resp.val = USER_NOTIF_MAGIC;
4341 
4342 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4343 
4344 	/* Wait for child to finish. */
4345 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4346 	EXPECT_EQ(true, WIFEXITED(status));
4347 	EXPECT_EQ(0, WEXITSTATUS(status));
4348 
4349 	close(memfd);
4350 }
4351 
4352 #ifndef SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP
4353 #define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0)
4354 #define SECCOMP_IOCTL_NOTIF_SET_FLAGS  SECCOMP_IOW(4, __u64)
4355 #endif
4356 
TEST(user_notification_sync)4357 TEST(user_notification_sync)
4358 {
4359 	struct seccomp_notif req = {};
4360 	struct seccomp_notif_resp resp = {};
4361 	int status, listener;
4362 	pid_t pid;
4363 	long ret;
4364 
4365 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4366 	ASSERT_EQ(0, ret) {
4367 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4368 	}
4369 
4370 	listener = user_notif_syscall(__NR_getppid,
4371 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
4372 	ASSERT_GE(listener, 0);
4373 
4374 	/* Try to set invalid flags. */
4375 	EXPECT_SYSCALL_RETURN(-EINVAL,
4376 		ioctl(listener, SECCOMP_IOCTL_NOTIF_SET_FLAGS, 0xffffffff, 0));
4377 
4378 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SET_FLAGS,
4379 			SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP, 0), 0);
4380 
4381 	pid = fork();
4382 	ASSERT_GE(pid, 0);
4383 	if (pid == 0) {
4384 		ret = syscall(__NR_getppid);
4385 		ASSERT_EQ(ret, USER_NOTIF_MAGIC) {
4386 			_exit(1);
4387 		}
4388 		_exit(0);
4389 	}
4390 
4391 	req.pid = 0;
4392 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4393 
4394 	ASSERT_EQ(req.data.nr,  __NR_getppid);
4395 
4396 	resp.id = req.id;
4397 	resp.error = 0;
4398 	resp.val = USER_NOTIF_MAGIC;
4399 	resp.flags = 0;
4400 	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4401 
4402 	ASSERT_EQ(waitpid(pid, &status, 0), pid);
4403 	ASSERT_EQ(status, 0);
4404 }
4405 
4406 
4407 /* Make sure PTRACE_O_SUSPEND_SECCOMP requires CAP_SYS_ADMIN. */
FIXTURE(O_SUSPEND_SECCOMP)4408 FIXTURE(O_SUSPEND_SECCOMP) {
4409 	pid_t pid;
4410 };
4411 
FIXTURE_SETUP(O_SUSPEND_SECCOMP)4412 FIXTURE_SETUP(O_SUSPEND_SECCOMP)
4413 {
4414 	ERRNO_FILTER(block_read, E2BIG);
4415 	cap_value_t cap_list[] = { CAP_SYS_ADMIN };
4416 	cap_t caps;
4417 
4418 	self->pid = 0;
4419 
4420 	/* make sure we don't have CAP_SYS_ADMIN */
4421 	caps = cap_get_proc();
4422 	ASSERT_NE(NULL, caps);
4423 	ASSERT_EQ(0, cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_CLEAR));
4424 	ASSERT_EQ(0, cap_set_proc(caps));
4425 	cap_free(caps);
4426 
4427 	ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
4428 	ASSERT_EQ(0, prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_block_read));
4429 
4430 	self->pid = fork();
4431 	ASSERT_GE(self->pid, 0);
4432 
4433 	if (self->pid == 0) {
4434 		while (1)
4435 			pause();
4436 		_exit(127);
4437 	}
4438 }
4439 
FIXTURE_TEARDOWN(O_SUSPEND_SECCOMP)4440 FIXTURE_TEARDOWN(O_SUSPEND_SECCOMP)
4441 {
4442 	if (self->pid)
4443 		kill(self->pid, SIGKILL);
4444 }
4445 
TEST_F(O_SUSPEND_SECCOMP,setoptions)4446 TEST_F(O_SUSPEND_SECCOMP, setoptions)
4447 {
4448 	int wstatus;
4449 
4450 	ASSERT_EQ(0, ptrace(PTRACE_ATTACH, self->pid, NULL, 0));
4451 	ASSERT_EQ(self->pid, wait(&wstatus));
4452 	ASSERT_EQ(-1, ptrace(PTRACE_SETOPTIONS, self->pid, NULL, PTRACE_O_SUSPEND_SECCOMP));
4453 	if (errno == EINVAL)
4454 		SKIP(return, "Kernel does not support PTRACE_O_SUSPEND_SECCOMP (missing CONFIG_CHECKPOINT_RESTORE?)");
4455 	ASSERT_EQ(EPERM, errno);
4456 }
4457 
TEST_F(O_SUSPEND_SECCOMP,seize)4458 TEST_F(O_SUSPEND_SECCOMP, seize)
4459 {
4460 	int ret;
4461 
4462 	ret = ptrace(PTRACE_SEIZE, self->pid, NULL, PTRACE_O_SUSPEND_SECCOMP);
4463 	ASSERT_EQ(-1, ret);
4464 	if (errno == EINVAL)
4465 		SKIP(return, "Kernel does not support PTRACE_O_SUSPEND_SECCOMP (missing CONFIG_CHECKPOINT_RESTORE?)");
4466 	ASSERT_EQ(EPERM, errno);
4467 }
4468 
4469 /*
4470  * get_nth - Get the nth, space separated entry in a file.
4471  *
4472  * Returns the length of the read field.
4473  * Throws error if field is zero-lengthed.
4474  */
get_nth(struct __test_metadata * _metadata,const char * path,const unsigned int position,char ** entry)4475 static ssize_t get_nth(struct __test_metadata *_metadata, const char *path,
4476 		     const unsigned int position, char **entry)
4477 {
4478 	char *line = NULL;
4479 	unsigned int i;
4480 	ssize_t nread;
4481 	size_t len = 0;
4482 	FILE *f;
4483 
4484 	f = fopen(path, "r");
4485 	ASSERT_NE(f, NULL) {
4486 		TH_LOG("Could not open %s: %s", path, strerror(errno));
4487 	}
4488 
4489 	for (i = 0; i < position; i++) {
4490 		nread = getdelim(&line, &len, ' ', f);
4491 		ASSERT_GE(nread, 0) {
4492 			TH_LOG("Failed to read %d entry in file %s", i, path);
4493 		}
4494 	}
4495 	fclose(f);
4496 
4497 	ASSERT_GT(nread, 0) {
4498 		TH_LOG("Entry in file %s had zero length", path);
4499 	}
4500 
4501 	*entry = line;
4502 	return nread - 1;
4503 }
4504 
4505 /* For a given PID, get the task state (D, R, etc...) */
get_proc_stat(struct __test_metadata * _metadata,pid_t pid)4506 static char get_proc_stat(struct __test_metadata *_metadata, pid_t pid)
4507 {
4508 	char proc_path[100] = {0};
4509 	char status;
4510 	char *line;
4511 
4512 	snprintf(proc_path, sizeof(proc_path), "/proc/%d/stat", pid);
4513 	ASSERT_EQ(get_nth(_metadata, proc_path, 3, &line), 1);
4514 
4515 	status = *line;
4516 	free(line);
4517 
4518 	return status;
4519 }
4520 
TEST(user_notification_fifo)4521 TEST(user_notification_fifo)
4522 {
4523 	struct seccomp_notif_resp resp = {};
4524 	struct seccomp_notif req = {};
4525 	int i, status, listener;
4526 	pid_t pid, pids[3];
4527 	__u64 baseid;
4528 	long ret;
4529 	/* 100 ms */
4530 	struct timespec delay = { .tv_nsec = 100000000 };
4531 
4532 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4533 	ASSERT_EQ(0, ret) {
4534 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4535 	}
4536 
4537 	/* Setup a listener */
4538 	listener = user_notif_syscall(__NR_getppid,
4539 				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
4540 	ASSERT_GE(listener, 0);
4541 
4542 	pid = fork();
4543 	ASSERT_GE(pid, 0);
4544 
4545 	if (pid == 0) {
4546 		ret = syscall(__NR_getppid);
4547 		exit(ret != USER_NOTIF_MAGIC);
4548 	}
4549 
4550 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4551 	baseid = req.id + 1;
4552 
4553 	resp.id = req.id;
4554 	resp.error = 0;
4555 	resp.val = USER_NOTIF_MAGIC;
4556 
4557 	/* check that we make sure flags == 0 */
4558 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4559 
4560 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4561 	EXPECT_EQ(true, WIFEXITED(status));
4562 	EXPECT_EQ(0, WEXITSTATUS(status));
4563 
4564 	/* Start children, and generate notifications */
4565 	for (i = 0; i < ARRAY_SIZE(pids); i++) {
4566 		pid = fork();
4567 		if (pid == 0) {
4568 			ret = syscall(__NR_getppid);
4569 			exit(ret != USER_NOTIF_MAGIC);
4570 		}
4571 		pids[i] = pid;
4572 	}
4573 
4574 	/* This spins until all of the children are sleeping */
4575 restart_wait:
4576 	for (i = 0; i < ARRAY_SIZE(pids); i++) {
4577 		if (get_proc_stat(_metadata, pids[i]) != 'S') {
4578 			nanosleep(&delay, NULL);
4579 			goto restart_wait;
4580 		}
4581 	}
4582 
4583 	/* Read the notifications in order (and respond) */
4584 	for (i = 0; i < ARRAY_SIZE(pids); i++) {
4585 		memset(&req, 0, sizeof(req));
4586 		EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4587 		EXPECT_EQ(req.id, baseid + i);
4588 		resp.id = req.id;
4589 		EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4590 	}
4591 
4592 	/* Make sure notifications were received */
4593 	for (i = 0; i < ARRAY_SIZE(pids); i++) {
4594 		EXPECT_EQ(waitpid(pids[i], &status, 0), pids[i]);
4595 		EXPECT_EQ(true, WIFEXITED(status));
4596 		EXPECT_EQ(0, WEXITSTATUS(status));
4597 	}
4598 }
4599 
4600 /* get_proc_syscall - Get the syscall in progress for a given pid
4601  *
4602  * Returns the current syscall number for a given process
4603  * Returns -1 if not in syscall (running or blocked)
4604  */
get_proc_syscall(struct __test_metadata * _metadata,int pid)4605 static long get_proc_syscall(struct __test_metadata *_metadata, int pid)
4606 {
4607 	char proc_path[100] = {0};
4608 	long ret = -1;
4609 	ssize_t nread;
4610 	char *line;
4611 
4612 	snprintf(proc_path, sizeof(proc_path), "/proc/%d/syscall", pid);
4613 	nread = get_nth(_metadata, proc_path, 1, &line);
4614 	ASSERT_GT(nread, 0);
4615 
4616 	if (!strncmp("running", line, MIN(7, nread)))
4617 		ret = strtol(line, NULL, 16);
4618 
4619 	free(line);
4620 	return ret;
4621 }
4622 
4623 /* Ensure non-fatal signals prior to receive are unmodified */
TEST(user_notification_wait_killable_pre_notification)4624 TEST(user_notification_wait_killable_pre_notification)
4625 {
4626 	struct sigaction new_action = {
4627 		.sa_handler = signal_handler,
4628 	};
4629 	int listener, status, sk_pair[2];
4630 	pid_t pid;
4631 	long ret;
4632 	char c;
4633 	/* 100 ms */
4634 	struct timespec delay = { .tv_nsec = 100000000 };
4635 
4636 	ASSERT_EQ(sigemptyset(&new_action.sa_mask), 0);
4637 
4638 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4639 	ASSERT_EQ(0, ret)
4640 	{
4641 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4642 	}
4643 
4644 	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
4645 
4646 	listener = user_notif_syscall(
4647 		__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER |
4648 				      SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
4649 	ASSERT_GE(listener, 0);
4650 
4651 	/*
4652 	 * Check that we can kill the process with SIGUSR1 prior to receiving
4653 	 * the notification. SIGUSR1 is wired up to a custom signal handler,
4654 	 * and make sure it gets called.
4655 	 */
4656 	pid = fork();
4657 	ASSERT_GE(pid, 0);
4658 
4659 	if (pid == 0) {
4660 		close(sk_pair[0]);
4661 		handled = sk_pair[1];
4662 
4663 		/* Setup the non-fatal sigaction without SA_RESTART */
4664 		if (sigaction(SIGUSR1, &new_action, NULL)) {
4665 			perror("sigaction");
4666 			exit(1);
4667 		}
4668 
4669 		ret = syscall(__NR_getppid);
4670 		/* Make sure we got a return from a signal interruption */
4671 		exit(ret != -1 || errno != EINTR);
4672 	}
4673 
4674 	/*
4675 	 * Make sure we've gotten to the seccomp user notification wait
4676 	 * from getppid prior to sending any signals
4677 	 */
4678 	while (get_proc_syscall(_metadata, pid) != __NR_getppid &&
4679 	       get_proc_stat(_metadata, pid) != 'S')
4680 		nanosleep(&delay, NULL);
4681 
4682 	/* Send non-fatal kill signal */
4683 	EXPECT_EQ(kill(pid, SIGUSR1), 0);
4684 
4685 	/* wait for process to exit (exit checks for EINTR) */
4686 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4687 	EXPECT_EQ(true, WIFEXITED(status));
4688 	EXPECT_EQ(0, WEXITSTATUS(status));
4689 
4690 	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
4691 }
4692 
4693 /* Ensure non-fatal signals after receive are blocked */
TEST(user_notification_wait_killable)4694 TEST(user_notification_wait_killable)
4695 {
4696 	struct sigaction new_action = {
4697 		.sa_handler = signal_handler,
4698 	};
4699 	struct seccomp_notif_resp resp = {};
4700 	struct seccomp_notif req = {};
4701 	int listener, status, sk_pair[2];
4702 	pid_t pid;
4703 	long ret;
4704 	char c;
4705 	/* 100 ms */
4706 	struct timespec delay = { .tv_nsec = 100000000 };
4707 
4708 	ASSERT_EQ(sigemptyset(&new_action.sa_mask), 0);
4709 
4710 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4711 	ASSERT_EQ(0, ret)
4712 	{
4713 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4714 	}
4715 
4716 	ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
4717 
4718 	listener = user_notif_syscall(
4719 		__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER |
4720 				      SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
4721 	ASSERT_GE(listener, 0);
4722 
4723 	pid = fork();
4724 	ASSERT_GE(pid, 0);
4725 
4726 	if (pid == 0) {
4727 		close(sk_pair[0]);
4728 		handled = sk_pair[1];
4729 
4730 		/* Setup the sigaction without SA_RESTART */
4731 		if (sigaction(SIGUSR1, &new_action, NULL)) {
4732 			perror("sigaction");
4733 			exit(1);
4734 		}
4735 
4736 		/* Make sure that the syscall is completed (no EINTR) */
4737 		ret = syscall(__NR_getppid);
4738 		exit(ret != USER_NOTIF_MAGIC);
4739 	}
4740 
4741 	/*
4742 	 * Get the notification, to make move the notifying process into a
4743 	 * non-preemptible (TASK_KILLABLE) state.
4744 	 */
4745 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4746 	/* Send non-fatal kill signal */
4747 	EXPECT_EQ(kill(pid, SIGUSR1), 0);
4748 
4749 	/*
4750 	 * Make sure the task enters moves to TASK_KILLABLE by waiting for
4751 	 * D (Disk Sleep) state after receiving non-fatal signal.
4752 	 */
4753 	while (get_proc_stat(_metadata, pid) != 'D')
4754 		nanosleep(&delay, NULL);
4755 
4756 	resp.id = req.id;
4757 	resp.val = USER_NOTIF_MAGIC;
4758 	/* Make sure the notification is found and able to be replied to */
4759 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
4760 
4761 	/*
4762 	 * Make sure that the signal handler does get called once we're back in
4763 	 * userspace.
4764 	 */
4765 	EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
4766 	/* wait for process to exit (exit checks for USER_NOTIF_MAGIC) */
4767 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4768 	EXPECT_EQ(true, WIFEXITED(status));
4769 	EXPECT_EQ(0, WEXITSTATUS(status));
4770 }
4771 
4772 /* Ensure fatal signals after receive are not blocked */
TEST(user_notification_wait_killable_fatal)4773 TEST(user_notification_wait_killable_fatal)
4774 {
4775 	struct seccomp_notif req = {};
4776 	int listener, status;
4777 	pid_t pid;
4778 	long ret;
4779 	/* 100 ms */
4780 	struct timespec delay = { .tv_nsec = 100000000 };
4781 
4782 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4783 	ASSERT_EQ(0, ret)
4784 	{
4785 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4786 	}
4787 
4788 	listener = user_notif_syscall(
4789 		__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER |
4790 				      SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV);
4791 	ASSERT_GE(listener, 0);
4792 
4793 	pid = fork();
4794 	ASSERT_GE(pid, 0);
4795 
4796 	if (pid == 0) {
4797 		/* This should never complete as it should get a SIGTERM */
4798 		syscall(__NR_getppid);
4799 		exit(1);
4800 	}
4801 
4802 	while (get_proc_stat(_metadata, pid) != 'S')
4803 		nanosleep(&delay, NULL);
4804 
4805 	/*
4806 	 * Get the notification, to make move the notifying process into a
4807 	 * non-preemptible (TASK_KILLABLE) state.
4808 	 */
4809 	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
4810 	/* Kill the process with a fatal signal */
4811 	EXPECT_EQ(kill(pid, SIGTERM), 0);
4812 
4813 	/*
4814 	 * Wait for the process to exit, and make sure the process terminated
4815 	 * due to the SIGTERM signal.
4816 	 */
4817 	EXPECT_EQ(waitpid(pid, &status, 0), pid);
4818 	EXPECT_EQ(true, WIFSIGNALED(status));
4819 	EXPECT_EQ(SIGTERM, WTERMSIG(status));
4820 }
4821 
4822 struct tsync_vs_thread_leader_args {
4823 	pthread_t leader;
4824 };
4825 
tsync_vs_dead_thread_leader_sibling(void * _args)4826 static void *tsync_vs_dead_thread_leader_sibling(void *_args)
4827 {
4828 	struct sock_filter allow_filter[] = {
4829 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
4830 	};
4831 	struct sock_fprog allow_prog = {
4832 		.len = (unsigned short)ARRAY_SIZE(allow_filter),
4833 		.filter = allow_filter,
4834 	};
4835 	struct tsync_vs_thread_leader_args *args = _args;
4836 	void *retval;
4837 	long ret;
4838 
4839 	ret = pthread_join(args->leader, &retval);
4840 	if (ret)
4841 		exit(1);
4842 	if (retval != _args)
4843 		exit(2);
4844 	ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, &allow_prog);
4845 	if (ret)
4846 		exit(3);
4847 
4848 	exit(0);
4849 }
4850 
4851 /*
4852  * Ensure that a dead thread leader doesn't prevent installing new filters with
4853  * SECCOMP_FILTER_FLAG_TSYNC from other threads.
4854  */
TEST(tsync_vs_dead_thread_leader)4855 TEST(tsync_vs_dead_thread_leader)
4856 {
4857 	int status;
4858 	pid_t pid;
4859 	long ret;
4860 
4861 	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
4862 	ASSERT_EQ(0, ret) {
4863 		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
4864 	}
4865 
4866 	pid = fork();
4867 	ASSERT_GE(pid, 0);
4868 
4869 	if (pid == 0) {
4870 		struct sock_filter allow_filter[] = {
4871 			BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
4872 		};
4873 		struct sock_fprog allow_prog = {
4874 			.len = (unsigned short)ARRAY_SIZE(allow_filter),
4875 			.filter = allow_filter,
4876 		};
4877 		struct  tsync_vs_thread_leader_args *args;
4878 		pthread_t sibling;
4879 
4880 		args = malloc(sizeof(*args));
4881 		ASSERT_NE(NULL, args);
4882 		args->leader = pthread_self();
4883 
4884 		ret = pthread_create(&sibling, NULL,
4885 				     tsync_vs_dead_thread_leader_sibling, args);
4886 		ASSERT_EQ(0, ret);
4887 
4888 		/* Install a new filter just to the leader thread. */
4889 		ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &allow_prog);
4890 		ASSERT_EQ(0, ret);
4891 		pthread_exit(args);
4892 		exit(1);
4893 	}
4894 
4895 	EXPECT_EQ(pid, waitpid(pid, &status, 0));
4896 	EXPECT_EQ(0, status);
4897 }
4898 
probed(void)4899 noinline int probed(void)
4900 {
4901 	return 1;
4902 }
4903 
parse_uint_from_file(const char * file,const char * fmt)4904 static int parse_uint_from_file(const char *file, const char *fmt)
4905 {
4906 	int err = -1, ret;
4907 	FILE *f;
4908 
4909 	f = fopen(file, "re");
4910 	if (f) {
4911 		err = fscanf(f, fmt, &ret);
4912 		fclose(f);
4913 	}
4914 	return err == 1 ? ret : err;
4915 }
4916 
determine_uprobe_perf_type(void)4917 static int determine_uprobe_perf_type(void)
4918 {
4919 	const char *file = "/sys/bus/event_source/devices/uprobe/type";
4920 
4921 	return parse_uint_from_file(file, "%d\n");
4922 }
4923 
determine_uprobe_retprobe_bit(void)4924 static int determine_uprobe_retprobe_bit(void)
4925 {
4926 	const char *file = "/sys/bus/event_source/devices/uprobe/format/retprobe";
4927 
4928 	return parse_uint_from_file(file, "config:%d\n");
4929 }
4930 
get_uprobe_offset(const void * addr)4931 static ssize_t get_uprobe_offset(const void *addr)
4932 {
4933 	size_t start, base, end;
4934 	bool found = false;
4935 	char buf[256];
4936 	FILE *f;
4937 
4938 	f = fopen("/proc/self/maps", "r");
4939 	if (!f)
4940 		return -1;
4941 
4942 	while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &base) == 4) {
4943 		if (buf[2] == 'x' && (uintptr_t)addr >= start && (uintptr_t)addr < end) {
4944 			found = true;
4945 			break;
4946 		}
4947 	}
4948 	fclose(f);
4949 	return found ? (uintptr_t)addr - start + base : -1;
4950 }
4951 
FIXTURE(URETPROBE)4952 FIXTURE(URETPROBE) {
4953 	int fd;
4954 };
4955 
FIXTURE_VARIANT(URETPROBE)4956 FIXTURE_VARIANT(URETPROBE) {
4957 	/*
4958 	 * All of the URETPROBE behaviors can be tested with either
4959 	 * uretprobe attached or not
4960 	 */
4961 	bool attach;
4962 };
4963 
FIXTURE_VARIANT_ADD(URETPROBE,attached)4964 FIXTURE_VARIANT_ADD(URETPROBE, attached) {
4965 	.attach = true,
4966 };
4967 
FIXTURE_VARIANT_ADD(URETPROBE,not_attached)4968 FIXTURE_VARIANT_ADD(URETPROBE, not_attached) {
4969 	.attach = false,
4970 };
4971 
FIXTURE_SETUP(URETPROBE)4972 FIXTURE_SETUP(URETPROBE)
4973 {
4974 	const size_t attr_sz = sizeof(struct perf_event_attr);
4975 	struct perf_event_attr attr;
4976 	ssize_t offset;
4977 	int type, bit;
4978 
4979 #ifndef __NR_uretprobe
4980 	SKIP(return, "__NR_uretprobe syscall not defined");
4981 #endif
4982 
4983 	if (!variant->attach)
4984 		return;
4985 
4986 	memset(&attr, 0, attr_sz);
4987 
4988 	type = determine_uprobe_perf_type();
4989 	ASSERT_GE(type, 0);
4990 	bit = determine_uprobe_retprobe_bit();
4991 	ASSERT_GE(bit, 0);
4992 	offset = get_uprobe_offset(probed);
4993 	ASSERT_GE(offset, 0);
4994 
4995 	attr.config |= 1 << bit;
4996 	attr.size = attr_sz;
4997 	attr.type = type;
4998 	attr.config1 = ptr_to_u64("/proc/self/exe");
4999 	attr.config2 = offset;
5000 
5001 	self->fd = syscall(__NR_perf_event_open, &attr,
5002 			   getpid() /* pid */, -1 /* cpu */, -1 /* group_fd */,
5003 			   PERF_FLAG_FD_CLOEXEC);
5004 }
5005 
FIXTURE_TEARDOWN(URETPROBE)5006 FIXTURE_TEARDOWN(URETPROBE)
5007 {
5008 	/* we could call close(self->fd), but we'd need extra filter for
5009 	 * that and since we are calling _exit right away..
5010 	 */
5011 }
5012 
run_probed_with_filter(struct sock_fprog * prog)5013 static int run_probed_with_filter(struct sock_fprog *prog)
5014 {
5015 	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
5016 	    seccomp(SECCOMP_SET_MODE_FILTER, 0, prog)) {
5017 		return -1;
5018 	}
5019 
5020 	probed();
5021 	return 0;
5022 }
5023 
TEST_F(URETPROBE,uretprobe_default_allow)5024 TEST_F(URETPROBE, uretprobe_default_allow)
5025 {
5026 	struct sock_filter filter[] = {
5027 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
5028 	};
5029 	struct sock_fprog prog = {
5030 		.len = (unsigned short)ARRAY_SIZE(filter),
5031 		.filter = filter,
5032 	};
5033 
5034 	ASSERT_EQ(0, run_probed_with_filter(&prog));
5035 }
5036 
TEST_F(URETPROBE,uretprobe_default_block)5037 TEST_F(URETPROBE, uretprobe_default_block)
5038 {
5039 	struct sock_filter filter[] = {
5040 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
5041 			offsetof(struct seccomp_data, nr)),
5042 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit_group, 1, 0),
5043 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
5044 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
5045 	};
5046 	struct sock_fprog prog = {
5047 		.len = (unsigned short)ARRAY_SIZE(filter),
5048 		.filter = filter,
5049 	};
5050 
5051 	ASSERT_EQ(0, run_probed_with_filter(&prog));
5052 }
5053 
TEST_F(URETPROBE,uretprobe_block_uretprobe_syscall)5054 TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall)
5055 {
5056 	struct sock_filter filter[] = {
5057 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
5058 			offsetof(struct seccomp_data, nr)),
5059 #ifdef __NR_uretprobe
5060 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 0, 1),
5061 #endif
5062 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
5063 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
5064 	};
5065 	struct sock_fprog prog = {
5066 		.len = (unsigned short)ARRAY_SIZE(filter),
5067 		.filter = filter,
5068 	};
5069 
5070 	ASSERT_EQ(0, run_probed_with_filter(&prog));
5071 }
5072 
TEST_F(URETPROBE,uretprobe_default_block_with_uretprobe_syscall)5073 TEST_F(URETPROBE, uretprobe_default_block_with_uretprobe_syscall)
5074 {
5075 	struct sock_filter filter[] = {
5076 		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
5077 			offsetof(struct seccomp_data, nr)),
5078 #ifdef __NR_uretprobe
5079 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 2, 0),
5080 #endif
5081 		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit_group, 1, 0),
5082 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
5083 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
5084 	};
5085 	struct sock_fprog prog = {
5086 		.len = (unsigned short)ARRAY_SIZE(filter),
5087 		.filter = filter,
5088 	};
5089 
5090 	ASSERT_EQ(0, run_probed_with_filter(&prog));
5091 }
5092 
5093 /*
5094  * TODO:
5095  * - expand NNP testing
5096  * - better arch-specific TRACE and TRAP handlers.
5097  * - endianness checking when appropriate
5098  * - 64-bit arg prodding
5099  * - arch value testing (x86 modes especially)
5100  * - verify that FILTER_FLAG_LOG filters generate log messages
5101  * - verify that RET_LOG generates log messages
5102  */
5103 
5104 TEST_HARNESS_MAIN
5105