1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net> 4 * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) 5 */ 6 7 #include <stdio.h> 8 #include <stdlib.h> 9 #include <stdarg.h> 10 #include <unistd.h> 11 #include <errno.h> 12 #include <fcntl.h> 13 #include <sched.h> 14 #include <signal.h> 15 #include <string.h> 16 #include <sys/mman.h> 17 #include <sys/stat.h> 18 #include <sys/wait.h> 19 #include <sys/time.h> 20 #include <sys/resource.h> 21 #include <asm/ldt.h> 22 #include <asm/unistd.h> 23 #include <init.h> 24 #include <os.h> 25 #include <kern_util.h> 26 #include <mem_user.h> 27 #include <ptrace_user.h> 28 #include <stdbool.h> 29 #include <stub-data.h> 30 #include <sys/prctl.h> 31 #include <linux/seccomp.h> 32 #include <linux/filter.h> 33 #include <sysdep/mcontext.h> 34 #include <sysdep/stub.h> 35 #include <registers.h> 36 #include <skas.h> 37 #include "internal.h" 38 39 static void ptrace_child(void) 40 { 41 int ret; 42 /* Calling os_getpid because some libcs cached getpid incorrectly */ 43 int pid = os_getpid(), ppid = getppid(); 44 int sc_result; 45 46 if (change_sig(SIGWINCH, 0) < 0 || 47 ptrace(PTRACE_TRACEME, 0, 0, 0) < 0) { 48 perror("ptrace"); 49 kill(pid, SIGKILL); 50 } 51 kill(pid, SIGSTOP); 52 53 /* 54 * This syscall will be intercepted by the parent. Don't call more than 55 * once, please. 56 */ 57 sc_result = os_getpid(); 58 59 if (sc_result == pid) 60 /* Nothing modified by the parent, we are running normally. */ 61 ret = 1; 62 else if (sc_result == ppid) 63 /* 64 * Expected in check_ptrace and check_sysemu when they succeed 65 * in modifying the stack frame 66 */ 67 ret = 0; 68 else 69 /* Serious trouble! This could be caused by a bug in host 2.6 70 * SKAS3/2.6 patch before release -V6, together with a bug in 71 * the UML code itself. 72 */ 73 ret = 2; 74 75 exit(ret); 76 } 77 78 static void fatal_perror(const char *str) 79 { 80 perror(str); 81 exit(1); 82 } 83 84 static void fatal(char *fmt, ...) 85 { 86 va_list list; 87 88 va_start(list, fmt); 89 vfprintf(stderr, fmt, list); 90 va_end(list); 91 92 exit(1); 93 } 94 95 static void non_fatal(char *fmt, ...) 96 { 97 va_list list; 98 99 va_start(list, fmt); 100 vfprintf(stderr, fmt, list); 101 va_end(list); 102 } 103 104 static int start_ptraced_child(void) 105 { 106 int pid, n, status; 107 108 fflush(stdout); 109 110 pid = fork(); 111 if (pid == 0) 112 ptrace_child(); 113 else if (pid < 0) 114 fatal_perror("start_ptraced_child : fork failed"); 115 116 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED)); 117 if (n < 0) 118 fatal_perror("check_ptrace : waitpid failed"); 119 if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) 120 fatal("check_ptrace : expected SIGSTOP, got status = %d", 121 status); 122 123 return pid; 124 } 125 126 static void stop_ptraced_child(int pid, int exitcode) 127 { 128 int status, n; 129 130 if (ptrace(PTRACE_CONT, pid, 0, 0) < 0) 131 fatal_perror("stop_ptraced_child : ptrace failed"); 132 133 CATCH_EINTR(n = waitpid(pid, &status, 0)); 134 if (!WIFEXITED(status) || (WEXITSTATUS(status) != exitcode)) { 135 int exit_with = WEXITSTATUS(status); 136 fatal("stop_ptraced_child : child exited with exitcode %d, " 137 "while expecting %d; status 0x%x\n", exit_with, 138 exitcode, status); 139 } 140 } 141 142 static void __init check_sysemu(void) 143 { 144 int pid, n, status, count=0; 145 146 os_info("Checking syscall emulation for ptrace..."); 147 pid = start_ptraced_child(); 148 149 if ((ptrace(PTRACE_SETOPTIONS, pid, 0, 150 (void *) PTRACE_O_TRACESYSGOOD) < 0)) 151 fatal_perror("check_sysemu: PTRACE_SETOPTIONS failed"); 152 153 while (1) { 154 count++; 155 if (ptrace(PTRACE_SYSEMU_SINGLESTEP, pid, 0, 0) < 0) 156 goto fail; 157 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED)); 158 if (n < 0) 159 fatal_perror("check_sysemu: wait failed"); 160 161 if (WIFSTOPPED(status) && 162 (WSTOPSIG(status) == (SIGTRAP|0x80))) { 163 if (!count) { 164 non_fatal("check_sysemu: SYSEMU_SINGLESTEP " 165 "doesn't singlestep"); 166 goto fail; 167 } 168 n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_RET_OFFSET, 169 os_getpid()); 170 if (n < 0) 171 fatal_perror("check_sysemu : failed to modify " 172 "system call return"); 173 break; 174 } 175 else if (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGTRAP)) 176 count++; 177 else { 178 non_fatal("check_sysemu: expected SIGTRAP or " 179 "(SIGTRAP | 0x80), got status = %d\n", 180 status); 181 goto fail; 182 } 183 } 184 stop_ptraced_child(pid, 0); 185 186 os_info("OK\n"); 187 188 return; 189 190 fail: 191 stop_ptraced_child(pid, 1); 192 fatal("missing\n"); 193 } 194 195 static void __init check_ptrace(void) 196 { 197 int pid, syscall, n, status; 198 199 os_info("Checking that ptrace can change system call numbers..."); 200 pid = start_ptraced_child(); 201 202 if ((ptrace(PTRACE_SETOPTIONS, pid, 0, 203 (void *) PTRACE_O_TRACESYSGOOD) < 0)) 204 fatal_perror("check_ptrace: PTRACE_SETOPTIONS failed"); 205 206 while (1) { 207 if (ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0) 208 fatal_perror("check_ptrace : ptrace failed"); 209 210 CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED)); 211 if (n < 0) 212 fatal_perror("check_ptrace : wait failed"); 213 214 if (!WIFSTOPPED(status) || 215 (WSTOPSIG(status) != (SIGTRAP | 0x80))) 216 fatal("check_ptrace : expected (SIGTRAP|0x80), " 217 "got status = %d", status); 218 219 syscall = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_NR_OFFSET, 220 0); 221 if (syscall == __NR_getpid) { 222 n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, 223 __NR_getppid); 224 if (n < 0) 225 fatal_perror("check_ptrace : failed to modify " 226 "system call"); 227 break; 228 } 229 } 230 stop_ptraced_child(pid, 0); 231 os_info("OK\n"); 232 check_sysemu(); 233 } 234 235 extern unsigned long host_fp_size; 236 extern unsigned long exec_regs[MAX_REG_NR]; 237 extern unsigned long *exec_fp_regs; 238 239 __initdata static struct stub_data *seccomp_test_stub_data; 240 241 static void __init sigsys_handler(int sig, siginfo_t *info, void *p) 242 { 243 ucontext_t *uc = p; 244 245 /* Stow away the location of the mcontext in the stack */ 246 seccomp_test_stub_data->mctx_offset = (unsigned long)&uc->uc_mcontext - 247 (unsigned long)&seccomp_test_stub_data->sigstack[0]; 248 249 /* Prevent libc from clearing memory (mctx_offset in particular) */ 250 syscall(__NR_exit, 0); 251 } 252 253 static int __init seccomp_helper(void *data) 254 { 255 static struct sock_filter filter[] = { 256 BPF_STMT(BPF_LD | BPF_W | BPF_ABS, 257 offsetof(struct seccomp_data, nr)), 258 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_clock_nanosleep, 1, 0), 259 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), 260 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP), 261 }; 262 static struct sock_fprog prog = { 263 .len = ARRAY_SIZE(filter), 264 .filter = filter, 265 }; 266 struct sigaction sa; 267 268 /* close_range is needed for the stub */ 269 if (stub_syscall3(__NR_close_range, 1, ~0U, 0)) 270 exit(1); 271 272 set_sigstack(seccomp_test_stub_data->sigstack, 273 sizeof(seccomp_test_stub_data->sigstack)); 274 275 sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO; 276 sa.sa_sigaction = (void *) sigsys_handler; 277 sa.sa_restorer = NULL; 278 if (sigaction(SIGSYS, &sa, NULL) < 0) 279 exit(2); 280 281 prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); 282 if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, 283 SECCOMP_FILTER_FLAG_TSYNC, &prog) != 0) 284 exit(3); 285 286 sleep(0); 287 288 /* Never reached. */ 289 _exit(4); 290 } 291 292 static bool __init init_seccomp(void) 293 { 294 int pid; 295 int status; 296 int n; 297 unsigned long sp; 298 299 /* 300 * We check that we can install a seccomp filter and then exit(0) 301 * from a trapped syscall. 302 * 303 * Note that we cannot verify that no seccomp filter already exists 304 * for a syscall that results in the process/thread to be killed. 305 */ 306 307 os_info("Checking that seccomp filters can be installed..."); 308 309 seccomp_test_stub_data = mmap(0, sizeof(*seccomp_test_stub_data), 310 PROT_READ | PROT_WRITE, 311 MAP_SHARED | MAP_ANON, 0, 0); 312 313 /* Use the syscall data area as stack, we just need something */ 314 sp = (unsigned long)&seccomp_test_stub_data->syscall_data + 315 sizeof(seccomp_test_stub_data->syscall_data) - 316 sizeof(void *); 317 pid = clone(seccomp_helper, (void *)sp, CLONE_VFORK | CLONE_VM, NULL); 318 319 if (pid < 0) 320 fatal_perror("check_seccomp : clone failed"); 321 322 CATCH_EINTR(n = waitpid(pid, &status, __WCLONE)); 323 if (n < 0) 324 fatal_perror("check_seccomp : waitpid failed"); 325 326 if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { 327 struct uml_pt_regs *regs; 328 unsigned long fp_size; 329 int r; 330 331 /* Fill in the host_fp_size from the mcontext. */ 332 regs = calloc(1, sizeof(struct uml_pt_regs)); 333 get_stub_state(regs, seccomp_test_stub_data, &fp_size); 334 host_fp_size = fp_size; 335 free(regs); 336 337 /* Repeat with the correct size */ 338 regs = calloc(1, sizeof(struct uml_pt_regs) + host_fp_size); 339 r = get_stub_state(regs, seccomp_test_stub_data, NULL); 340 341 /* Store as the default startup registers */ 342 exec_fp_regs = malloc(host_fp_size); 343 memcpy(exec_regs, regs->gp, sizeof(exec_regs)); 344 memcpy(exec_fp_regs, regs->fp, host_fp_size); 345 346 munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data)); 347 348 free(regs); 349 350 if (r) { 351 os_info("failed to fetch registers: %d\n", r); 352 return false; 353 } 354 355 os_info("OK\n"); 356 return true; 357 } 358 359 if (WIFEXITED(status) && WEXITSTATUS(status) == 2) 360 os_info("missing\n"); 361 else 362 os_info("error\n"); 363 364 munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data)); 365 return false; 366 } 367 368 369 static void __init check_coredump_limit(void) 370 { 371 struct rlimit lim; 372 int err = getrlimit(RLIMIT_CORE, &lim); 373 374 if (err) { 375 perror("Getting core dump limit"); 376 return; 377 } 378 379 os_info("Core dump limits :\n\tsoft - "); 380 if (lim.rlim_cur == RLIM_INFINITY) 381 os_info("NONE\n"); 382 else 383 os_info("%llu\n", (unsigned long long)lim.rlim_cur); 384 385 os_info("\thard - "); 386 if (lim.rlim_max == RLIM_INFINITY) 387 os_info("NONE\n"); 388 else 389 os_info("%llu\n", (unsigned long long)lim.rlim_max); 390 } 391 392 void __init get_host_cpu_features( 393 void (*flags_helper_func)(char *line), 394 void (*cache_helper_func)(char *line)) 395 { 396 FILE *cpuinfo; 397 char *line = NULL; 398 size_t len = 0; 399 int done_parsing = 0; 400 401 cpuinfo = fopen("/proc/cpuinfo", "r"); 402 if (cpuinfo == NULL) { 403 os_info("Failed to get host CPU features\n"); 404 } else { 405 while ((getline(&line, &len, cpuinfo)) != -1) { 406 if (strstr(line, "flags")) { 407 flags_helper_func(line); 408 done_parsing++; 409 } 410 if (strstr(line, "cache_alignment")) { 411 cache_helper_func(line); 412 done_parsing++; 413 } 414 free(line); 415 line = NULL; 416 if (done_parsing > 1) 417 break; 418 } 419 fclose(cpuinfo); 420 } 421 } 422 423 static int seccomp_config __initdata; 424 425 static int __init uml_seccomp_config(char *line, int *add) 426 { 427 *add = 0; 428 429 if (strcmp(line, "off") == 0) 430 seccomp_config = 0; 431 else if (strcmp(line, "auto") == 0) 432 seccomp_config = 1; 433 else if (strcmp(line, "on") == 0) 434 seccomp_config = 2; 435 else 436 fatal("Invalid seccomp option '%s', expected on/auto/off\n", 437 line); 438 439 return 0; 440 } 441 442 __uml_setup("seccomp=", uml_seccomp_config, 443 "seccomp=<on/auto/off>\n" 444 " Configure whether or not SECCOMP is used. With SECCOMP, userspace\n" 445 " processes work collaboratively with the kernel instead of being\n" 446 " traced using ptrace. All syscalls from the application are caught and\n" 447 " redirected using a signal. This signal handler in turn is permitted to\n" 448 " do the selected set of syscalls to communicate with the UML kernel and\n" 449 " do the required memory management.\n" 450 "\n" 451 " This method is overall faster than the ptrace based userspace, primarily\n" 452 " because it reduces the number of context switches for (minor) page faults.\n" 453 "\n" 454 " However, the SECCOMP filter is not (yet) restrictive enough to prevent\n" 455 " userspace from reading and writing all physical memory. Userspace\n" 456 " processes could also trick the stub into disabling SIGALRM which\n" 457 " prevents it from being interrupted for scheduling purposes.\n" 458 "\n" 459 " This is insecure and should only be used with a trusted userspace\n\n" 460 ); 461 462 void __init os_early_checks(void) 463 { 464 int pid; 465 466 /* Print out the core dump limits early */ 467 check_coredump_limit(); 468 469 /* Need to check this early because mmapping happens before the 470 * kernel is running. 471 */ 472 check_tmpexec(); 473 474 if (seccomp_config) { 475 if (init_seccomp()) { 476 using_seccomp = 1; 477 return; 478 } 479 480 if (seccomp_config == 2) 481 fatal("SECCOMP userspace requested but not functional!\n"); 482 } 483 484 using_seccomp = 0; 485 check_ptrace(); 486 487 pid = start_ptraced_child(); 488 if (init_pid_registers(pid)) 489 fatal("Failed to initialize default registers"); 490 stop_ptraced_child(pid, 1); 491 } 492 493 int __init parse_iomem(char *str, int *add) 494 { 495 struct iomem_region *new; 496 struct stat64 buf; 497 char *file, *driver; 498 int fd, size; 499 500 driver = str; 501 file = strchr(str,','); 502 if (file == NULL) { 503 os_warn("parse_iomem : failed to parse iomem\n"); 504 goto out; 505 } 506 *file = '\0'; 507 file++; 508 fd = open(file, O_RDWR, 0); 509 if (fd < 0) { 510 perror("parse_iomem - Couldn't open io file"); 511 goto out; 512 } 513 514 if (fstat64(fd, &buf) < 0) { 515 perror("parse_iomem - cannot stat_fd file"); 516 goto out_close; 517 } 518 519 new = malloc(sizeof(*new)); 520 if (new == NULL) { 521 perror("Couldn't allocate iomem_region struct"); 522 goto out_close; 523 } 524 525 size = (buf.st_size + UM_KERN_PAGE_SIZE) & ~(UM_KERN_PAGE_SIZE - 1); 526 527 *new = ((struct iomem_region) { .next = iomem_regions, 528 .driver = driver, 529 .fd = fd, 530 .size = size, 531 .phys = 0, 532 .virt = 0 }); 533 iomem_regions = new; 534 iomem_size += new->size + UM_KERN_PAGE_SIZE; 535 536 return 0; 537 out_close: 538 close(fd); 539 out: 540 return 1; 541 } 542