1 /* 2 * QEMU seccomp mode 2 support with libseccomp 3 * 4 * Copyright IBM, Corp. 2012 5 * 6 * Authors: 7 * Eduardo Otubo <eotubo@br.ibm.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Contributions after 2012-01-13 are licensed under the terms of the 13 * GNU GPL, version 2 or (at your option) any later version. 14 */ 15 16 #include "qemu/osdep.h" 17 #include "qapi/error.h" 18 #include "qemu/config-file.h" 19 #include "qemu/option.h" 20 #include "qemu/module.h" 21 #include <sys/prctl.h> 22 #include <seccomp.h> 23 #include "sysemu/seccomp.h" 24 #include <linux/seccomp.h> 25 26 /* For some architectures (notably ARM) cacheflush is not supported until 27 * libseccomp 2.2.3, but configure enforces that we are using a more recent 28 * version on those hosts, so it is OK for this check to be less strict. 29 */ 30 #if SCMP_VER_MAJOR >= 3 31 #define HAVE_CACHEFLUSH 32 #elif SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 2 33 #define HAVE_CACHEFLUSH 34 #endif 35 36 struct QemuSeccompSyscall { 37 int32_t num; 38 uint8_t set; 39 uint8_t narg; 40 const struct scmp_arg_cmp *arg_cmp; 41 uint32_t action; 42 }; 43 44 const struct scmp_arg_cmp sched_setscheduler_arg[] = { 45 /* was SCMP_A1(SCMP_CMP_NE, SCHED_IDLE), but expanded due to GCC 4.x bug */ 46 { .arg = 1, .op = SCMP_CMP_NE, .datum_a = SCHED_IDLE } 47 }; 48 49 /* 50 * See 'NOTES' in 'man 2 clone' - s390 & cross have 'flags' in 51 * different position to other architectures 52 */ 53 #if defined(HOST_S390X) || defined(HOST_S390) || defined(HOST_CRIS) 54 #define CLONE_FLAGS_ARG 1 55 #else 56 #define CLONE_FLAGS_ARG 0 57 #endif 58 59 #ifndef CLONE_PIDFD 60 # define CLONE_PIDFD 0x00001000 61 #endif 62 63 #define REQUIRE_CLONE_FLAG(flag) \ 64 const struct scmp_arg_cmp clone_arg ## flag[] = { \ 65 { .arg = CLONE_FLAGS_ARG, \ 66 .op = SCMP_CMP_MASKED_EQ, \ 67 .datum_a = flag, .datum_b = 0 } } 68 69 #define FORBID_CLONE_FLAG(flag) \ 70 const struct scmp_arg_cmp clone_arg ## flag[] = { \ 71 { .arg = CLONE_FLAGS_ARG, \ 72 .op = SCMP_CMP_MASKED_EQ, \ 73 .datum_a = flag, .datum_b = flag } } 74 75 #define RULE_CLONE_FLAG(flag) \ 76 { SCMP_SYS(clone), QEMU_SECCOMP_SET_SPAWN, \ 77 ARRAY_SIZE(clone_arg ## flag), clone_arg ## flag, SCMP_ACT_TRAP } 78 79 /* If no CLONE_* flags are set, except CSIGNAL, deny */ 80 const struct scmp_arg_cmp clone_arg_none[] = { 81 { .arg = CLONE_FLAGS_ARG, 82 .op = SCMP_CMP_MASKED_EQ, 83 .datum_a = ~(CSIGNAL), .datum_b = 0 } 84 }; 85 86 /* 87 * pthread_create should always set all of these. 88 */ 89 REQUIRE_CLONE_FLAG(CLONE_VM); 90 REQUIRE_CLONE_FLAG(CLONE_FS); 91 REQUIRE_CLONE_FLAG(CLONE_FILES); 92 REQUIRE_CLONE_FLAG(CLONE_SIGHAND); 93 REQUIRE_CLONE_FLAG(CLONE_THREAD); 94 REQUIRE_CLONE_FLAG(CLONE_SYSVSEM); 95 REQUIRE_CLONE_FLAG(CLONE_SETTLS); 96 REQUIRE_CLONE_FLAG(CLONE_PARENT_SETTID); 97 REQUIRE_CLONE_FLAG(CLONE_CHILD_CLEARTID); 98 /* 99 * Musl sets this in pthread_create too, but it is 100 * obsolete and harmless since its behaviour is 101 * subsumed under CLONE_THREAD 102 */ 103 /*REQUIRE_CLONE_FLAG(CLONE_DETACHED);*/ 104 105 106 /* 107 * These all indicate an attempt to spawn a process 108 * instead of a thread, or other undesirable scenarios 109 */ 110 FORBID_CLONE_FLAG(CLONE_PIDFD); 111 FORBID_CLONE_FLAG(CLONE_PTRACE); 112 FORBID_CLONE_FLAG(CLONE_VFORK); 113 FORBID_CLONE_FLAG(CLONE_PARENT); 114 FORBID_CLONE_FLAG(CLONE_NEWNS); 115 FORBID_CLONE_FLAG(CLONE_UNTRACED); 116 FORBID_CLONE_FLAG(CLONE_NEWCGROUP); 117 FORBID_CLONE_FLAG(CLONE_NEWUTS); 118 FORBID_CLONE_FLAG(CLONE_NEWIPC); 119 FORBID_CLONE_FLAG(CLONE_NEWUSER); 120 FORBID_CLONE_FLAG(CLONE_NEWPID); 121 FORBID_CLONE_FLAG(CLONE_NEWNET); 122 FORBID_CLONE_FLAG(CLONE_IO); 123 124 125 static const struct QemuSeccompSyscall denylist[] = { 126 /* default set of syscalls that should get blocked */ 127 { SCMP_SYS(reboot), QEMU_SECCOMP_SET_DEFAULT, 128 0, NULL, SCMP_ACT_TRAP }, 129 { SCMP_SYS(swapon), QEMU_SECCOMP_SET_DEFAULT, 130 0, NULL, SCMP_ACT_TRAP }, 131 { SCMP_SYS(swapoff), QEMU_SECCOMP_SET_DEFAULT, 132 0, NULL, SCMP_ACT_TRAP }, 133 { SCMP_SYS(syslog), QEMU_SECCOMP_SET_DEFAULT, 134 0, NULL, SCMP_ACT_TRAP }, 135 { SCMP_SYS(mount), QEMU_SECCOMP_SET_DEFAULT, 136 0, NULL, SCMP_ACT_TRAP }, 137 { SCMP_SYS(umount), QEMU_SECCOMP_SET_DEFAULT, 138 0, NULL, SCMP_ACT_TRAP }, 139 { SCMP_SYS(kexec_load), QEMU_SECCOMP_SET_DEFAULT, 140 0, NULL, SCMP_ACT_TRAP }, 141 { SCMP_SYS(afs_syscall), QEMU_SECCOMP_SET_DEFAULT, 142 0, NULL, SCMP_ACT_TRAP }, 143 { SCMP_SYS(break), QEMU_SECCOMP_SET_DEFAULT, 144 0, NULL, SCMP_ACT_TRAP }, 145 { SCMP_SYS(ftime), QEMU_SECCOMP_SET_DEFAULT, 146 0, NULL, SCMP_ACT_TRAP }, 147 { SCMP_SYS(getpmsg), QEMU_SECCOMP_SET_DEFAULT, 148 0, NULL, SCMP_ACT_TRAP }, 149 { SCMP_SYS(gtty), QEMU_SECCOMP_SET_DEFAULT, 150 0, NULL, SCMP_ACT_TRAP }, 151 { SCMP_SYS(lock), QEMU_SECCOMP_SET_DEFAULT, 152 0, NULL, SCMP_ACT_TRAP }, 153 { SCMP_SYS(mpx), QEMU_SECCOMP_SET_DEFAULT, 154 0, NULL, SCMP_ACT_TRAP }, 155 { SCMP_SYS(prof), QEMU_SECCOMP_SET_DEFAULT, 156 0, NULL, SCMP_ACT_TRAP }, 157 { SCMP_SYS(profil), QEMU_SECCOMP_SET_DEFAULT, 158 0, NULL, SCMP_ACT_TRAP }, 159 { SCMP_SYS(putpmsg), QEMU_SECCOMP_SET_DEFAULT, 160 0, NULL, SCMP_ACT_TRAP }, 161 { SCMP_SYS(security), QEMU_SECCOMP_SET_DEFAULT, 162 0, NULL, SCMP_ACT_TRAP }, 163 { SCMP_SYS(stty), QEMU_SECCOMP_SET_DEFAULT, 164 0, NULL, SCMP_ACT_TRAP }, 165 { SCMP_SYS(tuxcall), QEMU_SECCOMP_SET_DEFAULT, 166 0, NULL, SCMP_ACT_TRAP }, 167 { SCMP_SYS(ulimit), QEMU_SECCOMP_SET_DEFAULT, 168 0, NULL, SCMP_ACT_TRAP }, 169 { SCMP_SYS(vserver), QEMU_SECCOMP_SET_DEFAULT, 170 0, NULL, SCMP_ACT_TRAP }, 171 /* obsolete */ 172 { SCMP_SYS(readdir), QEMU_SECCOMP_SET_OBSOLETE, 173 0, NULL, SCMP_ACT_TRAP }, 174 { SCMP_SYS(_sysctl), QEMU_SECCOMP_SET_OBSOLETE, 175 0, NULL, SCMP_ACT_TRAP }, 176 { SCMP_SYS(bdflush), QEMU_SECCOMP_SET_OBSOLETE, 177 0, NULL, SCMP_ACT_TRAP }, 178 { SCMP_SYS(create_module), QEMU_SECCOMP_SET_OBSOLETE, 179 0, NULL, SCMP_ACT_TRAP }, 180 { SCMP_SYS(get_kernel_syms), QEMU_SECCOMP_SET_OBSOLETE, 181 0, NULL, SCMP_ACT_TRAP }, 182 { SCMP_SYS(query_module), QEMU_SECCOMP_SET_OBSOLETE, 183 0, NULL, SCMP_ACT_TRAP }, 184 { SCMP_SYS(sgetmask), QEMU_SECCOMP_SET_OBSOLETE, 185 0, NULL, SCMP_ACT_TRAP }, 186 { SCMP_SYS(ssetmask), QEMU_SECCOMP_SET_OBSOLETE, 187 0, NULL, SCMP_ACT_TRAP }, 188 { SCMP_SYS(sysfs), QEMU_SECCOMP_SET_OBSOLETE, 189 0, NULL, SCMP_ACT_TRAP }, 190 { SCMP_SYS(uselib), QEMU_SECCOMP_SET_OBSOLETE, 191 0, NULL, SCMP_ACT_TRAP }, 192 { SCMP_SYS(ustat), QEMU_SECCOMP_SET_OBSOLETE, 193 0, NULL, SCMP_ACT_TRAP }, 194 /* privileged */ 195 { SCMP_SYS(setuid), QEMU_SECCOMP_SET_PRIVILEGED, 196 0, NULL, SCMP_ACT_TRAP }, 197 { SCMP_SYS(setgid), QEMU_SECCOMP_SET_PRIVILEGED, 198 0, NULL, SCMP_ACT_TRAP }, 199 { SCMP_SYS(setpgid), QEMU_SECCOMP_SET_PRIVILEGED, 200 0, NULL, SCMP_ACT_TRAP }, 201 { SCMP_SYS(setsid), QEMU_SECCOMP_SET_PRIVILEGED, 202 0, NULL, SCMP_ACT_TRAP }, 203 { SCMP_SYS(setreuid), QEMU_SECCOMP_SET_PRIVILEGED, 204 0, NULL, SCMP_ACT_TRAP }, 205 { SCMP_SYS(setregid), QEMU_SECCOMP_SET_PRIVILEGED, 206 0, NULL, SCMP_ACT_TRAP }, 207 { SCMP_SYS(setresuid), QEMU_SECCOMP_SET_PRIVILEGED, 208 0, NULL, SCMP_ACT_TRAP }, 209 { SCMP_SYS(setresgid), QEMU_SECCOMP_SET_PRIVILEGED, 210 0, NULL, SCMP_ACT_TRAP }, 211 { SCMP_SYS(setfsuid), QEMU_SECCOMP_SET_PRIVILEGED, 212 0, NULL, SCMP_ACT_TRAP }, 213 { SCMP_SYS(setfsgid), QEMU_SECCOMP_SET_PRIVILEGED, 214 0, NULL, SCMP_ACT_TRAP }, 215 /* spawn */ 216 { SCMP_SYS(fork), QEMU_SECCOMP_SET_SPAWN, 217 0, NULL, SCMP_ACT_TRAP }, 218 { SCMP_SYS(vfork), QEMU_SECCOMP_SET_SPAWN, 219 0, NULL, SCMP_ACT_TRAP }, 220 { SCMP_SYS(execve), QEMU_SECCOMP_SET_SPAWN, 221 0, NULL, SCMP_ACT_TRAP }, 222 { SCMP_SYS(clone), QEMU_SECCOMP_SET_SPAWN, 223 ARRAY_SIZE(clone_arg_none), clone_arg_none, SCMP_ACT_TRAP }, 224 RULE_CLONE_FLAG(CLONE_VM), 225 RULE_CLONE_FLAG(CLONE_FS), 226 RULE_CLONE_FLAG(CLONE_FILES), 227 RULE_CLONE_FLAG(CLONE_SIGHAND), 228 RULE_CLONE_FLAG(CLONE_THREAD), 229 RULE_CLONE_FLAG(CLONE_SYSVSEM), 230 RULE_CLONE_FLAG(CLONE_SETTLS), 231 RULE_CLONE_FLAG(CLONE_PARENT_SETTID), 232 RULE_CLONE_FLAG(CLONE_CHILD_CLEARTID), 233 /*RULE_CLONE_FLAG(CLONE_DETACHED),*/ 234 RULE_CLONE_FLAG(CLONE_PIDFD), 235 RULE_CLONE_FLAG(CLONE_PTRACE), 236 RULE_CLONE_FLAG(CLONE_VFORK), 237 RULE_CLONE_FLAG(CLONE_PARENT), 238 RULE_CLONE_FLAG(CLONE_NEWNS), 239 RULE_CLONE_FLAG(CLONE_UNTRACED), 240 RULE_CLONE_FLAG(CLONE_NEWCGROUP), 241 RULE_CLONE_FLAG(CLONE_NEWUTS), 242 RULE_CLONE_FLAG(CLONE_NEWIPC), 243 RULE_CLONE_FLAG(CLONE_NEWUSER), 244 RULE_CLONE_FLAG(CLONE_NEWPID), 245 RULE_CLONE_FLAG(CLONE_NEWNET), 246 RULE_CLONE_FLAG(CLONE_IO), 247 #ifdef __SNR_clone3 248 { SCMP_SYS(clone3), QEMU_SECCOMP_SET_SPAWN, 249 0, NULL, SCMP_ACT_ERRNO(ENOSYS) }, 250 #endif 251 /* resource control */ 252 { SCMP_SYS(setpriority), QEMU_SECCOMP_SET_RESOURCECTL, 253 0, NULL, SCMP_ACT_ERRNO(EPERM) }, 254 { SCMP_SYS(sched_setparam), QEMU_SECCOMP_SET_RESOURCECTL, 255 0, NULL, SCMP_ACT_ERRNO(EPERM) }, 256 { SCMP_SYS(sched_setscheduler), QEMU_SECCOMP_SET_RESOURCECTL, 257 ARRAY_SIZE(sched_setscheduler_arg), sched_setscheduler_arg, 258 SCMP_ACT_ERRNO(EPERM) }, 259 { SCMP_SYS(sched_setaffinity), QEMU_SECCOMP_SET_RESOURCECTL, 260 0, NULL, SCMP_ACT_ERRNO(EPERM) }, 261 }; 262 263 static inline __attribute__((unused)) int 264 qemu_seccomp(unsigned int operation, unsigned int flags, void *args) 265 { 266 #ifdef __NR_seccomp 267 return syscall(__NR_seccomp, operation, flags, args); 268 #else 269 errno = ENOSYS; 270 return -1; 271 #endif 272 } 273 274 static uint32_t qemu_seccomp_update_action(uint32_t action) 275 { 276 #if defined(SECCOMP_GET_ACTION_AVAIL) && defined(SCMP_ACT_KILL_PROCESS) && \ 277 defined(SECCOMP_RET_KILL_PROCESS) 278 if (action == SCMP_ACT_TRAP) { 279 static int kill_process = -1; 280 if (kill_process == -1) { 281 uint32_t action = SECCOMP_RET_KILL_PROCESS; 282 283 if (qemu_seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &action) == 0) { 284 kill_process = 1; 285 } else { 286 kill_process = 0; 287 } 288 } 289 if (kill_process == 1) { 290 return SCMP_ACT_KILL_PROCESS; 291 } 292 } 293 #endif 294 return action; 295 } 296 297 298 static int seccomp_start(uint32_t seccomp_opts, Error **errp) 299 { 300 int rc = -1; 301 unsigned int i = 0; 302 scmp_filter_ctx ctx; 303 304 ctx = seccomp_init(SCMP_ACT_ALLOW); 305 if (ctx == NULL) { 306 error_setg(errp, "failed to initialize seccomp context"); 307 goto seccomp_return; 308 } 309 310 rc = seccomp_attr_set(ctx, SCMP_FLTATR_CTL_TSYNC, 1); 311 if (rc != 0) { 312 error_setg_errno(errp, -rc, 313 "failed to set seccomp thread synchronization"); 314 goto seccomp_return; 315 } 316 317 for (i = 0; i < ARRAY_SIZE(denylist); i++) { 318 uint32_t action; 319 if (!(seccomp_opts & denylist[i].set)) { 320 continue; 321 } 322 323 action = qemu_seccomp_update_action(denylist[i].action); 324 rc = seccomp_rule_add_array(ctx, action, denylist[i].num, 325 denylist[i].narg, denylist[i].arg_cmp); 326 if (rc < 0) { 327 error_setg_errno(errp, -rc, 328 "failed to add seccomp denylist rules"); 329 goto seccomp_return; 330 } 331 } 332 333 rc = seccomp_load(ctx); 334 if (rc < 0) { 335 error_setg_errno(errp, -rc, 336 "failed to load seccomp syscall filter in kernel"); 337 } 338 339 seccomp_return: 340 seccomp_release(ctx); 341 return rc < 0 ? -1 : 0; 342 } 343 344 int parse_sandbox(void *opaque, QemuOpts *opts, Error **errp) 345 { 346 if (qemu_opt_get_bool(opts, "enable", false)) { 347 uint32_t seccomp_opts = QEMU_SECCOMP_SET_DEFAULT 348 | QEMU_SECCOMP_SET_OBSOLETE; 349 const char *value = NULL; 350 351 value = qemu_opt_get(opts, "obsolete"); 352 if (value) { 353 if (g_str_equal(value, "allow")) { 354 seccomp_opts &= ~QEMU_SECCOMP_SET_OBSOLETE; 355 } else if (g_str_equal(value, "deny")) { 356 /* this is the default option, this if is here 357 * to provide a little bit of consistency for 358 * the command line */ 359 } else { 360 error_setg(errp, "invalid argument for obsolete"); 361 return -1; 362 } 363 } 364 365 value = qemu_opt_get(opts, "elevateprivileges"); 366 if (value) { 367 if (g_str_equal(value, "deny")) { 368 seccomp_opts |= QEMU_SECCOMP_SET_PRIVILEGED; 369 } else if (g_str_equal(value, "children")) { 370 seccomp_opts |= QEMU_SECCOMP_SET_PRIVILEGED; 371 372 /* calling prctl directly because we're 373 * not sure if host has CAP_SYS_ADMIN set*/ 374 if (prctl(PR_SET_NO_NEW_PRIVS, 1)) { 375 error_setg(errp, "failed to set no_new_privs aborting"); 376 return -1; 377 } 378 } else if (g_str_equal(value, "allow")) { 379 /* default value */ 380 } else { 381 error_setg(errp, "invalid argument for elevateprivileges"); 382 return -1; 383 } 384 } 385 386 value = qemu_opt_get(opts, "spawn"); 387 if (value) { 388 if (g_str_equal(value, "deny")) { 389 seccomp_opts |= QEMU_SECCOMP_SET_SPAWN; 390 } else if (g_str_equal(value, "allow")) { 391 /* default value */ 392 } else { 393 error_setg(errp, "invalid argument for spawn"); 394 return -1; 395 } 396 } 397 398 value = qemu_opt_get(opts, "resourcecontrol"); 399 if (value) { 400 if (g_str_equal(value, "deny")) { 401 seccomp_opts |= QEMU_SECCOMP_SET_RESOURCECTL; 402 } else if (g_str_equal(value, "allow")) { 403 /* default value */ 404 } else { 405 error_setg(errp, "invalid argument for resourcecontrol"); 406 return -1; 407 } 408 } 409 410 if (seccomp_start(seccomp_opts, errp) < 0) { 411 return -1; 412 } 413 } 414 415 return 0; 416 } 417 418 static QemuOptsList qemu_sandbox_opts = { 419 .name = "sandbox", 420 .implied_opt_name = "enable", 421 .head = QTAILQ_HEAD_INITIALIZER(qemu_sandbox_opts.head), 422 .desc = { 423 { 424 .name = "enable", 425 .type = QEMU_OPT_BOOL, 426 }, 427 { 428 .name = "obsolete", 429 .type = QEMU_OPT_STRING, 430 }, 431 { 432 .name = "elevateprivileges", 433 .type = QEMU_OPT_STRING, 434 }, 435 { 436 .name = "spawn", 437 .type = QEMU_OPT_STRING, 438 }, 439 { 440 .name = "resourcecontrol", 441 .type = QEMU_OPT_STRING, 442 }, 443 { /* end of list */ } 444 }, 445 }; 446 447 static void seccomp_register(void) 448 { 449 bool add = false; 450 451 /* FIXME: use seccomp_api_get() >= 2 check when released */ 452 453 #if defined(SECCOMP_FILTER_FLAG_TSYNC) 454 int check; 455 456 /* check host TSYNC capability, it returns errno == ENOSYS if unavailable */ 457 check = qemu_seccomp(SECCOMP_SET_MODE_FILTER, 458 SECCOMP_FILTER_FLAG_TSYNC, NULL); 459 if (check < 0 && errno == EFAULT) { 460 add = true; 461 } 462 #endif 463 464 if (add) { 465 qemu_add_opts(&qemu_sandbox_opts); 466 } 467 } 468 opts_init(seccomp_register); 469