1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Restartable sequences system call 4 * 5 * Copyright (C) 2015, Google, Inc., 6 * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com> 7 * Copyright (C) 2015-2018, EfficiOS Inc., 8 * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> 9 */ 10 11 #include <linux/sched.h> 12 #include <linux/uaccess.h> 13 #include <linux/syscalls.h> 14 #include <linux/rseq.h> 15 #include <linux/types.h> 16 #include <linux/ratelimit.h> 17 #include <asm/ptrace.h> 18 19 #define CREATE_TRACE_POINTS 20 #include <trace/events/rseq.h> 21 22 /* The original rseq structure size (including padding) is 32 bytes. */ 23 #define ORIG_RSEQ_SIZE 32 24 25 #define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ 26 RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ 27 RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) 28 29 #ifdef CONFIG_DEBUG_RSEQ 30 static struct rseq *rseq_kernel_fields(struct task_struct *t) 31 { 32 return (struct rseq *) t->rseq_fields; 33 } 34 35 static int rseq_validate_ro_fields(struct task_struct *t) 36 { 37 static DEFINE_RATELIMIT_STATE(_rs, 38 DEFAULT_RATELIMIT_INTERVAL, 39 DEFAULT_RATELIMIT_BURST); 40 u32 cpu_id_start, cpu_id, node_id, mm_cid; 41 struct rseq __user *rseq = t->rseq; 42 43 /* 44 * Validate fields which are required to be read-only by 45 * user-space. 46 */ 47 if (!user_read_access_begin(rseq, t->rseq_len)) 48 goto efault; 49 unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end); 50 unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end); 51 unsafe_get_user(node_id, &rseq->node_id, efault_end); 52 unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end); 53 user_read_access_end(); 54 55 if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start || 56 cpu_id != rseq_kernel_fields(t)->cpu_id || 57 node_id != rseq_kernel_fields(t)->node_id || 58 mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) { 59 60 pr_warn("Detected rseq corruption for pid: %d, name: %s\n" 61 "\tcpu_id_start: %u ?= %u\n" 62 "\tcpu_id: %u ?= %u\n" 63 "\tnode_id: %u ?= %u\n" 64 "\tmm_cid: %u ?= %u\n", 65 t->pid, t->comm, 66 cpu_id_start, rseq_kernel_fields(t)->cpu_id_start, 67 cpu_id, rseq_kernel_fields(t)->cpu_id, 68 node_id, rseq_kernel_fields(t)->node_id, 69 mm_cid, rseq_kernel_fields(t)->mm_cid); 70 } 71 72 /* For now, only print a console warning on mismatch. */ 73 return 0; 74 75 efault_end: 76 user_read_access_end(); 77 efault: 78 return -EFAULT; 79 } 80 81 /* 82 * Update an rseq field and its in-kernel copy in lock-step to keep a coherent 83 * state. 84 */ 85 #define rseq_unsafe_put_user(t, value, field, error_label) \ 86 do { \ 87 unsafe_put_user(value, &t->rseq->field, error_label); \ 88 rseq_kernel_fields(t)->field = value; \ 89 } while (0) 90 91 #else 92 static int rseq_validate_ro_fields(struct task_struct *t) 93 { 94 return 0; 95 } 96 97 #define rseq_unsafe_put_user(t, value, field, error_label) \ 98 unsafe_put_user(value, &t->rseq->field, error_label) 99 #endif 100 101 /* 102 * 103 * Restartable sequences are a lightweight interface that allows 104 * user-level code to be executed atomically relative to scheduler 105 * preemption and signal delivery. Typically used for implementing 106 * per-cpu operations. 107 * 108 * It allows user-space to perform update operations on per-cpu data 109 * without requiring heavy-weight atomic operations. 110 * 111 * Detailed algorithm of rseq user-space assembly sequences: 112 * 113 * init(rseq_cs) 114 * cpu = TLS->rseq::cpu_id_start 115 * [1] TLS->rseq::rseq_cs = rseq_cs 116 * [start_ip] ---------------------------- 117 * [2] if (cpu != TLS->rseq::cpu_id) 118 * goto abort_ip; 119 * [3] <last_instruction_in_cs> 120 * [post_commit_ip] ---------------------------- 121 * 122 * The address of jump target abort_ip must be outside the critical 123 * region, i.e.: 124 * 125 * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] 126 * 127 * Steps [2]-[3] (inclusive) need to be a sequence of instructions in 128 * userspace that can handle being interrupted between any of those 129 * instructions, and then resumed to the abort_ip. 130 * 131 * 1. Userspace stores the address of the struct rseq_cs assembly 132 * block descriptor into the rseq_cs field of the registered 133 * struct rseq TLS area. This update is performed through a single 134 * store within the inline assembly instruction sequence. 135 * [start_ip] 136 * 137 * 2. Userspace tests to check whether the current cpu_id field match 138 * the cpu number loaded before start_ip, branching to abort_ip 139 * in case of a mismatch. 140 * 141 * If the sequence is preempted or interrupted by a signal 142 * at or after start_ip and before post_commit_ip, then the kernel 143 * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return 144 * ip to abort_ip before returning to user-space, so the preempted 145 * execution resumes at abort_ip. 146 * 147 * 3. Userspace critical section final instruction before 148 * post_commit_ip is the commit. The critical section is 149 * self-terminating. 150 * [post_commit_ip] 151 * 152 * 4. <success> 153 * 154 * On failure at [2], or if interrupted by preempt or signal delivery 155 * between [1] and [3]: 156 * 157 * [abort_ip] 158 * F1. <failure> 159 */ 160 161 static int rseq_update_cpu_node_id(struct task_struct *t) 162 { 163 struct rseq __user *rseq = t->rseq; 164 u32 cpu_id = raw_smp_processor_id(); 165 u32 node_id = cpu_to_node(cpu_id); 166 u32 mm_cid = task_mm_cid(t); 167 168 /* 169 * Validate read-only rseq fields. 170 */ 171 if (rseq_validate_ro_fields(t)) 172 goto efault; 173 WARN_ON_ONCE((int) mm_cid < 0); 174 if (!user_write_access_begin(rseq, t->rseq_len)) 175 goto efault; 176 177 rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end); 178 rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); 179 rseq_unsafe_put_user(t, node_id, node_id, efault_end); 180 rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); 181 182 /* 183 * Additional feature fields added after ORIG_RSEQ_SIZE 184 * need to be conditionally updated only if 185 * t->rseq_len != ORIG_RSEQ_SIZE. 186 */ 187 user_write_access_end(); 188 trace_rseq_update(t); 189 return 0; 190 191 efault_end: 192 user_write_access_end(); 193 efault: 194 return -EFAULT; 195 } 196 197 static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) 198 { 199 struct rseq __user *rseq = t->rseq; 200 u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, 201 mm_cid = 0; 202 203 /* 204 * Validate read-only rseq fields. 205 */ 206 if (rseq_validate_ro_fields(t)) 207 goto efault; 208 209 if (!user_write_access_begin(rseq, t->rseq_len)) 210 goto efault; 211 212 /* 213 * Reset all fields to their initial state. 214 * 215 * All fields have an initial state of 0 except cpu_id which is set to 216 * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after 217 * unregistration can figure out that rseq needs to be registered 218 * again. 219 */ 220 rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end); 221 rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); 222 rseq_unsafe_put_user(t, node_id, node_id, efault_end); 223 rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); 224 225 /* 226 * Additional feature fields added after ORIG_RSEQ_SIZE 227 * need to be conditionally reset only if 228 * t->rseq_len != ORIG_RSEQ_SIZE. 229 */ 230 user_write_access_end(); 231 return 0; 232 233 efault_end: 234 user_write_access_end(); 235 efault: 236 return -EFAULT; 237 } 238 239 /* 240 * Get the user-space pointer value stored in the 'rseq_cs' field. 241 */ 242 static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs) 243 { 244 if (!rseq_cs) 245 return -EFAULT; 246 247 #ifdef CONFIG_64BIT 248 if (get_user(*rseq_cs, &rseq->rseq_cs)) 249 return -EFAULT; 250 #else 251 if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs))) 252 return -EFAULT; 253 #endif 254 255 return 0; 256 } 257 258 /* 259 * If the rseq_cs field of 'struct rseq' contains a valid pointer to 260 * user-space, copy 'struct rseq_cs' from user-space and validate its fields. 261 */ 262 static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) 263 { 264 struct rseq_cs __user *urseq_cs; 265 u64 ptr; 266 u32 __user *usig; 267 u32 sig; 268 int ret; 269 270 ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr); 271 if (ret) 272 return ret; 273 274 /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ 275 if (!ptr) { 276 memset(rseq_cs, 0, sizeof(*rseq_cs)); 277 return 0; 278 } 279 /* Check that the pointer value fits in the user-space process space. */ 280 if (ptr >= TASK_SIZE) 281 return -EINVAL; 282 urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; 283 if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) 284 return -EFAULT; 285 286 if (rseq_cs->start_ip >= TASK_SIZE || 287 rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || 288 rseq_cs->abort_ip >= TASK_SIZE || 289 rseq_cs->version > 0) 290 return -EINVAL; 291 /* Check for overflow. */ 292 if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) 293 return -EINVAL; 294 /* Ensure that abort_ip is not in the critical section. */ 295 if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) 296 return -EINVAL; 297 298 usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); 299 ret = get_user(sig, usig); 300 if (ret) 301 return ret; 302 303 if (current->rseq_sig != sig) { 304 printk_ratelimited(KERN_WARNING 305 "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", 306 sig, current->rseq_sig, current->pid, usig); 307 return -EINVAL; 308 } 309 return 0; 310 } 311 312 static bool rseq_warn_flags(const char *str, u32 flags) 313 { 314 u32 test_flags; 315 316 if (!flags) 317 return false; 318 test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS; 319 if (test_flags) 320 pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str); 321 test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS; 322 if (test_flags) 323 pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str); 324 return true; 325 } 326 327 static int rseq_need_restart(struct task_struct *t, u32 cs_flags) 328 { 329 u32 flags, event_mask; 330 int ret; 331 332 if (rseq_warn_flags("rseq_cs", cs_flags)) 333 return -EINVAL; 334 335 /* Get thread flags. */ 336 ret = get_user(flags, &t->rseq->flags); 337 if (ret) 338 return ret; 339 340 if (rseq_warn_flags("rseq", flags)) 341 return -EINVAL; 342 343 /* 344 * Load and clear event mask atomically with respect to 345 * scheduler preemption. 346 */ 347 preempt_disable(); 348 event_mask = t->rseq_event_mask; 349 t->rseq_event_mask = 0; 350 preempt_enable(); 351 352 return !!event_mask; 353 } 354 355 static int clear_rseq_cs(struct rseq __user *rseq) 356 { 357 /* 358 * The rseq_cs field is set to NULL on preemption or signal 359 * delivery on top of rseq assembly block, as well as on top 360 * of code outside of the rseq assembly block. This performs 361 * a lazy clear of the rseq_cs field. 362 * 363 * Set rseq_cs to NULL. 364 */ 365 #ifdef CONFIG_64BIT 366 return put_user(0UL, &rseq->rseq_cs); 367 #else 368 if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs))) 369 return -EFAULT; 370 return 0; 371 #endif 372 } 373 374 /* 375 * Unsigned comparison will be true when ip >= start_ip, and when 376 * ip < start_ip + post_commit_offset. 377 */ 378 static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) 379 { 380 return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; 381 } 382 383 static int rseq_ip_fixup(struct pt_regs *regs) 384 { 385 unsigned long ip = instruction_pointer(regs); 386 struct task_struct *t = current; 387 struct rseq_cs rseq_cs; 388 int ret; 389 390 ret = rseq_get_rseq_cs(t, &rseq_cs); 391 if (ret) 392 return ret; 393 394 /* 395 * Handle potentially not being within a critical section. 396 * If not nested over a rseq critical section, restart is useless. 397 * Clear the rseq_cs pointer and return. 398 */ 399 if (!in_rseq_cs(ip, &rseq_cs)) 400 return clear_rseq_cs(t->rseq); 401 ret = rseq_need_restart(t, rseq_cs.flags); 402 if (ret <= 0) 403 return ret; 404 ret = clear_rseq_cs(t->rseq); 405 if (ret) 406 return ret; 407 trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, 408 rseq_cs.abort_ip); 409 instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); 410 return 0; 411 } 412 413 /* 414 * This resume handler must always be executed between any of: 415 * - preemption, 416 * - signal delivery, 417 * and return to user-space. 418 * 419 * This is how we can ensure that the entire rseq critical section 420 * will issue the commit instruction only if executed atomically with 421 * respect to other threads scheduled on the same CPU, and with respect 422 * to signal handlers. 423 */ 424 void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) 425 { 426 struct task_struct *t = current; 427 int ret, sig; 428 429 if (unlikely(t->flags & PF_EXITING)) 430 return; 431 432 /* 433 * regs is NULL if and only if the caller is in a syscall path. Skip 434 * fixup and leave rseq_cs as is so that rseq_sycall() will detect and 435 * kill a misbehaving userspace on debug kernels. 436 */ 437 if (regs) { 438 ret = rseq_ip_fixup(regs); 439 if (unlikely(ret < 0)) 440 goto error; 441 } 442 if (unlikely(rseq_update_cpu_node_id(t))) 443 goto error; 444 return; 445 446 error: 447 sig = ksig ? ksig->sig : 0; 448 force_sigsegv(sig); 449 } 450 451 #ifdef CONFIG_DEBUG_RSEQ 452 453 /* 454 * Terminate the process if a syscall is issued within a restartable 455 * sequence. 456 */ 457 void rseq_syscall(struct pt_regs *regs) 458 { 459 unsigned long ip = instruction_pointer(regs); 460 struct task_struct *t = current; 461 struct rseq_cs rseq_cs; 462 463 if (!t->rseq) 464 return; 465 if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) 466 force_sig(SIGSEGV); 467 } 468 469 #endif 470 471 /* 472 * sys_rseq - setup restartable sequences for caller thread. 473 */ 474 SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, 475 int, flags, u32, sig) 476 { 477 int ret; 478 u64 rseq_cs; 479 480 if (flags & RSEQ_FLAG_UNREGISTER) { 481 if (flags & ~RSEQ_FLAG_UNREGISTER) 482 return -EINVAL; 483 /* Unregister rseq for current thread. */ 484 if (current->rseq != rseq || !current->rseq) 485 return -EINVAL; 486 if (rseq_len != current->rseq_len) 487 return -EINVAL; 488 if (current->rseq_sig != sig) 489 return -EPERM; 490 ret = rseq_reset_rseq_cpu_node_id(current); 491 if (ret) 492 return ret; 493 current->rseq = NULL; 494 current->rseq_sig = 0; 495 current->rseq_len = 0; 496 return 0; 497 } 498 499 if (unlikely(flags)) 500 return -EINVAL; 501 502 if (current->rseq) { 503 /* 504 * If rseq is already registered, check whether 505 * the provided address differs from the prior 506 * one. 507 */ 508 if (current->rseq != rseq || rseq_len != current->rseq_len) 509 return -EINVAL; 510 if (current->rseq_sig != sig) 511 return -EPERM; 512 /* Already registered. */ 513 return -EBUSY; 514 } 515 516 /* 517 * If there was no rseq previously registered, ensure the provided rseq 518 * is properly aligned, as communcated to user-space through the ELF 519 * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq 520 * size, the required alignment is the original struct rseq alignment. 521 * 522 * In order to be valid, rseq_len is either the original rseq size, or 523 * large enough to contain all supported fields, as communicated to 524 * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. 525 */ 526 if (rseq_len < ORIG_RSEQ_SIZE || 527 (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || 528 (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || 529 rseq_len < offsetof(struct rseq, end)))) 530 return -EINVAL; 531 if (!access_ok(rseq, rseq_len)) 532 return -EFAULT; 533 534 /* 535 * If the rseq_cs pointer is non-NULL on registration, clear it to 536 * avoid a potential segfault on return to user-space. The proper thing 537 * to do would have been to fail the registration but this would break 538 * older libcs that reuse the rseq area for new threads without 539 * clearing the fields. 540 */ 541 if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs)) 542 return -EFAULT; 543 if (rseq_cs && clear_rseq_cs(rseq)) 544 return -EFAULT; 545 546 #ifdef CONFIG_DEBUG_RSEQ 547 /* 548 * Initialize the in-kernel rseq fields copy for validation of 549 * read-only fields. 550 */ 551 if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) || 552 get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) || 553 get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) || 554 get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid)) 555 return -EFAULT; 556 #endif 557 /* 558 * Activate the registration by setting the rseq area address, length 559 * and signature in the task struct. 560 */ 561 current->rseq = rseq; 562 current->rseq_len = rseq_len; 563 current->rseq_sig = sig; 564 565 /* 566 * If rseq was previously inactive, and has just been 567 * registered, ensure the cpu_id_start and cpu_id fields 568 * are updated before returning to user-space. 569 */ 570 rseq_set_notify_resume(current); 571 572 return 0; 573 } 574