1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Code related to the io_uring_register() syscall 4 * 5 * Copyright (C) 2023 Jens Axboe 6 */ 7 #include <linux/kernel.h> 8 #include <linux/errno.h> 9 #include <linux/syscalls.h> 10 #include <linux/refcount.h> 11 #include <linux/bits.h> 12 #include <linux/fs.h> 13 #include <linux/file.h> 14 #include <linux/slab.h> 15 #include <linux/uaccess.h> 16 #include <linux/nospec.h> 17 #include <linux/compat.h> 18 #include <linux/io_uring.h> 19 #include <linux/io_uring_types.h> 20 21 #include "io_uring.h" 22 #include "opdef.h" 23 #include "tctx.h" 24 #include "rsrc.h" 25 #include "sqpoll.h" 26 #include "register.h" 27 #include "cancel.h" 28 #include "kbuf.h" 29 #include "napi.h" 30 #include "eventfd.h" 31 #include "msg_ring.h" 32 #include "memmap.h" 33 #include "zcrx.h" 34 35 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 36 IORING_REGISTER_LAST + IORING_OP_LAST) 37 38 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, 39 unsigned nr_args) 40 { 41 struct io_uring_probe *p; 42 size_t size; 43 int i, ret; 44 45 if (nr_args > IORING_OP_LAST) 46 nr_args = IORING_OP_LAST; 47 48 size = struct_size(p, ops, nr_args); 49 p = kzalloc(size, GFP_KERNEL); 50 if (!p) 51 return -ENOMEM; 52 53 ret = -EFAULT; 54 if (copy_from_user(p, arg, size)) 55 goto out; 56 ret = -EINVAL; 57 if (memchr_inv(p, 0, size)) 58 goto out; 59 60 p->last_op = IORING_OP_LAST - 1; 61 62 for (i = 0; i < nr_args; i++) { 63 p->ops[i].op = i; 64 if (io_uring_op_supported(i)) 65 p->ops[i].flags = IO_URING_OP_SUPPORTED; 66 } 67 p->ops_len = i; 68 69 ret = 0; 70 if (copy_to_user(arg, p, size)) 71 ret = -EFAULT; 72 out: 73 kfree(p); 74 return ret; 75 } 76 77 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) 78 { 79 const struct cred *creds; 80 81 creds = xa_erase(&ctx->personalities, id); 82 if (creds) { 83 put_cred(creds); 84 return 0; 85 } 86 87 return -EINVAL; 88 } 89 90 91 static int io_register_personality(struct io_ring_ctx *ctx) 92 { 93 const struct cred *creds; 94 u32 id; 95 int ret; 96 97 creds = get_current_cred(); 98 99 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, 100 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); 101 if (ret < 0) { 102 put_cred(creds); 103 return ret; 104 } 105 return id; 106 } 107 108 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, 109 struct io_restriction *restrictions) 110 { 111 struct io_uring_restriction *res; 112 size_t size; 113 int i, ret; 114 115 if (!arg || nr_args > IORING_MAX_RESTRICTIONS) 116 return -EINVAL; 117 118 size = array_size(nr_args, sizeof(*res)); 119 if (size == SIZE_MAX) 120 return -EOVERFLOW; 121 122 res = memdup_user(arg, size); 123 if (IS_ERR(res)) 124 return PTR_ERR(res); 125 126 ret = -EINVAL; 127 128 for (i = 0; i < nr_args; i++) { 129 switch (res[i].opcode) { 130 case IORING_RESTRICTION_REGISTER_OP: 131 if (res[i].register_op >= IORING_REGISTER_LAST) 132 goto err; 133 __set_bit(res[i].register_op, restrictions->register_op); 134 break; 135 case IORING_RESTRICTION_SQE_OP: 136 if (res[i].sqe_op >= IORING_OP_LAST) 137 goto err; 138 __set_bit(res[i].sqe_op, restrictions->sqe_op); 139 break; 140 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 141 restrictions->sqe_flags_allowed = res[i].sqe_flags; 142 break; 143 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 144 restrictions->sqe_flags_required = res[i].sqe_flags; 145 break; 146 default: 147 goto err; 148 } 149 } 150 151 ret = 0; 152 153 err: 154 kfree(res); 155 return ret; 156 } 157 158 static __cold int io_register_restrictions(struct io_ring_ctx *ctx, 159 void __user *arg, unsigned int nr_args) 160 { 161 int ret; 162 163 /* Restrictions allowed only if rings started disabled */ 164 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 165 return -EBADFD; 166 167 /* We allow only a single restrictions registration */ 168 if (ctx->restrictions.registered) 169 return -EBUSY; 170 171 ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); 172 /* Reset all restrictions if an error happened */ 173 if (ret != 0) 174 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 175 else 176 ctx->restrictions.registered = true; 177 return ret; 178 } 179 180 static int io_register_enable_rings(struct io_ring_ctx *ctx) 181 { 182 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 183 return -EBADFD; 184 185 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { 186 WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); 187 /* 188 * Lazy activation attempts would fail if it was polled before 189 * submitter_task is set. 190 */ 191 if (wq_has_sleeper(&ctx->poll_wq)) 192 io_activate_pollwq(ctx); 193 } 194 195 if (ctx->restrictions.registered) 196 ctx->restricted = 1; 197 198 ctx->flags &= ~IORING_SETUP_R_DISABLED; 199 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) 200 wake_up(&ctx->sq_data->wait); 201 return 0; 202 } 203 204 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, 205 cpumask_var_t new_mask) 206 { 207 int ret; 208 209 if (!(ctx->flags & IORING_SETUP_SQPOLL)) { 210 ret = io_wq_cpu_affinity(current->io_uring, new_mask); 211 } else { 212 mutex_unlock(&ctx->uring_lock); 213 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); 214 mutex_lock(&ctx->uring_lock); 215 } 216 217 return ret; 218 } 219 220 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, 221 void __user *arg, unsigned len) 222 { 223 cpumask_var_t new_mask; 224 int ret; 225 226 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 227 return -ENOMEM; 228 229 cpumask_clear(new_mask); 230 if (len > cpumask_size()) 231 len = cpumask_size(); 232 233 #ifdef CONFIG_COMPAT 234 if (in_compat_syscall()) 235 ret = compat_get_bitmap(cpumask_bits(new_mask), 236 (const compat_ulong_t __user *)arg, 237 len * 8 /* CHAR_BIT */); 238 else 239 #endif 240 ret = copy_from_user(new_mask, arg, len); 241 242 if (ret) { 243 free_cpumask_var(new_mask); 244 return -EFAULT; 245 } 246 247 ret = __io_register_iowq_aff(ctx, new_mask); 248 free_cpumask_var(new_mask); 249 return ret; 250 } 251 252 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) 253 { 254 return __io_register_iowq_aff(ctx, NULL); 255 } 256 257 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, 258 void __user *arg) 259 __must_hold(&ctx->uring_lock) 260 { 261 struct io_tctx_node *node; 262 struct io_uring_task *tctx = NULL; 263 struct io_sq_data *sqd = NULL; 264 __u32 new_count[2]; 265 int i, ret; 266 267 if (copy_from_user(new_count, arg, sizeof(new_count))) 268 return -EFAULT; 269 for (i = 0; i < ARRAY_SIZE(new_count); i++) 270 if (new_count[i] > INT_MAX) 271 return -EINVAL; 272 273 if (ctx->flags & IORING_SETUP_SQPOLL) { 274 sqd = ctx->sq_data; 275 if (sqd) { 276 /* 277 * Observe the correct sqd->lock -> ctx->uring_lock 278 * ordering. Fine to drop uring_lock here, we hold 279 * a ref to the ctx. 280 */ 281 refcount_inc(&sqd->refs); 282 mutex_unlock(&ctx->uring_lock); 283 mutex_lock(&sqd->lock); 284 mutex_lock(&ctx->uring_lock); 285 if (sqd->thread) 286 tctx = sqd->thread->io_uring; 287 } 288 } else { 289 tctx = current->io_uring; 290 } 291 292 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); 293 294 for (i = 0; i < ARRAY_SIZE(new_count); i++) 295 if (new_count[i]) 296 ctx->iowq_limits[i] = new_count[i]; 297 ctx->iowq_limits_set = true; 298 299 if (tctx && tctx->io_wq) { 300 ret = io_wq_max_workers(tctx->io_wq, new_count); 301 if (ret) 302 goto err; 303 } else { 304 memset(new_count, 0, sizeof(new_count)); 305 } 306 307 if (sqd) { 308 mutex_unlock(&ctx->uring_lock); 309 mutex_unlock(&sqd->lock); 310 io_put_sq_data(sqd); 311 mutex_lock(&ctx->uring_lock); 312 } 313 314 if (copy_to_user(arg, new_count, sizeof(new_count))) 315 return -EFAULT; 316 317 /* that's it for SQPOLL, only the SQPOLL task creates requests */ 318 if (sqd) 319 return 0; 320 321 /* now propagate the restriction to all registered users */ 322 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 323 tctx = node->task->io_uring; 324 if (WARN_ON_ONCE(!tctx->io_wq)) 325 continue; 326 327 for (i = 0; i < ARRAY_SIZE(new_count); i++) 328 new_count[i] = ctx->iowq_limits[i]; 329 /* ignore errors, it always returns zero anyway */ 330 (void)io_wq_max_workers(tctx->io_wq, new_count); 331 } 332 return 0; 333 err: 334 if (sqd) { 335 mutex_unlock(&ctx->uring_lock); 336 mutex_unlock(&sqd->lock); 337 io_put_sq_data(sqd); 338 mutex_lock(&ctx->uring_lock); 339 } 340 return ret; 341 } 342 343 static int io_register_clock(struct io_ring_ctx *ctx, 344 struct io_uring_clock_register __user *arg) 345 { 346 struct io_uring_clock_register reg; 347 348 if (copy_from_user(®, arg, sizeof(reg))) 349 return -EFAULT; 350 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) 351 return -EINVAL; 352 353 switch (reg.clockid) { 354 case CLOCK_MONOTONIC: 355 ctx->clock_offset = 0; 356 break; 357 case CLOCK_BOOTTIME: 358 ctx->clock_offset = TK_OFFS_BOOT; 359 break; 360 default: 361 return -EINVAL; 362 } 363 364 ctx->clockid = reg.clockid; 365 return 0; 366 } 367 368 /* 369 * State to maintain until we can swap. Both new and old state, used for 370 * either mapping or freeing. 371 */ 372 struct io_ring_ctx_rings { 373 struct io_rings *rings; 374 struct io_uring_sqe *sq_sqes; 375 376 struct io_mapped_region sq_region; 377 struct io_mapped_region ring_region; 378 }; 379 380 static void io_register_free_rings(struct io_ring_ctx *ctx, 381 struct io_uring_params *p, 382 struct io_ring_ctx_rings *r) 383 { 384 io_free_region(ctx, &r->sq_region); 385 io_free_region(ctx, &r->ring_region); 386 } 387 388 #define swap_old(ctx, o, n, field) \ 389 do { \ 390 (o).field = (ctx)->field; \ 391 (ctx)->field = (n).field; \ 392 } while (0) 393 394 #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) 395 #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ 396 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) 397 398 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) 399 { 400 struct io_uring_region_desc rd; 401 struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; 402 size_t size, sq_array_offset; 403 unsigned i, tail, old_head; 404 struct io_uring_params p; 405 int ret; 406 407 /* for single issuer, must be owner resizing */ 408 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && 409 current != ctx->submitter_task) 410 return -EEXIST; 411 /* limited to DEFER_TASKRUN for now */ 412 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 413 return -EINVAL; 414 if (copy_from_user(&p, arg, sizeof(p))) 415 return -EFAULT; 416 if (p.flags & ~RESIZE_FLAGS) 417 return -EINVAL; 418 419 /* properties that are always inherited */ 420 p.flags |= (ctx->flags & COPY_FLAGS); 421 422 ret = io_uring_fill_params(p.sq_entries, &p); 423 if (unlikely(ret)) 424 return ret; 425 426 /* nothing to do, but copy params back */ 427 if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) { 428 if (copy_to_user(arg, &p, sizeof(p))) 429 return -EFAULT; 430 return 0; 431 } 432 433 size = rings_size(p.flags, p.sq_entries, p.cq_entries, 434 &sq_array_offset); 435 if (size == SIZE_MAX) 436 return -EOVERFLOW; 437 438 memset(&rd, 0, sizeof(rd)); 439 rd.size = PAGE_ALIGN(size); 440 if (p.flags & IORING_SETUP_NO_MMAP) { 441 rd.user_addr = p.cq_off.user_addr; 442 rd.flags |= IORING_MEM_REGION_TYPE_USER; 443 } 444 ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); 445 if (ret) { 446 io_register_free_rings(ctx, &p, &n); 447 return ret; 448 } 449 n.rings = io_region_get_ptr(&n.ring_region); 450 451 /* 452 * At this point n.rings is shared with userspace, just like o.rings 453 * is as well. While we don't expect userspace to modify it while 454 * a resize is in progress, and it's most likely that userspace will 455 * shoot itself in the foot if it does, we can't always assume good 456 * intent... Use read/write once helpers from here on to indicate the 457 * shared nature of it. 458 */ 459 WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1); 460 WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1); 461 WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries); 462 WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries); 463 464 if (copy_to_user(arg, &p, sizeof(p))) { 465 io_register_free_rings(ctx, &p, &n); 466 return -EFAULT; 467 } 468 469 if (p.flags & IORING_SETUP_SQE128) 470 size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries); 471 else 472 size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); 473 if (size == SIZE_MAX) { 474 io_register_free_rings(ctx, &p, &n); 475 return -EOVERFLOW; 476 } 477 478 memset(&rd, 0, sizeof(rd)); 479 rd.size = PAGE_ALIGN(size); 480 if (p.flags & IORING_SETUP_NO_MMAP) { 481 rd.user_addr = p.sq_off.user_addr; 482 rd.flags |= IORING_MEM_REGION_TYPE_USER; 483 } 484 ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES); 485 if (ret) { 486 io_register_free_rings(ctx, &p, &n); 487 return ret; 488 } 489 n.sq_sqes = io_region_get_ptr(&n.sq_region); 490 491 /* 492 * If using SQPOLL, park the thread 493 */ 494 if (ctx->sq_data) { 495 mutex_unlock(&ctx->uring_lock); 496 io_sq_thread_park(ctx->sq_data); 497 mutex_lock(&ctx->uring_lock); 498 } 499 500 /* 501 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude 502 * any new mmap's on the ring fd. Clear out existing mappings to prevent 503 * mmap from seeing them, as we'll unmap them. Any attempt to mmap 504 * existing rings beyond this point will fail. Not that it could proceed 505 * at this point anyway, as the io_uring mmap side needs go grab the 506 * ctx->mmap_lock as well. Likewise, hold the completion lock over the 507 * duration of the actual swap. 508 */ 509 mutex_lock(&ctx->mmap_lock); 510 spin_lock(&ctx->completion_lock); 511 o.rings = ctx->rings; 512 ctx->rings = NULL; 513 o.sq_sqes = ctx->sq_sqes; 514 ctx->sq_sqes = NULL; 515 516 /* 517 * Now copy SQ and CQ entries, if any. If either of the destination 518 * rings can't hold what is already there, then fail the operation. 519 */ 520 tail = READ_ONCE(o.rings->sq.tail); 521 old_head = READ_ONCE(o.rings->sq.head); 522 if (tail - old_head > p.sq_entries) 523 goto overflow; 524 for (i = old_head; i < tail; i++) { 525 unsigned src_head = i & (ctx->sq_entries - 1); 526 unsigned dst_head = i & (p.sq_entries - 1); 527 528 n.sq_sqes[dst_head] = o.sq_sqes[src_head]; 529 } 530 WRITE_ONCE(n.rings->sq.head, old_head); 531 WRITE_ONCE(n.rings->sq.tail, tail); 532 533 tail = READ_ONCE(o.rings->cq.tail); 534 old_head = READ_ONCE(o.rings->cq.head); 535 if (tail - old_head > p.cq_entries) { 536 overflow: 537 /* restore old rings, and return -EOVERFLOW via cleanup path */ 538 ctx->rings = o.rings; 539 ctx->sq_sqes = o.sq_sqes; 540 to_free = &n; 541 ret = -EOVERFLOW; 542 goto out; 543 } 544 for (i = old_head; i < tail; i++) { 545 unsigned src_head = i & (ctx->cq_entries - 1); 546 unsigned dst_head = i & (p.cq_entries - 1); 547 548 n.rings->cqes[dst_head] = o.rings->cqes[src_head]; 549 } 550 WRITE_ONCE(n.rings->cq.head, old_head); 551 WRITE_ONCE(n.rings->cq.tail, tail); 552 /* invalidate cached cqe refill */ 553 ctx->cqe_cached = ctx->cqe_sentinel = NULL; 554 555 WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped)); 556 atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags)); 557 WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags)); 558 WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow)); 559 560 /* all done, store old pointers and assign new ones */ 561 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) 562 ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset); 563 564 ctx->sq_entries = p.sq_entries; 565 ctx->cq_entries = p.cq_entries; 566 567 ctx->rings = n.rings; 568 ctx->sq_sqes = n.sq_sqes; 569 swap_old(ctx, o, n, ring_region); 570 swap_old(ctx, o, n, sq_region); 571 to_free = &o; 572 ret = 0; 573 out: 574 spin_unlock(&ctx->completion_lock); 575 mutex_unlock(&ctx->mmap_lock); 576 io_register_free_rings(ctx, &p, to_free); 577 578 if (ctx->sq_data) 579 io_sq_thread_unpark(ctx->sq_data); 580 581 return ret; 582 } 583 584 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) 585 { 586 struct io_uring_mem_region_reg __user *reg_uptr = uarg; 587 struct io_uring_mem_region_reg reg; 588 struct io_uring_region_desc __user *rd_uptr; 589 struct io_uring_region_desc rd; 590 int ret; 591 592 if (io_region_is_set(&ctx->param_region)) 593 return -EBUSY; 594 if (copy_from_user(®, reg_uptr, sizeof(reg))) 595 return -EFAULT; 596 rd_uptr = u64_to_user_ptr(reg.region_uptr); 597 if (copy_from_user(&rd, rd_uptr, sizeof(rd))) 598 return -EFAULT; 599 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) 600 return -EINVAL; 601 if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG) 602 return -EINVAL; 603 604 /* 605 * This ensures there are no waiters. Waiters are unlocked and it's 606 * hard to synchronise with them, especially if we need to initialise 607 * the region. 608 */ 609 if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) && 610 !(ctx->flags & IORING_SETUP_R_DISABLED)) 611 return -EINVAL; 612 613 ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd, 614 IORING_MAP_OFF_PARAM_REGION); 615 if (ret) 616 return ret; 617 if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { 618 io_free_region(ctx, &ctx->param_region); 619 return -EFAULT; 620 } 621 622 if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) { 623 ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region); 624 ctx->cq_wait_size = rd.size; 625 } 626 return 0; 627 } 628 629 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 630 void __user *arg, unsigned nr_args) 631 __releases(ctx->uring_lock) 632 __acquires(ctx->uring_lock) 633 { 634 int ret; 635 636 /* 637 * We don't quiesce the refs for register anymore and so it can't be 638 * dying as we're holding a file ref here. 639 */ 640 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) 641 return -ENXIO; 642 643 if (ctx->submitter_task && ctx->submitter_task != current) 644 return -EEXIST; 645 646 if (ctx->restricted) { 647 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 648 if (!test_bit(opcode, ctx->restrictions.register_op)) 649 return -EACCES; 650 } 651 652 switch (opcode) { 653 case IORING_REGISTER_BUFFERS: 654 ret = -EFAULT; 655 if (!arg) 656 break; 657 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); 658 break; 659 case IORING_UNREGISTER_BUFFERS: 660 ret = -EINVAL; 661 if (arg || nr_args) 662 break; 663 ret = io_sqe_buffers_unregister(ctx); 664 break; 665 case IORING_REGISTER_FILES: 666 ret = -EFAULT; 667 if (!arg) 668 break; 669 ret = io_sqe_files_register(ctx, arg, nr_args, NULL); 670 break; 671 case IORING_UNREGISTER_FILES: 672 ret = -EINVAL; 673 if (arg || nr_args) 674 break; 675 ret = io_sqe_files_unregister(ctx); 676 break; 677 case IORING_REGISTER_FILES_UPDATE: 678 ret = io_register_files_update(ctx, arg, nr_args); 679 break; 680 case IORING_REGISTER_EVENTFD: 681 ret = -EINVAL; 682 if (nr_args != 1) 683 break; 684 ret = io_eventfd_register(ctx, arg, 0); 685 break; 686 case IORING_REGISTER_EVENTFD_ASYNC: 687 ret = -EINVAL; 688 if (nr_args != 1) 689 break; 690 ret = io_eventfd_register(ctx, arg, 1); 691 break; 692 case IORING_UNREGISTER_EVENTFD: 693 ret = -EINVAL; 694 if (arg || nr_args) 695 break; 696 ret = io_eventfd_unregister(ctx); 697 break; 698 case IORING_REGISTER_PROBE: 699 ret = -EINVAL; 700 if (!arg || nr_args > 256) 701 break; 702 ret = io_probe(ctx, arg, nr_args); 703 break; 704 case IORING_REGISTER_PERSONALITY: 705 ret = -EINVAL; 706 if (arg || nr_args) 707 break; 708 ret = io_register_personality(ctx); 709 break; 710 case IORING_UNREGISTER_PERSONALITY: 711 ret = -EINVAL; 712 if (arg) 713 break; 714 ret = io_unregister_personality(ctx, nr_args); 715 break; 716 case IORING_REGISTER_ENABLE_RINGS: 717 ret = -EINVAL; 718 if (arg || nr_args) 719 break; 720 ret = io_register_enable_rings(ctx); 721 break; 722 case IORING_REGISTER_RESTRICTIONS: 723 ret = io_register_restrictions(ctx, arg, nr_args); 724 break; 725 case IORING_REGISTER_FILES2: 726 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); 727 break; 728 case IORING_REGISTER_FILES_UPDATE2: 729 ret = io_register_rsrc_update(ctx, arg, nr_args, 730 IORING_RSRC_FILE); 731 break; 732 case IORING_REGISTER_BUFFERS2: 733 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); 734 break; 735 case IORING_REGISTER_BUFFERS_UPDATE: 736 ret = io_register_rsrc_update(ctx, arg, nr_args, 737 IORING_RSRC_BUFFER); 738 break; 739 case IORING_REGISTER_IOWQ_AFF: 740 ret = -EINVAL; 741 if (!arg || !nr_args) 742 break; 743 ret = io_register_iowq_aff(ctx, arg, nr_args); 744 break; 745 case IORING_UNREGISTER_IOWQ_AFF: 746 ret = -EINVAL; 747 if (arg || nr_args) 748 break; 749 ret = io_unregister_iowq_aff(ctx); 750 break; 751 case IORING_REGISTER_IOWQ_MAX_WORKERS: 752 ret = -EINVAL; 753 if (!arg || nr_args != 2) 754 break; 755 ret = io_register_iowq_max_workers(ctx, arg); 756 break; 757 case IORING_REGISTER_RING_FDS: 758 ret = io_ringfd_register(ctx, arg, nr_args); 759 break; 760 case IORING_UNREGISTER_RING_FDS: 761 ret = io_ringfd_unregister(ctx, arg, nr_args); 762 break; 763 case IORING_REGISTER_PBUF_RING: 764 ret = -EINVAL; 765 if (!arg || nr_args != 1) 766 break; 767 ret = io_register_pbuf_ring(ctx, arg); 768 break; 769 case IORING_UNREGISTER_PBUF_RING: 770 ret = -EINVAL; 771 if (!arg || nr_args != 1) 772 break; 773 ret = io_unregister_pbuf_ring(ctx, arg); 774 break; 775 case IORING_REGISTER_SYNC_CANCEL: 776 ret = -EINVAL; 777 if (!arg || nr_args != 1) 778 break; 779 ret = io_sync_cancel(ctx, arg); 780 break; 781 case IORING_REGISTER_FILE_ALLOC_RANGE: 782 ret = -EINVAL; 783 if (!arg || nr_args) 784 break; 785 ret = io_register_file_alloc_range(ctx, arg); 786 break; 787 case IORING_REGISTER_PBUF_STATUS: 788 ret = -EINVAL; 789 if (!arg || nr_args != 1) 790 break; 791 ret = io_register_pbuf_status(ctx, arg); 792 break; 793 case IORING_REGISTER_NAPI: 794 ret = -EINVAL; 795 if (!arg || nr_args != 1) 796 break; 797 ret = io_register_napi(ctx, arg); 798 break; 799 case IORING_UNREGISTER_NAPI: 800 ret = -EINVAL; 801 if (nr_args != 1) 802 break; 803 ret = io_unregister_napi(ctx, arg); 804 break; 805 case IORING_REGISTER_CLOCK: 806 ret = -EINVAL; 807 if (!arg || nr_args) 808 break; 809 ret = io_register_clock(ctx, arg); 810 break; 811 case IORING_REGISTER_CLONE_BUFFERS: 812 ret = -EINVAL; 813 if (!arg || nr_args != 1) 814 break; 815 ret = io_register_clone_buffers(ctx, arg); 816 break; 817 case IORING_REGISTER_ZCRX_IFQ: 818 ret = -EINVAL; 819 if (!arg || nr_args != 1) 820 break; 821 ret = io_register_zcrx_ifq(ctx, arg); 822 break; 823 case IORING_REGISTER_RESIZE_RINGS: 824 ret = -EINVAL; 825 if (!arg || nr_args != 1) 826 break; 827 ret = io_register_resize_rings(ctx, arg); 828 break; 829 case IORING_REGISTER_MEM_REGION: 830 ret = -EINVAL; 831 if (!arg || nr_args != 1) 832 break; 833 ret = io_register_mem_region(ctx, arg); 834 break; 835 default: 836 ret = -EINVAL; 837 break; 838 } 839 840 return ret; 841 } 842 843 /* 844 * Given an 'fd' value, return the ctx associated with if. If 'registered' is 845 * true, then the registered index is used. Otherwise, the normal fd table. 846 * Caller must call fput() on the returned file, unless it's an ERR_PTR. 847 */ 848 struct file *io_uring_register_get_file(unsigned int fd, bool registered) 849 { 850 struct file *file; 851 852 if (registered) { 853 /* 854 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we 855 * need only dereference our task private array to find it. 856 */ 857 struct io_uring_task *tctx = current->io_uring; 858 859 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) 860 return ERR_PTR(-EINVAL); 861 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); 862 file = tctx->registered_rings[fd]; 863 if (file) 864 get_file(file); 865 } else { 866 file = fget(fd); 867 } 868 869 if (unlikely(!file)) 870 return ERR_PTR(-EBADF); 871 if (io_is_uring_fops(file)) 872 return file; 873 fput(file); 874 return ERR_PTR(-EOPNOTSUPP); 875 } 876 877 /* 878 * "blind" registration opcodes are ones where there's no ring given, and 879 * hence the source fd must be -1. 880 */ 881 static int io_uring_register_blind(unsigned int opcode, void __user *arg, 882 unsigned int nr_args) 883 { 884 switch (opcode) { 885 case IORING_REGISTER_SEND_MSG_RING: { 886 struct io_uring_sqe sqe; 887 888 if (!arg || nr_args != 1) 889 return -EINVAL; 890 if (copy_from_user(&sqe, arg, sizeof(sqe))) 891 return -EFAULT; 892 /* no flags supported */ 893 if (sqe.flags) 894 return -EINVAL; 895 if (sqe.opcode == IORING_OP_MSG_RING) 896 return io_uring_sync_msg_ring(&sqe); 897 } 898 } 899 900 return -EINVAL; 901 } 902 903 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, 904 void __user *, arg, unsigned int, nr_args) 905 { 906 struct io_ring_ctx *ctx; 907 long ret = -EBADF; 908 struct file *file; 909 bool use_registered_ring; 910 911 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); 912 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; 913 914 if (opcode >= IORING_REGISTER_LAST) 915 return -EINVAL; 916 917 if (fd == -1) 918 return io_uring_register_blind(opcode, arg, nr_args); 919 920 file = io_uring_register_get_file(fd, use_registered_ring); 921 if (IS_ERR(file)) 922 return PTR_ERR(file); 923 ctx = file->private_data; 924 925 mutex_lock(&ctx->uring_lock); 926 ret = __io_uring_register(ctx, opcode, arg, nr_args); 927 928 trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr, 929 ctx->buf_table.nr, ret); 930 mutex_unlock(&ctx->uring_lock); 931 932 fput(file); 933 return ret; 934 } 935