1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Code related to the io_uring_register() syscall
4 *
5 * Copyright (C) 2023 Jens Axboe
6 */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20
21 #include "filetable.h"
22 #include "io_uring.h"
23 #include "opdef.h"
24 #include "tctx.h"
25 #include "rsrc.h"
26 #include "sqpoll.h"
27 #include "register.h"
28 #include "cancel.h"
29 #include "kbuf.h"
30 #include "napi.h"
31 #include "eventfd.h"
32 #include "msg_ring.h"
33 #include "memmap.h"
34 #include "zcrx.h"
35 #include "query.h"
36 #include "bpf_filter.h"
37
38 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
39 IORING_REGISTER_LAST + IORING_OP_LAST)
40
io_probe(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)41 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
42 unsigned nr_args)
43 {
44 struct io_uring_probe *p;
45 size_t size;
46 int i, ret;
47
48 if (nr_args > IORING_OP_LAST)
49 nr_args = IORING_OP_LAST;
50
51 size = struct_size(p, ops, nr_args);
52 p = memdup_user(arg, size);
53 if (IS_ERR(p))
54 return PTR_ERR(p);
55 ret = -EINVAL;
56 if (memchr_inv(p, 0, size))
57 goto out;
58
59 p->last_op = IORING_OP_LAST - 1;
60
61 for (i = 0; i < nr_args; i++) {
62 p->ops[i].op = i;
63 if (io_uring_op_supported(i))
64 p->ops[i].flags = IO_URING_OP_SUPPORTED;
65 }
66 p->ops_len = i;
67
68 ret = 0;
69 if (copy_to_user(arg, p, size))
70 ret = -EFAULT;
71 out:
72 kfree(p);
73 return ret;
74 }
75
io_unregister_personality(struct io_ring_ctx * ctx,unsigned id)76 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
77 {
78 const struct cred *creds;
79
80 creds = xa_erase(&ctx->personalities, id);
81 if (creds) {
82 put_cred(creds);
83 return 0;
84 }
85
86 return -EINVAL;
87 }
88
89
io_register_personality(struct io_ring_ctx * ctx)90 static int io_register_personality(struct io_ring_ctx *ctx)
91 {
92 const struct cred *creds;
93 u32 id;
94 int ret;
95
96 creds = get_current_cred();
97
98 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
99 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
100 if (ret < 0) {
101 put_cred(creds);
102 return ret;
103 }
104 return id;
105 }
106
107 /*
108 * Returns number of restrictions parsed and added on success, or < 0 for
109 * an error.
110 */
io_parse_restrictions(void __user * arg,unsigned int nr_args,struct io_restriction * restrictions)111 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
112 struct io_restriction *restrictions)
113 {
114 struct io_uring_restriction *res;
115 size_t size;
116 int i, ret;
117
118 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
119 return -EINVAL;
120
121 size = array_size(nr_args, sizeof(*res));
122 if (size == SIZE_MAX)
123 return -EOVERFLOW;
124
125 res = memdup_user(arg, size);
126 if (IS_ERR(res))
127 return PTR_ERR(res);
128
129 ret = -EINVAL;
130
131 for (i = 0; i < nr_args; i++) {
132 switch (res[i].opcode) {
133 case IORING_RESTRICTION_REGISTER_OP:
134 if (res[i].register_op >= IORING_REGISTER_LAST)
135 goto err;
136 __set_bit(res[i].register_op, restrictions->register_op);
137 restrictions->reg_registered = true;
138 break;
139 case IORING_RESTRICTION_SQE_OP:
140 if (res[i].sqe_op >= IORING_OP_LAST)
141 goto err;
142 __set_bit(res[i].sqe_op, restrictions->sqe_op);
143 restrictions->op_registered = true;
144 break;
145 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
146 restrictions->sqe_flags_allowed = res[i].sqe_flags;
147 restrictions->op_registered = true;
148 break;
149 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
150 restrictions->sqe_flags_required = res[i].sqe_flags;
151 restrictions->op_registered = true;
152 break;
153 default:
154 goto err;
155 }
156 }
157 ret = nr_args;
158 if (!nr_args) {
159 restrictions->op_registered = true;
160 restrictions->reg_registered = true;
161 }
162 err:
163 kfree(res);
164 return ret;
165 }
166
io_register_restrictions(struct io_ring_ctx * ctx,void __user * arg,unsigned int nr_args)167 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
168 void __user *arg, unsigned int nr_args)
169 {
170 int ret;
171
172 /* Restrictions allowed only if rings started disabled */
173 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
174 return -EBADFD;
175
176 /* We allow only a single restrictions registration */
177 if (ctx->restrictions.op_registered || ctx->restrictions.reg_registered)
178 return -EBUSY;
179
180 ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
181 /*
182 * Reset all restrictions if an error happened, but retain any COW'ed
183 * settings.
184 */
185 if (ret < 0) {
186 struct io_bpf_filters *bpf = ctx->restrictions.bpf_filters;
187 bool cowed = ctx->restrictions.bpf_filters_cow;
188
189 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
190 ctx->restrictions.bpf_filters = bpf;
191 ctx->restrictions.bpf_filters_cow = cowed;
192 return ret;
193 }
194 if (ctx->restrictions.op_registered)
195 ctx->op_restricted = 1;
196 if (ctx->restrictions.reg_registered)
197 ctx->reg_restricted = 1;
198 return 0;
199 }
200
io_register_restrictions_task(void __user * arg,unsigned int nr_args)201 static int io_register_restrictions_task(void __user *arg, unsigned int nr_args)
202 {
203 struct io_uring_task_restriction __user *ures = arg;
204 struct io_uring_task_restriction tres;
205 struct io_restriction *res;
206 int ret;
207
208 /* Disallow if task already has registered restrictions */
209 if (current->io_uring_restrict)
210 return -EPERM;
211 /*
212 * Similar to seccomp, disallow setting a filter if task_no_new_privs
213 * is false and we're not CAP_SYS_ADMIN.
214 */
215 if (!task_no_new_privs(current) &&
216 !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
217 return -EACCES;
218 if (nr_args != 1)
219 return -EINVAL;
220
221 if (copy_from_user(&tres, arg, sizeof(tres)))
222 return -EFAULT;
223
224 if (tres.flags)
225 return -EINVAL;
226 if (!mem_is_zero(tres.resv, sizeof(tres.resv)))
227 return -EINVAL;
228
229 res = kzalloc_obj(*res, GFP_KERNEL_ACCOUNT);
230 if (!res)
231 return -ENOMEM;
232
233 ret = io_parse_restrictions(ures->restrictions, tres.nr_res, res);
234 if (ret < 0) {
235 kfree(res);
236 return ret;
237 }
238 current->io_uring_restrict = res;
239 return 0;
240 }
241
io_register_bpf_filter_task(void __user * arg,unsigned int nr_args)242 static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args)
243 {
244 struct io_restriction *res;
245 int ret;
246
247 /*
248 * Similar to seccomp, disallow setting a filter if task_no_new_privs
249 * is false and we're not CAP_SYS_ADMIN.
250 */
251 if (!task_no_new_privs(current) &&
252 !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
253 return -EACCES;
254
255 if (nr_args != 1)
256 return -EINVAL;
257
258 /* If no task restrictions exist, setup a new set */
259 res = current->io_uring_restrict;
260 if (!res) {
261 res = kzalloc_obj(*res, GFP_KERNEL_ACCOUNT);
262 if (!res)
263 return -ENOMEM;
264 }
265
266 ret = io_register_bpf_filter(res, arg);
267 if (ret) {
268 if (res != current->io_uring_restrict)
269 kfree(res);
270 return ret;
271 }
272 if (!current->io_uring_restrict)
273 current->io_uring_restrict = res;
274 return 0;
275 }
276
io_register_enable_rings(struct io_ring_ctx * ctx)277 static int io_register_enable_rings(struct io_ring_ctx *ctx)
278 {
279 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
280 return -EBADFD;
281
282 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER) {
283 ctx->submitter_task = get_task_struct(current);
284 /*
285 * Lazy activation attempts would fail if it was polled before
286 * submitter_task is set.
287 */
288 if (wq_has_sleeper(&ctx->poll_wq))
289 io_activate_pollwq(ctx);
290 }
291
292 /* Keep submitter_task store before clearing IORING_SETUP_R_DISABLED */
293 smp_store_release(&ctx->flags, ctx->flags & ~IORING_SETUP_R_DISABLED);
294 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
295 wake_up(&ctx->sq_data->wait);
296 return 0;
297 }
298
__io_register_iowq_aff(struct io_ring_ctx * ctx,cpumask_var_t new_mask)299 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
300 cpumask_var_t new_mask)
301 {
302 int ret;
303
304 if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
305 ret = io_wq_cpu_affinity(current->io_uring, new_mask);
306 } else {
307 mutex_unlock(&ctx->uring_lock);
308 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
309 mutex_lock(&ctx->uring_lock);
310 }
311
312 return ret;
313 }
314
io_register_iowq_aff(struct io_ring_ctx * ctx,void __user * arg,unsigned len)315 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
316 void __user *arg, unsigned len)
317 {
318 cpumask_var_t new_mask;
319 int ret;
320
321 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
322 return -ENOMEM;
323
324 cpumask_clear(new_mask);
325 if (len > cpumask_size())
326 len = cpumask_size();
327
328 #ifdef CONFIG_COMPAT
329 if (in_compat_syscall())
330 ret = compat_get_bitmap(cpumask_bits(new_mask),
331 (const compat_ulong_t __user *)arg,
332 len * 8 /* CHAR_BIT */);
333 else
334 #endif
335 ret = copy_from_user(new_mask, arg, len);
336
337 if (ret) {
338 free_cpumask_var(new_mask);
339 return -EFAULT;
340 }
341
342 ret = __io_register_iowq_aff(ctx, new_mask);
343 free_cpumask_var(new_mask);
344 return ret;
345 }
346
io_unregister_iowq_aff(struct io_ring_ctx * ctx)347 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
348 {
349 return __io_register_iowq_aff(ctx, NULL);
350 }
351
io_register_iowq_max_workers(struct io_ring_ctx * ctx,void __user * arg)352 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
353 void __user *arg)
354 __must_hold(&ctx->uring_lock)
355 {
356 struct io_tctx_node *node;
357 struct io_uring_task *tctx = NULL;
358 struct io_sq_data *sqd = NULL;
359 __u32 new_count[2];
360 int i, ret;
361
362 if (copy_from_user(new_count, arg, sizeof(new_count)))
363 return -EFAULT;
364 for (i = 0; i < ARRAY_SIZE(new_count); i++)
365 if (new_count[i] > INT_MAX)
366 return -EINVAL;
367
368 if (ctx->flags & IORING_SETUP_SQPOLL) {
369 sqd = ctx->sq_data;
370 if (sqd) {
371 struct task_struct *tsk;
372
373 /*
374 * Observe the correct sqd->lock -> ctx->uring_lock
375 * ordering. Fine to drop uring_lock here, we hold
376 * a ref to the ctx.
377 */
378 refcount_inc(&sqd->refs);
379 mutex_unlock(&ctx->uring_lock);
380 mutex_lock(&sqd->lock);
381 mutex_lock(&ctx->uring_lock);
382 tsk = sqpoll_task_locked(sqd);
383 if (tsk)
384 tctx = tsk->io_uring;
385 }
386 } else {
387 tctx = current->io_uring;
388 }
389
390 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
391
392 for (i = 0; i < ARRAY_SIZE(new_count); i++)
393 if (new_count[i])
394 ctx->iowq_limits[i] = new_count[i];
395 ctx->iowq_limits_set = true;
396
397 if (tctx && tctx->io_wq) {
398 ret = io_wq_max_workers(tctx->io_wq, new_count);
399 if (ret)
400 goto err;
401 } else {
402 memset(new_count, 0, sizeof(new_count));
403 }
404
405 if (sqd) {
406 mutex_unlock(&ctx->uring_lock);
407 mutex_unlock(&sqd->lock);
408 io_put_sq_data(sqd);
409 mutex_lock(&ctx->uring_lock);
410 }
411
412 if (copy_to_user(arg, new_count, sizeof(new_count)))
413 return -EFAULT;
414
415 /* that's it for SQPOLL, only the SQPOLL task creates requests */
416 if (sqd)
417 return 0;
418
419 /* now propagate the restriction to all registered users */
420 mutex_lock(&ctx->tctx_lock);
421 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
422 tctx = node->task->io_uring;
423 if (WARN_ON_ONCE(!tctx->io_wq))
424 continue;
425
426 for (i = 0; i < ARRAY_SIZE(new_count); i++)
427 new_count[i] = ctx->iowq_limits[i];
428 /* ignore errors, it always returns zero anyway */
429 (void)io_wq_max_workers(tctx->io_wq, new_count);
430 }
431 mutex_unlock(&ctx->tctx_lock);
432 return 0;
433 err:
434 if (sqd) {
435 mutex_unlock(&ctx->uring_lock);
436 mutex_unlock(&sqd->lock);
437 io_put_sq_data(sqd);
438 mutex_lock(&ctx->uring_lock);
439 }
440 return ret;
441 }
442
io_register_clock(struct io_ring_ctx * ctx,struct io_uring_clock_register __user * arg)443 static int io_register_clock(struct io_ring_ctx *ctx,
444 struct io_uring_clock_register __user *arg)
445 {
446 struct io_uring_clock_register reg;
447
448 if (copy_from_user(®, arg, sizeof(reg)))
449 return -EFAULT;
450 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
451 return -EINVAL;
452
453 switch (reg.clockid) {
454 case CLOCK_MONOTONIC:
455 ctx->clock_offset = 0;
456 break;
457 case CLOCK_BOOTTIME:
458 ctx->clock_offset = TK_OFFS_BOOT;
459 break;
460 default:
461 return -EINVAL;
462 }
463
464 ctx->clockid = reg.clockid;
465 return 0;
466 }
467
468 /*
469 * State to maintain until we can swap. Both new and old state, used for
470 * either mapping or freeing.
471 */
472 struct io_ring_ctx_rings {
473 struct io_rings *rings;
474 struct io_uring_sqe *sq_sqes;
475
476 struct io_mapped_region sq_region;
477 struct io_mapped_region ring_region;
478 };
479
io_register_free_rings(struct io_ring_ctx * ctx,struct io_ring_ctx_rings * r)480 static void io_register_free_rings(struct io_ring_ctx *ctx,
481 struct io_ring_ctx_rings *r)
482 {
483 io_free_region(ctx->user, &r->sq_region);
484 io_free_region(ctx->user, &r->ring_region);
485 }
486
487 #define swap_old(ctx, o, n, field) \
488 do { \
489 (o).field = (ctx)->field; \
490 (ctx)->field = (n).field; \
491 } while (0)
492
493 #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
494 #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
495 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
496 IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED)
497
io_register_resize_rings(struct io_ring_ctx * ctx,void __user * arg)498 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
499 {
500 struct io_ctx_config config;
501 struct io_uring_region_desc rd;
502 struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
503 unsigned i, tail, old_head;
504 struct io_uring_params *p = &config.p;
505 struct io_rings_layout *rl = &config.layout;
506 int ret;
507
508 memset(&config, 0, sizeof(config));
509
510 /* limited to DEFER_TASKRUN for now */
511 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
512 return -EINVAL;
513 if (copy_from_user(p, arg, sizeof(*p)))
514 return -EFAULT;
515 if (p->flags & ~RESIZE_FLAGS)
516 return -EINVAL;
517
518 /* properties that are always inherited */
519 p->flags |= (ctx->flags & COPY_FLAGS);
520
521 ret = io_prepare_config(&config);
522 if (unlikely(ret))
523 return ret;
524
525 memset(&rd, 0, sizeof(rd));
526 rd.size = PAGE_ALIGN(rl->rings_size);
527 if (p->flags & IORING_SETUP_NO_MMAP) {
528 rd.user_addr = p->cq_off.user_addr;
529 rd.flags |= IORING_MEM_REGION_TYPE_USER;
530 }
531 ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
532 if (ret)
533 return ret;
534
535 n.rings = io_region_get_ptr(&n.ring_region);
536
537 /*
538 * At this point n.rings is shared with userspace, just like o.rings
539 * is as well. While we don't expect userspace to modify it while
540 * a resize is in progress, and it's most likely that userspace will
541 * shoot itself in the foot if it does, we can't always assume good
542 * intent... Use read/write once helpers from here on to indicate the
543 * shared nature of it.
544 */
545 WRITE_ONCE(n.rings->sq_ring_mask, p->sq_entries - 1);
546 WRITE_ONCE(n.rings->cq_ring_mask, p->cq_entries - 1);
547 WRITE_ONCE(n.rings->sq_ring_entries, p->sq_entries);
548 WRITE_ONCE(n.rings->cq_ring_entries, p->cq_entries);
549
550 if (copy_to_user(arg, p, sizeof(*p))) {
551 io_register_free_rings(ctx, &n);
552 return -EFAULT;
553 }
554
555 memset(&rd, 0, sizeof(rd));
556 rd.size = PAGE_ALIGN(rl->sq_size);
557 if (p->flags & IORING_SETUP_NO_MMAP) {
558 rd.user_addr = p->sq_off.user_addr;
559 rd.flags |= IORING_MEM_REGION_TYPE_USER;
560 }
561 ret = io_create_region(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
562 if (ret) {
563 io_register_free_rings(ctx, &n);
564 return ret;
565 }
566 n.sq_sqes = io_region_get_ptr(&n.sq_region);
567
568 /*
569 * If using SQPOLL, park the thread
570 */
571 if (ctx->sq_data) {
572 mutex_unlock(&ctx->uring_lock);
573 io_sq_thread_park(ctx->sq_data);
574 mutex_lock(&ctx->uring_lock);
575 }
576
577 /*
578 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
579 * any new mmap's on the ring fd. Clear out existing mappings to prevent
580 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
581 * existing rings beyond this point will fail. Not that it could proceed
582 * at this point anyway, as the io_uring mmap side needs go grab the
583 * ctx->mmap_lock as well. Likewise, hold the completion lock over the
584 * duration of the actual swap.
585 */
586 mutex_lock(&ctx->mmap_lock);
587 spin_lock(&ctx->completion_lock);
588 o.rings = ctx->rings;
589 ctx->rings = NULL;
590 o.sq_sqes = ctx->sq_sqes;
591 ctx->sq_sqes = NULL;
592
593 /*
594 * Now copy SQ and CQ entries, if any. If either of the destination
595 * rings can't hold what is already there, then fail the operation.
596 */
597 tail = READ_ONCE(o.rings->sq.tail);
598 old_head = READ_ONCE(o.rings->sq.head);
599 if (tail - old_head > p->sq_entries)
600 goto overflow;
601 for (i = old_head; i < tail; i++) {
602 unsigned src_head = i & (ctx->sq_entries - 1);
603 unsigned dst_head = i & (p->sq_entries - 1);
604
605 n.sq_sqes[dst_head] = o.sq_sqes[src_head];
606 }
607 WRITE_ONCE(n.rings->sq.head, old_head);
608 WRITE_ONCE(n.rings->sq.tail, tail);
609
610 tail = READ_ONCE(o.rings->cq.tail);
611 old_head = READ_ONCE(o.rings->cq.head);
612 if (tail - old_head > p->cq_entries) {
613 overflow:
614 /* restore old rings, and return -EOVERFLOW via cleanup path */
615 ctx->rings = o.rings;
616 ctx->sq_sqes = o.sq_sqes;
617 to_free = &n;
618 ret = -EOVERFLOW;
619 goto out;
620 }
621 for (i = old_head; i < tail; i++) {
622 unsigned src_head = i & (ctx->cq_entries - 1);
623 unsigned dst_head = i & (p->cq_entries - 1);
624
625 n.rings->cqes[dst_head] = o.rings->cqes[src_head];
626 }
627 WRITE_ONCE(n.rings->cq.head, old_head);
628 WRITE_ONCE(n.rings->cq.tail, tail);
629 /* invalidate cached cqe refill */
630 ctx->cqe_cached = ctx->cqe_sentinel = NULL;
631
632 WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
633 atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
634 WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
635 WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
636
637 /* all done, store old pointers and assign new ones */
638 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
639 ctx->sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset);
640
641 ctx->sq_entries = p->sq_entries;
642 ctx->cq_entries = p->cq_entries;
643
644 /*
645 * Just mark any flag we may have missed and that the application
646 * should act on unconditionally. Worst case it'll be an extra
647 * syscall.
648 */
649 atomic_or(IORING_SQ_TASKRUN | IORING_SQ_NEED_WAKEUP, &n.rings->sq_flags);
650 ctx->rings = n.rings;
651 rcu_assign_pointer(ctx->rings_rcu, n.rings);
652
653 ctx->sq_sqes = n.sq_sqes;
654 swap_old(ctx, o, n, ring_region);
655 swap_old(ctx, o, n, sq_region);
656 to_free = &o;
657 ret = 0;
658 out:
659 spin_unlock(&ctx->completion_lock);
660 mutex_unlock(&ctx->mmap_lock);
661 /* Wait for concurrent io_ctx_mark_taskrun() */
662 if (to_free == &o)
663 synchronize_rcu_expedited();
664 io_register_free_rings(ctx, to_free);
665
666 if (ctx->sq_data)
667 io_sq_thread_unpark(ctx->sq_data);
668
669 return ret;
670 }
671
io_register_mem_region(struct io_ring_ctx * ctx,void __user * uarg)672 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
673 {
674 struct io_uring_mem_region_reg __user *reg_uptr = uarg;
675 struct io_uring_mem_region_reg reg;
676 struct io_uring_region_desc __user *rd_uptr;
677 struct io_uring_region_desc rd;
678 struct io_mapped_region region = {};
679 int ret;
680
681 if (io_region_is_set(&ctx->param_region))
682 return -EBUSY;
683 if (copy_from_user(®, reg_uptr, sizeof(reg)))
684 return -EFAULT;
685 rd_uptr = u64_to_user_ptr(reg.region_uptr);
686 if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
687 return -EFAULT;
688 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
689 return -EINVAL;
690 if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
691 return -EINVAL;
692
693 /*
694 * This ensures there are no waiters. Waiters are unlocked and it's
695 * hard to synchronise with them, especially if we need to initialise
696 * the region.
697 */
698 if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
699 !(ctx->flags & IORING_SETUP_R_DISABLED))
700 return -EINVAL;
701
702 ret = io_create_region(ctx, ®ion, &rd, IORING_MAP_OFF_PARAM_REGION);
703 if (ret)
704 return ret;
705 if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
706 io_free_region(ctx->user, ®ion);
707 return -EFAULT;
708 }
709
710 if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
711 ctx->cq_wait_arg = io_region_get_ptr(®ion);
712 ctx->cq_wait_size = rd.size;
713 }
714
715 io_region_publish(ctx, ®ion, &ctx->param_region);
716 return 0;
717 }
718
__io_uring_register(struct io_ring_ctx * ctx,unsigned opcode,void __user * arg,unsigned nr_args)719 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
720 void __user *arg, unsigned nr_args)
721 __releases(ctx->uring_lock)
722 __acquires(ctx->uring_lock)
723 {
724 int ret;
725
726 /*
727 * We don't quiesce the refs for register anymore and so it can't be
728 * dying as we're holding a file ref here.
729 */
730 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
731 return -ENXIO;
732
733 if (ctx->submitter_task && ctx->submitter_task != current)
734 return -EEXIST;
735
736 if (ctx->reg_restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) {
737 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
738 if (!test_bit(opcode, ctx->restrictions.register_op))
739 return -EACCES;
740 }
741
742 switch (opcode) {
743 case IORING_REGISTER_BUFFERS:
744 ret = -EFAULT;
745 if (!arg)
746 break;
747 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
748 break;
749 case IORING_UNREGISTER_BUFFERS:
750 ret = -EINVAL;
751 if (arg || nr_args)
752 break;
753 ret = io_sqe_buffers_unregister(ctx);
754 break;
755 case IORING_REGISTER_FILES:
756 ret = -EFAULT;
757 if (!arg)
758 break;
759 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
760 break;
761 case IORING_UNREGISTER_FILES:
762 ret = -EINVAL;
763 if (arg || nr_args)
764 break;
765 ret = io_sqe_files_unregister(ctx);
766 break;
767 case IORING_REGISTER_FILES_UPDATE:
768 ret = io_register_files_update(ctx, arg, nr_args);
769 break;
770 case IORING_REGISTER_EVENTFD:
771 ret = -EINVAL;
772 if (nr_args != 1)
773 break;
774 ret = io_eventfd_register(ctx, arg, 0);
775 break;
776 case IORING_REGISTER_EVENTFD_ASYNC:
777 ret = -EINVAL;
778 if (nr_args != 1)
779 break;
780 ret = io_eventfd_register(ctx, arg, 1);
781 break;
782 case IORING_UNREGISTER_EVENTFD:
783 ret = -EINVAL;
784 if (arg || nr_args)
785 break;
786 ret = io_eventfd_unregister(ctx);
787 break;
788 case IORING_REGISTER_PROBE:
789 ret = -EINVAL;
790 if (!arg || nr_args > 256)
791 break;
792 ret = io_probe(ctx, arg, nr_args);
793 break;
794 case IORING_REGISTER_PERSONALITY:
795 ret = -EINVAL;
796 if (arg || nr_args)
797 break;
798 ret = io_register_personality(ctx);
799 break;
800 case IORING_UNREGISTER_PERSONALITY:
801 ret = -EINVAL;
802 if (arg)
803 break;
804 ret = io_unregister_personality(ctx, nr_args);
805 break;
806 case IORING_REGISTER_ENABLE_RINGS:
807 ret = -EINVAL;
808 if (arg || nr_args)
809 break;
810 ret = io_register_enable_rings(ctx);
811 break;
812 case IORING_REGISTER_RESTRICTIONS:
813 ret = io_register_restrictions(ctx, arg, nr_args);
814 break;
815 case IORING_REGISTER_FILES2:
816 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
817 break;
818 case IORING_REGISTER_FILES_UPDATE2:
819 ret = io_register_rsrc_update(ctx, arg, nr_args,
820 IORING_RSRC_FILE);
821 break;
822 case IORING_REGISTER_BUFFERS2:
823 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
824 break;
825 case IORING_REGISTER_BUFFERS_UPDATE:
826 ret = io_register_rsrc_update(ctx, arg, nr_args,
827 IORING_RSRC_BUFFER);
828 break;
829 case IORING_REGISTER_IOWQ_AFF:
830 ret = -EINVAL;
831 if (!arg || !nr_args)
832 break;
833 ret = io_register_iowq_aff(ctx, arg, nr_args);
834 break;
835 case IORING_UNREGISTER_IOWQ_AFF:
836 ret = -EINVAL;
837 if (arg || nr_args)
838 break;
839 ret = io_unregister_iowq_aff(ctx);
840 break;
841 case IORING_REGISTER_IOWQ_MAX_WORKERS:
842 ret = -EINVAL;
843 if (!arg || nr_args != 2)
844 break;
845 ret = io_register_iowq_max_workers(ctx, arg);
846 break;
847 case IORING_REGISTER_RING_FDS:
848 ret = io_ringfd_register(ctx, arg, nr_args);
849 break;
850 case IORING_UNREGISTER_RING_FDS:
851 ret = io_ringfd_unregister(ctx, arg, nr_args);
852 break;
853 case IORING_REGISTER_PBUF_RING:
854 ret = -EINVAL;
855 if (!arg || nr_args != 1)
856 break;
857 ret = io_register_pbuf_ring(ctx, arg);
858 break;
859 case IORING_UNREGISTER_PBUF_RING:
860 ret = -EINVAL;
861 if (!arg || nr_args != 1)
862 break;
863 ret = io_unregister_pbuf_ring(ctx, arg);
864 break;
865 case IORING_REGISTER_SYNC_CANCEL:
866 ret = -EINVAL;
867 if (!arg || nr_args != 1)
868 break;
869 ret = io_sync_cancel(ctx, arg);
870 break;
871 case IORING_REGISTER_FILE_ALLOC_RANGE:
872 ret = -EINVAL;
873 if (!arg || nr_args)
874 break;
875 ret = io_register_file_alloc_range(ctx, arg);
876 break;
877 case IORING_REGISTER_PBUF_STATUS:
878 ret = -EINVAL;
879 if (!arg || nr_args != 1)
880 break;
881 ret = io_register_pbuf_status(ctx, arg);
882 break;
883 case IORING_REGISTER_NAPI:
884 ret = -EINVAL;
885 if (!arg || nr_args != 1)
886 break;
887 ret = io_register_napi(ctx, arg);
888 break;
889 case IORING_UNREGISTER_NAPI:
890 ret = -EINVAL;
891 if (nr_args != 1)
892 break;
893 ret = io_unregister_napi(ctx, arg);
894 break;
895 case IORING_REGISTER_CLOCK:
896 ret = -EINVAL;
897 if (!arg || nr_args)
898 break;
899 ret = io_register_clock(ctx, arg);
900 break;
901 case IORING_REGISTER_CLONE_BUFFERS:
902 ret = -EINVAL;
903 if (!arg || nr_args != 1)
904 break;
905 ret = io_register_clone_buffers(ctx, arg);
906 break;
907 case IORING_REGISTER_ZCRX_IFQ:
908 ret = -EINVAL;
909 if (!arg || nr_args != 1)
910 break;
911 ret = io_register_zcrx_ifq(ctx, arg);
912 break;
913 case IORING_REGISTER_RESIZE_RINGS:
914 ret = -EINVAL;
915 if (!arg || nr_args != 1)
916 break;
917 ret = io_register_resize_rings(ctx, arg);
918 break;
919 case IORING_REGISTER_MEM_REGION:
920 ret = -EINVAL;
921 if (!arg || nr_args != 1)
922 break;
923 ret = io_register_mem_region(ctx, arg);
924 break;
925 case IORING_REGISTER_QUERY:
926 ret = io_query(arg, nr_args);
927 break;
928 case IORING_REGISTER_ZCRX_CTRL:
929 ret = io_zcrx_ctrl(ctx, arg, nr_args);
930 break;
931 case IORING_REGISTER_BPF_FILTER:
932 ret = -EINVAL;
933
934 if (nr_args != 1)
935 break;
936 ret = io_register_bpf_filter(&ctx->restrictions, arg);
937 if (!ret)
938 WRITE_ONCE(ctx->bpf_filters,
939 ctx->restrictions.bpf_filters->filters);
940 break;
941 default:
942 ret = -EINVAL;
943 break;
944 }
945
946 return ret;
947 }
948
949 /*
950 * Given an 'fd' value, return the ctx associated with if. If 'registered' is
951 * true, then the registered index is used. Otherwise, the normal fd table.
952 * Caller must call fput() on the returned file, unless it's an ERR_PTR.
953 */
io_uring_register_get_file(unsigned int fd,bool registered)954 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
955 {
956 struct file *file;
957
958 if (registered) {
959 /*
960 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
961 * need only dereference our task private array to find it.
962 */
963 struct io_uring_task *tctx = current->io_uring;
964
965 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
966 return ERR_PTR(-EINVAL);
967 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
968 file = tctx->registered_rings[fd];
969 if (file)
970 get_file(file);
971 } else {
972 file = fget(fd);
973 }
974
975 if (unlikely(!file))
976 return ERR_PTR(-EBADF);
977 if (io_is_uring_fops(file))
978 return file;
979 fput(file);
980 return ERR_PTR(-EOPNOTSUPP);
981 }
982
io_uring_register_send_msg_ring(void __user * arg,unsigned int nr_args)983 static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args)
984 {
985 struct io_uring_sqe sqe;
986
987 if (!arg || nr_args != 1)
988 return -EINVAL;
989 if (copy_from_user(&sqe, arg, sizeof(sqe)))
990 return -EFAULT;
991 /* no flags supported */
992 if (sqe.flags)
993 return -EINVAL;
994 if (sqe.opcode != IORING_OP_MSG_RING)
995 return -EINVAL;
996
997 return io_uring_sync_msg_ring(&sqe);
998 }
999
1000 /*
1001 * "blind" registration opcodes are ones where there's no ring given, and
1002 * hence the source fd must be -1.
1003 */
io_uring_register_blind(unsigned int opcode,void __user * arg,unsigned int nr_args)1004 static int io_uring_register_blind(unsigned int opcode, void __user *arg,
1005 unsigned int nr_args)
1006 {
1007 switch (opcode) {
1008 case IORING_REGISTER_SEND_MSG_RING:
1009 return io_uring_register_send_msg_ring(arg, nr_args);
1010 case IORING_REGISTER_QUERY:
1011 return io_query(arg, nr_args);
1012 case IORING_REGISTER_RESTRICTIONS:
1013 return io_register_restrictions_task(arg, nr_args);
1014 case IORING_REGISTER_BPF_FILTER:
1015 return io_register_bpf_filter_task(arg, nr_args);
1016 }
1017 return -EINVAL;
1018 }
1019
SYSCALL_DEFINE4(io_uring_register,unsigned int,fd,unsigned int,opcode,void __user *,arg,unsigned int,nr_args)1020 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
1021 void __user *, arg, unsigned int, nr_args)
1022 {
1023 struct io_ring_ctx *ctx;
1024 long ret = -EBADF;
1025 struct file *file;
1026 bool use_registered_ring;
1027
1028 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
1029 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
1030
1031 if (opcode >= IORING_REGISTER_LAST)
1032 return -EINVAL;
1033
1034 if (fd == -1)
1035 return io_uring_register_blind(opcode, arg, nr_args);
1036
1037 file = io_uring_register_get_file(fd, use_registered_ring);
1038 if (IS_ERR(file))
1039 return PTR_ERR(file);
1040 ctx = file->private_data;
1041
1042 mutex_lock(&ctx->uring_lock);
1043 ret = __io_uring_register(ctx, opcode, arg, nr_args);
1044
1045 trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
1046 ctx->buf_table.nr, ret);
1047 mutex_unlock(&ctx->uring_lock);
1048
1049 fput(file);
1050 return ret;
1051 }
1052