1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Code related to the io_uring_register() syscall
4 *
5 * Copyright (C) 2023 Jens Axboe
6 */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20
21 #include "io_uring.h"
22 #include "opdef.h"
23 #include "tctx.h"
24 #include "rsrc.h"
25 #include "sqpoll.h"
26 #include "register.h"
27 #include "cancel.h"
28 #include "kbuf.h"
29 #include "napi.h"
30 #include "eventfd.h"
31 #include "msg_ring.h"
32 #include "memmap.h"
33 #include "zcrx.h"
34
35 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
36 IORING_REGISTER_LAST + IORING_OP_LAST)
37
io_probe(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)38 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
39 unsigned nr_args)
40 {
41 struct io_uring_probe *p;
42 size_t size;
43 int i, ret;
44
45 if (nr_args > IORING_OP_LAST)
46 nr_args = IORING_OP_LAST;
47
48 size = struct_size(p, ops, nr_args);
49 p = kzalloc(size, GFP_KERNEL);
50 if (!p)
51 return -ENOMEM;
52
53 ret = -EFAULT;
54 if (copy_from_user(p, arg, size))
55 goto out;
56 ret = -EINVAL;
57 if (memchr_inv(p, 0, size))
58 goto out;
59
60 p->last_op = IORING_OP_LAST - 1;
61
62 for (i = 0; i < nr_args; i++) {
63 p->ops[i].op = i;
64 if (io_uring_op_supported(i))
65 p->ops[i].flags = IO_URING_OP_SUPPORTED;
66 }
67 p->ops_len = i;
68
69 ret = 0;
70 if (copy_to_user(arg, p, size))
71 ret = -EFAULT;
72 out:
73 kfree(p);
74 return ret;
75 }
76
io_unregister_personality(struct io_ring_ctx * ctx,unsigned id)77 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
78 {
79 const struct cred *creds;
80
81 creds = xa_erase(&ctx->personalities, id);
82 if (creds) {
83 put_cred(creds);
84 return 0;
85 }
86
87 return -EINVAL;
88 }
89
90
io_register_personality(struct io_ring_ctx * ctx)91 static int io_register_personality(struct io_ring_ctx *ctx)
92 {
93 const struct cred *creds;
94 u32 id;
95 int ret;
96
97 creds = get_current_cred();
98
99 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
100 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
101 if (ret < 0) {
102 put_cred(creds);
103 return ret;
104 }
105 return id;
106 }
107
io_parse_restrictions(void __user * arg,unsigned int nr_args,struct io_restriction * restrictions)108 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
109 struct io_restriction *restrictions)
110 {
111 struct io_uring_restriction *res;
112 size_t size;
113 int i, ret;
114
115 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
116 return -EINVAL;
117
118 size = array_size(nr_args, sizeof(*res));
119 if (size == SIZE_MAX)
120 return -EOVERFLOW;
121
122 res = memdup_user(arg, size);
123 if (IS_ERR(res))
124 return PTR_ERR(res);
125
126 ret = -EINVAL;
127
128 for (i = 0; i < nr_args; i++) {
129 switch (res[i].opcode) {
130 case IORING_RESTRICTION_REGISTER_OP:
131 if (res[i].register_op >= IORING_REGISTER_LAST)
132 goto err;
133 __set_bit(res[i].register_op, restrictions->register_op);
134 break;
135 case IORING_RESTRICTION_SQE_OP:
136 if (res[i].sqe_op >= IORING_OP_LAST)
137 goto err;
138 __set_bit(res[i].sqe_op, restrictions->sqe_op);
139 break;
140 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
141 restrictions->sqe_flags_allowed = res[i].sqe_flags;
142 break;
143 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
144 restrictions->sqe_flags_required = res[i].sqe_flags;
145 break;
146 default:
147 goto err;
148 }
149 }
150
151 ret = 0;
152
153 err:
154 kfree(res);
155 return ret;
156 }
157
io_register_restrictions(struct io_ring_ctx * ctx,void __user * arg,unsigned int nr_args)158 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
159 void __user *arg, unsigned int nr_args)
160 {
161 int ret;
162
163 /* Restrictions allowed only if rings started disabled */
164 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
165 return -EBADFD;
166
167 /* We allow only a single restrictions registration */
168 if (ctx->restrictions.registered)
169 return -EBUSY;
170
171 ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
172 /* Reset all restrictions if an error happened */
173 if (ret != 0)
174 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
175 else
176 ctx->restrictions.registered = true;
177 return ret;
178 }
179
io_register_enable_rings(struct io_ring_ctx * ctx)180 static int io_register_enable_rings(struct io_ring_ctx *ctx)
181 {
182 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
183 return -EBADFD;
184
185 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
186 WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
187 /*
188 * Lazy activation attempts would fail if it was polled before
189 * submitter_task is set.
190 */
191 if (wq_has_sleeper(&ctx->poll_wq))
192 io_activate_pollwq(ctx);
193 }
194
195 if (ctx->restrictions.registered)
196 ctx->restricted = 1;
197
198 ctx->flags &= ~IORING_SETUP_R_DISABLED;
199 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
200 wake_up(&ctx->sq_data->wait);
201 return 0;
202 }
203
__io_register_iowq_aff(struct io_ring_ctx * ctx,cpumask_var_t new_mask)204 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
205 cpumask_var_t new_mask)
206 {
207 int ret;
208
209 if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
210 ret = io_wq_cpu_affinity(current->io_uring, new_mask);
211 } else {
212 mutex_unlock(&ctx->uring_lock);
213 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
214 mutex_lock(&ctx->uring_lock);
215 }
216
217 return ret;
218 }
219
io_register_iowq_aff(struct io_ring_ctx * ctx,void __user * arg,unsigned len)220 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
221 void __user *arg, unsigned len)
222 {
223 cpumask_var_t new_mask;
224 int ret;
225
226 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
227 return -ENOMEM;
228
229 cpumask_clear(new_mask);
230 if (len > cpumask_size())
231 len = cpumask_size();
232
233 #ifdef CONFIG_COMPAT
234 if (in_compat_syscall())
235 ret = compat_get_bitmap(cpumask_bits(new_mask),
236 (const compat_ulong_t __user *)arg,
237 len * 8 /* CHAR_BIT */);
238 else
239 #endif
240 ret = copy_from_user(new_mask, arg, len);
241
242 if (ret) {
243 free_cpumask_var(new_mask);
244 return -EFAULT;
245 }
246
247 ret = __io_register_iowq_aff(ctx, new_mask);
248 free_cpumask_var(new_mask);
249 return ret;
250 }
251
io_unregister_iowq_aff(struct io_ring_ctx * ctx)252 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
253 {
254 return __io_register_iowq_aff(ctx, NULL);
255 }
256
io_register_iowq_max_workers(struct io_ring_ctx * ctx,void __user * arg)257 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
258 void __user *arg)
259 __must_hold(&ctx->uring_lock)
260 {
261 struct io_tctx_node *node;
262 struct io_uring_task *tctx = NULL;
263 struct io_sq_data *sqd = NULL;
264 __u32 new_count[2];
265 int i, ret;
266
267 if (copy_from_user(new_count, arg, sizeof(new_count)))
268 return -EFAULT;
269 for (i = 0; i < ARRAY_SIZE(new_count); i++)
270 if (new_count[i] > INT_MAX)
271 return -EINVAL;
272
273 if (ctx->flags & IORING_SETUP_SQPOLL) {
274 sqd = ctx->sq_data;
275 if (sqd) {
276 /*
277 * Observe the correct sqd->lock -> ctx->uring_lock
278 * ordering. Fine to drop uring_lock here, we hold
279 * a ref to the ctx.
280 */
281 refcount_inc(&sqd->refs);
282 mutex_unlock(&ctx->uring_lock);
283 mutex_lock(&sqd->lock);
284 mutex_lock(&ctx->uring_lock);
285 if (sqd->thread)
286 tctx = sqd->thread->io_uring;
287 }
288 } else {
289 tctx = current->io_uring;
290 }
291
292 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
293
294 for (i = 0; i < ARRAY_SIZE(new_count); i++)
295 if (new_count[i])
296 ctx->iowq_limits[i] = new_count[i];
297 ctx->iowq_limits_set = true;
298
299 if (tctx && tctx->io_wq) {
300 ret = io_wq_max_workers(tctx->io_wq, new_count);
301 if (ret)
302 goto err;
303 } else {
304 memset(new_count, 0, sizeof(new_count));
305 }
306
307 if (sqd) {
308 mutex_unlock(&ctx->uring_lock);
309 mutex_unlock(&sqd->lock);
310 io_put_sq_data(sqd);
311 mutex_lock(&ctx->uring_lock);
312 }
313
314 if (copy_to_user(arg, new_count, sizeof(new_count)))
315 return -EFAULT;
316
317 /* that's it for SQPOLL, only the SQPOLL task creates requests */
318 if (sqd)
319 return 0;
320
321 /* now propagate the restriction to all registered users */
322 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
323 tctx = node->task->io_uring;
324 if (WARN_ON_ONCE(!tctx->io_wq))
325 continue;
326
327 for (i = 0; i < ARRAY_SIZE(new_count); i++)
328 new_count[i] = ctx->iowq_limits[i];
329 /* ignore errors, it always returns zero anyway */
330 (void)io_wq_max_workers(tctx->io_wq, new_count);
331 }
332 return 0;
333 err:
334 if (sqd) {
335 mutex_unlock(&ctx->uring_lock);
336 mutex_unlock(&sqd->lock);
337 io_put_sq_data(sqd);
338 mutex_lock(&ctx->uring_lock);
339 }
340 return ret;
341 }
342
io_register_clock(struct io_ring_ctx * ctx,struct io_uring_clock_register __user * arg)343 static int io_register_clock(struct io_ring_ctx *ctx,
344 struct io_uring_clock_register __user *arg)
345 {
346 struct io_uring_clock_register reg;
347
348 if (copy_from_user(®, arg, sizeof(reg)))
349 return -EFAULT;
350 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
351 return -EINVAL;
352
353 switch (reg.clockid) {
354 case CLOCK_MONOTONIC:
355 ctx->clock_offset = 0;
356 break;
357 case CLOCK_BOOTTIME:
358 ctx->clock_offset = TK_OFFS_BOOT;
359 break;
360 default:
361 return -EINVAL;
362 }
363
364 ctx->clockid = reg.clockid;
365 return 0;
366 }
367
368 /*
369 * State to maintain until we can swap. Both new and old state, used for
370 * either mapping or freeing.
371 */
372 struct io_ring_ctx_rings {
373 struct io_rings *rings;
374 struct io_uring_sqe *sq_sqes;
375
376 struct io_mapped_region sq_region;
377 struct io_mapped_region ring_region;
378 };
379
io_register_free_rings(struct io_ring_ctx * ctx,struct io_uring_params * p,struct io_ring_ctx_rings * r)380 static void io_register_free_rings(struct io_ring_ctx *ctx,
381 struct io_uring_params *p,
382 struct io_ring_ctx_rings *r)
383 {
384 io_free_region(ctx, &r->sq_region);
385 io_free_region(ctx, &r->ring_region);
386 }
387
388 #define swap_old(ctx, o, n, field) \
389 do { \
390 (o).field = (ctx)->field; \
391 (ctx)->field = (n).field; \
392 } while (0)
393
394 #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
395 #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
396 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP)
397
io_register_resize_rings(struct io_ring_ctx * ctx,void __user * arg)398 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
399 {
400 struct io_uring_region_desc rd;
401 struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
402 size_t size, sq_array_offset;
403 unsigned i, tail, old_head;
404 struct io_uring_params p;
405 int ret;
406
407 /* for single issuer, must be owner resizing */
408 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER &&
409 current != ctx->submitter_task)
410 return -EEXIST;
411 /* limited to DEFER_TASKRUN for now */
412 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
413 return -EINVAL;
414 if (copy_from_user(&p, arg, sizeof(p)))
415 return -EFAULT;
416 if (p.flags & ~RESIZE_FLAGS)
417 return -EINVAL;
418
419 /* properties that are always inherited */
420 p.flags |= (ctx->flags & COPY_FLAGS);
421
422 ret = io_uring_fill_params(p.sq_entries, &p);
423 if (unlikely(ret))
424 return ret;
425
426 /* nothing to do, but copy params back */
427 if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) {
428 if (copy_to_user(arg, &p, sizeof(p)))
429 return -EFAULT;
430 return 0;
431 }
432
433 size = rings_size(p.flags, p.sq_entries, p.cq_entries,
434 &sq_array_offset);
435 if (size == SIZE_MAX)
436 return -EOVERFLOW;
437
438 memset(&rd, 0, sizeof(rd));
439 rd.size = PAGE_ALIGN(size);
440 if (p.flags & IORING_SETUP_NO_MMAP) {
441 rd.user_addr = p.cq_off.user_addr;
442 rd.flags |= IORING_MEM_REGION_TYPE_USER;
443 }
444 ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
445 if (ret) {
446 io_register_free_rings(ctx, &p, &n);
447 return ret;
448 }
449 n.rings = io_region_get_ptr(&n.ring_region);
450
451 /*
452 * At this point n.rings is shared with userspace, just like o.rings
453 * is as well. While we don't expect userspace to modify it while
454 * a resize is in progress, and it's most likely that userspace will
455 * shoot itself in the foot if it does, we can't always assume good
456 * intent... Use read/write once helpers from here on to indicate the
457 * shared nature of it.
458 */
459 WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1);
460 WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1);
461 WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries);
462 WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries);
463
464 if (copy_to_user(arg, &p, sizeof(p))) {
465 io_register_free_rings(ctx, &p, &n);
466 return -EFAULT;
467 }
468
469 if (p.flags & IORING_SETUP_SQE128)
470 size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
471 else
472 size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
473 if (size == SIZE_MAX) {
474 io_register_free_rings(ctx, &p, &n);
475 return -EOVERFLOW;
476 }
477
478 memset(&rd, 0, sizeof(rd));
479 rd.size = PAGE_ALIGN(size);
480 if (p.flags & IORING_SETUP_NO_MMAP) {
481 rd.user_addr = p.sq_off.user_addr;
482 rd.flags |= IORING_MEM_REGION_TYPE_USER;
483 }
484 ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
485 if (ret) {
486 io_register_free_rings(ctx, &p, &n);
487 return ret;
488 }
489 n.sq_sqes = io_region_get_ptr(&n.sq_region);
490
491 /*
492 * If using SQPOLL, park the thread
493 */
494 if (ctx->sq_data) {
495 mutex_unlock(&ctx->uring_lock);
496 io_sq_thread_park(ctx->sq_data);
497 mutex_lock(&ctx->uring_lock);
498 }
499
500 /*
501 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
502 * any new mmap's on the ring fd. Clear out existing mappings to prevent
503 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
504 * existing rings beyond this point will fail. Not that it could proceed
505 * at this point anyway, as the io_uring mmap side needs go grab the
506 * ctx->mmap_lock as well. Likewise, hold the completion lock over the
507 * duration of the actual swap.
508 */
509 mutex_lock(&ctx->mmap_lock);
510 spin_lock(&ctx->completion_lock);
511 o.rings = ctx->rings;
512 ctx->rings = NULL;
513 o.sq_sqes = ctx->sq_sqes;
514 ctx->sq_sqes = NULL;
515
516 /*
517 * Now copy SQ and CQ entries, if any. If either of the destination
518 * rings can't hold what is already there, then fail the operation.
519 */
520 tail = READ_ONCE(o.rings->sq.tail);
521 old_head = READ_ONCE(o.rings->sq.head);
522 if (tail - old_head > p.sq_entries)
523 goto overflow;
524 for (i = old_head; i < tail; i++) {
525 unsigned src_head = i & (ctx->sq_entries - 1);
526 unsigned dst_head = i & (p.sq_entries - 1);
527
528 n.sq_sqes[dst_head] = o.sq_sqes[src_head];
529 }
530 WRITE_ONCE(n.rings->sq.head, old_head);
531 WRITE_ONCE(n.rings->sq.tail, tail);
532
533 tail = READ_ONCE(o.rings->cq.tail);
534 old_head = READ_ONCE(o.rings->cq.head);
535 if (tail - old_head > p.cq_entries) {
536 overflow:
537 /* restore old rings, and return -EOVERFLOW via cleanup path */
538 ctx->rings = o.rings;
539 ctx->sq_sqes = o.sq_sqes;
540 to_free = &n;
541 ret = -EOVERFLOW;
542 goto out;
543 }
544 for (i = old_head; i < tail; i++) {
545 unsigned src_head = i & (ctx->cq_entries - 1);
546 unsigned dst_head = i & (p.cq_entries - 1);
547
548 n.rings->cqes[dst_head] = o.rings->cqes[src_head];
549 }
550 WRITE_ONCE(n.rings->cq.head, old_head);
551 WRITE_ONCE(n.rings->cq.tail, tail);
552 /* invalidate cached cqe refill */
553 ctx->cqe_cached = ctx->cqe_sentinel = NULL;
554
555 WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
556 atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
557 WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
558 WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
559
560 /* all done, store old pointers and assign new ones */
561 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
562 ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
563
564 ctx->sq_entries = p.sq_entries;
565 ctx->cq_entries = p.cq_entries;
566
567 ctx->rings = n.rings;
568 ctx->sq_sqes = n.sq_sqes;
569 swap_old(ctx, o, n, ring_region);
570 swap_old(ctx, o, n, sq_region);
571 to_free = &o;
572 ret = 0;
573 out:
574 spin_unlock(&ctx->completion_lock);
575 mutex_unlock(&ctx->mmap_lock);
576 io_register_free_rings(ctx, &p, to_free);
577
578 if (ctx->sq_data)
579 io_sq_thread_unpark(ctx->sq_data);
580
581 return ret;
582 }
583
io_register_mem_region(struct io_ring_ctx * ctx,void __user * uarg)584 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
585 {
586 struct io_uring_mem_region_reg __user *reg_uptr = uarg;
587 struct io_uring_mem_region_reg reg;
588 struct io_uring_region_desc __user *rd_uptr;
589 struct io_uring_region_desc rd;
590 int ret;
591
592 if (io_region_is_set(&ctx->param_region))
593 return -EBUSY;
594 if (copy_from_user(®, reg_uptr, sizeof(reg)))
595 return -EFAULT;
596 rd_uptr = u64_to_user_ptr(reg.region_uptr);
597 if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
598 return -EFAULT;
599 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
600 return -EINVAL;
601 if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
602 return -EINVAL;
603
604 /*
605 * This ensures there are no waiters. Waiters are unlocked and it's
606 * hard to synchronise with them, especially if we need to initialise
607 * the region.
608 */
609 if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
610 !(ctx->flags & IORING_SETUP_R_DISABLED))
611 return -EINVAL;
612
613 ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd,
614 IORING_MAP_OFF_PARAM_REGION);
615 if (ret)
616 return ret;
617 if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
618 io_free_region(ctx, &ctx->param_region);
619 return -EFAULT;
620 }
621
622 if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
623 ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region);
624 ctx->cq_wait_size = rd.size;
625 }
626 return 0;
627 }
628
__io_uring_register(struct io_ring_ctx * ctx,unsigned opcode,void __user * arg,unsigned nr_args)629 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
630 void __user *arg, unsigned nr_args)
631 __releases(ctx->uring_lock)
632 __acquires(ctx->uring_lock)
633 {
634 int ret;
635
636 /*
637 * We don't quiesce the refs for register anymore and so it can't be
638 * dying as we're holding a file ref here.
639 */
640 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
641 return -ENXIO;
642
643 if (ctx->submitter_task && ctx->submitter_task != current)
644 return -EEXIST;
645
646 if (ctx->restricted) {
647 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
648 if (!test_bit(opcode, ctx->restrictions.register_op))
649 return -EACCES;
650 }
651
652 switch (opcode) {
653 case IORING_REGISTER_BUFFERS:
654 ret = -EFAULT;
655 if (!arg)
656 break;
657 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
658 break;
659 case IORING_UNREGISTER_BUFFERS:
660 ret = -EINVAL;
661 if (arg || nr_args)
662 break;
663 ret = io_sqe_buffers_unregister(ctx);
664 break;
665 case IORING_REGISTER_FILES:
666 ret = -EFAULT;
667 if (!arg)
668 break;
669 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
670 break;
671 case IORING_UNREGISTER_FILES:
672 ret = -EINVAL;
673 if (arg || nr_args)
674 break;
675 ret = io_sqe_files_unregister(ctx);
676 break;
677 case IORING_REGISTER_FILES_UPDATE:
678 ret = io_register_files_update(ctx, arg, nr_args);
679 break;
680 case IORING_REGISTER_EVENTFD:
681 ret = -EINVAL;
682 if (nr_args != 1)
683 break;
684 ret = io_eventfd_register(ctx, arg, 0);
685 break;
686 case IORING_REGISTER_EVENTFD_ASYNC:
687 ret = -EINVAL;
688 if (nr_args != 1)
689 break;
690 ret = io_eventfd_register(ctx, arg, 1);
691 break;
692 case IORING_UNREGISTER_EVENTFD:
693 ret = -EINVAL;
694 if (arg || nr_args)
695 break;
696 ret = io_eventfd_unregister(ctx);
697 break;
698 case IORING_REGISTER_PROBE:
699 ret = -EINVAL;
700 if (!arg || nr_args > 256)
701 break;
702 ret = io_probe(ctx, arg, nr_args);
703 break;
704 case IORING_REGISTER_PERSONALITY:
705 ret = -EINVAL;
706 if (arg || nr_args)
707 break;
708 ret = io_register_personality(ctx);
709 break;
710 case IORING_UNREGISTER_PERSONALITY:
711 ret = -EINVAL;
712 if (arg)
713 break;
714 ret = io_unregister_personality(ctx, nr_args);
715 break;
716 case IORING_REGISTER_ENABLE_RINGS:
717 ret = -EINVAL;
718 if (arg || nr_args)
719 break;
720 ret = io_register_enable_rings(ctx);
721 break;
722 case IORING_REGISTER_RESTRICTIONS:
723 ret = io_register_restrictions(ctx, arg, nr_args);
724 break;
725 case IORING_REGISTER_FILES2:
726 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
727 break;
728 case IORING_REGISTER_FILES_UPDATE2:
729 ret = io_register_rsrc_update(ctx, arg, nr_args,
730 IORING_RSRC_FILE);
731 break;
732 case IORING_REGISTER_BUFFERS2:
733 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
734 break;
735 case IORING_REGISTER_BUFFERS_UPDATE:
736 ret = io_register_rsrc_update(ctx, arg, nr_args,
737 IORING_RSRC_BUFFER);
738 break;
739 case IORING_REGISTER_IOWQ_AFF:
740 ret = -EINVAL;
741 if (!arg || !nr_args)
742 break;
743 ret = io_register_iowq_aff(ctx, arg, nr_args);
744 break;
745 case IORING_UNREGISTER_IOWQ_AFF:
746 ret = -EINVAL;
747 if (arg || nr_args)
748 break;
749 ret = io_unregister_iowq_aff(ctx);
750 break;
751 case IORING_REGISTER_IOWQ_MAX_WORKERS:
752 ret = -EINVAL;
753 if (!arg || nr_args != 2)
754 break;
755 ret = io_register_iowq_max_workers(ctx, arg);
756 break;
757 case IORING_REGISTER_RING_FDS:
758 ret = io_ringfd_register(ctx, arg, nr_args);
759 break;
760 case IORING_UNREGISTER_RING_FDS:
761 ret = io_ringfd_unregister(ctx, arg, nr_args);
762 break;
763 case IORING_REGISTER_PBUF_RING:
764 ret = -EINVAL;
765 if (!arg || nr_args != 1)
766 break;
767 ret = io_register_pbuf_ring(ctx, arg);
768 break;
769 case IORING_UNREGISTER_PBUF_RING:
770 ret = -EINVAL;
771 if (!arg || nr_args != 1)
772 break;
773 ret = io_unregister_pbuf_ring(ctx, arg);
774 break;
775 case IORING_REGISTER_SYNC_CANCEL:
776 ret = -EINVAL;
777 if (!arg || nr_args != 1)
778 break;
779 ret = io_sync_cancel(ctx, arg);
780 break;
781 case IORING_REGISTER_FILE_ALLOC_RANGE:
782 ret = -EINVAL;
783 if (!arg || nr_args)
784 break;
785 ret = io_register_file_alloc_range(ctx, arg);
786 break;
787 case IORING_REGISTER_PBUF_STATUS:
788 ret = -EINVAL;
789 if (!arg || nr_args != 1)
790 break;
791 ret = io_register_pbuf_status(ctx, arg);
792 break;
793 case IORING_REGISTER_NAPI:
794 ret = -EINVAL;
795 if (!arg || nr_args != 1)
796 break;
797 ret = io_register_napi(ctx, arg);
798 break;
799 case IORING_UNREGISTER_NAPI:
800 ret = -EINVAL;
801 if (nr_args != 1)
802 break;
803 ret = io_unregister_napi(ctx, arg);
804 break;
805 case IORING_REGISTER_CLOCK:
806 ret = -EINVAL;
807 if (!arg || nr_args)
808 break;
809 ret = io_register_clock(ctx, arg);
810 break;
811 case IORING_REGISTER_CLONE_BUFFERS:
812 ret = -EINVAL;
813 if (!arg || nr_args != 1)
814 break;
815 ret = io_register_clone_buffers(ctx, arg);
816 break;
817 case IORING_REGISTER_ZCRX_IFQ:
818 ret = -EINVAL;
819 if (!arg || nr_args != 1)
820 break;
821 ret = io_register_zcrx_ifq(ctx, arg);
822 break;
823 case IORING_REGISTER_RESIZE_RINGS:
824 ret = -EINVAL;
825 if (!arg || nr_args != 1)
826 break;
827 ret = io_register_resize_rings(ctx, arg);
828 break;
829 case IORING_REGISTER_MEM_REGION:
830 ret = -EINVAL;
831 if (!arg || nr_args != 1)
832 break;
833 ret = io_register_mem_region(ctx, arg);
834 break;
835 default:
836 ret = -EINVAL;
837 break;
838 }
839
840 return ret;
841 }
842
843 /*
844 * Given an 'fd' value, return the ctx associated with if. If 'registered' is
845 * true, then the registered index is used. Otherwise, the normal fd table.
846 * Caller must call fput() on the returned file, unless it's an ERR_PTR.
847 */
io_uring_register_get_file(unsigned int fd,bool registered)848 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
849 {
850 struct file *file;
851
852 if (registered) {
853 /*
854 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
855 * need only dereference our task private array to find it.
856 */
857 struct io_uring_task *tctx = current->io_uring;
858
859 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
860 return ERR_PTR(-EINVAL);
861 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
862 file = tctx->registered_rings[fd];
863 if (file)
864 get_file(file);
865 } else {
866 file = fget(fd);
867 }
868
869 if (unlikely(!file))
870 return ERR_PTR(-EBADF);
871 if (io_is_uring_fops(file))
872 return file;
873 fput(file);
874 return ERR_PTR(-EOPNOTSUPP);
875 }
876
877 /*
878 * "blind" registration opcodes are ones where there's no ring given, and
879 * hence the source fd must be -1.
880 */
io_uring_register_blind(unsigned int opcode,void __user * arg,unsigned int nr_args)881 static int io_uring_register_blind(unsigned int opcode, void __user *arg,
882 unsigned int nr_args)
883 {
884 switch (opcode) {
885 case IORING_REGISTER_SEND_MSG_RING: {
886 struct io_uring_sqe sqe;
887
888 if (!arg || nr_args != 1)
889 return -EINVAL;
890 if (copy_from_user(&sqe, arg, sizeof(sqe)))
891 return -EFAULT;
892 /* no flags supported */
893 if (sqe.flags)
894 return -EINVAL;
895 if (sqe.opcode == IORING_OP_MSG_RING)
896 return io_uring_sync_msg_ring(&sqe);
897 }
898 }
899
900 return -EINVAL;
901 }
902
SYSCALL_DEFINE4(io_uring_register,unsigned int,fd,unsigned int,opcode,void __user *,arg,unsigned int,nr_args)903 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
904 void __user *, arg, unsigned int, nr_args)
905 {
906 struct io_ring_ctx *ctx;
907 long ret = -EBADF;
908 struct file *file;
909 bool use_registered_ring;
910
911 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
912 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
913
914 if (opcode >= IORING_REGISTER_LAST)
915 return -EINVAL;
916
917 if (fd == -1)
918 return io_uring_register_blind(opcode, arg, nr_args);
919
920 file = io_uring_register_get_file(fd, use_registered_ring);
921 if (IS_ERR(file))
922 return PTR_ERR(file);
923 ctx = file->private_data;
924
925 mutex_lock(&ctx->uring_lock);
926 ret = __io_uring_register(ctx, opcode, arg, nr_args);
927
928 trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
929 ctx->buf_table.nr, ret);
930 mutex_unlock(&ctx->uring_lock);
931
932 fput(file);
933 return ret;
934 }
935