1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code related to the io_uring_register() syscall
4  *
5  * Copyright (C) 2023 Jens Axboe
6  */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20 
21 #include "io_uring.h"
22 #include "opdef.h"
23 #include "tctx.h"
24 #include "rsrc.h"
25 #include "sqpoll.h"
26 #include "register.h"
27 #include "cancel.h"
28 #include "kbuf.h"
29 #include "napi.h"
30 #include "eventfd.h"
31 #include "msg_ring.h"
32 #include "memmap.h"
33 #include "zcrx.h"
34 
35 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
36 				 IORING_REGISTER_LAST + IORING_OP_LAST)
37 
38 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
39 			   unsigned nr_args)
40 {
41 	struct io_uring_probe *p;
42 	size_t size;
43 	int i, ret;
44 
45 	if (nr_args > IORING_OP_LAST)
46 		nr_args = IORING_OP_LAST;
47 
48 	size = struct_size(p, ops, nr_args);
49 	p = kzalloc(size, GFP_KERNEL);
50 	if (!p)
51 		return -ENOMEM;
52 
53 	ret = -EFAULT;
54 	if (copy_from_user(p, arg, size))
55 		goto out;
56 	ret = -EINVAL;
57 	if (memchr_inv(p, 0, size))
58 		goto out;
59 
60 	p->last_op = IORING_OP_LAST - 1;
61 
62 	for (i = 0; i < nr_args; i++) {
63 		p->ops[i].op = i;
64 		if (io_uring_op_supported(i))
65 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
66 	}
67 	p->ops_len = i;
68 
69 	ret = 0;
70 	if (copy_to_user(arg, p, size))
71 		ret = -EFAULT;
72 out:
73 	kfree(p);
74 	return ret;
75 }
76 
77 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
78 {
79 	const struct cred *creds;
80 
81 	creds = xa_erase(&ctx->personalities, id);
82 	if (creds) {
83 		put_cred(creds);
84 		return 0;
85 	}
86 
87 	return -EINVAL;
88 }
89 
90 
91 static int io_register_personality(struct io_ring_ctx *ctx)
92 {
93 	const struct cred *creds;
94 	u32 id;
95 	int ret;
96 
97 	creds = get_current_cred();
98 
99 	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
100 			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
101 	if (ret < 0) {
102 		put_cred(creds);
103 		return ret;
104 	}
105 	return id;
106 }
107 
108 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
109 					struct io_restriction *restrictions)
110 {
111 	struct io_uring_restriction *res;
112 	size_t size;
113 	int i, ret;
114 
115 	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
116 		return -EINVAL;
117 
118 	size = array_size(nr_args, sizeof(*res));
119 	if (size == SIZE_MAX)
120 		return -EOVERFLOW;
121 
122 	res = memdup_user(arg, size);
123 	if (IS_ERR(res))
124 		return PTR_ERR(res);
125 
126 	ret = -EINVAL;
127 
128 	for (i = 0; i < nr_args; i++) {
129 		switch (res[i].opcode) {
130 		case IORING_RESTRICTION_REGISTER_OP:
131 			if (res[i].register_op >= IORING_REGISTER_LAST)
132 				goto err;
133 			__set_bit(res[i].register_op, restrictions->register_op);
134 			break;
135 		case IORING_RESTRICTION_SQE_OP:
136 			if (res[i].sqe_op >= IORING_OP_LAST)
137 				goto err;
138 			__set_bit(res[i].sqe_op, restrictions->sqe_op);
139 			break;
140 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
141 			restrictions->sqe_flags_allowed = res[i].sqe_flags;
142 			break;
143 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
144 			restrictions->sqe_flags_required = res[i].sqe_flags;
145 			break;
146 		default:
147 			goto err;
148 		}
149 	}
150 
151 	ret = 0;
152 
153 err:
154 	kfree(res);
155 	return ret;
156 }
157 
158 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
159 					   void __user *arg, unsigned int nr_args)
160 {
161 	int ret;
162 
163 	/* Restrictions allowed only if rings started disabled */
164 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
165 		return -EBADFD;
166 
167 	/* We allow only a single restrictions registration */
168 	if (ctx->restrictions.registered)
169 		return -EBUSY;
170 
171 	ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
172 	/* Reset all restrictions if an error happened */
173 	if (ret != 0)
174 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
175 	else
176 		ctx->restrictions.registered = true;
177 	return ret;
178 }
179 
180 static int io_register_enable_rings(struct io_ring_ctx *ctx)
181 {
182 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
183 		return -EBADFD;
184 
185 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
186 		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
187 		/*
188 		 * Lazy activation attempts would fail if it was polled before
189 		 * submitter_task is set.
190 		 */
191 		if (wq_has_sleeper(&ctx->poll_wq))
192 			io_activate_pollwq(ctx);
193 	}
194 
195 	if (ctx->restrictions.registered)
196 		ctx->restricted = 1;
197 
198 	ctx->flags &= ~IORING_SETUP_R_DISABLED;
199 	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
200 		wake_up(&ctx->sq_data->wait);
201 	return 0;
202 }
203 
204 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
205 					 cpumask_var_t new_mask)
206 {
207 	int ret;
208 
209 	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
210 		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
211 	} else {
212 		mutex_unlock(&ctx->uring_lock);
213 		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
214 		mutex_lock(&ctx->uring_lock);
215 	}
216 
217 	return ret;
218 }
219 
220 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
221 				       void __user *arg, unsigned len)
222 {
223 	cpumask_var_t new_mask;
224 	int ret;
225 
226 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
227 		return -ENOMEM;
228 
229 	cpumask_clear(new_mask);
230 	if (len > cpumask_size())
231 		len = cpumask_size();
232 
233 #ifdef CONFIG_COMPAT
234 	if (in_compat_syscall())
235 		ret = compat_get_bitmap(cpumask_bits(new_mask),
236 					(const compat_ulong_t __user *)arg,
237 					len * 8 /* CHAR_BIT */);
238 	else
239 #endif
240 		ret = copy_from_user(new_mask, arg, len);
241 
242 	if (ret) {
243 		free_cpumask_var(new_mask);
244 		return -EFAULT;
245 	}
246 
247 	ret = __io_register_iowq_aff(ctx, new_mask);
248 	free_cpumask_var(new_mask);
249 	return ret;
250 }
251 
252 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
253 {
254 	return __io_register_iowq_aff(ctx, NULL);
255 }
256 
257 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
258 					       void __user *arg)
259 	__must_hold(&ctx->uring_lock)
260 {
261 	struct io_tctx_node *node;
262 	struct io_uring_task *tctx = NULL;
263 	struct io_sq_data *sqd = NULL;
264 	__u32 new_count[2];
265 	int i, ret;
266 
267 	if (copy_from_user(new_count, arg, sizeof(new_count)))
268 		return -EFAULT;
269 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
270 		if (new_count[i] > INT_MAX)
271 			return -EINVAL;
272 
273 	if (ctx->flags & IORING_SETUP_SQPOLL) {
274 		sqd = ctx->sq_data;
275 		if (sqd) {
276 			/*
277 			 * Observe the correct sqd->lock -> ctx->uring_lock
278 			 * ordering. Fine to drop uring_lock here, we hold
279 			 * a ref to the ctx.
280 			 */
281 			refcount_inc(&sqd->refs);
282 			mutex_unlock(&ctx->uring_lock);
283 			mutex_lock(&sqd->lock);
284 			mutex_lock(&ctx->uring_lock);
285 			if (sqd->thread)
286 				tctx = sqd->thread->io_uring;
287 		}
288 	} else {
289 		tctx = current->io_uring;
290 	}
291 
292 	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
293 
294 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
295 		if (new_count[i])
296 			ctx->iowq_limits[i] = new_count[i];
297 	ctx->iowq_limits_set = true;
298 
299 	if (tctx && tctx->io_wq) {
300 		ret = io_wq_max_workers(tctx->io_wq, new_count);
301 		if (ret)
302 			goto err;
303 	} else {
304 		memset(new_count, 0, sizeof(new_count));
305 	}
306 
307 	if (sqd) {
308 		mutex_unlock(&ctx->uring_lock);
309 		mutex_unlock(&sqd->lock);
310 		io_put_sq_data(sqd);
311 		mutex_lock(&ctx->uring_lock);
312 	}
313 
314 	if (copy_to_user(arg, new_count, sizeof(new_count)))
315 		return -EFAULT;
316 
317 	/* that's it for SQPOLL, only the SQPOLL task creates requests */
318 	if (sqd)
319 		return 0;
320 
321 	/* now propagate the restriction to all registered users */
322 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
323 		tctx = node->task->io_uring;
324 		if (WARN_ON_ONCE(!tctx->io_wq))
325 			continue;
326 
327 		for (i = 0; i < ARRAY_SIZE(new_count); i++)
328 			new_count[i] = ctx->iowq_limits[i];
329 		/* ignore errors, it always returns zero anyway */
330 		(void)io_wq_max_workers(tctx->io_wq, new_count);
331 	}
332 	return 0;
333 err:
334 	if (sqd) {
335 		mutex_unlock(&ctx->uring_lock);
336 		mutex_unlock(&sqd->lock);
337 		io_put_sq_data(sqd);
338 		mutex_lock(&ctx->uring_lock);
339 	}
340 	return ret;
341 }
342 
343 static int io_register_clock(struct io_ring_ctx *ctx,
344 			     struct io_uring_clock_register __user *arg)
345 {
346 	struct io_uring_clock_register reg;
347 
348 	if (copy_from_user(&reg, arg, sizeof(reg)))
349 		return -EFAULT;
350 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
351 		return -EINVAL;
352 
353 	switch (reg.clockid) {
354 	case CLOCK_MONOTONIC:
355 		ctx->clock_offset = 0;
356 		break;
357 	case CLOCK_BOOTTIME:
358 		ctx->clock_offset = TK_OFFS_BOOT;
359 		break;
360 	default:
361 		return -EINVAL;
362 	}
363 
364 	ctx->clockid = reg.clockid;
365 	return 0;
366 }
367 
368 /*
369  * State to maintain until we can swap. Both new and old state, used for
370  * either mapping or freeing.
371  */
372 struct io_ring_ctx_rings {
373 	struct io_rings *rings;
374 	struct io_uring_sqe *sq_sqes;
375 
376 	struct io_mapped_region sq_region;
377 	struct io_mapped_region ring_region;
378 };
379 
380 static void io_register_free_rings(struct io_ring_ctx *ctx,
381 				   struct io_uring_params *p,
382 				   struct io_ring_ctx_rings *r)
383 {
384 	io_free_region(ctx, &r->sq_region);
385 	io_free_region(ctx, &r->ring_region);
386 }
387 
388 #define swap_old(ctx, o, n, field)		\
389 	do {					\
390 		(o).field = (ctx)->field;	\
391 		(ctx)->field = (n).field;	\
392 	} while (0)
393 
394 #define RESIZE_FLAGS	(IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
395 #define COPY_FLAGS	(IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
396 			 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP)
397 
398 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
399 {
400 	struct io_uring_region_desc rd;
401 	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
402 	size_t size, sq_array_offset;
403 	unsigned i, tail, old_head;
404 	struct io_uring_params p;
405 	int ret;
406 
407 	/* for single issuer, must be owner resizing */
408 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER &&
409 	    current != ctx->submitter_task)
410 		return -EEXIST;
411 	/* limited to DEFER_TASKRUN for now */
412 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
413 		return -EINVAL;
414 	if (copy_from_user(&p, arg, sizeof(p)))
415 		return -EFAULT;
416 	if (p.flags & ~RESIZE_FLAGS)
417 		return -EINVAL;
418 
419 	/* properties that are always inherited */
420 	p.flags |= (ctx->flags & COPY_FLAGS);
421 
422 	ret = io_uring_fill_params(p.sq_entries, &p);
423 	if (unlikely(ret))
424 		return ret;
425 
426 	/* nothing to do, but copy params back */
427 	if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) {
428 		if (copy_to_user(arg, &p, sizeof(p)))
429 			return -EFAULT;
430 		return 0;
431 	}
432 
433 	size = rings_size(p.flags, p.sq_entries, p.cq_entries,
434 				&sq_array_offset);
435 	if (size == SIZE_MAX)
436 		return -EOVERFLOW;
437 
438 	memset(&rd, 0, sizeof(rd));
439 	rd.size = PAGE_ALIGN(size);
440 	if (p.flags & IORING_SETUP_NO_MMAP) {
441 		rd.user_addr = p.cq_off.user_addr;
442 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
443 	}
444 	ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
445 	if (ret) {
446 		io_register_free_rings(ctx, &p, &n);
447 		return ret;
448 	}
449 	n.rings = io_region_get_ptr(&n.ring_region);
450 
451 	/*
452 	 * At this point n.rings is shared with userspace, just like o.rings
453 	 * is as well. While we don't expect userspace to modify it while
454 	 * a resize is in progress, and it's most likely that userspace will
455 	 * shoot itself in the foot if it does, we can't always assume good
456 	 * intent... Use read/write once helpers from here on to indicate the
457 	 * shared nature of it.
458 	 */
459 	WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1);
460 	WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1);
461 	WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries);
462 	WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries);
463 
464 	if (copy_to_user(arg, &p, sizeof(p))) {
465 		io_register_free_rings(ctx, &p, &n);
466 		return -EFAULT;
467 	}
468 
469 	if (p.flags & IORING_SETUP_SQE128)
470 		size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
471 	else
472 		size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
473 	if (size == SIZE_MAX) {
474 		io_register_free_rings(ctx, &p, &n);
475 		return -EOVERFLOW;
476 	}
477 
478 	memset(&rd, 0, sizeof(rd));
479 	rd.size = PAGE_ALIGN(size);
480 	if (p.flags & IORING_SETUP_NO_MMAP) {
481 		rd.user_addr = p.sq_off.user_addr;
482 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
483 	}
484 	ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
485 	if (ret) {
486 		io_register_free_rings(ctx, &p, &n);
487 		return ret;
488 	}
489 	n.sq_sqes = io_region_get_ptr(&n.sq_region);
490 
491 	/*
492 	 * If using SQPOLL, park the thread
493 	 */
494 	if (ctx->sq_data) {
495 		mutex_unlock(&ctx->uring_lock);
496 		io_sq_thread_park(ctx->sq_data);
497 		mutex_lock(&ctx->uring_lock);
498 	}
499 
500 	/*
501 	 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
502 	 * any new mmap's on the ring fd. Clear out existing mappings to prevent
503 	 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
504 	 * existing rings beyond this point will fail. Not that it could proceed
505 	 * at this point anyway, as the io_uring mmap side needs go grab the
506 	 * ctx->mmap_lock as well. Likewise, hold the completion lock over the
507 	 * duration of the actual swap.
508 	 */
509 	mutex_lock(&ctx->mmap_lock);
510 	spin_lock(&ctx->completion_lock);
511 	o.rings = ctx->rings;
512 	ctx->rings = NULL;
513 	o.sq_sqes = ctx->sq_sqes;
514 	ctx->sq_sqes = NULL;
515 
516 	/*
517 	 * Now copy SQ and CQ entries, if any. If either of the destination
518 	 * rings can't hold what is already there, then fail the operation.
519 	 */
520 	tail = READ_ONCE(o.rings->sq.tail);
521 	old_head = READ_ONCE(o.rings->sq.head);
522 	if (tail - old_head > p.sq_entries)
523 		goto overflow;
524 	for (i = old_head; i < tail; i++) {
525 		unsigned src_head = i & (ctx->sq_entries - 1);
526 		unsigned dst_head = i & (p.sq_entries - 1);
527 
528 		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
529 	}
530 	WRITE_ONCE(n.rings->sq.head, old_head);
531 	WRITE_ONCE(n.rings->sq.tail, tail);
532 
533 	tail = READ_ONCE(o.rings->cq.tail);
534 	old_head = READ_ONCE(o.rings->cq.head);
535 	if (tail - old_head > p.cq_entries) {
536 overflow:
537 		/* restore old rings, and return -EOVERFLOW via cleanup path */
538 		ctx->rings = o.rings;
539 		ctx->sq_sqes = o.sq_sqes;
540 		to_free = &n;
541 		ret = -EOVERFLOW;
542 		goto out;
543 	}
544 	for (i = old_head; i < tail; i++) {
545 		unsigned src_head = i & (ctx->cq_entries - 1);
546 		unsigned dst_head = i & (p.cq_entries - 1);
547 
548 		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
549 	}
550 	WRITE_ONCE(n.rings->cq.head, old_head);
551 	WRITE_ONCE(n.rings->cq.tail, tail);
552 	/* invalidate cached cqe refill */
553 	ctx->cqe_cached = ctx->cqe_sentinel = NULL;
554 
555 	WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
556 	atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
557 	WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
558 	WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
559 
560 	/* all done, store old pointers and assign new ones */
561 	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
562 		ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
563 
564 	ctx->sq_entries = p.sq_entries;
565 	ctx->cq_entries = p.cq_entries;
566 
567 	ctx->rings = n.rings;
568 	ctx->sq_sqes = n.sq_sqes;
569 	swap_old(ctx, o, n, ring_region);
570 	swap_old(ctx, o, n, sq_region);
571 	to_free = &o;
572 	ret = 0;
573 out:
574 	spin_unlock(&ctx->completion_lock);
575 	mutex_unlock(&ctx->mmap_lock);
576 	io_register_free_rings(ctx, &p, to_free);
577 
578 	if (ctx->sq_data)
579 		io_sq_thread_unpark(ctx->sq_data);
580 
581 	return ret;
582 }
583 
584 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
585 {
586 	struct io_uring_mem_region_reg __user *reg_uptr = uarg;
587 	struct io_uring_mem_region_reg reg;
588 	struct io_uring_region_desc __user *rd_uptr;
589 	struct io_uring_region_desc rd;
590 	int ret;
591 
592 	if (io_region_is_set(&ctx->param_region))
593 		return -EBUSY;
594 	if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
595 		return -EFAULT;
596 	rd_uptr = u64_to_user_ptr(reg.region_uptr);
597 	if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
598 		return -EFAULT;
599 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
600 		return -EINVAL;
601 	if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
602 		return -EINVAL;
603 
604 	/*
605 	 * This ensures there are no waiters. Waiters are unlocked and it's
606 	 * hard to synchronise with them, especially if we need to initialise
607 	 * the region.
608 	 */
609 	if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
610 	    !(ctx->flags & IORING_SETUP_R_DISABLED))
611 		return -EINVAL;
612 
613 	ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd,
614 					 IORING_MAP_OFF_PARAM_REGION);
615 	if (ret)
616 		return ret;
617 	if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
618 		io_free_region(ctx, &ctx->param_region);
619 		return -EFAULT;
620 	}
621 
622 	if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
623 		ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region);
624 		ctx->cq_wait_size = rd.size;
625 	}
626 	return 0;
627 }
628 
629 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
630 			       void __user *arg, unsigned nr_args)
631 	__releases(ctx->uring_lock)
632 	__acquires(ctx->uring_lock)
633 {
634 	int ret;
635 
636 	/*
637 	 * We don't quiesce the refs for register anymore and so it can't be
638 	 * dying as we're holding a file ref here.
639 	 */
640 	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
641 		return -ENXIO;
642 
643 	if (ctx->submitter_task && ctx->submitter_task != current)
644 		return -EEXIST;
645 
646 	if (ctx->restricted) {
647 		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
648 		if (!test_bit(opcode, ctx->restrictions.register_op))
649 			return -EACCES;
650 	}
651 
652 	switch (opcode) {
653 	case IORING_REGISTER_BUFFERS:
654 		ret = -EFAULT;
655 		if (!arg)
656 			break;
657 		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
658 		break;
659 	case IORING_UNREGISTER_BUFFERS:
660 		ret = -EINVAL;
661 		if (arg || nr_args)
662 			break;
663 		ret = io_sqe_buffers_unregister(ctx);
664 		break;
665 	case IORING_REGISTER_FILES:
666 		ret = -EFAULT;
667 		if (!arg)
668 			break;
669 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
670 		break;
671 	case IORING_UNREGISTER_FILES:
672 		ret = -EINVAL;
673 		if (arg || nr_args)
674 			break;
675 		ret = io_sqe_files_unregister(ctx);
676 		break;
677 	case IORING_REGISTER_FILES_UPDATE:
678 		ret = io_register_files_update(ctx, arg, nr_args);
679 		break;
680 	case IORING_REGISTER_EVENTFD:
681 		ret = -EINVAL;
682 		if (nr_args != 1)
683 			break;
684 		ret = io_eventfd_register(ctx, arg, 0);
685 		break;
686 	case IORING_REGISTER_EVENTFD_ASYNC:
687 		ret = -EINVAL;
688 		if (nr_args != 1)
689 			break;
690 		ret = io_eventfd_register(ctx, arg, 1);
691 		break;
692 	case IORING_UNREGISTER_EVENTFD:
693 		ret = -EINVAL;
694 		if (arg || nr_args)
695 			break;
696 		ret = io_eventfd_unregister(ctx);
697 		break;
698 	case IORING_REGISTER_PROBE:
699 		ret = -EINVAL;
700 		if (!arg || nr_args > 256)
701 			break;
702 		ret = io_probe(ctx, arg, nr_args);
703 		break;
704 	case IORING_REGISTER_PERSONALITY:
705 		ret = -EINVAL;
706 		if (arg || nr_args)
707 			break;
708 		ret = io_register_personality(ctx);
709 		break;
710 	case IORING_UNREGISTER_PERSONALITY:
711 		ret = -EINVAL;
712 		if (arg)
713 			break;
714 		ret = io_unregister_personality(ctx, nr_args);
715 		break;
716 	case IORING_REGISTER_ENABLE_RINGS:
717 		ret = -EINVAL;
718 		if (arg || nr_args)
719 			break;
720 		ret = io_register_enable_rings(ctx);
721 		break;
722 	case IORING_REGISTER_RESTRICTIONS:
723 		ret = io_register_restrictions(ctx, arg, nr_args);
724 		break;
725 	case IORING_REGISTER_FILES2:
726 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
727 		break;
728 	case IORING_REGISTER_FILES_UPDATE2:
729 		ret = io_register_rsrc_update(ctx, arg, nr_args,
730 					      IORING_RSRC_FILE);
731 		break;
732 	case IORING_REGISTER_BUFFERS2:
733 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
734 		break;
735 	case IORING_REGISTER_BUFFERS_UPDATE:
736 		ret = io_register_rsrc_update(ctx, arg, nr_args,
737 					      IORING_RSRC_BUFFER);
738 		break;
739 	case IORING_REGISTER_IOWQ_AFF:
740 		ret = -EINVAL;
741 		if (!arg || !nr_args)
742 			break;
743 		ret = io_register_iowq_aff(ctx, arg, nr_args);
744 		break;
745 	case IORING_UNREGISTER_IOWQ_AFF:
746 		ret = -EINVAL;
747 		if (arg || nr_args)
748 			break;
749 		ret = io_unregister_iowq_aff(ctx);
750 		break;
751 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
752 		ret = -EINVAL;
753 		if (!arg || nr_args != 2)
754 			break;
755 		ret = io_register_iowq_max_workers(ctx, arg);
756 		break;
757 	case IORING_REGISTER_RING_FDS:
758 		ret = io_ringfd_register(ctx, arg, nr_args);
759 		break;
760 	case IORING_UNREGISTER_RING_FDS:
761 		ret = io_ringfd_unregister(ctx, arg, nr_args);
762 		break;
763 	case IORING_REGISTER_PBUF_RING:
764 		ret = -EINVAL;
765 		if (!arg || nr_args != 1)
766 			break;
767 		ret = io_register_pbuf_ring(ctx, arg);
768 		break;
769 	case IORING_UNREGISTER_PBUF_RING:
770 		ret = -EINVAL;
771 		if (!arg || nr_args != 1)
772 			break;
773 		ret = io_unregister_pbuf_ring(ctx, arg);
774 		break;
775 	case IORING_REGISTER_SYNC_CANCEL:
776 		ret = -EINVAL;
777 		if (!arg || nr_args != 1)
778 			break;
779 		ret = io_sync_cancel(ctx, arg);
780 		break;
781 	case IORING_REGISTER_FILE_ALLOC_RANGE:
782 		ret = -EINVAL;
783 		if (!arg || nr_args)
784 			break;
785 		ret = io_register_file_alloc_range(ctx, arg);
786 		break;
787 	case IORING_REGISTER_PBUF_STATUS:
788 		ret = -EINVAL;
789 		if (!arg || nr_args != 1)
790 			break;
791 		ret = io_register_pbuf_status(ctx, arg);
792 		break;
793 	case IORING_REGISTER_NAPI:
794 		ret = -EINVAL;
795 		if (!arg || nr_args != 1)
796 			break;
797 		ret = io_register_napi(ctx, arg);
798 		break;
799 	case IORING_UNREGISTER_NAPI:
800 		ret = -EINVAL;
801 		if (nr_args != 1)
802 			break;
803 		ret = io_unregister_napi(ctx, arg);
804 		break;
805 	case IORING_REGISTER_CLOCK:
806 		ret = -EINVAL;
807 		if (!arg || nr_args)
808 			break;
809 		ret = io_register_clock(ctx, arg);
810 		break;
811 	case IORING_REGISTER_CLONE_BUFFERS:
812 		ret = -EINVAL;
813 		if (!arg || nr_args != 1)
814 			break;
815 		ret = io_register_clone_buffers(ctx, arg);
816 		break;
817 	case IORING_REGISTER_ZCRX_IFQ:
818 		ret = -EINVAL;
819 		if (!arg || nr_args != 1)
820 			break;
821 		ret = io_register_zcrx_ifq(ctx, arg);
822 		break;
823 	case IORING_REGISTER_RESIZE_RINGS:
824 		ret = -EINVAL;
825 		if (!arg || nr_args != 1)
826 			break;
827 		ret = io_register_resize_rings(ctx, arg);
828 		break;
829 	case IORING_REGISTER_MEM_REGION:
830 		ret = -EINVAL;
831 		if (!arg || nr_args != 1)
832 			break;
833 		ret = io_register_mem_region(ctx, arg);
834 		break;
835 	default:
836 		ret = -EINVAL;
837 		break;
838 	}
839 
840 	return ret;
841 }
842 
843 /*
844  * Given an 'fd' value, return the ctx associated with if. If 'registered' is
845  * true, then the registered index is used. Otherwise, the normal fd table.
846  * Caller must call fput() on the returned file, unless it's an ERR_PTR.
847  */
848 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
849 {
850 	struct file *file;
851 
852 	if (registered) {
853 		/*
854 		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
855 		 * need only dereference our task private array to find it.
856 		 */
857 		struct io_uring_task *tctx = current->io_uring;
858 
859 		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
860 			return ERR_PTR(-EINVAL);
861 		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
862 		file = tctx->registered_rings[fd];
863 		if (file)
864 			get_file(file);
865 	} else {
866 		file = fget(fd);
867 	}
868 
869 	if (unlikely(!file))
870 		return ERR_PTR(-EBADF);
871 	if (io_is_uring_fops(file))
872 		return file;
873 	fput(file);
874 	return ERR_PTR(-EOPNOTSUPP);
875 }
876 
877 /*
878  * "blind" registration opcodes are ones where there's no ring given, and
879  * hence the source fd must be -1.
880  */
881 static int io_uring_register_blind(unsigned int opcode, void __user *arg,
882 				   unsigned int nr_args)
883 {
884 	switch (opcode) {
885 	case IORING_REGISTER_SEND_MSG_RING: {
886 		struct io_uring_sqe sqe;
887 
888 		if (!arg || nr_args != 1)
889 			return -EINVAL;
890 		if (copy_from_user(&sqe, arg, sizeof(sqe)))
891 			return -EFAULT;
892 		/* no flags supported */
893 		if (sqe.flags)
894 			return -EINVAL;
895 		if (sqe.opcode == IORING_OP_MSG_RING)
896 			return io_uring_sync_msg_ring(&sqe);
897 		}
898 	}
899 
900 	return -EINVAL;
901 }
902 
903 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
904 		void __user *, arg, unsigned int, nr_args)
905 {
906 	struct io_ring_ctx *ctx;
907 	long ret = -EBADF;
908 	struct file *file;
909 	bool use_registered_ring;
910 
911 	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
912 	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
913 
914 	if (opcode >= IORING_REGISTER_LAST)
915 		return -EINVAL;
916 
917 	if (fd == -1)
918 		return io_uring_register_blind(opcode, arg, nr_args);
919 
920 	file = io_uring_register_get_file(fd, use_registered_ring);
921 	if (IS_ERR(file))
922 		return PTR_ERR(file);
923 	ctx = file->private_data;
924 
925 	mutex_lock(&ctx->uring_lock);
926 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
927 
928 	trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
929 				ctx->buf_table.nr, ret);
930 	mutex_unlock(&ctx->uring_lock);
931 
932 	fput(file);
933 	return ret;
934 }
935