1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/fs.h>
5 #include <linux/file.h>
6 #include <linux/mm.h>
7 #include <linux/slab.h>
8 #include <linux/nospec.h>
9 #include <linux/hugetlb.h>
10 #include <linux/compat.h>
11 #include <linux/io_uring.h>
12 #include <linux/io_uring/cmd.h>
13
14 #include <uapi/linux/io_uring.h>
15
16 #include "io_uring.h"
17 #include "openclose.h"
18 #include "rsrc.h"
19 #include "memmap.h"
20 #include "register.h"
21
22 struct io_rsrc_update {
23 struct file *file;
24 u64 arg;
25 u32 nr_args;
26 u32 offset;
27 };
28
29 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
30 struct iovec *iov, struct page **last_hpage);
31
32 /* only define max */
33 #define IORING_MAX_FIXED_FILES (1U << 20)
34 #define IORING_MAX_REG_BUFFERS (1U << 14)
35
36 #define IO_CACHED_BVECS_SEGS 32
37
__io_account_mem(struct user_struct * user,unsigned long nr_pages)38 int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
39 {
40 unsigned long page_limit, cur_pages, new_pages;
41
42 if (!nr_pages)
43 return 0;
44
45 /* Don't allow more pages than we can safely lock */
46 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
47
48 cur_pages = atomic_long_read(&user->locked_vm);
49 do {
50 new_pages = cur_pages + nr_pages;
51 if (new_pages > page_limit)
52 return -ENOMEM;
53 } while (!atomic_long_try_cmpxchg(&user->locked_vm,
54 &cur_pages, new_pages));
55 return 0;
56 }
57
io_unaccount_mem(struct io_ring_ctx * ctx,unsigned long nr_pages)58 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
59 {
60 if (ctx->user)
61 __io_unaccount_mem(ctx->user, nr_pages);
62
63 if (ctx->mm_account)
64 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
65 }
66
io_account_mem(struct io_ring_ctx * ctx,unsigned long nr_pages)67 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
68 {
69 int ret;
70
71 if (ctx->user) {
72 ret = __io_account_mem(ctx->user, nr_pages);
73 if (ret)
74 return ret;
75 }
76
77 if (ctx->mm_account)
78 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
79
80 return 0;
81 }
82
io_buffer_validate(struct iovec * iov)83 int io_buffer_validate(struct iovec *iov)
84 {
85 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
86
87 /*
88 * Don't impose further limits on the size and buffer
89 * constraints here, we'll -EINVAL later when IO is
90 * submitted if they are wrong.
91 */
92 if (!iov->iov_base)
93 return iov->iov_len ? -EFAULT : 0;
94 if (!iov->iov_len)
95 return -EFAULT;
96
97 /* arbitrary limit, but we need something */
98 if (iov->iov_len > SZ_1G)
99 return -EFAULT;
100
101 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
102 return -EOVERFLOW;
103
104 return 0;
105 }
106
io_release_ubuf(void * priv)107 static void io_release_ubuf(void *priv)
108 {
109 struct io_mapped_ubuf *imu = priv;
110 unsigned int i;
111
112 for (i = 0; i < imu->nr_bvecs; i++)
113 unpin_user_page(imu->bvec[i].bv_page);
114 }
115
io_alloc_imu(struct io_ring_ctx * ctx,int nr_bvecs)116 static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
117 int nr_bvecs)
118 {
119 if (nr_bvecs <= IO_CACHED_BVECS_SEGS)
120 return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL);
121 return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs),
122 GFP_KERNEL);
123 }
124
io_free_imu(struct io_ring_ctx * ctx,struct io_mapped_ubuf * imu)125 static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
126 {
127 if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS)
128 io_cache_free(&ctx->imu_cache, imu);
129 else
130 kvfree(imu);
131 }
132
io_buffer_unmap(struct io_ring_ctx * ctx,struct io_mapped_ubuf * imu)133 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
134 {
135 if (!refcount_dec_and_test(&imu->refs))
136 return;
137
138 if (imu->acct_pages)
139 io_unaccount_mem(ctx, imu->acct_pages);
140 imu->release(imu->priv);
141 io_free_imu(ctx, imu);
142 }
143
io_rsrc_node_alloc(struct io_ring_ctx * ctx,int type)144 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
145 {
146 struct io_rsrc_node *node;
147
148 node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL);
149 if (node) {
150 node->type = type;
151 node->refs = 1;
152 node->tag = 0;
153 node->file_ptr = 0;
154 }
155 return node;
156 }
157
io_rsrc_cache_init(struct io_ring_ctx * ctx)158 bool io_rsrc_cache_init(struct io_ring_ctx *ctx)
159 {
160 const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec,
161 IO_CACHED_BVECS_SEGS);
162 const int node_size = sizeof(struct io_rsrc_node);
163 bool ret;
164
165 ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX,
166 node_size, 0);
167 ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX,
168 imu_cache_size, 0);
169 return ret;
170 }
171
io_rsrc_cache_free(struct io_ring_ctx * ctx)172 void io_rsrc_cache_free(struct io_ring_ctx *ctx)
173 {
174 io_alloc_cache_free(&ctx->node_cache, kfree);
175 io_alloc_cache_free(&ctx->imu_cache, kfree);
176 }
177
io_clear_table_tags(struct io_rsrc_data * data)178 static void io_clear_table_tags(struct io_rsrc_data *data)
179 {
180 int i;
181
182 for (i = 0; i < data->nr; i++) {
183 struct io_rsrc_node *node = data->nodes[i];
184
185 if (node)
186 node->tag = 0;
187 }
188 }
189
io_rsrc_data_free(struct io_ring_ctx * ctx,struct io_rsrc_data * data)190 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
191 struct io_rsrc_data *data)
192 {
193 if (!data->nr)
194 return;
195 while (data->nr--) {
196 if (data->nodes[data->nr])
197 io_put_rsrc_node(ctx, data->nodes[data->nr]);
198 }
199 kvfree(data->nodes);
200 data->nodes = NULL;
201 data->nr = 0;
202 }
203
io_rsrc_data_alloc(struct io_rsrc_data * data,unsigned nr)204 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
205 {
206 data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *),
207 GFP_KERNEL_ACCOUNT | __GFP_ZERO);
208 if (data->nodes) {
209 data->nr = nr;
210 return 0;
211 }
212 return -ENOMEM;
213 }
214
__io_sqe_files_update(struct io_ring_ctx * ctx,struct io_uring_rsrc_update2 * up,unsigned nr_args)215 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
216 struct io_uring_rsrc_update2 *up,
217 unsigned nr_args)
218 {
219 u64 __user *tags = u64_to_user_ptr(up->tags);
220 __s32 __user *fds = u64_to_user_ptr(up->data);
221 int fd, i, err = 0;
222 unsigned int done;
223
224 if (!ctx->file_table.data.nr)
225 return -ENXIO;
226 if (up->offset + nr_args > ctx->file_table.data.nr)
227 return -EINVAL;
228
229 for (done = 0; done < nr_args; done++) {
230 u64 tag = 0;
231
232 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
233 copy_from_user(&fd, &fds[done], sizeof(fd))) {
234 err = -EFAULT;
235 break;
236 }
237 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
238 err = -EINVAL;
239 break;
240 }
241 if (fd == IORING_REGISTER_FILES_SKIP)
242 continue;
243
244 i = up->offset + done;
245 if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i))
246 io_file_bitmap_clear(&ctx->file_table, i);
247
248 if (fd != -1) {
249 struct file *file = fget(fd);
250 struct io_rsrc_node *node;
251
252 if (!file) {
253 err = -EBADF;
254 break;
255 }
256 /*
257 * Don't allow io_uring instances to be registered.
258 */
259 if (io_is_uring_fops(file)) {
260 fput(file);
261 err = -EBADF;
262 break;
263 }
264 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
265 if (!node) {
266 err = -ENOMEM;
267 fput(file);
268 break;
269 }
270 ctx->file_table.data.nodes[i] = node;
271 if (tag)
272 node->tag = tag;
273 io_fixed_file_set(node, file);
274 io_file_bitmap_set(&ctx->file_table, i);
275 }
276 }
277 return done ? done : err;
278 }
279
__io_sqe_buffers_update(struct io_ring_ctx * ctx,struct io_uring_rsrc_update2 * up,unsigned int nr_args)280 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
281 struct io_uring_rsrc_update2 *up,
282 unsigned int nr_args)
283 {
284 u64 __user *tags = u64_to_user_ptr(up->tags);
285 struct iovec fast_iov, *iov;
286 struct page *last_hpage = NULL;
287 struct iovec __user *uvec;
288 u64 user_data = up->data;
289 __u32 done;
290 int i, err;
291
292 if (!ctx->buf_table.nr)
293 return -ENXIO;
294 if (up->offset + nr_args > ctx->buf_table.nr)
295 return -EINVAL;
296
297 for (done = 0; done < nr_args; done++) {
298 struct io_rsrc_node *node;
299 u64 tag = 0;
300
301 uvec = u64_to_user_ptr(user_data);
302 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
303 if (IS_ERR(iov)) {
304 err = PTR_ERR(iov);
305 break;
306 }
307 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
308 err = -EFAULT;
309 break;
310 }
311 err = io_buffer_validate(iov);
312 if (err)
313 break;
314 node = io_sqe_buffer_register(ctx, iov, &last_hpage);
315 if (IS_ERR(node)) {
316 err = PTR_ERR(node);
317 break;
318 }
319 if (tag) {
320 if (!node) {
321 err = -EINVAL;
322 break;
323 }
324 node->tag = tag;
325 }
326 i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
327 io_reset_rsrc_node(ctx, &ctx->buf_table, i);
328 ctx->buf_table.nodes[i] = node;
329 if (ctx->compat)
330 user_data += sizeof(struct compat_iovec);
331 else
332 user_data += sizeof(struct iovec);
333 }
334 return done ? done : err;
335 }
336
__io_register_rsrc_update(struct io_ring_ctx * ctx,unsigned type,struct io_uring_rsrc_update2 * up,unsigned nr_args)337 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
338 struct io_uring_rsrc_update2 *up,
339 unsigned nr_args)
340 {
341 __u32 tmp;
342
343 lockdep_assert_held(&ctx->uring_lock);
344
345 if (check_add_overflow(up->offset, nr_args, &tmp))
346 return -EOVERFLOW;
347
348 switch (type) {
349 case IORING_RSRC_FILE:
350 return __io_sqe_files_update(ctx, up, nr_args);
351 case IORING_RSRC_BUFFER:
352 return __io_sqe_buffers_update(ctx, up, nr_args);
353 }
354 return -EINVAL;
355 }
356
io_register_files_update(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)357 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
358 unsigned nr_args)
359 {
360 struct io_uring_rsrc_update2 up;
361
362 if (!nr_args)
363 return -EINVAL;
364 memset(&up, 0, sizeof(up));
365 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
366 return -EFAULT;
367 if (up.resv || up.resv2)
368 return -EINVAL;
369 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
370 }
371
io_register_rsrc_update(struct io_ring_ctx * ctx,void __user * arg,unsigned size,unsigned type)372 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
373 unsigned size, unsigned type)
374 {
375 struct io_uring_rsrc_update2 up;
376
377 if (size != sizeof(up))
378 return -EINVAL;
379 if (copy_from_user(&up, arg, sizeof(up)))
380 return -EFAULT;
381 if (!up.nr || up.resv || up.resv2)
382 return -EINVAL;
383 return __io_register_rsrc_update(ctx, type, &up, up.nr);
384 }
385
io_register_rsrc(struct io_ring_ctx * ctx,void __user * arg,unsigned int size,unsigned int type)386 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
387 unsigned int size, unsigned int type)
388 {
389 struct io_uring_rsrc_register rr;
390
391 /* keep it extendible */
392 if (size != sizeof(rr))
393 return -EINVAL;
394
395 memset(&rr, 0, sizeof(rr));
396 if (copy_from_user(&rr, arg, size))
397 return -EFAULT;
398 if (!rr.nr || rr.resv2)
399 return -EINVAL;
400 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
401 return -EINVAL;
402
403 switch (type) {
404 case IORING_RSRC_FILE:
405 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
406 break;
407 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
408 rr.nr, u64_to_user_ptr(rr.tags));
409 case IORING_RSRC_BUFFER:
410 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
411 break;
412 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
413 rr.nr, u64_to_user_ptr(rr.tags));
414 }
415 return -EINVAL;
416 }
417
io_files_update_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)418 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
419 {
420 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
421
422 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
423 return -EINVAL;
424 if (sqe->rw_flags || sqe->splice_fd_in)
425 return -EINVAL;
426
427 up->offset = READ_ONCE(sqe->off);
428 up->nr_args = READ_ONCE(sqe->len);
429 if (!up->nr_args)
430 return -EINVAL;
431 up->arg = READ_ONCE(sqe->addr);
432 return 0;
433 }
434
io_files_update_with_index_alloc(struct io_kiocb * req,unsigned int issue_flags)435 static int io_files_update_with_index_alloc(struct io_kiocb *req,
436 unsigned int issue_flags)
437 {
438 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
439 __s32 __user *fds = u64_to_user_ptr(up->arg);
440 unsigned int done;
441 struct file *file;
442 int ret, fd;
443
444 if (!req->ctx->file_table.data.nr)
445 return -ENXIO;
446
447 for (done = 0; done < up->nr_args; done++) {
448 if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
449 ret = -EFAULT;
450 break;
451 }
452
453 file = fget(fd);
454 if (!file) {
455 ret = -EBADF;
456 break;
457 }
458 ret = io_fixed_fd_install(req, issue_flags, file,
459 IORING_FILE_INDEX_ALLOC);
460 if (ret < 0)
461 break;
462 if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
463 __io_close_fixed(req->ctx, issue_flags, ret);
464 ret = -EFAULT;
465 break;
466 }
467 }
468
469 if (done)
470 return done;
471 return ret;
472 }
473
io_files_update(struct io_kiocb * req,unsigned int issue_flags)474 int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
475 {
476 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
477 struct io_ring_ctx *ctx = req->ctx;
478 struct io_uring_rsrc_update2 up2;
479 int ret;
480
481 up2.offset = up->offset;
482 up2.data = up->arg;
483 up2.nr = 0;
484 up2.tags = 0;
485 up2.resv = 0;
486 up2.resv2 = 0;
487
488 if (up->offset == IORING_FILE_INDEX_ALLOC) {
489 ret = io_files_update_with_index_alloc(req, issue_flags);
490 } else {
491 io_ring_submit_lock(ctx, issue_flags);
492 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
493 &up2, up->nr_args);
494 io_ring_submit_unlock(ctx, issue_flags);
495 }
496
497 if (ret < 0)
498 req_set_fail(req);
499 io_req_set_res(req, ret, 0);
500 return IOU_OK;
501 }
502
io_free_rsrc_node(struct io_ring_ctx * ctx,struct io_rsrc_node * node)503 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
504 {
505 if (node->tag)
506 io_post_aux_cqe(ctx, node->tag, 0, 0);
507
508 switch (node->type) {
509 case IORING_RSRC_FILE:
510 fput(io_slot_file(node));
511 break;
512 case IORING_RSRC_BUFFER:
513 io_buffer_unmap(ctx, node->buf);
514 break;
515 default:
516 WARN_ON_ONCE(1);
517 break;
518 }
519
520 io_cache_free(&ctx->node_cache, node);
521 }
522
io_sqe_files_unregister(struct io_ring_ctx * ctx)523 int io_sqe_files_unregister(struct io_ring_ctx *ctx)
524 {
525 if (!ctx->file_table.data.nr)
526 return -ENXIO;
527
528 io_free_file_tables(ctx, &ctx->file_table);
529 io_file_table_set_alloc_range(ctx, 0, 0);
530 return 0;
531 }
532
io_sqe_files_register(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args,u64 __user * tags)533 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
534 unsigned nr_args, u64 __user *tags)
535 {
536 __s32 __user *fds = (__s32 __user *) arg;
537 struct file *file;
538 int fd, ret;
539 unsigned i;
540
541 if (ctx->file_table.data.nr)
542 return -EBUSY;
543 if (!nr_args)
544 return -EINVAL;
545 if (nr_args > IORING_MAX_FIXED_FILES)
546 return -EMFILE;
547 if (nr_args > rlimit(RLIMIT_NOFILE))
548 return -EMFILE;
549 if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args))
550 return -ENOMEM;
551
552 for (i = 0; i < nr_args; i++) {
553 struct io_rsrc_node *node;
554 u64 tag = 0;
555
556 ret = -EFAULT;
557 if (tags && copy_from_user(&tag, &tags[i], sizeof(tag)))
558 goto fail;
559 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd)))
560 goto fail;
561 /* allow sparse sets */
562 if (!fds || fd == -1) {
563 ret = -EINVAL;
564 if (tag)
565 goto fail;
566 continue;
567 }
568
569 file = fget(fd);
570 ret = -EBADF;
571 if (unlikely(!file))
572 goto fail;
573
574 /*
575 * Don't allow io_uring instances to be registered.
576 */
577 if (io_is_uring_fops(file)) {
578 fput(file);
579 goto fail;
580 }
581 ret = -ENOMEM;
582 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
583 if (!node) {
584 fput(file);
585 goto fail;
586 }
587 if (tag)
588 node->tag = tag;
589 ctx->file_table.data.nodes[i] = node;
590 io_fixed_file_set(node, file);
591 io_file_bitmap_set(&ctx->file_table, i);
592 }
593
594 /* default it to the whole table */
595 io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr);
596 return 0;
597 fail:
598 io_clear_table_tags(&ctx->file_table.data);
599 io_sqe_files_unregister(ctx);
600 return ret;
601 }
602
io_sqe_buffers_unregister(struct io_ring_ctx * ctx)603 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
604 {
605 if (!ctx->buf_table.nr)
606 return -ENXIO;
607 io_rsrc_data_free(ctx, &ctx->buf_table);
608 return 0;
609 }
610
611 /*
612 * Not super efficient, but this is just a registration time. And we do cache
613 * the last compound head, so generally we'll only do a full search if we don't
614 * match that one.
615 *
616 * We check if the given compound head page has already been accounted, to
617 * avoid double accounting it. This allows us to account the full size of the
618 * page, not just the constituent pages of a huge page.
619 */
headpage_already_acct(struct io_ring_ctx * ctx,struct page ** pages,int nr_pages,struct page * hpage)620 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
621 int nr_pages, struct page *hpage)
622 {
623 int i, j;
624
625 /* check current page array */
626 for (i = 0; i < nr_pages; i++) {
627 if (!PageCompound(pages[i]))
628 continue;
629 if (compound_head(pages[i]) == hpage)
630 return true;
631 }
632
633 /* check previously registered pages */
634 for (i = 0; i < ctx->buf_table.nr; i++) {
635 struct io_rsrc_node *node = ctx->buf_table.nodes[i];
636 struct io_mapped_ubuf *imu;
637
638 if (!node)
639 continue;
640 imu = node->buf;
641 for (j = 0; j < imu->nr_bvecs; j++) {
642 if (!PageCompound(imu->bvec[j].bv_page))
643 continue;
644 if (compound_head(imu->bvec[j].bv_page) == hpage)
645 return true;
646 }
647 }
648
649 return false;
650 }
651
io_buffer_account_pin(struct io_ring_ctx * ctx,struct page ** pages,int nr_pages,struct io_mapped_ubuf * imu,struct page ** last_hpage)652 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
653 int nr_pages, struct io_mapped_ubuf *imu,
654 struct page **last_hpage)
655 {
656 int i, ret;
657
658 imu->acct_pages = 0;
659 for (i = 0; i < nr_pages; i++) {
660 if (!PageCompound(pages[i])) {
661 imu->acct_pages++;
662 } else {
663 struct page *hpage;
664
665 hpage = compound_head(pages[i]);
666 if (hpage == *last_hpage)
667 continue;
668 *last_hpage = hpage;
669 if (headpage_already_acct(ctx, pages, i, hpage))
670 continue;
671 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
672 }
673 }
674
675 if (!imu->acct_pages)
676 return 0;
677
678 ret = io_account_mem(ctx, imu->acct_pages);
679 if (ret)
680 imu->acct_pages = 0;
681 return ret;
682 }
683
io_coalesce_buffer(struct page *** pages,int * nr_pages,struct io_imu_folio_data * data)684 static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
685 struct io_imu_folio_data *data)
686 {
687 struct page **page_array = *pages, **new_array = NULL;
688 int nr_pages_left = *nr_pages, i, j;
689 int nr_folios = data->nr_folios;
690
691 /* Store head pages only*/
692 new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
693 GFP_KERNEL);
694 if (!new_array)
695 return false;
696
697 new_array[0] = compound_head(page_array[0]);
698 /*
699 * The pages are bound to the folio, it doesn't
700 * actually unpin them but drops all but one reference,
701 * which is usually put down by io_buffer_unmap().
702 * Note, needs a better helper.
703 */
704 if (data->nr_pages_head > 1)
705 unpin_user_pages(&page_array[1], data->nr_pages_head - 1);
706
707 j = data->nr_pages_head;
708 nr_pages_left -= data->nr_pages_head;
709 for (i = 1; i < nr_folios; i++) {
710 unsigned int nr_unpin;
711
712 new_array[i] = page_array[j];
713 nr_unpin = min_t(unsigned int, nr_pages_left - 1,
714 data->nr_pages_mid - 1);
715 if (nr_unpin)
716 unpin_user_pages(&page_array[j+1], nr_unpin);
717 j += data->nr_pages_mid;
718 nr_pages_left -= data->nr_pages_mid;
719 }
720 kvfree(page_array);
721 *pages = new_array;
722 *nr_pages = nr_folios;
723 return true;
724 }
725
io_check_coalesce_buffer(struct page ** page_array,int nr_pages,struct io_imu_folio_data * data)726 bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
727 struct io_imu_folio_data *data)
728 {
729 struct folio *folio = page_folio(page_array[0]);
730 unsigned int count = 1, nr_folios = 1;
731 int i;
732
733 data->nr_pages_mid = folio_nr_pages(folio);
734 data->folio_shift = folio_shift(folio);
735
736 /*
737 * Check if pages are contiguous inside a folio, and all folios have
738 * the same page count except for the head and tail.
739 */
740 for (i = 1; i < nr_pages; i++) {
741 if (page_folio(page_array[i]) == folio &&
742 page_array[i] == page_array[i-1] + 1) {
743 count++;
744 continue;
745 }
746
747 if (nr_folios == 1) {
748 if (folio_page_idx(folio, page_array[i-1]) !=
749 data->nr_pages_mid - 1)
750 return false;
751
752 data->nr_pages_head = count;
753 } else if (count != data->nr_pages_mid) {
754 return false;
755 }
756
757 folio = page_folio(page_array[i]);
758 if (folio_size(folio) != (1UL << data->folio_shift) ||
759 folio_page_idx(folio, page_array[i]) != 0)
760 return false;
761
762 count = 1;
763 nr_folios++;
764 }
765 if (nr_folios == 1)
766 data->nr_pages_head = count;
767
768 data->nr_folios = nr_folios;
769 return true;
770 }
771
io_sqe_buffer_register(struct io_ring_ctx * ctx,struct iovec * iov,struct page ** last_hpage)772 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
773 struct iovec *iov,
774 struct page **last_hpage)
775 {
776 struct io_mapped_ubuf *imu = NULL;
777 struct page **pages = NULL;
778 struct io_rsrc_node *node;
779 unsigned long off;
780 size_t size;
781 int ret, nr_pages, i;
782 struct io_imu_folio_data data;
783 bool coalesced = false;
784
785 if (!iov->iov_base)
786 return NULL;
787
788 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
789 if (!node)
790 return ERR_PTR(-ENOMEM);
791
792 ret = -ENOMEM;
793 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
794 &nr_pages);
795 if (IS_ERR(pages)) {
796 ret = PTR_ERR(pages);
797 pages = NULL;
798 goto done;
799 }
800
801 /* If it's huge page(s), try to coalesce them into fewer bvec entries */
802 if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) {
803 if (data.nr_pages_mid != 1)
804 coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
805 }
806
807 imu = io_alloc_imu(ctx, nr_pages);
808 if (!imu)
809 goto done;
810
811 imu->nr_bvecs = nr_pages;
812 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
813 if (ret) {
814 unpin_user_pages(pages, nr_pages);
815 goto done;
816 }
817
818 size = iov->iov_len;
819 /* store original address for later verification */
820 imu->ubuf = (unsigned long) iov->iov_base;
821 imu->len = iov->iov_len;
822 imu->folio_shift = PAGE_SHIFT;
823 imu->release = io_release_ubuf;
824 imu->priv = imu;
825 imu->is_kbuf = false;
826 imu->dir = IO_IMU_DEST | IO_IMU_SOURCE;
827 if (coalesced)
828 imu->folio_shift = data.folio_shift;
829 refcount_set(&imu->refs, 1);
830 off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1);
831 node->buf = imu;
832 ret = 0;
833
834 for (i = 0; i < nr_pages; i++) {
835 size_t vec_len;
836
837 vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off);
838 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
839 off = 0;
840 size -= vec_len;
841 }
842 done:
843 if (ret) {
844 if (imu)
845 io_free_imu(ctx, imu);
846 io_cache_free(&ctx->node_cache, node);
847 node = ERR_PTR(ret);
848 }
849 kvfree(pages);
850 return node;
851 }
852
io_sqe_buffers_register(struct io_ring_ctx * ctx,void __user * arg,unsigned int nr_args,u64 __user * tags)853 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
854 unsigned int nr_args, u64 __user *tags)
855 {
856 struct page *last_hpage = NULL;
857 struct io_rsrc_data data;
858 struct iovec fast_iov, *iov = &fast_iov;
859 const struct iovec __user *uvec;
860 int i, ret;
861
862 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
863
864 if (ctx->buf_table.nr)
865 return -EBUSY;
866 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
867 return -EINVAL;
868 ret = io_rsrc_data_alloc(&data, nr_args);
869 if (ret)
870 return ret;
871
872 if (!arg)
873 memset(iov, 0, sizeof(*iov));
874
875 for (i = 0; i < nr_args; i++) {
876 struct io_rsrc_node *node;
877 u64 tag = 0;
878
879 if (arg) {
880 uvec = (struct iovec __user *) arg;
881 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
882 if (IS_ERR(iov)) {
883 ret = PTR_ERR(iov);
884 break;
885 }
886 ret = io_buffer_validate(iov);
887 if (ret)
888 break;
889 if (ctx->compat)
890 arg += sizeof(struct compat_iovec);
891 else
892 arg += sizeof(struct iovec);
893 }
894
895 if (tags) {
896 if (copy_from_user(&tag, &tags[i], sizeof(tag))) {
897 ret = -EFAULT;
898 break;
899 }
900 }
901
902 node = io_sqe_buffer_register(ctx, iov, &last_hpage);
903 if (IS_ERR(node)) {
904 ret = PTR_ERR(node);
905 break;
906 }
907 if (tag) {
908 if (!node) {
909 ret = -EINVAL;
910 break;
911 }
912 node->tag = tag;
913 }
914 data.nodes[i] = node;
915 }
916
917 ctx->buf_table = data;
918 if (ret) {
919 io_clear_table_tags(&ctx->buf_table);
920 io_sqe_buffers_unregister(ctx);
921 }
922 return ret;
923 }
924
io_buffer_register_bvec(struct io_uring_cmd * cmd,struct request * rq,void (* release)(void *),unsigned int index,unsigned int issue_flags)925 int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
926 void (*release)(void *), unsigned int index,
927 unsigned int issue_flags)
928 {
929 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
930 struct io_rsrc_data *data = &ctx->buf_table;
931 struct req_iterator rq_iter;
932 struct io_mapped_ubuf *imu;
933 struct io_rsrc_node *node;
934 struct bio_vec bv, *bvec;
935 u16 nr_bvecs;
936 int ret = 0;
937
938 io_ring_submit_lock(ctx, issue_flags);
939 if (index >= data->nr) {
940 ret = -EINVAL;
941 goto unlock;
942 }
943 index = array_index_nospec(index, data->nr);
944
945 if (data->nodes[index]) {
946 ret = -EBUSY;
947 goto unlock;
948 }
949
950 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
951 if (!node) {
952 ret = -ENOMEM;
953 goto unlock;
954 }
955
956 nr_bvecs = blk_rq_nr_phys_segments(rq);
957 imu = io_alloc_imu(ctx, nr_bvecs);
958 if (!imu) {
959 kfree(node);
960 ret = -ENOMEM;
961 goto unlock;
962 }
963
964 imu->ubuf = 0;
965 imu->len = blk_rq_bytes(rq);
966 imu->acct_pages = 0;
967 imu->folio_shift = PAGE_SHIFT;
968 imu->nr_bvecs = nr_bvecs;
969 refcount_set(&imu->refs, 1);
970 imu->release = release;
971 imu->priv = rq;
972 imu->is_kbuf = true;
973 imu->dir = 1 << rq_data_dir(rq);
974
975 bvec = imu->bvec;
976 rq_for_each_bvec(bv, rq, rq_iter)
977 *bvec++ = bv;
978
979 node->buf = imu;
980 data->nodes[index] = node;
981 unlock:
982 io_ring_submit_unlock(ctx, issue_flags);
983 return ret;
984 }
985 EXPORT_SYMBOL_GPL(io_buffer_register_bvec);
986
io_buffer_unregister_bvec(struct io_uring_cmd * cmd,unsigned int index,unsigned int issue_flags)987 int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
988 unsigned int issue_flags)
989 {
990 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
991 struct io_rsrc_data *data = &ctx->buf_table;
992 struct io_rsrc_node *node;
993 int ret = 0;
994
995 io_ring_submit_lock(ctx, issue_flags);
996 if (index >= data->nr) {
997 ret = -EINVAL;
998 goto unlock;
999 }
1000 index = array_index_nospec(index, data->nr);
1001
1002 node = data->nodes[index];
1003 if (!node) {
1004 ret = -EINVAL;
1005 goto unlock;
1006 }
1007 if (!node->buf->is_kbuf) {
1008 ret = -EBUSY;
1009 goto unlock;
1010 }
1011
1012 io_put_rsrc_node(ctx, node);
1013 data->nodes[index] = NULL;
1014 unlock:
1015 io_ring_submit_unlock(ctx, issue_flags);
1016 return ret;
1017 }
1018 EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec);
1019
validate_fixed_range(u64 buf_addr,size_t len,const struct io_mapped_ubuf * imu)1020 static int validate_fixed_range(u64 buf_addr, size_t len,
1021 const struct io_mapped_ubuf *imu)
1022 {
1023 u64 buf_end;
1024
1025 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
1026 return -EFAULT;
1027 /* not inside the mapped region */
1028 if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
1029 return -EFAULT;
1030 if (unlikely(len > MAX_RW_COUNT))
1031 return -EFAULT;
1032 return 0;
1033 }
1034
io_import_kbuf(int ddir,struct iov_iter * iter,struct io_mapped_ubuf * imu,size_t len,size_t offset)1035 static int io_import_kbuf(int ddir, struct iov_iter *iter,
1036 struct io_mapped_ubuf *imu, size_t len, size_t offset)
1037 {
1038 size_t count = len + offset;
1039
1040 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count);
1041 iov_iter_advance(iter, offset);
1042
1043 if (count < imu->len) {
1044 const struct bio_vec *bvec = iter->bvec;
1045
1046 while (len > bvec->bv_len) {
1047 len -= bvec->bv_len;
1048 bvec++;
1049 }
1050 iter->nr_segs = 1 + bvec - iter->bvec;
1051 }
1052 return 0;
1053 }
1054
io_import_fixed(int ddir,struct iov_iter * iter,struct io_mapped_ubuf * imu,u64 buf_addr,size_t len)1055 static int io_import_fixed(int ddir, struct iov_iter *iter,
1056 struct io_mapped_ubuf *imu,
1057 u64 buf_addr, size_t len)
1058 {
1059 const struct bio_vec *bvec;
1060 size_t folio_mask;
1061 unsigned nr_segs;
1062 size_t offset;
1063 int ret;
1064
1065 if (WARN_ON_ONCE(!imu))
1066 return -EFAULT;
1067 ret = validate_fixed_range(buf_addr, len, imu);
1068 if (unlikely(ret))
1069 return ret;
1070 if (!(imu->dir & (1 << ddir)))
1071 return -EFAULT;
1072
1073 offset = buf_addr - imu->ubuf;
1074
1075 if (imu->is_kbuf)
1076 return io_import_kbuf(ddir, iter, imu, len, offset);
1077
1078 /*
1079 * Don't use iov_iter_advance() here, as it's really slow for
1080 * using the latter parts of a big fixed buffer - it iterates
1081 * over each segment manually. We can cheat a bit here for user
1082 * registered nodes, because we know that:
1083 *
1084 * 1) it's a BVEC iter, we set it up
1085 * 2) all bvecs are the same in size, except potentially the
1086 * first and last bvec
1087 */
1088 folio_mask = (1UL << imu->folio_shift) - 1;
1089 bvec = imu->bvec;
1090 if (offset >= bvec->bv_len) {
1091 unsigned long seg_skip;
1092
1093 /* skip first vec */
1094 offset -= bvec->bv_len;
1095 seg_skip = 1 + (offset >> imu->folio_shift);
1096 bvec += seg_skip;
1097 offset &= folio_mask;
1098 }
1099 nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift;
1100 iov_iter_bvec(iter, ddir, bvec, nr_segs, len);
1101 iter->iov_offset = offset;
1102 return 0;
1103 }
1104
io_find_buf_node(struct io_kiocb * req,unsigned issue_flags)1105 inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req,
1106 unsigned issue_flags)
1107 {
1108 struct io_ring_ctx *ctx = req->ctx;
1109 struct io_rsrc_node *node;
1110
1111 if (req->flags & REQ_F_BUF_NODE)
1112 return req->buf_node;
1113
1114 io_ring_submit_lock(ctx, issue_flags);
1115 node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
1116 if (node)
1117 io_req_assign_buf_node(req, node);
1118 io_ring_submit_unlock(ctx, issue_flags);
1119 return node;
1120 }
1121
io_import_reg_buf(struct io_kiocb * req,struct iov_iter * iter,u64 buf_addr,size_t len,int ddir,unsigned issue_flags)1122 int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
1123 u64 buf_addr, size_t len, int ddir,
1124 unsigned issue_flags)
1125 {
1126 struct io_rsrc_node *node;
1127
1128 node = io_find_buf_node(req, issue_flags);
1129 if (!node)
1130 return -EFAULT;
1131 return io_import_fixed(ddir, iter, node->buf, buf_addr, len);
1132 }
1133
1134 /* Lock two rings at once. The rings must be different! */
lock_two_rings(struct io_ring_ctx * ctx1,struct io_ring_ctx * ctx2)1135 static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
1136 {
1137 if (ctx1 > ctx2)
1138 swap(ctx1, ctx2);
1139 mutex_lock(&ctx1->uring_lock);
1140 mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING);
1141 }
1142
1143 /* Both rings are locked by the caller. */
io_clone_buffers(struct io_ring_ctx * ctx,struct io_ring_ctx * src_ctx,struct io_uring_clone_buffers * arg)1144 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
1145 struct io_uring_clone_buffers *arg)
1146 {
1147 struct io_rsrc_data data;
1148 int i, ret, off, nr;
1149 unsigned int nbufs;
1150
1151 lockdep_assert_held(&ctx->uring_lock);
1152 lockdep_assert_held(&src_ctx->uring_lock);
1153
1154 /*
1155 * Accounting state is shared between the two rings; that only works if
1156 * both rings are accounted towards the same counters.
1157 */
1158 if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account)
1159 return -EINVAL;
1160
1161 /* if offsets are given, must have nr specified too */
1162 if (!arg->nr && (arg->dst_off || arg->src_off))
1163 return -EINVAL;
1164 /* not allowed unless REPLACE is set */
1165 if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
1166 return -EBUSY;
1167
1168 nbufs = src_ctx->buf_table.nr;
1169 if (!arg->nr)
1170 arg->nr = nbufs;
1171 else if (arg->nr > nbufs)
1172 return -EINVAL;
1173 else if (arg->nr > IORING_MAX_REG_BUFFERS)
1174 return -EINVAL;
1175 if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
1176 return -EOVERFLOW;
1177
1178 ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr));
1179 if (ret)
1180 return ret;
1181
1182 /* Fill entries in data from dst that won't overlap with src */
1183 for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) {
1184 struct io_rsrc_node *src_node = ctx->buf_table.nodes[i];
1185
1186 if (src_node) {
1187 data.nodes[i] = src_node;
1188 src_node->refs++;
1189 }
1190 }
1191
1192 ret = -ENXIO;
1193 nbufs = src_ctx->buf_table.nr;
1194 if (!nbufs)
1195 goto out_free;
1196 ret = -EINVAL;
1197 if (!arg->nr)
1198 arg->nr = nbufs;
1199 else if (arg->nr > nbufs)
1200 goto out_free;
1201 ret = -EOVERFLOW;
1202 if (check_add_overflow(arg->nr, arg->src_off, &off))
1203 goto out_free;
1204 if (off > nbufs)
1205 goto out_free;
1206
1207 off = arg->dst_off;
1208 i = arg->src_off;
1209 nr = arg->nr;
1210 while (nr--) {
1211 struct io_rsrc_node *dst_node, *src_node;
1212
1213 src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i);
1214 if (!src_node) {
1215 dst_node = NULL;
1216 } else {
1217 dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
1218 if (!dst_node) {
1219 ret = -ENOMEM;
1220 goto out_free;
1221 }
1222
1223 refcount_inc(&src_node->buf->refs);
1224 dst_node->buf = src_node->buf;
1225 }
1226 data.nodes[off++] = dst_node;
1227 i++;
1228 }
1229
1230 /*
1231 * If asked for replace, put the old table. data->nodes[] holds both
1232 * old and new nodes at this point.
1233 */
1234 if (arg->flags & IORING_REGISTER_DST_REPLACE)
1235 io_rsrc_data_free(ctx, &ctx->buf_table);
1236
1237 /*
1238 * ctx->buf_table must be empty now - either the contents are being
1239 * replaced and we just freed the table, or the contents are being
1240 * copied to a ring that does not have buffers yet (checked at function
1241 * entry).
1242 */
1243 WARN_ON_ONCE(ctx->buf_table.nr);
1244 ctx->buf_table = data;
1245 return 0;
1246
1247 out_free:
1248 io_rsrc_data_free(ctx, &data);
1249 return ret;
1250 }
1251
1252 /*
1253 * Copy the registered buffers from the source ring whose file descriptor
1254 * is given in the src_fd to the current ring. This is identical to registering
1255 * the buffers with ctx, except faster as mappings already exist.
1256 *
1257 * Since the memory is already accounted once, don't account it again.
1258 */
io_register_clone_buffers(struct io_ring_ctx * ctx,void __user * arg)1259 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
1260 {
1261 struct io_uring_clone_buffers buf;
1262 struct io_ring_ctx *src_ctx;
1263 bool registered_src;
1264 struct file *file;
1265 int ret;
1266
1267 if (copy_from_user(&buf, arg, sizeof(buf)))
1268 return -EFAULT;
1269 if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE))
1270 return -EINVAL;
1271 if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr)
1272 return -EBUSY;
1273 if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
1274 return -EINVAL;
1275
1276 registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
1277 file = io_uring_register_get_file(buf.src_fd, registered_src);
1278 if (IS_ERR(file))
1279 return PTR_ERR(file);
1280
1281 src_ctx = file->private_data;
1282 if (src_ctx != ctx) {
1283 mutex_unlock(&ctx->uring_lock);
1284 lock_two_rings(ctx, src_ctx);
1285 }
1286
1287 ret = io_clone_buffers(ctx, src_ctx, &buf);
1288
1289 if (src_ctx != ctx)
1290 mutex_unlock(&src_ctx->uring_lock);
1291
1292 fput(file);
1293 return ret;
1294 }
1295
io_vec_free(struct iou_vec * iv)1296 void io_vec_free(struct iou_vec *iv)
1297 {
1298 if (!iv->iovec)
1299 return;
1300 kfree(iv->iovec);
1301 iv->iovec = NULL;
1302 iv->nr = 0;
1303 }
1304
io_vec_realloc(struct iou_vec * iv,unsigned nr_entries)1305 int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries)
1306 {
1307 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1308 struct iovec *iov;
1309
1310 iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp);
1311 if (!iov)
1312 return -ENOMEM;
1313
1314 io_vec_free(iv);
1315 iv->iovec = iov;
1316 iv->nr = nr_entries;
1317 return 0;
1318 }
1319
io_vec_fill_bvec(int ddir,struct iov_iter * iter,struct io_mapped_ubuf * imu,struct iovec * iovec,unsigned nr_iovs,struct iou_vec * vec)1320 static int io_vec_fill_bvec(int ddir, struct iov_iter *iter,
1321 struct io_mapped_ubuf *imu,
1322 struct iovec *iovec, unsigned nr_iovs,
1323 struct iou_vec *vec)
1324 {
1325 unsigned long folio_size = 1 << imu->folio_shift;
1326 unsigned long folio_mask = folio_size - 1;
1327 u64 folio_addr = imu->ubuf & ~folio_mask;
1328 struct bio_vec *res_bvec = vec->bvec;
1329 size_t total_len = 0;
1330 unsigned bvec_idx = 0;
1331 unsigned iov_idx;
1332
1333 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) {
1334 size_t iov_len = iovec[iov_idx].iov_len;
1335 u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base;
1336 struct bio_vec *src_bvec;
1337 size_t offset;
1338 int ret;
1339
1340 ret = validate_fixed_range(buf_addr, iov_len, imu);
1341 if (unlikely(ret))
1342 return ret;
1343
1344 if (unlikely(!iov_len))
1345 return -EFAULT;
1346 if (unlikely(check_add_overflow(total_len, iov_len, &total_len)))
1347 return -EOVERFLOW;
1348
1349 /* by using folio address it also accounts for bvec offset */
1350 offset = buf_addr - folio_addr;
1351 src_bvec = imu->bvec + (offset >> imu->folio_shift);
1352 offset &= folio_mask;
1353
1354 for (; iov_len; offset = 0, bvec_idx++, src_bvec++) {
1355 size_t seg_size = min_t(size_t, iov_len,
1356 folio_size - offset);
1357
1358 bvec_set_page(&res_bvec[bvec_idx],
1359 src_bvec->bv_page, seg_size, offset);
1360 iov_len -= seg_size;
1361 }
1362 }
1363 if (total_len > MAX_RW_COUNT)
1364 return -EINVAL;
1365
1366 iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len);
1367 return 0;
1368 }
1369
io_estimate_bvec_size(struct iovec * iov,unsigned nr_iovs,struct io_mapped_ubuf * imu)1370 static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs,
1371 struct io_mapped_ubuf *imu)
1372 {
1373 unsigned shift = imu->folio_shift;
1374 size_t max_segs = 0;
1375 unsigned i;
1376
1377 for (i = 0; i < nr_iovs; i++)
1378 max_segs += (iov[i].iov_len >> shift) + 2;
1379 return max_segs;
1380 }
1381
io_vec_fill_kern_bvec(int ddir,struct iov_iter * iter,struct io_mapped_ubuf * imu,struct iovec * iovec,unsigned nr_iovs,struct iou_vec * vec)1382 static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter,
1383 struct io_mapped_ubuf *imu,
1384 struct iovec *iovec, unsigned nr_iovs,
1385 struct iou_vec *vec)
1386 {
1387 const struct bio_vec *src_bvec = imu->bvec;
1388 struct bio_vec *res_bvec = vec->bvec;
1389 unsigned res_idx = 0;
1390 size_t total_len = 0;
1391 unsigned iov_idx;
1392
1393 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) {
1394 size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base;
1395 size_t iov_len = iovec[iov_idx].iov_len;
1396 struct bvec_iter bi = {
1397 .bi_size = offset + iov_len,
1398 };
1399 struct bio_vec bv;
1400
1401 bvec_iter_advance(src_bvec, &bi, offset);
1402 for_each_mp_bvec(bv, src_bvec, bi, bi)
1403 res_bvec[res_idx++] = bv;
1404 total_len += iov_len;
1405 }
1406 iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len);
1407 return 0;
1408 }
1409
iov_kern_bvec_size(const struct iovec * iov,const struct io_mapped_ubuf * imu,unsigned int * nr_seg)1410 static int iov_kern_bvec_size(const struct iovec *iov,
1411 const struct io_mapped_ubuf *imu,
1412 unsigned int *nr_seg)
1413 {
1414 size_t offset = (size_t)(uintptr_t)iov->iov_base;
1415 const struct bio_vec *bvec = imu->bvec;
1416 int start = 0, i = 0;
1417 size_t off = 0;
1418 int ret;
1419
1420 ret = validate_fixed_range(offset, iov->iov_len, imu);
1421 if (unlikely(ret))
1422 return ret;
1423
1424 for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs;
1425 off += bvec[i].bv_len, i++) {
1426 if (offset >= off && offset < off + bvec[i].bv_len)
1427 start = i;
1428 }
1429 *nr_seg = i - start;
1430 return 0;
1431 }
1432
io_kern_bvec_size(struct iovec * iov,unsigned nr_iovs,struct io_mapped_ubuf * imu,unsigned * nr_segs)1433 static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs,
1434 struct io_mapped_ubuf *imu, unsigned *nr_segs)
1435 {
1436 unsigned max_segs = 0;
1437 size_t total_len = 0;
1438 unsigned i;
1439 int ret;
1440
1441 *nr_segs = 0;
1442 for (i = 0; i < nr_iovs; i++) {
1443 if (unlikely(!iov[i].iov_len))
1444 return -EFAULT;
1445 if (unlikely(check_add_overflow(total_len, iov[i].iov_len,
1446 &total_len)))
1447 return -EOVERFLOW;
1448 ret = iov_kern_bvec_size(&iov[i], imu, &max_segs);
1449 if (unlikely(ret))
1450 return ret;
1451 *nr_segs += max_segs;
1452 }
1453 if (total_len > MAX_RW_COUNT)
1454 return -EINVAL;
1455 return 0;
1456 }
1457
io_import_reg_vec(int ddir,struct iov_iter * iter,struct io_kiocb * req,struct iou_vec * vec,unsigned nr_iovs,unsigned issue_flags)1458 int io_import_reg_vec(int ddir, struct iov_iter *iter,
1459 struct io_kiocb *req, struct iou_vec *vec,
1460 unsigned nr_iovs, unsigned issue_flags)
1461 {
1462 struct io_rsrc_node *node;
1463 struct io_mapped_ubuf *imu;
1464 unsigned iovec_off;
1465 struct iovec *iov;
1466 unsigned nr_segs;
1467
1468 node = io_find_buf_node(req, issue_flags);
1469 if (!node)
1470 return -EFAULT;
1471 imu = node->buf;
1472 if (!(imu->dir & (1 << ddir)))
1473 return -EFAULT;
1474
1475 iovec_off = vec->nr - nr_iovs;
1476 iov = vec->iovec + iovec_off;
1477
1478 if (imu->is_kbuf) {
1479 int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs);
1480
1481 if (unlikely(ret))
1482 return ret;
1483 } else {
1484 nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu);
1485 }
1486
1487 if (sizeof(struct bio_vec) > sizeof(struct iovec)) {
1488 size_t bvec_bytes;
1489
1490 bvec_bytes = nr_segs * sizeof(struct bio_vec);
1491 nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov);
1492 nr_segs += nr_iovs;
1493 }
1494
1495 if (nr_segs > vec->nr) {
1496 struct iou_vec tmp_vec = {};
1497 int ret;
1498
1499 ret = io_vec_realloc(&tmp_vec, nr_segs);
1500 if (ret)
1501 return ret;
1502
1503 iovec_off = tmp_vec.nr - nr_iovs;
1504 memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs);
1505 io_vec_free(vec);
1506
1507 *vec = tmp_vec;
1508 iov = vec->iovec + iovec_off;
1509 req->flags |= REQ_F_NEED_CLEANUP;
1510 }
1511
1512 if (imu->is_kbuf)
1513 return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec);
1514
1515 return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec);
1516 }
1517
io_prep_reg_iovec(struct io_kiocb * req,struct iou_vec * iv,const struct iovec __user * uvec,size_t uvec_segs)1518 int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv,
1519 const struct iovec __user *uvec, size_t uvec_segs)
1520 {
1521 struct iovec *iov;
1522 int iovec_off, ret;
1523 void *res;
1524
1525 if (uvec_segs > iv->nr) {
1526 ret = io_vec_realloc(iv, uvec_segs);
1527 if (ret)
1528 return ret;
1529 req->flags |= REQ_F_NEED_CLEANUP;
1530 }
1531
1532 /* pad iovec to the right */
1533 iovec_off = iv->nr - uvec_segs;
1534 iov = iv->iovec + iovec_off;
1535 res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov,
1536 io_is_compat(req->ctx));
1537 if (IS_ERR(res))
1538 return PTR_ERR(res);
1539
1540 req->flags |= REQ_F_IMPORT_BUFFER;
1541 return 0;
1542 }
1543