1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/fs.h> 5 #include <linux/file.h> 6 #include <linux/mm.h> 7 #include <linux/slab.h> 8 #include <linux/nospec.h> 9 #include <linux/hugetlb.h> 10 #include <linux/compat.h> 11 #include <linux/io_uring.h> 12 #include <linux/io_uring/cmd.h> 13 14 #include <uapi/linux/io_uring.h> 15 16 #include "io_uring.h" 17 #include "openclose.h" 18 #include "rsrc.h" 19 #include "memmap.h" 20 #include "register.h" 21 22 struct io_rsrc_update { 23 struct file *file; 24 u64 arg; 25 u32 nr_args; 26 u32 offset; 27 }; 28 29 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 30 struct iovec *iov, struct page **last_hpage); 31 32 /* only define max */ 33 #define IORING_MAX_FIXED_FILES (1U << 20) 34 #define IORING_MAX_REG_BUFFERS (1U << 14) 35 36 #define IO_CACHED_BVECS_SEGS 32 37 38 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) 39 { 40 unsigned long page_limit, cur_pages, new_pages; 41 42 if (!nr_pages) 43 return 0; 44 45 /* Don't allow more pages than we can safely lock */ 46 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 47 48 cur_pages = atomic_long_read(&user->locked_vm); 49 do { 50 new_pages = cur_pages + nr_pages; 51 if (new_pages > page_limit) 52 return -ENOMEM; 53 } while (!atomic_long_try_cmpxchg(&user->locked_vm, 54 &cur_pages, new_pages)); 55 return 0; 56 } 57 58 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 59 { 60 if (ctx->user) 61 __io_unaccount_mem(ctx->user, nr_pages); 62 63 if (ctx->mm_account) 64 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 65 } 66 67 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 68 { 69 int ret; 70 71 if (ctx->user) { 72 ret = __io_account_mem(ctx->user, nr_pages); 73 if (ret) 74 return ret; 75 } 76 77 if (ctx->mm_account) 78 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 79 80 return 0; 81 } 82 83 int io_buffer_validate(struct iovec *iov) 84 { 85 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); 86 87 /* 88 * Don't impose further limits on the size and buffer 89 * constraints here, we'll -EINVAL later when IO is 90 * submitted if they are wrong. 91 */ 92 if (!iov->iov_base) 93 return iov->iov_len ? -EFAULT : 0; 94 if (!iov->iov_len) 95 return -EFAULT; 96 97 /* arbitrary limit, but we need something */ 98 if (iov->iov_len > SZ_1G) 99 return -EFAULT; 100 101 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) 102 return -EOVERFLOW; 103 104 return 0; 105 } 106 107 static void io_release_ubuf(void *priv) 108 { 109 struct io_mapped_ubuf *imu = priv; 110 unsigned int i; 111 112 for (i = 0; i < imu->nr_bvecs; i++) 113 unpin_user_page(imu->bvec[i].bv_page); 114 } 115 116 static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, 117 int nr_bvecs) 118 { 119 if (nr_bvecs <= IO_CACHED_BVECS_SEGS) 120 return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); 121 return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs), 122 GFP_KERNEL); 123 } 124 125 static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 126 { 127 if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS) 128 io_cache_free(&ctx->imu_cache, imu); 129 else 130 kvfree(imu); 131 } 132 133 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) 134 { 135 if (!refcount_dec_and_test(&imu->refs)) 136 return; 137 138 if (imu->acct_pages) 139 io_unaccount_mem(ctx, imu->acct_pages); 140 imu->release(imu->priv); 141 io_free_imu(ctx, imu); 142 } 143 144 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) 145 { 146 struct io_rsrc_node *node; 147 148 node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); 149 if (node) { 150 node->type = type; 151 node->refs = 1; 152 node->tag = 0; 153 node->file_ptr = 0; 154 } 155 return node; 156 } 157 158 bool io_rsrc_cache_init(struct io_ring_ctx *ctx) 159 { 160 const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, 161 IO_CACHED_BVECS_SEGS); 162 const int node_size = sizeof(struct io_rsrc_node); 163 bool ret; 164 165 ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, 166 node_size, 0); 167 ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, 168 imu_cache_size, 0); 169 return ret; 170 } 171 172 void io_rsrc_cache_free(struct io_ring_ctx *ctx) 173 { 174 io_alloc_cache_free(&ctx->node_cache, kfree); 175 io_alloc_cache_free(&ctx->imu_cache, kfree); 176 } 177 178 static void io_clear_table_tags(struct io_rsrc_data *data) 179 { 180 int i; 181 182 for (i = 0; i < data->nr; i++) { 183 struct io_rsrc_node *node = data->nodes[i]; 184 185 if (node) 186 node->tag = 0; 187 } 188 } 189 190 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, 191 struct io_rsrc_data *data) 192 { 193 if (!data->nr) 194 return; 195 while (data->nr--) { 196 if (data->nodes[data->nr]) 197 io_put_rsrc_node(ctx, data->nodes[data->nr]); 198 } 199 kvfree(data->nodes); 200 data->nodes = NULL; 201 data->nr = 0; 202 } 203 204 __cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) 205 { 206 data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *), 207 GFP_KERNEL_ACCOUNT | __GFP_ZERO); 208 if (data->nodes) { 209 data->nr = nr; 210 return 0; 211 } 212 return -ENOMEM; 213 } 214 215 static int __io_sqe_files_update(struct io_ring_ctx *ctx, 216 struct io_uring_rsrc_update2 *up, 217 unsigned nr_args) 218 { 219 u64 __user *tags = u64_to_user_ptr(up->tags); 220 __s32 __user *fds = u64_to_user_ptr(up->data); 221 int fd, i, err = 0; 222 unsigned int done; 223 224 if (!ctx->file_table.data.nr) 225 return -ENXIO; 226 if (up->offset + nr_args > ctx->file_table.data.nr) 227 return -EINVAL; 228 229 for (done = 0; done < nr_args; done++) { 230 u64 tag = 0; 231 232 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 233 copy_from_user(&fd, &fds[done], sizeof(fd))) { 234 err = -EFAULT; 235 break; 236 } 237 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 238 err = -EINVAL; 239 break; 240 } 241 if (fd == IORING_REGISTER_FILES_SKIP) 242 continue; 243 244 i = up->offset + done; 245 if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) 246 io_file_bitmap_clear(&ctx->file_table, i); 247 248 if (fd != -1) { 249 struct file *file = fget(fd); 250 struct io_rsrc_node *node; 251 252 if (!file) { 253 err = -EBADF; 254 break; 255 } 256 /* 257 * Don't allow io_uring instances to be registered. 258 */ 259 if (io_is_uring_fops(file)) { 260 fput(file); 261 err = -EBADF; 262 break; 263 } 264 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 265 if (!node) { 266 err = -ENOMEM; 267 fput(file); 268 break; 269 } 270 ctx->file_table.data.nodes[i] = node; 271 if (tag) 272 node->tag = tag; 273 io_fixed_file_set(node, file); 274 io_file_bitmap_set(&ctx->file_table, i); 275 } 276 } 277 return done ? done : err; 278 } 279 280 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 281 struct io_uring_rsrc_update2 *up, 282 unsigned int nr_args) 283 { 284 u64 __user *tags = u64_to_user_ptr(up->tags); 285 struct iovec fast_iov, *iov; 286 struct page *last_hpage = NULL; 287 struct iovec __user *uvec; 288 u64 user_data = up->data; 289 __u32 done; 290 int i, err; 291 292 if (!ctx->buf_table.nr) 293 return -ENXIO; 294 if (up->offset + nr_args > ctx->buf_table.nr) 295 return -EINVAL; 296 297 for (done = 0; done < nr_args; done++) { 298 struct io_rsrc_node *node; 299 u64 tag = 0; 300 301 uvec = u64_to_user_ptr(user_data); 302 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 303 if (IS_ERR(iov)) { 304 err = PTR_ERR(iov); 305 break; 306 } 307 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 308 err = -EFAULT; 309 break; 310 } 311 err = io_buffer_validate(iov); 312 if (err) 313 break; 314 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 315 if (IS_ERR(node)) { 316 err = PTR_ERR(node); 317 break; 318 } 319 if (tag) { 320 if (!node) { 321 err = -EINVAL; 322 break; 323 } 324 node->tag = tag; 325 } 326 i = array_index_nospec(up->offset + done, ctx->buf_table.nr); 327 io_reset_rsrc_node(ctx, &ctx->buf_table, i); 328 ctx->buf_table.nodes[i] = node; 329 if (ctx->compat) 330 user_data += sizeof(struct compat_iovec); 331 else 332 user_data += sizeof(struct iovec); 333 } 334 return done ? done : err; 335 } 336 337 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 338 struct io_uring_rsrc_update2 *up, 339 unsigned nr_args) 340 { 341 __u32 tmp; 342 343 lockdep_assert_held(&ctx->uring_lock); 344 345 if (check_add_overflow(up->offset, nr_args, &tmp)) 346 return -EOVERFLOW; 347 348 switch (type) { 349 case IORING_RSRC_FILE: 350 return __io_sqe_files_update(ctx, up, nr_args); 351 case IORING_RSRC_BUFFER: 352 return __io_sqe_buffers_update(ctx, up, nr_args); 353 } 354 return -EINVAL; 355 } 356 357 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 358 unsigned nr_args) 359 { 360 struct io_uring_rsrc_update2 up; 361 362 if (!nr_args) 363 return -EINVAL; 364 memset(&up, 0, sizeof(up)); 365 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 366 return -EFAULT; 367 if (up.resv || up.resv2) 368 return -EINVAL; 369 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 370 } 371 372 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 373 unsigned size, unsigned type) 374 { 375 struct io_uring_rsrc_update2 up; 376 377 if (size != sizeof(up)) 378 return -EINVAL; 379 if (copy_from_user(&up, arg, sizeof(up))) 380 return -EFAULT; 381 if (!up.nr || up.resv || up.resv2) 382 return -EINVAL; 383 return __io_register_rsrc_update(ctx, type, &up, up.nr); 384 } 385 386 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 387 unsigned int size, unsigned int type) 388 { 389 struct io_uring_rsrc_register rr; 390 391 /* keep it extendible */ 392 if (size != sizeof(rr)) 393 return -EINVAL; 394 395 memset(&rr, 0, sizeof(rr)); 396 if (copy_from_user(&rr, arg, size)) 397 return -EFAULT; 398 if (!rr.nr || rr.resv2) 399 return -EINVAL; 400 if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) 401 return -EINVAL; 402 403 switch (type) { 404 case IORING_RSRC_FILE: 405 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 406 break; 407 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 408 rr.nr, u64_to_user_ptr(rr.tags)); 409 case IORING_RSRC_BUFFER: 410 if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) 411 break; 412 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 413 rr.nr, u64_to_user_ptr(rr.tags)); 414 } 415 return -EINVAL; 416 } 417 418 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 419 { 420 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 421 422 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 423 return -EINVAL; 424 if (sqe->rw_flags || sqe->splice_fd_in) 425 return -EINVAL; 426 427 up->offset = READ_ONCE(sqe->off); 428 up->nr_args = READ_ONCE(sqe->len); 429 if (!up->nr_args) 430 return -EINVAL; 431 up->arg = READ_ONCE(sqe->addr); 432 return 0; 433 } 434 435 static int io_files_update_with_index_alloc(struct io_kiocb *req, 436 unsigned int issue_flags) 437 { 438 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 439 __s32 __user *fds = u64_to_user_ptr(up->arg); 440 unsigned int done; 441 struct file *file; 442 int ret, fd; 443 444 if (!req->ctx->file_table.data.nr) 445 return -ENXIO; 446 447 for (done = 0; done < up->nr_args; done++) { 448 if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 449 ret = -EFAULT; 450 break; 451 } 452 453 file = fget(fd); 454 if (!file) { 455 ret = -EBADF; 456 break; 457 } 458 ret = io_fixed_fd_install(req, issue_flags, file, 459 IORING_FILE_INDEX_ALLOC); 460 if (ret < 0) 461 break; 462 if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 463 __io_close_fixed(req->ctx, issue_flags, ret); 464 ret = -EFAULT; 465 break; 466 } 467 } 468 469 if (done) 470 return done; 471 return ret; 472 } 473 474 int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 475 { 476 struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); 477 struct io_ring_ctx *ctx = req->ctx; 478 struct io_uring_rsrc_update2 up2; 479 int ret; 480 481 up2.offset = up->offset; 482 up2.data = up->arg; 483 up2.nr = 0; 484 up2.tags = 0; 485 up2.resv = 0; 486 up2.resv2 = 0; 487 488 if (up->offset == IORING_FILE_INDEX_ALLOC) { 489 ret = io_files_update_with_index_alloc(req, issue_flags); 490 } else { 491 io_ring_submit_lock(ctx, issue_flags); 492 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 493 &up2, up->nr_args); 494 io_ring_submit_unlock(ctx, issue_flags); 495 } 496 497 if (ret < 0) 498 req_set_fail(req); 499 io_req_set_res(req, ret, 0); 500 return IOU_OK; 501 } 502 503 void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) 504 { 505 if (node->tag) 506 io_post_aux_cqe(ctx, node->tag, 0, 0); 507 508 switch (node->type) { 509 case IORING_RSRC_FILE: 510 fput(io_slot_file(node)); 511 break; 512 case IORING_RSRC_BUFFER: 513 io_buffer_unmap(ctx, node->buf); 514 break; 515 default: 516 WARN_ON_ONCE(1); 517 break; 518 } 519 520 io_cache_free(&ctx->node_cache, node); 521 } 522 523 int io_sqe_files_unregister(struct io_ring_ctx *ctx) 524 { 525 if (!ctx->file_table.data.nr) 526 return -ENXIO; 527 528 io_free_file_tables(ctx, &ctx->file_table); 529 io_file_table_set_alloc_range(ctx, 0, 0); 530 return 0; 531 } 532 533 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 534 unsigned nr_args, u64 __user *tags) 535 { 536 __s32 __user *fds = (__s32 __user *) arg; 537 struct file *file; 538 int fd, ret; 539 unsigned i; 540 541 if (ctx->file_table.data.nr) 542 return -EBUSY; 543 if (!nr_args) 544 return -EINVAL; 545 if (nr_args > IORING_MAX_FIXED_FILES) 546 return -EMFILE; 547 if (nr_args > rlimit(RLIMIT_NOFILE)) 548 return -EMFILE; 549 if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) 550 return -ENOMEM; 551 552 for (i = 0; i < nr_args; i++) { 553 struct io_rsrc_node *node; 554 u64 tag = 0; 555 556 ret = -EFAULT; 557 if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) 558 goto fail; 559 if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) 560 goto fail; 561 /* allow sparse sets */ 562 if (!fds || fd == -1) { 563 ret = -EINVAL; 564 if (tag) 565 goto fail; 566 continue; 567 } 568 569 file = fget(fd); 570 ret = -EBADF; 571 if (unlikely(!file)) 572 goto fail; 573 574 /* 575 * Don't allow io_uring instances to be registered. 576 */ 577 if (io_is_uring_fops(file)) { 578 fput(file); 579 goto fail; 580 } 581 ret = -ENOMEM; 582 node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); 583 if (!node) { 584 fput(file); 585 goto fail; 586 } 587 if (tag) 588 node->tag = tag; 589 ctx->file_table.data.nodes[i] = node; 590 io_fixed_file_set(node, file); 591 io_file_bitmap_set(&ctx->file_table, i); 592 } 593 594 /* default it to the whole table */ 595 io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); 596 return 0; 597 fail: 598 io_clear_table_tags(&ctx->file_table.data); 599 io_sqe_files_unregister(ctx); 600 return ret; 601 } 602 603 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 604 { 605 if (!ctx->buf_table.nr) 606 return -ENXIO; 607 io_rsrc_data_free(ctx, &ctx->buf_table); 608 return 0; 609 } 610 611 /* 612 * Not super efficient, but this is just a registration time. And we do cache 613 * the last compound head, so generally we'll only do a full search if we don't 614 * match that one. 615 * 616 * We check if the given compound head page has already been accounted, to 617 * avoid double accounting it. This allows us to account the full size of the 618 * page, not just the constituent pages of a huge page. 619 */ 620 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 621 int nr_pages, struct page *hpage) 622 { 623 int i, j; 624 625 /* check current page array */ 626 for (i = 0; i < nr_pages; i++) { 627 if (!PageCompound(pages[i])) 628 continue; 629 if (compound_head(pages[i]) == hpage) 630 return true; 631 } 632 633 /* check previously registered pages */ 634 for (i = 0; i < ctx->buf_table.nr; i++) { 635 struct io_rsrc_node *node = ctx->buf_table.nodes[i]; 636 struct io_mapped_ubuf *imu; 637 638 if (!node) 639 continue; 640 imu = node->buf; 641 for (j = 0; j < imu->nr_bvecs; j++) { 642 if (!PageCompound(imu->bvec[j].bv_page)) 643 continue; 644 if (compound_head(imu->bvec[j].bv_page) == hpage) 645 return true; 646 } 647 } 648 649 return false; 650 } 651 652 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 653 int nr_pages, struct io_mapped_ubuf *imu, 654 struct page **last_hpage) 655 { 656 int i, ret; 657 658 imu->acct_pages = 0; 659 for (i = 0; i < nr_pages; i++) { 660 if (!PageCompound(pages[i])) { 661 imu->acct_pages++; 662 } else { 663 struct page *hpage; 664 665 hpage = compound_head(pages[i]); 666 if (hpage == *last_hpage) 667 continue; 668 *last_hpage = hpage; 669 if (headpage_already_acct(ctx, pages, i, hpage)) 670 continue; 671 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 672 } 673 } 674 675 if (!imu->acct_pages) 676 return 0; 677 678 ret = io_account_mem(ctx, imu->acct_pages); 679 if (ret) 680 imu->acct_pages = 0; 681 return ret; 682 } 683 684 static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, 685 struct io_imu_folio_data *data) 686 { 687 struct page **page_array = *pages, **new_array = NULL; 688 int nr_pages_left = *nr_pages, i, j; 689 int nr_folios = data->nr_folios; 690 691 /* Store head pages only*/ 692 new_array = kvmalloc_array(nr_folios, sizeof(struct page *), 693 GFP_KERNEL); 694 if (!new_array) 695 return false; 696 697 new_array[0] = compound_head(page_array[0]); 698 /* 699 * The pages are bound to the folio, it doesn't 700 * actually unpin them but drops all but one reference, 701 * which is usually put down by io_buffer_unmap(). 702 * Note, needs a better helper. 703 */ 704 if (data->nr_pages_head > 1) 705 unpin_user_pages(&page_array[1], data->nr_pages_head - 1); 706 707 j = data->nr_pages_head; 708 nr_pages_left -= data->nr_pages_head; 709 for (i = 1; i < nr_folios; i++) { 710 unsigned int nr_unpin; 711 712 new_array[i] = page_array[j]; 713 nr_unpin = min_t(unsigned int, nr_pages_left - 1, 714 data->nr_pages_mid - 1); 715 if (nr_unpin) 716 unpin_user_pages(&page_array[j+1], nr_unpin); 717 j += data->nr_pages_mid; 718 nr_pages_left -= data->nr_pages_mid; 719 } 720 kvfree(page_array); 721 *pages = new_array; 722 *nr_pages = nr_folios; 723 return true; 724 } 725 726 bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, 727 struct io_imu_folio_data *data) 728 { 729 struct folio *folio = page_folio(page_array[0]); 730 unsigned int count = 1, nr_folios = 1; 731 int i; 732 733 data->nr_pages_mid = folio_nr_pages(folio); 734 data->folio_shift = folio_shift(folio); 735 736 /* 737 * Check if pages are contiguous inside a folio, and all folios have 738 * the same page count except for the head and tail. 739 */ 740 for (i = 1; i < nr_pages; i++) { 741 if (page_folio(page_array[i]) == folio && 742 page_array[i] == page_array[i-1] + 1) { 743 count++; 744 continue; 745 } 746 747 if (nr_folios == 1) { 748 if (folio_page_idx(folio, page_array[i-1]) != 749 data->nr_pages_mid - 1) 750 return false; 751 752 data->nr_pages_head = count; 753 } else if (count != data->nr_pages_mid) { 754 return false; 755 } 756 757 folio = page_folio(page_array[i]); 758 if (folio_size(folio) != (1UL << data->folio_shift) || 759 folio_page_idx(folio, page_array[i]) != 0) 760 return false; 761 762 count = 1; 763 nr_folios++; 764 } 765 if (nr_folios == 1) 766 data->nr_pages_head = count; 767 768 data->nr_folios = nr_folios; 769 return true; 770 } 771 772 static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, 773 struct iovec *iov, 774 struct page **last_hpage) 775 { 776 struct io_mapped_ubuf *imu = NULL; 777 struct page **pages = NULL; 778 struct io_rsrc_node *node; 779 unsigned long off; 780 size_t size; 781 int ret, nr_pages, i; 782 struct io_imu_folio_data data; 783 bool coalesced = false; 784 785 if (!iov->iov_base) 786 return NULL; 787 788 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 789 if (!node) 790 return ERR_PTR(-ENOMEM); 791 792 ret = -ENOMEM; 793 pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, 794 &nr_pages); 795 if (IS_ERR(pages)) { 796 ret = PTR_ERR(pages); 797 pages = NULL; 798 goto done; 799 } 800 801 /* If it's huge page(s), try to coalesce them into fewer bvec entries */ 802 if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { 803 if (data.nr_pages_mid != 1) 804 coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); 805 } 806 807 imu = io_alloc_imu(ctx, nr_pages); 808 if (!imu) 809 goto done; 810 811 imu->nr_bvecs = nr_pages; 812 ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); 813 if (ret) { 814 unpin_user_pages(pages, nr_pages); 815 goto done; 816 } 817 818 size = iov->iov_len; 819 /* store original address for later verification */ 820 imu->ubuf = (unsigned long) iov->iov_base; 821 imu->len = iov->iov_len; 822 imu->folio_shift = PAGE_SHIFT; 823 imu->release = io_release_ubuf; 824 imu->priv = imu; 825 imu->is_kbuf = false; 826 imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; 827 if (coalesced) 828 imu->folio_shift = data.folio_shift; 829 refcount_set(&imu->refs, 1); 830 off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); 831 node->buf = imu; 832 ret = 0; 833 834 for (i = 0; i < nr_pages; i++) { 835 size_t vec_len; 836 837 vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off); 838 bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); 839 off = 0; 840 size -= vec_len; 841 } 842 done: 843 if (ret) { 844 if (imu) 845 io_free_imu(ctx, imu); 846 io_cache_free(&ctx->node_cache, node); 847 node = ERR_PTR(ret); 848 } 849 kvfree(pages); 850 return node; 851 } 852 853 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 854 unsigned int nr_args, u64 __user *tags) 855 { 856 struct page *last_hpage = NULL; 857 struct io_rsrc_data data; 858 struct iovec fast_iov, *iov = &fast_iov; 859 const struct iovec __user *uvec; 860 int i, ret; 861 862 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 863 864 if (ctx->buf_table.nr) 865 return -EBUSY; 866 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 867 return -EINVAL; 868 ret = io_rsrc_data_alloc(&data, nr_args); 869 if (ret) 870 return ret; 871 872 if (!arg) 873 memset(iov, 0, sizeof(*iov)); 874 875 for (i = 0; i < nr_args; i++) { 876 struct io_rsrc_node *node; 877 u64 tag = 0; 878 879 if (arg) { 880 uvec = (struct iovec __user *) arg; 881 iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 882 if (IS_ERR(iov)) { 883 ret = PTR_ERR(iov); 884 break; 885 } 886 ret = io_buffer_validate(iov); 887 if (ret) 888 break; 889 if (ctx->compat) 890 arg += sizeof(struct compat_iovec); 891 else 892 arg += sizeof(struct iovec); 893 } 894 895 if (tags) { 896 if (copy_from_user(&tag, &tags[i], sizeof(tag))) { 897 ret = -EFAULT; 898 break; 899 } 900 } 901 902 node = io_sqe_buffer_register(ctx, iov, &last_hpage); 903 if (IS_ERR(node)) { 904 ret = PTR_ERR(node); 905 break; 906 } 907 if (tag) { 908 if (!node) { 909 ret = -EINVAL; 910 break; 911 } 912 node->tag = tag; 913 } 914 data.nodes[i] = node; 915 } 916 917 ctx->buf_table = data; 918 if (ret) { 919 io_clear_table_tags(&ctx->buf_table); 920 io_sqe_buffers_unregister(ctx); 921 } 922 return ret; 923 } 924 925 int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, 926 void (*release)(void *), unsigned int index, 927 unsigned int issue_flags) 928 { 929 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 930 struct io_rsrc_data *data = &ctx->buf_table; 931 struct req_iterator rq_iter; 932 struct io_mapped_ubuf *imu; 933 struct io_rsrc_node *node; 934 struct bio_vec bv, *bvec; 935 u16 nr_bvecs; 936 int ret = 0; 937 938 io_ring_submit_lock(ctx, issue_flags); 939 if (index >= data->nr) { 940 ret = -EINVAL; 941 goto unlock; 942 } 943 index = array_index_nospec(index, data->nr); 944 945 if (data->nodes[index]) { 946 ret = -EBUSY; 947 goto unlock; 948 } 949 950 node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 951 if (!node) { 952 ret = -ENOMEM; 953 goto unlock; 954 } 955 956 nr_bvecs = blk_rq_nr_phys_segments(rq); 957 imu = io_alloc_imu(ctx, nr_bvecs); 958 if (!imu) { 959 kfree(node); 960 ret = -ENOMEM; 961 goto unlock; 962 } 963 964 imu->ubuf = 0; 965 imu->len = blk_rq_bytes(rq); 966 imu->acct_pages = 0; 967 imu->folio_shift = PAGE_SHIFT; 968 imu->nr_bvecs = nr_bvecs; 969 refcount_set(&imu->refs, 1); 970 imu->release = release; 971 imu->priv = rq; 972 imu->is_kbuf = true; 973 imu->dir = 1 << rq_data_dir(rq); 974 975 bvec = imu->bvec; 976 rq_for_each_bvec(bv, rq, rq_iter) 977 *bvec++ = bv; 978 979 node->buf = imu; 980 data->nodes[index] = node; 981 unlock: 982 io_ring_submit_unlock(ctx, issue_flags); 983 return ret; 984 } 985 EXPORT_SYMBOL_GPL(io_buffer_register_bvec); 986 987 int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, 988 unsigned int issue_flags) 989 { 990 struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; 991 struct io_rsrc_data *data = &ctx->buf_table; 992 struct io_rsrc_node *node; 993 int ret = 0; 994 995 io_ring_submit_lock(ctx, issue_flags); 996 if (index >= data->nr) { 997 ret = -EINVAL; 998 goto unlock; 999 } 1000 index = array_index_nospec(index, data->nr); 1001 1002 node = data->nodes[index]; 1003 if (!node) { 1004 ret = -EINVAL; 1005 goto unlock; 1006 } 1007 if (!node->buf->is_kbuf) { 1008 ret = -EBUSY; 1009 goto unlock; 1010 } 1011 1012 io_put_rsrc_node(ctx, node); 1013 data->nodes[index] = NULL; 1014 unlock: 1015 io_ring_submit_unlock(ctx, issue_flags); 1016 return ret; 1017 } 1018 EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); 1019 1020 static int validate_fixed_range(u64 buf_addr, size_t len, 1021 const struct io_mapped_ubuf *imu) 1022 { 1023 u64 buf_end; 1024 1025 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 1026 return -EFAULT; 1027 /* not inside the mapped region */ 1028 if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) 1029 return -EFAULT; 1030 if (unlikely(len > MAX_RW_COUNT)) 1031 return -EFAULT; 1032 return 0; 1033 } 1034 1035 static int io_import_fixed(int ddir, struct iov_iter *iter, 1036 struct io_mapped_ubuf *imu, 1037 u64 buf_addr, size_t len) 1038 { 1039 size_t offset; 1040 int ret; 1041 1042 if (WARN_ON_ONCE(!imu)) 1043 return -EFAULT; 1044 ret = validate_fixed_range(buf_addr, len, imu); 1045 if (unlikely(ret)) 1046 return ret; 1047 if (!(imu->dir & (1 << ddir))) 1048 return -EFAULT; 1049 1050 /* 1051 * Might not be a start of buffer, set size appropriately 1052 * and advance us to the beginning. 1053 */ 1054 offset = buf_addr - imu->ubuf; 1055 iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); 1056 1057 if (offset) { 1058 /* 1059 * Don't use iov_iter_advance() here, as it's really slow for 1060 * using the latter parts of a big fixed buffer - it iterates 1061 * over each segment manually. We can cheat a bit here for user 1062 * registered nodes, because we know that: 1063 * 1064 * 1) it's a BVEC iter, we set it up 1065 * 2) all bvecs are the same in size, except potentially the 1066 * first and last bvec 1067 * 1068 * So just find our index, and adjust the iterator afterwards. 1069 * If the offset is within the first bvec (or the whole first 1070 * bvec, just use iov_iter_advance(). This makes it easier 1071 * since we can just skip the first segment, which may not 1072 * be folio_size aligned. 1073 */ 1074 const struct bio_vec *bvec = imu->bvec; 1075 1076 /* 1077 * Kernel buffer bvecs, on the other hand, don't necessarily 1078 * have the size property of user registered ones, so we have 1079 * to use the slow iter advance. 1080 */ 1081 if (offset < bvec->bv_len) { 1082 iter->count -= offset; 1083 iter->iov_offset = offset; 1084 } else if (imu->is_kbuf) { 1085 iov_iter_advance(iter, offset); 1086 } else { 1087 unsigned long seg_skip; 1088 1089 /* skip first vec */ 1090 offset -= bvec->bv_len; 1091 seg_skip = 1 + (offset >> imu->folio_shift); 1092 1093 iter->bvec += seg_skip; 1094 iter->nr_segs -= seg_skip; 1095 iter->count -= bvec->bv_len + offset; 1096 iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); 1097 } 1098 } 1099 1100 return 0; 1101 } 1102 1103 inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, 1104 unsigned issue_flags) 1105 { 1106 struct io_ring_ctx *ctx = req->ctx; 1107 struct io_rsrc_node *node; 1108 1109 if (req->flags & REQ_F_BUF_NODE) 1110 return req->buf_node; 1111 1112 io_ring_submit_lock(ctx, issue_flags); 1113 node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); 1114 if (node) 1115 io_req_assign_buf_node(req, node); 1116 io_ring_submit_unlock(ctx, issue_flags); 1117 return node; 1118 } 1119 1120 int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, 1121 u64 buf_addr, size_t len, int ddir, 1122 unsigned issue_flags) 1123 { 1124 struct io_rsrc_node *node; 1125 1126 node = io_find_buf_node(req, issue_flags); 1127 if (!node) 1128 return -EFAULT; 1129 return io_import_fixed(ddir, iter, node->buf, buf_addr, len); 1130 } 1131 1132 /* Lock two rings at once. The rings must be different! */ 1133 static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) 1134 { 1135 if (ctx1 > ctx2) 1136 swap(ctx1, ctx2); 1137 mutex_lock(&ctx1->uring_lock); 1138 mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING); 1139 } 1140 1141 /* Both rings are locked by the caller. */ 1142 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, 1143 struct io_uring_clone_buffers *arg) 1144 { 1145 struct io_rsrc_data data; 1146 int i, ret, off, nr; 1147 unsigned int nbufs; 1148 1149 lockdep_assert_held(&ctx->uring_lock); 1150 lockdep_assert_held(&src_ctx->uring_lock); 1151 1152 /* 1153 * Accounting state is shared between the two rings; that only works if 1154 * both rings are accounted towards the same counters. 1155 */ 1156 if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) 1157 return -EINVAL; 1158 1159 /* if offsets are given, must have nr specified too */ 1160 if (!arg->nr && (arg->dst_off || arg->src_off)) 1161 return -EINVAL; 1162 /* not allowed unless REPLACE is set */ 1163 if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) 1164 return -EBUSY; 1165 1166 nbufs = src_ctx->buf_table.nr; 1167 if (!arg->nr) 1168 arg->nr = nbufs; 1169 else if (arg->nr > nbufs) 1170 return -EINVAL; 1171 else if (arg->nr > IORING_MAX_REG_BUFFERS) 1172 return -EINVAL; 1173 if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) 1174 return -EOVERFLOW; 1175 1176 ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); 1177 if (ret) 1178 return ret; 1179 1180 /* Fill entries in data from dst that won't overlap with src */ 1181 for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { 1182 struct io_rsrc_node *src_node = ctx->buf_table.nodes[i]; 1183 1184 if (src_node) { 1185 data.nodes[i] = src_node; 1186 src_node->refs++; 1187 } 1188 } 1189 1190 ret = -ENXIO; 1191 nbufs = src_ctx->buf_table.nr; 1192 if (!nbufs) 1193 goto out_free; 1194 ret = -EINVAL; 1195 if (!arg->nr) 1196 arg->nr = nbufs; 1197 else if (arg->nr > nbufs) 1198 goto out_free; 1199 ret = -EOVERFLOW; 1200 if (check_add_overflow(arg->nr, arg->src_off, &off)) 1201 goto out_free; 1202 if (off > nbufs) 1203 goto out_free; 1204 1205 off = arg->dst_off; 1206 i = arg->src_off; 1207 nr = arg->nr; 1208 while (nr--) { 1209 struct io_rsrc_node *dst_node, *src_node; 1210 1211 src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); 1212 if (!src_node) { 1213 dst_node = NULL; 1214 } else { 1215 dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); 1216 if (!dst_node) { 1217 ret = -ENOMEM; 1218 goto out_free; 1219 } 1220 1221 refcount_inc(&src_node->buf->refs); 1222 dst_node->buf = src_node->buf; 1223 } 1224 data.nodes[off++] = dst_node; 1225 i++; 1226 } 1227 1228 /* 1229 * If asked for replace, put the old table. data->nodes[] holds both 1230 * old and new nodes at this point. 1231 */ 1232 if (arg->flags & IORING_REGISTER_DST_REPLACE) 1233 io_rsrc_data_free(ctx, &ctx->buf_table); 1234 1235 /* 1236 * ctx->buf_table must be empty now - either the contents are being 1237 * replaced and we just freed the table, or the contents are being 1238 * copied to a ring that does not have buffers yet (checked at function 1239 * entry). 1240 */ 1241 WARN_ON_ONCE(ctx->buf_table.nr); 1242 ctx->buf_table = data; 1243 return 0; 1244 1245 out_free: 1246 io_rsrc_data_free(ctx, &data); 1247 return ret; 1248 } 1249 1250 /* 1251 * Copy the registered buffers from the source ring whose file descriptor 1252 * is given in the src_fd to the current ring. This is identical to registering 1253 * the buffers with ctx, except faster as mappings already exist. 1254 * 1255 * Since the memory is already accounted once, don't account it again. 1256 */ 1257 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) 1258 { 1259 struct io_uring_clone_buffers buf; 1260 struct io_ring_ctx *src_ctx; 1261 bool registered_src; 1262 struct file *file; 1263 int ret; 1264 1265 if (copy_from_user(&buf, arg, sizeof(buf))) 1266 return -EFAULT; 1267 if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) 1268 return -EINVAL; 1269 if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) 1270 return -EBUSY; 1271 if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) 1272 return -EINVAL; 1273 1274 registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; 1275 file = io_uring_register_get_file(buf.src_fd, registered_src); 1276 if (IS_ERR(file)) 1277 return PTR_ERR(file); 1278 1279 src_ctx = file->private_data; 1280 if (src_ctx != ctx) { 1281 mutex_unlock(&ctx->uring_lock); 1282 lock_two_rings(ctx, src_ctx); 1283 } 1284 1285 ret = io_clone_buffers(ctx, src_ctx, &buf); 1286 1287 if (src_ctx != ctx) 1288 mutex_unlock(&src_ctx->uring_lock); 1289 1290 fput(file); 1291 return ret; 1292 } 1293 1294 void io_vec_free(struct iou_vec *iv) 1295 { 1296 if (!iv->iovec) 1297 return; 1298 kfree(iv->iovec); 1299 iv->iovec = NULL; 1300 iv->nr = 0; 1301 } 1302 1303 int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) 1304 { 1305 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 1306 struct iovec *iov; 1307 1308 iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp); 1309 if (!iov) 1310 return -ENOMEM; 1311 1312 io_vec_free(iv); 1313 iv->iovec = iov; 1314 iv->nr = nr_entries; 1315 return 0; 1316 } 1317 1318 static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, 1319 struct io_mapped_ubuf *imu, 1320 struct iovec *iovec, unsigned nr_iovs, 1321 struct iou_vec *vec) 1322 { 1323 unsigned long folio_size = 1 << imu->folio_shift; 1324 unsigned long folio_mask = folio_size - 1; 1325 u64 folio_addr = imu->ubuf & ~folio_mask; 1326 struct bio_vec *res_bvec = vec->bvec; 1327 size_t total_len = 0; 1328 unsigned bvec_idx = 0; 1329 unsigned iov_idx; 1330 1331 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1332 size_t iov_len = iovec[iov_idx].iov_len; 1333 u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base; 1334 struct bio_vec *src_bvec; 1335 size_t offset; 1336 int ret; 1337 1338 ret = validate_fixed_range(buf_addr, iov_len, imu); 1339 if (unlikely(ret)) 1340 return ret; 1341 1342 if (unlikely(!iov_len)) 1343 return -EFAULT; 1344 if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) 1345 return -EOVERFLOW; 1346 1347 /* by using folio address it also accounts for bvec offset */ 1348 offset = buf_addr - folio_addr; 1349 src_bvec = imu->bvec + (offset >> imu->folio_shift); 1350 offset &= folio_mask; 1351 1352 for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { 1353 size_t seg_size = min_t(size_t, iov_len, 1354 folio_size - offset); 1355 1356 bvec_set_page(&res_bvec[bvec_idx], 1357 src_bvec->bv_page, seg_size, offset); 1358 iov_len -= seg_size; 1359 } 1360 } 1361 if (total_len > MAX_RW_COUNT) 1362 return -EINVAL; 1363 1364 iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); 1365 return 0; 1366 } 1367 1368 static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, 1369 struct io_mapped_ubuf *imu) 1370 { 1371 unsigned shift = imu->folio_shift; 1372 size_t max_segs = 0; 1373 unsigned i; 1374 1375 for (i = 0; i < nr_iovs; i++) 1376 max_segs += (iov[i].iov_len >> shift) + 2; 1377 return max_segs; 1378 } 1379 1380 static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter, 1381 struct io_mapped_ubuf *imu, 1382 struct iovec *iovec, unsigned nr_iovs, 1383 struct iou_vec *vec) 1384 { 1385 const struct bio_vec *src_bvec = imu->bvec; 1386 struct bio_vec *res_bvec = vec->bvec; 1387 unsigned res_idx = 0; 1388 size_t total_len = 0; 1389 unsigned iov_idx; 1390 1391 for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { 1392 size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base; 1393 size_t iov_len = iovec[iov_idx].iov_len; 1394 struct bvec_iter bi = { 1395 .bi_size = offset + iov_len, 1396 }; 1397 struct bio_vec bv; 1398 1399 bvec_iter_advance(src_bvec, &bi, offset); 1400 for_each_mp_bvec(bv, src_bvec, bi, bi) 1401 res_bvec[res_idx++] = bv; 1402 total_len += iov_len; 1403 } 1404 iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len); 1405 return 0; 1406 } 1407 1408 static int iov_kern_bvec_size(const struct iovec *iov, 1409 const struct io_mapped_ubuf *imu, 1410 unsigned int *nr_seg) 1411 { 1412 size_t offset = (size_t)(uintptr_t)iov->iov_base; 1413 const struct bio_vec *bvec = imu->bvec; 1414 int start = 0, i = 0; 1415 size_t off = 0; 1416 int ret; 1417 1418 ret = validate_fixed_range(offset, iov->iov_len, imu); 1419 if (unlikely(ret)) 1420 return ret; 1421 1422 for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs; 1423 off += bvec[i].bv_len, i++) { 1424 if (offset >= off && offset < off + bvec[i].bv_len) 1425 start = i; 1426 } 1427 *nr_seg = i - start; 1428 return 0; 1429 } 1430 1431 static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs, 1432 struct io_mapped_ubuf *imu, unsigned *nr_segs) 1433 { 1434 unsigned max_segs = 0; 1435 size_t total_len = 0; 1436 unsigned i; 1437 int ret; 1438 1439 *nr_segs = 0; 1440 for (i = 0; i < nr_iovs; i++) { 1441 if (unlikely(!iov[i].iov_len)) 1442 return -EFAULT; 1443 if (unlikely(check_add_overflow(total_len, iov[i].iov_len, 1444 &total_len))) 1445 return -EOVERFLOW; 1446 ret = iov_kern_bvec_size(&iov[i], imu, &max_segs); 1447 if (unlikely(ret)) 1448 return ret; 1449 *nr_segs += max_segs; 1450 } 1451 if (total_len > MAX_RW_COUNT) 1452 return -EINVAL; 1453 return 0; 1454 } 1455 1456 int io_import_reg_vec(int ddir, struct iov_iter *iter, 1457 struct io_kiocb *req, struct iou_vec *vec, 1458 unsigned nr_iovs, unsigned issue_flags) 1459 { 1460 struct io_rsrc_node *node; 1461 struct io_mapped_ubuf *imu; 1462 unsigned iovec_off; 1463 struct iovec *iov; 1464 unsigned nr_segs; 1465 1466 node = io_find_buf_node(req, issue_flags); 1467 if (!node) 1468 return -EFAULT; 1469 imu = node->buf; 1470 if (!(imu->dir & (1 << ddir))) 1471 return -EFAULT; 1472 1473 iovec_off = vec->nr - nr_iovs; 1474 iov = vec->iovec + iovec_off; 1475 1476 if (imu->is_kbuf) { 1477 int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs); 1478 1479 if (unlikely(ret)) 1480 return ret; 1481 } else { 1482 nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu); 1483 } 1484 1485 if (sizeof(struct bio_vec) > sizeof(struct iovec)) { 1486 size_t bvec_bytes; 1487 1488 bvec_bytes = nr_segs * sizeof(struct bio_vec); 1489 nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov); 1490 nr_segs += nr_iovs; 1491 } 1492 1493 if (nr_segs > vec->nr) { 1494 struct iou_vec tmp_vec = {}; 1495 int ret; 1496 1497 ret = io_vec_realloc(&tmp_vec, nr_segs); 1498 if (ret) 1499 return ret; 1500 1501 iovec_off = tmp_vec.nr - nr_iovs; 1502 memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); 1503 io_vec_free(vec); 1504 1505 *vec = tmp_vec; 1506 iov = vec->iovec + iovec_off; 1507 req->flags |= REQ_F_NEED_CLEANUP; 1508 } 1509 1510 if (imu->is_kbuf) 1511 return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1512 1513 return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); 1514 } 1515 1516 int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, 1517 const struct iovec __user *uvec, size_t uvec_segs) 1518 { 1519 struct iovec *iov; 1520 int iovec_off, ret; 1521 void *res; 1522 1523 if (uvec_segs > iv->nr) { 1524 ret = io_vec_realloc(iv, uvec_segs); 1525 if (ret) 1526 return ret; 1527 req->flags |= REQ_F_NEED_CLEANUP; 1528 } 1529 1530 /* pad iovec to the right */ 1531 iovec_off = iv->nr - uvec_segs; 1532 iov = iv->iovec + iovec_off; 1533 res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, 1534 io_is_compat(req->ctx)); 1535 if (IS_ERR(res)) 1536 return PTR_ERR(res); 1537 1538 req->flags |= REQ_F_IMPORT_BUFFER; 1539 return 0; 1540 } 1541