1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/dma-map-ops.h> 5 #include <linux/mm.h> 6 #include <linux/nospec.h> 7 #include <linux/io_uring.h> 8 #include <linux/netdevice.h> 9 #include <linux/rtnetlink.h> 10 #include <linux/skbuff_ref.h> 11 12 #include <net/page_pool/helpers.h> 13 #include <net/page_pool/memory_provider.h> 14 #include <net/netlink.h> 15 #include <net/netdev_rx_queue.h> 16 #include <net/tcp.h> 17 #include <net/rps.h> 18 19 #include <trace/events/page_pool.h> 20 21 #include <uapi/linux/io_uring.h> 22 23 #include "io_uring.h" 24 #include "kbuf.h" 25 #include "memmap.h" 26 #include "zcrx.h" 27 #include "rsrc.h" 28 29 #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) 30 31 static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp) 32 { 33 return pp->mp_priv; 34 } 35 36 static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) 37 { 38 struct net_iov_area *owner = net_iov_owner(niov); 39 40 return container_of(owner, struct io_zcrx_area, nia); 41 } 42 43 static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) 44 { 45 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 46 47 return area->mem.pages[net_iov_idx(niov)]; 48 } 49 50 static void io_release_dmabuf(struct io_zcrx_mem *mem) 51 { 52 if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) 53 return; 54 55 if (mem->sgt) 56 dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt, 57 DMA_FROM_DEVICE); 58 if (mem->attach) 59 dma_buf_detach(mem->dmabuf, mem->attach); 60 if (mem->dmabuf) 61 dma_buf_put(mem->dmabuf); 62 63 mem->sgt = NULL; 64 mem->attach = NULL; 65 mem->dmabuf = NULL; 66 } 67 68 static int io_import_dmabuf(struct io_zcrx_ifq *ifq, 69 struct io_zcrx_mem *mem, 70 struct io_uring_zcrx_area_reg *area_reg) 71 { 72 unsigned long off = (unsigned long)area_reg->addr; 73 unsigned long len = (unsigned long)area_reg->len; 74 unsigned long total_size = 0; 75 struct scatterlist *sg; 76 int dmabuf_fd = area_reg->dmabuf_fd; 77 int i, ret; 78 79 if (WARN_ON_ONCE(!ifq->dev)) 80 return -EFAULT; 81 if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) 82 return -EINVAL; 83 84 mem->is_dmabuf = true; 85 mem->dmabuf = dma_buf_get(dmabuf_fd); 86 if (IS_ERR(mem->dmabuf)) { 87 ret = PTR_ERR(mem->dmabuf); 88 mem->dmabuf = NULL; 89 goto err; 90 } 91 92 mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev); 93 if (IS_ERR(mem->attach)) { 94 ret = PTR_ERR(mem->attach); 95 mem->attach = NULL; 96 goto err; 97 } 98 99 mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE); 100 if (IS_ERR(mem->sgt)) { 101 ret = PTR_ERR(mem->sgt); 102 mem->sgt = NULL; 103 goto err; 104 } 105 106 for_each_sgtable_dma_sg(mem->sgt, sg, i) 107 total_size += sg_dma_len(sg); 108 109 if (total_size < off + len) { 110 ret = -EINVAL; 111 goto err; 112 } 113 114 mem->dmabuf_offset = off; 115 mem->size = len; 116 return 0; 117 err: 118 io_release_dmabuf(mem); 119 return ret; 120 } 121 122 static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 123 { 124 unsigned long off = area->mem.dmabuf_offset; 125 struct scatterlist *sg; 126 unsigned i, niov_idx = 0; 127 128 if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) 129 return -EINVAL; 130 131 for_each_sgtable_dma_sg(area->mem.sgt, sg, i) { 132 dma_addr_t dma = sg_dma_address(sg); 133 unsigned long sg_len = sg_dma_len(sg); 134 unsigned long sg_off = min(sg_len, off); 135 136 off -= sg_off; 137 sg_len -= sg_off; 138 dma += sg_off; 139 140 while (sg_len && niov_idx < area->nia.num_niovs) { 141 struct net_iov *niov = &area->nia.niovs[niov_idx]; 142 143 if (net_mp_niov_set_dma_addr(niov, dma)) 144 return 0; 145 sg_len -= PAGE_SIZE; 146 dma += PAGE_SIZE; 147 niov_idx++; 148 } 149 } 150 return niov_idx; 151 } 152 153 static int io_import_umem(struct io_zcrx_ifq *ifq, 154 struct io_zcrx_mem *mem, 155 struct io_uring_zcrx_area_reg *area_reg) 156 { 157 struct page **pages; 158 int nr_pages; 159 160 if (area_reg->dmabuf_fd) 161 return -EINVAL; 162 if (!area_reg->addr) 163 return -EFAULT; 164 pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, 165 &nr_pages); 166 if (IS_ERR(pages)) 167 return PTR_ERR(pages); 168 169 mem->pages = pages; 170 mem->nr_folios = nr_pages; 171 mem->size = area_reg->len; 172 return 0; 173 } 174 175 static void io_release_area_mem(struct io_zcrx_mem *mem) 176 { 177 if (mem->is_dmabuf) { 178 io_release_dmabuf(mem); 179 return; 180 } 181 if (mem->pages) { 182 unpin_user_pages(mem->pages, mem->nr_folios); 183 kvfree(mem->pages); 184 } 185 } 186 187 static int io_import_area(struct io_zcrx_ifq *ifq, 188 struct io_zcrx_mem *mem, 189 struct io_uring_zcrx_area_reg *area_reg) 190 { 191 int ret; 192 193 ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); 194 if (ret) 195 return ret; 196 if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) 197 return -EINVAL; 198 199 if (area_reg->flags & IORING_ZCRX_AREA_DMABUF) 200 return io_import_dmabuf(ifq, mem, area_reg); 201 return io_import_umem(ifq, mem, area_reg); 202 } 203 204 static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq, 205 struct io_zcrx_area *area, int nr_mapped) 206 { 207 int i; 208 209 for (i = 0; i < nr_mapped; i++) { 210 netmem_ref netmem = net_iov_to_netmem(&area->nia.niovs[i]); 211 dma_addr_t dma = page_pool_get_dma_addr_netmem(netmem); 212 213 dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, 214 DMA_FROM_DEVICE, IO_DMA_ATTR); 215 } 216 } 217 218 static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, 219 struct io_zcrx_area *area, int nr_mapped) 220 { 221 int i; 222 223 if (area->mem.is_dmabuf) 224 io_release_dmabuf(&area->mem); 225 else 226 io_zcrx_unmap_umem(ifq, area, nr_mapped); 227 228 for (i = 0; i < area->nia.num_niovs; i++) 229 net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); 230 } 231 232 static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 233 { 234 guard(mutex)(&ifq->dma_lock); 235 236 if (area->is_mapped) 237 __io_zcrx_unmap_area(ifq, area, area->nia.num_niovs); 238 area->is_mapped = false; 239 } 240 241 static int io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 242 { 243 int i; 244 245 for (i = 0; i < area->nia.num_niovs; i++) { 246 struct net_iov *niov = &area->nia.niovs[i]; 247 dma_addr_t dma; 248 249 dma = dma_map_page_attrs(ifq->dev, area->mem.pages[i], 0, 250 PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR); 251 if (dma_mapping_error(ifq->dev, dma)) 252 break; 253 if (net_mp_niov_set_dma_addr(niov, dma)) { 254 dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, 255 DMA_FROM_DEVICE, IO_DMA_ATTR); 256 break; 257 } 258 } 259 return i; 260 } 261 262 static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 263 { 264 unsigned nr; 265 266 guard(mutex)(&ifq->dma_lock); 267 if (area->is_mapped) 268 return 0; 269 270 if (area->mem.is_dmabuf) 271 nr = io_zcrx_map_area_dmabuf(ifq, area); 272 else 273 nr = io_zcrx_map_area_umem(ifq, area); 274 275 if (nr != area->nia.num_niovs) { 276 __io_zcrx_unmap_area(ifq, area, nr); 277 return -EINVAL; 278 } 279 280 area->is_mapped = true; 281 return 0; 282 } 283 284 static void io_zcrx_sync_for_device(const struct page_pool *pool, 285 struct net_iov *niov) 286 { 287 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) 288 dma_addr_t dma_addr; 289 290 if (!dma_dev_need_sync(pool->p.dev)) 291 return; 292 293 dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); 294 __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, 295 PAGE_SIZE, pool->p.dma_dir); 296 #endif 297 } 298 299 #define IO_RQ_MAX_ENTRIES 32768 300 301 #define IO_SKBS_PER_CALL_LIMIT 20 302 303 struct io_zcrx_args { 304 struct io_kiocb *req; 305 struct io_zcrx_ifq *ifq; 306 struct socket *sock; 307 unsigned nr_skbs; 308 }; 309 310 static const struct memory_provider_ops io_uring_pp_zc_ops; 311 312 static inline atomic_t *io_get_user_counter(struct net_iov *niov) 313 { 314 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 315 316 return &area->user_refs[net_iov_idx(niov)]; 317 } 318 319 static bool io_zcrx_put_niov_uref(struct net_iov *niov) 320 { 321 atomic_t *uref = io_get_user_counter(niov); 322 323 if (unlikely(!atomic_read(uref))) 324 return false; 325 atomic_dec(uref); 326 return true; 327 } 328 329 static void io_zcrx_get_niov_uref(struct net_iov *niov) 330 { 331 atomic_inc(io_get_user_counter(niov)); 332 } 333 334 static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, 335 struct io_uring_zcrx_ifq_reg *reg, 336 struct io_uring_region_desc *rd, 337 u32 id) 338 { 339 u64 mmap_offset; 340 size_t off, size; 341 void *ptr; 342 int ret; 343 344 off = sizeof(struct io_uring); 345 size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; 346 if (size > rd->size) 347 return -EINVAL; 348 349 mmap_offset = IORING_MAP_OFF_ZCRX_REGION; 350 mmap_offset += id << IORING_OFF_PBUF_SHIFT; 351 352 ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset); 353 if (ret < 0) 354 return ret; 355 356 ptr = io_region_get_ptr(&ifq->region); 357 ifq->rq_ring = (struct io_uring *)ptr; 358 ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); 359 return 0; 360 } 361 362 static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) 363 { 364 io_free_region(ifq->ctx, &ifq->region); 365 ifq->rq_ring = NULL; 366 ifq->rqes = NULL; 367 } 368 369 static void io_zcrx_free_area(struct io_zcrx_area *area) 370 { 371 if (area->ifq) 372 io_zcrx_unmap_area(area->ifq, area); 373 io_release_area_mem(&area->mem); 374 375 kvfree(area->freelist); 376 kvfree(area->nia.niovs); 377 kvfree(area->user_refs); 378 kfree(area); 379 } 380 381 #define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF) 382 383 static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, 384 struct io_zcrx_area **res, 385 struct io_uring_zcrx_area_reg *area_reg) 386 { 387 struct io_zcrx_area *area; 388 unsigned nr_iovs; 389 int i, ret; 390 391 if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS) 392 return -EINVAL; 393 if (area_reg->rq_area_token) 394 return -EINVAL; 395 if (area_reg->__resv2[0] || area_reg->__resv2[1]) 396 return -EINVAL; 397 398 ret = -ENOMEM; 399 area = kzalloc(sizeof(*area), GFP_KERNEL); 400 if (!area) 401 goto err; 402 403 ret = io_import_area(ifq, &area->mem, area_reg); 404 if (ret) 405 goto err; 406 407 nr_iovs = area->mem.size >> PAGE_SHIFT; 408 area->nia.num_niovs = nr_iovs; 409 410 ret = -ENOMEM; 411 area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]), 412 GFP_KERNEL | __GFP_ZERO); 413 if (!area->nia.niovs) 414 goto err; 415 416 area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]), 417 GFP_KERNEL | __GFP_ZERO); 418 if (!area->freelist) 419 goto err; 420 421 area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]), 422 GFP_KERNEL | __GFP_ZERO); 423 if (!area->user_refs) 424 goto err; 425 426 for (i = 0; i < nr_iovs; i++) { 427 struct net_iov *niov = &area->nia.niovs[i]; 428 429 niov->owner = &area->nia; 430 area->freelist[i] = i; 431 atomic_set(&area->user_refs[i], 0); 432 niov->type = NET_IOV_IOURING; 433 } 434 435 area->free_count = nr_iovs; 436 area->ifq = ifq; 437 /* we're only supporting one area per ifq for now */ 438 area->area_id = 0; 439 area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; 440 spin_lock_init(&area->freelist_lock); 441 *res = area; 442 return 0; 443 err: 444 if (area) 445 io_zcrx_free_area(area); 446 return ret; 447 } 448 449 static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) 450 { 451 struct io_zcrx_ifq *ifq; 452 453 ifq = kzalloc(sizeof(*ifq), GFP_KERNEL); 454 if (!ifq) 455 return NULL; 456 457 ifq->if_rxq = -1; 458 ifq->ctx = ctx; 459 spin_lock_init(&ifq->lock); 460 spin_lock_init(&ifq->rq_lock); 461 mutex_init(&ifq->dma_lock); 462 return ifq; 463 } 464 465 static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq) 466 { 467 spin_lock(&ifq->lock); 468 if (ifq->netdev) { 469 netdev_put(ifq->netdev, &ifq->netdev_tracker); 470 ifq->netdev = NULL; 471 } 472 spin_unlock(&ifq->lock); 473 } 474 475 static void io_close_queue(struct io_zcrx_ifq *ifq) 476 { 477 struct net_device *netdev; 478 netdevice_tracker netdev_tracker; 479 struct pp_memory_provider_params p = { 480 .mp_ops = &io_uring_pp_zc_ops, 481 .mp_priv = ifq, 482 }; 483 484 if (ifq->if_rxq == -1) 485 return; 486 487 spin_lock(&ifq->lock); 488 netdev = ifq->netdev; 489 netdev_tracker = ifq->netdev_tracker; 490 ifq->netdev = NULL; 491 spin_unlock(&ifq->lock); 492 493 if (netdev) { 494 net_mp_close_rxq(netdev, ifq->if_rxq, &p); 495 netdev_put(netdev, &netdev_tracker); 496 } 497 ifq->if_rxq = -1; 498 } 499 500 static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) 501 { 502 io_close_queue(ifq); 503 io_zcrx_drop_netdev(ifq); 504 505 if (ifq->area) 506 io_zcrx_free_area(ifq->area); 507 if (ifq->dev) 508 put_device(ifq->dev); 509 510 io_free_rbuf_ring(ifq); 511 mutex_destroy(&ifq->dma_lock); 512 kfree(ifq); 513 } 514 515 struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, 516 unsigned int id) 517 { 518 struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id); 519 520 lockdep_assert_held(&ctx->mmap_lock); 521 522 return ifq ? &ifq->region : NULL; 523 } 524 525 int io_register_zcrx_ifq(struct io_ring_ctx *ctx, 526 struct io_uring_zcrx_ifq_reg __user *arg) 527 { 528 struct pp_memory_provider_params mp_param = {}; 529 struct io_uring_zcrx_area_reg area; 530 struct io_uring_zcrx_ifq_reg reg; 531 struct io_uring_region_desc rd; 532 struct io_zcrx_ifq *ifq; 533 int ret; 534 u32 id; 535 536 /* 537 * 1. Interface queue allocation. 538 * 2. It can observe data destined for sockets of other tasks. 539 */ 540 if (!capable(CAP_NET_ADMIN)) 541 return -EPERM; 542 543 /* mandatory io_uring features for zc rx */ 544 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && 545 ctx->flags & IORING_SETUP_CQE32)) 546 return -EINVAL; 547 if (copy_from_user(®, arg, sizeof(reg))) 548 return -EFAULT; 549 if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) 550 return -EFAULT; 551 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)) || 552 reg.__resv2 || reg.zcrx_id) 553 return -EINVAL; 554 if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) 555 return -EINVAL; 556 if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { 557 if (!(ctx->flags & IORING_SETUP_CLAMP)) 558 return -EINVAL; 559 reg.rq_entries = IO_RQ_MAX_ENTRIES; 560 } 561 reg.rq_entries = roundup_pow_of_two(reg.rq_entries); 562 563 if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area))) 564 return -EFAULT; 565 566 ifq = io_zcrx_ifq_alloc(ctx); 567 if (!ifq) 568 return -ENOMEM; 569 ifq->rq_entries = reg.rq_entries; 570 571 scoped_guard(mutex, &ctx->mmap_lock) { 572 /* preallocate id */ 573 ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); 574 if (ret) 575 goto ifq_free; 576 } 577 578 ret = io_allocate_rbuf_ring(ifq, ®, &rd, id); 579 if (ret) 580 goto err; 581 582 ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx, 583 &ifq->netdev_tracker, GFP_KERNEL); 584 if (!ifq->netdev) { 585 ret = -ENODEV; 586 goto err; 587 } 588 589 ifq->dev = ifq->netdev->dev.parent; 590 if (!ifq->dev) { 591 ret = -EOPNOTSUPP; 592 goto err; 593 } 594 get_device(ifq->dev); 595 596 ret = io_zcrx_create_area(ifq, &ifq->area, &area); 597 if (ret) 598 goto err; 599 600 mp_param.mp_ops = &io_uring_pp_zc_ops; 601 mp_param.mp_priv = ifq; 602 ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param); 603 if (ret) 604 goto err; 605 ifq->if_rxq = reg.if_rxq; 606 607 reg.offsets.rqes = sizeof(struct io_uring); 608 reg.offsets.head = offsetof(struct io_uring, head); 609 reg.offsets.tail = offsetof(struct io_uring, tail); 610 reg.zcrx_id = id; 611 612 scoped_guard(mutex, &ctx->mmap_lock) { 613 /* publish ifq */ 614 ret = -ENOMEM; 615 if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) 616 goto err; 617 } 618 619 if (copy_to_user(arg, ®, sizeof(reg)) || 620 copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) || 621 copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) { 622 ret = -EFAULT; 623 goto err; 624 } 625 return 0; 626 err: 627 scoped_guard(mutex, &ctx->mmap_lock) 628 xa_erase(&ctx->zcrx_ctxs, id); 629 ifq_free: 630 io_zcrx_ifq_free(ifq); 631 return ret; 632 } 633 634 void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) 635 { 636 struct io_zcrx_ifq *ifq; 637 638 lockdep_assert_held(&ctx->uring_lock); 639 640 while (1) { 641 scoped_guard(mutex, &ctx->mmap_lock) { 642 unsigned long id = 0; 643 644 ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); 645 if (ifq) 646 xa_erase(&ctx->zcrx_ctxs, id); 647 } 648 if (!ifq) 649 break; 650 io_zcrx_ifq_free(ifq); 651 } 652 653 xa_destroy(&ctx->zcrx_ctxs); 654 } 655 656 static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) 657 { 658 unsigned niov_idx; 659 660 lockdep_assert_held(&area->freelist_lock); 661 662 niov_idx = area->freelist[--area->free_count]; 663 return &area->nia.niovs[niov_idx]; 664 } 665 666 static void io_zcrx_return_niov_freelist(struct net_iov *niov) 667 { 668 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 669 670 spin_lock_bh(&area->freelist_lock); 671 area->freelist[area->free_count++] = net_iov_idx(niov); 672 spin_unlock_bh(&area->freelist_lock); 673 } 674 675 static void io_zcrx_return_niov(struct net_iov *niov) 676 { 677 netmem_ref netmem = net_iov_to_netmem(niov); 678 679 if (!niov->pp) { 680 /* copy fallback allocated niovs */ 681 io_zcrx_return_niov_freelist(niov); 682 return; 683 } 684 page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false); 685 } 686 687 static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) 688 { 689 struct io_zcrx_area *area = ifq->area; 690 int i; 691 692 if (!area) 693 return; 694 695 /* Reclaim back all buffers given to the user space. */ 696 for (i = 0; i < area->nia.num_niovs; i++) { 697 struct net_iov *niov = &area->nia.niovs[i]; 698 int nr; 699 700 if (!atomic_read(io_get_user_counter(niov))) 701 continue; 702 nr = atomic_xchg(io_get_user_counter(niov), 0); 703 if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) 704 io_zcrx_return_niov(niov); 705 } 706 } 707 708 void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) 709 { 710 struct io_zcrx_ifq *ifq; 711 unsigned long index; 712 713 lockdep_assert_held(&ctx->uring_lock); 714 715 xa_for_each(&ctx->zcrx_ctxs, index, ifq) { 716 io_zcrx_scrub(ifq); 717 io_close_queue(ifq); 718 } 719 } 720 721 static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) 722 { 723 u32 entries; 724 725 entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head; 726 return min(entries, ifq->rq_entries); 727 } 728 729 static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq, 730 unsigned mask) 731 { 732 unsigned int idx = ifq->cached_rq_head++ & mask; 733 734 return &ifq->rqes[idx]; 735 } 736 737 static void io_zcrx_ring_refill(struct page_pool *pp, 738 struct io_zcrx_ifq *ifq) 739 { 740 unsigned int mask = ifq->rq_entries - 1; 741 unsigned int entries; 742 netmem_ref netmem; 743 744 spin_lock_bh(&ifq->rq_lock); 745 746 entries = io_zcrx_rqring_entries(ifq); 747 entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count); 748 if (unlikely(!entries)) { 749 spin_unlock_bh(&ifq->rq_lock); 750 return; 751 } 752 753 do { 754 struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); 755 struct io_zcrx_area *area; 756 struct net_iov *niov; 757 unsigned niov_idx, area_idx; 758 759 area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; 760 niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT; 761 762 if (unlikely(rqe->__pad || area_idx)) 763 continue; 764 area = ifq->area; 765 766 if (unlikely(niov_idx >= area->nia.num_niovs)) 767 continue; 768 niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); 769 770 niov = &area->nia.niovs[niov_idx]; 771 if (!io_zcrx_put_niov_uref(niov)) 772 continue; 773 774 netmem = net_iov_to_netmem(niov); 775 if (page_pool_unref_netmem(netmem, 1) != 0) 776 continue; 777 778 if (unlikely(niov->pp != pp)) { 779 io_zcrx_return_niov(niov); 780 continue; 781 } 782 783 io_zcrx_sync_for_device(pp, niov); 784 net_mp_netmem_place_in_cache(pp, netmem); 785 } while (--entries); 786 787 smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head); 788 spin_unlock_bh(&ifq->rq_lock); 789 } 790 791 static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) 792 { 793 struct io_zcrx_area *area = ifq->area; 794 795 spin_lock_bh(&area->freelist_lock); 796 while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) { 797 struct net_iov *niov = __io_zcrx_get_free_niov(area); 798 netmem_ref netmem = net_iov_to_netmem(niov); 799 800 net_mp_niov_set_page_pool(pp, niov); 801 io_zcrx_sync_for_device(pp, niov); 802 net_mp_netmem_place_in_cache(pp, netmem); 803 } 804 spin_unlock_bh(&area->freelist_lock); 805 } 806 807 static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) 808 { 809 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 810 811 /* pp should already be ensuring that */ 812 if (unlikely(pp->alloc.count)) 813 goto out_return; 814 815 io_zcrx_ring_refill(pp, ifq); 816 if (likely(pp->alloc.count)) 817 goto out_return; 818 819 io_zcrx_refill_slow(pp, ifq); 820 if (!pp->alloc.count) 821 return 0; 822 out_return: 823 return pp->alloc.cache[--pp->alloc.count]; 824 } 825 826 static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) 827 { 828 struct net_iov *niov; 829 830 if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) 831 return false; 832 833 niov = netmem_to_net_iov(netmem); 834 net_mp_niov_clear_page_pool(niov); 835 io_zcrx_return_niov_freelist(niov); 836 return false; 837 } 838 839 static int io_pp_zc_init(struct page_pool *pp) 840 { 841 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 842 int ret; 843 844 if (WARN_ON_ONCE(!ifq)) 845 return -EINVAL; 846 if (WARN_ON_ONCE(ifq->dev != pp->p.dev)) 847 return -EINVAL; 848 if (WARN_ON_ONCE(!pp->dma_map)) 849 return -EOPNOTSUPP; 850 if (pp->p.order != 0) 851 return -EOPNOTSUPP; 852 if (pp->p.dma_dir != DMA_FROM_DEVICE) 853 return -EOPNOTSUPP; 854 855 ret = io_zcrx_map_area(ifq, ifq->area); 856 if (ret) 857 return ret; 858 859 percpu_ref_get(&ifq->ctx->refs); 860 return 0; 861 } 862 863 static void io_pp_zc_destroy(struct page_pool *pp) 864 { 865 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 866 struct io_zcrx_area *area = ifq->area; 867 868 if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs)) 869 return; 870 percpu_ref_put(&ifq->ctx->refs); 871 } 872 873 static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp, 874 struct netdev_rx_queue *rxq) 875 { 876 struct nlattr *nest; 877 int type; 878 879 type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING; 880 nest = nla_nest_start(rsp, type); 881 if (!nest) 882 return -EMSGSIZE; 883 nla_nest_end(rsp, nest); 884 885 return 0; 886 } 887 888 static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq) 889 { 890 struct pp_memory_provider_params *p = &rxq->mp_params; 891 struct io_zcrx_ifq *ifq = mp_priv; 892 893 io_zcrx_drop_netdev(ifq); 894 if (ifq->area) 895 io_zcrx_unmap_area(ifq, ifq->area); 896 897 p->mp_ops = NULL; 898 p->mp_priv = NULL; 899 } 900 901 static const struct memory_provider_ops io_uring_pp_zc_ops = { 902 .alloc_netmems = io_pp_zc_alloc_netmems, 903 .release_netmem = io_pp_zc_release_netmem, 904 .init = io_pp_zc_init, 905 .destroy = io_pp_zc_destroy, 906 .nl_fill = io_pp_nl_fill, 907 .uninstall = io_pp_uninstall, 908 }; 909 910 static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, 911 struct io_zcrx_ifq *ifq, int off, int len) 912 { 913 struct io_uring_zcrx_cqe *rcqe; 914 struct io_zcrx_area *area; 915 struct io_uring_cqe *cqe; 916 u64 offset; 917 918 if (!io_defer_get_uncommited_cqe(req->ctx, &cqe)) 919 return false; 920 921 cqe->user_data = req->cqe.user_data; 922 cqe->res = len; 923 cqe->flags = IORING_CQE_F_MORE; 924 925 area = io_zcrx_iov_to_area(niov); 926 offset = off + (net_iov_idx(niov) << PAGE_SHIFT); 927 rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); 928 rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT); 929 rcqe->__pad = 0; 930 return true; 931 } 932 933 static struct net_iov *io_zcrx_alloc_fallback(struct io_zcrx_area *area) 934 { 935 struct net_iov *niov = NULL; 936 937 spin_lock_bh(&area->freelist_lock); 938 if (area->free_count) 939 niov = __io_zcrx_get_free_niov(area); 940 spin_unlock_bh(&area->freelist_lock); 941 942 if (niov) 943 page_pool_fragment_netmem(net_iov_to_netmem(niov), 1); 944 return niov; 945 } 946 947 static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 948 void *src_base, struct page *src_page, 949 unsigned int src_offset, size_t len) 950 { 951 struct io_zcrx_area *area = ifq->area; 952 size_t copied = 0; 953 int ret = 0; 954 955 if (area->mem.is_dmabuf) 956 return -EFAULT; 957 958 while (len) { 959 size_t copy_size = min_t(size_t, PAGE_SIZE, len); 960 const int dst_off = 0; 961 struct net_iov *niov; 962 struct page *dst_page; 963 void *dst_addr; 964 965 niov = io_zcrx_alloc_fallback(area); 966 if (!niov) { 967 ret = -ENOMEM; 968 break; 969 } 970 971 dst_page = io_zcrx_iov_page(niov); 972 dst_addr = kmap_local_page(dst_page); 973 if (src_page) 974 src_base = kmap_local_page(src_page); 975 976 memcpy(dst_addr, src_base + src_offset, copy_size); 977 978 if (src_page) 979 kunmap_local(src_base); 980 kunmap_local(dst_addr); 981 982 if (!io_zcrx_queue_cqe(req, niov, ifq, dst_off, copy_size)) { 983 io_zcrx_return_niov(niov); 984 ret = -ENOSPC; 985 break; 986 } 987 988 io_zcrx_get_niov_uref(niov); 989 src_offset += copy_size; 990 len -= copy_size; 991 copied += copy_size; 992 } 993 994 return copied ? copied : ret; 995 } 996 997 static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 998 const skb_frag_t *frag, int off, int len) 999 { 1000 struct page *page = skb_frag_page(frag); 1001 u32 p_off, p_len, t, copied = 0; 1002 int ret = 0; 1003 1004 off += skb_frag_off(frag); 1005 1006 skb_frag_foreach_page(frag, off, len, 1007 page, p_off, p_len, t) { 1008 ret = io_zcrx_copy_chunk(req, ifq, NULL, page, p_off, p_len); 1009 if (ret < 0) 1010 return copied ? copied : ret; 1011 copied += ret; 1012 } 1013 return copied; 1014 } 1015 1016 static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1017 const skb_frag_t *frag, int off, int len) 1018 { 1019 struct net_iov *niov; 1020 1021 if (unlikely(!skb_frag_is_net_iov(frag))) 1022 return io_zcrx_copy_frag(req, ifq, frag, off, len); 1023 1024 niov = netmem_to_net_iov(frag->netmem); 1025 if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops || 1026 io_pp_to_ifq(niov->pp) != ifq) 1027 return -EFAULT; 1028 1029 if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len)) 1030 return -ENOSPC; 1031 1032 /* 1033 * Prevent it from being recycled while user is accessing it. 1034 * It has to be done before grabbing a user reference. 1035 */ 1036 page_pool_ref_netmem(net_iov_to_netmem(niov)); 1037 io_zcrx_get_niov_uref(niov); 1038 return len; 1039 } 1040 1041 static int 1042 io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, 1043 unsigned int offset, size_t len) 1044 { 1045 struct io_zcrx_args *args = desc->arg.data; 1046 struct io_zcrx_ifq *ifq = args->ifq; 1047 struct io_kiocb *req = args->req; 1048 struct sk_buff *frag_iter; 1049 unsigned start, start_off = offset; 1050 int i, copy, end, off; 1051 int ret = 0; 1052 1053 len = min_t(size_t, len, desc->count); 1054 /* 1055 * __tcp_read_sock() always calls io_zcrx_recv_skb one last time, even 1056 * if desc->count is already 0. This is caused by the if (offset + 1 != 1057 * skb->len) check. Return early in this case to break out of 1058 * __tcp_read_sock(). 1059 */ 1060 if (!len) 1061 return 0; 1062 if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT)) 1063 return -EAGAIN; 1064 1065 if (unlikely(offset < skb_headlen(skb))) { 1066 ssize_t copied; 1067 size_t to_copy; 1068 1069 to_copy = min_t(size_t, skb_headlen(skb) - offset, len); 1070 copied = io_zcrx_copy_chunk(req, ifq, skb->data, NULL, 1071 offset, to_copy); 1072 if (copied < 0) { 1073 ret = copied; 1074 goto out; 1075 } 1076 offset += copied; 1077 len -= copied; 1078 if (!len) 1079 goto out; 1080 if (offset != skb_headlen(skb)) 1081 goto out; 1082 } 1083 1084 start = skb_headlen(skb); 1085 1086 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 1087 const skb_frag_t *frag; 1088 1089 if (WARN_ON(start > offset + len)) 1090 return -EFAULT; 1091 1092 frag = &skb_shinfo(skb)->frags[i]; 1093 end = start + skb_frag_size(frag); 1094 1095 if (offset < end) { 1096 copy = end - offset; 1097 if (copy > len) 1098 copy = len; 1099 1100 off = offset - start; 1101 ret = io_zcrx_recv_frag(req, ifq, frag, off, copy); 1102 if (ret < 0) 1103 goto out; 1104 1105 offset += ret; 1106 len -= ret; 1107 if (len == 0 || ret != copy) 1108 goto out; 1109 } 1110 start = end; 1111 } 1112 1113 skb_walk_frags(skb, frag_iter) { 1114 if (WARN_ON(start > offset + len)) 1115 return -EFAULT; 1116 1117 end = start + frag_iter->len; 1118 if (offset < end) { 1119 copy = end - offset; 1120 if (copy > len) 1121 copy = len; 1122 1123 off = offset - start; 1124 ret = io_zcrx_recv_skb(desc, frag_iter, off, copy); 1125 if (ret < 0) 1126 goto out; 1127 1128 offset += ret; 1129 len -= ret; 1130 if (len == 0 || ret != copy) 1131 goto out; 1132 } 1133 start = end; 1134 } 1135 1136 out: 1137 if (offset == start_off) 1138 return ret; 1139 desc->count -= (offset - start_off); 1140 return offset - start_off; 1141 } 1142 1143 static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1144 struct sock *sk, int flags, 1145 unsigned issue_flags, unsigned int *outlen) 1146 { 1147 unsigned int len = *outlen; 1148 struct io_zcrx_args args = { 1149 .req = req, 1150 .ifq = ifq, 1151 .sock = sk->sk_socket, 1152 }; 1153 read_descriptor_t rd_desc = { 1154 .count = len ? len : UINT_MAX, 1155 .arg.data = &args, 1156 }; 1157 int ret; 1158 1159 lock_sock(sk); 1160 ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb); 1161 if (len && ret > 0) 1162 *outlen = len - ret; 1163 if (ret <= 0) { 1164 if (ret < 0 || sock_flag(sk, SOCK_DONE)) 1165 goto out; 1166 if (sk->sk_err) 1167 ret = sock_error(sk); 1168 else if (sk->sk_shutdown & RCV_SHUTDOWN) 1169 goto out; 1170 else if (sk->sk_state == TCP_CLOSE) 1171 ret = -ENOTCONN; 1172 else 1173 ret = -EAGAIN; 1174 } else if (unlikely(args.nr_skbs > IO_SKBS_PER_CALL_LIMIT) && 1175 (issue_flags & IO_URING_F_MULTISHOT)) { 1176 ret = IOU_REQUEUE; 1177 } else if (sock_flag(sk, SOCK_DONE)) { 1178 /* Make it to retry until it finally gets 0. */ 1179 if (issue_flags & IO_URING_F_MULTISHOT) 1180 ret = IOU_REQUEUE; 1181 else 1182 ret = -EAGAIN; 1183 } 1184 out: 1185 release_sock(sk); 1186 return ret; 1187 } 1188 1189 int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 1190 struct socket *sock, unsigned int flags, 1191 unsigned issue_flags, unsigned int *len) 1192 { 1193 struct sock *sk = sock->sk; 1194 const struct proto *prot = READ_ONCE(sk->sk_prot); 1195 1196 if (prot->recvmsg != tcp_recvmsg) 1197 return -EPROTONOSUPPORT; 1198 1199 sock_rps_record_flow(sk); 1200 return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags, len); 1201 } 1202