1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/dma-map-ops.h>
5 #include <linux/mm.h>
6 #include <linux/nospec.h>
7 #include <linux/io_uring.h>
8 #include <linux/netdevice.h>
9 #include <linux/rtnetlink.h>
10 #include <linux/skbuff_ref.h>
11
12 #include <net/page_pool/helpers.h>
13 #include <net/page_pool/memory_provider.h>
14 #include <net/netlink.h>
15 #include <net/netdev_rx_queue.h>
16 #include <net/tcp.h>
17 #include <net/rps.h>
18
19 #include <trace/events/page_pool.h>
20
21 #include <uapi/linux/io_uring.h>
22
23 #include "io_uring.h"
24 #include "kbuf.h"
25 #include "memmap.h"
26 #include "zcrx.h"
27 #include "rsrc.h"
28
io_pp_to_ifq(struct page_pool * pp)29 static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp)
30 {
31 return pp->mp_priv;
32 }
33
34 #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
35
__io_zcrx_unmap_area(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area,int nr_mapped)36 static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
37 struct io_zcrx_area *area, int nr_mapped)
38 {
39 int i;
40
41 for (i = 0; i < nr_mapped; i++) {
42 struct net_iov *niov = &area->nia.niovs[i];
43 dma_addr_t dma;
44
45 dma = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov));
46 dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE,
47 DMA_FROM_DEVICE, IO_DMA_ATTR);
48 net_mp_niov_set_dma_addr(niov, 0);
49 }
50 }
51
io_zcrx_unmap_area(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area)52 static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
53 {
54 guard(mutex)(&ifq->dma_lock);
55
56 if (area->is_mapped)
57 __io_zcrx_unmap_area(ifq, area, area->nia.num_niovs);
58 area->is_mapped = false;
59 }
60
io_zcrx_map_area(struct io_zcrx_ifq * ifq,struct io_zcrx_area * area)61 static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
62 {
63 int i;
64
65 guard(mutex)(&ifq->dma_lock);
66 if (area->is_mapped)
67 return 0;
68
69 for (i = 0; i < area->nia.num_niovs; i++) {
70 struct net_iov *niov = &area->nia.niovs[i];
71 dma_addr_t dma;
72
73 dma = dma_map_page_attrs(ifq->dev, area->pages[i], 0, PAGE_SIZE,
74 DMA_FROM_DEVICE, IO_DMA_ATTR);
75 if (dma_mapping_error(ifq->dev, dma))
76 break;
77 if (net_mp_niov_set_dma_addr(niov, dma)) {
78 dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE,
79 DMA_FROM_DEVICE, IO_DMA_ATTR);
80 break;
81 }
82 }
83
84 if (i != area->nia.num_niovs) {
85 __io_zcrx_unmap_area(ifq, area, i);
86 return -EINVAL;
87 }
88
89 area->is_mapped = true;
90 return 0;
91 }
92
io_zcrx_sync_for_device(const struct page_pool * pool,struct net_iov * niov)93 static void io_zcrx_sync_for_device(const struct page_pool *pool,
94 struct net_iov *niov)
95 {
96 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
97 dma_addr_t dma_addr;
98
99 if (!dma_dev_need_sync(pool->p.dev))
100 return;
101
102 dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov));
103 __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
104 PAGE_SIZE, pool->p.dma_dir);
105 #endif
106 }
107
108 #define IO_RQ_MAX_ENTRIES 32768
109
110 #define IO_SKBS_PER_CALL_LIMIT 20
111
112 struct io_zcrx_args {
113 struct io_kiocb *req;
114 struct io_zcrx_ifq *ifq;
115 struct socket *sock;
116 unsigned nr_skbs;
117 };
118
119 static const struct memory_provider_ops io_uring_pp_zc_ops;
120
io_zcrx_iov_to_area(const struct net_iov * niov)121 static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
122 {
123 struct net_iov_area *owner = net_iov_owner(niov);
124
125 return container_of(owner, struct io_zcrx_area, nia);
126 }
127
io_get_user_counter(struct net_iov * niov)128 static inline atomic_t *io_get_user_counter(struct net_iov *niov)
129 {
130 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
131
132 return &area->user_refs[net_iov_idx(niov)];
133 }
134
io_zcrx_put_niov_uref(struct net_iov * niov)135 static bool io_zcrx_put_niov_uref(struct net_iov *niov)
136 {
137 atomic_t *uref = io_get_user_counter(niov);
138
139 if (unlikely(!atomic_read(uref)))
140 return false;
141 atomic_dec(uref);
142 return true;
143 }
144
io_zcrx_get_niov_uref(struct net_iov * niov)145 static void io_zcrx_get_niov_uref(struct net_iov *niov)
146 {
147 atomic_inc(io_get_user_counter(niov));
148 }
149
io_zcrx_iov_page(const struct net_iov * niov)150 static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
151 {
152 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
153
154 return area->pages[net_iov_idx(niov)];
155 }
156
io_allocate_rbuf_ring(struct io_zcrx_ifq * ifq,struct io_uring_zcrx_ifq_reg * reg,struct io_uring_region_desc * rd)157 static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
158 struct io_uring_zcrx_ifq_reg *reg,
159 struct io_uring_region_desc *rd)
160 {
161 size_t off, size;
162 void *ptr;
163 int ret;
164
165 off = sizeof(struct io_uring);
166 size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries;
167 if (size > rd->size)
168 return -EINVAL;
169
170 ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd,
171 IORING_MAP_OFF_ZCRX_REGION);
172 if (ret < 0)
173 return ret;
174
175 ptr = io_region_get_ptr(&ifq->ctx->zcrx_region);
176 ifq->rq_ring = (struct io_uring *)ptr;
177 ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
178 return 0;
179 }
180
io_free_rbuf_ring(struct io_zcrx_ifq * ifq)181 static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
182 {
183 io_free_region(ifq->ctx, &ifq->ctx->zcrx_region);
184 ifq->rq_ring = NULL;
185 ifq->rqes = NULL;
186 }
187
io_zcrx_free_area(struct io_zcrx_area * area)188 static void io_zcrx_free_area(struct io_zcrx_area *area)
189 {
190 io_zcrx_unmap_area(area->ifq, area);
191
192 kvfree(area->freelist);
193 kvfree(area->nia.niovs);
194 kvfree(area->user_refs);
195 if (area->pages) {
196 unpin_user_pages(area->pages, area->nr_folios);
197 kvfree(area->pages);
198 }
199 kfree(area);
200 }
201
io_zcrx_create_area(struct io_zcrx_ifq * ifq,struct io_zcrx_area ** res,struct io_uring_zcrx_area_reg * area_reg)202 static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
203 struct io_zcrx_area **res,
204 struct io_uring_zcrx_area_reg *area_reg)
205 {
206 struct io_zcrx_area *area;
207 int i, ret, nr_pages, nr_iovs;
208 struct iovec iov;
209
210 if (area_reg->flags || area_reg->rq_area_token)
211 return -EINVAL;
212 if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1])
213 return -EINVAL;
214 if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
215 return -EINVAL;
216
217 iov.iov_base = u64_to_user_ptr(area_reg->addr);
218 iov.iov_len = area_reg->len;
219 ret = io_buffer_validate(&iov);
220 if (ret)
221 return ret;
222
223 ret = -ENOMEM;
224 area = kzalloc(sizeof(*area), GFP_KERNEL);
225 if (!area)
226 goto err;
227
228 area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
229 &nr_pages);
230 if (IS_ERR(area->pages)) {
231 ret = PTR_ERR(area->pages);
232 area->pages = NULL;
233 goto err;
234 }
235 area->nr_folios = nr_iovs = nr_pages;
236 area->nia.num_niovs = nr_iovs;
237
238 area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]),
239 GFP_KERNEL | __GFP_ZERO);
240 if (!area->nia.niovs)
241 goto err;
242
243 area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]),
244 GFP_KERNEL | __GFP_ZERO);
245 if (!area->freelist)
246 goto err;
247
248 for (i = 0; i < nr_iovs; i++)
249 area->freelist[i] = i;
250
251 area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]),
252 GFP_KERNEL | __GFP_ZERO);
253 if (!area->user_refs)
254 goto err;
255
256 for (i = 0; i < nr_iovs; i++) {
257 struct net_iov *niov = &area->nia.niovs[i];
258
259 niov->owner = &area->nia;
260 area->freelist[i] = i;
261 atomic_set(&area->user_refs[i], 0);
262 }
263
264 area->free_count = nr_iovs;
265 area->ifq = ifq;
266 /* we're only supporting one area per ifq for now */
267 area->area_id = 0;
268 area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT;
269 spin_lock_init(&area->freelist_lock);
270 *res = area;
271 return 0;
272 err:
273 if (area)
274 io_zcrx_free_area(area);
275 return ret;
276 }
277
io_zcrx_ifq_alloc(struct io_ring_ctx * ctx)278 static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
279 {
280 struct io_zcrx_ifq *ifq;
281
282 ifq = kzalloc(sizeof(*ifq), GFP_KERNEL);
283 if (!ifq)
284 return NULL;
285
286 ifq->if_rxq = -1;
287 ifq->ctx = ctx;
288 spin_lock_init(&ifq->lock);
289 spin_lock_init(&ifq->rq_lock);
290 mutex_init(&ifq->dma_lock);
291 return ifq;
292 }
293
io_zcrx_drop_netdev(struct io_zcrx_ifq * ifq)294 static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq)
295 {
296 spin_lock(&ifq->lock);
297 if (ifq->netdev) {
298 netdev_put(ifq->netdev, &ifq->netdev_tracker);
299 ifq->netdev = NULL;
300 }
301 spin_unlock(&ifq->lock);
302 }
303
io_close_queue(struct io_zcrx_ifq * ifq)304 static void io_close_queue(struct io_zcrx_ifq *ifq)
305 {
306 struct net_device *netdev;
307 netdevice_tracker netdev_tracker;
308 struct pp_memory_provider_params p = {
309 .mp_ops = &io_uring_pp_zc_ops,
310 .mp_priv = ifq,
311 };
312
313 if (ifq->if_rxq == -1)
314 return;
315
316 spin_lock(&ifq->lock);
317 netdev = ifq->netdev;
318 netdev_tracker = ifq->netdev_tracker;
319 ifq->netdev = NULL;
320 spin_unlock(&ifq->lock);
321
322 if (netdev) {
323 net_mp_close_rxq(netdev, ifq->if_rxq, &p);
324 netdev_put(netdev, &netdev_tracker);
325 }
326 ifq->if_rxq = -1;
327 }
328
io_zcrx_ifq_free(struct io_zcrx_ifq * ifq)329 static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
330 {
331 io_close_queue(ifq);
332 io_zcrx_drop_netdev(ifq);
333
334 if (ifq->area)
335 io_zcrx_free_area(ifq->area);
336 if (ifq->dev)
337 put_device(ifq->dev);
338
339 io_free_rbuf_ring(ifq);
340 mutex_destroy(&ifq->dma_lock);
341 kfree(ifq);
342 }
343
io_register_zcrx_ifq(struct io_ring_ctx * ctx,struct io_uring_zcrx_ifq_reg __user * arg)344 int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
345 struct io_uring_zcrx_ifq_reg __user *arg)
346 {
347 struct pp_memory_provider_params mp_param = {};
348 struct io_uring_zcrx_area_reg area;
349 struct io_uring_zcrx_ifq_reg reg;
350 struct io_uring_region_desc rd;
351 struct io_zcrx_ifq *ifq;
352 int ret;
353
354 /*
355 * 1. Interface queue allocation.
356 * 2. It can observe data destined for sockets of other tasks.
357 */
358 if (!capable(CAP_NET_ADMIN))
359 return -EPERM;
360
361 /* mandatory io_uring features for zc rx */
362 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
363 ctx->flags & IORING_SETUP_CQE32))
364 return -EINVAL;
365 if (ctx->ifq)
366 return -EBUSY;
367 if (copy_from_user(®, arg, sizeof(reg)))
368 return -EFAULT;
369 if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
370 return -EFAULT;
371 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)) ||
372 reg.__resv2 || reg.zcrx_id)
373 return -EINVAL;
374 if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
375 return -EINVAL;
376 if (reg.rq_entries > IO_RQ_MAX_ENTRIES) {
377 if (!(ctx->flags & IORING_SETUP_CLAMP))
378 return -EINVAL;
379 reg.rq_entries = IO_RQ_MAX_ENTRIES;
380 }
381 reg.rq_entries = roundup_pow_of_two(reg.rq_entries);
382
383 if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area)))
384 return -EFAULT;
385
386 ifq = io_zcrx_ifq_alloc(ctx);
387 if (!ifq)
388 return -ENOMEM;
389
390 ret = io_allocate_rbuf_ring(ifq, ®, &rd);
391 if (ret)
392 goto err;
393
394 ret = io_zcrx_create_area(ifq, &ifq->area, &area);
395 if (ret)
396 goto err;
397
398 ifq->rq_entries = reg.rq_entries;
399
400 ret = -ENODEV;
401 ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx,
402 &ifq->netdev_tracker, GFP_KERNEL);
403 if (!ifq->netdev)
404 goto err;
405
406 ifq->dev = ifq->netdev->dev.parent;
407 ret = -EOPNOTSUPP;
408 if (!ifq->dev)
409 goto err;
410 get_device(ifq->dev);
411
412 mp_param.mp_ops = &io_uring_pp_zc_ops;
413 mp_param.mp_priv = ifq;
414 ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param);
415 if (ret)
416 goto err;
417 ifq->if_rxq = reg.if_rxq;
418
419 reg.offsets.rqes = sizeof(struct io_uring);
420 reg.offsets.head = offsetof(struct io_uring, head);
421 reg.offsets.tail = offsetof(struct io_uring, tail);
422
423 if (copy_to_user(arg, ®, sizeof(reg)) ||
424 copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) ||
425 copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) {
426 ret = -EFAULT;
427 goto err;
428 }
429 ctx->ifq = ifq;
430 return 0;
431 err:
432 io_zcrx_ifq_free(ifq);
433 return ret;
434 }
435
io_unregister_zcrx_ifqs(struct io_ring_ctx * ctx)436 void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
437 {
438 struct io_zcrx_ifq *ifq = ctx->ifq;
439
440 lockdep_assert_held(&ctx->uring_lock);
441
442 if (!ifq)
443 return;
444
445 ctx->ifq = NULL;
446 io_zcrx_ifq_free(ifq);
447 }
448
__io_zcrx_get_free_niov(struct io_zcrx_area * area)449 static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area)
450 {
451 unsigned niov_idx;
452
453 lockdep_assert_held(&area->freelist_lock);
454
455 niov_idx = area->freelist[--area->free_count];
456 return &area->nia.niovs[niov_idx];
457 }
458
io_zcrx_return_niov_freelist(struct net_iov * niov)459 static void io_zcrx_return_niov_freelist(struct net_iov *niov)
460 {
461 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
462
463 spin_lock_bh(&area->freelist_lock);
464 area->freelist[area->free_count++] = net_iov_idx(niov);
465 spin_unlock_bh(&area->freelist_lock);
466 }
467
io_zcrx_return_niov(struct net_iov * niov)468 static void io_zcrx_return_niov(struct net_iov *niov)
469 {
470 netmem_ref netmem = net_iov_to_netmem(niov);
471
472 if (!niov->pp) {
473 /* copy fallback allocated niovs */
474 io_zcrx_return_niov_freelist(niov);
475 return;
476 }
477 page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false);
478 }
479
io_zcrx_scrub(struct io_zcrx_ifq * ifq)480 static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
481 {
482 struct io_zcrx_area *area = ifq->area;
483 int i;
484
485 if (!area)
486 return;
487
488 /* Reclaim back all buffers given to the user space. */
489 for (i = 0; i < area->nia.num_niovs; i++) {
490 struct net_iov *niov = &area->nia.niovs[i];
491 int nr;
492
493 if (!atomic_read(io_get_user_counter(niov)))
494 continue;
495 nr = atomic_xchg(io_get_user_counter(niov), 0);
496 if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr))
497 io_zcrx_return_niov(niov);
498 }
499 }
500
io_shutdown_zcrx_ifqs(struct io_ring_ctx * ctx)501 void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
502 {
503 lockdep_assert_held(&ctx->uring_lock);
504
505 if (!ctx->ifq)
506 return;
507 io_zcrx_scrub(ctx->ifq);
508 io_close_queue(ctx->ifq);
509 }
510
io_zcrx_rqring_entries(struct io_zcrx_ifq * ifq)511 static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq)
512 {
513 u32 entries;
514
515 entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head;
516 return min(entries, ifq->rq_entries);
517 }
518
io_zcrx_get_rqe(struct io_zcrx_ifq * ifq,unsigned mask)519 static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq,
520 unsigned mask)
521 {
522 unsigned int idx = ifq->cached_rq_head++ & mask;
523
524 return &ifq->rqes[idx];
525 }
526
io_zcrx_ring_refill(struct page_pool * pp,struct io_zcrx_ifq * ifq)527 static void io_zcrx_ring_refill(struct page_pool *pp,
528 struct io_zcrx_ifq *ifq)
529 {
530 unsigned int mask = ifq->rq_entries - 1;
531 unsigned int entries;
532 netmem_ref netmem;
533
534 spin_lock_bh(&ifq->rq_lock);
535
536 entries = io_zcrx_rqring_entries(ifq);
537 entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count);
538 if (unlikely(!entries)) {
539 spin_unlock_bh(&ifq->rq_lock);
540 return;
541 }
542
543 do {
544 struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask);
545 struct io_zcrx_area *area;
546 struct net_iov *niov;
547 unsigned niov_idx, area_idx;
548
549 area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT;
550 niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT;
551
552 if (unlikely(rqe->__pad || area_idx))
553 continue;
554 area = ifq->area;
555
556 if (unlikely(niov_idx >= area->nia.num_niovs))
557 continue;
558 niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs);
559
560 niov = &area->nia.niovs[niov_idx];
561 if (!io_zcrx_put_niov_uref(niov))
562 continue;
563
564 netmem = net_iov_to_netmem(niov);
565 if (page_pool_unref_netmem(netmem, 1) != 0)
566 continue;
567
568 if (unlikely(niov->pp != pp)) {
569 io_zcrx_return_niov(niov);
570 continue;
571 }
572
573 io_zcrx_sync_for_device(pp, niov);
574 net_mp_netmem_place_in_cache(pp, netmem);
575 } while (--entries);
576
577 smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head);
578 spin_unlock_bh(&ifq->rq_lock);
579 }
580
io_zcrx_refill_slow(struct page_pool * pp,struct io_zcrx_ifq * ifq)581 static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq)
582 {
583 struct io_zcrx_area *area = ifq->area;
584
585 spin_lock_bh(&area->freelist_lock);
586 while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) {
587 struct net_iov *niov = __io_zcrx_get_free_niov(area);
588 netmem_ref netmem = net_iov_to_netmem(niov);
589
590 net_mp_niov_set_page_pool(pp, niov);
591 io_zcrx_sync_for_device(pp, niov);
592 net_mp_netmem_place_in_cache(pp, netmem);
593 }
594 spin_unlock_bh(&area->freelist_lock);
595 }
596
io_pp_zc_alloc_netmems(struct page_pool * pp,gfp_t gfp)597 static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp)
598 {
599 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
600
601 /* pp should already be ensuring that */
602 if (unlikely(pp->alloc.count))
603 goto out_return;
604
605 io_zcrx_ring_refill(pp, ifq);
606 if (likely(pp->alloc.count))
607 goto out_return;
608
609 io_zcrx_refill_slow(pp, ifq);
610 if (!pp->alloc.count)
611 return 0;
612 out_return:
613 return pp->alloc.cache[--pp->alloc.count];
614 }
615
io_pp_zc_release_netmem(struct page_pool * pp,netmem_ref netmem)616 static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem)
617 {
618 struct net_iov *niov;
619
620 if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
621 return false;
622
623 niov = netmem_to_net_iov(netmem);
624 net_mp_niov_clear_page_pool(niov);
625 io_zcrx_return_niov_freelist(niov);
626 return false;
627 }
628
io_pp_zc_init(struct page_pool * pp)629 static int io_pp_zc_init(struct page_pool *pp)
630 {
631 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
632 int ret;
633
634 if (WARN_ON_ONCE(!ifq))
635 return -EINVAL;
636 if (WARN_ON_ONCE(ifq->dev != pp->p.dev))
637 return -EINVAL;
638 if (WARN_ON_ONCE(!pp->dma_map))
639 return -EOPNOTSUPP;
640 if (pp->p.order != 0)
641 return -EOPNOTSUPP;
642 if (pp->p.dma_dir != DMA_FROM_DEVICE)
643 return -EOPNOTSUPP;
644
645 ret = io_zcrx_map_area(ifq, ifq->area);
646 if (ret)
647 return ret;
648
649 percpu_ref_get(&ifq->ctx->refs);
650 return 0;
651 }
652
io_pp_zc_destroy(struct page_pool * pp)653 static void io_pp_zc_destroy(struct page_pool *pp)
654 {
655 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
656 struct io_zcrx_area *area = ifq->area;
657
658 if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
659 return;
660 percpu_ref_put(&ifq->ctx->refs);
661 }
662
io_pp_nl_fill(void * mp_priv,struct sk_buff * rsp,struct netdev_rx_queue * rxq)663 static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp,
664 struct netdev_rx_queue *rxq)
665 {
666 struct nlattr *nest;
667 int type;
668
669 type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING;
670 nest = nla_nest_start(rsp, type);
671 if (!nest)
672 return -EMSGSIZE;
673 nla_nest_end(rsp, nest);
674
675 return 0;
676 }
677
io_pp_uninstall(void * mp_priv,struct netdev_rx_queue * rxq)678 static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq)
679 {
680 struct pp_memory_provider_params *p = &rxq->mp_params;
681 struct io_zcrx_ifq *ifq = mp_priv;
682
683 io_zcrx_drop_netdev(ifq);
684 if (ifq->area)
685 io_zcrx_unmap_area(ifq, ifq->area);
686
687 p->mp_ops = NULL;
688 p->mp_priv = NULL;
689 }
690
691 static const struct memory_provider_ops io_uring_pp_zc_ops = {
692 .alloc_netmems = io_pp_zc_alloc_netmems,
693 .release_netmem = io_pp_zc_release_netmem,
694 .init = io_pp_zc_init,
695 .destroy = io_pp_zc_destroy,
696 .nl_fill = io_pp_nl_fill,
697 .uninstall = io_pp_uninstall,
698 };
699
io_zcrx_queue_cqe(struct io_kiocb * req,struct net_iov * niov,struct io_zcrx_ifq * ifq,int off,int len)700 static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov,
701 struct io_zcrx_ifq *ifq, int off, int len)
702 {
703 struct io_uring_zcrx_cqe *rcqe;
704 struct io_zcrx_area *area;
705 struct io_uring_cqe *cqe;
706 u64 offset;
707
708 if (!io_defer_get_uncommited_cqe(req->ctx, &cqe))
709 return false;
710
711 cqe->user_data = req->cqe.user_data;
712 cqe->res = len;
713 cqe->flags = IORING_CQE_F_MORE;
714
715 area = io_zcrx_iov_to_area(niov);
716 offset = off + (net_iov_idx(niov) << PAGE_SHIFT);
717 rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1);
718 rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT);
719 rcqe->__pad = 0;
720 return true;
721 }
722
io_zcrx_alloc_fallback(struct io_zcrx_area * area)723 static struct net_iov *io_zcrx_alloc_fallback(struct io_zcrx_area *area)
724 {
725 struct net_iov *niov = NULL;
726
727 spin_lock_bh(&area->freelist_lock);
728 if (area->free_count)
729 niov = __io_zcrx_get_free_niov(area);
730 spin_unlock_bh(&area->freelist_lock);
731
732 if (niov)
733 page_pool_fragment_netmem(net_iov_to_netmem(niov), 1);
734 return niov;
735 }
736
io_zcrx_copy_chunk(struct io_kiocb * req,struct io_zcrx_ifq * ifq,void * src_base,struct page * src_page,unsigned int src_offset,size_t len)737 static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
738 void *src_base, struct page *src_page,
739 unsigned int src_offset, size_t len)
740 {
741 struct io_zcrx_area *area = ifq->area;
742 size_t copied = 0;
743 int ret = 0;
744
745 while (len) {
746 size_t copy_size = min_t(size_t, PAGE_SIZE, len);
747 const int dst_off = 0;
748 struct net_iov *niov;
749 struct page *dst_page;
750 void *dst_addr;
751
752 niov = io_zcrx_alloc_fallback(area);
753 if (!niov) {
754 ret = -ENOMEM;
755 break;
756 }
757
758 dst_page = io_zcrx_iov_page(niov);
759 dst_addr = kmap_local_page(dst_page);
760 if (src_page)
761 src_base = kmap_local_page(src_page);
762
763 memcpy(dst_addr, src_base + src_offset, copy_size);
764
765 if (src_page)
766 kunmap_local(src_base);
767 kunmap_local(dst_addr);
768
769 if (!io_zcrx_queue_cqe(req, niov, ifq, dst_off, copy_size)) {
770 io_zcrx_return_niov(niov);
771 ret = -ENOSPC;
772 break;
773 }
774
775 io_zcrx_get_niov_uref(niov);
776 src_offset += copy_size;
777 len -= copy_size;
778 copied += copy_size;
779 }
780
781 return copied ? copied : ret;
782 }
783
io_zcrx_copy_frag(struct io_kiocb * req,struct io_zcrx_ifq * ifq,const skb_frag_t * frag,int off,int len)784 static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
785 const skb_frag_t *frag, int off, int len)
786 {
787 struct page *page = skb_frag_page(frag);
788 u32 p_off, p_len, t, copied = 0;
789 int ret = 0;
790
791 off += skb_frag_off(frag);
792
793 skb_frag_foreach_page(frag, off, len,
794 page, p_off, p_len, t) {
795 ret = io_zcrx_copy_chunk(req, ifq, NULL, page, p_off, p_len);
796 if (ret < 0)
797 return copied ? copied : ret;
798 copied += ret;
799 }
800 return copied;
801 }
802
io_zcrx_recv_frag(struct io_kiocb * req,struct io_zcrx_ifq * ifq,const skb_frag_t * frag,int off,int len)803 static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
804 const skb_frag_t *frag, int off, int len)
805 {
806 struct net_iov *niov;
807
808 if (unlikely(!skb_frag_is_net_iov(frag)))
809 return io_zcrx_copy_frag(req, ifq, frag, off, len);
810
811 niov = netmem_to_net_iov(frag->netmem);
812 if (niov->pp->mp_ops != &io_uring_pp_zc_ops ||
813 io_pp_to_ifq(niov->pp) != ifq)
814 return -EFAULT;
815
816 if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len))
817 return -ENOSPC;
818
819 /*
820 * Prevent it from being recycled while user is accessing it.
821 * It has to be done before grabbing a user reference.
822 */
823 page_pool_ref_netmem(net_iov_to_netmem(niov));
824 io_zcrx_get_niov_uref(niov);
825 return len;
826 }
827
828 static int
io_zcrx_recv_skb(read_descriptor_t * desc,struct sk_buff * skb,unsigned int offset,size_t len)829 io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
830 unsigned int offset, size_t len)
831 {
832 struct io_zcrx_args *args = desc->arg.data;
833 struct io_zcrx_ifq *ifq = args->ifq;
834 struct io_kiocb *req = args->req;
835 struct sk_buff *frag_iter;
836 unsigned start, start_off = offset;
837 int i, copy, end, off;
838 int ret = 0;
839
840 len = min_t(size_t, len, desc->count);
841 /*
842 * __tcp_read_sock() always calls io_zcrx_recv_skb one last time, even
843 * if desc->count is already 0. This is caused by the if (offset + 1 !=
844 * skb->len) check. Return early in this case to break out of
845 * __tcp_read_sock().
846 */
847 if (!len)
848 return 0;
849 if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT))
850 return -EAGAIN;
851
852 if (unlikely(offset < skb_headlen(skb))) {
853 ssize_t copied;
854 size_t to_copy;
855
856 to_copy = min_t(size_t, skb_headlen(skb) - offset, len);
857 copied = io_zcrx_copy_chunk(req, ifq, skb->data, NULL,
858 offset, to_copy);
859 if (copied < 0) {
860 ret = copied;
861 goto out;
862 }
863 offset += copied;
864 len -= copied;
865 if (!len)
866 goto out;
867 if (offset != skb_headlen(skb))
868 goto out;
869 }
870
871 start = skb_headlen(skb);
872
873 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
874 const skb_frag_t *frag;
875
876 if (WARN_ON(start > offset + len))
877 return -EFAULT;
878
879 frag = &skb_shinfo(skb)->frags[i];
880 end = start + skb_frag_size(frag);
881
882 if (offset < end) {
883 copy = end - offset;
884 if (copy > len)
885 copy = len;
886
887 off = offset - start;
888 ret = io_zcrx_recv_frag(req, ifq, frag, off, copy);
889 if (ret < 0)
890 goto out;
891
892 offset += ret;
893 len -= ret;
894 if (len == 0 || ret != copy)
895 goto out;
896 }
897 start = end;
898 }
899
900 skb_walk_frags(skb, frag_iter) {
901 if (WARN_ON(start > offset + len))
902 return -EFAULT;
903
904 end = start + frag_iter->len;
905 if (offset < end) {
906 copy = end - offset;
907 if (copy > len)
908 copy = len;
909
910 off = offset - start;
911 ret = io_zcrx_recv_skb(desc, frag_iter, off, copy);
912 if (ret < 0)
913 goto out;
914
915 offset += ret;
916 len -= ret;
917 if (len == 0 || ret != copy)
918 goto out;
919 }
920 start = end;
921 }
922
923 out:
924 if (offset == start_off)
925 return ret;
926 desc->count -= (offset - start_off);
927 return offset - start_off;
928 }
929
io_zcrx_tcp_recvmsg(struct io_kiocb * req,struct io_zcrx_ifq * ifq,struct sock * sk,int flags,unsigned issue_flags,unsigned int * outlen)930 static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
931 struct sock *sk, int flags,
932 unsigned issue_flags, unsigned int *outlen)
933 {
934 unsigned int len = *outlen;
935 struct io_zcrx_args args = {
936 .req = req,
937 .ifq = ifq,
938 .sock = sk->sk_socket,
939 };
940 read_descriptor_t rd_desc = {
941 .count = len ? len : UINT_MAX,
942 .arg.data = &args,
943 };
944 int ret;
945
946 lock_sock(sk);
947 ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb);
948 if (len && ret > 0)
949 *outlen = len - ret;
950 if (ret <= 0) {
951 if (ret < 0 || sock_flag(sk, SOCK_DONE))
952 goto out;
953 if (sk->sk_err)
954 ret = sock_error(sk);
955 else if (sk->sk_shutdown & RCV_SHUTDOWN)
956 goto out;
957 else if (sk->sk_state == TCP_CLOSE)
958 ret = -ENOTCONN;
959 else
960 ret = -EAGAIN;
961 } else if (unlikely(args.nr_skbs > IO_SKBS_PER_CALL_LIMIT) &&
962 (issue_flags & IO_URING_F_MULTISHOT)) {
963 ret = IOU_REQUEUE;
964 } else if (sock_flag(sk, SOCK_DONE)) {
965 /* Make it to retry until it finally gets 0. */
966 if (issue_flags & IO_URING_F_MULTISHOT)
967 ret = IOU_REQUEUE;
968 else
969 ret = -EAGAIN;
970 }
971 out:
972 release_sock(sk);
973 return ret;
974 }
975
io_zcrx_recv(struct io_kiocb * req,struct io_zcrx_ifq * ifq,struct socket * sock,unsigned int flags,unsigned issue_flags,unsigned int * len)976 int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
977 struct socket *sock, unsigned int flags,
978 unsigned issue_flags, unsigned int *len)
979 {
980 struct sock *sk = sock->sk;
981 const struct proto *prot = READ_ONCE(sk->sk_prot);
982
983 if (prot->recvmsg != tcp_recvmsg)
984 return -EPROTONOSUPPORT;
985
986 sock_rps_record_flow(sk);
987 return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags, len);
988 }
989