xref: /linux/Documentation/networking/iou-zcrx.rst (revision 4f9786035f9e519db41375818e1d0b5f20da2f10)
1*d9ac1d5fSDavid Wei.. SPDX-License-Identifier: GPL-2.0
2*d9ac1d5fSDavid Wei
3*d9ac1d5fSDavid Wei=====================
4*d9ac1d5fSDavid Weiio_uring zero copy Rx
5*d9ac1d5fSDavid Wei=====================
6*d9ac1d5fSDavid Wei
7*d9ac1d5fSDavid WeiIntroduction
8*d9ac1d5fSDavid Wei============
9*d9ac1d5fSDavid Wei
10*d9ac1d5fSDavid Weiio_uring zero copy Rx (ZC Rx) is a feature that removes kernel-to-user copy on
11*d9ac1d5fSDavid Weithe network receive path, allowing packet data to be received directly into
12*d9ac1d5fSDavid Weiuserspace memory. This feature is different to TCP_ZEROCOPY_RECEIVE in that
13*d9ac1d5fSDavid Weithere are no strict alignment requirements and no need to mmap()/munmap().
14*d9ac1d5fSDavid WeiCompared to kernel bypass solutions such as e.g. DPDK, the packet headers are
15*d9ac1d5fSDavid Weiprocessed by the kernel TCP stack as normal.
16*d9ac1d5fSDavid Wei
17*d9ac1d5fSDavid WeiNIC HW Requirements
18*d9ac1d5fSDavid Wei===================
19*d9ac1d5fSDavid Wei
20*d9ac1d5fSDavid WeiSeveral NIC HW features are required for io_uring ZC Rx to work. For now the
21*d9ac1d5fSDavid Weikernel API does not configure the NIC and it must be done by the user.
22*d9ac1d5fSDavid Wei
23*d9ac1d5fSDavid WeiHeader/data split
24*d9ac1d5fSDavid Wei-----------------
25*d9ac1d5fSDavid Wei
26*d9ac1d5fSDavid WeiRequired to split packets at the L4 boundary into a header and a payload.
27*d9ac1d5fSDavid WeiHeaders are received into kernel memory as normal and processed by the TCP
28*d9ac1d5fSDavid Weistack as normal. Payloads are received into userspace memory directly.
29*d9ac1d5fSDavid Wei
30*d9ac1d5fSDavid WeiFlow steering
31*d9ac1d5fSDavid Wei-------------
32*d9ac1d5fSDavid Wei
33*d9ac1d5fSDavid WeiSpecific HW Rx queues are configured for this feature, but modern NICs
34*d9ac1d5fSDavid Weitypically distribute flows across all HW Rx queues. Flow steering is required
35*d9ac1d5fSDavid Weito ensure that only desired flows are directed towards HW queues that are
36*d9ac1d5fSDavid Weiconfigured for io_uring ZC Rx.
37*d9ac1d5fSDavid Wei
38*d9ac1d5fSDavid WeiRSS
39*d9ac1d5fSDavid Wei---
40*d9ac1d5fSDavid Wei
41*d9ac1d5fSDavid WeiIn addition to flow steering above, RSS is required to steer all other non-zero
42*d9ac1d5fSDavid Weicopy flows away from queues that are configured for io_uring ZC Rx.
43*d9ac1d5fSDavid Wei
44*d9ac1d5fSDavid WeiUsage
45*d9ac1d5fSDavid Wei=====
46*d9ac1d5fSDavid Wei
47*d9ac1d5fSDavid WeiSetup NIC
48*d9ac1d5fSDavid Wei---------
49*d9ac1d5fSDavid Wei
50*d9ac1d5fSDavid WeiMust be done out of band for now.
51*d9ac1d5fSDavid Wei
52*d9ac1d5fSDavid WeiEnsure there are at least two queues::
53*d9ac1d5fSDavid Wei
54*d9ac1d5fSDavid Wei  ethtool -L eth0 combined 2
55*d9ac1d5fSDavid Wei
56*d9ac1d5fSDavid WeiEnable header/data split::
57*d9ac1d5fSDavid Wei
58*d9ac1d5fSDavid Wei  ethtool -G eth0 tcp-data-split on
59*d9ac1d5fSDavid Wei
60*d9ac1d5fSDavid WeiCarve out half of the HW Rx queues for zero copy using RSS::
61*d9ac1d5fSDavid Wei
62*d9ac1d5fSDavid Wei  ethtool -X eth0 equal 1
63*d9ac1d5fSDavid Wei
64*d9ac1d5fSDavid WeiSet up flow steering, bearing in mind that queues are 0-indexed::
65*d9ac1d5fSDavid Wei
66*d9ac1d5fSDavid Wei  ethtool -N eth0 flow-type tcp6 ... action 1
67*d9ac1d5fSDavid Wei
68*d9ac1d5fSDavid WeiSetup io_uring
69*d9ac1d5fSDavid Wei--------------
70*d9ac1d5fSDavid Wei
71*d9ac1d5fSDavid WeiThis section describes the low level io_uring kernel API. Please refer to
72*d9ac1d5fSDavid Weiliburing documentation for how to use the higher level API.
73*d9ac1d5fSDavid Wei
74*d9ac1d5fSDavid WeiCreate an io_uring instance with the following required setup flags::
75*d9ac1d5fSDavid Wei
76*d9ac1d5fSDavid Wei  IORING_SETUP_SINGLE_ISSUER
77*d9ac1d5fSDavid Wei  IORING_SETUP_DEFER_TASKRUN
78*d9ac1d5fSDavid Wei  IORING_SETUP_CQE32
79*d9ac1d5fSDavid Wei
80*d9ac1d5fSDavid WeiCreate memory area
81*d9ac1d5fSDavid Wei------------------
82*d9ac1d5fSDavid Wei
83*d9ac1d5fSDavid WeiAllocate userspace memory area for receiving zero copy data::
84*d9ac1d5fSDavid Wei
85*d9ac1d5fSDavid Wei  void *area_ptr = mmap(NULL, area_size,
86*d9ac1d5fSDavid Wei                        PROT_READ | PROT_WRITE,
87*d9ac1d5fSDavid Wei                        MAP_ANONYMOUS | MAP_PRIVATE,
88*d9ac1d5fSDavid Wei                        0, 0);
89*d9ac1d5fSDavid Wei
90*d9ac1d5fSDavid WeiCreate refill ring
91*d9ac1d5fSDavid Wei------------------
92*d9ac1d5fSDavid Wei
93*d9ac1d5fSDavid WeiAllocate memory for a shared ringbuf used for returning consumed buffers::
94*d9ac1d5fSDavid Wei
95*d9ac1d5fSDavid Wei  void *ring_ptr = mmap(NULL, ring_size,
96*d9ac1d5fSDavid Wei                        PROT_READ | PROT_WRITE,
97*d9ac1d5fSDavid Wei                        MAP_ANONYMOUS | MAP_PRIVATE,
98*d9ac1d5fSDavid Wei                        0, 0);
99*d9ac1d5fSDavid Wei
100*d9ac1d5fSDavid WeiThis refill ring consists of some space for the header, followed by an array of
101*d9ac1d5fSDavid Wei``struct io_uring_zcrx_rqe``::
102*d9ac1d5fSDavid Wei
103*d9ac1d5fSDavid Wei  size_t rq_entries = 4096;
104*d9ac1d5fSDavid Wei  size_t ring_size = rq_entries * sizeof(struct io_uring_zcrx_rqe) + PAGE_SIZE;
105*d9ac1d5fSDavid Wei  /* align to page size */
106*d9ac1d5fSDavid Wei  ring_size = (ring_size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
107*d9ac1d5fSDavid Wei
108*d9ac1d5fSDavid WeiRegister ZC Rx
109*d9ac1d5fSDavid Wei--------------
110*d9ac1d5fSDavid Wei
111*d9ac1d5fSDavid WeiFill in registration structs::
112*d9ac1d5fSDavid Wei
113*d9ac1d5fSDavid Wei  struct io_uring_zcrx_area_reg area_reg = {
114*d9ac1d5fSDavid Wei    .addr = (__u64)(unsigned long)area_ptr,
115*d9ac1d5fSDavid Wei    .len = area_size,
116*d9ac1d5fSDavid Wei    .flags = 0,
117*d9ac1d5fSDavid Wei  };
118*d9ac1d5fSDavid Wei
119*d9ac1d5fSDavid Wei  struct io_uring_region_desc region_reg = {
120*d9ac1d5fSDavid Wei    .user_addr = (__u64)(unsigned long)ring_ptr,
121*d9ac1d5fSDavid Wei    .size = ring_size,
122*d9ac1d5fSDavid Wei    .flags = IORING_MEM_REGION_TYPE_USER,
123*d9ac1d5fSDavid Wei  };
124*d9ac1d5fSDavid Wei
125*d9ac1d5fSDavid Wei  struct io_uring_zcrx_ifq_reg reg = {
126*d9ac1d5fSDavid Wei    .if_idx = if_nametoindex("eth0"),
127*d9ac1d5fSDavid Wei    /* this is the HW queue with desired flow steered into it */
128*d9ac1d5fSDavid Wei    .if_rxq = 1,
129*d9ac1d5fSDavid Wei    .rq_entries = rq_entries,
130*d9ac1d5fSDavid Wei    .area_ptr = (__u64)(unsigned long)&area_reg,
131*d9ac1d5fSDavid Wei    .region_ptr = (__u64)(unsigned long)&region_reg,
132*d9ac1d5fSDavid Wei  };
133*d9ac1d5fSDavid Wei
134*d9ac1d5fSDavid WeiRegister with kernel::
135*d9ac1d5fSDavid Wei
136*d9ac1d5fSDavid Wei  io_uring_register_ifq(ring, &reg);
137*d9ac1d5fSDavid Wei
138*d9ac1d5fSDavid WeiMap refill ring
139*d9ac1d5fSDavid Wei---------------
140*d9ac1d5fSDavid Wei
141*d9ac1d5fSDavid WeiThe kernel fills in fields for the refill ring in the registration ``struct
142*d9ac1d5fSDavid Weiio_uring_zcrx_ifq_reg``. Map it into userspace::
143*d9ac1d5fSDavid Wei
144*d9ac1d5fSDavid Wei  struct io_uring_zcrx_rq refill_ring;
145*d9ac1d5fSDavid Wei
146*d9ac1d5fSDavid Wei  refill_ring.khead = (unsigned *)((char *)ring_ptr + reg.offsets.head);
147*d9ac1d5fSDavid Wei  refill_ring.khead = (unsigned *)((char *)ring_ptr + reg.offsets.tail);
148*d9ac1d5fSDavid Wei  refill_ring.rqes =
149*d9ac1d5fSDavid Wei    (struct io_uring_zcrx_rqe *)((char *)ring_ptr + reg.offsets.rqes);
150*d9ac1d5fSDavid Wei  refill_ring.rq_tail = 0;
151*d9ac1d5fSDavid Wei  refill_ring.ring_ptr = ring_ptr;
152*d9ac1d5fSDavid Wei
153*d9ac1d5fSDavid WeiReceiving data
154*d9ac1d5fSDavid Wei--------------
155*d9ac1d5fSDavid Wei
156*d9ac1d5fSDavid WeiPrepare a zero copy recv request::
157*d9ac1d5fSDavid Wei
158*d9ac1d5fSDavid Wei  struct io_uring_sqe *sqe;
159*d9ac1d5fSDavid Wei
160*d9ac1d5fSDavid Wei  sqe = io_uring_get_sqe(ring);
161*d9ac1d5fSDavid Wei  io_uring_prep_rw(IORING_OP_RECV_ZC, sqe, fd, NULL, 0, 0);
162*d9ac1d5fSDavid Wei  sqe->ioprio |= IORING_RECV_MULTISHOT;
163*d9ac1d5fSDavid Wei
164*d9ac1d5fSDavid WeiNow, submit and wait::
165*d9ac1d5fSDavid Wei
166*d9ac1d5fSDavid Wei  io_uring_submit_and_wait(ring, 1);
167*d9ac1d5fSDavid Wei
168*d9ac1d5fSDavid WeiFinally, process completions::
169*d9ac1d5fSDavid Wei
170*d9ac1d5fSDavid Wei  struct io_uring_cqe *cqe;
171*d9ac1d5fSDavid Wei  unsigned int count = 0;
172*d9ac1d5fSDavid Wei  unsigned int head;
173*d9ac1d5fSDavid Wei
174*d9ac1d5fSDavid Wei  io_uring_for_each_cqe(ring, head, cqe) {
175*d9ac1d5fSDavid Wei    struct io_uring_zcrx_cqe *rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1);
176*d9ac1d5fSDavid Wei
177*d9ac1d5fSDavid Wei    unsigned long mask = (1ULL << IORING_ZCRX_AREA_SHIFT) - 1;
178*d9ac1d5fSDavid Wei    unsigned char *data = area_ptr + (rcqe->off & mask);
179*d9ac1d5fSDavid Wei    /* do something with the data */
180*d9ac1d5fSDavid Wei
181*d9ac1d5fSDavid Wei    count++;
182*d9ac1d5fSDavid Wei  }
183*d9ac1d5fSDavid Wei  io_uring_cq_advance(ring, count);
184*d9ac1d5fSDavid Wei
185*d9ac1d5fSDavid WeiRecycling buffers
186*d9ac1d5fSDavid Wei-----------------
187*d9ac1d5fSDavid Wei
188*d9ac1d5fSDavid WeiReturn buffers back to the kernel to be used again::
189*d9ac1d5fSDavid Wei
190*d9ac1d5fSDavid Wei  struct io_uring_zcrx_rqe *rqe;
191*d9ac1d5fSDavid Wei  unsigned mask = refill_ring.ring_entries - 1;
192*d9ac1d5fSDavid Wei  rqe = &refill_ring.rqes[refill_ring.rq_tail & mask];
193*d9ac1d5fSDavid Wei
194*d9ac1d5fSDavid Wei  unsigned long area_offset = rcqe->off & ~IORING_ZCRX_AREA_MASK;
195*d9ac1d5fSDavid Wei  rqe->off = area_offset | area_reg.rq_area_token;
196*d9ac1d5fSDavid Wei  rqe->len = cqe->res;
197*d9ac1d5fSDavid Wei  IO_URING_WRITE_ONCE(*refill_ring.ktail, ++refill_ring.rq_tail);
198*d9ac1d5fSDavid Wei
199*d9ac1d5fSDavid WeiTesting
200*d9ac1d5fSDavid Wei=======
201*d9ac1d5fSDavid Wei
202*d9ac1d5fSDavid WeiSee ``tools/testing/selftests/drivers/net/hw/iou-zcrx.c``
203