1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2012 Chelsio Communications, Inc.
5 * All rights reserved.
6 * Written by: Navdeep Parhar <np@FreeBSD.org>
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include <sys/cdefs.h>
31 #include "opt_inet.h"
32
33 #include <sys/param.h>
34 #include <sys/aio.h>
35 #include <sys/bio.h>
36 #include <sys/file.h>
37 #include <sys/systm.h>
38 #include <sys/kernel.h>
39 #include <sys/ktr.h>
40 #include <sys/module.h>
41 #include <sys/protosw.h>
42 #include <sys/proc.h>
43 #include <sys/domain.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/taskqueue.h>
47 #include <sys/uio.h>
48 #include <netinet/in.h>
49 #include <netinet/in_pcb.h>
50 #include <netinet/ip.h>
51 #include <netinet/tcp_var.h>
52 #define TCPSTATES
53 #include <netinet/tcp_fsm.h>
54 #include <netinet/toecore.h>
55
56 #include <vm/vm.h>
57 #include <vm/vm_extern.h>
58 #include <vm/vm_param.h>
59 #include <vm/pmap.h>
60 #include <vm/vm_map.h>
61 #include <vm/vm_page.h>
62 #include <vm/vm_object.h>
63
64 #include <cam/scsi/scsi_all.h>
65 #include <cam/ctl/ctl_io.h>
66
67 #ifdef TCP_OFFLOAD
68 #include "common/common.h"
69 #include "common/t4_msg.h"
70 #include "common/t4_regs.h"
71 #include "common/t4_tcb.h"
72 #include "tom/t4_tom.h"
73
74 /*
75 * Use the 'backend3' field in AIO jobs to store the amount of data
76 * received by the AIO job so far.
77 */
78 #define aio_received backend3
79
80 static void aio_ddp_requeue_task(void *context, int pending);
81 static void ddp_complete_all(struct toepcb *toep, int error);
82 static void t4_aio_cancel_active(struct kaiocb *job);
83 static void t4_aio_cancel_queued(struct kaiocb *job);
84 static int t4_alloc_page_pods_for_rcvbuf(struct ppod_region *pr,
85 struct ddp_rcv_buffer *drb);
86 static int t4_write_page_pods_for_rcvbuf(struct adapter *sc,
87 struct sge_wrq *wrq, int tid, struct ddp_rcv_buffer *drb);
88
89 static TAILQ_HEAD(, pageset) ddp_orphan_pagesets;
90 static struct mtx ddp_orphan_pagesets_lock;
91 static struct task ddp_orphan_task;
92
93 #define MAX_DDP_BUFFER_SIZE (M_TCB_RX_DDP_BUF0_LEN)
94
95 /*
96 * A page set holds information about a user buffer used for AIO DDP.
97 * The page set holds resources such as the VM pages backing the
98 * buffer (either held or wired) and the page pods associated with the
99 * buffer. Recently used page sets are cached to allow for efficient
100 * reuse of buffers (avoiding the need to re-fault in pages, hold
101 * them, etc.). Note that cached page sets keep the backing pages
102 * wired. The number of wired pages is capped by only allowing for
103 * two wired pagesets per connection. This is not a perfect cap, but
104 * is a trade-off for performance.
105 *
106 * If an application ping-pongs two buffers for a connection via
107 * aio_read(2) then those buffers should remain wired and expensive VM
108 * fault lookups should be avoided after each buffer has been used
109 * once. If an application uses more than two buffers then this will
110 * fall back to doing expensive VM fault lookups for each operation.
111 */
112 static void
free_pageset(struct tom_data * td,struct pageset * ps)113 free_pageset(struct tom_data *td, struct pageset *ps)
114 {
115 vm_page_t p;
116 int i;
117
118 if (ps->prsv.prsv_nppods > 0)
119 t4_free_page_pods(&ps->prsv);
120
121 for (i = 0; i < ps->npages; i++) {
122 p = ps->pages[i];
123 vm_page_unwire(p, PQ_INACTIVE);
124 }
125 mtx_lock(&ddp_orphan_pagesets_lock);
126 TAILQ_INSERT_TAIL(&ddp_orphan_pagesets, ps, link);
127 taskqueue_enqueue(taskqueue_thread, &ddp_orphan_task);
128 mtx_unlock(&ddp_orphan_pagesets_lock);
129 }
130
131 static void
ddp_free_orphan_pagesets(void * context,int pending)132 ddp_free_orphan_pagesets(void *context, int pending)
133 {
134 struct pageset *ps;
135
136 mtx_lock(&ddp_orphan_pagesets_lock);
137 while (!TAILQ_EMPTY(&ddp_orphan_pagesets)) {
138 ps = TAILQ_FIRST(&ddp_orphan_pagesets);
139 TAILQ_REMOVE(&ddp_orphan_pagesets, ps, link);
140 mtx_unlock(&ddp_orphan_pagesets_lock);
141 if (ps->vm)
142 vmspace_free(ps->vm);
143 free(ps, M_CXGBE);
144 mtx_lock(&ddp_orphan_pagesets_lock);
145 }
146 mtx_unlock(&ddp_orphan_pagesets_lock);
147 }
148
149 static void
recycle_pageset(struct toepcb * toep,struct pageset * ps)150 recycle_pageset(struct toepcb *toep, struct pageset *ps)
151 {
152
153 DDP_ASSERT_LOCKED(toep);
154 if (!(toep->ddp.flags & DDP_DEAD)) {
155 KASSERT(toep->ddp.cached_count + toep->ddp.active_count <
156 nitems(toep->ddp.db), ("too many wired pagesets"));
157 TAILQ_INSERT_HEAD(&toep->ddp.cached_pagesets, ps, link);
158 toep->ddp.cached_count++;
159 } else
160 free_pageset(toep->td, ps);
161 }
162
163 static void
ddp_complete_one(struct kaiocb * job,int error)164 ddp_complete_one(struct kaiocb *job, int error)
165 {
166 long copied;
167
168 /*
169 * If this job had copied data out of the socket buffer before
170 * it was cancelled, report it as a short read rather than an
171 * error.
172 */
173 copied = job->aio_received;
174 if (copied != 0 || error == 0)
175 aio_complete(job, copied, 0);
176 else
177 aio_complete(job, -1, error);
178 }
179
180 static void
free_ddp_rcv_buffer(struct toepcb * toep,struct ddp_rcv_buffer * drb)181 free_ddp_rcv_buffer(struct toepcb *toep, struct ddp_rcv_buffer *drb)
182 {
183 t4_free_page_pods(&drb->prsv);
184 free(drb->buf, M_CXGBE);
185 free(drb, M_CXGBE);
186 counter_u64_add(toep->ofld_rxq->ddp_buffer_free, 1);
187 free_toepcb(toep);
188 }
189
190 static void
recycle_ddp_rcv_buffer(struct toepcb * toep,struct ddp_rcv_buffer * drb)191 recycle_ddp_rcv_buffer(struct toepcb *toep, struct ddp_rcv_buffer *drb)
192 {
193 DDP_CACHE_LOCK(toep);
194 if (!(toep->ddp.flags & DDP_DEAD) &&
195 toep->ddp.cached_count < t4_ddp_rcvbuf_cache) {
196 TAILQ_INSERT_HEAD(&toep->ddp.cached_buffers, drb, link);
197 toep->ddp.cached_count++;
198 DDP_CACHE_UNLOCK(toep);
199 } else {
200 DDP_CACHE_UNLOCK(toep);
201 free_ddp_rcv_buffer(toep, drb);
202 }
203 }
204
205 static struct ddp_rcv_buffer *
alloc_cached_ddp_rcv_buffer(struct toepcb * toep)206 alloc_cached_ddp_rcv_buffer(struct toepcb *toep)
207 {
208 struct ddp_rcv_buffer *drb;
209
210 DDP_CACHE_LOCK(toep);
211 if (!TAILQ_EMPTY(&toep->ddp.cached_buffers)) {
212 drb = TAILQ_FIRST(&toep->ddp.cached_buffers);
213 TAILQ_REMOVE(&toep->ddp.cached_buffers, drb, link);
214 toep->ddp.cached_count--;
215 counter_u64_add(toep->ofld_rxq->ddp_buffer_reuse, 1);
216 } else
217 drb = NULL;
218 DDP_CACHE_UNLOCK(toep);
219 return (drb);
220 }
221
222 static struct ddp_rcv_buffer *
alloc_ddp_rcv_buffer(struct toepcb * toep,int how)223 alloc_ddp_rcv_buffer(struct toepcb *toep, int how)
224 {
225 struct tom_data *td = toep->td;
226 struct adapter *sc = td_adapter(td);
227 struct ddp_rcv_buffer *drb;
228 int error;
229
230 drb = malloc(sizeof(*drb), M_CXGBE, how | M_ZERO);
231 if (drb == NULL)
232 return (NULL);
233
234 drb->buf = contigmalloc(t4_ddp_rcvbuf_len, M_CXGBE, how, 0, ~0,
235 t4_ddp_rcvbuf_len, 0);
236 if (drb->buf == NULL) {
237 free(drb, M_CXGBE);
238 return (NULL);
239 }
240 drb->len = t4_ddp_rcvbuf_len;
241 drb->refs = 1;
242
243 error = t4_alloc_page_pods_for_rcvbuf(&td->pr, drb);
244 if (error != 0) {
245 free(drb->buf, M_CXGBE);
246 free(drb, M_CXGBE);
247 return (NULL);
248 }
249
250 error = t4_write_page_pods_for_rcvbuf(sc, toep->ctrlq, toep->tid, drb);
251 if (error != 0) {
252 t4_free_page_pods(&drb->prsv);
253 free(drb->buf, M_CXGBE);
254 free(drb, M_CXGBE);
255 return (NULL);
256 }
257
258 hold_toepcb(toep);
259 counter_u64_add(toep->ofld_rxq->ddp_buffer_alloc, 1);
260 return (drb);
261 }
262
263 static void
free_ddp_buffer(struct toepcb * toep,struct ddp_buffer * db)264 free_ddp_buffer(struct toepcb *toep, struct ddp_buffer *db)
265 {
266 if ((toep->ddp.flags & DDP_RCVBUF) != 0) {
267 if (db->drb != NULL)
268 free_ddp_rcv_buffer(toep, db->drb);
269 #ifdef INVARIANTS
270 db->drb = NULL;
271 #endif
272 return;
273 }
274
275 if (db->job) {
276 /*
277 * XXX: If we are un-offloading the socket then we
278 * should requeue these on the socket somehow. If we
279 * got a FIN from the remote end, then this completes
280 * any remaining requests with an EOF read.
281 */
282 if (!aio_clear_cancel_function(db->job))
283 ddp_complete_one(db->job, 0);
284 #ifdef INVARIANTS
285 db->job = NULL;
286 #endif
287 }
288
289 if (db->ps) {
290 free_pageset(toep->td, db->ps);
291 #ifdef INVARIANTS
292 db->ps = NULL;
293 #endif
294 }
295 }
296
297 static void
ddp_init_toep(struct toepcb * toep)298 ddp_init_toep(struct toepcb *toep)
299 {
300
301 toep->ddp.flags = DDP_OK;
302 toep->ddp.active_id = -1;
303 mtx_init(&toep->ddp.lock, "t4 ddp", NULL, MTX_DEF);
304 mtx_init(&toep->ddp.cache_lock, "t4 ddp cache", NULL, MTX_DEF);
305 }
306
307 void
ddp_uninit_toep(struct toepcb * toep)308 ddp_uninit_toep(struct toepcb *toep)
309 {
310
311 mtx_destroy(&toep->ddp.lock);
312 mtx_destroy(&toep->ddp.cache_lock);
313 }
314
315 void
release_ddp_resources(struct toepcb * toep)316 release_ddp_resources(struct toepcb *toep)
317 {
318 struct ddp_rcv_buffer *drb;
319 struct pageset *ps;
320 int i;
321
322 DDP_LOCK(toep);
323 DDP_CACHE_LOCK(toep);
324 toep->ddp.flags |= DDP_DEAD;
325 DDP_CACHE_UNLOCK(toep);
326 for (i = 0; i < nitems(toep->ddp.db); i++) {
327 free_ddp_buffer(toep, &toep->ddp.db[i]);
328 }
329 if ((toep->ddp.flags & DDP_AIO) != 0) {
330 while ((ps = TAILQ_FIRST(&toep->ddp.cached_pagesets)) != NULL) {
331 TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
332 free_pageset(toep->td, ps);
333 }
334 ddp_complete_all(toep, 0);
335 }
336 if ((toep->ddp.flags & DDP_RCVBUF) != 0) {
337 DDP_CACHE_LOCK(toep);
338 while ((drb = TAILQ_FIRST(&toep->ddp.cached_buffers)) != NULL) {
339 TAILQ_REMOVE(&toep->ddp.cached_buffers, drb, link);
340 free_ddp_rcv_buffer(toep, drb);
341 }
342 DDP_CACHE_UNLOCK(toep);
343 }
344 DDP_UNLOCK(toep);
345 }
346
347 #ifdef INVARIANTS
348 void
ddp_assert_empty(struct toepcb * toep)349 ddp_assert_empty(struct toepcb *toep)
350 {
351 int i;
352
353 MPASS((toep->ddp.flags & (DDP_TASK_ACTIVE | DDP_DEAD)) != DDP_TASK_ACTIVE);
354 for (i = 0; i < nitems(toep->ddp.db); i++) {
355 if ((toep->ddp.flags & DDP_AIO) != 0) {
356 MPASS(toep->ddp.db[i].job == NULL);
357 MPASS(toep->ddp.db[i].ps == NULL);
358 } else
359 MPASS(toep->ddp.db[i].drb == NULL);
360 }
361 if ((toep->ddp.flags & DDP_AIO) != 0) {
362 MPASS(TAILQ_EMPTY(&toep->ddp.cached_pagesets));
363 MPASS(TAILQ_EMPTY(&toep->ddp.aiojobq));
364 }
365 if ((toep->ddp.flags & DDP_RCVBUF) != 0)
366 MPASS(TAILQ_EMPTY(&toep->ddp.cached_buffers));
367 }
368 #endif
369
370 static void
complete_ddp_buffer(struct toepcb * toep,struct ddp_buffer * db,unsigned int db_idx)371 complete_ddp_buffer(struct toepcb *toep, struct ddp_buffer *db,
372 unsigned int db_idx)
373 {
374 struct ddp_rcv_buffer *drb;
375 unsigned int db_flag;
376
377 toep->ddp.active_count--;
378 if (toep->ddp.active_id == db_idx) {
379 if (toep->ddp.active_count == 0) {
380 if ((toep->ddp.flags & DDP_AIO) != 0)
381 KASSERT(toep->ddp.db[db_idx ^ 1].job == NULL,
382 ("%s: active_count mismatch", __func__));
383 else
384 KASSERT(toep->ddp.db[db_idx ^ 1].drb == NULL,
385 ("%s: active_count mismatch", __func__));
386 toep->ddp.active_id = -1;
387 } else
388 toep->ddp.active_id ^= 1;
389 #ifdef VERBOSE_TRACES
390 CTR3(KTR_CXGBE, "%s: tid %u, ddp_active_id = %d", __func__,
391 toep->tid, toep->ddp.active_id);
392 #endif
393 } else {
394 KASSERT(toep->ddp.active_count != 0 &&
395 toep->ddp.active_id != -1,
396 ("%s: active count mismatch", __func__));
397 }
398
399 if ((toep->ddp.flags & DDP_AIO) != 0) {
400 db->cancel_pending = 0;
401 db->job = NULL;
402 recycle_pageset(toep, db->ps);
403 db->ps = NULL;
404 } else {
405 drb = db->drb;
406 if (atomic_fetchadd_int(&drb->refs, -1) == 1)
407 recycle_ddp_rcv_buffer(toep, drb);
408 db->drb = NULL;
409 db->placed = 0;
410 }
411
412 db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
413 KASSERT(toep->ddp.flags & db_flag,
414 ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x",
415 __func__, toep, toep->ddp.flags));
416 toep->ddp.flags &= ~db_flag;
417 }
418
419 /* Called when m_free drops the last reference. */
420 static void
ddp_rcv_mbuf_done(struct mbuf * m)421 ddp_rcv_mbuf_done(struct mbuf *m)
422 {
423 struct toepcb *toep = m->m_ext.ext_arg1;
424 struct ddp_rcv_buffer *drb = m->m_ext.ext_arg2;
425
426 recycle_ddp_rcv_buffer(toep, drb);
427 }
428
429 static void
queue_ddp_rcvbuf_mbuf(struct toepcb * toep,u_int db_idx,u_int len)430 queue_ddp_rcvbuf_mbuf(struct toepcb *toep, u_int db_idx, u_int len)
431 {
432 struct inpcb *inp = toep->inp;
433 struct sockbuf *sb;
434 struct ddp_buffer *db;
435 struct ddp_rcv_buffer *drb;
436 struct mbuf *m;
437
438 m = m_gethdr(M_NOWAIT, MT_DATA);
439 if (m == NULL) {
440 printf("%s: failed to allocate mbuf", __func__);
441 return;
442 }
443 m->m_pkthdr.rcvif = toep->vi->ifp;
444
445 db = &toep->ddp.db[db_idx];
446 drb = db->drb;
447 m_extaddref(m, (char *)drb->buf + db->placed, len, &drb->refs,
448 ddp_rcv_mbuf_done, toep, drb);
449 m->m_pkthdr.len = len;
450 m->m_len = len;
451
452 sb = &inp->inp_socket->so_rcv;
453 SOCKBUF_LOCK_ASSERT(sb);
454 sbappendstream_locked(sb, m, 0);
455
456 db->placed += len;
457 toep->ofld_rxq->rx_toe_ddp_octets += len;
458 }
459
460 /* XXX: handle_ddp_data code duplication */
461 void
insert_ddp_data(struct toepcb * toep,uint32_t n)462 insert_ddp_data(struct toepcb *toep, uint32_t n)
463 {
464 struct inpcb *inp = toep->inp;
465 struct tcpcb *tp = intotcpcb(inp);
466 struct ddp_buffer *db;
467 struct kaiocb *job;
468 size_t placed;
469 long copied;
470 unsigned int db_idx;
471 #ifdef INVARIANTS
472 unsigned int db_flag;
473 #endif
474 bool ddp_rcvbuf;
475
476 INP_WLOCK_ASSERT(inp);
477 DDP_ASSERT_LOCKED(toep);
478
479 ddp_rcvbuf = (toep->ddp.flags & DDP_RCVBUF) != 0;
480 tp->rcv_nxt += n;
481 #ifndef USE_DDP_RX_FLOW_CONTROL
482 KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__));
483 tp->rcv_wnd -= n;
484 #endif
485 CTR2(KTR_CXGBE, "%s: placed %u bytes before falling out of DDP",
486 __func__, n);
487 while (toep->ddp.active_count > 0) {
488 MPASS(toep->ddp.active_id != -1);
489 db_idx = toep->ddp.active_id;
490 #ifdef INVARIANTS
491 db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
492 #endif
493 MPASS((toep->ddp.flags & db_flag) != 0);
494 db = &toep->ddp.db[db_idx];
495 if (ddp_rcvbuf) {
496 placed = n;
497 if (placed > db->drb->len - db->placed)
498 placed = db->drb->len - db->placed;
499 if (placed != 0)
500 queue_ddp_rcvbuf_mbuf(toep, db_idx, placed);
501 complete_ddp_buffer(toep, db, db_idx);
502 n -= placed;
503 continue;
504 }
505 job = db->job;
506 copied = job->aio_received;
507 placed = n;
508 if (placed > job->uaiocb.aio_nbytes - copied)
509 placed = job->uaiocb.aio_nbytes - copied;
510 if (placed > 0) {
511 job->msgrcv = 1;
512 toep->ofld_rxq->rx_aio_ddp_jobs++;
513 }
514 toep->ofld_rxq->rx_aio_ddp_octets += placed;
515 if (!aio_clear_cancel_function(job)) {
516 /*
517 * Update the copied length for when
518 * t4_aio_cancel_active() completes this
519 * request.
520 */
521 job->aio_received += placed;
522 } else if (copied + placed != 0) {
523 CTR4(KTR_CXGBE,
524 "%s: completing %p (copied %ld, placed %lu)",
525 __func__, job, copied, placed);
526 /* XXX: This always completes if there is some data. */
527 aio_complete(job, copied + placed, 0);
528 } else if (aio_set_cancel_function(job, t4_aio_cancel_queued)) {
529 TAILQ_INSERT_HEAD(&toep->ddp.aiojobq, job, list);
530 toep->ddp.waiting_count++;
531 } else
532 aio_cancel(job);
533 n -= placed;
534 complete_ddp_buffer(toep, db, db_idx);
535 }
536
537 MPASS(n == 0);
538 }
539
540 /* SET_TCB_FIELD sent as a ULP command looks like this */
541 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
542 sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
543
544 /* RX_DATA_ACK sent as a ULP command looks like this */
545 #define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \
546 sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core))
547
548 static inline void *
mk_rx_data_ack_ulp(struct ulp_txpkt * ulpmc,struct toepcb * toep)549 mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep)
550 {
551 struct ulptx_idata *ulpsc;
552 struct cpl_rx_data_ack_core *req;
553
554 ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
555 ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16));
556
557 ulpsc = (struct ulptx_idata *)(ulpmc + 1);
558 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
559 ulpsc->len = htobe32(sizeof(*req));
560
561 req = (struct cpl_rx_data_ack_core *)(ulpsc + 1);
562 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid));
563 req->credit_dack = htobe32(F_RX_MODULATE_RX);
564
565 ulpsc = (struct ulptx_idata *)(req + 1);
566 if (LEN__RX_DATA_ACK_ULP % 16) {
567 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
568 ulpsc->len = htobe32(0);
569 return (ulpsc + 1);
570 }
571 return (ulpsc);
572 }
573
574 static struct wrqe *
mk_update_tcb_for_ddp(struct adapter * sc,struct toepcb * toep,int db_idx,struct ppod_reservation * prsv,int offset,uint32_t len,uint64_t ddp_flags,uint64_t ddp_flags_mask)575 mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx,
576 struct ppod_reservation *prsv, int offset, uint32_t len,
577 uint64_t ddp_flags, uint64_t ddp_flags_mask)
578 {
579 struct wrqe *wr;
580 struct work_request_hdr *wrh;
581 struct ulp_txpkt *ulpmc;
582 int wrlen;
583
584 KASSERT(db_idx == 0 || db_idx == 1,
585 ("%s: bad DDP buffer index %d", __func__, db_idx));
586
587 /*
588 * We'll send a compound work request that has 3 SET_TCB_FIELDs and an
589 * RX_DATA_ACK (with RX_MODULATE to speed up delivery).
590 *
591 * The work request header is 16B and always ends at a 16B boundary.
592 * The ULPTX master commands that follow must all end at 16B boundaries
593 * too so we round up the size to 16.
594 */
595 wrlen = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) +
596 roundup2(LEN__RX_DATA_ACK_ULP, 16);
597
598 wr = alloc_wrqe(wrlen, toep->ctrlq);
599 if (wr == NULL)
600 return (NULL);
601 wrh = wrtod(wr);
602 INIT_ULPTX_WRH(wrh, wrlen, 1, 0); /* atomic */
603 ulpmc = (struct ulp_txpkt *)(wrh + 1);
604
605 /* Write the buffer's tag */
606 ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid,
607 W_TCB_RX_DDP_BUF0_TAG + db_idx,
608 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
609 V_TCB_RX_DDP_BUF0_TAG(prsv->prsv_tag));
610
611 /* Update the current offset in the DDP buffer and its total length */
612 if (db_idx == 0)
613 ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid,
614 W_TCB_RX_DDP_BUF0_OFFSET,
615 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
616 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
617 V_TCB_RX_DDP_BUF0_OFFSET(offset) |
618 V_TCB_RX_DDP_BUF0_LEN(len));
619 else
620 ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid,
621 W_TCB_RX_DDP_BUF1_OFFSET,
622 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
623 V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32),
624 V_TCB_RX_DDP_BUF1_OFFSET(offset) |
625 V_TCB_RX_DDP_BUF1_LEN((u64)len << 32));
626
627 /* Update DDP flags */
628 ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_RX_DDP_FLAGS,
629 ddp_flags_mask, ddp_flags);
630
631 /* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */
632 ulpmc = mk_rx_data_ack_ulp(ulpmc, toep);
633
634 return (wr);
635 }
636
637 static int
handle_ddp_data_aio(struct toepcb * toep,__be32 ddp_report,__be32 rcv_nxt,int len)638 handle_ddp_data_aio(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt,
639 int len)
640 {
641 uint32_t report = be32toh(ddp_report);
642 unsigned int db_idx;
643 struct inpcb *inp = toep->inp;
644 struct ddp_buffer *db;
645 struct tcpcb *tp;
646 struct socket *so;
647 struct sockbuf *sb;
648 struct kaiocb *job;
649 long copied;
650
651 db_idx = report & F_DDP_BUF_IDX ? 1 : 0;
652
653 if (__predict_false(!(report & F_DDP_INV)))
654 CXGBE_UNIMPLEMENTED("DDP buffer still valid");
655
656 INP_WLOCK(inp);
657 so = inp_inpcbtosocket(inp);
658 sb = &so->so_rcv;
659 DDP_LOCK(toep);
660
661 KASSERT(toep->ddp.active_id == db_idx,
662 ("completed DDP buffer (%d) != active_id (%d) for tid %d", db_idx,
663 toep->ddp.active_id, toep->tid));
664 db = &toep->ddp.db[db_idx];
665 job = db->job;
666
667 if (__predict_false(inp->inp_flags & INP_DROPPED)) {
668 /*
669 * This can happen due to an administrative tcpdrop(8).
670 * Just fail the request with ECONNRESET.
671 */
672 CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x",
673 __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags);
674 if (aio_clear_cancel_function(job))
675 ddp_complete_one(job, ECONNRESET);
676 goto completed;
677 }
678
679 tp = intotcpcb(inp);
680
681 /*
682 * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the
683 * sequence number of the next byte to receive. The length of
684 * the data received for this message must be computed by
685 * comparing the new and old values of rcv_nxt.
686 *
687 * For RX_DATA_DDP, len might be non-zero, but it is only the
688 * length of the most recent DMA. It does not include the
689 * total length of the data received since the previous update
690 * for this DDP buffer. rcv_nxt is the sequence number of the
691 * first received byte from the most recent DMA.
692 */
693 len += be32toh(rcv_nxt) - tp->rcv_nxt;
694 tp->rcv_nxt += len;
695 tp->t_rcvtime = ticks;
696 #ifndef USE_DDP_RX_FLOW_CONTROL
697 KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
698 tp->rcv_wnd -= len;
699 #endif
700 #ifdef VERBOSE_TRACES
701 CTR5(KTR_CXGBE, "%s: tid %u, DDP[%d] placed %d bytes (%#x)", __func__,
702 toep->tid, db_idx, len, report);
703 #endif
704
705 /* receive buffer autosize */
706 MPASS(toep->vnet == so->so_vnet);
707 CURVNET_SET(toep->vnet);
708 SOCKBUF_LOCK(sb);
709 if (sb->sb_flags & SB_AUTOSIZE &&
710 V_tcp_do_autorcvbuf &&
711 sb->sb_hiwat < V_tcp_autorcvbuf_max &&
712 len > (sbspace(sb) / 8 * 7)) {
713 struct adapter *sc = td_adapter(toep->td);
714 unsigned int hiwat = sb->sb_hiwat;
715 unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
716 V_tcp_autorcvbuf_max);
717
718 if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
719 sb->sb_flags &= ~SB_AUTOSIZE;
720 }
721 SOCKBUF_UNLOCK(sb);
722 CURVNET_RESTORE();
723
724 job->msgrcv = 1;
725 toep->ofld_rxq->rx_aio_ddp_jobs++;
726 toep->ofld_rxq->rx_aio_ddp_octets += len;
727 if (db->cancel_pending) {
728 /*
729 * Update the job's length but defer completion to the
730 * TCB_RPL callback.
731 */
732 job->aio_received += len;
733 goto out;
734 } else if (!aio_clear_cancel_function(job)) {
735 /*
736 * Update the copied length for when
737 * t4_aio_cancel_active() completes this request.
738 */
739 job->aio_received += len;
740 } else {
741 copied = job->aio_received;
742 #ifdef VERBOSE_TRACES
743 CTR5(KTR_CXGBE,
744 "%s: tid %u, completing %p (copied %ld, placed %d)",
745 __func__, toep->tid, job, copied, len);
746 #endif
747 aio_complete(job, copied + len, 0);
748 t4_rcvd(&toep->td->tod, tp);
749 }
750
751 completed:
752 complete_ddp_buffer(toep, db, db_idx);
753 if (toep->ddp.waiting_count > 0)
754 ddp_queue_toep(toep);
755 out:
756 DDP_UNLOCK(toep);
757 INP_WUNLOCK(inp);
758
759 return (0);
760 }
761
762 static bool
queue_ddp_rcvbuf(struct toepcb * toep,struct ddp_rcv_buffer * drb)763 queue_ddp_rcvbuf(struct toepcb *toep, struct ddp_rcv_buffer *drb)
764 {
765 struct adapter *sc = td_adapter(toep->td);
766 struct ddp_buffer *db;
767 struct wrqe *wr;
768 uint64_t ddp_flags, ddp_flags_mask;
769 int buf_flag, db_idx;
770
771 DDP_ASSERT_LOCKED(toep);
772
773 KASSERT((toep->ddp.flags & DDP_DEAD) == 0, ("%s: DDP_DEAD", __func__));
774 KASSERT(toep->ddp.active_count < nitems(toep->ddp.db),
775 ("%s: no empty DDP buffer slot", __func__));
776
777 /* Determine which DDP buffer to use. */
778 if (toep->ddp.db[0].drb == NULL) {
779 db_idx = 0;
780 } else {
781 MPASS(toep->ddp.db[1].drb == NULL);
782 db_idx = 1;
783 }
784
785 /*
786 * Permit PSH to trigger a partial completion without
787 * invalidating the rest of the buffer, but disable the PUSH
788 * timer.
789 */
790 ddp_flags = 0;
791 ddp_flags_mask = 0;
792 if (db_idx == 0) {
793 ddp_flags |= V_TF_DDP_PSH_NO_INVALIDATE0(1) |
794 V_TF_DDP_PUSH_DISABLE_0(0) | V_TF_DDP_PSHF_ENABLE_0(1) |
795 V_TF_DDP_BUF0_VALID(1);
796 ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE0(1) |
797 V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PSHF_ENABLE_0(1) |
798 V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF0_VALID(1);
799 buf_flag = DDP_BUF0_ACTIVE;
800 } else {
801 ddp_flags |= V_TF_DDP_PSH_NO_INVALIDATE1(1) |
802 V_TF_DDP_PUSH_DISABLE_1(0) | V_TF_DDP_PSHF_ENABLE_1(1) |
803 V_TF_DDP_BUF1_VALID(1);
804 ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE1(1) |
805 V_TF_DDP_PUSH_DISABLE_1(1) | V_TF_DDP_PSHF_ENABLE_1(1) |
806 V_TF_DDP_BUF1_FLUSH(1) | V_TF_DDP_BUF1_VALID(1);
807 buf_flag = DDP_BUF1_ACTIVE;
808 }
809 MPASS((toep->ddp.flags & buf_flag) == 0);
810 if ((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0) {
811 MPASS(db_idx == 0);
812 MPASS(toep->ddp.active_id == -1);
813 MPASS(toep->ddp.active_count == 0);
814 ddp_flags_mask |= V_TF_DDP_ACTIVE_BUF(1);
815 }
816
817 /*
818 * The TID for this connection should still be valid. If
819 * DDP_DEAD is set, SBS_CANTRCVMORE should be set, so we
820 * shouldn't be this far anyway.
821 */
822 wr = mk_update_tcb_for_ddp(sc, toep, db_idx, &drb->prsv, 0, drb->len,
823 ddp_flags, ddp_flags_mask);
824 if (wr == NULL) {
825 recycle_ddp_rcv_buffer(toep, drb);
826 printf("%s: mk_update_tcb_for_ddp failed\n", __func__);
827 return (false);
828 }
829
830 #ifdef VERBOSE_TRACES
831 CTR(KTR_CXGBE,
832 "%s: tid %u, scheduling DDP[%d] (flags %#lx/%#lx)", __func__,
833 toep->tid, db_idx, ddp_flags, ddp_flags_mask);
834 #endif
835 /*
836 * Hold a reference on scheduled buffers that is dropped in
837 * complete_ddp_buffer.
838 */
839 drb->refs = 1;
840
841 /* Give the chip the go-ahead. */
842 t4_wrq_tx(sc, wr);
843 db = &toep->ddp.db[db_idx];
844 db->drb = drb;
845 toep->ddp.flags |= buf_flag;
846 toep->ddp.active_count++;
847 if (toep->ddp.active_count == 1) {
848 MPASS(toep->ddp.active_id == -1);
849 toep->ddp.active_id = db_idx;
850 CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__,
851 toep->ddp.active_id);
852 }
853 return (true);
854 }
855
856 static int
handle_ddp_data_rcvbuf(struct toepcb * toep,__be32 ddp_report,__be32 rcv_nxt,int len)857 handle_ddp_data_rcvbuf(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt,
858 int len)
859 {
860 uint32_t report = be32toh(ddp_report);
861 struct inpcb *inp = toep->inp;
862 struct tcpcb *tp;
863 struct socket *so;
864 struct sockbuf *sb;
865 struct ddp_buffer *db;
866 struct ddp_rcv_buffer *drb;
867 unsigned int db_idx;
868 bool invalidated;
869
870 db_idx = report & F_DDP_BUF_IDX ? 1 : 0;
871
872 invalidated = (report & F_DDP_INV) != 0;
873
874 INP_WLOCK(inp);
875 so = inp_inpcbtosocket(inp);
876 sb = &so->so_rcv;
877 DDP_LOCK(toep);
878
879 KASSERT(toep->ddp.active_id == db_idx,
880 ("completed DDP buffer (%d) != active_id (%d) for tid %d", db_idx,
881 toep->ddp.active_id, toep->tid));
882 db = &toep->ddp.db[db_idx];
883
884 if (__predict_false(inp->inp_flags & INP_DROPPED)) {
885 /*
886 * This can happen due to an administrative tcpdrop(8).
887 * Just ignore the received data.
888 */
889 CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x",
890 __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags);
891 if (invalidated)
892 complete_ddp_buffer(toep, db, db_idx);
893 goto out;
894 }
895
896 tp = intotcpcb(inp);
897
898 /*
899 * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the
900 * sequence number of the next byte to receive. The length of
901 * the data received for this message must be computed by
902 * comparing the new and old values of rcv_nxt.
903 *
904 * For RX_DATA_DDP, len might be non-zero, but it is only the
905 * length of the most recent DMA. It does not include the
906 * total length of the data received since the previous update
907 * for this DDP buffer. rcv_nxt is the sequence number of the
908 * first received byte from the most recent DMA.
909 */
910 len += be32toh(rcv_nxt) - tp->rcv_nxt;
911 tp->rcv_nxt += len;
912 tp->t_rcvtime = ticks;
913 #ifndef USE_DDP_RX_FLOW_CONTROL
914 KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
915 tp->rcv_wnd -= len;
916 #endif
917 #ifdef VERBOSE_TRACES
918 CTR5(KTR_CXGBE, "%s: tid %u, DDP[%d] placed %d bytes (%#x)", __func__,
919 toep->tid, db_idx, len, report);
920 #endif
921
922 /* receive buffer autosize */
923 MPASS(toep->vnet == so->so_vnet);
924 CURVNET_SET(toep->vnet);
925 SOCKBUF_LOCK(sb);
926 if (sb->sb_flags & SB_AUTOSIZE &&
927 V_tcp_do_autorcvbuf &&
928 sb->sb_hiwat < V_tcp_autorcvbuf_max &&
929 len > (sbspace(sb) / 8 * 7)) {
930 struct adapter *sc = td_adapter(toep->td);
931 unsigned int hiwat = sb->sb_hiwat;
932 unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
933 V_tcp_autorcvbuf_max);
934
935 if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
936 sb->sb_flags &= ~SB_AUTOSIZE;
937 }
938
939 if (len > 0) {
940 queue_ddp_rcvbuf_mbuf(toep, db_idx, len);
941 t4_rcvd_locked(&toep->td->tod, tp);
942 }
943 sorwakeup_locked(so);
944 SOCKBUF_UNLOCK_ASSERT(sb);
945 CURVNET_RESTORE();
946
947 if (invalidated)
948 complete_ddp_buffer(toep, db, db_idx);
949 else
950 KASSERT(db->placed < db->drb->len,
951 ("%s: full DDP buffer not invalidated", __func__));
952
953 if (toep->ddp.active_count != nitems(toep->ddp.db)) {
954 drb = alloc_cached_ddp_rcv_buffer(toep);
955 if (drb == NULL)
956 drb = alloc_ddp_rcv_buffer(toep, M_NOWAIT);
957 if (drb == NULL)
958 ddp_queue_toep(toep);
959 else {
960 if (!queue_ddp_rcvbuf(toep, drb)) {
961 ddp_queue_toep(toep);
962 }
963 }
964 }
965 out:
966 DDP_UNLOCK(toep);
967 INP_WUNLOCK(inp);
968
969 return (0);
970 }
971
972 static int
handle_ddp_data(struct toepcb * toep,__be32 ddp_report,__be32 rcv_nxt,int len)973 handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len)
974 {
975 if ((toep->ddp.flags & DDP_RCVBUF) != 0)
976 return (handle_ddp_data_rcvbuf(toep, ddp_report, rcv_nxt, len));
977 else
978 return (handle_ddp_data_aio(toep, ddp_report, rcv_nxt, len));
979 }
980
981 void
handle_ddp_indicate(struct toepcb * toep)982 handle_ddp_indicate(struct toepcb *toep)
983 {
984
985 DDP_ASSERT_LOCKED(toep);
986 if ((toep->ddp.flags & DDP_RCVBUF) != 0) {
987 /*
988 * Indicates are not meaningful for RCVBUF since
989 * buffers are activated when the socket option is
990 * set.
991 */
992 return;
993 }
994
995 MPASS(toep->ddp.active_count == 0);
996 MPASS((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0);
997 if (toep->ddp.waiting_count == 0) {
998 /*
999 * The pending requests that triggered the request for an
1000 * an indicate were cancelled. Those cancels should have
1001 * already disabled DDP. Just ignore this as the data is
1002 * going into the socket buffer anyway.
1003 */
1004 return;
1005 }
1006 CTR3(KTR_CXGBE, "%s: tid %d indicated (%d waiting)", __func__,
1007 toep->tid, toep->ddp.waiting_count);
1008 ddp_queue_toep(toep);
1009 }
1010
1011 CTASSERT(CPL_COOKIE_DDP0 + 1 == CPL_COOKIE_DDP1);
1012
1013 static int
do_ddp_tcb_rpl(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1014 do_ddp_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1015 {
1016 struct adapter *sc = iq->adapter;
1017 const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1);
1018 unsigned int tid = GET_TID(cpl);
1019 unsigned int db_idx;
1020 struct toepcb *toep;
1021 struct inpcb *inp;
1022 struct ddp_buffer *db;
1023 struct kaiocb *job;
1024 long copied;
1025
1026 if (cpl->status != CPL_ERR_NONE)
1027 panic("XXX: tcp_rpl failed: %d", cpl->status);
1028
1029 toep = lookup_tid(sc, tid);
1030 inp = toep->inp;
1031 switch (cpl->cookie) {
1032 case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(CPL_COOKIE_DDP0):
1033 case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(CPL_COOKIE_DDP1):
1034 /*
1035 * XXX: This duplicates a lot of code with handle_ddp_data().
1036 */
1037 KASSERT((toep->ddp.flags & DDP_AIO) != 0,
1038 ("%s: DDP_RCVBUF", __func__));
1039 db_idx = G_COOKIE(cpl->cookie) - CPL_COOKIE_DDP0;
1040 MPASS(db_idx < nitems(toep->ddp.db));
1041 INP_WLOCK(inp);
1042 DDP_LOCK(toep);
1043 db = &toep->ddp.db[db_idx];
1044
1045 /*
1046 * handle_ddp_data() should leave the job around until
1047 * this callback runs once a cancel is pending.
1048 */
1049 MPASS(db != NULL);
1050 MPASS(db->job != NULL);
1051 MPASS(db->cancel_pending);
1052
1053 /*
1054 * XXX: It's not clear what happens if there is data
1055 * placed when the buffer is invalidated. I suspect we
1056 * need to read the TCB to see how much data was placed.
1057 *
1058 * For now this just pretends like nothing was placed.
1059 *
1060 * XXX: Note that if we did check the PCB we would need to
1061 * also take care of updating the tp, etc.
1062 */
1063 job = db->job;
1064 copied = job->aio_received;
1065 if (copied == 0) {
1066 CTR2(KTR_CXGBE, "%s: cancelling %p", __func__, job);
1067 aio_cancel(job);
1068 } else {
1069 CTR3(KTR_CXGBE, "%s: completing %p (copied %ld)",
1070 __func__, job, copied);
1071 aio_complete(job, copied, 0);
1072 t4_rcvd(&toep->td->tod, intotcpcb(inp));
1073 }
1074
1075 complete_ddp_buffer(toep, db, db_idx);
1076 if (toep->ddp.waiting_count > 0)
1077 ddp_queue_toep(toep);
1078 DDP_UNLOCK(toep);
1079 INP_WUNLOCK(inp);
1080 break;
1081 default:
1082 panic("XXX: unknown tcb_rpl offset %#x, cookie %#x",
1083 G_WORD(cpl->cookie), G_COOKIE(cpl->cookie));
1084 }
1085
1086 return (0);
1087 }
1088
1089 void
handle_ddp_close(struct toepcb * toep,struct tcpcb * tp,__be32 rcv_nxt)1090 handle_ddp_close(struct toepcb *toep, struct tcpcb *tp, __be32 rcv_nxt)
1091 {
1092 struct socket *so = toep->inp->inp_socket;
1093 struct sockbuf *sb = &so->so_rcv;
1094 struct ddp_buffer *db;
1095 struct kaiocb *job;
1096 long copied;
1097 unsigned int db_idx;
1098 #ifdef INVARIANTS
1099 unsigned int db_flag;
1100 #endif
1101 int len, placed;
1102 bool ddp_rcvbuf;
1103
1104 INP_WLOCK_ASSERT(toep->inp);
1105 DDP_ASSERT_LOCKED(toep);
1106
1107 ddp_rcvbuf = (toep->ddp.flags & DDP_RCVBUF) != 0;
1108
1109 /* - 1 is to ignore the byte for FIN */
1110 len = be32toh(rcv_nxt) - tp->rcv_nxt - 1;
1111 tp->rcv_nxt += len;
1112
1113 CTR(KTR_CXGBE, "%s: tid %d placed %u bytes before FIN", __func__,
1114 toep->tid, len);
1115 while (toep->ddp.active_count > 0) {
1116 MPASS(toep->ddp.active_id != -1);
1117 db_idx = toep->ddp.active_id;
1118 #ifdef INVARIANTS
1119 db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
1120 #endif
1121 MPASS((toep->ddp.flags & db_flag) != 0);
1122 db = &toep->ddp.db[db_idx];
1123 if (ddp_rcvbuf) {
1124 placed = len;
1125 if (placed > db->drb->len - db->placed)
1126 placed = db->drb->len - db->placed;
1127 if (placed != 0) {
1128 SOCKBUF_LOCK(sb);
1129 queue_ddp_rcvbuf_mbuf(toep, db_idx, placed);
1130 sorwakeup_locked(so);
1131 SOCKBUF_UNLOCK_ASSERT(sb);
1132 }
1133 complete_ddp_buffer(toep, db, db_idx);
1134 len -= placed;
1135 continue;
1136 }
1137 job = db->job;
1138 copied = job->aio_received;
1139 placed = len;
1140 if (placed > job->uaiocb.aio_nbytes - copied)
1141 placed = job->uaiocb.aio_nbytes - copied;
1142 if (placed > 0) {
1143 job->msgrcv = 1;
1144 toep->ofld_rxq->rx_aio_ddp_jobs++;
1145 }
1146 toep->ofld_rxq->rx_aio_ddp_octets += placed;
1147 if (!aio_clear_cancel_function(job)) {
1148 /*
1149 * Update the copied length for when
1150 * t4_aio_cancel_active() completes this
1151 * request.
1152 */
1153 job->aio_received += placed;
1154 } else {
1155 CTR4(KTR_CXGBE, "%s: tid %d completed buf %d len %d",
1156 __func__, toep->tid, db_idx, placed);
1157 aio_complete(job, copied + placed, 0);
1158 }
1159 len -= placed;
1160 complete_ddp_buffer(toep, db, db_idx);
1161 }
1162
1163 MPASS(len == 0);
1164 if ((toep->ddp.flags & DDP_AIO) != 0)
1165 ddp_complete_all(toep, 0);
1166 }
1167
1168 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
1169 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
1170 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
1171 F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR)
1172
1173 extern cpl_handler_t t4_cpl_handler[];
1174
1175 static int
do_rx_data_ddp(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1176 do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1177 {
1178 struct adapter *sc = iq->adapter;
1179 const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
1180 unsigned int tid = GET_TID(cpl);
1181 uint32_t vld;
1182 struct toepcb *toep = lookup_tid(sc, tid);
1183
1184 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1185 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
1186 KASSERT(!(toep->flags & TPF_SYNQE),
1187 ("%s: toep %p claims to be a synq entry", __func__, toep));
1188
1189 vld = be32toh(cpl->ddpvld);
1190 if (__predict_false(vld & DDP_ERR)) {
1191 panic("%s: DDP error 0x%x (tid %d, toep %p)",
1192 __func__, vld, tid, toep);
1193 }
1194
1195 if (ulp_mode(toep) == ULP_MODE_ISCSI) {
1196 t4_cpl_handler[CPL_RX_ISCSI_DDP](iq, rss, m);
1197 return (0);
1198 }
1199
1200 handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len));
1201
1202 return (0);
1203 }
1204
1205 static int
do_rx_ddp_complete(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1206 do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss,
1207 struct mbuf *m)
1208 {
1209 struct adapter *sc = iq->adapter;
1210 const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1);
1211 unsigned int tid = GET_TID(cpl);
1212 struct toepcb *toep = lookup_tid(sc, tid);
1213
1214 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1215 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
1216 KASSERT(!(toep->flags & TPF_SYNQE),
1217 ("%s: toep %p claims to be a synq entry", __func__, toep));
1218
1219 handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0);
1220
1221 return (0);
1222 }
1223
1224 static bool
set_ddp_ulp_mode(struct toepcb * toep)1225 set_ddp_ulp_mode(struct toepcb *toep)
1226 {
1227 struct adapter *sc = toep->vi->adapter;
1228 struct wrqe *wr;
1229 struct work_request_hdr *wrh;
1230 struct ulp_txpkt *ulpmc;
1231 int fields, len;
1232
1233 if (!sc->tt.ddp)
1234 return (false);
1235
1236 fields = 0;
1237
1238 /* Overlay region including W_TCB_RX_DDP_FLAGS */
1239 fields += 3;
1240
1241 /* W_TCB_ULP_TYPE */
1242 fields++;
1243
1244 #ifdef USE_DDP_RX_FLOW_CONTROL
1245 /* W_TCB_T_FLAGS */
1246 fields++;
1247 #endif
1248
1249 len = sizeof(*wrh) + fields * roundup2(LEN__SET_TCB_FIELD_ULP, 16);
1250 KASSERT(len <= SGE_MAX_WR_LEN,
1251 ("%s: WR with %d TCB field updates too large", __func__, fields));
1252
1253 wr = alloc_wrqe(len, toep->ctrlq);
1254 if (wr == NULL)
1255 return (false);
1256
1257 CTR(KTR_CXGBE, "%s: tid %u", __func__, toep->tid);
1258
1259 wrh = wrtod(wr);
1260 INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */
1261 ulpmc = (struct ulp_txpkt *)(wrh + 1);
1262
1263 /*
1264 * Words 26/27 are zero except for the DDP_OFF flag in
1265 * W_TCB_RX_DDP_FLAGS (27).
1266 */
1267 ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, 26,
1268 0xffffffffffffffff, (uint64_t)V_TF_DDP_OFF(1) << 32);
1269
1270 /* Words 28/29 are zero. */
1271 ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, 28,
1272 0xffffffffffffffff, 0);
1273
1274 /* Words 30/31 are zero. */
1275 ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, 30,
1276 0xffffffffffffffff, 0);
1277
1278 /* Set the ULP mode to ULP_MODE_TCPDDP. */
1279 toep->params.ulp_mode = ULP_MODE_TCPDDP;
1280 ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_ULP_TYPE,
1281 V_TCB_ULP_TYPE(M_TCB_ULP_TYPE), V_TCB_ULP_TYPE(ULP_MODE_TCPDDP));
1282
1283 #ifdef USE_DDP_RX_FLOW_CONTROL
1284 /* Set TF_RX_FLOW_CONTROL_DDP. */
1285 ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_T_FLAGS,
1286 V_TF_RX_FLOW_CONTROL_DDP(1), V_TF_RX_FLOW_CONTROL_DDP(1));
1287 #endif
1288
1289 ddp_init_toep(toep);
1290
1291 t4_wrq_tx(sc, wr);
1292 return (true);
1293 }
1294
1295 static void
enable_ddp(struct adapter * sc,struct toepcb * toep)1296 enable_ddp(struct adapter *sc, struct toepcb *toep)
1297 {
1298 uint64_t ddp_flags;
1299
1300 KASSERT((toep->ddp.flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK,
1301 ("%s: toep %p has bad ddp_flags 0x%x",
1302 __func__, toep, toep->ddp.flags));
1303
1304 CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
1305 __func__, toep->tid, time_uptime);
1306
1307 ddp_flags = 0;
1308 if ((toep->ddp.flags & DDP_AIO) != 0)
1309 ddp_flags |= V_TF_DDP_BUF0_INDICATE(1) |
1310 V_TF_DDP_BUF1_INDICATE(1);
1311 DDP_ASSERT_LOCKED(toep);
1312 toep->ddp.flags |= DDP_SC_REQ;
1313 t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_RX_DDP_FLAGS,
1314 V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) |
1315 V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) |
1316 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1), ddp_flags, 0, 0);
1317 t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS,
1318 V_TF_RCV_COALESCE_ENABLE(1), 0, 0, 0);
1319 }
1320
1321 static int
calculate_hcf(int n1,int n2)1322 calculate_hcf(int n1, int n2)
1323 {
1324 int a, b, t;
1325
1326 if (n1 <= n2) {
1327 a = n1;
1328 b = n2;
1329 } else {
1330 a = n2;
1331 b = n1;
1332 }
1333
1334 while (a != 0) {
1335 t = a;
1336 a = b % a;
1337 b = t;
1338 }
1339
1340 return (b);
1341 }
1342
1343 static inline int
pages_to_nppods(int npages,int ddp_page_shift)1344 pages_to_nppods(int npages, int ddp_page_shift)
1345 {
1346
1347 MPASS(ddp_page_shift >= PAGE_SHIFT);
1348
1349 return (howmany(npages >> (ddp_page_shift - PAGE_SHIFT), PPOD_PAGES));
1350 }
1351
1352 static int
alloc_page_pods(struct ppod_region * pr,u_int nppods,u_int pgsz_idx,struct ppod_reservation * prsv)1353 alloc_page_pods(struct ppod_region *pr, u_int nppods, u_int pgsz_idx,
1354 struct ppod_reservation *prsv)
1355 {
1356 vmem_addr_t addr; /* relative to start of region */
1357
1358 if (vmem_alloc(pr->pr_arena, PPOD_SZ(nppods), M_NOWAIT | M_FIRSTFIT,
1359 &addr) != 0)
1360 return (ENOMEM);
1361
1362 #ifdef VERBOSE_TRACES
1363 CTR5(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d, pgsz %d",
1364 __func__, pr->pr_arena, (uint32_t)addr & pr->pr_tag_mask,
1365 nppods, 1 << pr->pr_page_shift[pgsz_idx]);
1366 #endif
1367
1368 /*
1369 * The hardware tagmask includes an extra invalid bit but the arena was
1370 * seeded with valid values only. An allocation out of this arena will
1371 * fit inside the tagmask but won't have the invalid bit set.
1372 */
1373 MPASS((addr & pr->pr_tag_mask) == addr);
1374 MPASS((addr & pr->pr_invalid_bit) == 0);
1375
1376 prsv->prsv_pr = pr;
1377 prsv->prsv_tag = V_PPOD_PGSZ(pgsz_idx) | addr;
1378 prsv->prsv_nppods = nppods;
1379
1380 return (0);
1381 }
1382
1383 static int
t4_alloc_page_pods_for_vmpages(struct ppod_region * pr,vm_page_t * pages,int npages,struct ppod_reservation * prsv)1384 t4_alloc_page_pods_for_vmpages(struct ppod_region *pr, vm_page_t *pages,
1385 int npages, struct ppod_reservation *prsv)
1386 {
1387 int i, hcf, seglen, idx, nppods;
1388
1389 /*
1390 * The DDP page size is unrelated to the VM page size. We combine
1391 * contiguous physical pages into larger segments to get the best DDP
1392 * page size possible. This is the largest of the four sizes in
1393 * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in
1394 * the page list.
1395 */
1396 hcf = 0;
1397 for (i = 0; i < npages; i++) {
1398 seglen = PAGE_SIZE;
1399 while (i < npages - 1 &&
1400 VM_PAGE_TO_PHYS(pages[i]) + PAGE_SIZE ==
1401 VM_PAGE_TO_PHYS(pages[i + 1])) {
1402 seglen += PAGE_SIZE;
1403 i++;
1404 }
1405
1406 hcf = calculate_hcf(hcf, seglen);
1407 if (hcf < (1 << pr->pr_page_shift[1])) {
1408 idx = 0;
1409 goto have_pgsz; /* give up, short circuit */
1410 }
1411 }
1412
1413 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
1414 MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
1415 for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
1416 if ((hcf & PR_PAGE_MASK(idx)) == 0)
1417 break;
1418 }
1419 #undef PR_PAGE_MASK
1420
1421 have_pgsz:
1422 MPASS(idx <= M_PPOD_PGSZ);
1423
1424 nppods = pages_to_nppods(npages, pr->pr_page_shift[idx]);
1425 if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
1426 return (ENOMEM);
1427 MPASS(prsv->prsv_nppods > 0);
1428
1429 return (0);
1430 }
1431
1432 int
t4_alloc_page_pods_for_ps(struct ppod_region * pr,struct pageset * ps)1433 t4_alloc_page_pods_for_ps(struct ppod_region *pr, struct pageset *ps)
1434 {
1435 struct ppod_reservation *prsv = &ps->prsv;
1436
1437 KASSERT(prsv->prsv_nppods == 0,
1438 ("%s: page pods already allocated", __func__));
1439
1440 return (t4_alloc_page_pods_for_vmpages(pr, ps->pages, ps->npages,
1441 prsv));
1442 }
1443
1444 int
t4_alloc_page_pods_for_bio(struct ppod_region * pr,struct bio * bp,struct ppod_reservation * prsv)1445 t4_alloc_page_pods_for_bio(struct ppod_region *pr, struct bio *bp,
1446 struct ppod_reservation *prsv)
1447 {
1448
1449 MPASS(bp->bio_flags & BIO_UNMAPPED);
1450
1451 return (t4_alloc_page_pods_for_vmpages(pr, bp->bio_ma, bp->bio_ma_n,
1452 prsv));
1453 }
1454
1455 int
t4_alloc_page_pods_for_buf(struct ppod_region * pr,vm_offset_t buf,int len,struct ppod_reservation * prsv)1456 t4_alloc_page_pods_for_buf(struct ppod_region *pr, vm_offset_t buf, int len,
1457 struct ppod_reservation *prsv)
1458 {
1459 int hcf, seglen, idx, npages, nppods;
1460 uintptr_t start_pva, end_pva, pva, p1;
1461
1462 MPASS(buf > 0);
1463 MPASS(len > 0);
1464
1465 /*
1466 * The DDP page size is unrelated to the VM page size. We combine
1467 * contiguous physical pages into larger segments to get the best DDP
1468 * page size possible. This is the largest of the four sizes in
1469 * A_ULP_RX_ISCSI_PSZ that evenly divides the HCF of the segment sizes
1470 * in the page list.
1471 */
1472 hcf = 0;
1473 start_pva = trunc_page(buf);
1474 end_pva = trunc_page(buf + len - 1);
1475 pva = start_pva;
1476 while (pva <= end_pva) {
1477 seglen = PAGE_SIZE;
1478 p1 = pmap_kextract(pva);
1479 pva += PAGE_SIZE;
1480 while (pva <= end_pva && p1 + seglen == pmap_kextract(pva)) {
1481 seglen += PAGE_SIZE;
1482 pva += PAGE_SIZE;
1483 }
1484
1485 hcf = calculate_hcf(hcf, seglen);
1486 if (hcf < (1 << pr->pr_page_shift[1])) {
1487 idx = 0;
1488 goto have_pgsz; /* give up, short circuit */
1489 }
1490 }
1491
1492 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
1493 MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
1494 for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
1495 if ((hcf & PR_PAGE_MASK(idx)) == 0)
1496 break;
1497 }
1498 #undef PR_PAGE_MASK
1499
1500 have_pgsz:
1501 MPASS(idx <= M_PPOD_PGSZ);
1502
1503 npages = 1;
1504 npages += (end_pva - start_pva) >> pr->pr_page_shift[idx];
1505 nppods = howmany(npages, PPOD_PAGES);
1506 if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
1507 return (ENOMEM);
1508 MPASS(prsv->prsv_nppods > 0);
1509
1510 return (0);
1511 }
1512
1513 static int
t4_alloc_page_pods_for_rcvbuf(struct ppod_region * pr,struct ddp_rcv_buffer * drb)1514 t4_alloc_page_pods_for_rcvbuf(struct ppod_region *pr,
1515 struct ddp_rcv_buffer *drb)
1516 {
1517 struct ppod_reservation *prsv = &drb->prsv;
1518
1519 KASSERT(prsv->prsv_nppods == 0,
1520 ("%s: page pods already allocated", __func__));
1521
1522 return (t4_alloc_page_pods_for_buf(pr, (vm_offset_t)drb->buf, drb->len,
1523 prsv));
1524 }
1525
1526 int
t4_alloc_page_pods_for_sgl(struct ppod_region * pr,struct ctl_sg_entry * sgl,int entries,struct ppod_reservation * prsv)1527 t4_alloc_page_pods_for_sgl(struct ppod_region *pr, struct ctl_sg_entry *sgl,
1528 int entries, struct ppod_reservation *prsv)
1529 {
1530 int hcf, seglen, idx = 0, npages, nppods, i, len;
1531 uintptr_t start_pva, end_pva, pva, p1 ;
1532 vm_offset_t buf;
1533 struct ctl_sg_entry *sge;
1534
1535 MPASS(entries > 0);
1536 MPASS(sgl);
1537
1538 /*
1539 * The DDP page size is unrelated to the VM page size. We combine
1540 * contiguous physical pages into larger segments to get the best DDP
1541 * page size possible. This is the largest of the four sizes in
1542 * A_ULP_RX_ISCSI_PSZ that evenly divides the HCF of the segment sizes
1543 * in the page list.
1544 */
1545 hcf = 0;
1546 for (i = entries - 1; i >= 0; i--) {
1547 sge = sgl + i;
1548 buf = (vm_offset_t)sge->addr;
1549 len = sge->len;
1550 start_pva = trunc_page(buf);
1551 end_pva = trunc_page(buf + len - 1);
1552 pva = start_pva;
1553 while (pva <= end_pva) {
1554 seglen = PAGE_SIZE;
1555 p1 = pmap_kextract(pva);
1556 pva += PAGE_SIZE;
1557 while (pva <= end_pva && p1 + seglen ==
1558 pmap_kextract(pva)) {
1559 seglen += PAGE_SIZE;
1560 pva += PAGE_SIZE;
1561 }
1562
1563 hcf = calculate_hcf(hcf, seglen);
1564 if (hcf < (1 << pr->pr_page_shift[1])) {
1565 idx = 0;
1566 goto have_pgsz; /* give up, short circuit */
1567 }
1568 }
1569 }
1570 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
1571 MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
1572 for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
1573 if ((hcf & PR_PAGE_MASK(idx)) == 0)
1574 break;
1575 }
1576 #undef PR_PAGE_MASK
1577
1578 have_pgsz:
1579 MPASS(idx <= M_PPOD_PGSZ);
1580
1581 npages = 0;
1582 while (entries--) {
1583 npages++;
1584 start_pva = trunc_page((vm_offset_t)sgl->addr);
1585 end_pva = trunc_page((vm_offset_t)sgl->addr + sgl->len - 1);
1586 npages += (end_pva - start_pva) >> pr->pr_page_shift[idx];
1587 sgl = sgl + 1;
1588 }
1589 nppods = howmany(npages, PPOD_PAGES);
1590 if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
1591 return (ENOMEM);
1592 MPASS(prsv->prsv_nppods > 0);
1593 return (0);
1594 }
1595
1596 void
t4_free_page_pods(struct ppod_reservation * prsv)1597 t4_free_page_pods(struct ppod_reservation *prsv)
1598 {
1599 struct ppod_region *pr = prsv->prsv_pr;
1600 vmem_addr_t addr;
1601
1602 MPASS(prsv != NULL);
1603 MPASS(prsv->prsv_nppods != 0);
1604
1605 addr = prsv->prsv_tag & pr->pr_tag_mask;
1606 MPASS((addr & pr->pr_invalid_bit) == 0);
1607
1608 #ifdef VERBOSE_TRACES
1609 CTR4(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d", __func__,
1610 pr->pr_arena, addr, prsv->prsv_nppods);
1611 #endif
1612
1613 vmem_free(pr->pr_arena, addr, PPOD_SZ(prsv->prsv_nppods));
1614 prsv->prsv_nppods = 0;
1615 }
1616
1617 #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE)
1618
1619 int
t4_write_page_pods_for_ps(struct adapter * sc,struct sge_wrq * wrq,int tid,struct pageset * ps)1620 t4_write_page_pods_for_ps(struct adapter *sc, struct sge_wrq *wrq, int tid,
1621 struct pageset *ps)
1622 {
1623 struct wrqe *wr;
1624 struct ulp_mem_io *ulpmc;
1625 struct ulptx_idata *ulpsc;
1626 struct pagepod *ppod;
1627 int i, j, k, n, chunk, len, ddp_pgsz, idx;
1628 u_int ppod_addr;
1629 uint32_t cmd;
1630 struct ppod_reservation *prsv = &ps->prsv;
1631 struct ppod_region *pr = prsv->prsv_pr;
1632 vm_paddr_t pa;
1633
1634 KASSERT(!(ps->flags & PS_PPODS_WRITTEN),
1635 ("%s: page pods already written", __func__));
1636 MPASS(prsv->prsv_nppods > 0);
1637
1638 cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
1639 if (is_t4(sc))
1640 cmd |= htobe32(F_ULP_MEMIO_ORDER);
1641 else
1642 cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
1643 ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
1644 ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
1645 for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
1646 /* How many page pods are we writing in this cycle */
1647 n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
1648 chunk = PPOD_SZ(n);
1649 len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
1650
1651 wr = alloc_wrqe(len, wrq);
1652 if (wr == NULL)
1653 return (ENOMEM); /* ok to just bail out */
1654 ulpmc = wrtod(wr);
1655
1656 INIT_ULPTX_WR(ulpmc, len, 0, 0);
1657 ulpmc->cmd = cmd;
1658 if (chip_id(sc) >= CHELSIO_T7)
1659 ulpmc->dlen = htobe32(V_T7_ULP_MEMIO_DATA_LEN(chunk >> 5));
1660 else
1661 ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk >> 5));
1662 ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
1663 ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
1664
1665 ulpsc = (struct ulptx_idata *)(ulpmc + 1);
1666 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
1667 ulpsc->len = htobe32(chunk);
1668
1669 ppod = (struct pagepod *)(ulpsc + 1);
1670 for (j = 0; j < n; i++, j++, ppod++) {
1671 ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
1672 V_PPOD_TID(tid) | prsv->prsv_tag);
1673 ppod->len_offset = htobe64(V_PPOD_LEN(ps->len) |
1674 V_PPOD_OFST(ps->offset));
1675 ppod->rsvd = 0;
1676 idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE);
1677 for (k = 0; k < nitems(ppod->addr); k++) {
1678 if (idx < ps->npages) {
1679 pa = VM_PAGE_TO_PHYS(ps->pages[idx]);
1680 ppod->addr[k] = htobe64(pa);
1681 idx += ddp_pgsz / PAGE_SIZE;
1682 } else
1683 ppod->addr[k] = 0;
1684 #if 0
1685 CTR5(KTR_CXGBE,
1686 "%s: tid %d ppod[%d]->addr[%d] = %p",
1687 __func__, tid, i, k,
1688 be64toh(ppod->addr[k]));
1689 #endif
1690 }
1691
1692 }
1693
1694 t4_wrq_tx(sc, wr);
1695 }
1696 ps->flags |= PS_PPODS_WRITTEN;
1697
1698 return (0);
1699 }
1700
1701 static int
t4_write_page_pods_for_rcvbuf(struct adapter * sc,struct sge_wrq * wrq,int tid,struct ddp_rcv_buffer * drb)1702 t4_write_page_pods_for_rcvbuf(struct adapter *sc, struct sge_wrq *wrq, int tid,
1703 struct ddp_rcv_buffer *drb)
1704 {
1705 struct wrqe *wr;
1706 struct ulp_mem_io *ulpmc;
1707 struct ulptx_idata *ulpsc;
1708 struct pagepod *ppod;
1709 int i, j, k, n, chunk, len, ddp_pgsz;
1710 u_int ppod_addr, offset;
1711 uint32_t cmd;
1712 struct ppod_reservation *prsv = &drb->prsv;
1713 struct ppod_region *pr = prsv->prsv_pr;
1714 uintptr_t end_pva, pva;
1715 vm_paddr_t pa;
1716
1717 MPASS(prsv->prsv_nppods > 0);
1718
1719 cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
1720 if (is_t4(sc))
1721 cmd |= htobe32(F_ULP_MEMIO_ORDER);
1722 else
1723 cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
1724 ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
1725 offset = (uintptr_t)drb->buf & PAGE_MASK;
1726 ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
1727 pva = trunc_page((uintptr_t)drb->buf);
1728 end_pva = trunc_page((uintptr_t)drb->buf + drb->len - 1);
1729 for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
1730 /* How many page pods are we writing in this cycle */
1731 n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
1732 MPASS(n > 0);
1733 chunk = PPOD_SZ(n);
1734 len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
1735
1736 wr = alloc_wrqe(len, wrq);
1737 if (wr == NULL)
1738 return (ENOMEM); /* ok to just bail out */
1739 ulpmc = wrtod(wr);
1740
1741 INIT_ULPTX_WR(ulpmc, len, 0, 0);
1742 ulpmc->cmd = cmd;
1743 ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
1744 ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
1745 ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
1746
1747 ulpsc = (struct ulptx_idata *)(ulpmc + 1);
1748 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
1749 ulpsc->len = htobe32(chunk);
1750
1751 ppod = (struct pagepod *)(ulpsc + 1);
1752 for (j = 0; j < n; i++, j++, ppod++) {
1753 ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
1754 V_PPOD_TID(tid) | prsv->prsv_tag);
1755 ppod->len_offset = htobe64(V_PPOD_LEN(drb->len) |
1756 V_PPOD_OFST(offset));
1757 ppod->rsvd = 0;
1758
1759 for (k = 0; k < nitems(ppod->addr); k++) {
1760 if (pva > end_pva)
1761 ppod->addr[k] = 0;
1762 else {
1763 pa = pmap_kextract(pva);
1764 ppod->addr[k] = htobe64(pa);
1765 pva += ddp_pgsz;
1766 }
1767 #if 0
1768 CTR5(KTR_CXGBE,
1769 "%s: tid %d ppod[%d]->addr[%d] = %p",
1770 __func__, tid, i, k,
1771 be64toh(ppod->addr[k]));
1772 #endif
1773 }
1774
1775 /*
1776 * Walk back 1 segment so that the first address in the
1777 * next pod is the same as the last one in the current
1778 * pod.
1779 */
1780 pva -= ddp_pgsz;
1781 }
1782
1783 t4_wrq_tx(sc, wr);
1784 }
1785
1786 MPASS(pva <= end_pva);
1787
1788 return (0);
1789 }
1790
1791 struct mbuf *
alloc_raw_wr_mbuf(int len)1792 alloc_raw_wr_mbuf(int len)
1793 {
1794 struct mbuf *m;
1795
1796 if (len <= MHLEN)
1797 m = m_gethdr(M_NOWAIT, MT_DATA);
1798 else if (len <= MCLBYTES)
1799 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
1800 else
1801 m = NULL;
1802 if (m == NULL)
1803 return (NULL);
1804 m->m_pkthdr.len = len;
1805 m->m_len = len;
1806 set_mbuf_raw_wr(m, true);
1807 return (m);
1808 }
1809
1810 int
t4_write_page_pods_for_bio(struct adapter * sc,struct toepcb * toep,struct ppod_reservation * prsv,struct bio * bp,struct mbufq * wrq)1811 t4_write_page_pods_for_bio(struct adapter *sc, struct toepcb *toep,
1812 struct ppod_reservation *prsv, struct bio *bp, struct mbufq *wrq)
1813 {
1814 struct ulp_mem_io *ulpmc;
1815 struct ulptx_idata *ulpsc;
1816 struct pagepod *ppod;
1817 int i, j, k, n, chunk, len, ddp_pgsz, idx;
1818 u_int ppod_addr;
1819 uint32_t cmd;
1820 struct ppod_region *pr = prsv->prsv_pr;
1821 vm_paddr_t pa;
1822 struct mbuf *m;
1823
1824 MPASS(bp->bio_flags & BIO_UNMAPPED);
1825
1826 cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
1827 if (is_t4(sc))
1828 cmd |= htobe32(F_ULP_MEMIO_ORDER);
1829 else
1830 cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
1831 ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
1832 ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
1833 for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
1834
1835 /* How many page pods are we writing in this cycle */
1836 n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
1837 MPASS(n > 0);
1838 chunk = PPOD_SZ(n);
1839 len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
1840
1841 m = alloc_raw_wr_mbuf(len);
1842 if (m == NULL)
1843 return (ENOMEM);
1844
1845 ulpmc = mtod(m, struct ulp_mem_io *);
1846 INIT_ULPTX_WR(ulpmc, len, 0, toep->tid);
1847 ulpmc->cmd = cmd;
1848 if (chip_id(sc) >= CHELSIO_T7)
1849 ulpmc->dlen = htobe32(V_T7_ULP_MEMIO_DATA_LEN(chunk >> 5));
1850 else
1851 ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk >> 5));
1852 ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
1853 ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
1854
1855 ulpsc = (struct ulptx_idata *)(ulpmc + 1);
1856 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
1857 ulpsc->len = htobe32(chunk);
1858
1859 ppod = (struct pagepod *)(ulpsc + 1);
1860 for (j = 0; j < n; i++, j++, ppod++) {
1861 ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
1862 V_PPOD_TID(toep->tid) |
1863 (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ)));
1864 ppod->len_offset = htobe64(V_PPOD_LEN(bp->bio_bcount) |
1865 V_PPOD_OFST(bp->bio_ma_offset));
1866 ppod->rsvd = 0;
1867 idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE);
1868 for (k = 0; k < nitems(ppod->addr); k++) {
1869 if (idx < bp->bio_ma_n) {
1870 pa = VM_PAGE_TO_PHYS(bp->bio_ma[idx]);
1871 ppod->addr[k] = htobe64(pa);
1872 idx += ddp_pgsz / PAGE_SIZE;
1873 } else
1874 ppod->addr[k] = 0;
1875 #if 0
1876 CTR5(KTR_CXGBE,
1877 "%s: tid %d ppod[%d]->addr[%d] = %p",
1878 __func__, toep->tid, i, k,
1879 be64toh(ppod->addr[k]));
1880 #endif
1881 }
1882 }
1883
1884 mbufq_enqueue(wrq, m);
1885 }
1886
1887 return (0);
1888 }
1889
1890 int
t4_write_page_pods_for_buf(struct adapter * sc,struct toepcb * toep,struct ppod_reservation * prsv,vm_offset_t buf,int buflen,struct mbufq * wrq)1891 t4_write_page_pods_for_buf(struct adapter *sc, struct toepcb *toep,
1892 struct ppod_reservation *prsv, vm_offset_t buf, int buflen,
1893 struct mbufq *wrq)
1894 {
1895 struct ulp_mem_io *ulpmc;
1896 struct ulptx_idata *ulpsc;
1897 struct pagepod *ppod;
1898 int i, j, k, n, chunk, len, ddp_pgsz;
1899 u_int ppod_addr, offset;
1900 uint32_t cmd;
1901 struct ppod_region *pr = prsv->prsv_pr;
1902 uintptr_t end_pva, pva;
1903 vm_paddr_t pa;
1904 struct mbuf *m;
1905
1906 cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
1907 if (is_t4(sc))
1908 cmd |= htobe32(F_ULP_MEMIO_ORDER);
1909 else
1910 cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
1911 ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
1912 offset = buf & PAGE_MASK;
1913 ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
1914 pva = trunc_page(buf);
1915 end_pva = trunc_page(buf + buflen - 1);
1916 for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
1917
1918 /* How many page pods are we writing in this cycle */
1919 n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
1920 MPASS(n > 0);
1921 chunk = PPOD_SZ(n);
1922 len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
1923
1924 m = alloc_raw_wr_mbuf(len);
1925 if (m == NULL)
1926 return (ENOMEM);
1927 ulpmc = mtod(m, struct ulp_mem_io *);
1928
1929 INIT_ULPTX_WR(ulpmc, len, 0, toep->tid);
1930 ulpmc->cmd = cmd;
1931 if (chip_id(sc) >= CHELSIO_T7)
1932 ulpmc->dlen = htobe32(V_T7_ULP_MEMIO_DATA_LEN(chunk >> 5));
1933 else
1934 ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk >> 5));
1935 ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
1936 ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
1937
1938 ulpsc = (struct ulptx_idata *)(ulpmc + 1);
1939 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
1940 ulpsc->len = htobe32(chunk);
1941
1942 ppod = (struct pagepod *)(ulpsc + 1);
1943 for (j = 0; j < n; i++, j++, ppod++) {
1944 ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
1945 V_PPOD_TID(toep->tid) |
1946 (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ)));
1947 ppod->len_offset = htobe64(V_PPOD_LEN(buflen) |
1948 V_PPOD_OFST(offset));
1949 ppod->rsvd = 0;
1950
1951 for (k = 0; k < nitems(ppod->addr); k++) {
1952 if (pva > end_pva)
1953 ppod->addr[k] = 0;
1954 else {
1955 pa = pmap_kextract(pva);
1956 ppod->addr[k] = htobe64(pa);
1957 pva += ddp_pgsz;
1958 }
1959 #if 0
1960 CTR5(KTR_CXGBE,
1961 "%s: tid %d ppod[%d]->addr[%d] = %p",
1962 __func__, toep->tid, i, k,
1963 be64toh(ppod->addr[k]));
1964 #endif
1965 }
1966
1967 /*
1968 * Walk back 1 segment so that the first address in the
1969 * next pod is the same as the last one in the current
1970 * pod.
1971 */
1972 pva -= ddp_pgsz;
1973 }
1974
1975 mbufq_enqueue(wrq, m);
1976 }
1977
1978 MPASS(pva <= end_pva);
1979
1980 return (0);
1981 }
1982
1983 int
t4_write_page_pods_for_sgl(struct adapter * sc,struct toepcb * toep,struct ppod_reservation * prsv,struct ctl_sg_entry * sgl,int entries,int xferlen,struct mbufq * wrq)1984 t4_write_page_pods_for_sgl(struct adapter *sc, struct toepcb *toep,
1985 struct ppod_reservation *prsv, struct ctl_sg_entry *sgl, int entries,
1986 int xferlen, struct mbufq *wrq)
1987 {
1988 struct ulp_mem_io *ulpmc;
1989 struct ulptx_idata *ulpsc;
1990 struct pagepod *ppod;
1991 int i, j, k, n, chunk, len, ddp_pgsz;
1992 u_int ppod_addr, offset, sg_offset = 0;
1993 uint32_t cmd;
1994 struct ppod_region *pr = prsv->prsv_pr;
1995 uintptr_t pva;
1996 vm_paddr_t pa;
1997 struct mbuf *m;
1998
1999 MPASS(sgl != NULL);
2000 MPASS(entries > 0);
2001 cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
2002 if (is_t4(sc))
2003 cmd |= htobe32(F_ULP_MEMIO_ORDER);
2004 else
2005 cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
2006 ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
2007 offset = (vm_offset_t)sgl->addr & PAGE_MASK;
2008 ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
2009 pva = trunc_page((vm_offset_t)sgl->addr);
2010 for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
2011
2012 /* How many page pods are we writing in this cycle */
2013 n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
2014 MPASS(n > 0);
2015 chunk = PPOD_SZ(n);
2016 len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
2017
2018 m = alloc_raw_wr_mbuf(len);
2019 if (m == NULL)
2020 return (ENOMEM);
2021 ulpmc = mtod(m, struct ulp_mem_io *);
2022
2023 INIT_ULPTX_WR(ulpmc, len, 0, toep->tid);
2024 ulpmc->cmd = cmd;
2025 if (chip_id(sc) >= CHELSIO_T7)
2026 ulpmc->dlen = htobe32(V_T7_ULP_MEMIO_DATA_LEN(chunk >> 5));
2027 else
2028 ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk >> 5));
2029 ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
2030 ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
2031
2032 ulpsc = (struct ulptx_idata *)(ulpmc + 1);
2033 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
2034 ulpsc->len = htobe32(chunk);
2035
2036 ppod = (struct pagepod *)(ulpsc + 1);
2037 for (j = 0; j < n; i++, j++, ppod++) {
2038 ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
2039 V_PPOD_TID(toep->tid) |
2040 (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ)));
2041 ppod->len_offset = htobe64(V_PPOD_LEN(xferlen) |
2042 V_PPOD_OFST(offset));
2043 ppod->rsvd = 0;
2044
2045 for (k = 0; k < nitems(ppod->addr); k++) {
2046 if (entries != 0) {
2047 pa = pmap_kextract(pva + sg_offset);
2048 ppod->addr[k] = htobe64(pa);
2049 } else
2050 ppod->addr[k] = 0;
2051
2052 #if 0
2053 CTR5(KTR_CXGBE,
2054 "%s: tid %d ppod[%d]->addr[%d] = %p",
2055 __func__, toep->tid, i, k,
2056 be64toh(ppod->addr[k]));
2057 #endif
2058
2059 /*
2060 * If this is the last entry in a pod,
2061 * reuse the same entry for first address
2062 * in the next pod.
2063 */
2064 if (k + 1 == nitems(ppod->addr))
2065 break;
2066
2067 /*
2068 * Don't move to the next DDP page if the
2069 * sgl is already finished.
2070 */
2071 if (entries == 0)
2072 continue;
2073
2074 sg_offset += ddp_pgsz;
2075 if (sg_offset == sgl->len) {
2076 /*
2077 * This sgl entry is done. Go
2078 * to the next.
2079 */
2080 entries--;
2081 sgl++;
2082 sg_offset = 0;
2083 if (entries != 0)
2084 pva = trunc_page(
2085 (vm_offset_t)sgl->addr);
2086 }
2087 }
2088 }
2089
2090 mbufq_enqueue(wrq, m);
2091 }
2092
2093 return (0);
2094 }
2095
2096 /*
2097 * Prepare a pageset for DDP. This sets up page pods.
2098 */
2099 static int
prep_pageset(struct adapter * sc,struct toepcb * toep,struct pageset * ps)2100 prep_pageset(struct adapter *sc, struct toepcb *toep, struct pageset *ps)
2101 {
2102 struct tom_data *td = sc->tom_softc;
2103
2104 if (ps->prsv.prsv_nppods == 0 &&
2105 t4_alloc_page_pods_for_ps(&td->pr, ps) != 0) {
2106 return (0);
2107 }
2108 if (!(ps->flags & PS_PPODS_WRITTEN) &&
2109 t4_write_page_pods_for_ps(sc, toep->ctrlq, toep->tid, ps) != 0) {
2110 return (0);
2111 }
2112
2113 return (1);
2114 }
2115
2116 int
t4_init_ppod_region(struct ppod_region * pr,struct t4_range * r,u_int psz,const char * name)2117 t4_init_ppod_region(struct ppod_region *pr, struct t4_range *r, u_int psz,
2118 const char *name)
2119 {
2120 int i;
2121
2122 MPASS(pr != NULL);
2123 MPASS(r->size > 0);
2124
2125 pr->pr_start = r->start;
2126 pr->pr_len = r->size;
2127 pr->pr_page_shift[0] = 12 + G_HPZ0(psz);
2128 pr->pr_page_shift[1] = 12 + G_HPZ1(psz);
2129 pr->pr_page_shift[2] = 12 + G_HPZ2(psz);
2130 pr->pr_page_shift[3] = 12 + G_HPZ3(psz);
2131
2132 /* The SGL -> page pod algorithm requires the sizes to be in order. */
2133 for (i = 1; i < nitems(pr->pr_page_shift); i++) {
2134 if (pr->pr_page_shift[i] <= pr->pr_page_shift[i - 1])
2135 return (ENXIO);
2136 }
2137
2138 pr->pr_tag_mask = ((1 << fls(r->size)) - 1) & V_PPOD_TAG(M_PPOD_TAG);
2139 pr->pr_alias_mask = V_PPOD_TAG(M_PPOD_TAG) & ~pr->pr_tag_mask;
2140 if (pr->pr_tag_mask == 0 || pr->pr_alias_mask == 0)
2141 return (ENXIO);
2142 pr->pr_alias_shift = fls(pr->pr_tag_mask);
2143 pr->pr_invalid_bit = 1 << (pr->pr_alias_shift - 1);
2144
2145 pr->pr_arena = vmem_create(name, 0, pr->pr_len, PPOD_SIZE, 0,
2146 M_FIRSTFIT | M_NOWAIT);
2147 if (pr->pr_arena == NULL)
2148 return (ENOMEM);
2149
2150 return (0);
2151 }
2152
2153 void
t4_free_ppod_region(struct ppod_region * pr)2154 t4_free_ppod_region(struct ppod_region *pr)
2155 {
2156
2157 MPASS(pr != NULL);
2158
2159 if (pr->pr_arena)
2160 vmem_destroy(pr->pr_arena);
2161 bzero(pr, sizeof(*pr));
2162 }
2163
2164 static int
pscmp(struct pageset * ps,struct vmspace * vm,vm_offset_t start,int npages,int pgoff,int len)2165 pscmp(struct pageset *ps, struct vmspace *vm, vm_offset_t start, int npages,
2166 int pgoff, int len)
2167 {
2168
2169 if (ps->start != start || ps->npages != npages ||
2170 ps->offset != pgoff || ps->len != len)
2171 return (1);
2172
2173 return (ps->vm != vm || ps->vm_timestamp != vm->vm_map.timestamp);
2174 }
2175
2176 static int
hold_aio(struct toepcb * toep,struct kaiocb * job,struct pageset ** pps)2177 hold_aio(struct toepcb *toep, struct kaiocb *job, struct pageset **pps)
2178 {
2179 struct vmspace *vm;
2180 vm_map_t map;
2181 vm_offset_t start, end, pgoff;
2182 struct pageset *ps;
2183 int n;
2184
2185 DDP_ASSERT_LOCKED(toep);
2186
2187 /*
2188 * The AIO subsystem will cancel and drain all requests before
2189 * permitting a process to exit or exec, so p_vmspace should
2190 * be stable here.
2191 */
2192 vm = job->userproc->p_vmspace;
2193 map = &vm->vm_map;
2194 start = (uintptr_t)job->uaiocb.aio_buf;
2195 pgoff = start & PAGE_MASK;
2196 end = round_page(start + job->uaiocb.aio_nbytes);
2197 start = trunc_page(start);
2198
2199 if (end - start > MAX_DDP_BUFFER_SIZE) {
2200 /*
2201 * Truncate the request to a short read.
2202 * Alternatively, we could DDP in chunks to the larger
2203 * buffer, but that would be quite a bit more work.
2204 *
2205 * When truncating, round the request down to avoid
2206 * crossing a cache line on the final transaction.
2207 */
2208 end = rounddown2(start + MAX_DDP_BUFFER_SIZE, CACHE_LINE_SIZE);
2209 #ifdef VERBOSE_TRACES
2210 CTR4(KTR_CXGBE, "%s: tid %d, truncating size from %lu to %lu",
2211 __func__, toep->tid, (unsigned long)job->uaiocb.aio_nbytes,
2212 (unsigned long)(end - (start + pgoff)));
2213 job->uaiocb.aio_nbytes = end - (start + pgoff);
2214 #endif
2215 end = round_page(end);
2216 }
2217
2218 n = atop(end - start);
2219
2220 /*
2221 * Try to reuse a cached pageset.
2222 */
2223 TAILQ_FOREACH(ps, &toep->ddp.cached_pagesets, link) {
2224 if (pscmp(ps, vm, start, n, pgoff,
2225 job->uaiocb.aio_nbytes) == 0) {
2226 TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
2227 toep->ddp.cached_count--;
2228 *pps = ps;
2229 return (0);
2230 }
2231 }
2232
2233 /*
2234 * If there are too many cached pagesets to create a new one,
2235 * free a pageset before creating a new one.
2236 */
2237 KASSERT(toep->ddp.active_count + toep->ddp.cached_count <=
2238 nitems(toep->ddp.db), ("%s: too many wired pagesets", __func__));
2239 if (toep->ddp.active_count + toep->ddp.cached_count ==
2240 nitems(toep->ddp.db)) {
2241 KASSERT(toep->ddp.cached_count > 0,
2242 ("no cached pageset to free"));
2243 ps = TAILQ_LAST(&toep->ddp.cached_pagesets, pagesetq);
2244 TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
2245 toep->ddp.cached_count--;
2246 free_pageset(toep->td, ps);
2247 }
2248 DDP_UNLOCK(toep);
2249
2250 /* Create a new pageset. */
2251 ps = malloc(sizeof(*ps) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK |
2252 M_ZERO);
2253 ps->pages = (vm_page_t *)(ps + 1);
2254 ps->vm_timestamp = map->timestamp;
2255 ps->npages = vm_fault_quick_hold_pages(map, start, end - start,
2256 VM_PROT_WRITE, ps->pages, n);
2257
2258 DDP_LOCK(toep);
2259 if (ps->npages < 0) {
2260 free(ps, M_CXGBE);
2261 return (EFAULT);
2262 }
2263
2264 KASSERT(ps->npages == n, ("hold_aio: page count mismatch: %d vs %d",
2265 ps->npages, n));
2266
2267 ps->offset = pgoff;
2268 ps->len = job->uaiocb.aio_nbytes;
2269 refcount_acquire(&vm->vm_refcnt);
2270 ps->vm = vm;
2271 ps->start = start;
2272
2273 CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d",
2274 __func__, toep->tid, ps, job, ps->npages);
2275 *pps = ps;
2276 return (0);
2277 }
2278
2279 static void
ddp_complete_all(struct toepcb * toep,int error)2280 ddp_complete_all(struct toepcb *toep, int error)
2281 {
2282 struct kaiocb *job;
2283
2284 DDP_ASSERT_LOCKED(toep);
2285 KASSERT((toep->ddp.flags & DDP_AIO) != 0, ("%s: DDP_RCVBUF", __func__));
2286 while (!TAILQ_EMPTY(&toep->ddp.aiojobq)) {
2287 job = TAILQ_FIRST(&toep->ddp.aiojobq);
2288 TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
2289 toep->ddp.waiting_count--;
2290 if (aio_clear_cancel_function(job))
2291 ddp_complete_one(job, error);
2292 }
2293 }
2294
2295 static void
aio_ddp_cancel_one(struct kaiocb * job)2296 aio_ddp_cancel_one(struct kaiocb *job)
2297 {
2298 long copied;
2299
2300 /*
2301 * If this job had copied data out of the socket buffer before
2302 * it was cancelled, report it as a short read rather than an
2303 * error.
2304 */
2305 copied = job->aio_received;
2306 if (copied != 0)
2307 aio_complete(job, copied, 0);
2308 else
2309 aio_cancel(job);
2310 }
2311
2312 /*
2313 * Called when the main loop wants to requeue a job to retry it later.
2314 * Deals with the race of the job being cancelled while it was being
2315 * examined.
2316 */
2317 static void
aio_ddp_requeue_one(struct toepcb * toep,struct kaiocb * job)2318 aio_ddp_requeue_one(struct toepcb *toep, struct kaiocb *job)
2319 {
2320
2321 DDP_ASSERT_LOCKED(toep);
2322 if (!(toep->ddp.flags & DDP_DEAD) &&
2323 aio_set_cancel_function(job, t4_aio_cancel_queued)) {
2324 TAILQ_INSERT_HEAD(&toep->ddp.aiojobq, job, list);
2325 toep->ddp.waiting_count++;
2326 } else
2327 aio_ddp_cancel_one(job);
2328 }
2329
2330 static void
aio_ddp_requeue(struct toepcb * toep)2331 aio_ddp_requeue(struct toepcb *toep)
2332 {
2333 struct adapter *sc = td_adapter(toep->td);
2334 struct socket *so;
2335 struct sockbuf *sb;
2336 struct inpcb *inp;
2337 struct kaiocb *job;
2338 struct ddp_buffer *db;
2339 size_t copied, offset, resid;
2340 struct pageset *ps;
2341 struct mbuf *m;
2342 uint64_t ddp_flags, ddp_flags_mask;
2343 struct wrqe *wr;
2344 int buf_flag, db_idx, error;
2345
2346 DDP_ASSERT_LOCKED(toep);
2347
2348 restart:
2349 if (toep->ddp.flags & DDP_DEAD) {
2350 MPASS(toep->ddp.waiting_count == 0);
2351 MPASS(toep->ddp.active_count == 0);
2352 return;
2353 }
2354
2355 if (toep->ddp.waiting_count == 0 ||
2356 toep->ddp.active_count == nitems(toep->ddp.db)) {
2357 return;
2358 }
2359
2360 job = TAILQ_FIRST(&toep->ddp.aiojobq);
2361 so = job->fd_file->f_data;
2362 sb = &so->so_rcv;
2363 SOCKBUF_LOCK(sb);
2364
2365 /* We will never get anything unless we are or were connected. */
2366 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
2367 SOCKBUF_UNLOCK(sb);
2368 ddp_complete_all(toep, ENOTCONN);
2369 return;
2370 }
2371
2372 KASSERT(toep->ddp.active_count == 0 || sbavail(sb) == 0,
2373 ("%s: pending sockbuf data and DDP is active", __func__));
2374
2375 /* Abort if socket has reported problems. */
2376 /* XXX: Wait for any queued DDP's to finish and/or flush them? */
2377 if (so->so_error && sbavail(sb) == 0) {
2378 toep->ddp.waiting_count--;
2379 TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
2380 if (!aio_clear_cancel_function(job)) {
2381 SOCKBUF_UNLOCK(sb);
2382 goto restart;
2383 }
2384
2385 /*
2386 * If this job has previously copied some data, report
2387 * a short read and leave the error to be reported by
2388 * a future request.
2389 */
2390 copied = job->aio_received;
2391 if (copied != 0) {
2392 SOCKBUF_UNLOCK(sb);
2393 aio_complete(job, copied, 0);
2394 goto restart;
2395 }
2396 error = so->so_error;
2397 so->so_error = 0;
2398 SOCKBUF_UNLOCK(sb);
2399 aio_complete(job, -1, error);
2400 goto restart;
2401 }
2402
2403 /*
2404 * Door is closed. If there is pending data in the socket buffer,
2405 * deliver it. If there are pending DDP requests, wait for those
2406 * to complete. Once they have completed, return EOF reads.
2407 */
2408 if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) {
2409 SOCKBUF_UNLOCK(sb);
2410 if (toep->ddp.active_count != 0)
2411 return;
2412 ddp_complete_all(toep, 0);
2413 return;
2414 }
2415
2416 /*
2417 * If DDP is not enabled and there is no pending socket buffer
2418 * data, try to enable DDP.
2419 */
2420 if (sbavail(sb) == 0 && (toep->ddp.flags & DDP_ON) == 0) {
2421 SOCKBUF_UNLOCK(sb);
2422
2423 /*
2424 * Wait for the card to ACK that DDP is enabled before
2425 * queueing any buffers. Currently this waits for an
2426 * indicate to arrive. This could use a TCB_SET_FIELD_RPL
2427 * message to know that DDP was enabled instead of waiting
2428 * for the indicate which would avoid copying the indicate
2429 * if no data is pending.
2430 *
2431 * XXX: Might want to limit the indicate size to the size
2432 * of the first queued request.
2433 */
2434 if ((toep->ddp.flags & DDP_SC_REQ) == 0)
2435 enable_ddp(sc, toep);
2436 return;
2437 }
2438 SOCKBUF_UNLOCK(sb);
2439
2440 /*
2441 * If another thread is queueing a buffer for DDP, let it
2442 * drain any work and return.
2443 */
2444 if (toep->ddp.queueing != NULL)
2445 return;
2446
2447 /* Take the next job to prep it for DDP. */
2448 toep->ddp.waiting_count--;
2449 TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
2450 if (!aio_clear_cancel_function(job))
2451 goto restart;
2452 toep->ddp.queueing = job;
2453
2454 /* NB: This drops DDP_LOCK while it holds the backing VM pages. */
2455 error = hold_aio(toep, job, &ps);
2456 if (error != 0) {
2457 ddp_complete_one(job, error);
2458 toep->ddp.queueing = NULL;
2459 goto restart;
2460 }
2461
2462 SOCKBUF_LOCK(sb);
2463 if (so->so_error && sbavail(sb) == 0) {
2464 copied = job->aio_received;
2465 if (copied != 0) {
2466 SOCKBUF_UNLOCK(sb);
2467 recycle_pageset(toep, ps);
2468 aio_complete(job, copied, 0);
2469 toep->ddp.queueing = NULL;
2470 goto restart;
2471 }
2472
2473 error = so->so_error;
2474 so->so_error = 0;
2475 SOCKBUF_UNLOCK(sb);
2476 recycle_pageset(toep, ps);
2477 aio_complete(job, -1, error);
2478 toep->ddp.queueing = NULL;
2479 goto restart;
2480 }
2481
2482 if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) {
2483 SOCKBUF_UNLOCK(sb);
2484 recycle_pageset(toep, ps);
2485 if (toep->ddp.active_count != 0) {
2486 /*
2487 * The door is closed, but there are still pending
2488 * DDP buffers. Requeue. These jobs will all be
2489 * completed once those buffers drain.
2490 */
2491 aio_ddp_requeue_one(toep, job);
2492 toep->ddp.queueing = NULL;
2493 return;
2494 }
2495 ddp_complete_one(job, 0);
2496 ddp_complete_all(toep, 0);
2497 toep->ddp.queueing = NULL;
2498 return;
2499 }
2500
2501 sbcopy:
2502 /*
2503 * If the toep is dead, there shouldn't be any data in the socket
2504 * buffer, so the above case should have handled this.
2505 */
2506 MPASS(!(toep->ddp.flags & DDP_DEAD));
2507
2508 /*
2509 * If there is pending data in the socket buffer (either
2510 * from before the requests were queued or a DDP indicate),
2511 * copy those mbufs out directly.
2512 */
2513 copied = 0;
2514 offset = ps->offset + job->aio_received;
2515 MPASS(job->aio_received <= job->uaiocb.aio_nbytes);
2516 resid = job->uaiocb.aio_nbytes - job->aio_received;
2517 m = sb->sb_mb;
2518 KASSERT(m == NULL || toep->ddp.active_count == 0,
2519 ("%s: sockbuf data with active DDP", __func__));
2520 while (m != NULL && resid > 0) {
2521 struct iovec iov[1];
2522 struct uio uio;
2523 #ifdef INVARIANTS
2524 int error;
2525 #endif
2526
2527 iov[0].iov_base = mtod(m, void *);
2528 iov[0].iov_len = m->m_len;
2529 if (iov[0].iov_len > resid)
2530 iov[0].iov_len = resid;
2531 uio.uio_iov = iov;
2532 uio.uio_iovcnt = 1;
2533 uio.uio_offset = 0;
2534 uio.uio_resid = iov[0].iov_len;
2535 uio.uio_segflg = UIO_SYSSPACE;
2536 uio.uio_rw = UIO_WRITE;
2537 #ifdef INVARIANTS
2538 error = uiomove_fromphys(ps->pages, offset + copied,
2539 uio.uio_resid, &uio);
2540 #else
2541 uiomove_fromphys(ps->pages, offset + copied, uio.uio_resid, &uio);
2542 #endif
2543 MPASS(error == 0 && uio.uio_resid == 0);
2544 copied += uio.uio_offset;
2545 resid -= uio.uio_offset;
2546 m = m->m_next;
2547 }
2548 if (copied != 0) {
2549 sbdrop_locked(sb, copied);
2550 job->aio_received += copied;
2551 job->msgrcv = 1;
2552 copied = job->aio_received;
2553 inp = sotoinpcb(so);
2554 if (!INP_TRY_WLOCK(inp)) {
2555 /*
2556 * The reference on the socket file descriptor in
2557 * the AIO job should keep 'sb' and 'inp' stable.
2558 * Our caller has a reference on the 'toep' that
2559 * keeps it stable.
2560 */
2561 SOCKBUF_UNLOCK(sb);
2562 DDP_UNLOCK(toep);
2563 INP_WLOCK(inp);
2564 DDP_LOCK(toep);
2565 SOCKBUF_LOCK(sb);
2566
2567 /*
2568 * If the socket has been closed, we should detect
2569 * that and complete this request if needed on
2570 * the next trip around the loop.
2571 */
2572 }
2573 t4_rcvd_locked(&toep->td->tod, intotcpcb(inp));
2574 INP_WUNLOCK(inp);
2575 if (resid == 0 || toep->ddp.flags & DDP_DEAD) {
2576 /*
2577 * We filled the entire buffer with socket
2578 * data, DDP is not being used, or the socket
2579 * is being shut down, so complete the
2580 * request.
2581 */
2582 SOCKBUF_UNLOCK(sb);
2583 recycle_pageset(toep, ps);
2584 aio_complete(job, copied, 0);
2585 toep->ddp.queueing = NULL;
2586 goto restart;
2587 }
2588
2589 /*
2590 * If DDP is not enabled, requeue this request and restart.
2591 * This will either enable DDP or wait for more data to
2592 * arrive on the socket buffer.
2593 */
2594 if ((toep->ddp.flags & (DDP_ON | DDP_SC_REQ)) != DDP_ON) {
2595 SOCKBUF_UNLOCK(sb);
2596 recycle_pageset(toep, ps);
2597 aio_ddp_requeue_one(toep, job);
2598 toep->ddp.queueing = NULL;
2599 goto restart;
2600 }
2601
2602 /*
2603 * An indicate might have arrived and been added to
2604 * the socket buffer while it was unlocked after the
2605 * copy to lock the INP. If so, restart the copy.
2606 */
2607 if (sbavail(sb) != 0)
2608 goto sbcopy;
2609 }
2610 SOCKBUF_UNLOCK(sb);
2611
2612 if (prep_pageset(sc, toep, ps) == 0) {
2613 recycle_pageset(toep, ps);
2614 aio_ddp_requeue_one(toep, job);
2615 toep->ddp.queueing = NULL;
2616
2617 /*
2618 * XXX: Need to retry this later. Mostly need a trigger
2619 * when page pods are freed up.
2620 */
2621 printf("%s: prep_pageset failed\n", __func__);
2622 return;
2623 }
2624
2625 /* Determine which DDP buffer to use. */
2626 if (toep->ddp.db[0].job == NULL) {
2627 db_idx = 0;
2628 } else {
2629 MPASS(toep->ddp.db[1].job == NULL);
2630 db_idx = 1;
2631 }
2632
2633 ddp_flags = 0;
2634 ddp_flags_mask = 0;
2635 if (db_idx == 0) {
2636 ddp_flags |= V_TF_DDP_BUF0_VALID(1);
2637 if (so->so_state & SS_NBIO)
2638 ddp_flags |= V_TF_DDP_BUF0_FLUSH(1);
2639 ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE0(1) |
2640 V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PSHF_ENABLE_0(1) |
2641 V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF0_VALID(1);
2642 buf_flag = DDP_BUF0_ACTIVE;
2643 } else {
2644 ddp_flags |= V_TF_DDP_BUF1_VALID(1);
2645 if (so->so_state & SS_NBIO)
2646 ddp_flags |= V_TF_DDP_BUF1_FLUSH(1);
2647 ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE1(1) |
2648 V_TF_DDP_PUSH_DISABLE_1(1) | V_TF_DDP_PSHF_ENABLE_1(1) |
2649 V_TF_DDP_BUF1_FLUSH(1) | V_TF_DDP_BUF1_VALID(1);
2650 buf_flag = DDP_BUF1_ACTIVE;
2651 }
2652 MPASS((toep->ddp.flags & buf_flag) == 0);
2653 if ((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0) {
2654 MPASS(db_idx == 0);
2655 MPASS(toep->ddp.active_id == -1);
2656 MPASS(toep->ddp.active_count == 0);
2657 ddp_flags_mask |= V_TF_DDP_ACTIVE_BUF(1);
2658 }
2659
2660 /*
2661 * The TID for this connection should still be valid. If DDP_DEAD
2662 * is set, SBS_CANTRCVMORE should be set, so we shouldn't be
2663 * this far anyway. Even if the socket is closing on the other
2664 * end, the AIO job holds a reference on this end of the socket
2665 * which will keep it open and keep the TCP PCB attached until
2666 * after the job is completed.
2667 */
2668 wr = mk_update_tcb_for_ddp(sc, toep, db_idx, &ps->prsv,
2669 job->aio_received, ps->len, ddp_flags, ddp_flags_mask);
2670 if (wr == NULL) {
2671 recycle_pageset(toep, ps);
2672 aio_ddp_requeue_one(toep, job);
2673 toep->ddp.queueing = NULL;
2674
2675 /*
2676 * XXX: Need a way to kick a retry here.
2677 *
2678 * XXX: We know the fixed size needed and could
2679 * preallocate this using a blocking request at the
2680 * start of the task to avoid having to handle this
2681 * edge case.
2682 */
2683 printf("%s: mk_update_tcb_for_ddp failed\n", __func__);
2684 return;
2685 }
2686
2687 if (!aio_set_cancel_function(job, t4_aio_cancel_active)) {
2688 free_wrqe(wr);
2689 recycle_pageset(toep, ps);
2690 aio_ddp_cancel_one(job);
2691 toep->ddp.queueing = NULL;
2692 goto restart;
2693 }
2694
2695 #ifdef VERBOSE_TRACES
2696 CTR6(KTR_CXGBE,
2697 "%s: tid %u, scheduling %p for DDP[%d] (flags %#lx/%#lx)", __func__,
2698 toep->tid, job, db_idx, ddp_flags, ddp_flags_mask);
2699 #endif
2700 /* Give the chip the go-ahead. */
2701 t4_wrq_tx(sc, wr);
2702 db = &toep->ddp.db[db_idx];
2703 db->cancel_pending = 0;
2704 db->job = job;
2705 db->ps = ps;
2706 toep->ddp.queueing = NULL;
2707 toep->ddp.flags |= buf_flag;
2708 toep->ddp.active_count++;
2709 if (toep->ddp.active_count == 1) {
2710 MPASS(toep->ddp.active_id == -1);
2711 toep->ddp.active_id = db_idx;
2712 CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__,
2713 toep->ddp.active_id);
2714 }
2715 goto restart;
2716 }
2717
2718 void
ddp_queue_toep(struct toepcb * toep)2719 ddp_queue_toep(struct toepcb *toep)
2720 {
2721
2722 DDP_ASSERT_LOCKED(toep);
2723 if (toep->ddp.flags & DDP_TASK_ACTIVE)
2724 return;
2725 toep->ddp.flags |= DDP_TASK_ACTIVE;
2726 hold_toepcb(toep);
2727 soaio_enqueue(&toep->ddp.requeue_task);
2728 }
2729
2730 static void
aio_ddp_requeue_task(void * context,int pending)2731 aio_ddp_requeue_task(void *context, int pending)
2732 {
2733 struct toepcb *toep = context;
2734
2735 DDP_LOCK(toep);
2736 aio_ddp_requeue(toep);
2737 toep->ddp.flags &= ~DDP_TASK_ACTIVE;
2738 DDP_UNLOCK(toep);
2739
2740 free_toepcb(toep);
2741 }
2742
2743 static void
t4_aio_cancel_active(struct kaiocb * job)2744 t4_aio_cancel_active(struct kaiocb *job)
2745 {
2746 struct socket *so = job->fd_file->f_data;
2747 struct tcpcb *tp = sototcpcb(so);
2748 struct toepcb *toep = tp->t_toe;
2749 struct adapter *sc = td_adapter(toep->td);
2750 uint64_t valid_flag;
2751 int i;
2752
2753 DDP_LOCK(toep);
2754 if (aio_cancel_cleared(job)) {
2755 DDP_UNLOCK(toep);
2756 aio_ddp_cancel_one(job);
2757 return;
2758 }
2759
2760 for (i = 0; i < nitems(toep->ddp.db); i++) {
2761 if (toep->ddp.db[i].job == job) {
2762 /* Should only ever get one cancel request for a job. */
2763 MPASS(toep->ddp.db[i].cancel_pending == 0);
2764
2765 /*
2766 * Invalidate this buffer. It will be
2767 * cancelled or partially completed once the
2768 * card ACKs the invalidate.
2769 */
2770 valid_flag = i == 0 ? V_TF_DDP_BUF0_VALID(1) :
2771 V_TF_DDP_BUF1_VALID(1);
2772 t4_set_tcb_field(sc, toep->ctrlq, toep,
2773 W_TCB_RX_DDP_FLAGS, valid_flag, 0, 1,
2774 CPL_COOKIE_DDP0 + i);
2775 toep->ddp.db[i].cancel_pending = 1;
2776 CTR2(KTR_CXGBE, "%s: request %p marked pending",
2777 __func__, job);
2778 break;
2779 }
2780 }
2781 DDP_UNLOCK(toep);
2782 }
2783
2784 static void
t4_aio_cancel_queued(struct kaiocb * job)2785 t4_aio_cancel_queued(struct kaiocb *job)
2786 {
2787 struct socket *so = job->fd_file->f_data;
2788 struct tcpcb *tp = sototcpcb(so);
2789 struct toepcb *toep = tp->t_toe;
2790
2791 DDP_LOCK(toep);
2792 if (!aio_cancel_cleared(job)) {
2793 TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
2794 toep->ddp.waiting_count--;
2795 if (toep->ddp.waiting_count == 0)
2796 ddp_queue_toep(toep);
2797 }
2798 CTR2(KTR_CXGBE, "%s: request %p cancelled", __func__, job);
2799 DDP_UNLOCK(toep);
2800
2801 aio_ddp_cancel_one(job);
2802 }
2803
2804 int
t4_aio_queue_ddp(struct socket * so,struct kaiocb * job)2805 t4_aio_queue_ddp(struct socket *so, struct kaiocb *job)
2806 {
2807 struct inpcb *inp = sotoinpcb(so);
2808 struct tcpcb *tp = intotcpcb(inp);
2809 struct toepcb *toep = tp->t_toe;
2810
2811 /* Ignore writes. */
2812 if (job->uaiocb.aio_lio_opcode != LIO_READ)
2813 return (EOPNOTSUPP);
2814
2815 INP_WLOCK(inp);
2816 if (__predict_false(ulp_mode(toep) == ULP_MODE_NONE)) {
2817 if (!set_ddp_ulp_mode(toep)) {
2818 INP_WUNLOCK(inp);
2819 return (EOPNOTSUPP);
2820 }
2821 }
2822 INP_WUNLOCK(inp);
2823
2824 DDP_LOCK(toep);
2825
2826 /*
2827 * If DDP is being used for all normal receive, don't use it
2828 * for AIO.
2829 */
2830 if ((toep->ddp.flags & DDP_RCVBUF) != 0) {
2831 DDP_UNLOCK(toep);
2832 return (EOPNOTSUPP);
2833 }
2834
2835 if ((toep->ddp.flags & DDP_AIO) == 0) {
2836 toep->ddp.flags |= DDP_AIO;
2837 TAILQ_INIT(&toep->ddp.cached_pagesets);
2838 TAILQ_INIT(&toep->ddp.aiojobq);
2839 TASK_INIT(&toep->ddp.requeue_task, 0, aio_ddp_requeue_task,
2840 toep);
2841 }
2842
2843 /*
2844 * XXX: Think about possibly returning errors for ENOTCONN,
2845 * etc. Perhaps the caller would only queue the request
2846 * if it failed with EOPNOTSUPP?
2847 */
2848
2849 #ifdef VERBOSE_TRACES
2850 CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid);
2851 #endif
2852 if (!aio_set_cancel_function(job, t4_aio_cancel_queued))
2853 panic("new job was cancelled");
2854 TAILQ_INSERT_TAIL(&toep->ddp.aiojobq, job, list);
2855 toep->ddp.waiting_count++;
2856
2857 /*
2858 * Try to handle this request synchronously. If this has
2859 * to block because the task is running, it will just bail
2860 * and let the task handle it instead.
2861 */
2862 aio_ddp_requeue(toep);
2863 DDP_UNLOCK(toep);
2864 return (0);
2865 }
2866
2867 static void
ddp_rcvbuf_requeue(struct toepcb * toep)2868 ddp_rcvbuf_requeue(struct toepcb *toep)
2869 {
2870 struct socket *so;
2871 struct sockbuf *sb;
2872 struct inpcb *inp;
2873 struct ddp_rcv_buffer *drb;
2874
2875 DDP_ASSERT_LOCKED(toep);
2876 restart:
2877 if ((toep->ddp.flags & DDP_DEAD) != 0) {
2878 MPASS(toep->ddp.active_count == 0);
2879 return;
2880 }
2881
2882 /* If both buffers are active, nothing to do. */
2883 if (toep->ddp.active_count == nitems(toep->ddp.db)) {
2884 return;
2885 }
2886
2887 inp = toep->inp;
2888 so = inp->inp_socket;
2889 sb = &so->so_rcv;
2890
2891 drb = alloc_cached_ddp_rcv_buffer(toep);
2892 DDP_UNLOCK(toep);
2893
2894 if (drb == NULL) {
2895 drb = alloc_ddp_rcv_buffer(toep, M_WAITOK);
2896 if (drb == NULL) {
2897 printf("%s: failed to allocate buffer\n", __func__);
2898 DDP_LOCK(toep);
2899 return;
2900 }
2901 }
2902
2903 DDP_LOCK(toep);
2904 if ((toep->ddp.flags & DDP_DEAD) != 0 ||
2905 toep->ddp.active_count == nitems(toep->ddp.db)) {
2906 recycle_ddp_rcv_buffer(toep, drb);
2907 return;
2908 }
2909
2910 /* We will never get anything unless we are or were connected. */
2911 SOCKBUF_LOCK(sb);
2912 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
2913 SOCKBUF_UNLOCK(sb);
2914 recycle_ddp_rcv_buffer(toep, drb);
2915 return;
2916 }
2917
2918 /* Abort if socket has reported problems or is closed. */
2919 if (so->so_error != 0 || (sb->sb_state & SBS_CANTRCVMORE) != 0) {
2920 SOCKBUF_UNLOCK(sb);
2921 recycle_ddp_rcv_buffer(toep, drb);
2922 return;
2923 }
2924 SOCKBUF_UNLOCK(sb);
2925
2926 if (!queue_ddp_rcvbuf(toep, drb)) {
2927 /*
2928 * XXX: Need a way to kick a retry here.
2929 *
2930 * XXX: We know the fixed size needed and could
2931 * preallocate the work request using a blocking
2932 * request at the start of the task to avoid having to
2933 * handle this edge case.
2934 */
2935 return;
2936 }
2937 goto restart;
2938 }
2939
2940 static void
ddp_rcvbuf_requeue_task(void * context,int pending)2941 ddp_rcvbuf_requeue_task(void *context, int pending)
2942 {
2943 struct toepcb *toep = context;
2944
2945 DDP_LOCK(toep);
2946 ddp_rcvbuf_requeue(toep);
2947 toep->ddp.flags &= ~DDP_TASK_ACTIVE;
2948 DDP_UNLOCK(toep);
2949
2950 free_toepcb(toep);
2951 }
2952
2953 int
t4_enable_ddp_rcv(struct socket * so,struct toepcb * toep)2954 t4_enable_ddp_rcv(struct socket *so, struct toepcb *toep)
2955 {
2956 struct inpcb *inp = sotoinpcb(so);
2957 struct adapter *sc = td_adapter(toep->td);
2958
2959 INP_WLOCK(inp);
2960 switch (ulp_mode(toep)) {
2961 case ULP_MODE_TCPDDP:
2962 break;
2963 case ULP_MODE_NONE:
2964 if (set_ddp_ulp_mode(toep))
2965 break;
2966 /* FALLTHROUGH */
2967 default:
2968 INP_WUNLOCK(inp);
2969 return (EOPNOTSUPP);
2970 }
2971 INP_WUNLOCK(inp);
2972
2973 DDP_LOCK(toep);
2974
2975 /*
2976 * If DDP is being used for AIO already, don't use it for
2977 * normal receive.
2978 */
2979 if ((toep->ddp.flags & DDP_AIO) != 0) {
2980 DDP_UNLOCK(toep);
2981 return (EOPNOTSUPP);
2982 }
2983
2984 if ((toep->ddp.flags & DDP_RCVBUF) != 0) {
2985 DDP_UNLOCK(toep);
2986 return (EBUSY);
2987 }
2988
2989 toep->ddp.flags |= DDP_RCVBUF;
2990 TAILQ_INIT(&toep->ddp.cached_buffers);
2991 enable_ddp(sc, toep);
2992 TASK_INIT(&toep->ddp.requeue_task, 0, ddp_rcvbuf_requeue_task, toep);
2993 ddp_queue_toep(toep);
2994 DDP_UNLOCK(toep);
2995 return (0);
2996 }
2997
2998 void
t4_ddp_mod_load(void)2999 t4_ddp_mod_load(void)
3000 {
3001 if (t4_ddp_rcvbuf_len < PAGE_SIZE)
3002 t4_ddp_rcvbuf_len = PAGE_SIZE;
3003 if (t4_ddp_rcvbuf_len > MAX_DDP_BUFFER_SIZE)
3004 t4_ddp_rcvbuf_len = MAX_DDP_BUFFER_SIZE;
3005 if (!powerof2(t4_ddp_rcvbuf_len))
3006 t4_ddp_rcvbuf_len = 1 << fls(t4_ddp_rcvbuf_len);
3007
3008 t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_ddp_tcb_rpl,
3009 CPL_COOKIE_DDP0);
3010 t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_ddp_tcb_rpl,
3011 CPL_COOKIE_DDP1);
3012 t4_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
3013 t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
3014 TAILQ_INIT(&ddp_orphan_pagesets);
3015 mtx_init(&ddp_orphan_pagesets_lock, "ddp orphans", NULL, MTX_DEF);
3016 TASK_INIT(&ddp_orphan_task, 0, ddp_free_orphan_pagesets, NULL);
3017 }
3018
3019 void
t4_ddp_mod_unload(void)3020 t4_ddp_mod_unload(void)
3021 {
3022
3023 taskqueue_drain(taskqueue_thread, &ddp_orphan_task);
3024 MPASS(TAILQ_EMPTY(&ddp_orphan_pagesets));
3025 mtx_destroy(&ddp_orphan_pagesets_lock);
3026 t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, NULL, CPL_COOKIE_DDP0);
3027 t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, NULL, CPL_COOKIE_DDP1);
3028 t4_register_cpl_handler(CPL_RX_DATA_DDP, NULL);
3029 t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, NULL);
3030 }
3031 #endif
3032