/*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023 Chelsio Communications, Inc. * Written by: John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_inet.h" #include #include #include #include #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "common/t4_regs.h" #include "common/t4_tcb.h" #include "tom/t4_tom.h" /* Status code values in CPL_NVMT_CMP. */ #define CMP_STATUS_ERROR_MASK 0x7f #define CMP_STATUS_NO_ERROR 0 #define CMP_STATUS_HEADER_DIGEST 1 #define CMP_STATUS_DIRECTION_MISMATCH 2 #define CMP_STATUS_DIGEST_FLAG_MISMATCH 3 #define CMP_STATUS_SUCCESS_NOT_LAST 4 #define CMP_STATUS_BAD_DATA_LENGTH 5 #define CMP_STATUS_USER_MODE_UNALLOCATED 6 #define CMP_STATUS_RQT_LIMIT 7 #define CMP_STATUS_RQT_WRAP 8 #define CMP_STATUS_RQT_BOUND 9 #define CMP_STATUS_TPT_LIMIT 16 #define CMP_STATUS_TPT_INVALID 17 #define CMP_STATUS_TPT_COLOUR_MISMATCH 18 #define CMP_STATUS_TPT_MISC 19 #define CMP_STATUS_TPT_WRAP 20 #define CMP_STATUS_TPT_BOUND 21 #define CMP_STATUS_TPT_LAST_PDU_UNALIGNED 22 #define CMP_STATUS_PBL_LIMIT 24 #define CMP_STATUS_DATA_DIGEST 25 #define CMP_STATUS_DDP 0x80 /* * Transfer tags and CIDs with the MSB set are "unallocated" tags that * pass data through to the freelist without using DDP. */ #define CHE_FL_TAG_MASK 0x8000 #define CHE_MAX_FL_TAG 0x7fff #define CHE_NUM_FL_TAGS (CHE_MAX_FL_TAG + 1) #define CHE_TAG_IS_FL(ttag) (((ttag) & CHE_FL_TAG_MASK) == CHE_FL_TAG_MASK) #define CHE_RAW_FL_TAG(ttag) ((ttag) & ~CHE_FL_TAG_MASK) #define CHE_DDP_TAG(stag_idx, color) ((stag_idx) << 4 | (color)) #define CHE_STAG_COLOR(stag) ((stag) & 0xf) #define CHE_STAG_IDX(stag) ((stag) >> 4) #define CHE_DDP_MAX_COLOR 0xf #define CHE_DDP_NO_TAG 0xffff /* * A bitmap of non-DDP CIDs in use on the host. Since there is no * _BIT_FFC (find first clear), the bitset is inverted so that a clear * bit indicates an in-use CID. */ BITSET_DEFINE(fl_cid_set, CHE_NUM_FL_TAGS); #define FL_CID_INIT(p) __BIT_FILL(CHE_NUM_FL_TAGS, p) #define FL_CID_BUSY(n, p) __BIT_CLR(CHE_NUM_FL_TAGS, n, p) #define FL_CID_ISACTIVE(n, p) !__BIT_ISSET(CHE_NUM_FL_TAGS, n, p) #define FL_CID_FREE(n, p) __BIT_SET(CHE_NUM_FL_TAGS, n, p) #define FL_CID_FINDFREE_AT(p, start) __BIT_FFS_AT(CHE_NUM_FL_TAGS, p, start) /* * The TCP sequence number of both CPL_NVMT_DATA and CPL_NVMT_CMP * mbufs are saved here while the mbuf is in qp->rx_data and qp->rx_pdus. */ #define nvmf_tcp_seq PH_loc.thirtytwo[0] /* * The CPL status of CPL_NVMT_CMP mbufs are saved here while the mbuf * is in qp->rx_pdus. */ #define nvmf_cpl_status PH_loc.eight[4] struct nvmf_che_capsule; struct nvmf_che_qpair; struct nvmf_che_adapter { struct adapter *sc; u_int ddp_threshold; u_int max_transmit_pdu; u_int max_receive_pdu; bool nvmt_data_iqe; struct sysctl_ctx_list ctx; /* from uld_activate to deactivate */ }; struct nvmf_che_command_buffer { struct nvmf_che_qpair *qp; struct nvmf_io_request io; size_t data_len; size_t data_xfered; uint32_t data_offset; u_int refs; int error; bool ddp_ok; uint16_t cid; uint16_t ttag; uint16_t original_cid; /* Host only */ TAILQ_ENTRY(nvmf_che_command_buffer) link; /* Fields used for DDP. */ struct fw_ri_tpte tpte; uint64_t *pbl; uint32_t pbl_addr; uint32_t pbl_len; /* Controller only */ struct nvmf_che_capsule *cc; }; struct nvmf_che_command_buffer_list { TAILQ_HEAD(, nvmf_che_command_buffer) head; struct mtx lock; }; struct nvmf_che_qpair { struct nvmf_qpair qp; struct socket *so; struct toepcb *toep; struct nvmf_che_adapter *nca; volatile u_int refs; /* Every allocated capsule holds a reference */ uint8_t txpda; uint8_t rxpda; bool header_digests; bool data_digests; uint32_t maxr2t; uint32_t maxh2cdata; /* Controller only */ uint32_t max_rx_data; uint32_t max_tx_data; uint32_t max_icd; /* Host only */ uint32_t max_ioccsz; /* Controller only */ union { uint16_t next_fl_ttag; /* Controller only */ uint16_t next_cid; /* Host only */ }; uint16_t next_ddp_tag; u_int num_fl_ttags; /* Controller only */ u_int active_fl_ttags; /* Controller only */ u_int num_ddp_tags; u_int active_ddp_tags; bool send_success; /* Controller only */ uint8_t ddp_color; uint32_t tpt_offset; /* Receive state. */ struct thread *rx_thread; struct cv rx_cv; bool rx_shutdown; int rx_error; struct mbufq rx_data; /* Data received via CPL_NVMT_DATA. */ struct mbufq rx_pdus; /* PDU headers received via CPL_NVMT_CMP. */ /* Transmit state. */ struct thread *tx_thread; struct cv tx_cv; bool tx_shutdown; STAILQ_HEAD(, nvmf_che_capsule) tx_capsules; struct nvmf_che_command_buffer_list tx_buffers; struct nvmf_che_command_buffer_list rx_buffers; /* * For the controller, an RX command buffer can be in one of * three locations, all protected by the rx_buffers.lock. If * a receive request is waiting for either an R2T slot for its * command (due to exceeding MAXR2T), or a transfer tag it is * placed on the rx_buffers list. When a request is allocated * an active transfer tag, it moves to either the * open_ddp_tags[] or open_fl_ttags[] array (indexed by the * tag) until it completes. * * For the host, an RX command buffer using DDP is in * open_ddp_tags[], otherwise it is in rx_buffers. */ struct nvmf_che_command_buffer **open_ddp_tags; struct nvmf_che_command_buffer **open_fl_ttags; /* Controller only */ /* * For the host, CIDs submitted by nvmf(4) must be rewritten * to either use DDP or not use DDP. The CID in response * capsules must be restored to their original value. For * DDP, the original CID is stored in the command buffer. * These variables manage non-DDP CIDs. */ uint16_t *fl_cids; /* Host only */ struct fl_cid_set *fl_cid_set; /* Host only */ struct mtx fl_cid_lock; /* Host only */ }; struct nvmf_che_rxpdu { struct mbuf *m; const struct nvme_tcp_common_pdu_hdr *hdr; uint32_t data_len; bool data_digest_mismatch; bool ddp; }; struct nvmf_che_capsule { struct nvmf_capsule nc; volatile u_int refs; struct nvmf_che_rxpdu rx_pdu; uint32_t active_r2ts; /* Controller only */ #ifdef INVARIANTS uint32_t tx_data_offset; /* Controller only */ u_int pending_r2ts; /* Controller only */ #endif STAILQ_ENTRY(nvmf_che_capsule) link; }; #define CCAP(nc) ((struct nvmf_che_capsule *)(nc)) #define CQP(qp) ((struct nvmf_che_qpair *)(qp)) static void che_release_capsule(struct nvmf_che_capsule *cc); static void che_free_qpair(struct nvmf_qpair *nq); SYSCTL_NODE(_kern_nvmf, OID_AUTO, che, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Chelsio TCP offload transport"); static u_int che_max_transmit_pdu = 32 * 1024; SYSCTL_UINT(_kern_nvmf_che, OID_AUTO, max_transmit_pdu, CTLFLAG_RWTUN, &che_max_transmit_pdu, 0, "Maximum size of a transmitted PDU"); static u_int che_max_receive_pdu = 32 * 1024; SYSCTL_UINT(_kern_nvmf_che, OID_AUTO, max_receive_pdu, CTLFLAG_RWTUN, &che_max_receive_pdu, 0, "Maximum size of a received PDU"); static int use_dsgl = 1; SYSCTL_INT(_kern_nvmf_che, OID_AUTO, use_dsgl, CTLFLAG_RWTUN, &use_dsgl, 0, "Use DSGL for PBL/FastReg (default=1)"); static int inline_threshold = 256; SYSCTL_INT(_kern_nvmf_che, OID_AUTO, inline_threshold, CTLFLAG_RWTUN, &inline_threshold, 0, "inline vs dsgl threshold (default=256)"); static int ddp_tags_per_qp = 128; SYSCTL_INT(_kern_nvmf_che, OID_AUTO, ddp_tags_per_qp, CTLFLAG_RWTUN, &ddp_tags_per_qp, 0, "Number of DDP tags to reserve for each queue pair"); static MALLOC_DEFINE(M_NVMF_CHE, "nvmf_che", "Chelsio NVMe-TCP offload"); /* * PBL regions consist of N full-sized pages. TPT entries support an * initial offset into the first page (FBO) and can handle a partial * length on the last page. */ static bool che_ddp_io_check(struct nvmf_che_qpair *qp, const struct nvmf_io_request *io) { const struct memdesc *mem = &io->io_mem; struct bus_dma_segment *ds; int i; if (io->io_len < qp->nca->ddp_threshold) { return (false); } switch (mem->md_type) { case MEMDESC_VADDR: case MEMDESC_PADDR: case MEMDESC_VMPAGES: return (true); case MEMDESC_VLIST: case MEMDESC_PLIST: /* * Require all but the first segment to start on a * page boundary. Require all but the last segment to * end on a page boundary. */ ds = mem->u.md_list; for (i = 0; i < mem->md_nseg; i++, ds++) { if (i != 0 && ds->ds_addr % PAGE_SIZE != 0) return (false); if (i != mem->md_nseg - 1 && (ds->ds_addr + ds->ds_len) % PAGE_SIZE != 0) return (false); } return (true); default: /* * Other types could be validated with more work, but * they aren't used currently by nvmf(4) or nvmft(4). */ return (false); } } static u_int che_fbo(struct nvmf_che_command_buffer *cb) { struct memdesc *mem = &cb->io.io_mem; switch (mem->md_type) { case MEMDESC_VADDR: return ((uintptr_t)mem->u.md_vaddr & PAGE_MASK); case MEMDESC_PADDR: return (mem->u.md_paddr & PAGE_MASK); case MEMDESC_VMPAGES: return (mem->md_offset); case MEMDESC_VLIST: case MEMDESC_PLIST: return (mem->u.md_list[0].ds_addr & PAGE_MASK); default: __assert_unreachable(); } } static u_int che_npages(struct nvmf_che_command_buffer *cb) { return (howmany(che_fbo(cb) + cb->io.io_len, PAGE_SIZE)); } static struct nvmf_che_command_buffer * che_alloc_command_buffer(struct nvmf_che_qpair *qp, const struct nvmf_io_request *io, uint32_t data_offset, size_t data_len, uint16_t cid) { struct nvmf_che_command_buffer *cb; cb = malloc(sizeof(*cb), M_NVMF_CHE, M_WAITOK); cb->qp = qp; cb->io = *io; cb->data_offset = data_offset; cb->data_len = data_len; cb->data_xfered = 0; refcount_init(&cb->refs, 1); cb->error = 0; cb->ddp_ok = che_ddp_io_check(qp, io); cb->cid = cid; cb->ttag = 0; cb->original_cid = 0; cb->cc = NULL; cb->pbl = NULL; return (cb); } static void che_hold_command_buffer(struct nvmf_che_command_buffer *cb) { refcount_acquire(&cb->refs); } static void che_free_command_buffer(struct nvmf_che_command_buffer *cb) { nvmf_complete_io_request(&cb->io, cb->data_xfered, cb->error); if (cb->cc != NULL) che_release_capsule(cb->cc); MPASS(cb->pbl == NULL); free(cb, M_NVMF_CHE); } static void che_release_command_buffer(struct nvmf_che_command_buffer *cb) { if (refcount_release(&cb->refs)) che_free_command_buffer(cb); } static void che_add_command_buffer(struct nvmf_che_command_buffer_list *list, struct nvmf_che_command_buffer *cb) { mtx_assert(&list->lock, MA_OWNED); TAILQ_INSERT_HEAD(&list->head, cb, link); } static struct nvmf_che_command_buffer * che_find_command_buffer(struct nvmf_che_command_buffer_list *list, uint16_t cid) { struct nvmf_che_command_buffer *cb; mtx_assert(&list->lock, MA_OWNED); TAILQ_FOREACH(cb, &list->head, link) { if (cb->cid == cid) return (cb); } return (NULL); } static void che_remove_command_buffer(struct nvmf_che_command_buffer_list *list, struct nvmf_che_command_buffer *cb) { mtx_assert(&list->lock, MA_OWNED); TAILQ_REMOVE(&list->head, cb, link); } static void che_purge_command_buffer(struct nvmf_che_command_buffer_list *list, uint16_t cid) { struct nvmf_che_command_buffer *cb; mtx_lock(&list->lock); cb = che_find_command_buffer(list, cid); if (cb != NULL) { che_remove_command_buffer(list, cb); mtx_unlock(&list->lock); che_release_command_buffer(cb); } else mtx_unlock(&list->lock); } static int che_write_mem_inline(struct adapter *sc, struct toepcb *toep, uint32_t addr, uint32_t len, void *data, struct mbufq *wrq) { struct mbuf *m; char *cp; int copy_len, i, num_wqe, wr_len; #ifdef VERBOSE_TRACES CTR(KTR_CXGBE, "%s: addr 0x%x len %u", __func__, addr << 5, len); #endif num_wqe = DIV_ROUND_UP(len, T4_MAX_INLINE_SIZE); cp = data; for (i = 0; i < num_wqe; i++) { copy_len = min(len, T4_MAX_INLINE_SIZE); wr_len = T4_WRITE_MEM_INLINE_LEN(copy_len); m = alloc_raw_wr_mbuf(wr_len); if (m == NULL) return (ENOMEM); t4_write_mem_inline_wr(sc, mtod(m, void *), wr_len, toep->tid, addr, copy_len, cp, 0); if (cp != NULL) cp += T4_MAX_INLINE_SIZE; addr += T4_MAX_INLINE_SIZE >> 5; len -= T4_MAX_INLINE_SIZE; mbufq_enqueue(wrq, m); } return (0); } static int che_write_mem_dma_aligned(struct adapter *sc, struct toepcb *toep, uint32_t addr, uint32_t len, void *data, struct mbufq *wrq) { struct mbuf *m; vm_offset_t va; u_int todo; int wr_len; /* First page. */ va = (vm_offset_t)data; todo = min(PAGE_SIZE - (va % PAGE_SIZE), len); wr_len = T4_WRITE_MEM_DMA_LEN; m = alloc_raw_wr_mbuf(wr_len); if (m == NULL) return (ENOMEM); t4_write_mem_dma_wr(sc, mtod(m, void *), wr_len, toep->tid, addr, todo, pmap_kextract(va), 0); mbufq_enqueue(wrq, m); len -= todo; addr += todo >> 5; va += todo; while (len > 0) { MPASS(va == trunc_page(va)); todo = min(PAGE_SIZE, len); m = alloc_raw_wr_mbuf(wr_len); if (m == NULL) return (ENOMEM); t4_write_mem_dma_wr(sc, mtod(m, void *), wr_len, toep->tid, addr, todo, pmap_kextract(va), 0); mbufq_enqueue(wrq, m); len -= todo; addr += todo >> 5; va += todo; } return (0); } static int che_write_adapter_mem(struct nvmf_che_qpair *qp, uint32_t addr, uint32_t len, void *data) { struct adapter *sc = qp->nca->sc; struct toepcb *toep = qp->toep; struct socket *so = qp->so; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct mbufq mq; int error; mbufq_init(&mq, INT_MAX); if (!use_dsgl || len < inline_threshold || data == NULL) error = che_write_mem_inline(sc, toep, addr, len, data, &mq); else error = che_write_mem_dma_aligned(sc, toep, addr, len, data, &mq); if (__predict_false(error != 0)) goto error; INP_WLOCK(inp); if ((tp->t_flags & TF_DISCONNECTED) != 0) { INP_WUNLOCK(inp); error = ECONNRESET; goto error; } mbufq_concat(&toep->ulp_pduq, &mq); INP_WUNLOCK(inp); return (0); error: mbufq_drain(&mq); return (error); } static bool che_alloc_pbl(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb) { struct adapter *sc = qp->nca->sc; struct memdesc *mem = &cb->io.io_mem; uint64_t *pbl; uint32_t addr, len; u_int i, npages; int error; MPASS(cb->pbl == NULL); MPASS(cb->ddp_ok); /* Hardware limit? iWARP only enforces this for T5. */ if (cb->io.io_len >= (8 * 1024 * 1024 * 1024ULL)) return (false); npages = che_npages(cb); len = roundup2(npages, 4) * sizeof(*cb->pbl); addr = t4_pblpool_alloc(sc, len); if (addr == 0) return (false); pbl = malloc(len, M_NVMF_CHE, M_NOWAIT | M_ZERO); if (pbl == NULL) { t4_pblpool_free(sc, addr, len); return (false); } switch (mem->md_type) { case MEMDESC_VADDR: { vm_offset_t va; va = trunc_page((uintptr_t)mem->u.md_vaddr); for (i = 0; i < npages; i++) pbl[i] = htobe64(pmap_kextract(va + i * PAGE_SIZE)); break; } case MEMDESC_PADDR: { vm_paddr_t pa; pa = trunc_page(mem->u.md_paddr); for (i = 0; i < npages; i++) pbl[i] = htobe64(pa + i * PAGE_SIZE); break; } case MEMDESC_VMPAGES: for (i = 0; i < npages; i++) pbl[i] = htobe64(VM_PAGE_TO_PHYS(mem->u.md_ma[i])); break; case MEMDESC_VLIST: { struct bus_dma_segment *ds; vm_offset_t va; vm_size_t len; u_int j, k; i = 0; ds = mem->u.md_list; for (j = 0; j < mem->md_nseg; j++, ds++) { va = trunc_page((uintptr_t)ds->ds_addr); len = ds->ds_len; if (ds->ds_addr % PAGE_SIZE != 0) len += ds->ds_addr % PAGE_SIZE; for (k = 0; k < howmany(len, PAGE_SIZE); k++) { pbl[i] = htobe64(pmap_kextract(va + k * PAGE_SIZE)); i++; } } MPASS(i == npages); break; } case MEMDESC_PLIST: { struct bus_dma_segment *ds; vm_paddr_t pa; vm_size_t len; u_int j, k; i = 0; ds = mem->u.md_list; for (j = 0; j < mem->md_nseg; j++, ds++) { pa = trunc_page((vm_paddr_t)ds->ds_addr); len = ds->ds_len; if (ds->ds_addr % PAGE_SIZE != 0) len += ds->ds_addr % PAGE_SIZE; for (k = 0; k < howmany(len, PAGE_SIZE); k++) { pbl[i] = htobe64(pa + k * PAGE_SIZE); i++; } } MPASS(i == npages); break; } default: __assert_unreachable(); } error = che_write_adapter_mem(qp, addr >> 5, len, pbl); if (error != 0) { t4_pblpool_free(sc, addr, len); free(pbl, M_NVMF_CHE); return (false); } cb->pbl = pbl; cb->pbl_addr = addr; cb->pbl_len = len; return (true); } static void che_free_pbl(struct nvmf_che_command_buffer *cb) { free(cb->pbl, M_NVMF_CHE); t4_pblpool_free(cb->qp->nca->sc, cb->pbl_addr, cb->pbl_len); cb->pbl = NULL; cb->pbl_addr = 0; cb->pbl_len = 0; } static bool che_write_tpt_entry(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb, uint16_t stag) { uint32_t tpt_addr; int error; cb->tpte.valid_to_pdid = htobe32(F_FW_RI_TPTE_VALID | V_FW_RI_TPTE_STAGKEY(CHE_STAG_COLOR(stag)) | F_FW_RI_TPTE_STAGSTATE | V_FW_RI_TPTE_STAGTYPE(FW_RI_STAG_NSMR) | V_FW_RI_TPTE_PDID(0)); cb->tpte.locread_to_qpid = htobe32( V_FW_RI_TPTE_PERM(FW_RI_MEM_ACCESS_REM_WRITE) | V_FW_RI_TPTE_ADDRTYPE(FW_RI_ZERO_BASED_TO) | V_FW_RI_TPTE_PS(PAGE_SIZE) | V_FW_RI_TPTE_QPID(qp->toep->tid)); #define PBL_OFF(qp, a) ((a) - (qp)->nca->sc->vres.pbl.start) cb->tpte.nosnoop_pbladdr = htobe32(V_FW_RI_TPTE_PBLADDR(PBL_OFF(qp, cb->pbl_addr) >> 3)); cb->tpte.len_lo = htobe32(cb->data_len); cb->tpte.va_hi = 0; cb->tpte.va_lo_fbo = htobe32(che_fbo(cb)); cb->tpte.dca_mwbcnt_pstag = 0; cb->tpte.len_hi = htobe32(cb->data_offset); tpt_addr = qp->tpt_offset + CHE_STAG_IDX(stag) + (qp->nca->sc->vres.stag.start >> 5); error = che_write_adapter_mem(qp, tpt_addr, sizeof(cb->tpte), &cb->tpte); return (error == 0); } static void che_clear_tpt_entry(struct nvmf_che_qpair *qp, uint16_t stag) { uint32_t tpt_addr; tpt_addr = qp->tpt_offset + CHE_STAG_IDX(stag) + (qp->nca->sc->vres.stag.start >> 5); (void)che_write_adapter_mem(qp, tpt_addr, sizeof(struct fw_ri_tpte), NULL); } static uint16_t che_alloc_ddp_stag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb) { uint16_t stag_idx; mtx_assert(&qp->rx_buffers.lock, MA_OWNED); MPASS(cb->ddp_ok); if (qp->active_ddp_tags == qp->num_ddp_tags) return (CHE_DDP_NO_TAG); MPASS(qp->num_ddp_tags != 0); stag_idx = qp->next_ddp_tag; for (;;) { if (qp->open_ddp_tags[stag_idx] == NULL) break; if (stag_idx == qp->num_ddp_tags - 1) { stag_idx = 0; if (qp->ddp_color == CHE_DDP_MAX_COLOR) qp->ddp_color = 0; else qp->ddp_color++; } else stag_idx++; MPASS(stag_idx != qp->next_ddp_tag); } if (stag_idx == qp->num_ddp_tags - 1) qp->next_ddp_tag = 0; else qp->next_ddp_tag = stag_idx + 1; qp->active_ddp_tags++; qp->open_ddp_tags[stag_idx] = cb; return (CHE_DDP_TAG(stag_idx, qp->ddp_color)); } static void che_free_ddp_stag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb, uint16_t stag) { MPASS(!CHE_TAG_IS_FL(stag)); mtx_assert(&qp->rx_buffers.lock, MA_OWNED); MPASS(qp->open_ddp_tags[CHE_STAG_IDX(stag)] == cb); qp->open_ddp_tags[CHE_STAG_IDX(stag)] = NULL; qp->active_ddp_tags--; } static uint16_t che_alloc_ddp_tag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb) { uint16_t stag; mtx_assert(&qp->rx_buffers.lock, MA_OWNED); if (!cb->ddp_ok) return (CHE_DDP_NO_TAG); stag = che_alloc_ddp_stag(qp, cb); if (stag == CHE_DDP_NO_TAG) { counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_no_stag, 1); return (CHE_DDP_NO_TAG); } if (!che_alloc_pbl(qp, cb)) { che_free_ddp_stag(qp, cb, stag); counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_error, 1); return (CHE_DDP_NO_TAG); } if (!che_write_tpt_entry(qp, cb, stag)) { che_free_pbl(cb); che_free_ddp_stag(qp, cb, stag); counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_error, 1); return (CHE_DDP_NO_TAG); } counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_ok, 1); return (stag); } static void che_free_ddp_tag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb, uint16_t stag) { MPASS(!CHE_TAG_IS_FL(stag)); mtx_assert(&qp->rx_buffers.lock, MA_OWNED); MPASS(qp->open_ddp_tags[CHE_STAG_IDX(stag)] == cb); che_clear_tpt_entry(qp, stag); che_free_pbl(cb); che_free_ddp_stag(qp, cb, stag); } static void nvmf_che_write_pdu(struct nvmf_che_qpair *qp, struct mbuf *m) { struct epoch_tracker et; struct socket *so = qp->so; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = qp->toep; CURVNET_SET(so->so_vnet); NET_EPOCH_ENTER(et); INP_WLOCK(inp); if (__predict_false(tp->t_flags & TF_DISCONNECTED) || __predict_false((toep->flags & TPF_ATTACHED) == 0)) { m_freem(m); } else { mbufq_enqueue(&toep->ulp_pduq, m); t4_push_pdus(toep->vi->adapter, toep, 0); } INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } static void nvmf_che_report_error(struct nvmf_che_qpair *qp, uint16_t fes, uint32_t fei, struct mbuf *rx_pdu, u_int hlen) { struct nvme_tcp_term_req_hdr *hdr; struct mbuf *m; if (hlen != 0) { hlen = min(hlen, NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE); hlen = min(hlen, m_length(rx_pdu, NULL)); } m = m_get2(sizeof(*hdr) + hlen, M_WAITOK, MT_DATA, M_PKTHDR); m->m_len = sizeof(*hdr) + hlen; m->m_pkthdr.len = m->m_len; hdr = mtod(m, void *); memset(hdr, 0, sizeof(*hdr)); hdr->common.pdu_type = qp->qp.nq_controller ? NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ; hdr->common.hlen = sizeof(*hdr); hdr->common.plen = sizeof(*hdr) + hlen; hdr->fes = htole16(fes); le32enc(hdr->fei, fei); if (hlen != 0) m_copydata(rx_pdu, 0, hlen, (caddr_t)(hdr + 1)); nvmf_che_write_pdu(qp, m); } static int nvmf_che_validate_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) { const struct nvme_tcp_common_pdu_hdr *ch; struct mbuf *m = pdu->m; uint32_t data_len, fei, plen, rx_digest; u_int hlen, cpl_error; int error; uint16_t fes; /* Determine how large of a PDU header to return for errors. */ ch = pdu->hdr; hlen = ch->hlen; plen = le32toh(ch->plen); if (hlen < sizeof(*ch) || hlen > plen) hlen = sizeof(*ch); cpl_error = m->m_pkthdr.nvmf_cpl_status & CMP_STATUS_ERROR_MASK; switch (cpl_error) { case CMP_STATUS_NO_ERROR: break; case CMP_STATUS_HEADER_DIGEST: counter_u64_add( qp->toep->ofld_rxq->rx_nvme_header_digest_errors, 1); printf("NVMe/TCP: Header digest mismatch\n"); rx_digest = le32dec(mtodo(m, ch->hlen)); nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, m, hlen); return (EBADMSG); case CMP_STATUS_DIRECTION_MISMATCH: counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); printf("NVMe/TCP: Invalid PDU type %u\n", ch->pdu_type); nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_common_pdu_hdr, pdu_type), m, hlen); return (EBADMSG); case CMP_STATUS_SUCCESS_NOT_LAST: case CMP_STATUS_DIGEST_FLAG_MISMATCH: counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); printf("NVMe/TCP: Invalid PDU header flags %#x\n", ch->flags); nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_common_pdu_hdr, flags), m, hlen); return (EBADMSG); case CMP_STATUS_BAD_DATA_LENGTH: counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); printf("NVMe/TCP: Invalid PDU length %u\n", plen); nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_common_pdu_hdr, plen), m, hlen); return (EBADMSG); case CMP_STATUS_USER_MODE_UNALLOCATED: case CMP_STATUS_RQT_LIMIT: case CMP_STATUS_RQT_WRAP: case CMP_STATUS_RQT_BOUND: device_printf(qp->nca->sc->dev, "received invalid NVMET error %u\n", cpl_error); return (ECONNRESET); case CMP_STATUS_TPT_LIMIT: case CMP_STATUS_TPT_INVALID: case CMP_STATUS_TPT_COLOUR_MISMATCH: case CMP_STATUS_TPT_MISC: case CMP_STATUS_TPT_WRAP: case CMP_STATUS_TPT_BOUND: counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); switch (ch->pdu_type) { case NVME_TCP_PDU_TYPE_H2C_DATA: nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m, pdu->hdr->hlen); return (EBADMSG); case NVME_TCP_PDU_TYPE_C2H_DATA: nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_c2h_data_hdr, cccid), m, hlen); return (EBADMSG); default: device_printf(qp->nca->sc->dev, "received DDP NVMET error %u for PDU %u\n", cpl_error, ch->pdu_type); return (ECONNRESET); } case CMP_STATUS_TPT_LAST_PDU_UNALIGNED: counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, m, hlen); return (EBADMSG); case CMP_STATUS_PBL_LIMIT: counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1); nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, m, hlen); return (EBADMSG); case CMP_STATUS_DATA_DIGEST: /* Handled below. */ break; default: device_printf(qp->nca->sc->dev, "received unknown NVMET error %u\n", cpl_error); return (ECONNRESET); } error = nvmf_tcp_validate_pdu_header(ch, qp->qp.nq_controller, qp->header_digests, qp->data_digests, qp->rxpda, &data_len, &fes, &fei); if (error != 0) { if (error != ECONNRESET) nvmf_che_report_error(qp, fes, fei, m, hlen); return (error); } /* Check data digest if present. */ pdu->data_digest_mismatch = false; if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) { if (cpl_error == CMP_STATUS_DATA_DIGEST) { printf("NVMe/TCP: Data digest mismatch\n"); pdu->data_digest_mismatch = true; counter_u64_add( qp->toep->ofld_rxq->rx_nvme_data_digest_errors, 1); } } pdu->data_len = data_len; return (0); } static void nvmf_che_free_pdu(struct nvmf_che_rxpdu *pdu) { m_freem(pdu->m); pdu->m = NULL; pdu->hdr = NULL; } static int nvmf_che_handle_term_req(struct nvmf_che_rxpdu *pdu) { const struct nvme_tcp_term_req_hdr *hdr; hdr = (const void *)pdu->hdr; printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n", le16toh(hdr->fes), le32dec(hdr->fei)); nvmf_che_free_pdu(pdu); return (ECONNRESET); } static int nvmf_che_save_command_capsule(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) { const struct nvme_tcp_cmd *cmd; struct nvmf_capsule *nc; struct nvmf_che_capsule *cc; cmd = (const void *)pdu->hdr; nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe, M_WAITOK); cc = CCAP(nc); cc->rx_pdu = *pdu; nvmf_capsule_received(&qp->qp, nc); return (0); } static int nvmf_che_save_response_capsule(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) { const struct nvme_tcp_rsp *rsp; struct nvme_completion cpl; struct nvmf_capsule *nc; struct nvmf_che_capsule *cc; uint16_t cid; rsp = (const void *)pdu->hdr; /* * Restore the original CID and ensure any command buffers * associated with this CID have been released. Once the CQE * has been received, no further transfers to the command * buffer for the associated CID can occur. */ cpl = rsp->rccqe; cid = le16toh(cpl.cid); if (CHE_TAG_IS_FL(cid)) { cid = CHE_RAW_FL_TAG(cid); mtx_lock(&qp->fl_cid_lock); MPASS(FL_CID_ISACTIVE(cid, qp->fl_cid_set)); cpl.cid = qp->fl_cids[cid]; FL_CID_FREE(cid, qp->fl_cid_set); mtx_unlock(&qp->fl_cid_lock); che_purge_command_buffer(&qp->rx_buffers, rsp->rccqe.cid); che_purge_command_buffer(&qp->tx_buffers, rsp->rccqe.cid); } else { struct nvmf_che_command_buffer *cb; mtx_lock(&qp->rx_buffers.lock); cb = qp->open_ddp_tags[CHE_STAG_IDX(cid)]; MPASS(cb != NULL); MPASS(cb->cid == rsp->rccqe.cid); cpl.cid = cb->original_cid; che_free_ddp_tag(qp, cb, cid); mtx_unlock(&qp->rx_buffers.lock); che_release_command_buffer(cb); } #ifdef VERBOSE_TRACES CTR(KTR_CXGBE, "%s: tid %u freed cid 0x%04x for 0x%04x", __func__, qp->toep->tid, le16toh(rsp->rccqe.cid), cpl.cid); #endif nc = nvmf_allocate_response(&qp->qp, &cpl, M_WAITOK); nc->nc_sqhd_valid = true; cc = CCAP(nc); cc->rx_pdu = *pdu; nvmf_capsule_received(&qp->qp, nc); return (0); } /* * Construct a PDU that contains an optional data payload. This * includes dealing with the length fields in the common header. The * adapter inserts digests and padding when the PDU is transmitted. */ static struct mbuf * nvmf_che_construct_pdu(struct nvmf_che_qpair *qp, void *hdr, size_t hlen, struct mbuf *data, uint32_t data_len) { struct nvme_tcp_common_pdu_hdr *ch; struct mbuf *top; uint32_t pdo, plen; uint8_t ulp_submode; plen = hlen; if (qp->header_digests) plen += sizeof(uint32_t); if (data_len != 0) { KASSERT(m_length(data, NULL) == data_len, ("length mismatch")); pdo = roundup(plen, qp->txpda); plen = pdo + data_len; if (qp->data_digests) plen += sizeof(uint32_t); } else { KASSERT(data == NULL, ("payload mbuf with zero length")); pdo = 0; } top = m_get2(hlen, M_WAITOK, MT_DATA, M_PKTHDR); top->m_len = hlen; top->m_pkthdr.len = hlen; ch = mtod(top, void *); memcpy(ch, hdr, hlen); ch->hlen = hlen; ulp_submode = 0; if (qp->header_digests) { ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF; ulp_submode |= ULP_CRC_HEADER; } if (qp->data_digests && data_len != 0) { ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF; ulp_submode |= ULP_CRC_DATA; } ch->pdo = pdo; ch->plen = htole32(plen); set_mbuf_ulp_submode(top, ulp_submode); if (data_len != 0) { top->m_pkthdr.len += data_len; top->m_next = data; } return (top); } /* Allocate the next free freelist transfer tag. */ static bool nvmf_che_allocate_fl_ttag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb) { uint16_t ttag; mtx_assert(&qp->rx_buffers.lock, MA_OWNED); if (qp->active_fl_ttags == qp->num_fl_ttags) return (false); ttag = qp->next_fl_ttag; for (;;) { if (qp->open_fl_ttags[ttag] == NULL) break; if (ttag == qp->num_fl_ttags - 1) ttag = 0; else ttag++; MPASS(ttag != qp->next_fl_ttag); } if (ttag == qp->num_fl_ttags - 1) qp->next_fl_ttag = 0; else qp->next_fl_ttag = ttag + 1; qp->active_fl_ttags++; qp->open_fl_ttags[ttag] = cb; cb->ttag = ttag | CHE_FL_TAG_MASK; return (true); } /* Attempt to allocate a free transfer tag and assign it to cb. */ static bool nvmf_che_allocate_ttag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb) { uint16_t stag; mtx_assert(&qp->rx_buffers.lock, MA_OWNED); stag = che_alloc_ddp_tag(qp, cb); if (stag == CHE_DDP_NO_TAG) { if (!nvmf_che_allocate_fl_ttag(qp, cb)) return (false); } else { cb->ttag = stag; } #ifdef VERBOSE_TRACES CTR(KTR_CXGBE, "%s: tid %u allocated ttag 0x%04x", __func__, qp->toep->tid, cb->ttag); #endif cb->cc->active_r2ts++; return (true); } /* Find the next command buffer eligible to schedule for R2T. */ static struct nvmf_che_command_buffer * nvmf_che_next_r2t(struct nvmf_che_qpair *qp) { struct nvmf_che_command_buffer *cb; mtx_assert(&qp->rx_buffers.lock, MA_OWNED); TAILQ_FOREACH(cb, &qp->rx_buffers.head, link) { /* NB: maxr2t is 0's based. */ if (cb->cc->active_r2ts > qp->maxr2t) continue; if (!nvmf_che_allocate_ttag(qp, cb)) return (NULL); #ifdef INVARIANTS cb->cc->pending_r2ts--; #endif TAILQ_REMOVE(&qp->rx_buffers.head, cb, link); return (cb); } return (NULL); } /* NB: cid and is little-endian already. */ static void che_send_r2t(struct nvmf_che_qpair *qp, uint16_t cid, uint16_t ttag, uint32_t data_offset, uint32_t data_len) { struct nvme_tcp_r2t_hdr r2t; struct mbuf *m; memset(&r2t, 0, sizeof(r2t)); r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T; r2t.cccid = cid; r2t.ttag = htole16(ttag); r2t.r2to = htole32(data_offset); r2t.r2tl = htole32(data_len); m = nvmf_che_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0); nvmf_che_write_pdu(qp, m); } /* * Release a transfer tag and schedule another R2T. * * NB: This drops the rx_buffers.lock mutex. */ static void nvmf_che_send_next_r2t(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb) { struct nvmf_che_command_buffer *ncb; mtx_assert(&qp->rx_buffers.lock, MA_OWNED); #ifdef VERBOSE_TRACES CTR(KTR_CXGBE, "%s: tid %u freed ttag 0x%04x", __func__, qp->toep->tid, cb->ttag); #endif if (CHE_TAG_IS_FL(cb->ttag)) { uint16_t ttag; ttag = CHE_RAW_FL_TAG(cb->ttag); MPASS(qp->open_fl_ttags[ttag] == cb); /* Release this transfer tag. */ qp->open_fl_ttags[ttag] = NULL; qp->active_fl_ttags--; } else che_free_ddp_tag(qp, cb, cb->ttag); cb->cc->active_r2ts--; /* Schedule another R2T. */ ncb = nvmf_che_next_r2t(qp); mtx_unlock(&qp->rx_buffers.lock); if (ncb != NULL) che_send_r2t(qp, ncb->cid, ncb->ttag, ncb->data_offset, ncb->data_len); } /* * Copy len bytes starting at offset skip from an mbuf chain into an * I/O buffer at destination offset io_offset. */ static void mbuf_copyto_io(struct mbuf *m, u_int skip, u_int len, struct nvmf_io_request *io, u_int io_offset) { u_int todo; while (m->m_len <= skip) { skip -= m->m_len; m = m->m_next; } while (len != 0) { MPASS((m->m_flags & M_EXTPG) == 0); todo = min(m->m_len - skip, len); memdesc_copyback(&io->io_mem, io_offset, todo, mtodo(m, skip)); skip = 0; io_offset += todo; len -= todo; m = m->m_next; } } static int nvmf_che_handle_h2c_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) { const struct nvme_tcp_h2c_data_hdr *h2c; struct nvmf_che_command_buffer *cb; uint32_t data_len, data_offset; uint16_t ttag, fl_ttag; h2c = (const void *)pdu->hdr; if (le32toh(h2c->datal) > qp->maxh2cdata) { nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0, pdu->m, pdu->hdr->hlen); nvmf_che_free_pdu(pdu); return (EBADMSG); } ttag = le16toh(h2c->ttag); if (CHE_TAG_IS_FL(ttag)) { fl_ttag = CHE_RAW_FL_TAG(ttag); if (fl_ttag >= qp->num_fl_ttags) { nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m, pdu->hdr->hlen); nvmf_che_free_pdu(pdu); return (EBADMSG); } mtx_lock(&qp->rx_buffers.lock); cb = qp->open_fl_ttags[fl_ttag]; } else { if (CHE_STAG_IDX(ttag) >= qp->num_ddp_tags) { nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m, pdu->hdr->hlen); nvmf_che_free_pdu(pdu); return (EBADMSG); } mtx_lock(&qp->rx_buffers.lock); cb = qp->open_ddp_tags[CHE_STAG_IDX(ttag)]; } if (cb == NULL) { mtx_unlock(&qp->rx_buffers.lock); nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m, pdu->hdr->hlen); nvmf_che_free_pdu(pdu); return (EBADMSG); } MPASS(cb->ttag == ttag); /* For a data digest mismatch, fail the I/O request. */ if (pdu->data_digest_mismatch) { nvmf_che_send_next_r2t(qp, cb); cb->error = EINTEGRITY; che_release_command_buffer(cb); nvmf_che_free_pdu(pdu); return (0); } data_len = le32toh(h2c->datal); if (data_len != pdu->data_len) { mtx_unlock(&qp->rx_buffers.lock); nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->m, pdu->hdr->hlen); nvmf_che_free_pdu(pdu); return (EBADMSG); } data_offset = le32toh(h2c->datao); if (data_offset < cb->data_offset || data_offset + data_len > cb->data_offset + cb->data_len) { mtx_unlock(&qp->rx_buffers.lock); nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m, pdu->hdr->hlen); nvmf_che_free_pdu(pdu); return (EBADMSG); } if (data_offset != cb->data_offset + cb->data_xfered) { if (CHE_TAG_IS_FL(ttag)) { mtx_unlock(&qp->rx_buffers.lock); nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, pdu->hdr->hlen); nvmf_che_free_pdu(pdu); return (EBADMSG); } else { uint32_t ddp_bytes; /* Account for PDUs silently received via DDP. */ ddp_bytes = data_offset - (cb->data_offset + cb->data_xfered); cb->data_xfered += ddp_bytes; #ifdef VERBOSE_TRACES CTR(KTR_CXGBE, "%s: tid %u previous ddp_bytes %u", __func__, qp->toep->tid, ddp_bytes); #endif counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets, ddp_bytes); } } if ((cb->data_xfered + data_len == cb->data_len) != ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) { mtx_unlock(&qp->rx_buffers.lock); nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, pdu->hdr->hlen); nvmf_che_free_pdu(pdu); return (EBADMSG); } cb->data_xfered += data_len; data_offset -= cb->data_offset; if (cb->data_xfered == cb->data_len) { nvmf_che_send_next_r2t(qp, cb); } else { che_hold_command_buffer(cb); mtx_unlock(&qp->rx_buffers.lock); } if (CHE_TAG_IS_FL(ttag)) mbuf_copyto_io(pdu->m->m_next, 0, data_len, &cb->io, data_offset); che_release_command_buffer(cb); nvmf_che_free_pdu(pdu); return (0); } static int nvmf_che_handle_c2h_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) { const struct nvme_tcp_c2h_data_hdr *c2h; struct nvmf_che_command_buffer *cb; uint32_t data_len, data_offset; uint16_t cid, original_cid; /* * Unlike freelist command buffers, DDP command buffers are * not released until the response capsule is received to keep * the STAG allocated until the command has completed. */ c2h = (const void *)pdu->hdr; cid = le16toh(c2h->cccid); if (CHE_TAG_IS_FL(cid)) { mtx_lock(&qp->rx_buffers.lock); cb = che_find_command_buffer(&qp->rx_buffers, c2h->cccid); } else { if (CHE_STAG_IDX(cid) >= qp->num_ddp_tags) { nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->m, pdu->hdr->hlen); nvmf_che_free_pdu(pdu); return (EBADMSG); } mtx_lock(&qp->rx_buffers.lock); cb = qp->open_ddp_tags[CHE_STAG_IDX(cid)]; } if (cb == NULL) { mtx_unlock(&qp->rx_buffers.lock); /* * XXX: Could be PDU sequence error if cccid is for a * command that doesn't use a command buffer. */ nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->m, pdu->hdr->hlen); nvmf_che_free_pdu(pdu); return (EBADMSG); } /* For a data digest mismatch, fail the I/O request. */ if (pdu->data_digest_mismatch) { cb->error = EINTEGRITY; if (CHE_TAG_IS_FL(cid)) { che_remove_command_buffer(&qp->rx_buffers, cb); mtx_unlock(&qp->rx_buffers.lock); che_release_command_buffer(cb); } else mtx_unlock(&qp->rx_buffers.lock); nvmf_che_free_pdu(pdu); return (0); } data_len = le32toh(c2h->datal); if (data_len != pdu->data_len) { mtx_unlock(&qp->rx_buffers.lock); nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->m, pdu->hdr->hlen); nvmf_che_free_pdu(pdu); return (EBADMSG); } data_offset = le32toh(c2h->datao); if (data_offset < cb->data_offset || data_offset + data_len > cb->data_offset + cb->data_len) { mtx_unlock(&qp->rx_buffers.lock); nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m, pdu->hdr->hlen); nvmf_che_free_pdu(pdu); return (EBADMSG); } if (data_offset != cb->data_offset + cb->data_xfered) { if (CHE_TAG_IS_FL(cid)) { mtx_unlock(&qp->rx_buffers.lock); nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, pdu->hdr->hlen); nvmf_che_free_pdu(pdu); return (EBADMSG); } else { uint32_t ddp_bytes; /* Account for PDUs silently received via DDP. */ ddp_bytes = data_offset - (cb->data_offset + cb->data_xfered); cb->data_xfered += ddp_bytes; #ifdef VERBOSE_TRACES CTR(KTR_CXGBE, "%s: tid %u previous ddp_bytes %u", __func__, qp->toep->tid, ddp_bytes); #endif counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets, ddp_bytes); } } if ((cb->data_xfered + data_len == cb->data_len) != ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) { mtx_unlock(&qp->rx_buffers.lock); nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, pdu->hdr->hlen); nvmf_che_free_pdu(pdu); return (EBADMSG); } cb->data_xfered += data_len; original_cid = cb->original_cid; if (CHE_TAG_IS_FL(cid)) { data_offset -= cb->data_offset; if (cb->data_xfered == cb->data_len) che_remove_command_buffer(&qp->rx_buffers, cb); else che_hold_command_buffer(cb); mtx_unlock(&qp->rx_buffers.lock); if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) { /* * Free the CID as the command has now been * completed. */ cid = CHE_RAW_FL_TAG(cid); mtx_lock(&qp->fl_cid_lock); MPASS(FL_CID_ISACTIVE(cid, qp->fl_cid_set)); MPASS(original_cid == qp->fl_cids[cid]); FL_CID_FREE(cid, qp->fl_cid_set); mtx_unlock(&qp->fl_cid_lock); } mbuf_copyto_io(pdu->m->m_next, 0, data_len, &cb->io, data_offset); che_release_command_buffer(cb); } else { if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) { /* * Free the command buffer and STAG as the * command has now been completed. */ che_free_ddp_tag(qp, cb, cid); mtx_unlock(&qp->rx_buffers.lock); che_release_command_buffer(cb); } else mtx_unlock(&qp->rx_buffers.lock); } if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) { struct nvme_completion cqe; struct nvmf_capsule *nc; memset(&cqe, 0, sizeof(cqe)); cqe.cid = original_cid; nc = nvmf_allocate_response(&qp->qp, &cqe, M_WAITOK); nc->nc_sqhd_valid = false; nvmf_capsule_received(&qp->qp, nc); } nvmf_che_free_pdu(pdu); return (0); } /* Called when m_free drops refcount to 0. */ static void nvmf_che_mbuf_done(struct mbuf *m) { struct nvmf_che_command_buffer *cb = m->m_ext.ext_arg1; che_free_command_buffer(cb); } static struct mbuf * nvmf_che_mbuf(void *arg, int how, void *data, size_t len) { struct nvmf_che_command_buffer *cb = arg; struct mbuf *m; m = m_get(how, MT_DATA); m->m_flags |= M_RDONLY; m_extaddref(m, data, len, &cb->refs, nvmf_che_mbuf_done, cb, NULL); m->m_len = len; return (m); } static void nvmf_che_free_mext_pg(struct mbuf *m) { struct nvmf_che_command_buffer *cb = m->m_ext.ext_arg1; M_ASSERTEXTPG(m); che_release_command_buffer(cb); } static struct mbuf * nvmf_che_mext_pg(void *arg, int how) { struct nvmf_che_command_buffer *cb = arg; struct mbuf *m; m = mb_alloc_ext_pgs(how, nvmf_che_free_mext_pg, M_RDONLY); m->m_ext.ext_arg1 = cb; che_hold_command_buffer(cb); return (m); } /* * Return an mbuf chain for a range of data belonging to a command * buffer. * * The mbuf chain uses M_EXT mbufs which hold references on the * command buffer so that it remains "alive" until the data has been * fully transmitted. If truncate_ok is true, then the mbuf chain * might return a short chain to avoid gratuitously splitting up a * page. */ static struct mbuf * nvmf_che_command_buffer_mbuf(struct nvmf_che_command_buffer *cb, uint32_t data_offset, uint32_t data_len, uint32_t *actual_len, bool can_truncate) { struct mbuf *m; size_t len; m = memdesc_alloc_ext_mbufs(&cb->io.io_mem, nvmf_che_mbuf, nvmf_che_mext_pg, cb, M_WAITOK, data_offset, data_len, &len, can_truncate); if (actual_len != NULL) *actual_len = len; return (m); } /* NB: cid and ttag and little-endian already. */ static void che_send_h2c_pdu(struct nvmf_che_qpair *qp, uint16_t cid, uint16_t ttag, uint32_t data_offset, struct mbuf *m, size_t len, bool last_pdu) { struct nvme_tcp_h2c_data_hdr h2c; struct mbuf *top; memset(&h2c, 0, sizeof(h2c)); h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA; if (last_pdu) h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU; h2c.cccid = cid; h2c.ttag = ttag; h2c.datao = htole32(data_offset); h2c.datal = htole32(len); top = nvmf_che_construct_pdu(qp, &h2c, sizeof(h2c), m, len); nvmf_che_write_pdu(qp, top); } static int nvmf_che_handle_r2t(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) { const struct nvme_tcp_r2t_hdr *r2t; struct nvmf_che_command_buffer *cb; uint32_t data_len, data_offset; r2t = (const void *)pdu->hdr; mtx_lock(&qp->tx_buffers.lock); cb = che_find_command_buffer(&qp->tx_buffers, r2t->cccid); if (cb == NULL) { mtx_unlock(&qp->tx_buffers.lock); nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD, offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->m, pdu->hdr->hlen); nvmf_che_free_pdu(pdu); return (EBADMSG); } data_offset = le32toh(r2t->r2to); if (data_offset != cb->data_xfered) { mtx_unlock(&qp->tx_buffers.lock); nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m, pdu->hdr->hlen); nvmf_che_free_pdu(pdu); return (EBADMSG); } /* * XXX: The spec does not specify how to handle R2T tranfers * out of range of the original command. */ data_len = le32toh(r2t->r2tl); if (data_offset + data_len > cb->data_len) { mtx_unlock(&qp->tx_buffers.lock); nvmf_che_report_error(qp, NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m, pdu->hdr->hlen); nvmf_che_free_pdu(pdu); return (EBADMSG); } cb->data_xfered += data_len; if (cb->data_xfered == cb->data_len) che_remove_command_buffer(&qp->tx_buffers, cb); else che_hold_command_buffer(cb); mtx_unlock(&qp->tx_buffers.lock); /* * Queue one or more H2C_DATA PDUs containing the requested * data. */ while (data_len > 0) { struct mbuf *m; uint32_t sent, todo; todo = min(data_len, qp->max_tx_data); m = nvmf_che_command_buffer_mbuf(cb, data_offset, todo, &sent, todo < data_len); che_send_h2c_pdu(qp, r2t->cccid, r2t->ttag, data_offset, m, sent, sent == data_len); data_offset += sent; data_len -= sent; } che_release_command_buffer(cb); nvmf_che_free_pdu(pdu); return (0); } static int nvmf_che_dispatch_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) { /* * The PDU header should always be contiguous in the mbuf from * CPL_NVMT_CMP. */ pdu->hdr = mtod(pdu->m, void *); KASSERT(pdu->m->m_len == pdu->hdr->hlen + ((pdu->hdr->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0 ? sizeof(uint32_t) : 0), ("%s: mismatched PDU header mbuf length", __func__)); switch (pdu->hdr->pdu_type) { default: __assert_unreachable(); break; case NVME_TCP_PDU_TYPE_H2C_TERM_REQ: case NVME_TCP_PDU_TYPE_C2H_TERM_REQ: return (nvmf_che_handle_term_req(pdu)); case NVME_TCP_PDU_TYPE_CAPSULE_CMD: return (nvmf_che_save_command_capsule(qp, pdu)); case NVME_TCP_PDU_TYPE_CAPSULE_RESP: return (nvmf_che_save_response_capsule(qp, pdu)); case NVME_TCP_PDU_TYPE_H2C_DATA: return (nvmf_che_handle_h2c_data(qp, pdu)); case NVME_TCP_PDU_TYPE_C2H_DATA: return (nvmf_che_handle_c2h_data(qp, pdu)); case NVME_TCP_PDU_TYPE_R2T: return (nvmf_che_handle_r2t(qp, pdu)); } } static int nvmf_che_attach_pdu_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu) { struct socket *so = qp->so; struct mbuf *m, *n; uint32_t tcp_seq; size_t len; int error; /* Check for DDP data. */ if (pdu->ddp) { counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_pdus, 1); counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets, pdu->data_len); return (0); } error = 0; len = pdu->data_len; tcp_seq = pdu->m->m_pkthdr.nvmf_tcp_seq; m = pdu->m; SOCKBUF_LOCK(&so->so_rcv); while (len > 0) { n = mbufq_dequeue(&qp->rx_data); KASSERT(n != NULL, ("%s: missing %zu data", __func__, len)); if (n == NULL) { error = ENOBUFS; break; } KASSERT(n->m_pkthdr.nvmf_tcp_seq == tcp_seq, ("%s: TCP seq mismatch", __func__)); KASSERT(n->m_pkthdr.len <= len, ("%s: too much data", __func__)); if (n->m_pkthdr.nvmf_tcp_seq != tcp_seq || n->m_pkthdr.len > len) { m_freem(n); error = ENOBUFS; break; } #ifdef VERBOSE_TRACES CTR(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__, qp->toep->tid, n->m_pkthdr.len, n->m_pkthdr.nvmf_tcp_seq); #endif pdu->m->m_pkthdr.len += n->m_pkthdr.len; len -= n->m_pkthdr.len; tcp_seq += n->m_pkthdr.len; m_demote_pkthdr(n); m->m_next = n; m = m_last(n); } SOCKBUF_UNLOCK(&so->so_rcv); if (error == 0) { counter_u64_add(qp->toep->ofld_rxq->rx_nvme_fl_pdus, 1); counter_u64_add(qp->toep->ofld_rxq->rx_nvme_fl_octets, pdu->data_len); } return (error); } static void nvmf_che_receive(void *arg) { struct nvmf_che_qpair *qp = arg; struct socket *so = qp->so; struct nvmf_che_rxpdu pdu; struct mbuf *m; int error, terror; SOCKBUF_LOCK(&so->so_rcv); while (!qp->rx_shutdown) { /* Wait for a PDU. */ if (so->so_error != 0 || so->so_rerror != 0) { if (so->so_error != 0) error = so->so_error; else error = so->so_rerror; SOCKBUF_UNLOCK(&so->so_rcv); error: nvmf_qpair_error(&qp->qp, error); SOCKBUF_LOCK(&so->so_rcv); while (!qp->rx_shutdown) cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv)); break; } m = mbufq_dequeue(&qp->rx_pdus); if (m == NULL) { if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0) { error = 0; SOCKBUF_UNLOCK(&so->so_rcv); goto error; } cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv)); continue; } SOCKBUF_UNLOCK(&so->so_rcv); pdu.m = m; pdu.hdr = mtod(m, const void *); pdu.ddp = (m->m_pkthdr.nvmf_cpl_status & CMP_STATUS_DDP) != 0; error = nvmf_che_validate_pdu(qp, &pdu); if (error == 0 && pdu.data_len != 0) error = nvmf_che_attach_pdu_data(qp, &pdu); if (error != 0) nvmf_che_free_pdu(&pdu); else error = nvmf_che_dispatch_pdu(qp, &pdu); if (error != 0) { /* * If we received a termination request, close * the connection immediately. */ if (error == ECONNRESET) goto error; /* * Wait for up to 30 seconds for the socket to * be closed by the other end. */ SOCKBUF_LOCK(&so->so_rcv); if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { terror = cv_timedwait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv), 30 * hz); if (terror == ETIMEDOUT) printf("NVMe/TCP: Timed out after sending terminate request\n"); } SOCKBUF_UNLOCK(&so->so_rcv); goto error; } SOCKBUF_LOCK(&so->so_rcv); } SOCKBUF_UNLOCK(&so->so_rcv); kthread_exit(); } static int nvmf_che_soupcall_receive(struct socket *so, void *arg, int waitflag) { struct nvmf_che_qpair *qp = arg; cv_signal(&qp->rx_cv); return (SU_OK); } static int do_nvmt_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; struct nvmf_che_adapter *nca = sc->nvme_ulp_softc; const struct cpl_nvmt_data *cpl; u_int tid; struct toepcb *toep; struct nvmf_che_qpair *qp; struct socket *so; struct inpcb *inp; struct tcpcb *tp; int len __diagused; if (nca->nvmt_data_iqe) { cpl = (const void *)(rss + 1); } else { cpl = mtod(m, const void *); /* strip off CPL header */ m_adj(m, sizeof(*cpl)); } tid = GET_TID(cpl); toep = lookup_tid(sc, tid); KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); len = m->m_pkthdr.len; KASSERT(len == be16toh(cpl->length), ("%s: payload length mismatch", __func__)); inp = toep->inp; tp = intotcpcb(inp); INP_WLOCK(inp); if (tp->t_flags & TF_DISCONNECTED) { CTR(KTR_CXGBE, "%s: tid %u, rx (%d bytes), t_flags 0x%x", __func__, tid, len, tp->t_flags); INP_WUNLOCK(inp); m_freem(m); return (0); } /* Save TCP sequence number. */ m->m_pkthdr.nvmf_tcp_seq = be32toh(cpl->seq); qp = toep->ulpcb; so = qp->so; SOCKBUF_LOCK(&so->so_rcv); mbufq_enqueue(&qp->rx_data, m); SOCKBUF_UNLOCK(&so->so_rcv); tp->t_rcvtime = ticks; #ifdef VERBOSE_TRACES CTR(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__, tid, len, be32toh(cpl->seq)); #endif INP_WUNLOCK(inp); return (0); } static int do_nvmt_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_nvmt_cmp *cpl = mtod(m, const void *); u_int tid = GET_TID(cpl); struct toepcb *toep = lookup_tid(sc, tid); struct nvmf_che_qpair *qp = toep->ulpcb; struct socket *so = qp->so; struct inpcb *inp = toep->inp; struct tcpcb *tp = intotcpcb(inp); u_int hlen __diagused; bool empty; KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); KASSERT(!(toep->flags & TPF_SYNQE), ("%s: toep %p claims to be a synq entry", __func__, toep)); /* strip off CPL header */ m_adj(m, sizeof(*cpl)); hlen = m->m_pkthdr.len; KASSERT(hlen == be16toh(cpl->length), ("%s: payload length mismatch", __func__)); INP_WLOCK(inp); if (tp->t_flags & TF_DISCONNECTED) { CTR(KTR_CXGBE, "%s: tid %u, rx (hlen %u), t_flags 0x%x", __func__, tid, hlen, tp->t_flags); INP_WUNLOCK(inp); m_freem(m); return (0); } #ifdef VERBOSE_TRACES CTR(KTR_CXGBE, "%s: tid %u hlen %u seq %u status %u", __func__, tid, hlen, be32toh(cpl->seq), cpl->status); #endif /* Save TCP sequence number and CPL status. */ m->m_pkthdr.nvmf_tcp_seq = be32toh(cpl->seq); m->m_pkthdr.nvmf_cpl_status = cpl->status; SOCKBUF_LOCK(&so->so_rcv); empty = mbufq_len(&qp->rx_pdus) == 0; mbufq_enqueue(&qp->rx_pdus, m); SOCKBUF_UNLOCK(&so->so_rcv); INP_WUNLOCK(inp); if (empty) cv_signal(&qp->rx_cv); return (0); } static uint16_t che_alloc_fl_cid(struct nvmf_che_qpair *qp, uint16_t original_cid) { uint16_t new_cid; mtx_lock(&qp->fl_cid_lock); new_cid = FL_CID_FINDFREE_AT(qp->fl_cid_set, qp->next_cid); if (new_cid == 0) { new_cid = FL_CID_FINDFREE_AT(qp->fl_cid_set, 0); MPASS(new_cid != 0); } new_cid--; FL_CID_BUSY(new_cid, qp->fl_cid_set); if (new_cid == CHE_MAX_FL_TAG) qp->next_cid = 0; else qp->next_cid = new_cid + 1; qp->fl_cids[new_cid] = original_cid; mtx_unlock(&qp->fl_cid_lock); return (new_cid | CHE_FL_TAG_MASK); } static uint16_t che_alloc_ddp_cid(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb) { mtx_assert(&qp->rx_buffers.lock, MA_OWNED); return (che_alloc_ddp_tag(qp, cb)); } static struct mbuf * che_command_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc) { struct nvmf_capsule *nc = &cc->nc; struct nvmf_che_command_buffer *cb; struct nvme_sgl_descriptor *sgl; struct nvme_tcp_cmd cmd; struct mbuf *top, *m; uint16_t cid; bool use_icd; use_icd = false; cb = NULL; m = NULL; if (nc->nc_data.io_len != 0) { cb = che_alloc_command_buffer(qp, &nc->nc_data, 0, nc->nc_data.io_len, nc->nc_sqe.cid); cb->original_cid = nc->nc_sqe.cid; if (nc->nc_send_data && nc->nc_data.io_len <= qp->max_icd) { cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid); use_icd = true; m = nvmf_che_command_buffer_mbuf(cb, 0, nc->nc_data.io_len, NULL, false); cb->data_xfered = nc->nc_data.io_len; che_release_command_buffer(cb); } else if (nc->nc_send_data) { cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid); cb->cid = htole16(cid); mtx_lock(&qp->tx_buffers.lock); che_add_command_buffer(&qp->tx_buffers, cb); mtx_unlock(&qp->tx_buffers.lock); } else { mtx_lock(&qp->rx_buffers.lock); cid = che_alloc_ddp_cid(qp, cb); if (cid == CHE_DDP_NO_TAG) { cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid); che_add_command_buffer(&qp->rx_buffers, cb); } cb->cid = htole16(cid); mtx_unlock(&qp->rx_buffers.lock); } } else cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid); #ifdef VERBOSE_TRACES CTR(KTR_CXGBE, "%s: tid %u allocated cid 0x%04x for 0x%04x", __func__, qp->toep->tid, cid, nc->nc_sqe.cid); #endif memset(&cmd, 0, sizeof(cmd)); cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD; cmd.ccsqe = nc->nc_sqe; cmd.ccsqe.cid = htole16(cid); /* Populate SGL in SQE. */ sgl = &cmd.ccsqe.sgl; memset(sgl, 0, sizeof(*sgl)); sgl->address = 0; sgl->length = htole32(nc->nc_data.io_len); if (use_icd) { /* Use in-capsule data. */ sgl->type = NVME_SGL_TYPE_ICD; } else { /* Use a command buffer. */ sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER; } top = nvmf_che_construct_pdu(qp, &cmd, sizeof(cmd), m, m != NULL ? nc->nc_data.io_len : 0); return (top); } static struct mbuf * che_response_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc) { struct nvmf_capsule *nc = &cc->nc; struct nvme_tcp_rsp rsp; memset(&rsp, 0, sizeof(rsp)); rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP; rsp.rccqe = nc->nc_cqe; return (nvmf_che_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0)); } static struct mbuf * capsule_to_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc) { if (cc->nc.nc_qe_len == sizeof(struct nvme_command)) return (che_command_pdu(qp, cc)); else return (che_response_pdu(qp, cc)); } static void nvmf_che_send(void *arg) { struct nvmf_che_qpair *qp = arg; struct nvmf_che_capsule *cc; struct socket *so = qp->so; struct mbuf *m; int error; m = NULL; SOCKBUF_LOCK(&so->so_snd); while (!qp->tx_shutdown) { if (so->so_error != 0) { error = so->so_error; SOCKBUF_UNLOCK(&so->so_snd); m_freem(m); nvmf_qpair_error(&qp->qp, error); SOCKBUF_LOCK(&so->so_snd); while (!qp->tx_shutdown) cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd)); break; } if (STAILQ_EMPTY(&qp->tx_capsules)) { cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd)); continue; } /* Convert a capsule into a PDU. */ cc = STAILQ_FIRST(&qp->tx_capsules); STAILQ_REMOVE_HEAD(&qp->tx_capsules, link); SOCKBUF_UNLOCK(&so->so_snd); m = capsule_to_pdu(qp, cc); che_release_capsule(cc); nvmf_che_write_pdu(qp, m); SOCKBUF_LOCK(&so->so_snd); } SOCKBUF_UNLOCK(&so->so_snd); kthread_exit(); } static int nvmf_che_setsockopt(struct socket *so, u_int sspace, u_int rspace) { struct sockopt opt; int error, one = 1; /* Don't lower the buffer sizes, just enforce a minimum. */ SOCKBUF_LOCK(&so->so_snd); if (sspace < so->so_snd.sb_hiwat) sspace = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); if (rspace < so->so_rcv.sb_hiwat) rspace = so->so_rcv.sb_hiwat; SOCKBUF_UNLOCK(&so->so_rcv); error = soreserve(so, sspace, rspace); if (error != 0) return (error); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags |= SB_AUTOSIZE; SOCKBUF_UNLOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags |= SB_AUTOSIZE; SOCKBUF_UNLOCK(&so->so_rcv); /* * Disable Nagle. */ bzero(&opt, sizeof(opt)); opt.sopt_dir = SOPT_SET; opt.sopt_level = IPPROTO_TCP; opt.sopt_name = TCP_NODELAY; opt.sopt_val = &one; opt.sopt_valsize = sizeof(one); error = sosetopt(so, &opt); if (error != 0) return (error); return (0); } static void t4_nvme_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val) { struct adapter *sc = td_adapter(toep->td); t4_set_tcb_field(sc, &toep->ofld_txq->wrq, toep, word, mask, val, 0, 0); } static void set_ulp_mode_nvme(struct toepcb *toep, u_int ulp_submode, uint8_t rxpda) { uint64_t val; CTR(KTR_CXGBE, "%s: tid %u, ULP_MODE_NVMET, submode=%#x, rxpda=%u", __func__, toep->tid, ulp_submode, rxpda); val = V_TCB_ULP_TYPE(ULP_MODE_NVMET) | V_TCB_ULP_RAW(ulp_submode); t4_nvme_set_tcb_field(toep, W_TCB_ULP_TYPE, V_TCB_ULP_TYPE(M_TCB_ULP_TYPE) | V_TCB_ULP_RAW(M_TCB_ULP_RAW), val); val = V_TF_RX_FLOW_CONTROL_DISABLE(1ULL); t4_nvme_set_tcb_field(toep, W_TCB_T_FLAGS, val, val); val = V_TCB_RSVD((rxpda / 4) - 1); t4_nvme_set_tcb_field(toep, W_TCB_RSVD, V_TCB_RSVD(M_TCB_RSVD), val); /* 0 disables CPL_NVMT_CMP_IMM which is not useful in this driver. */ val = 0; t4_nvme_set_tcb_field(toep, W_TCB_CMP_IMM_SZ, V_TCB_CMP_IMM_SZ(M_TCB_CMP_IMM_SZ), val); } static u_int pdu_max_data_len(const nvlist_t *nvl, u_int max_pdu_len, u_int hlen, uint8_t pda) { u_int max_data_len; if (nvlist_get_bool(nvl, "header_digests")) hlen += sizeof(uint32_t); hlen = roundup(hlen, pda); max_data_len = max_pdu_len - hlen; if (nvlist_get_bool(nvl, "data_digests")) max_data_len -= sizeof(uint32_t); return (max_data_len); } static struct nvmf_qpair * che_allocate_qpair(bool controller, const nvlist_t *nvl) { struct nvmf_che_adapter *nca; struct nvmf_che_qpair *qp; struct adapter *sc; struct file *fp; struct socket *so; struct inpcb *inp; struct tcpcb *tp; struct toepcb *toep; cap_rights_t rights; u_int max_tx_pdu_len, num_ddp_tags; int error, ulp_submode; if (!nvlist_exists_number(nvl, "fd") || !nvlist_exists_number(nvl, "rxpda") || !nvlist_exists_number(nvl, "txpda") || !nvlist_exists_bool(nvl, "header_digests") || !nvlist_exists_bool(nvl, "data_digests") || !nvlist_exists_number(nvl, "maxr2t") || !nvlist_exists_number(nvl, "maxh2cdata") || !nvlist_exists_number(nvl, "max_icd")) return (NULL); error = fget(curthread, nvlist_get_number(nvl, "fd"), cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp); if (error != 0) return (NULL); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, curthread); return (NULL); } so = fp->f_data; if (so->so_type != SOCK_STREAM || so->so_proto->pr_protocol != IPPROTO_TCP) { fdrop(fp, curthread); return (NULL); } sc = find_offload_adapter(so); if (sc == NULL) { fdrop(fp, curthread); return (NULL); } nca = sc->nvme_ulp_softc; /* * Controller: Require advertised MAXH2CDATA to be small * enough. */ if (controller) { u_int max_rx_data; max_rx_data = pdu_max_data_len(nvl, nca->max_receive_pdu, sizeof(struct nvme_tcp_h2c_data_hdr), nvlist_get_number(nvl, "rxpda")); if (nvlist_get_number(nvl, "maxh2cdata") > max_rx_data) { fdrop(fp, curthread); return (NULL); } } /* * Host: Require the queue size to be small enough that all of * the command ids allocated by nvmf(4) will fit in the * unallocated range. * * XXX: Alternatively this driver could just queue commands * when an unallocated ID isn't available. */ if (!controller) { u_int num_commands; num_commands = nvlist_get_number(nvl, "qsize") - 1; if (nvlist_get_bool(nvl, "admin")) num_commands += 8; /* Max AER */ if (num_commands > CHE_NUM_FL_TAGS) { fdrop(fp, curthread); return (NULL); } } qp = malloc(sizeof(*qp), M_NVMF_CHE, M_WAITOK | M_ZERO); qp->txpda = nvlist_get_number(nvl, "txpda"); qp->rxpda = nvlist_get_number(nvl, "rxpda"); qp->header_digests = nvlist_get_bool(nvl, "header_digests"); qp->data_digests = nvlist_get_bool(nvl, "data_digests"); qp->maxr2t = nvlist_get_number(nvl, "maxr2t"); if (controller) qp->maxh2cdata = nvlist_get_number(nvl, "maxh2cdata"); if (controller) { /* NB: maxr2t is 0's based. */ qp->num_fl_ttags = MIN(CHE_NUM_FL_TAGS, nvlist_get_number(nvl, "qsize") * ((uint64_t)qp->maxr2t + 1)); qp->open_fl_ttags = mallocarray(qp->num_fl_ttags, sizeof(*qp->open_fl_ttags), M_NVMF_CHE, M_WAITOK | M_ZERO); } else { qp->fl_cids = mallocarray(CHE_NUM_FL_TAGS, sizeof(*qp->fl_cids), M_NVMF_CHE, M_WAITOK | M_ZERO); qp->fl_cid_set = malloc(sizeof(*qp->fl_cid_set), M_NVMF_CHE, M_WAITOK); FL_CID_INIT(qp->fl_cid_set); mtx_init(&qp->fl_cid_lock, "nvmf/che fl cids", NULL, MTX_DEF); } inp = sotoinpcb(so); INP_WLOCK(inp); tp = intotcpcb(inp); if (tp->t_flags & TF_DISCONNECTED) { INP_WUNLOCK(inp); free(qp->fl_cid_set, M_NVMF_CHE); free(qp->fl_cids, M_NVMF_CHE); free(qp->open_fl_ttags, M_NVMF_CHE); free(qp, M_NVMF_CHE); fdrop(fp, curthread); return (NULL); } MPASS(tp->t_flags & TF_TOE); MPASS(tp->tod != NULL); MPASS(tp->t_toe != NULL); toep = tp->t_toe; MPASS(toep->vi->adapter == sc); if (ulp_mode(toep) != ULP_MODE_NONE) { INP_WUNLOCK(inp); free(qp->fl_cid_set, M_NVMF_CHE); free(qp->fl_cids, M_NVMF_CHE); free(qp->open_fl_ttags, M_NVMF_CHE); free(qp, M_NVMF_CHE); fdrop(fp, curthread); return (NULL); } /* Claim socket from file descriptor. */ fp->f_ops = &badfileops; fp->f_data = NULL; qp->so = so; qp->toep = toep; qp->nca = nca; refcount_init(&qp->refs, 1); /* NB: C2H and H2C headers are the same size. */ qp->max_rx_data = pdu_max_data_len(nvl, nca->max_receive_pdu, sizeof(struct nvme_tcp_c2h_data_hdr), qp->rxpda); qp->max_tx_data = pdu_max_data_len(nvl, nca->max_transmit_pdu, sizeof(struct nvme_tcp_c2h_data_hdr), qp->txpda); if (!controller) { qp->max_tx_data = min(qp->max_tx_data, nvlist_get_number(nvl, "maxh2cdata")); qp->max_icd = min(nvlist_get_number(nvl, "max_icd"), pdu_max_data_len(nvl, nca->max_transmit_pdu, sizeof(struct nvme_tcp_cmd), qp->txpda)); } else { /* * IOCCSZ represents the size of a logical command * capsule including the 64 byte SQE and the * in-capsule data. Use pdu_max_data_len to compute * the maximum supported ICD length. */ qp->max_ioccsz = rounddown(pdu_max_data_len(nvl, nca->max_receive_pdu, sizeof(struct nvme_tcp_cmd), qp->rxpda), 16) + sizeof(struct nvme_command); } ulp_submode = 0; if (qp->header_digests) ulp_submode |= FW_NVMET_ULPSUBMODE_HCRC; if (qp->data_digests) ulp_submode |= FW_NVMET_ULPSUBMODE_DCRC; if (!controller) ulp_submode |= FW_NVMET_ULPSUBMODE_ING_DIR; max_tx_pdu_len = sizeof(struct nvme_tcp_h2c_data_hdr); if (qp->header_digests) max_tx_pdu_len += sizeof(uint32_t); max_tx_pdu_len = roundup(max_tx_pdu_len, qp->txpda); max_tx_pdu_len += qp->max_tx_data; if (qp->data_digests) max_tx_pdu_len += sizeof(uint32_t); /* TODO: ISO limits */ if (controller) { /* Use the SUCCESS flag if SQ flow control is disabled. */ qp->send_success = !nvlist_get_bool(nvl, "sq_flow_control"); } toep->params.ulp_mode = ULP_MODE_NVMET; toep->ulpcb = qp; send_txdataplen_max_flowc_wr(sc, toep, roundup(/* max_iso_pdus * */ max_tx_pdu_len, tp->t_maxseg)); set_ulp_mode_nvme(toep, ulp_submode, qp->rxpda); INP_WUNLOCK(inp); fdrop(fp, curthread); error = nvmf_che_setsockopt(so, max_tx_pdu_len, nca->max_receive_pdu); if (error != 0) { free(qp->fl_cid_set, M_NVMF_CHE); free(qp->fl_cids, M_NVMF_CHE); free(qp->open_fl_ttags, M_NVMF_CHE); free(qp, M_NVMF_CHE); return (NULL); } num_ddp_tags = ddp_tags_per_qp; if (num_ddp_tags > 0) { qp->tpt_offset = t4_stag_alloc(sc, num_ddp_tags); if (qp->tpt_offset != T4_STAG_UNSET) { #ifdef VERBOSE_TRACES CTR(KTR_CXGBE, "%s: tid %u using %u tags at offset 0x%x", __func__, toep->tid, num_ddp_tags, qp->tpt_offset); #endif qp->num_ddp_tags = num_ddp_tags; qp->open_ddp_tags = mallocarray(qp->num_ddp_tags, sizeof(*qp->open_ddp_tags), M_NVMF_CHE, M_WAITOK | M_ZERO); t4_nvme_set_tcb_field(toep, W_TCB_TPT_OFFSET, M_TCB_TPT_OFFSET, V_TCB_TPT_OFFSET(qp->tpt_offset)); } } TAILQ_INIT(&qp->rx_buffers.head); TAILQ_INIT(&qp->tx_buffers.head); mtx_init(&qp->rx_buffers.lock, "nvmf/che rx buffers", NULL, MTX_DEF); mtx_init(&qp->tx_buffers.lock, "nvmf/che tx buffers", NULL, MTX_DEF); cv_init(&qp->rx_cv, "-"); cv_init(&qp->tx_cv, "-"); mbufq_init(&qp->rx_data, 0); mbufq_init(&qp->rx_pdus, 0); STAILQ_INIT(&qp->tx_capsules); /* Register socket upcall for receive to handle remote FIN. */ SOCKBUF_LOCK(&so->so_rcv); soupcall_set(so, SO_RCV, nvmf_che_soupcall_receive, qp); SOCKBUF_UNLOCK(&so->so_rcv); /* Spin up kthreads. */ error = kthread_add(nvmf_che_receive, qp, NULL, &qp->rx_thread, 0, 0, "nvmef che rx"); if (error != 0) { che_free_qpair(&qp->qp); return (NULL); } error = kthread_add(nvmf_che_send, qp, NULL, &qp->tx_thread, 0, 0, "nvmef che tx"); if (error != 0) { che_free_qpair(&qp->qp); return (NULL); } return (&qp->qp); } static void che_release_qpair(struct nvmf_che_qpair *qp) { if (refcount_release(&qp->refs)) free(qp, M_NVMF_CHE); } static void che_free_qpair(struct nvmf_qpair *nq) { struct nvmf_che_qpair *qp = CQP(nq); struct nvmf_che_command_buffer *ncb, *cb; struct nvmf_che_capsule *ncc, *cc; struct socket *so = qp->so; struct toepcb *toep = qp->toep; struct inpcb *inp = sotoinpcb(so); /* Shut down kthreads. */ SOCKBUF_LOCK(&so->so_snd); qp->tx_shutdown = true; if (qp->tx_thread != NULL) { cv_signal(&qp->tx_cv); mtx_sleep(qp->tx_thread, SOCKBUF_MTX(&so->so_snd), 0, "nvchetx", 0); } SOCKBUF_UNLOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); qp->rx_shutdown = true; if (qp->rx_thread != NULL) { cv_signal(&qp->rx_cv); mtx_sleep(qp->rx_thread, SOCKBUF_MTX(&so->so_rcv), 0, "nvcherx", 0); } soupcall_clear(so, SO_RCV); SOCKBUF_UNLOCK(&so->so_rcv); mbufq_drain(&qp->rx_data); mbufq_drain(&qp->rx_pdus); STAILQ_FOREACH_SAFE(cc, &qp->tx_capsules, link, ncc) { nvmf_abort_capsule_data(&cc->nc, ECONNABORTED); che_release_capsule(cc); } cv_destroy(&qp->tx_cv); cv_destroy(&qp->rx_cv); if (qp->open_fl_ttags != NULL) { for (u_int i = 0; i < qp->num_fl_ttags; i++) { cb = qp->open_fl_ttags[i]; if (cb != NULL) { cb->cc->active_r2ts--; cb->error = ECONNABORTED; che_release_command_buffer(cb); } } free(qp->open_fl_ttags, M_NVMF_CHE); } if (qp->num_ddp_tags != 0) { for (u_int i = 0; i < qp->num_ddp_tags; i++) { cb = qp->open_ddp_tags[i]; if (cb != NULL) { if (cb->cc != NULL) cb->cc->active_r2ts--; cb->error = ECONNABORTED; mtx_lock(&qp->rx_buffers.lock); che_free_ddp_tag(qp, cb, cb->ttag); mtx_unlock(&qp->rx_buffers.lock); che_release_command_buffer(cb); } } free(qp->open_ddp_tags, M_NVMF_CHE); } mtx_lock(&qp->rx_buffers.lock); TAILQ_FOREACH_SAFE(cb, &qp->rx_buffers.head, link, ncb) { che_remove_command_buffer(&qp->rx_buffers, cb); mtx_unlock(&qp->rx_buffers.lock); #ifdef INVARIANTS if (cb->cc != NULL) cb->cc->pending_r2ts--; #endif cb->error = ECONNABORTED; che_release_command_buffer(cb); mtx_lock(&qp->rx_buffers.lock); } mtx_destroy(&qp->rx_buffers.lock); mtx_lock(&qp->tx_buffers.lock); TAILQ_FOREACH_SAFE(cb, &qp->tx_buffers.head, link, ncb) { che_remove_command_buffer(&qp->tx_buffers, cb); mtx_unlock(&qp->tx_buffers.lock); cb->error = ECONNABORTED; che_release_command_buffer(cb); mtx_lock(&qp->tx_buffers.lock); } mtx_destroy(&qp->tx_buffers.lock); if (qp->num_ddp_tags != 0) t4_stag_free(qp->nca->sc, qp->tpt_offset, qp->num_ddp_tags); if (!qp->qp.nq_controller) { free(qp->fl_cids, M_NVMF_CHE); free(qp->fl_cid_set, M_NVMF_CHE); mtx_destroy(&qp->fl_cid_lock); } INP_WLOCK(inp); toep->ulpcb = NULL; mbufq_drain(&toep->ulp_pduq); /* * Grab a reference to use when waiting for the final CPL to * be received. If toep->inp is NULL, then * final_cpl_received() has already been called (e.g. due to * the peer sending a RST). */ if (toep->inp != NULL) { toep = hold_toepcb(toep); toep->flags |= TPF_WAITING_FOR_FINAL; } else toep = NULL; INP_WUNLOCK(inp); soclose(so); /* * Wait for the socket to fully close. This ensures any * pending received data has been received (and in particular, * any data that would be received by DDP has been handled). */ if (toep != NULL) { struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep); mtx_lock(lock); while ((toep->flags & TPF_WAITING_FOR_FINAL) != 0) mtx_sleep(toep, lock, PSOCK, "conclo2", 0); mtx_unlock(lock); free_toepcb(toep); } che_release_qpair(qp); } static uint32_t che_max_ioccsz(struct nvmf_qpair *nq) { struct nvmf_che_qpair *qp = CQP(nq); /* * Limit the command capsule size so that with maximum ICD it * fits within the limit of the largest PDU the adapter can * receive. */ return (qp->max_ioccsz); } static uint64_t che_max_xfer_size(struct nvmf_qpair *nq) { struct nvmf_che_qpair *qp = CQP(nq); /* * Limit host transfers to the size of the data payload in the * largest PDU the adapter can receive. */ return (qp->max_rx_data); } static struct nvmf_capsule * che_allocate_capsule(struct nvmf_qpair *nq, int how) { struct nvmf_che_qpair *qp = CQP(nq); struct nvmf_che_capsule *cc; cc = malloc(sizeof(*cc), M_NVMF_CHE, how | M_ZERO); if (cc == NULL) return (NULL); refcount_init(&cc->refs, 1); refcount_acquire(&qp->refs); return (&cc->nc); } static void che_release_capsule(struct nvmf_che_capsule *cc) { struct nvmf_che_qpair *qp = CQP(cc->nc.nc_qpair); if (!refcount_release(&cc->refs)) return; MPASS(cc->active_r2ts == 0); MPASS(cc->pending_r2ts == 0); nvmf_che_free_pdu(&cc->rx_pdu); free(cc, M_NVMF_CHE); che_release_qpair(qp); } static void che_free_capsule(struct nvmf_capsule *nc) { che_release_capsule(CCAP(nc)); } static int che_transmit_capsule(struct nvmf_capsule *nc) { struct nvmf_che_qpair *qp = CQP(nc->nc_qpair); struct nvmf_che_capsule *cc = CCAP(nc); struct socket *so = qp->so; refcount_acquire(&cc->refs); SOCKBUF_LOCK(&so->so_snd); STAILQ_INSERT_TAIL(&qp->tx_capsules, cc, link); cv_signal(&qp->tx_cv); SOCKBUF_UNLOCK(&so->so_snd); return (0); } static uint8_t che_validate_command_capsule(struct nvmf_capsule *nc) { struct nvmf_che_capsule *cc = CCAP(nc); struct nvme_sgl_descriptor *sgl; KASSERT(cc->rx_pdu.hdr != NULL, ("capsule wasn't received")); sgl = &nc->nc_sqe.sgl; switch (sgl->type) { case NVME_SGL_TYPE_ICD: if (cc->rx_pdu.data_len != le32toh(sgl->length)) { printf("NVMe/TCP: Command Capsule with mismatched ICD length\n"); return (NVME_SC_DATA_SGL_LENGTH_INVALID); } break; case NVME_SGL_TYPE_COMMAND_BUFFER: if (cc->rx_pdu.data_len != 0) { printf("NVMe/TCP: Command Buffer SGL with ICD\n"); return (NVME_SC_INVALID_FIELD); } break; default: printf("NVMe/TCP: Invalid SGL type in Command Capsule\n"); return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID); } if (sgl->address != 0) { printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n"); return (NVME_SC_SGL_OFFSET_INVALID); } return (NVME_SC_SUCCESS); } static size_t che_capsule_data_len(const struct nvmf_capsule *nc) { MPASS(nc->nc_qe_len == sizeof(struct nvme_command)); return (le32toh(nc->nc_sqe.sgl.length)); } static void che_receive_r2t_data(struct nvmf_capsule *nc, uint32_t data_offset, struct nvmf_io_request *io) { struct nvmf_che_qpair *qp = CQP(nc->nc_qpair); struct nvmf_che_capsule *cc = CCAP(nc); struct nvmf_che_command_buffer *cb; cb = che_alloc_command_buffer(qp, io, data_offset, io->io_len, nc->nc_sqe.cid); cb->cc = cc; refcount_acquire(&cc->refs); /* * If this command has too many active R2Ts or there are no * available transfer tags, queue the request for later. * * NB: maxr2t is 0's based. */ mtx_lock(&qp->rx_buffers.lock); if (cc->active_r2ts > qp->maxr2t || !nvmf_che_allocate_ttag(qp, cb)) { #ifdef INVARIANTS cc->pending_r2ts++; #endif TAILQ_INSERT_TAIL(&qp->rx_buffers.head, cb, link); mtx_unlock(&qp->rx_buffers.lock); return; } mtx_unlock(&qp->rx_buffers.lock); che_send_r2t(qp, nc->nc_sqe.cid, cb->ttag, data_offset, io->io_len); } static void che_receive_icd_data(struct nvmf_capsule *nc, uint32_t data_offset, struct nvmf_io_request *io) { struct nvmf_che_capsule *cc = CCAP(nc); /* * The header is in rx_pdu.m, the padding is discarded, and * the data starts at rx_pdu.m->m_next. */ mbuf_copyto_io(cc->rx_pdu.m->m_next, data_offset, io->io_len, io, 0); nvmf_complete_io_request(io, io->io_len, 0); } static int che_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, struct nvmf_io_request *io) { struct nvme_sgl_descriptor *sgl; size_t data_len; if (nc->nc_qe_len != sizeof(struct nvme_command) || !nc->nc_qpair->nq_controller) return (EINVAL); sgl = &nc->nc_sqe.sgl; data_len = le32toh(sgl->length); if (data_offset + io->io_len > data_len) return (EFBIG); if (sgl->type == NVME_SGL_TYPE_ICD) che_receive_icd_data(nc, data_offset, io); else che_receive_r2t_data(nc, data_offset, io); return (0); } /* NB: cid is little-endian already. */ static void che_send_c2h_pdu(struct nvmf_che_qpair *qp, uint16_t cid, uint32_t data_offset, struct mbuf *m, size_t len, bool last_pdu, bool success) { struct nvme_tcp_c2h_data_hdr c2h; struct mbuf *top; memset(&c2h, 0, sizeof(c2h)); c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA; if (last_pdu) c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU; if (success) c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS; c2h.cccid = cid; c2h.datao = htole32(data_offset); c2h.datal = htole32(len); top = nvmf_che_construct_pdu(qp, &c2h, sizeof(c2h), m, len); nvmf_che_write_pdu(qp, top); } static u_int che_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset, struct mbuf *m, size_t len) { struct nvmf_che_qpair *qp = CQP(nc->nc_qpair); struct nvme_sgl_descriptor *sgl; uint32_t data_len; bool last_pdu, last_xfer; if (nc->nc_qe_len != sizeof(struct nvme_command) || !qp->qp.nq_controller) { m_freem(m); return (NVME_SC_INVALID_FIELD); } sgl = &nc->nc_sqe.sgl; data_len = le32toh(sgl->length); if (data_offset + len > data_len) { m_freem(m); return (NVME_SC_INVALID_FIELD); } last_xfer = (data_offset + len == data_len); if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) { m_freem(m); return (NVME_SC_INVALID_FIELD); } KASSERT(data_offset == CCAP(nc)->tx_data_offset, ("%s: starting data_offset %u doesn't match end of previous xfer %u", __func__, data_offset, CCAP(nc)->tx_data_offset)); /* Queue one or more C2H_DATA PDUs containing the data from 'm'. */ while (m != NULL) { struct mbuf *n; uint32_t todo; if (m->m_len > qp->max_tx_data) { n = m_split(m, qp->max_tx_data, M_WAITOK); todo = m->m_len; } else { struct mbuf *p; todo = m->m_len; p = m; n = p->m_next; while (n != NULL) { if (todo + n->m_len > qp->max_tx_data) { p->m_next = NULL; break; } todo += n->m_len; p = n; n = p->m_next; } MPASS(m_length(m, NULL) == todo); } last_pdu = (n == NULL && last_xfer); che_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset, m, todo, last_pdu, last_pdu && qp->send_success); data_offset += todo; data_len -= todo; m = n; } MPASS(data_len == 0); #ifdef INVARIANTS CCAP(nc)->tx_data_offset = data_offset; #endif if (!last_xfer) return (NVMF_MORE); else if (qp->send_success) return (NVMF_SUCCESS_SENT); else return (NVME_SC_SUCCESS); } struct nvmf_transport_ops che_ops = { .allocate_qpair = che_allocate_qpair, .free_qpair = che_free_qpair, .max_ioccsz = che_max_ioccsz, .max_xfer_size = che_max_xfer_size, .allocate_capsule = che_allocate_capsule, .free_capsule = che_free_capsule, .transmit_capsule = che_transmit_capsule, .validate_command_capsule = che_validate_command_capsule, .capsule_data_len = che_capsule_data_len, .receive_controller_data = che_receive_controller_data, .send_controller_data = che_send_controller_data, .trtype = NVMF_TRTYPE_TCP, .priority = 10, }; NVMF_TRANSPORT(che, che_ops); static void read_pdu_limits(struct adapter *sc, u_int *max_tx_pdu_len, uint32_t *max_rx_pdu_len) { uint32_t tx_len, rx_len, r, v; /* Copied from cxgbei, but not sure if this is correct. */ rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE); tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE); r = t4_read_reg(sc, A_TP_PARA_REG2); rx_len = min(rx_len, G_MAXRXDATA(r)); tx_len = min(tx_len, G_MAXRXDATA(r)); r = t4_read_reg(sc, A_TP_PARA_REG7); v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r)); rx_len = min(rx_len, v); tx_len = min(tx_len, v); /* Cannot be larger than 32KB - 256. */ rx_len = min(rx_len, 32512); tx_len = min(tx_len, 32512); *max_tx_pdu_len = tx_len; *max_rx_pdu_len = rx_len; } static int nvmf_che_init(struct adapter *sc, struct nvmf_che_adapter *nca) { struct sysctl_oid *oid; struct sysctl_oid_list *children; uint32_t val; read_pdu_limits(sc, &nca->max_transmit_pdu, &nca->max_receive_pdu); if (nca->max_transmit_pdu > che_max_transmit_pdu) nca->max_transmit_pdu = che_max_transmit_pdu; if (nca->max_receive_pdu > che_max_receive_pdu) nca->max_receive_pdu = che_max_receive_pdu; val = t4_read_reg(sc, A_SGE_CONTROL2); nca->nvmt_data_iqe = (val & F_RXCPLMODE_NVMT) != 0; sysctl_ctx_init(&nca->ctx); oid = device_get_sysctl_tree(sc->dev); /* dev.che.X */ children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(&nca->ctx, children, OID_AUTO, "nvme", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "NVMe ULP settings"); children = SYSCTL_CHILDREN(oid); nca->ddp_threshold = 8192; SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "ddp_threshold", CTLFLAG_RW, &nca->ddp_threshold, 0, "Rx zero copy threshold"); SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "max_transmit_pdu", CTLFLAG_RW, &nca->max_transmit_pdu, 0, "Maximum size of a transmitted PDU"); SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "max_receive_pdu", CTLFLAG_RW, &nca->max_receive_pdu, 0, "Maximum size of a received PDU"); return (0); } static void nvmf_che_destroy(struct nvmf_che_adapter *nca) { sysctl_ctx_free(&nca->ctx); free(nca, M_CXGBE); } static int nvmf_che_activate(struct adapter *sc) { struct nvmf_che_adapter *nca; int rc; ASSERT_SYNCHRONIZED_OP(sc); if (uld_active(sc, ULD_NVME)) { KASSERT(0, ("%s: NVMe offload already enabled on adapter %p", __func__, sc)); return (0); } if ((sc->nvmecaps & FW_CAPS_CONFIG_NVME_TCP) == 0) { device_printf(sc->dev, "not NVMe offload capable, or capability disabled\n"); return (ENOSYS); } /* per-adapter softc for NVMe */ nca = malloc(sizeof(*nca), M_CXGBE, M_ZERO | M_WAITOK); nca->sc = sc; rc = nvmf_che_init(sc, nca); if (rc != 0) { free(nca, M_CXGBE); return (rc); } sc->nvme_ulp_softc = nca; return (0); } static int nvmf_che_deactivate(struct adapter *sc) { struct nvmf_che_adapter *nca = sc->nvme_ulp_softc; ASSERT_SYNCHRONIZED_OP(sc); if (nca != NULL) { nvmf_che_destroy(nca); sc->nvme_ulp_softc = NULL; } return (0); } static void nvmf_che_activate_all(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t7nvact") != 0) return; /* Activate NVMe if any port on this adapter has IFCAP_TOE enabled. */ if (sc->offload_map && !uld_active(sc, ULD_NVME)) (void) t4_activate_uld(sc, ULD_NVME); end_synchronized_op(sc, 0); } static void nvmf_che_deactivate_all(struct adapter *sc, void *arg __unused) { if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t7nvdea") != 0) return; if (uld_active(sc, ULD_NVME)) (void) t4_deactivate_uld(sc, ULD_NVME); end_synchronized_op(sc, 0); } static struct uld_info nvmf_che_uld_info = { .uld_activate = nvmf_che_activate, .uld_deactivate = nvmf_che_deactivate, }; static int nvmf_che_mod_load(void) { int rc; t4_register_cpl_handler(CPL_NVMT_CMP, do_nvmt_cmp); t4_register_cpl_handler(CPL_NVMT_DATA, do_nvmt_data); rc = t4_register_uld(&nvmf_che_uld_info, ULD_NVME); if (rc != 0) return (rc); t4_iterate(nvmf_che_activate_all, NULL); return (rc); } static int nvmf_che_mod_unload(void) { t4_iterate(nvmf_che_deactivate_all, NULL); if (t4_unregister_uld(&nvmf_che_uld_info, ULD_NVME) == EBUSY) return (EBUSY); t4_register_cpl_handler(CPL_NVMT_CMP, NULL); t4_register_cpl_handler(CPL_NVMT_DATA, NULL); return (0); } #endif static int nvmf_che_modevent(module_t mod, int cmd, void *arg) { int rc; #ifdef TCP_OFFLOAD switch (cmd) { case MOD_LOAD: rc = nvmf_che_mod_load(); break; case MOD_UNLOAD: rc = nvmf_che_mod_unload(); break; default: rc = EOPNOTSUPP; break; } #else printf("nvmf_che: compiled without TCP_OFFLOAD support.\n"); rc = EOPNOTSUPP; #endif return (rc); } static moduledata_t nvmf_che_mod = { "nvmf_che", nvmf_che_modevent, NULL, }; MODULE_VERSION(nvmf_che, 1); DECLARE_MODULE(nvmf_che, nvmf_che_mod, SI_SUB_EXEC, SI_ORDER_ANY); MODULE_DEPEND(nvmf_che, t4_tom, 1, 1, 1); MODULE_DEPEND(nvmf_che, cxgbe, 1, 1, 1);