1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (C) 2012-2014 Intel Corporation
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/param.h>
30 #include <sys/bus.h>
31 #include <sys/conf.h>
32 #include <sys/domainset.h>
33 #include <sys/proc.h>
34 #include <sys/sbuf.h>
35
36 #include <dev/pci/pcivar.h>
37
38 #include "nvme_private.h"
39
40 typedef enum error_print { ERROR_PRINT_NONE, ERROR_PRINT_NO_RETRY, ERROR_PRINT_ALL } error_print_t;
41 #define DO_NOT_RETRY 1
42
43 static void _nvme_qpair_submit_request(struct nvme_qpair *qpair,
44 struct nvme_request *req);
45 static void nvme_qpair_destroy(struct nvme_qpair *qpair);
46
47 static const char *
get_opcode_string(bool admin,uint8_t opc,char * buf,size_t len)48 get_opcode_string(bool admin, uint8_t opc, char *buf, size_t len)
49 {
50 struct sbuf sb;
51
52 sbuf_new(&sb, buf, len, SBUF_FIXEDLEN);
53 nvme_opcode_sbuf(admin, opc, &sb);
54 if (sbuf_finish(&sb) != 0)
55 return ("");
56 return (buf);
57 }
58
59 static void
nvme_admin_qpair_print_command(struct nvme_qpair * qpair,struct nvme_command * cmd)60 nvme_admin_qpair_print_command(struct nvme_qpair *qpair,
61 struct nvme_command *cmd)
62 {
63 char buf[64];
64
65 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%x "
66 "cdw10:%08x cdw11:%08x\n",
67 get_opcode_string(true, cmd->opc, buf, sizeof(buf)), qpair->id,
68 cmd->cid, le32toh(cmd->nsid), le32toh(cmd->cdw10),
69 le32toh(cmd->cdw11));
70 }
71
72 static void
nvme_io_qpair_print_command(struct nvme_qpair * qpair,struct nvme_command * cmd)73 nvme_io_qpair_print_command(struct nvme_qpair *qpair,
74 struct nvme_command *cmd)
75 {
76 char buf[64];
77
78 switch (cmd->opc) {
79 case NVME_OPC_WRITE:
80 case NVME_OPC_READ:
81 case NVME_OPC_WRITE_UNCORRECTABLE:
82 case NVME_OPC_COMPARE:
83 case NVME_OPC_WRITE_ZEROES:
84 case NVME_OPC_VERIFY:
85 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d "
86 "lba:%llu len:%d\n",
87 get_opcode_string(false, cmd->opc, buf, sizeof(buf)),
88 qpair->id, cmd->cid, le32toh(cmd->nsid),
89 ((unsigned long long)le32toh(cmd->cdw11) << 32) + le32toh(cmd->cdw10),
90 (le32toh(cmd->cdw12) & 0xFFFF) + 1);
91 break;
92 default:
93 nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d\n",
94 get_opcode_string(false, cmd->opc, buf, sizeof(buf)),
95 qpair->id, cmd->cid, le32toh(cmd->nsid));
96 break;
97 }
98 }
99
100 void
nvme_qpair_print_command(struct nvme_qpair * qpair,struct nvme_command * cmd)101 nvme_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd)
102 {
103 if (qpair->id == 0)
104 nvme_admin_qpair_print_command(qpair, cmd);
105 else
106 nvme_io_qpair_print_command(qpair, cmd);
107 if (nvme_verbose_cmd_dump) {
108 nvme_printf(qpair->ctrlr,
109 "nsid:%#x rsvd2:%#x rsvd3:%#x mptr:%#jx prp1:%#jx prp2:%#jx\n",
110 cmd->nsid, cmd->rsvd2, cmd->rsvd3, (uintmax_t)cmd->mptr,
111 (uintmax_t)cmd->prp1, (uintmax_t)cmd->prp2);
112 nvme_printf(qpair->ctrlr,
113 "cdw10: %#x cdw11:%#x cdw12:%#x cdw13:%#x cdw14:%#x cdw15:%#x\n",
114 cmd->cdw10, cmd->cdw11, cmd->cdw12, cmd->cdw13, cmd->cdw14,
115 cmd->cdw15);
116 }
117 }
118
119 static const char *
get_status_string(const struct nvme_completion * cpl,char * buf,size_t len)120 get_status_string(const struct nvme_completion *cpl, char *buf, size_t len)
121 {
122 struct sbuf sb;
123
124 sbuf_new(&sb, buf, len, SBUF_FIXEDLEN);
125 nvme_sc_sbuf(cpl, &sb);
126 if (sbuf_finish(&sb) != 0)
127 return ("");
128 return (buf);
129 }
130
131 void
nvme_qpair_print_completion(struct nvme_qpair * qpair,struct nvme_completion * cpl)132 nvme_qpair_print_completion(struct nvme_qpair *qpair,
133 struct nvme_completion *cpl)
134 {
135 char buf[64];
136 uint8_t crd, m, dnr, p;
137
138 crd = NVME_STATUS_GET_CRD(cpl->status);
139 m = NVME_STATUS_GET_M(cpl->status);
140 dnr = NVME_STATUS_GET_DNR(cpl->status);
141 p = NVME_STATUS_GET_P(cpl->status);
142
143 nvme_printf(qpair->ctrlr, "%s crd:%x m:%x dnr:%x p:%d "
144 "sqid:%d cid:%d cdw0:%x\n",
145 get_status_string(cpl, buf, sizeof(buf)), crd, m, dnr, p,
146 cpl->sqid, cpl->cid, cpl->cdw0);
147 }
148
149 static bool
nvme_completion_is_retry(const struct nvme_completion * cpl)150 nvme_completion_is_retry(const struct nvme_completion *cpl)
151 {
152 uint8_t sct, sc, dnr;
153
154 sct = NVME_STATUS_GET_SCT(cpl->status);
155 sc = NVME_STATUS_GET_SC(cpl->status);
156 dnr = NVME_STATUS_GET_DNR(cpl->status); /* Do Not Retry Bit */
157
158 /*
159 * TODO: spec is not clear how commands that are aborted due
160 * to TLER will be marked. So for now, it seems
161 * NAMESPACE_NOT_READY is the only case where we should
162 * look at the DNR bit. Requests failed with ABORTED_BY_REQUEST
163 * set the DNR bit correctly since the driver controls that.
164 */
165 switch (sct) {
166 case NVME_SCT_GENERIC:
167 switch (sc) {
168 case NVME_SC_ABORTED_BY_REQUEST:
169 case NVME_SC_NAMESPACE_NOT_READY:
170 if (dnr)
171 return (0);
172 else
173 return (1);
174 case NVME_SC_INVALID_OPCODE:
175 case NVME_SC_INVALID_FIELD:
176 case NVME_SC_COMMAND_ID_CONFLICT:
177 case NVME_SC_DATA_TRANSFER_ERROR:
178 case NVME_SC_ABORTED_POWER_LOSS:
179 case NVME_SC_INTERNAL_DEVICE_ERROR:
180 case NVME_SC_ABORTED_SQ_DELETION:
181 case NVME_SC_ABORTED_FAILED_FUSED:
182 case NVME_SC_ABORTED_MISSING_FUSED:
183 case NVME_SC_INVALID_NAMESPACE_OR_FORMAT:
184 case NVME_SC_COMMAND_SEQUENCE_ERROR:
185 case NVME_SC_LBA_OUT_OF_RANGE:
186 case NVME_SC_CAPACITY_EXCEEDED:
187 default:
188 return (0);
189 }
190 case NVME_SCT_COMMAND_SPECIFIC:
191 case NVME_SCT_MEDIA_ERROR:
192 return (0);
193 case NVME_SCT_PATH_RELATED:
194 switch (sc) {
195 case NVME_SC_INTERNAL_PATH_ERROR:
196 if (dnr)
197 return (0);
198 else
199 return (1);
200 default:
201 return (0);
202 }
203 case NVME_SCT_VENDOR_SPECIFIC:
204 default:
205 return (0);
206 }
207 }
208
209 static void
nvme_qpair_complete_tracker(struct nvme_tracker * tr,struct nvme_completion * cpl,error_print_t print_on_error)210 nvme_qpair_complete_tracker(struct nvme_tracker *tr,
211 struct nvme_completion *cpl, error_print_t print_on_error)
212 {
213 struct nvme_qpair *qpair = tr->qpair;
214 struct nvme_request *req;
215 bool retry, error, retriable;
216
217 mtx_assert(&qpair->lock, MA_NOTOWNED);
218
219 req = tr->req;
220 error = nvme_completion_is_error(cpl);
221 retriable = nvme_completion_is_retry(cpl);
222 retry = error && retriable && req->retries < nvme_retry_count;
223 if (retry)
224 qpair->num_retries++;
225 if (error && req->retries >= nvme_retry_count && retriable)
226 qpair->num_failures++;
227
228 if (error && (print_on_error == ERROR_PRINT_ALL ||
229 (!retry && print_on_error == ERROR_PRINT_NO_RETRY))) {
230 nvme_qpair_print_command(qpair, &req->cmd);
231 nvme_qpair_print_completion(qpair, cpl);
232 }
233
234 qpair->act_tr[cpl->cid] = NULL;
235
236 KASSERT(cpl->cid == req->cmd.cid, ("cpl cid does not match cmd cid\n"));
237
238 if (!retry) {
239 if (req->payload_valid) {
240 bus_dmamap_sync(qpair->dma_tag_payload,
241 tr->payload_dma_map,
242 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
243 }
244 if (req->cb_fn)
245 req->cb_fn(req->cb_arg, cpl);
246 }
247
248 mtx_lock(&qpair->lock);
249
250 if (retry) {
251 req->retries++;
252 nvme_qpair_submit_tracker(qpair, tr);
253 } else {
254 if (req->payload_valid) {
255 bus_dmamap_unload(qpair->dma_tag_payload,
256 tr->payload_dma_map);
257 }
258
259 nvme_free_request(req);
260 tr->req = NULL;
261
262 TAILQ_REMOVE(&qpair->outstanding_tr, tr, tailq);
263 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq);
264
265 /*
266 * If the controller is in the middle of resetting, don't
267 * try to submit queued requests here - let the reset logic
268 * handle that instead.
269 */
270 if (!STAILQ_EMPTY(&qpair->queued_req) &&
271 !qpair->ctrlr->is_resetting) {
272 req = STAILQ_FIRST(&qpair->queued_req);
273 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
274 _nvme_qpair_submit_request(qpair, req);
275 }
276 }
277
278 mtx_unlock(&qpair->lock);
279 }
280
281 static uint32_t
nvme_qpair_make_status(uint32_t sct,uint32_t sc,uint32_t dnr)282 nvme_qpair_make_status(uint32_t sct, uint32_t sc, uint32_t dnr)
283 {
284 uint32_t status = 0;
285
286 status |= NVMEF(NVME_STATUS_SCT, sct);
287 status |= NVMEF(NVME_STATUS_SC, sc);
288 status |= NVMEF(NVME_STATUS_DNR, dnr);
289 /* M=0 : this is artificial so no data in error log page */
290 /* CRD=0 : this is artificial and no delayed retry support anyway */
291 /* P=0 : phase not checked */
292 return (status);
293 }
294
295 static void
nvme_qpair_manual_complete_tracker(struct nvme_tracker * tr,uint32_t sct,uint32_t sc,uint32_t dnr,error_print_t print_on_error)296 nvme_qpair_manual_complete_tracker(
297 struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
298 error_print_t print_on_error)
299 {
300 struct nvme_completion cpl;
301 struct nvme_qpair * qpair = tr->qpair;
302
303 mtx_assert(&qpair->lock, MA_NOTOWNED);
304
305 memset(&cpl, 0, sizeof(cpl));
306
307 cpl.sqid = qpair->id;
308 cpl.cid = tr->cid;
309 cpl.status = nvme_qpair_make_status(sct, sc, dnr);
310 nvme_qpair_complete_tracker(tr, &cpl, print_on_error);
311 }
312
313 static void
nvme_qpair_manual_complete_request(struct nvme_qpair * qpair,struct nvme_request * req,uint32_t sct,uint32_t sc,uint32_t dnr,error_print_t print_on_error)314 nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
315 struct nvme_request *req, uint32_t sct, uint32_t sc, uint32_t dnr,
316 error_print_t print_on_error)
317 {
318 struct nvme_completion cpl;
319 bool error;
320
321 memset(&cpl, 0, sizeof(cpl));
322 cpl.sqid = qpair->id;
323 cpl.status = nvme_qpair_make_status(sct, sc, dnr);
324 error = nvme_completion_is_error(&cpl);
325
326 if (error && print_on_error == ERROR_PRINT_ALL) {
327 nvme_qpair_print_command(qpair, &req->cmd);
328 nvme_qpair_print_completion(qpair, &cpl);
329 }
330
331 if (req->cb_fn)
332 req->cb_fn(req->cb_arg, &cpl);
333
334 nvme_free_request(req);
335 }
336
337 /* Locked version of completion processor */
338 static bool
_nvme_qpair_process_completions(struct nvme_qpair * qpair)339 _nvme_qpair_process_completions(struct nvme_qpair *qpair)
340 {
341 struct nvme_tracker *tr;
342 struct nvme_completion cpl;
343 bool done = false;
344 bool in_panic = dumping || SCHEDULER_STOPPED();
345
346 mtx_assert(&qpair->recovery, MA_OWNED);
347
348 /*
349 * qpair is not enabled, likely because a controller reset is in
350 * progress. Ignore the interrupt - any I/O that was associated with
351 * this interrupt will get retried when the reset is complete. Any
352 * pending completions for when we're in startup will be completed
353 * as soon as initialization is complete and we start sending commands
354 * to the device.
355 */
356 if (qpair->recovery_state != RECOVERY_NONE) {
357 qpair->num_ignored++;
358 return (false);
359 }
360
361 /*
362 * Sanity check initialization. After we reset the hardware, the phase
363 * is defined to be 1. So if we get here with zero prior calls and the
364 * phase is 0, it means that we've lost a race between the
365 * initialization and the ISR running. With the phase wrong, we'll
366 * process a bunch of completions that aren't really completions leading
367 * to a KASSERT below.
368 */
369 KASSERT(!(qpair->num_intr_handler_calls == 0 && qpair->phase == 0),
370 ("%s: Phase wrong for first interrupt call.",
371 device_get_nameunit(qpair->ctrlr->dev)));
372
373 qpair->num_intr_handler_calls++;
374
375 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
376 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
377 /*
378 * A panic can stop the CPU this routine is running on at any point. If
379 * we're called during a panic, complete the sq_head wrap protocol for
380 * the case where we are interrupted just after the increment at 1
381 * below, but before we can reset cq_head to zero at 2. Also cope with
382 * the case where we do the zero at 2, but may or may not have done the
383 * phase adjustment at step 3. The panic machinery flushes all pending
384 * memory writes, so we can make these strong ordering assumptions
385 * that would otherwise be unwise if we were racing in real time.
386 */
387 if (__predict_false(in_panic)) {
388 if (qpair->cq_head == qpair->num_entries) {
389 /*
390 * Here we know that we need to zero cq_head and then negate
391 * the phase, which hasn't been assigned if cq_head isn't
392 * zero due to the atomic_store_rel.
393 */
394 qpair->cq_head = 0;
395 qpair->phase = !qpair->phase;
396 } else if (qpair->cq_head == 0) {
397 /*
398 * In this case, we know that the assignment at 2
399 * happened below, but we don't know if it 3 happened or
400 * not. To do this, we look at the last completion
401 * entry and set the phase to the opposite phase
402 * that it has. This gets us back in sync
403 */
404 cpl = qpair->cpl[qpair->num_entries - 1];
405 nvme_completion_swapbytes(&cpl);
406 qpair->phase = !NVME_STATUS_GET_P(cpl.status);
407 }
408 }
409
410 while (1) {
411 uint16_t status;
412
413 /*
414 * We need to do this dance to avoid a race between the host and
415 * the device where the device overtakes the host while the host
416 * is reading this record, leaving the status field 'new' and
417 * the sqhd and cid fields potentially stale. If the phase
418 * doesn't match, that means status hasn't yet been updated and
419 * we'll get any pending changes next time. It also means that
420 * the phase must be the same the second time. We have to sync
421 * before reading to ensure any bouncing completes.
422 */
423 status = le16toh(qpair->cpl[qpair->cq_head].status);
424 if (NVME_STATUS_GET_P(status) != qpair->phase)
425 break;
426
427 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
428 BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
429 cpl = qpair->cpl[qpair->cq_head];
430 nvme_completion_swapbytes(&cpl);
431
432 KASSERT(
433 NVME_STATUS_GET_P(status) == NVME_STATUS_GET_P(cpl.status),
434 ("Phase unexpectedly inconsistent"));
435
436 if (cpl.cid < qpair->num_trackers)
437 tr = qpair->act_tr[cpl.cid];
438 else
439 tr = NULL;
440
441 done = true;
442 if (tr != NULL) {
443 nvme_qpair_complete_tracker(tr, &cpl, ERROR_PRINT_ALL);
444 qpair->sq_head = cpl.sqhd;
445 } else if (!in_panic) {
446 /*
447 * A missing tracker is normally an error. However, a
448 * panic can stop the CPU this routine is running on
449 * after completing an I/O but before updating
450 * qpair->cq_head at 1 below. Later, we re-enter this
451 * routine to poll I/O associated with the kernel
452 * dump. We find that the tr has been set to null before
453 * calling the completion routine. If it hasn't
454 * completed (or it triggers a panic), then '1' below
455 * won't have updated cq_head. Rather than panic again,
456 * ignore this condition because it's not unexpected.
457 */
458 nvme_printf(qpair->ctrlr,
459 "cpl (cid = %u) does not map to outstanding cmd\n",
460 cpl.cid);
461 nvme_qpair_print_completion(qpair,
462 &qpair->cpl[qpair->cq_head]);
463 KASSERT(0, ("received completion for unknown cmd"));
464 }
465
466 /*
467 * There's a number of races with the following (see above) when
468 * the system panics. We compensate for each one of them by
469 * using the atomic store to force strong ordering (at least when
470 * viewed in the aftermath of a panic).
471 */
472 if (++qpair->cq_head == qpair->num_entries) { /* 1 */
473 atomic_store_rel_int(&qpair->cq_head, 0); /* 2 */
474 qpair->phase = !qpair->phase; /* 3 */
475 }
476 }
477
478 if (done) {
479 bus_write_4(qpair->ctrlr->resource, qpair->cq_hdbl_off,
480 qpair->cq_head);
481 }
482
483 return (done);
484 }
485
486 bool
nvme_qpair_process_completions(struct nvme_qpair * qpair)487 nvme_qpair_process_completions(struct nvme_qpair *qpair)
488 {
489 bool done = false;
490
491 /*
492 * Interlock with reset / recovery code. This is an usually uncontended
493 * to make sure that we drain out of the ISRs before we reset the card
494 * and to prevent races with the recovery process called from a timeout
495 * context.
496 */
497 mtx_lock(&qpair->recovery);
498
499 if (__predict_true(qpair->recovery_state == RECOVERY_NONE))
500 done = _nvme_qpair_process_completions(qpair);
501 else
502 qpair->num_recovery_nolock++; // XXX likely need to rename
503
504 mtx_unlock(&qpair->recovery);
505
506 return (done);
507 }
508
509 static void
nvme_qpair_msi_handler(void * arg)510 nvme_qpair_msi_handler(void *arg)
511 {
512 struct nvme_qpair *qpair = arg;
513
514 nvme_qpair_process_completions(qpair);
515 }
516
517 int
nvme_qpair_construct(struct nvme_qpair * qpair,uint32_t num_entries,uint32_t num_trackers,struct nvme_controller * ctrlr)518 nvme_qpair_construct(struct nvme_qpair *qpair,
519 uint32_t num_entries, uint32_t num_trackers,
520 struct nvme_controller *ctrlr)
521 {
522 struct nvme_tracker *tr;
523 size_t cmdsz, cplsz, prpsz, allocsz, prpmemsz;
524 uint64_t queuemem_phys, prpmem_phys, list_phys;
525 uint8_t *queuemem, *prpmem, *prp_list;
526 int i, err;
527
528 qpair->vector = ctrlr->msi_count > 1 ? qpair->id : 0;
529 qpair->num_entries = num_entries;
530 qpair->num_trackers = num_trackers;
531 qpair->ctrlr = ctrlr;
532
533 mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF);
534 mtx_init(&qpair->recovery, "nvme qpair recovery", NULL, MTX_DEF);
535
536 callout_init_mtx(&qpair->timer, &qpair->recovery, 0);
537 qpair->timer_armed = false;
538 qpair->recovery_state = RECOVERY_WAITING;
539
540 /* Note: NVMe PRP format is restricted to 4-byte alignment. */
541 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
542 4, ctrlr->page_size, BUS_SPACE_MAXADDR,
543 BUS_SPACE_MAXADDR, NULL, NULL, ctrlr->max_xfer_size,
544 howmany(ctrlr->max_xfer_size, ctrlr->page_size) + 1,
545 ctrlr->page_size, 0,
546 NULL, NULL, &qpair->dma_tag_payload);
547 if (err != 0) {
548 nvme_printf(ctrlr, "payload tag create failed %d\n", err);
549 goto out;
550 }
551
552 /*
553 * Each component must be page aligned, and individual PRP lists
554 * cannot cross a page boundary.
555 */
556 cmdsz = qpair->num_entries * sizeof(struct nvme_command);
557 cmdsz = roundup2(cmdsz, ctrlr->page_size);
558 cplsz = qpair->num_entries * sizeof(struct nvme_completion);
559 cplsz = roundup2(cplsz, ctrlr->page_size);
560 /*
561 * For commands requiring more than 2 PRP entries, one PRP will be
562 * embedded in the command (prp1), and the rest of the PRP entries
563 * will be in a list pointed to by the command (prp2).
564 */
565 prpsz = sizeof(uint64_t) *
566 howmany(ctrlr->max_xfer_size, ctrlr->page_size);
567 prpmemsz = qpair->num_trackers * prpsz;
568 allocsz = cmdsz + cplsz + prpmemsz;
569
570 err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
571 ctrlr->page_size, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
572 allocsz, 1, allocsz, 0, NULL, NULL, &qpair->dma_tag);
573 if (err != 0) {
574 nvme_printf(ctrlr, "tag create failed %d\n", err);
575 goto out;
576 }
577 bus_dma_tag_set_domain(qpair->dma_tag, qpair->domain);
578
579 if (bus_dmamem_alloc(qpair->dma_tag, (void **)&queuemem,
580 BUS_DMA_COHERENT | BUS_DMA_NOWAIT, &qpair->queuemem_map)) {
581 nvme_printf(ctrlr, "failed to alloc qpair memory\n");
582 goto out;
583 }
584
585 if (bus_dmamap_load(qpair->dma_tag, qpair->queuemem_map,
586 queuemem, allocsz, nvme_single_map, &queuemem_phys, 0) != 0) {
587 nvme_printf(ctrlr, "failed to load qpair memory\n");
588 bus_dmamem_free(qpair->dma_tag, qpair->cmd,
589 qpair->queuemem_map);
590 goto out;
591 }
592
593 qpair->num_cmds = 0;
594 qpair->num_intr_handler_calls = 0;
595 qpair->num_retries = 0;
596 qpair->num_failures = 0;
597 qpair->num_ignored = 0;
598 qpair->cmd = (struct nvme_command *)queuemem;
599 qpair->cpl = (struct nvme_completion *)(queuemem + cmdsz);
600 prpmem = (uint8_t *)(queuemem + cmdsz + cplsz);
601 qpair->cmd_bus_addr = queuemem_phys;
602 qpair->cpl_bus_addr = queuemem_phys + cmdsz;
603 prpmem_phys = queuemem_phys + cmdsz + cplsz;
604
605 /*
606 * Calcuate the stride of the doorbell register. Many emulators set this
607 * value to correspond to a cache line. However, some hardware has set
608 * it to various small values.
609 */
610 qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell[0]) +
611 (qpair->id << (ctrlr->dstrd + 1));
612 qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell[0]) +
613 (qpair->id << (ctrlr->dstrd + 1)) + (1 << ctrlr->dstrd);
614
615 TAILQ_INIT(&qpair->free_tr);
616 TAILQ_INIT(&qpair->outstanding_tr);
617 STAILQ_INIT(&qpair->queued_req);
618
619 list_phys = prpmem_phys;
620 prp_list = prpmem;
621 for (i = 0; i < qpair->num_trackers; i++) {
622 if (list_phys + prpsz > prpmem_phys + prpmemsz) {
623 qpair->num_trackers = i;
624 break;
625 }
626
627 /*
628 * Make sure that the PRP list for this tracker doesn't
629 * overflow to another nvme page.
630 */
631 if (trunc_page(list_phys) !=
632 trunc_page(list_phys + prpsz - 1)) {
633 list_phys = roundup2(list_phys, ctrlr->page_size);
634 prp_list =
635 (uint8_t *)roundup2((uintptr_t)prp_list, ctrlr->page_size);
636 }
637
638 tr = malloc_domainset(sizeof(*tr), M_NVME,
639 DOMAINSET_PREF(qpair->domain), M_ZERO | M_WAITOK);
640 bus_dmamap_create(qpair->dma_tag_payload, 0,
641 &tr->payload_dma_map);
642 tr->cid = i;
643 tr->qpair = qpair;
644 tr->prp = (uint64_t *)prp_list;
645 tr->prp_bus_addr = list_phys;
646 TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq);
647 list_phys += prpsz;
648 prp_list += prpsz;
649 }
650
651 if (qpair->num_trackers == 0) {
652 nvme_printf(ctrlr, "failed to allocate enough trackers\n");
653 goto out;
654 }
655
656 qpair->act_tr = malloc_domainset(sizeof(struct nvme_tracker *) *
657 qpair->num_entries, M_NVME, DOMAINSET_PREF(qpair->domain),
658 M_ZERO | M_WAITOK);
659
660 if (ctrlr->msi_count > 1) {
661 /*
662 * MSI-X vector resource IDs start at 1, so we add one to
663 * the queue's vector to get the corresponding rid to use.
664 */
665 qpair->rid = qpair->vector + 1;
666
667 qpair->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ,
668 &qpair->rid, RF_ACTIVE);
669 if (qpair->res == NULL) {
670 nvme_printf(ctrlr, "unable to allocate MSI\n");
671 goto out;
672 }
673 if (bus_setup_intr(ctrlr->dev, qpair->res,
674 INTR_TYPE_MISC | INTR_MPSAFE, NULL,
675 nvme_qpair_msi_handler, qpair, &qpair->tag) != 0) {
676 nvme_printf(ctrlr, "unable to setup MSI\n");
677 goto out;
678 }
679 if (qpair->id == 0) {
680 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag,
681 "admin");
682 } else {
683 bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag,
684 "io%d", qpair->id - 1);
685 }
686 }
687
688 return (0);
689
690 out:
691 nvme_qpair_destroy(qpair);
692 return (ENOMEM);
693 }
694
695 static void
nvme_qpair_destroy(struct nvme_qpair * qpair)696 nvme_qpair_destroy(struct nvme_qpair *qpair)
697 {
698 struct nvme_tracker *tr;
699
700 mtx_lock(&qpair->recovery);
701 qpair->timer_armed = false;
702 mtx_unlock(&qpair->recovery);
703 callout_drain(&qpair->timer);
704
705 if (qpair->tag) {
706 bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag);
707 qpair->tag = NULL;
708 }
709
710 if (qpair->act_tr) {
711 free(qpair->act_tr, M_NVME);
712 qpair->act_tr = NULL;
713 }
714
715 while (!TAILQ_EMPTY(&qpair->free_tr)) {
716 tr = TAILQ_FIRST(&qpair->free_tr);
717 TAILQ_REMOVE(&qpair->free_tr, tr, tailq);
718 bus_dmamap_destroy(qpair->dma_tag_payload,
719 tr->payload_dma_map);
720 free(tr, M_NVME);
721 }
722
723 if (qpair->cmd != NULL) {
724 bus_dmamap_unload(qpair->dma_tag, qpair->queuemem_map);
725 bus_dmamem_free(qpair->dma_tag, qpair->cmd,
726 qpair->queuemem_map);
727 qpair->cmd = NULL;
728 }
729
730 if (qpair->dma_tag) {
731 bus_dma_tag_destroy(qpair->dma_tag);
732 qpair->dma_tag = NULL;
733 }
734
735 if (qpair->dma_tag_payload) {
736 bus_dma_tag_destroy(qpair->dma_tag_payload);
737 qpair->dma_tag_payload = NULL;
738 }
739
740 if (mtx_initialized(&qpair->lock))
741 mtx_destroy(&qpair->lock);
742 if (mtx_initialized(&qpair->recovery))
743 mtx_destroy(&qpair->recovery);
744
745 if (qpair->res) {
746 bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ,
747 rman_get_rid(qpair->res), qpair->res);
748 qpair->res = NULL;
749 }
750 }
751
752 static void
nvme_admin_qpair_abort_aers(struct nvme_qpair * qpair)753 nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair)
754 {
755 struct nvme_tracker *tr;
756
757 /*
758 * nvme_complete_tracker must be called without the qpair lock held. It
759 * takes the lock to adjust outstanding_tr list, so make sure we don't
760 * have it yet. We need the lock to make the list traverse safe, but
761 * have to drop the lock to complete any AER. We restart the list scan
762 * when we do this to make this safe. There's interlock with the ISR so
763 * we know this tracker won't be completed twice.
764 */
765 mtx_assert(&qpair->lock, MA_NOTOWNED);
766
767 mtx_lock(&qpair->lock);
768 tr = TAILQ_FIRST(&qpair->outstanding_tr);
769 while (tr != NULL) {
770 if (tr->req->cmd.opc != NVME_OPC_ASYNC_EVENT_REQUEST) {
771 tr = TAILQ_NEXT(tr, tailq);
772 continue;
773 }
774 mtx_unlock(&qpair->lock);
775 nvme_qpair_manual_complete_tracker(tr,
776 NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0,
777 ERROR_PRINT_NONE);
778 mtx_lock(&qpair->lock);
779 tr = TAILQ_FIRST(&qpair->outstanding_tr);
780 }
781 mtx_unlock(&qpair->lock);
782 }
783
784 void
nvme_admin_qpair_destroy(struct nvme_qpair * qpair)785 nvme_admin_qpair_destroy(struct nvme_qpair *qpair)
786 {
787 mtx_assert(&qpair->lock, MA_NOTOWNED);
788
789 nvme_admin_qpair_abort_aers(qpair);
790 nvme_qpair_destroy(qpair);
791 }
792
793 void
nvme_io_qpair_destroy(struct nvme_qpair * qpair)794 nvme_io_qpair_destroy(struct nvme_qpair *qpair)
795 {
796 nvme_qpair_destroy(qpair);
797 }
798
799 static void
nvme_abort_complete(void * arg,const struct nvme_completion * status)800 nvme_abort_complete(void *arg, const struct nvme_completion *status)
801 {
802 struct nvme_tracker *tr = arg;
803
804 /*
805 * If cdw0 bit 0 == 1, the controller was not able to abort the command
806 * we requested. We still need to check the active tracker array, to
807 * cover race where I/O timed out at same time controller was completing
808 * the I/O. An abort command always is on the admin queue, but affects
809 * either an admin or an I/O queue, so take the appropriate qpair lock
810 * for the original command's queue, since we'll need it to avoid races
811 * with the completion code and to complete the command manually.
812 */
813 mtx_lock(&tr->qpair->lock);
814 if ((status->cdw0 & 1) == 1 && tr->qpair->act_tr[tr->cid] != NULL) {
815 /*
816 * An I/O has timed out, and the controller was unable to abort
817 * it for some reason. And we've not processed a completion for
818 * it yet. Construct a fake completion status, and then complete
819 * the I/O's tracker manually.
820 */
821 nvme_printf(tr->qpair->ctrlr,
822 "abort command failed, aborting command manually\n");
823 nvme_qpair_manual_complete_tracker(tr,
824 NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL);
825 }
826 /*
827 * XXX We don't check status for the possible 'Could not abort because
828 * excess aborts were submitted to the controller'. We don't prevent
829 * that, either. Document for the future here, since the standard is
830 * squishy and only says 'may generate' but implies anything is possible
831 * including hangs if you exceed the ACL.
832 */
833 mtx_unlock(&tr->qpair->lock);
834 }
835
836 static void
nvme_qpair_timeout(void * arg)837 nvme_qpair_timeout(void *arg)
838 {
839 struct nvme_qpair *qpair = arg;
840 struct nvme_controller *ctrlr = qpair->ctrlr;
841 struct nvme_tracker *tr;
842 sbintime_t now;
843 bool idle = true;
844 bool is_admin = qpair == &ctrlr->adminq;
845 bool fast;
846 uint32_t csts;
847 uint8_t cfs;
848
849 mtx_assert(&qpair->recovery, MA_OWNED);
850
851 /*
852 * If the controller is failed, then stop polling. This ensures that any
853 * failure processing that races with the qpair timeout will fail
854 * safely.
855 */
856 if (is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed) {
857 nvme_printf(qpair->ctrlr,
858 "%sFailed controller, stopping watchdog timeout.\n",
859 is_admin ? "Complete " : "");
860 qpair->timer_armed = false;
861 return;
862 }
863
864 /*
865 * Shutdown condition: We set qpair->timer_armed to false in
866 * nvme_qpair_destroy before calling callout_drain. When we call that,
867 * this routine might get called one last time. Exit w/o setting a
868 * timeout. None of the watchdog stuff needs to be done since we're
869 * destroying the qpair.
870 */
871 if (!qpair->timer_armed) {
872 nvme_printf(qpair->ctrlr,
873 "Timeout fired during nvme_qpair_destroy\n");
874 return;
875 }
876
877 switch (qpair->recovery_state) {
878 case RECOVERY_NONE:
879 /*
880 * Read csts to get value of cfs - controller fatal status. If
881 * we are in the hot-plug or controller failed status proceed
882 * directly to reset. We also bail early if the status reads all
883 * 1's or the control fatal status bit is now 1. The latter is
884 * always true when the former is true, but not vice versa. The
885 * intent of the code is that if the card is gone (all 1's) or
886 * we've failed, then try to do a reset (which someitmes
887 * unwedges a card reading all 1's that's not gone away, but
888 * usually doesn't).
889 */
890 csts = nvme_mmio_read_4(ctrlr, csts);
891 cfs = NVMEV(NVME_CSTS_REG_CFS, csts);
892 if (csts == NVME_GONE || cfs == 1) {
893 /*
894 * We've had a command timeout that we weren't able to
895 * abort or we have aborts disabled and any command
896 * timed out.
897 *
898 * If we get here due to a possible surprise hot-unplug
899 * event, then we let nvme_ctrlr_reset confirm and fail
900 * the controller.
901 */
902 do_reset:
903 nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
904 (csts == 0xffffffff) ? " and possible hot unplug" :
905 (cfs ? " and fatal error status" : ""));
906 qpair->recovery_state = RECOVERY_WAITING;
907 nvme_ctrlr_reset(ctrlr);
908 idle = false;
909 break;
910 }
911
912
913 /*
914 * See if there's any recovery needed. First, do a fast check to
915 * see if anything could have timed out. If not, then skip
916 * everything else.
917 */
918 fast = false;
919 mtx_lock(&qpair->lock);
920 now = getsbinuptime();
921 TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
922 /*
923 * Skip async commands, they are posted to the card for
924 * an indefinite amount of time and have no deadline.
925 */
926 if (tr->deadline == SBT_MAX)
927 continue;
928
929 /*
930 * If the first real transaction is not in timeout, then
931 * we're done. Otherwise, we try recovery.
932 */
933 idle = false;
934 if (now <= tr->deadline)
935 fast = true;
936 break;
937 }
938 mtx_unlock(&qpair->lock);
939 if (idle || fast)
940 break;
941
942 /*
943 * There's a stale transaction at the start of the queue whose
944 * deadline has passed. Poll the competions as a last-ditch
945 * effort in case an interrupt has been missed. Warn the user if
946 * transactions were found of possible interrupt issues, but
947 * just once per controller.
948 */
949 if (_nvme_qpair_process_completions(qpair) && !ctrlr->isr_warned) {
950 nvme_printf(ctrlr, "System interrupt issues?\n");
951 ctrlr->isr_warned = true;
952 }
953
954 /*
955 * Now that we've run the ISR, re-rheck to see if there's any
956 * timed out commands and abort them or reset the card if so.
957 */
958 mtx_lock(&qpair->lock);
959 idle = true;
960 TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
961 /*
962 * Skip async commands, they are posted to the card for
963 * an indefinite amount of time and have no deadline.
964 */
965 if (tr->deadline == SBT_MAX)
966 continue;
967
968 /*
969 * If we know this tracker hasn't timed out, we also
970 * know all subsequent ones haven't timed out. The tr
971 * queue is in submission order and all normal commands
972 * in a queue have the same timeout (or the timeout was
973 * changed by the user, but we eventually timeout then).
974 */
975 idle = false;
976 if (now <= tr->deadline)
977 break;
978
979 /*
980 * Timeout expired, abort it or reset controller.
981 */
982 if (ctrlr->enable_aborts &&
983 tr->req->cb_fn != nvme_abort_complete) {
984 /*
985 * This isn't an abort command, ask for a
986 * hardware abort. This goes to the admin
987 * queue which will reset the card if it
988 * times out.
989 */
990 nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id,
991 nvme_abort_complete, tr);
992 } else {
993 /*
994 * We have a live command in the card (either
995 * one we couldn't abort, or aborts weren't
996 * enabled). We can only reset.
997 */
998 mtx_unlock(&qpair->lock);
999 goto do_reset;
1000 }
1001 }
1002 mtx_unlock(&qpair->lock);
1003 break;
1004
1005 case RECOVERY_WAITING:
1006 /*
1007 * These messages aren't interesting while we're suspended. We
1008 * put the queues into waiting state while
1009 * suspending. Suspending takes a while, so we'll see these
1010 * during that time and they aren't diagnostic. At other times,
1011 * they indicate a problem that's worth complaining about.
1012 */
1013 if (!device_is_suspended(ctrlr->dev))
1014 nvme_printf(ctrlr, "Waiting for reset to complete\n");
1015 idle = false; /* We want to keep polling */
1016 break;
1017 }
1018
1019 /*
1020 * Rearm the timeout.
1021 */
1022 if (!idle) {
1023 callout_schedule_sbt(&qpair->timer, SBT_1S / 2, SBT_1S / 2, 0);
1024 } else {
1025 qpair->timer_armed = false;
1026 }
1027 }
1028
1029 /*
1030 * Submit the tracker to the hardware. Must already be in the
1031 * outstanding queue when called.
1032 */
1033 void
nvme_qpair_submit_tracker(struct nvme_qpair * qpair,struct nvme_tracker * tr)1034 nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr)
1035 {
1036 struct nvme_request *req;
1037 struct nvme_controller *ctrlr;
1038 int timeout;
1039
1040 mtx_assert(&qpair->lock, MA_OWNED);
1041
1042 req = tr->req;
1043 req->cmd.cid = tr->cid;
1044 qpair->act_tr[tr->cid] = tr;
1045 ctrlr = qpair->ctrlr;
1046
1047 if (req->timeout) {
1048 if (req->cb_fn == nvme_completion_poll_cb)
1049 timeout = 1;
1050 else if (qpair->id == 0)
1051 timeout = ctrlr->admin_timeout_period;
1052 else
1053 timeout = ctrlr->timeout_period;
1054 tr->deadline = getsbinuptime() + timeout * SBT_1S;
1055 if (!qpair->timer_armed) {
1056 qpair->timer_armed = true;
1057 callout_reset_sbt_on(&qpair->timer, SBT_1S / 2, SBT_1S / 2,
1058 nvme_qpair_timeout, qpair, qpair->cpu, 0);
1059 }
1060 } else
1061 tr->deadline = SBT_MAX;
1062
1063 /* Copy the command from the tracker to the submission queue. */
1064 memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd));
1065
1066 if (++qpair->sq_tail == qpair->num_entries)
1067 qpair->sq_tail = 0;
1068
1069 bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
1070 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1071 bus_write_4(ctrlr->resource, qpair->sq_tdbl_off, qpair->sq_tail);
1072 qpair->num_cmds++;
1073 }
1074
1075 static void
nvme_payload_map(void * arg,bus_dma_segment_t * seg,int nseg,int error)1076 nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg, int error)
1077 {
1078 struct nvme_tracker *tr = arg;
1079 uint32_t cur_nseg;
1080
1081 /*
1082 * If the mapping operation failed, return immediately. The caller
1083 * is responsible for detecting the error status and failing the
1084 * tracker manually.
1085 */
1086 if (error != 0) {
1087 nvme_printf(tr->qpair->ctrlr,
1088 "nvme_payload_map err %d\n", error);
1089 return;
1090 }
1091
1092 /*
1093 * Note that we specified ctrlr->page_size for alignment and max
1094 * segment size when creating the bus dma tags. So here we can safely
1095 * just transfer each segment to its associated PRP entry.
1096 */
1097 tr->req->cmd.prp1 = htole64(seg[0].ds_addr);
1098
1099 if (nseg == 2) {
1100 tr->req->cmd.prp2 = htole64(seg[1].ds_addr);
1101 } else if (nseg > 2) {
1102 cur_nseg = 1;
1103 tr->req->cmd.prp2 = htole64((uint64_t)tr->prp_bus_addr);
1104 while (cur_nseg < nseg) {
1105 tr->prp[cur_nseg-1] =
1106 htole64((uint64_t)seg[cur_nseg].ds_addr);
1107 cur_nseg++;
1108 }
1109 } else {
1110 /*
1111 * prp2 should not be used by the controller
1112 * since there is only one segment, but set
1113 * to 0 just to be safe.
1114 */
1115 tr->req->cmd.prp2 = 0;
1116 }
1117
1118 bus_dmamap_sync(tr->qpair->dma_tag_payload, tr->payload_dma_map,
1119 BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1120 nvme_qpair_submit_tracker(tr->qpair, tr);
1121 }
1122
1123 static void
_nvme_qpair_submit_request(struct nvme_qpair * qpair,struct nvme_request * req)1124 _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
1125 {
1126 struct nvme_tracker *tr;
1127 int err = 0;
1128 bool is_admin = qpair == &qpair->ctrlr->adminq;
1129
1130 mtx_assert(&qpair->lock, MA_OWNED);
1131
1132 tr = TAILQ_FIRST(&qpair->free_tr);
1133 req->qpair = qpair;
1134
1135 /*
1136 * The controller has failed, so fail the request. Note, that this races
1137 * the recovery / timeout code. Since we hold the qpair lock, we know
1138 * it's safe to fail directly. is_failed is set when we fail the
1139 * controller. It is only ever reset in the ioctl reset controller
1140 * path, which is safe to race (for failed controllers, we make no
1141 * guarantees about bringing it out of failed state relative to other
1142 * commands). We try hard to allow admin commands when the entire
1143 * controller hasn't failed, only something related to I/O queues.
1144 */
1145 if (is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed) {
1146 nvme_qpair_manual_complete_request(qpair, req,
1147 NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 1,
1148 ERROR_PRINT_NONE);
1149 return;
1150 }
1151
1152 /*
1153 * No tracker is available, or the qpair is disabled due to an
1154 * in-progress controller-level reset. If we lose the race with
1155 * recovery_state, then we may add an extra request to the queue which
1156 * will be resubmitted later. We only set recovery_state to NONE with
1157 * qpair->lock also held, so if we observe that the state is not NONE,
1158 * we know it won't transition back to NONE without retrying queued
1159 * request.
1160 */
1161 if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {
1162 STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
1163 return;
1164 }
1165
1166 TAILQ_REMOVE(&qpair->free_tr, tr, tailq);
1167 TAILQ_INSERT_TAIL(&qpair->outstanding_tr, tr, tailq);
1168 tr->deadline = SBT_MAX;
1169 tr->req = req;
1170
1171 if (!req->payload_valid) {
1172 nvme_qpair_submit_tracker(tr->qpair, tr);
1173 return;
1174 }
1175
1176 /*
1177 * tr->deadline updating when nvme_payload_map calls
1178 * nvme_qpair_submit_tracker (we call it above directly
1179 * when there's no map to load).
1180 */
1181 err = bus_dmamap_load_mem(tr->qpair->dma_tag_payload,
1182 tr->payload_dma_map, &req->payload, nvme_payload_map, tr, 0);
1183 if (err != 0) {
1184 /*
1185 * The dmamap operation failed, so we manually fail the
1186 * tracker here with DATA_TRANSFER_ERROR status.
1187 *
1188 * nvme_qpair_manual_complete_tracker must not be called
1189 * with the qpair lock held.
1190 */
1191 nvme_printf(qpair->ctrlr,
1192 "bus_dmamap_load_mem returned 0x%x!\n", err);
1193 mtx_unlock(&qpair->lock);
1194 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1195 NVME_SC_DATA_TRANSFER_ERROR, DO_NOT_RETRY, ERROR_PRINT_ALL);
1196 mtx_lock(&qpair->lock);
1197 }
1198 }
1199
1200 void
nvme_qpair_submit_request(struct nvme_qpair * qpair,struct nvme_request * req)1201 nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
1202 {
1203 mtx_lock(&qpair->lock);
1204 _nvme_qpair_submit_request(qpair, req);
1205 mtx_unlock(&qpair->lock);
1206 }
1207
1208 static void
nvme_qpair_enable(struct nvme_qpair * qpair)1209 nvme_qpair_enable(struct nvme_qpair *qpair)
1210 {
1211 bool is_admin __diagused = qpair == &qpair->ctrlr->adminq;
1212
1213 if (mtx_initialized(&qpair->recovery))
1214 mtx_assert(&qpair->recovery, MA_OWNED);
1215 if (mtx_initialized(&qpair->lock))
1216 mtx_assert(&qpair->lock, MA_OWNED);
1217 KASSERT(!(is_admin ? qpair->ctrlr->is_failed_admin : qpair->ctrlr->is_failed),
1218 ("Enabling a failed qpair\n"));
1219
1220 qpair->recovery_state = RECOVERY_NONE;
1221 }
1222
1223 void
nvme_qpair_reset(struct nvme_qpair * qpair)1224 nvme_qpair_reset(struct nvme_qpair *qpair)
1225 {
1226 qpair->sq_head = qpair->sq_tail = qpair->cq_head = 0;
1227
1228 /*
1229 * First time through the completion queue, HW will set phase
1230 * bit on completions to 1. So set this to 1 here, indicating
1231 * we're looking for a 1 to know which entries have completed.
1232 * we'll toggle the bit each time when the completion queue
1233 * rolls over.
1234 */
1235 qpair->phase = 1;
1236
1237 memset(qpair->cmd, 0,
1238 qpair->num_entries * sizeof(struct nvme_command));
1239 memset(qpair->cpl, 0,
1240 qpair->num_entries * sizeof(struct nvme_completion));
1241 }
1242
1243 void
nvme_admin_qpair_enable(struct nvme_qpair * qpair)1244 nvme_admin_qpair_enable(struct nvme_qpair *qpair)
1245 {
1246 struct nvme_tracker *tr;
1247 struct nvme_tracker *tr_temp;
1248 bool rpt;
1249
1250 /*
1251 * Manually abort each outstanding admin command. Do not retry
1252 * admin commands found here, since they will be left over from
1253 * a controller reset and its likely the context in which the
1254 * command was issued no longer applies.
1255 */
1256 rpt = !TAILQ_EMPTY(&qpair->outstanding_tr);
1257 if (rpt)
1258 nvme_printf(qpair->ctrlr,
1259 "aborting outstanding admin command\n");
1260 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
1261 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1262 NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);
1263 }
1264 if (rpt)
1265 nvme_printf(qpair->ctrlr,
1266 "done aborting outstanding admin\n");
1267
1268 mtx_lock(&qpair->recovery);
1269 mtx_lock(&qpair->lock);
1270 nvme_qpair_enable(qpair);
1271 mtx_unlock(&qpair->lock);
1272 mtx_unlock(&qpair->recovery);
1273 }
1274
1275 void
nvme_io_qpair_enable(struct nvme_qpair * qpair)1276 nvme_io_qpair_enable(struct nvme_qpair *qpair)
1277 {
1278 STAILQ_HEAD(, nvme_request) temp;
1279 struct nvme_tracker *tr;
1280 struct nvme_tracker *tr_temp;
1281 struct nvme_request *req;
1282 bool report;
1283
1284 /*
1285 * Manually abort each outstanding I/O. This normally results in a
1286 * retry, unless the retry count on the associated request has
1287 * reached its limit.
1288 */
1289 report = !TAILQ_EMPTY(&qpair->outstanding_tr);
1290 if (report)
1291 nvme_printf(qpair->ctrlr, "aborting outstanding i/o\n");
1292 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
1293 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1294 NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_NO_RETRY);
1295 }
1296 if (report)
1297 nvme_printf(qpair->ctrlr, "done aborting outstanding i/o\n");
1298
1299 mtx_lock(&qpair->recovery);
1300 mtx_lock(&qpair->lock);
1301 nvme_qpair_enable(qpair);
1302
1303 STAILQ_INIT(&temp);
1304 STAILQ_SWAP(&qpair->queued_req, &temp, nvme_request);
1305
1306 report = !STAILQ_EMPTY(&temp);
1307 if (report)
1308 nvme_printf(qpair->ctrlr, "resubmitting queued i/o\n");
1309 while (!STAILQ_EMPTY(&temp)) {
1310 req = STAILQ_FIRST(&temp);
1311 STAILQ_REMOVE_HEAD(&temp, stailq);
1312 nvme_qpair_print_command(qpair, &req->cmd);
1313 _nvme_qpair_submit_request(qpair, req);
1314 }
1315 if (report)
1316 nvme_printf(qpair->ctrlr, "done resubmitting i/o\n");
1317
1318 mtx_unlock(&qpair->lock);
1319 mtx_unlock(&qpair->recovery);
1320 }
1321
1322 static void
nvme_qpair_disable(struct nvme_qpair * qpair)1323 nvme_qpair_disable(struct nvme_qpair *qpair)
1324 {
1325 struct nvme_tracker *tr, *tr_temp;
1326
1327 if (mtx_initialized(&qpair->recovery))
1328 mtx_assert(&qpair->recovery, MA_OWNED);
1329 if (mtx_initialized(&qpair->lock))
1330 mtx_assert(&qpair->lock, MA_OWNED);
1331
1332 qpair->recovery_state = RECOVERY_WAITING;
1333 TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
1334 tr->deadline = SBT_MAX;
1335 }
1336 }
1337
1338 void
nvme_admin_qpair_disable(struct nvme_qpair * qpair)1339 nvme_admin_qpair_disable(struct nvme_qpair *qpair)
1340 {
1341 mtx_lock(&qpair->recovery);
1342
1343 mtx_lock(&qpair->lock);
1344 nvme_qpair_disable(qpair);
1345 mtx_unlock(&qpair->lock);
1346
1347 nvme_admin_qpair_abort_aers(qpair);
1348
1349 mtx_unlock(&qpair->recovery);
1350 }
1351
1352 void
nvme_io_qpair_disable(struct nvme_qpair * qpair)1353 nvme_io_qpair_disable(struct nvme_qpair *qpair)
1354 {
1355 mtx_lock(&qpair->recovery);
1356 mtx_lock(&qpair->lock);
1357
1358 nvme_qpair_disable(qpair);
1359
1360 mtx_unlock(&qpair->lock);
1361 mtx_unlock(&qpair->recovery);
1362 }
1363
1364 void
nvme_qpair_fail(struct nvme_qpair * qpair)1365 nvme_qpair_fail(struct nvme_qpair *qpair)
1366 {
1367 struct nvme_tracker *tr;
1368 struct nvme_request *req;
1369
1370 if (!mtx_initialized(&qpair->lock))
1371 return;
1372
1373 mtx_lock(&qpair->lock);
1374
1375 if (!STAILQ_EMPTY(&qpair->queued_req)) {
1376 nvme_printf(qpair->ctrlr, "failing queued i/o\n");
1377 }
1378 while (!STAILQ_EMPTY(&qpair->queued_req)) {
1379 req = STAILQ_FIRST(&qpair->queued_req);
1380 STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
1381 mtx_unlock(&qpair->lock);
1382 nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
1383 NVME_SC_ABORTED_BY_REQUEST, 1, ERROR_PRINT_ALL);
1384 mtx_lock(&qpair->lock);
1385 }
1386
1387 if (!TAILQ_EMPTY(&qpair->outstanding_tr)) {
1388 nvme_printf(qpair->ctrlr, "failing outstanding i/o\n");
1389 }
1390 /* Manually abort each outstanding I/O. */
1391 while (!TAILQ_EMPTY(&qpair->outstanding_tr)) {
1392 tr = TAILQ_FIRST(&qpair->outstanding_tr);
1393 /*
1394 * Do not remove the tracker. The abort_tracker path will
1395 * do that for us.
1396 */
1397 mtx_unlock(&qpair->lock);
1398 nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC,
1399 NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);
1400 mtx_lock(&qpair->lock);
1401 }
1402
1403 mtx_unlock(&qpair->lock);
1404 }
1405