1 /*
2 * QEMU NVM Express Controller
3 *
4 * Copyright (c) 2012, Intel Corporation
5 *
6 * Written by Keith Busch <keith.busch@intel.com>
7 *
8 * This code is licensed under the GNU GPL v2 or later.
9 */
10
11 /**
12 * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
13 *
14 * https://nvmexpress.org/developers/nvme-specification/
15 *
16 *
17 * Notes on coding style
18 * ---------------------
19 * While QEMU coding style prefers lowercase hexadecimals in constants, the
20 * NVMe subsystem use this format from the NVMe specifications in the comments
21 * (i.e. 'h' suffix instead of '0x' prefix).
22 *
23 * Usage
24 * -----
25 * See docs/system/nvme.rst for extensive documentation.
26 *
27 * Add options:
28 * -drive file=<file>,if=none,id=<drive_id>
29 * -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id>
30 * -device nvme,serial=<serial>,id=<bus_name>, \
31 * cmb_size_mb=<cmb_size_mb[optional]>, \
32 * [pmrdev=<mem_backend_file_id>,] \
33 * max_ioqpairs=<N[optional]>, \
34 * aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
35 * mdts=<N[optional]>,vsl=<N[optional]>, \
36 * zoned.zasl=<N[optional]>, \
37 * zoned.auto_transition=<on|off[optional]>, \
38 * sriov_max_vfs=<N[optional]> \
39 * sriov_vq_flexible=<N[optional]> \
40 * sriov_vi_flexible=<N[optional]> \
41 * sriov_max_vi_per_vf=<N[optional]> \
42 * sriov_max_vq_per_vf=<N[optional]> \
43 * atomic.dn=<on|off[optional]>, \
44 * atomic.awun<N[optional]>, \
45 * atomic.awupf<N[optional]>, \
46 * subsys=<subsys_id>
47 * -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
48 * zoned=<true|false[optional]>, \
49 * subsys=<subsys_id>,shared=<true|false[optional]>, \
50 * detached=<true|false[optional]>, \
51 * zoned.zone_size=<N[optional]>, \
52 * zoned.zone_capacity=<N[optional]>, \
53 * zoned.descr_ext_size=<N[optional]>, \
54 * zoned.max_active=<N[optional]>, \
55 * zoned.max_open=<N[optional]>, \
56 * zoned.cross_read=<true|false[optional]>
57 *
58 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
59 * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
60 * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
61 * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
62 *
63 * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
64 * For example:
65 * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
66 * size=<size> .... -device nvme,...,pmrdev=<mem_id>
67 *
68 * The PMR will use BAR 4/5 exclusively.
69 *
70 * To place controller(s) and namespace(s) to a subsystem, then provide
71 * nvme-subsys device as above.
72 *
73 * nvme subsystem device parameters
74 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
75 * - `nqn`
76 * This parameter provides the `<nqn_id>` part of the string
77 * `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field
78 * of subsystem controllers. Note that `<nqn_id>` should be unique per
79 * subsystem, but this is not enforced by QEMU. If not specified, it will
80 * default to the value of the `id` parameter (`<subsys_id>`).
81 *
82 * nvme device parameters
83 * ~~~~~~~~~~~~~~~~~~~~~~
84 * - `subsys`
85 * Specifying this parameter attaches the controller to the subsystem and
86 * the SUBNQN field in the controller will report the NQN of the subsystem
87 * device. This also enables multi controller capability represented in
88 * Identify Controller data structure in CMIC (Controller Multi-path I/O and
89 * Namespace Sharing Capabilities).
90 *
91 * - `aerl`
92 * The Asynchronous Event Request Limit (AERL). Indicates the maximum number
93 * of concurrently outstanding Asynchronous Event Request commands support
94 * by the controller. This is a 0's based value.
95 *
96 * - `aer_max_queued`
97 * This is the maximum number of events that the device will enqueue for
98 * completion when there are no outstanding AERs. When the maximum number of
99 * enqueued events are reached, subsequent events will be dropped.
100 *
101 * - `mdts`
102 * Indicates the maximum data transfer size for a command that transfers data
103 * between host-accessible memory and the controller. The value is specified
104 * as a power of two (2^n) and is in units of the minimum memory page size
105 * (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
106 *
107 * - `vsl`
108 * Indicates the maximum data size limit for the Verify command. Like `mdts`,
109 * this value is specified as a power of two (2^n) and is in units of the
110 * minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512
111 * KiB).
112 *
113 * - `zoned.zasl`
114 * Indicates the maximum data transfer size for the Zone Append command. Like
115 * `mdts`, the value is specified as a power of two (2^n) and is in units of
116 * the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
117 * defaulting to the value of `mdts`).
118 *
119 * - `zoned.auto_transition`
120 * Indicates if zones in zone state implicitly opened can be automatically
121 * transitioned to zone state closed for resource management purposes.
122 * Defaults to 'on'.
123 *
124 * - `sriov_max_vfs`
125 * Indicates the maximum number of PCIe virtual functions supported
126 * by the controller. The default value is 0. Specifying a non-zero value
127 * enables reporting of both SR-IOV and ARI capabilities by the NVMe device.
128 * Virtual function controllers will not report SR-IOV capability.
129 *
130 * NOTE: Single Root I/O Virtualization support is experimental.
131 * All the related parameters may be subject to change.
132 *
133 * - `sriov_vq_flexible`
134 * Indicates the total number of flexible queue resources assignable to all
135 * the secondary controllers. Implicitly sets the number of primary
136 * controller's private resources to `(max_ioqpairs - sriov_vq_flexible)`.
137 *
138 * - `sriov_vi_flexible`
139 * Indicates the total number of flexible interrupt resources assignable to
140 * all the secondary controllers. Implicitly sets the number of primary
141 * controller's private resources to `(msix_qsize - sriov_vi_flexible)`.
142 *
143 * - `sriov_max_vi_per_vf`
144 * Indicates the maximum number of virtual interrupt resources assignable
145 * to a secondary controller. The default 0 resolves to
146 * `(sriov_vi_flexible / sriov_max_vfs)`.
147 *
148 * - `sriov_max_vq_per_vf`
149 * Indicates the maximum number of virtual queue resources assignable to
150 * a secondary controller. The default 0 resolves to
151 * `(sriov_vq_flexible / sriov_max_vfs)`.
152 *
153 * nvme namespace device parameters
154 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
155 * - `shared`
156 * When the parent nvme device (as defined explicitly by the 'bus' parameter
157 * or implicitly by the most recently defined NvmeBus) is linked to an
158 * nvme-subsys device, the namespace will be attached to all controllers in
159 * the subsystem. If set to 'off' (the default), the namespace will remain a
160 * private namespace and may only be attached to a single controller at a
161 * time.
162 *
163 * - `detached`
164 * This parameter is only valid together with the `subsys` parameter. If left
165 * at the default value (`false/off`), the namespace will be attached to all
166 * controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
167 * namespace will be available in the subsystem but not attached to any
168 * controllers.
169 *
170 * Setting `zoned` to true selects Zoned Command Set at the namespace.
171 * In this case, the following namespace properties are available to configure
172 * zoned operation:
173 * zoned.zone_size=<zone size in bytes, default: 128MiB>
174 * The number may be followed by K, M, G as in kilo-, mega- or giga-.
175 *
176 * zoned.zone_capacity=<zone capacity in bytes, default: zone size>
177 * The value 0 (default) forces zone capacity to be the same as zone
178 * size. The value of this property may not exceed zone size.
179 *
180 * zoned.descr_ext_size=<zone descriptor extension size, default 0>
181 * This value needs to be specified in 64B units. If it is zero,
182 * namespace(s) will not support zone descriptor extensions.
183 *
184 * zoned.max_active=<Maximum Active Resources (zones), default: 0>
185 * The default value means there is no limit to the number of
186 * concurrently active zones.
187 *
188 * zoned.max_open=<Maximum Open Resources (zones), default: 0>
189 * The default value means there is no limit to the number of
190 * concurrently open zones.
191 *
192 * zoned.cross_read=<enable RAZB, default: false>
193 * Setting this property to true enables Read Across Zone Boundaries.
194 */
195
196 #include "qemu/osdep.h"
197 #include "qemu/cutils.h"
198 #include "qemu/error-report.h"
199 #include "qemu/log.h"
200 #include "qemu/units.h"
201 #include "qemu/range.h"
202 #include "qapi/error.h"
203 #include "qapi/visitor.h"
204 #include "system/system.h"
205 #include "system/block-backend.h"
206 #include "system/hostmem.h"
207 #include "hw/pci/msix.h"
208 #include "hw/pci/pcie_sriov.h"
209 #include "system/spdm-socket.h"
210 #include "migration/vmstate.h"
211
212 #include "nvme.h"
213 #include "dif.h"
214 #include "trace.h"
215
216 #define NVME_MAX_IOQPAIRS 0xffff
217 #define NVME_DB_SIZE 4
218 #define NVME_SPEC_VER 0x00010400
219 #define NVME_CMB_BIR 2
220 #define NVME_PMR_BIR 4
221 #define NVME_TEMPERATURE 0x143
222 #define NVME_TEMPERATURE_WARNING 0x157
223 #define NVME_TEMPERATURE_CRITICAL 0x175
224 #define NVME_NUM_FW_SLOTS 1
225 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
226 #define NVME_VF_RES_GRANULARITY 1
227 #define NVME_VF_OFFSET 0x1
228 #define NVME_VF_STRIDE 1
229
230 #define NVME_GUEST_ERR(trace, fmt, ...) \
231 do { \
232 (trace_##trace)(__VA_ARGS__); \
233 qemu_log_mask(LOG_GUEST_ERROR, #trace \
234 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
235 } while (0)
236
237 static const bool nvme_feature_support[NVME_FID_MAX] = {
238 [NVME_ARBITRATION] = true,
239 [NVME_POWER_MANAGEMENT] = true,
240 [NVME_TEMPERATURE_THRESHOLD] = true,
241 [NVME_ERROR_RECOVERY] = true,
242 [NVME_VOLATILE_WRITE_CACHE] = true,
243 [NVME_NUMBER_OF_QUEUES] = true,
244 [NVME_INTERRUPT_COALESCING] = true,
245 [NVME_INTERRUPT_VECTOR_CONF] = true,
246 [NVME_WRITE_ATOMICITY] = true,
247 [NVME_ASYNCHRONOUS_EVENT_CONF] = true,
248 [NVME_TIMESTAMP] = true,
249 [NVME_HOST_BEHAVIOR_SUPPORT] = true,
250 [NVME_COMMAND_SET_PROFILE] = true,
251 [NVME_FDP_MODE] = true,
252 [NVME_FDP_EVENTS] = true,
253 };
254
255 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
256 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
257 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
258 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
259 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
260 [NVME_WRITE_ATOMICITY] = NVME_FEAT_CAP_CHANGE,
261 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
262 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE,
263 [NVME_HOST_BEHAVIOR_SUPPORT] = NVME_FEAT_CAP_CHANGE,
264 [NVME_COMMAND_SET_PROFILE] = NVME_FEAT_CAP_CHANGE,
265 [NVME_FDP_MODE] = NVME_FEAT_CAP_CHANGE,
266 [NVME_FDP_EVENTS] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
267 };
268
269 static const uint32_t nvme_cse_acs_default[256] = {
270 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP,
271 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP,
272 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
273 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP,
274 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP,
275 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
276 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP,
277 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
278 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
279 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
280 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC |
281 NVME_CMD_EFF_CCC,
282 [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
283 [NVME_ADM_CMD_DIRECTIVE_RECV] = NVME_CMD_EFF_CSUPP,
284 [NVME_ADM_CMD_DIRECTIVE_SEND] = NVME_CMD_EFF_CSUPP,
285 };
286
287 static const uint32_t nvme_cse_iocs_nvm_default[256] = {
288 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
289 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
290 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
291 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
292 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
293 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
294 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
295 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
296 [NVME_CMD_IO_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
297 [NVME_CMD_IO_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
298 };
299
300 static const uint32_t nvme_cse_iocs_zoned_default[256] = {
301 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
302 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
303 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
304 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
305 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
306 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
307 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
308 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
309 [NVME_CMD_IO_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
310 [NVME_CMD_IO_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
311
312 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
313 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
314 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
315 };
316
317 static void nvme_process_sq(void *opaque);
318 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst);
319 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n);
320
nvme_sqid(NvmeRequest * req)321 static uint16_t nvme_sqid(NvmeRequest *req)
322 {
323 return le16_to_cpu(req->sq->sqid);
324 }
325
nvme_make_pid(NvmeNamespace * ns,uint16_t rg,uint16_t ph)326 static inline uint16_t nvme_make_pid(NvmeNamespace *ns, uint16_t rg,
327 uint16_t ph)
328 {
329 uint16_t rgif = ns->endgrp->fdp.rgif;
330
331 if (!rgif) {
332 return ph;
333 }
334
335 return (rg << (16 - rgif)) | ph;
336 }
337
nvme_ph_valid(NvmeNamespace * ns,uint16_t ph)338 static inline bool nvme_ph_valid(NvmeNamespace *ns, uint16_t ph)
339 {
340 return ph < ns->fdp.nphs;
341 }
342
nvme_rg_valid(NvmeEnduranceGroup * endgrp,uint16_t rg)343 static inline bool nvme_rg_valid(NvmeEnduranceGroup *endgrp, uint16_t rg)
344 {
345 return rg < endgrp->fdp.nrg;
346 }
347
nvme_pid2ph(NvmeNamespace * ns,uint16_t pid)348 static inline uint16_t nvme_pid2ph(NvmeNamespace *ns, uint16_t pid)
349 {
350 uint16_t rgif = ns->endgrp->fdp.rgif;
351
352 if (!rgif) {
353 return pid;
354 }
355
356 return pid & ((1 << (15 - rgif)) - 1);
357 }
358
nvme_pid2rg(NvmeNamespace * ns,uint16_t pid)359 static inline uint16_t nvme_pid2rg(NvmeNamespace *ns, uint16_t pid)
360 {
361 uint16_t rgif = ns->endgrp->fdp.rgif;
362
363 if (!rgif) {
364 return 0;
365 }
366
367 return pid >> (16 - rgif);
368 }
369
nvme_parse_pid(NvmeNamespace * ns,uint16_t pid,uint16_t * ph,uint16_t * rg)370 static inline bool nvme_parse_pid(NvmeNamespace *ns, uint16_t pid,
371 uint16_t *ph, uint16_t *rg)
372 {
373 *rg = nvme_pid2rg(ns, pid);
374 *ph = nvme_pid2ph(ns, pid);
375
376 return nvme_ph_valid(ns, *ph) && nvme_rg_valid(ns->endgrp, *rg);
377 }
378
nvme_assign_zone_state(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state)379 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
380 NvmeZoneState state)
381 {
382 if (QTAILQ_IN_USE(zone, entry)) {
383 switch (nvme_get_zone_state(zone)) {
384 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
385 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
386 break;
387 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
388 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
389 break;
390 case NVME_ZONE_STATE_CLOSED:
391 QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
392 break;
393 case NVME_ZONE_STATE_FULL:
394 QTAILQ_REMOVE(&ns->full_zones, zone, entry);
395 default:
396 ;
397 }
398 }
399
400 nvme_set_zone_state(zone, state);
401
402 switch (state) {
403 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
404 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
405 break;
406 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
407 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
408 break;
409 case NVME_ZONE_STATE_CLOSED:
410 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
411 break;
412 case NVME_ZONE_STATE_FULL:
413 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
414 case NVME_ZONE_STATE_READ_ONLY:
415 break;
416 default:
417 zone->d.za = 0;
418 }
419 }
420
nvme_zns_check_resources(NvmeNamespace * ns,uint32_t act,uint32_t opn,uint32_t zrwa)421 static uint16_t nvme_zns_check_resources(NvmeNamespace *ns, uint32_t act,
422 uint32_t opn, uint32_t zrwa)
423 {
424 if (ns->params.max_active_zones != 0 &&
425 ns->nr_active_zones + act > ns->params.max_active_zones) {
426 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
427 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
428 }
429
430 if (ns->params.max_open_zones != 0 &&
431 ns->nr_open_zones + opn > ns->params.max_open_zones) {
432 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
433 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
434 }
435
436 if (zrwa > ns->zns.numzrwa) {
437 return NVME_NOZRWA | NVME_DNR;
438 }
439
440 return NVME_SUCCESS;
441 }
442
443 /*
444 * Check if we can open a zone without exceeding open/active limits.
445 * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
446 */
nvme_aor_check(NvmeNamespace * ns,uint32_t act,uint32_t opn)447 static uint16_t nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
448 {
449 return nvme_zns_check_resources(ns, act, opn, 0);
450 }
451
nvme_fdp_alloc_event(NvmeCtrl * n,NvmeFdpEventBuffer * ebuf)452 static NvmeFdpEvent *nvme_fdp_alloc_event(NvmeCtrl *n, NvmeFdpEventBuffer *ebuf)
453 {
454 NvmeFdpEvent *ret = NULL;
455 bool is_full = ebuf->next == ebuf->start && ebuf->nelems;
456
457 ret = &ebuf->events[ebuf->next++];
458 if (unlikely(ebuf->next == NVME_FDP_MAX_EVENTS)) {
459 ebuf->next = 0;
460 }
461 if (is_full) {
462 ebuf->start = ebuf->next;
463 } else {
464 ebuf->nelems++;
465 }
466
467 memset(ret, 0, sizeof(NvmeFdpEvent));
468 ret->timestamp = nvme_get_timestamp(n);
469
470 return ret;
471 }
472
log_event(NvmeRuHandle * ruh,uint8_t event_type)473 static inline int log_event(NvmeRuHandle *ruh, uint8_t event_type)
474 {
475 return (ruh->event_filter >> nvme_fdp_evf_shifts[event_type]) & 0x1;
476 }
477
nvme_update_ruh(NvmeCtrl * n,NvmeNamespace * ns,uint16_t pid)478 static bool nvme_update_ruh(NvmeCtrl *n, NvmeNamespace *ns, uint16_t pid)
479 {
480 NvmeEnduranceGroup *endgrp = ns->endgrp;
481 NvmeRuHandle *ruh;
482 NvmeReclaimUnit *ru;
483 NvmeFdpEvent *e = NULL;
484 uint16_t ph, rg, ruhid;
485
486 if (!nvme_parse_pid(ns, pid, &ph, &rg)) {
487 return false;
488 }
489
490 ruhid = ns->fdp.phs[ph];
491
492 ruh = &endgrp->fdp.ruhs[ruhid];
493 ru = &ruh->rus[rg];
494
495 if (ru->ruamw) {
496 if (log_event(ruh, FDP_EVT_RU_NOT_FULLY_WRITTEN)) {
497 e = nvme_fdp_alloc_event(n, &endgrp->fdp.host_events);
498 e->type = FDP_EVT_RU_NOT_FULLY_WRITTEN;
499 e->flags = FDPEF_PIV | FDPEF_NSIDV | FDPEF_LV;
500 e->pid = cpu_to_le16(pid);
501 e->nsid = cpu_to_le32(ns->params.nsid);
502 e->rgid = cpu_to_le16(rg);
503 e->ruhid = cpu_to_le16(ruhid);
504 }
505
506 /* log (eventual) GC overhead of prematurely swapping the RU */
507 nvme_fdp_stat_inc(&endgrp->fdp.mbmw, nvme_l2b(ns, ru->ruamw));
508 }
509
510 ru->ruamw = ruh->ruamw;
511
512 return true;
513 }
514
nvme_addr_is_cmb(NvmeCtrl * n,hwaddr addr)515 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
516 {
517 hwaddr hi, lo;
518
519 if (!n->cmb.cmse) {
520 return false;
521 }
522
523 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
524 hi = lo + int128_get64(n->cmb.mem.size);
525
526 return addr >= lo && addr < hi;
527 }
528
nvme_addr_to_cmb(NvmeCtrl * n,hwaddr addr)529 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
530 {
531 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
532 return &n->cmb.buf[addr - base];
533 }
534
nvme_addr_is_pmr(NvmeCtrl * n,hwaddr addr)535 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
536 {
537 hwaddr hi;
538
539 if (!n->pmr.cmse) {
540 return false;
541 }
542
543 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
544
545 return addr >= n->pmr.cba && addr < hi;
546 }
547
nvme_addr_to_pmr(NvmeCtrl * n,hwaddr addr)548 static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
549 {
550 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
551 }
552
nvme_addr_is_iomem(NvmeCtrl * n,hwaddr addr)553 static inline bool nvme_addr_is_iomem(NvmeCtrl *n, hwaddr addr)
554 {
555 hwaddr hi, lo;
556
557 /*
558 * The purpose of this check is to guard against invalid "local" access to
559 * the iomem (i.e. controller registers). Thus, we check against the range
560 * covered by the 'bar0' MemoryRegion since that is currently composed of
561 * two subregions (the NVMe "MBAR" and the MSI-X table/pba). Note, however,
562 * that if the device model is ever changed to allow the CMB to be located
563 * in BAR0 as well, then this must be changed.
564 */
565 lo = n->bar0.addr;
566 hi = lo + int128_get64(n->bar0.size);
567
568 return addr >= lo && addr < hi;
569 }
570
nvme_addr_read(NvmeCtrl * n,hwaddr addr,void * buf,int size)571 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
572 {
573 hwaddr hi = addr + size - 1;
574 if (hi < addr) {
575 return 1;
576 }
577
578 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
579 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
580 return 0;
581 }
582
583 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
584 memcpy(buf, nvme_addr_to_pmr(n, addr), size);
585 return 0;
586 }
587
588 return pci_dma_read(PCI_DEVICE(n), addr, buf, size);
589 }
590
nvme_addr_write(NvmeCtrl * n,hwaddr addr,const void * buf,int size)591 static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, const void *buf, int size)
592 {
593 hwaddr hi = addr + size - 1;
594 if (hi < addr) {
595 return 1;
596 }
597
598 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
599 memcpy(nvme_addr_to_cmb(n, addr), buf, size);
600 return 0;
601 }
602
603 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
604 memcpy(nvme_addr_to_pmr(n, addr), buf, size);
605 return 0;
606 }
607
608 return pci_dma_write(PCI_DEVICE(n), addr, buf, size);
609 }
610
nvme_nsid_valid(NvmeCtrl * n,uint32_t nsid)611 static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
612 {
613 return nsid &&
614 (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
615 }
616
nvme_check_sqid(NvmeCtrl * n,uint16_t sqid)617 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
618 {
619 return sqid < n->conf_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
620 }
621
nvme_check_cqid(NvmeCtrl * n,uint16_t cqid)622 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
623 {
624 return cqid < n->conf_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
625 }
626
nvme_inc_cq_tail(NvmeCQueue * cq)627 static void nvme_inc_cq_tail(NvmeCQueue *cq)
628 {
629 cq->tail++;
630 if (cq->tail >= cq->size) {
631 cq->tail = 0;
632 cq->phase = !cq->phase;
633 }
634 }
635
nvme_inc_sq_head(NvmeSQueue * sq)636 static void nvme_inc_sq_head(NvmeSQueue *sq)
637 {
638 sq->head = (sq->head + 1) % sq->size;
639 }
640
nvme_cq_full(NvmeCQueue * cq)641 static uint8_t nvme_cq_full(NvmeCQueue *cq)
642 {
643 return (cq->tail + 1) % cq->size == cq->head;
644 }
645
nvme_sq_empty(NvmeSQueue * sq)646 static uint8_t nvme_sq_empty(NvmeSQueue *sq)
647 {
648 return sq->head == sq->tail;
649 }
650
nvme_irq_check(NvmeCtrl * n)651 static void nvme_irq_check(NvmeCtrl *n)
652 {
653 PCIDevice *pci = PCI_DEVICE(n);
654 uint32_t intms = ldl_le_p(&n->bar.intms);
655
656 if (msix_enabled(pci)) {
657 return;
658 }
659
660 /* vfs does not implement intx */
661 if (pci_is_vf(pci)) {
662 return;
663 }
664
665 if (~intms & n->irq_status) {
666 pci_irq_assert(pci);
667 } else {
668 pci_irq_deassert(pci);
669 }
670 }
671
nvme_irq_assert(NvmeCtrl * n,NvmeCQueue * cq)672 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
673 {
674 PCIDevice *pci = PCI_DEVICE(n);
675
676 if (cq->irq_enabled) {
677 if (msix_enabled(pci)) {
678 trace_pci_nvme_irq_msix(cq->vector);
679 msix_notify(pci, cq->vector);
680 } else {
681 trace_pci_nvme_irq_pin();
682 assert(cq->vector < 32);
683 n->irq_status |= 1 << cq->vector;
684 nvme_irq_check(n);
685 }
686 } else {
687 trace_pci_nvme_irq_masked();
688 }
689 }
690
nvme_irq_deassert(NvmeCtrl * n,NvmeCQueue * cq)691 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
692 {
693 if (cq->irq_enabled) {
694 if (msix_enabled(PCI_DEVICE(n))) {
695 return;
696 } else {
697 assert(cq->vector < 32);
698 if (!n->cq_pending) {
699 n->irq_status &= ~(1 << cq->vector);
700 }
701 nvme_irq_check(n);
702 }
703 }
704 }
705
nvme_req_clear(NvmeRequest * req)706 static void nvme_req_clear(NvmeRequest *req)
707 {
708 req->ns = NULL;
709 req->opaque = NULL;
710 req->aiocb = NULL;
711 memset(&req->cqe, 0x0, sizeof(req->cqe));
712 req->status = NVME_SUCCESS;
713 }
714
nvme_sg_init(NvmeCtrl * n,NvmeSg * sg,bool dma)715 static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
716 {
717 if (dma) {
718 pci_dma_sglist_init(&sg->qsg, PCI_DEVICE(n), 0);
719 sg->flags = NVME_SG_DMA;
720 } else {
721 qemu_iovec_init(&sg->iov, 0);
722 }
723
724 sg->flags |= NVME_SG_ALLOC;
725 }
726
nvme_sg_unmap(NvmeSg * sg)727 static inline void nvme_sg_unmap(NvmeSg *sg)
728 {
729 if (!(sg->flags & NVME_SG_ALLOC)) {
730 return;
731 }
732
733 if (sg->flags & NVME_SG_DMA) {
734 qemu_sglist_destroy(&sg->qsg);
735 } else {
736 qemu_iovec_destroy(&sg->iov);
737 }
738
739 memset(sg, 0x0, sizeof(*sg));
740 }
741
742 /*
743 * When metadata is transferred as extended LBAs, the DPTR mapped into `sg`
744 * holds both data and metadata. This function splits the data and metadata
745 * into two separate QSG/IOVs.
746 */
nvme_sg_split(NvmeSg * sg,NvmeNamespace * ns,NvmeSg * data,NvmeSg * mdata)747 static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
748 NvmeSg *mdata)
749 {
750 NvmeSg *dst = data;
751 uint32_t trans_len, count = ns->lbasz;
752 uint64_t offset = 0;
753 bool dma = sg->flags & NVME_SG_DMA;
754 size_t sge_len;
755 size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
756 int sg_idx = 0;
757
758 assert(sg->flags & NVME_SG_ALLOC);
759
760 while (sg_len) {
761 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
762
763 trans_len = MIN(sg_len, count);
764 trans_len = MIN(trans_len, sge_len - offset);
765
766 if (dst) {
767 if (dma) {
768 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
769 trans_len);
770 } else {
771 qemu_iovec_add(&dst->iov,
772 sg->iov.iov[sg_idx].iov_base + offset,
773 trans_len);
774 }
775 }
776
777 sg_len -= trans_len;
778 count -= trans_len;
779 offset += trans_len;
780
781 if (count == 0) {
782 dst = (dst == data) ? mdata : data;
783 count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
784 }
785
786 if (sge_len == offset) {
787 offset = 0;
788 sg_idx++;
789 }
790 }
791 }
792
nvme_map_addr_cmb(NvmeCtrl * n,QEMUIOVector * iov,hwaddr addr,size_t len)793 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
794 size_t len)
795 {
796 if (!len) {
797 return NVME_SUCCESS;
798 }
799
800 trace_pci_nvme_map_addr_cmb(addr, len);
801
802 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
803 return NVME_DATA_TRAS_ERROR;
804 }
805
806 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
807
808 return NVME_SUCCESS;
809 }
810
nvme_map_addr_pmr(NvmeCtrl * n,QEMUIOVector * iov,hwaddr addr,size_t len)811 static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
812 size_t len)
813 {
814 if (!len) {
815 return NVME_SUCCESS;
816 }
817
818 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
819 return NVME_DATA_TRAS_ERROR;
820 }
821
822 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
823
824 return NVME_SUCCESS;
825 }
826
nvme_map_addr(NvmeCtrl * n,NvmeSg * sg,hwaddr addr,size_t len)827 static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
828 {
829 bool cmb = false, pmr = false;
830
831 if (!len) {
832 return NVME_SUCCESS;
833 }
834
835 trace_pci_nvme_map_addr(addr, len);
836
837 if (nvme_addr_is_iomem(n, addr)) {
838 return NVME_DATA_TRAS_ERROR;
839 }
840
841 if (nvme_addr_is_cmb(n, addr)) {
842 cmb = true;
843 } else if (nvme_addr_is_pmr(n, addr)) {
844 pmr = true;
845 }
846
847 if (cmb || pmr) {
848 if (sg->flags & NVME_SG_DMA) {
849 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
850 }
851
852 if (sg->iov.niov + 1 > IOV_MAX) {
853 goto max_mappings_exceeded;
854 }
855
856 if (cmb) {
857 return nvme_map_addr_cmb(n, &sg->iov, addr, len);
858 } else {
859 return nvme_map_addr_pmr(n, &sg->iov, addr, len);
860 }
861 }
862
863 if (!(sg->flags & NVME_SG_DMA)) {
864 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
865 }
866
867 if (sg->qsg.nsg + 1 > IOV_MAX) {
868 goto max_mappings_exceeded;
869 }
870
871 qemu_sglist_add(&sg->qsg, addr, len);
872
873 return NVME_SUCCESS;
874
875 max_mappings_exceeded:
876 NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings,
877 "number of mappings exceed 1024");
878 return NVME_INTERNAL_DEV_ERROR | NVME_DNR;
879 }
880
nvme_addr_is_dma(NvmeCtrl * n,hwaddr addr)881 static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
882 {
883 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
884 }
885
nvme_map_prp(NvmeCtrl * n,NvmeSg * sg,uint64_t prp1,uint64_t prp2,uint32_t len)886 static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
887 uint64_t prp2, uint32_t len)
888 {
889 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
890 trans_len = MIN(len, trans_len);
891 int num_prps = (len >> n->page_bits) + 1;
892 uint16_t status;
893 int ret;
894
895 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
896
897 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
898
899 status = nvme_map_addr(n, sg, prp1, trans_len);
900 if (status) {
901 goto unmap;
902 }
903
904 len -= trans_len;
905 if (len) {
906 if (len > n->page_size) {
907 g_autofree uint64_t *prp_list = g_new(uint64_t, n->max_prp_ents);
908 uint32_t nents, prp_trans;
909 int i = 0;
910
911 /*
912 * The first PRP list entry, pointed to by PRP2 may contain offset.
913 * Hence, we need to calculate the number of entries in based on
914 * that offset.
915 */
916 nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
917 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
918 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
919 if (ret) {
920 trace_pci_nvme_err_addr_read(prp2);
921 status = NVME_DATA_TRAS_ERROR;
922 goto unmap;
923 }
924 while (len != 0) {
925 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
926
927 if (i == nents - 1 && len > n->page_size) {
928 if (unlikely(prp_ent & (n->page_size - 1))) {
929 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
930 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
931 goto unmap;
932 }
933
934 i = 0;
935 nents = (len + n->page_size - 1) >> n->page_bits;
936 nents = MIN(nents, n->max_prp_ents);
937 prp_trans = nents * sizeof(uint64_t);
938 ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
939 prp_trans);
940 if (ret) {
941 trace_pci_nvme_err_addr_read(prp_ent);
942 status = NVME_DATA_TRAS_ERROR;
943 goto unmap;
944 }
945 prp_ent = le64_to_cpu(prp_list[i]);
946 }
947
948 if (unlikely(prp_ent & (n->page_size - 1))) {
949 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
950 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
951 goto unmap;
952 }
953
954 trans_len = MIN(len, n->page_size);
955 status = nvme_map_addr(n, sg, prp_ent, trans_len);
956 if (status) {
957 goto unmap;
958 }
959
960 len -= trans_len;
961 i++;
962 }
963 } else {
964 if (unlikely(prp2 & (n->page_size - 1))) {
965 trace_pci_nvme_err_invalid_prp2_align(prp2);
966 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
967 goto unmap;
968 }
969 status = nvme_map_addr(n, sg, prp2, len);
970 if (status) {
971 goto unmap;
972 }
973 }
974 }
975
976 return NVME_SUCCESS;
977
978 unmap:
979 nvme_sg_unmap(sg);
980 return status;
981 }
982
983 /*
984 * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
985 * number of bytes mapped in len.
986 */
nvme_map_sgl_data(NvmeCtrl * n,NvmeSg * sg,NvmeSglDescriptor * segment,uint64_t nsgld,size_t * len,NvmeCmd * cmd)987 static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
988 NvmeSglDescriptor *segment, uint64_t nsgld,
989 size_t *len, NvmeCmd *cmd)
990 {
991 dma_addr_t addr, trans_len;
992 uint32_t dlen;
993 uint16_t status;
994
995 for (int i = 0; i < nsgld; i++) {
996 uint8_t type = NVME_SGL_TYPE(segment[i].type);
997
998 switch (type) {
999 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
1000 break;
1001 case NVME_SGL_DESCR_TYPE_SEGMENT:
1002 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
1003 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
1004 default:
1005 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
1006 }
1007
1008 dlen = le32_to_cpu(segment[i].len);
1009
1010 if (!dlen) {
1011 continue;
1012 }
1013
1014 if (*len == 0) {
1015 /*
1016 * All data has been mapped, but the SGL contains additional
1017 * segments and/or descriptors. The controller might accept
1018 * ignoring the rest of the SGL.
1019 */
1020 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
1021 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
1022 break;
1023 }
1024
1025 trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
1026 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1027 }
1028
1029 trans_len = MIN(*len, dlen);
1030
1031 addr = le64_to_cpu(segment[i].addr);
1032
1033 if (UINT64_MAX - addr < dlen) {
1034 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1035 }
1036
1037 status = nvme_map_addr(n, sg, addr, trans_len);
1038 if (status) {
1039 return status;
1040 }
1041
1042 *len -= trans_len;
1043 }
1044
1045 return NVME_SUCCESS;
1046 }
1047
nvme_map_sgl(NvmeCtrl * n,NvmeSg * sg,NvmeSglDescriptor sgl,size_t len,NvmeCmd * cmd)1048 static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
1049 size_t len, NvmeCmd *cmd)
1050 {
1051 /*
1052 * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
1053 * dynamically allocating a potentially huge SGL. The spec allows the SGL
1054 * to be larger (as in number of bytes required to describe the SGL
1055 * descriptors and segment chain) than the command transfer size, so it is
1056 * not bounded by MDTS.
1057 */
1058 #define SEG_CHUNK_SIZE 256
1059
1060 QEMU_UNINITIALIZED NvmeSglDescriptor segment[SEG_CHUNK_SIZE];
1061 NvmeSglDescriptor *sgld, *last_sgld;
1062 uint64_t nsgld;
1063 uint32_t seg_len;
1064 uint16_t status;
1065 hwaddr addr;
1066 int ret;
1067
1068 sgld = &sgl;
1069 addr = le64_to_cpu(sgl.addr);
1070
1071 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
1072
1073 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
1074
1075 /*
1076 * If the entire transfer can be described with a single data block it can
1077 * be mapped directly.
1078 */
1079 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
1080 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
1081 if (status) {
1082 goto unmap;
1083 }
1084
1085 goto out;
1086 }
1087
1088 for (;;) {
1089 switch (NVME_SGL_TYPE(sgld->type)) {
1090 case NVME_SGL_DESCR_TYPE_SEGMENT:
1091 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
1092 break;
1093 default:
1094 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1095 }
1096
1097 seg_len = le32_to_cpu(sgld->len);
1098
1099 /* check the length of the (Last) Segment descriptor */
1100 if (!seg_len || seg_len & 0xf) {
1101 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1102 }
1103
1104 if (UINT64_MAX - addr < seg_len) {
1105 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1106 }
1107
1108 nsgld = seg_len / sizeof(NvmeSglDescriptor);
1109
1110 while (nsgld > SEG_CHUNK_SIZE) {
1111 if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
1112 trace_pci_nvme_err_addr_read(addr);
1113 status = NVME_DATA_TRAS_ERROR;
1114 goto unmap;
1115 }
1116
1117 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
1118 &len, cmd);
1119 if (status) {
1120 goto unmap;
1121 }
1122
1123 nsgld -= SEG_CHUNK_SIZE;
1124 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
1125 }
1126
1127 ret = nvme_addr_read(n, addr, segment, nsgld *
1128 sizeof(NvmeSglDescriptor));
1129 if (ret) {
1130 trace_pci_nvme_err_addr_read(addr);
1131 status = NVME_DATA_TRAS_ERROR;
1132 goto unmap;
1133 }
1134
1135 last_sgld = &segment[nsgld - 1];
1136
1137 /*
1138 * If the segment ends with a Data Block, then we are done.
1139 */
1140 if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
1141 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
1142 if (status) {
1143 goto unmap;
1144 }
1145
1146 goto out;
1147 }
1148
1149 /*
1150 * If the last descriptor was not a Data Block, then the current
1151 * segment must not be a Last Segment.
1152 */
1153 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
1154 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1155 goto unmap;
1156 }
1157
1158 sgld = last_sgld;
1159 addr = le64_to_cpu(sgld->addr);
1160
1161 /*
1162 * Do not map the last descriptor; it will be a Segment or Last Segment
1163 * descriptor and is handled by the next iteration.
1164 */
1165 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
1166 if (status) {
1167 goto unmap;
1168 }
1169 }
1170
1171 out:
1172 /* if there is any residual left in len, the SGL was too short */
1173 if (len) {
1174 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1175 goto unmap;
1176 }
1177
1178 return NVME_SUCCESS;
1179
1180 unmap:
1181 nvme_sg_unmap(sg);
1182 return status;
1183 }
1184
nvme_map_dptr(NvmeCtrl * n,NvmeSg * sg,size_t len,NvmeCmd * cmd)1185 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1186 NvmeCmd *cmd)
1187 {
1188 uint64_t prp1, prp2;
1189
1190 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
1191 case NVME_PSDT_PRP:
1192 prp1 = le64_to_cpu(cmd->dptr.prp1);
1193 prp2 = le64_to_cpu(cmd->dptr.prp2);
1194
1195 return nvme_map_prp(n, sg, prp1, prp2, len);
1196 case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
1197 case NVME_PSDT_SGL_MPTR_SGL:
1198 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
1199 default:
1200 return NVME_INVALID_FIELD;
1201 }
1202 }
1203
nvme_map_mptr(NvmeCtrl * n,NvmeSg * sg,size_t len,NvmeCmd * cmd)1204 static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1205 NvmeCmd *cmd)
1206 {
1207 int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
1208 hwaddr mptr = le64_to_cpu(cmd->mptr);
1209 uint16_t status;
1210
1211 if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
1212 NvmeSglDescriptor sgl;
1213
1214 if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
1215 return NVME_DATA_TRAS_ERROR;
1216 }
1217
1218 status = nvme_map_sgl(n, sg, sgl, len, cmd);
1219 if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
1220 status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
1221 }
1222
1223 return status;
1224 }
1225
1226 nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1227 status = nvme_map_addr(n, sg, mptr, len);
1228 if (status) {
1229 nvme_sg_unmap(sg);
1230 }
1231
1232 return status;
1233 }
1234
nvme_map_data(NvmeCtrl * n,uint32_t nlb,NvmeRequest * req)1235 static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1236 {
1237 NvmeNamespace *ns = req->ns;
1238 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1239 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1240 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1241 size_t len = nvme_l2b(ns, nlb);
1242 uint16_t status;
1243
1244 if (nvme_ns_ext(ns) &&
1245 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1246 NvmeSg sg;
1247
1248 len += nvme_m2b(ns, nlb);
1249
1250 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1251 if (status) {
1252 return status;
1253 }
1254
1255 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1256 nvme_sg_split(&sg, ns, &req->sg, NULL);
1257 nvme_sg_unmap(&sg);
1258
1259 return NVME_SUCCESS;
1260 }
1261
1262 return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1263 }
1264
nvme_map_mdata(NvmeCtrl * n,uint32_t nlb,NvmeRequest * req)1265 static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1266 {
1267 NvmeNamespace *ns = req->ns;
1268 size_t len = nvme_m2b(ns, nlb);
1269 uint16_t status;
1270
1271 if (nvme_ns_ext(ns)) {
1272 NvmeSg sg;
1273
1274 len += nvme_l2b(ns, nlb);
1275
1276 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1277 if (status) {
1278 return status;
1279 }
1280
1281 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1282 nvme_sg_split(&sg, ns, NULL, &req->sg);
1283 nvme_sg_unmap(&sg);
1284
1285 return NVME_SUCCESS;
1286 }
1287
1288 return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1289 }
1290
nvme_tx_interleaved(NvmeCtrl * n,NvmeSg * sg,uint8_t * ptr,uint32_t len,uint32_t bytes,int32_t skip_bytes,int64_t offset,NvmeTxDirection dir)1291 static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1292 uint32_t len, uint32_t bytes,
1293 int32_t skip_bytes, int64_t offset,
1294 NvmeTxDirection dir)
1295 {
1296 hwaddr addr;
1297 uint32_t trans_len, count = bytes;
1298 bool dma = sg->flags & NVME_SG_DMA;
1299 int64_t sge_len;
1300 int sg_idx = 0;
1301 int ret;
1302
1303 assert(sg->flags & NVME_SG_ALLOC);
1304
1305 while (len) {
1306 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1307
1308 if (sge_len - offset < 0) {
1309 offset -= sge_len;
1310 sg_idx++;
1311 continue;
1312 }
1313
1314 if (sge_len == offset) {
1315 offset = 0;
1316 sg_idx++;
1317 continue;
1318 }
1319
1320 trans_len = MIN(len, count);
1321 trans_len = MIN(trans_len, sge_len - offset);
1322
1323 if (dma) {
1324 addr = sg->qsg.sg[sg_idx].base + offset;
1325 } else {
1326 addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1327 }
1328
1329 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1330 ret = nvme_addr_read(n, addr, ptr, trans_len);
1331 } else {
1332 ret = nvme_addr_write(n, addr, ptr, trans_len);
1333 }
1334
1335 if (ret) {
1336 return NVME_DATA_TRAS_ERROR;
1337 }
1338
1339 ptr += trans_len;
1340 len -= trans_len;
1341 count -= trans_len;
1342 offset += trans_len;
1343
1344 if (count == 0) {
1345 count = bytes;
1346 offset += skip_bytes;
1347 }
1348 }
1349
1350 return NVME_SUCCESS;
1351 }
1352
nvme_tx(NvmeCtrl * n,NvmeSg * sg,void * ptr,uint32_t len,NvmeTxDirection dir)1353 static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, void *ptr, uint32_t len,
1354 NvmeTxDirection dir)
1355 {
1356 assert(sg->flags & NVME_SG_ALLOC);
1357
1358 if (sg->flags & NVME_SG_DMA) {
1359 const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED;
1360 dma_addr_t residual;
1361
1362 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1363 dma_buf_write(ptr, len, &residual, &sg->qsg, attrs);
1364 } else {
1365 dma_buf_read(ptr, len, &residual, &sg->qsg, attrs);
1366 }
1367
1368 if (unlikely(residual)) {
1369 trace_pci_nvme_err_invalid_dma();
1370 return NVME_INVALID_FIELD | NVME_DNR;
1371 }
1372 } else {
1373 size_t bytes;
1374
1375 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1376 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1377 } else {
1378 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1379 }
1380
1381 if (unlikely(bytes != len)) {
1382 trace_pci_nvme_err_invalid_dma();
1383 return NVME_INVALID_FIELD | NVME_DNR;
1384 }
1385 }
1386
1387 return NVME_SUCCESS;
1388 }
1389
nvme_c2h(NvmeCtrl * n,void * ptr,uint32_t len,NvmeRequest * req)1390 static inline uint16_t nvme_c2h(NvmeCtrl *n, void *ptr, uint32_t len,
1391 NvmeRequest *req)
1392 {
1393 uint16_t status;
1394
1395 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1396 if (status) {
1397 return status;
1398 }
1399
1400 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1401 }
1402
nvme_h2c(NvmeCtrl * n,void * ptr,uint32_t len,NvmeRequest * req)1403 static inline uint16_t nvme_h2c(NvmeCtrl *n, void *ptr, uint32_t len,
1404 NvmeRequest *req)
1405 {
1406 uint16_t status;
1407
1408 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1409 if (status) {
1410 return status;
1411 }
1412
1413 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1414 }
1415
nvme_bounce_data(NvmeCtrl * n,void * ptr,uint32_t len,NvmeTxDirection dir,NvmeRequest * req)1416 uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len,
1417 NvmeTxDirection dir, NvmeRequest *req)
1418 {
1419 NvmeNamespace *ns = req->ns;
1420 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1421 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1422 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1423
1424 if (nvme_ns_ext(ns) &&
1425 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1426 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
1427 ns->lbaf.ms, 0, dir);
1428 }
1429
1430 return nvme_tx(n, &req->sg, ptr, len, dir);
1431 }
1432
nvme_bounce_mdata(NvmeCtrl * n,void * ptr,uint32_t len,NvmeTxDirection dir,NvmeRequest * req)1433 uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len,
1434 NvmeTxDirection dir, NvmeRequest *req)
1435 {
1436 NvmeNamespace *ns = req->ns;
1437 uint16_t status;
1438
1439 if (nvme_ns_ext(ns)) {
1440 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
1441 ns->lbasz, ns->lbasz, dir);
1442 }
1443
1444 nvme_sg_unmap(&req->sg);
1445
1446 status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1447 if (status) {
1448 return status;
1449 }
1450
1451 return nvme_tx(n, &req->sg, ptr, len, dir);
1452 }
1453
nvme_blk_read(BlockBackend * blk,int64_t offset,uint32_t align,BlockCompletionFunc * cb,NvmeRequest * req)1454 static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1455 uint32_t align, BlockCompletionFunc *cb,
1456 NvmeRequest *req)
1457 {
1458 assert(req->sg.flags & NVME_SG_ALLOC);
1459
1460 if (req->sg.flags & NVME_SG_DMA) {
1461 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, align, cb, req);
1462 } else {
1463 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1464 }
1465 }
1466
nvme_blk_write(BlockBackend * blk,int64_t offset,uint32_t align,BlockCompletionFunc * cb,NvmeRequest * req)1467 static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1468 uint32_t align, BlockCompletionFunc *cb,
1469 NvmeRequest *req)
1470 {
1471 assert(req->sg.flags & NVME_SG_ALLOC);
1472
1473 if (req->sg.flags & NVME_SG_DMA) {
1474 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, align, cb, req);
1475 } else {
1476 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1477 }
1478 }
1479
nvme_update_cq_eventidx(const NvmeCQueue * cq)1480 static void nvme_update_cq_eventidx(const NvmeCQueue *cq)
1481 {
1482 trace_pci_nvme_update_cq_eventidx(cq->cqid, cq->head);
1483
1484 stl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->ei_addr, cq->head,
1485 MEMTXATTRS_UNSPECIFIED);
1486 }
1487
nvme_update_cq_head(NvmeCQueue * cq)1488 static void nvme_update_cq_head(NvmeCQueue *cq)
1489 {
1490 ldl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->db_addr, &cq->head,
1491 MEMTXATTRS_UNSPECIFIED);
1492
1493 trace_pci_nvme_update_cq_head(cq->cqid, cq->head);
1494 }
1495
nvme_post_cqes(void * opaque)1496 static void nvme_post_cqes(void *opaque)
1497 {
1498 NvmeCQueue *cq = opaque;
1499 NvmeCtrl *n = cq->ctrl;
1500 NvmeRequest *req, *next;
1501 bool pending = cq->head != cq->tail;
1502 int ret;
1503
1504 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1505 NvmeSQueue *sq;
1506 hwaddr addr;
1507
1508 if (n->dbbuf_enabled) {
1509 nvme_update_cq_eventidx(cq);
1510 nvme_update_cq_head(cq);
1511 }
1512
1513 if (nvme_cq_full(cq)) {
1514 break;
1515 }
1516
1517 sq = req->sq;
1518 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1519 req->cqe.sq_id = cpu_to_le16(sq->sqid);
1520 req->cqe.sq_head = cpu_to_le16(sq->head);
1521 addr = cq->dma_addr + (cq->tail << NVME_CQES);
1522 ret = pci_dma_write(PCI_DEVICE(n), addr, (void *)&req->cqe,
1523 sizeof(req->cqe));
1524 if (ret) {
1525 trace_pci_nvme_err_addr_write(addr);
1526 trace_pci_nvme_err_cfs();
1527 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
1528 break;
1529 }
1530
1531 QTAILQ_REMOVE(&cq->req_list, req, entry);
1532
1533 nvme_inc_cq_tail(cq);
1534 nvme_sg_unmap(&req->sg);
1535
1536 if (QTAILQ_EMPTY(&sq->req_list) && !nvme_sq_empty(sq)) {
1537 qemu_bh_schedule(sq->bh);
1538 }
1539
1540 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1541 }
1542 if (cq->tail != cq->head) {
1543 if (cq->irq_enabled && !pending) {
1544 n->cq_pending++;
1545 }
1546
1547 nvme_irq_assert(n, cq);
1548 }
1549 }
1550
nvme_enqueue_req_completion(NvmeCQueue * cq,NvmeRequest * req)1551 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1552 {
1553 assert(cq->cqid == req->sq->cqid);
1554 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1555 le32_to_cpu(req->cqe.result),
1556 le32_to_cpu(req->cqe.dw1),
1557 req->status);
1558
1559 if (req->status) {
1560 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1561 req->status, req->cmd.opcode);
1562 }
1563
1564 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1565 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1566
1567 qemu_bh_schedule(cq->bh);
1568 }
1569
nvme_process_aers(void * opaque)1570 static void nvme_process_aers(void *opaque)
1571 {
1572 NvmeCtrl *n = opaque;
1573 NvmeAsyncEvent *event, *next;
1574
1575 trace_pci_nvme_process_aers(n->aer_queued);
1576
1577 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1578 NvmeRequest *req;
1579 NvmeAerResult *result;
1580
1581 /* can't post cqe if there is nothing to complete */
1582 if (!n->outstanding_aers) {
1583 trace_pci_nvme_no_outstanding_aers();
1584 break;
1585 }
1586
1587 /* ignore if masked (cqe posted, but event not cleared) */
1588 if (n->aer_mask & (1 << event->result.event_type)) {
1589 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1590 continue;
1591 }
1592
1593 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1594 n->aer_queued--;
1595
1596 n->aer_mask |= 1 << event->result.event_type;
1597 n->outstanding_aers--;
1598
1599 req = n->aer_reqs[n->outstanding_aers];
1600
1601 result = (NvmeAerResult *) &req->cqe.result;
1602 result->event_type = event->result.event_type;
1603 result->event_info = event->result.event_info;
1604 result->log_page = event->result.log_page;
1605 g_free(event);
1606
1607 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1608 result->log_page);
1609
1610 nvme_enqueue_req_completion(&n->admin_cq, req);
1611 }
1612 }
1613
nvme_enqueue_event(NvmeCtrl * n,uint8_t event_type,uint8_t event_info,uint8_t log_page)1614 static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1615 uint8_t event_info, uint8_t log_page)
1616 {
1617 NvmeAsyncEvent *event;
1618
1619 trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1620
1621 if (n->aer_queued == n->params.aer_max_queued) {
1622 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1623 return;
1624 }
1625
1626 event = g_new(NvmeAsyncEvent, 1);
1627 event->result = (NvmeAerResult) {
1628 .event_type = event_type,
1629 .event_info = event_info,
1630 .log_page = log_page,
1631 };
1632
1633 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1634 n->aer_queued++;
1635
1636 nvme_process_aers(n);
1637 }
1638
nvme_smart_event(NvmeCtrl * n,uint8_t event)1639 static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1640 {
1641 uint8_t aer_info;
1642
1643 /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
1644 if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1645 return;
1646 }
1647
1648 switch (event) {
1649 case NVME_SMART_SPARE:
1650 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1651 break;
1652 case NVME_SMART_TEMPERATURE:
1653 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1654 break;
1655 case NVME_SMART_RELIABILITY:
1656 case NVME_SMART_MEDIA_READ_ONLY:
1657 case NVME_SMART_FAILED_VOLATILE_MEDIA:
1658 case NVME_SMART_PMR_UNRELIABLE:
1659 aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1660 break;
1661 default:
1662 return;
1663 }
1664
1665 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1666 }
1667
nvme_clear_events(NvmeCtrl * n,uint8_t event_type)1668 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1669 {
1670 NvmeAsyncEvent *event, *next;
1671
1672 n->aer_mask &= ~(1 << event_type);
1673
1674 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1675 if (event->result.event_type == event_type) {
1676 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1677 n->aer_queued--;
1678 g_free(event);
1679 }
1680 }
1681 }
1682
nvme_check_mdts(NvmeCtrl * n,size_t len)1683 static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1684 {
1685 uint8_t mdts = n->params.mdts;
1686
1687 if (mdts && len > n->page_size << mdts) {
1688 trace_pci_nvme_err_mdts(len);
1689 return NVME_INVALID_FIELD | NVME_DNR;
1690 }
1691
1692 return NVME_SUCCESS;
1693 }
1694
nvme_check_bounds(NvmeNamespace * ns,uint64_t slba,uint32_t nlb)1695 static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1696 uint32_t nlb)
1697 {
1698 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1699
1700 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1701 trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
1702 return NVME_LBA_RANGE | NVME_DNR;
1703 }
1704
1705 return NVME_SUCCESS;
1706 }
1707
nvme_block_status_all(NvmeNamespace * ns,uint64_t slba,uint32_t nlb,int flags)1708 static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
1709 uint32_t nlb, int flags)
1710 {
1711 BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1712
1713 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1714 int64_t offset = nvme_l2b(ns, slba);
1715 int ret;
1716
1717 /*
1718 * `pnum` holds the number of bytes after offset that shares the same
1719 * allocation status as the byte at offset. If `pnum` is different from
1720 * `bytes`, we should check the allocation status of the next range and
1721 * continue this until all bytes have been checked.
1722 */
1723 do {
1724 bytes -= pnum;
1725
1726 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1727 if (ret < 0) {
1728 return ret;
1729 }
1730
1731
1732 trace_pci_nvme_block_status(offset, bytes, pnum, ret,
1733 !!(ret & BDRV_BLOCK_ZERO));
1734
1735 if (!(ret & flags)) {
1736 return 1;
1737 }
1738
1739 offset += pnum;
1740 } while (pnum != bytes);
1741
1742 return 0;
1743 }
1744
nvme_check_dulbe(NvmeNamespace * ns,uint64_t slba,uint32_t nlb)1745 static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1746 uint32_t nlb)
1747 {
1748 int ret;
1749 Error *err = NULL;
1750
1751 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
1752 if (ret) {
1753 if (ret < 0) {
1754 error_setg_errno(&err, -ret, "unable to get block status");
1755 error_report_err(err);
1756
1757 return NVME_INTERNAL_DEV_ERROR;
1758 }
1759
1760 return NVME_DULB;
1761 }
1762
1763 return NVME_SUCCESS;
1764 }
1765
nvme_zone_idx(NvmeNamespace * ns,uint64_t slba)1766 static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1767 {
1768 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1769 slba / ns->zone_size;
1770 }
1771
nvme_get_zone_by_slba(NvmeNamespace * ns,uint64_t slba)1772 static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1773 {
1774 uint32_t zone_idx = nvme_zone_idx(ns, slba);
1775
1776 if (zone_idx >= ns->num_zones) {
1777 return NULL;
1778 }
1779
1780 return &ns->zone_array[zone_idx];
1781 }
1782
nvme_check_zone_state_for_write(NvmeZone * zone)1783 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1784 {
1785 uint64_t zslba = zone->d.zslba;
1786
1787 switch (nvme_get_zone_state(zone)) {
1788 case NVME_ZONE_STATE_EMPTY:
1789 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1790 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1791 case NVME_ZONE_STATE_CLOSED:
1792 return NVME_SUCCESS;
1793 case NVME_ZONE_STATE_FULL:
1794 trace_pci_nvme_err_zone_is_full(zslba);
1795 return NVME_ZONE_FULL;
1796 case NVME_ZONE_STATE_OFFLINE:
1797 trace_pci_nvme_err_zone_is_offline(zslba);
1798 return NVME_ZONE_OFFLINE;
1799 case NVME_ZONE_STATE_READ_ONLY:
1800 trace_pci_nvme_err_zone_is_read_only(zslba);
1801 return NVME_ZONE_READ_ONLY;
1802 default:
1803 g_assert_not_reached();
1804 }
1805
1806 return NVME_INTERNAL_DEV_ERROR;
1807 }
1808
nvme_check_zone_write(NvmeNamespace * ns,NvmeZone * zone,uint64_t slba,uint32_t nlb)1809 static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1810 uint64_t slba, uint32_t nlb)
1811 {
1812 uint64_t zcap = nvme_zone_wr_boundary(zone);
1813 uint16_t status;
1814
1815 status = nvme_check_zone_state_for_write(zone);
1816 if (status) {
1817 return status;
1818 }
1819
1820 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1821 uint64_t ezrwa = zone->w_ptr + 2 * ns->zns.zrwas;
1822
1823 if (slba < zone->w_ptr || slba + nlb > ezrwa) {
1824 trace_pci_nvme_err_zone_invalid_write(slba, zone->w_ptr);
1825 return NVME_ZONE_INVALID_WRITE;
1826 }
1827 } else {
1828 if (unlikely(slba != zone->w_ptr)) {
1829 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
1830 zone->w_ptr);
1831 return NVME_ZONE_INVALID_WRITE;
1832 }
1833 }
1834
1835 if (unlikely((slba + nlb) > zcap)) {
1836 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1837 return NVME_ZONE_BOUNDARY_ERROR;
1838 }
1839
1840 return NVME_SUCCESS;
1841 }
1842
nvme_check_zone_state_for_read(NvmeZone * zone)1843 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1844 {
1845 switch (nvme_get_zone_state(zone)) {
1846 case NVME_ZONE_STATE_EMPTY:
1847 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1848 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1849 case NVME_ZONE_STATE_FULL:
1850 case NVME_ZONE_STATE_CLOSED:
1851 case NVME_ZONE_STATE_READ_ONLY:
1852 return NVME_SUCCESS;
1853 case NVME_ZONE_STATE_OFFLINE:
1854 trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1855 return NVME_ZONE_OFFLINE;
1856 default:
1857 g_assert_not_reached();
1858 }
1859
1860 return NVME_INTERNAL_DEV_ERROR;
1861 }
1862
nvme_check_zone_read(NvmeNamespace * ns,uint64_t slba,uint32_t nlb)1863 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1864 uint32_t nlb)
1865 {
1866 NvmeZone *zone;
1867 uint64_t bndry, end;
1868 uint16_t status;
1869
1870 zone = nvme_get_zone_by_slba(ns, slba);
1871 assert(zone);
1872
1873 bndry = nvme_zone_rd_boundary(ns, zone);
1874 end = slba + nlb;
1875
1876 status = nvme_check_zone_state_for_read(zone);
1877 if (status) {
1878 ;
1879 } else if (unlikely(end > bndry)) {
1880 if (!ns->params.cross_zone_read) {
1881 status = NVME_ZONE_BOUNDARY_ERROR;
1882 } else {
1883 /*
1884 * Read across zone boundary - check that all subsequent
1885 * zones that are being read have an appropriate state.
1886 */
1887 do {
1888 zone++;
1889 status = nvme_check_zone_state_for_read(zone);
1890 if (status) {
1891 break;
1892 }
1893 } while (end > nvme_zone_rd_boundary(ns, zone));
1894 }
1895 }
1896
1897 return status;
1898 }
1899
nvme_zrm_finish(NvmeNamespace * ns,NvmeZone * zone)1900 static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1901 {
1902 switch (nvme_get_zone_state(zone)) {
1903 case NVME_ZONE_STATE_FULL:
1904 return NVME_SUCCESS;
1905
1906 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1907 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1908 nvme_aor_dec_open(ns);
1909 /* fallthrough */
1910 case NVME_ZONE_STATE_CLOSED:
1911 nvme_aor_dec_active(ns);
1912
1913 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1914 zone->d.za &= ~NVME_ZA_ZRWA_VALID;
1915 if (ns->params.numzrwa) {
1916 ns->zns.numzrwa++;
1917 }
1918 }
1919
1920 /* fallthrough */
1921 case NVME_ZONE_STATE_EMPTY:
1922 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1923 return NVME_SUCCESS;
1924
1925 default:
1926 return NVME_ZONE_INVAL_TRANSITION;
1927 }
1928 }
1929
nvme_zrm_close(NvmeNamespace * ns,NvmeZone * zone)1930 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1931 {
1932 switch (nvme_get_zone_state(zone)) {
1933 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1934 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1935 nvme_aor_dec_open(ns);
1936 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1937 /* fall through */
1938 case NVME_ZONE_STATE_CLOSED:
1939 return NVME_SUCCESS;
1940
1941 default:
1942 return NVME_ZONE_INVAL_TRANSITION;
1943 }
1944 }
1945
nvme_zrm_reset(NvmeNamespace * ns,NvmeZone * zone)1946 static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
1947 {
1948 switch (nvme_get_zone_state(zone)) {
1949 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1950 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1951 nvme_aor_dec_open(ns);
1952 /* fallthrough */
1953 case NVME_ZONE_STATE_CLOSED:
1954 nvme_aor_dec_active(ns);
1955
1956 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1957 if (ns->params.numzrwa) {
1958 ns->zns.numzrwa++;
1959 }
1960 }
1961
1962 /* fallthrough */
1963 case NVME_ZONE_STATE_FULL:
1964 zone->w_ptr = zone->d.zslba;
1965 zone->d.wp = zone->w_ptr;
1966 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
1967 /* fallthrough */
1968 case NVME_ZONE_STATE_EMPTY:
1969 return NVME_SUCCESS;
1970
1971 default:
1972 return NVME_ZONE_INVAL_TRANSITION;
1973 }
1974 }
1975
nvme_zrm_auto_transition_zone(NvmeNamespace * ns)1976 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1977 {
1978 NvmeZone *zone;
1979
1980 if (ns->params.max_open_zones &&
1981 ns->nr_open_zones == ns->params.max_open_zones) {
1982 zone = QTAILQ_FIRST(&ns->imp_open_zones);
1983 if (zone) {
1984 /*
1985 * Automatically close this implicitly open zone.
1986 */
1987 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
1988 nvme_zrm_close(ns, zone);
1989 }
1990 }
1991 }
1992
1993 enum {
1994 NVME_ZRM_AUTO = 1 << 0,
1995 NVME_ZRM_ZRWA = 1 << 1,
1996 };
1997
nvme_zrm_open_flags(NvmeCtrl * n,NvmeNamespace * ns,NvmeZone * zone,int flags)1998 static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
1999 NvmeZone *zone, int flags)
2000 {
2001 int act = 0;
2002 uint16_t status;
2003
2004 switch (nvme_get_zone_state(zone)) {
2005 case NVME_ZONE_STATE_EMPTY:
2006 act = 1;
2007
2008 /* fallthrough */
2009
2010 case NVME_ZONE_STATE_CLOSED:
2011 if (n->params.auto_transition_zones) {
2012 nvme_zrm_auto_transition_zone(ns);
2013 }
2014 status = nvme_zns_check_resources(ns, act, 1,
2015 (flags & NVME_ZRM_ZRWA) ? 1 : 0);
2016 if (status) {
2017 return status;
2018 }
2019
2020 if (act) {
2021 nvme_aor_inc_active(ns);
2022 }
2023
2024 nvme_aor_inc_open(ns);
2025
2026 if (flags & NVME_ZRM_AUTO) {
2027 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
2028 return NVME_SUCCESS;
2029 }
2030
2031 /* fallthrough */
2032
2033 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
2034 if (flags & NVME_ZRM_AUTO) {
2035 return NVME_SUCCESS;
2036 }
2037
2038 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
2039
2040 /* fallthrough */
2041
2042 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
2043 if (flags & NVME_ZRM_ZRWA) {
2044 ns->zns.numzrwa--;
2045
2046 zone->d.za |= NVME_ZA_ZRWA_VALID;
2047 }
2048
2049 return NVME_SUCCESS;
2050
2051 default:
2052 return NVME_ZONE_INVAL_TRANSITION;
2053 }
2054 }
2055
nvme_zrm_auto(NvmeCtrl * n,NvmeNamespace * ns,NvmeZone * zone)2056 static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
2057 NvmeZone *zone)
2058 {
2059 return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
2060 }
2061
nvme_advance_zone_wp(NvmeNamespace * ns,NvmeZone * zone,uint32_t nlb)2062 static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
2063 uint32_t nlb)
2064 {
2065 zone->d.wp += nlb;
2066
2067 if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
2068 nvme_zrm_finish(ns, zone);
2069 }
2070 }
2071
nvme_zoned_zrwa_implicit_flush(NvmeNamespace * ns,NvmeZone * zone,uint32_t nlbc)2072 static void nvme_zoned_zrwa_implicit_flush(NvmeNamespace *ns, NvmeZone *zone,
2073 uint32_t nlbc)
2074 {
2075 uint16_t nzrwafgs = DIV_ROUND_UP(nlbc, ns->zns.zrwafg);
2076
2077 nlbc = nzrwafgs * ns->zns.zrwafg;
2078
2079 trace_pci_nvme_zoned_zrwa_implicit_flush(zone->d.zslba, nlbc);
2080
2081 zone->w_ptr += nlbc;
2082
2083 nvme_advance_zone_wp(ns, zone, nlbc);
2084 }
2085
nvme_finalize_zoned_write(NvmeNamespace * ns,NvmeRequest * req)2086 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
2087 {
2088 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2089 NvmeZone *zone;
2090 uint64_t slba;
2091 uint32_t nlb;
2092
2093 slba = le64_to_cpu(rw->slba);
2094 nlb = le16_to_cpu(rw->nlb) + 1;
2095 zone = nvme_get_zone_by_slba(ns, slba);
2096 assert(zone);
2097
2098 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
2099 uint64_t ezrwa = zone->w_ptr + ns->zns.zrwas - 1;
2100 uint64_t elba = slba + nlb - 1;
2101
2102 if (elba > ezrwa) {
2103 nvme_zoned_zrwa_implicit_flush(ns, zone, elba - ezrwa);
2104 }
2105
2106 return;
2107 }
2108
2109 nvme_advance_zone_wp(ns, zone, nlb);
2110 }
2111
nvme_is_write(NvmeRequest * req)2112 static inline bool nvme_is_write(NvmeRequest *req)
2113 {
2114 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2115
2116 return rw->opcode == NVME_CMD_WRITE ||
2117 rw->opcode == NVME_CMD_ZONE_APPEND ||
2118 rw->opcode == NVME_CMD_WRITE_ZEROES;
2119 }
2120
nvme_misc_cb(void * opaque,int ret)2121 static void nvme_misc_cb(void *opaque, int ret)
2122 {
2123 NvmeRequest *req = opaque;
2124 uint16_t cid = nvme_cid(req);
2125
2126 trace_pci_nvme_misc_cb(cid);
2127
2128 if (ret) {
2129 if (!req->status) {
2130 req->status = NVME_INTERNAL_DEV_ERROR;
2131 }
2132
2133 trace_pci_nvme_err_aio(cid, strerror(-ret), req->status);
2134 }
2135
2136 nvme_enqueue_req_completion(nvme_cq(req), req);
2137 }
2138
nvme_rw_complete_cb(void * opaque,int ret)2139 void nvme_rw_complete_cb(void *opaque, int ret)
2140 {
2141 NvmeRequest *req = opaque;
2142 NvmeNamespace *ns = req->ns;
2143 BlockBackend *blk = ns->blkconf.blk;
2144 BlockAcctCookie *acct = &req->acct;
2145 BlockAcctStats *stats = blk_get_stats(blk);
2146
2147 trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
2148
2149 if (ret) {
2150 Error *err = NULL;
2151
2152 block_acct_failed(stats, acct);
2153
2154 switch (req->cmd.opcode) {
2155 case NVME_CMD_READ:
2156 req->status = NVME_UNRECOVERED_READ;
2157 break;
2158
2159 case NVME_CMD_WRITE:
2160 case NVME_CMD_WRITE_ZEROES:
2161 case NVME_CMD_ZONE_APPEND:
2162 req->status = NVME_WRITE_FAULT;
2163 break;
2164
2165 default:
2166 req->status = NVME_INTERNAL_DEV_ERROR;
2167 break;
2168 }
2169
2170 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), req->status);
2171
2172 error_setg_errno(&err, -ret, "aio failed");
2173 error_report_err(err);
2174 } else {
2175 block_acct_done(stats, acct);
2176 }
2177
2178 if (ns->params.zoned && nvme_is_write(req)) {
2179 nvme_finalize_zoned_write(ns, req);
2180 }
2181
2182 nvme_enqueue_req_completion(nvme_cq(req), req);
2183 }
2184
nvme_rw_cb(void * opaque,int ret)2185 static void nvme_rw_cb(void *opaque, int ret)
2186 {
2187 NvmeRequest *req = opaque;
2188 NvmeNamespace *ns = req->ns;
2189
2190 BlockBackend *blk = ns->blkconf.blk;
2191
2192 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
2193
2194 if (ret) {
2195 goto out;
2196 }
2197
2198 if (ns->lbaf.ms) {
2199 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2200 uint64_t slba = le64_to_cpu(rw->slba);
2201 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2202 uint64_t offset = nvme_moff(ns, slba);
2203
2204 if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
2205 size_t mlen = nvme_m2b(ns, nlb);
2206
2207 req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
2208 BDRV_REQ_MAY_UNMAP,
2209 nvme_rw_complete_cb, req);
2210 return;
2211 }
2212
2213 if (nvme_ns_ext(ns) || req->cmd.mptr) {
2214 uint16_t status;
2215
2216 nvme_sg_unmap(&req->sg);
2217 status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
2218 if (status) {
2219 ret = -EFAULT;
2220 goto out;
2221 }
2222
2223 if (req->cmd.opcode == NVME_CMD_READ) {
2224 return nvme_blk_read(blk, offset, 1, nvme_rw_complete_cb, req);
2225 }
2226
2227 return nvme_blk_write(blk, offset, 1, nvme_rw_complete_cb, req);
2228 }
2229 }
2230
2231 out:
2232 nvme_rw_complete_cb(req, ret);
2233 }
2234
nvme_verify_cb(void * opaque,int ret)2235 static void nvme_verify_cb(void *opaque, int ret)
2236 {
2237 NvmeBounceContext *ctx = opaque;
2238 NvmeRequest *req = ctx->req;
2239 NvmeNamespace *ns = req->ns;
2240 BlockBackend *blk = ns->blkconf.blk;
2241 BlockAcctCookie *acct = &req->acct;
2242 BlockAcctStats *stats = blk_get_stats(blk);
2243 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2244 uint64_t slba = le64_to_cpu(rw->slba);
2245 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2246 uint16_t apptag = le16_to_cpu(rw->apptag);
2247 uint16_t appmask = le16_to_cpu(rw->appmask);
2248 uint64_t reftag = le32_to_cpu(rw->reftag);
2249 uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2250 uint16_t status;
2251
2252 reftag |= cdw3 << 32;
2253
2254 trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
2255
2256 if (ret) {
2257 block_acct_failed(stats, acct);
2258 req->status = NVME_UNRECOVERED_READ;
2259
2260 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), req->status);
2261
2262 goto out;
2263 }
2264
2265 block_acct_done(stats, acct);
2266
2267 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2268 status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
2269 ctx->mdata.iov.size, slba);
2270 if (status) {
2271 req->status = status;
2272 goto out;
2273 }
2274
2275 req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2276 ctx->mdata.bounce, ctx->mdata.iov.size,
2277 prinfo, slba, apptag, appmask, &reftag);
2278 }
2279
2280 out:
2281 qemu_iovec_destroy(&ctx->data.iov);
2282 g_free(ctx->data.bounce);
2283
2284 qemu_iovec_destroy(&ctx->mdata.iov);
2285 g_free(ctx->mdata.bounce);
2286
2287 g_free(ctx);
2288
2289 nvme_enqueue_req_completion(nvme_cq(req), req);
2290 }
2291
2292
nvme_verify_mdata_in_cb(void * opaque,int ret)2293 static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2294 {
2295 NvmeBounceContext *ctx = opaque;
2296 NvmeRequest *req = ctx->req;
2297 NvmeNamespace *ns = req->ns;
2298 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2299 uint64_t slba = le64_to_cpu(rw->slba);
2300 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2301 size_t mlen = nvme_m2b(ns, nlb);
2302 uint64_t offset = nvme_moff(ns, slba);
2303 BlockBackend *blk = ns->blkconf.blk;
2304
2305 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2306
2307 if (ret) {
2308 goto out;
2309 }
2310
2311 ctx->mdata.bounce = g_malloc(mlen);
2312
2313 qemu_iovec_reset(&ctx->mdata.iov);
2314 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2315
2316 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2317 nvme_verify_cb, ctx);
2318 return;
2319
2320 out:
2321 nvme_verify_cb(ctx, ret);
2322 }
2323
2324 struct nvme_compare_ctx {
2325 struct {
2326 QEMUIOVector iov;
2327 uint8_t *bounce;
2328 } data;
2329
2330 struct {
2331 QEMUIOVector iov;
2332 uint8_t *bounce;
2333 } mdata;
2334 };
2335
nvme_compare_mdata_cb(void * opaque,int ret)2336 static void nvme_compare_mdata_cb(void *opaque, int ret)
2337 {
2338 NvmeRequest *req = opaque;
2339 NvmeNamespace *ns = req->ns;
2340 NvmeCtrl *n = nvme_ctrl(req);
2341 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2342 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2343 uint16_t apptag = le16_to_cpu(rw->apptag);
2344 uint16_t appmask = le16_to_cpu(rw->appmask);
2345 uint64_t reftag = le32_to_cpu(rw->reftag);
2346 uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2347 struct nvme_compare_ctx *ctx = req->opaque;
2348 g_autofree uint8_t *buf = NULL;
2349 BlockBackend *blk = ns->blkconf.blk;
2350 BlockAcctCookie *acct = &req->acct;
2351 BlockAcctStats *stats = blk_get_stats(blk);
2352 uint16_t status = NVME_SUCCESS;
2353
2354 reftag |= cdw3 << 32;
2355
2356 trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2357
2358 if (ret) {
2359 block_acct_failed(stats, acct);
2360 req->status = NVME_UNRECOVERED_READ;
2361
2362 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), req->status);
2363
2364 goto out;
2365 }
2366
2367 buf = g_malloc(ctx->mdata.iov.size);
2368
2369 status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2370 NVME_TX_DIRECTION_TO_DEVICE, req);
2371 if (status) {
2372 req->status = status;
2373 goto out;
2374 }
2375
2376 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2377 uint64_t slba = le64_to_cpu(rw->slba);
2378 uint8_t *bufp;
2379 uint8_t *mbufp = ctx->mdata.bounce;
2380 uint8_t *end = mbufp + ctx->mdata.iov.size;
2381 int16_t pil = 0;
2382
2383 status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2384 ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
2385 slba, apptag, appmask, &reftag);
2386 if (status) {
2387 req->status = status;
2388 goto out;
2389 }
2390
2391 /*
2392 * When formatted with protection information, do not compare the DIF
2393 * tuple.
2394 */
2395 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2396 pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
2397 }
2398
2399 for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
2400 if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
2401 req->status = NVME_CMP_FAILURE | NVME_DNR;
2402 goto out;
2403 }
2404 }
2405
2406 goto out;
2407 }
2408
2409 if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2410 req->status = NVME_CMP_FAILURE | NVME_DNR;
2411 goto out;
2412 }
2413
2414 block_acct_done(stats, acct);
2415
2416 out:
2417 qemu_iovec_destroy(&ctx->data.iov);
2418 g_free(ctx->data.bounce);
2419
2420 qemu_iovec_destroy(&ctx->mdata.iov);
2421 g_free(ctx->mdata.bounce);
2422
2423 g_free(ctx);
2424
2425 nvme_enqueue_req_completion(nvme_cq(req), req);
2426 }
2427
nvme_compare_data_cb(void * opaque,int ret)2428 static void nvme_compare_data_cb(void *opaque, int ret)
2429 {
2430 NvmeRequest *req = opaque;
2431 NvmeCtrl *n = nvme_ctrl(req);
2432 NvmeNamespace *ns = req->ns;
2433 BlockBackend *blk = ns->blkconf.blk;
2434 BlockAcctCookie *acct = &req->acct;
2435 BlockAcctStats *stats = blk_get_stats(blk);
2436
2437 struct nvme_compare_ctx *ctx = req->opaque;
2438 g_autofree uint8_t *buf = NULL;
2439 uint16_t status;
2440
2441 trace_pci_nvme_compare_data_cb(nvme_cid(req));
2442
2443 if (ret) {
2444 block_acct_failed(stats, acct);
2445 req->status = NVME_UNRECOVERED_READ;
2446
2447 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), req->status);
2448
2449 goto out;
2450 }
2451
2452 buf = g_malloc(ctx->data.iov.size);
2453
2454 status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2455 NVME_TX_DIRECTION_TO_DEVICE, req);
2456 if (status) {
2457 req->status = status;
2458 goto out;
2459 }
2460
2461 if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2462 req->status = NVME_CMP_FAILURE | NVME_DNR;
2463 goto out;
2464 }
2465
2466 if (ns->lbaf.ms) {
2467 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2468 uint64_t slba = le64_to_cpu(rw->slba);
2469 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2470 size_t mlen = nvme_m2b(ns, nlb);
2471 uint64_t offset = nvme_moff(ns, slba);
2472
2473 ctx->mdata.bounce = g_malloc(mlen);
2474
2475 qemu_iovec_init(&ctx->mdata.iov, 1);
2476 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2477
2478 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2479 nvme_compare_mdata_cb, req);
2480 return;
2481 }
2482
2483 block_acct_done(stats, acct);
2484
2485 out:
2486 qemu_iovec_destroy(&ctx->data.iov);
2487 g_free(ctx->data.bounce);
2488 g_free(ctx);
2489
2490 nvme_enqueue_req_completion(nvme_cq(req), req);
2491 }
2492
2493 typedef struct NvmeDSMAIOCB {
2494 BlockAIOCB common;
2495 BlockAIOCB *aiocb;
2496 NvmeRequest *req;
2497 int ret;
2498
2499 NvmeDsmRange *range;
2500 unsigned int nr;
2501 unsigned int idx;
2502 } NvmeDSMAIOCB;
2503
nvme_dsm_cancel(BlockAIOCB * aiocb)2504 static void nvme_dsm_cancel(BlockAIOCB *aiocb)
2505 {
2506 NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
2507
2508 /* break nvme_dsm_cb loop */
2509 iocb->idx = iocb->nr;
2510 iocb->ret = -ECANCELED;
2511
2512 if (iocb->aiocb) {
2513 blk_aio_cancel_async(iocb->aiocb);
2514 iocb->aiocb = NULL;
2515 } else {
2516 /*
2517 * We only reach this if nvme_dsm_cancel() has already been called or
2518 * the command ran to completion.
2519 */
2520 assert(iocb->idx == iocb->nr);
2521 }
2522 }
2523
2524 static const AIOCBInfo nvme_dsm_aiocb_info = {
2525 .aiocb_size = sizeof(NvmeDSMAIOCB),
2526 .cancel_async = nvme_dsm_cancel,
2527 };
2528
2529 static void nvme_dsm_cb(void *opaque, int ret);
2530
nvme_dsm_md_cb(void * opaque,int ret)2531 static void nvme_dsm_md_cb(void *opaque, int ret)
2532 {
2533 NvmeDSMAIOCB *iocb = opaque;
2534 NvmeRequest *req = iocb->req;
2535 NvmeNamespace *ns = req->ns;
2536 NvmeDsmRange *range;
2537 uint64_t slba;
2538 uint32_t nlb;
2539
2540 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
2541 goto done;
2542 }
2543
2544 range = &iocb->range[iocb->idx - 1];
2545 slba = le64_to_cpu(range->slba);
2546 nlb = le32_to_cpu(range->nlb);
2547
2548 /*
2549 * Check that all block were discarded (zeroed); otherwise we do not zero
2550 * the metadata.
2551 */
2552
2553 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
2554 if (ret) {
2555 if (ret < 0) {
2556 goto done;
2557 }
2558
2559 nvme_dsm_cb(iocb, 0);
2560 return;
2561 }
2562
2563 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
2564 nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
2565 nvme_dsm_cb, iocb);
2566 return;
2567
2568 done:
2569 nvme_dsm_cb(iocb, ret);
2570 }
2571
nvme_dsm_cb(void * opaque,int ret)2572 static void nvme_dsm_cb(void *opaque, int ret)
2573 {
2574 NvmeDSMAIOCB *iocb = opaque;
2575 NvmeRequest *req = iocb->req;
2576 NvmeCtrl *n = nvme_ctrl(req);
2577 NvmeNamespace *ns = req->ns;
2578 NvmeDsmRange *range;
2579 uint64_t slba;
2580 uint32_t nlb;
2581
2582 if (iocb->ret < 0) {
2583 goto done;
2584 } else if (ret < 0) {
2585 iocb->ret = ret;
2586 goto done;
2587 }
2588
2589 next:
2590 if (iocb->idx == iocb->nr) {
2591 goto done;
2592 }
2593
2594 range = &iocb->range[iocb->idx++];
2595 slba = le64_to_cpu(range->slba);
2596 nlb = le32_to_cpu(range->nlb);
2597
2598 trace_pci_nvme_dsm_deallocate(slba, nlb);
2599
2600 if (nlb > n->dmrsl) {
2601 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2602 goto next;
2603 }
2604
2605 if (nvme_check_bounds(ns, slba, nlb)) {
2606 trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2607 ns->id_ns.nsze);
2608 goto next;
2609 }
2610
2611 iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
2612 nvme_l2b(ns, nlb),
2613 nvme_dsm_md_cb, iocb);
2614 return;
2615
2616 done:
2617 iocb->aiocb = NULL;
2618 iocb->common.cb(iocb->common.opaque, iocb->ret);
2619 g_free(iocb->range);
2620 qemu_aio_unref(iocb);
2621 }
2622
nvme_dsm(NvmeCtrl * n,NvmeRequest * req)2623 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2624 {
2625 NvmeNamespace *ns = req->ns;
2626 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2627 uint32_t attr = le32_to_cpu(dsm->attributes);
2628 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2629 uint16_t status = NVME_SUCCESS;
2630
2631 trace_pci_nvme_dsm(nr, attr);
2632
2633 if (attr & NVME_DSMGMT_AD) {
2634 NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
2635 nvme_misc_cb, req);
2636
2637 iocb->req = req;
2638 iocb->ret = 0;
2639 iocb->range = g_new(NvmeDsmRange, nr);
2640 iocb->nr = nr;
2641 iocb->idx = 0;
2642
2643 status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
2644 req);
2645 if (status) {
2646 g_free(iocb->range);
2647 qemu_aio_unref(iocb);
2648
2649 return status;
2650 }
2651
2652 req->aiocb = &iocb->common;
2653 nvme_dsm_cb(iocb, 0);
2654
2655 return NVME_NO_COMPLETE;
2656 }
2657
2658 return status;
2659 }
2660
nvme_verify(NvmeCtrl * n,NvmeRequest * req)2661 static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2662 {
2663 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2664 NvmeNamespace *ns = req->ns;
2665 BlockBackend *blk = ns->blkconf.blk;
2666 uint64_t slba = le64_to_cpu(rw->slba);
2667 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2668 size_t len = nvme_l2b(ns, nlb);
2669 size_t data_len = len;
2670 int64_t offset = nvme_l2b(ns, slba);
2671 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2672 uint32_t reftag = le32_to_cpu(rw->reftag);
2673 NvmeBounceContext *ctx = NULL;
2674 uint16_t status;
2675
2676 trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2677
2678 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2679 status = nvme_check_prinfo(ns, prinfo, slba, reftag);
2680 if (status) {
2681 return status;
2682 }
2683
2684 if (prinfo & NVME_PRINFO_PRACT) {
2685 return NVME_INVALID_PROT_INFO | NVME_DNR;
2686 }
2687 }
2688
2689 if (nvme_ns_ext(ns) && !(NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt))) {
2690 data_len += nvme_m2b(ns, nlb);
2691 }
2692
2693 if (data_len > (n->page_size << n->params.vsl)) {
2694 return NVME_INVALID_FIELD | NVME_DNR;
2695 }
2696
2697 status = nvme_check_bounds(ns, slba, nlb);
2698 if (status) {
2699 return status;
2700 }
2701
2702 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2703 status = nvme_check_dulbe(ns, slba, nlb);
2704 if (status) {
2705 return status;
2706 }
2707 }
2708
2709 ctx = g_new0(NvmeBounceContext, 1);
2710 ctx->req = req;
2711
2712 ctx->data.bounce = g_malloc(len);
2713
2714 qemu_iovec_init(&ctx->data.iov, 1);
2715 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2716
2717 block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2718 BLOCK_ACCT_READ);
2719
2720 req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2721 nvme_verify_mdata_in_cb, ctx);
2722 return NVME_NO_COMPLETE;
2723 }
2724
2725 typedef struct NvmeCopyAIOCB {
2726 BlockAIOCB common;
2727 BlockAIOCB *aiocb;
2728 NvmeRequest *req;
2729 NvmeCtrl *n;
2730 int ret;
2731
2732 void *ranges;
2733 unsigned int format;
2734 int nr;
2735 int idx;
2736
2737 uint8_t *bounce;
2738 QEMUIOVector iov;
2739 struct {
2740 BlockAcctCookie read;
2741 BlockAcctCookie write;
2742 } acct;
2743
2744 uint64_t reftag;
2745 uint64_t slba;
2746
2747 NvmeZone *zone;
2748 NvmeNamespace *sns;
2749 uint32_t tcl;
2750 } NvmeCopyAIOCB;
2751
nvme_copy_cancel(BlockAIOCB * aiocb)2752 static void nvme_copy_cancel(BlockAIOCB *aiocb)
2753 {
2754 NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
2755
2756 iocb->ret = -ECANCELED;
2757
2758 if (iocb->aiocb) {
2759 blk_aio_cancel_async(iocb->aiocb);
2760 iocb->aiocb = NULL;
2761 }
2762 }
2763
2764 static const AIOCBInfo nvme_copy_aiocb_info = {
2765 .aiocb_size = sizeof(NvmeCopyAIOCB),
2766 .cancel_async = nvme_copy_cancel,
2767 };
2768
nvme_copy_done(NvmeCopyAIOCB * iocb)2769 static void nvme_copy_done(NvmeCopyAIOCB *iocb)
2770 {
2771 NvmeRequest *req = iocb->req;
2772 NvmeNamespace *ns = req->ns;
2773 BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
2774
2775 if (iocb->idx != iocb->nr) {
2776 req->cqe.result = cpu_to_le32(iocb->idx);
2777 }
2778
2779 qemu_iovec_destroy(&iocb->iov);
2780 g_free(iocb->bounce);
2781
2782 if (iocb->ret < 0) {
2783 block_acct_failed(stats, &iocb->acct.read);
2784 block_acct_failed(stats, &iocb->acct.write);
2785 } else {
2786 block_acct_done(stats, &iocb->acct.read);
2787 block_acct_done(stats, &iocb->acct.write);
2788 }
2789
2790 iocb->common.cb(iocb->common.opaque, iocb->ret);
2791 qemu_aio_unref(iocb);
2792 }
2793
2794 static void nvme_do_copy(NvmeCopyAIOCB *iocb);
2795
nvme_copy_source_range_parse_format0_2(void * ranges,int idx,uint64_t * slba,uint32_t * nlb,uint32_t * snsid,uint16_t * apptag,uint16_t * appmask,uint64_t * reftag)2796 static void nvme_copy_source_range_parse_format0_2(void *ranges,
2797 int idx, uint64_t *slba,
2798 uint32_t *nlb,
2799 uint32_t *snsid,
2800 uint16_t *apptag,
2801 uint16_t *appmask,
2802 uint64_t *reftag)
2803 {
2804 NvmeCopySourceRangeFormat0_2 *_ranges = ranges;
2805
2806 if (snsid) {
2807 *snsid = le32_to_cpu(_ranges[idx].sparams);
2808 }
2809
2810 if (slba) {
2811 *slba = le64_to_cpu(_ranges[idx].slba);
2812 }
2813
2814 if (nlb) {
2815 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2816 }
2817
2818 if (apptag) {
2819 *apptag = le16_to_cpu(_ranges[idx].apptag);
2820 }
2821
2822 if (appmask) {
2823 *appmask = le16_to_cpu(_ranges[idx].appmask);
2824 }
2825
2826 if (reftag) {
2827 *reftag = le32_to_cpu(_ranges[idx].reftag);
2828 }
2829 }
2830
nvme_copy_source_range_parse_format1_3(void * ranges,int idx,uint64_t * slba,uint32_t * nlb,uint32_t * snsid,uint16_t * apptag,uint16_t * appmask,uint64_t * reftag)2831 static void nvme_copy_source_range_parse_format1_3(void *ranges, int idx,
2832 uint64_t *slba,
2833 uint32_t *nlb,
2834 uint32_t *snsid,
2835 uint16_t *apptag,
2836 uint16_t *appmask,
2837 uint64_t *reftag)
2838 {
2839 NvmeCopySourceRangeFormat1_3 *_ranges = ranges;
2840
2841 if (snsid) {
2842 *snsid = le32_to_cpu(_ranges[idx].sparams);
2843 }
2844
2845 if (slba) {
2846 *slba = le64_to_cpu(_ranges[idx].slba);
2847 }
2848
2849 if (nlb) {
2850 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2851 }
2852
2853 if (apptag) {
2854 *apptag = le16_to_cpu(_ranges[idx].apptag);
2855 }
2856
2857 if (appmask) {
2858 *appmask = le16_to_cpu(_ranges[idx].appmask);
2859 }
2860
2861 if (reftag) {
2862 *reftag = 0;
2863
2864 *reftag |= (uint64_t)_ranges[idx].sr[4] << 40;
2865 *reftag |= (uint64_t)_ranges[idx].sr[5] << 32;
2866 *reftag |= (uint64_t)_ranges[idx].sr[6] << 24;
2867 *reftag |= (uint64_t)_ranges[idx].sr[7] << 16;
2868 *reftag |= (uint64_t)_ranges[idx].sr[8] << 8;
2869 *reftag |= (uint64_t)_ranges[idx].sr[9];
2870 }
2871 }
2872
nvme_copy_source_range_parse(void * ranges,int idx,uint8_t format,uint64_t * slba,uint32_t * nlb,uint32_t * snsid,uint16_t * apptag,uint16_t * appmask,uint64_t * reftag)2873 static void nvme_copy_source_range_parse(void *ranges, int idx, uint8_t format,
2874 uint64_t *slba, uint32_t *nlb,
2875 uint32_t *snsid, uint16_t *apptag,
2876 uint16_t *appmask, uint64_t *reftag)
2877 {
2878 switch (format) {
2879 case NVME_COPY_FORMAT_0:
2880 case NVME_COPY_FORMAT_2:
2881 nvme_copy_source_range_parse_format0_2(ranges, idx, slba, nlb, snsid,
2882 apptag, appmask, reftag);
2883 break;
2884
2885 case NVME_COPY_FORMAT_1:
2886 case NVME_COPY_FORMAT_3:
2887 nvme_copy_source_range_parse_format1_3(ranges, idx, slba, nlb, snsid,
2888 apptag, appmask, reftag);
2889 break;
2890
2891 default:
2892 abort();
2893 }
2894 }
2895
nvme_check_copy_mcl(NvmeNamespace * ns,NvmeCopyAIOCB * iocb,uint16_t nr)2896 static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns,
2897 NvmeCopyAIOCB *iocb, uint16_t nr)
2898 {
2899 uint32_t copy_len = 0;
2900
2901 for (int idx = 0; idx < nr; idx++) {
2902 uint32_t nlb;
2903 nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL,
2904 &nlb, NULL, NULL, NULL, NULL);
2905 copy_len += nlb;
2906 }
2907 iocb->tcl = copy_len;
2908 if (copy_len > ns->id_ns.mcl) {
2909 return NVME_CMD_SIZE_LIMIT | NVME_DNR;
2910 }
2911
2912 return NVME_SUCCESS;
2913 }
2914
nvme_copy_out_completed_cb(void * opaque,int ret)2915 static void nvme_copy_out_completed_cb(void *opaque, int ret)
2916 {
2917 NvmeCopyAIOCB *iocb = opaque;
2918 NvmeRequest *req = iocb->req;
2919 NvmeNamespace *dns = req->ns;
2920 uint32_t nlb;
2921
2922 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2923 &nlb, NULL, NULL, NULL, NULL);
2924
2925 if (ret < 0) {
2926 iocb->ret = ret;
2927 req->status = NVME_WRITE_FAULT;
2928 goto out;
2929 } else if (iocb->ret < 0) {
2930 goto out;
2931 }
2932
2933 if (dns->params.zoned) {
2934 nvme_advance_zone_wp(dns, iocb->zone, nlb);
2935 }
2936
2937 iocb->idx++;
2938 iocb->slba += nlb;
2939 out:
2940 nvme_do_copy(iocb);
2941 }
2942
nvme_copy_out_cb(void * opaque,int ret)2943 static void nvme_copy_out_cb(void *opaque, int ret)
2944 {
2945 NvmeCopyAIOCB *iocb = opaque;
2946 NvmeRequest *req = iocb->req;
2947 NvmeNamespace *dns = req->ns;
2948 uint32_t nlb;
2949 size_t mlen;
2950 uint8_t *mbounce;
2951
2952 if (ret < 0 || iocb->ret < 0 || !dns->lbaf.ms) {
2953 goto out;
2954 }
2955
2956 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2957 &nlb, NULL, NULL, NULL, NULL);
2958
2959 mlen = nvme_m2b(dns, nlb);
2960 mbounce = iocb->bounce + nvme_l2b(dns, nlb);
2961
2962 qemu_iovec_reset(&iocb->iov);
2963 qemu_iovec_add(&iocb->iov, mbounce, mlen);
2964
2965 iocb->aiocb = blk_aio_pwritev(dns->blkconf.blk, nvme_moff(dns, iocb->slba),
2966 &iocb->iov, 0, nvme_copy_out_completed_cb,
2967 iocb);
2968
2969 return;
2970
2971 out:
2972 nvme_copy_out_completed_cb(iocb, ret);
2973 }
2974
nvme_copy_in_completed_cb(void * opaque,int ret)2975 static void nvme_copy_in_completed_cb(void *opaque, int ret)
2976 {
2977 NvmeCopyAIOCB *iocb = opaque;
2978 NvmeRequest *req = iocb->req;
2979 NvmeNamespace *sns = iocb->sns;
2980 NvmeNamespace *dns = req->ns;
2981 NvmeCopyCmd *copy = NULL;
2982 uint8_t *mbounce = NULL;
2983 uint32_t nlb;
2984 uint64_t slba;
2985 uint16_t apptag, appmask;
2986 uint64_t reftag;
2987 size_t len, mlen;
2988 uint16_t status;
2989
2990 if (ret < 0) {
2991 iocb->ret = ret;
2992 req->status = NVME_UNRECOVERED_READ;
2993 goto out;
2994 } else if (iocb->ret < 0) {
2995 goto out;
2996 }
2997
2998 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
2999 &nlb, NULL, &apptag, &appmask, &reftag);
3000
3001 trace_pci_nvme_copy_out(iocb->slba, nlb);
3002
3003 len = nvme_l2b(sns, nlb);
3004
3005 if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps)) {
3006 copy = (NvmeCopyCmd *)&req->cmd;
3007
3008 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
3009
3010 mlen = nvme_m2b(sns, nlb);
3011 mbounce = iocb->bounce + nvme_l2b(sns, nlb);
3012
3013 status = nvme_dif_mangle_mdata(sns, mbounce, mlen, slba);
3014 if (status) {
3015 goto invalid;
3016 }
3017 status = nvme_dif_check(sns, iocb->bounce, len, mbounce, mlen, prinfor,
3018 slba, apptag, appmask, &reftag);
3019 if (status) {
3020 goto invalid;
3021 }
3022 }
3023
3024 if (NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3025 copy = (NvmeCopyCmd *)&req->cmd;
3026 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
3027
3028 mlen = nvme_m2b(dns, nlb);
3029 mbounce = iocb->bounce + nvme_l2b(dns, nlb);
3030
3031 apptag = le16_to_cpu(copy->apptag);
3032 appmask = le16_to_cpu(copy->appmask);
3033
3034 if (prinfow & NVME_PRINFO_PRACT) {
3035 status = nvme_check_prinfo(dns, prinfow, iocb->slba, iocb->reftag);
3036 if (status) {
3037 goto invalid;
3038 }
3039
3040 nvme_dif_pract_generate_dif(dns, iocb->bounce, len, mbounce, mlen,
3041 apptag, &iocb->reftag);
3042 } else {
3043 status = nvme_dif_check(dns, iocb->bounce, len, mbounce, mlen,
3044 prinfow, iocb->slba, apptag, appmask,
3045 &iocb->reftag);
3046 if (status) {
3047 goto invalid;
3048 }
3049 }
3050 }
3051
3052 status = nvme_check_bounds(dns, iocb->slba, nlb);
3053 if (status) {
3054 goto invalid;
3055 }
3056
3057 if (dns->params.zoned) {
3058 status = nvme_check_zone_write(dns, iocb->zone, iocb->slba, nlb);
3059 if (status) {
3060 goto invalid;
3061 }
3062
3063 if (!(iocb->zone->d.za & NVME_ZA_ZRWA_VALID)) {
3064 iocb->zone->w_ptr += nlb;
3065 }
3066 }
3067
3068 qemu_iovec_reset(&iocb->iov);
3069 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
3070
3071 block_acct_start(blk_get_stats(dns->blkconf.blk), &iocb->acct.write, 0,
3072 BLOCK_ACCT_WRITE);
3073
3074 iocb->aiocb = blk_aio_pwritev(dns->blkconf.blk, nvme_l2b(dns, iocb->slba),
3075 &iocb->iov, 0, nvme_copy_out_cb, iocb);
3076
3077 return;
3078
3079 invalid:
3080 req->status = status;
3081 iocb->ret = -1;
3082 out:
3083 nvme_do_copy(iocb);
3084 }
3085
nvme_copy_in_cb(void * opaque,int ret)3086 static void nvme_copy_in_cb(void *opaque, int ret)
3087 {
3088 NvmeCopyAIOCB *iocb = opaque;
3089 NvmeNamespace *sns = iocb->sns;
3090 uint64_t slba;
3091 uint32_t nlb;
3092
3093 if (ret < 0 || iocb->ret < 0 || !sns->lbaf.ms) {
3094 goto out;
3095 }
3096
3097 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
3098 &nlb, NULL, NULL, NULL, NULL);
3099
3100 qemu_iovec_reset(&iocb->iov);
3101 qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(sns, nlb),
3102 nvme_m2b(sns, nlb));
3103
3104 iocb->aiocb = blk_aio_preadv(sns->blkconf.blk, nvme_moff(sns, slba),
3105 &iocb->iov, 0, nvme_copy_in_completed_cb,
3106 iocb);
3107 return;
3108
3109 out:
3110 nvme_copy_in_completed_cb(iocb, ret);
3111 }
3112
nvme_csi_supports_copy(uint8_t csi)3113 static inline bool nvme_csi_supports_copy(uint8_t csi)
3114 {
3115 return csi == NVME_CSI_NVM || csi == NVME_CSI_ZONED;
3116 }
3117
nvme_copy_ns_format_match(NvmeNamespace * sns,NvmeNamespace * dns)3118 static inline bool nvme_copy_ns_format_match(NvmeNamespace *sns,
3119 NvmeNamespace *dns)
3120 {
3121 return sns->lbaf.ds == dns->lbaf.ds && sns->lbaf.ms == dns->lbaf.ms;
3122 }
3123
nvme_copy_matching_ns_format(NvmeNamespace * sns,NvmeNamespace * dns,bool pi_enable)3124 static bool nvme_copy_matching_ns_format(NvmeNamespace *sns, NvmeNamespace *dns,
3125 bool pi_enable)
3126 {
3127 if (!nvme_csi_supports_copy(sns->csi) ||
3128 !nvme_csi_supports_copy(dns->csi)) {
3129 return false;
3130 }
3131
3132 if (!pi_enable && !nvme_copy_ns_format_match(sns, dns)) {
3133 return false;
3134 }
3135
3136 if (pi_enable && (!nvme_copy_ns_format_match(sns, dns) ||
3137 sns->id_ns.dps != dns->id_ns.dps)) {
3138 return false;
3139 }
3140
3141 return true;
3142 }
3143
nvme_copy_corresp_pi_match(NvmeNamespace * sns,NvmeNamespace * dns)3144 static inline bool nvme_copy_corresp_pi_match(NvmeNamespace *sns,
3145 NvmeNamespace *dns)
3146 {
3147 return sns->lbaf.ms == 0 &&
3148 ((dns->lbaf.ms == 8 && dns->pif == 0) ||
3149 (dns->lbaf.ms == 16 && dns->pif == 1));
3150 }
3151
nvme_copy_corresp_pi_format(NvmeNamespace * sns,NvmeNamespace * dns,bool sns_pi_en)3152 static bool nvme_copy_corresp_pi_format(NvmeNamespace *sns, NvmeNamespace *dns,
3153 bool sns_pi_en)
3154 {
3155 if (!nvme_csi_supports_copy(sns->csi) ||
3156 !nvme_csi_supports_copy(dns->csi)) {
3157 return false;
3158 }
3159
3160 if (!sns_pi_en && !nvme_copy_corresp_pi_match(sns, dns)) {
3161 return false;
3162 }
3163
3164 if (sns_pi_en && !nvme_copy_corresp_pi_match(dns, sns)) {
3165 return false;
3166 }
3167
3168 return true;
3169 }
3170
nvme_do_copy(NvmeCopyAIOCB * iocb)3171 static void nvme_do_copy(NvmeCopyAIOCB *iocb)
3172 {
3173 NvmeRequest *req = iocb->req;
3174 NvmeNamespace *sns;
3175 NvmeNamespace *dns = req->ns;
3176 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
3177 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
3178 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
3179 uint64_t slba;
3180 uint32_t nlb;
3181 size_t len;
3182 uint16_t status;
3183 uint32_t dnsid = le32_to_cpu(req->cmd.nsid);
3184 uint32_t snsid = dnsid;
3185
3186 if (iocb->ret < 0) {
3187 goto done;
3188 }
3189
3190 if (iocb->idx == iocb->nr) {
3191 goto done;
3192 }
3193
3194 if (iocb->format == 2 || iocb->format == 3) {
3195 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format,
3196 &slba, &nlb, &snsid, NULL, NULL, NULL);
3197 if (snsid != dnsid) {
3198 if (snsid == NVME_NSID_BROADCAST ||
3199 !nvme_nsid_valid(iocb->n, snsid)) {
3200 status = NVME_INVALID_NSID | NVME_DNR;
3201 goto invalid;
3202 }
3203 iocb->sns = nvme_ns(iocb->n, snsid);
3204 if (unlikely(!iocb->sns)) {
3205 status = NVME_INVALID_FIELD | NVME_DNR;
3206 goto invalid;
3207 }
3208 } else {
3209 if (((slba + nlb) > iocb->slba) &&
3210 ((slba + nlb) < (iocb->slba + iocb->tcl))) {
3211 status = NVME_CMD_OVERLAP_IO_RANGE | NVME_DNR;
3212 goto invalid;
3213 }
3214 }
3215 } else {
3216 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format,
3217 &slba, &nlb, NULL, NULL, NULL, NULL);
3218 }
3219
3220 sns = iocb->sns;
3221 if ((snsid == dnsid) && NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3222 ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
3223 status = NVME_INVALID_FIELD | NVME_DNR;
3224 goto invalid;
3225 } else if (snsid != dnsid) {
3226 if (!NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3227 !NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3228 if (!nvme_copy_matching_ns_format(sns, dns, false)) {
3229 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3230 goto invalid;
3231 }
3232 }
3233 if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3234 NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3235 if ((prinfor & NVME_PRINFO_PRACT) !=
3236 (prinfow & NVME_PRINFO_PRACT)) {
3237 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3238 goto invalid;
3239 } else {
3240 if (!nvme_copy_matching_ns_format(sns, dns, true)) {
3241 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3242 goto invalid;
3243 }
3244 }
3245 }
3246
3247 if (!NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3248 NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3249 if (!(prinfow & NVME_PRINFO_PRACT)) {
3250 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3251 goto invalid;
3252 } else {
3253 if (!nvme_copy_corresp_pi_format(sns, dns, false)) {
3254 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3255 goto invalid;
3256 }
3257 }
3258 }
3259
3260 if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3261 !NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3262 if (!(prinfor & NVME_PRINFO_PRACT)) {
3263 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3264 goto invalid;
3265 } else {
3266 if (!nvme_copy_corresp_pi_format(sns, dns, true)) {
3267 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3268 goto invalid;
3269 }
3270 }
3271 }
3272 }
3273 len = nvme_l2b(sns, nlb);
3274
3275 trace_pci_nvme_copy_source_range(slba, nlb);
3276
3277 if (nlb > le16_to_cpu(sns->id_ns.mssrl)) {
3278 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
3279 goto invalid;
3280 }
3281
3282 status = nvme_check_bounds(sns, slba, nlb);
3283 if (status) {
3284 goto invalid;
3285 }
3286
3287 if (NVME_ERR_REC_DULBE(sns->features.err_rec)) {
3288 status = nvme_check_dulbe(sns, slba, nlb);
3289 if (status) {
3290 goto invalid;
3291 }
3292 }
3293
3294 if (sns->params.zoned) {
3295 status = nvme_check_zone_read(sns, slba, nlb);
3296 if (status) {
3297 goto invalid;
3298 }
3299 }
3300
3301 g_free(iocb->bounce);
3302 iocb->bounce = g_malloc_n(le16_to_cpu(sns->id_ns.mssrl),
3303 sns->lbasz + sns->lbaf.ms);
3304
3305 qemu_iovec_reset(&iocb->iov);
3306 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
3307
3308 block_acct_start(blk_get_stats(sns->blkconf.blk), &iocb->acct.read, 0,
3309 BLOCK_ACCT_READ);
3310
3311 iocb->aiocb = blk_aio_preadv(sns->blkconf.blk, nvme_l2b(sns, slba),
3312 &iocb->iov, 0, nvme_copy_in_cb, iocb);
3313 return;
3314
3315 invalid:
3316 req->status = status;
3317 iocb->ret = -1;
3318 done:
3319 nvme_copy_done(iocb);
3320 }
3321
nvme_copy(NvmeCtrl * n,NvmeRequest * req)3322 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
3323 {
3324 NvmeNamespace *ns = req->ns;
3325 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
3326 NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
3327 nvme_misc_cb, req);
3328 uint16_t nr = copy->nr + 1;
3329 uint8_t format = copy->control[0] & 0xf;
3330 size_t len = sizeof(NvmeCopySourceRangeFormat0_2);
3331
3332 uint16_t status;
3333
3334 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
3335
3336 iocb->ranges = NULL;
3337 iocb->zone = NULL;
3338
3339 if (!(n->id_ctrl.ocfs & (1 << format)) ||
3340 ((format == 2 || format == 3) &&
3341 !(n->features.hbs.cdfe & (1 << format)))) {
3342 trace_pci_nvme_err_copy_invalid_format(format);
3343 status = NVME_INVALID_FIELD | NVME_DNR;
3344 goto invalid;
3345 }
3346
3347 if (nr > ns->id_ns.msrc + 1) {
3348 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
3349 goto invalid;
3350 }
3351
3352 if ((ns->pif == 0x0 && (format != 0x0 && format != 0x2)) ||
3353 (ns->pif != 0x0 && (format != 0x1 && format != 0x3))) {
3354 status = NVME_INVALID_FORMAT | NVME_DNR;
3355 goto invalid;
3356 }
3357
3358 if (ns->pif) {
3359 len = sizeof(NvmeCopySourceRangeFormat1_3);
3360 }
3361
3362 iocb->format = format;
3363 iocb->ranges = g_malloc_n(nr, len);
3364 status = nvme_h2c(n, (uint8_t *)iocb->ranges, len * nr, req);
3365 if (status) {
3366 goto invalid;
3367 }
3368
3369 iocb->slba = le64_to_cpu(copy->sdlba);
3370
3371 if (ns->params.zoned) {
3372 iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
3373 if (!iocb->zone) {
3374 status = NVME_LBA_RANGE | NVME_DNR;
3375 goto invalid;
3376 }
3377
3378 status = nvme_zrm_auto(n, ns, iocb->zone);
3379 if (status) {
3380 goto invalid;
3381 }
3382 }
3383
3384 status = nvme_check_copy_mcl(ns, iocb, nr);
3385 if (status) {
3386 goto invalid;
3387 }
3388
3389 iocb->req = req;
3390 iocb->ret = 0;
3391 iocb->nr = nr;
3392 iocb->idx = 0;
3393 iocb->reftag = le32_to_cpu(copy->reftag);
3394 iocb->reftag |= (uint64_t)le32_to_cpu(copy->cdw3) << 32;
3395
3396 qemu_iovec_init(&iocb->iov, 1);
3397
3398 req->aiocb = &iocb->common;
3399 iocb->sns = req->ns;
3400 iocb->n = n;
3401 iocb->bounce = NULL;
3402 nvme_do_copy(iocb);
3403
3404 return NVME_NO_COMPLETE;
3405
3406 invalid:
3407 g_free(iocb->ranges);
3408 qemu_aio_unref(iocb);
3409 return status;
3410 }
3411
nvme_compare(NvmeCtrl * n,NvmeRequest * req)3412 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
3413 {
3414 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3415 NvmeNamespace *ns = req->ns;
3416 BlockBackend *blk = ns->blkconf.blk;
3417 uint64_t slba = le64_to_cpu(rw->slba);
3418 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
3419 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3420 size_t data_len = nvme_l2b(ns, nlb);
3421 size_t len = data_len;
3422 int64_t offset = nvme_l2b(ns, slba);
3423 struct nvme_compare_ctx *ctx = NULL;
3424 uint16_t status;
3425
3426 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
3427
3428 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
3429 return NVME_INVALID_PROT_INFO | NVME_DNR;
3430 }
3431
3432 if (nvme_ns_ext(ns)) {
3433 len += nvme_m2b(ns, nlb);
3434 }
3435
3436 if (NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt)) {
3437 status = nvme_check_mdts(n, data_len);
3438 } else {
3439 status = nvme_check_mdts(n, len);
3440 }
3441 if (status) {
3442 return status;
3443 }
3444
3445 status = nvme_check_bounds(ns, slba, nlb);
3446 if (status) {
3447 return status;
3448 }
3449
3450 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3451 status = nvme_check_dulbe(ns, slba, nlb);
3452 if (status) {
3453 return status;
3454 }
3455 }
3456
3457 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
3458 if (status) {
3459 return status;
3460 }
3461
3462 ctx = g_new(struct nvme_compare_ctx, 1);
3463 ctx->data.bounce = g_malloc(data_len);
3464
3465 req->opaque = ctx;
3466
3467 qemu_iovec_init(&ctx->data.iov, 1);
3468 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
3469
3470 block_acct_start(blk_get_stats(blk), &req->acct, data_len,
3471 BLOCK_ACCT_READ);
3472 req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
3473 nvme_compare_data_cb, req);
3474
3475 return NVME_NO_COMPLETE;
3476 }
3477
3478 typedef struct NvmeFlushAIOCB {
3479 BlockAIOCB common;
3480 BlockAIOCB *aiocb;
3481 NvmeRequest *req;
3482 int ret;
3483
3484 NvmeNamespace *ns;
3485 uint32_t nsid;
3486 bool broadcast;
3487 } NvmeFlushAIOCB;
3488
nvme_flush_cancel(BlockAIOCB * acb)3489 static void nvme_flush_cancel(BlockAIOCB *acb)
3490 {
3491 NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
3492
3493 iocb->ret = -ECANCELED;
3494
3495 if (iocb->aiocb) {
3496 blk_aio_cancel_async(iocb->aiocb);
3497 iocb->aiocb = NULL;
3498 }
3499 }
3500
3501 static const AIOCBInfo nvme_flush_aiocb_info = {
3502 .aiocb_size = sizeof(NvmeFlushAIOCB),
3503 .cancel_async = nvme_flush_cancel,
3504 };
3505
3506 static void nvme_do_flush(NvmeFlushAIOCB *iocb);
3507
nvme_flush_ns_cb(void * opaque,int ret)3508 static void nvme_flush_ns_cb(void *opaque, int ret)
3509 {
3510 NvmeFlushAIOCB *iocb = opaque;
3511 NvmeNamespace *ns = iocb->ns;
3512
3513 if (ret < 0) {
3514 iocb->ret = ret;
3515 iocb->req->status = NVME_WRITE_FAULT;
3516 goto out;
3517 } else if (iocb->ret < 0) {
3518 goto out;
3519 }
3520
3521 if (ns) {
3522 trace_pci_nvme_flush_ns(iocb->nsid);
3523
3524 iocb->ns = NULL;
3525 iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
3526 return;
3527 }
3528
3529 out:
3530 nvme_do_flush(iocb);
3531 }
3532
nvme_do_flush(NvmeFlushAIOCB * iocb)3533 static void nvme_do_flush(NvmeFlushAIOCB *iocb)
3534 {
3535 NvmeRequest *req = iocb->req;
3536 NvmeCtrl *n = nvme_ctrl(req);
3537 int i;
3538
3539 if (iocb->ret < 0) {
3540 goto done;
3541 }
3542
3543 if (iocb->broadcast) {
3544 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
3545 iocb->ns = nvme_ns(n, i);
3546 if (iocb->ns) {
3547 iocb->nsid = i;
3548 break;
3549 }
3550 }
3551 }
3552
3553 if (!iocb->ns) {
3554 goto done;
3555 }
3556
3557 nvme_flush_ns_cb(iocb, 0);
3558 return;
3559
3560 done:
3561 iocb->common.cb(iocb->common.opaque, iocb->ret);
3562 qemu_aio_unref(iocb);
3563 }
3564
nvme_flush(NvmeCtrl * n,NvmeRequest * req)3565 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
3566 {
3567 NvmeFlushAIOCB *iocb;
3568 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3569 uint16_t status;
3570
3571 iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
3572
3573 iocb->req = req;
3574 iocb->ret = 0;
3575 iocb->ns = NULL;
3576 iocb->nsid = 0;
3577 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
3578
3579 if (!iocb->broadcast) {
3580 if (!nvme_nsid_valid(n, nsid)) {
3581 status = NVME_INVALID_NSID | NVME_DNR;
3582 goto out;
3583 }
3584
3585 iocb->ns = nvme_ns(n, nsid);
3586 if (!iocb->ns) {
3587 status = NVME_INVALID_FIELD | NVME_DNR;
3588 goto out;
3589 }
3590
3591 iocb->nsid = nsid;
3592 }
3593
3594 req->aiocb = &iocb->common;
3595 nvme_do_flush(iocb);
3596
3597 return NVME_NO_COMPLETE;
3598
3599 out:
3600 qemu_aio_unref(iocb);
3601
3602 return status;
3603 }
3604
nvme_read(NvmeCtrl * n,NvmeRequest * req)3605 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
3606 {
3607 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3608 NvmeNamespace *ns = req->ns;
3609 uint64_t slba = le64_to_cpu(rw->slba);
3610 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3611 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3612 uint64_t data_size = nvme_l2b(ns, nlb);
3613 uint64_t mapped_size = data_size;
3614 uint64_t data_offset;
3615 BlockBackend *blk = ns->blkconf.blk;
3616 uint16_t status;
3617
3618 if (nvme_ns_ext(ns) && !(NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt))) {
3619 mapped_size += nvme_m2b(ns, nlb);
3620
3621 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3622 bool pract = prinfo & NVME_PRINFO_PRACT;
3623
3624 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3625 mapped_size = data_size;
3626 }
3627 }
3628 }
3629
3630 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
3631
3632 status = nvme_check_mdts(n, mapped_size);
3633 if (status) {
3634 goto invalid;
3635 }
3636
3637 status = nvme_check_bounds(ns, slba, nlb);
3638 if (status) {
3639 goto invalid;
3640 }
3641
3642 if (ns->params.zoned) {
3643 status = nvme_check_zone_read(ns, slba, nlb);
3644 if (status) {
3645 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
3646 goto invalid;
3647 }
3648 }
3649
3650 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3651 status = nvme_check_dulbe(ns, slba, nlb);
3652 if (status) {
3653 goto invalid;
3654 }
3655 }
3656
3657 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3658 return nvme_dif_rw(n, req);
3659 }
3660
3661 status = nvme_map_data(n, nlb, req);
3662 if (status) {
3663 goto invalid;
3664 }
3665
3666 data_offset = nvme_l2b(ns, slba);
3667
3668 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3669 BLOCK_ACCT_READ);
3670 nvme_blk_read(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
3671 return NVME_NO_COMPLETE;
3672
3673 invalid:
3674 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
3675 return status | NVME_DNR;
3676 }
3677
nvme_do_write_fdp(NvmeCtrl * n,NvmeRequest * req,uint64_t slba,uint32_t nlb)3678 static void nvme_do_write_fdp(NvmeCtrl *n, NvmeRequest *req, uint64_t slba,
3679 uint32_t nlb)
3680 {
3681 NvmeNamespace *ns = req->ns;
3682 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3683 uint64_t data_size = nvme_l2b(ns, nlb);
3684 uint32_t dw12 = le32_to_cpu(req->cmd.cdw12);
3685 uint8_t dtype = (dw12 >> 20) & 0xf;
3686 uint16_t pid = le16_to_cpu(rw->dspec);
3687 uint16_t ph, rg, ruhid;
3688 NvmeReclaimUnit *ru;
3689
3690 if (dtype != NVME_DIRECTIVE_DATA_PLACEMENT ||
3691 !nvme_parse_pid(ns, pid, &ph, &rg)) {
3692 ph = 0;
3693 rg = 0;
3694 }
3695
3696 ruhid = ns->fdp.phs[ph];
3697 ru = &ns->endgrp->fdp.ruhs[ruhid].rus[rg];
3698
3699 nvme_fdp_stat_inc(&ns->endgrp->fdp.hbmw, data_size);
3700 nvme_fdp_stat_inc(&ns->endgrp->fdp.mbmw, data_size);
3701
3702 while (nlb) {
3703 if (nlb < ru->ruamw) {
3704 ru->ruamw -= nlb;
3705 break;
3706 }
3707
3708 nlb -= ru->ruamw;
3709 nvme_update_ruh(n, ns, pid);
3710 }
3711 }
3712
nvme_do_write(NvmeCtrl * n,NvmeRequest * req,bool append,bool wrz)3713 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
3714 bool wrz)
3715 {
3716 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3717 NvmeNamespace *ns = req->ns;
3718 uint64_t slba = le64_to_cpu(rw->slba);
3719 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3720 uint16_t ctrl = le16_to_cpu(rw->control);
3721 uint8_t prinfo = NVME_RW_PRINFO(ctrl);
3722 uint64_t data_size = nvme_l2b(ns, nlb);
3723 uint64_t mapped_size = data_size;
3724 uint64_t data_offset;
3725 NvmeZone *zone;
3726 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3727 BlockBackend *blk = ns->blkconf.blk;
3728 uint16_t status;
3729
3730 if (nvme_ns_ext(ns) && !(NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt))) {
3731 mapped_size += nvme_m2b(ns, nlb);
3732
3733 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3734 bool pract = prinfo & NVME_PRINFO_PRACT;
3735
3736 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3737 mapped_size -= nvme_m2b(ns, nlb);
3738 }
3739 }
3740 }
3741
3742 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3743 nvme_nsid(ns), nlb, mapped_size, slba);
3744
3745 if (!wrz) {
3746 status = nvme_check_mdts(n, mapped_size);
3747 if (status) {
3748 goto invalid;
3749 }
3750 }
3751
3752 status = nvme_check_bounds(ns, slba, nlb);
3753 if (status) {
3754 goto invalid;
3755 }
3756
3757 if (ns->params.zoned) {
3758 zone = nvme_get_zone_by_slba(ns, slba);
3759 assert(zone);
3760
3761 if (append) {
3762 bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3763
3764 if (unlikely(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3765 return NVME_INVALID_ZONE_OP | NVME_DNR;
3766 }
3767
3768 if (unlikely(slba != zone->d.zslba)) {
3769 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3770 status = NVME_INVALID_FIELD;
3771 goto invalid;
3772 }
3773
3774 if (n->params.zasl &&
3775 data_size > (uint64_t)n->page_size << n->params.zasl) {
3776 trace_pci_nvme_err_zasl(data_size);
3777 return NVME_INVALID_FIELD | NVME_DNR;
3778 }
3779
3780 slba = zone->w_ptr;
3781 rw->slba = cpu_to_le64(slba);
3782 res->slba = cpu_to_le64(slba);
3783
3784 switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3785 case NVME_ID_NS_DPS_TYPE_1:
3786 if (!piremap) {
3787 return NVME_INVALID_PROT_INFO | NVME_DNR;
3788 }
3789
3790 /* fallthrough */
3791
3792 case NVME_ID_NS_DPS_TYPE_2:
3793 if (piremap) {
3794 uint32_t reftag = le32_to_cpu(rw->reftag);
3795 rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3796 }
3797
3798 break;
3799
3800 case NVME_ID_NS_DPS_TYPE_3:
3801 if (piremap) {
3802 return NVME_INVALID_PROT_INFO | NVME_DNR;
3803 }
3804
3805 break;
3806 }
3807 }
3808
3809 status = nvme_check_zone_write(ns, zone, slba, nlb);
3810 if (status) {
3811 goto invalid;
3812 }
3813
3814 status = nvme_zrm_auto(n, ns, zone);
3815 if (status) {
3816 goto invalid;
3817 }
3818
3819 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3820 zone->w_ptr += nlb;
3821 }
3822 } else if (ns->endgrp && ns->endgrp->fdp.enabled) {
3823 nvme_do_write_fdp(n, req, slba, nlb);
3824 }
3825
3826 data_offset = nvme_l2b(ns, slba);
3827
3828 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3829 return nvme_dif_rw(n, req);
3830 }
3831
3832 if (!wrz) {
3833 status = nvme_map_data(n, nlb, req);
3834 if (status) {
3835 goto invalid;
3836 }
3837
3838 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3839 BLOCK_ACCT_WRITE);
3840 nvme_blk_write(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
3841 } else {
3842 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3843 BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3844 req);
3845 }
3846
3847 return NVME_NO_COMPLETE;
3848
3849 invalid:
3850 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3851 return status | NVME_DNR;
3852 }
3853
nvme_write(NvmeCtrl * n,NvmeRequest * req)3854 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3855 {
3856 return nvme_do_write(n, req, false, false);
3857 }
3858
nvme_write_zeroes(NvmeCtrl * n,NvmeRequest * req)3859 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3860 {
3861 return nvme_do_write(n, req, false, true);
3862 }
3863
nvme_zone_append(NvmeCtrl * n,NvmeRequest * req)3864 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3865 {
3866 return nvme_do_write(n, req, true, false);
3867 }
3868
nvme_get_mgmt_zone_slba_idx(NvmeNamespace * ns,NvmeCmd * c,uint64_t * slba,uint32_t * zone_idx)3869 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3870 uint64_t *slba, uint32_t *zone_idx)
3871 {
3872 uint32_t dw10 = le32_to_cpu(c->cdw10);
3873 uint32_t dw11 = le32_to_cpu(c->cdw11);
3874
3875 if (!ns->params.zoned) {
3876 trace_pci_nvme_err_invalid_opc(c->opcode);
3877 return NVME_INVALID_OPCODE | NVME_DNR;
3878 }
3879
3880 *slba = ((uint64_t)dw11) << 32 | dw10;
3881 if (unlikely(*slba >= ns->id_ns.nsze)) {
3882 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3883 *slba = 0;
3884 return NVME_LBA_RANGE | NVME_DNR;
3885 }
3886
3887 *zone_idx = nvme_zone_idx(ns, *slba);
3888 assert(*zone_idx < ns->num_zones);
3889
3890 return NVME_SUCCESS;
3891 }
3892
3893 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3894 NvmeRequest *);
3895
3896 enum NvmeZoneProcessingMask {
3897 NVME_PROC_CURRENT_ZONE = 0,
3898 NVME_PROC_OPENED_ZONES = 1 << 0,
3899 NVME_PROC_CLOSED_ZONES = 1 << 1,
3900 NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3901 NVME_PROC_FULL_ZONES = 1 << 3,
3902 };
3903
nvme_open_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3904 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3905 NvmeZoneState state, NvmeRequest *req)
3906 {
3907 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
3908 int flags = 0;
3909
3910 if (cmd->zsflags & NVME_ZSFLAG_ZRWA_ALLOC) {
3911 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
3912
3913 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
3914 return NVME_INVALID_ZONE_OP | NVME_DNR;
3915 }
3916
3917 if (zone->w_ptr % ns->zns.zrwafg) {
3918 return NVME_NOZRWA | NVME_DNR;
3919 }
3920
3921 flags = NVME_ZRM_ZRWA;
3922 }
3923
3924 return nvme_zrm_open_flags(nvme_ctrl(req), ns, zone, flags);
3925 }
3926
nvme_close_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3927 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3928 NvmeZoneState state, NvmeRequest *req)
3929 {
3930 return nvme_zrm_close(ns, zone);
3931 }
3932
nvme_finish_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3933 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3934 NvmeZoneState state, NvmeRequest *req)
3935 {
3936 return nvme_zrm_finish(ns, zone);
3937 }
3938
nvme_offline_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3939 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3940 NvmeZoneState state, NvmeRequest *req)
3941 {
3942 switch (state) {
3943 case NVME_ZONE_STATE_READ_ONLY:
3944 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3945 /* fall through */
3946 case NVME_ZONE_STATE_OFFLINE:
3947 return NVME_SUCCESS;
3948 default:
3949 return NVME_ZONE_INVAL_TRANSITION;
3950 }
3951 }
3952
nvme_set_zd_ext(NvmeNamespace * ns,NvmeZone * zone)3953 static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3954 {
3955 uint16_t status;
3956 uint8_t state = nvme_get_zone_state(zone);
3957
3958 if (state == NVME_ZONE_STATE_EMPTY) {
3959 status = nvme_aor_check(ns, 1, 0);
3960 if (status) {
3961 return status;
3962 }
3963 nvme_aor_inc_active(ns);
3964 zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3965 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3966 return NVME_SUCCESS;
3967 }
3968
3969 return NVME_ZONE_INVAL_TRANSITION;
3970 }
3971
nvme_bulk_proc_zone(NvmeNamespace * ns,NvmeZone * zone,enum NvmeZoneProcessingMask proc_mask,op_handler_t op_hndlr,NvmeRequest * req)3972 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3973 enum NvmeZoneProcessingMask proc_mask,
3974 op_handler_t op_hndlr, NvmeRequest *req)
3975 {
3976 uint16_t status = NVME_SUCCESS;
3977 NvmeZoneState zs = nvme_get_zone_state(zone);
3978 bool proc_zone;
3979
3980 switch (zs) {
3981 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3982 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3983 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3984 break;
3985 case NVME_ZONE_STATE_CLOSED:
3986 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3987 break;
3988 case NVME_ZONE_STATE_READ_ONLY:
3989 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3990 break;
3991 case NVME_ZONE_STATE_FULL:
3992 proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3993 break;
3994 default:
3995 proc_zone = false;
3996 }
3997
3998 if (proc_zone) {
3999 status = op_hndlr(ns, zone, zs, req);
4000 }
4001
4002 return status;
4003 }
4004
nvme_do_zone_op(NvmeNamespace * ns,NvmeZone * zone,enum NvmeZoneProcessingMask proc_mask,op_handler_t op_hndlr,NvmeRequest * req)4005 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
4006 enum NvmeZoneProcessingMask proc_mask,
4007 op_handler_t op_hndlr, NvmeRequest *req)
4008 {
4009 NvmeZone *next;
4010 uint16_t status = NVME_SUCCESS;
4011 int i;
4012
4013 if (!proc_mask) {
4014 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
4015 } else {
4016 if (proc_mask & NVME_PROC_CLOSED_ZONES) {
4017 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
4018 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4019 req);
4020 if (status && status != NVME_NO_COMPLETE) {
4021 goto out;
4022 }
4023 }
4024 }
4025 if (proc_mask & NVME_PROC_OPENED_ZONES) {
4026 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
4027 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4028 req);
4029 if (status && status != NVME_NO_COMPLETE) {
4030 goto out;
4031 }
4032 }
4033
4034 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
4035 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4036 req);
4037 if (status && status != NVME_NO_COMPLETE) {
4038 goto out;
4039 }
4040 }
4041 }
4042 if (proc_mask & NVME_PROC_FULL_ZONES) {
4043 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
4044 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4045 req);
4046 if (status && status != NVME_NO_COMPLETE) {
4047 goto out;
4048 }
4049 }
4050 }
4051
4052 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
4053 for (i = 0; i < ns->num_zones; i++, zone++) {
4054 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4055 req);
4056 if (status && status != NVME_NO_COMPLETE) {
4057 goto out;
4058 }
4059 }
4060 }
4061 }
4062
4063 out:
4064 return status;
4065 }
4066
4067 typedef struct NvmeZoneResetAIOCB {
4068 BlockAIOCB common;
4069 BlockAIOCB *aiocb;
4070 NvmeRequest *req;
4071 int ret;
4072
4073 bool all;
4074 int idx;
4075 NvmeZone *zone;
4076 } NvmeZoneResetAIOCB;
4077
nvme_zone_reset_cancel(BlockAIOCB * aiocb)4078 static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
4079 {
4080 NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
4081 NvmeRequest *req = iocb->req;
4082 NvmeNamespace *ns = req->ns;
4083
4084 iocb->idx = ns->num_zones;
4085
4086 iocb->ret = -ECANCELED;
4087
4088 if (iocb->aiocb) {
4089 blk_aio_cancel_async(iocb->aiocb);
4090 iocb->aiocb = NULL;
4091 }
4092 }
4093
4094 static const AIOCBInfo nvme_zone_reset_aiocb_info = {
4095 .aiocb_size = sizeof(NvmeZoneResetAIOCB),
4096 .cancel_async = nvme_zone_reset_cancel,
4097 };
4098
4099 static void nvme_zone_reset_cb(void *opaque, int ret);
4100
nvme_zone_reset_epilogue_cb(void * opaque,int ret)4101 static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
4102 {
4103 NvmeZoneResetAIOCB *iocb = opaque;
4104 NvmeRequest *req = iocb->req;
4105 NvmeNamespace *ns = req->ns;
4106 int64_t moff;
4107 int count;
4108
4109 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
4110 goto out;
4111 }
4112
4113 moff = nvme_moff(ns, iocb->zone->d.zslba);
4114 count = nvme_m2b(ns, ns->zone_size);
4115
4116 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
4117 BDRV_REQ_MAY_UNMAP,
4118 nvme_zone_reset_cb, iocb);
4119 return;
4120
4121 out:
4122 nvme_zone_reset_cb(iocb, ret);
4123 }
4124
nvme_zone_reset_cb(void * opaque,int ret)4125 static void nvme_zone_reset_cb(void *opaque, int ret)
4126 {
4127 NvmeZoneResetAIOCB *iocb = opaque;
4128 NvmeRequest *req = iocb->req;
4129 NvmeNamespace *ns = req->ns;
4130
4131 if (iocb->ret < 0) {
4132 goto done;
4133 } else if (ret < 0) {
4134 iocb->ret = ret;
4135 goto done;
4136 }
4137
4138 if (iocb->zone) {
4139 nvme_zrm_reset(ns, iocb->zone);
4140
4141 if (!iocb->all) {
4142 goto done;
4143 }
4144 }
4145
4146 while (iocb->idx < ns->num_zones) {
4147 NvmeZone *zone = &ns->zone_array[iocb->idx++];
4148
4149 switch (nvme_get_zone_state(zone)) {
4150 case NVME_ZONE_STATE_EMPTY:
4151 if (!iocb->all) {
4152 goto done;
4153 }
4154
4155 continue;
4156
4157 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
4158 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
4159 case NVME_ZONE_STATE_CLOSED:
4160 case NVME_ZONE_STATE_FULL:
4161 iocb->zone = zone;
4162 break;
4163
4164 default:
4165 continue;
4166 }
4167
4168 trace_pci_nvme_zns_zone_reset(zone->d.zslba);
4169
4170 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
4171 nvme_l2b(ns, zone->d.zslba),
4172 nvme_l2b(ns, ns->zone_size),
4173 BDRV_REQ_MAY_UNMAP,
4174 nvme_zone_reset_epilogue_cb,
4175 iocb);
4176 return;
4177 }
4178
4179 done:
4180 iocb->aiocb = NULL;
4181
4182 iocb->common.cb(iocb->common.opaque, iocb->ret);
4183 qemu_aio_unref(iocb);
4184 }
4185
nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl * n,NvmeZone * zone,uint64_t elba,NvmeRequest * req)4186 static uint16_t nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl *n, NvmeZone *zone,
4187 uint64_t elba, NvmeRequest *req)
4188 {
4189 NvmeNamespace *ns = req->ns;
4190 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
4191 uint64_t wp = zone->d.wp;
4192 uint32_t nlb = elba - wp + 1;
4193 uint16_t status;
4194
4195
4196 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
4197 return NVME_INVALID_ZONE_OP | NVME_DNR;
4198 }
4199
4200 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
4201 return NVME_INVALID_FIELD | NVME_DNR;
4202 }
4203
4204 if (elba < wp || elba > wp + ns->zns.zrwas) {
4205 return NVME_ZONE_BOUNDARY_ERROR | NVME_DNR;
4206 }
4207
4208 if (nlb % ns->zns.zrwafg) {
4209 return NVME_INVALID_FIELD | NVME_DNR;
4210 }
4211
4212 status = nvme_zrm_auto(n, ns, zone);
4213 if (status) {
4214 return status;
4215 }
4216
4217 zone->w_ptr += nlb;
4218
4219 nvme_advance_zone_wp(ns, zone, nlb);
4220
4221 return NVME_SUCCESS;
4222 }
4223
nvme_zone_mgmt_send(NvmeCtrl * n,NvmeRequest * req)4224 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
4225 {
4226 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
4227 NvmeNamespace *ns = req->ns;
4228 NvmeZone *zone;
4229 NvmeZoneResetAIOCB *iocb;
4230 uint8_t *zd_ext;
4231 uint64_t slba = 0;
4232 uint32_t zone_idx = 0;
4233 uint16_t status;
4234 uint8_t action = cmd->zsa;
4235 bool all;
4236 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
4237
4238 all = cmd->zsflags & NVME_ZSFLAG_SELECT_ALL;
4239
4240 req->status = NVME_SUCCESS;
4241
4242 if (!all) {
4243 status = nvme_get_mgmt_zone_slba_idx(ns, &req->cmd, &slba, &zone_idx);
4244 if (status) {
4245 return status;
4246 }
4247 }
4248
4249 zone = &ns->zone_array[zone_idx];
4250 if (slba != zone->d.zslba && action != NVME_ZONE_ACTION_ZRWA_FLUSH) {
4251 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
4252 return NVME_INVALID_FIELD | NVME_DNR;
4253 }
4254
4255 switch (action) {
4256
4257 case NVME_ZONE_ACTION_OPEN:
4258 if (all) {
4259 proc_mask = NVME_PROC_CLOSED_ZONES;
4260 }
4261 trace_pci_nvme_open_zone(slba, zone_idx, all);
4262 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
4263 break;
4264
4265 case NVME_ZONE_ACTION_CLOSE:
4266 if (all) {
4267 proc_mask = NVME_PROC_OPENED_ZONES;
4268 }
4269 trace_pci_nvme_close_zone(slba, zone_idx, all);
4270 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
4271 break;
4272
4273 case NVME_ZONE_ACTION_FINISH:
4274 if (all) {
4275 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
4276 }
4277 trace_pci_nvme_finish_zone(slba, zone_idx, all);
4278 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
4279 break;
4280
4281 case NVME_ZONE_ACTION_RESET:
4282 trace_pci_nvme_reset_zone(slba, zone_idx, all);
4283
4284 iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
4285 nvme_misc_cb, req);
4286
4287 iocb->req = req;
4288 iocb->ret = 0;
4289 iocb->all = all;
4290 iocb->idx = zone_idx;
4291 iocb->zone = NULL;
4292
4293 req->aiocb = &iocb->common;
4294 nvme_zone_reset_cb(iocb, 0);
4295
4296 return NVME_NO_COMPLETE;
4297
4298 case NVME_ZONE_ACTION_OFFLINE:
4299 if (all) {
4300 proc_mask = NVME_PROC_READ_ONLY_ZONES;
4301 }
4302 trace_pci_nvme_offline_zone(slba, zone_idx, all);
4303 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
4304 break;
4305
4306 case NVME_ZONE_ACTION_SET_ZD_EXT:
4307 trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
4308 if (all || !ns->params.zd_extension_size) {
4309 return NVME_INVALID_FIELD | NVME_DNR;
4310 }
4311 zd_ext = nvme_get_zd_extension(ns, zone_idx);
4312 status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
4313 if (status) {
4314 trace_pci_nvme_err_zd_extension_map_error(zone_idx);
4315 return status;
4316 }
4317
4318 status = nvme_set_zd_ext(ns, zone);
4319 if (status == NVME_SUCCESS) {
4320 trace_pci_nvme_zd_extension_set(zone_idx);
4321 return status;
4322 }
4323 break;
4324
4325 case NVME_ZONE_ACTION_ZRWA_FLUSH:
4326 if (all) {
4327 return NVME_INVALID_FIELD | NVME_DNR;
4328 }
4329
4330 return nvme_zone_mgmt_send_zrwa_flush(n, zone, slba, req);
4331
4332 default:
4333 trace_pci_nvme_err_invalid_mgmt_action(action);
4334 status = NVME_INVALID_FIELD;
4335 }
4336
4337 if (status == NVME_ZONE_INVAL_TRANSITION) {
4338 trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
4339 zone->d.za);
4340 }
4341 if (status) {
4342 status |= NVME_DNR;
4343 }
4344
4345 return status;
4346 }
4347
nvme_zone_matches_filter(uint32_t zafs,NvmeZone * zl)4348 static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
4349 {
4350 NvmeZoneState zs = nvme_get_zone_state(zl);
4351
4352 switch (zafs) {
4353 case NVME_ZONE_REPORT_ALL:
4354 return true;
4355 case NVME_ZONE_REPORT_EMPTY:
4356 return zs == NVME_ZONE_STATE_EMPTY;
4357 case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
4358 return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
4359 case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
4360 return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
4361 case NVME_ZONE_REPORT_CLOSED:
4362 return zs == NVME_ZONE_STATE_CLOSED;
4363 case NVME_ZONE_REPORT_FULL:
4364 return zs == NVME_ZONE_STATE_FULL;
4365 case NVME_ZONE_REPORT_READ_ONLY:
4366 return zs == NVME_ZONE_STATE_READ_ONLY;
4367 case NVME_ZONE_REPORT_OFFLINE:
4368 return zs == NVME_ZONE_STATE_OFFLINE;
4369 default:
4370 return false;
4371 }
4372 }
4373
nvme_zone_mgmt_recv(NvmeCtrl * n,NvmeRequest * req)4374 static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
4375 {
4376 NvmeCmd *cmd = &req->cmd;
4377 NvmeNamespace *ns = req->ns;
4378 /* cdw12 is zero-based number of dwords to return. Convert to bytes */
4379 uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
4380 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4381 uint32_t zone_idx, zra, zrasf, partial;
4382 uint64_t max_zones, nr_zones = 0;
4383 uint16_t status;
4384 uint64_t slba;
4385 NvmeZoneDescr *z;
4386 NvmeZone *zone;
4387 NvmeZoneReportHeader *header;
4388 void *buf, *buf_p;
4389 size_t zone_entry_sz;
4390 int i;
4391
4392 req->status = NVME_SUCCESS;
4393
4394 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
4395 if (status) {
4396 return status;
4397 }
4398
4399 zra = dw13 & 0xff;
4400 if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
4401 return NVME_INVALID_FIELD | NVME_DNR;
4402 }
4403 if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
4404 return NVME_INVALID_FIELD | NVME_DNR;
4405 }
4406
4407 zrasf = (dw13 >> 8) & 0xff;
4408 if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
4409 return NVME_INVALID_FIELD | NVME_DNR;
4410 }
4411
4412 if (data_size < sizeof(NvmeZoneReportHeader)) {
4413 return NVME_INVALID_FIELD | NVME_DNR;
4414 }
4415
4416 status = nvme_check_mdts(n, data_size);
4417 if (status) {
4418 return status;
4419 }
4420
4421 partial = (dw13 >> 16) & 0x01;
4422
4423 zone_entry_sz = sizeof(NvmeZoneDescr);
4424 if (zra == NVME_ZONE_REPORT_EXTENDED) {
4425 zone_entry_sz += ns->params.zd_extension_size;
4426 }
4427
4428 max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
4429 buf = g_malloc0(data_size);
4430
4431 zone = &ns->zone_array[zone_idx];
4432 for (i = zone_idx; i < ns->num_zones; i++) {
4433 if (partial && nr_zones >= max_zones) {
4434 break;
4435 }
4436 if (nvme_zone_matches_filter(zrasf, zone++)) {
4437 nr_zones++;
4438 }
4439 }
4440 header = buf;
4441 header->nr_zones = cpu_to_le64(nr_zones);
4442
4443 buf_p = buf + sizeof(NvmeZoneReportHeader);
4444 for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
4445 zone = &ns->zone_array[zone_idx];
4446 if (nvme_zone_matches_filter(zrasf, zone)) {
4447 z = buf_p;
4448 buf_p += sizeof(NvmeZoneDescr);
4449
4450 z->zt = zone->d.zt;
4451 z->zs = zone->d.zs;
4452 z->zcap = cpu_to_le64(zone->d.zcap);
4453 z->zslba = cpu_to_le64(zone->d.zslba);
4454 z->za = zone->d.za;
4455
4456 if (nvme_wp_is_valid(zone)) {
4457 z->wp = cpu_to_le64(zone->d.wp);
4458 } else {
4459 z->wp = cpu_to_le64(~0ULL);
4460 }
4461
4462 if (zra == NVME_ZONE_REPORT_EXTENDED) {
4463 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
4464 memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
4465 ns->params.zd_extension_size);
4466 }
4467 buf_p += ns->params.zd_extension_size;
4468 }
4469
4470 max_zones--;
4471 }
4472 }
4473
4474 status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
4475
4476 g_free(buf);
4477
4478 return status;
4479 }
4480
nvme_io_mgmt_recv_ruhs(NvmeCtrl * n,NvmeRequest * req,size_t len)4481 static uint16_t nvme_io_mgmt_recv_ruhs(NvmeCtrl *n, NvmeRequest *req,
4482 size_t len)
4483 {
4484 NvmeNamespace *ns = req->ns;
4485 NvmeEnduranceGroup *endgrp;
4486 NvmeRuhStatus *hdr;
4487 NvmeRuhStatusDescr *ruhsd;
4488 unsigned int nruhsd;
4489 uint16_t rg, ph, *ruhid;
4490 size_t trans_len;
4491 g_autofree uint8_t *buf = NULL;
4492
4493 if (!n->subsys) {
4494 return NVME_INVALID_FIELD | NVME_DNR;
4495 }
4496
4497 if (ns->params.nsid == 0 || ns->params.nsid == 0xffffffff) {
4498 return NVME_INVALID_NSID | NVME_DNR;
4499 }
4500
4501 if (!n->subsys->endgrp.fdp.enabled) {
4502 return NVME_FDP_DISABLED | NVME_DNR;
4503 }
4504
4505 endgrp = ns->endgrp;
4506
4507 nruhsd = ns->fdp.nphs * endgrp->fdp.nrg;
4508 trans_len = sizeof(NvmeRuhStatus) + nruhsd * sizeof(NvmeRuhStatusDescr);
4509 buf = g_malloc0(trans_len);
4510
4511 trans_len = MIN(trans_len, len);
4512
4513 hdr = (NvmeRuhStatus *)buf;
4514 ruhsd = (NvmeRuhStatusDescr *)(buf + sizeof(NvmeRuhStatus));
4515
4516 hdr->nruhsd = cpu_to_le16(nruhsd);
4517
4518 ruhid = ns->fdp.phs;
4519
4520 for (ph = 0; ph < ns->fdp.nphs; ph++, ruhid++) {
4521 NvmeRuHandle *ruh = &endgrp->fdp.ruhs[*ruhid];
4522
4523 for (rg = 0; rg < endgrp->fdp.nrg; rg++, ruhsd++) {
4524 uint16_t pid = nvme_make_pid(ns, rg, ph);
4525
4526 ruhsd->pid = cpu_to_le16(pid);
4527 ruhsd->ruhid = *ruhid;
4528 ruhsd->earutr = 0;
4529 ruhsd->ruamw = cpu_to_le64(ruh->rus[rg].ruamw);
4530 }
4531 }
4532
4533 return nvme_c2h(n, buf, trans_len, req);
4534 }
4535
nvme_io_mgmt_recv(NvmeCtrl * n,NvmeRequest * req)4536 static uint16_t nvme_io_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
4537 {
4538 NvmeCmd *cmd = &req->cmd;
4539 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4540 uint32_t numd = le32_to_cpu(cmd->cdw11);
4541 uint8_t mo = (cdw10 & 0xff);
4542 size_t len = (numd + 1) << 2;
4543
4544 switch (mo) {
4545 case NVME_IOMR_MO_NOP:
4546 return 0;
4547 case NVME_IOMR_MO_RUH_STATUS:
4548 return nvme_io_mgmt_recv_ruhs(n, req, len);
4549 default:
4550 return NVME_INVALID_FIELD | NVME_DNR;
4551 };
4552 }
4553
nvme_io_mgmt_send_ruh_update(NvmeCtrl * n,NvmeRequest * req)4554 static uint16_t nvme_io_mgmt_send_ruh_update(NvmeCtrl *n, NvmeRequest *req)
4555 {
4556 NvmeCmd *cmd = &req->cmd;
4557 NvmeNamespace *ns = req->ns;
4558 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4559 uint16_t ret = NVME_SUCCESS;
4560 uint32_t npid = (cdw10 >> 16) + 1;
4561 unsigned int i = 0;
4562 g_autofree uint16_t *pids = NULL;
4563 uint32_t maxnpid;
4564
4565 if (!ns->endgrp || !ns->endgrp->fdp.enabled) {
4566 return NVME_FDP_DISABLED | NVME_DNR;
4567 }
4568
4569 maxnpid = n->subsys->endgrp.fdp.nrg * n->subsys->endgrp.fdp.nruh;
4570
4571 if (unlikely(npid >= MIN(NVME_FDP_MAXPIDS, maxnpid))) {
4572 return NVME_INVALID_FIELD | NVME_DNR;
4573 }
4574
4575 pids = g_new(uint16_t, npid);
4576
4577 ret = nvme_h2c(n, pids, npid * sizeof(uint16_t), req);
4578 if (ret) {
4579 return ret;
4580 }
4581
4582 for (; i < npid; i++) {
4583 if (!nvme_update_ruh(n, ns, pids[i])) {
4584 return NVME_INVALID_FIELD | NVME_DNR;
4585 }
4586 }
4587
4588 return ret;
4589 }
4590
nvme_io_mgmt_send(NvmeCtrl * n,NvmeRequest * req)4591 static uint16_t nvme_io_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
4592 {
4593 NvmeCmd *cmd = &req->cmd;
4594 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4595 uint8_t mo = (cdw10 & 0xff);
4596
4597 switch (mo) {
4598 case NVME_IOMS_MO_NOP:
4599 return 0;
4600 case NVME_IOMS_MO_RUH_UPDATE:
4601 return nvme_io_mgmt_send_ruh_update(n, req);
4602 default:
4603 return NVME_INVALID_FIELD | NVME_DNR;
4604 };
4605 }
4606
__nvme_io_cmd_nvm(NvmeCtrl * n,NvmeRequest * req)4607 static uint16_t __nvme_io_cmd_nvm(NvmeCtrl *n, NvmeRequest *req)
4608 {
4609 switch (req->cmd.opcode) {
4610 case NVME_CMD_WRITE:
4611 return nvme_write(n, req);
4612 case NVME_CMD_READ:
4613 return nvme_read(n, req);
4614 case NVME_CMD_COMPARE:
4615 return nvme_compare(n, req);
4616 case NVME_CMD_WRITE_ZEROES:
4617 return nvme_write_zeroes(n, req);
4618 case NVME_CMD_DSM:
4619 return nvme_dsm(n, req);
4620 case NVME_CMD_VERIFY:
4621 return nvme_verify(n, req);
4622 case NVME_CMD_COPY:
4623 return nvme_copy(n, req);
4624 case NVME_CMD_IO_MGMT_RECV:
4625 return nvme_io_mgmt_recv(n, req);
4626 case NVME_CMD_IO_MGMT_SEND:
4627 return nvme_io_mgmt_send(n, req);
4628 }
4629
4630 g_assert_not_reached();
4631 }
4632
nvme_io_cmd_nvm(NvmeCtrl * n,NvmeRequest * req)4633 static uint16_t nvme_io_cmd_nvm(NvmeCtrl *n, NvmeRequest *req)
4634 {
4635 if (!(n->cse.iocs.nvm[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
4636 trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
4637 return NVME_INVALID_OPCODE | NVME_DNR;
4638 }
4639
4640 return __nvme_io_cmd_nvm(n, req);
4641 }
4642
nvme_io_cmd_zoned(NvmeCtrl * n,NvmeRequest * req)4643 static uint16_t nvme_io_cmd_zoned(NvmeCtrl *n, NvmeRequest *req)
4644 {
4645 if (!(n->cse.iocs.zoned[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
4646 trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
4647 return NVME_INVALID_OPCODE | NVME_DNR;
4648 }
4649
4650 switch (req->cmd.opcode) {
4651 case NVME_CMD_ZONE_APPEND:
4652 return nvme_zone_append(n, req);
4653 case NVME_CMD_ZONE_MGMT_SEND:
4654 return nvme_zone_mgmt_send(n, req);
4655 case NVME_CMD_ZONE_MGMT_RECV:
4656 return nvme_zone_mgmt_recv(n, req);
4657 }
4658
4659 return __nvme_io_cmd_nvm(n, req);
4660 }
4661
nvme_io_cmd(NvmeCtrl * n,NvmeRequest * req)4662 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
4663 {
4664 NvmeNamespace *ns;
4665 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4666
4667 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
4668 req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
4669
4670 /*
4671 * In the base NVM command set, Flush may apply to all namespaces
4672 * (indicated by NSID being set to FFFFFFFFh). But if that feature is used
4673 * along with TP 4056 (Namespace Types), it may be pretty screwed up.
4674 *
4675 * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the
4676 * opcode with a specific command since we cannot determine a unique I/O
4677 * command set. Opcode 0h could have any other meaning than something
4678 * equivalent to flushing and say it DOES have completely different
4679 * semantics in some other command set - does an NSID of FFFFFFFFh then
4680 * mean "for all namespaces, apply whatever command set specific command
4681 * that uses the 0h opcode?" Or does it mean "for all namespaces, apply
4682 * whatever command that uses the 0h opcode if, and only if, it allows NSID
4683 * to be FFFFFFFFh"?
4684 *
4685 * Anyway (and luckily), for now, we do not care about this since the
4686 * device only supports namespace types that includes the NVM Flush command
4687 * (NVM and Zoned), so always do an NVM Flush.
4688 */
4689
4690 if (req->cmd.opcode == NVME_CMD_FLUSH) {
4691 return nvme_flush(n, req);
4692 }
4693
4694 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4695 return NVME_INVALID_NSID | NVME_DNR;
4696 }
4697
4698 ns = nvme_ns(n, nsid);
4699 if (unlikely(!ns)) {
4700 return NVME_INVALID_FIELD | NVME_DNR;
4701 }
4702
4703 if (ns->status) {
4704 return ns->status;
4705 }
4706
4707 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
4708 return NVME_INVALID_FIELD;
4709 }
4710
4711 req->ns = ns;
4712
4713 switch (ns->csi) {
4714 case NVME_CSI_NVM:
4715 return nvme_io_cmd_nvm(n, req);
4716 case NVME_CSI_ZONED:
4717 return nvme_io_cmd_zoned(n, req);
4718 }
4719
4720 g_assert_not_reached();
4721 }
4722
nvme_cq_notifier(EventNotifier * e)4723 static void nvme_cq_notifier(EventNotifier *e)
4724 {
4725 NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
4726 NvmeCtrl *n = cq->ctrl;
4727
4728 if (!event_notifier_test_and_clear(e)) {
4729 return;
4730 }
4731
4732 nvme_update_cq_head(cq);
4733
4734 if (cq->tail == cq->head) {
4735 if (cq->irq_enabled) {
4736 n->cq_pending--;
4737 }
4738
4739 nvme_irq_deassert(n, cq);
4740 }
4741
4742 qemu_bh_schedule(cq->bh);
4743 }
4744
nvme_init_cq_ioeventfd(NvmeCQueue * cq)4745 static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
4746 {
4747 NvmeCtrl *n = cq->ctrl;
4748 uint16_t offset = (cq->cqid << 3) + (1 << 2);
4749 int ret;
4750
4751 ret = event_notifier_init(&cq->notifier, 0);
4752 if (ret < 0) {
4753 return ret;
4754 }
4755
4756 event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
4757 memory_region_add_eventfd(&n->iomem,
4758 0x1000 + offset, 4, false, 0, &cq->notifier);
4759
4760 return 0;
4761 }
4762
nvme_sq_notifier(EventNotifier * e)4763 static void nvme_sq_notifier(EventNotifier *e)
4764 {
4765 NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
4766
4767 if (!event_notifier_test_and_clear(e)) {
4768 return;
4769 }
4770
4771 nvme_process_sq(sq);
4772 }
4773
nvme_init_sq_ioeventfd(NvmeSQueue * sq)4774 static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
4775 {
4776 NvmeCtrl *n = sq->ctrl;
4777 uint16_t offset = sq->sqid << 3;
4778 int ret;
4779
4780 ret = event_notifier_init(&sq->notifier, 0);
4781 if (ret < 0) {
4782 return ret;
4783 }
4784
4785 event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
4786 memory_region_add_eventfd(&n->iomem,
4787 0x1000 + offset, 4, false, 0, &sq->notifier);
4788
4789 return 0;
4790 }
4791
nvme_free_sq(NvmeSQueue * sq,NvmeCtrl * n)4792 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
4793 {
4794 uint16_t offset = sq->sqid << 3;
4795
4796 n->sq[sq->sqid] = NULL;
4797 qemu_bh_delete(sq->bh);
4798 if (sq->ioeventfd_enabled) {
4799 memory_region_del_eventfd(&n->iomem,
4800 0x1000 + offset, 4, false, 0, &sq->notifier);
4801 event_notifier_set_handler(&sq->notifier, NULL);
4802 event_notifier_cleanup(&sq->notifier);
4803 }
4804 g_free(sq->io_req);
4805 if (sq->sqid) {
4806 g_free(sq);
4807 }
4808 }
4809
nvme_del_sq(NvmeCtrl * n,NvmeRequest * req)4810 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
4811 {
4812 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4813 NvmeRequest *r, *next;
4814 NvmeSQueue *sq;
4815 NvmeCQueue *cq;
4816 uint16_t qid = le16_to_cpu(c->qid);
4817
4818 if (unlikely(!qid || nvme_check_sqid(n, qid))) {
4819 trace_pci_nvme_err_invalid_del_sq(qid);
4820 return NVME_INVALID_QID | NVME_DNR;
4821 }
4822
4823 trace_pci_nvme_del_sq(qid);
4824
4825 sq = n->sq[qid];
4826 while (!QTAILQ_EMPTY(&sq->out_req_list)) {
4827 r = QTAILQ_FIRST(&sq->out_req_list);
4828 assert(r->aiocb);
4829 r->status = NVME_CMD_ABORT_SQ_DEL;
4830 blk_aio_cancel(r->aiocb);
4831 }
4832
4833 assert(QTAILQ_EMPTY(&sq->out_req_list));
4834
4835 if (!nvme_check_cqid(n, sq->cqid)) {
4836 cq = n->cq[sq->cqid];
4837 QTAILQ_REMOVE(&cq->sq_list, sq, entry);
4838
4839 nvme_post_cqes(cq);
4840 QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
4841 if (r->sq == sq) {
4842 QTAILQ_REMOVE(&cq->req_list, r, entry);
4843 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
4844 }
4845 }
4846 }
4847
4848 nvme_free_sq(sq, n);
4849 return NVME_SUCCESS;
4850 }
4851
nvme_init_sq(NvmeSQueue * sq,NvmeCtrl * n,uint64_t dma_addr,uint16_t sqid,uint16_t cqid,uint16_t size)4852 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
4853 uint16_t sqid, uint16_t cqid, uint16_t size)
4854 {
4855 int i;
4856 NvmeCQueue *cq;
4857
4858 sq->ctrl = n;
4859 sq->dma_addr = dma_addr;
4860 sq->sqid = sqid;
4861 sq->size = size;
4862 sq->cqid = cqid;
4863 sq->head = sq->tail = 0;
4864 sq->io_req = g_new0(NvmeRequest, sq->size);
4865
4866 QTAILQ_INIT(&sq->req_list);
4867 QTAILQ_INIT(&sq->out_req_list);
4868 for (i = 0; i < sq->size; i++) {
4869 sq->io_req[i].sq = sq;
4870 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
4871 }
4872
4873 sq->bh = qemu_bh_new_guarded(nvme_process_sq, sq,
4874 &DEVICE(sq->ctrl)->mem_reentrancy_guard);
4875
4876 if (n->dbbuf_enabled) {
4877 sq->db_addr = n->dbbuf_dbs + (sqid << 3);
4878 sq->ei_addr = n->dbbuf_eis + (sqid << 3);
4879
4880 if (n->params.ioeventfd && sq->sqid != 0) {
4881 if (!nvme_init_sq_ioeventfd(sq)) {
4882 sq->ioeventfd_enabled = true;
4883 }
4884 }
4885 }
4886
4887 assert(n->cq[cqid]);
4888 cq = n->cq[cqid];
4889 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
4890 n->sq[sqid] = sq;
4891 }
4892
nvme_create_sq(NvmeCtrl * n,NvmeRequest * req)4893 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
4894 {
4895 NvmeSQueue *sq;
4896 NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
4897
4898 uint16_t cqid = le16_to_cpu(c->cqid);
4899 uint16_t sqid = le16_to_cpu(c->sqid);
4900 uint16_t qsize = le16_to_cpu(c->qsize);
4901 uint16_t qflags = le16_to_cpu(c->sq_flags);
4902 uint64_t prp1 = le64_to_cpu(c->prp1);
4903
4904 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
4905
4906 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
4907 trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
4908 return NVME_INVALID_CQID | NVME_DNR;
4909 }
4910 if (unlikely(!sqid || sqid > n->conf_ioqpairs || n->sq[sqid] != NULL)) {
4911 trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
4912 return NVME_INVALID_QID | NVME_DNR;
4913 }
4914 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4915 trace_pci_nvme_err_invalid_create_sq_size(qsize);
4916 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4917 }
4918 if (unlikely(prp1 & (n->page_size - 1))) {
4919 trace_pci_nvme_err_invalid_create_sq_addr(prp1);
4920 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4921 }
4922 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
4923 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
4924 return NVME_INVALID_FIELD | NVME_DNR;
4925 }
4926 sq = g_malloc0(sizeof(*sq));
4927 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
4928 return NVME_SUCCESS;
4929 }
4930
4931 struct nvme_stats {
4932 uint64_t units_read;
4933 uint64_t units_written;
4934 uint64_t read_commands;
4935 uint64_t write_commands;
4936 };
4937
nvme_set_blk_stats(NvmeNamespace * ns,struct nvme_stats * stats)4938 static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
4939 {
4940 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
4941
4942 stats->units_read += s->nr_bytes[BLOCK_ACCT_READ];
4943 stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE];
4944 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
4945 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
4946 }
4947
nvme_ocp_extended_smart_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)4948 static uint16_t nvme_ocp_extended_smart_info(NvmeCtrl *n, uint8_t rae,
4949 uint32_t buf_len, uint64_t off,
4950 NvmeRequest *req)
4951 {
4952 NvmeNamespace *ns = NULL;
4953 NvmeSmartLogExtended smart_l = { 0 };
4954 struct nvme_stats stats = { 0 };
4955 uint32_t trans_len;
4956
4957 if (off >= sizeof(smart_l)) {
4958 return NVME_INVALID_FIELD | NVME_DNR;
4959 }
4960
4961 /* accumulate all stats from all namespaces */
4962 for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4963 ns = nvme_ns(n, i);
4964 if (ns) {
4965 nvme_set_blk_stats(ns, &stats);
4966 }
4967 }
4968
4969 smart_l.physical_media_units_written[0] = cpu_to_le64(stats.units_written);
4970 smart_l.physical_media_units_read[0] = cpu_to_le64(stats.units_read);
4971 smart_l.log_page_version = 0x0005;
4972
4973 static const uint8_t guid[16] = {
4974 0xC5, 0xAF, 0x10, 0x28, 0xEA, 0xBF, 0xF2, 0xA4,
4975 0x9C, 0x4F, 0x6F, 0x7C, 0xC9, 0x14, 0xD5, 0xAF
4976 };
4977 memcpy(smart_l.log_page_guid, guid, sizeof(smart_l.log_page_guid));
4978
4979 if (!rae) {
4980 nvme_clear_events(n, NVME_AER_TYPE_SMART);
4981 }
4982
4983 trans_len = MIN(sizeof(smart_l) - off, buf_len);
4984 return nvme_c2h(n, (uint8_t *) &smart_l + off, trans_len, req);
4985 }
4986
nvme_smart_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)4987 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4988 uint64_t off, NvmeRequest *req)
4989 {
4990 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4991 struct nvme_stats stats = { 0 };
4992 NvmeSmartLog smart = { 0 };
4993 uint32_t trans_len;
4994 NvmeNamespace *ns;
4995 time_t current_ms;
4996 uint64_t u_read, u_written;
4997
4998 if (off >= sizeof(smart)) {
4999 return NVME_INVALID_FIELD | NVME_DNR;
5000 }
5001
5002 if (nsid != 0xffffffff) {
5003 ns = nvme_ns(n, nsid);
5004 if (!ns) {
5005 return NVME_INVALID_NSID | NVME_DNR;
5006 }
5007 nvme_set_blk_stats(ns, &stats);
5008 } else {
5009 int i;
5010
5011 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5012 ns = nvme_ns(n, i);
5013 if (!ns) {
5014 continue;
5015 }
5016 nvme_set_blk_stats(ns, &stats);
5017 }
5018 }
5019
5020 trans_len = MIN(sizeof(smart) - off, buf_len);
5021 smart.critical_warning = n->smart_critical_warning;
5022
5023 u_read = DIV_ROUND_UP(stats.units_read >> BDRV_SECTOR_BITS, 1000);
5024 u_written = DIV_ROUND_UP(stats.units_written >> BDRV_SECTOR_BITS, 1000);
5025
5026 smart.data_units_read[0] = cpu_to_le64(u_read);
5027 smart.data_units_written[0] = cpu_to_le64(u_written);
5028 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
5029 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
5030
5031 smart.temperature = cpu_to_le16(n->temperature);
5032
5033 if ((n->temperature >= n->features.temp_thresh_hi) ||
5034 (n->temperature <= n->features.temp_thresh_low)) {
5035 smart.critical_warning |= NVME_SMART_TEMPERATURE;
5036 }
5037
5038 current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
5039 smart.power_on_hours[0] =
5040 cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
5041
5042 if (!rae) {
5043 nvme_clear_events(n, NVME_AER_TYPE_SMART);
5044 }
5045
5046 return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
5047 }
5048
nvme_endgrp_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)5049 static uint16_t nvme_endgrp_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
5050 uint64_t off, NvmeRequest *req)
5051 {
5052 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
5053 uint16_t endgrpid = (dw11 >> 16) & 0xffff;
5054 struct nvme_stats stats = {};
5055 NvmeEndGrpLog info = {};
5056 int i;
5057
5058 if (!n->subsys || endgrpid != 0x1) {
5059 return NVME_INVALID_FIELD | NVME_DNR;
5060 }
5061
5062 if (off >= sizeof(info)) {
5063 return NVME_INVALID_FIELD | NVME_DNR;
5064 }
5065
5066 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5067 NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i);
5068 if (!ns) {
5069 continue;
5070 }
5071
5072 nvme_set_blk_stats(ns, &stats);
5073 }
5074
5075 info.data_units_read[0] =
5076 cpu_to_le64(DIV_ROUND_UP(stats.units_read / 1000000000, 1000000000));
5077 info.data_units_written[0] =
5078 cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
5079 info.media_units_written[0] =
5080 cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
5081
5082 info.host_read_commands[0] = cpu_to_le64(stats.read_commands);
5083 info.host_write_commands[0] = cpu_to_le64(stats.write_commands);
5084
5085 buf_len = MIN(sizeof(info) - off, buf_len);
5086
5087 return nvme_c2h(n, (uint8_t *)&info + off, buf_len, req);
5088 }
5089
5090
nvme_fw_log_info(NvmeCtrl * n,uint32_t buf_len,uint64_t off,NvmeRequest * req)5091 static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
5092 NvmeRequest *req)
5093 {
5094 uint32_t trans_len;
5095 NvmeFwSlotInfoLog fw_log = {
5096 .afi = 0x1,
5097 };
5098
5099 if (off >= sizeof(fw_log)) {
5100 return NVME_INVALID_FIELD | NVME_DNR;
5101 }
5102
5103 strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
5104 trans_len = MIN(sizeof(fw_log) - off, buf_len);
5105
5106 return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
5107 }
5108
nvme_error_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)5109 static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
5110 uint64_t off, NvmeRequest *req)
5111 {
5112 uint32_t trans_len;
5113 NvmeErrorLog errlog;
5114
5115 if (off >= sizeof(errlog)) {
5116 return NVME_INVALID_FIELD | NVME_DNR;
5117 }
5118
5119 if (!rae) {
5120 nvme_clear_events(n, NVME_AER_TYPE_ERROR);
5121 }
5122
5123 memset(&errlog, 0x0, sizeof(errlog));
5124 trans_len = MIN(sizeof(errlog) - off, buf_len);
5125
5126 return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
5127 }
5128
nvme_changed_nslist(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)5129 static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
5130 uint64_t off, NvmeRequest *req)
5131 {
5132 uint32_t nslist[1024] = {};
5133 uint32_t trans_len;
5134 int i = 0;
5135 uint32_t nsid;
5136
5137 if (off >= sizeof(nslist)) {
5138 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(nslist));
5139 return NVME_INVALID_FIELD | NVME_DNR;
5140 }
5141
5142 trans_len = MIN(sizeof(nslist) - off, buf_len);
5143
5144 while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
5145 NVME_CHANGED_NSID_SIZE) {
5146 /*
5147 * If more than 1024 namespaces, the first entry in the log page should
5148 * be set to FFFFFFFFh and the others to 0 as spec.
5149 */
5150 if (i == ARRAY_SIZE(nslist)) {
5151 memset(nslist, 0x0, sizeof(nslist));
5152 nslist[0] = 0xffffffff;
5153 break;
5154 }
5155
5156 nslist[i++] = nsid;
5157 clear_bit(nsid, n->changed_nsids);
5158 }
5159
5160 /*
5161 * Remove all the remaining list entries in case returns directly due to
5162 * more than 1024 namespaces.
5163 */
5164 if (nslist[0] == 0xffffffff) {
5165 bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
5166 }
5167
5168 if (!rae) {
5169 nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
5170 }
5171
5172 return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
5173 }
5174
nvme_cmd_effects(NvmeCtrl * n,uint8_t csi,uint32_t buf_len,uint64_t off,NvmeRequest * req)5175 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
5176 uint64_t off, NvmeRequest *req)
5177 {
5178 NvmeEffectsLog log = {};
5179 const uint32_t *iocs = NULL;
5180 uint32_t trans_len;
5181
5182 if (off >= sizeof(log)) {
5183 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
5184 return NVME_INVALID_FIELD | NVME_DNR;
5185 }
5186
5187 switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) {
5188 case NVME_CC_CSS_NVM:
5189 iocs = n->cse.iocs.nvm;
5190 break;
5191
5192 case NVME_CC_CSS_ALL:
5193 switch (csi) {
5194 case NVME_CSI_NVM:
5195 iocs = n->cse.iocs.nvm;
5196 break;
5197 case NVME_CSI_ZONED:
5198 iocs = n->cse.iocs.zoned;
5199 break;
5200 }
5201
5202 break;
5203 }
5204
5205 memcpy(log.acs, n->cse.acs, sizeof(log.acs));
5206
5207 if (iocs) {
5208 memcpy(log.iocs, iocs, sizeof(log.iocs));
5209 }
5210
5211 trans_len = MIN(sizeof(log) - off, buf_len);
5212
5213 return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
5214 }
5215
nvme_vendor_specific_log(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req,uint8_t lid)5216 static uint16_t nvme_vendor_specific_log(NvmeCtrl *n, uint8_t rae,
5217 uint32_t buf_len, uint64_t off,
5218 NvmeRequest *req, uint8_t lid)
5219 {
5220 switch (lid) {
5221 case NVME_OCP_EXTENDED_SMART_INFO:
5222 if (n->params.ocp) {
5223 return nvme_ocp_extended_smart_info(n, rae, buf_len, off, req);
5224 }
5225 break;
5226 /* add a case for each additional vendor specific log id */
5227 }
5228
5229 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
5230 return NVME_INVALID_FIELD | NVME_DNR;
5231 }
5232
sizeof_fdp_conf_descr(size_t nruh,size_t vss)5233 static size_t sizeof_fdp_conf_descr(size_t nruh, size_t vss)
5234 {
5235 size_t entry_siz = sizeof(NvmeFdpDescrHdr) + nruh * sizeof(NvmeRuhDescr)
5236 + vss;
5237 return ROUND_UP(entry_siz, 8);
5238 }
5239
nvme_fdp_confs(NvmeCtrl * n,uint32_t endgrpid,uint32_t buf_len,uint64_t off,NvmeRequest * req)5240 static uint16_t nvme_fdp_confs(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
5241 uint64_t off, NvmeRequest *req)
5242 {
5243 uint32_t log_size, trans_len;
5244 g_autofree uint8_t *buf = NULL;
5245 NvmeFdpDescrHdr *hdr;
5246 NvmeRuhDescr *ruhd;
5247 NvmeEnduranceGroup *endgrp;
5248 NvmeFdpConfsHdr *log;
5249 size_t nruh, fdp_descr_size;
5250 int i;
5251
5252 if (endgrpid != 1 || !n->subsys) {
5253 return NVME_INVALID_FIELD | NVME_DNR;
5254 }
5255
5256 endgrp = &n->subsys->endgrp;
5257
5258 if (endgrp->fdp.enabled) {
5259 nruh = endgrp->fdp.nruh;
5260 } else {
5261 nruh = 1;
5262 }
5263
5264 fdp_descr_size = sizeof_fdp_conf_descr(nruh, FDPVSS);
5265 log_size = sizeof(NvmeFdpConfsHdr) + fdp_descr_size;
5266
5267 if (off >= log_size) {
5268 return NVME_INVALID_FIELD | NVME_DNR;
5269 }
5270
5271 trans_len = MIN(log_size - off, buf_len);
5272
5273 buf = g_malloc0(log_size);
5274 log = (NvmeFdpConfsHdr *)buf;
5275 hdr = (NvmeFdpDescrHdr *)(log + 1);
5276 ruhd = (NvmeRuhDescr *)(buf + sizeof(*log) + sizeof(*hdr));
5277
5278 log->num_confs = cpu_to_le16(0);
5279 log->size = cpu_to_le32(log_size);
5280
5281 hdr->descr_size = cpu_to_le16(fdp_descr_size);
5282 if (endgrp->fdp.enabled) {
5283 hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, VALID, 1);
5284 hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, RGIF, endgrp->fdp.rgif);
5285 hdr->nrg = cpu_to_le16(endgrp->fdp.nrg);
5286 hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
5287 hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
5288 hdr->nnss = cpu_to_le32(NVME_MAX_NAMESPACES);
5289 hdr->runs = cpu_to_le64(endgrp->fdp.runs);
5290
5291 for (i = 0; i < nruh; i++) {
5292 ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
5293 ruhd++;
5294 }
5295 } else {
5296 /* 1 bit for RUH in PIF -> 2 RUHs max. */
5297 hdr->nrg = cpu_to_le16(1);
5298 hdr->nruh = cpu_to_le16(1);
5299 hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
5300 hdr->nnss = cpu_to_le32(1);
5301 hdr->runs = cpu_to_le64(96 * MiB);
5302
5303 ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
5304 }
5305
5306 return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
5307 }
5308
nvme_fdp_ruh_usage(NvmeCtrl * n,uint32_t endgrpid,uint32_t dw10,uint32_t dw12,uint32_t buf_len,uint64_t off,NvmeRequest * req)5309 static uint16_t nvme_fdp_ruh_usage(NvmeCtrl *n, uint32_t endgrpid,
5310 uint32_t dw10, uint32_t dw12,
5311 uint32_t buf_len, uint64_t off,
5312 NvmeRequest *req)
5313 {
5314 NvmeRuHandle *ruh;
5315 NvmeRuhuLog *hdr;
5316 NvmeRuhuDescr *ruhud;
5317 NvmeEnduranceGroup *endgrp;
5318 g_autofree uint8_t *buf = NULL;
5319 uint32_t log_size, trans_len;
5320 uint16_t i;
5321
5322 if (endgrpid != 1 || !n->subsys) {
5323 return NVME_INVALID_FIELD | NVME_DNR;
5324 }
5325
5326 endgrp = &n->subsys->endgrp;
5327
5328 if (!endgrp->fdp.enabled) {
5329 return NVME_FDP_DISABLED | NVME_DNR;
5330 }
5331
5332 log_size = sizeof(NvmeRuhuLog) + endgrp->fdp.nruh * sizeof(NvmeRuhuDescr);
5333
5334 if (off >= log_size) {
5335 return NVME_INVALID_FIELD | NVME_DNR;
5336 }
5337
5338 trans_len = MIN(log_size - off, buf_len);
5339
5340 buf = g_malloc0(log_size);
5341 hdr = (NvmeRuhuLog *)buf;
5342 ruhud = (NvmeRuhuDescr *)(hdr + 1);
5343
5344 ruh = endgrp->fdp.ruhs;
5345 hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
5346
5347 for (i = 0; i < endgrp->fdp.nruh; i++, ruhud++, ruh++) {
5348 ruhud->ruha = ruh->ruha;
5349 }
5350
5351 return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
5352 }
5353
nvme_fdp_stats(NvmeCtrl * n,uint32_t endgrpid,uint32_t buf_len,uint64_t off,NvmeRequest * req)5354 static uint16_t nvme_fdp_stats(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
5355 uint64_t off, NvmeRequest *req)
5356 {
5357 NvmeEnduranceGroup *endgrp;
5358 NvmeFdpStatsLog log = {};
5359 uint32_t trans_len;
5360
5361 if (off >= sizeof(NvmeFdpStatsLog)) {
5362 return NVME_INVALID_FIELD | NVME_DNR;
5363 }
5364
5365 if (endgrpid != 1 || !n->subsys) {
5366 return NVME_INVALID_FIELD | NVME_DNR;
5367 }
5368
5369 if (!n->subsys->endgrp.fdp.enabled) {
5370 return NVME_FDP_DISABLED | NVME_DNR;
5371 }
5372
5373 endgrp = &n->subsys->endgrp;
5374
5375 trans_len = MIN(sizeof(log) - off, buf_len);
5376
5377 /* spec value is 128 bit, we only use 64 bit */
5378 log.hbmw[0] = cpu_to_le64(endgrp->fdp.hbmw);
5379 log.mbmw[0] = cpu_to_le64(endgrp->fdp.mbmw);
5380 log.mbe[0] = cpu_to_le64(endgrp->fdp.mbe);
5381
5382 return nvme_c2h(n, (uint8_t *)&log + off, trans_len, req);
5383 }
5384
nvme_fdp_events(NvmeCtrl * n,uint32_t endgrpid,uint32_t buf_len,uint64_t off,NvmeRequest * req)5385 static uint16_t nvme_fdp_events(NvmeCtrl *n, uint32_t endgrpid,
5386 uint32_t buf_len, uint64_t off,
5387 NvmeRequest *req)
5388 {
5389 NvmeEnduranceGroup *endgrp;
5390 NvmeCmd *cmd = &req->cmd;
5391 bool host_events = (cmd->cdw10 >> 8) & 0x1;
5392 uint32_t log_size, trans_len;
5393 NvmeFdpEventBuffer *ebuf;
5394 g_autofree NvmeFdpEventsLog *elog = NULL;
5395 NvmeFdpEvent *event;
5396
5397 if (endgrpid != 1 || !n->subsys) {
5398 return NVME_INVALID_FIELD | NVME_DNR;
5399 }
5400
5401 endgrp = &n->subsys->endgrp;
5402
5403 if (!endgrp->fdp.enabled) {
5404 return NVME_FDP_DISABLED | NVME_DNR;
5405 }
5406
5407 if (host_events) {
5408 ebuf = &endgrp->fdp.host_events;
5409 } else {
5410 ebuf = &endgrp->fdp.ctrl_events;
5411 }
5412
5413 log_size = sizeof(NvmeFdpEventsLog) + ebuf->nelems * sizeof(NvmeFdpEvent);
5414
5415 if (off >= log_size) {
5416 return NVME_INVALID_FIELD | NVME_DNR;
5417 }
5418
5419 trans_len = MIN(log_size - off, buf_len);
5420 elog = g_malloc0(log_size);
5421 elog->num_events = cpu_to_le32(ebuf->nelems);
5422 event = (NvmeFdpEvent *)(elog + 1);
5423
5424 if (ebuf->nelems && ebuf->start == ebuf->next) {
5425 unsigned int nelems = (NVME_FDP_MAX_EVENTS - ebuf->start);
5426 /* wrap over, copy [start;NVME_FDP_MAX_EVENTS[ and [0; next[ */
5427 memcpy(event, &ebuf->events[ebuf->start],
5428 sizeof(NvmeFdpEvent) * nelems);
5429 memcpy(event + nelems, ebuf->events,
5430 sizeof(NvmeFdpEvent) * ebuf->next);
5431 } else if (ebuf->start < ebuf->next) {
5432 memcpy(event, &ebuf->events[ebuf->start],
5433 sizeof(NvmeFdpEvent) * (ebuf->next - ebuf->start));
5434 }
5435
5436 return nvme_c2h(n, (uint8_t *)elog + off, trans_len, req);
5437 }
5438
nvme_get_log(NvmeCtrl * n,NvmeRequest * req)5439 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
5440 {
5441 NvmeCmd *cmd = &req->cmd;
5442
5443 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5444 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5445 uint32_t dw12 = le32_to_cpu(cmd->cdw12);
5446 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
5447 uint8_t lid = dw10 & 0xff;
5448 uint8_t lsp = (dw10 >> 8) & 0xf;
5449 uint8_t rae = (dw10 >> 15) & 0x1;
5450 uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24;
5451 uint32_t numdl, numdu, lspi;
5452 uint64_t off, lpol, lpou;
5453 size_t len;
5454 uint16_t status;
5455
5456 numdl = (dw10 >> 16);
5457 numdu = (dw11 & 0xffff);
5458 lspi = (dw11 >> 16);
5459 lpol = dw12;
5460 lpou = dw13;
5461
5462 len = (((numdu << 16) | numdl) + 1) << 2;
5463 off = (lpou << 32ULL) | lpol;
5464
5465 if (off & 0x3) {
5466 return NVME_INVALID_FIELD | NVME_DNR;
5467 }
5468
5469 trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
5470
5471 status = nvme_check_mdts(n, len);
5472 if (status) {
5473 return status;
5474 }
5475
5476 switch (lid) {
5477 case NVME_LOG_ERROR_INFO:
5478 return nvme_error_info(n, rae, len, off, req);
5479 case NVME_LOG_SMART_INFO:
5480 return nvme_smart_info(n, rae, len, off, req);
5481 case NVME_LOG_FW_SLOT_INFO:
5482 return nvme_fw_log_info(n, len, off, req);
5483 case NVME_LOG_VENDOR_START...NVME_LOG_VENDOR_END:
5484 return nvme_vendor_specific_log(n, rae, len, off, req, lid);
5485 case NVME_LOG_CHANGED_NSLIST:
5486 return nvme_changed_nslist(n, rae, len, off, req);
5487 case NVME_LOG_CMD_EFFECTS:
5488 return nvme_cmd_effects(n, csi, len, off, req);
5489 case NVME_LOG_ENDGRP:
5490 return nvme_endgrp_info(n, rae, len, off, req);
5491 case NVME_LOG_FDP_CONFS:
5492 return nvme_fdp_confs(n, lspi, len, off, req);
5493 case NVME_LOG_FDP_RUH_USAGE:
5494 return nvme_fdp_ruh_usage(n, lspi, dw10, dw12, len, off, req);
5495 case NVME_LOG_FDP_STATS:
5496 return nvme_fdp_stats(n, lspi, len, off, req);
5497 case NVME_LOG_FDP_EVENTS:
5498 return nvme_fdp_events(n, lspi, len, off, req);
5499 default:
5500 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
5501 return NVME_INVALID_FIELD | NVME_DNR;
5502 }
5503 }
5504
nvme_free_cq(NvmeCQueue * cq,NvmeCtrl * n)5505 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
5506 {
5507 PCIDevice *pci = PCI_DEVICE(n);
5508 uint16_t offset = (cq->cqid << 3) + (1 << 2);
5509
5510 n->cq[cq->cqid] = NULL;
5511 qemu_bh_delete(cq->bh);
5512 if (cq->ioeventfd_enabled) {
5513 memory_region_del_eventfd(&n->iomem,
5514 0x1000 + offset, 4, false, 0, &cq->notifier);
5515 event_notifier_set_handler(&cq->notifier, NULL);
5516 event_notifier_cleanup(&cq->notifier);
5517 }
5518 if (msix_enabled(pci) && cq->irq_enabled) {
5519 msix_vector_unuse(pci, cq->vector);
5520 }
5521 if (cq->cqid) {
5522 g_free(cq);
5523 }
5524 }
5525
nvme_del_cq(NvmeCtrl * n,NvmeRequest * req)5526 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
5527 {
5528 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
5529 NvmeCQueue *cq;
5530 uint16_t qid = le16_to_cpu(c->qid);
5531
5532 if (unlikely(!qid || nvme_check_cqid(n, qid))) {
5533 trace_pci_nvme_err_invalid_del_cq_cqid(qid);
5534 return NVME_INVALID_CQID | NVME_DNR;
5535 }
5536
5537 cq = n->cq[qid];
5538 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
5539 trace_pci_nvme_err_invalid_del_cq_notempty(qid);
5540 return NVME_INVALID_QUEUE_DEL;
5541 }
5542
5543 if (cq->irq_enabled && cq->tail != cq->head) {
5544 n->cq_pending--;
5545 }
5546
5547 nvme_irq_deassert(n, cq);
5548 trace_pci_nvme_del_cq(qid);
5549 nvme_free_cq(cq, n);
5550 return NVME_SUCCESS;
5551 }
5552
nvme_init_cq(NvmeCQueue * cq,NvmeCtrl * n,uint64_t dma_addr,uint16_t cqid,uint16_t vector,uint16_t size,uint16_t irq_enabled)5553 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
5554 uint16_t cqid, uint16_t vector, uint16_t size,
5555 uint16_t irq_enabled)
5556 {
5557 PCIDevice *pci = PCI_DEVICE(n);
5558
5559 if (msix_enabled(pci) && irq_enabled) {
5560 msix_vector_use(pci, vector);
5561 }
5562
5563 cq->ctrl = n;
5564 cq->cqid = cqid;
5565 cq->size = size;
5566 cq->dma_addr = dma_addr;
5567 cq->phase = 1;
5568 cq->irq_enabled = irq_enabled;
5569 cq->vector = vector;
5570 cq->head = cq->tail = 0;
5571 QTAILQ_INIT(&cq->req_list);
5572 QTAILQ_INIT(&cq->sq_list);
5573 if (n->dbbuf_enabled) {
5574 cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
5575 cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
5576
5577 if (n->params.ioeventfd && cqid != 0) {
5578 if (!nvme_init_cq_ioeventfd(cq)) {
5579 cq->ioeventfd_enabled = true;
5580 }
5581 }
5582 }
5583 n->cq[cqid] = cq;
5584 cq->bh = qemu_bh_new_guarded(nvme_post_cqes, cq,
5585 &DEVICE(cq->ctrl)->mem_reentrancy_guard);
5586 }
5587
nvme_create_cq(NvmeCtrl * n,NvmeRequest * req)5588 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
5589 {
5590 NvmeCQueue *cq;
5591 NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
5592 uint16_t cqid = le16_to_cpu(c->cqid);
5593 uint16_t vector = le16_to_cpu(c->irq_vector);
5594 uint16_t qsize = le16_to_cpu(c->qsize);
5595 uint16_t qflags = le16_to_cpu(c->cq_flags);
5596 uint64_t prp1 = le64_to_cpu(c->prp1);
5597 uint32_t cc = ldq_le_p(&n->bar.cc);
5598 uint8_t iocqes = NVME_CC_IOCQES(cc);
5599 uint8_t iosqes = NVME_CC_IOSQES(cc);
5600
5601 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
5602 NVME_CQ_FLAGS_IEN(qflags) != 0);
5603
5604 if (iosqes != NVME_SQES || iocqes != NVME_CQES) {
5605 trace_pci_nvme_err_invalid_create_cq_entry_size(iosqes, iocqes);
5606 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
5607 }
5608
5609 if (unlikely(!cqid || cqid > n->conf_ioqpairs || n->cq[cqid] != NULL)) {
5610 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
5611 return NVME_INVALID_QID | NVME_DNR;
5612 }
5613 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
5614 trace_pci_nvme_err_invalid_create_cq_size(qsize);
5615 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
5616 }
5617 if (unlikely(prp1 & (n->page_size - 1))) {
5618 trace_pci_nvme_err_invalid_create_cq_addr(prp1);
5619 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
5620 }
5621 if (unlikely(!msix_enabled(PCI_DEVICE(n)) && vector)) {
5622 trace_pci_nvme_err_invalid_create_cq_vector(vector);
5623 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
5624 }
5625 if (unlikely(vector >= n->conf_msix_qsize)) {
5626 trace_pci_nvme_err_invalid_create_cq_vector(vector);
5627 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
5628 }
5629 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
5630 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
5631 return NVME_INVALID_FIELD | NVME_DNR;
5632 }
5633
5634 cq = g_malloc0(sizeof(*cq));
5635 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
5636 NVME_CQ_FLAGS_IEN(qflags));
5637
5638 /*
5639 * It is only required to set qs_created when creating a completion queue;
5640 * creating a submission queue without a matching completion queue will
5641 * fail.
5642 */
5643 n->qs_created = true;
5644 return NVME_SUCCESS;
5645 }
5646
nvme_rpt_empty_id_struct(NvmeCtrl * n,NvmeRequest * req)5647 static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
5648 {
5649 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
5650
5651 return nvme_c2h(n, id, sizeof(id), req);
5652 }
5653
nvme_identify_ctrl(NvmeCtrl * n,NvmeRequest * req)5654 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
5655 {
5656 trace_pci_nvme_identify_ctrl();
5657
5658 return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
5659 }
5660
nvme_identify_ctrl_csi(NvmeCtrl * n,NvmeRequest * req)5661 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
5662 {
5663 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5664 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
5665 NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
5666
5667 trace_pci_nvme_identify_ctrl_csi(c->csi);
5668
5669 switch (c->csi) {
5670 case NVME_CSI_NVM:
5671 id_nvm->vsl = n->params.vsl;
5672 id_nvm->dmrl = NVME_ID_CTRL_NVM_DMRL_MAX;
5673 id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
5674 id_nvm->dmsl = NVME_ID_CTRL_NVM_DMRL_MAX * n->dmrsl;
5675 break;
5676
5677 case NVME_CSI_ZONED:
5678 ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
5679 break;
5680
5681 default:
5682 return NVME_INVALID_FIELD | NVME_DNR;
5683 }
5684
5685 return nvme_c2h(n, id, sizeof(id), req);
5686 }
5687
nvme_identify_ns(NvmeCtrl * n,NvmeRequest * req,bool active)5688 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
5689 {
5690 NvmeNamespace *ns;
5691 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5692 uint32_t nsid = le32_to_cpu(c->nsid);
5693
5694 trace_pci_nvme_identify_ns(nsid);
5695
5696 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5697 return NVME_INVALID_NSID | NVME_DNR;
5698 }
5699
5700 ns = nvme_ns(n, nsid);
5701 if (unlikely(!ns)) {
5702 if (!active) {
5703 ns = nvme_subsys_ns(n->subsys, nsid);
5704 if (!ns) {
5705 return nvme_rpt_empty_id_struct(n, req);
5706 }
5707 } else {
5708 return nvme_rpt_empty_id_struct(n, req);
5709 }
5710 }
5711
5712 if (active || ns->csi == NVME_CSI_NVM) {
5713 return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
5714 }
5715
5716 return NVME_INVALID_IOCS | NVME_DNR;
5717 }
5718
nvme_identify_ctrl_list(NvmeCtrl * n,NvmeRequest * req,bool attached)5719 static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
5720 bool attached)
5721 {
5722 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5723 uint32_t nsid = le32_to_cpu(c->nsid);
5724 uint16_t min_id = le16_to_cpu(c->ctrlid);
5725 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5726 uint16_t *ids = &list[1];
5727 NvmeNamespace *ns;
5728 NvmeCtrl *ctrl;
5729 int cntlid, nr_ids = 0;
5730
5731 trace_pci_nvme_identify_ctrl_list(c->cns, min_id);
5732
5733 if (!n->subsys) {
5734 return NVME_INVALID_FIELD | NVME_DNR;
5735 }
5736
5737 if (attached) {
5738 if (nsid == NVME_NSID_BROADCAST) {
5739 return NVME_INVALID_FIELD | NVME_DNR;
5740 }
5741
5742 ns = nvme_subsys_ns(n->subsys, nsid);
5743 if (!ns) {
5744 return NVME_INVALID_FIELD | NVME_DNR;
5745 }
5746 }
5747
5748 for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
5749 ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
5750 if (!ctrl) {
5751 continue;
5752 }
5753
5754 if (attached && !nvme_ns(ctrl, nsid)) {
5755 continue;
5756 }
5757
5758 ids[nr_ids++] = cntlid;
5759 }
5760
5761 list[0] = nr_ids;
5762
5763 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
5764 }
5765
nvme_identify_pri_ctrl_cap(NvmeCtrl * n,NvmeRequest * req)5766 static uint16_t nvme_identify_pri_ctrl_cap(NvmeCtrl *n, NvmeRequest *req)
5767 {
5768 trace_pci_nvme_identify_pri_ctrl_cap(le16_to_cpu(n->pri_ctrl_cap.cntlid));
5769
5770 return nvme_c2h(n, (uint8_t *)&n->pri_ctrl_cap,
5771 sizeof(NvmePriCtrlCap), req);
5772 }
5773
nvme_identify_sec_ctrl_list(NvmeCtrl * n,NvmeRequest * req)5774 static uint16_t nvme_identify_sec_ctrl_list(NvmeCtrl *n, NvmeRequest *req)
5775 {
5776 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5777 uint16_t pri_ctrl_id = le16_to_cpu(n->pri_ctrl_cap.cntlid);
5778 uint16_t min_id = le16_to_cpu(c->ctrlid);
5779 uint8_t num_sec_ctrl = n->nr_sec_ctrls;
5780 NvmeSecCtrlList list = {0};
5781 uint8_t i;
5782
5783 for (i = 0; i < num_sec_ctrl; i++) {
5784 if (n->sec_ctrl_list[i].scid >= min_id) {
5785 list.numcntl = MIN(num_sec_ctrl - i, 127);
5786 memcpy(&list.sec, n->sec_ctrl_list + i,
5787 list.numcntl * sizeof(NvmeSecCtrlEntry));
5788 break;
5789 }
5790 }
5791
5792 trace_pci_nvme_identify_sec_ctrl_list(pri_ctrl_id, list.numcntl);
5793
5794 return nvme_c2h(n, (uint8_t *)&list, sizeof(list), req);
5795 }
5796
nvme_identify_ns_ind(NvmeCtrl * n,NvmeRequest * req,bool alloc)5797 static uint16_t nvme_identify_ns_ind(NvmeCtrl *n, NvmeRequest *req, bool alloc)
5798 {
5799 NvmeNamespace *ns;
5800 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5801 uint32_t nsid = le32_to_cpu(c->nsid);
5802
5803 trace_pci_nvme_identify_ns_ind(nsid);
5804
5805 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5806 return NVME_INVALID_NSID | NVME_DNR;
5807 }
5808
5809 ns = nvme_ns(n, nsid);
5810 if (unlikely(!ns)) {
5811 if (alloc) {
5812 ns = nvme_subsys_ns(n->subsys, nsid);
5813 if (!ns) {
5814 return nvme_rpt_empty_id_struct(n, req);
5815 }
5816 } else {
5817 return nvme_rpt_empty_id_struct(n, req);
5818 }
5819 }
5820
5821 return nvme_c2h(n, (uint8_t *)&ns->id_ns_ind, sizeof(NvmeIdNsInd), req);
5822 }
5823
nvme_identify_ns_csi(NvmeCtrl * n,NvmeRequest * req,bool active)5824 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
5825 bool active)
5826 {
5827 NvmeNamespace *ns;
5828 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5829 uint32_t nsid = le32_to_cpu(c->nsid);
5830
5831 trace_pci_nvme_identify_ns_csi(nsid, c->csi);
5832
5833 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5834 return NVME_INVALID_NSID | NVME_DNR;
5835 }
5836
5837 ns = nvme_ns(n, nsid);
5838 if (unlikely(!ns)) {
5839 if (!active) {
5840 ns = nvme_subsys_ns(n->subsys, nsid);
5841 if (!ns) {
5842 return nvme_rpt_empty_id_struct(n, req);
5843 }
5844 } else {
5845 return nvme_rpt_empty_id_struct(n, req);
5846 }
5847 }
5848
5849 if (c->csi == NVME_CSI_NVM) {
5850 return nvme_c2h(n, (uint8_t *)&ns->id_ns_nvm, sizeof(NvmeIdNsNvm),
5851 req);
5852 } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
5853 return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
5854 req);
5855 }
5856
5857 return NVME_INVALID_FIELD | NVME_DNR;
5858 }
5859
nvme_identify_nslist(NvmeCtrl * n,NvmeRequest * req,bool active)5860 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
5861 bool active)
5862 {
5863 NvmeNamespace *ns;
5864 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5865 uint32_t min_nsid = le32_to_cpu(c->nsid);
5866 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5867 static const int data_len = sizeof(list);
5868 uint32_t *list_ptr = (uint32_t *)list;
5869 int i, j = 0;
5870
5871 trace_pci_nvme_identify_nslist(min_nsid);
5872
5873 /*
5874 * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values
5875 * since the Active Namespace ID List should return namespaces with ids
5876 * *higher* than the NSID specified in the command. This is also specified
5877 * in the spec (NVM Express v1.3d, Section 5.15.4).
5878 */
5879 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5880 return NVME_INVALID_NSID | NVME_DNR;
5881 }
5882
5883 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5884 ns = nvme_ns(n, i);
5885 if (!ns) {
5886 if (!active) {
5887 ns = nvme_subsys_ns(n->subsys, i);
5888 if (!ns) {
5889 continue;
5890 }
5891 } else {
5892 continue;
5893 }
5894 }
5895 if (ns->params.nsid <= min_nsid) {
5896 continue;
5897 }
5898 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5899 if (j == data_len / sizeof(uint32_t)) {
5900 break;
5901 }
5902 }
5903
5904 return nvme_c2h(n, list, data_len, req);
5905 }
5906
nvme_identify_nslist_csi(NvmeCtrl * n,NvmeRequest * req,bool active)5907 static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
5908 bool active)
5909 {
5910 NvmeNamespace *ns;
5911 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5912 uint32_t min_nsid = le32_to_cpu(c->nsid);
5913 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5914 static const int data_len = sizeof(list);
5915 uint32_t *list_ptr = (uint32_t *)list;
5916 int i, j = 0;
5917
5918 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
5919
5920 /*
5921 * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid.
5922 */
5923 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5924 return NVME_INVALID_NSID | NVME_DNR;
5925 }
5926
5927 if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
5928 return NVME_INVALID_FIELD | NVME_DNR;
5929 }
5930
5931 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5932 ns = nvme_ns(n, i);
5933 if (!ns) {
5934 if (!active) {
5935 ns = nvme_subsys_ns(n->subsys, i);
5936 if (!ns) {
5937 continue;
5938 }
5939 } else {
5940 continue;
5941 }
5942 }
5943 if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
5944 continue;
5945 }
5946 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5947 if (j == data_len / sizeof(uint32_t)) {
5948 break;
5949 }
5950 }
5951
5952 return nvme_c2h(n, list, data_len, req);
5953 }
5954
nvme_endurance_group_list(NvmeCtrl * n,NvmeRequest * req)5955 static uint16_t nvme_endurance_group_list(NvmeCtrl *n, NvmeRequest *req)
5956 {
5957 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5958 uint16_t *nr_ids = &list[0];
5959 uint16_t *ids = &list[1];
5960 uint16_t endgid = le32_to_cpu(req->cmd.cdw11) & 0xffff;
5961
5962 /*
5963 * The current nvme-subsys only supports Endurance Group #1.
5964 */
5965 if (!endgid) {
5966 *nr_ids = 1;
5967 ids[0] = 1;
5968 } else {
5969 *nr_ids = 0;
5970 }
5971
5972 return nvme_c2h(n, list, sizeof(list), req);
5973 }
5974
nvme_identify_ns_descr_list(NvmeCtrl * n,NvmeRequest * req)5975 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
5976 {
5977 NvmeNamespace *ns;
5978 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5979 uint32_t nsid = le32_to_cpu(c->nsid);
5980 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5981 uint8_t *pos = list;
5982 struct {
5983 NvmeIdNsDescr hdr;
5984 uint8_t v[NVME_NIDL_UUID];
5985 } QEMU_PACKED uuid = {};
5986 struct {
5987 NvmeIdNsDescr hdr;
5988 uint8_t v[NVME_NIDL_NGUID];
5989 } QEMU_PACKED nguid = {};
5990 struct {
5991 NvmeIdNsDescr hdr;
5992 uint64_t v;
5993 } QEMU_PACKED eui64 = {};
5994 struct {
5995 NvmeIdNsDescr hdr;
5996 uint8_t v;
5997 } QEMU_PACKED csi = {};
5998
5999 trace_pci_nvme_identify_ns_descr_list(nsid);
6000
6001 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
6002 return NVME_INVALID_NSID | NVME_DNR;
6003 }
6004
6005 ns = nvme_ns(n, nsid);
6006 if (unlikely(!ns)) {
6007 return NVME_INVALID_FIELD | NVME_DNR;
6008 }
6009
6010 if (!qemu_uuid_is_null(&ns->params.uuid)) {
6011 uuid.hdr.nidt = NVME_NIDT_UUID;
6012 uuid.hdr.nidl = NVME_NIDL_UUID;
6013 memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
6014 memcpy(pos, &uuid, sizeof(uuid));
6015 pos += sizeof(uuid);
6016 }
6017
6018 if (!nvme_nguid_is_null(&ns->params.nguid)) {
6019 nguid.hdr.nidt = NVME_NIDT_NGUID;
6020 nguid.hdr.nidl = NVME_NIDL_NGUID;
6021 memcpy(nguid.v, ns->params.nguid.data, NVME_NIDL_NGUID);
6022 memcpy(pos, &nguid, sizeof(nguid));
6023 pos += sizeof(nguid);
6024 }
6025
6026 if (ns->params.eui64) {
6027 eui64.hdr.nidt = NVME_NIDT_EUI64;
6028 eui64.hdr.nidl = NVME_NIDL_EUI64;
6029 eui64.v = cpu_to_be64(ns->params.eui64);
6030 memcpy(pos, &eui64, sizeof(eui64));
6031 pos += sizeof(eui64);
6032 }
6033
6034 csi.hdr.nidt = NVME_NIDT_CSI;
6035 csi.hdr.nidl = NVME_NIDL_CSI;
6036 csi.v = ns->csi;
6037 memcpy(pos, &csi, sizeof(csi));
6038 pos += sizeof(csi);
6039
6040 return nvme_c2h(n, list, sizeof(list), req);
6041 }
6042
nvme_identify_cmd_set(NvmeCtrl * n,NvmeRequest * req)6043 static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
6044 {
6045 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
6046 static const int data_len = sizeof(list);
6047
6048 trace_pci_nvme_identify_cmd_set();
6049
6050 NVME_SET_CSI(*list, NVME_CSI_NVM);
6051 NVME_SET_CSI(*list, NVME_CSI_ZONED);
6052
6053 return nvme_c2h(n, list, data_len, req);
6054 }
6055
nvme_identify(NvmeCtrl * n,NvmeRequest * req)6056 static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
6057 {
6058 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
6059
6060 trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
6061 c->csi);
6062
6063 switch (c->cns) {
6064 case NVME_ID_CNS_NS:
6065 return nvme_identify_ns(n, req, true);
6066 case NVME_ID_CNS_NS_PRESENT:
6067 return nvme_identify_ns(n, req, false);
6068 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
6069 return nvme_identify_ctrl_list(n, req, true);
6070 case NVME_ID_CNS_CTRL_LIST:
6071 return nvme_identify_ctrl_list(n, req, false);
6072 case NVME_ID_CNS_PRIMARY_CTRL_CAP:
6073 return nvme_identify_pri_ctrl_cap(n, req);
6074 case NVME_ID_CNS_SECONDARY_CTRL_LIST:
6075 return nvme_identify_sec_ctrl_list(n, req);
6076 case NVME_ID_CNS_CS_NS:
6077 return nvme_identify_ns_csi(n, req, true);
6078 case NVME_ID_CNS_CS_IND_NS:
6079 return nvme_identify_ns_ind(n, req, false);
6080 case NVME_ID_CNS_CS_IND_NS_ALLOCATED:
6081 return nvme_identify_ns_ind(n, req, true);
6082 case NVME_ID_CNS_CS_NS_PRESENT:
6083 return nvme_identify_ns_csi(n, req, false);
6084 case NVME_ID_CNS_CTRL:
6085 return nvme_identify_ctrl(n, req);
6086 case NVME_ID_CNS_CS_CTRL:
6087 return nvme_identify_ctrl_csi(n, req);
6088 case NVME_ID_CNS_NS_ACTIVE_LIST:
6089 return nvme_identify_nslist(n, req, true);
6090 case NVME_ID_CNS_NS_PRESENT_LIST:
6091 return nvme_identify_nslist(n, req, false);
6092 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
6093 return nvme_identify_nslist_csi(n, req, true);
6094 case NVME_ID_CNS_ENDURANCE_GROUP_LIST:
6095 return nvme_endurance_group_list(n, req);
6096 case NVME_ID_CNS_CS_NS_PRESENT_LIST:
6097 return nvme_identify_nslist_csi(n, req, false);
6098 case NVME_ID_CNS_NS_DESCR_LIST:
6099 return nvme_identify_ns_descr_list(n, req);
6100 case NVME_ID_CNS_IO_COMMAND_SET:
6101 return nvme_identify_cmd_set(n, req);
6102 default:
6103 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
6104 return NVME_INVALID_FIELD | NVME_DNR;
6105 }
6106 }
6107
nvme_abort(NvmeCtrl * n,NvmeRequest * req)6108 static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
6109 {
6110 uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
6111 uint16_t cid = (le32_to_cpu(req->cmd.cdw10) >> 16) & 0xffff;
6112 NvmeSQueue *sq = n->sq[sqid];
6113 NvmeRequest *r, *next;
6114 int i;
6115
6116 req->cqe.result = 1;
6117 if (nvme_check_sqid(n, sqid)) {
6118 return NVME_INVALID_FIELD | NVME_DNR;
6119 }
6120
6121 if (sqid == 0) {
6122 for (i = 0; i < n->outstanding_aers; i++) {
6123 NvmeRequest *re = n->aer_reqs[i];
6124 if (re->cqe.cid == cid) {
6125 memmove(n->aer_reqs + i, n->aer_reqs + i + 1,
6126 (n->outstanding_aers - i - 1) * sizeof(NvmeRequest *));
6127 n->outstanding_aers--;
6128 re->status = NVME_CMD_ABORT_REQ;
6129 req->cqe.result = 0;
6130 nvme_enqueue_req_completion(&n->admin_cq, re);
6131 return NVME_SUCCESS;
6132 }
6133 }
6134 }
6135
6136 QTAILQ_FOREACH_SAFE(r, &sq->out_req_list, entry, next) {
6137 if (r->cqe.cid == cid) {
6138 if (r->aiocb) {
6139 r->status = NVME_CMD_ABORT_REQ;
6140 blk_aio_cancel_async(r->aiocb);
6141 }
6142 break;
6143 }
6144 }
6145
6146 return NVME_SUCCESS;
6147 }
6148
nvme_set_timestamp(NvmeCtrl * n,uint64_t ts)6149 static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
6150 {
6151 trace_pci_nvme_setfeat_timestamp(ts);
6152
6153 n->host_timestamp = le64_to_cpu(ts);
6154 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6155 }
6156
nvme_get_timestamp(const NvmeCtrl * n)6157 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
6158 {
6159 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6160 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
6161
6162 union nvme_timestamp {
6163 struct {
6164 uint64_t timestamp:48;
6165 uint64_t sync:1;
6166 uint64_t origin:3;
6167 uint64_t rsvd1:12;
6168 };
6169 uint64_t all;
6170 };
6171
6172 union nvme_timestamp ts;
6173 ts.all = 0;
6174 ts.timestamp = n->host_timestamp + elapsed_time;
6175
6176 /* If the host timestamp is non-zero, set the timestamp origin */
6177 ts.origin = n->host_timestamp ? 0x01 : 0x00;
6178
6179 trace_pci_nvme_getfeat_timestamp(ts.all);
6180
6181 return cpu_to_le64(ts.all);
6182 }
6183
nvme_get_feature_timestamp(NvmeCtrl * n,NvmeRequest * req)6184 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
6185 {
6186 uint64_t timestamp = nvme_get_timestamp(n);
6187
6188 return nvme_c2h(n, (uint8_t *)×tamp, sizeof(timestamp), req);
6189 }
6190
nvme_get_feature_fdp(NvmeCtrl * n,uint32_t endgrpid,uint32_t * result)6191 static int nvme_get_feature_fdp(NvmeCtrl *n, uint32_t endgrpid,
6192 uint32_t *result)
6193 {
6194 *result = 0;
6195
6196 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6197 return NVME_INVALID_FIELD | NVME_DNR;
6198 }
6199
6200 *result = FIELD_DP16(0, FEAT_FDP, FDPE, 1);
6201 *result = FIELD_DP16(*result, FEAT_FDP, CONF_NDX, 0);
6202
6203 return NVME_SUCCESS;
6204 }
6205
nvme_get_feature_fdp_events(NvmeCtrl * n,NvmeNamespace * ns,NvmeRequest * req,uint32_t * result)6206 static uint16_t nvme_get_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
6207 NvmeRequest *req, uint32_t *result)
6208 {
6209 NvmeCmd *cmd = &req->cmd;
6210 uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
6211 uint16_t ph = cdw11 & 0xffff;
6212 uint8_t noet = (cdw11 >> 16) & 0xff;
6213 uint16_t ruhid, ret;
6214 uint32_t nentries = 0;
6215 uint8_t s_events_ndx = 0;
6216 size_t s_events_siz = sizeof(NvmeFdpEventDescr) * noet;
6217 g_autofree NvmeFdpEventDescr *s_events = g_malloc0(s_events_siz);
6218 NvmeRuHandle *ruh;
6219 NvmeFdpEventDescr *s_event;
6220
6221 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6222 return NVME_FDP_DISABLED | NVME_DNR;
6223 }
6224
6225 if (!nvme_ph_valid(ns, ph)) {
6226 return NVME_INVALID_FIELD | NVME_DNR;
6227 }
6228
6229 ruhid = ns->fdp.phs[ph];
6230 ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
6231
6232 assert(ruh);
6233
6234 if (unlikely(noet == 0)) {
6235 return NVME_INVALID_FIELD | NVME_DNR;
6236 }
6237
6238 for (uint8_t event_type = 0; event_type < FDP_EVT_MAX; event_type++) {
6239 uint8_t shift = nvme_fdp_evf_shifts[event_type];
6240 if (!shift && event_type) {
6241 /*
6242 * only first entry (event_type == 0) has a shift value of 0
6243 * other entries are simply unpopulated.
6244 */
6245 continue;
6246 }
6247
6248 nentries++;
6249
6250 s_event = &s_events[s_events_ndx];
6251 s_event->evt = event_type;
6252 s_event->evta = (ruh->event_filter >> shift) & 0x1;
6253
6254 /* break if all `noet` entries are filled */
6255 if ((++s_events_ndx) == noet) {
6256 break;
6257 }
6258 }
6259
6260 ret = nvme_c2h(n, s_events, s_events_siz, req);
6261 if (ret) {
6262 return ret;
6263 }
6264
6265 *result = nentries;
6266 return NVME_SUCCESS;
6267 }
6268
nvme_get_feature(NvmeCtrl * n,NvmeRequest * req)6269 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
6270 {
6271 NvmeCmd *cmd = &req->cmd;
6272 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
6273 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
6274 uint32_t nsid = le32_to_cpu(cmd->nsid);
6275 uint32_t result = 0;
6276 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
6277 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
6278 uint16_t iv;
6279 NvmeNamespace *ns;
6280 int i;
6281 uint16_t endgrpid = 0, ret = NVME_SUCCESS;
6282
6283 static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
6284 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
6285 };
6286
6287 trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
6288
6289 if (!nvme_feature_support[fid]) {
6290 return NVME_INVALID_FIELD | NVME_DNR;
6291 }
6292
6293 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
6294 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
6295 /*
6296 * The Reservation Notification Mask and Reservation Persistence
6297 * features require a status code of Invalid Field in Command when
6298 * NSID is FFFFFFFFh. Since the device does not support those
6299 * features we can always return Invalid Namespace or Format as we
6300 * should do for all other features.
6301 */
6302 return NVME_INVALID_NSID | NVME_DNR;
6303 }
6304
6305 if (!nvme_ns(n, nsid)) {
6306 return NVME_INVALID_FIELD | NVME_DNR;
6307 }
6308 }
6309
6310 switch (sel) {
6311 case NVME_GETFEAT_SELECT_CURRENT:
6312 break;
6313 case NVME_GETFEAT_SELECT_SAVED:
6314 /* no features are saveable by the controller; fallthrough */
6315 case NVME_GETFEAT_SELECT_DEFAULT:
6316 goto defaults;
6317 case NVME_GETFEAT_SELECT_CAP:
6318 result = nvme_feature_cap[fid];
6319 goto out;
6320 }
6321
6322 switch (fid) {
6323 case NVME_TEMPERATURE_THRESHOLD:
6324 result = 0;
6325
6326 /*
6327 * The controller only implements the Composite Temperature sensor, so
6328 * return 0 for all other sensors.
6329 */
6330 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6331 goto out;
6332 }
6333
6334 switch (NVME_TEMP_THSEL(dw11)) {
6335 case NVME_TEMP_THSEL_OVER:
6336 result = n->features.temp_thresh_hi;
6337 goto out;
6338 case NVME_TEMP_THSEL_UNDER:
6339 result = n->features.temp_thresh_low;
6340 goto out;
6341 }
6342
6343 return NVME_INVALID_FIELD | NVME_DNR;
6344 case NVME_ERROR_RECOVERY:
6345 if (!nvme_nsid_valid(n, nsid)) {
6346 return NVME_INVALID_NSID | NVME_DNR;
6347 }
6348
6349 ns = nvme_ns(n, nsid);
6350 if (unlikely(!ns)) {
6351 return NVME_INVALID_FIELD | NVME_DNR;
6352 }
6353
6354 result = ns->features.err_rec;
6355 goto out;
6356 case NVME_VOLATILE_WRITE_CACHE:
6357 result = 0;
6358 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6359 ns = nvme_ns(n, i);
6360 if (!ns) {
6361 continue;
6362 }
6363
6364 result = blk_enable_write_cache(ns->blkconf.blk);
6365 if (result) {
6366 break;
6367 }
6368 }
6369 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
6370 goto out;
6371 case NVME_ASYNCHRONOUS_EVENT_CONF:
6372 result = n->features.async_config;
6373 goto out;
6374 case NVME_TIMESTAMP:
6375 return nvme_get_feature_timestamp(n, req);
6376 case NVME_HOST_BEHAVIOR_SUPPORT:
6377 return nvme_c2h(n, (uint8_t *)&n->features.hbs,
6378 sizeof(n->features.hbs), req);
6379 case NVME_FDP_MODE:
6380 endgrpid = dw11 & 0xff;
6381
6382 if (endgrpid != 0x1) {
6383 return NVME_INVALID_FIELD | NVME_DNR;
6384 }
6385
6386 ret = nvme_get_feature_fdp(n, endgrpid, &result);
6387 if (ret) {
6388 return ret;
6389 }
6390 goto out;
6391 case NVME_FDP_EVENTS:
6392 if (!nvme_nsid_valid(n, nsid)) {
6393 return NVME_INVALID_NSID | NVME_DNR;
6394 }
6395
6396 ns = nvme_ns(n, nsid);
6397 if (unlikely(!ns)) {
6398 return NVME_INVALID_FIELD | NVME_DNR;
6399 }
6400
6401 ret = nvme_get_feature_fdp_events(n, ns, req, &result);
6402 if (ret) {
6403 return ret;
6404 }
6405 goto out;
6406 default:
6407 break;
6408 }
6409
6410 defaults:
6411 switch (fid) {
6412 case NVME_TEMPERATURE_THRESHOLD:
6413 result = 0;
6414
6415 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6416 break;
6417 }
6418
6419 if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
6420 result = NVME_TEMPERATURE_WARNING;
6421 }
6422
6423 break;
6424 case NVME_NUMBER_OF_QUEUES:
6425 result = (n->conf_ioqpairs - 1) | ((n->conf_ioqpairs - 1) << 16);
6426 trace_pci_nvme_getfeat_numq(result);
6427 break;
6428 case NVME_INTERRUPT_VECTOR_CONF:
6429 iv = dw11 & 0xffff;
6430 if (iv >= n->conf_ioqpairs + 1) {
6431 return NVME_INVALID_FIELD | NVME_DNR;
6432 }
6433
6434 result = iv;
6435 if (iv == n->admin_cq.vector) {
6436 result |= NVME_INTVC_NOCOALESCING;
6437 }
6438 break;
6439 case NVME_FDP_MODE:
6440 endgrpid = dw11 & 0xff;
6441
6442 if (endgrpid != 0x1) {
6443 return NVME_INVALID_FIELD | NVME_DNR;
6444 }
6445
6446 ret = nvme_get_feature_fdp(n, endgrpid, &result);
6447 if (ret) {
6448 return ret;
6449 }
6450 break;
6451
6452 case NVME_WRITE_ATOMICITY:
6453 result = n->dn;
6454 break;
6455 default:
6456 result = nvme_feature_default[fid];
6457 break;
6458 }
6459
6460 out:
6461 req->cqe.result = cpu_to_le32(result);
6462 return ret;
6463 }
6464
nvme_set_feature_timestamp(NvmeCtrl * n,NvmeRequest * req)6465 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
6466 {
6467 uint16_t ret;
6468 uint64_t timestamp;
6469
6470 ret = nvme_h2c(n, (uint8_t *)×tamp, sizeof(timestamp), req);
6471 if (ret) {
6472 return ret;
6473 }
6474
6475 nvme_set_timestamp(n, timestamp);
6476
6477 return NVME_SUCCESS;
6478 }
6479
nvme_set_feature_fdp_events(NvmeCtrl * n,NvmeNamespace * ns,NvmeRequest * req)6480 static uint16_t nvme_set_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
6481 NvmeRequest *req)
6482 {
6483 NvmeCmd *cmd = &req->cmd;
6484 uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
6485 uint16_t ph = cdw11 & 0xffff;
6486 uint8_t noet = (cdw11 >> 16) & 0xff;
6487 uint16_t ret, ruhid;
6488 uint8_t enable = le32_to_cpu(cmd->cdw12) & 0x1;
6489 uint8_t event_mask = 0;
6490 unsigned int i;
6491 g_autofree uint8_t *events = g_malloc0(noet);
6492 NvmeRuHandle *ruh = NULL;
6493
6494 assert(ns);
6495
6496 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6497 return NVME_FDP_DISABLED | NVME_DNR;
6498 }
6499
6500 if (!nvme_ph_valid(ns, ph)) {
6501 return NVME_INVALID_FIELD | NVME_DNR;
6502 }
6503
6504 ruhid = ns->fdp.phs[ph];
6505 ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
6506
6507 ret = nvme_h2c(n, events, noet, req);
6508 if (ret) {
6509 return ret;
6510 }
6511
6512 for (i = 0; i < noet; i++) {
6513 event_mask |= (1 << nvme_fdp_evf_shifts[events[i]]);
6514 }
6515
6516 if (enable) {
6517 ruh->event_filter |= event_mask;
6518 } else {
6519 ruh->event_filter = ruh->event_filter & ~event_mask;
6520 }
6521
6522 return NVME_SUCCESS;
6523 }
6524
nvme_set_feature(NvmeCtrl * n,NvmeRequest * req)6525 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
6526 {
6527 NvmeNamespace *ns = NULL;
6528
6529 NvmeCmd *cmd = &req->cmd;
6530 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
6531 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
6532 uint32_t nsid = le32_to_cpu(cmd->nsid);
6533 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
6534 uint8_t save = NVME_SETFEAT_SAVE(dw10);
6535 uint16_t status;
6536 int i;
6537 NvmeIdCtrl *id = &n->id_ctrl;
6538 NvmeAtomic *atomic = &n->atomic;
6539
6540 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
6541
6542 if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
6543 return NVME_FID_NOT_SAVEABLE | NVME_DNR;
6544 }
6545
6546 if (!nvme_feature_support[fid]) {
6547 return NVME_INVALID_FIELD | NVME_DNR;
6548 }
6549
6550 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
6551 if (nsid != NVME_NSID_BROADCAST) {
6552 if (!nvme_nsid_valid(n, nsid)) {
6553 return NVME_INVALID_NSID | NVME_DNR;
6554 }
6555
6556 ns = nvme_ns(n, nsid);
6557 if (unlikely(!ns)) {
6558 return NVME_INVALID_FIELD | NVME_DNR;
6559 }
6560 }
6561 } else if (nsid && nsid != NVME_NSID_BROADCAST) {
6562 if (!nvme_nsid_valid(n, nsid)) {
6563 return NVME_INVALID_NSID | NVME_DNR;
6564 }
6565
6566 return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
6567 }
6568
6569 if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
6570 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
6571 }
6572
6573 switch (fid) {
6574 case NVME_TEMPERATURE_THRESHOLD:
6575 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6576 break;
6577 }
6578
6579 switch (NVME_TEMP_THSEL(dw11)) {
6580 case NVME_TEMP_THSEL_OVER:
6581 n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
6582 break;
6583 case NVME_TEMP_THSEL_UNDER:
6584 n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
6585 break;
6586 default:
6587 return NVME_INVALID_FIELD | NVME_DNR;
6588 }
6589
6590 if ((n->temperature >= n->features.temp_thresh_hi) ||
6591 (n->temperature <= n->features.temp_thresh_low)) {
6592 nvme_smart_event(n, NVME_SMART_TEMPERATURE);
6593 }
6594
6595 break;
6596 case NVME_ERROR_RECOVERY:
6597 if (nsid == NVME_NSID_BROADCAST) {
6598 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6599 ns = nvme_ns(n, i);
6600
6601 if (!ns) {
6602 continue;
6603 }
6604
6605 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
6606 ns->features.err_rec = dw11;
6607 }
6608 }
6609
6610 break;
6611 }
6612
6613 assert(ns);
6614 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
6615 ns->features.err_rec = dw11;
6616 }
6617 break;
6618 case NVME_VOLATILE_WRITE_CACHE:
6619 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6620 ns = nvme_ns(n, i);
6621 if (!ns) {
6622 continue;
6623 }
6624
6625 if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
6626 blk_flush(ns->blkconf.blk);
6627 }
6628
6629 blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
6630 }
6631
6632 break;
6633
6634 case NVME_NUMBER_OF_QUEUES:
6635 if (n->qs_created) {
6636 return NVME_CMD_SEQ_ERROR | NVME_DNR;
6637 }
6638
6639 /*
6640 * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR
6641 * and NSQR.
6642 */
6643 if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
6644 return NVME_INVALID_FIELD | NVME_DNR;
6645 }
6646
6647 trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
6648 ((dw11 >> 16) & 0xffff) + 1,
6649 n->conf_ioqpairs,
6650 n->conf_ioqpairs);
6651 req->cqe.result = cpu_to_le32((n->conf_ioqpairs - 1) |
6652 ((n->conf_ioqpairs - 1) << 16));
6653 break;
6654 case NVME_ASYNCHRONOUS_EVENT_CONF:
6655 n->features.async_config = dw11;
6656 break;
6657 case NVME_TIMESTAMP:
6658 return nvme_set_feature_timestamp(n, req);
6659 case NVME_HOST_BEHAVIOR_SUPPORT:
6660 status = nvme_h2c(n, (uint8_t *)&n->features.hbs,
6661 sizeof(n->features.hbs), req);
6662 if (status) {
6663 return status;
6664 }
6665
6666 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6667 ns = nvme_ns(n, i);
6668
6669 if (!ns) {
6670 continue;
6671 }
6672
6673 ns->id_ns.nlbaf = ns->nlbaf - 1;
6674 if (!n->features.hbs.lbafee) {
6675 ns->id_ns.nlbaf = MIN(ns->id_ns.nlbaf, 15);
6676 }
6677 }
6678
6679 return status;
6680 case NVME_COMMAND_SET_PROFILE:
6681 if (dw11 & 0x1ff) {
6682 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
6683 return NVME_IOCS_COMBINATION_REJECTED | NVME_DNR;
6684 }
6685 break;
6686 case NVME_FDP_MODE:
6687 /* spec: abort with cmd seq err if there's one or more NS' in endgrp */
6688 return NVME_CMD_SEQ_ERROR | NVME_DNR;
6689 case NVME_FDP_EVENTS:
6690 return nvme_set_feature_fdp_events(n, ns, req);
6691 case NVME_WRITE_ATOMICITY:
6692
6693 n->dn = 0x1 & dw11;
6694
6695 if (n->dn) {
6696 atomic->atomic_max_write_size = le16_to_cpu(id->awupf) + 1;
6697 } else {
6698 atomic->atomic_max_write_size = le16_to_cpu(id->awun) + 1;
6699 }
6700
6701 if (atomic->atomic_max_write_size == 1) {
6702 atomic->atomic_writes = 0;
6703 } else {
6704 atomic->atomic_writes = 1;
6705 }
6706 break;
6707 default:
6708 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
6709 }
6710 return NVME_SUCCESS;
6711 }
6712
nvme_aer(NvmeCtrl * n,NvmeRequest * req)6713 static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
6714 {
6715 trace_pci_nvme_aer(nvme_cid(req));
6716
6717 if (n->outstanding_aers > n->params.aerl) {
6718 trace_pci_nvme_aer_aerl_exceeded();
6719 return NVME_AER_LIMIT_EXCEEDED;
6720 }
6721
6722 n->aer_reqs[n->outstanding_aers] = req;
6723 n->outstanding_aers++;
6724
6725 if (!QTAILQ_EMPTY(&n->aer_queue)) {
6726 nvme_process_aers(n);
6727 }
6728
6729 return NVME_NO_COMPLETE;
6730 }
6731
nvme_update_dsm_limits(NvmeCtrl * n,NvmeNamespace * ns)6732 static void nvme_update_dsm_limits(NvmeCtrl *n, NvmeNamespace *ns)
6733 {
6734 if (ns) {
6735 n->dmrsl =
6736 MIN_NON_ZERO(n->dmrsl, BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6737
6738 return;
6739 }
6740
6741 for (uint32_t nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
6742 ns = nvme_ns(n, nsid);
6743 if (!ns) {
6744 continue;
6745 }
6746
6747 n->dmrsl =
6748 MIN_NON_ZERO(n->dmrsl, BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6749 }
6750 }
6751
nvme_csi_supported(NvmeCtrl * n,uint8_t csi)6752 static bool nvme_csi_supported(NvmeCtrl *n, uint8_t csi)
6753 {
6754 uint32_t cc;
6755
6756 switch (csi) {
6757 case NVME_CSI_NVM:
6758 return true;
6759
6760 case NVME_CSI_ZONED:
6761 cc = ldl_le_p(&n->bar.cc);
6762
6763 return NVME_CC_CSS(cc) == NVME_CC_CSS_ALL;
6764 }
6765
6766 g_assert_not_reached();
6767 }
6768
nvme_detach_ns(NvmeCtrl * n,NvmeNamespace * ns)6769 static void nvme_detach_ns(NvmeCtrl *n, NvmeNamespace *ns)
6770 {
6771 assert(ns->attached > 0);
6772
6773 n->namespaces[ns->params.nsid] = NULL;
6774 ns->attached--;
6775 }
6776
nvme_ns_attachment(NvmeCtrl * n,NvmeRequest * req)6777 static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
6778 {
6779 NvmeNamespace *ns;
6780 NvmeCtrl *ctrl;
6781 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
6782 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
6783 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6784 uint8_t sel = dw10 & 0xf;
6785 uint16_t *nr_ids = &list[0];
6786 uint16_t *ids = &list[1];
6787 uint16_t ret;
6788 int i;
6789
6790 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
6791
6792 if (!nvme_nsid_valid(n, nsid)) {
6793 return NVME_INVALID_NSID | NVME_DNR;
6794 }
6795
6796 ns = nvme_subsys_ns(n->subsys, nsid);
6797 if (!ns) {
6798 return NVME_INVALID_FIELD | NVME_DNR;
6799 }
6800
6801 ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
6802 if (ret) {
6803 return ret;
6804 }
6805
6806 if (!*nr_ids) {
6807 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
6808 }
6809
6810 *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
6811 for (i = 0; i < *nr_ids; i++) {
6812 ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
6813 if (!ctrl) {
6814 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
6815 }
6816
6817 switch (sel) {
6818 case NVME_NS_ATTACHMENT_ATTACH:
6819 if (nvme_ns(n, nsid)) {
6820 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
6821 }
6822
6823 if (ns->attached && !ns->params.shared) {
6824 return NVME_NS_PRIVATE | NVME_DNR;
6825 }
6826
6827 if (!nvme_csi_supported(n, ns->csi)) {
6828 return NVME_IOCS_NOT_SUPPORTED | NVME_DNR;
6829 }
6830
6831 nvme_attach_ns(ctrl, ns);
6832 nvme_update_dsm_limits(ctrl, ns);
6833
6834 break;
6835
6836 case NVME_NS_ATTACHMENT_DETACH:
6837 nvme_detach_ns(ctrl, ns);
6838 nvme_update_dsm_limits(ctrl, NULL);
6839
6840 break;
6841
6842 default:
6843 return NVME_INVALID_FIELD | NVME_DNR;
6844 }
6845
6846 /*
6847 * Add namespace id to the changed namespace id list for event clearing
6848 * via Get Log Page command.
6849 */
6850 if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
6851 nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
6852 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
6853 NVME_LOG_CHANGED_NSLIST);
6854 }
6855 }
6856
6857 return NVME_SUCCESS;
6858 }
6859
6860 typedef struct NvmeFormatAIOCB {
6861 BlockAIOCB common;
6862 BlockAIOCB *aiocb;
6863 NvmeRequest *req;
6864 int ret;
6865
6866 NvmeNamespace *ns;
6867 uint32_t nsid;
6868 bool broadcast;
6869 int64_t offset;
6870
6871 uint8_t lbaf;
6872 uint8_t mset;
6873 uint8_t pi;
6874 uint8_t pil;
6875 } NvmeFormatAIOCB;
6876
nvme_format_cancel(BlockAIOCB * aiocb)6877 static void nvme_format_cancel(BlockAIOCB *aiocb)
6878 {
6879 NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
6880
6881 iocb->ret = -ECANCELED;
6882
6883 if (iocb->aiocb) {
6884 blk_aio_cancel_async(iocb->aiocb);
6885 iocb->aiocb = NULL;
6886 }
6887 }
6888
6889 static const AIOCBInfo nvme_format_aiocb_info = {
6890 .aiocb_size = sizeof(NvmeFormatAIOCB),
6891 .cancel_async = nvme_format_cancel,
6892 };
6893
nvme_format_set(NvmeNamespace * ns,uint8_t lbaf,uint8_t mset,uint8_t pi,uint8_t pil)6894 static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset,
6895 uint8_t pi, uint8_t pil)
6896 {
6897 uint8_t lbafl = lbaf & 0xf;
6898 uint8_t lbafu = lbaf >> 4;
6899
6900 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
6901
6902 ns->id_ns.dps = (pil << 3) | pi;
6903 ns->id_ns.flbas = (lbafu << 5) | (mset << 4) | lbafl;
6904
6905 nvme_ns_init_format(ns);
6906 }
6907
6908 static void nvme_do_format(NvmeFormatAIOCB *iocb);
6909
nvme_format_ns_cb(void * opaque,int ret)6910 static void nvme_format_ns_cb(void *opaque, int ret)
6911 {
6912 NvmeFormatAIOCB *iocb = opaque;
6913 NvmeNamespace *ns = iocb->ns;
6914 int bytes;
6915
6916 if (iocb->ret < 0) {
6917 goto done;
6918 } else if (ret < 0) {
6919 iocb->ret = ret;
6920 goto done;
6921 }
6922
6923 assert(ns);
6924
6925 if (iocb->offset < ns->size) {
6926 bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
6927
6928 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
6929 bytes, BDRV_REQ_MAY_UNMAP,
6930 nvme_format_ns_cb, iocb);
6931
6932 iocb->offset += bytes;
6933 return;
6934 }
6935
6936 nvme_format_set(ns, iocb->lbaf, iocb->mset, iocb->pi, iocb->pil);
6937 ns->status = 0x0;
6938 iocb->ns = NULL;
6939 iocb->offset = 0;
6940
6941 done:
6942 nvme_do_format(iocb);
6943 }
6944
nvme_format_check(NvmeNamespace * ns,uint8_t lbaf,uint8_t pi)6945 static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
6946 {
6947 if (ns->params.zoned) {
6948 return NVME_INVALID_FORMAT | NVME_DNR;
6949 }
6950
6951 if (lbaf > ns->id_ns.nlbaf) {
6952 return NVME_INVALID_FORMAT | NVME_DNR;
6953 }
6954
6955 if (pi && (ns->id_ns.lbaf[lbaf].ms < nvme_pi_tuple_size(ns))) {
6956 return NVME_INVALID_FORMAT | NVME_DNR;
6957 }
6958
6959 if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
6960 return NVME_INVALID_FIELD | NVME_DNR;
6961 }
6962
6963 return NVME_SUCCESS;
6964 }
6965
nvme_do_format(NvmeFormatAIOCB * iocb)6966 static void nvme_do_format(NvmeFormatAIOCB *iocb)
6967 {
6968 NvmeRequest *req = iocb->req;
6969 NvmeCtrl *n = nvme_ctrl(req);
6970 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6971 uint8_t lbaf = dw10 & 0xf;
6972 uint8_t pi = (dw10 >> 5) & 0x7;
6973 uint16_t status;
6974 int i;
6975
6976 if (iocb->ret < 0) {
6977 goto done;
6978 }
6979
6980 if (iocb->broadcast) {
6981 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
6982 iocb->ns = nvme_ns(n, i);
6983 if (iocb->ns) {
6984 iocb->nsid = i;
6985 break;
6986 }
6987 }
6988 }
6989
6990 if (!iocb->ns) {
6991 goto done;
6992 }
6993
6994 status = nvme_format_check(iocb->ns, lbaf, pi);
6995 if (status) {
6996 req->status = status;
6997 goto done;
6998 }
6999
7000 iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
7001 nvme_format_ns_cb(iocb, 0);
7002 return;
7003
7004 done:
7005 iocb->common.cb(iocb->common.opaque, iocb->ret);
7006 qemu_aio_unref(iocb);
7007 }
7008
nvme_format(NvmeCtrl * n,NvmeRequest * req)7009 static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
7010 {
7011 NvmeFormatAIOCB *iocb;
7012 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
7013 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
7014 uint8_t lbaf = dw10 & 0xf;
7015 uint8_t mset = (dw10 >> 4) & 0x1;
7016 uint8_t pi = (dw10 >> 5) & 0x7;
7017 uint8_t pil = (dw10 >> 8) & 0x1;
7018 uint8_t lbafu = (dw10 >> 12) & 0x3;
7019 uint16_t status;
7020
7021 iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
7022
7023 iocb->req = req;
7024 iocb->ret = 0;
7025 iocb->ns = NULL;
7026 iocb->nsid = 0;
7027 iocb->lbaf = lbaf;
7028 iocb->mset = mset;
7029 iocb->pi = pi;
7030 iocb->pil = pil;
7031 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
7032 iocb->offset = 0;
7033
7034 if (n->features.hbs.lbafee) {
7035 iocb->lbaf |= lbafu << 4;
7036 }
7037
7038 if (!iocb->broadcast) {
7039 if (!nvme_nsid_valid(n, nsid)) {
7040 status = NVME_INVALID_NSID | NVME_DNR;
7041 goto out;
7042 }
7043
7044 iocb->ns = nvme_ns(n, nsid);
7045 if (!iocb->ns) {
7046 status = NVME_INVALID_FIELD | NVME_DNR;
7047 goto out;
7048 }
7049 }
7050
7051 req->aiocb = &iocb->common;
7052 nvme_do_format(iocb);
7053
7054 return NVME_NO_COMPLETE;
7055
7056 out:
7057 qemu_aio_unref(iocb);
7058
7059 return status;
7060 }
7061
nvme_get_virt_res_num(NvmeCtrl * n,uint8_t rt,int * num_total,int * num_prim,int * num_sec)7062 static void nvme_get_virt_res_num(NvmeCtrl *n, uint8_t rt, int *num_total,
7063 int *num_prim, int *num_sec)
7064 {
7065 *num_total = le32_to_cpu(rt ?
7066 n->pri_ctrl_cap.vifrt : n->pri_ctrl_cap.vqfrt);
7067 *num_prim = le16_to_cpu(rt ?
7068 n->pri_ctrl_cap.virfap : n->pri_ctrl_cap.vqrfap);
7069 *num_sec = le16_to_cpu(rt ? n->pri_ctrl_cap.virfa : n->pri_ctrl_cap.vqrfa);
7070 }
7071
nvme_assign_virt_res_to_prim(NvmeCtrl * n,NvmeRequest * req,uint16_t cntlid,uint8_t rt,int nr)7072 static uint16_t nvme_assign_virt_res_to_prim(NvmeCtrl *n, NvmeRequest *req,
7073 uint16_t cntlid, uint8_t rt,
7074 int nr)
7075 {
7076 int num_total, num_prim, num_sec;
7077
7078 if (cntlid != n->cntlid) {
7079 return NVME_INVALID_CTRL_ID | NVME_DNR;
7080 }
7081
7082 nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
7083
7084 if (nr > num_total) {
7085 return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
7086 }
7087
7088 if (nr > num_total - num_sec) {
7089 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
7090 }
7091
7092 if (rt) {
7093 n->next_pri_ctrl_cap.virfap = cpu_to_le16(nr);
7094 } else {
7095 n->next_pri_ctrl_cap.vqrfap = cpu_to_le16(nr);
7096 }
7097
7098 req->cqe.result = cpu_to_le32(nr);
7099 return req->status;
7100 }
7101
nvme_update_virt_res(NvmeCtrl * n,NvmeSecCtrlEntry * sctrl,uint8_t rt,int nr)7102 static void nvme_update_virt_res(NvmeCtrl *n, NvmeSecCtrlEntry *sctrl,
7103 uint8_t rt, int nr)
7104 {
7105 int prev_nr, prev_total;
7106
7107 if (rt) {
7108 prev_nr = le16_to_cpu(sctrl->nvi);
7109 prev_total = le32_to_cpu(n->pri_ctrl_cap.virfa);
7110 sctrl->nvi = cpu_to_le16(nr);
7111 n->pri_ctrl_cap.virfa = cpu_to_le32(prev_total + nr - prev_nr);
7112 } else {
7113 prev_nr = le16_to_cpu(sctrl->nvq);
7114 prev_total = le32_to_cpu(n->pri_ctrl_cap.vqrfa);
7115 sctrl->nvq = cpu_to_le16(nr);
7116 n->pri_ctrl_cap.vqrfa = cpu_to_le32(prev_total + nr - prev_nr);
7117 }
7118 }
7119
nvme_assign_virt_res_to_sec(NvmeCtrl * n,NvmeRequest * req,uint16_t cntlid,uint8_t rt,int nr)7120 static uint16_t nvme_assign_virt_res_to_sec(NvmeCtrl *n, NvmeRequest *req,
7121 uint16_t cntlid, uint8_t rt, int nr)
7122 {
7123 int num_total, num_prim, num_sec, num_free, diff, limit;
7124 NvmeSecCtrlEntry *sctrl;
7125
7126 sctrl = nvme_sctrl_for_cntlid(n, cntlid);
7127 if (!sctrl) {
7128 return NVME_INVALID_CTRL_ID | NVME_DNR;
7129 }
7130
7131 if (sctrl->scs) {
7132 return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
7133 }
7134
7135 limit = le16_to_cpu(rt ? n->pri_ctrl_cap.vifrsm : n->pri_ctrl_cap.vqfrsm);
7136 if (nr > limit) {
7137 return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
7138 }
7139
7140 nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
7141 num_free = num_total - num_prim - num_sec;
7142 diff = nr - le16_to_cpu(rt ? sctrl->nvi : sctrl->nvq);
7143
7144 if (diff > num_free) {
7145 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
7146 }
7147
7148 nvme_update_virt_res(n, sctrl, rt, nr);
7149 req->cqe.result = cpu_to_le32(nr);
7150
7151 return req->status;
7152 }
7153
nvme_virt_set_state(NvmeCtrl * n,uint16_t cntlid,bool online)7154 static uint16_t nvme_virt_set_state(NvmeCtrl *n, uint16_t cntlid, bool online)
7155 {
7156 PCIDevice *pci = PCI_DEVICE(n);
7157 NvmeCtrl *sn = NULL;
7158 NvmeSecCtrlEntry *sctrl;
7159 int vf_index;
7160
7161 sctrl = nvme_sctrl_for_cntlid(n, cntlid);
7162 if (!sctrl) {
7163 return NVME_INVALID_CTRL_ID | NVME_DNR;
7164 }
7165
7166 if (!pci_is_vf(pci)) {
7167 vf_index = le16_to_cpu(sctrl->vfn) - 1;
7168 sn = NVME(pcie_sriov_get_vf_at_index(pci, vf_index));
7169 }
7170
7171 if (online) {
7172 if (!sctrl->nvi || (le16_to_cpu(sctrl->nvq) < 2) || !sn) {
7173 return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
7174 }
7175
7176 if (!sctrl->scs) {
7177 sctrl->scs = 0x1;
7178 nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
7179 }
7180 } else {
7181 nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_INTERRUPT, 0);
7182 nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_QUEUE, 0);
7183
7184 if (sctrl->scs) {
7185 sctrl->scs = 0x0;
7186 if (sn) {
7187 nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
7188 }
7189 }
7190 }
7191
7192 return NVME_SUCCESS;
7193 }
7194
nvme_virt_mngmt(NvmeCtrl * n,NvmeRequest * req)7195 static uint16_t nvme_virt_mngmt(NvmeCtrl *n, NvmeRequest *req)
7196 {
7197 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
7198 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
7199 uint8_t act = dw10 & 0xf;
7200 uint8_t rt = (dw10 >> 8) & 0x7;
7201 uint16_t cntlid = (dw10 >> 16) & 0xffff;
7202 int nr = dw11 & 0xffff;
7203
7204 trace_pci_nvme_virt_mngmt(nvme_cid(req), act, cntlid, rt ? "VI" : "VQ", nr);
7205
7206 if (rt != NVME_VIRT_RES_QUEUE && rt != NVME_VIRT_RES_INTERRUPT) {
7207 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
7208 }
7209
7210 switch (act) {
7211 case NVME_VIRT_MNGMT_ACTION_SEC_ASSIGN:
7212 return nvme_assign_virt_res_to_sec(n, req, cntlid, rt, nr);
7213 case NVME_VIRT_MNGMT_ACTION_PRM_ALLOC:
7214 return nvme_assign_virt_res_to_prim(n, req, cntlid, rt, nr);
7215 case NVME_VIRT_MNGMT_ACTION_SEC_ONLINE:
7216 return nvme_virt_set_state(n, cntlid, true);
7217 case NVME_VIRT_MNGMT_ACTION_SEC_OFFLINE:
7218 return nvme_virt_set_state(n, cntlid, false);
7219 default:
7220 return NVME_INVALID_FIELD | NVME_DNR;
7221 }
7222 }
7223
nvme_dbbuf_config(NvmeCtrl * n,const NvmeRequest * req)7224 static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
7225 {
7226 PCIDevice *pci = PCI_DEVICE(n);
7227 uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
7228 uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
7229 int i;
7230
7231 /* Address should be page aligned */
7232 if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
7233 return NVME_INVALID_FIELD | NVME_DNR;
7234 }
7235
7236 /* Save shadow buffer base addr for use during queue creation */
7237 n->dbbuf_dbs = dbs_addr;
7238 n->dbbuf_eis = eis_addr;
7239 n->dbbuf_enabled = true;
7240
7241 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7242 NvmeSQueue *sq = n->sq[i];
7243 NvmeCQueue *cq = n->cq[i];
7244
7245 if (sq) {
7246 /*
7247 * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3)
7248 * nvme_process_db() uses this hard-coded way to calculate
7249 * doorbell offsets. Be consistent with that here.
7250 */
7251 sq->db_addr = dbs_addr + (i << 3);
7252 sq->ei_addr = eis_addr + (i << 3);
7253 stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
7254
7255 if (n->params.ioeventfd && sq->sqid != 0) {
7256 if (!nvme_init_sq_ioeventfd(sq)) {
7257 sq->ioeventfd_enabled = true;
7258 }
7259 }
7260 }
7261
7262 if (cq) {
7263 /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */
7264 cq->db_addr = dbs_addr + (i << 3) + (1 << 2);
7265 cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
7266 stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
7267
7268 if (n->params.ioeventfd && cq->cqid != 0) {
7269 if (!nvme_init_cq_ioeventfd(cq)) {
7270 cq->ioeventfd_enabled = true;
7271 }
7272 }
7273 }
7274 }
7275
7276 trace_pci_nvme_dbbuf_config(dbs_addr, eis_addr);
7277
7278 return NVME_SUCCESS;
7279 }
7280
nvme_directive_send(NvmeCtrl * n,NvmeRequest * req)7281 static uint16_t nvme_directive_send(NvmeCtrl *n, NvmeRequest *req)
7282 {
7283 return NVME_INVALID_FIELD | NVME_DNR;
7284 }
7285
nvme_directive_receive(NvmeCtrl * n,NvmeRequest * req)7286 static uint16_t nvme_directive_receive(NvmeCtrl *n, NvmeRequest *req)
7287 {
7288 NvmeNamespace *ns;
7289 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
7290 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
7291 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
7292 uint8_t doper, dtype;
7293 uint32_t numd, trans_len;
7294 NvmeDirectiveIdentify id = {
7295 .supported = 1 << NVME_DIRECTIVE_IDENTIFY,
7296 .enabled = 1 << NVME_DIRECTIVE_IDENTIFY,
7297 };
7298
7299 numd = dw10 + 1;
7300 doper = dw11 & 0xff;
7301 dtype = (dw11 >> 8) & 0xff;
7302
7303 trans_len = MIN(sizeof(NvmeDirectiveIdentify), numd << 2);
7304
7305 if (nsid == NVME_NSID_BROADCAST || dtype != NVME_DIRECTIVE_IDENTIFY ||
7306 doper != NVME_DIRECTIVE_RETURN_PARAMS) {
7307 return NVME_INVALID_FIELD | NVME_DNR;
7308 }
7309
7310 ns = nvme_ns(n, nsid);
7311 if (!ns) {
7312 return NVME_INVALID_FIELD | NVME_DNR;
7313 }
7314
7315 switch (dtype) {
7316 case NVME_DIRECTIVE_IDENTIFY:
7317 switch (doper) {
7318 case NVME_DIRECTIVE_RETURN_PARAMS:
7319 if (ns->endgrp && ns->endgrp->fdp.enabled) {
7320 id.supported |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7321 id.enabled |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7322 id.persistent |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7323 }
7324
7325 return nvme_c2h(n, (uint8_t *)&id, trans_len, req);
7326
7327 default:
7328 return NVME_INVALID_FIELD | NVME_DNR;
7329 }
7330
7331 default:
7332 return NVME_INVALID_FIELD;
7333 }
7334 }
7335
nvme_admin_cmd(NvmeCtrl * n,NvmeRequest * req)7336 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
7337 {
7338 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
7339 nvme_adm_opc_str(req->cmd.opcode));
7340
7341 if (!(n->cse.acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
7342 trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
7343 return NVME_INVALID_OPCODE | NVME_DNR;
7344 }
7345
7346 /* SGLs shall not be used for Admin commands in NVMe over PCIe */
7347 if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
7348 return NVME_INVALID_FIELD | NVME_DNR;
7349 }
7350
7351 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
7352 return NVME_INVALID_FIELD;
7353 }
7354
7355 switch (req->cmd.opcode) {
7356 case NVME_ADM_CMD_DELETE_SQ:
7357 return nvme_del_sq(n, req);
7358 case NVME_ADM_CMD_CREATE_SQ:
7359 return nvme_create_sq(n, req);
7360 case NVME_ADM_CMD_GET_LOG_PAGE:
7361 return nvme_get_log(n, req);
7362 case NVME_ADM_CMD_DELETE_CQ:
7363 return nvme_del_cq(n, req);
7364 case NVME_ADM_CMD_CREATE_CQ:
7365 return nvme_create_cq(n, req);
7366 case NVME_ADM_CMD_IDENTIFY:
7367 return nvme_identify(n, req);
7368 case NVME_ADM_CMD_ABORT:
7369 return nvme_abort(n, req);
7370 case NVME_ADM_CMD_SET_FEATURES:
7371 return nvme_set_feature(n, req);
7372 case NVME_ADM_CMD_GET_FEATURES:
7373 return nvme_get_feature(n, req);
7374 case NVME_ADM_CMD_ASYNC_EV_REQ:
7375 return nvme_aer(n, req);
7376 case NVME_ADM_CMD_NS_ATTACHMENT:
7377 return nvme_ns_attachment(n, req);
7378 case NVME_ADM_CMD_VIRT_MNGMT:
7379 return nvme_virt_mngmt(n, req);
7380 case NVME_ADM_CMD_DBBUF_CONFIG:
7381 return nvme_dbbuf_config(n, req);
7382 case NVME_ADM_CMD_FORMAT_NVM:
7383 return nvme_format(n, req);
7384 case NVME_ADM_CMD_DIRECTIVE_SEND:
7385 return nvme_directive_send(n, req);
7386 case NVME_ADM_CMD_DIRECTIVE_RECV:
7387 return nvme_directive_receive(n, req);
7388 default:
7389 g_assert_not_reached();
7390 }
7391
7392 return NVME_INVALID_OPCODE | NVME_DNR;
7393 }
7394
nvme_update_sq_eventidx(const NvmeSQueue * sq)7395 static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
7396 {
7397 trace_pci_nvme_update_sq_eventidx(sq->sqid, sq->tail);
7398
7399 stl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->ei_addr, sq->tail,
7400 MEMTXATTRS_UNSPECIFIED);
7401 }
7402
nvme_update_sq_tail(NvmeSQueue * sq)7403 static void nvme_update_sq_tail(NvmeSQueue *sq)
7404 {
7405 ldl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->db_addr, &sq->tail,
7406 MEMTXATTRS_UNSPECIFIED);
7407
7408 trace_pci_nvme_update_sq_tail(sq->sqid, sq->tail);
7409 }
7410
7411 #define NVME_ATOMIC_NO_START 0
7412 #define NVME_ATOMIC_START_ATOMIC 1
7413 #define NVME_ATOMIC_START_NONATOMIC 2
7414
nvme_atomic_write_check(NvmeCtrl * n,NvmeCmd * cmd,NvmeAtomic * atomic)7415 static int nvme_atomic_write_check(NvmeCtrl *n, NvmeCmd *cmd,
7416 NvmeAtomic *atomic)
7417 {
7418 NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
7419 uint64_t slba = le64_to_cpu(rw->slba);
7420 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb);
7421 uint64_t elba = slba + nlb;
7422 bool cmd_atomic_wr = true;
7423 int i;
7424
7425 if ((cmd->opcode == NVME_CMD_READ) || ((cmd->opcode == NVME_CMD_WRITE) &&
7426 ((rw->nlb + 1) > atomic->atomic_max_write_size))) {
7427 cmd_atomic_wr = false;
7428 }
7429
7430 /*
7431 * Walk the queues to see if there are any atomic conflicts.
7432 */
7433 for (i = 1; i < n->params.max_ioqpairs + 1; i++) {
7434 NvmeSQueue *sq;
7435 NvmeRequest *req;
7436 NvmeRwCmd *req_rw;
7437 uint64_t req_slba;
7438 uint32_t req_nlb;
7439 uint64_t req_elba;
7440
7441 sq = n->sq[i];
7442 if (!sq) {
7443 continue;
7444 }
7445
7446 /*
7447 * Walk all the requests on a given queue.
7448 */
7449 QTAILQ_FOREACH(req, &sq->out_req_list, entry) {
7450 req_rw = (NvmeRwCmd *)&req->cmd;
7451
7452 if (((req_rw->opcode == NVME_CMD_WRITE) ||
7453 (req_rw->opcode == NVME_CMD_READ)) &&
7454 (cmd->nsid == req->ns->params.nsid)) {
7455 req_slba = le64_to_cpu(req_rw->slba);
7456 req_nlb = (uint32_t)le16_to_cpu(req_rw->nlb);
7457 req_elba = req_slba + req_nlb;
7458
7459 if (cmd_atomic_wr) {
7460 if ((elba >= req_slba) && (slba <= req_elba)) {
7461 return NVME_ATOMIC_NO_START;
7462 }
7463 } else {
7464 if (req->atomic_write && ((elba >= req_slba) &&
7465 (slba <= req_elba))) {
7466 return NVME_ATOMIC_NO_START;
7467 }
7468 }
7469 }
7470 }
7471 }
7472 if (cmd_atomic_wr) {
7473 return NVME_ATOMIC_START_ATOMIC;
7474 }
7475 return NVME_ATOMIC_START_NONATOMIC;
7476 }
7477
nvme_get_atomic(NvmeCtrl * n,NvmeCmd * cmd)7478 static NvmeAtomic *nvme_get_atomic(NvmeCtrl *n, NvmeCmd *cmd)
7479 {
7480 if (n->atomic.atomic_writes) {
7481 return &n->atomic;
7482 }
7483 return NULL;
7484 }
7485
nvme_process_sq(void * opaque)7486 static void nvme_process_sq(void *opaque)
7487 {
7488 NvmeSQueue *sq = opaque;
7489 NvmeCtrl *n = sq->ctrl;
7490 NvmeCQueue *cq = n->cq[sq->cqid];
7491
7492 uint16_t status;
7493 hwaddr addr;
7494 NvmeCmd cmd;
7495 NvmeRequest *req;
7496
7497 if (n->dbbuf_enabled) {
7498 nvme_update_sq_tail(sq);
7499 }
7500
7501 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
7502 NvmeAtomic *atomic;
7503 bool cmd_is_atomic;
7504
7505 addr = sq->dma_addr + (sq->head << NVME_SQES);
7506 if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
7507 trace_pci_nvme_err_addr_read(addr);
7508 trace_pci_nvme_err_cfs();
7509 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
7510 break;
7511 }
7512
7513 atomic = nvme_get_atomic(n, &cmd);
7514
7515 cmd_is_atomic = false;
7516 if (sq->sqid && atomic) {
7517 int ret;
7518
7519 ret = nvme_atomic_write_check(n, &cmd, atomic);
7520 switch (ret) {
7521 case NVME_ATOMIC_NO_START:
7522 qemu_bh_schedule(sq->bh);
7523 return;
7524 case NVME_ATOMIC_START_ATOMIC:
7525 cmd_is_atomic = true;
7526 break;
7527 case NVME_ATOMIC_START_NONATOMIC:
7528 default:
7529 break;
7530 }
7531 }
7532 nvme_inc_sq_head(sq);
7533
7534 req = QTAILQ_FIRST(&sq->req_list);
7535 QTAILQ_REMOVE(&sq->req_list, req, entry);
7536 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
7537 nvme_req_clear(req);
7538 req->cqe.cid = cmd.cid;
7539 memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
7540
7541 if (sq->sqid && atomic) {
7542 req->atomic_write = cmd_is_atomic;
7543 }
7544
7545 status = sq->sqid ? nvme_io_cmd(n, req) :
7546 nvme_admin_cmd(n, req);
7547 if (status != NVME_NO_COMPLETE) {
7548 req->status = status;
7549 nvme_enqueue_req_completion(cq, req);
7550 }
7551
7552 if (n->dbbuf_enabled) {
7553 nvme_update_sq_eventidx(sq);
7554 nvme_update_sq_tail(sq);
7555 }
7556 }
7557 }
7558
nvme_update_msixcap_ts(PCIDevice * pci_dev,uint32_t table_size)7559 static void nvme_update_msixcap_ts(PCIDevice *pci_dev, uint32_t table_size)
7560 {
7561 uint8_t *config;
7562
7563 if (!msix_present(pci_dev)) {
7564 return;
7565 }
7566
7567 assert(table_size > 0 && table_size <= pci_dev->msix_entries_nr);
7568
7569 config = pci_dev->config + pci_dev->msix_cap;
7570 pci_set_word_by_mask(config + PCI_MSIX_FLAGS, PCI_MSIX_FLAGS_QSIZE,
7571 table_size - 1);
7572 }
7573
nvme_activate_virt_res(NvmeCtrl * n)7574 static void nvme_activate_virt_res(NvmeCtrl *n)
7575 {
7576 PCIDevice *pci_dev = PCI_DEVICE(n);
7577 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
7578 NvmeSecCtrlEntry *sctrl;
7579
7580 /* -1 to account for the admin queue */
7581 if (pci_is_vf(pci_dev)) {
7582 sctrl = nvme_sctrl(n);
7583 cap->vqprt = sctrl->nvq;
7584 cap->viprt = sctrl->nvi;
7585 n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
7586 n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
7587 } else {
7588 cap->vqrfap = n->next_pri_ctrl_cap.vqrfap;
7589 cap->virfap = n->next_pri_ctrl_cap.virfap;
7590 n->conf_ioqpairs = le16_to_cpu(cap->vqprt) +
7591 le16_to_cpu(cap->vqrfap) - 1;
7592 n->conf_msix_qsize = le16_to_cpu(cap->viprt) +
7593 le16_to_cpu(cap->virfap);
7594 }
7595 }
7596
nvme_ctrl_reset(NvmeCtrl * n,NvmeResetType rst)7597 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst)
7598 {
7599 PCIDevice *pci_dev = PCI_DEVICE(n);
7600 NvmeSecCtrlEntry *sctrl;
7601 NvmeNamespace *ns;
7602 int i;
7603
7604 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7605 ns = nvme_ns(n, i);
7606 if (!ns) {
7607 continue;
7608 }
7609
7610 nvme_ns_drain(ns);
7611 }
7612
7613 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7614 if (n->sq[i] != NULL) {
7615 nvme_free_sq(n->sq[i], n);
7616 }
7617 }
7618 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7619 if (n->cq[i] != NULL) {
7620 nvme_free_cq(n->cq[i], n);
7621 }
7622 }
7623
7624 while (!QTAILQ_EMPTY(&n->aer_queue)) {
7625 NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
7626 QTAILQ_REMOVE(&n->aer_queue, event, entry);
7627 g_free(event);
7628 }
7629
7630 if (n->params.sriov_max_vfs) {
7631 if (!pci_is_vf(pci_dev)) {
7632 for (i = 0; i < n->nr_sec_ctrls; i++) {
7633 sctrl = &n->sec_ctrl_list[i];
7634 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
7635 }
7636 }
7637
7638 if (rst != NVME_RESET_CONTROLLER) {
7639 nvme_activate_virt_res(n);
7640 }
7641 }
7642
7643 n->aer_queued = 0;
7644 n->aer_mask = 0;
7645 n->outstanding_aers = 0;
7646 n->qs_created = false;
7647
7648 n->dn = n->params.atomic_dn; /* Set Disable Normal */
7649
7650 nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
7651
7652 if (pci_is_vf(pci_dev)) {
7653 sctrl = nvme_sctrl(n);
7654
7655 stl_le_p(&n->bar.csts, sctrl->scs ? 0 : NVME_CSTS_FAILED);
7656 } else {
7657 stl_le_p(&n->bar.csts, 0);
7658 }
7659
7660 stl_le_p(&n->bar.intms, 0);
7661 stl_le_p(&n->bar.intmc, 0);
7662 stl_le_p(&n->bar.cc, 0);
7663
7664 n->dbbuf_dbs = 0;
7665 n->dbbuf_eis = 0;
7666 n->dbbuf_enabled = false;
7667 }
7668
nvme_ctrl_shutdown(NvmeCtrl * n)7669 static void nvme_ctrl_shutdown(NvmeCtrl *n)
7670 {
7671 NvmeNamespace *ns;
7672 int i;
7673
7674 if (n->pmr.dev) {
7675 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
7676 }
7677
7678 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7679 ns = nvme_ns(n, i);
7680 if (!ns) {
7681 continue;
7682 }
7683
7684 nvme_ns_shutdown(ns);
7685 }
7686 }
7687
nvme_start_ctrl(NvmeCtrl * n)7688 static int nvme_start_ctrl(NvmeCtrl *n)
7689 {
7690 uint64_t cap = ldq_le_p(&n->bar.cap);
7691 uint32_t cc = ldl_le_p(&n->bar.cc);
7692 uint32_t aqa = ldl_le_p(&n->bar.aqa);
7693 uint64_t asq = ldq_le_p(&n->bar.asq);
7694 uint64_t acq = ldq_le_p(&n->bar.acq);
7695 uint32_t page_bits = NVME_CC_MPS(cc) + 12;
7696 uint32_t page_size = 1 << page_bits;
7697 NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
7698
7699 if (pci_is_vf(PCI_DEVICE(n)) && !sctrl->scs) {
7700 trace_pci_nvme_err_startfail_virt_state(le16_to_cpu(sctrl->nvi),
7701 le16_to_cpu(sctrl->nvq));
7702 return -1;
7703 }
7704 if (unlikely(n->cq[0])) {
7705 trace_pci_nvme_err_startfail_cq();
7706 return -1;
7707 }
7708 if (unlikely(n->sq[0])) {
7709 trace_pci_nvme_err_startfail_sq();
7710 return -1;
7711 }
7712 if (unlikely(asq & (page_size - 1))) {
7713 trace_pci_nvme_err_startfail_asq_misaligned(asq);
7714 return -1;
7715 }
7716 if (unlikely(acq & (page_size - 1))) {
7717 trace_pci_nvme_err_startfail_acq_misaligned(acq);
7718 return -1;
7719 }
7720 if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) {
7721 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc));
7722 return -1;
7723 }
7724 if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) {
7725 trace_pci_nvme_err_startfail_page_too_small(
7726 NVME_CC_MPS(cc),
7727 NVME_CAP_MPSMIN(cap));
7728 return -1;
7729 }
7730 if (unlikely(NVME_CC_MPS(cc) >
7731 NVME_CAP_MPSMAX(cap))) {
7732 trace_pci_nvme_err_startfail_page_too_large(
7733 NVME_CC_MPS(cc),
7734 NVME_CAP_MPSMAX(cap));
7735 return -1;
7736 }
7737 if (unlikely(!NVME_AQA_ASQS(aqa))) {
7738 trace_pci_nvme_err_startfail_asqent_sz_zero();
7739 return -1;
7740 }
7741 if (unlikely(!NVME_AQA_ACQS(aqa))) {
7742 trace_pci_nvme_err_startfail_acqent_sz_zero();
7743 return -1;
7744 }
7745
7746 n->page_bits = page_bits;
7747 n->page_size = page_size;
7748 n->max_prp_ents = n->page_size / sizeof(uint64_t);
7749 nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
7750 nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
7751
7752 nvme_set_timestamp(n, 0ULL);
7753
7754 /* verify that the command sets of attached namespaces are supported */
7755 for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7756 NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i);
7757
7758 if (!ns || (!ns->params.shared && ns->ctrl != n)) {
7759 continue;
7760 }
7761
7762 if (nvme_csi_supported(n, ns->csi) && !ns->params.detached) {
7763 if (!ns->attached || ns->params.shared) {
7764 nvme_attach_ns(n, ns);
7765 }
7766 }
7767 }
7768
7769 nvme_update_dsm_limits(n, NULL);
7770
7771 return 0;
7772 }
7773
nvme_cmb_enable_regs(NvmeCtrl * n)7774 static void nvme_cmb_enable_regs(NvmeCtrl *n)
7775 {
7776 uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc);
7777 uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz);
7778
7779 NVME_CMBLOC_SET_CDPCILS(cmbloc, 1);
7780 NVME_CMBLOC_SET_CDPMLS(cmbloc, 1);
7781 NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR);
7782 stl_le_p(&n->bar.cmbloc, cmbloc);
7783
7784 NVME_CMBSZ_SET_SQS(cmbsz, 1);
7785 NVME_CMBSZ_SET_CQS(cmbsz, 0);
7786 NVME_CMBSZ_SET_LISTS(cmbsz, 1);
7787 NVME_CMBSZ_SET_RDS(cmbsz, 1);
7788 NVME_CMBSZ_SET_WDS(cmbsz, 1);
7789 NVME_CMBSZ_SET_SZU(cmbsz, 2); /* MBs */
7790 NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb);
7791 stl_le_p(&n->bar.cmbsz, cmbsz);
7792 }
7793
nvme_write_bar(NvmeCtrl * n,hwaddr offset,uint64_t data,unsigned size)7794 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
7795 unsigned size)
7796 {
7797 PCIDevice *pci = PCI_DEVICE(n);
7798 uint64_t cap = ldq_le_p(&n->bar.cap);
7799 uint32_t cc = ldl_le_p(&n->bar.cc);
7800 uint32_t intms = ldl_le_p(&n->bar.intms);
7801 uint32_t csts = ldl_le_p(&n->bar.csts);
7802 uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
7803
7804 if (unlikely(offset & (sizeof(uint32_t) - 1))) {
7805 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
7806 "MMIO write not 32-bit aligned,"
7807 " offset=0x%"PRIx64"", offset);
7808 /* should be ignored, fall through for now */
7809 }
7810
7811 if (unlikely(size < sizeof(uint32_t))) {
7812 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
7813 "MMIO write smaller than 32-bits,"
7814 " offset=0x%"PRIx64", size=%u",
7815 offset, size);
7816 /* should be ignored, fall through for now */
7817 }
7818
7819 switch (offset) {
7820 case NVME_REG_INTMS:
7821 if (unlikely(msix_enabled(pci))) {
7822 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
7823 "undefined access to interrupt mask set"
7824 " when MSI-X is enabled");
7825 /* should be ignored, fall through for now */
7826 }
7827 intms |= data;
7828 stl_le_p(&n->bar.intms, intms);
7829 n->bar.intmc = n->bar.intms;
7830 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms);
7831 nvme_irq_check(n);
7832 break;
7833 case NVME_REG_INTMC:
7834 if (unlikely(msix_enabled(pci))) {
7835 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
7836 "undefined access to interrupt mask clr"
7837 " when MSI-X is enabled");
7838 /* should be ignored, fall through for now */
7839 }
7840 intms &= ~data;
7841 stl_le_p(&n->bar.intms, intms);
7842 n->bar.intmc = n->bar.intms;
7843 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms);
7844 nvme_irq_check(n);
7845 break;
7846 case NVME_REG_CC:
7847 stl_le_p(&n->bar.cc, data);
7848
7849 trace_pci_nvme_mmio_cfg(data & 0xffffffff);
7850
7851 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
7852 trace_pci_nvme_mmio_shutdown_set();
7853 nvme_ctrl_shutdown(n);
7854 csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
7855 csts |= NVME_CSTS_SHST_COMPLETE;
7856 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
7857 trace_pci_nvme_mmio_shutdown_cleared();
7858 csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
7859 }
7860
7861 if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
7862 if (unlikely(nvme_start_ctrl(n))) {
7863 trace_pci_nvme_err_startfail();
7864 csts = NVME_CSTS_FAILED;
7865 } else {
7866 trace_pci_nvme_mmio_start_success();
7867 csts = NVME_CSTS_READY;
7868 }
7869 } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
7870 trace_pci_nvme_mmio_stopped();
7871 nvme_ctrl_reset(n, NVME_RESET_CONTROLLER);
7872
7873 break;
7874 }
7875
7876 stl_le_p(&n->bar.csts, csts);
7877
7878 break;
7879 case NVME_REG_CSTS:
7880 if (data & (1 << 4)) {
7881 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
7882 "attempted to W1C CSTS.NSSRO"
7883 " but CAP.NSSRS is zero (not supported)");
7884 } else if (data != 0) {
7885 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
7886 "attempted to set a read only bit"
7887 " of controller status");
7888 }
7889 break;
7890 case NVME_REG_NSSR:
7891 if (data == 0x4e564d65) {
7892 trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
7893 } else {
7894 /* The spec says that writes of other values have no effect */
7895 return;
7896 }
7897 break;
7898 case NVME_REG_AQA:
7899 stl_le_p(&n->bar.aqa, data);
7900 trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
7901 break;
7902 case NVME_REG_ASQ:
7903 stn_le_p(&n->bar.asq, size, data);
7904 trace_pci_nvme_mmio_asqaddr(data);
7905 break;
7906 case NVME_REG_ASQ + 4:
7907 stl_le_p((uint8_t *)&n->bar.asq + 4, data);
7908 trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq));
7909 break;
7910 case NVME_REG_ACQ:
7911 trace_pci_nvme_mmio_acqaddr(data);
7912 stn_le_p(&n->bar.acq, size, data);
7913 break;
7914 case NVME_REG_ACQ + 4:
7915 stl_le_p((uint8_t *)&n->bar.acq + 4, data);
7916 trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq));
7917 break;
7918 case NVME_REG_CMBLOC:
7919 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
7920 "invalid write to reserved CMBLOC"
7921 " when CMBSZ is zero, ignored");
7922 return;
7923 case NVME_REG_CMBSZ:
7924 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
7925 "invalid write to read only CMBSZ, ignored");
7926 return;
7927 case NVME_REG_CMBMSC:
7928 if (!NVME_CAP_CMBS(cap)) {
7929 return;
7930 }
7931
7932 stn_le_p(&n->bar.cmbmsc, size, data);
7933 n->cmb.cmse = false;
7934
7935 if (NVME_CMBMSC_CRE(data)) {
7936 nvme_cmb_enable_regs(n);
7937
7938 if (NVME_CMBMSC_CMSE(data)) {
7939 uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc);
7940 hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT;
7941 if (cba + int128_get64(n->cmb.mem.size) < cba) {
7942 uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts);
7943 NVME_CMBSTS_SET_CBAI(cmbsts, 1);
7944 stl_le_p(&n->bar.cmbsts, cmbsts);
7945 return;
7946 }
7947
7948 n->cmb.cba = cba;
7949 n->cmb.cmse = true;
7950 }
7951 } else {
7952 n->bar.cmbsz = 0;
7953 n->bar.cmbloc = 0;
7954 }
7955
7956 return;
7957 case NVME_REG_CMBMSC + 4:
7958 stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data);
7959 return;
7960
7961 case NVME_REG_PMRCAP:
7962 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
7963 "invalid write to PMRCAP register, ignored");
7964 return;
7965 case NVME_REG_PMRCTL:
7966 if (!NVME_CAP_PMRS(cap)) {
7967 return;
7968 }
7969
7970 stl_le_p(&n->bar.pmrctl, data);
7971 if (NVME_PMRCTL_EN(data)) {
7972 memory_region_set_enabled(&n->pmr.dev->mr, true);
7973 pmrsts = 0;
7974 } else {
7975 memory_region_set_enabled(&n->pmr.dev->mr, false);
7976 NVME_PMRSTS_SET_NRDY(pmrsts, 1);
7977 n->pmr.cmse = false;
7978 }
7979 stl_le_p(&n->bar.pmrsts, pmrsts);
7980 return;
7981 case NVME_REG_PMRSTS:
7982 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
7983 "invalid write to PMRSTS register, ignored");
7984 return;
7985 case NVME_REG_PMREBS:
7986 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
7987 "invalid write to PMREBS register, ignored");
7988 return;
7989 case NVME_REG_PMRSWTP:
7990 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
7991 "invalid write to PMRSWTP register, ignored");
7992 return;
7993 case NVME_REG_PMRMSCL:
7994 if (!NVME_CAP_PMRS(cap)) {
7995 return;
7996 }
7997
7998 stl_le_p(&n->bar.pmrmscl, data);
7999 n->pmr.cmse = false;
8000
8001 if (NVME_PMRMSCL_CMSE(data)) {
8002 uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu);
8003 hwaddr cba = pmrmscu << 32 |
8004 (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT);
8005 if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
8006 NVME_PMRSTS_SET_CBAI(pmrsts, 1);
8007 stl_le_p(&n->bar.pmrsts, pmrsts);
8008 return;
8009 }
8010
8011 n->pmr.cmse = true;
8012 n->pmr.cba = cba;
8013 }
8014
8015 return;
8016 case NVME_REG_PMRMSCU:
8017 if (!NVME_CAP_PMRS(cap)) {
8018 return;
8019 }
8020
8021 stl_le_p(&n->bar.pmrmscu, data);
8022 return;
8023 default:
8024 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
8025 "invalid MMIO write,"
8026 " offset=0x%"PRIx64", data=%"PRIx64"",
8027 offset, data);
8028 break;
8029 }
8030 }
8031
nvme_mmio_read(void * opaque,hwaddr addr,unsigned size)8032 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
8033 {
8034 NvmeCtrl *n = (NvmeCtrl *)opaque;
8035 uint8_t *ptr = (uint8_t *)&n->bar;
8036
8037 trace_pci_nvme_mmio_read(addr, size);
8038
8039 if (unlikely(addr & (sizeof(uint32_t) - 1))) {
8040 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
8041 "MMIO read not 32-bit aligned,"
8042 " offset=0x%"PRIx64"", addr);
8043 /* should RAZ, fall through for now */
8044 } else if (unlikely(size < sizeof(uint32_t))) {
8045 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
8046 "MMIO read smaller than 32-bits,"
8047 " offset=0x%"PRIx64"", addr);
8048 /* should RAZ, fall through for now */
8049 }
8050
8051 if (addr > sizeof(n->bar) - size) {
8052 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
8053 "MMIO read beyond last register,"
8054 " offset=0x%"PRIx64", returning 0", addr);
8055
8056 return 0;
8057 }
8058
8059 if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
8060 addr != NVME_REG_CSTS) {
8061 trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
8062 return 0;
8063 }
8064
8065 /*
8066 * When PMRWBM bit 1 is set then read from
8067 * from PMRSTS should ensure prior writes
8068 * made it to persistent media
8069 */
8070 if (addr == NVME_REG_PMRSTS &&
8071 (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) {
8072 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
8073 }
8074
8075 return ldn_le_p(ptr + addr, size);
8076 }
8077
nvme_process_db(NvmeCtrl * n,hwaddr addr,int val)8078 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
8079 {
8080 PCIDevice *pci = PCI_DEVICE(n);
8081 uint32_t qid;
8082
8083 if (unlikely(addr & ((1 << 2) - 1))) {
8084 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
8085 "doorbell write not 32-bit aligned,"
8086 " offset=0x%"PRIx64", ignoring", addr);
8087 return;
8088 }
8089
8090 if (((addr - 0x1000) >> 2) & 1) {
8091 /* Completion queue doorbell write */
8092
8093 uint16_t new_head = val & 0xffff;
8094 NvmeCQueue *cq;
8095
8096 qid = (addr - (0x1000 + (1 << 2))) >> 3;
8097 if (unlikely(nvme_check_cqid(n, qid))) {
8098 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
8099 "completion queue doorbell write"
8100 " for nonexistent queue,"
8101 " sqid=%"PRIu32", ignoring", qid);
8102
8103 /*
8104 * NVM Express v1.3d, Section 4.1 state: "If host software writes
8105 * an invalid value to the Submission Queue Tail Doorbell or
8106 * Completion Queue Head Doorbell register and an Asynchronous Event
8107 * Request command is outstanding, then an asynchronous event is
8108 * posted to the Admin Completion Queue with a status code of
8109 * Invalid Doorbell Write Value."
8110 *
8111 * Also note that the spec includes the "Invalid Doorbell Register"
8112 * status code, but nowhere does it specify when to use it.
8113 * However, it seems reasonable to use it here in a similar
8114 * fashion.
8115 */
8116 if (n->outstanding_aers) {
8117 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8118 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
8119 NVME_LOG_ERROR_INFO);
8120 }
8121
8122 return;
8123 }
8124
8125 cq = n->cq[qid];
8126 if (unlikely(new_head >= cq->size)) {
8127 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
8128 "completion queue doorbell write value"
8129 " beyond queue size, sqid=%"PRIu32","
8130 " new_head=%"PRIu16", ignoring",
8131 qid, new_head);
8132
8133 if (n->outstanding_aers) {
8134 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8135 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
8136 NVME_LOG_ERROR_INFO);
8137 }
8138
8139 return;
8140 }
8141
8142 trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
8143
8144 /* scheduled deferred cqe posting if queue was previously full */
8145 if (nvme_cq_full(cq)) {
8146 qemu_bh_schedule(cq->bh);
8147 }
8148
8149 cq->head = new_head;
8150 if (!qid && n->dbbuf_enabled) {
8151 stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
8152 }
8153
8154 if (cq->tail == cq->head) {
8155 if (cq->irq_enabled) {
8156 n->cq_pending--;
8157 }
8158
8159 nvme_irq_deassert(n, cq);
8160 }
8161 } else {
8162 /* Submission queue doorbell write */
8163
8164 uint16_t new_tail = val & 0xffff;
8165 NvmeSQueue *sq;
8166
8167 qid = (addr - 0x1000) >> 3;
8168 if (unlikely(nvme_check_sqid(n, qid))) {
8169 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
8170 "submission queue doorbell write"
8171 " for nonexistent queue,"
8172 " sqid=%"PRIu32", ignoring", qid);
8173
8174 if (n->outstanding_aers) {
8175 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8176 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
8177 NVME_LOG_ERROR_INFO);
8178 }
8179
8180 return;
8181 }
8182
8183 sq = n->sq[qid];
8184 if (unlikely(new_tail >= sq->size)) {
8185 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
8186 "submission queue doorbell write value"
8187 " beyond queue size, sqid=%"PRIu32","
8188 " new_tail=%"PRIu16", ignoring",
8189 qid, new_tail);
8190
8191 if (n->outstanding_aers) {
8192 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8193 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
8194 NVME_LOG_ERROR_INFO);
8195 }
8196
8197 return;
8198 }
8199
8200 trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
8201
8202 sq->tail = new_tail;
8203 if (!qid && n->dbbuf_enabled) {
8204 /*
8205 * The spec states "the host shall also update the controller's
8206 * corresponding doorbell property to match the value of that entry
8207 * in the Shadow Doorbell buffer."
8208 *
8209 * Since this context is currently a VM trap, we can safely enforce
8210 * the requirement from the device side in case the host is
8211 * misbehaving.
8212 *
8213 * Note, we shouldn't have to do this, but various drivers
8214 * including ones that run on Linux, are not updating Admin Queues,
8215 * so we can't trust reading it for an appropriate sq tail.
8216 */
8217 stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
8218 }
8219
8220 qemu_bh_schedule(sq->bh);
8221 }
8222 }
8223
nvme_mmio_write(void * opaque,hwaddr addr,uint64_t data,unsigned size)8224 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
8225 unsigned size)
8226 {
8227 NvmeCtrl *n = (NvmeCtrl *)opaque;
8228
8229 trace_pci_nvme_mmio_write(addr, data, size);
8230
8231 if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
8232 addr != NVME_REG_CSTS) {
8233 trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
8234 return;
8235 }
8236
8237 if (addr < sizeof(n->bar)) {
8238 nvme_write_bar(n, addr, data, size);
8239 } else {
8240 nvme_process_db(n, addr, data);
8241 }
8242 }
8243
8244 static const MemoryRegionOps nvme_mmio_ops = {
8245 .read = nvme_mmio_read,
8246 .write = nvme_mmio_write,
8247 .endianness = DEVICE_LITTLE_ENDIAN,
8248 .impl = {
8249 .min_access_size = 2,
8250 .max_access_size = 8,
8251 },
8252 };
8253
nvme_cmb_write(void * opaque,hwaddr addr,uint64_t data,unsigned size)8254 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
8255 unsigned size)
8256 {
8257 NvmeCtrl *n = (NvmeCtrl *)opaque;
8258 stn_le_p(&n->cmb.buf[addr], size, data);
8259 }
8260
nvme_cmb_read(void * opaque,hwaddr addr,unsigned size)8261 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
8262 {
8263 NvmeCtrl *n = (NvmeCtrl *)opaque;
8264 return ldn_le_p(&n->cmb.buf[addr], size);
8265 }
8266
8267 static const MemoryRegionOps nvme_cmb_ops = {
8268 .read = nvme_cmb_read,
8269 .write = nvme_cmb_write,
8270 .endianness = DEVICE_LITTLE_ENDIAN,
8271 .impl = {
8272 .min_access_size = 1,
8273 .max_access_size = 8,
8274 },
8275 };
8276
nvme_check_params(NvmeCtrl * n,Error ** errp)8277 static bool nvme_check_params(NvmeCtrl *n, Error **errp)
8278 {
8279 NvmeParams *params = &n->params;
8280
8281 if (params->num_queues) {
8282 warn_report("num_queues is deprecated; please use max_ioqpairs "
8283 "instead");
8284
8285 params->max_ioqpairs = params->num_queues - 1;
8286 }
8287
8288 if (n->namespace.blkconf.blk && n->subsys) {
8289 error_setg(errp, "subsystem support is unavailable with legacy "
8290 "namespace ('drive' property)");
8291 return false;
8292 }
8293
8294 if (params->max_ioqpairs < 1 ||
8295 params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
8296 error_setg(errp, "max_ioqpairs must be between 1 and %d",
8297 NVME_MAX_IOQPAIRS);
8298 return false;
8299 }
8300
8301 if (params->msix_qsize < 1 ||
8302 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
8303 error_setg(errp, "msix_qsize must be between 1 and %d",
8304 PCI_MSIX_FLAGS_QSIZE + 1);
8305 return false;
8306 }
8307
8308 if (!params->serial) {
8309 error_setg(errp, "serial property not set");
8310 return false;
8311 }
8312
8313 if (params->mqes < 1) {
8314 error_setg(errp, "mqes property cannot be less than 1");
8315 return false;
8316 }
8317
8318 if (n->pmr.dev) {
8319 if (params->msix_exclusive_bar) {
8320 error_setg(errp, "not enough BARs available to enable PMR");
8321 return false;
8322 }
8323
8324 if (host_memory_backend_is_mapped(n->pmr.dev)) {
8325 error_setg(errp, "can't use already busy memdev: %s",
8326 object_get_canonical_path_component(OBJECT(n->pmr.dev)));
8327 return false;
8328 }
8329
8330 if (!is_power_of_2(n->pmr.dev->size)) {
8331 error_setg(errp, "pmr backend size needs to be power of 2 in size");
8332 return false;
8333 }
8334
8335 host_memory_backend_set_mapped(n->pmr.dev, true);
8336 }
8337
8338 if (n->params.zasl > n->params.mdts) {
8339 error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
8340 "than or equal to mdts (Maximum Data Transfer Size)");
8341 return false;
8342 }
8343
8344 if (!n->params.vsl) {
8345 error_setg(errp, "vsl must be non-zero");
8346 return false;
8347 }
8348
8349 if (params->sriov_max_vfs) {
8350 if (!n->subsys) {
8351 error_setg(errp, "subsystem is required for the use of SR-IOV");
8352 return false;
8353 }
8354
8355 if (params->cmb_size_mb) {
8356 error_setg(errp, "CMB is not supported with SR-IOV");
8357 return false;
8358 }
8359
8360 if (n->pmr.dev) {
8361 error_setg(errp, "PMR is not supported with SR-IOV");
8362 return false;
8363 }
8364
8365 if (!params->sriov_vq_flexible || !params->sriov_vi_flexible) {
8366 error_setg(errp, "both sriov_vq_flexible and sriov_vi_flexible"
8367 " must be set for the use of SR-IOV");
8368 return false;
8369 }
8370
8371 if (params->sriov_vq_flexible < params->sriov_max_vfs * 2) {
8372 error_setg(errp, "sriov_vq_flexible must be greater than or equal"
8373 " to %d (sriov_max_vfs * 2)", params->sriov_max_vfs * 2);
8374 return false;
8375 }
8376
8377 if (params->max_ioqpairs < params->sriov_vq_flexible + 2) {
8378 error_setg(errp, "(max_ioqpairs - sriov_vq_flexible) must be"
8379 " greater than or equal to 2");
8380 return false;
8381 }
8382
8383 if (params->sriov_vi_flexible < params->sriov_max_vfs) {
8384 error_setg(errp, "sriov_vi_flexible must be greater than or equal"
8385 " to %d (sriov_max_vfs)", params->sriov_max_vfs);
8386 return false;
8387 }
8388
8389 if (params->msix_qsize < params->sriov_vi_flexible + 1) {
8390 error_setg(errp, "(msix_qsize - sriov_vi_flexible) must be"
8391 " greater than or equal to 1");
8392 return false;
8393 }
8394
8395 if (params->sriov_max_vi_per_vf &&
8396 (params->sriov_max_vi_per_vf - 1) % NVME_VF_RES_GRANULARITY) {
8397 error_setg(errp, "sriov_max_vi_per_vf must meet:"
8398 " (sriov_max_vi_per_vf - 1) %% %d == 0 and"
8399 " sriov_max_vi_per_vf >= 1", NVME_VF_RES_GRANULARITY);
8400 return false;
8401 }
8402
8403 if (params->sriov_max_vq_per_vf &&
8404 (params->sriov_max_vq_per_vf < 2 ||
8405 (params->sriov_max_vq_per_vf - 1) % NVME_VF_RES_GRANULARITY)) {
8406 error_setg(errp, "sriov_max_vq_per_vf must meet:"
8407 " (sriov_max_vq_per_vf - 1) %% %d == 0 and"
8408 " sriov_max_vq_per_vf >= 2", NVME_VF_RES_GRANULARITY);
8409 return false;
8410 }
8411 }
8412
8413 return true;
8414 }
8415
nvme_init_state(NvmeCtrl * n)8416 static void nvme_init_state(NvmeCtrl *n)
8417 {
8418 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
8419 NvmeSecCtrlEntry *list = n->sec_ctrl_list;
8420 NvmeSecCtrlEntry *sctrl;
8421 PCIDevice *pci = PCI_DEVICE(n);
8422 NvmeAtomic *atomic = &n->atomic;
8423 NvmeIdCtrl *id = &n->id_ctrl;
8424 uint8_t max_vfs;
8425 int i;
8426
8427 if (pci_is_vf(pci)) {
8428 sctrl = nvme_sctrl(n);
8429 max_vfs = 0;
8430 n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
8431 n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
8432 } else {
8433 max_vfs = n->params.sriov_max_vfs;
8434 n->conf_ioqpairs = n->params.max_ioqpairs;
8435 n->conf_msix_qsize = n->params.msix_qsize;
8436 }
8437
8438 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
8439 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
8440 n->temperature = NVME_TEMPERATURE;
8441 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
8442 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
8443 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
8444 QTAILQ_INIT(&n->aer_queue);
8445
8446 n->nr_sec_ctrls = max_vfs;
8447 for (i = 0; i < max_vfs; i++) {
8448 sctrl = &list[i];
8449 sctrl->pcid = cpu_to_le16(n->cntlid);
8450 sctrl->vfn = cpu_to_le16(i + 1);
8451 }
8452
8453 cap->cntlid = cpu_to_le16(n->cntlid);
8454 cap->crt = NVME_CRT_VQ | NVME_CRT_VI;
8455
8456 if (pci_is_vf(pci)) {
8457 cap->vqprt = cpu_to_le16(1 + n->conf_ioqpairs);
8458 } else {
8459 cap->vqprt = cpu_to_le16(1 + n->params.max_ioqpairs -
8460 n->params.sriov_vq_flexible);
8461 cap->vqfrt = cpu_to_le32(n->params.sriov_vq_flexible);
8462 cap->vqrfap = cap->vqfrt;
8463 cap->vqgran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
8464 cap->vqfrsm = n->params.sriov_max_vq_per_vf ?
8465 cpu_to_le16(n->params.sriov_max_vq_per_vf) :
8466 cap->vqfrt / MAX(max_vfs, 1);
8467 }
8468
8469 if (pci_is_vf(pci)) {
8470 cap->viprt = cpu_to_le16(n->conf_msix_qsize);
8471 } else {
8472 cap->viprt = cpu_to_le16(n->params.msix_qsize -
8473 n->params.sriov_vi_flexible);
8474 cap->vifrt = cpu_to_le32(n->params.sriov_vi_flexible);
8475 cap->virfap = cap->vifrt;
8476 cap->vigran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
8477 cap->vifrsm = n->params.sriov_max_vi_per_vf ?
8478 cpu_to_le16(n->params.sriov_max_vi_per_vf) :
8479 cap->vifrt / MAX(max_vfs, 1);
8480 }
8481
8482 /* Atomic Write */
8483 id->awun = cpu_to_le16(n->params.atomic_awun);
8484 id->awupf = cpu_to_le16(n->params.atomic_awupf);
8485 n->dn = n->params.atomic_dn;
8486
8487 if (id->awun || id->awupf) {
8488 if (id->awupf > id->awun) {
8489 id->awupf = 0;
8490 }
8491
8492 if (n->dn) {
8493 atomic->atomic_max_write_size = id->awupf + 1;
8494 } else {
8495 atomic->atomic_max_write_size = id->awun + 1;
8496 }
8497
8498 if (atomic->atomic_max_write_size == 1) {
8499 atomic->atomic_writes = 0;
8500 } else {
8501 atomic->atomic_writes = 1;
8502 }
8503 }
8504 }
8505
nvme_init_cmb(NvmeCtrl * n,PCIDevice * pci_dev)8506 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
8507 {
8508 uint64_t cmb_size = n->params.cmb_size_mb * MiB;
8509 uint64_t cap = ldq_le_p(&n->bar.cap);
8510
8511 n->cmb.buf = g_malloc0(cmb_size);
8512 memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
8513 "nvme-cmb", cmb_size);
8514 pci_register_bar(pci_dev, NVME_CMB_BIR,
8515 PCI_BASE_ADDRESS_SPACE_MEMORY |
8516 PCI_BASE_ADDRESS_MEM_TYPE_64 |
8517 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
8518
8519 NVME_CAP_SET_CMBS(cap, 1);
8520 stq_le_p(&n->bar.cap, cap);
8521
8522 if (n->params.legacy_cmb) {
8523 nvme_cmb_enable_regs(n);
8524 n->cmb.cmse = true;
8525 }
8526 }
8527
nvme_init_pmr(NvmeCtrl * n,PCIDevice * pci_dev)8528 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
8529 {
8530 uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap);
8531
8532 NVME_PMRCAP_SET_RDS(pmrcap, 1);
8533 NVME_PMRCAP_SET_WDS(pmrcap, 1);
8534 NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR);
8535 /* Turn on bit 1 support */
8536 NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02);
8537 NVME_PMRCAP_SET_CMSS(pmrcap, 1);
8538 stl_le_p(&n->bar.pmrcap, pmrcap);
8539
8540 pci_register_bar(pci_dev, NVME_PMR_BIR,
8541 PCI_BASE_ADDRESS_SPACE_MEMORY |
8542 PCI_BASE_ADDRESS_MEM_TYPE_64 |
8543 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
8544
8545 memory_region_set_enabled(&n->pmr.dev->mr, false);
8546 }
8547
nvme_mbar_size(unsigned total_queues,unsigned total_irqs,unsigned * msix_table_offset,unsigned * msix_pba_offset)8548 static uint64_t nvme_mbar_size(unsigned total_queues, unsigned total_irqs,
8549 unsigned *msix_table_offset,
8550 unsigned *msix_pba_offset)
8551 {
8552 uint64_t bar_size, msix_table_size;
8553
8554 bar_size = sizeof(NvmeBar) + 2 * total_queues * NVME_DB_SIZE;
8555
8556 if (total_irqs == 0) {
8557 goto out;
8558 }
8559
8560 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
8561
8562 if (msix_table_offset) {
8563 *msix_table_offset = bar_size;
8564 }
8565
8566 msix_table_size = PCI_MSIX_ENTRY_SIZE * total_irqs;
8567 bar_size += msix_table_size;
8568 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
8569
8570 if (msix_pba_offset) {
8571 *msix_pba_offset = bar_size;
8572 }
8573
8574 bar_size += QEMU_ALIGN_UP(total_irqs, 64) / 8;
8575
8576 out:
8577 return pow2ceil(bar_size);
8578 }
8579
nvme_init_sriov(NvmeCtrl * n,PCIDevice * pci_dev,uint16_t offset,Error ** errp)8580 static bool nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset,
8581 Error **errp)
8582 {
8583 uint16_t vf_dev_id = n->params.use_intel_id ?
8584 PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME;
8585 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
8586 uint64_t bar_size = nvme_mbar_size(le16_to_cpu(cap->vqfrsm),
8587 le16_to_cpu(cap->vifrsm),
8588 NULL, NULL);
8589
8590 if (!pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id,
8591 n->params.sriov_max_vfs, n->params.sriov_max_vfs,
8592 NVME_VF_OFFSET, NVME_VF_STRIDE, errp)) {
8593 return false;
8594 }
8595
8596 pcie_sriov_pf_init_vf_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8597 PCI_BASE_ADDRESS_MEM_TYPE_64, bar_size);
8598
8599 return true;
8600 }
8601
nvme_add_pm_capability(PCIDevice * pci_dev,uint8_t offset)8602 static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset)
8603 {
8604 Error *err = NULL;
8605 int ret;
8606
8607 ret = pci_pm_init(pci_dev, offset, &err);
8608 if (err) {
8609 error_report_err(err);
8610 return ret;
8611 }
8612
8613 pci_set_word(pci_dev->config + offset + PCI_PM_PMC,
8614 PCI_PM_CAP_VER_1_2);
8615 pci_set_word(pci_dev->config + offset + PCI_PM_CTRL,
8616 PCI_PM_CTRL_NO_SOFT_RESET);
8617 pci_set_word(pci_dev->wmask + offset + PCI_PM_CTRL,
8618 PCI_PM_CTRL_STATE_MASK);
8619
8620 return 0;
8621 }
8622
pcie_doe_spdm_rsp(DOECap * doe_cap)8623 static bool pcie_doe_spdm_rsp(DOECap *doe_cap)
8624 {
8625 void *req = pcie_doe_get_write_mbox_ptr(doe_cap);
8626 uint32_t req_len = pcie_doe_get_obj_len(req) * 4;
8627 void *rsp = doe_cap->read_mbox;
8628 uint32_t rsp_len = SPDM_SOCKET_MAX_MESSAGE_BUFFER_SIZE;
8629
8630 uint32_t recvd = spdm_socket_rsp(doe_cap->spdm_socket,
8631 SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE,
8632 req, req_len, rsp, rsp_len);
8633 doe_cap->read_mbox_len += DIV_ROUND_UP(recvd, 4);
8634
8635 return recvd != 0;
8636 }
8637
8638 static DOEProtocol doe_spdm_prot[] = {
8639 { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_CMA, pcie_doe_spdm_rsp },
8640 { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_SECURED_CMA, pcie_doe_spdm_rsp },
8641 { }
8642 };
8643
nvme_init_pci(NvmeCtrl * n,PCIDevice * pci_dev,Error ** errp)8644 static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
8645 {
8646 ERRP_GUARD();
8647 uint8_t *pci_conf = pci_dev->config;
8648 uint64_t bar_size;
8649 unsigned msix_table_offset = 0, msix_pba_offset = 0;
8650 unsigned nr_vectors;
8651 int ret;
8652
8653 pci_conf[PCI_INTERRUPT_PIN] = pci_is_vf(pci_dev) ? 0 : 1;
8654 pci_config_set_prog_interface(pci_conf, 0x2);
8655
8656 if (n->params.use_intel_id) {
8657 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
8658 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_INTEL_NVME);
8659 } else {
8660 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
8661 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
8662 }
8663
8664 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
8665 nvme_add_pm_capability(pci_dev, 0x60);
8666 pcie_endpoint_cap_init(pci_dev, 0x80);
8667 pcie_cap_flr_init(pci_dev);
8668 if (n->params.sriov_max_vfs) {
8669 pcie_ari_init(pci_dev, 0x100);
8670 }
8671
8672 if (n->params.msix_exclusive_bar && !pci_is_vf(pci_dev)) {
8673 bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, 0, NULL, NULL);
8674 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
8675 bar_size);
8676 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8677 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->iomem);
8678 ret = msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, errp);
8679 } else {
8680 assert(n->params.msix_qsize >= 1);
8681
8682 /* add one to max_ioqpairs to account for the admin queue pair */
8683 if (!pci_is_vf(pci_dev)) {
8684 nr_vectors = n->params.msix_qsize;
8685 bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1,
8686 nr_vectors, &msix_table_offset,
8687 &msix_pba_offset);
8688 } else {
8689 NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
8690 NvmePriCtrlCap *cap = &pn->pri_ctrl_cap;
8691
8692 nr_vectors = le16_to_cpu(cap->vifrsm);
8693 bar_size = nvme_mbar_size(le16_to_cpu(cap->vqfrsm), nr_vectors,
8694 &msix_table_offset, &msix_pba_offset);
8695 }
8696
8697 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
8698 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
8699 msix_table_offset);
8700 memory_region_add_subregion(&n->bar0, 0, &n->iomem);
8701
8702 if (pci_is_vf(pci_dev)) {
8703 pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0);
8704 } else {
8705 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8706 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
8707 }
8708
8709 ret = msix_init(pci_dev, nr_vectors,
8710 &n->bar0, 0, msix_table_offset,
8711 &n->bar0, 0, msix_pba_offset, 0, errp);
8712 }
8713
8714 if (ret == -ENOTSUP) {
8715 /* report that msix is not supported, but do not error out */
8716 warn_report_err(*errp);
8717 *errp = NULL;
8718 } else if (ret < 0) {
8719 /* propagate error to caller */
8720 return false;
8721 }
8722
8723 if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs &&
8724 !nvme_init_sriov(n, pci_dev, 0x120, errp)) {
8725 return false;
8726 }
8727
8728 nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
8729
8730 pcie_cap_deverr_init(pci_dev);
8731
8732 /* DOE Initialisation */
8733 if (pci_dev->spdm_port) {
8734 uint16_t doe_offset = n->params.sriov_max_vfs ?
8735 PCI_CONFIG_SPACE_SIZE + PCI_ARI_SIZEOF
8736 : PCI_CONFIG_SPACE_SIZE;
8737
8738 pcie_doe_init(pci_dev, &pci_dev->doe_spdm, doe_offset,
8739 doe_spdm_prot, true, 0);
8740
8741 pci_dev->doe_spdm.spdm_socket = spdm_socket_connect(pci_dev->spdm_port,
8742 errp);
8743
8744 if (pci_dev->doe_spdm.spdm_socket < 0) {
8745 return false;
8746 }
8747 }
8748
8749 if (n->params.cmb_size_mb) {
8750 nvme_init_cmb(n, pci_dev);
8751 }
8752
8753 if (n->pmr.dev) {
8754 nvme_init_pmr(n, pci_dev);
8755 }
8756
8757 return true;
8758 }
8759
nvme_init_subnqn(NvmeCtrl * n)8760 static void nvme_init_subnqn(NvmeCtrl *n)
8761 {
8762 NvmeSubsystem *subsys = n->subsys;
8763 NvmeIdCtrl *id = &n->id_ctrl;
8764
8765 if (!subsys) {
8766 snprintf((char *)id->subnqn, sizeof(id->subnqn),
8767 "nqn.2019-08.org.qemu:%s", n->params.serial);
8768 } else {
8769 pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
8770 }
8771 }
8772
nvme_init_ctrl(NvmeCtrl * n,PCIDevice * pci_dev)8773 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
8774 {
8775 NvmeIdCtrl *id = &n->id_ctrl;
8776 uint8_t *pci_conf = pci_dev->config;
8777 uint64_t cap = ldq_le_p(&n->bar.cap);
8778 NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
8779 uint32_t ctratt;
8780 uint16_t oacs;
8781
8782 memcpy(n->cse.acs, nvme_cse_acs_default, sizeof(n->cse.acs));
8783 memcpy(n->cse.iocs.nvm, nvme_cse_iocs_nvm_default, sizeof(n->cse.iocs.nvm));
8784 memcpy(n->cse.iocs.zoned, nvme_cse_iocs_zoned_default,
8785 sizeof(n->cse.iocs.zoned));
8786
8787 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
8788 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
8789 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
8790 strpadcpy((char *)id->fr, sizeof(id->fr), QEMU_VERSION, ' ');
8791 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
8792
8793 id->cntlid = cpu_to_le16(n->cntlid);
8794
8795 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
8796
8797 ctratt = NVME_CTRATT_ELBAS;
8798 if (n->params.ctratt.mem) {
8799 ctratt |= NVME_CTRATT_MEM;
8800 }
8801
8802 id->rab = 6;
8803
8804 if (n->params.use_intel_id) {
8805 id->ieee[0] = 0xb3;
8806 id->ieee[1] = 0x02;
8807 id->ieee[2] = 0x00;
8808 } else {
8809 id->ieee[0] = 0x00;
8810 id->ieee[1] = 0x54;
8811 id->ieee[2] = 0x52;
8812 }
8813
8814 id->mdts = n->params.mdts;
8815 id->ver = cpu_to_le32(NVME_SPEC_VER);
8816
8817 oacs = NVME_OACS_NMS | NVME_OACS_FORMAT | NVME_OACS_DIRECTIVES;
8818
8819 if (n->params.dbcs) {
8820 oacs |= NVME_OACS_DBCS;
8821
8822 n->cse.acs[NVME_ADM_CMD_DBBUF_CONFIG] = NVME_CMD_EFF_CSUPP;
8823 }
8824
8825 if (n->params.sriov_max_vfs) {
8826 oacs |= NVME_OACS_VMS;
8827
8828 n->cse.acs[NVME_ADM_CMD_VIRT_MNGMT] = NVME_CMD_EFF_CSUPP;
8829 }
8830
8831 id->oacs = cpu_to_le16(oacs);
8832
8833 id->cntrltype = 0x1;
8834
8835 /*
8836 * Because the controller always completes the Abort command immediately,
8837 * there can never be more than one concurrently executing Abort command,
8838 * so this value is never used for anything. Note that there can easily be
8839 * many Abort commands in the queues, but they are not considered
8840 * "executing" until processed by nvme_abort.
8841 *
8842 * The specification recommends a value of 3 for Abort Command Limit (four
8843 * concurrently outstanding Abort commands), so lets use that though it is
8844 * inconsequential.
8845 */
8846 id->acl = 3;
8847 id->aerl = n->params.aerl;
8848 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
8849 id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
8850
8851 /* recommended default value (~70 C) */
8852 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
8853 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
8854
8855 id->sqes = (NVME_SQES << 4) | NVME_SQES;
8856 id->cqes = (NVME_CQES << 4) | NVME_CQES;
8857 id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
8858 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
8859 NVME_ONCS_FEATURES | NVME_ONCS_DSM |
8860 NVME_ONCS_COMPARE | NVME_ONCS_COPY |
8861 NVME_ONCS_NVMCSA | NVME_ONCS_NVMAFC);
8862
8863 /*
8864 * NOTE: If this device ever supports a command set that does NOT use 0x0
8865 * as a Flush-equivalent operation, support for the broadcast NSID in Flush
8866 * should probably be removed.
8867 *
8868 * See comment in nvme_io_cmd.
8869 */
8870 id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
8871
8872 id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0 | NVME_OCFS_COPY_FORMAT_1 |
8873 NVME_OCFS_COPY_FORMAT_2 | NVME_OCFS_COPY_FORMAT_3);
8874 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
8875 NVME_CTRL_SGLS_MPTR_SGL);
8876
8877 nvme_init_subnqn(n);
8878
8879 id->psd[0].mp = cpu_to_le16(0x9c4);
8880 id->psd[0].enlat = cpu_to_le32(0x10);
8881 id->psd[0].exlat = cpu_to_le32(0x4);
8882
8883 id->cmic |= NVME_CMIC_MULTI_CTRL;
8884 ctratt |= NVME_CTRATT_ENDGRPS;
8885
8886 id->endgidmax = cpu_to_le16(0x1);
8887
8888 if (n->subsys->endgrp.fdp.enabled) {
8889 ctratt |= NVME_CTRATT_FDPS;
8890 }
8891
8892 id->ctratt = cpu_to_le32(ctratt);
8893
8894 NVME_CAP_SET_MQES(cap, n->params.mqes);
8895 NVME_CAP_SET_CQR(cap, 1);
8896 NVME_CAP_SET_TO(cap, 0xf);
8897 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NCSS);
8898 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_IOCSS);
8899 NVME_CAP_SET_MPSMAX(cap, 4);
8900 NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0);
8901 NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0);
8902 stq_le_p(&n->bar.cap, cap);
8903
8904 stl_le_p(&n->bar.vs, NVME_SPEC_VER);
8905 n->bar.intmc = n->bar.intms = 0;
8906
8907 if (pci_is_vf(pci_dev) && !sctrl->scs) {
8908 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
8909 }
8910 }
8911
nvme_init_subsys(NvmeCtrl * n,Error ** errp)8912 static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
8913 {
8914 int cntlid;
8915
8916 if (!n->subsys) {
8917 DeviceState *dev = qdev_new(TYPE_NVME_SUBSYS);
8918
8919 qdev_prop_set_string(dev, "nqn", n->params.serial);
8920
8921 if (!qdev_realize(dev, NULL, errp)) {
8922 return -1;
8923 }
8924
8925 n->subsys = NVME_SUBSYS(dev);
8926 }
8927
8928 cntlid = nvme_subsys_register_ctrl(n, errp);
8929 if (cntlid < 0) {
8930 return -1;
8931 }
8932
8933 n->cntlid = cntlid;
8934
8935 return 0;
8936 }
8937
nvme_attach_ns(NvmeCtrl * n,NvmeNamespace * ns)8938 void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
8939 {
8940 uint32_t nsid = ns->params.nsid;
8941 assert(nsid && nsid <= NVME_MAX_NAMESPACES);
8942
8943 n->namespaces[nsid] = ns;
8944 ns->attached++;
8945 }
8946
nvme_realize(PCIDevice * pci_dev,Error ** errp)8947 static void nvme_realize(PCIDevice *pci_dev, Error **errp)
8948 {
8949 NvmeCtrl *n = NVME(pci_dev);
8950 DeviceState *dev = DEVICE(pci_dev);
8951 NvmeNamespace *ns;
8952 NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
8953
8954 if (pci_is_vf(pci_dev)) {
8955 /*
8956 * VFs derive settings from the parent. PF's lifespan exceeds
8957 * that of VF's.
8958 */
8959 memcpy(&n->params, &pn->params, sizeof(NvmeParams));
8960
8961 /*
8962 * Set PF's serial value to a new string memory to prevent 'serial'
8963 * property object release of PF when a VF is removed from the system.
8964 */
8965 n->params.serial = g_strdup(pn->params.serial);
8966 n->subsys = pn->subsys;
8967
8968 /*
8969 * Assigning this link (strong link) causes an `object_unref` later in
8970 * `object_release_link_property`. Increment the refcount to balance
8971 * this out.
8972 */
8973 object_ref(OBJECT(pn->subsys));
8974 }
8975
8976 if (!nvme_check_params(n, errp)) {
8977 return;
8978 }
8979
8980 qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS, dev, dev->id);
8981
8982 if (nvme_init_subsys(n, errp)) {
8983 return;
8984 }
8985 nvme_init_state(n);
8986 if (!nvme_init_pci(n, pci_dev, errp)) {
8987 return;
8988 }
8989 nvme_init_ctrl(n, pci_dev);
8990
8991 /* setup a namespace if the controller drive property was given */
8992 if (n->namespace.blkconf.blk) {
8993 ns = &n->namespace;
8994 ns->params.nsid = 1;
8995 ns->ctrl = n;
8996
8997 if (nvme_ns_setup(ns, errp)) {
8998 return;
8999 }
9000
9001 n->subsys->namespaces[ns->params.nsid] = ns;
9002 }
9003 }
9004
nvme_exit(PCIDevice * pci_dev)9005 static void nvme_exit(PCIDevice *pci_dev)
9006 {
9007 NvmeCtrl *n = NVME(pci_dev);
9008 NvmeNamespace *ns;
9009 int i;
9010
9011 nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
9012
9013 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
9014 ns = nvme_ns(n, i);
9015 if (ns) {
9016 ns->attached--;
9017 }
9018 }
9019
9020 nvme_subsys_unregister_ctrl(n->subsys, n);
9021
9022 g_free(n->cq);
9023 g_free(n->sq);
9024 g_free(n->aer_reqs);
9025
9026 if (n->params.cmb_size_mb) {
9027 g_free(n->cmb.buf);
9028 }
9029
9030 if (pci_dev->doe_spdm.spdm_socket > 0) {
9031 spdm_socket_close(pci_dev->doe_spdm.spdm_socket,
9032 SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE);
9033 }
9034
9035 if (n->pmr.dev) {
9036 host_memory_backend_set_mapped(n->pmr.dev, false);
9037 }
9038
9039 if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
9040 pcie_sriov_pf_exit(pci_dev);
9041 }
9042
9043 if (n->params.msix_exclusive_bar && !pci_is_vf(pci_dev)) {
9044 msix_uninit_exclusive_bar(pci_dev);
9045 } else {
9046 msix_uninit(pci_dev, &n->bar0, &n->bar0);
9047 }
9048
9049 memory_region_del_subregion(&n->bar0, &n->iomem);
9050 }
9051
9052 static const Property nvme_props[] = {
9053 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
9054 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
9055 HostMemoryBackend *),
9056 DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
9057 NvmeSubsystem *),
9058 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
9059 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
9060 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
9061 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
9062 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
9063 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
9064 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
9065 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
9066 DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
9067 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
9068 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
9069 DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, false),
9070 DEFINE_PROP_BOOL("dbcs", NvmeCtrl, params.dbcs, true),
9071 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
9072 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
9073 params.auto_transition_zones, true),
9074 DEFINE_PROP_UINT16("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0),
9075 DEFINE_PROP_UINT16("sriov_vq_flexible", NvmeCtrl,
9076 params.sriov_vq_flexible, 0),
9077 DEFINE_PROP_UINT16("sriov_vi_flexible", NvmeCtrl,
9078 params.sriov_vi_flexible, 0),
9079 DEFINE_PROP_UINT32("sriov_max_vi_per_vf", NvmeCtrl,
9080 params.sriov_max_vi_per_vf, 0),
9081 DEFINE_PROP_UINT32("sriov_max_vq_per_vf", NvmeCtrl,
9082 params.sriov_max_vq_per_vf, 0),
9083 DEFINE_PROP_BOOL("msix-exclusive-bar", NvmeCtrl, params.msix_exclusive_bar,
9084 false),
9085 DEFINE_PROP_UINT16("mqes", NvmeCtrl, params.mqes, 0x7ff),
9086 DEFINE_PROP_UINT16("spdm_port", PCIDevice, spdm_port, 0),
9087 DEFINE_PROP_BOOL("ctratt.mem", NvmeCtrl, params.ctratt.mem, false),
9088 DEFINE_PROP_BOOL("atomic.dn", NvmeCtrl, params.atomic_dn, 0),
9089 DEFINE_PROP_UINT16("atomic.awun", NvmeCtrl, params.atomic_awun, 0),
9090 DEFINE_PROP_UINT16("atomic.awupf", NvmeCtrl, params.atomic_awupf, 0),
9091 DEFINE_PROP_BOOL("ocp", NvmeCtrl, params.ocp, false),
9092 };
9093
nvme_get_smart_warning(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)9094 static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
9095 void *opaque, Error **errp)
9096 {
9097 NvmeCtrl *n = NVME(obj);
9098 uint8_t value = n->smart_critical_warning;
9099
9100 visit_type_uint8(v, name, &value, errp);
9101 }
9102
nvme_set_smart_warning(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)9103 static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
9104 void *opaque, Error **errp)
9105 {
9106 NvmeCtrl *n = NVME(obj);
9107 uint8_t value, old_value, cap = 0, index, event;
9108
9109 if (!visit_type_uint8(v, name, &value, errp)) {
9110 return;
9111 }
9112
9113 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
9114 | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
9115 if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) {
9116 cap |= NVME_SMART_PMR_UNRELIABLE;
9117 }
9118
9119 if ((value & cap) != value) {
9120 error_setg(errp, "unsupported smart critical warning bits: 0x%x",
9121 value & ~cap);
9122 return;
9123 }
9124
9125 old_value = n->smart_critical_warning;
9126 n->smart_critical_warning = value;
9127
9128 /* only inject new bits of smart critical warning */
9129 for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
9130 event = 1 << index;
9131 if (value & ~old_value & event)
9132 nvme_smart_event(n, event);
9133 }
9134 }
9135
nvme_pci_reset(DeviceState * qdev)9136 static void nvme_pci_reset(DeviceState *qdev)
9137 {
9138 PCIDevice *pci_dev = PCI_DEVICE(qdev);
9139 NvmeCtrl *n = NVME(pci_dev);
9140
9141 trace_pci_nvme_pci_reset();
9142 nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
9143 }
9144
nvme_sriov_post_write_config(PCIDevice * dev,uint16_t old_num_vfs)9145 static void nvme_sriov_post_write_config(PCIDevice *dev, uint16_t old_num_vfs)
9146 {
9147 NvmeCtrl *n = NVME(dev);
9148 NvmeSecCtrlEntry *sctrl;
9149 int i;
9150
9151 for (i = pcie_sriov_num_vfs(dev); i < old_num_vfs; i++) {
9152 sctrl = &n->sec_ctrl_list[i];
9153 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
9154 }
9155 }
9156
nvme_pci_write_config(PCIDevice * dev,uint32_t address,uint32_t val,int len)9157 static void nvme_pci_write_config(PCIDevice *dev, uint32_t address,
9158 uint32_t val, int len)
9159 {
9160 uint16_t old_num_vfs = pcie_sriov_num_vfs(dev);
9161
9162 if (pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE)) {
9163 pcie_doe_write_config(&dev->doe_spdm, address, val, len);
9164 }
9165 pci_default_write_config(dev, address, val, len);
9166 pcie_cap_flr_write_config(dev, address, val, len);
9167 nvme_sriov_post_write_config(dev, old_num_vfs);
9168 }
9169
nvme_pci_read_config(PCIDevice * dev,uint32_t address,int len)9170 static uint32_t nvme_pci_read_config(PCIDevice *dev, uint32_t address, int len)
9171 {
9172 uint32_t val;
9173 if (dev->spdm_port && pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE)) {
9174 if (pcie_doe_read_config(&dev->doe_spdm, address, len, &val)) {
9175 return val;
9176 }
9177 }
9178 return pci_default_read_config(dev, address, len);
9179 }
9180
9181 static const VMStateDescription nvme_vmstate = {
9182 .name = "nvme",
9183 .unmigratable = 1,
9184 };
9185
nvme_class_init(ObjectClass * oc,const void * data)9186 static void nvme_class_init(ObjectClass *oc, const void *data)
9187 {
9188 DeviceClass *dc = DEVICE_CLASS(oc);
9189 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
9190
9191 pc->realize = nvme_realize;
9192 pc->config_write = nvme_pci_write_config;
9193 pc->config_read = nvme_pci_read_config;
9194 pc->exit = nvme_exit;
9195 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
9196 pc->revision = 2;
9197
9198 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
9199 dc->desc = "Non-Volatile Memory Express";
9200 device_class_set_props(dc, nvme_props);
9201 dc->vmsd = &nvme_vmstate;
9202 device_class_set_legacy_reset(dc, nvme_pci_reset);
9203 }
9204
nvme_instance_init(Object * obj)9205 static void nvme_instance_init(Object *obj)
9206 {
9207 NvmeCtrl *n = NVME(obj);
9208
9209 device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
9210 "bootindex", "/namespace@1,0",
9211 DEVICE(obj));
9212
9213 object_property_add(obj, "smart_critical_warning", "uint8",
9214 nvme_get_smart_warning,
9215 nvme_set_smart_warning, NULL, NULL);
9216 }
9217
9218 static const TypeInfo nvme_info = {
9219 .name = TYPE_NVME,
9220 .parent = TYPE_PCI_DEVICE,
9221 .instance_size = sizeof(NvmeCtrl),
9222 .instance_init = nvme_instance_init,
9223 .class_init = nvme_class_init,
9224 .interfaces = (const InterfaceInfo[]) {
9225 { INTERFACE_PCIE_DEVICE },
9226 { }
9227 },
9228 };
9229
9230 static const TypeInfo nvme_bus_info = {
9231 .name = TYPE_NVME_BUS,
9232 .parent = TYPE_BUS,
9233 .instance_size = sizeof(NvmeBus),
9234 };
9235
nvme_register_types(void)9236 static void nvme_register_types(void)
9237 {
9238 type_register_static(&nvme_info);
9239 type_register_static(&nvme_bus_info);
9240 }
9241
9242 type_init(nvme_register_types)
9243