1 /*
2 * QEMU NVM Express Controller
3 *
4 * Copyright (c) 2012, Intel Corporation
5 *
6 * Written by Keith Busch <keith.busch@intel.com>
7 *
8 * This code is licensed under the GNU GPL v2 or later.
9 */
10
11 /**
12 * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
13 *
14 * https://nvmexpress.org/developers/nvme-specification/
15 *
16 *
17 * Notes on coding style
18 * ---------------------
19 * While QEMU coding style prefers lowercase hexadecimals in constants, the
20 * NVMe subsystem use this format from the NVMe specifications in the comments
21 * (i.e. 'h' suffix instead of '0x' prefix).
22 *
23 * Usage
24 * -----
25 * See docs/system/nvme.rst for extensive documentation.
26 *
27 * Add options:
28 * -drive file=<file>,if=none,id=<drive_id>
29 * -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id>
30 * -device nvme,serial=<serial>,id=<bus_name>, \
31 * cmb_size_mb=<cmb_size_mb[optional]>, \
32 * [pmrdev=<mem_backend_file_id>,] \
33 * max_ioqpairs=<N[optional]>, \
34 * aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
35 * mdts=<N[optional]>,vsl=<N[optional]>, \
36 * zoned.zasl=<N[optional]>, \
37 * zoned.auto_transition=<on|off[optional]>, \
38 * sriov_max_vfs=<N[optional]> \
39 * sriov_vq_flexible=<N[optional]> \
40 * sriov_vi_flexible=<N[optional]> \
41 * sriov_max_vi_per_vf=<N[optional]> \
42 * sriov_max_vq_per_vf=<N[optional]> \
43 * atomic.dn=<on|off[optional]>, \
44 * atomic.awun<N[optional]>, \
45 * atomic.awupf<N[optional]>, \
46 * subsys=<subsys_id>
47 * -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
48 * zoned=<true|false[optional]>, \
49 * subsys=<subsys_id>,shared=<true|false[optional]>, \
50 * detached=<true|false[optional]>, \
51 * zoned.zone_size=<N[optional]>, \
52 * zoned.zone_capacity=<N[optional]>, \
53 * zoned.descr_ext_size=<N[optional]>, \
54 * zoned.max_active=<N[optional]>, \
55 * zoned.max_open=<N[optional]>, \
56 * zoned.cross_read=<true|false[optional]>
57 *
58 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
59 * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
60 * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
61 * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
62 *
63 * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
64 * For example:
65 * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
66 * size=<size> .... -device nvme,...,pmrdev=<mem_id>
67 *
68 * The PMR will use BAR 4/5 exclusively.
69 *
70 * To place controller(s) and namespace(s) to a subsystem, then provide
71 * nvme-subsys device as above.
72 *
73 * nvme subsystem device parameters
74 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
75 * - `nqn`
76 * This parameter provides the `<nqn_id>` part of the string
77 * `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field
78 * of subsystem controllers. Note that `<nqn_id>` should be unique per
79 * subsystem, but this is not enforced by QEMU. If not specified, it will
80 * default to the value of the `id` parameter (`<subsys_id>`).
81 *
82 * nvme device parameters
83 * ~~~~~~~~~~~~~~~~~~~~~~
84 * - `subsys`
85 * Specifying this parameter attaches the controller to the subsystem and
86 * the SUBNQN field in the controller will report the NQN of the subsystem
87 * device. This also enables multi controller capability represented in
88 * Identify Controller data structure in CMIC (Controller Multi-path I/O and
89 * Namespace Sharing Capabilities).
90 *
91 * - `aerl`
92 * The Asynchronous Event Request Limit (AERL). Indicates the maximum number
93 * of concurrently outstanding Asynchronous Event Request commands support
94 * by the controller. This is a 0's based value.
95 *
96 * - `aer_max_queued`
97 * This is the maximum number of events that the device will enqueue for
98 * completion when there are no outstanding AERs. When the maximum number of
99 * enqueued events are reached, subsequent events will be dropped.
100 *
101 * - `mdts`
102 * Indicates the maximum data transfer size for a command that transfers data
103 * between host-accessible memory and the controller. The value is specified
104 * as a power of two (2^n) and is in units of the minimum memory page size
105 * (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
106 *
107 * - `vsl`
108 * Indicates the maximum data size limit for the Verify command. Like `mdts`,
109 * this value is specified as a power of two (2^n) and is in units of the
110 * minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512
111 * KiB).
112 *
113 * - `zoned.zasl`
114 * Indicates the maximum data transfer size for the Zone Append command. Like
115 * `mdts`, the value is specified as a power of two (2^n) and is in units of
116 * the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
117 * defaulting to the value of `mdts`).
118 *
119 * - `zoned.auto_transition`
120 * Indicates if zones in zone state implicitly opened can be automatically
121 * transitioned to zone state closed for resource management purposes.
122 * Defaults to 'on'.
123 *
124 * - `sriov_max_vfs`
125 * Indicates the maximum number of PCIe virtual functions supported
126 * by the controller. The default value is 0. Specifying a non-zero value
127 * enables reporting of both SR-IOV and ARI capabilities by the NVMe device.
128 * Virtual function controllers will not report SR-IOV capability.
129 *
130 * NOTE: Single Root I/O Virtualization support is experimental.
131 * All the related parameters may be subject to change.
132 *
133 * - `sriov_vq_flexible`
134 * Indicates the total number of flexible queue resources assignable to all
135 * the secondary controllers. Implicitly sets the number of primary
136 * controller's private resources to `(max_ioqpairs - sriov_vq_flexible)`.
137 *
138 * - `sriov_vi_flexible`
139 * Indicates the total number of flexible interrupt resources assignable to
140 * all the secondary controllers. Implicitly sets the number of primary
141 * controller's private resources to `(msix_qsize - sriov_vi_flexible)`.
142 *
143 * - `sriov_max_vi_per_vf`
144 * Indicates the maximum number of virtual interrupt resources assignable
145 * to a secondary controller. The default 0 resolves to
146 * `(sriov_vi_flexible / sriov_max_vfs)`.
147 *
148 * - `sriov_max_vq_per_vf`
149 * Indicates the maximum number of virtual queue resources assignable to
150 * a secondary controller. The default 0 resolves to
151 * `(sriov_vq_flexible / sriov_max_vfs)`.
152 *
153 * nvme namespace device parameters
154 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
155 * - `shared`
156 * When the parent nvme device (as defined explicitly by the 'bus' parameter
157 * or implicitly by the most recently defined NvmeBus) is linked to an
158 * nvme-subsys device, the namespace will be attached to all controllers in
159 * the subsystem. If set to 'off' (the default), the namespace will remain a
160 * private namespace and may only be attached to a single controller at a
161 * time.
162 *
163 * - `detached`
164 * This parameter is only valid together with the `subsys` parameter. If left
165 * at the default value (`false/off`), the namespace will be attached to all
166 * controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
167 * namespace will be available in the subsystem but not attached to any
168 * controllers.
169 *
170 * Setting `zoned` to true selects Zoned Command Set at the namespace.
171 * In this case, the following namespace properties are available to configure
172 * zoned operation:
173 * zoned.zone_size=<zone size in bytes, default: 128MiB>
174 * The number may be followed by K, M, G as in kilo-, mega- or giga-.
175 *
176 * zoned.zone_capacity=<zone capacity in bytes, default: zone size>
177 * The value 0 (default) forces zone capacity to be the same as zone
178 * size. The value of this property may not exceed zone size.
179 *
180 * zoned.descr_ext_size=<zone descriptor extension size, default 0>
181 * This value needs to be specified in 64B units. If it is zero,
182 * namespace(s) will not support zone descriptor extensions.
183 *
184 * zoned.max_active=<Maximum Active Resources (zones), default: 0>
185 * The default value means there is no limit to the number of
186 * concurrently active zones.
187 *
188 * zoned.max_open=<Maximum Open Resources (zones), default: 0>
189 * The default value means there is no limit to the number of
190 * concurrently open zones.
191 *
192 * zoned.cross_read=<enable RAZB, default: false>
193 * Setting this property to true enables Read Across Zone Boundaries.
194 */
195
196 #include "qemu/osdep.h"
197 #include "qemu/cutils.h"
198 #include "qemu/error-report.h"
199 #include "qemu/log.h"
200 #include "qemu/units.h"
201 #include "qemu/range.h"
202 #include "qapi/error.h"
203 #include "qapi/visitor.h"
204 #include "system/system.h"
205 #include "system/block-backend.h"
206 #include "system/hostmem.h"
207 #include "hw/pci/msix.h"
208 #include "hw/pci/pcie_sriov.h"
209 #include "system/spdm-socket.h"
210 #include "migration/vmstate.h"
211
212 #include "nvme.h"
213 #include "dif.h"
214 #include "trace.h"
215
216 #define NVME_MAX_IOQPAIRS 0xffff
217 #define NVME_DB_SIZE 4
218 #define NVME_SPEC_VER 0x00010400
219 #define NVME_CMB_BIR 2
220 #define NVME_PMR_BIR 4
221 #define NVME_TEMPERATURE 0x143
222 #define NVME_TEMPERATURE_WARNING 0x157
223 #define NVME_TEMPERATURE_CRITICAL 0x175
224 #define NVME_NUM_FW_SLOTS 1
225 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
226 #define NVME_VF_RES_GRANULARITY 1
227 #define NVME_VF_OFFSET 0x1
228 #define NVME_VF_STRIDE 1
229
230 #define NVME_GUEST_ERR(trace, fmt, ...) \
231 do { \
232 (trace_##trace)(__VA_ARGS__); \
233 qemu_log_mask(LOG_GUEST_ERROR, #trace \
234 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
235 } while (0)
236
237 static const bool nvme_feature_support[NVME_FID_MAX] = {
238 [NVME_ARBITRATION] = true,
239 [NVME_POWER_MANAGEMENT] = true,
240 [NVME_TEMPERATURE_THRESHOLD] = true,
241 [NVME_ERROR_RECOVERY] = true,
242 [NVME_VOLATILE_WRITE_CACHE] = true,
243 [NVME_NUMBER_OF_QUEUES] = true,
244 [NVME_INTERRUPT_COALESCING] = true,
245 [NVME_INTERRUPT_VECTOR_CONF] = true,
246 [NVME_WRITE_ATOMICITY] = true,
247 [NVME_ASYNCHRONOUS_EVENT_CONF] = true,
248 [NVME_TIMESTAMP] = true,
249 [NVME_HOST_BEHAVIOR_SUPPORT] = true,
250 [NVME_COMMAND_SET_PROFILE] = true,
251 [NVME_FDP_MODE] = true,
252 [NVME_FDP_EVENTS] = true,
253 };
254
255 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
256 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
257 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
258 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
259 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
260 [NVME_WRITE_ATOMICITY] = NVME_FEAT_CAP_CHANGE,
261 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
262 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE,
263 [NVME_HOST_BEHAVIOR_SUPPORT] = NVME_FEAT_CAP_CHANGE,
264 [NVME_COMMAND_SET_PROFILE] = NVME_FEAT_CAP_CHANGE,
265 [NVME_FDP_MODE] = NVME_FEAT_CAP_CHANGE,
266 [NVME_FDP_EVENTS] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
267 };
268
269 static const uint32_t nvme_cse_acs_default[256] = {
270 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP,
271 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP,
272 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
273 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP,
274 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP,
275 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
276 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP,
277 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
278 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
279 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
280 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC |
281 NVME_CMD_EFF_CCC,
282 [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
283 [NVME_ADM_CMD_DIRECTIVE_RECV] = NVME_CMD_EFF_CSUPP,
284 [NVME_ADM_CMD_DIRECTIVE_SEND] = NVME_CMD_EFF_CSUPP,
285 };
286
287 static const uint32_t nvme_cse_iocs_nvm_default[256] = {
288 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
289 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
290 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
291 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
292 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
293 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
294 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
295 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
296 [NVME_CMD_IO_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
297 [NVME_CMD_IO_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
298 };
299
300 static const uint32_t nvme_cse_iocs_zoned_default[256] = {
301 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
302 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
303 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
304 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
305 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
306 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
307 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
308 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
309 [NVME_CMD_IO_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
310 [NVME_CMD_IO_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
311
312 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
313 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
314 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
315 };
316
317 static void nvme_process_sq(void *opaque);
318 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst);
319 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n);
320
nvme_sqid(NvmeRequest * req)321 static uint16_t nvme_sqid(NvmeRequest *req)
322 {
323 return le16_to_cpu(req->sq->sqid);
324 }
325
nvme_make_pid(NvmeNamespace * ns,uint16_t rg,uint16_t ph)326 static inline uint16_t nvme_make_pid(NvmeNamespace *ns, uint16_t rg,
327 uint16_t ph)
328 {
329 uint16_t rgif = ns->endgrp->fdp.rgif;
330
331 if (!rgif) {
332 return ph;
333 }
334
335 return (rg << (16 - rgif)) | ph;
336 }
337
nvme_ph_valid(NvmeNamespace * ns,uint16_t ph)338 static inline bool nvme_ph_valid(NvmeNamespace *ns, uint16_t ph)
339 {
340 return ph < ns->fdp.nphs;
341 }
342
nvme_rg_valid(NvmeEnduranceGroup * endgrp,uint16_t rg)343 static inline bool nvme_rg_valid(NvmeEnduranceGroup *endgrp, uint16_t rg)
344 {
345 return rg < endgrp->fdp.nrg;
346 }
347
nvme_pid2ph(NvmeNamespace * ns,uint16_t pid)348 static inline uint16_t nvme_pid2ph(NvmeNamespace *ns, uint16_t pid)
349 {
350 uint16_t rgif = ns->endgrp->fdp.rgif;
351
352 if (!rgif) {
353 return pid;
354 }
355
356 return pid & ((1 << (15 - rgif)) - 1);
357 }
358
nvme_pid2rg(NvmeNamespace * ns,uint16_t pid)359 static inline uint16_t nvme_pid2rg(NvmeNamespace *ns, uint16_t pid)
360 {
361 uint16_t rgif = ns->endgrp->fdp.rgif;
362
363 if (!rgif) {
364 return 0;
365 }
366
367 return pid >> (16 - rgif);
368 }
369
nvme_parse_pid(NvmeNamespace * ns,uint16_t pid,uint16_t * ph,uint16_t * rg)370 static inline bool nvme_parse_pid(NvmeNamespace *ns, uint16_t pid,
371 uint16_t *ph, uint16_t *rg)
372 {
373 *rg = nvme_pid2rg(ns, pid);
374 *ph = nvme_pid2ph(ns, pid);
375
376 return nvme_ph_valid(ns, *ph) && nvme_rg_valid(ns->endgrp, *rg);
377 }
378
nvme_assign_zone_state(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state)379 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
380 NvmeZoneState state)
381 {
382 if (QTAILQ_IN_USE(zone, entry)) {
383 switch (nvme_get_zone_state(zone)) {
384 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
385 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
386 break;
387 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
388 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
389 break;
390 case NVME_ZONE_STATE_CLOSED:
391 QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
392 break;
393 case NVME_ZONE_STATE_FULL:
394 QTAILQ_REMOVE(&ns->full_zones, zone, entry);
395 default:
396 ;
397 }
398 }
399
400 nvme_set_zone_state(zone, state);
401
402 switch (state) {
403 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
404 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
405 break;
406 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
407 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
408 break;
409 case NVME_ZONE_STATE_CLOSED:
410 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
411 break;
412 case NVME_ZONE_STATE_FULL:
413 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
414 case NVME_ZONE_STATE_READ_ONLY:
415 break;
416 default:
417 zone->d.za = 0;
418 }
419 }
420
nvme_zns_check_resources(NvmeNamespace * ns,uint32_t act,uint32_t opn,uint32_t zrwa)421 static uint16_t nvme_zns_check_resources(NvmeNamespace *ns, uint32_t act,
422 uint32_t opn, uint32_t zrwa)
423 {
424 if (ns->params.max_active_zones != 0 &&
425 ns->nr_active_zones + act > ns->params.max_active_zones) {
426 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
427 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
428 }
429
430 if (ns->params.max_open_zones != 0 &&
431 ns->nr_open_zones + opn > ns->params.max_open_zones) {
432 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
433 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
434 }
435
436 if (zrwa > ns->zns.numzrwa) {
437 return NVME_NOZRWA | NVME_DNR;
438 }
439
440 return NVME_SUCCESS;
441 }
442
443 /*
444 * Check if we can open a zone without exceeding open/active limits.
445 * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
446 */
nvme_aor_check(NvmeNamespace * ns,uint32_t act,uint32_t opn)447 static uint16_t nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
448 {
449 return nvme_zns_check_resources(ns, act, opn, 0);
450 }
451
nvme_fdp_alloc_event(NvmeCtrl * n,NvmeFdpEventBuffer * ebuf)452 static NvmeFdpEvent *nvme_fdp_alloc_event(NvmeCtrl *n, NvmeFdpEventBuffer *ebuf)
453 {
454 NvmeFdpEvent *ret = NULL;
455 bool is_full = ebuf->next == ebuf->start && ebuf->nelems;
456
457 ret = &ebuf->events[ebuf->next++];
458 if (unlikely(ebuf->next == NVME_FDP_MAX_EVENTS)) {
459 ebuf->next = 0;
460 }
461 if (is_full) {
462 ebuf->start = ebuf->next;
463 } else {
464 ebuf->nelems++;
465 }
466
467 memset(ret, 0, sizeof(NvmeFdpEvent));
468 ret->timestamp = nvme_get_timestamp(n);
469
470 return ret;
471 }
472
log_event(NvmeRuHandle * ruh,uint8_t event_type)473 static inline int log_event(NvmeRuHandle *ruh, uint8_t event_type)
474 {
475 return (ruh->event_filter >> nvme_fdp_evf_shifts[event_type]) & 0x1;
476 }
477
nvme_update_ruh(NvmeCtrl * n,NvmeNamespace * ns,uint16_t pid)478 static bool nvme_update_ruh(NvmeCtrl *n, NvmeNamespace *ns, uint16_t pid)
479 {
480 NvmeEnduranceGroup *endgrp = ns->endgrp;
481 NvmeRuHandle *ruh;
482 NvmeReclaimUnit *ru;
483 NvmeFdpEvent *e = NULL;
484 uint16_t ph, rg, ruhid;
485
486 if (!nvme_parse_pid(ns, pid, &ph, &rg)) {
487 return false;
488 }
489
490 ruhid = ns->fdp.phs[ph];
491
492 ruh = &endgrp->fdp.ruhs[ruhid];
493 ru = &ruh->rus[rg];
494
495 if (ru->ruamw) {
496 if (log_event(ruh, FDP_EVT_RU_NOT_FULLY_WRITTEN)) {
497 e = nvme_fdp_alloc_event(n, &endgrp->fdp.host_events);
498 e->type = FDP_EVT_RU_NOT_FULLY_WRITTEN;
499 e->flags = FDPEF_PIV | FDPEF_NSIDV | FDPEF_LV;
500 e->pid = cpu_to_le16(pid);
501 e->nsid = cpu_to_le32(ns->params.nsid);
502 e->rgid = cpu_to_le16(rg);
503 e->ruhid = cpu_to_le16(ruhid);
504 }
505
506 /* log (eventual) GC overhead of prematurely swapping the RU */
507 nvme_fdp_stat_inc(&endgrp->fdp.mbmw, nvme_l2b(ns, ru->ruamw));
508 }
509
510 ru->ruamw = ruh->ruamw;
511
512 return true;
513 }
514
nvme_addr_is_cmb(NvmeCtrl * n,hwaddr addr)515 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
516 {
517 hwaddr hi, lo;
518
519 if (!n->cmb.cmse) {
520 return false;
521 }
522
523 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
524 hi = lo + int128_get64(n->cmb.mem.size);
525
526 return addr >= lo && addr < hi;
527 }
528
nvme_addr_to_cmb(NvmeCtrl * n,hwaddr addr)529 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
530 {
531 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
532 return &n->cmb.buf[addr - base];
533 }
534
nvme_addr_is_pmr(NvmeCtrl * n,hwaddr addr)535 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
536 {
537 hwaddr hi;
538
539 if (!n->pmr.cmse) {
540 return false;
541 }
542
543 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
544
545 return addr >= n->pmr.cba && addr < hi;
546 }
547
nvme_addr_to_pmr(NvmeCtrl * n,hwaddr addr)548 static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
549 {
550 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
551 }
552
nvme_addr_is_iomem(NvmeCtrl * n,hwaddr addr)553 static inline bool nvme_addr_is_iomem(NvmeCtrl *n, hwaddr addr)
554 {
555 hwaddr hi, lo;
556
557 /*
558 * The purpose of this check is to guard against invalid "local" access to
559 * the iomem (i.e. controller registers). Thus, we check against the range
560 * covered by the 'bar0' MemoryRegion since that is currently composed of
561 * two subregions (the NVMe "MBAR" and the MSI-X table/pba). Note, however,
562 * that if the device model is ever changed to allow the CMB to be located
563 * in BAR0 as well, then this must be changed.
564 */
565 lo = n->bar0.addr;
566 hi = lo + int128_get64(n->bar0.size);
567
568 return addr >= lo && addr < hi;
569 }
570
nvme_addr_read(NvmeCtrl * n,hwaddr addr,void * buf,int size)571 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
572 {
573 hwaddr hi = addr + size - 1;
574 if (hi < addr) {
575 return 1;
576 }
577
578 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
579 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
580 return 0;
581 }
582
583 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
584 memcpy(buf, nvme_addr_to_pmr(n, addr), size);
585 return 0;
586 }
587
588 return pci_dma_read(PCI_DEVICE(n), addr, buf, size);
589 }
590
nvme_addr_write(NvmeCtrl * n,hwaddr addr,const void * buf,int size)591 static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, const void *buf, int size)
592 {
593 hwaddr hi = addr + size - 1;
594 if (hi < addr) {
595 return 1;
596 }
597
598 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
599 memcpy(nvme_addr_to_cmb(n, addr), buf, size);
600 return 0;
601 }
602
603 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
604 memcpy(nvme_addr_to_pmr(n, addr), buf, size);
605 return 0;
606 }
607
608 return pci_dma_write(PCI_DEVICE(n), addr, buf, size);
609 }
610
nvme_nsid_valid(NvmeCtrl * n,uint32_t nsid)611 static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
612 {
613 return nsid &&
614 (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
615 }
616
nvme_check_sqid(NvmeCtrl * n,uint16_t sqid)617 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
618 {
619 return sqid < n->conf_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
620 }
621
nvme_check_cqid(NvmeCtrl * n,uint16_t cqid)622 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
623 {
624 return cqid < n->conf_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
625 }
626
nvme_inc_cq_tail(NvmeCQueue * cq)627 static void nvme_inc_cq_tail(NvmeCQueue *cq)
628 {
629 cq->tail++;
630 if (cq->tail >= cq->size) {
631 cq->tail = 0;
632 cq->phase = !cq->phase;
633 }
634 }
635
nvme_inc_sq_head(NvmeSQueue * sq)636 static void nvme_inc_sq_head(NvmeSQueue *sq)
637 {
638 sq->head = (sq->head + 1) % sq->size;
639 }
640
nvme_cq_full(NvmeCQueue * cq)641 static uint8_t nvme_cq_full(NvmeCQueue *cq)
642 {
643 return (cq->tail + 1) % cq->size == cq->head;
644 }
645
nvme_sq_empty(NvmeSQueue * sq)646 static uint8_t nvme_sq_empty(NvmeSQueue *sq)
647 {
648 return sq->head == sq->tail;
649 }
650
nvme_irq_check(NvmeCtrl * n)651 static void nvme_irq_check(NvmeCtrl *n)
652 {
653 PCIDevice *pci = PCI_DEVICE(n);
654 uint32_t intms = ldl_le_p(&n->bar.intms);
655
656 if (msix_enabled(pci)) {
657 return;
658 }
659
660 /* vfs does not implement intx */
661 if (pci_is_vf(pci)) {
662 return;
663 }
664
665 if (~intms & n->irq_status) {
666 pci_irq_assert(pci);
667 } else {
668 pci_irq_deassert(pci);
669 }
670 }
671
nvme_irq_assert(NvmeCtrl * n,NvmeCQueue * cq)672 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
673 {
674 PCIDevice *pci = PCI_DEVICE(n);
675
676 if (cq->irq_enabled) {
677 if (msix_enabled(pci)) {
678 trace_pci_nvme_irq_msix(cq->vector);
679 msix_notify(pci, cq->vector);
680 } else {
681 trace_pci_nvme_irq_pin();
682 assert(cq->vector < 32);
683 n->irq_status |= 1 << cq->vector;
684 nvme_irq_check(n);
685 }
686 } else {
687 trace_pci_nvme_irq_masked();
688 }
689 }
690
nvme_irq_deassert(NvmeCtrl * n,NvmeCQueue * cq)691 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
692 {
693 if (cq->irq_enabled) {
694 if (msix_enabled(PCI_DEVICE(n))) {
695 return;
696 } else {
697 assert(cq->vector < 32);
698 if (!n->cq_pending) {
699 n->irq_status &= ~(1 << cq->vector);
700 }
701 nvme_irq_check(n);
702 }
703 }
704 }
705
nvme_req_clear(NvmeRequest * req)706 static void nvme_req_clear(NvmeRequest *req)
707 {
708 req->ns = NULL;
709 req->opaque = NULL;
710 req->aiocb = NULL;
711 memset(&req->cqe, 0x0, sizeof(req->cqe));
712 req->status = NVME_SUCCESS;
713 }
714
nvme_sg_init(NvmeCtrl * n,NvmeSg * sg,bool dma)715 static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
716 {
717 if (dma) {
718 pci_dma_sglist_init(&sg->qsg, PCI_DEVICE(n), 0);
719 sg->flags = NVME_SG_DMA;
720 } else {
721 qemu_iovec_init(&sg->iov, 0);
722 }
723
724 sg->flags |= NVME_SG_ALLOC;
725 }
726
nvme_sg_unmap(NvmeSg * sg)727 static inline void nvme_sg_unmap(NvmeSg *sg)
728 {
729 if (!(sg->flags & NVME_SG_ALLOC)) {
730 return;
731 }
732
733 if (sg->flags & NVME_SG_DMA) {
734 qemu_sglist_destroy(&sg->qsg);
735 } else {
736 qemu_iovec_destroy(&sg->iov);
737 }
738
739 memset(sg, 0x0, sizeof(*sg));
740 }
741
742 /*
743 * When metadata is transferred as extended LBAs, the DPTR mapped into `sg`
744 * holds both data and metadata. This function splits the data and metadata
745 * into two separate QSG/IOVs.
746 */
nvme_sg_split(NvmeSg * sg,NvmeNamespace * ns,NvmeSg * data,NvmeSg * mdata)747 static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
748 NvmeSg *mdata)
749 {
750 NvmeSg *dst = data;
751 uint32_t trans_len, count = ns->lbasz;
752 uint64_t offset = 0;
753 bool dma = sg->flags & NVME_SG_DMA;
754 size_t sge_len;
755 size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
756 int sg_idx = 0;
757
758 assert(sg->flags & NVME_SG_ALLOC);
759
760 while (sg_len) {
761 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
762
763 trans_len = MIN(sg_len, count);
764 trans_len = MIN(trans_len, sge_len - offset);
765
766 if (dst) {
767 if (dma) {
768 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
769 trans_len);
770 } else {
771 qemu_iovec_add(&dst->iov,
772 sg->iov.iov[sg_idx].iov_base + offset,
773 trans_len);
774 }
775 }
776
777 sg_len -= trans_len;
778 count -= trans_len;
779 offset += trans_len;
780
781 if (count == 0) {
782 dst = (dst == data) ? mdata : data;
783 count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
784 }
785
786 if (sge_len == offset) {
787 offset = 0;
788 sg_idx++;
789 }
790 }
791 }
792
nvme_map_addr_cmb(NvmeCtrl * n,QEMUIOVector * iov,hwaddr addr,size_t len)793 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
794 size_t len)
795 {
796 if (!len) {
797 return NVME_SUCCESS;
798 }
799
800 trace_pci_nvme_map_addr_cmb(addr, len);
801
802 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
803 return NVME_DATA_TRAS_ERROR;
804 }
805
806 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
807
808 return NVME_SUCCESS;
809 }
810
nvme_map_addr_pmr(NvmeCtrl * n,QEMUIOVector * iov,hwaddr addr,size_t len)811 static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
812 size_t len)
813 {
814 if (!len) {
815 return NVME_SUCCESS;
816 }
817
818 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
819 return NVME_DATA_TRAS_ERROR;
820 }
821
822 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
823
824 return NVME_SUCCESS;
825 }
826
nvme_map_addr(NvmeCtrl * n,NvmeSg * sg,hwaddr addr,size_t len)827 static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
828 {
829 bool cmb = false, pmr = false;
830
831 if (!len) {
832 return NVME_SUCCESS;
833 }
834
835 trace_pci_nvme_map_addr(addr, len);
836
837 if (nvme_addr_is_iomem(n, addr)) {
838 return NVME_DATA_TRAS_ERROR;
839 }
840
841 if (nvme_addr_is_cmb(n, addr)) {
842 cmb = true;
843 } else if (nvme_addr_is_pmr(n, addr)) {
844 pmr = true;
845 }
846
847 if (cmb || pmr) {
848 if (sg->flags & NVME_SG_DMA) {
849 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
850 }
851
852 if (sg->iov.niov + 1 > IOV_MAX) {
853 goto max_mappings_exceeded;
854 }
855
856 if (cmb) {
857 return nvme_map_addr_cmb(n, &sg->iov, addr, len);
858 } else {
859 return nvme_map_addr_pmr(n, &sg->iov, addr, len);
860 }
861 }
862
863 if (!(sg->flags & NVME_SG_DMA)) {
864 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
865 }
866
867 if (sg->qsg.nsg + 1 > IOV_MAX) {
868 goto max_mappings_exceeded;
869 }
870
871 qemu_sglist_add(&sg->qsg, addr, len);
872
873 return NVME_SUCCESS;
874
875 max_mappings_exceeded:
876 NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings,
877 "number of mappings exceed 1024");
878 return NVME_INTERNAL_DEV_ERROR | NVME_DNR;
879 }
880
nvme_addr_is_dma(NvmeCtrl * n,hwaddr addr)881 static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
882 {
883 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
884 }
885
nvme_map_prp(NvmeCtrl * n,NvmeSg * sg,uint64_t prp1,uint64_t prp2,uint32_t len)886 static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
887 uint64_t prp2, uint32_t len)
888 {
889 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
890 trans_len = MIN(len, trans_len);
891 int num_prps = (len >> n->page_bits) + 1;
892 uint16_t status;
893 int ret;
894
895 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
896
897 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
898
899 status = nvme_map_addr(n, sg, prp1, trans_len);
900 if (status) {
901 goto unmap;
902 }
903
904 len -= trans_len;
905 if (len) {
906 if (len > n->page_size) {
907 g_autofree uint64_t *prp_list = g_new(uint64_t, n->max_prp_ents);
908 uint32_t nents, prp_trans;
909 int i = 0;
910
911 /*
912 * The first PRP list entry, pointed to by PRP2 may contain offset.
913 * Hence, we need to calculate the number of entries in based on
914 * that offset.
915 */
916 nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
917 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
918 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
919 if (ret) {
920 trace_pci_nvme_err_addr_read(prp2);
921 status = NVME_DATA_TRAS_ERROR;
922 goto unmap;
923 }
924 while (len != 0) {
925 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
926
927 if (i == nents - 1 && len > n->page_size) {
928 if (unlikely(prp_ent & (n->page_size - 1))) {
929 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
930 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
931 goto unmap;
932 }
933
934 i = 0;
935 nents = (len + n->page_size - 1) >> n->page_bits;
936 nents = MIN(nents, n->max_prp_ents);
937 prp_trans = nents * sizeof(uint64_t);
938 ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
939 prp_trans);
940 if (ret) {
941 trace_pci_nvme_err_addr_read(prp_ent);
942 status = NVME_DATA_TRAS_ERROR;
943 goto unmap;
944 }
945 prp_ent = le64_to_cpu(prp_list[i]);
946 }
947
948 if (unlikely(prp_ent & (n->page_size - 1))) {
949 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
950 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
951 goto unmap;
952 }
953
954 trans_len = MIN(len, n->page_size);
955 status = nvme_map_addr(n, sg, prp_ent, trans_len);
956 if (status) {
957 goto unmap;
958 }
959
960 len -= trans_len;
961 i++;
962 }
963 } else {
964 if (unlikely(prp2 & (n->page_size - 1))) {
965 trace_pci_nvme_err_invalid_prp2_align(prp2);
966 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
967 goto unmap;
968 }
969 status = nvme_map_addr(n, sg, prp2, len);
970 if (status) {
971 goto unmap;
972 }
973 }
974 }
975
976 return NVME_SUCCESS;
977
978 unmap:
979 nvme_sg_unmap(sg);
980 return status;
981 }
982
983 /*
984 * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
985 * number of bytes mapped in len.
986 */
nvme_map_sgl_data(NvmeCtrl * n,NvmeSg * sg,NvmeSglDescriptor * segment,uint64_t nsgld,size_t * len,NvmeCmd * cmd)987 static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
988 NvmeSglDescriptor *segment, uint64_t nsgld,
989 size_t *len, NvmeCmd *cmd)
990 {
991 dma_addr_t addr, trans_len;
992 uint32_t dlen;
993 uint16_t status;
994
995 for (int i = 0; i < nsgld; i++) {
996 uint8_t type = NVME_SGL_TYPE(segment[i].type);
997
998 switch (type) {
999 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
1000 break;
1001 case NVME_SGL_DESCR_TYPE_SEGMENT:
1002 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
1003 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
1004 default:
1005 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
1006 }
1007
1008 dlen = le32_to_cpu(segment[i].len);
1009
1010 if (!dlen) {
1011 continue;
1012 }
1013
1014 if (*len == 0) {
1015 /*
1016 * All data has been mapped, but the SGL contains additional
1017 * segments and/or descriptors. The controller might accept
1018 * ignoring the rest of the SGL.
1019 */
1020 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
1021 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
1022 break;
1023 }
1024
1025 trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
1026 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1027 }
1028
1029 trans_len = MIN(*len, dlen);
1030
1031 addr = le64_to_cpu(segment[i].addr);
1032
1033 if (UINT64_MAX - addr < dlen) {
1034 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1035 }
1036
1037 status = nvme_map_addr(n, sg, addr, trans_len);
1038 if (status) {
1039 return status;
1040 }
1041
1042 *len -= trans_len;
1043 }
1044
1045 return NVME_SUCCESS;
1046 }
1047
nvme_map_sgl(NvmeCtrl * n,NvmeSg * sg,NvmeSglDescriptor sgl,size_t len,NvmeCmd * cmd)1048 static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
1049 size_t len, NvmeCmd *cmd)
1050 {
1051 /*
1052 * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
1053 * dynamically allocating a potentially huge SGL. The spec allows the SGL
1054 * to be larger (as in number of bytes required to describe the SGL
1055 * descriptors and segment chain) than the command transfer size, so it is
1056 * not bounded by MDTS.
1057 */
1058 #define SEG_CHUNK_SIZE 256
1059
1060 NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
1061 uint64_t nsgld;
1062 uint32_t seg_len;
1063 uint16_t status;
1064 hwaddr addr;
1065 int ret;
1066
1067 sgld = &sgl;
1068 addr = le64_to_cpu(sgl.addr);
1069
1070 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
1071
1072 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
1073
1074 /*
1075 * If the entire transfer can be described with a single data block it can
1076 * be mapped directly.
1077 */
1078 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
1079 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
1080 if (status) {
1081 goto unmap;
1082 }
1083
1084 goto out;
1085 }
1086
1087 for (;;) {
1088 switch (NVME_SGL_TYPE(sgld->type)) {
1089 case NVME_SGL_DESCR_TYPE_SEGMENT:
1090 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
1091 break;
1092 default:
1093 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1094 }
1095
1096 seg_len = le32_to_cpu(sgld->len);
1097
1098 /* check the length of the (Last) Segment descriptor */
1099 if (!seg_len || seg_len & 0xf) {
1100 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1101 }
1102
1103 if (UINT64_MAX - addr < seg_len) {
1104 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1105 }
1106
1107 nsgld = seg_len / sizeof(NvmeSglDescriptor);
1108
1109 while (nsgld > SEG_CHUNK_SIZE) {
1110 if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
1111 trace_pci_nvme_err_addr_read(addr);
1112 status = NVME_DATA_TRAS_ERROR;
1113 goto unmap;
1114 }
1115
1116 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
1117 &len, cmd);
1118 if (status) {
1119 goto unmap;
1120 }
1121
1122 nsgld -= SEG_CHUNK_SIZE;
1123 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
1124 }
1125
1126 ret = nvme_addr_read(n, addr, segment, nsgld *
1127 sizeof(NvmeSglDescriptor));
1128 if (ret) {
1129 trace_pci_nvme_err_addr_read(addr);
1130 status = NVME_DATA_TRAS_ERROR;
1131 goto unmap;
1132 }
1133
1134 last_sgld = &segment[nsgld - 1];
1135
1136 /*
1137 * If the segment ends with a Data Block, then we are done.
1138 */
1139 if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
1140 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
1141 if (status) {
1142 goto unmap;
1143 }
1144
1145 goto out;
1146 }
1147
1148 /*
1149 * If the last descriptor was not a Data Block, then the current
1150 * segment must not be a Last Segment.
1151 */
1152 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
1153 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1154 goto unmap;
1155 }
1156
1157 sgld = last_sgld;
1158 addr = le64_to_cpu(sgld->addr);
1159
1160 /*
1161 * Do not map the last descriptor; it will be a Segment or Last Segment
1162 * descriptor and is handled by the next iteration.
1163 */
1164 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
1165 if (status) {
1166 goto unmap;
1167 }
1168 }
1169
1170 out:
1171 /* if there is any residual left in len, the SGL was too short */
1172 if (len) {
1173 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1174 goto unmap;
1175 }
1176
1177 return NVME_SUCCESS;
1178
1179 unmap:
1180 nvme_sg_unmap(sg);
1181 return status;
1182 }
1183
nvme_map_dptr(NvmeCtrl * n,NvmeSg * sg,size_t len,NvmeCmd * cmd)1184 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1185 NvmeCmd *cmd)
1186 {
1187 uint64_t prp1, prp2;
1188
1189 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
1190 case NVME_PSDT_PRP:
1191 prp1 = le64_to_cpu(cmd->dptr.prp1);
1192 prp2 = le64_to_cpu(cmd->dptr.prp2);
1193
1194 return nvme_map_prp(n, sg, prp1, prp2, len);
1195 case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
1196 case NVME_PSDT_SGL_MPTR_SGL:
1197 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
1198 default:
1199 return NVME_INVALID_FIELD;
1200 }
1201 }
1202
nvme_map_mptr(NvmeCtrl * n,NvmeSg * sg,size_t len,NvmeCmd * cmd)1203 static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1204 NvmeCmd *cmd)
1205 {
1206 int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
1207 hwaddr mptr = le64_to_cpu(cmd->mptr);
1208 uint16_t status;
1209
1210 if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
1211 NvmeSglDescriptor sgl;
1212
1213 if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
1214 return NVME_DATA_TRAS_ERROR;
1215 }
1216
1217 status = nvme_map_sgl(n, sg, sgl, len, cmd);
1218 if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
1219 status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
1220 }
1221
1222 return status;
1223 }
1224
1225 nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1226 status = nvme_map_addr(n, sg, mptr, len);
1227 if (status) {
1228 nvme_sg_unmap(sg);
1229 }
1230
1231 return status;
1232 }
1233
nvme_map_data(NvmeCtrl * n,uint32_t nlb,NvmeRequest * req)1234 static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1235 {
1236 NvmeNamespace *ns = req->ns;
1237 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1238 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1239 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1240 size_t len = nvme_l2b(ns, nlb);
1241 uint16_t status;
1242
1243 if (nvme_ns_ext(ns) &&
1244 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1245 NvmeSg sg;
1246
1247 len += nvme_m2b(ns, nlb);
1248
1249 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1250 if (status) {
1251 return status;
1252 }
1253
1254 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1255 nvme_sg_split(&sg, ns, &req->sg, NULL);
1256 nvme_sg_unmap(&sg);
1257
1258 return NVME_SUCCESS;
1259 }
1260
1261 return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1262 }
1263
nvme_map_mdata(NvmeCtrl * n,uint32_t nlb,NvmeRequest * req)1264 static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1265 {
1266 NvmeNamespace *ns = req->ns;
1267 size_t len = nvme_m2b(ns, nlb);
1268 uint16_t status;
1269
1270 if (nvme_ns_ext(ns)) {
1271 NvmeSg sg;
1272
1273 len += nvme_l2b(ns, nlb);
1274
1275 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1276 if (status) {
1277 return status;
1278 }
1279
1280 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1281 nvme_sg_split(&sg, ns, NULL, &req->sg);
1282 nvme_sg_unmap(&sg);
1283
1284 return NVME_SUCCESS;
1285 }
1286
1287 return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1288 }
1289
nvme_tx_interleaved(NvmeCtrl * n,NvmeSg * sg,uint8_t * ptr,uint32_t len,uint32_t bytes,int32_t skip_bytes,int64_t offset,NvmeTxDirection dir)1290 static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1291 uint32_t len, uint32_t bytes,
1292 int32_t skip_bytes, int64_t offset,
1293 NvmeTxDirection dir)
1294 {
1295 hwaddr addr;
1296 uint32_t trans_len, count = bytes;
1297 bool dma = sg->flags & NVME_SG_DMA;
1298 int64_t sge_len;
1299 int sg_idx = 0;
1300 int ret;
1301
1302 assert(sg->flags & NVME_SG_ALLOC);
1303
1304 while (len) {
1305 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1306
1307 if (sge_len - offset < 0) {
1308 offset -= sge_len;
1309 sg_idx++;
1310 continue;
1311 }
1312
1313 if (sge_len == offset) {
1314 offset = 0;
1315 sg_idx++;
1316 continue;
1317 }
1318
1319 trans_len = MIN(len, count);
1320 trans_len = MIN(trans_len, sge_len - offset);
1321
1322 if (dma) {
1323 addr = sg->qsg.sg[sg_idx].base + offset;
1324 } else {
1325 addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1326 }
1327
1328 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1329 ret = nvme_addr_read(n, addr, ptr, trans_len);
1330 } else {
1331 ret = nvme_addr_write(n, addr, ptr, trans_len);
1332 }
1333
1334 if (ret) {
1335 return NVME_DATA_TRAS_ERROR;
1336 }
1337
1338 ptr += trans_len;
1339 len -= trans_len;
1340 count -= trans_len;
1341 offset += trans_len;
1342
1343 if (count == 0) {
1344 count = bytes;
1345 offset += skip_bytes;
1346 }
1347 }
1348
1349 return NVME_SUCCESS;
1350 }
1351
nvme_tx(NvmeCtrl * n,NvmeSg * sg,void * ptr,uint32_t len,NvmeTxDirection dir)1352 static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, void *ptr, uint32_t len,
1353 NvmeTxDirection dir)
1354 {
1355 assert(sg->flags & NVME_SG_ALLOC);
1356
1357 if (sg->flags & NVME_SG_DMA) {
1358 const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED;
1359 dma_addr_t residual;
1360
1361 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1362 dma_buf_write(ptr, len, &residual, &sg->qsg, attrs);
1363 } else {
1364 dma_buf_read(ptr, len, &residual, &sg->qsg, attrs);
1365 }
1366
1367 if (unlikely(residual)) {
1368 trace_pci_nvme_err_invalid_dma();
1369 return NVME_INVALID_FIELD | NVME_DNR;
1370 }
1371 } else {
1372 size_t bytes;
1373
1374 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1375 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1376 } else {
1377 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1378 }
1379
1380 if (unlikely(bytes != len)) {
1381 trace_pci_nvme_err_invalid_dma();
1382 return NVME_INVALID_FIELD | NVME_DNR;
1383 }
1384 }
1385
1386 return NVME_SUCCESS;
1387 }
1388
nvme_c2h(NvmeCtrl * n,void * ptr,uint32_t len,NvmeRequest * req)1389 static inline uint16_t nvme_c2h(NvmeCtrl *n, void *ptr, uint32_t len,
1390 NvmeRequest *req)
1391 {
1392 uint16_t status;
1393
1394 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1395 if (status) {
1396 return status;
1397 }
1398
1399 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1400 }
1401
nvme_h2c(NvmeCtrl * n,void * ptr,uint32_t len,NvmeRequest * req)1402 static inline uint16_t nvme_h2c(NvmeCtrl *n, void *ptr, uint32_t len,
1403 NvmeRequest *req)
1404 {
1405 uint16_t status;
1406
1407 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1408 if (status) {
1409 return status;
1410 }
1411
1412 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1413 }
1414
nvme_bounce_data(NvmeCtrl * n,void * ptr,uint32_t len,NvmeTxDirection dir,NvmeRequest * req)1415 uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len,
1416 NvmeTxDirection dir, NvmeRequest *req)
1417 {
1418 NvmeNamespace *ns = req->ns;
1419 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1420 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1421 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1422
1423 if (nvme_ns_ext(ns) &&
1424 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1425 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
1426 ns->lbaf.ms, 0, dir);
1427 }
1428
1429 return nvme_tx(n, &req->sg, ptr, len, dir);
1430 }
1431
nvme_bounce_mdata(NvmeCtrl * n,void * ptr,uint32_t len,NvmeTxDirection dir,NvmeRequest * req)1432 uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len,
1433 NvmeTxDirection dir, NvmeRequest *req)
1434 {
1435 NvmeNamespace *ns = req->ns;
1436 uint16_t status;
1437
1438 if (nvme_ns_ext(ns)) {
1439 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
1440 ns->lbasz, ns->lbasz, dir);
1441 }
1442
1443 nvme_sg_unmap(&req->sg);
1444
1445 status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1446 if (status) {
1447 return status;
1448 }
1449
1450 return nvme_tx(n, &req->sg, ptr, len, dir);
1451 }
1452
nvme_blk_read(BlockBackend * blk,int64_t offset,uint32_t align,BlockCompletionFunc * cb,NvmeRequest * req)1453 static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1454 uint32_t align, BlockCompletionFunc *cb,
1455 NvmeRequest *req)
1456 {
1457 assert(req->sg.flags & NVME_SG_ALLOC);
1458
1459 if (req->sg.flags & NVME_SG_DMA) {
1460 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, align, cb, req);
1461 } else {
1462 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1463 }
1464 }
1465
nvme_blk_write(BlockBackend * blk,int64_t offset,uint32_t align,BlockCompletionFunc * cb,NvmeRequest * req)1466 static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1467 uint32_t align, BlockCompletionFunc *cb,
1468 NvmeRequest *req)
1469 {
1470 assert(req->sg.flags & NVME_SG_ALLOC);
1471
1472 if (req->sg.flags & NVME_SG_DMA) {
1473 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, align, cb, req);
1474 } else {
1475 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1476 }
1477 }
1478
nvme_update_cq_eventidx(const NvmeCQueue * cq)1479 static void nvme_update_cq_eventidx(const NvmeCQueue *cq)
1480 {
1481 trace_pci_nvme_update_cq_eventidx(cq->cqid, cq->head);
1482
1483 stl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->ei_addr, cq->head,
1484 MEMTXATTRS_UNSPECIFIED);
1485 }
1486
nvme_update_cq_head(NvmeCQueue * cq)1487 static void nvme_update_cq_head(NvmeCQueue *cq)
1488 {
1489 ldl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->db_addr, &cq->head,
1490 MEMTXATTRS_UNSPECIFIED);
1491
1492 trace_pci_nvme_update_cq_head(cq->cqid, cq->head);
1493 }
1494
nvme_post_cqes(void * opaque)1495 static void nvme_post_cqes(void *opaque)
1496 {
1497 NvmeCQueue *cq = opaque;
1498 NvmeCtrl *n = cq->ctrl;
1499 NvmeRequest *req, *next;
1500 bool pending = cq->head != cq->tail;
1501 int ret;
1502
1503 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1504 NvmeSQueue *sq;
1505 hwaddr addr;
1506
1507 if (n->dbbuf_enabled) {
1508 nvme_update_cq_eventidx(cq);
1509 nvme_update_cq_head(cq);
1510 }
1511
1512 if (nvme_cq_full(cq)) {
1513 break;
1514 }
1515
1516 sq = req->sq;
1517 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1518 req->cqe.sq_id = cpu_to_le16(sq->sqid);
1519 req->cqe.sq_head = cpu_to_le16(sq->head);
1520 addr = cq->dma_addr + (cq->tail << NVME_CQES);
1521 ret = pci_dma_write(PCI_DEVICE(n), addr, (void *)&req->cqe,
1522 sizeof(req->cqe));
1523 if (ret) {
1524 trace_pci_nvme_err_addr_write(addr);
1525 trace_pci_nvme_err_cfs();
1526 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
1527 break;
1528 }
1529
1530 QTAILQ_REMOVE(&cq->req_list, req, entry);
1531
1532 nvme_inc_cq_tail(cq);
1533 nvme_sg_unmap(&req->sg);
1534
1535 if (QTAILQ_EMPTY(&sq->req_list) && !nvme_sq_empty(sq)) {
1536 qemu_bh_schedule(sq->bh);
1537 }
1538
1539 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1540 }
1541 if (cq->tail != cq->head) {
1542 if (cq->irq_enabled && !pending) {
1543 n->cq_pending++;
1544 }
1545
1546 nvme_irq_assert(n, cq);
1547 }
1548 }
1549
nvme_enqueue_req_completion(NvmeCQueue * cq,NvmeRequest * req)1550 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1551 {
1552 assert(cq->cqid == req->sq->cqid);
1553 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1554 le32_to_cpu(req->cqe.result),
1555 le32_to_cpu(req->cqe.dw1),
1556 req->status);
1557
1558 if (req->status) {
1559 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1560 req->status, req->cmd.opcode);
1561 }
1562
1563 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1564 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1565
1566 qemu_bh_schedule(cq->bh);
1567 }
1568
nvme_process_aers(void * opaque)1569 static void nvme_process_aers(void *opaque)
1570 {
1571 NvmeCtrl *n = opaque;
1572 NvmeAsyncEvent *event, *next;
1573
1574 trace_pci_nvme_process_aers(n->aer_queued);
1575
1576 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1577 NvmeRequest *req;
1578 NvmeAerResult *result;
1579
1580 /* can't post cqe if there is nothing to complete */
1581 if (!n->outstanding_aers) {
1582 trace_pci_nvme_no_outstanding_aers();
1583 break;
1584 }
1585
1586 /* ignore if masked (cqe posted, but event not cleared) */
1587 if (n->aer_mask & (1 << event->result.event_type)) {
1588 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1589 continue;
1590 }
1591
1592 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1593 n->aer_queued--;
1594
1595 n->aer_mask |= 1 << event->result.event_type;
1596 n->outstanding_aers--;
1597
1598 req = n->aer_reqs[n->outstanding_aers];
1599
1600 result = (NvmeAerResult *) &req->cqe.result;
1601 result->event_type = event->result.event_type;
1602 result->event_info = event->result.event_info;
1603 result->log_page = event->result.log_page;
1604 g_free(event);
1605
1606 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1607 result->log_page);
1608
1609 nvme_enqueue_req_completion(&n->admin_cq, req);
1610 }
1611 }
1612
nvme_enqueue_event(NvmeCtrl * n,uint8_t event_type,uint8_t event_info,uint8_t log_page)1613 static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1614 uint8_t event_info, uint8_t log_page)
1615 {
1616 NvmeAsyncEvent *event;
1617
1618 trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1619
1620 if (n->aer_queued == n->params.aer_max_queued) {
1621 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1622 return;
1623 }
1624
1625 event = g_new(NvmeAsyncEvent, 1);
1626 event->result = (NvmeAerResult) {
1627 .event_type = event_type,
1628 .event_info = event_info,
1629 .log_page = log_page,
1630 };
1631
1632 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1633 n->aer_queued++;
1634
1635 nvme_process_aers(n);
1636 }
1637
nvme_smart_event(NvmeCtrl * n,uint8_t event)1638 static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1639 {
1640 uint8_t aer_info;
1641
1642 /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
1643 if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1644 return;
1645 }
1646
1647 switch (event) {
1648 case NVME_SMART_SPARE:
1649 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1650 break;
1651 case NVME_SMART_TEMPERATURE:
1652 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1653 break;
1654 case NVME_SMART_RELIABILITY:
1655 case NVME_SMART_MEDIA_READ_ONLY:
1656 case NVME_SMART_FAILED_VOLATILE_MEDIA:
1657 case NVME_SMART_PMR_UNRELIABLE:
1658 aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1659 break;
1660 default:
1661 return;
1662 }
1663
1664 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1665 }
1666
nvme_clear_events(NvmeCtrl * n,uint8_t event_type)1667 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1668 {
1669 NvmeAsyncEvent *event, *next;
1670
1671 n->aer_mask &= ~(1 << event_type);
1672
1673 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1674 if (event->result.event_type == event_type) {
1675 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1676 n->aer_queued--;
1677 g_free(event);
1678 }
1679 }
1680 }
1681
nvme_check_mdts(NvmeCtrl * n,size_t len)1682 static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1683 {
1684 uint8_t mdts = n->params.mdts;
1685
1686 if (mdts && len > n->page_size << mdts) {
1687 trace_pci_nvme_err_mdts(len);
1688 return NVME_INVALID_FIELD | NVME_DNR;
1689 }
1690
1691 return NVME_SUCCESS;
1692 }
1693
nvme_check_bounds(NvmeNamespace * ns,uint64_t slba,uint32_t nlb)1694 static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1695 uint32_t nlb)
1696 {
1697 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1698
1699 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1700 trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
1701 return NVME_LBA_RANGE | NVME_DNR;
1702 }
1703
1704 return NVME_SUCCESS;
1705 }
1706
nvme_block_status_all(NvmeNamespace * ns,uint64_t slba,uint32_t nlb,int flags)1707 static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
1708 uint32_t nlb, int flags)
1709 {
1710 BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1711
1712 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1713 int64_t offset = nvme_l2b(ns, slba);
1714 int ret;
1715
1716 /*
1717 * `pnum` holds the number of bytes after offset that shares the same
1718 * allocation status as the byte at offset. If `pnum` is different from
1719 * `bytes`, we should check the allocation status of the next range and
1720 * continue this until all bytes have been checked.
1721 */
1722 do {
1723 bytes -= pnum;
1724
1725 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1726 if (ret < 0) {
1727 return ret;
1728 }
1729
1730
1731 trace_pci_nvme_block_status(offset, bytes, pnum, ret,
1732 !!(ret & BDRV_BLOCK_ZERO));
1733
1734 if (!(ret & flags)) {
1735 return 1;
1736 }
1737
1738 offset += pnum;
1739 } while (pnum != bytes);
1740
1741 return 0;
1742 }
1743
nvme_check_dulbe(NvmeNamespace * ns,uint64_t slba,uint32_t nlb)1744 static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1745 uint32_t nlb)
1746 {
1747 int ret;
1748 Error *err = NULL;
1749
1750 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
1751 if (ret) {
1752 if (ret < 0) {
1753 error_setg_errno(&err, -ret, "unable to get block status");
1754 error_report_err(err);
1755
1756 return NVME_INTERNAL_DEV_ERROR;
1757 }
1758
1759 return NVME_DULB;
1760 }
1761
1762 return NVME_SUCCESS;
1763 }
1764
nvme_zone_idx(NvmeNamespace * ns,uint64_t slba)1765 static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1766 {
1767 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1768 slba / ns->zone_size;
1769 }
1770
nvme_get_zone_by_slba(NvmeNamespace * ns,uint64_t slba)1771 static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1772 {
1773 uint32_t zone_idx = nvme_zone_idx(ns, slba);
1774
1775 if (zone_idx >= ns->num_zones) {
1776 return NULL;
1777 }
1778
1779 return &ns->zone_array[zone_idx];
1780 }
1781
nvme_check_zone_state_for_write(NvmeZone * zone)1782 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1783 {
1784 uint64_t zslba = zone->d.zslba;
1785
1786 switch (nvme_get_zone_state(zone)) {
1787 case NVME_ZONE_STATE_EMPTY:
1788 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1789 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1790 case NVME_ZONE_STATE_CLOSED:
1791 return NVME_SUCCESS;
1792 case NVME_ZONE_STATE_FULL:
1793 trace_pci_nvme_err_zone_is_full(zslba);
1794 return NVME_ZONE_FULL;
1795 case NVME_ZONE_STATE_OFFLINE:
1796 trace_pci_nvme_err_zone_is_offline(zslba);
1797 return NVME_ZONE_OFFLINE;
1798 case NVME_ZONE_STATE_READ_ONLY:
1799 trace_pci_nvme_err_zone_is_read_only(zslba);
1800 return NVME_ZONE_READ_ONLY;
1801 default:
1802 g_assert_not_reached();
1803 }
1804
1805 return NVME_INTERNAL_DEV_ERROR;
1806 }
1807
nvme_check_zone_write(NvmeNamespace * ns,NvmeZone * zone,uint64_t slba,uint32_t nlb)1808 static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1809 uint64_t slba, uint32_t nlb)
1810 {
1811 uint64_t zcap = nvme_zone_wr_boundary(zone);
1812 uint16_t status;
1813
1814 status = nvme_check_zone_state_for_write(zone);
1815 if (status) {
1816 return status;
1817 }
1818
1819 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1820 uint64_t ezrwa = zone->w_ptr + 2 * ns->zns.zrwas;
1821
1822 if (slba < zone->w_ptr || slba + nlb > ezrwa) {
1823 trace_pci_nvme_err_zone_invalid_write(slba, zone->w_ptr);
1824 return NVME_ZONE_INVALID_WRITE;
1825 }
1826 } else {
1827 if (unlikely(slba != zone->w_ptr)) {
1828 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
1829 zone->w_ptr);
1830 return NVME_ZONE_INVALID_WRITE;
1831 }
1832 }
1833
1834 if (unlikely((slba + nlb) > zcap)) {
1835 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1836 return NVME_ZONE_BOUNDARY_ERROR;
1837 }
1838
1839 return NVME_SUCCESS;
1840 }
1841
nvme_check_zone_state_for_read(NvmeZone * zone)1842 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1843 {
1844 switch (nvme_get_zone_state(zone)) {
1845 case NVME_ZONE_STATE_EMPTY:
1846 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1847 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1848 case NVME_ZONE_STATE_FULL:
1849 case NVME_ZONE_STATE_CLOSED:
1850 case NVME_ZONE_STATE_READ_ONLY:
1851 return NVME_SUCCESS;
1852 case NVME_ZONE_STATE_OFFLINE:
1853 trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1854 return NVME_ZONE_OFFLINE;
1855 default:
1856 g_assert_not_reached();
1857 }
1858
1859 return NVME_INTERNAL_DEV_ERROR;
1860 }
1861
nvme_check_zone_read(NvmeNamespace * ns,uint64_t slba,uint32_t nlb)1862 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1863 uint32_t nlb)
1864 {
1865 NvmeZone *zone;
1866 uint64_t bndry, end;
1867 uint16_t status;
1868
1869 zone = nvme_get_zone_by_slba(ns, slba);
1870 assert(zone);
1871
1872 bndry = nvme_zone_rd_boundary(ns, zone);
1873 end = slba + nlb;
1874
1875 status = nvme_check_zone_state_for_read(zone);
1876 if (status) {
1877 ;
1878 } else if (unlikely(end > bndry)) {
1879 if (!ns->params.cross_zone_read) {
1880 status = NVME_ZONE_BOUNDARY_ERROR;
1881 } else {
1882 /*
1883 * Read across zone boundary - check that all subsequent
1884 * zones that are being read have an appropriate state.
1885 */
1886 do {
1887 zone++;
1888 status = nvme_check_zone_state_for_read(zone);
1889 if (status) {
1890 break;
1891 }
1892 } while (end > nvme_zone_rd_boundary(ns, zone));
1893 }
1894 }
1895
1896 return status;
1897 }
1898
nvme_zrm_finish(NvmeNamespace * ns,NvmeZone * zone)1899 static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1900 {
1901 switch (nvme_get_zone_state(zone)) {
1902 case NVME_ZONE_STATE_FULL:
1903 return NVME_SUCCESS;
1904
1905 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1906 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1907 nvme_aor_dec_open(ns);
1908 /* fallthrough */
1909 case NVME_ZONE_STATE_CLOSED:
1910 nvme_aor_dec_active(ns);
1911
1912 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1913 zone->d.za &= ~NVME_ZA_ZRWA_VALID;
1914 if (ns->params.numzrwa) {
1915 ns->zns.numzrwa++;
1916 }
1917 }
1918
1919 /* fallthrough */
1920 case NVME_ZONE_STATE_EMPTY:
1921 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1922 return NVME_SUCCESS;
1923
1924 default:
1925 return NVME_ZONE_INVAL_TRANSITION;
1926 }
1927 }
1928
nvme_zrm_close(NvmeNamespace * ns,NvmeZone * zone)1929 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1930 {
1931 switch (nvme_get_zone_state(zone)) {
1932 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1933 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1934 nvme_aor_dec_open(ns);
1935 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1936 /* fall through */
1937 case NVME_ZONE_STATE_CLOSED:
1938 return NVME_SUCCESS;
1939
1940 default:
1941 return NVME_ZONE_INVAL_TRANSITION;
1942 }
1943 }
1944
nvme_zrm_reset(NvmeNamespace * ns,NvmeZone * zone)1945 static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
1946 {
1947 switch (nvme_get_zone_state(zone)) {
1948 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1949 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1950 nvme_aor_dec_open(ns);
1951 /* fallthrough */
1952 case NVME_ZONE_STATE_CLOSED:
1953 nvme_aor_dec_active(ns);
1954
1955 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1956 if (ns->params.numzrwa) {
1957 ns->zns.numzrwa++;
1958 }
1959 }
1960
1961 /* fallthrough */
1962 case NVME_ZONE_STATE_FULL:
1963 zone->w_ptr = zone->d.zslba;
1964 zone->d.wp = zone->w_ptr;
1965 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
1966 /* fallthrough */
1967 case NVME_ZONE_STATE_EMPTY:
1968 return NVME_SUCCESS;
1969
1970 default:
1971 return NVME_ZONE_INVAL_TRANSITION;
1972 }
1973 }
1974
nvme_zrm_auto_transition_zone(NvmeNamespace * ns)1975 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1976 {
1977 NvmeZone *zone;
1978
1979 if (ns->params.max_open_zones &&
1980 ns->nr_open_zones == ns->params.max_open_zones) {
1981 zone = QTAILQ_FIRST(&ns->imp_open_zones);
1982 if (zone) {
1983 /*
1984 * Automatically close this implicitly open zone.
1985 */
1986 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
1987 nvme_zrm_close(ns, zone);
1988 }
1989 }
1990 }
1991
1992 enum {
1993 NVME_ZRM_AUTO = 1 << 0,
1994 NVME_ZRM_ZRWA = 1 << 1,
1995 };
1996
nvme_zrm_open_flags(NvmeCtrl * n,NvmeNamespace * ns,NvmeZone * zone,int flags)1997 static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
1998 NvmeZone *zone, int flags)
1999 {
2000 int act = 0;
2001 uint16_t status;
2002
2003 switch (nvme_get_zone_state(zone)) {
2004 case NVME_ZONE_STATE_EMPTY:
2005 act = 1;
2006
2007 /* fallthrough */
2008
2009 case NVME_ZONE_STATE_CLOSED:
2010 if (n->params.auto_transition_zones) {
2011 nvme_zrm_auto_transition_zone(ns);
2012 }
2013 status = nvme_zns_check_resources(ns, act, 1,
2014 (flags & NVME_ZRM_ZRWA) ? 1 : 0);
2015 if (status) {
2016 return status;
2017 }
2018
2019 if (act) {
2020 nvme_aor_inc_active(ns);
2021 }
2022
2023 nvme_aor_inc_open(ns);
2024
2025 if (flags & NVME_ZRM_AUTO) {
2026 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
2027 return NVME_SUCCESS;
2028 }
2029
2030 /* fallthrough */
2031
2032 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
2033 if (flags & NVME_ZRM_AUTO) {
2034 return NVME_SUCCESS;
2035 }
2036
2037 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
2038
2039 /* fallthrough */
2040
2041 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
2042 if (flags & NVME_ZRM_ZRWA) {
2043 ns->zns.numzrwa--;
2044
2045 zone->d.za |= NVME_ZA_ZRWA_VALID;
2046 }
2047
2048 return NVME_SUCCESS;
2049
2050 default:
2051 return NVME_ZONE_INVAL_TRANSITION;
2052 }
2053 }
2054
nvme_zrm_auto(NvmeCtrl * n,NvmeNamespace * ns,NvmeZone * zone)2055 static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
2056 NvmeZone *zone)
2057 {
2058 return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
2059 }
2060
nvme_advance_zone_wp(NvmeNamespace * ns,NvmeZone * zone,uint32_t nlb)2061 static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
2062 uint32_t nlb)
2063 {
2064 zone->d.wp += nlb;
2065
2066 if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
2067 nvme_zrm_finish(ns, zone);
2068 }
2069 }
2070
nvme_zoned_zrwa_implicit_flush(NvmeNamespace * ns,NvmeZone * zone,uint32_t nlbc)2071 static void nvme_zoned_zrwa_implicit_flush(NvmeNamespace *ns, NvmeZone *zone,
2072 uint32_t nlbc)
2073 {
2074 uint16_t nzrwafgs = DIV_ROUND_UP(nlbc, ns->zns.zrwafg);
2075
2076 nlbc = nzrwafgs * ns->zns.zrwafg;
2077
2078 trace_pci_nvme_zoned_zrwa_implicit_flush(zone->d.zslba, nlbc);
2079
2080 zone->w_ptr += nlbc;
2081
2082 nvme_advance_zone_wp(ns, zone, nlbc);
2083 }
2084
nvme_finalize_zoned_write(NvmeNamespace * ns,NvmeRequest * req)2085 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
2086 {
2087 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2088 NvmeZone *zone;
2089 uint64_t slba;
2090 uint32_t nlb;
2091
2092 slba = le64_to_cpu(rw->slba);
2093 nlb = le16_to_cpu(rw->nlb) + 1;
2094 zone = nvme_get_zone_by_slba(ns, slba);
2095 assert(zone);
2096
2097 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
2098 uint64_t ezrwa = zone->w_ptr + ns->zns.zrwas - 1;
2099 uint64_t elba = slba + nlb - 1;
2100
2101 if (elba > ezrwa) {
2102 nvme_zoned_zrwa_implicit_flush(ns, zone, elba - ezrwa);
2103 }
2104
2105 return;
2106 }
2107
2108 nvme_advance_zone_wp(ns, zone, nlb);
2109 }
2110
nvme_is_write(NvmeRequest * req)2111 static inline bool nvme_is_write(NvmeRequest *req)
2112 {
2113 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2114
2115 return rw->opcode == NVME_CMD_WRITE ||
2116 rw->opcode == NVME_CMD_ZONE_APPEND ||
2117 rw->opcode == NVME_CMD_WRITE_ZEROES;
2118 }
2119
nvme_misc_cb(void * opaque,int ret)2120 static void nvme_misc_cb(void *opaque, int ret)
2121 {
2122 NvmeRequest *req = opaque;
2123 uint16_t cid = nvme_cid(req);
2124
2125 trace_pci_nvme_misc_cb(cid);
2126
2127 if (ret) {
2128 if (!req->status) {
2129 req->status = NVME_INTERNAL_DEV_ERROR;
2130 }
2131
2132 trace_pci_nvme_err_aio(cid, strerror(-ret), req->status);
2133 }
2134
2135 nvme_enqueue_req_completion(nvme_cq(req), req);
2136 }
2137
nvme_rw_complete_cb(void * opaque,int ret)2138 void nvme_rw_complete_cb(void *opaque, int ret)
2139 {
2140 NvmeRequest *req = opaque;
2141 NvmeNamespace *ns = req->ns;
2142 BlockBackend *blk = ns->blkconf.blk;
2143 BlockAcctCookie *acct = &req->acct;
2144 BlockAcctStats *stats = blk_get_stats(blk);
2145
2146 trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
2147
2148 if (ret) {
2149 Error *err = NULL;
2150
2151 block_acct_failed(stats, acct);
2152
2153 switch (req->cmd.opcode) {
2154 case NVME_CMD_READ:
2155 req->status = NVME_UNRECOVERED_READ;
2156 break;
2157
2158 case NVME_CMD_WRITE:
2159 case NVME_CMD_WRITE_ZEROES:
2160 case NVME_CMD_ZONE_APPEND:
2161 req->status = NVME_WRITE_FAULT;
2162 break;
2163
2164 default:
2165 req->status = NVME_INTERNAL_DEV_ERROR;
2166 break;
2167 }
2168
2169 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), req->status);
2170
2171 error_setg_errno(&err, -ret, "aio failed");
2172 error_report_err(err);
2173 } else {
2174 block_acct_done(stats, acct);
2175 }
2176
2177 if (ns->params.zoned && nvme_is_write(req)) {
2178 nvme_finalize_zoned_write(ns, req);
2179 }
2180
2181 nvme_enqueue_req_completion(nvme_cq(req), req);
2182 }
2183
nvme_rw_cb(void * opaque,int ret)2184 static void nvme_rw_cb(void *opaque, int ret)
2185 {
2186 NvmeRequest *req = opaque;
2187 NvmeNamespace *ns = req->ns;
2188
2189 BlockBackend *blk = ns->blkconf.blk;
2190
2191 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
2192
2193 if (ret) {
2194 goto out;
2195 }
2196
2197 if (ns->lbaf.ms) {
2198 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2199 uint64_t slba = le64_to_cpu(rw->slba);
2200 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2201 uint64_t offset = nvme_moff(ns, slba);
2202
2203 if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
2204 size_t mlen = nvme_m2b(ns, nlb);
2205
2206 req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
2207 BDRV_REQ_MAY_UNMAP,
2208 nvme_rw_complete_cb, req);
2209 return;
2210 }
2211
2212 if (nvme_ns_ext(ns) || req->cmd.mptr) {
2213 uint16_t status;
2214
2215 nvme_sg_unmap(&req->sg);
2216 status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
2217 if (status) {
2218 ret = -EFAULT;
2219 goto out;
2220 }
2221
2222 if (req->cmd.opcode == NVME_CMD_READ) {
2223 return nvme_blk_read(blk, offset, 1, nvme_rw_complete_cb, req);
2224 }
2225
2226 return nvme_blk_write(blk, offset, 1, nvme_rw_complete_cb, req);
2227 }
2228 }
2229
2230 out:
2231 nvme_rw_complete_cb(req, ret);
2232 }
2233
nvme_verify_cb(void * opaque,int ret)2234 static void nvme_verify_cb(void *opaque, int ret)
2235 {
2236 NvmeBounceContext *ctx = opaque;
2237 NvmeRequest *req = ctx->req;
2238 NvmeNamespace *ns = req->ns;
2239 BlockBackend *blk = ns->blkconf.blk;
2240 BlockAcctCookie *acct = &req->acct;
2241 BlockAcctStats *stats = blk_get_stats(blk);
2242 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2243 uint64_t slba = le64_to_cpu(rw->slba);
2244 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2245 uint16_t apptag = le16_to_cpu(rw->apptag);
2246 uint16_t appmask = le16_to_cpu(rw->appmask);
2247 uint64_t reftag = le32_to_cpu(rw->reftag);
2248 uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2249 uint16_t status;
2250
2251 reftag |= cdw3 << 32;
2252
2253 trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
2254
2255 if (ret) {
2256 block_acct_failed(stats, acct);
2257 req->status = NVME_UNRECOVERED_READ;
2258
2259 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), req->status);
2260
2261 goto out;
2262 }
2263
2264 block_acct_done(stats, acct);
2265
2266 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2267 status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
2268 ctx->mdata.iov.size, slba);
2269 if (status) {
2270 req->status = status;
2271 goto out;
2272 }
2273
2274 req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2275 ctx->mdata.bounce, ctx->mdata.iov.size,
2276 prinfo, slba, apptag, appmask, &reftag);
2277 }
2278
2279 out:
2280 qemu_iovec_destroy(&ctx->data.iov);
2281 g_free(ctx->data.bounce);
2282
2283 qemu_iovec_destroy(&ctx->mdata.iov);
2284 g_free(ctx->mdata.bounce);
2285
2286 g_free(ctx);
2287
2288 nvme_enqueue_req_completion(nvme_cq(req), req);
2289 }
2290
2291
nvme_verify_mdata_in_cb(void * opaque,int ret)2292 static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2293 {
2294 NvmeBounceContext *ctx = opaque;
2295 NvmeRequest *req = ctx->req;
2296 NvmeNamespace *ns = req->ns;
2297 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2298 uint64_t slba = le64_to_cpu(rw->slba);
2299 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2300 size_t mlen = nvme_m2b(ns, nlb);
2301 uint64_t offset = nvme_moff(ns, slba);
2302 BlockBackend *blk = ns->blkconf.blk;
2303
2304 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2305
2306 if (ret) {
2307 goto out;
2308 }
2309
2310 ctx->mdata.bounce = g_malloc(mlen);
2311
2312 qemu_iovec_reset(&ctx->mdata.iov);
2313 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2314
2315 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2316 nvme_verify_cb, ctx);
2317 return;
2318
2319 out:
2320 nvme_verify_cb(ctx, ret);
2321 }
2322
2323 struct nvme_compare_ctx {
2324 struct {
2325 QEMUIOVector iov;
2326 uint8_t *bounce;
2327 } data;
2328
2329 struct {
2330 QEMUIOVector iov;
2331 uint8_t *bounce;
2332 } mdata;
2333 };
2334
nvme_compare_mdata_cb(void * opaque,int ret)2335 static void nvme_compare_mdata_cb(void *opaque, int ret)
2336 {
2337 NvmeRequest *req = opaque;
2338 NvmeNamespace *ns = req->ns;
2339 NvmeCtrl *n = nvme_ctrl(req);
2340 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2341 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2342 uint16_t apptag = le16_to_cpu(rw->apptag);
2343 uint16_t appmask = le16_to_cpu(rw->appmask);
2344 uint64_t reftag = le32_to_cpu(rw->reftag);
2345 uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2346 struct nvme_compare_ctx *ctx = req->opaque;
2347 g_autofree uint8_t *buf = NULL;
2348 BlockBackend *blk = ns->blkconf.blk;
2349 BlockAcctCookie *acct = &req->acct;
2350 BlockAcctStats *stats = blk_get_stats(blk);
2351 uint16_t status = NVME_SUCCESS;
2352
2353 reftag |= cdw3 << 32;
2354
2355 trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2356
2357 if (ret) {
2358 block_acct_failed(stats, acct);
2359 req->status = NVME_UNRECOVERED_READ;
2360
2361 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), req->status);
2362
2363 goto out;
2364 }
2365
2366 buf = g_malloc(ctx->mdata.iov.size);
2367
2368 status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2369 NVME_TX_DIRECTION_TO_DEVICE, req);
2370 if (status) {
2371 req->status = status;
2372 goto out;
2373 }
2374
2375 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2376 uint64_t slba = le64_to_cpu(rw->slba);
2377 uint8_t *bufp;
2378 uint8_t *mbufp = ctx->mdata.bounce;
2379 uint8_t *end = mbufp + ctx->mdata.iov.size;
2380 int16_t pil = 0;
2381
2382 status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2383 ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
2384 slba, apptag, appmask, &reftag);
2385 if (status) {
2386 req->status = status;
2387 goto out;
2388 }
2389
2390 /*
2391 * When formatted with protection information, do not compare the DIF
2392 * tuple.
2393 */
2394 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2395 pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
2396 }
2397
2398 for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
2399 if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
2400 req->status = NVME_CMP_FAILURE | NVME_DNR;
2401 goto out;
2402 }
2403 }
2404
2405 goto out;
2406 }
2407
2408 if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2409 req->status = NVME_CMP_FAILURE | NVME_DNR;
2410 goto out;
2411 }
2412
2413 block_acct_done(stats, acct);
2414
2415 out:
2416 qemu_iovec_destroy(&ctx->data.iov);
2417 g_free(ctx->data.bounce);
2418
2419 qemu_iovec_destroy(&ctx->mdata.iov);
2420 g_free(ctx->mdata.bounce);
2421
2422 g_free(ctx);
2423
2424 nvme_enqueue_req_completion(nvme_cq(req), req);
2425 }
2426
nvme_compare_data_cb(void * opaque,int ret)2427 static void nvme_compare_data_cb(void *opaque, int ret)
2428 {
2429 NvmeRequest *req = opaque;
2430 NvmeCtrl *n = nvme_ctrl(req);
2431 NvmeNamespace *ns = req->ns;
2432 BlockBackend *blk = ns->blkconf.blk;
2433 BlockAcctCookie *acct = &req->acct;
2434 BlockAcctStats *stats = blk_get_stats(blk);
2435
2436 struct nvme_compare_ctx *ctx = req->opaque;
2437 g_autofree uint8_t *buf = NULL;
2438 uint16_t status;
2439
2440 trace_pci_nvme_compare_data_cb(nvme_cid(req));
2441
2442 if (ret) {
2443 block_acct_failed(stats, acct);
2444 req->status = NVME_UNRECOVERED_READ;
2445
2446 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), req->status);
2447
2448 goto out;
2449 }
2450
2451 buf = g_malloc(ctx->data.iov.size);
2452
2453 status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2454 NVME_TX_DIRECTION_TO_DEVICE, req);
2455 if (status) {
2456 req->status = status;
2457 goto out;
2458 }
2459
2460 if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2461 req->status = NVME_CMP_FAILURE | NVME_DNR;
2462 goto out;
2463 }
2464
2465 if (ns->lbaf.ms) {
2466 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2467 uint64_t slba = le64_to_cpu(rw->slba);
2468 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2469 size_t mlen = nvme_m2b(ns, nlb);
2470 uint64_t offset = nvme_moff(ns, slba);
2471
2472 ctx->mdata.bounce = g_malloc(mlen);
2473
2474 qemu_iovec_init(&ctx->mdata.iov, 1);
2475 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2476
2477 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2478 nvme_compare_mdata_cb, req);
2479 return;
2480 }
2481
2482 block_acct_done(stats, acct);
2483
2484 out:
2485 qemu_iovec_destroy(&ctx->data.iov);
2486 g_free(ctx->data.bounce);
2487 g_free(ctx);
2488
2489 nvme_enqueue_req_completion(nvme_cq(req), req);
2490 }
2491
2492 typedef struct NvmeDSMAIOCB {
2493 BlockAIOCB common;
2494 BlockAIOCB *aiocb;
2495 NvmeRequest *req;
2496 int ret;
2497
2498 NvmeDsmRange *range;
2499 unsigned int nr;
2500 unsigned int idx;
2501 } NvmeDSMAIOCB;
2502
nvme_dsm_cancel(BlockAIOCB * aiocb)2503 static void nvme_dsm_cancel(BlockAIOCB *aiocb)
2504 {
2505 NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
2506
2507 /* break nvme_dsm_cb loop */
2508 iocb->idx = iocb->nr;
2509 iocb->ret = -ECANCELED;
2510
2511 if (iocb->aiocb) {
2512 blk_aio_cancel_async(iocb->aiocb);
2513 iocb->aiocb = NULL;
2514 } else {
2515 /*
2516 * We only reach this if nvme_dsm_cancel() has already been called or
2517 * the command ran to completion.
2518 */
2519 assert(iocb->idx == iocb->nr);
2520 }
2521 }
2522
2523 static const AIOCBInfo nvme_dsm_aiocb_info = {
2524 .aiocb_size = sizeof(NvmeDSMAIOCB),
2525 .cancel_async = nvme_dsm_cancel,
2526 };
2527
2528 static void nvme_dsm_cb(void *opaque, int ret);
2529
nvme_dsm_md_cb(void * opaque,int ret)2530 static void nvme_dsm_md_cb(void *opaque, int ret)
2531 {
2532 NvmeDSMAIOCB *iocb = opaque;
2533 NvmeRequest *req = iocb->req;
2534 NvmeNamespace *ns = req->ns;
2535 NvmeDsmRange *range;
2536 uint64_t slba;
2537 uint32_t nlb;
2538
2539 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
2540 goto done;
2541 }
2542
2543 range = &iocb->range[iocb->idx - 1];
2544 slba = le64_to_cpu(range->slba);
2545 nlb = le32_to_cpu(range->nlb);
2546
2547 /*
2548 * Check that all block were discarded (zeroed); otherwise we do not zero
2549 * the metadata.
2550 */
2551
2552 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
2553 if (ret) {
2554 if (ret < 0) {
2555 goto done;
2556 }
2557
2558 nvme_dsm_cb(iocb, 0);
2559 return;
2560 }
2561
2562 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
2563 nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
2564 nvme_dsm_cb, iocb);
2565 return;
2566
2567 done:
2568 nvme_dsm_cb(iocb, ret);
2569 }
2570
nvme_dsm_cb(void * opaque,int ret)2571 static void nvme_dsm_cb(void *opaque, int ret)
2572 {
2573 NvmeDSMAIOCB *iocb = opaque;
2574 NvmeRequest *req = iocb->req;
2575 NvmeCtrl *n = nvme_ctrl(req);
2576 NvmeNamespace *ns = req->ns;
2577 NvmeDsmRange *range;
2578 uint64_t slba;
2579 uint32_t nlb;
2580
2581 if (iocb->ret < 0) {
2582 goto done;
2583 } else if (ret < 0) {
2584 iocb->ret = ret;
2585 goto done;
2586 }
2587
2588 next:
2589 if (iocb->idx == iocb->nr) {
2590 goto done;
2591 }
2592
2593 range = &iocb->range[iocb->idx++];
2594 slba = le64_to_cpu(range->slba);
2595 nlb = le32_to_cpu(range->nlb);
2596
2597 trace_pci_nvme_dsm_deallocate(slba, nlb);
2598
2599 if (nlb > n->dmrsl) {
2600 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2601 goto next;
2602 }
2603
2604 if (nvme_check_bounds(ns, slba, nlb)) {
2605 trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2606 ns->id_ns.nsze);
2607 goto next;
2608 }
2609
2610 iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
2611 nvme_l2b(ns, nlb),
2612 nvme_dsm_md_cb, iocb);
2613 return;
2614
2615 done:
2616 iocb->aiocb = NULL;
2617 iocb->common.cb(iocb->common.opaque, iocb->ret);
2618 g_free(iocb->range);
2619 qemu_aio_unref(iocb);
2620 }
2621
nvme_dsm(NvmeCtrl * n,NvmeRequest * req)2622 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2623 {
2624 NvmeNamespace *ns = req->ns;
2625 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2626 uint32_t attr = le32_to_cpu(dsm->attributes);
2627 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2628 uint16_t status = NVME_SUCCESS;
2629
2630 trace_pci_nvme_dsm(nr, attr);
2631
2632 if (attr & NVME_DSMGMT_AD) {
2633 NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
2634 nvme_misc_cb, req);
2635
2636 iocb->req = req;
2637 iocb->ret = 0;
2638 iocb->range = g_new(NvmeDsmRange, nr);
2639 iocb->nr = nr;
2640 iocb->idx = 0;
2641
2642 status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
2643 req);
2644 if (status) {
2645 g_free(iocb->range);
2646 qemu_aio_unref(iocb);
2647
2648 return status;
2649 }
2650
2651 req->aiocb = &iocb->common;
2652 nvme_dsm_cb(iocb, 0);
2653
2654 return NVME_NO_COMPLETE;
2655 }
2656
2657 return status;
2658 }
2659
nvme_verify(NvmeCtrl * n,NvmeRequest * req)2660 static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2661 {
2662 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2663 NvmeNamespace *ns = req->ns;
2664 BlockBackend *blk = ns->blkconf.blk;
2665 uint64_t slba = le64_to_cpu(rw->slba);
2666 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2667 size_t len = nvme_l2b(ns, nlb);
2668 size_t data_len = len;
2669 int64_t offset = nvme_l2b(ns, slba);
2670 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2671 uint32_t reftag = le32_to_cpu(rw->reftag);
2672 NvmeBounceContext *ctx = NULL;
2673 uint16_t status;
2674
2675 trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2676
2677 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2678 status = nvme_check_prinfo(ns, prinfo, slba, reftag);
2679 if (status) {
2680 return status;
2681 }
2682
2683 if (prinfo & NVME_PRINFO_PRACT) {
2684 return NVME_INVALID_PROT_INFO | NVME_DNR;
2685 }
2686 }
2687
2688 if (nvme_ns_ext(ns) && !(NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt))) {
2689 data_len += nvme_m2b(ns, nlb);
2690 }
2691
2692 if (data_len > (n->page_size << n->params.vsl)) {
2693 return NVME_INVALID_FIELD | NVME_DNR;
2694 }
2695
2696 status = nvme_check_bounds(ns, slba, nlb);
2697 if (status) {
2698 return status;
2699 }
2700
2701 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2702 status = nvme_check_dulbe(ns, slba, nlb);
2703 if (status) {
2704 return status;
2705 }
2706 }
2707
2708 ctx = g_new0(NvmeBounceContext, 1);
2709 ctx->req = req;
2710
2711 ctx->data.bounce = g_malloc(len);
2712
2713 qemu_iovec_init(&ctx->data.iov, 1);
2714 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2715
2716 block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2717 BLOCK_ACCT_READ);
2718
2719 req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2720 nvme_verify_mdata_in_cb, ctx);
2721 return NVME_NO_COMPLETE;
2722 }
2723
2724 typedef struct NvmeCopyAIOCB {
2725 BlockAIOCB common;
2726 BlockAIOCB *aiocb;
2727 NvmeRequest *req;
2728 NvmeCtrl *n;
2729 int ret;
2730
2731 void *ranges;
2732 unsigned int format;
2733 int nr;
2734 int idx;
2735
2736 uint8_t *bounce;
2737 QEMUIOVector iov;
2738 struct {
2739 BlockAcctCookie read;
2740 BlockAcctCookie write;
2741 } acct;
2742
2743 uint64_t reftag;
2744 uint64_t slba;
2745
2746 NvmeZone *zone;
2747 NvmeNamespace *sns;
2748 uint32_t tcl;
2749 } NvmeCopyAIOCB;
2750
nvme_copy_cancel(BlockAIOCB * aiocb)2751 static void nvme_copy_cancel(BlockAIOCB *aiocb)
2752 {
2753 NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
2754
2755 iocb->ret = -ECANCELED;
2756
2757 if (iocb->aiocb) {
2758 blk_aio_cancel_async(iocb->aiocb);
2759 iocb->aiocb = NULL;
2760 }
2761 }
2762
2763 static const AIOCBInfo nvme_copy_aiocb_info = {
2764 .aiocb_size = sizeof(NvmeCopyAIOCB),
2765 .cancel_async = nvme_copy_cancel,
2766 };
2767
nvme_copy_done(NvmeCopyAIOCB * iocb)2768 static void nvme_copy_done(NvmeCopyAIOCB *iocb)
2769 {
2770 NvmeRequest *req = iocb->req;
2771 NvmeNamespace *ns = req->ns;
2772 BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
2773
2774 if (iocb->idx != iocb->nr) {
2775 req->cqe.result = cpu_to_le32(iocb->idx);
2776 }
2777
2778 qemu_iovec_destroy(&iocb->iov);
2779 g_free(iocb->bounce);
2780
2781 if (iocb->ret < 0) {
2782 block_acct_failed(stats, &iocb->acct.read);
2783 block_acct_failed(stats, &iocb->acct.write);
2784 } else {
2785 block_acct_done(stats, &iocb->acct.read);
2786 block_acct_done(stats, &iocb->acct.write);
2787 }
2788
2789 iocb->common.cb(iocb->common.opaque, iocb->ret);
2790 qemu_aio_unref(iocb);
2791 }
2792
2793 static void nvme_do_copy(NvmeCopyAIOCB *iocb);
2794
nvme_copy_source_range_parse_format0_2(void * ranges,int idx,uint64_t * slba,uint32_t * nlb,uint32_t * snsid,uint16_t * apptag,uint16_t * appmask,uint64_t * reftag)2795 static void nvme_copy_source_range_parse_format0_2(void *ranges,
2796 int idx, uint64_t *slba,
2797 uint32_t *nlb,
2798 uint32_t *snsid,
2799 uint16_t *apptag,
2800 uint16_t *appmask,
2801 uint64_t *reftag)
2802 {
2803 NvmeCopySourceRangeFormat0_2 *_ranges = ranges;
2804
2805 if (snsid) {
2806 *snsid = le32_to_cpu(_ranges[idx].sparams);
2807 }
2808
2809 if (slba) {
2810 *slba = le64_to_cpu(_ranges[idx].slba);
2811 }
2812
2813 if (nlb) {
2814 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2815 }
2816
2817 if (apptag) {
2818 *apptag = le16_to_cpu(_ranges[idx].apptag);
2819 }
2820
2821 if (appmask) {
2822 *appmask = le16_to_cpu(_ranges[idx].appmask);
2823 }
2824
2825 if (reftag) {
2826 *reftag = le32_to_cpu(_ranges[idx].reftag);
2827 }
2828 }
2829
nvme_copy_source_range_parse_format1_3(void * ranges,int idx,uint64_t * slba,uint32_t * nlb,uint32_t * snsid,uint16_t * apptag,uint16_t * appmask,uint64_t * reftag)2830 static void nvme_copy_source_range_parse_format1_3(void *ranges, int idx,
2831 uint64_t *slba,
2832 uint32_t *nlb,
2833 uint32_t *snsid,
2834 uint16_t *apptag,
2835 uint16_t *appmask,
2836 uint64_t *reftag)
2837 {
2838 NvmeCopySourceRangeFormat1_3 *_ranges = ranges;
2839
2840 if (snsid) {
2841 *snsid = le32_to_cpu(_ranges[idx].sparams);
2842 }
2843
2844 if (slba) {
2845 *slba = le64_to_cpu(_ranges[idx].slba);
2846 }
2847
2848 if (nlb) {
2849 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2850 }
2851
2852 if (apptag) {
2853 *apptag = le16_to_cpu(_ranges[idx].apptag);
2854 }
2855
2856 if (appmask) {
2857 *appmask = le16_to_cpu(_ranges[idx].appmask);
2858 }
2859
2860 if (reftag) {
2861 *reftag = 0;
2862
2863 *reftag |= (uint64_t)_ranges[idx].sr[4] << 40;
2864 *reftag |= (uint64_t)_ranges[idx].sr[5] << 32;
2865 *reftag |= (uint64_t)_ranges[idx].sr[6] << 24;
2866 *reftag |= (uint64_t)_ranges[idx].sr[7] << 16;
2867 *reftag |= (uint64_t)_ranges[idx].sr[8] << 8;
2868 *reftag |= (uint64_t)_ranges[idx].sr[9];
2869 }
2870 }
2871
nvme_copy_source_range_parse(void * ranges,int idx,uint8_t format,uint64_t * slba,uint32_t * nlb,uint32_t * snsid,uint16_t * apptag,uint16_t * appmask,uint64_t * reftag)2872 static void nvme_copy_source_range_parse(void *ranges, int idx, uint8_t format,
2873 uint64_t *slba, uint32_t *nlb,
2874 uint32_t *snsid, uint16_t *apptag,
2875 uint16_t *appmask, uint64_t *reftag)
2876 {
2877 switch (format) {
2878 case NVME_COPY_FORMAT_0:
2879 case NVME_COPY_FORMAT_2:
2880 nvme_copy_source_range_parse_format0_2(ranges, idx, slba, nlb, snsid,
2881 apptag, appmask, reftag);
2882 break;
2883
2884 case NVME_COPY_FORMAT_1:
2885 case NVME_COPY_FORMAT_3:
2886 nvme_copy_source_range_parse_format1_3(ranges, idx, slba, nlb, snsid,
2887 apptag, appmask, reftag);
2888 break;
2889
2890 default:
2891 abort();
2892 }
2893 }
2894
nvme_check_copy_mcl(NvmeNamespace * ns,NvmeCopyAIOCB * iocb,uint16_t nr)2895 static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns,
2896 NvmeCopyAIOCB *iocb, uint16_t nr)
2897 {
2898 uint32_t copy_len = 0;
2899
2900 for (int idx = 0; idx < nr; idx++) {
2901 uint32_t nlb;
2902 nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL,
2903 &nlb, NULL, NULL, NULL, NULL);
2904 copy_len += nlb;
2905 }
2906 iocb->tcl = copy_len;
2907 if (copy_len > ns->id_ns.mcl) {
2908 return NVME_CMD_SIZE_LIMIT | NVME_DNR;
2909 }
2910
2911 return NVME_SUCCESS;
2912 }
2913
nvme_copy_out_completed_cb(void * opaque,int ret)2914 static void nvme_copy_out_completed_cb(void *opaque, int ret)
2915 {
2916 NvmeCopyAIOCB *iocb = opaque;
2917 NvmeRequest *req = iocb->req;
2918 NvmeNamespace *dns = req->ns;
2919 uint32_t nlb;
2920
2921 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2922 &nlb, NULL, NULL, NULL, NULL);
2923
2924 if (ret < 0) {
2925 iocb->ret = ret;
2926 req->status = NVME_WRITE_FAULT;
2927 goto out;
2928 } else if (iocb->ret < 0) {
2929 goto out;
2930 }
2931
2932 if (dns->params.zoned) {
2933 nvme_advance_zone_wp(dns, iocb->zone, nlb);
2934 }
2935
2936 iocb->idx++;
2937 iocb->slba += nlb;
2938 out:
2939 nvme_do_copy(iocb);
2940 }
2941
nvme_copy_out_cb(void * opaque,int ret)2942 static void nvme_copy_out_cb(void *opaque, int ret)
2943 {
2944 NvmeCopyAIOCB *iocb = opaque;
2945 NvmeRequest *req = iocb->req;
2946 NvmeNamespace *dns = req->ns;
2947 uint32_t nlb;
2948 size_t mlen;
2949 uint8_t *mbounce;
2950
2951 if (ret < 0 || iocb->ret < 0 || !dns->lbaf.ms) {
2952 goto out;
2953 }
2954
2955 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2956 &nlb, NULL, NULL, NULL, NULL);
2957
2958 mlen = nvme_m2b(dns, nlb);
2959 mbounce = iocb->bounce + nvme_l2b(dns, nlb);
2960
2961 qemu_iovec_reset(&iocb->iov);
2962 qemu_iovec_add(&iocb->iov, mbounce, mlen);
2963
2964 iocb->aiocb = blk_aio_pwritev(dns->blkconf.blk, nvme_moff(dns, iocb->slba),
2965 &iocb->iov, 0, nvme_copy_out_completed_cb,
2966 iocb);
2967
2968 return;
2969
2970 out:
2971 nvme_copy_out_completed_cb(iocb, ret);
2972 }
2973
nvme_copy_in_completed_cb(void * opaque,int ret)2974 static void nvme_copy_in_completed_cb(void *opaque, int ret)
2975 {
2976 NvmeCopyAIOCB *iocb = opaque;
2977 NvmeRequest *req = iocb->req;
2978 NvmeNamespace *sns = iocb->sns;
2979 NvmeNamespace *dns = req->ns;
2980 NvmeCopyCmd *copy = NULL;
2981 uint8_t *mbounce = NULL;
2982 uint32_t nlb;
2983 uint64_t slba;
2984 uint16_t apptag, appmask;
2985 uint64_t reftag;
2986 size_t len, mlen;
2987 uint16_t status;
2988
2989 if (ret < 0) {
2990 iocb->ret = ret;
2991 req->status = NVME_UNRECOVERED_READ;
2992 goto out;
2993 } else if (iocb->ret < 0) {
2994 goto out;
2995 }
2996
2997 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
2998 &nlb, NULL, &apptag, &appmask, &reftag);
2999
3000 trace_pci_nvme_copy_out(iocb->slba, nlb);
3001
3002 len = nvme_l2b(sns, nlb);
3003
3004 if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps)) {
3005 copy = (NvmeCopyCmd *)&req->cmd;
3006
3007 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
3008
3009 mlen = nvme_m2b(sns, nlb);
3010 mbounce = iocb->bounce + nvme_l2b(sns, nlb);
3011
3012 status = nvme_dif_mangle_mdata(sns, mbounce, mlen, slba);
3013 if (status) {
3014 goto invalid;
3015 }
3016 status = nvme_dif_check(sns, iocb->bounce, len, mbounce, mlen, prinfor,
3017 slba, apptag, appmask, &reftag);
3018 if (status) {
3019 goto invalid;
3020 }
3021 }
3022
3023 if (NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3024 copy = (NvmeCopyCmd *)&req->cmd;
3025 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
3026
3027 mlen = nvme_m2b(dns, nlb);
3028 mbounce = iocb->bounce + nvme_l2b(dns, nlb);
3029
3030 apptag = le16_to_cpu(copy->apptag);
3031 appmask = le16_to_cpu(copy->appmask);
3032
3033 if (prinfow & NVME_PRINFO_PRACT) {
3034 status = nvme_check_prinfo(dns, prinfow, iocb->slba, iocb->reftag);
3035 if (status) {
3036 goto invalid;
3037 }
3038
3039 nvme_dif_pract_generate_dif(dns, iocb->bounce, len, mbounce, mlen,
3040 apptag, &iocb->reftag);
3041 } else {
3042 status = nvme_dif_check(dns, iocb->bounce, len, mbounce, mlen,
3043 prinfow, iocb->slba, apptag, appmask,
3044 &iocb->reftag);
3045 if (status) {
3046 goto invalid;
3047 }
3048 }
3049 }
3050
3051 status = nvme_check_bounds(dns, iocb->slba, nlb);
3052 if (status) {
3053 goto invalid;
3054 }
3055
3056 if (dns->params.zoned) {
3057 status = nvme_check_zone_write(dns, iocb->zone, iocb->slba, nlb);
3058 if (status) {
3059 goto invalid;
3060 }
3061
3062 if (!(iocb->zone->d.za & NVME_ZA_ZRWA_VALID)) {
3063 iocb->zone->w_ptr += nlb;
3064 }
3065 }
3066
3067 qemu_iovec_reset(&iocb->iov);
3068 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
3069
3070 block_acct_start(blk_get_stats(dns->blkconf.blk), &iocb->acct.write, 0,
3071 BLOCK_ACCT_WRITE);
3072
3073 iocb->aiocb = blk_aio_pwritev(dns->blkconf.blk, nvme_l2b(dns, iocb->slba),
3074 &iocb->iov, 0, nvme_copy_out_cb, iocb);
3075
3076 return;
3077
3078 invalid:
3079 req->status = status;
3080 iocb->ret = -1;
3081 out:
3082 nvme_do_copy(iocb);
3083 }
3084
nvme_copy_in_cb(void * opaque,int ret)3085 static void nvme_copy_in_cb(void *opaque, int ret)
3086 {
3087 NvmeCopyAIOCB *iocb = opaque;
3088 NvmeNamespace *sns = iocb->sns;
3089 uint64_t slba;
3090 uint32_t nlb;
3091
3092 if (ret < 0 || iocb->ret < 0 || !sns->lbaf.ms) {
3093 goto out;
3094 }
3095
3096 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
3097 &nlb, NULL, NULL, NULL, NULL);
3098
3099 qemu_iovec_reset(&iocb->iov);
3100 qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(sns, nlb),
3101 nvme_m2b(sns, nlb));
3102
3103 iocb->aiocb = blk_aio_preadv(sns->blkconf.blk, nvme_moff(sns, slba),
3104 &iocb->iov, 0, nvme_copy_in_completed_cb,
3105 iocb);
3106 return;
3107
3108 out:
3109 nvme_copy_in_completed_cb(iocb, ret);
3110 }
3111
nvme_csi_supports_copy(uint8_t csi)3112 static inline bool nvme_csi_supports_copy(uint8_t csi)
3113 {
3114 return csi == NVME_CSI_NVM || csi == NVME_CSI_ZONED;
3115 }
3116
nvme_copy_ns_format_match(NvmeNamespace * sns,NvmeNamespace * dns)3117 static inline bool nvme_copy_ns_format_match(NvmeNamespace *sns,
3118 NvmeNamespace *dns)
3119 {
3120 return sns->lbaf.ds == dns->lbaf.ds && sns->lbaf.ms == dns->lbaf.ms;
3121 }
3122
nvme_copy_matching_ns_format(NvmeNamespace * sns,NvmeNamespace * dns,bool pi_enable)3123 static bool nvme_copy_matching_ns_format(NvmeNamespace *sns, NvmeNamespace *dns,
3124 bool pi_enable)
3125 {
3126 if (!nvme_csi_supports_copy(sns->csi) ||
3127 !nvme_csi_supports_copy(dns->csi)) {
3128 return false;
3129 }
3130
3131 if (!pi_enable && !nvme_copy_ns_format_match(sns, dns)) {
3132 return false;
3133 }
3134
3135 if (pi_enable && (!nvme_copy_ns_format_match(sns, dns) ||
3136 sns->id_ns.dps != dns->id_ns.dps)) {
3137 return false;
3138 }
3139
3140 return true;
3141 }
3142
nvme_copy_corresp_pi_match(NvmeNamespace * sns,NvmeNamespace * dns)3143 static inline bool nvme_copy_corresp_pi_match(NvmeNamespace *sns,
3144 NvmeNamespace *dns)
3145 {
3146 return sns->lbaf.ms == 0 &&
3147 ((dns->lbaf.ms == 8 && dns->pif == 0) ||
3148 (dns->lbaf.ms == 16 && dns->pif == 1));
3149 }
3150
nvme_copy_corresp_pi_format(NvmeNamespace * sns,NvmeNamespace * dns,bool sns_pi_en)3151 static bool nvme_copy_corresp_pi_format(NvmeNamespace *sns, NvmeNamespace *dns,
3152 bool sns_pi_en)
3153 {
3154 if (!nvme_csi_supports_copy(sns->csi) ||
3155 !nvme_csi_supports_copy(dns->csi)) {
3156 return false;
3157 }
3158
3159 if (!sns_pi_en && !nvme_copy_corresp_pi_match(sns, dns)) {
3160 return false;
3161 }
3162
3163 if (sns_pi_en && !nvme_copy_corresp_pi_match(dns, sns)) {
3164 return false;
3165 }
3166
3167 return true;
3168 }
3169
nvme_do_copy(NvmeCopyAIOCB * iocb)3170 static void nvme_do_copy(NvmeCopyAIOCB *iocb)
3171 {
3172 NvmeRequest *req = iocb->req;
3173 NvmeNamespace *sns;
3174 NvmeNamespace *dns = req->ns;
3175 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
3176 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
3177 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
3178 uint64_t slba;
3179 uint32_t nlb;
3180 size_t len;
3181 uint16_t status;
3182 uint32_t dnsid = le32_to_cpu(req->cmd.nsid);
3183 uint32_t snsid = dnsid;
3184
3185 if (iocb->ret < 0) {
3186 goto done;
3187 }
3188
3189 if (iocb->idx == iocb->nr) {
3190 goto done;
3191 }
3192
3193 if (iocb->format == 2 || iocb->format == 3) {
3194 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format,
3195 &slba, &nlb, &snsid, NULL, NULL, NULL);
3196 if (snsid != dnsid) {
3197 if (snsid == NVME_NSID_BROADCAST ||
3198 !nvme_nsid_valid(iocb->n, snsid)) {
3199 status = NVME_INVALID_NSID | NVME_DNR;
3200 goto invalid;
3201 }
3202 iocb->sns = nvme_ns(iocb->n, snsid);
3203 if (unlikely(!iocb->sns)) {
3204 status = NVME_INVALID_FIELD | NVME_DNR;
3205 goto invalid;
3206 }
3207 } else {
3208 if (((slba + nlb) > iocb->slba) &&
3209 ((slba + nlb) < (iocb->slba + iocb->tcl))) {
3210 status = NVME_CMD_OVERLAP_IO_RANGE | NVME_DNR;
3211 goto invalid;
3212 }
3213 }
3214 } else {
3215 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format,
3216 &slba, &nlb, NULL, NULL, NULL, NULL);
3217 }
3218
3219 sns = iocb->sns;
3220 if ((snsid == dnsid) && NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3221 ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
3222 status = NVME_INVALID_FIELD | NVME_DNR;
3223 goto invalid;
3224 } else if (snsid != dnsid) {
3225 if (!NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3226 !NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3227 if (!nvme_copy_matching_ns_format(sns, dns, false)) {
3228 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3229 goto invalid;
3230 }
3231 }
3232 if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3233 NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3234 if ((prinfor & NVME_PRINFO_PRACT) !=
3235 (prinfow & NVME_PRINFO_PRACT)) {
3236 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3237 goto invalid;
3238 } else {
3239 if (!nvme_copy_matching_ns_format(sns, dns, true)) {
3240 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3241 goto invalid;
3242 }
3243 }
3244 }
3245
3246 if (!NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3247 NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3248 if (!(prinfow & NVME_PRINFO_PRACT)) {
3249 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3250 goto invalid;
3251 } else {
3252 if (!nvme_copy_corresp_pi_format(sns, dns, false)) {
3253 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3254 goto invalid;
3255 }
3256 }
3257 }
3258
3259 if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3260 !NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3261 if (!(prinfor & NVME_PRINFO_PRACT)) {
3262 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3263 goto invalid;
3264 } else {
3265 if (!nvme_copy_corresp_pi_format(sns, dns, true)) {
3266 status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3267 goto invalid;
3268 }
3269 }
3270 }
3271 }
3272 len = nvme_l2b(sns, nlb);
3273
3274 trace_pci_nvme_copy_source_range(slba, nlb);
3275
3276 if (nlb > le16_to_cpu(sns->id_ns.mssrl)) {
3277 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
3278 goto invalid;
3279 }
3280
3281 status = nvme_check_bounds(sns, slba, nlb);
3282 if (status) {
3283 goto invalid;
3284 }
3285
3286 if (NVME_ERR_REC_DULBE(sns->features.err_rec)) {
3287 status = nvme_check_dulbe(sns, slba, nlb);
3288 if (status) {
3289 goto invalid;
3290 }
3291 }
3292
3293 if (sns->params.zoned) {
3294 status = nvme_check_zone_read(sns, slba, nlb);
3295 if (status) {
3296 goto invalid;
3297 }
3298 }
3299
3300 g_free(iocb->bounce);
3301 iocb->bounce = g_malloc_n(le16_to_cpu(sns->id_ns.mssrl),
3302 sns->lbasz + sns->lbaf.ms);
3303
3304 qemu_iovec_reset(&iocb->iov);
3305 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
3306
3307 block_acct_start(blk_get_stats(sns->blkconf.blk), &iocb->acct.read, 0,
3308 BLOCK_ACCT_READ);
3309
3310 iocb->aiocb = blk_aio_preadv(sns->blkconf.blk, nvme_l2b(sns, slba),
3311 &iocb->iov, 0, nvme_copy_in_cb, iocb);
3312 return;
3313
3314 invalid:
3315 req->status = status;
3316 iocb->ret = -1;
3317 done:
3318 nvme_copy_done(iocb);
3319 }
3320
nvme_copy(NvmeCtrl * n,NvmeRequest * req)3321 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
3322 {
3323 NvmeNamespace *ns = req->ns;
3324 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
3325 NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
3326 nvme_misc_cb, req);
3327 uint16_t nr = copy->nr + 1;
3328 uint8_t format = copy->control[0] & 0xf;
3329 size_t len = sizeof(NvmeCopySourceRangeFormat0_2);
3330
3331 uint16_t status;
3332
3333 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
3334
3335 iocb->ranges = NULL;
3336 iocb->zone = NULL;
3337
3338 if (!(n->id_ctrl.ocfs & (1 << format)) ||
3339 ((format == 2 || format == 3) &&
3340 !(n->features.hbs.cdfe & (1 << format)))) {
3341 trace_pci_nvme_err_copy_invalid_format(format);
3342 status = NVME_INVALID_FIELD | NVME_DNR;
3343 goto invalid;
3344 }
3345
3346 if (nr > ns->id_ns.msrc + 1) {
3347 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
3348 goto invalid;
3349 }
3350
3351 if ((ns->pif == 0x0 && (format != 0x0 && format != 0x2)) ||
3352 (ns->pif != 0x0 && (format != 0x1 && format != 0x3))) {
3353 status = NVME_INVALID_FORMAT | NVME_DNR;
3354 goto invalid;
3355 }
3356
3357 if (ns->pif) {
3358 len = sizeof(NvmeCopySourceRangeFormat1_3);
3359 }
3360
3361 iocb->format = format;
3362 iocb->ranges = g_malloc_n(nr, len);
3363 status = nvme_h2c(n, (uint8_t *)iocb->ranges, len * nr, req);
3364 if (status) {
3365 goto invalid;
3366 }
3367
3368 iocb->slba = le64_to_cpu(copy->sdlba);
3369
3370 if (ns->params.zoned) {
3371 iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
3372 if (!iocb->zone) {
3373 status = NVME_LBA_RANGE | NVME_DNR;
3374 goto invalid;
3375 }
3376
3377 status = nvme_zrm_auto(n, ns, iocb->zone);
3378 if (status) {
3379 goto invalid;
3380 }
3381 }
3382
3383 status = nvme_check_copy_mcl(ns, iocb, nr);
3384 if (status) {
3385 goto invalid;
3386 }
3387
3388 iocb->req = req;
3389 iocb->ret = 0;
3390 iocb->nr = nr;
3391 iocb->idx = 0;
3392 iocb->reftag = le32_to_cpu(copy->reftag);
3393 iocb->reftag |= (uint64_t)le32_to_cpu(copy->cdw3) << 32;
3394
3395 qemu_iovec_init(&iocb->iov, 1);
3396
3397 req->aiocb = &iocb->common;
3398 iocb->sns = req->ns;
3399 iocb->n = n;
3400 iocb->bounce = NULL;
3401 nvme_do_copy(iocb);
3402
3403 return NVME_NO_COMPLETE;
3404
3405 invalid:
3406 g_free(iocb->ranges);
3407 qemu_aio_unref(iocb);
3408 return status;
3409 }
3410
nvme_compare(NvmeCtrl * n,NvmeRequest * req)3411 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
3412 {
3413 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3414 NvmeNamespace *ns = req->ns;
3415 BlockBackend *blk = ns->blkconf.blk;
3416 uint64_t slba = le64_to_cpu(rw->slba);
3417 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
3418 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3419 size_t data_len = nvme_l2b(ns, nlb);
3420 size_t len = data_len;
3421 int64_t offset = nvme_l2b(ns, slba);
3422 struct nvme_compare_ctx *ctx = NULL;
3423 uint16_t status;
3424
3425 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
3426
3427 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
3428 return NVME_INVALID_PROT_INFO | NVME_DNR;
3429 }
3430
3431 if (nvme_ns_ext(ns)) {
3432 len += nvme_m2b(ns, nlb);
3433 }
3434
3435 if (NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt)) {
3436 status = nvme_check_mdts(n, data_len);
3437 } else {
3438 status = nvme_check_mdts(n, len);
3439 }
3440 if (status) {
3441 return status;
3442 }
3443
3444 status = nvme_check_bounds(ns, slba, nlb);
3445 if (status) {
3446 return status;
3447 }
3448
3449 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3450 status = nvme_check_dulbe(ns, slba, nlb);
3451 if (status) {
3452 return status;
3453 }
3454 }
3455
3456 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
3457 if (status) {
3458 return status;
3459 }
3460
3461 ctx = g_new(struct nvme_compare_ctx, 1);
3462 ctx->data.bounce = g_malloc(data_len);
3463
3464 req->opaque = ctx;
3465
3466 qemu_iovec_init(&ctx->data.iov, 1);
3467 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
3468
3469 block_acct_start(blk_get_stats(blk), &req->acct, data_len,
3470 BLOCK_ACCT_READ);
3471 req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
3472 nvme_compare_data_cb, req);
3473
3474 return NVME_NO_COMPLETE;
3475 }
3476
3477 typedef struct NvmeFlushAIOCB {
3478 BlockAIOCB common;
3479 BlockAIOCB *aiocb;
3480 NvmeRequest *req;
3481 int ret;
3482
3483 NvmeNamespace *ns;
3484 uint32_t nsid;
3485 bool broadcast;
3486 } NvmeFlushAIOCB;
3487
nvme_flush_cancel(BlockAIOCB * acb)3488 static void nvme_flush_cancel(BlockAIOCB *acb)
3489 {
3490 NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
3491
3492 iocb->ret = -ECANCELED;
3493
3494 if (iocb->aiocb) {
3495 blk_aio_cancel_async(iocb->aiocb);
3496 iocb->aiocb = NULL;
3497 }
3498 }
3499
3500 static const AIOCBInfo nvme_flush_aiocb_info = {
3501 .aiocb_size = sizeof(NvmeFlushAIOCB),
3502 .cancel_async = nvme_flush_cancel,
3503 };
3504
3505 static void nvme_do_flush(NvmeFlushAIOCB *iocb);
3506
nvme_flush_ns_cb(void * opaque,int ret)3507 static void nvme_flush_ns_cb(void *opaque, int ret)
3508 {
3509 NvmeFlushAIOCB *iocb = opaque;
3510 NvmeNamespace *ns = iocb->ns;
3511
3512 if (ret < 0) {
3513 iocb->ret = ret;
3514 iocb->req->status = NVME_WRITE_FAULT;
3515 goto out;
3516 } else if (iocb->ret < 0) {
3517 goto out;
3518 }
3519
3520 if (ns) {
3521 trace_pci_nvme_flush_ns(iocb->nsid);
3522
3523 iocb->ns = NULL;
3524 iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
3525 return;
3526 }
3527
3528 out:
3529 nvme_do_flush(iocb);
3530 }
3531
nvme_do_flush(NvmeFlushAIOCB * iocb)3532 static void nvme_do_flush(NvmeFlushAIOCB *iocb)
3533 {
3534 NvmeRequest *req = iocb->req;
3535 NvmeCtrl *n = nvme_ctrl(req);
3536 int i;
3537
3538 if (iocb->ret < 0) {
3539 goto done;
3540 }
3541
3542 if (iocb->broadcast) {
3543 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
3544 iocb->ns = nvme_ns(n, i);
3545 if (iocb->ns) {
3546 iocb->nsid = i;
3547 break;
3548 }
3549 }
3550 }
3551
3552 if (!iocb->ns) {
3553 goto done;
3554 }
3555
3556 nvme_flush_ns_cb(iocb, 0);
3557 return;
3558
3559 done:
3560 iocb->common.cb(iocb->common.opaque, iocb->ret);
3561 qemu_aio_unref(iocb);
3562 }
3563
nvme_flush(NvmeCtrl * n,NvmeRequest * req)3564 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
3565 {
3566 NvmeFlushAIOCB *iocb;
3567 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3568 uint16_t status;
3569
3570 iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
3571
3572 iocb->req = req;
3573 iocb->ret = 0;
3574 iocb->ns = NULL;
3575 iocb->nsid = 0;
3576 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
3577
3578 if (!iocb->broadcast) {
3579 if (!nvme_nsid_valid(n, nsid)) {
3580 status = NVME_INVALID_NSID | NVME_DNR;
3581 goto out;
3582 }
3583
3584 iocb->ns = nvme_ns(n, nsid);
3585 if (!iocb->ns) {
3586 status = NVME_INVALID_FIELD | NVME_DNR;
3587 goto out;
3588 }
3589
3590 iocb->nsid = nsid;
3591 }
3592
3593 req->aiocb = &iocb->common;
3594 nvme_do_flush(iocb);
3595
3596 return NVME_NO_COMPLETE;
3597
3598 out:
3599 qemu_aio_unref(iocb);
3600
3601 return status;
3602 }
3603
nvme_read(NvmeCtrl * n,NvmeRequest * req)3604 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
3605 {
3606 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3607 NvmeNamespace *ns = req->ns;
3608 uint64_t slba = le64_to_cpu(rw->slba);
3609 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3610 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3611 uint64_t data_size = nvme_l2b(ns, nlb);
3612 uint64_t mapped_size = data_size;
3613 uint64_t data_offset;
3614 BlockBackend *blk = ns->blkconf.blk;
3615 uint16_t status;
3616
3617 if (nvme_ns_ext(ns) && !(NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt))) {
3618 mapped_size += nvme_m2b(ns, nlb);
3619
3620 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3621 bool pract = prinfo & NVME_PRINFO_PRACT;
3622
3623 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3624 mapped_size = data_size;
3625 }
3626 }
3627 }
3628
3629 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
3630
3631 status = nvme_check_mdts(n, mapped_size);
3632 if (status) {
3633 goto invalid;
3634 }
3635
3636 status = nvme_check_bounds(ns, slba, nlb);
3637 if (status) {
3638 goto invalid;
3639 }
3640
3641 if (ns->params.zoned) {
3642 status = nvme_check_zone_read(ns, slba, nlb);
3643 if (status) {
3644 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
3645 goto invalid;
3646 }
3647 }
3648
3649 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3650 status = nvme_check_dulbe(ns, slba, nlb);
3651 if (status) {
3652 goto invalid;
3653 }
3654 }
3655
3656 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3657 return nvme_dif_rw(n, req);
3658 }
3659
3660 status = nvme_map_data(n, nlb, req);
3661 if (status) {
3662 goto invalid;
3663 }
3664
3665 data_offset = nvme_l2b(ns, slba);
3666
3667 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3668 BLOCK_ACCT_READ);
3669 nvme_blk_read(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
3670 return NVME_NO_COMPLETE;
3671
3672 invalid:
3673 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
3674 return status | NVME_DNR;
3675 }
3676
nvme_do_write_fdp(NvmeCtrl * n,NvmeRequest * req,uint64_t slba,uint32_t nlb)3677 static void nvme_do_write_fdp(NvmeCtrl *n, NvmeRequest *req, uint64_t slba,
3678 uint32_t nlb)
3679 {
3680 NvmeNamespace *ns = req->ns;
3681 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3682 uint64_t data_size = nvme_l2b(ns, nlb);
3683 uint32_t dw12 = le32_to_cpu(req->cmd.cdw12);
3684 uint8_t dtype = (dw12 >> 20) & 0xf;
3685 uint16_t pid = le16_to_cpu(rw->dspec);
3686 uint16_t ph, rg, ruhid;
3687 NvmeReclaimUnit *ru;
3688
3689 if (dtype != NVME_DIRECTIVE_DATA_PLACEMENT ||
3690 !nvme_parse_pid(ns, pid, &ph, &rg)) {
3691 ph = 0;
3692 rg = 0;
3693 }
3694
3695 ruhid = ns->fdp.phs[ph];
3696 ru = &ns->endgrp->fdp.ruhs[ruhid].rus[rg];
3697
3698 nvme_fdp_stat_inc(&ns->endgrp->fdp.hbmw, data_size);
3699 nvme_fdp_stat_inc(&ns->endgrp->fdp.mbmw, data_size);
3700
3701 while (nlb) {
3702 if (nlb < ru->ruamw) {
3703 ru->ruamw -= nlb;
3704 break;
3705 }
3706
3707 nlb -= ru->ruamw;
3708 nvme_update_ruh(n, ns, pid);
3709 }
3710 }
3711
nvme_do_write(NvmeCtrl * n,NvmeRequest * req,bool append,bool wrz)3712 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
3713 bool wrz)
3714 {
3715 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3716 NvmeNamespace *ns = req->ns;
3717 uint64_t slba = le64_to_cpu(rw->slba);
3718 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3719 uint16_t ctrl = le16_to_cpu(rw->control);
3720 uint8_t prinfo = NVME_RW_PRINFO(ctrl);
3721 uint64_t data_size = nvme_l2b(ns, nlb);
3722 uint64_t mapped_size = data_size;
3723 uint64_t data_offset;
3724 NvmeZone *zone;
3725 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3726 BlockBackend *blk = ns->blkconf.blk;
3727 uint16_t status;
3728
3729 if (nvme_ns_ext(ns) && !(NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt))) {
3730 mapped_size += nvme_m2b(ns, nlb);
3731
3732 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3733 bool pract = prinfo & NVME_PRINFO_PRACT;
3734
3735 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3736 mapped_size -= nvme_m2b(ns, nlb);
3737 }
3738 }
3739 }
3740
3741 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3742 nvme_nsid(ns), nlb, mapped_size, slba);
3743
3744 if (!wrz) {
3745 status = nvme_check_mdts(n, mapped_size);
3746 if (status) {
3747 goto invalid;
3748 }
3749 }
3750
3751 status = nvme_check_bounds(ns, slba, nlb);
3752 if (status) {
3753 goto invalid;
3754 }
3755
3756 if (ns->params.zoned) {
3757 zone = nvme_get_zone_by_slba(ns, slba);
3758 assert(zone);
3759
3760 if (append) {
3761 bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3762
3763 if (unlikely(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3764 return NVME_INVALID_ZONE_OP | NVME_DNR;
3765 }
3766
3767 if (unlikely(slba != zone->d.zslba)) {
3768 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3769 status = NVME_INVALID_FIELD;
3770 goto invalid;
3771 }
3772
3773 if (n->params.zasl &&
3774 data_size > (uint64_t)n->page_size << n->params.zasl) {
3775 trace_pci_nvme_err_zasl(data_size);
3776 return NVME_INVALID_FIELD | NVME_DNR;
3777 }
3778
3779 slba = zone->w_ptr;
3780 rw->slba = cpu_to_le64(slba);
3781 res->slba = cpu_to_le64(slba);
3782
3783 switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3784 case NVME_ID_NS_DPS_TYPE_1:
3785 if (!piremap) {
3786 return NVME_INVALID_PROT_INFO | NVME_DNR;
3787 }
3788
3789 /* fallthrough */
3790
3791 case NVME_ID_NS_DPS_TYPE_2:
3792 if (piremap) {
3793 uint32_t reftag = le32_to_cpu(rw->reftag);
3794 rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3795 }
3796
3797 break;
3798
3799 case NVME_ID_NS_DPS_TYPE_3:
3800 if (piremap) {
3801 return NVME_INVALID_PROT_INFO | NVME_DNR;
3802 }
3803
3804 break;
3805 }
3806 }
3807
3808 status = nvme_check_zone_write(ns, zone, slba, nlb);
3809 if (status) {
3810 goto invalid;
3811 }
3812
3813 status = nvme_zrm_auto(n, ns, zone);
3814 if (status) {
3815 goto invalid;
3816 }
3817
3818 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3819 zone->w_ptr += nlb;
3820 }
3821 } else if (ns->endgrp && ns->endgrp->fdp.enabled) {
3822 nvme_do_write_fdp(n, req, slba, nlb);
3823 }
3824
3825 data_offset = nvme_l2b(ns, slba);
3826
3827 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3828 return nvme_dif_rw(n, req);
3829 }
3830
3831 if (!wrz) {
3832 status = nvme_map_data(n, nlb, req);
3833 if (status) {
3834 goto invalid;
3835 }
3836
3837 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3838 BLOCK_ACCT_WRITE);
3839 nvme_blk_write(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
3840 } else {
3841 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3842 BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3843 req);
3844 }
3845
3846 return NVME_NO_COMPLETE;
3847
3848 invalid:
3849 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3850 return status | NVME_DNR;
3851 }
3852
nvme_write(NvmeCtrl * n,NvmeRequest * req)3853 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3854 {
3855 return nvme_do_write(n, req, false, false);
3856 }
3857
nvme_write_zeroes(NvmeCtrl * n,NvmeRequest * req)3858 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3859 {
3860 return nvme_do_write(n, req, false, true);
3861 }
3862
nvme_zone_append(NvmeCtrl * n,NvmeRequest * req)3863 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3864 {
3865 return nvme_do_write(n, req, true, false);
3866 }
3867
nvme_get_mgmt_zone_slba_idx(NvmeNamespace * ns,NvmeCmd * c,uint64_t * slba,uint32_t * zone_idx)3868 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3869 uint64_t *slba, uint32_t *zone_idx)
3870 {
3871 uint32_t dw10 = le32_to_cpu(c->cdw10);
3872 uint32_t dw11 = le32_to_cpu(c->cdw11);
3873
3874 if (!ns->params.zoned) {
3875 trace_pci_nvme_err_invalid_opc(c->opcode);
3876 return NVME_INVALID_OPCODE | NVME_DNR;
3877 }
3878
3879 *slba = ((uint64_t)dw11) << 32 | dw10;
3880 if (unlikely(*slba >= ns->id_ns.nsze)) {
3881 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3882 *slba = 0;
3883 return NVME_LBA_RANGE | NVME_DNR;
3884 }
3885
3886 *zone_idx = nvme_zone_idx(ns, *slba);
3887 assert(*zone_idx < ns->num_zones);
3888
3889 return NVME_SUCCESS;
3890 }
3891
3892 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3893 NvmeRequest *);
3894
3895 enum NvmeZoneProcessingMask {
3896 NVME_PROC_CURRENT_ZONE = 0,
3897 NVME_PROC_OPENED_ZONES = 1 << 0,
3898 NVME_PROC_CLOSED_ZONES = 1 << 1,
3899 NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3900 NVME_PROC_FULL_ZONES = 1 << 3,
3901 };
3902
nvme_open_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3903 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3904 NvmeZoneState state, NvmeRequest *req)
3905 {
3906 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
3907 int flags = 0;
3908
3909 if (cmd->zsflags & NVME_ZSFLAG_ZRWA_ALLOC) {
3910 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
3911
3912 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
3913 return NVME_INVALID_ZONE_OP | NVME_DNR;
3914 }
3915
3916 if (zone->w_ptr % ns->zns.zrwafg) {
3917 return NVME_NOZRWA | NVME_DNR;
3918 }
3919
3920 flags = NVME_ZRM_ZRWA;
3921 }
3922
3923 return nvme_zrm_open_flags(nvme_ctrl(req), ns, zone, flags);
3924 }
3925
nvme_close_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3926 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3927 NvmeZoneState state, NvmeRequest *req)
3928 {
3929 return nvme_zrm_close(ns, zone);
3930 }
3931
nvme_finish_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3932 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3933 NvmeZoneState state, NvmeRequest *req)
3934 {
3935 return nvme_zrm_finish(ns, zone);
3936 }
3937
nvme_offline_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3938 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3939 NvmeZoneState state, NvmeRequest *req)
3940 {
3941 switch (state) {
3942 case NVME_ZONE_STATE_READ_ONLY:
3943 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3944 /* fall through */
3945 case NVME_ZONE_STATE_OFFLINE:
3946 return NVME_SUCCESS;
3947 default:
3948 return NVME_ZONE_INVAL_TRANSITION;
3949 }
3950 }
3951
nvme_set_zd_ext(NvmeNamespace * ns,NvmeZone * zone)3952 static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3953 {
3954 uint16_t status;
3955 uint8_t state = nvme_get_zone_state(zone);
3956
3957 if (state == NVME_ZONE_STATE_EMPTY) {
3958 status = nvme_aor_check(ns, 1, 0);
3959 if (status) {
3960 return status;
3961 }
3962 nvme_aor_inc_active(ns);
3963 zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3964 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3965 return NVME_SUCCESS;
3966 }
3967
3968 return NVME_ZONE_INVAL_TRANSITION;
3969 }
3970
nvme_bulk_proc_zone(NvmeNamespace * ns,NvmeZone * zone,enum NvmeZoneProcessingMask proc_mask,op_handler_t op_hndlr,NvmeRequest * req)3971 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3972 enum NvmeZoneProcessingMask proc_mask,
3973 op_handler_t op_hndlr, NvmeRequest *req)
3974 {
3975 uint16_t status = NVME_SUCCESS;
3976 NvmeZoneState zs = nvme_get_zone_state(zone);
3977 bool proc_zone;
3978
3979 switch (zs) {
3980 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3981 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3982 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3983 break;
3984 case NVME_ZONE_STATE_CLOSED:
3985 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3986 break;
3987 case NVME_ZONE_STATE_READ_ONLY:
3988 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3989 break;
3990 case NVME_ZONE_STATE_FULL:
3991 proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3992 break;
3993 default:
3994 proc_zone = false;
3995 }
3996
3997 if (proc_zone) {
3998 status = op_hndlr(ns, zone, zs, req);
3999 }
4000
4001 return status;
4002 }
4003
nvme_do_zone_op(NvmeNamespace * ns,NvmeZone * zone,enum NvmeZoneProcessingMask proc_mask,op_handler_t op_hndlr,NvmeRequest * req)4004 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
4005 enum NvmeZoneProcessingMask proc_mask,
4006 op_handler_t op_hndlr, NvmeRequest *req)
4007 {
4008 NvmeZone *next;
4009 uint16_t status = NVME_SUCCESS;
4010 int i;
4011
4012 if (!proc_mask) {
4013 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
4014 } else {
4015 if (proc_mask & NVME_PROC_CLOSED_ZONES) {
4016 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
4017 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4018 req);
4019 if (status && status != NVME_NO_COMPLETE) {
4020 goto out;
4021 }
4022 }
4023 }
4024 if (proc_mask & NVME_PROC_OPENED_ZONES) {
4025 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
4026 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4027 req);
4028 if (status && status != NVME_NO_COMPLETE) {
4029 goto out;
4030 }
4031 }
4032
4033 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
4034 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4035 req);
4036 if (status && status != NVME_NO_COMPLETE) {
4037 goto out;
4038 }
4039 }
4040 }
4041 if (proc_mask & NVME_PROC_FULL_ZONES) {
4042 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
4043 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4044 req);
4045 if (status && status != NVME_NO_COMPLETE) {
4046 goto out;
4047 }
4048 }
4049 }
4050
4051 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
4052 for (i = 0; i < ns->num_zones; i++, zone++) {
4053 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4054 req);
4055 if (status && status != NVME_NO_COMPLETE) {
4056 goto out;
4057 }
4058 }
4059 }
4060 }
4061
4062 out:
4063 return status;
4064 }
4065
4066 typedef struct NvmeZoneResetAIOCB {
4067 BlockAIOCB common;
4068 BlockAIOCB *aiocb;
4069 NvmeRequest *req;
4070 int ret;
4071
4072 bool all;
4073 int idx;
4074 NvmeZone *zone;
4075 } NvmeZoneResetAIOCB;
4076
nvme_zone_reset_cancel(BlockAIOCB * aiocb)4077 static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
4078 {
4079 NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
4080 NvmeRequest *req = iocb->req;
4081 NvmeNamespace *ns = req->ns;
4082
4083 iocb->idx = ns->num_zones;
4084
4085 iocb->ret = -ECANCELED;
4086
4087 if (iocb->aiocb) {
4088 blk_aio_cancel_async(iocb->aiocb);
4089 iocb->aiocb = NULL;
4090 }
4091 }
4092
4093 static const AIOCBInfo nvme_zone_reset_aiocb_info = {
4094 .aiocb_size = sizeof(NvmeZoneResetAIOCB),
4095 .cancel_async = nvme_zone_reset_cancel,
4096 };
4097
4098 static void nvme_zone_reset_cb(void *opaque, int ret);
4099
nvme_zone_reset_epilogue_cb(void * opaque,int ret)4100 static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
4101 {
4102 NvmeZoneResetAIOCB *iocb = opaque;
4103 NvmeRequest *req = iocb->req;
4104 NvmeNamespace *ns = req->ns;
4105 int64_t moff;
4106 int count;
4107
4108 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
4109 goto out;
4110 }
4111
4112 moff = nvme_moff(ns, iocb->zone->d.zslba);
4113 count = nvme_m2b(ns, ns->zone_size);
4114
4115 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
4116 BDRV_REQ_MAY_UNMAP,
4117 nvme_zone_reset_cb, iocb);
4118 return;
4119
4120 out:
4121 nvme_zone_reset_cb(iocb, ret);
4122 }
4123
nvme_zone_reset_cb(void * opaque,int ret)4124 static void nvme_zone_reset_cb(void *opaque, int ret)
4125 {
4126 NvmeZoneResetAIOCB *iocb = opaque;
4127 NvmeRequest *req = iocb->req;
4128 NvmeNamespace *ns = req->ns;
4129
4130 if (iocb->ret < 0) {
4131 goto done;
4132 } else if (ret < 0) {
4133 iocb->ret = ret;
4134 goto done;
4135 }
4136
4137 if (iocb->zone) {
4138 nvme_zrm_reset(ns, iocb->zone);
4139
4140 if (!iocb->all) {
4141 goto done;
4142 }
4143 }
4144
4145 while (iocb->idx < ns->num_zones) {
4146 NvmeZone *zone = &ns->zone_array[iocb->idx++];
4147
4148 switch (nvme_get_zone_state(zone)) {
4149 case NVME_ZONE_STATE_EMPTY:
4150 if (!iocb->all) {
4151 goto done;
4152 }
4153
4154 continue;
4155
4156 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
4157 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
4158 case NVME_ZONE_STATE_CLOSED:
4159 case NVME_ZONE_STATE_FULL:
4160 iocb->zone = zone;
4161 break;
4162
4163 default:
4164 continue;
4165 }
4166
4167 trace_pci_nvme_zns_zone_reset(zone->d.zslba);
4168
4169 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
4170 nvme_l2b(ns, zone->d.zslba),
4171 nvme_l2b(ns, ns->zone_size),
4172 BDRV_REQ_MAY_UNMAP,
4173 nvme_zone_reset_epilogue_cb,
4174 iocb);
4175 return;
4176 }
4177
4178 done:
4179 iocb->aiocb = NULL;
4180
4181 iocb->common.cb(iocb->common.opaque, iocb->ret);
4182 qemu_aio_unref(iocb);
4183 }
4184
nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl * n,NvmeZone * zone,uint64_t elba,NvmeRequest * req)4185 static uint16_t nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl *n, NvmeZone *zone,
4186 uint64_t elba, NvmeRequest *req)
4187 {
4188 NvmeNamespace *ns = req->ns;
4189 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
4190 uint64_t wp = zone->d.wp;
4191 uint32_t nlb = elba - wp + 1;
4192 uint16_t status;
4193
4194
4195 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
4196 return NVME_INVALID_ZONE_OP | NVME_DNR;
4197 }
4198
4199 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
4200 return NVME_INVALID_FIELD | NVME_DNR;
4201 }
4202
4203 if (elba < wp || elba > wp + ns->zns.zrwas) {
4204 return NVME_ZONE_BOUNDARY_ERROR | NVME_DNR;
4205 }
4206
4207 if (nlb % ns->zns.zrwafg) {
4208 return NVME_INVALID_FIELD | NVME_DNR;
4209 }
4210
4211 status = nvme_zrm_auto(n, ns, zone);
4212 if (status) {
4213 return status;
4214 }
4215
4216 zone->w_ptr += nlb;
4217
4218 nvme_advance_zone_wp(ns, zone, nlb);
4219
4220 return NVME_SUCCESS;
4221 }
4222
nvme_zone_mgmt_send(NvmeCtrl * n,NvmeRequest * req)4223 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
4224 {
4225 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
4226 NvmeNamespace *ns = req->ns;
4227 NvmeZone *zone;
4228 NvmeZoneResetAIOCB *iocb;
4229 uint8_t *zd_ext;
4230 uint64_t slba = 0;
4231 uint32_t zone_idx = 0;
4232 uint16_t status;
4233 uint8_t action = cmd->zsa;
4234 bool all;
4235 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
4236
4237 all = cmd->zsflags & NVME_ZSFLAG_SELECT_ALL;
4238
4239 req->status = NVME_SUCCESS;
4240
4241 if (!all) {
4242 status = nvme_get_mgmt_zone_slba_idx(ns, &req->cmd, &slba, &zone_idx);
4243 if (status) {
4244 return status;
4245 }
4246 }
4247
4248 zone = &ns->zone_array[zone_idx];
4249 if (slba != zone->d.zslba && action != NVME_ZONE_ACTION_ZRWA_FLUSH) {
4250 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
4251 return NVME_INVALID_FIELD | NVME_DNR;
4252 }
4253
4254 switch (action) {
4255
4256 case NVME_ZONE_ACTION_OPEN:
4257 if (all) {
4258 proc_mask = NVME_PROC_CLOSED_ZONES;
4259 }
4260 trace_pci_nvme_open_zone(slba, zone_idx, all);
4261 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
4262 break;
4263
4264 case NVME_ZONE_ACTION_CLOSE:
4265 if (all) {
4266 proc_mask = NVME_PROC_OPENED_ZONES;
4267 }
4268 trace_pci_nvme_close_zone(slba, zone_idx, all);
4269 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
4270 break;
4271
4272 case NVME_ZONE_ACTION_FINISH:
4273 if (all) {
4274 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
4275 }
4276 trace_pci_nvme_finish_zone(slba, zone_idx, all);
4277 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
4278 break;
4279
4280 case NVME_ZONE_ACTION_RESET:
4281 trace_pci_nvme_reset_zone(slba, zone_idx, all);
4282
4283 iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
4284 nvme_misc_cb, req);
4285
4286 iocb->req = req;
4287 iocb->ret = 0;
4288 iocb->all = all;
4289 iocb->idx = zone_idx;
4290 iocb->zone = NULL;
4291
4292 req->aiocb = &iocb->common;
4293 nvme_zone_reset_cb(iocb, 0);
4294
4295 return NVME_NO_COMPLETE;
4296
4297 case NVME_ZONE_ACTION_OFFLINE:
4298 if (all) {
4299 proc_mask = NVME_PROC_READ_ONLY_ZONES;
4300 }
4301 trace_pci_nvme_offline_zone(slba, zone_idx, all);
4302 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
4303 break;
4304
4305 case NVME_ZONE_ACTION_SET_ZD_EXT:
4306 trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
4307 if (all || !ns->params.zd_extension_size) {
4308 return NVME_INVALID_FIELD | NVME_DNR;
4309 }
4310 zd_ext = nvme_get_zd_extension(ns, zone_idx);
4311 status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
4312 if (status) {
4313 trace_pci_nvme_err_zd_extension_map_error(zone_idx);
4314 return status;
4315 }
4316
4317 status = nvme_set_zd_ext(ns, zone);
4318 if (status == NVME_SUCCESS) {
4319 trace_pci_nvme_zd_extension_set(zone_idx);
4320 return status;
4321 }
4322 break;
4323
4324 case NVME_ZONE_ACTION_ZRWA_FLUSH:
4325 if (all) {
4326 return NVME_INVALID_FIELD | NVME_DNR;
4327 }
4328
4329 return nvme_zone_mgmt_send_zrwa_flush(n, zone, slba, req);
4330
4331 default:
4332 trace_pci_nvme_err_invalid_mgmt_action(action);
4333 status = NVME_INVALID_FIELD;
4334 }
4335
4336 if (status == NVME_ZONE_INVAL_TRANSITION) {
4337 trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
4338 zone->d.za);
4339 }
4340 if (status) {
4341 status |= NVME_DNR;
4342 }
4343
4344 return status;
4345 }
4346
nvme_zone_matches_filter(uint32_t zafs,NvmeZone * zl)4347 static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
4348 {
4349 NvmeZoneState zs = nvme_get_zone_state(zl);
4350
4351 switch (zafs) {
4352 case NVME_ZONE_REPORT_ALL:
4353 return true;
4354 case NVME_ZONE_REPORT_EMPTY:
4355 return zs == NVME_ZONE_STATE_EMPTY;
4356 case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
4357 return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
4358 case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
4359 return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
4360 case NVME_ZONE_REPORT_CLOSED:
4361 return zs == NVME_ZONE_STATE_CLOSED;
4362 case NVME_ZONE_REPORT_FULL:
4363 return zs == NVME_ZONE_STATE_FULL;
4364 case NVME_ZONE_REPORT_READ_ONLY:
4365 return zs == NVME_ZONE_STATE_READ_ONLY;
4366 case NVME_ZONE_REPORT_OFFLINE:
4367 return zs == NVME_ZONE_STATE_OFFLINE;
4368 default:
4369 return false;
4370 }
4371 }
4372
nvme_zone_mgmt_recv(NvmeCtrl * n,NvmeRequest * req)4373 static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
4374 {
4375 NvmeCmd *cmd = &req->cmd;
4376 NvmeNamespace *ns = req->ns;
4377 /* cdw12 is zero-based number of dwords to return. Convert to bytes */
4378 uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
4379 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4380 uint32_t zone_idx, zra, zrasf, partial;
4381 uint64_t max_zones, nr_zones = 0;
4382 uint16_t status;
4383 uint64_t slba;
4384 NvmeZoneDescr *z;
4385 NvmeZone *zone;
4386 NvmeZoneReportHeader *header;
4387 void *buf, *buf_p;
4388 size_t zone_entry_sz;
4389 int i;
4390
4391 req->status = NVME_SUCCESS;
4392
4393 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
4394 if (status) {
4395 return status;
4396 }
4397
4398 zra = dw13 & 0xff;
4399 if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
4400 return NVME_INVALID_FIELD | NVME_DNR;
4401 }
4402 if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
4403 return NVME_INVALID_FIELD | NVME_DNR;
4404 }
4405
4406 zrasf = (dw13 >> 8) & 0xff;
4407 if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
4408 return NVME_INVALID_FIELD | NVME_DNR;
4409 }
4410
4411 if (data_size < sizeof(NvmeZoneReportHeader)) {
4412 return NVME_INVALID_FIELD | NVME_DNR;
4413 }
4414
4415 status = nvme_check_mdts(n, data_size);
4416 if (status) {
4417 return status;
4418 }
4419
4420 partial = (dw13 >> 16) & 0x01;
4421
4422 zone_entry_sz = sizeof(NvmeZoneDescr);
4423 if (zra == NVME_ZONE_REPORT_EXTENDED) {
4424 zone_entry_sz += ns->params.zd_extension_size;
4425 }
4426
4427 max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
4428 buf = g_malloc0(data_size);
4429
4430 zone = &ns->zone_array[zone_idx];
4431 for (i = zone_idx; i < ns->num_zones; i++) {
4432 if (partial && nr_zones >= max_zones) {
4433 break;
4434 }
4435 if (nvme_zone_matches_filter(zrasf, zone++)) {
4436 nr_zones++;
4437 }
4438 }
4439 header = buf;
4440 header->nr_zones = cpu_to_le64(nr_zones);
4441
4442 buf_p = buf + sizeof(NvmeZoneReportHeader);
4443 for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
4444 zone = &ns->zone_array[zone_idx];
4445 if (nvme_zone_matches_filter(zrasf, zone)) {
4446 z = buf_p;
4447 buf_p += sizeof(NvmeZoneDescr);
4448
4449 z->zt = zone->d.zt;
4450 z->zs = zone->d.zs;
4451 z->zcap = cpu_to_le64(zone->d.zcap);
4452 z->zslba = cpu_to_le64(zone->d.zslba);
4453 z->za = zone->d.za;
4454
4455 if (nvme_wp_is_valid(zone)) {
4456 z->wp = cpu_to_le64(zone->d.wp);
4457 } else {
4458 z->wp = cpu_to_le64(~0ULL);
4459 }
4460
4461 if (zra == NVME_ZONE_REPORT_EXTENDED) {
4462 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
4463 memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
4464 ns->params.zd_extension_size);
4465 }
4466 buf_p += ns->params.zd_extension_size;
4467 }
4468
4469 max_zones--;
4470 }
4471 }
4472
4473 status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
4474
4475 g_free(buf);
4476
4477 return status;
4478 }
4479
nvme_io_mgmt_recv_ruhs(NvmeCtrl * n,NvmeRequest * req,size_t len)4480 static uint16_t nvme_io_mgmt_recv_ruhs(NvmeCtrl *n, NvmeRequest *req,
4481 size_t len)
4482 {
4483 NvmeNamespace *ns = req->ns;
4484 NvmeEnduranceGroup *endgrp;
4485 NvmeRuhStatus *hdr;
4486 NvmeRuhStatusDescr *ruhsd;
4487 unsigned int nruhsd;
4488 uint16_t rg, ph, *ruhid;
4489 size_t trans_len;
4490 g_autofree uint8_t *buf = NULL;
4491
4492 if (!n->subsys) {
4493 return NVME_INVALID_FIELD | NVME_DNR;
4494 }
4495
4496 if (ns->params.nsid == 0 || ns->params.nsid == 0xffffffff) {
4497 return NVME_INVALID_NSID | NVME_DNR;
4498 }
4499
4500 if (!n->subsys->endgrp.fdp.enabled) {
4501 return NVME_FDP_DISABLED | NVME_DNR;
4502 }
4503
4504 endgrp = ns->endgrp;
4505
4506 nruhsd = ns->fdp.nphs * endgrp->fdp.nrg;
4507 trans_len = sizeof(NvmeRuhStatus) + nruhsd * sizeof(NvmeRuhStatusDescr);
4508 buf = g_malloc0(trans_len);
4509
4510 trans_len = MIN(trans_len, len);
4511
4512 hdr = (NvmeRuhStatus *)buf;
4513 ruhsd = (NvmeRuhStatusDescr *)(buf + sizeof(NvmeRuhStatus));
4514
4515 hdr->nruhsd = cpu_to_le16(nruhsd);
4516
4517 ruhid = ns->fdp.phs;
4518
4519 for (ph = 0; ph < ns->fdp.nphs; ph++, ruhid++) {
4520 NvmeRuHandle *ruh = &endgrp->fdp.ruhs[*ruhid];
4521
4522 for (rg = 0; rg < endgrp->fdp.nrg; rg++, ruhsd++) {
4523 uint16_t pid = nvme_make_pid(ns, rg, ph);
4524
4525 ruhsd->pid = cpu_to_le16(pid);
4526 ruhsd->ruhid = *ruhid;
4527 ruhsd->earutr = 0;
4528 ruhsd->ruamw = cpu_to_le64(ruh->rus[rg].ruamw);
4529 }
4530 }
4531
4532 return nvme_c2h(n, buf, trans_len, req);
4533 }
4534
nvme_io_mgmt_recv(NvmeCtrl * n,NvmeRequest * req)4535 static uint16_t nvme_io_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
4536 {
4537 NvmeCmd *cmd = &req->cmd;
4538 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4539 uint32_t numd = le32_to_cpu(cmd->cdw11);
4540 uint8_t mo = (cdw10 & 0xff);
4541 size_t len = (numd + 1) << 2;
4542
4543 switch (mo) {
4544 case NVME_IOMR_MO_NOP:
4545 return 0;
4546 case NVME_IOMR_MO_RUH_STATUS:
4547 return nvme_io_mgmt_recv_ruhs(n, req, len);
4548 default:
4549 return NVME_INVALID_FIELD | NVME_DNR;
4550 };
4551 }
4552
nvme_io_mgmt_send_ruh_update(NvmeCtrl * n,NvmeRequest * req)4553 static uint16_t nvme_io_mgmt_send_ruh_update(NvmeCtrl *n, NvmeRequest *req)
4554 {
4555 NvmeCmd *cmd = &req->cmd;
4556 NvmeNamespace *ns = req->ns;
4557 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4558 uint16_t ret = NVME_SUCCESS;
4559 uint32_t npid = (cdw10 >> 16) + 1;
4560 unsigned int i = 0;
4561 g_autofree uint16_t *pids = NULL;
4562 uint32_t maxnpid;
4563
4564 if (!ns->endgrp || !ns->endgrp->fdp.enabled) {
4565 return NVME_FDP_DISABLED | NVME_DNR;
4566 }
4567
4568 maxnpid = n->subsys->endgrp.fdp.nrg * n->subsys->endgrp.fdp.nruh;
4569
4570 if (unlikely(npid >= MIN(NVME_FDP_MAXPIDS, maxnpid))) {
4571 return NVME_INVALID_FIELD | NVME_DNR;
4572 }
4573
4574 pids = g_new(uint16_t, npid);
4575
4576 ret = nvme_h2c(n, pids, npid * sizeof(uint16_t), req);
4577 if (ret) {
4578 return ret;
4579 }
4580
4581 for (; i < npid; i++) {
4582 if (!nvme_update_ruh(n, ns, pids[i])) {
4583 return NVME_INVALID_FIELD | NVME_DNR;
4584 }
4585 }
4586
4587 return ret;
4588 }
4589
nvme_io_mgmt_send(NvmeCtrl * n,NvmeRequest * req)4590 static uint16_t nvme_io_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
4591 {
4592 NvmeCmd *cmd = &req->cmd;
4593 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4594 uint8_t mo = (cdw10 & 0xff);
4595
4596 switch (mo) {
4597 case NVME_IOMS_MO_NOP:
4598 return 0;
4599 case NVME_IOMS_MO_RUH_UPDATE:
4600 return nvme_io_mgmt_send_ruh_update(n, req);
4601 default:
4602 return NVME_INVALID_FIELD | NVME_DNR;
4603 };
4604 }
4605
__nvme_io_cmd_nvm(NvmeCtrl * n,NvmeRequest * req)4606 static uint16_t __nvme_io_cmd_nvm(NvmeCtrl *n, NvmeRequest *req)
4607 {
4608 switch (req->cmd.opcode) {
4609 case NVME_CMD_WRITE:
4610 return nvme_write(n, req);
4611 case NVME_CMD_READ:
4612 return nvme_read(n, req);
4613 case NVME_CMD_COMPARE:
4614 return nvme_compare(n, req);
4615 case NVME_CMD_WRITE_ZEROES:
4616 return nvme_write_zeroes(n, req);
4617 case NVME_CMD_DSM:
4618 return nvme_dsm(n, req);
4619 case NVME_CMD_VERIFY:
4620 return nvme_verify(n, req);
4621 case NVME_CMD_COPY:
4622 return nvme_copy(n, req);
4623 case NVME_CMD_IO_MGMT_RECV:
4624 return nvme_io_mgmt_recv(n, req);
4625 case NVME_CMD_IO_MGMT_SEND:
4626 return nvme_io_mgmt_send(n, req);
4627 }
4628
4629 g_assert_not_reached();
4630 }
4631
nvme_io_cmd_nvm(NvmeCtrl * n,NvmeRequest * req)4632 static uint16_t nvme_io_cmd_nvm(NvmeCtrl *n, NvmeRequest *req)
4633 {
4634 if (!(n->cse.iocs.nvm[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
4635 trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
4636 return NVME_INVALID_OPCODE | NVME_DNR;
4637 }
4638
4639 return __nvme_io_cmd_nvm(n, req);
4640 }
4641
nvme_io_cmd_zoned(NvmeCtrl * n,NvmeRequest * req)4642 static uint16_t nvme_io_cmd_zoned(NvmeCtrl *n, NvmeRequest *req)
4643 {
4644 if (!(n->cse.iocs.zoned[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
4645 trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
4646 return NVME_INVALID_OPCODE | NVME_DNR;
4647 }
4648
4649 switch (req->cmd.opcode) {
4650 case NVME_CMD_ZONE_APPEND:
4651 return nvme_zone_append(n, req);
4652 case NVME_CMD_ZONE_MGMT_SEND:
4653 return nvme_zone_mgmt_send(n, req);
4654 case NVME_CMD_ZONE_MGMT_RECV:
4655 return nvme_zone_mgmt_recv(n, req);
4656 }
4657
4658 return __nvme_io_cmd_nvm(n, req);
4659 }
4660
nvme_io_cmd(NvmeCtrl * n,NvmeRequest * req)4661 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
4662 {
4663 NvmeNamespace *ns;
4664 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4665
4666 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
4667 req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
4668
4669 /*
4670 * In the base NVM command set, Flush may apply to all namespaces
4671 * (indicated by NSID being set to FFFFFFFFh). But if that feature is used
4672 * along with TP 4056 (Namespace Types), it may be pretty screwed up.
4673 *
4674 * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the
4675 * opcode with a specific command since we cannot determine a unique I/O
4676 * command set. Opcode 0h could have any other meaning than something
4677 * equivalent to flushing and say it DOES have completely different
4678 * semantics in some other command set - does an NSID of FFFFFFFFh then
4679 * mean "for all namespaces, apply whatever command set specific command
4680 * that uses the 0h opcode?" Or does it mean "for all namespaces, apply
4681 * whatever command that uses the 0h opcode if, and only if, it allows NSID
4682 * to be FFFFFFFFh"?
4683 *
4684 * Anyway (and luckily), for now, we do not care about this since the
4685 * device only supports namespace types that includes the NVM Flush command
4686 * (NVM and Zoned), so always do an NVM Flush.
4687 */
4688
4689 if (req->cmd.opcode == NVME_CMD_FLUSH) {
4690 return nvme_flush(n, req);
4691 }
4692
4693 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4694 return NVME_INVALID_NSID | NVME_DNR;
4695 }
4696
4697 ns = nvme_ns(n, nsid);
4698 if (unlikely(!ns)) {
4699 return NVME_INVALID_FIELD | NVME_DNR;
4700 }
4701
4702 if (ns->status) {
4703 return ns->status;
4704 }
4705
4706 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
4707 return NVME_INVALID_FIELD;
4708 }
4709
4710 req->ns = ns;
4711
4712 switch (ns->csi) {
4713 case NVME_CSI_NVM:
4714 return nvme_io_cmd_nvm(n, req);
4715 case NVME_CSI_ZONED:
4716 return nvme_io_cmd_zoned(n, req);
4717 }
4718
4719 g_assert_not_reached();
4720 }
4721
nvme_cq_notifier(EventNotifier * e)4722 static void nvme_cq_notifier(EventNotifier *e)
4723 {
4724 NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
4725 NvmeCtrl *n = cq->ctrl;
4726
4727 if (!event_notifier_test_and_clear(e)) {
4728 return;
4729 }
4730
4731 nvme_update_cq_head(cq);
4732
4733 if (cq->tail == cq->head) {
4734 if (cq->irq_enabled) {
4735 n->cq_pending--;
4736 }
4737
4738 nvme_irq_deassert(n, cq);
4739 }
4740
4741 qemu_bh_schedule(cq->bh);
4742 }
4743
nvme_init_cq_ioeventfd(NvmeCQueue * cq)4744 static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
4745 {
4746 NvmeCtrl *n = cq->ctrl;
4747 uint16_t offset = (cq->cqid << 3) + (1 << 2);
4748 int ret;
4749
4750 ret = event_notifier_init(&cq->notifier, 0);
4751 if (ret < 0) {
4752 return ret;
4753 }
4754
4755 event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
4756 memory_region_add_eventfd(&n->iomem,
4757 0x1000 + offset, 4, false, 0, &cq->notifier);
4758
4759 return 0;
4760 }
4761
nvme_sq_notifier(EventNotifier * e)4762 static void nvme_sq_notifier(EventNotifier *e)
4763 {
4764 NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
4765
4766 if (!event_notifier_test_and_clear(e)) {
4767 return;
4768 }
4769
4770 nvme_process_sq(sq);
4771 }
4772
nvme_init_sq_ioeventfd(NvmeSQueue * sq)4773 static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
4774 {
4775 NvmeCtrl *n = sq->ctrl;
4776 uint16_t offset = sq->sqid << 3;
4777 int ret;
4778
4779 ret = event_notifier_init(&sq->notifier, 0);
4780 if (ret < 0) {
4781 return ret;
4782 }
4783
4784 event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
4785 memory_region_add_eventfd(&n->iomem,
4786 0x1000 + offset, 4, false, 0, &sq->notifier);
4787
4788 return 0;
4789 }
4790
nvme_free_sq(NvmeSQueue * sq,NvmeCtrl * n)4791 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
4792 {
4793 uint16_t offset = sq->sqid << 3;
4794
4795 n->sq[sq->sqid] = NULL;
4796 qemu_bh_delete(sq->bh);
4797 if (sq->ioeventfd_enabled) {
4798 memory_region_del_eventfd(&n->iomem,
4799 0x1000 + offset, 4, false, 0, &sq->notifier);
4800 event_notifier_set_handler(&sq->notifier, NULL);
4801 event_notifier_cleanup(&sq->notifier);
4802 }
4803 g_free(sq->io_req);
4804 if (sq->sqid) {
4805 g_free(sq);
4806 }
4807 }
4808
nvme_del_sq(NvmeCtrl * n,NvmeRequest * req)4809 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
4810 {
4811 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4812 NvmeRequest *r, *next;
4813 NvmeSQueue *sq;
4814 NvmeCQueue *cq;
4815 uint16_t qid = le16_to_cpu(c->qid);
4816
4817 if (unlikely(!qid || nvme_check_sqid(n, qid))) {
4818 trace_pci_nvme_err_invalid_del_sq(qid);
4819 return NVME_INVALID_QID | NVME_DNR;
4820 }
4821
4822 trace_pci_nvme_del_sq(qid);
4823
4824 sq = n->sq[qid];
4825 while (!QTAILQ_EMPTY(&sq->out_req_list)) {
4826 r = QTAILQ_FIRST(&sq->out_req_list);
4827 assert(r->aiocb);
4828 r->status = NVME_CMD_ABORT_SQ_DEL;
4829 blk_aio_cancel(r->aiocb);
4830 }
4831
4832 assert(QTAILQ_EMPTY(&sq->out_req_list));
4833
4834 if (!nvme_check_cqid(n, sq->cqid)) {
4835 cq = n->cq[sq->cqid];
4836 QTAILQ_REMOVE(&cq->sq_list, sq, entry);
4837
4838 nvme_post_cqes(cq);
4839 QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
4840 if (r->sq == sq) {
4841 QTAILQ_REMOVE(&cq->req_list, r, entry);
4842 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
4843 }
4844 }
4845 }
4846
4847 nvme_free_sq(sq, n);
4848 return NVME_SUCCESS;
4849 }
4850
nvme_init_sq(NvmeSQueue * sq,NvmeCtrl * n,uint64_t dma_addr,uint16_t sqid,uint16_t cqid,uint16_t size)4851 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
4852 uint16_t sqid, uint16_t cqid, uint16_t size)
4853 {
4854 int i;
4855 NvmeCQueue *cq;
4856
4857 sq->ctrl = n;
4858 sq->dma_addr = dma_addr;
4859 sq->sqid = sqid;
4860 sq->size = size;
4861 sq->cqid = cqid;
4862 sq->head = sq->tail = 0;
4863 sq->io_req = g_new0(NvmeRequest, sq->size);
4864
4865 QTAILQ_INIT(&sq->req_list);
4866 QTAILQ_INIT(&sq->out_req_list);
4867 for (i = 0; i < sq->size; i++) {
4868 sq->io_req[i].sq = sq;
4869 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
4870 }
4871
4872 sq->bh = qemu_bh_new_guarded(nvme_process_sq, sq,
4873 &DEVICE(sq->ctrl)->mem_reentrancy_guard);
4874
4875 if (n->dbbuf_enabled) {
4876 sq->db_addr = n->dbbuf_dbs + (sqid << 3);
4877 sq->ei_addr = n->dbbuf_eis + (sqid << 3);
4878
4879 if (n->params.ioeventfd && sq->sqid != 0) {
4880 if (!nvme_init_sq_ioeventfd(sq)) {
4881 sq->ioeventfd_enabled = true;
4882 }
4883 }
4884 }
4885
4886 assert(n->cq[cqid]);
4887 cq = n->cq[cqid];
4888 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
4889 n->sq[sqid] = sq;
4890 }
4891
nvme_create_sq(NvmeCtrl * n,NvmeRequest * req)4892 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
4893 {
4894 NvmeSQueue *sq;
4895 NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
4896
4897 uint16_t cqid = le16_to_cpu(c->cqid);
4898 uint16_t sqid = le16_to_cpu(c->sqid);
4899 uint16_t qsize = le16_to_cpu(c->qsize);
4900 uint16_t qflags = le16_to_cpu(c->sq_flags);
4901 uint64_t prp1 = le64_to_cpu(c->prp1);
4902
4903 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
4904
4905 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
4906 trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
4907 return NVME_INVALID_CQID | NVME_DNR;
4908 }
4909 if (unlikely(!sqid || sqid > n->conf_ioqpairs || n->sq[sqid] != NULL)) {
4910 trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
4911 return NVME_INVALID_QID | NVME_DNR;
4912 }
4913 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4914 trace_pci_nvme_err_invalid_create_sq_size(qsize);
4915 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4916 }
4917 if (unlikely(prp1 & (n->page_size - 1))) {
4918 trace_pci_nvme_err_invalid_create_sq_addr(prp1);
4919 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4920 }
4921 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
4922 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
4923 return NVME_INVALID_FIELD | NVME_DNR;
4924 }
4925 sq = g_malloc0(sizeof(*sq));
4926 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
4927 return NVME_SUCCESS;
4928 }
4929
4930 struct nvme_stats {
4931 uint64_t units_read;
4932 uint64_t units_written;
4933 uint64_t read_commands;
4934 uint64_t write_commands;
4935 };
4936
nvme_set_blk_stats(NvmeNamespace * ns,struct nvme_stats * stats)4937 static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
4938 {
4939 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
4940
4941 stats->units_read += s->nr_bytes[BLOCK_ACCT_READ];
4942 stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE];
4943 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
4944 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
4945 }
4946
nvme_ocp_extended_smart_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)4947 static uint16_t nvme_ocp_extended_smart_info(NvmeCtrl *n, uint8_t rae,
4948 uint32_t buf_len, uint64_t off,
4949 NvmeRequest *req)
4950 {
4951 NvmeNamespace *ns = NULL;
4952 NvmeSmartLogExtended smart_l = { 0 };
4953 struct nvme_stats stats = { 0 };
4954 uint32_t trans_len;
4955
4956 if (off >= sizeof(smart_l)) {
4957 return NVME_INVALID_FIELD | NVME_DNR;
4958 }
4959
4960 /* accumulate all stats from all namespaces */
4961 for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4962 ns = nvme_ns(n, i);
4963 if (ns) {
4964 nvme_set_blk_stats(ns, &stats);
4965 }
4966 }
4967
4968 smart_l.physical_media_units_written[0] = cpu_to_le64(stats.units_written);
4969 smart_l.physical_media_units_read[0] = cpu_to_le64(stats.units_read);
4970 smart_l.log_page_version = 0x0005;
4971
4972 static const uint8_t guid[16] = {
4973 0xC5, 0xAF, 0x10, 0x28, 0xEA, 0xBF, 0xF2, 0xA4,
4974 0x9C, 0x4F, 0x6F, 0x7C, 0xC9, 0x14, 0xD5, 0xAF
4975 };
4976 memcpy(smart_l.log_page_guid, guid, sizeof(smart_l.log_page_guid));
4977
4978 if (!rae) {
4979 nvme_clear_events(n, NVME_AER_TYPE_SMART);
4980 }
4981
4982 trans_len = MIN(sizeof(smart_l) - off, buf_len);
4983 return nvme_c2h(n, (uint8_t *) &smart_l + off, trans_len, req);
4984 }
4985
nvme_smart_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)4986 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4987 uint64_t off, NvmeRequest *req)
4988 {
4989 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4990 struct nvme_stats stats = { 0 };
4991 NvmeSmartLog smart = { 0 };
4992 uint32_t trans_len;
4993 NvmeNamespace *ns;
4994 time_t current_ms;
4995 uint64_t u_read, u_written;
4996
4997 if (off >= sizeof(smart)) {
4998 return NVME_INVALID_FIELD | NVME_DNR;
4999 }
5000
5001 if (nsid != 0xffffffff) {
5002 ns = nvme_ns(n, nsid);
5003 if (!ns) {
5004 return NVME_INVALID_NSID | NVME_DNR;
5005 }
5006 nvme_set_blk_stats(ns, &stats);
5007 } else {
5008 int i;
5009
5010 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5011 ns = nvme_ns(n, i);
5012 if (!ns) {
5013 continue;
5014 }
5015 nvme_set_blk_stats(ns, &stats);
5016 }
5017 }
5018
5019 trans_len = MIN(sizeof(smart) - off, buf_len);
5020 smart.critical_warning = n->smart_critical_warning;
5021
5022 u_read = DIV_ROUND_UP(stats.units_read >> BDRV_SECTOR_BITS, 1000);
5023 u_written = DIV_ROUND_UP(stats.units_written >> BDRV_SECTOR_BITS, 1000);
5024
5025 smart.data_units_read[0] = cpu_to_le64(u_read);
5026 smart.data_units_written[0] = cpu_to_le64(u_written);
5027 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
5028 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
5029
5030 smart.temperature = cpu_to_le16(n->temperature);
5031
5032 if ((n->temperature >= n->features.temp_thresh_hi) ||
5033 (n->temperature <= n->features.temp_thresh_low)) {
5034 smart.critical_warning |= NVME_SMART_TEMPERATURE;
5035 }
5036
5037 current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
5038 smart.power_on_hours[0] =
5039 cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
5040
5041 if (!rae) {
5042 nvme_clear_events(n, NVME_AER_TYPE_SMART);
5043 }
5044
5045 return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
5046 }
5047
nvme_endgrp_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)5048 static uint16_t nvme_endgrp_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
5049 uint64_t off, NvmeRequest *req)
5050 {
5051 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
5052 uint16_t endgrpid = (dw11 >> 16) & 0xffff;
5053 struct nvme_stats stats = {};
5054 NvmeEndGrpLog info = {};
5055 int i;
5056
5057 if (!n->subsys || endgrpid != 0x1) {
5058 return NVME_INVALID_FIELD | NVME_DNR;
5059 }
5060
5061 if (off >= sizeof(info)) {
5062 return NVME_INVALID_FIELD | NVME_DNR;
5063 }
5064
5065 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5066 NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i);
5067 if (!ns) {
5068 continue;
5069 }
5070
5071 nvme_set_blk_stats(ns, &stats);
5072 }
5073
5074 info.data_units_read[0] =
5075 cpu_to_le64(DIV_ROUND_UP(stats.units_read / 1000000000, 1000000000));
5076 info.data_units_written[0] =
5077 cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
5078 info.media_units_written[0] =
5079 cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
5080
5081 info.host_read_commands[0] = cpu_to_le64(stats.read_commands);
5082 info.host_write_commands[0] = cpu_to_le64(stats.write_commands);
5083
5084 buf_len = MIN(sizeof(info) - off, buf_len);
5085
5086 return nvme_c2h(n, (uint8_t *)&info + off, buf_len, req);
5087 }
5088
5089
nvme_fw_log_info(NvmeCtrl * n,uint32_t buf_len,uint64_t off,NvmeRequest * req)5090 static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
5091 NvmeRequest *req)
5092 {
5093 uint32_t trans_len;
5094 NvmeFwSlotInfoLog fw_log = {
5095 .afi = 0x1,
5096 };
5097
5098 if (off >= sizeof(fw_log)) {
5099 return NVME_INVALID_FIELD | NVME_DNR;
5100 }
5101
5102 strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
5103 trans_len = MIN(sizeof(fw_log) - off, buf_len);
5104
5105 return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
5106 }
5107
nvme_error_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)5108 static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
5109 uint64_t off, NvmeRequest *req)
5110 {
5111 uint32_t trans_len;
5112 NvmeErrorLog errlog;
5113
5114 if (off >= sizeof(errlog)) {
5115 return NVME_INVALID_FIELD | NVME_DNR;
5116 }
5117
5118 if (!rae) {
5119 nvme_clear_events(n, NVME_AER_TYPE_ERROR);
5120 }
5121
5122 memset(&errlog, 0x0, sizeof(errlog));
5123 trans_len = MIN(sizeof(errlog) - off, buf_len);
5124
5125 return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
5126 }
5127
nvme_changed_nslist(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)5128 static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
5129 uint64_t off, NvmeRequest *req)
5130 {
5131 uint32_t nslist[1024];
5132 uint32_t trans_len;
5133 int i = 0;
5134 uint32_t nsid;
5135
5136 if (off >= sizeof(nslist)) {
5137 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(nslist));
5138 return NVME_INVALID_FIELD | NVME_DNR;
5139 }
5140
5141 memset(nslist, 0x0, sizeof(nslist));
5142 trans_len = MIN(sizeof(nslist) - off, buf_len);
5143
5144 while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
5145 NVME_CHANGED_NSID_SIZE) {
5146 /*
5147 * If more than 1024 namespaces, the first entry in the log page should
5148 * be set to FFFFFFFFh and the others to 0 as spec.
5149 */
5150 if (i == ARRAY_SIZE(nslist)) {
5151 memset(nslist, 0x0, sizeof(nslist));
5152 nslist[0] = 0xffffffff;
5153 break;
5154 }
5155
5156 nslist[i++] = nsid;
5157 clear_bit(nsid, n->changed_nsids);
5158 }
5159
5160 /*
5161 * Remove all the remaining list entries in case returns directly due to
5162 * more than 1024 namespaces.
5163 */
5164 if (nslist[0] == 0xffffffff) {
5165 bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
5166 }
5167
5168 if (!rae) {
5169 nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
5170 }
5171
5172 return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
5173 }
5174
nvme_cmd_effects(NvmeCtrl * n,uint8_t csi,uint32_t buf_len,uint64_t off,NvmeRequest * req)5175 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
5176 uint64_t off, NvmeRequest *req)
5177 {
5178 NvmeEffectsLog log = {};
5179 const uint32_t *iocs = NULL;
5180 uint32_t trans_len;
5181
5182 if (off >= sizeof(log)) {
5183 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
5184 return NVME_INVALID_FIELD | NVME_DNR;
5185 }
5186
5187 switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) {
5188 case NVME_CC_CSS_NVM:
5189 iocs = n->cse.iocs.nvm;
5190 break;
5191
5192 case NVME_CC_CSS_ALL:
5193 switch (csi) {
5194 case NVME_CSI_NVM:
5195 iocs = n->cse.iocs.nvm;
5196 break;
5197 case NVME_CSI_ZONED:
5198 iocs = n->cse.iocs.zoned;
5199 break;
5200 }
5201
5202 break;
5203 }
5204
5205 memcpy(log.acs, n->cse.acs, sizeof(log.acs));
5206
5207 if (iocs) {
5208 memcpy(log.iocs, iocs, sizeof(log.iocs));
5209 }
5210
5211 trans_len = MIN(sizeof(log) - off, buf_len);
5212
5213 return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
5214 }
5215
nvme_vendor_specific_log(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req,uint8_t lid)5216 static uint16_t nvme_vendor_specific_log(NvmeCtrl *n, uint8_t rae,
5217 uint32_t buf_len, uint64_t off,
5218 NvmeRequest *req, uint8_t lid)
5219 {
5220 switch (lid) {
5221 case NVME_OCP_EXTENDED_SMART_INFO:
5222 if (n->params.ocp) {
5223 return nvme_ocp_extended_smart_info(n, rae, buf_len, off, req);
5224 }
5225 break;
5226 /* add a case for each additional vendor specific log id */
5227 }
5228
5229 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
5230 return NVME_INVALID_FIELD | NVME_DNR;
5231 }
5232
sizeof_fdp_conf_descr(size_t nruh,size_t vss)5233 static size_t sizeof_fdp_conf_descr(size_t nruh, size_t vss)
5234 {
5235 size_t entry_siz = sizeof(NvmeFdpDescrHdr) + nruh * sizeof(NvmeRuhDescr)
5236 + vss;
5237 return ROUND_UP(entry_siz, 8);
5238 }
5239
nvme_fdp_confs(NvmeCtrl * n,uint32_t endgrpid,uint32_t buf_len,uint64_t off,NvmeRequest * req)5240 static uint16_t nvme_fdp_confs(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
5241 uint64_t off, NvmeRequest *req)
5242 {
5243 uint32_t log_size, trans_len;
5244 g_autofree uint8_t *buf = NULL;
5245 NvmeFdpDescrHdr *hdr;
5246 NvmeRuhDescr *ruhd;
5247 NvmeEnduranceGroup *endgrp;
5248 NvmeFdpConfsHdr *log;
5249 size_t nruh, fdp_descr_size;
5250 int i;
5251
5252 if (endgrpid != 1 || !n->subsys) {
5253 return NVME_INVALID_FIELD | NVME_DNR;
5254 }
5255
5256 endgrp = &n->subsys->endgrp;
5257
5258 if (endgrp->fdp.enabled) {
5259 nruh = endgrp->fdp.nruh;
5260 } else {
5261 nruh = 1;
5262 }
5263
5264 fdp_descr_size = sizeof_fdp_conf_descr(nruh, FDPVSS);
5265 log_size = sizeof(NvmeFdpConfsHdr) + fdp_descr_size;
5266
5267 if (off >= log_size) {
5268 return NVME_INVALID_FIELD | NVME_DNR;
5269 }
5270
5271 trans_len = MIN(log_size - off, buf_len);
5272
5273 buf = g_malloc0(log_size);
5274 log = (NvmeFdpConfsHdr *)buf;
5275 hdr = (NvmeFdpDescrHdr *)(log + 1);
5276 ruhd = (NvmeRuhDescr *)(buf + sizeof(*log) + sizeof(*hdr));
5277
5278 log->num_confs = cpu_to_le16(0);
5279 log->size = cpu_to_le32(log_size);
5280
5281 hdr->descr_size = cpu_to_le16(fdp_descr_size);
5282 if (endgrp->fdp.enabled) {
5283 hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, VALID, 1);
5284 hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, RGIF, endgrp->fdp.rgif);
5285 hdr->nrg = cpu_to_le16(endgrp->fdp.nrg);
5286 hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
5287 hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
5288 hdr->nnss = cpu_to_le32(NVME_MAX_NAMESPACES);
5289 hdr->runs = cpu_to_le64(endgrp->fdp.runs);
5290
5291 for (i = 0; i < nruh; i++) {
5292 ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
5293 ruhd++;
5294 }
5295 } else {
5296 /* 1 bit for RUH in PIF -> 2 RUHs max. */
5297 hdr->nrg = cpu_to_le16(1);
5298 hdr->nruh = cpu_to_le16(1);
5299 hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
5300 hdr->nnss = cpu_to_le32(1);
5301 hdr->runs = cpu_to_le64(96 * MiB);
5302
5303 ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
5304 }
5305
5306 return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
5307 }
5308
nvme_fdp_ruh_usage(NvmeCtrl * n,uint32_t endgrpid,uint32_t dw10,uint32_t dw12,uint32_t buf_len,uint64_t off,NvmeRequest * req)5309 static uint16_t nvme_fdp_ruh_usage(NvmeCtrl *n, uint32_t endgrpid,
5310 uint32_t dw10, uint32_t dw12,
5311 uint32_t buf_len, uint64_t off,
5312 NvmeRequest *req)
5313 {
5314 NvmeRuHandle *ruh;
5315 NvmeRuhuLog *hdr;
5316 NvmeRuhuDescr *ruhud;
5317 NvmeEnduranceGroup *endgrp;
5318 g_autofree uint8_t *buf = NULL;
5319 uint32_t log_size, trans_len;
5320 uint16_t i;
5321
5322 if (endgrpid != 1 || !n->subsys) {
5323 return NVME_INVALID_FIELD | NVME_DNR;
5324 }
5325
5326 endgrp = &n->subsys->endgrp;
5327
5328 if (!endgrp->fdp.enabled) {
5329 return NVME_FDP_DISABLED | NVME_DNR;
5330 }
5331
5332 log_size = sizeof(NvmeRuhuLog) + endgrp->fdp.nruh * sizeof(NvmeRuhuDescr);
5333
5334 if (off >= log_size) {
5335 return NVME_INVALID_FIELD | NVME_DNR;
5336 }
5337
5338 trans_len = MIN(log_size - off, buf_len);
5339
5340 buf = g_malloc0(log_size);
5341 hdr = (NvmeRuhuLog *)buf;
5342 ruhud = (NvmeRuhuDescr *)(hdr + 1);
5343
5344 ruh = endgrp->fdp.ruhs;
5345 hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
5346
5347 for (i = 0; i < endgrp->fdp.nruh; i++, ruhud++, ruh++) {
5348 ruhud->ruha = ruh->ruha;
5349 }
5350
5351 return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
5352 }
5353
nvme_fdp_stats(NvmeCtrl * n,uint32_t endgrpid,uint32_t buf_len,uint64_t off,NvmeRequest * req)5354 static uint16_t nvme_fdp_stats(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
5355 uint64_t off, NvmeRequest *req)
5356 {
5357 NvmeEnduranceGroup *endgrp;
5358 NvmeFdpStatsLog log = {};
5359 uint32_t trans_len;
5360
5361 if (off >= sizeof(NvmeFdpStatsLog)) {
5362 return NVME_INVALID_FIELD | NVME_DNR;
5363 }
5364
5365 if (endgrpid != 1 || !n->subsys) {
5366 return NVME_INVALID_FIELD | NVME_DNR;
5367 }
5368
5369 if (!n->subsys->endgrp.fdp.enabled) {
5370 return NVME_FDP_DISABLED | NVME_DNR;
5371 }
5372
5373 endgrp = &n->subsys->endgrp;
5374
5375 trans_len = MIN(sizeof(log) - off, buf_len);
5376
5377 /* spec value is 128 bit, we only use 64 bit */
5378 log.hbmw[0] = cpu_to_le64(endgrp->fdp.hbmw);
5379 log.mbmw[0] = cpu_to_le64(endgrp->fdp.mbmw);
5380 log.mbe[0] = cpu_to_le64(endgrp->fdp.mbe);
5381
5382 return nvme_c2h(n, (uint8_t *)&log + off, trans_len, req);
5383 }
5384
nvme_fdp_events(NvmeCtrl * n,uint32_t endgrpid,uint32_t buf_len,uint64_t off,NvmeRequest * req)5385 static uint16_t nvme_fdp_events(NvmeCtrl *n, uint32_t endgrpid,
5386 uint32_t buf_len, uint64_t off,
5387 NvmeRequest *req)
5388 {
5389 NvmeEnduranceGroup *endgrp;
5390 NvmeCmd *cmd = &req->cmd;
5391 bool host_events = (cmd->cdw10 >> 8) & 0x1;
5392 uint32_t log_size, trans_len;
5393 NvmeFdpEventBuffer *ebuf;
5394 g_autofree NvmeFdpEventsLog *elog = NULL;
5395 NvmeFdpEvent *event;
5396
5397 if (endgrpid != 1 || !n->subsys) {
5398 return NVME_INVALID_FIELD | NVME_DNR;
5399 }
5400
5401 endgrp = &n->subsys->endgrp;
5402
5403 if (!endgrp->fdp.enabled) {
5404 return NVME_FDP_DISABLED | NVME_DNR;
5405 }
5406
5407 if (host_events) {
5408 ebuf = &endgrp->fdp.host_events;
5409 } else {
5410 ebuf = &endgrp->fdp.ctrl_events;
5411 }
5412
5413 log_size = sizeof(NvmeFdpEventsLog) + ebuf->nelems * sizeof(NvmeFdpEvent);
5414
5415 if (off >= log_size) {
5416 return NVME_INVALID_FIELD | NVME_DNR;
5417 }
5418
5419 trans_len = MIN(log_size - off, buf_len);
5420 elog = g_malloc0(log_size);
5421 elog->num_events = cpu_to_le32(ebuf->nelems);
5422 event = (NvmeFdpEvent *)(elog + 1);
5423
5424 if (ebuf->nelems && ebuf->start == ebuf->next) {
5425 unsigned int nelems = (NVME_FDP_MAX_EVENTS - ebuf->start);
5426 /* wrap over, copy [start;NVME_FDP_MAX_EVENTS[ and [0; next[ */
5427 memcpy(event, &ebuf->events[ebuf->start],
5428 sizeof(NvmeFdpEvent) * nelems);
5429 memcpy(event + nelems, ebuf->events,
5430 sizeof(NvmeFdpEvent) * ebuf->next);
5431 } else if (ebuf->start < ebuf->next) {
5432 memcpy(event, &ebuf->events[ebuf->start],
5433 sizeof(NvmeFdpEvent) * (ebuf->next - ebuf->start));
5434 }
5435
5436 return nvme_c2h(n, (uint8_t *)elog + off, trans_len, req);
5437 }
5438
nvme_get_log(NvmeCtrl * n,NvmeRequest * req)5439 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
5440 {
5441 NvmeCmd *cmd = &req->cmd;
5442
5443 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5444 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5445 uint32_t dw12 = le32_to_cpu(cmd->cdw12);
5446 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
5447 uint8_t lid = dw10 & 0xff;
5448 uint8_t lsp = (dw10 >> 8) & 0xf;
5449 uint8_t rae = (dw10 >> 15) & 0x1;
5450 uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24;
5451 uint32_t numdl, numdu, lspi;
5452 uint64_t off, lpol, lpou;
5453 size_t len;
5454 uint16_t status;
5455
5456 numdl = (dw10 >> 16);
5457 numdu = (dw11 & 0xffff);
5458 lspi = (dw11 >> 16);
5459 lpol = dw12;
5460 lpou = dw13;
5461
5462 len = (((numdu << 16) | numdl) + 1) << 2;
5463 off = (lpou << 32ULL) | lpol;
5464
5465 if (off & 0x3) {
5466 return NVME_INVALID_FIELD | NVME_DNR;
5467 }
5468
5469 trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
5470
5471 status = nvme_check_mdts(n, len);
5472 if (status) {
5473 return status;
5474 }
5475
5476 switch (lid) {
5477 case NVME_LOG_ERROR_INFO:
5478 return nvme_error_info(n, rae, len, off, req);
5479 case NVME_LOG_SMART_INFO:
5480 return nvme_smart_info(n, rae, len, off, req);
5481 case NVME_LOG_FW_SLOT_INFO:
5482 return nvme_fw_log_info(n, len, off, req);
5483 case NVME_LOG_VENDOR_START...NVME_LOG_VENDOR_END:
5484 return nvme_vendor_specific_log(n, rae, len, off, req, lid);
5485 case NVME_LOG_CHANGED_NSLIST:
5486 return nvme_changed_nslist(n, rae, len, off, req);
5487 case NVME_LOG_CMD_EFFECTS:
5488 return nvme_cmd_effects(n, csi, len, off, req);
5489 case NVME_LOG_ENDGRP:
5490 return nvme_endgrp_info(n, rae, len, off, req);
5491 case NVME_LOG_FDP_CONFS:
5492 return nvme_fdp_confs(n, lspi, len, off, req);
5493 case NVME_LOG_FDP_RUH_USAGE:
5494 return nvme_fdp_ruh_usage(n, lspi, dw10, dw12, len, off, req);
5495 case NVME_LOG_FDP_STATS:
5496 return nvme_fdp_stats(n, lspi, len, off, req);
5497 case NVME_LOG_FDP_EVENTS:
5498 return nvme_fdp_events(n, lspi, len, off, req);
5499 default:
5500 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
5501 return NVME_INVALID_FIELD | NVME_DNR;
5502 }
5503 }
5504
nvme_free_cq(NvmeCQueue * cq,NvmeCtrl * n)5505 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
5506 {
5507 PCIDevice *pci = PCI_DEVICE(n);
5508 uint16_t offset = (cq->cqid << 3) + (1 << 2);
5509
5510 n->cq[cq->cqid] = NULL;
5511 qemu_bh_delete(cq->bh);
5512 if (cq->ioeventfd_enabled) {
5513 memory_region_del_eventfd(&n->iomem,
5514 0x1000 + offset, 4, false, 0, &cq->notifier);
5515 event_notifier_set_handler(&cq->notifier, NULL);
5516 event_notifier_cleanup(&cq->notifier);
5517 }
5518 if (msix_enabled(pci) && cq->irq_enabled) {
5519 msix_vector_unuse(pci, cq->vector);
5520 }
5521 if (cq->cqid) {
5522 g_free(cq);
5523 }
5524 }
5525
nvme_del_cq(NvmeCtrl * n,NvmeRequest * req)5526 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
5527 {
5528 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
5529 NvmeCQueue *cq;
5530 uint16_t qid = le16_to_cpu(c->qid);
5531
5532 if (unlikely(!qid || nvme_check_cqid(n, qid))) {
5533 trace_pci_nvme_err_invalid_del_cq_cqid(qid);
5534 return NVME_INVALID_CQID | NVME_DNR;
5535 }
5536
5537 cq = n->cq[qid];
5538 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
5539 trace_pci_nvme_err_invalid_del_cq_notempty(qid);
5540 return NVME_INVALID_QUEUE_DEL;
5541 }
5542
5543 if (cq->irq_enabled && cq->tail != cq->head) {
5544 n->cq_pending--;
5545 }
5546
5547 nvme_irq_deassert(n, cq);
5548 trace_pci_nvme_del_cq(qid);
5549 nvme_free_cq(cq, n);
5550 return NVME_SUCCESS;
5551 }
5552
nvme_init_cq(NvmeCQueue * cq,NvmeCtrl * n,uint64_t dma_addr,uint16_t cqid,uint16_t vector,uint16_t size,uint16_t irq_enabled)5553 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
5554 uint16_t cqid, uint16_t vector, uint16_t size,
5555 uint16_t irq_enabled)
5556 {
5557 PCIDevice *pci = PCI_DEVICE(n);
5558
5559 if (msix_enabled(pci) && irq_enabled) {
5560 msix_vector_use(pci, vector);
5561 }
5562
5563 cq->ctrl = n;
5564 cq->cqid = cqid;
5565 cq->size = size;
5566 cq->dma_addr = dma_addr;
5567 cq->phase = 1;
5568 cq->irq_enabled = irq_enabled;
5569 cq->vector = vector;
5570 cq->head = cq->tail = 0;
5571 QTAILQ_INIT(&cq->req_list);
5572 QTAILQ_INIT(&cq->sq_list);
5573 if (n->dbbuf_enabled) {
5574 cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
5575 cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
5576
5577 if (n->params.ioeventfd && cqid != 0) {
5578 if (!nvme_init_cq_ioeventfd(cq)) {
5579 cq->ioeventfd_enabled = true;
5580 }
5581 }
5582 }
5583 n->cq[cqid] = cq;
5584 cq->bh = qemu_bh_new_guarded(nvme_post_cqes, cq,
5585 &DEVICE(cq->ctrl)->mem_reentrancy_guard);
5586 }
5587
nvme_create_cq(NvmeCtrl * n,NvmeRequest * req)5588 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
5589 {
5590 NvmeCQueue *cq;
5591 NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
5592 uint16_t cqid = le16_to_cpu(c->cqid);
5593 uint16_t vector = le16_to_cpu(c->irq_vector);
5594 uint16_t qsize = le16_to_cpu(c->qsize);
5595 uint16_t qflags = le16_to_cpu(c->cq_flags);
5596 uint64_t prp1 = le64_to_cpu(c->prp1);
5597 uint32_t cc = ldq_le_p(&n->bar.cc);
5598 uint8_t iocqes = NVME_CC_IOCQES(cc);
5599 uint8_t iosqes = NVME_CC_IOSQES(cc);
5600
5601 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
5602 NVME_CQ_FLAGS_IEN(qflags) != 0);
5603
5604 if (iosqes != NVME_SQES || iocqes != NVME_CQES) {
5605 trace_pci_nvme_err_invalid_create_cq_entry_size(iosqes, iocqes);
5606 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
5607 }
5608
5609 if (unlikely(!cqid || cqid > n->conf_ioqpairs || n->cq[cqid] != NULL)) {
5610 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
5611 return NVME_INVALID_QID | NVME_DNR;
5612 }
5613 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
5614 trace_pci_nvme_err_invalid_create_cq_size(qsize);
5615 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
5616 }
5617 if (unlikely(prp1 & (n->page_size - 1))) {
5618 trace_pci_nvme_err_invalid_create_cq_addr(prp1);
5619 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
5620 }
5621 if (unlikely(!msix_enabled(PCI_DEVICE(n)) && vector)) {
5622 trace_pci_nvme_err_invalid_create_cq_vector(vector);
5623 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
5624 }
5625 if (unlikely(vector >= n->conf_msix_qsize)) {
5626 trace_pci_nvme_err_invalid_create_cq_vector(vector);
5627 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
5628 }
5629 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
5630 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
5631 return NVME_INVALID_FIELD | NVME_DNR;
5632 }
5633
5634 cq = g_malloc0(sizeof(*cq));
5635 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
5636 NVME_CQ_FLAGS_IEN(qflags));
5637
5638 /*
5639 * It is only required to set qs_created when creating a completion queue;
5640 * creating a submission queue without a matching completion queue will
5641 * fail.
5642 */
5643 n->qs_created = true;
5644 return NVME_SUCCESS;
5645 }
5646
nvme_rpt_empty_id_struct(NvmeCtrl * n,NvmeRequest * req)5647 static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
5648 {
5649 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
5650
5651 return nvme_c2h(n, id, sizeof(id), req);
5652 }
5653
nvme_identify_ctrl(NvmeCtrl * n,NvmeRequest * req)5654 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
5655 {
5656 trace_pci_nvme_identify_ctrl();
5657
5658 return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
5659 }
5660
nvme_identify_ctrl_csi(NvmeCtrl * n,NvmeRequest * req)5661 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
5662 {
5663 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5664 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
5665 NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
5666
5667 trace_pci_nvme_identify_ctrl_csi(c->csi);
5668
5669 switch (c->csi) {
5670 case NVME_CSI_NVM:
5671 id_nvm->vsl = n->params.vsl;
5672 id_nvm->dmrl = NVME_ID_CTRL_NVM_DMRL_MAX;
5673 id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
5674 id_nvm->dmsl = NVME_ID_CTRL_NVM_DMRL_MAX * n->dmrsl;
5675 break;
5676
5677 case NVME_CSI_ZONED:
5678 ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
5679 break;
5680
5681 default:
5682 return NVME_INVALID_FIELD | NVME_DNR;
5683 }
5684
5685 return nvme_c2h(n, id, sizeof(id), req);
5686 }
5687
nvme_identify_ns(NvmeCtrl * n,NvmeRequest * req,bool active)5688 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
5689 {
5690 NvmeNamespace *ns;
5691 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5692 uint32_t nsid = le32_to_cpu(c->nsid);
5693
5694 trace_pci_nvme_identify_ns(nsid);
5695
5696 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5697 return NVME_INVALID_NSID | NVME_DNR;
5698 }
5699
5700 ns = nvme_ns(n, nsid);
5701 if (unlikely(!ns)) {
5702 if (!active) {
5703 ns = nvme_subsys_ns(n->subsys, nsid);
5704 if (!ns) {
5705 return nvme_rpt_empty_id_struct(n, req);
5706 }
5707 } else {
5708 return nvme_rpt_empty_id_struct(n, req);
5709 }
5710 }
5711
5712 if (active || ns->csi == NVME_CSI_NVM) {
5713 return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
5714 }
5715
5716 return NVME_INVALID_IOCS | NVME_DNR;
5717 }
5718
nvme_identify_ctrl_list(NvmeCtrl * n,NvmeRequest * req,bool attached)5719 static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
5720 bool attached)
5721 {
5722 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5723 uint32_t nsid = le32_to_cpu(c->nsid);
5724 uint16_t min_id = le16_to_cpu(c->ctrlid);
5725 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5726 uint16_t *ids = &list[1];
5727 NvmeNamespace *ns;
5728 NvmeCtrl *ctrl;
5729 int cntlid, nr_ids = 0;
5730
5731 trace_pci_nvme_identify_ctrl_list(c->cns, min_id);
5732
5733 if (!n->subsys) {
5734 return NVME_INVALID_FIELD | NVME_DNR;
5735 }
5736
5737 if (attached) {
5738 if (nsid == NVME_NSID_BROADCAST) {
5739 return NVME_INVALID_FIELD | NVME_DNR;
5740 }
5741
5742 ns = nvme_subsys_ns(n->subsys, nsid);
5743 if (!ns) {
5744 return NVME_INVALID_FIELD | NVME_DNR;
5745 }
5746 }
5747
5748 for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
5749 ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
5750 if (!ctrl) {
5751 continue;
5752 }
5753
5754 if (attached && !nvme_ns(ctrl, nsid)) {
5755 continue;
5756 }
5757
5758 ids[nr_ids++] = cntlid;
5759 }
5760
5761 list[0] = nr_ids;
5762
5763 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
5764 }
5765
nvme_identify_pri_ctrl_cap(NvmeCtrl * n,NvmeRequest * req)5766 static uint16_t nvme_identify_pri_ctrl_cap(NvmeCtrl *n, NvmeRequest *req)
5767 {
5768 trace_pci_nvme_identify_pri_ctrl_cap(le16_to_cpu(n->pri_ctrl_cap.cntlid));
5769
5770 return nvme_c2h(n, (uint8_t *)&n->pri_ctrl_cap,
5771 sizeof(NvmePriCtrlCap), req);
5772 }
5773
nvme_identify_sec_ctrl_list(NvmeCtrl * n,NvmeRequest * req)5774 static uint16_t nvme_identify_sec_ctrl_list(NvmeCtrl *n, NvmeRequest *req)
5775 {
5776 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5777 uint16_t pri_ctrl_id = le16_to_cpu(n->pri_ctrl_cap.cntlid);
5778 uint16_t min_id = le16_to_cpu(c->ctrlid);
5779 uint8_t num_sec_ctrl = n->nr_sec_ctrls;
5780 NvmeSecCtrlList list = {0};
5781 uint8_t i;
5782
5783 for (i = 0; i < num_sec_ctrl; i++) {
5784 if (n->sec_ctrl_list[i].scid >= min_id) {
5785 list.numcntl = MIN(num_sec_ctrl - i, 127);
5786 memcpy(&list.sec, n->sec_ctrl_list + i,
5787 list.numcntl * sizeof(NvmeSecCtrlEntry));
5788 break;
5789 }
5790 }
5791
5792 trace_pci_nvme_identify_sec_ctrl_list(pri_ctrl_id, list.numcntl);
5793
5794 return nvme_c2h(n, (uint8_t *)&list, sizeof(list), req);
5795 }
5796
nvme_identify_ns_ind(NvmeCtrl * n,NvmeRequest * req,bool alloc)5797 static uint16_t nvme_identify_ns_ind(NvmeCtrl *n, NvmeRequest *req, bool alloc)
5798 {
5799 NvmeNamespace *ns;
5800 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5801 uint32_t nsid = le32_to_cpu(c->nsid);
5802
5803 trace_pci_nvme_identify_ns_ind(nsid);
5804
5805 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5806 return NVME_INVALID_NSID | NVME_DNR;
5807 }
5808
5809 ns = nvme_ns(n, nsid);
5810 if (unlikely(!ns)) {
5811 if (alloc) {
5812 ns = nvme_subsys_ns(n->subsys, nsid);
5813 if (!ns) {
5814 return nvme_rpt_empty_id_struct(n, req);
5815 }
5816 } else {
5817 return nvme_rpt_empty_id_struct(n, req);
5818 }
5819 }
5820
5821 return nvme_c2h(n, (uint8_t *)&ns->id_ns_ind, sizeof(NvmeIdNsInd), req);
5822 }
5823
nvme_identify_ns_csi(NvmeCtrl * n,NvmeRequest * req,bool active)5824 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
5825 bool active)
5826 {
5827 NvmeNamespace *ns;
5828 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5829 uint32_t nsid = le32_to_cpu(c->nsid);
5830
5831 trace_pci_nvme_identify_ns_csi(nsid, c->csi);
5832
5833 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5834 return NVME_INVALID_NSID | NVME_DNR;
5835 }
5836
5837 ns = nvme_ns(n, nsid);
5838 if (unlikely(!ns)) {
5839 if (!active) {
5840 ns = nvme_subsys_ns(n->subsys, nsid);
5841 if (!ns) {
5842 return nvme_rpt_empty_id_struct(n, req);
5843 }
5844 } else {
5845 return nvme_rpt_empty_id_struct(n, req);
5846 }
5847 }
5848
5849 if (c->csi == NVME_CSI_NVM) {
5850 return nvme_c2h(n, (uint8_t *)&ns->id_ns_nvm, sizeof(NvmeIdNsNvm),
5851 req);
5852 } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
5853 return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
5854 req);
5855 }
5856
5857 return NVME_INVALID_FIELD | NVME_DNR;
5858 }
5859
nvme_identify_nslist(NvmeCtrl * n,NvmeRequest * req,bool active)5860 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
5861 bool active)
5862 {
5863 NvmeNamespace *ns;
5864 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5865 uint32_t min_nsid = le32_to_cpu(c->nsid);
5866 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5867 static const int data_len = sizeof(list);
5868 uint32_t *list_ptr = (uint32_t *)list;
5869 int i, j = 0;
5870
5871 trace_pci_nvme_identify_nslist(min_nsid);
5872
5873 /*
5874 * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values
5875 * since the Active Namespace ID List should return namespaces with ids
5876 * *higher* than the NSID specified in the command. This is also specified
5877 * in the spec (NVM Express v1.3d, Section 5.15.4).
5878 */
5879 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5880 return NVME_INVALID_NSID | NVME_DNR;
5881 }
5882
5883 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5884 ns = nvme_ns(n, i);
5885 if (!ns) {
5886 if (!active) {
5887 ns = nvme_subsys_ns(n->subsys, i);
5888 if (!ns) {
5889 continue;
5890 }
5891 } else {
5892 continue;
5893 }
5894 }
5895 if (ns->params.nsid <= min_nsid) {
5896 continue;
5897 }
5898 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5899 if (j == data_len / sizeof(uint32_t)) {
5900 break;
5901 }
5902 }
5903
5904 return nvme_c2h(n, list, data_len, req);
5905 }
5906
nvme_identify_nslist_csi(NvmeCtrl * n,NvmeRequest * req,bool active)5907 static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
5908 bool active)
5909 {
5910 NvmeNamespace *ns;
5911 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5912 uint32_t min_nsid = le32_to_cpu(c->nsid);
5913 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5914 static const int data_len = sizeof(list);
5915 uint32_t *list_ptr = (uint32_t *)list;
5916 int i, j = 0;
5917
5918 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
5919
5920 /*
5921 * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid.
5922 */
5923 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5924 return NVME_INVALID_NSID | NVME_DNR;
5925 }
5926
5927 if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
5928 return NVME_INVALID_FIELD | NVME_DNR;
5929 }
5930
5931 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5932 ns = nvme_ns(n, i);
5933 if (!ns) {
5934 if (!active) {
5935 ns = nvme_subsys_ns(n->subsys, i);
5936 if (!ns) {
5937 continue;
5938 }
5939 } else {
5940 continue;
5941 }
5942 }
5943 if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
5944 continue;
5945 }
5946 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5947 if (j == data_len / sizeof(uint32_t)) {
5948 break;
5949 }
5950 }
5951
5952 return nvme_c2h(n, list, data_len, req);
5953 }
5954
nvme_endurance_group_list(NvmeCtrl * n,NvmeRequest * req)5955 static uint16_t nvme_endurance_group_list(NvmeCtrl *n, NvmeRequest *req)
5956 {
5957 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5958 uint16_t *nr_ids = &list[0];
5959 uint16_t *ids = &list[1];
5960 uint16_t endgid = le32_to_cpu(req->cmd.cdw11) & 0xffff;
5961
5962 /*
5963 * The current nvme-subsys only supports Endurance Group #1.
5964 */
5965 if (!endgid) {
5966 *nr_ids = 1;
5967 ids[0] = 1;
5968 } else {
5969 *nr_ids = 0;
5970 }
5971
5972 return nvme_c2h(n, list, sizeof(list), req);
5973 }
5974
nvme_identify_ns_descr_list(NvmeCtrl * n,NvmeRequest * req)5975 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
5976 {
5977 NvmeNamespace *ns;
5978 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5979 uint32_t nsid = le32_to_cpu(c->nsid);
5980 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5981 uint8_t *pos = list;
5982 struct {
5983 NvmeIdNsDescr hdr;
5984 uint8_t v[NVME_NIDL_UUID];
5985 } QEMU_PACKED uuid = {};
5986 struct {
5987 NvmeIdNsDescr hdr;
5988 uint8_t v[NVME_NIDL_NGUID];
5989 } QEMU_PACKED nguid = {};
5990 struct {
5991 NvmeIdNsDescr hdr;
5992 uint64_t v;
5993 } QEMU_PACKED eui64 = {};
5994 struct {
5995 NvmeIdNsDescr hdr;
5996 uint8_t v;
5997 } QEMU_PACKED csi = {};
5998
5999 trace_pci_nvme_identify_ns_descr_list(nsid);
6000
6001 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
6002 return NVME_INVALID_NSID | NVME_DNR;
6003 }
6004
6005 ns = nvme_ns(n, nsid);
6006 if (unlikely(!ns)) {
6007 return NVME_INVALID_FIELD | NVME_DNR;
6008 }
6009
6010 if (!qemu_uuid_is_null(&ns->params.uuid)) {
6011 uuid.hdr.nidt = NVME_NIDT_UUID;
6012 uuid.hdr.nidl = NVME_NIDL_UUID;
6013 memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
6014 memcpy(pos, &uuid, sizeof(uuid));
6015 pos += sizeof(uuid);
6016 }
6017
6018 if (!nvme_nguid_is_null(&ns->params.nguid)) {
6019 nguid.hdr.nidt = NVME_NIDT_NGUID;
6020 nguid.hdr.nidl = NVME_NIDL_NGUID;
6021 memcpy(nguid.v, ns->params.nguid.data, NVME_NIDL_NGUID);
6022 memcpy(pos, &nguid, sizeof(nguid));
6023 pos += sizeof(nguid);
6024 }
6025
6026 if (ns->params.eui64) {
6027 eui64.hdr.nidt = NVME_NIDT_EUI64;
6028 eui64.hdr.nidl = NVME_NIDL_EUI64;
6029 eui64.v = cpu_to_be64(ns->params.eui64);
6030 memcpy(pos, &eui64, sizeof(eui64));
6031 pos += sizeof(eui64);
6032 }
6033
6034 csi.hdr.nidt = NVME_NIDT_CSI;
6035 csi.hdr.nidl = NVME_NIDL_CSI;
6036 csi.v = ns->csi;
6037 memcpy(pos, &csi, sizeof(csi));
6038 pos += sizeof(csi);
6039
6040 return nvme_c2h(n, list, sizeof(list), req);
6041 }
6042
nvme_identify_cmd_set(NvmeCtrl * n,NvmeRequest * req)6043 static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
6044 {
6045 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
6046 static const int data_len = sizeof(list);
6047
6048 trace_pci_nvme_identify_cmd_set();
6049
6050 NVME_SET_CSI(*list, NVME_CSI_NVM);
6051 NVME_SET_CSI(*list, NVME_CSI_ZONED);
6052
6053 return nvme_c2h(n, list, data_len, req);
6054 }
6055
nvme_identify(NvmeCtrl * n,NvmeRequest * req)6056 static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
6057 {
6058 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
6059
6060 trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
6061 c->csi);
6062
6063 switch (c->cns) {
6064 case NVME_ID_CNS_NS:
6065 return nvme_identify_ns(n, req, true);
6066 case NVME_ID_CNS_NS_PRESENT:
6067 return nvme_identify_ns(n, req, false);
6068 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
6069 return nvme_identify_ctrl_list(n, req, true);
6070 case NVME_ID_CNS_CTRL_LIST:
6071 return nvme_identify_ctrl_list(n, req, false);
6072 case NVME_ID_CNS_PRIMARY_CTRL_CAP:
6073 return nvme_identify_pri_ctrl_cap(n, req);
6074 case NVME_ID_CNS_SECONDARY_CTRL_LIST:
6075 return nvme_identify_sec_ctrl_list(n, req);
6076 case NVME_ID_CNS_CS_NS:
6077 return nvme_identify_ns_csi(n, req, true);
6078 case NVME_ID_CNS_CS_IND_NS:
6079 return nvme_identify_ns_ind(n, req, false);
6080 case NVME_ID_CNS_CS_IND_NS_ALLOCATED:
6081 return nvme_identify_ns_ind(n, req, true);
6082 case NVME_ID_CNS_CS_NS_PRESENT:
6083 return nvme_identify_ns_csi(n, req, false);
6084 case NVME_ID_CNS_CTRL:
6085 return nvme_identify_ctrl(n, req);
6086 case NVME_ID_CNS_CS_CTRL:
6087 return nvme_identify_ctrl_csi(n, req);
6088 case NVME_ID_CNS_NS_ACTIVE_LIST:
6089 return nvme_identify_nslist(n, req, true);
6090 case NVME_ID_CNS_NS_PRESENT_LIST:
6091 return nvme_identify_nslist(n, req, false);
6092 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
6093 return nvme_identify_nslist_csi(n, req, true);
6094 case NVME_ID_CNS_ENDURANCE_GROUP_LIST:
6095 return nvme_endurance_group_list(n, req);
6096 case NVME_ID_CNS_CS_NS_PRESENT_LIST:
6097 return nvme_identify_nslist_csi(n, req, false);
6098 case NVME_ID_CNS_NS_DESCR_LIST:
6099 return nvme_identify_ns_descr_list(n, req);
6100 case NVME_ID_CNS_IO_COMMAND_SET:
6101 return nvme_identify_cmd_set(n, req);
6102 default:
6103 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
6104 return NVME_INVALID_FIELD | NVME_DNR;
6105 }
6106 }
6107
nvme_abort(NvmeCtrl * n,NvmeRequest * req)6108 static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
6109 {
6110 uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
6111 uint16_t cid = (le32_to_cpu(req->cmd.cdw10) >> 16) & 0xffff;
6112 NvmeSQueue *sq = n->sq[sqid];
6113 NvmeRequest *r, *next;
6114 int i;
6115
6116 req->cqe.result = 1;
6117 if (nvme_check_sqid(n, sqid)) {
6118 return NVME_INVALID_FIELD | NVME_DNR;
6119 }
6120
6121 if (sqid == 0) {
6122 for (i = 0; i < n->outstanding_aers; i++) {
6123 NvmeRequest *re = n->aer_reqs[i];
6124 if (re->cqe.cid == cid) {
6125 memmove(n->aer_reqs + i, n->aer_reqs + i + 1,
6126 (n->outstanding_aers - i - 1) * sizeof(NvmeRequest *));
6127 n->outstanding_aers--;
6128 re->status = NVME_CMD_ABORT_REQ;
6129 req->cqe.result = 0;
6130 nvme_enqueue_req_completion(&n->admin_cq, re);
6131 return NVME_SUCCESS;
6132 }
6133 }
6134 }
6135
6136 QTAILQ_FOREACH_SAFE(r, &sq->out_req_list, entry, next) {
6137 if (r->cqe.cid == cid) {
6138 if (r->aiocb) {
6139 r->status = NVME_CMD_ABORT_REQ;
6140 blk_aio_cancel_async(r->aiocb);
6141 }
6142 break;
6143 }
6144 }
6145
6146 return NVME_SUCCESS;
6147 }
6148
nvme_set_timestamp(NvmeCtrl * n,uint64_t ts)6149 static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
6150 {
6151 trace_pci_nvme_setfeat_timestamp(ts);
6152
6153 n->host_timestamp = le64_to_cpu(ts);
6154 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6155 }
6156
nvme_get_timestamp(const NvmeCtrl * n)6157 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
6158 {
6159 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6160 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
6161
6162 union nvme_timestamp {
6163 struct {
6164 uint64_t timestamp:48;
6165 uint64_t sync:1;
6166 uint64_t origin:3;
6167 uint64_t rsvd1:12;
6168 };
6169 uint64_t all;
6170 };
6171
6172 union nvme_timestamp ts;
6173 ts.all = 0;
6174 ts.timestamp = n->host_timestamp + elapsed_time;
6175
6176 /* If the host timestamp is non-zero, set the timestamp origin */
6177 ts.origin = n->host_timestamp ? 0x01 : 0x00;
6178
6179 trace_pci_nvme_getfeat_timestamp(ts.all);
6180
6181 return cpu_to_le64(ts.all);
6182 }
6183
nvme_get_feature_timestamp(NvmeCtrl * n,NvmeRequest * req)6184 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
6185 {
6186 uint64_t timestamp = nvme_get_timestamp(n);
6187
6188 return nvme_c2h(n, (uint8_t *)×tamp, sizeof(timestamp), req);
6189 }
6190
nvme_get_feature_fdp(NvmeCtrl * n,uint32_t endgrpid,uint32_t * result)6191 static int nvme_get_feature_fdp(NvmeCtrl *n, uint32_t endgrpid,
6192 uint32_t *result)
6193 {
6194 *result = 0;
6195
6196 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6197 return NVME_INVALID_FIELD | NVME_DNR;
6198 }
6199
6200 *result = FIELD_DP16(0, FEAT_FDP, FDPE, 1);
6201 *result = FIELD_DP16(*result, FEAT_FDP, CONF_NDX, 0);
6202
6203 return NVME_SUCCESS;
6204 }
6205
nvme_get_feature_fdp_events(NvmeCtrl * n,NvmeNamespace * ns,NvmeRequest * req,uint32_t * result)6206 static uint16_t nvme_get_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
6207 NvmeRequest *req, uint32_t *result)
6208 {
6209 NvmeCmd *cmd = &req->cmd;
6210 uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
6211 uint16_t ph = cdw11 & 0xffff;
6212 uint8_t noet = (cdw11 >> 16) & 0xff;
6213 uint16_t ruhid, ret;
6214 uint32_t nentries = 0;
6215 uint8_t s_events_ndx = 0;
6216 size_t s_events_siz = sizeof(NvmeFdpEventDescr) * noet;
6217 g_autofree NvmeFdpEventDescr *s_events = g_malloc0(s_events_siz);
6218 NvmeRuHandle *ruh;
6219 NvmeFdpEventDescr *s_event;
6220
6221 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6222 return NVME_FDP_DISABLED | NVME_DNR;
6223 }
6224
6225 if (!nvme_ph_valid(ns, ph)) {
6226 return NVME_INVALID_FIELD | NVME_DNR;
6227 }
6228
6229 ruhid = ns->fdp.phs[ph];
6230 ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
6231
6232 assert(ruh);
6233
6234 if (unlikely(noet == 0)) {
6235 return NVME_INVALID_FIELD | NVME_DNR;
6236 }
6237
6238 for (uint8_t event_type = 0; event_type < FDP_EVT_MAX; event_type++) {
6239 uint8_t shift = nvme_fdp_evf_shifts[event_type];
6240 if (!shift && event_type) {
6241 /*
6242 * only first entry (event_type == 0) has a shift value of 0
6243 * other entries are simply unpopulated.
6244 */
6245 continue;
6246 }
6247
6248 nentries++;
6249
6250 s_event = &s_events[s_events_ndx];
6251 s_event->evt = event_type;
6252 s_event->evta = (ruh->event_filter >> shift) & 0x1;
6253
6254 /* break if all `noet` entries are filled */
6255 if ((++s_events_ndx) == noet) {
6256 break;
6257 }
6258 }
6259
6260 ret = nvme_c2h(n, s_events, s_events_siz, req);
6261 if (ret) {
6262 return ret;
6263 }
6264
6265 *result = nentries;
6266 return NVME_SUCCESS;
6267 }
6268
nvme_get_feature(NvmeCtrl * n,NvmeRequest * req)6269 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
6270 {
6271 NvmeCmd *cmd = &req->cmd;
6272 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
6273 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
6274 uint32_t nsid = le32_to_cpu(cmd->nsid);
6275 uint32_t result = 0;
6276 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
6277 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
6278 uint16_t iv;
6279 NvmeNamespace *ns;
6280 int i;
6281 uint16_t endgrpid = 0, ret = NVME_SUCCESS;
6282
6283 static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
6284 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
6285 };
6286
6287 trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
6288
6289 if (!nvme_feature_support[fid]) {
6290 return NVME_INVALID_FIELD | NVME_DNR;
6291 }
6292
6293 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
6294 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
6295 /*
6296 * The Reservation Notification Mask and Reservation Persistence
6297 * features require a status code of Invalid Field in Command when
6298 * NSID is FFFFFFFFh. Since the device does not support those
6299 * features we can always return Invalid Namespace or Format as we
6300 * should do for all other features.
6301 */
6302 return NVME_INVALID_NSID | NVME_DNR;
6303 }
6304
6305 if (!nvme_ns(n, nsid)) {
6306 return NVME_INVALID_FIELD | NVME_DNR;
6307 }
6308 }
6309
6310 switch (sel) {
6311 case NVME_GETFEAT_SELECT_CURRENT:
6312 break;
6313 case NVME_GETFEAT_SELECT_SAVED:
6314 /* no features are saveable by the controller; fallthrough */
6315 case NVME_GETFEAT_SELECT_DEFAULT:
6316 goto defaults;
6317 case NVME_GETFEAT_SELECT_CAP:
6318 result = nvme_feature_cap[fid];
6319 goto out;
6320 }
6321
6322 switch (fid) {
6323 case NVME_TEMPERATURE_THRESHOLD:
6324 result = 0;
6325
6326 /*
6327 * The controller only implements the Composite Temperature sensor, so
6328 * return 0 for all other sensors.
6329 */
6330 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6331 goto out;
6332 }
6333
6334 switch (NVME_TEMP_THSEL(dw11)) {
6335 case NVME_TEMP_THSEL_OVER:
6336 result = n->features.temp_thresh_hi;
6337 goto out;
6338 case NVME_TEMP_THSEL_UNDER:
6339 result = n->features.temp_thresh_low;
6340 goto out;
6341 }
6342
6343 return NVME_INVALID_FIELD | NVME_DNR;
6344 case NVME_ERROR_RECOVERY:
6345 if (!nvme_nsid_valid(n, nsid)) {
6346 return NVME_INVALID_NSID | NVME_DNR;
6347 }
6348
6349 ns = nvme_ns(n, nsid);
6350 if (unlikely(!ns)) {
6351 return NVME_INVALID_FIELD | NVME_DNR;
6352 }
6353
6354 result = ns->features.err_rec;
6355 goto out;
6356 case NVME_VOLATILE_WRITE_CACHE:
6357 result = 0;
6358 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6359 ns = nvme_ns(n, i);
6360 if (!ns) {
6361 continue;
6362 }
6363
6364 result = blk_enable_write_cache(ns->blkconf.blk);
6365 if (result) {
6366 break;
6367 }
6368 }
6369 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
6370 goto out;
6371 case NVME_ASYNCHRONOUS_EVENT_CONF:
6372 result = n->features.async_config;
6373 goto out;
6374 case NVME_TIMESTAMP:
6375 return nvme_get_feature_timestamp(n, req);
6376 case NVME_HOST_BEHAVIOR_SUPPORT:
6377 return nvme_c2h(n, (uint8_t *)&n->features.hbs,
6378 sizeof(n->features.hbs), req);
6379 case NVME_FDP_MODE:
6380 endgrpid = dw11 & 0xff;
6381
6382 if (endgrpid != 0x1) {
6383 return NVME_INVALID_FIELD | NVME_DNR;
6384 }
6385
6386 ret = nvme_get_feature_fdp(n, endgrpid, &result);
6387 if (ret) {
6388 return ret;
6389 }
6390 goto out;
6391 case NVME_FDP_EVENTS:
6392 if (!nvme_nsid_valid(n, nsid)) {
6393 return NVME_INVALID_NSID | NVME_DNR;
6394 }
6395
6396 ns = nvme_ns(n, nsid);
6397 if (unlikely(!ns)) {
6398 return NVME_INVALID_FIELD | NVME_DNR;
6399 }
6400
6401 ret = nvme_get_feature_fdp_events(n, ns, req, &result);
6402 if (ret) {
6403 return ret;
6404 }
6405 goto out;
6406 default:
6407 break;
6408 }
6409
6410 defaults:
6411 switch (fid) {
6412 case NVME_TEMPERATURE_THRESHOLD:
6413 result = 0;
6414
6415 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6416 break;
6417 }
6418
6419 if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
6420 result = NVME_TEMPERATURE_WARNING;
6421 }
6422
6423 break;
6424 case NVME_NUMBER_OF_QUEUES:
6425 result = (n->conf_ioqpairs - 1) | ((n->conf_ioqpairs - 1) << 16);
6426 trace_pci_nvme_getfeat_numq(result);
6427 break;
6428 case NVME_INTERRUPT_VECTOR_CONF:
6429 iv = dw11 & 0xffff;
6430 if (iv >= n->conf_ioqpairs + 1) {
6431 return NVME_INVALID_FIELD | NVME_DNR;
6432 }
6433
6434 result = iv;
6435 if (iv == n->admin_cq.vector) {
6436 result |= NVME_INTVC_NOCOALESCING;
6437 }
6438 break;
6439 case NVME_FDP_MODE:
6440 endgrpid = dw11 & 0xff;
6441
6442 if (endgrpid != 0x1) {
6443 return NVME_INVALID_FIELD | NVME_DNR;
6444 }
6445
6446 ret = nvme_get_feature_fdp(n, endgrpid, &result);
6447 if (ret) {
6448 return ret;
6449 }
6450 break;
6451
6452 case NVME_WRITE_ATOMICITY:
6453 result = n->dn;
6454 break;
6455 default:
6456 result = nvme_feature_default[fid];
6457 break;
6458 }
6459
6460 out:
6461 req->cqe.result = cpu_to_le32(result);
6462 return ret;
6463 }
6464
nvme_set_feature_timestamp(NvmeCtrl * n,NvmeRequest * req)6465 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
6466 {
6467 uint16_t ret;
6468 uint64_t timestamp;
6469
6470 ret = nvme_h2c(n, (uint8_t *)×tamp, sizeof(timestamp), req);
6471 if (ret) {
6472 return ret;
6473 }
6474
6475 nvme_set_timestamp(n, timestamp);
6476
6477 return NVME_SUCCESS;
6478 }
6479
nvme_set_feature_fdp_events(NvmeCtrl * n,NvmeNamespace * ns,NvmeRequest * req)6480 static uint16_t nvme_set_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
6481 NvmeRequest *req)
6482 {
6483 NvmeCmd *cmd = &req->cmd;
6484 uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
6485 uint16_t ph = cdw11 & 0xffff;
6486 uint8_t noet = (cdw11 >> 16) & 0xff;
6487 uint16_t ret, ruhid;
6488 uint8_t enable = le32_to_cpu(cmd->cdw12) & 0x1;
6489 uint8_t event_mask = 0;
6490 unsigned int i;
6491 g_autofree uint8_t *events = g_malloc0(noet);
6492 NvmeRuHandle *ruh = NULL;
6493
6494 assert(ns);
6495
6496 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6497 return NVME_FDP_DISABLED | NVME_DNR;
6498 }
6499
6500 if (!nvme_ph_valid(ns, ph)) {
6501 return NVME_INVALID_FIELD | NVME_DNR;
6502 }
6503
6504 ruhid = ns->fdp.phs[ph];
6505 ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
6506
6507 ret = nvme_h2c(n, events, noet, req);
6508 if (ret) {
6509 return ret;
6510 }
6511
6512 for (i = 0; i < noet; i++) {
6513 event_mask |= (1 << nvme_fdp_evf_shifts[events[i]]);
6514 }
6515
6516 if (enable) {
6517 ruh->event_filter |= event_mask;
6518 } else {
6519 ruh->event_filter = ruh->event_filter & ~event_mask;
6520 }
6521
6522 return NVME_SUCCESS;
6523 }
6524
nvme_set_feature(NvmeCtrl * n,NvmeRequest * req)6525 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
6526 {
6527 NvmeNamespace *ns = NULL;
6528
6529 NvmeCmd *cmd = &req->cmd;
6530 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
6531 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
6532 uint32_t nsid = le32_to_cpu(cmd->nsid);
6533 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
6534 uint8_t save = NVME_SETFEAT_SAVE(dw10);
6535 uint16_t status;
6536 int i;
6537 NvmeIdCtrl *id = &n->id_ctrl;
6538 NvmeAtomic *atomic = &n->atomic;
6539
6540 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
6541
6542 if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
6543 return NVME_FID_NOT_SAVEABLE | NVME_DNR;
6544 }
6545
6546 if (!nvme_feature_support[fid]) {
6547 return NVME_INVALID_FIELD | NVME_DNR;
6548 }
6549
6550 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
6551 if (nsid != NVME_NSID_BROADCAST) {
6552 if (!nvme_nsid_valid(n, nsid)) {
6553 return NVME_INVALID_NSID | NVME_DNR;
6554 }
6555
6556 ns = nvme_ns(n, nsid);
6557 if (unlikely(!ns)) {
6558 return NVME_INVALID_FIELD | NVME_DNR;
6559 }
6560 }
6561 } else if (nsid && nsid != NVME_NSID_BROADCAST) {
6562 if (!nvme_nsid_valid(n, nsid)) {
6563 return NVME_INVALID_NSID | NVME_DNR;
6564 }
6565
6566 return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
6567 }
6568
6569 if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
6570 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
6571 }
6572
6573 switch (fid) {
6574 case NVME_TEMPERATURE_THRESHOLD:
6575 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6576 break;
6577 }
6578
6579 switch (NVME_TEMP_THSEL(dw11)) {
6580 case NVME_TEMP_THSEL_OVER:
6581 n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
6582 break;
6583 case NVME_TEMP_THSEL_UNDER:
6584 n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
6585 break;
6586 default:
6587 return NVME_INVALID_FIELD | NVME_DNR;
6588 }
6589
6590 if ((n->temperature >= n->features.temp_thresh_hi) ||
6591 (n->temperature <= n->features.temp_thresh_low)) {
6592 nvme_smart_event(n, NVME_SMART_TEMPERATURE);
6593 }
6594
6595 break;
6596 case NVME_ERROR_RECOVERY:
6597 if (nsid == NVME_NSID_BROADCAST) {
6598 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6599 ns = nvme_ns(n, i);
6600
6601 if (!ns) {
6602 continue;
6603 }
6604
6605 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
6606 ns->features.err_rec = dw11;
6607 }
6608 }
6609
6610 break;
6611 }
6612
6613 assert(ns);
6614 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
6615 ns->features.err_rec = dw11;
6616 }
6617 break;
6618 case NVME_VOLATILE_WRITE_CACHE:
6619 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6620 ns = nvme_ns(n, i);
6621 if (!ns) {
6622 continue;
6623 }
6624
6625 if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
6626 blk_flush(ns->blkconf.blk);
6627 }
6628
6629 blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
6630 }
6631
6632 break;
6633
6634 case NVME_NUMBER_OF_QUEUES:
6635 if (n->qs_created) {
6636 return NVME_CMD_SEQ_ERROR | NVME_DNR;
6637 }
6638
6639 /*
6640 * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR
6641 * and NSQR.
6642 */
6643 if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
6644 return NVME_INVALID_FIELD | NVME_DNR;
6645 }
6646
6647 trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
6648 ((dw11 >> 16) & 0xffff) + 1,
6649 n->conf_ioqpairs,
6650 n->conf_ioqpairs);
6651 req->cqe.result = cpu_to_le32((n->conf_ioqpairs - 1) |
6652 ((n->conf_ioqpairs - 1) << 16));
6653 break;
6654 case NVME_ASYNCHRONOUS_EVENT_CONF:
6655 n->features.async_config = dw11;
6656 break;
6657 case NVME_TIMESTAMP:
6658 return nvme_set_feature_timestamp(n, req);
6659 case NVME_HOST_BEHAVIOR_SUPPORT:
6660 status = nvme_h2c(n, (uint8_t *)&n->features.hbs,
6661 sizeof(n->features.hbs), req);
6662 if (status) {
6663 return status;
6664 }
6665
6666 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6667 ns = nvme_ns(n, i);
6668
6669 if (!ns) {
6670 continue;
6671 }
6672
6673 ns->id_ns.nlbaf = ns->nlbaf - 1;
6674 if (!n->features.hbs.lbafee) {
6675 ns->id_ns.nlbaf = MIN(ns->id_ns.nlbaf, 15);
6676 }
6677 }
6678
6679 return status;
6680 case NVME_COMMAND_SET_PROFILE:
6681 if (dw11 & 0x1ff) {
6682 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
6683 return NVME_IOCS_COMBINATION_REJECTED | NVME_DNR;
6684 }
6685 break;
6686 case NVME_FDP_MODE:
6687 /* spec: abort with cmd seq err if there's one or more NS' in endgrp */
6688 return NVME_CMD_SEQ_ERROR | NVME_DNR;
6689 case NVME_FDP_EVENTS:
6690 return nvme_set_feature_fdp_events(n, ns, req);
6691 case NVME_WRITE_ATOMICITY:
6692
6693 n->dn = 0x1 & dw11;
6694
6695 if (n->dn) {
6696 atomic->atomic_max_write_size = le16_to_cpu(id->awupf) + 1;
6697 } else {
6698 atomic->atomic_max_write_size = le16_to_cpu(id->awun) + 1;
6699 }
6700
6701 if (atomic->atomic_max_write_size == 1) {
6702 atomic->atomic_writes = 0;
6703 } else {
6704 atomic->atomic_writes = 1;
6705 }
6706 break;
6707 default:
6708 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
6709 }
6710 return NVME_SUCCESS;
6711 }
6712
nvme_aer(NvmeCtrl * n,NvmeRequest * req)6713 static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
6714 {
6715 trace_pci_nvme_aer(nvme_cid(req));
6716
6717 if (n->outstanding_aers > n->params.aerl) {
6718 trace_pci_nvme_aer_aerl_exceeded();
6719 return NVME_AER_LIMIT_EXCEEDED;
6720 }
6721
6722 n->aer_reqs[n->outstanding_aers] = req;
6723 n->outstanding_aers++;
6724
6725 if (!QTAILQ_EMPTY(&n->aer_queue)) {
6726 nvme_process_aers(n);
6727 }
6728
6729 return NVME_NO_COMPLETE;
6730 }
6731
nvme_update_dsm_limits(NvmeCtrl * n,NvmeNamespace * ns)6732 static void nvme_update_dsm_limits(NvmeCtrl *n, NvmeNamespace *ns)
6733 {
6734 if (ns) {
6735 n->dmrsl =
6736 MIN_NON_ZERO(n->dmrsl, BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6737
6738 return;
6739 }
6740
6741 for (uint32_t nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
6742 ns = nvme_ns(n, nsid);
6743 if (!ns) {
6744 continue;
6745 }
6746
6747 n->dmrsl =
6748 MIN_NON_ZERO(n->dmrsl, BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6749 }
6750 }
6751
nvme_csi_supported(NvmeCtrl * n,uint8_t csi)6752 static bool nvme_csi_supported(NvmeCtrl *n, uint8_t csi)
6753 {
6754 uint32_t cc;
6755
6756 switch (csi) {
6757 case NVME_CSI_NVM:
6758 return true;
6759
6760 case NVME_CSI_ZONED:
6761 cc = ldl_le_p(&n->bar.cc);
6762
6763 return NVME_CC_CSS(cc) == NVME_CC_CSS_ALL;
6764 }
6765
6766 g_assert_not_reached();
6767 }
6768
nvme_detach_ns(NvmeCtrl * n,NvmeNamespace * ns)6769 static void nvme_detach_ns(NvmeCtrl *n, NvmeNamespace *ns)
6770 {
6771 assert(ns->attached > 0);
6772
6773 n->namespaces[ns->params.nsid] = NULL;
6774 ns->attached--;
6775 }
6776
nvme_ns_attachment(NvmeCtrl * n,NvmeRequest * req)6777 static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
6778 {
6779 NvmeNamespace *ns;
6780 NvmeCtrl *ctrl;
6781 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
6782 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
6783 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6784 uint8_t sel = dw10 & 0xf;
6785 uint16_t *nr_ids = &list[0];
6786 uint16_t *ids = &list[1];
6787 uint16_t ret;
6788 int i;
6789
6790 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
6791
6792 if (!nvme_nsid_valid(n, nsid)) {
6793 return NVME_INVALID_NSID | NVME_DNR;
6794 }
6795
6796 ns = nvme_subsys_ns(n->subsys, nsid);
6797 if (!ns) {
6798 return NVME_INVALID_FIELD | NVME_DNR;
6799 }
6800
6801 ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
6802 if (ret) {
6803 return ret;
6804 }
6805
6806 if (!*nr_ids) {
6807 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
6808 }
6809
6810 *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
6811 for (i = 0; i < *nr_ids; i++) {
6812 ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
6813 if (!ctrl) {
6814 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
6815 }
6816
6817 switch (sel) {
6818 case NVME_NS_ATTACHMENT_ATTACH:
6819 if (nvme_ns(n, nsid)) {
6820 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
6821 }
6822
6823 if (ns->attached && !ns->params.shared) {
6824 return NVME_NS_PRIVATE | NVME_DNR;
6825 }
6826
6827 if (!nvme_csi_supported(n, ns->csi)) {
6828 return NVME_IOCS_NOT_SUPPORTED | NVME_DNR;
6829 }
6830
6831 nvme_attach_ns(ctrl, ns);
6832 nvme_update_dsm_limits(ctrl, ns);
6833
6834 break;
6835
6836 case NVME_NS_ATTACHMENT_DETACH:
6837 nvme_detach_ns(ctrl, ns);
6838 nvme_update_dsm_limits(ctrl, NULL);
6839
6840 break;
6841
6842 default:
6843 return NVME_INVALID_FIELD | NVME_DNR;
6844 }
6845
6846 /*
6847 * Add namespace id to the changed namespace id list for event clearing
6848 * via Get Log Page command.
6849 */
6850 if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
6851 nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
6852 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
6853 NVME_LOG_CHANGED_NSLIST);
6854 }
6855 }
6856
6857 return NVME_SUCCESS;
6858 }
6859
6860 typedef struct NvmeFormatAIOCB {
6861 BlockAIOCB common;
6862 BlockAIOCB *aiocb;
6863 NvmeRequest *req;
6864 int ret;
6865
6866 NvmeNamespace *ns;
6867 uint32_t nsid;
6868 bool broadcast;
6869 int64_t offset;
6870
6871 uint8_t lbaf;
6872 uint8_t mset;
6873 uint8_t pi;
6874 uint8_t pil;
6875 } NvmeFormatAIOCB;
6876
nvme_format_cancel(BlockAIOCB * aiocb)6877 static void nvme_format_cancel(BlockAIOCB *aiocb)
6878 {
6879 NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
6880
6881 iocb->ret = -ECANCELED;
6882
6883 if (iocb->aiocb) {
6884 blk_aio_cancel_async(iocb->aiocb);
6885 iocb->aiocb = NULL;
6886 }
6887 }
6888
6889 static const AIOCBInfo nvme_format_aiocb_info = {
6890 .aiocb_size = sizeof(NvmeFormatAIOCB),
6891 .cancel_async = nvme_format_cancel,
6892 };
6893
nvme_format_set(NvmeNamespace * ns,uint8_t lbaf,uint8_t mset,uint8_t pi,uint8_t pil)6894 static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset,
6895 uint8_t pi, uint8_t pil)
6896 {
6897 uint8_t lbafl = lbaf & 0xf;
6898 uint8_t lbafu = lbaf >> 4;
6899
6900 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
6901
6902 ns->id_ns.dps = (pil << 3) | pi;
6903 ns->id_ns.flbas = (lbafu << 5) | (mset << 4) | lbafl;
6904
6905 nvme_ns_init_format(ns);
6906 }
6907
6908 static void nvme_do_format(NvmeFormatAIOCB *iocb);
6909
nvme_format_ns_cb(void * opaque,int ret)6910 static void nvme_format_ns_cb(void *opaque, int ret)
6911 {
6912 NvmeFormatAIOCB *iocb = opaque;
6913 NvmeNamespace *ns = iocb->ns;
6914 int bytes;
6915
6916 if (iocb->ret < 0) {
6917 goto done;
6918 } else if (ret < 0) {
6919 iocb->ret = ret;
6920 goto done;
6921 }
6922
6923 assert(ns);
6924
6925 if (iocb->offset < ns->size) {
6926 bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
6927
6928 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
6929 bytes, BDRV_REQ_MAY_UNMAP,
6930 nvme_format_ns_cb, iocb);
6931
6932 iocb->offset += bytes;
6933 return;
6934 }
6935
6936 nvme_format_set(ns, iocb->lbaf, iocb->mset, iocb->pi, iocb->pil);
6937 ns->status = 0x0;
6938 iocb->ns = NULL;
6939 iocb->offset = 0;
6940
6941 done:
6942 nvme_do_format(iocb);
6943 }
6944
nvme_format_check(NvmeNamespace * ns,uint8_t lbaf,uint8_t pi)6945 static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
6946 {
6947 if (ns->params.zoned) {
6948 return NVME_INVALID_FORMAT | NVME_DNR;
6949 }
6950
6951 if (lbaf > ns->id_ns.nlbaf) {
6952 return NVME_INVALID_FORMAT | NVME_DNR;
6953 }
6954
6955 if (pi && (ns->id_ns.lbaf[lbaf].ms < nvme_pi_tuple_size(ns))) {
6956 return NVME_INVALID_FORMAT | NVME_DNR;
6957 }
6958
6959 if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
6960 return NVME_INVALID_FIELD | NVME_DNR;
6961 }
6962
6963 return NVME_SUCCESS;
6964 }
6965
nvme_do_format(NvmeFormatAIOCB * iocb)6966 static void nvme_do_format(NvmeFormatAIOCB *iocb)
6967 {
6968 NvmeRequest *req = iocb->req;
6969 NvmeCtrl *n = nvme_ctrl(req);
6970 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6971 uint8_t lbaf = dw10 & 0xf;
6972 uint8_t pi = (dw10 >> 5) & 0x7;
6973 uint16_t status;
6974 int i;
6975
6976 if (iocb->ret < 0) {
6977 goto done;
6978 }
6979
6980 if (iocb->broadcast) {
6981 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
6982 iocb->ns = nvme_ns(n, i);
6983 if (iocb->ns) {
6984 iocb->nsid = i;
6985 break;
6986 }
6987 }
6988 }
6989
6990 if (!iocb->ns) {
6991 goto done;
6992 }
6993
6994 status = nvme_format_check(iocb->ns, lbaf, pi);
6995 if (status) {
6996 req->status = status;
6997 goto done;
6998 }
6999
7000 iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
7001 nvme_format_ns_cb(iocb, 0);
7002 return;
7003
7004 done:
7005 iocb->common.cb(iocb->common.opaque, iocb->ret);
7006 qemu_aio_unref(iocb);
7007 }
7008
nvme_format(NvmeCtrl * n,NvmeRequest * req)7009 static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
7010 {
7011 NvmeFormatAIOCB *iocb;
7012 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
7013 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
7014 uint8_t lbaf = dw10 & 0xf;
7015 uint8_t mset = (dw10 >> 4) & 0x1;
7016 uint8_t pi = (dw10 >> 5) & 0x7;
7017 uint8_t pil = (dw10 >> 8) & 0x1;
7018 uint8_t lbafu = (dw10 >> 12) & 0x3;
7019 uint16_t status;
7020
7021 iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
7022
7023 iocb->req = req;
7024 iocb->ret = 0;
7025 iocb->ns = NULL;
7026 iocb->nsid = 0;
7027 iocb->lbaf = lbaf;
7028 iocb->mset = mset;
7029 iocb->pi = pi;
7030 iocb->pil = pil;
7031 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
7032 iocb->offset = 0;
7033
7034 if (n->features.hbs.lbafee) {
7035 iocb->lbaf |= lbafu << 4;
7036 }
7037
7038 if (!iocb->broadcast) {
7039 if (!nvme_nsid_valid(n, nsid)) {
7040 status = NVME_INVALID_NSID | NVME_DNR;
7041 goto out;
7042 }
7043
7044 iocb->ns = nvme_ns(n, nsid);
7045 if (!iocb->ns) {
7046 status = NVME_INVALID_FIELD | NVME_DNR;
7047 goto out;
7048 }
7049 }
7050
7051 req->aiocb = &iocb->common;
7052 nvme_do_format(iocb);
7053
7054 return NVME_NO_COMPLETE;
7055
7056 out:
7057 qemu_aio_unref(iocb);
7058
7059 return status;
7060 }
7061
nvme_get_virt_res_num(NvmeCtrl * n,uint8_t rt,int * num_total,int * num_prim,int * num_sec)7062 static void nvme_get_virt_res_num(NvmeCtrl *n, uint8_t rt, int *num_total,
7063 int *num_prim, int *num_sec)
7064 {
7065 *num_total = le32_to_cpu(rt ?
7066 n->pri_ctrl_cap.vifrt : n->pri_ctrl_cap.vqfrt);
7067 *num_prim = le16_to_cpu(rt ?
7068 n->pri_ctrl_cap.virfap : n->pri_ctrl_cap.vqrfap);
7069 *num_sec = le16_to_cpu(rt ? n->pri_ctrl_cap.virfa : n->pri_ctrl_cap.vqrfa);
7070 }
7071
nvme_assign_virt_res_to_prim(NvmeCtrl * n,NvmeRequest * req,uint16_t cntlid,uint8_t rt,int nr)7072 static uint16_t nvme_assign_virt_res_to_prim(NvmeCtrl *n, NvmeRequest *req,
7073 uint16_t cntlid, uint8_t rt,
7074 int nr)
7075 {
7076 int num_total, num_prim, num_sec;
7077
7078 if (cntlid != n->cntlid) {
7079 return NVME_INVALID_CTRL_ID | NVME_DNR;
7080 }
7081
7082 nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
7083
7084 if (nr > num_total) {
7085 return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
7086 }
7087
7088 if (nr > num_total - num_sec) {
7089 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
7090 }
7091
7092 if (rt) {
7093 n->next_pri_ctrl_cap.virfap = cpu_to_le16(nr);
7094 } else {
7095 n->next_pri_ctrl_cap.vqrfap = cpu_to_le16(nr);
7096 }
7097
7098 req->cqe.result = cpu_to_le32(nr);
7099 return req->status;
7100 }
7101
nvme_update_virt_res(NvmeCtrl * n,NvmeSecCtrlEntry * sctrl,uint8_t rt,int nr)7102 static void nvme_update_virt_res(NvmeCtrl *n, NvmeSecCtrlEntry *sctrl,
7103 uint8_t rt, int nr)
7104 {
7105 int prev_nr, prev_total;
7106
7107 if (rt) {
7108 prev_nr = le16_to_cpu(sctrl->nvi);
7109 prev_total = le32_to_cpu(n->pri_ctrl_cap.virfa);
7110 sctrl->nvi = cpu_to_le16(nr);
7111 n->pri_ctrl_cap.virfa = cpu_to_le32(prev_total + nr - prev_nr);
7112 } else {
7113 prev_nr = le16_to_cpu(sctrl->nvq);
7114 prev_total = le32_to_cpu(n->pri_ctrl_cap.vqrfa);
7115 sctrl->nvq = cpu_to_le16(nr);
7116 n->pri_ctrl_cap.vqrfa = cpu_to_le32(prev_total + nr - prev_nr);
7117 }
7118 }
7119
nvme_assign_virt_res_to_sec(NvmeCtrl * n,NvmeRequest * req,uint16_t cntlid,uint8_t rt,int nr)7120 static uint16_t nvme_assign_virt_res_to_sec(NvmeCtrl *n, NvmeRequest *req,
7121 uint16_t cntlid, uint8_t rt, int nr)
7122 {
7123 int num_total, num_prim, num_sec, num_free, diff, limit;
7124 NvmeSecCtrlEntry *sctrl;
7125
7126 sctrl = nvme_sctrl_for_cntlid(n, cntlid);
7127 if (!sctrl) {
7128 return NVME_INVALID_CTRL_ID | NVME_DNR;
7129 }
7130
7131 if (sctrl->scs) {
7132 return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
7133 }
7134
7135 limit = le16_to_cpu(rt ? n->pri_ctrl_cap.vifrsm : n->pri_ctrl_cap.vqfrsm);
7136 if (nr > limit) {
7137 return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
7138 }
7139
7140 nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
7141 num_free = num_total - num_prim - num_sec;
7142 diff = nr - le16_to_cpu(rt ? sctrl->nvi : sctrl->nvq);
7143
7144 if (diff > num_free) {
7145 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
7146 }
7147
7148 nvme_update_virt_res(n, sctrl, rt, nr);
7149 req->cqe.result = cpu_to_le32(nr);
7150
7151 return req->status;
7152 }
7153
nvme_virt_set_state(NvmeCtrl * n,uint16_t cntlid,bool online)7154 static uint16_t nvme_virt_set_state(NvmeCtrl *n, uint16_t cntlid, bool online)
7155 {
7156 PCIDevice *pci = PCI_DEVICE(n);
7157 NvmeCtrl *sn = NULL;
7158 NvmeSecCtrlEntry *sctrl;
7159 int vf_index;
7160
7161 sctrl = nvme_sctrl_for_cntlid(n, cntlid);
7162 if (!sctrl) {
7163 return NVME_INVALID_CTRL_ID | NVME_DNR;
7164 }
7165
7166 if (!pci_is_vf(pci)) {
7167 vf_index = le16_to_cpu(sctrl->vfn) - 1;
7168 sn = NVME(pcie_sriov_get_vf_at_index(pci, vf_index));
7169 }
7170
7171 if (online) {
7172 if (!sctrl->nvi || (le16_to_cpu(sctrl->nvq) < 2) || !sn) {
7173 return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
7174 }
7175
7176 if (!sctrl->scs) {
7177 sctrl->scs = 0x1;
7178 nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
7179 }
7180 } else {
7181 nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_INTERRUPT, 0);
7182 nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_QUEUE, 0);
7183
7184 if (sctrl->scs) {
7185 sctrl->scs = 0x0;
7186 if (sn) {
7187 nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
7188 }
7189 }
7190 }
7191
7192 return NVME_SUCCESS;
7193 }
7194
nvme_virt_mngmt(NvmeCtrl * n,NvmeRequest * req)7195 static uint16_t nvme_virt_mngmt(NvmeCtrl *n, NvmeRequest *req)
7196 {
7197 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
7198 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
7199 uint8_t act = dw10 & 0xf;
7200 uint8_t rt = (dw10 >> 8) & 0x7;
7201 uint16_t cntlid = (dw10 >> 16) & 0xffff;
7202 int nr = dw11 & 0xffff;
7203
7204 trace_pci_nvme_virt_mngmt(nvme_cid(req), act, cntlid, rt ? "VI" : "VQ", nr);
7205
7206 if (rt != NVME_VIRT_RES_QUEUE && rt != NVME_VIRT_RES_INTERRUPT) {
7207 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
7208 }
7209
7210 switch (act) {
7211 case NVME_VIRT_MNGMT_ACTION_SEC_ASSIGN:
7212 return nvme_assign_virt_res_to_sec(n, req, cntlid, rt, nr);
7213 case NVME_VIRT_MNGMT_ACTION_PRM_ALLOC:
7214 return nvme_assign_virt_res_to_prim(n, req, cntlid, rt, nr);
7215 case NVME_VIRT_MNGMT_ACTION_SEC_ONLINE:
7216 return nvme_virt_set_state(n, cntlid, true);
7217 case NVME_VIRT_MNGMT_ACTION_SEC_OFFLINE:
7218 return nvme_virt_set_state(n, cntlid, false);
7219 default:
7220 return NVME_INVALID_FIELD | NVME_DNR;
7221 }
7222 }
7223
nvme_dbbuf_config(NvmeCtrl * n,const NvmeRequest * req)7224 static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
7225 {
7226 PCIDevice *pci = PCI_DEVICE(n);
7227 uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
7228 uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
7229 int i;
7230
7231 /* Address should be page aligned */
7232 if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
7233 return NVME_INVALID_FIELD | NVME_DNR;
7234 }
7235
7236 /* Save shadow buffer base addr for use during queue creation */
7237 n->dbbuf_dbs = dbs_addr;
7238 n->dbbuf_eis = eis_addr;
7239 n->dbbuf_enabled = true;
7240
7241 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7242 NvmeSQueue *sq = n->sq[i];
7243 NvmeCQueue *cq = n->cq[i];
7244
7245 if (sq) {
7246 /*
7247 * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3)
7248 * nvme_process_db() uses this hard-coded way to calculate
7249 * doorbell offsets. Be consistent with that here.
7250 */
7251 sq->db_addr = dbs_addr + (i << 3);
7252 sq->ei_addr = eis_addr + (i << 3);
7253 stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
7254
7255 if (n->params.ioeventfd && sq->sqid != 0) {
7256 if (!nvme_init_sq_ioeventfd(sq)) {
7257 sq->ioeventfd_enabled = true;
7258 }
7259 }
7260 }
7261
7262 if (cq) {
7263 /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */
7264 cq->db_addr = dbs_addr + (i << 3) + (1 << 2);
7265 cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
7266 stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
7267
7268 if (n->params.ioeventfd && cq->cqid != 0) {
7269 if (!nvme_init_cq_ioeventfd(cq)) {
7270 cq->ioeventfd_enabled = true;
7271 }
7272 }
7273 }
7274 }
7275
7276 trace_pci_nvme_dbbuf_config(dbs_addr, eis_addr);
7277
7278 return NVME_SUCCESS;
7279 }
7280
nvme_directive_send(NvmeCtrl * n,NvmeRequest * req)7281 static uint16_t nvme_directive_send(NvmeCtrl *n, NvmeRequest *req)
7282 {
7283 return NVME_INVALID_FIELD | NVME_DNR;
7284 }
7285
nvme_directive_receive(NvmeCtrl * n,NvmeRequest * req)7286 static uint16_t nvme_directive_receive(NvmeCtrl *n, NvmeRequest *req)
7287 {
7288 NvmeNamespace *ns;
7289 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
7290 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
7291 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
7292 uint8_t doper, dtype;
7293 uint32_t numd, trans_len;
7294 NvmeDirectiveIdentify id = {
7295 .supported = 1 << NVME_DIRECTIVE_IDENTIFY,
7296 .enabled = 1 << NVME_DIRECTIVE_IDENTIFY,
7297 };
7298
7299 numd = dw10 + 1;
7300 doper = dw11 & 0xff;
7301 dtype = (dw11 >> 8) & 0xff;
7302
7303 trans_len = MIN(sizeof(NvmeDirectiveIdentify), numd << 2);
7304
7305 if (nsid == NVME_NSID_BROADCAST || dtype != NVME_DIRECTIVE_IDENTIFY ||
7306 doper != NVME_DIRECTIVE_RETURN_PARAMS) {
7307 return NVME_INVALID_FIELD | NVME_DNR;
7308 }
7309
7310 ns = nvme_ns(n, nsid);
7311 if (!ns) {
7312 return NVME_INVALID_FIELD | NVME_DNR;
7313 }
7314
7315 switch (dtype) {
7316 case NVME_DIRECTIVE_IDENTIFY:
7317 switch (doper) {
7318 case NVME_DIRECTIVE_RETURN_PARAMS:
7319 if (ns->endgrp && ns->endgrp->fdp.enabled) {
7320 id.supported |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7321 id.enabled |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7322 id.persistent |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7323 }
7324
7325 return nvme_c2h(n, (uint8_t *)&id, trans_len, req);
7326
7327 default:
7328 return NVME_INVALID_FIELD | NVME_DNR;
7329 }
7330
7331 default:
7332 return NVME_INVALID_FIELD;
7333 }
7334 }
7335
nvme_admin_cmd(NvmeCtrl * n,NvmeRequest * req)7336 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
7337 {
7338 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
7339 nvme_adm_opc_str(req->cmd.opcode));
7340
7341 if (!(n->cse.acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
7342 trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
7343 return NVME_INVALID_OPCODE | NVME_DNR;
7344 }
7345
7346 /* SGLs shall not be used for Admin commands in NVMe over PCIe */
7347 if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
7348 return NVME_INVALID_FIELD | NVME_DNR;
7349 }
7350
7351 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
7352 return NVME_INVALID_FIELD;
7353 }
7354
7355 switch (req->cmd.opcode) {
7356 case NVME_ADM_CMD_DELETE_SQ:
7357 return nvme_del_sq(n, req);
7358 case NVME_ADM_CMD_CREATE_SQ:
7359 return nvme_create_sq(n, req);
7360 case NVME_ADM_CMD_GET_LOG_PAGE:
7361 return nvme_get_log(n, req);
7362 case NVME_ADM_CMD_DELETE_CQ:
7363 return nvme_del_cq(n, req);
7364 case NVME_ADM_CMD_CREATE_CQ:
7365 return nvme_create_cq(n, req);
7366 case NVME_ADM_CMD_IDENTIFY:
7367 return nvme_identify(n, req);
7368 case NVME_ADM_CMD_ABORT:
7369 return nvme_abort(n, req);
7370 case NVME_ADM_CMD_SET_FEATURES:
7371 return nvme_set_feature(n, req);
7372 case NVME_ADM_CMD_GET_FEATURES:
7373 return nvme_get_feature(n, req);
7374 case NVME_ADM_CMD_ASYNC_EV_REQ:
7375 return nvme_aer(n, req);
7376 case NVME_ADM_CMD_NS_ATTACHMENT:
7377 return nvme_ns_attachment(n, req);
7378 case NVME_ADM_CMD_VIRT_MNGMT:
7379 return nvme_virt_mngmt(n, req);
7380 case NVME_ADM_CMD_DBBUF_CONFIG:
7381 return nvme_dbbuf_config(n, req);
7382 case NVME_ADM_CMD_FORMAT_NVM:
7383 return nvme_format(n, req);
7384 case NVME_ADM_CMD_DIRECTIVE_SEND:
7385 return nvme_directive_send(n, req);
7386 case NVME_ADM_CMD_DIRECTIVE_RECV:
7387 return nvme_directive_receive(n, req);
7388 default:
7389 g_assert_not_reached();
7390 }
7391
7392 return NVME_INVALID_OPCODE | NVME_DNR;
7393 }
7394
nvme_update_sq_eventidx(const NvmeSQueue * sq)7395 static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
7396 {
7397 trace_pci_nvme_update_sq_eventidx(sq->sqid, sq->tail);
7398
7399 stl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->ei_addr, sq->tail,
7400 MEMTXATTRS_UNSPECIFIED);
7401 }
7402
nvme_update_sq_tail(NvmeSQueue * sq)7403 static void nvme_update_sq_tail(NvmeSQueue *sq)
7404 {
7405 ldl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->db_addr, &sq->tail,
7406 MEMTXATTRS_UNSPECIFIED);
7407
7408 trace_pci_nvme_update_sq_tail(sq->sqid, sq->tail);
7409 }
7410
7411 #define NVME_ATOMIC_NO_START 0
7412 #define NVME_ATOMIC_START_ATOMIC 1
7413 #define NVME_ATOMIC_START_NONATOMIC 2
7414
nvme_atomic_write_check(NvmeCtrl * n,NvmeCmd * cmd,NvmeAtomic * atomic)7415 static int nvme_atomic_write_check(NvmeCtrl *n, NvmeCmd *cmd,
7416 NvmeAtomic *atomic)
7417 {
7418 NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
7419 uint64_t slba = le64_to_cpu(rw->slba);
7420 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb);
7421 uint64_t elba = slba + nlb;
7422 bool cmd_atomic_wr = true;
7423 int i;
7424
7425 if ((cmd->opcode == NVME_CMD_READ) || ((cmd->opcode == NVME_CMD_WRITE) &&
7426 ((rw->nlb + 1) > atomic->atomic_max_write_size))) {
7427 cmd_atomic_wr = false;
7428 }
7429
7430 /*
7431 * Walk the queues to see if there are any atomic conflicts.
7432 */
7433 for (i = 1; i < n->params.max_ioqpairs + 1; i++) {
7434 NvmeSQueue *sq;
7435 NvmeRequest *req;
7436 NvmeRwCmd *req_rw;
7437 uint64_t req_slba;
7438 uint32_t req_nlb;
7439 uint64_t req_elba;
7440
7441 sq = n->sq[i];
7442 if (!sq) {
7443 continue;
7444 }
7445
7446 /*
7447 * Walk all the requests on a given queue.
7448 */
7449 QTAILQ_FOREACH(req, &sq->out_req_list, entry) {
7450 req_rw = (NvmeRwCmd *)&req->cmd;
7451
7452 if (((req_rw->opcode == NVME_CMD_WRITE) ||
7453 (req_rw->opcode == NVME_CMD_READ)) &&
7454 (cmd->nsid == req->ns->params.nsid)) {
7455 req_slba = le64_to_cpu(req_rw->slba);
7456 req_nlb = (uint32_t)le16_to_cpu(req_rw->nlb);
7457 req_elba = req_slba + req_nlb;
7458
7459 if (cmd_atomic_wr) {
7460 if ((elba >= req_slba) && (slba <= req_elba)) {
7461 return NVME_ATOMIC_NO_START;
7462 }
7463 } else {
7464 if (req->atomic_write && ((elba >= req_slba) &&
7465 (slba <= req_elba))) {
7466 return NVME_ATOMIC_NO_START;
7467 }
7468 }
7469 }
7470 }
7471 }
7472 if (cmd_atomic_wr) {
7473 return NVME_ATOMIC_START_ATOMIC;
7474 }
7475 return NVME_ATOMIC_START_NONATOMIC;
7476 }
7477
nvme_get_atomic(NvmeCtrl * n,NvmeCmd * cmd)7478 static NvmeAtomic *nvme_get_atomic(NvmeCtrl *n, NvmeCmd *cmd)
7479 {
7480 if (n->atomic.atomic_writes) {
7481 return &n->atomic;
7482 }
7483 return NULL;
7484 }
7485
nvme_process_sq(void * opaque)7486 static void nvme_process_sq(void *opaque)
7487 {
7488 NvmeSQueue *sq = opaque;
7489 NvmeCtrl *n = sq->ctrl;
7490 NvmeCQueue *cq = n->cq[sq->cqid];
7491
7492 uint16_t status;
7493 hwaddr addr;
7494 NvmeCmd cmd;
7495 NvmeRequest *req;
7496
7497 if (n->dbbuf_enabled) {
7498 nvme_update_sq_tail(sq);
7499 }
7500
7501 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
7502 NvmeAtomic *atomic;
7503 bool cmd_is_atomic;
7504
7505 addr = sq->dma_addr + (sq->head << NVME_SQES);
7506 if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
7507 trace_pci_nvme_err_addr_read(addr);
7508 trace_pci_nvme_err_cfs();
7509 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
7510 break;
7511 }
7512
7513 atomic = nvme_get_atomic(n, &cmd);
7514
7515 cmd_is_atomic = false;
7516 if (sq->sqid && atomic) {
7517 int ret;
7518
7519 ret = nvme_atomic_write_check(n, &cmd, atomic);
7520 switch (ret) {
7521 case NVME_ATOMIC_NO_START:
7522 qemu_bh_schedule(sq->bh);
7523 return;
7524 case NVME_ATOMIC_START_ATOMIC:
7525 cmd_is_atomic = true;
7526 break;
7527 case NVME_ATOMIC_START_NONATOMIC:
7528 default:
7529 break;
7530 }
7531 }
7532 nvme_inc_sq_head(sq);
7533
7534 req = QTAILQ_FIRST(&sq->req_list);
7535 QTAILQ_REMOVE(&sq->req_list, req, entry);
7536 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
7537 nvme_req_clear(req);
7538 req->cqe.cid = cmd.cid;
7539 memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
7540
7541 if (sq->sqid && atomic) {
7542 req->atomic_write = cmd_is_atomic;
7543 }
7544
7545 status = sq->sqid ? nvme_io_cmd(n, req) :
7546 nvme_admin_cmd(n, req);
7547 if (status != NVME_NO_COMPLETE) {
7548 req->status = status;
7549 nvme_enqueue_req_completion(cq, req);
7550 }
7551
7552 if (n->dbbuf_enabled) {
7553 nvme_update_sq_eventidx(sq);
7554 nvme_update_sq_tail(sq);
7555 }
7556 }
7557 }
7558
nvme_update_msixcap_ts(PCIDevice * pci_dev,uint32_t table_size)7559 static void nvme_update_msixcap_ts(PCIDevice *pci_dev, uint32_t table_size)
7560 {
7561 uint8_t *config;
7562
7563 if (!msix_present(pci_dev)) {
7564 return;
7565 }
7566
7567 assert(table_size > 0 && table_size <= pci_dev->msix_entries_nr);
7568
7569 config = pci_dev->config + pci_dev->msix_cap;
7570 pci_set_word_by_mask(config + PCI_MSIX_FLAGS, PCI_MSIX_FLAGS_QSIZE,
7571 table_size - 1);
7572 }
7573
nvme_activate_virt_res(NvmeCtrl * n)7574 static void nvme_activate_virt_res(NvmeCtrl *n)
7575 {
7576 PCIDevice *pci_dev = PCI_DEVICE(n);
7577 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
7578 NvmeSecCtrlEntry *sctrl;
7579
7580 /* -1 to account for the admin queue */
7581 if (pci_is_vf(pci_dev)) {
7582 sctrl = nvme_sctrl(n);
7583 cap->vqprt = sctrl->nvq;
7584 cap->viprt = sctrl->nvi;
7585 n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
7586 n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
7587 } else {
7588 cap->vqrfap = n->next_pri_ctrl_cap.vqrfap;
7589 cap->virfap = n->next_pri_ctrl_cap.virfap;
7590 n->conf_ioqpairs = le16_to_cpu(cap->vqprt) +
7591 le16_to_cpu(cap->vqrfap) - 1;
7592 n->conf_msix_qsize = le16_to_cpu(cap->viprt) +
7593 le16_to_cpu(cap->virfap);
7594 }
7595 }
7596
nvme_ctrl_reset(NvmeCtrl * n,NvmeResetType rst)7597 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst)
7598 {
7599 PCIDevice *pci_dev = PCI_DEVICE(n);
7600 NvmeSecCtrlEntry *sctrl;
7601 NvmeNamespace *ns;
7602 int i;
7603
7604 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7605 ns = nvme_ns(n, i);
7606 if (!ns) {
7607 continue;
7608 }
7609
7610 nvme_ns_drain(ns);
7611 }
7612
7613 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7614 if (n->sq[i] != NULL) {
7615 nvme_free_sq(n->sq[i], n);
7616 }
7617 }
7618 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7619 if (n->cq[i] != NULL) {
7620 nvme_free_cq(n->cq[i], n);
7621 }
7622 }
7623
7624 while (!QTAILQ_EMPTY(&n->aer_queue)) {
7625 NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
7626 QTAILQ_REMOVE(&n->aer_queue, event, entry);
7627 g_free(event);
7628 }
7629
7630 if (n->params.sriov_max_vfs) {
7631 if (!pci_is_vf(pci_dev)) {
7632 for (i = 0; i < n->nr_sec_ctrls; i++) {
7633 sctrl = &n->sec_ctrl_list[i];
7634 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
7635 }
7636 }
7637
7638 if (rst != NVME_RESET_CONTROLLER) {
7639 nvme_activate_virt_res(n);
7640 }
7641 }
7642
7643 n->aer_queued = 0;
7644 n->aer_mask = 0;
7645 n->outstanding_aers = 0;
7646 n->qs_created = false;
7647
7648 n->dn = n->params.atomic_dn; /* Set Disable Normal */
7649
7650 nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
7651
7652 if (pci_is_vf(pci_dev)) {
7653 sctrl = nvme_sctrl(n);
7654
7655 stl_le_p(&n->bar.csts, sctrl->scs ? 0 : NVME_CSTS_FAILED);
7656 } else {
7657 stl_le_p(&n->bar.csts, 0);
7658 }
7659
7660 stl_le_p(&n->bar.intms, 0);
7661 stl_le_p(&n->bar.intmc, 0);
7662 stl_le_p(&n->bar.cc, 0);
7663
7664 n->dbbuf_dbs = 0;
7665 n->dbbuf_eis = 0;
7666 n->dbbuf_enabled = false;
7667 }
7668
nvme_ctrl_shutdown(NvmeCtrl * n)7669 static void nvme_ctrl_shutdown(NvmeCtrl *n)
7670 {
7671 NvmeNamespace *ns;
7672 int i;
7673
7674 if (n->pmr.dev) {
7675 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
7676 }
7677
7678 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7679 ns = nvme_ns(n, i);
7680 if (!ns) {
7681 continue;
7682 }
7683
7684 nvme_ns_shutdown(ns);
7685 }
7686 }
7687
nvme_start_ctrl(NvmeCtrl * n)7688 static int nvme_start_ctrl(NvmeCtrl *n)
7689 {
7690 uint64_t cap = ldq_le_p(&n->bar.cap);
7691 uint32_t cc = ldl_le_p(&n->bar.cc);
7692 uint32_t aqa = ldl_le_p(&n->bar.aqa);
7693 uint64_t asq = ldq_le_p(&n->bar.asq);
7694 uint64_t acq = ldq_le_p(&n->bar.acq);
7695 uint32_t page_bits = NVME_CC_MPS(cc) + 12;
7696 uint32_t page_size = 1 << page_bits;
7697 NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
7698
7699 if (pci_is_vf(PCI_DEVICE(n)) && !sctrl->scs) {
7700 trace_pci_nvme_err_startfail_virt_state(le16_to_cpu(sctrl->nvi),
7701 le16_to_cpu(sctrl->nvq));
7702 return -1;
7703 }
7704 if (unlikely(n->cq[0])) {
7705 trace_pci_nvme_err_startfail_cq();
7706 return -1;
7707 }
7708 if (unlikely(n->sq[0])) {
7709 trace_pci_nvme_err_startfail_sq();
7710 return -1;
7711 }
7712 if (unlikely(asq & (page_size - 1))) {
7713 trace_pci_nvme_err_startfail_asq_misaligned(asq);
7714 return -1;
7715 }
7716 if (unlikely(acq & (page_size - 1))) {
7717 trace_pci_nvme_err_startfail_acq_misaligned(acq);
7718 return -1;
7719 }
7720 if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) {
7721 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc));
7722 return -1;
7723 }
7724 if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) {
7725 trace_pci_nvme_err_startfail_page_too_small(
7726 NVME_CC_MPS(cc),
7727 NVME_CAP_MPSMIN(cap));
7728 return -1;
7729 }
7730 if (unlikely(NVME_CC_MPS(cc) >
7731 NVME_CAP_MPSMAX(cap))) {
7732 trace_pci_nvme_err_startfail_page_too_large(
7733 NVME_CC_MPS(cc),
7734 NVME_CAP_MPSMAX(cap));
7735 return -1;
7736 }
7737 if (unlikely(!NVME_AQA_ASQS(aqa))) {
7738 trace_pci_nvme_err_startfail_asqent_sz_zero();
7739 return -1;
7740 }
7741 if (unlikely(!NVME_AQA_ACQS(aqa))) {
7742 trace_pci_nvme_err_startfail_acqent_sz_zero();
7743 return -1;
7744 }
7745
7746 n->page_bits = page_bits;
7747 n->page_size = page_size;
7748 n->max_prp_ents = n->page_size / sizeof(uint64_t);
7749 nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
7750 nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
7751
7752 nvme_set_timestamp(n, 0ULL);
7753
7754 /* verify that the command sets of attached namespaces are supported */
7755 for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7756 NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i);
7757
7758 if (!ns || (!ns->params.shared && ns->ctrl != n)) {
7759 continue;
7760 }
7761
7762 if (nvme_csi_supported(n, ns->csi) && !ns->params.detached) {
7763 if (!ns->attached || ns->params.shared) {
7764 nvme_attach_ns(n, ns);
7765 }
7766 }
7767 }
7768
7769 nvme_update_dsm_limits(n, NULL);
7770
7771 return 0;
7772 }
7773
nvme_cmb_enable_regs(NvmeCtrl * n)7774 static void nvme_cmb_enable_regs(NvmeCtrl *n)
7775 {
7776 uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc);
7777 uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz);
7778
7779 NVME_CMBLOC_SET_CDPCILS(cmbloc, 1);
7780 NVME_CMBLOC_SET_CDPMLS(cmbloc, 1);
7781 NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR);
7782 stl_le_p(&n->bar.cmbloc, cmbloc);
7783
7784 NVME_CMBSZ_SET_SQS(cmbsz, 1);
7785 NVME_CMBSZ_SET_CQS(cmbsz, 0);
7786 NVME_CMBSZ_SET_LISTS(cmbsz, 1);
7787 NVME_CMBSZ_SET_RDS(cmbsz, 1);
7788 NVME_CMBSZ_SET_WDS(cmbsz, 1);
7789 NVME_CMBSZ_SET_SZU(cmbsz, 2); /* MBs */
7790 NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb);
7791 stl_le_p(&n->bar.cmbsz, cmbsz);
7792 }
7793
nvme_write_bar(NvmeCtrl * n,hwaddr offset,uint64_t data,unsigned size)7794 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
7795 unsigned size)
7796 {
7797 PCIDevice *pci = PCI_DEVICE(n);
7798 uint64_t cap = ldq_le_p(&n->bar.cap);
7799 uint32_t cc = ldl_le_p(&n->bar.cc);
7800 uint32_t intms = ldl_le_p(&n->bar.intms);
7801 uint32_t csts = ldl_le_p(&n->bar.csts);
7802 uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
7803
7804 if (unlikely(offset & (sizeof(uint32_t) - 1))) {
7805 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
7806 "MMIO write not 32-bit aligned,"
7807 " offset=0x%"PRIx64"", offset);
7808 /* should be ignored, fall through for now */
7809 }
7810
7811 if (unlikely(size < sizeof(uint32_t))) {
7812 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
7813 "MMIO write smaller than 32-bits,"
7814 " offset=0x%"PRIx64", size=%u",
7815 offset, size);
7816 /* should be ignored, fall through for now */
7817 }
7818
7819 switch (offset) {
7820 case NVME_REG_INTMS:
7821 if (unlikely(msix_enabled(pci))) {
7822 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
7823 "undefined access to interrupt mask set"
7824 " when MSI-X is enabled");
7825 /* should be ignored, fall through for now */
7826 }
7827 intms |= data;
7828 stl_le_p(&n->bar.intms, intms);
7829 n->bar.intmc = n->bar.intms;
7830 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms);
7831 nvme_irq_check(n);
7832 break;
7833 case NVME_REG_INTMC:
7834 if (unlikely(msix_enabled(pci))) {
7835 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
7836 "undefined access to interrupt mask clr"
7837 " when MSI-X is enabled");
7838 /* should be ignored, fall through for now */
7839 }
7840 intms &= ~data;
7841 stl_le_p(&n->bar.intms, intms);
7842 n->bar.intmc = n->bar.intms;
7843 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms);
7844 nvme_irq_check(n);
7845 break;
7846 case NVME_REG_CC:
7847 stl_le_p(&n->bar.cc, data);
7848
7849 trace_pci_nvme_mmio_cfg(data & 0xffffffff);
7850
7851 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
7852 trace_pci_nvme_mmio_shutdown_set();
7853 nvme_ctrl_shutdown(n);
7854 csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
7855 csts |= NVME_CSTS_SHST_COMPLETE;
7856 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
7857 trace_pci_nvme_mmio_shutdown_cleared();
7858 csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
7859 }
7860
7861 if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
7862 if (unlikely(nvme_start_ctrl(n))) {
7863 trace_pci_nvme_err_startfail();
7864 csts = NVME_CSTS_FAILED;
7865 } else {
7866 trace_pci_nvme_mmio_start_success();
7867 csts = NVME_CSTS_READY;
7868 }
7869 } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
7870 trace_pci_nvme_mmio_stopped();
7871 nvme_ctrl_reset(n, NVME_RESET_CONTROLLER);
7872
7873 break;
7874 }
7875
7876 stl_le_p(&n->bar.csts, csts);
7877
7878 break;
7879 case NVME_REG_CSTS:
7880 if (data & (1 << 4)) {
7881 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
7882 "attempted to W1C CSTS.NSSRO"
7883 " but CAP.NSSRS is zero (not supported)");
7884 } else if (data != 0) {
7885 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
7886 "attempted to set a read only bit"
7887 " of controller status");
7888 }
7889 break;
7890 case NVME_REG_NSSR:
7891 if (data == 0x4e564d65) {
7892 trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
7893 } else {
7894 /* The spec says that writes of other values have no effect */
7895 return;
7896 }
7897 break;
7898 case NVME_REG_AQA:
7899 stl_le_p(&n->bar.aqa, data);
7900 trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
7901 break;
7902 case NVME_REG_ASQ:
7903 stn_le_p(&n->bar.asq, size, data);
7904 trace_pci_nvme_mmio_asqaddr(data);
7905 break;
7906 case NVME_REG_ASQ + 4:
7907 stl_le_p((uint8_t *)&n->bar.asq + 4, data);
7908 trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq));
7909 break;
7910 case NVME_REG_ACQ:
7911 trace_pci_nvme_mmio_acqaddr(data);
7912 stn_le_p(&n->bar.acq, size, data);
7913 break;
7914 case NVME_REG_ACQ + 4:
7915 stl_le_p((uint8_t *)&n->bar.acq + 4, data);
7916 trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq));
7917 break;
7918 case NVME_REG_CMBLOC:
7919 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
7920 "invalid write to reserved CMBLOC"
7921 " when CMBSZ is zero, ignored");
7922 return;
7923 case NVME_REG_CMBSZ:
7924 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
7925 "invalid write to read only CMBSZ, ignored");
7926 return;
7927 case NVME_REG_CMBMSC:
7928 if (!NVME_CAP_CMBS(cap)) {
7929 return;
7930 }
7931
7932 stn_le_p(&n->bar.cmbmsc, size, data);
7933 n->cmb.cmse = false;
7934
7935 if (NVME_CMBMSC_CRE(data)) {
7936 nvme_cmb_enable_regs(n);
7937
7938 if (NVME_CMBMSC_CMSE(data)) {
7939 uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc);
7940 hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT;
7941 if (cba + int128_get64(n->cmb.mem.size) < cba) {
7942 uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts);
7943 NVME_CMBSTS_SET_CBAI(cmbsts, 1);
7944 stl_le_p(&n->bar.cmbsts, cmbsts);
7945 return;
7946 }
7947
7948 n->cmb.cba = cba;
7949 n->cmb.cmse = true;
7950 }
7951 } else {
7952 n->bar.cmbsz = 0;
7953 n->bar.cmbloc = 0;
7954 }
7955
7956 return;
7957 case NVME_REG_CMBMSC + 4:
7958 stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data);
7959 return;
7960
7961 case NVME_REG_PMRCAP:
7962 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
7963 "invalid write to PMRCAP register, ignored");
7964 return;
7965 case NVME_REG_PMRCTL:
7966 if (!NVME_CAP_PMRS(cap)) {
7967 return;
7968 }
7969
7970 stl_le_p(&n->bar.pmrctl, data);
7971 if (NVME_PMRCTL_EN(data)) {
7972 memory_region_set_enabled(&n->pmr.dev->mr, true);
7973 pmrsts = 0;
7974 } else {
7975 memory_region_set_enabled(&n->pmr.dev->mr, false);
7976 NVME_PMRSTS_SET_NRDY(pmrsts, 1);
7977 n->pmr.cmse = false;
7978 }
7979 stl_le_p(&n->bar.pmrsts, pmrsts);
7980 return;
7981 case NVME_REG_PMRSTS:
7982 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
7983 "invalid write to PMRSTS register, ignored");
7984 return;
7985 case NVME_REG_PMREBS:
7986 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
7987 "invalid write to PMREBS register, ignored");
7988 return;
7989 case NVME_REG_PMRSWTP:
7990 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
7991 "invalid write to PMRSWTP register, ignored");
7992 return;
7993 case NVME_REG_PMRMSCL:
7994 if (!NVME_CAP_PMRS(cap)) {
7995 return;
7996 }
7997
7998 stl_le_p(&n->bar.pmrmscl, data);
7999 n->pmr.cmse = false;
8000
8001 if (NVME_PMRMSCL_CMSE(data)) {
8002 uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu);
8003 hwaddr cba = pmrmscu << 32 |
8004 (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT);
8005 if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
8006 NVME_PMRSTS_SET_CBAI(pmrsts, 1);
8007 stl_le_p(&n->bar.pmrsts, pmrsts);
8008 return;
8009 }
8010
8011 n->pmr.cmse = true;
8012 n->pmr.cba = cba;
8013 }
8014
8015 return;
8016 case NVME_REG_PMRMSCU:
8017 if (!NVME_CAP_PMRS(cap)) {
8018 return;
8019 }
8020
8021 stl_le_p(&n->bar.pmrmscu, data);
8022 return;
8023 default:
8024 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
8025 "invalid MMIO write,"
8026 " offset=0x%"PRIx64", data=%"PRIx64"",
8027 offset, data);
8028 break;
8029 }
8030 }
8031
nvme_mmio_read(void * opaque,hwaddr addr,unsigned size)8032 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
8033 {
8034 NvmeCtrl *n = (NvmeCtrl *)opaque;
8035 uint8_t *ptr = (uint8_t *)&n->bar;
8036
8037 trace_pci_nvme_mmio_read(addr, size);
8038
8039 if (unlikely(addr & (sizeof(uint32_t) - 1))) {
8040 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
8041 "MMIO read not 32-bit aligned,"
8042 " offset=0x%"PRIx64"", addr);
8043 /* should RAZ, fall through for now */
8044 } else if (unlikely(size < sizeof(uint32_t))) {
8045 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
8046 "MMIO read smaller than 32-bits,"
8047 " offset=0x%"PRIx64"", addr);
8048 /* should RAZ, fall through for now */
8049 }
8050
8051 if (addr > sizeof(n->bar) - size) {
8052 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
8053 "MMIO read beyond last register,"
8054 " offset=0x%"PRIx64", returning 0", addr);
8055
8056 return 0;
8057 }
8058
8059 if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
8060 addr != NVME_REG_CSTS) {
8061 trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
8062 return 0;
8063 }
8064
8065 /*
8066 * When PMRWBM bit 1 is set then read from
8067 * from PMRSTS should ensure prior writes
8068 * made it to persistent media
8069 */
8070 if (addr == NVME_REG_PMRSTS &&
8071 (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) {
8072 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
8073 }
8074
8075 return ldn_le_p(ptr + addr, size);
8076 }
8077
nvme_process_db(NvmeCtrl * n,hwaddr addr,int val)8078 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
8079 {
8080 PCIDevice *pci = PCI_DEVICE(n);
8081 uint32_t qid;
8082
8083 if (unlikely(addr & ((1 << 2) - 1))) {
8084 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
8085 "doorbell write not 32-bit aligned,"
8086 " offset=0x%"PRIx64", ignoring", addr);
8087 return;
8088 }
8089
8090 if (((addr - 0x1000) >> 2) & 1) {
8091 /* Completion queue doorbell write */
8092
8093 uint16_t new_head = val & 0xffff;
8094 NvmeCQueue *cq;
8095
8096 qid = (addr - (0x1000 + (1 << 2))) >> 3;
8097 if (unlikely(nvme_check_cqid(n, qid))) {
8098 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
8099 "completion queue doorbell write"
8100 " for nonexistent queue,"
8101 " sqid=%"PRIu32", ignoring", qid);
8102
8103 /*
8104 * NVM Express v1.3d, Section 4.1 state: "If host software writes
8105 * an invalid value to the Submission Queue Tail Doorbell or
8106 * Completion Queue Head Doorbell register and an Asynchronous Event
8107 * Request command is outstanding, then an asynchronous event is
8108 * posted to the Admin Completion Queue with a status code of
8109 * Invalid Doorbell Write Value."
8110 *
8111 * Also note that the spec includes the "Invalid Doorbell Register"
8112 * status code, but nowhere does it specify when to use it.
8113 * However, it seems reasonable to use it here in a similar
8114 * fashion.
8115 */
8116 if (n->outstanding_aers) {
8117 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8118 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
8119 NVME_LOG_ERROR_INFO);
8120 }
8121
8122 return;
8123 }
8124
8125 cq = n->cq[qid];
8126 if (unlikely(new_head >= cq->size)) {
8127 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
8128 "completion queue doorbell write value"
8129 " beyond queue size, sqid=%"PRIu32","
8130 " new_head=%"PRIu16", ignoring",
8131 qid, new_head);
8132
8133 if (n->outstanding_aers) {
8134 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8135 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
8136 NVME_LOG_ERROR_INFO);
8137 }
8138
8139 return;
8140 }
8141
8142 trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
8143
8144 /* scheduled deferred cqe posting if queue was previously full */
8145 if (nvme_cq_full(cq)) {
8146 qemu_bh_schedule(cq->bh);
8147 }
8148
8149 cq->head = new_head;
8150 if (!qid && n->dbbuf_enabled) {
8151 stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
8152 }
8153
8154 if (cq->tail == cq->head) {
8155 if (cq->irq_enabled) {
8156 n->cq_pending--;
8157 }
8158
8159 nvme_irq_deassert(n, cq);
8160 }
8161 } else {
8162 /* Submission queue doorbell write */
8163
8164 uint16_t new_tail = val & 0xffff;
8165 NvmeSQueue *sq;
8166
8167 qid = (addr - 0x1000) >> 3;
8168 if (unlikely(nvme_check_sqid(n, qid))) {
8169 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
8170 "submission queue doorbell write"
8171 " for nonexistent queue,"
8172 " sqid=%"PRIu32", ignoring", qid);
8173
8174 if (n->outstanding_aers) {
8175 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8176 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
8177 NVME_LOG_ERROR_INFO);
8178 }
8179
8180 return;
8181 }
8182
8183 sq = n->sq[qid];
8184 if (unlikely(new_tail >= sq->size)) {
8185 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
8186 "submission queue doorbell write value"
8187 " beyond queue size, sqid=%"PRIu32","
8188 " new_tail=%"PRIu16", ignoring",
8189 qid, new_tail);
8190
8191 if (n->outstanding_aers) {
8192 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8193 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
8194 NVME_LOG_ERROR_INFO);
8195 }
8196
8197 return;
8198 }
8199
8200 trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
8201
8202 sq->tail = new_tail;
8203 if (!qid && n->dbbuf_enabled) {
8204 /*
8205 * The spec states "the host shall also update the controller's
8206 * corresponding doorbell property to match the value of that entry
8207 * in the Shadow Doorbell buffer."
8208 *
8209 * Since this context is currently a VM trap, we can safely enforce
8210 * the requirement from the device side in case the host is
8211 * misbehaving.
8212 *
8213 * Note, we shouldn't have to do this, but various drivers
8214 * including ones that run on Linux, are not updating Admin Queues,
8215 * so we can't trust reading it for an appropriate sq tail.
8216 */
8217 stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
8218 }
8219
8220 qemu_bh_schedule(sq->bh);
8221 }
8222 }
8223
nvme_mmio_write(void * opaque,hwaddr addr,uint64_t data,unsigned size)8224 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
8225 unsigned size)
8226 {
8227 NvmeCtrl *n = (NvmeCtrl *)opaque;
8228
8229 trace_pci_nvme_mmio_write(addr, data, size);
8230
8231 if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
8232 addr != NVME_REG_CSTS) {
8233 trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
8234 return;
8235 }
8236
8237 if (addr < sizeof(n->bar)) {
8238 nvme_write_bar(n, addr, data, size);
8239 } else {
8240 nvme_process_db(n, addr, data);
8241 }
8242 }
8243
8244 static const MemoryRegionOps nvme_mmio_ops = {
8245 .read = nvme_mmio_read,
8246 .write = nvme_mmio_write,
8247 .endianness = DEVICE_LITTLE_ENDIAN,
8248 .impl = {
8249 .min_access_size = 2,
8250 .max_access_size = 8,
8251 },
8252 };
8253
nvme_cmb_write(void * opaque,hwaddr addr,uint64_t data,unsigned size)8254 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
8255 unsigned size)
8256 {
8257 NvmeCtrl *n = (NvmeCtrl *)opaque;
8258 stn_le_p(&n->cmb.buf[addr], size, data);
8259 }
8260
nvme_cmb_read(void * opaque,hwaddr addr,unsigned size)8261 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
8262 {
8263 NvmeCtrl *n = (NvmeCtrl *)opaque;
8264 return ldn_le_p(&n->cmb.buf[addr], size);
8265 }
8266
8267 static const MemoryRegionOps nvme_cmb_ops = {
8268 .read = nvme_cmb_read,
8269 .write = nvme_cmb_write,
8270 .endianness = DEVICE_LITTLE_ENDIAN,
8271 .impl = {
8272 .min_access_size = 1,
8273 .max_access_size = 8,
8274 },
8275 };
8276
nvme_check_params(NvmeCtrl * n,Error ** errp)8277 static bool nvme_check_params(NvmeCtrl *n, Error **errp)
8278 {
8279 NvmeParams *params = &n->params;
8280
8281 if (params->num_queues) {
8282 warn_report("num_queues is deprecated; please use max_ioqpairs "
8283 "instead");
8284
8285 params->max_ioqpairs = params->num_queues - 1;
8286 }
8287
8288 if (n->namespace.blkconf.blk && n->subsys) {
8289 error_setg(errp, "subsystem support is unavailable with legacy "
8290 "namespace ('drive' property)");
8291 return false;
8292 }
8293
8294 if (params->max_ioqpairs < 1 ||
8295 params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
8296 error_setg(errp, "max_ioqpairs must be between 1 and %d",
8297 NVME_MAX_IOQPAIRS);
8298 return false;
8299 }
8300
8301 if (params->msix_qsize < 1 ||
8302 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
8303 error_setg(errp, "msix_qsize must be between 1 and %d",
8304 PCI_MSIX_FLAGS_QSIZE + 1);
8305 return false;
8306 }
8307
8308 if (!params->serial) {
8309 error_setg(errp, "serial property not set");
8310 return false;
8311 }
8312
8313 if (params->mqes < 1) {
8314 error_setg(errp, "mqes property cannot be less than 1");
8315 return false;
8316 }
8317
8318 if (n->pmr.dev) {
8319 if (params->msix_exclusive_bar) {
8320 error_setg(errp, "not enough BARs available to enable PMR");
8321 return false;
8322 }
8323
8324 if (host_memory_backend_is_mapped(n->pmr.dev)) {
8325 error_setg(errp, "can't use already busy memdev: %s",
8326 object_get_canonical_path_component(OBJECT(n->pmr.dev)));
8327 return false;
8328 }
8329
8330 if (!is_power_of_2(n->pmr.dev->size)) {
8331 error_setg(errp, "pmr backend size needs to be power of 2 in size");
8332 return false;
8333 }
8334
8335 host_memory_backend_set_mapped(n->pmr.dev, true);
8336 }
8337
8338 if (n->params.zasl > n->params.mdts) {
8339 error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
8340 "than or equal to mdts (Maximum Data Transfer Size)");
8341 return false;
8342 }
8343
8344 if (!n->params.vsl) {
8345 error_setg(errp, "vsl must be non-zero");
8346 return false;
8347 }
8348
8349 if (params->sriov_max_vfs) {
8350 if (!n->subsys) {
8351 error_setg(errp, "subsystem is required for the use of SR-IOV");
8352 return false;
8353 }
8354
8355 if (params->cmb_size_mb) {
8356 error_setg(errp, "CMB is not supported with SR-IOV");
8357 return false;
8358 }
8359
8360 if (n->pmr.dev) {
8361 error_setg(errp, "PMR is not supported with SR-IOV");
8362 return false;
8363 }
8364
8365 if (!params->sriov_vq_flexible || !params->sriov_vi_flexible) {
8366 error_setg(errp, "both sriov_vq_flexible and sriov_vi_flexible"
8367 " must be set for the use of SR-IOV");
8368 return false;
8369 }
8370
8371 if (params->sriov_vq_flexible < params->sriov_max_vfs * 2) {
8372 error_setg(errp, "sriov_vq_flexible must be greater than or equal"
8373 " to %d (sriov_max_vfs * 2)", params->sriov_max_vfs * 2);
8374 return false;
8375 }
8376
8377 if (params->max_ioqpairs < params->sriov_vq_flexible + 2) {
8378 error_setg(errp, "(max_ioqpairs - sriov_vq_flexible) must be"
8379 " greater than or equal to 2");
8380 return false;
8381 }
8382
8383 if (params->sriov_vi_flexible < params->sriov_max_vfs) {
8384 error_setg(errp, "sriov_vi_flexible must be greater than or equal"
8385 " to %d (sriov_max_vfs)", params->sriov_max_vfs);
8386 return false;
8387 }
8388
8389 if (params->msix_qsize < params->sriov_vi_flexible + 1) {
8390 error_setg(errp, "(msix_qsize - sriov_vi_flexible) must be"
8391 " greater than or equal to 1");
8392 return false;
8393 }
8394
8395 if (params->sriov_max_vi_per_vf &&
8396 (params->sriov_max_vi_per_vf - 1) % NVME_VF_RES_GRANULARITY) {
8397 error_setg(errp, "sriov_max_vi_per_vf must meet:"
8398 " (sriov_max_vi_per_vf - 1) %% %d == 0 and"
8399 " sriov_max_vi_per_vf >= 1", NVME_VF_RES_GRANULARITY);
8400 return false;
8401 }
8402
8403 if (params->sriov_max_vq_per_vf &&
8404 (params->sriov_max_vq_per_vf < 2 ||
8405 (params->sriov_max_vq_per_vf - 1) % NVME_VF_RES_GRANULARITY)) {
8406 error_setg(errp, "sriov_max_vq_per_vf must meet:"
8407 " (sriov_max_vq_per_vf - 1) %% %d == 0 and"
8408 " sriov_max_vq_per_vf >= 2", NVME_VF_RES_GRANULARITY);
8409 return false;
8410 }
8411 }
8412
8413 return true;
8414 }
8415
nvme_init_state(NvmeCtrl * n)8416 static void nvme_init_state(NvmeCtrl *n)
8417 {
8418 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
8419 NvmeSecCtrlEntry *list = n->sec_ctrl_list;
8420 NvmeSecCtrlEntry *sctrl;
8421 PCIDevice *pci = PCI_DEVICE(n);
8422 NvmeAtomic *atomic = &n->atomic;
8423 NvmeIdCtrl *id = &n->id_ctrl;
8424 uint8_t max_vfs;
8425 int i;
8426
8427 if (pci_is_vf(pci)) {
8428 sctrl = nvme_sctrl(n);
8429 max_vfs = 0;
8430 n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
8431 n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
8432 } else {
8433 max_vfs = n->params.sriov_max_vfs;
8434 n->conf_ioqpairs = n->params.max_ioqpairs;
8435 n->conf_msix_qsize = n->params.msix_qsize;
8436 }
8437
8438 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
8439 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
8440 n->temperature = NVME_TEMPERATURE;
8441 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
8442 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
8443 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
8444 QTAILQ_INIT(&n->aer_queue);
8445
8446 n->nr_sec_ctrls = max_vfs;
8447 for (i = 0; i < max_vfs; i++) {
8448 sctrl = &list[i];
8449 sctrl->pcid = cpu_to_le16(n->cntlid);
8450 sctrl->vfn = cpu_to_le16(i + 1);
8451 }
8452
8453 cap->cntlid = cpu_to_le16(n->cntlid);
8454 cap->crt = NVME_CRT_VQ | NVME_CRT_VI;
8455
8456 if (pci_is_vf(pci)) {
8457 cap->vqprt = cpu_to_le16(1 + n->conf_ioqpairs);
8458 } else {
8459 cap->vqprt = cpu_to_le16(1 + n->params.max_ioqpairs -
8460 n->params.sriov_vq_flexible);
8461 cap->vqfrt = cpu_to_le32(n->params.sriov_vq_flexible);
8462 cap->vqrfap = cap->vqfrt;
8463 cap->vqgran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
8464 cap->vqfrsm = n->params.sriov_max_vq_per_vf ?
8465 cpu_to_le16(n->params.sriov_max_vq_per_vf) :
8466 cap->vqfrt / MAX(max_vfs, 1);
8467 }
8468
8469 if (pci_is_vf(pci)) {
8470 cap->viprt = cpu_to_le16(n->conf_msix_qsize);
8471 } else {
8472 cap->viprt = cpu_to_le16(n->params.msix_qsize -
8473 n->params.sriov_vi_flexible);
8474 cap->vifrt = cpu_to_le32(n->params.sriov_vi_flexible);
8475 cap->virfap = cap->vifrt;
8476 cap->vigran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
8477 cap->vifrsm = n->params.sriov_max_vi_per_vf ?
8478 cpu_to_le16(n->params.sriov_max_vi_per_vf) :
8479 cap->vifrt / MAX(max_vfs, 1);
8480 }
8481
8482 /* Atomic Write */
8483 id->awun = cpu_to_le16(n->params.atomic_awun);
8484 id->awupf = cpu_to_le16(n->params.atomic_awupf);
8485 n->dn = n->params.atomic_dn;
8486
8487 if (id->awun || id->awupf) {
8488 if (id->awupf > id->awun) {
8489 id->awupf = 0;
8490 }
8491
8492 if (n->dn) {
8493 atomic->atomic_max_write_size = id->awupf + 1;
8494 } else {
8495 atomic->atomic_max_write_size = id->awun + 1;
8496 }
8497
8498 if (atomic->atomic_max_write_size == 1) {
8499 atomic->atomic_writes = 0;
8500 } else {
8501 atomic->atomic_writes = 1;
8502 }
8503 }
8504 }
8505
nvme_init_cmb(NvmeCtrl * n,PCIDevice * pci_dev)8506 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
8507 {
8508 uint64_t cmb_size = n->params.cmb_size_mb * MiB;
8509 uint64_t cap = ldq_le_p(&n->bar.cap);
8510
8511 n->cmb.buf = g_malloc0(cmb_size);
8512 memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
8513 "nvme-cmb", cmb_size);
8514 pci_register_bar(pci_dev, NVME_CMB_BIR,
8515 PCI_BASE_ADDRESS_SPACE_MEMORY |
8516 PCI_BASE_ADDRESS_MEM_TYPE_64 |
8517 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
8518
8519 NVME_CAP_SET_CMBS(cap, 1);
8520 stq_le_p(&n->bar.cap, cap);
8521
8522 if (n->params.legacy_cmb) {
8523 nvme_cmb_enable_regs(n);
8524 n->cmb.cmse = true;
8525 }
8526 }
8527
nvme_init_pmr(NvmeCtrl * n,PCIDevice * pci_dev)8528 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
8529 {
8530 uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap);
8531
8532 NVME_PMRCAP_SET_RDS(pmrcap, 1);
8533 NVME_PMRCAP_SET_WDS(pmrcap, 1);
8534 NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR);
8535 /* Turn on bit 1 support */
8536 NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02);
8537 NVME_PMRCAP_SET_CMSS(pmrcap, 1);
8538 stl_le_p(&n->bar.pmrcap, pmrcap);
8539
8540 pci_register_bar(pci_dev, NVME_PMR_BIR,
8541 PCI_BASE_ADDRESS_SPACE_MEMORY |
8542 PCI_BASE_ADDRESS_MEM_TYPE_64 |
8543 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
8544
8545 memory_region_set_enabled(&n->pmr.dev->mr, false);
8546 }
8547
nvme_mbar_size(unsigned total_queues,unsigned total_irqs,unsigned * msix_table_offset,unsigned * msix_pba_offset)8548 static uint64_t nvme_mbar_size(unsigned total_queues, unsigned total_irqs,
8549 unsigned *msix_table_offset,
8550 unsigned *msix_pba_offset)
8551 {
8552 uint64_t bar_size, msix_table_size;
8553
8554 bar_size = sizeof(NvmeBar) + 2 * total_queues * NVME_DB_SIZE;
8555
8556 if (total_irqs == 0) {
8557 goto out;
8558 }
8559
8560 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
8561
8562 if (msix_table_offset) {
8563 *msix_table_offset = bar_size;
8564 }
8565
8566 msix_table_size = PCI_MSIX_ENTRY_SIZE * total_irqs;
8567 bar_size += msix_table_size;
8568 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
8569
8570 if (msix_pba_offset) {
8571 *msix_pba_offset = bar_size;
8572 }
8573
8574 bar_size += QEMU_ALIGN_UP(total_irqs, 64) / 8;
8575
8576 out:
8577 return pow2ceil(bar_size);
8578 }
8579
nvme_init_sriov(NvmeCtrl * n,PCIDevice * pci_dev,uint16_t offset,Error ** errp)8580 static bool nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset,
8581 Error **errp)
8582 {
8583 uint16_t vf_dev_id = n->params.use_intel_id ?
8584 PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME;
8585 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
8586 uint64_t bar_size = nvme_mbar_size(le16_to_cpu(cap->vqfrsm),
8587 le16_to_cpu(cap->vifrsm),
8588 NULL, NULL);
8589
8590 if (!pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id,
8591 n->params.sriov_max_vfs, n->params.sriov_max_vfs,
8592 NVME_VF_OFFSET, NVME_VF_STRIDE, errp)) {
8593 return false;
8594 }
8595
8596 pcie_sriov_pf_init_vf_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8597 PCI_BASE_ADDRESS_MEM_TYPE_64, bar_size);
8598
8599 return true;
8600 }
8601
nvme_add_pm_capability(PCIDevice * pci_dev,uint8_t offset)8602 static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset)
8603 {
8604 Error *err = NULL;
8605 int ret;
8606
8607 ret = pci_pm_init(pci_dev, offset, &err);
8608 if (err) {
8609 error_report_err(err);
8610 return ret;
8611 }
8612
8613 pci_set_word(pci_dev->config + offset + PCI_PM_PMC,
8614 PCI_PM_CAP_VER_1_2);
8615 pci_set_word(pci_dev->config + offset + PCI_PM_CTRL,
8616 PCI_PM_CTRL_NO_SOFT_RESET);
8617 pci_set_word(pci_dev->wmask + offset + PCI_PM_CTRL,
8618 PCI_PM_CTRL_STATE_MASK);
8619
8620 return 0;
8621 }
8622
pcie_doe_spdm_rsp(DOECap * doe_cap)8623 static bool pcie_doe_spdm_rsp(DOECap *doe_cap)
8624 {
8625 void *req = pcie_doe_get_write_mbox_ptr(doe_cap);
8626 uint32_t req_len = pcie_doe_get_obj_len(req) * 4;
8627 void *rsp = doe_cap->read_mbox;
8628 uint32_t rsp_len = SPDM_SOCKET_MAX_MESSAGE_BUFFER_SIZE;
8629
8630 uint32_t recvd = spdm_socket_rsp(doe_cap->spdm_socket,
8631 SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE,
8632 req, req_len, rsp, rsp_len);
8633 doe_cap->read_mbox_len += DIV_ROUND_UP(recvd, 4);
8634
8635 return recvd != 0;
8636 }
8637
8638 static DOEProtocol doe_spdm_prot[] = {
8639 { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_CMA, pcie_doe_spdm_rsp },
8640 { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_SECURED_CMA, pcie_doe_spdm_rsp },
8641 { }
8642 };
8643
nvme_init_pci(NvmeCtrl * n,PCIDevice * pci_dev,Error ** errp)8644 static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
8645 {
8646 ERRP_GUARD();
8647 uint8_t *pci_conf = pci_dev->config;
8648 uint64_t bar_size;
8649 unsigned msix_table_offset = 0, msix_pba_offset = 0;
8650 unsigned nr_vectors;
8651 int ret;
8652
8653 pci_conf[PCI_INTERRUPT_PIN] = pci_is_vf(pci_dev) ? 0 : 1;
8654 pci_config_set_prog_interface(pci_conf, 0x2);
8655
8656 if (n->params.use_intel_id) {
8657 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
8658 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_INTEL_NVME);
8659 } else {
8660 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
8661 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
8662 }
8663
8664 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
8665 nvme_add_pm_capability(pci_dev, 0x60);
8666 pcie_endpoint_cap_init(pci_dev, 0x80);
8667 pcie_cap_flr_init(pci_dev);
8668 if (n->params.sriov_max_vfs) {
8669 pcie_ari_init(pci_dev, 0x100);
8670 }
8671
8672 if (n->params.msix_exclusive_bar && !pci_is_vf(pci_dev)) {
8673 bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, 0, NULL, NULL);
8674 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
8675 bar_size);
8676 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8677 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->iomem);
8678 ret = msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, errp);
8679 } else {
8680 assert(n->params.msix_qsize >= 1);
8681
8682 /* add one to max_ioqpairs to account for the admin queue pair */
8683 if (!pci_is_vf(pci_dev)) {
8684 nr_vectors = n->params.msix_qsize;
8685 bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1,
8686 nr_vectors, &msix_table_offset,
8687 &msix_pba_offset);
8688 } else {
8689 NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
8690 NvmePriCtrlCap *cap = &pn->pri_ctrl_cap;
8691
8692 nr_vectors = le16_to_cpu(cap->vifrsm);
8693 bar_size = nvme_mbar_size(le16_to_cpu(cap->vqfrsm), nr_vectors,
8694 &msix_table_offset, &msix_pba_offset);
8695 }
8696
8697 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
8698 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
8699 msix_table_offset);
8700 memory_region_add_subregion(&n->bar0, 0, &n->iomem);
8701
8702 if (pci_is_vf(pci_dev)) {
8703 pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0);
8704 } else {
8705 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8706 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
8707 }
8708
8709 ret = msix_init(pci_dev, nr_vectors,
8710 &n->bar0, 0, msix_table_offset,
8711 &n->bar0, 0, msix_pba_offset, 0, errp);
8712 }
8713
8714 if (ret == -ENOTSUP) {
8715 /* report that msix is not supported, but do not error out */
8716 warn_report_err(*errp);
8717 *errp = NULL;
8718 } else if (ret < 0) {
8719 /* propagate error to caller */
8720 return false;
8721 }
8722
8723 if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs &&
8724 !nvme_init_sriov(n, pci_dev, 0x120, errp)) {
8725 return false;
8726 }
8727
8728 nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
8729
8730 pcie_cap_deverr_init(pci_dev);
8731
8732 /* DOE Initialisation */
8733 if (pci_dev->spdm_port) {
8734 uint16_t doe_offset = n->params.sriov_max_vfs ?
8735 PCI_CONFIG_SPACE_SIZE + PCI_ARI_SIZEOF
8736 : PCI_CONFIG_SPACE_SIZE;
8737
8738 pcie_doe_init(pci_dev, &pci_dev->doe_spdm, doe_offset,
8739 doe_spdm_prot, true, 0);
8740
8741 pci_dev->doe_spdm.spdm_socket = spdm_socket_connect(pci_dev->spdm_port,
8742 errp);
8743
8744 if (pci_dev->doe_spdm.spdm_socket < 0) {
8745 return false;
8746 }
8747 }
8748
8749 if (n->params.cmb_size_mb) {
8750 nvme_init_cmb(n, pci_dev);
8751 }
8752
8753 if (n->pmr.dev) {
8754 nvme_init_pmr(n, pci_dev);
8755 }
8756
8757 return true;
8758 }
8759
nvme_init_subnqn(NvmeCtrl * n)8760 static void nvme_init_subnqn(NvmeCtrl *n)
8761 {
8762 NvmeSubsystem *subsys = n->subsys;
8763 NvmeIdCtrl *id = &n->id_ctrl;
8764
8765 if (!subsys) {
8766 snprintf((char *)id->subnqn, sizeof(id->subnqn),
8767 "nqn.2019-08.org.qemu:%s", n->params.serial);
8768 } else {
8769 pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
8770 }
8771 }
8772
nvme_init_ctrl(NvmeCtrl * n,PCIDevice * pci_dev)8773 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
8774 {
8775 NvmeIdCtrl *id = &n->id_ctrl;
8776 uint8_t *pci_conf = pci_dev->config;
8777 uint64_t cap = ldq_le_p(&n->bar.cap);
8778 NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
8779 uint32_t ctratt;
8780 uint16_t oacs;
8781
8782 memcpy(n->cse.acs, nvme_cse_acs_default, sizeof(n->cse.acs));
8783 memcpy(n->cse.iocs.nvm, nvme_cse_iocs_nvm_default, sizeof(n->cse.iocs.nvm));
8784 memcpy(n->cse.iocs.zoned, nvme_cse_iocs_zoned_default,
8785 sizeof(n->cse.iocs.zoned));
8786
8787 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
8788 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
8789 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
8790 strpadcpy((char *)id->fr, sizeof(id->fr), QEMU_VERSION, ' ');
8791 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
8792
8793 id->cntlid = cpu_to_le16(n->cntlid);
8794
8795 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
8796
8797 ctratt = NVME_CTRATT_ELBAS;
8798 if (n->params.ctratt.mem) {
8799 ctratt |= NVME_CTRATT_MEM;
8800 }
8801
8802 id->rab = 6;
8803
8804 if (n->params.use_intel_id) {
8805 id->ieee[0] = 0xb3;
8806 id->ieee[1] = 0x02;
8807 id->ieee[2] = 0x00;
8808 } else {
8809 id->ieee[0] = 0x00;
8810 id->ieee[1] = 0x54;
8811 id->ieee[2] = 0x52;
8812 }
8813
8814 id->mdts = n->params.mdts;
8815 id->ver = cpu_to_le32(NVME_SPEC_VER);
8816
8817 oacs = NVME_OACS_NMS | NVME_OACS_FORMAT | NVME_OACS_DIRECTIVES;
8818
8819 if (n->params.dbcs) {
8820 oacs |= NVME_OACS_DBCS;
8821
8822 n->cse.acs[NVME_ADM_CMD_DBBUF_CONFIG] = NVME_CMD_EFF_CSUPP;
8823 }
8824
8825 if (n->params.sriov_max_vfs) {
8826 oacs |= NVME_OACS_VMS;
8827
8828 n->cse.acs[NVME_ADM_CMD_VIRT_MNGMT] = NVME_CMD_EFF_CSUPP;
8829 }
8830
8831 id->oacs = cpu_to_le16(oacs);
8832
8833 id->cntrltype = 0x1;
8834
8835 /*
8836 * Because the controller always completes the Abort command immediately,
8837 * there can never be more than one concurrently executing Abort command,
8838 * so this value is never used for anything. Note that there can easily be
8839 * many Abort commands in the queues, but they are not considered
8840 * "executing" until processed by nvme_abort.
8841 *
8842 * The specification recommends a value of 3 for Abort Command Limit (four
8843 * concurrently outstanding Abort commands), so lets use that though it is
8844 * inconsequential.
8845 */
8846 id->acl = 3;
8847 id->aerl = n->params.aerl;
8848 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
8849 id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
8850
8851 /* recommended default value (~70 C) */
8852 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
8853 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
8854
8855 id->sqes = (NVME_SQES << 4) | NVME_SQES;
8856 id->cqes = (NVME_CQES << 4) | NVME_CQES;
8857 id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
8858 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
8859 NVME_ONCS_FEATURES | NVME_ONCS_DSM |
8860 NVME_ONCS_COMPARE | NVME_ONCS_COPY |
8861 NVME_ONCS_NVMCSA | NVME_ONCS_NVMAFC);
8862
8863 /*
8864 * NOTE: If this device ever supports a command set that does NOT use 0x0
8865 * as a Flush-equivalent operation, support for the broadcast NSID in Flush
8866 * should probably be removed.
8867 *
8868 * See comment in nvme_io_cmd.
8869 */
8870 id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
8871
8872 id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0 | NVME_OCFS_COPY_FORMAT_1 |
8873 NVME_OCFS_COPY_FORMAT_2 | NVME_OCFS_COPY_FORMAT_3);
8874 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
8875 NVME_CTRL_SGLS_MPTR_SGL);
8876
8877 nvme_init_subnqn(n);
8878
8879 id->psd[0].mp = cpu_to_le16(0x9c4);
8880 id->psd[0].enlat = cpu_to_le32(0x10);
8881 id->psd[0].exlat = cpu_to_le32(0x4);
8882
8883 id->cmic |= NVME_CMIC_MULTI_CTRL;
8884 ctratt |= NVME_CTRATT_ENDGRPS;
8885
8886 id->endgidmax = cpu_to_le16(0x1);
8887
8888 if (n->subsys->endgrp.fdp.enabled) {
8889 ctratt |= NVME_CTRATT_FDPS;
8890 }
8891
8892 id->ctratt = cpu_to_le32(ctratt);
8893
8894 NVME_CAP_SET_MQES(cap, n->params.mqes);
8895 NVME_CAP_SET_CQR(cap, 1);
8896 NVME_CAP_SET_TO(cap, 0xf);
8897 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NCSS);
8898 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_IOCSS);
8899 NVME_CAP_SET_MPSMAX(cap, 4);
8900 NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0);
8901 NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0);
8902 stq_le_p(&n->bar.cap, cap);
8903
8904 stl_le_p(&n->bar.vs, NVME_SPEC_VER);
8905 n->bar.intmc = n->bar.intms = 0;
8906
8907 if (pci_is_vf(pci_dev) && !sctrl->scs) {
8908 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
8909 }
8910 }
8911
nvme_init_subsys(NvmeCtrl * n,Error ** errp)8912 static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
8913 {
8914 int cntlid;
8915
8916 if (!n->subsys) {
8917 DeviceState *dev = qdev_new(TYPE_NVME_SUBSYS);
8918
8919 qdev_prop_set_string(dev, "nqn", n->params.serial);
8920
8921 if (!qdev_realize(dev, NULL, errp)) {
8922 return -1;
8923 }
8924
8925 n->subsys = NVME_SUBSYS(dev);
8926 }
8927
8928 cntlid = nvme_subsys_register_ctrl(n, errp);
8929 if (cntlid < 0) {
8930 return -1;
8931 }
8932
8933 n->cntlid = cntlid;
8934
8935 return 0;
8936 }
8937
nvme_attach_ns(NvmeCtrl * n,NvmeNamespace * ns)8938 void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
8939 {
8940 uint32_t nsid = ns->params.nsid;
8941 assert(nsid && nsid <= NVME_MAX_NAMESPACES);
8942
8943 n->namespaces[nsid] = ns;
8944 ns->attached++;
8945 }
8946
nvme_realize(PCIDevice * pci_dev,Error ** errp)8947 static void nvme_realize(PCIDevice *pci_dev, Error **errp)
8948 {
8949 NvmeCtrl *n = NVME(pci_dev);
8950 DeviceState *dev = DEVICE(pci_dev);
8951 NvmeNamespace *ns;
8952 NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
8953
8954 if (pci_is_vf(pci_dev)) {
8955 /*
8956 * VFs derive settings from the parent. PF's lifespan exceeds
8957 * that of VF's.
8958 */
8959 memcpy(&n->params, &pn->params, sizeof(NvmeParams));
8960
8961 /*
8962 * Set PF's serial value to a new string memory to prevent 'serial'
8963 * property object release of PF when a VF is removed from the system.
8964 */
8965 n->params.serial = g_strdup(pn->params.serial);
8966 n->subsys = pn->subsys;
8967
8968 /*
8969 * Assigning this link (strong link) causes an `object_unref` later in
8970 * `object_release_link_property`. Increment the refcount to balance
8971 * this out.
8972 */
8973 object_ref(OBJECT(pn->subsys));
8974 }
8975
8976 if (!nvme_check_params(n, errp)) {
8977 return;
8978 }
8979
8980 qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS, dev, dev->id);
8981
8982 if (nvme_init_subsys(n, errp)) {
8983 return;
8984 }
8985 nvme_init_state(n);
8986 if (!nvme_init_pci(n, pci_dev, errp)) {
8987 return;
8988 }
8989 nvme_init_ctrl(n, pci_dev);
8990
8991 /* setup a namespace if the controller drive property was given */
8992 if (n->namespace.blkconf.blk) {
8993 ns = &n->namespace;
8994 ns->params.nsid = 1;
8995 ns->ctrl = n;
8996
8997 if (nvme_ns_setup(ns, errp)) {
8998 return;
8999 }
9000
9001 n->subsys->namespaces[ns->params.nsid] = ns;
9002 }
9003 }
9004
nvme_exit(PCIDevice * pci_dev)9005 static void nvme_exit(PCIDevice *pci_dev)
9006 {
9007 NvmeCtrl *n = NVME(pci_dev);
9008 NvmeNamespace *ns;
9009 int i;
9010
9011 nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
9012
9013 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
9014 ns = nvme_ns(n, i);
9015 if (ns) {
9016 ns->attached--;
9017 }
9018 }
9019
9020 nvme_subsys_unregister_ctrl(n->subsys, n);
9021
9022 g_free(n->cq);
9023 g_free(n->sq);
9024 g_free(n->aer_reqs);
9025
9026 if (n->params.cmb_size_mb) {
9027 g_free(n->cmb.buf);
9028 }
9029
9030 if (pci_dev->doe_spdm.spdm_socket > 0) {
9031 spdm_socket_close(pci_dev->doe_spdm.spdm_socket,
9032 SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE);
9033 }
9034
9035 if (n->pmr.dev) {
9036 host_memory_backend_set_mapped(n->pmr.dev, false);
9037 }
9038
9039 if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
9040 pcie_sriov_pf_exit(pci_dev);
9041 }
9042
9043 if (n->params.msix_exclusive_bar && !pci_is_vf(pci_dev)) {
9044 msix_uninit_exclusive_bar(pci_dev);
9045 } else {
9046 msix_uninit(pci_dev, &n->bar0, &n->bar0);
9047 }
9048
9049 memory_region_del_subregion(&n->bar0, &n->iomem);
9050 }
9051
9052 static const Property nvme_props[] = {
9053 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
9054 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
9055 HostMemoryBackend *),
9056 DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
9057 NvmeSubsystem *),
9058 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
9059 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
9060 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
9061 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
9062 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
9063 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
9064 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
9065 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
9066 DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
9067 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
9068 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
9069 DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, false),
9070 DEFINE_PROP_BOOL("dbcs", NvmeCtrl, params.dbcs, true),
9071 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
9072 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
9073 params.auto_transition_zones, true),
9074 DEFINE_PROP_UINT16("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0),
9075 DEFINE_PROP_UINT16("sriov_vq_flexible", NvmeCtrl,
9076 params.sriov_vq_flexible, 0),
9077 DEFINE_PROP_UINT16("sriov_vi_flexible", NvmeCtrl,
9078 params.sriov_vi_flexible, 0),
9079 DEFINE_PROP_UINT32("sriov_max_vi_per_vf", NvmeCtrl,
9080 params.sriov_max_vi_per_vf, 0),
9081 DEFINE_PROP_UINT32("sriov_max_vq_per_vf", NvmeCtrl,
9082 params.sriov_max_vq_per_vf, 0),
9083 DEFINE_PROP_BOOL("msix-exclusive-bar", NvmeCtrl, params.msix_exclusive_bar,
9084 false),
9085 DEFINE_PROP_UINT16("mqes", NvmeCtrl, params.mqes, 0x7ff),
9086 DEFINE_PROP_UINT16("spdm_port", PCIDevice, spdm_port, 0),
9087 DEFINE_PROP_BOOL("ctratt.mem", NvmeCtrl, params.ctratt.mem, false),
9088 DEFINE_PROP_BOOL("atomic.dn", NvmeCtrl, params.atomic_dn, 0),
9089 DEFINE_PROP_UINT16("atomic.awun", NvmeCtrl, params.atomic_awun, 0),
9090 DEFINE_PROP_UINT16("atomic.awupf", NvmeCtrl, params.atomic_awupf, 0),
9091 DEFINE_PROP_BOOL("ocp", NvmeCtrl, params.ocp, false),
9092 };
9093
nvme_get_smart_warning(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)9094 static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
9095 void *opaque, Error **errp)
9096 {
9097 NvmeCtrl *n = NVME(obj);
9098 uint8_t value = n->smart_critical_warning;
9099
9100 visit_type_uint8(v, name, &value, errp);
9101 }
9102
nvme_set_smart_warning(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)9103 static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
9104 void *opaque, Error **errp)
9105 {
9106 NvmeCtrl *n = NVME(obj);
9107 uint8_t value, old_value, cap = 0, index, event;
9108
9109 if (!visit_type_uint8(v, name, &value, errp)) {
9110 return;
9111 }
9112
9113 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
9114 | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
9115 if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) {
9116 cap |= NVME_SMART_PMR_UNRELIABLE;
9117 }
9118
9119 if ((value & cap) != value) {
9120 error_setg(errp, "unsupported smart critical warning bits: 0x%x",
9121 value & ~cap);
9122 return;
9123 }
9124
9125 old_value = n->smart_critical_warning;
9126 n->smart_critical_warning = value;
9127
9128 /* only inject new bits of smart critical warning */
9129 for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
9130 event = 1 << index;
9131 if (value & ~old_value & event)
9132 nvme_smart_event(n, event);
9133 }
9134 }
9135
nvme_pci_reset(DeviceState * qdev)9136 static void nvme_pci_reset(DeviceState *qdev)
9137 {
9138 PCIDevice *pci_dev = PCI_DEVICE(qdev);
9139 NvmeCtrl *n = NVME(pci_dev);
9140
9141 trace_pci_nvme_pci_reset();
9142 nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
9143 }
9144
nvme_sriov_post_write_config(PCIDevice * dev,uint16_t old_num_vfs)9145 static void nvme_sriov_post_write_config(PCIDevice *dev, uint16_t old_num_vfs)
9146 {
9147 NvmeCtrl *n = NVME(dev);
9148 NvmeSecCtrlEntry *sctrl;
9149 int i;
9150
9151 for (i = pcie_sriov_num_vfs(dev); i < old_num_vfs; i++) {
9152 sctrl = &n->sec_ctrl_list[i];
9153 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
9154 }
9155 }
9156
nvme_pci_write_config(PCIDevice * dev,uint32_t address,uint32_t val,int len)9157 static void nvme_pci_write_config(PCIDevice *dev, uint32_t address,
9158 uint32_t val, int len)
9159 {
9160 uint16_t old_num_vfs = pcie_sriov_num_vfs(dev);
9161
9162 if (pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE)) {
9163 pcie_doe_write_config(&dev->doe_spdm, address, val, len);
9164 }
9165 pci_default_write_config(dev, address, val, len);
9166 pcie_cap_flr_write_config(dev, address, val, len);
9167 nvme_sriov_post_write_config(dev, old_num_vfs);
9168 }
9169
nvme_pci_read_config(PCIDevice * dev,uint32_t address,int len)9170 static uint32_t nvme_pci_read_config(PCIDevice *dev, uint32_t address, int len)
9171 {
9172 uint32_t val;
9173 if (dev->spdm_port && pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE)) {
9174 if (pcie_doe_read_config(&dev->doe_spdm, address, len, &val)) {
9175 return val;
9176 }
9177 }
9178 return pci_default_read_config(dev, address, len);
9179 }
9180
9181 static const VMStateDescription nvme_vmstate = {
9182 .name = "nvme",
9183 .unmigratable = 1,
9184 };
9185
nvme_class_init(ObjectClass * oc,const void * data)9186 static void nvme_class_init(ObjectClass *oc, const void *data)
9187 {
9188 DeviceClass *dc = DEVICE_CLASS(oc);
9189 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
9190
9191 pc->realize = nvme_realize;
9192 pc->config_write = nvme_pci_write_config;
9193 pc->config_read = nvme_pci_read_config;
9194 pc->exit = nvme_exit;
9195 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
9196 pc->revision = 2;
9197
9198 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
9199 dc->desc = "Non-Volatile Memory Express";
9200 device_class_set_props(dc, nvme_props);
9201 dc->vmsd = &nvme_vmstate;
9202 device_class_set_legacy_reset(dc, nvme_pci_reset);
9203 }
9204
nvme_instance_init(Object * obj)9205 static void nvme_instance_init(Object *obj)
9206 {
9207 NvmeCtrl *n = NVME(obj);
9208
9209 device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
9210 "bootindex", "/namespace@1,0",
9211 DEVICE(obj));
9212
9213 object_property_add(obj, "smart_critical_warning", "uint8",
9214 nvme_get_smart_warning,
9215 nvme_set_smart_warning, NULL, NULL);
9216 }
9217
9218 static const TypeInfo nvme_info = {
9219 .name = TYPE_NVME,
9220 .parent = TYPE_PCI_DEVICE,
9221 .instance_size = sizeof(NvmeCtrl),
9222 .instance_init = nvme_instance_init,
9223 .class_init = nvme_class_init,
9224 .interfaces = (const InterfaceInfo[]) {
9225 { INTERFACE_PCIE_DEVICE },
9226 { }
9227 },
9228 };
9229
9230 static const TypeInfo nvme_bus_info = {
9231 .name = TYPE_NVME_BUS,
9232 .parent = TYPE_BUS,
9233 .instance_size = sizeof(NvmeBus),
9234 };
9235
nvme_register_types(void)9236 static void nvme_register_types(void)
9237 {
9238 type_register_static(&nvme_info);
9239 type_register_static(&nvme_bus_info);
9240 }
9241
9242 type_init(nvme_register_types)
9243