xref: /qemu/hw/ppc/spapr_nvdimm.c (revision 06b40d250ecfa1633209c2e431a7a38acfd03a98)
1 /*
2  * QEMU PAPR Storage Class Memory Interfaces
3  *
4  * Copyright (c) 2019-2020, IBM Corporation.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include "qemu/osdep.h"
25 #include "qemu/cutils.h"
26 #include "qapi/error.h"
27 #include "hw/ppc/spapr_drc.h"
28 #include "hw/ppc/spapr_nvdimm.h"
29 #include "hw/mem/nvdimm.h"
30 #include "qemu/nvdimm-utils.h"
31 #include "hw/ppc/fdt.h"
32 #include "qemu/range.h"
33 #include "hw/ppc/spapr_numa.h"
34 #include "block/thread-pool.h"
35 #include "migration/vmstate.h"
36 #include "qemu/pmem.h"
37 #include "hw/qdev-properties.h"
38 
39 /* DIMM health bitmap bitmap indicators. Taken from kernel's papr_scm.c */
40 /* SCM device is unable to persist memory contents */
41 #define PAPR_PMEM_UNARMED PPC_BIT(0)
42 
43 /*
44  * The nvdimm size should be aligned to SCM block size.
45  * The SCM block size should be aligned to SPAPR_MEMORY_BLOCK_SIZE
46  * in order to have SCM regions not to overlap with dimm memory regions.
47  * The SCM devices can have variable block sizes. For now, fixing the
48  * block size to the minimum value.
49  */
50 #define SPAPR_MINIMUM_SCM_BLOCK_SIZE SPAPR_MEMORY_BLOCK_SIZE
51 
52 /* Have an explicit check for alignment */
53 QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE);
54 
55 #define TYPE_SPAPR_NVDIMM "spapr-nvdimm"
56 OBJECT_DECLARE_TYPE(SpaprNVDIMMDevice, SPAPRNVDIMMClass, SPAPR_NVDIMM)
57 
58 struct SPAPRNVDIMMClass {
59     /* private */
60     NVDIMMClass parent_class;
61 
62     /* public */
63     void (*realize)(NVDIMMDevice *dimm, Error **errp);
64     void (*unrealize)(NVDIMMDevice *dimm, Error **errp);
65 };
66 
spapr_nvdimm_validate(HotplugHandler * hotplug_dev,NVDIMMDevice * nvdimm,uint64_t size,Error ** errp)67 bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
68                            uint64_t size, Error **errp)
69 {
70     const MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev);
71     const MachineState *ms = MACHINE(hotplug_dev);
72     PCDIMMDevice *dimm = PC_DIMM(nvdimm);
73     MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem);
74     g_autofree char *uuidstr = NULL;
75     QemuUUID uuid;
76     int ret;
77 
78     if (!mc->nvdimm_supported) {
79         error_setg(errp, "NVDIMM hotplug not supported for this machine");
80         return false;
81     }
82 
83     if (!ms->nvdimms_state->is_enabled) {
84         error_setg(errp, "nvdimm device found but 'nvdimm=off' was set");
85         return false;
86     }
87 
88     if (object_property_get_int(OBJECT(nvdimm), NVDIMM_LABEL_SIZE_PROP,
89                                 &error_abort) == 0) {
90         error_setg(errp, "PAPR requires NVDIMM devices to have label-size set");
91         return false;
92     }
93 
94     if (size % SPAPR_MINIMUM_SCM_BLOCK_SIZE) {
95         error_setg(errp, "PAPR requires NVDIMM memory size (excluding label)"
96                    " to be a multiple of %" PRIu64 "MB",
97                    SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
98         return false;
99     }
100 
101     uuidstr = object_property_get_str(OBJECT(nvdimm), NVDIMM_UUID_PROP,
102                                       &error_abort);
103     ret = qemu_uuid_parse(uuidstr, &uuid);
104     g_assert(!ret);
105 
106     if (qemu_uuid_is_null(&uuid)) {
107         error_setg(errp, "NVDIMM device requires the uuid to be set");
108         return false;
109     }
110 
111     if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM) &&
112         (memory_region_get_fd(mr) < 0)) {
113         error_setg(errp, "spapr-nvdimm device requires the "
114                    "memdev %s to be of memory-backend-file type",
115                    object_get_canonical_path_component(OBJECT(dimm->hostmem)));
116         return false;
117     }
118 
119     return true;
120 }
121 
122 
spapr_add_nvdimm(DeviceState * dev,uint64_t slot)123 void spapr_add_nvdimm(DeviceState *dev, uint64_t slot)
124 {
125     SpaprDrc *drc;
126     bool hotplugged = spapr_drc_hotplugged(dev);
127 
128     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, slot);
129     g_assert(drc);
130 
131     /*
132      * pc_dimm_get_free_slot() provided a free slot at pre-plug. The
133      * corresponding DRC is thus assumed to be attachable.
134      */
135     spapr_drc_attach(drc, dev);
136 
137     if (hotplugged) {
138         spapr_hotplug_req_add_by_index(drc);
139     }
140 }
141 
spapr_dt_nvdimm(SpaprMachineState * spapr,void * fdt,int parent_offset,NVDIMMDevice * nvdimm)142 static int spapr_dt_nvdimm(SpaprMachineState *spapr, void *fdt,
143                            int parent_offset, NVDIMMDevice *nvdimm)
144 {
145     int child_offset;
146     char *buf;
147     SpaprDrc *drc;
148     uint32_t drc_idx;
149     uint32_t node = object_property_get_uint(OBJECT(nvdimm), PC_DIMM_NODE_PROP,
150                                              &error_abort);
151     uint64_t slot = object_property_get_uint(OBJECT(nvdimm), PC_DIMM_SLOT_PROP,
152                                              &error_abort);
153     uint64_t lsize = nvdimm->label_size;
154     uint64_t size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
155                                             NULL);
156 
157     drc = spapr_drc_by_id(TYPE_SPAPR_DRC_PMEM, slot);
158     g_assert(drc);
159 
160     drc_idx = spapr_drc_index(drc);
161 
162     buf = g_strdup_printf("ibm,pmemory@%x", drc_idx);
163     child_offset = fdt_add_subnode(fdt, parent_offset, buf);
164     g_free(buf);
165 
166     _FDT(child_offset);
167 
168     _FDT((fdt_setprop_cell(fdt, child_offset, "reg", drc_idx)));
169     _FDT((fdt_setprop_string(fdt, child_offset, "compatible", "ibm,pmemory")));
170     _FDT((fdt_setprop_string(fdt, child_offset, "device_type", "ibm,pmemory")));
171 
172     spapr_numa_write_associativity_dt(spapr, fdt, child_offset, node);
173 
174     buf = qemu_uuid_unparse_strdup(&nvdimm->uuid);
175     _FDT((fdt_setprop_string(fdt, child_offset, "ibm,unit-guid", buf)));
176     g_free(buf);
177 
178     _FDT((fdt_setprop_cell(fdt, child_offset, "ibm,my-drc-index", drc_idx)));
179 
180     _FDT((fdt_setprop_u64(fdt, child_offset, "ibm,block-size",
181                           SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
182     _FDT((fdt_setprop_u64(fdt, child_offset, "ibm,number-of-blocks",
183                           size / SPAPR_MINIMUM_SCM_BLOCK_SIZE)));
184     _FDT((fdt_setprop_cell(fdt, child_offset, "ibm,metadata-size", lsize)));
185 
186     _FDT((fdt_setprop_string(fdt, child_offset, "ibm,pmem-application",
187                              "operating-system")));
188     _FDT(fdt_setprop(fdt, child_offset, "ibm,cache-flush-required", NULL, 0));
189 
190     if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM)) {
191         bool is_pmem = false, pmem_override = false;
192         PCDIMMDevice *dimm = PC_DIMM(nvdimm);
193         HostMemoryBackend *hostmem = dimm->hostmem;
194 
195         is_pmem = object_property_get_bool(OBJECT(hostmem), "pmem", NULL);
196         pmem_override = object_property_get_bool(OBJECT(nvdimm),
197                                                  "pmem-override", NULL);
198         if (!is_pmem || pmem_override) {
199             _FDT(fdt_setprop(fdt, child_offset, "ibm,hcall-flush-required",
200                              NULL, 0));
201         }
202     }
203 
204     return child_offset;
205 }
206 
spapr_pmem_dt_populate(SpaprDrc * drc,SpaprMachineState * spapr,void * fdt,int * fdt_start_offset,Error ** errp)207 int spapr_pmem_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
208                            void *fdt, int *fdt_start_offset, Error **errp)
209 {
210     NVDIMMDevice *nvdimm = NVDIMM(drc->dev);
211 
212     *fdt_start_offset = spapr_dt_nvdimm(spapr, fdt, 0, nvdimm);
213 
214     return 0;
215 }
216 
spapr_dt_persistent_memory(SpaprMachineState * spapr,void * fdt)217 void spapr_dt_persistent_memory(SpaprMachineState *spapr, void *fdt)
218 {
219     int offset = fdt_subnode_offset(fdt, 0, "ibm,persistent-memory");
220     GSList *iter, *nvdimms = nvdimm_get_device_list();
221 
222     if (offset < 0) {
223         offset = fdt_add_subnode(fdt, 0, "ibm,persistent-memory");
224         _FDT(offset);
225         _FDT((fdt_setprop_cell(fdt, offset, "#address-cells", 0x1)));
226         _FDT((fdt_setprop_cell(fdt, offset, "#size-cells", 0x0)));
227         _FDT((fdt_setprop_string(fdt, offset, "device_type",
228                                  "ibm,persistent-memory")));
229     }
230 
231     /* Create DT entries for cold plugged NVDIMM devices */
232     for (iter = nvdimms; iter; iter = iter->next) {
233         NVDIMMDevice *nvdimm = iter->data;
234 
235         spapr_dt_nvdimm(spapr, fdt, offset, nvdimm);
236     }
237     g_slist_free(nvdimms);
238 }
239 
h_scm_read_metadata(PowerPCCPU * cpu,SpaprMachineState * spapr,target_ulong opcode,target_ulong * args)240 static target_ulong h_scm_read_metadata(PowerPCCPU *cpu,
241                                         SpaprMachineState *spapr,
242                                         target_ulong opcode,
243                                         target_ulong *args)
244 {
245     uint32_t drc_index = args[0];
246     uint64_t offset = args[1];
247     uint64_t len = args[2];
248     SpaprDrc *drc = spapr_drc_by_index(drc_index);
249     NVDIMMDevice *nvdimm;
250     NVDIMMClass *ddc;
251     uint64_t data = 0;
252     uint8_t buf[8] = { 0 };
253 
254     if (!drc || !drc->dev ||
255         spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
256         return H_PARAMETER;
257     }
258 
259     if (len != 1 && len != 2 &&
260         len != 4 && len != 8) {
261         return H_P3;
262     }
263 
264     nvdimm = NVDIMM(drc->dev);
265     if ((offset + len < offset) ||
266         (nvdimm->label_size < len + offset)) {
267         return H_P2;
268     }
269 
270     ddc = NVDIMM_GET_CLASS(nvdimm);
271     ddc->read_label_data(nvdimm, buf, len, offset);
272 
273     switch (len) {
274     case 1:
275         data = ldub_p(buf);
276         break;
277     case 2:
278         data = lduw_be_p(buf);
279         break;
280     case 4:
281         data = ldl_be_p(buf);
282         break;
283     case 8:
284         data = ldq_be_p(buf);
285         break;
286     default:
287         g_assert_not_reached();
288     }
289 
290     args[0] = data;
291 
292     return H_SUCCESS;
293 }
294 
h_scm_write_metadata(PowerPCCPU * cpu,SpaprMachineState * spapr,target_ulong opcode,target_ulong * args)295 static target_ulong h_scm_write_metadata(PowerPCCPU *cpu,
296                                          SpaprMachineState *spapr,
297                                          target_ulong opcode,
298                                          target_ulong *args)
299 {
300     uint32_t drc_index = args[0];
301     uint64_t offset = args[1];
302     uint64_t data = args[2];
303     uint64_t len = args[3];
304     SpaprDrc *drc = spapr_drc_by_index(drc_index);
305     NVDIMMDevice *nvdimm;
306     NVDIMMClass *ddc;
307     uint8_t buf[8] = { 0 };
308 
309     if (!drc || !drc->dev ||
310         spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
311         return H_PARAMETER;
312     }
313 
314     if (len != 1 && len != 2 &&
315         len != 4 && len != 8) {
316         return H_P4;
317     }
318 
319     nvdimm = NVDIMM(drc->dev);
320     if ((offset + len < offset) ||
321         (nvdimm->label_size < len + offset) ||
322         nvdimm->readonly) {
323         return H_P2;
324     }
325 
326     switch (len) {
327     case 1:
328         if (data & 0xffffffffffffff00) {
329             return H_P2;
330         }
331         stb_p(buf, data);
332         break;
333     case 2:
334         if (data & 0xffffffffffff0000) {
335             return H_P2;
336         }
337         stw_be_p(buf, data);
338         break;
339     case 4:
340         if (data & 0xffffffff00000000) {
341             return H_P2;
342         }
343         stl_be_p(buf, data);
344         break;
345     case 8:
346         stq_be_p(buf, data);
347         break;
348     default:
349             g_assert_not_reached();
350     }
351 
352     ddc = NVDIMM_GET_CLASS(nvdimm);
353     ddc->write_label_data(nvdimm, buf, len, offset);
354 
355     return H_SUCCESS;
356 }
357 
h_scm_bind_mem(PowerPCCPU * cpu,SpaprMachineState * spapr,target_ulong opcode,target_ulong * args)358 static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
359                                    target_ulong opcode, target_ulong *args)
360 {
361     uint32_t drc_index = args[0];
362     uint64_t starting_idx = args[1];
363     uint64_t no_of_scm_blocks_to_bind = args[2];
364     uint64_t target_logical_mem_addr = args[3];
365     uint64_t continue_token = args[4];
366     uint64_t size;
367     uint64_t total_no_of_scm_blocks;
368     SpaprDrc *drc = spapr_drc_by_index(drc_index);
369     hwaddr addr;
370     NVDIMMDevice *nvdimm;
371 
372     if (!drc || !drc->dev ||
373         spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
374         return H_PARAMETER;
375     }
376 
377     /*
378      * Currently continue token should be zero qemu has already bound
379      * everything and this hcall doesn't return H_BUSY.
380      */
381     if (continue_token > 0) {
382         return H_P5;
383     }
384 
385     /* Currently qemu assigns the address. */
386     if (target_logical_mem_addr != 0xffffffffffffffff) {
387         return H_OVERLAP;
388     }
389 
390     nvdimm = NVDIMM(drc->dev);
391 
392     size = object_property_get_uint(OBJECT(nvdimm),
393                                     PC_DIMM_SIZE_PROP, &error_abort);
394 
395     total_no_of_scm_blocks = size / SPAPR_MINIMUM_SCM_BLOCK_SIZE;
396 
397     if (starting_idx > total_no_of_scm_blocks) {
398         return H_P2;
399     }
400 
401     if (((starting_idx + no_of_scm_blocks_to_bind) < starting_idx) ||
402         ((starting_idx + no_of_scm_blocks_to_bind) > total_no_of_scm_blocks)) {
403         return H_P3;
404     }
405 
406     addr = object_property_get_uint(OBJECT(nvdimm),
407                                     PC_DIMM_ADDR_PROP, &error_abort);
408 
409     addr += starting_idx * SPAPR_MINIMUM_SCM_BLOCK_SIZE;
410 
411     /* Already bound, Return target logical address in R5 */
412     args[1] = addr;
413     args[2] = no_of_scm_blocks_to_bind;
414 
415     return H_SUCCESS;
416 }
417 
418 typedef struct SpaprNVDIMMDeviceFlushState {
419     uint64_t continue_token;
420     int64_t hcall_ret;
421     uint32_t drcidx;
422 
423     QLIST_ENTRY(SpaprNVDIMMDeviceFlushState) node;
424 } SpaprNVDIMMDeviceFlushState;
425 
426 typedef struct SpaprNVDIMMDevice SpaprNVDIMMDevice;
427 struct SpaprNVDIMMDevice {
428     /* private */
429     NVDIMMDevice parent_obj;
430 
431     bool hcall_flush_required;
432     uint64_t nvdimm_flush_token;
433     QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) pending_nvdimm_flush_states;
434     QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) completed_nvdimm_flush_states;
435 
436     /* public */
437 
438     /*
439      * The 'on' value for this property forced the qemu to enable the hcall
440      * flush for the nvdimm device even if the backend is a pmem
441      */
442     bool pmem_override;
443 };
444 
flush_worker_cb(void * opaque)445 static int flush_worker_cb(void *opaque)
446 {
447     SpaprNVDIMMDeviceFlushState *state = opaque;
448     SpaprDrc *drc = spapr_drc_by_index(state->drcidx);
449     PCDIMMDevice *dimm;
450     HostMemoryBackend *backend;
451     int backend_fd;
452 
453     g_assert(drc != NULL);
454 
455     dimm = PC_DIMM(drc->dev);
456     backend = MEMORY_BACKEND(dimm->hostmem);
457     backend_fd = memory_region_get_fd(&backend->mr);
458 
459     if (object_property_get_bool(OBJECT(backend), "pmem", NULL)) {
460         MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem);
461         void *ptr = memory_region_get_ram_ptr(mr);
462         size_t size = object_property_get_uint(OBJECT(dimm), PC_DIMM_SIZE_PROP,
463                                                NULL);
464 
465         /* flush pmem backend */
466         pmem_persist(ptr, size);
467     } else {
468         /* flush raw backing image */
469         if (qemu_fdatasync(backend_fd) < 0) {
470             error_report("papr_scm: Could not sync nvdimm to backend file: %s",
471                          strerror(errno));
472             return H_HARDWARE;
473         }
474     }
475 
476     return H_SUCCESS;
477 }
478 
spapr_nvdimm_flush_completion_cb(void * opaque,int hcall_ret)479 static void spapr_nvdimm_flush_completion_cb(void *opaque, int hcall_ret)
480 {
481     SpaprNVDIMMDeviceFlushState *state = opaque;
482     SpaprDrc *drc = spapr_drc_by_index(state->drcidx);
483     SpaprNVDIMMDevice *s_nvdimm;
484 
485     g_assert(drc != NULL);
486 
487     s_nvdimm = SPAPR_NVDIMM(drc->dev);
488 
489     state->hcall_ret = hcall_ret;
490     QLIST_REMOVE(state, node);
491     QLIST_INSERT_HEAD(&s_nvdimm->completed_nvdimm_flush_states, state, node);
492 }
493 
spapr_nvdimm_flush_post_load(void * opaque,int version_id)494 static int spapr_nvdimm_flush_post_load(void *opaque, int version_id)
495 {
496     SpaprNVDIMMDevice *s_nvdimm = (SpaprNVDIMMDevice *)opaque;
497     SpaprNVDIMMDeviceFlushState *state;
498     HostMemoryBackend *backend = MEMORY_BACKEND(PC_DIMM(s_nvdimm)->hostmem);
499     bool is_pmem = object_property_get_bool(OBJECT(backend), "pmem", NULL);
500     bool pmem_override = object_property_get_bool(OBJECT(s_nvdimm),
501                                                   "pmem-override", NULL);
502     bool dest_hcall_flush_required = pmem_override || !is_pmem;
503 
504     if (!s_nvdimm->hcall_flush_required && dest_hcall_flush_required) {
505         error_report("The file backend for the spapr-nvdimm device %s at "
506                      "source is a pmem, use pmem=on and pmem-override=off to "
507                      "continue.", DEVICE(s_nvdimm)->id);
508         return -EINVAL;
509     }
510     if (s_nvdimm->hcall_flush_required && !dest_hcall_flush_required) {
511         error_report("The guest expects hcall-flush support for the "
512                      "spapr-nvdimm device %s, use pmem_override=on to "
513                      "continue.", DEVICE(s_nvdimm)->id);
514         return -EINVAL;
515     }
516 
517     QLIST_FOREACH(state, &s_nvdimm->pending_nvdimm_flush_states, node) {
518         thread_pool_submit_aio(flush_worker_cb, state,
519                                spapr_nvdimm_flush_completion_cb, state);
520     }
521 
522     return 0;
523 }
524 
525 static const VMStateDescription vmstate_spapr_nvdimm_flush_state = {
526      .name = "spapr_nvdimm_flush_state",
527      .version_id = 1,
528      .minimum_version_id = 1,
529      .fields = (const VMStateField[]) {
530          VMSTATE_UINT64(continue_token, SpaprNVDIMMDeviceFlushState),
531          VMSTATE_INT64(hcall_ret, SpaprNVDIMMDeviceFlushState),
532          VMSTATE_UINT32(drcidx, SpaprNVDIMMDeviceFlushState),
533          VMSTATE_END_OF_LIST()
534      },
535 };
536 
537 const VMStateDescription vmstate_spapr_nvdimm_states = {
538     .name = "spapr_nvdimm_states",
539     .version_id = 1,
540     .minimum_version_id = 1,
541     .post_load = spapr_nvdimm_flush_post_load,
542     .fields = (const VMStateField[]) {
543         VMSTATE_BOOL(hcall_flush_required, SpaprNVDIMMDevice),
544         VMSTATE_UINT64(nvdimm_flush_token, SpaprNVDIMMDevice),
545         VMSTATE_QLIST_V(completed_nvdimm_flush_states, SpaprNVDIMMDevice, 1,
546                         vmstate_spapr_nvdimm_flush_state,
547                         SpaprNVDIMMDeviceFlushState, node),
548         VMSTATE_QLIST_V(pending_nvdimm_flush_states, SpaprNVDIMMDevice, 1,
549                         vmstate_spapr_nvdimm_flush_state,
550                         SpaprNVDIMMDeviceFlushState, node),
551         VMSTATE_END_OF_LIST()
552     },
553 };
554 
555 /*
556  * Assign a token and reserve it for the new flush state.
557  */
spapr_nvdimm_init_new_flush_state(SpaprNVDIMMDevice * spapr_nvdimm)558 static SpaprNVDIMMDeviceFlushState *spapr_nvdimm_init_new_flush_state(
559                                                 SpaprNVDIMMDevice *spapr_nvdimm)
560 {
561     SpaprNVDIMMDeviceFlushState *state;
562 
563     state = g_malloc0(sizeof(*state));
564 
565     spapr_nvdimm->nvdimm_flush_token++;
566     /* Token zero is presumed as no job pending. Assert on overflow to zero */
567     g_assert(spapr_nvdimm->nvdimm_flush_token != 0);
568 
569     state->continue_token = spapr_nvdimm->nvdimm_flush_token;
570 
571     QLIST_INSERT_HEAD(&spapr_nvdimm->pending_nvdimm_flush_states, state, node);
572 
573     return state;
574 }
575 
576 /*
577  * spapr_nvdimm_finish_flushes
578  *      Waits for all pending flush requests to complete
579  *      their execution and free the states
580  */
spapr_nvdimm_finish_flushes(void)581 void spapr_nvdimm_finish_flushes(void)
582 {
583     SpaprNVDIMMDeviceFlushState *state, *next;
584     GSList *list, *nvdimms;
585 
586     /*
587      * Called on reset path, the main loop thread which calls
588      * the pending BHs has gotten out running in the reset path,
589      * finally reaching here. Other code path being guest
590      * h_client_architecture_support, that's early boot up.
591      */
592     nvdimms = nvdimm_get_device_list();
593     for (list = nvdimms; list; list = list->next) {
594         NVDIMMDevice *nvdimm = list->data;
595         if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM)) {
596             SpaprNVDIMMDevice *s_nvdimm = SPAPR_NVDIMM(nvdimm);
597             while (!QLIST_EMPTY(&s_nvdimm->pending_nvdimm_flush_states)) {
598                 aio_poll(qemu_get_aio_context(), true);
599             }
600 
601             QLIST_FOREACH_SAFE(state, &s_nvdimm->completed_nvdimm_flush_states,
602                                node, next) {
603                 QLIST_REMOVE(state, node);
604                 g_free(state);
605             }
606         }
607     }
608     g_slist_free(nvdimms);
609 }
610 
611 /*
612  * spapr_nvdimm_get_flush_status
613  *      Fetches the status of the hcall worker and returns
614  *      H_LONG_BUSY_ORDER_10_MSEC if the worker is still running.
615  */
spapr_nvdimm_get_flush_status(SpaprNVDIMMDevice * s_nvdimm,uint64_t token)616 static int spapr_nvdimm_get_flush_status(SpaprNVDIMMDevice *s_nvdimm,
617                                          uint64_t token)
618 {
619     SpaprNVDIMMDeviceFlushState *state, *node;
620 
621     QLIST_FOREACH(state, &s_nvdimm->pending_nvdimm_flush_states, node) {
622         if (state->continue_token == token) {
623             return H_LONG_BUSY_ORDER_10_MSEC;
624         }
625     }
626 
627     QLIST_FOREACH_SAFE(state, &s_nvdimm->completed_nvdimm_flush_states,
628                        node, node) {
629         if (state->continue_token == token) {
630             int ret = state->hcall_ret;
631             QLIST_REMOVE(state, node);
632             g_free(state);
633             return ret;
634         }
635     }
636 
637     /* If not found in complete list too, invalid token */
638     return H_P2;
639 }
640 
641 /*
642  * H_SCM_FLUSH
643  * Input: drc_index, continue-token
644  * Out: continue-token
645  * Return Value: H_SUCCESS, H_Parameter, H_P2, H_LONG_BUSY_ORDER_10_MSEC,
646  *               H_UNSUPPORTED
647  *
648  * Given a DRC Index Flush the data to backend NVDIMM device. The hcall returns
649  * H_LONG_BUSY_ORDER_10_MSEC when the flush takes longer time and the hcall
650  * needs to be issued multiple times in order to be completely serviced. The
651  * continue-token from the output to be passed in the argument list of
652  * subsequent hcalls until the hcall is completely serviced at which point
653  * H_SUCCESS or other error is returned.
654  */
h_scm_flush(PowerPCCPU * cpu,SpaprMachineState * spapr,target_ulong opcode,target_ulong * args)655 static target_ulong h_scm_flush(PowerPCCPU *cpu, SpaprMachineState *spapr,
656                                 target_ulong opcode, target_ulong *args)
657 {
658     int ret;
659     uint32_t drc_index = args[0];
660     uint64_t continue_token = args[1];
661     SpaprDrc *drc = spapr_drc_by_index(drc_index);
662     PCDIMMDevice *dimm;
663     HostMemoryBackend *backend = NULL;
664     SpaprNVDIMMDeviceFlushState *state;
665     int fd;
666 
667     if (!drc || !drc->dev ||
668         spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
669         return H_PARAMETER;
670     }
671 
672     dimm = PC_DIMM(drc->dev);
673     if (!object_dynamic_cast(OBJECT(dimm), TYPE_SPAPR_NVDIMM)) {
674         return H_PARAMETER;
675     }
676     if (continue_token == 0) {
677         bool is_pmem = false, pmem_override = false;
678         backend = MEMORY_BACKEND(dimm->hostmem);
679         fd = memory_region_get_fd(&backend->mr);
680 
681         if (fd < 0) {
682             return H_UNSUPPORTED;
683         }
684 
685         is_pmem = object_property_get_bool(OBJECT(backend), "pmem", NULL);
686         pmem_override = object_property_get_bool(OBJECT(dimm),
687                                                 "pmem-override", NULL);
688         if (is_pmem && !pmem_override) {
689             return H_UNSUPPORTED;
690         }
691 
692         state = spapr_nvdimm_init_new_flush_state(SPAPR_NVDIMM(dimm));
693         if (!state) {
694             return H_HARDWARE;
695         }
696 
697         state->drcidx = drc_index;
698 
699         thread_pool_submit_aio(flush_worker_cb, state,
700                                spapr_nvdimm_flush_completion_cb, state);
701 
702         continue_token = state->continue_token;
703     }
704 
705     ret = spapr_nvdimm_get_flush_status(SPAPR_NVDIMM(dimm), continue_token);
706     if (H_IS_LONG_BUSY(ret)) {
707         args[0] = continue_token;
708     }
709 
710     return ret;
711 }
712 
h_scm_unbind_mem(PowerPCCPU * cpu,SpaprMachineState * spapr,target_ulong opcode,target_ulong * args)713 static target_ulong h_scm_unbind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
714                                      target_ulong opcode, target_ulong *args)
715 {
716     uint32_t drc_index = args[0];
717     uint64_t starting_scm_logical_addr = args[1];
718     uint64_t no_of_scm_blocks_to_unbind = args[2];
719     uint64_t continue_token = args[3];
720     uint64_t size_to_unbind;
721     Range blockrange = range_empty;
722     Range nvdimmrange = range_empty;
723     SpaprDrc *drc = spapr_drc_by_index(drc_index);
724     NVDIMMDevice *nvdimm;
725     uint64_t size, addr;
726 
727     if (!drc || !drc->dev ||
728         spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
729         return H_PARAMETER;
730     }
731 
732     /* continue_token should be zero as this hcall doesn't return H_BUSY. */
733     if (continue_token > 0) {
734         return H_P4;
735     }
736 
737     /* Check if starting_scm_logical_addr is block aligned */
738     if (!QEMU_IS_ALIGNED(starting_scm_logical_addr,
739                          SPAPR_MINIMUM_SCM_BLOCK_SIZE)) {
740         return H_P2;
741     }
742 
743     size_to_unbind = no_of_scm_blocks_to_unbind * SPAPR_MINIMUM_SCM_BLOCK_SIZE;
744     if (no_of_scm_blocks_to_unbind == 0 || no_of_scm_blocks_to_unbind !=
745                                size_to_unbind / SPAPR_MINIMUM_SCM_BLOCK_SIZE) {
746         return H_P3;
747     }
748 
749     nvdimm = NVDIMM(drc->dev);
750     size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
751                                    &error_abort);
752     addr = object_property_get_int(OBJECT(nvdimm), PC_DIMM_ADDR_PROP,
753                                    &error_abort);
754 
755     range_init_nofail(&nvdimmrange, addr, size);
756     range_init_nofail(&blockrange, starting_scm_logical_addr, size_to_unbind);
757 
758     if (!range_contains_range(&nvdimmrange, &blockrange)) {
759         return H_P3;
760     }
761 
762     args[1] = no_of_scm_blocks_to_unbind;
763 
764     /* let unplug take care of actual unbind */
765     return H_SUCCESS;
766 }
767 
768 #define H_UNBIND_SCOPE_ALL 0x1
769 #define H_UNBIND_SCOPE_DRC 0x2
770 
h_scm_unbind_all(PowerPCCPU * cpu,SpaprMachineState * spapr,target_ulong opcode,target_ulong * args)771 static target_ulong h_scm_unbind_all(PowerPCCPU *cpu, SpaprMachineState *spapr,
772                                      target_ulong opcode, target_ulong *args)
773 {
774     uint64_t target_scope = args[0];
775     uint32_t drc_index = args[1];
776     uint64_t continue_token = args[2];
777     NVDIMMDevice *nvdimm;
778     uint64_t size;
779     uint64_t no_of_scm_blocks_unbound = 0;
780 
781     /* continue_token should be zero as this hcall doesn't return H_BUSY. */
782     if (continue_token > 0) {
783         return H_P4;
784     }
785 
786     if (target_scope == H_UNBIND_SCOPE_DRC) {
787         SpaprDrc *drc = spapr_drc_by_index(drc_index);
788 
789         if (!drc || !drc->dev ||
790             spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
791             return H_P2;
792         }
793 
794         nvdimm = NVDIMM(drc->dev);
795         size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
796                                        &error_abort);
797 
798         no_of_scm_blocks_unbound = size / SPAPR_MINIMUM_SCM_BLOCK_SIZE;
799     } else if (target_scope ==  H_UNBIND_SCOPE_ALL) {
800         GSList *list, *nvdimms;
801 
802         nvdimms = nvdimm_get_device_list();
803         for (list = nvdimms; list; list = list->next) {
804             nvdimm = list->data;
805             size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
806                                            &error_abort);
807 
808             no_of_scm_blocks_unbound += size / SPAPR_MINIMUM_SCM_BLOCK_SIZE;
809         }
810         g_slist_free(nvdimms);
811     } else {
812         return H_PARAMETER;
813     }
814 
815     args[1] = no_of_scm_blocks_unbound;
816 
817     /* let unplug take care of actual unbind */
818     return H_SUCCESS;
819 }
820 
h_scm_health(PowerPCCPU * cpu,SpaprMachineState * spapr,target_ulong opcode,target_ulong * args)821 static target_ulong h_scm_health(PowerPCCPU *cpu, SpaprMachineState *spapr,
822                                  target_ulong opcode, target_ulong *args)
823 {
824 
825     NVDIMMDevice *nvdimm;
826     uint64_t hbitmap = 0;
827     uint32_t drc_index = args[0];
828     SpaprDrc *drc = spapr_drc_by_index(drc_index);
829     const uint64_t hbitmap_mask = PAPR_PMEM_UNARMED;
830 
831 
832     /* Ensure that the drc is valid & is valid PMEM dimm and is plugged in */
833     if (!drc || !drc->dev ||
834         spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
835         return H_PARAMETER;
836     }
837 
838     nvdimm = NVDIMM(drc->dev);
839 
840     /* Update if the nvdimm is unarmed and send its status via health bitmaps */
841     if (object_property_get_bool(OBJECT(nvdimm), NVDIMM_UNARMED_PROP, NULL)) {
842         hbitmap |= PAPR_PMEM_UNARMED;
843     }
844 
845     /* Update the out args with health bitmap/mask */
846     args[0] = hbitmap;
847     args[1] = hbitmap_mask;
848 
849     return H_SUCCESS;
850 }
851 
spapr_scm_register_types(void)852 static void spapr_scm_register_types(void)
853 {
854     /* qemu/scm specific hcalls */
855     spapr_register_hypercall(H_SCM_READ_METADATA, h_scm_read_metadata);
856     spapr_register_hypercall(H_SCM_WRITE_METADATA, h_scm_write_metadata);
857     spapr_register_hypercall(H_SCM_BIND_MEM, h_scm_bind_mem);
858     spapr_register_hypercall(H_SCM_UNBIND_MEM, h_scm_unbind_mem);
859     spapr_register_hypercall(H_SCM_UNBIND_ALL, h_scm_unbind_all);
860     spapr_register_hypercall(H_SCM_HEALTH, h_scm_health);
861     spapr_register_hypercall(H_SCM_FLUSH, h_scm_flush);
862 }
863 
type_init(spapr_scm_register_types)864 type_init(spapr_scm_register_types)
865 
866 static void spapr_nvdimm_realize(NVDIMMDevice *dimm, Error **errp)
867 {
868     SpaprNVDIMMDevice *s_nvdimm = SPAPR_NVDIMM(dimm);
869     HostMemoryBackend *backend = MEMORY_BACKEND(PC_DIMM(dimm)->hostmem);
870     bool is_pmem = object_property_get_bool(OBJECT(backend),  "pmem", NULL);
871     bool pmem_override = object_property_get_bool(OBJECT(dimm), "pmem-override",
872                                              NULL);
873     if (!is_pmem || pmem_override) {
874         s_nvdimm->hcall_flush_required = true;
875     }
876 
877     vmstate_register_any(NULL, &vmstate_spapr_nvdimm_states, dimm);
878 }
879 
spapr_nvdimm_unrealize(NVDIMMDevice * dimm)880 static void spapr_nvdimm_unrealize(NVDIMMDevice *dimm)
881 {
882     vmstate_unregister(NULL, &vmstate_spapr_nvdimm_states, dimm);
883 }
884 
885 #ifdef CONFIG_LIBPMEM
886 static const Property spapr_nvdimm_properties[] = {
887     DEFINE_PROP_BOOL("pmem-override", SpaprNVDIMMDevice, pmem_override, false),
888 };
889 #endif
890 
spapr_nvdimm_class_init(ObjectClass * oc,const void * data)891 static void spapr_nvdimm_class_init(ObjectClass *oc, const void *data)
892 {
893     NVDIMMClass *nvc = NVDIMM_CLASS(oc);
894 
895     nvc->realize = spapr_nvdimm_realize;
896     nvc->unrealize = spapr_nvdimm_unrealize;
897 
898 #ifdef CONFIG_LIBPMEM
899     device_class_set_props(DEVICE_CLASS(oc), spapr_nvdimm_properties);
900 #endif
901 }
902 
spapr_nvdimm_init(Object * obj)903 static void spapr_nvdimm_init(Object *obj)
904 {
905     SpaprNVDIMMDevice *s_nvdimm = SPAPR_NVDIMM(obj);
906 
907     s_nvdimm->hcall_flush_required = false;
908     QLIST_INIT(&s_nvdimm->pending_nvdimm_flush_states);
909     QLIST_INIT(&s_nvdimm->completed_nvdimm_flush_states);
910 }
911 
912 static TypeInfo spapr_nvdimm_info = {
913     .name          = TYPE_SPAPR_NVDIMM,
914     .parent        = TYPE_NVDIMM,
915     .class_init    = spapr_nvdimm_class_init,
916     .class_size    = sizeof(SPAPRNVDIMMClass),
917     .instance_size = sizeof(SpaprNVDIMMDevice),
918     .instance_init = spapr_nvdimm_init,
919 };
920 
spapr_nvdimm_register_types(void)921 static void spapr_nvdimm_register_types(void)
922 {
923     type_register_static(&spapr_nvdimm_info);
924 }
925 
926 type_init(spapr_nvdimm_register_types)
927