xref: /qemu/hw/virtio/virtio-mem.c (revision 24c00b754121f3569ea9e68f5f188747cf5b8439)
1 /*
2  * Virtio MEM device
3  *
4  * Copyright (C) 2020 Red Hat, Inc.
5  *
6  * Authors:
7  *  David Hildenbrand <david@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.
10  * See the COPYING file in the top-level directory.
11  */
12 
13 #include "qemu/osdep.h"
14 #include "qemu/iov.h"
15 #include "qemu/cutils.h"
16 #include "qemu/error-report.h"
17 #include "qemu/units.h"
18 #include "system/numa.h"
19 #include "system/system.h"
20 #include "system/reset.h"
21 #include "system/runstate.h"
22 #include "hw/virtio/virtio.h"
23 #include "hw/virtio/virtio-bus.h"
24 #include "hw/virtio/virtio-mem.h"
25 #include "qapi/error.h"
26 #include "qapi/visitor.h"
27 #include "system/ram_addr.h"
28 #include "migration/misc.h"
29 #include "hw/boards.h"
30 #include "hw/qdev-properties.h"
31 #include "hw/acpi/acpi.h"
32 #include "trace.h"
33 
34 static const VMStateDescription vmstate_virtio_mem_device_early;
35 
36 /*
37  * We only had legacy x86 guests that did not support
38  * VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE. Other targets don't have legacy guests.
39  */
40 #if defined(TARGET_X86_64) || defined(TARGET_I386)
41 #define VIRTIO_MEM_HAS_LEGACY_GUESTS
42 #endif
43 
44 /*
45  * Let's not allow blocks smaller than 1 MiB, for example, to keep the tracking
46  * bitmap small.
47  */
48 #define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB))
49 
virtio_mem_default_thp_size(void)50 static uint32_t virtio_mem_default_thp_size(void)
51 {
52     uint32_t default_thp_size = VIRTIO_MEM_MIN_BLOCK_SIZE;
53 
54 #if defined(__x86_64__) || defined(__arm__) || defined(__powerpc64__)
55     default_thp_size = 2 * MiB;
56 #elif defined(__aarch64__)
57     if (qemu_real_host_page_size() == 4 * KiB) {
58         default_thp_size = 2 * MiB;
59     } else if (qemu_real_host_page_size() == 16 * KiB) {
60         default_thp_size = 32 * MiB;
61     } else if (qemu_real_host_page_size() == 64 * KiB) {
62         default_thp_size = 512 * MiB;
63     }
64 #elif defined(__s390x__)
65     default_thp_size = 1 * MiB;
66 #endif
67 
68     return default_thp_size;
69 }
70 
71 /*
72  * The minimum memslot size depends on this setting ("sane default"), the
73  * device block size, and the memory backend page size. The last (or single)
74  * memslot might be smaller than this constant.
75  */
76 #define VIRTIO_MEM_MIN_MEMSLOT_SIZE (1 * GiB)
77 
78 /*
79  * We want to have a reasonable default block size such that
80  * 1. We avoid splitting THPs when unplugging memory, which degrades
81  *    performance.
82  * 2. We avoid placing THPs for plugged blocks that also cover unplugged
83  *    blocks.
84  *
85  * The actual THP size might differ between Linux kernels, so we try to probe
86  * it. In the future (if we ever run into issues regarding 2.), we might want
87  * to disable THP in case we fail to properly probe the THP size, or if the
88  * block size is configured smaller than the THP size.
89  */
90 static uint32_t thp_size;
91 
92 #define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
93 #define HPAGE_PATH "/sys/kernel/mm/transparent_hugepage/"
virtio_mem_thp_size(void)94 static uint32_t virtio_mem_thp_size(void)
95 {
96     gchar *content = NULL;
97     const char *endptr;
98     uint64_t tmp;
99 
100     if (thp_size) {
101         return thp_size;
102     }
103 
104     /* No THP -> no restrictions. */
105     if (!g_file_test(HPAGE_PATH, G_FILE_TEST_EXISTS)) {
106         thp_size = VIRTIO_MEM_MIN_BLOCK_SIZE;
107         return thp_size;
108     }
109 
110     /*
111      * Try to probe the actual THP size, fallback to (sane but eventually
112      * incorrect) default sizes.
113      */
114     if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) &&
115         !qemu_strtou64(content, &endptr, 0, &tmp) &&
116         (!endptr || *endptr == '\n')) {
117         /* Sanity-check the value and fallback to something reasonable. */
118         if (!tmp || !is_power_of_2(tmp)) {
119             warn_report("Read unsupported THP size: %" PRIx64, tmp);
120         } else {
121             thp_size = tmp;
122         }
123     }
124 
125     if (!thp_size) {
126         thp_size = virtio_mem_default_thp_size();
127         warn_report("Could not detect THP size, falling back to %" PRIx64
128                     "  MiB.", thp_size / MiB);
129     }
130 
131     g_free(content);
132     return thp_size;
133 }
134 
virtio_mem_default_block_size(RAMBlock * rb)135 static uint64_t virtio_mem_default_block_size(RAMBlock *rb)
136 {
137     const uint64_t page_size = qemu_ram_pagesize(rb);
138 
139     /* We can have hugetlbfs with a page size smaller than the THP size. */
140     if (page_size == qemu_real_host_page_size()) {
141         return MAX(page_size, virtio_mem_thp_size());
142     }
143     return MAX(page_size, VIRTIO_MEM_MIN_BLOCK_SIZE);
144 }
145 
146 #if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS)
virtio_mem_has_shared_zeropage(RAMBlock * rb)147 static bool virtio_mem_has_shared_zeropage(RAMBlock *rb)
148 {
149     /*
150      * We only have a guaranteed shared zeropage on ordinary MAP_PRIVATE
151      * anonymous RAM. In any other case, reading unplugged *can* populate a
152      * fresh page, consuming actual memory.
153      */
154     return !qemu_ram_is_shared(rb) && qemu_ram_get_fd(rb) < 0 &&
155            qemu_ram_pagesize(rb) == qemu_real_host_page_size();
156 }
157 #endif /* VIRTIO_MEM_HAS_LEGACY_GUESTS */
158 
159 /*
160  * Size the usable region bigger than the requested size if possible. Esp.
161  * Linux guests will only add (aligned) memory blocks in case they fully
162  * fit into the usable region, but plug+online only a subset of the pages.
163  * The memory block size corresponds mostly to the section size.
164  *
165  * This allows e.g., to add 20MB with a section size of 128MB on x86_64, and
166  * a section size of 512MB on arm64 (as long as the start address is properly
167  * aligned, similar to ordinary DIMMs).
168  *
169  * We can change this at any time and maybe even make it configurable if
170  * necessary (as the section size can change). But it's more likely that the
171  * section size will rather get smaller and not bigger over time.
172  */
173 #if defined(TARGET_X86_64) || defined(TARGET_I386) || defined(TARGET_S390X)
174 #define VIRTIO_MEM_USABLE_EXTENT (2 * (128 * MiB))
175 #elif defined(TARGET_ARM)
176 #define VIRTIO_MEM_USABLE_EXTENT (2 * (512 * MiB))
177 #else
178 #error VIRTIO_MEM_USABLE_EXTENT not defined
179 #endif
180 
virtio_mem_is_busy(void)181 static bool virtio_mem_is_busy(void)
182 {
183     /*
184      * Postcopy cannot handle concurrent discards and we don't want to migrate
185      * pages on-demand with stale content when plugging new blocks.
186      *
187      * For precopy, we don't want unplugged blocks in our migration stream, and
188      * when plugging new blocks, the page content might differ between source
189      * and destination (observable by the guest when not initializing pages
190      * after plugging them) until we're running on the destination (as we didn't
191      * migrate these blocks when they were unplugged).
192      */
193     return migration_in_incoming_postcopy() || migration_is_running();
194 }
195 
196 typedef int (*virtio_mem_range_cb)(VirtIOMEM *vmem, void *arg,
197                                    uint64_t offset, uint64_t size);
198 
virtio_mem_for_each_unplugged_range(VirtIOMEM * vmem,void * arg,virtio_mem_range_cb cb)199 static int virtio_mem_for_each_unplugged_range(VirtIOMEM *vmem, void *arg,
200                                                virtio_mem_range_cb cb)
201 {
202     unsigned long first_zero_bit, last_zero_bit;
203     uint64_t offset, size;
204     int ret = 0;
205 
206     first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size);
207     while (first_zero_bit < vmem->bitmap_size) {
208         offset = first_zero_bit * vmem->block_size;
209         last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
210                                       first_zero_bit + 1) - 1;
211         size = (last_zero_bit - first_zero_bit + 1) * vmem->block_size;
212 
213         ret = cb(vmem, arg, offset, size);
214         if (ret) {
215             break;
216         }
217         first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
218                                             last_zero_bit + 2);
219     }
220     return ret;
221 }
222 
virtio_mem_for_each_plugged_range(VirtIOMEM * vmem,void * arg,virtio_mem_range_cb cb)223 static int virtio_mem_for_each_plugged_range(VirtIOMEM *vmem, void *arg,
224                                              virtio_mem_range_cb cb)
225 {
226     unsigned long first_bit, last_bit;
227     uint64_t offset, size;
228     int ret = 0;
229 
230     first_bit = find_first_bit(vmem->bitmap, vmem->bitmap_size);
231     while (first_bit < vmem->bitmap_size) {
232         offset = first_bit * vmem->block_size;
233         last_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
234                                       first_bit + 1) - 1;
235         size = (last_bit - first_bit + 1) * vmem->block_size;
236 
237         ret = cb(vmem, arg, offset, size);
238         if (ret) {
239             break;
240         }
241         first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
242                                   last_bit + 2);
243     }
244     return ret;
245 }
246 
247 typedef int (*virtio_mem_section_cb)(MemoryRegionSection *s, void *arg);
248 
virtio_mem_for_each_plugged_section(const VirtIOMEM * vmem,MemoryRegionSection * s,void * arg,virtio_mem_section_cb cb)249 static int virtio_mem_for_each_plugged_section(const VirtIOMEM *vmem,
250                                                MemoryRegionSection *s,
251                                                void *arg,
252                                                virtio_mem_section_cb cb)
253 {
254     unsigned long first_bit, last_bit;
255     uint64_t offset, size;
256     int ret = 0;
257 
258     first_bit = s->offset_within_region / vmem->block_size;
259     first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, first_bit);
260     while (first_bit < vmem->bitmap_size) {
261         MemoryRegionSection tmp = *s;
262 
263         offset = first_bit * vmem->block_size;
264         last_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
265                                       first_bit + 1) - 1;
266         size = (last_bit - first_bit + 1) * vmem->block_size;
267 
268         if (!memory_region_section_intersect_range(&tmp, offset, size)) {
269             break;
270         }
271         ret = cb(&tmp, arg);
272         if (ret) {
273             break;
274         }
275         first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
276                                   last_bit + 2);
277     }
278     return ret;
279 }
280 
virtio_mem_for_each_unplugged_section(const VirtIOMEM * vmem,MemoryRegionSection * s,void * arg,virtio_mem_section_cb cb)281 static int virtio_mem_for_each_unplugged_section(const VirtIOMEM *vmem,
282                                                  MemoryRegionSection *s,
283                                                  void *arg,
284                                                  virtio_mem_section_cb cb)
285 {
286     unsigned long first_bit, last_bit;
287     uint64_t offset, size;
288     int ret = 0;
289 
290     first_bit = s->offset_within_region / vmem->block_size;
291     first_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, first_bit);
292     while (first_bit < vmem->bitmap_size) {
293         MemoryRegionSection tmp = *s;
294 
295         offset = first_bit * vmem->block_size;
296         last_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
297                                  first_bit + 1) - 1;
298         size = (last_bit - first_bit + 1) * vmem->block_size;
299 
300         if (!memory_region_section_intersect_range(&tmp, offset, size)) {
301             break;
302         }
303         ret = cb(&tmp, arg);
304         if (ret) {
305             break;
306         }
307         first_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
308                                        last_bit + 2);
309     }
310     return ret;
311 }
312 
virtio_mem_notify_populate_cb(MemoryRegionSection * s,void * arg)313 static int virtio_mem_notify_populate_cb(MemoryRegionSection *s, void *arg)
314 {
315     RamDiscardListener *rdl = arg;
316 
317     return rdl->notify_populate(rdl, s);
318 }
319 
virtio_mem_notify_discard_cb(MemoryRegionSection * s,void * arg)320 static int virtio_mem_notify_discard_cb(MemoryRegionSection *s, void *arg)
321 {
322     RamDiscardListener *rdl = arg;
323 
324     rdl->notify_discard(rdl, s);
325     return 0;
326 }
327 
virtio_mem_notify_unplug(VirtIOMEM * vmem,uint64_t offset,uint64_t size)328 static void virtio_mem_notify_unplug(VirtIOMEM *vmem, uint64_t offset,
329                                      uint64_t size)
330 {
331     RamDiscardListener *rdl;
332 
333     QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
334         MemoryRegionSection tmp = *rdl->section;
335 
336         if (!memory_region_section_intersect_range(&tmp, offset, size)) {
337             continue;
338         }
339         rdl->notify_discard(rdl, &tmp);
340     }
341 }
342 
virtio_mem_notify_plug(VirtIOMEM * vmem,uint64_t offset,uint64_t size)343 static int virtio_mem_notify_plug(VirtIOMEM *vmem, uint64_t offset,
344                                   uint64_t size)
345 {
346     RamDiscardListener *rdl, *rdl2;
347     int ret = 0;
348 
349     QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
350         MemoryRegionSection tmp = *rdl->section;
351 
352         if (!memory_region_section_intersect_range(&tmp, offset, size)) {
353             continue;
354         }
355         ret = rdl->notify_populate(rdl, &tmp);
356         if (ret) {
357             break;
358         }
359     }
360 
361     if (ret) {
362         /* Notify all already-notified listeners. */
363         QLIST_FOREACH(rdl2, &vmem->rdl_list, next) {
364             MemoryRegionSection tmp = *rdl2->section;
365 
366             if (rdl2 == rdl) {
367                 break;
368             }
369             if (!memory_region_section_intersect_range(&tmp, offset, size)) {
370                 continue;
371             }
372             rdl2->notify_discard(rdl2, &tmp);
373         }
374     }
375     return ret;
376 }
377 
virtio_mem_notify_unplug_all(VirtIOMEM * vmem)378 static void virtio_mem_notify_unplug_all(VirtIOMEM *vmem)
379 {
380     RamDiscardListener *rdl;
381 
382     if (!vmem->size) {
383         return;
384     }
385 
386     QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
387         if (rdl->double_discard_supported) {
388             rdl->notify_discard(rdl, rdl->section);
389         } else {
390             virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
391                                                 virtio_mem_notify_discard_cb);
392         }
393     }
394 }
395 
virtio_mem_is_range_plugged(const VirtIOMEM * vmem,uint64_t start_gpa,uint64_t size)396 static bool virtio_mem_is_range_plugged(const VirtIOMEM *vmem,
397                                         uint64_t start_gpa, uint64_t size)
398 {
399     const unsigned long first_bit = (start_gpa - vmem->addr) / vmem->block_size;
400     const unsigned long last_bit = first_bit + (size / vmem->block_size) - 1;
401     unsigned long found_bit;
402 
403     /* We fake a shorter bitmap to avoid searching too far. */
404     found_bit = find_next_zero_bit(vmem->bitmap, last_bit + 1, first_bit);
405     return found_bit > last_bit;
406 }
407 
virtio_mem_is_range_unplugged(const VirtIOMEM * vmem,uint64_t start_gpa,uint64_t size)408 static bool virtio_mem_is_range_unplugged(const VirtIOMEM *vmem,
409                                           uint64_t start_gpa, uint64_t size)
410 {
411     const unsigned long first_bit = (start_gpa - vmem->addr) / vmem->block_size;
412     const unsigned long last_bit = first_bit + (size / vmem->block_size) - 1;
413     unsigned long found_bit;
414 
415     /* We fake a shorter bitmap to avoid searching too far. */
416     found_bit = find_next_bit(vmem->bitmap, last_bit + 1, first_bit);
417     return found_bit > last_bit;
418 }
419 
virtio_mem_set_range_plugged(VirtIOMEM * vmem,uint64_t start_gpa,uint64_t size)420 static void virtio_mem_set_range_plugged(VirtIOMEM *vmem, uint64_t start_gpa,
421                                          uint64_t size)
422 {
423     const unsigned long bit = (start_gpa - vmem->addr) / vmem->block_size;
424     const unsigned long nbits = size / vmem->block_size;
425 
426     bitmap_set(vmem->bitmap, bit, nbits);
427 }
428 
virtio_mem_set_range_unplugged(VirtIOMEM * vmem,uint64_t start_gpa,uint64_t size)429 static void virtio_mem_set_range_unplugged(VirtIOMEM *vmem, uint64_t start_gpa,
430                                            uint64_t size)
431 {
432     const unsigned long bit = (start_gpa - vmem->addr) / vmem->block_size;
433     const unsigned long nbits = size / vmem->block_size;
434 
435     bitmap_clear(vmem->bitmap, bit, nbits);
436 }
437 
virtio_mem_send_response(VirtIOMEM * vmem,VirtQueueElement * elem,struct virtio_mem_resp * resp)438 static void virtio_mem_send_response(VirtIOMEM *vmem, VirtQueueElement *elem,
439                                      struct virtio_mem_resp *resp)
440 {
441     VirtIODevice *vdev = VIRTIO_DEVICE(vmem);
442     VirtQueue *vq = vmem->vq;
443 
444     trace_virtio_mem_send_response(le16_to_cpu(resp->type));
445     iov_from_buf(elem->in_sg, elem->in_num, 0, resp, sizeof(*resp));
446 
447     virtqueue_push(vq, elem, sizeof(*resp));
448     virtio_notify(vdev, vq);
449 }
450 
virtio_mem_send_response_simple(VirtIOMEM * vmem,VirtQueueElement * elem,uint16_t type)451 static void virtio_mem_send_response_simple(VirtIOMEM *vmem,
452                                             VirtQueueElement *elem,
453                                             uint16_t type)
454 {
455     struct virtio_mem_resp resp = {
456         .type = cpu_to_le16(type),
457     };
458 
459     virtio_mem_send_response(vmem, elem, &resp);
460 }
461 
virtio_mem_valid_range(const VirtIOMEM * vmem,uint64_t gpa,uint64_t size)462 static bool virtio_mem_valid_range(const VirtIOMEM *vmem, uint64_t gpa,
463                                    uint64_t size)
464 {
465     if (!QEMU_IS_ALIGNED(gpa, vmem->block_size)) {
466         return false;
467     }
468     if (gpa + size < gpa || !size) {
469         return false;
470     }
471     if (gpa < vmem->addr || gpa >= vmem->addr + vmem->usable_region_size) {
472         return false;
473     }
474     if (gpa + size > vmem->addr + vmem->usable_region_size) {
475         return false;
476     }
477     return true;
478 }
479 
virtio_mem_activate_memslot(VirtIOMEM * vmem,unsigned int idx)480 static void virtio_mem_activate_memslot(VirtIOMEM *vmem, unsigned int idx)
481 {
482     const uint64_t memslot_offset = idx * vmem->memslot_size;
483 
484     assert(vmem->memslots);
485 
486     /*
487      * Instead of enabling/disabling memslots, we add/remove them. This should
488      * make address space updates faster, because we don't have to loop over
489      * many disabled subregions.
490      */
491     if (memory_region_is_mapped(&vmem->memslots[idx])) {
492         return;
493     }
494     memory_region_add_subregion(vmem->mr, memslot_offset, &vmem->memslots[idx]);
495 }
496 
virtio_mem_deactivate_memslot(VirtIOMEM * vmem,unsigned int idx)497 static void virtio_mem_deactivate_memslot(VirtIOMEM *vmem, unsigned int idx)
498 {
499     assert(vmem->memslots);
500 
501     if (!memory_region_is_mapped(&vmem->memslots[idx])) {
502         return;
503     }
504     memory_region_del_subregion(vmem->mr, &vmem->memslots[idx]);
505 }
506 
virtio_mem_activate_memslots_to_plug(VirtIOMEM * vmem,uint64_t offset,uint64_t size)507 static void virtio_mem_activate_memslots_to_plug(VirtIOMEM *vmem,
508                                                  uint64_t offset, uint64_t size)
509 {
510     const unsigned int start_idx = offset / vmem->memslot_size;
511     const unsigned int end_idx = (offset + size + vmem->memslot_size - 1) /
512                                  vmem->memslot_size;
513     unsigned int idx;
514 
515     assert(vmem->dynamic_memslots);
516 
517     /* Activate all involved memslots in a single transaction. */
518     memory_region_transaction_begin();
519     for (idx = start_idx; idx < end_idx; idx++) {
520         virtio_mem_activate_memslot(vmem, idx);
521     }
522     memory_region_transaction_commit();
523 }
524 
virtio_mem_deactivate_unplugged_memslots(VirtIOMEM * vmem,uint64_t offset,uint64_t size)525 static void virtio_mem_deactivate_unplugged_memslots(VirtIOMEM *vmem,
526                                                      uint64_t offset,
527                                                      uint64_t size)
528 {
529     const uint64_t region_size = memory_region_size(&vmem->memdev->mr);
530     const unsigned int start_idx = offset / vmem->memslot_size;
531     const unsigned int end_idx = (offset + size + vmem->memslot_size - 1) /
532                                  vmem->memslot_size;
533     unsigned int idx;
534 
535     assert(vmem->dynamic_memslots);
536 
537     /* Deactivate all memslots with unplugged blocks in a single transaction. */
538     memory_region_transaction_begin();
539     for (idx = start_idx; idx < end_idx; idx++) {
540         const uint64_t memslot_offset = idx * vmem->memslot_size;
541         uint64_t memslot_size = vmem->memslot_size;
542 
543         /* The size of the last memslot might be smaller. */
544         if (idx == vmem->nb_memslots - 1) {
545             memslot_size = region_size - memslot_offset;
546         }
547 
548         /*
549          * Partially covered memslots might still have some blocks plugged and
550          * have to remain active if that's the case.
551          */
552         if (offset > memslot_offset ||
553             offset + size < memslot_offset + memslot_size) {
554             const uint64_t gpa = vmem->addr + memslot_offset;
555 
556             if (!virtio_mem_is_range_unplugged(vmem, gpa, memslot_size)) {
557                 continue;
558             }
559         }
560 
561         virtio_mem_deactivate_memslot(vmem, idx);
562     }
563     memory_region_transaction_commit();
564 }
565 
virtio_mem_set_block_state(VirtIOMEM * vmem,uint64_t start_gpa,uint64_t size,bool plug)566 static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,
567                                       uint64_t size, bool plug)
568 {
569     const uint64_t offset = start_gpa - vmem->addr;
570     RAMBlock *rb = vmem->memdev->mr.ram_block;
571     int ret = 0;
572 
573     if (virtio_mem_is_busy()) {
574         return -EBUSY;
575     }
576 
577     if (!plug) {
578         if (ram_block_discard_range(rb, offset, size)) {
579             return -EBUSY;
580         }
581         virtio_mem_notify_unplug(vmem, offset, size);
582         virtio_mem_set_range_unplugged(vmem, start_gpa, size);
583         /* Deactivate completely unplugged memslots after updating the state. */
584         if (vmem->dynamic_memslots) {
585             virtio_mem_deactivate_unplugged_memslots(vmem, offset, size);
586         }
587         return 0;
588     }
589 
590     if (vmem->prealloc) {
591         void *area = memory_region_get_ram_ptr(&vmem->memdev->mr) + offset;
592         int fd = memory_region_get_fd(&vmem->memdev->mr);
593         Error *local_err = NULL;
594 
595         if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) {
596             static bool warned;
597 
598             /*
599              * Warn only once, we don't want to fill the log with these
600              * warnings.
601              */
602             if (!warned) {
603                 warn_report_err(local_err);
604                 warned = true;
605             } else {
606                 error_free(local_err);
607             }
608             ret = -EBUSY;
609         }
610     }
611 
612     if (!ret) {
613         /*
614          * Activate before notifying and rollback in case of any errors.
615          *
616          * When activating a yet inactive memslot, memory notifiers will get
617          * notified about the added memory region and can register with the
618          * RamDiscardManager; this will traverse all plugged blocks and skip the
619          * blocks we are plugging here. The following notification will inform
620          * registered listeners about the blocks we're plugging.
621          */
622         if (vmem->dynamic_memslots) {
623             virtio_mem_activate_memslots_to_plug(vmem, offset, size);
624         }
625         ret = virtio_mem_notify_plug(vmem, offset, size);
626         if (ret && vmem->dynamic_memslots) {
627             virtio_mem_deactivate_unplugged_memslots(vmem, offset, size);
628         }
629     }
630     if (ret) {
631         /* Could be preallocation or a notifier populated memory. */
632         ram_block_discard_range(vmem->memdev->mr.ram_block, offset, size);
633         return -EBUSY;
634     }
635 
636     virtio_mem_set_range_plugged(vmem, start_gpa, size);
637     return 0;
638 }
639 
virtio_mem_state_change_request(VirtIOMEM * vmem,uint64_t gpa,uint16_t nb_blocks,bool plug)640 static int virtio_mem_state_change_request(VirtIOMEM *vmem, uint64_t gpa,
641                                            uint16_t nb_blocks, bool plug)
642 {
643     const uint64_t size = nb_blocks * vmem->block_size;
644     int ret;
645 
646     if (!virtio_mem_valid_range(vmem, gpa, size)) {
647         return VIRTIO_MEM_RESP_ERROR;
648     }
649 
650     if (plug && (vmem->size + size > vmem->requested_size)) {
651         return VIRTIO_MEM_RESP_NACK;
652     }
653 
654     /* test if really all blocks are in the opposite state */
655     if ((plug && !virtio_mem_is_range_unplugged(vmem, gpa, size)) ||
656         (!plug && !virtio_mem_is_range_plugged(vmem, gpa, size))) {
657         return VIRTIO_MEM_RESP_ERROR;
658     }
659 
660     ret = virtio_mem_set_block_state(vmem, gpa, size, plug);
661     if (ret) {
662         return VIRTIO_MEM_RESP_BUSY;
663     }
664     if (plug) {
665         vmem->size += size;
666     } else {
667         vmem->size -= size;
668     }
669     notifier_list_notify(&vmem->size_change_notifiers, &vmem->size);
670     return VIRTIO_MEM_RESP_ACK;
671 }
672 
virtio_mem_plug_request(VirtIOMEM * vmem,VirtQueueElement * elem,struct virtio_mem_req * req)673 static void virtio_mem_plug_request(VirtIOMEM *vmem, VirtQueueElement *elem,
674                                     struct virtio_mem_req *req)
675 {
676     const uint64_t gpa = le64_to_cpu(req->u.plug.addr);
677     const uint16_t nb_blocks = le16_to_cpu(req->u.plug.nb_blocks);
678     uint16_t type;
679 
680     trace_virtio_mem_plug_request(gpa, nb_blocks);
681     type = virtio_mem_state_change_request(vmem, gpa, nb_blocks, true);
682     virtio_mem_send_response_simple(vmem, elem, type);
683 }
684 
virtio_mem_unplug_request(VirtIOMEM * vmem,VirtQueueElement * elem,struct virtio_mem_req * req)685 static void virtio_mem_unplug_request(VirtIOMEM *vmem, VirtQueueElement *elem,
686                                       struct virtio_mem_req *req)
687 {
688     const uint64_t gpa = le64_to_cpu(req->u.unplug.addr);
689     const uint16_t nb_blocks = le16_to_cpu(req->u.unplug.nb_blocks);
690     uint16_t type;
691 
692     trace_virtio_mem_unplug_request(gpa, nb_blocks);
693     type = virtio_mem_state_change_request(vmem, gpa, nb_blocks, false);
694     virtio_mem_send_response_simple(vmem, elem, type);
695 }
696 
virtio_mem_resize_usable_region(VirtIOMEM * vmem,uint64_t requested_size,bool can_shrink)697 static void virtio_mem_resize_usable_region(VirtIOMEM *vmem,
698                                             uint64_t requested_size,
699                                             bool can_shrink)
700 {
701     uint64_t newsize = MIN(memory_region_size(&vmem->memdev->mr),
702                            requested_size + VIRTIO_MEM_USABLE_EXTENT);
703 
704     /* The usable region size always has to be multiples of the block size. */
705     newsize = QEMU_ALIGN_UP(newsize, vmem->block_size);
706 
707     if (!requested_size) {
708         newsize = 0;
709     }
710 
711     if (newsize < vmem->usable_region_size && !can_shrink) {
712         return;
713     }
714 
715     trace_virtio_mem_resized_usable_region(vmem->usable_region_size, newsize);
716     vmem->usable_region_size = newsize;
717 }
718 
virtio_mem_unplug_all(VirtIOMEM * vmem)719 static int virtio_mem_unplug_all(VirtIOMEM *vmem)
720 {
721     const uint64_t region_size = memory_region_size(&vmem->memdev->mr);
722     RAMBlock *rb = vmem->memdev->mr.ram_block;
723 
724     if (vmem->size) {
725         if (virtio_mem_is_busy()) {
726             return -EBUSY;
727         }
728         if (ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb))) {
729             return -EBUSY;
730         }
731         virtio_mem_notify_unplug_all(vmem);
732 
733         bitmap_clear(vmem->bitmap, 0, vmem->bitmap_size);
734         vmem->size = 0;
735         notifier_list_notify(&vmem->size_change_notifiers, &vmem->size);
736 
737         /* Deactivate all memslots after updating the state. */
738         if (vmem->dynamic_memslots) {
739             virtio_mem_deactivate_unplugged_memslots(vmem, 0, region_size);
740         }
741     }
742 
743     trace_virtio_mem_unplugged_all();
744     virtio_mem_resize_usable_region(vmem, vmem->requested_size, true);
745     return 0;
746 }
747 
virtio_mem_unplug_all_request(VirtIOMEM * vmem,VirtQueueElement * elem)748 static void virtio_mem_unplug_all_request(VirtIOMEM *vmem,
749                                           VirtQueueElement *elem)
750 {
751     trace_virtio_mem_unplug_all_request();
752     if (virtio_mem_unplug_all(vmem)) {
753         virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_BUSY);
754     } else {
755         virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_ACK);
756     }
757 }
758 
virtio_mem_state_request(VirtIOMEM * vmem,VirtQueueElement * elem,struct virtio_mem_req * req)759 static void virtio_mem_state_request(VirtIOMEM *vmem, VirtQueueElement *elem,
760                                      struct virtio_mem_req *req)
761 {
762     const uint16_t nb_blocks = le16_to_cpu(req->u.state.nb_blocks);
763     const uint64_t gpa = le64_to_cpu(req->u.state.addr);
764     const uint64_t size = nb_blocks * vmem->block_size;
765     struct virtio_mem_resp resp = {
766         .type = cpu_to_le16(VIRTIO_MEM_RESP_ACK),
767     };
768 
769     trace_virtio_mem_state_request(gpa, nb_blocks);
770     if (!virtio_mem_valid_range(vmem, gpa, size)) {
771         virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_ERROR);
772         return;
773     }
774 
775     if (virtio_mem_is_range_plugged(vmem, gpa, size)) {
776         resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_PLUGGED);
777     } else if (virtio_mem_is_range_unplugged(vmem, gpa, size)) {
778         resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_UNPLUGGED);
779     } else {
780         resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_MIXED);
781     }
782     trace_virtio_mem_state_response(le16_to_cpu(resp.u.state.state));
783     virtio_mem_send_response(vmem, elem, &resp);
784 }
785 
virtio_mem_handle_request(VirtIODevice * vdev,VirtQueue * vq)786 static void virtio_mem_handle_request(VirtIODevice *vdev, VirtQueue *vq)
787 {
788     const int len = sizeof(struct virtio_mem_req);
789     VirtIOMEM *vmem = VIRTIO_MEM(vdev);
790     VirtQueueElement *elem;
791     struct virtio_mem_req req;
792     uint16_t type;
793 
794     while (true) {
795         elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
796         if (!elem) {
797             return;
798         }
799 
800         if (iov_to_buf(elem->out_sg, elem->out_num, 0, &req, len) < len) {
801             virtio_error(vdev, "virtio-mem protocol violation: invalid request"
802                          " size: %d", len);
803             virtqueue_detach_element(vq, elem, 0);
804             g_free(elem);
805             return;
806         }
807 
808         if (iov_size(elem->in_sg, elem->in_num) <
809             sizeof(struct virtio_mem_resp)) {
810             virtio_error(vdev, "virtio-mem protocol violation: not enough space"
811                          " for response: %zu",
812                          iov_size(elem->in_sg, elem->in_num));
813             virtqueue_detach_element(vq, elem, 0);
814             g_free(elem);
815             return;
816         }
817 
818         type = le16_to_cpu(req.type);
819         switch (type) {
820         case VIRTIO_MEM_REQ_PLUG:
821             virtio_mem_plug_request(vmem, elem, &req);
822             break;
823         case VIRTIO_MEM_REQ_UNPLUG:
824             virtio_mem_unplug_request(vmem, elem, &req);
825             break;
826         case VIRTIO_MEM_REQ_UNPLUG_ALL:
827             virtio_mem_unplug_all_request(vmem, elem);
828             break;
829         case VIRTIO_MEM_REQ_STATE:
830             virtio_mem_state_request(vmem, elem, &req);
831             break;
832         default:
833             virtio_error(vdev, "virtio-mem protocol violation: unknown request"
834                          " type: %d", type);
835             virtqueue_detach_element(vq, elem, 0);
836             g_free(elem);
837             return;
838         }
839 
840         g_free(elem);
841     }
842 }
843 
virtio_mem_get_config(VirtIODevice * vdev,uint8_t * config_data)844 static void virtio_mem_get_config(VirtIODevice *vdev, uint8_t *config_data)
845 {
846     VirtIOMEM *vmem = VIRTIO_MEM(vdev);
847     struct virtio_mem_config *config = (void *) config_data;
848 
849     config->block_size = cpu_to_le64(vmem->block_size);
850     config->node_id = cpu_to_le16(vmem->node);
851     config->requested_size = cpu_to_le64(vmem->requested_size);
852     config->plugged_size = cpu_to_le64(vmem->size);
853     config->addr = cpu_to_le64(vmem->addr);
854     config->region_size = cpu_to_le64(memory_region_size(&vmem->memdev->mr));
855     config->usable_region_size = cpu_to_le64(vmem->usable_region_size);
856 }
857 
virtio_mem_get_features(VirtIODevice * vdev,uint64_t features,Error ** errp)858 static uint64_t virtio_mem_get_features(VirtIODevice *vdev, uint64_t features,
859                                         Error **errp)
860 {
861     MachineState *ms = MACHINE(qdev_get_machine());
862     VirtIOMEM *vmem = VIRTIO_MEM(vdev);
863 
864     if (ms->numa_state && acpi_builtin()) {
865         virtio_add_feature(&features, VIRTIO_MEM_F_ACPI_PXM);
866     }
867     assert(vmem->unplugged_inaccessible != ON_OFF_AUTO_AUTO);
868     if (vmem->unplugged_inaccessible == ON_OFF_AUTO_ON) {
869         virtio_add_feature(&features, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE);
870     }
871     if (qemu_wakeup_suspend_enabled()) {
872         virtio_add_feature(&features, VIRTIO_MEM_F_PERSISTENT_SUSPEND);
873     }
874     return features;
875 }
876 
virtio_mem_validate_features(VirtIODevice * vdev)877 static int virtio_mem_validate_features(VirtIODevice *vdev)
878 {
879     if (virtio_host_has_feature(vdev, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE) &&
880         !virtio_vdev_has_feature(vdev, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE)) {
881         return -EFAULT;
882     }
883     return 0;
884 }
885 
virtio_mem_prepare_mr(VirtIOMEM * vmem)886 static void virtio_mem_prepare_mr(VirtIOMEM *vmem)
887 {
888     const uint64_t region_size = memory_region_size(&vmem->memdev->mr);
889 
890     assert(!vmem->mr && vmem->dynamic_memslots);
891     vmem->mr = g_new0(MemoryRegion, 1);
892     memory_region_init(vmem->mr, OBJECT(vmem), "virtio-mem",
893                        region_size);
894     vmem->mr->align = memory_region_get_alignment(&vmem->memdev->mr);
895 }
896 
virtio_mem_prepare_memslots(VirtIOMEM * vmem)897 static void virtio_mem_prepare_memslots(VirtIOMEM *vmem)
898 {
899     const uint64_t region_size = memory_region_size(&vmem->memdev->mr);
900     unsigned int idx;
901 
902     g_assert(!vmem->memslots && vmem->nb_memslots && vmem->dynamic_memslots);
903     vmem->memslots = g_new0(MemoryRegion, vmem->nb_memslots);
904 
905     /* Initialize our memslots, but don't map them yet. */
906     for (idx = 0; idx < vmem->nb_memslots; idx++) {
907         const uint64_t memslot_offset = idx * vmem->memslot_size;
908         uint64_t memslot_size = vmem->memslot_size;
909         char name[20];
910 
911         /* The size of the last memslot might be smaller. */
912         if (idx == vmem->nb_memslots - 1) {
913             memslot_size = region_size - memslot_offset;
914         }
915 
916         snprintf(name, sizeof(name), "memslot-%u", idx);
917         memory_region_init_alias(&vmem->memslots[idx], OBJECT(vmem), name,
918                                  &vmem->memdev->mr, memslot_offset,
919                                  memslot_size);
920         /*
921          * We want to be able to atomically and efficiently activate/deactivate
922          * individual memslots without affecting adjacent memslots in memory
923          * notifiers.
924          */
925         memory_region_set_unmergeable(&vmem->memslots[idx], true);
926     }
927 }
928 
virtio_mem_device_realize(DeviceState * dev,Error ** errp)929 static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
930 {
931     MachineState *ms = MACHINE(qdev_get_machine());
932     int nb_numa_nodes = ms->numa_state ? ms->numa_state->num_nodes : 0;
933     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
934     VirtIOMEM *vmem = VIRTIO_MEM(dev);
935     uint64_t page_size;
936     RAMBlock *rb;
937     Object *obj;
938     int ret;
939 
940     if (!vmem->memdev) {
941         error_setg(errp, "'%s' property is not set", VIRTIO_MEM_MEMDEV_PROP);
942         return;
943     } else if (host_memory_backend_is_mapped(vmem->memdev)) {
944         error_setg(errp, "'%s' property specifies a busy memdev: %s",
945                    VIRTIO_MEM_MEMDEV_PROP,
946                    object_get_canonical_path_component(OBJECT(vmem->memdev)));
947         return;
948     } else if (!memory_region_is_ram(&vmem->memdev->mr) ||
949         memory_region_is_rom(&vmem->memdev->mr) ||
950         !vmem->memdev->mr.ram_block) {
951         error_setg(errp, "'%s' property specifies an unsupported memdev",
952                    VIRTIO_MEM_MEMDEV_PROP);
953         return;
954     } else if (vmem->memdev->prealloc) {
955         error_setg(errp, "'%s' property specifies a memdev with preallocation"
956                    " enabled: %s. Instead, specify 'prealloc=on' for the"
957                    " virtio-mem device. ", VIRTIO_MEM_MEMDEV_PROP,
958                    object_get_canonical_path_component(OBJECT(vmem->memdev)));
959         return;
960     }
961 
962     if ((nb_numa_nodes && vmem->node >= nb_numa_nodes) ||
963         (!nb_numa_nodes && vmem->node)) {
964         error_setg(errp, "'%s' property has value '%" PRIu32 "', which exceeds"
965                    "the number of numa nodes: %d", VIRTIO_MEM_NODE_PROP,
966                    vmem->node, nb_numa_nodes ? nb_numa_nodes : 1);
967         return;
968     }
969 
970     if (should_mlock(mlock_state)) {
971         error_setg(errp, "Incompatible with mlock");
972         return;
973     }
974 
975     rb = vmem->memdev->mr.ram_block;
976     page_size = qemu_ram_pagesize(rb);
977 
978 #if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS)
979     switch (vmem->unplugged_inaccessible) {
980     case ON_OFF_AUTO_AUTO:
981         if (virtio_mem_has_shared_zeropage(rb)) {
982             vmem->unplugged_inaccessible = ON_OFF_AUTO_OFF;
983         } else {
984             vmem->unplugged_inaccessible = ON_OFF_AUTO_ON;
985         }
986         break;
987     case ON_OFF_AUTO_OFF:
988         if (!virtio_mem_has_shared_zeropage(rb)) {
989             warn_report("'%s' property set to 'off' with a memdev that does"
990                         " not support the shared zeropage.",
991                         VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP);
992         }
993         break;
994     default:
995         break;
996     }
997 #else /* VIRTIO_MEM_HAS_LEGACY_GUESTS */
998     vmem->unplugged_inaccessible = ON_OFF_AUTO_ON;
999 #endif /* VIRTIO_MEM_HAS_LEGACY_GUESTS */
1000 
1001     if (vmem->dynamic_memslots &&
1002         vmem->unplugged_inaccessible != ON_OFF_AUTO_ON) {
1003         error_setg(errp, "'%s' property set to 'on' requires '%s' to be 'on'",
1004                    VIRTIO_MEM_DYNAMIC_MEMSLOTS_PROP,
1005                    VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP);
1006         return;
1007     }
1008 
1009     /*
1010      * If the block size wasn't configured by the user, use a sane default. This
1011      * allows using hugetlbfs backends of any page size without manual
1012      * intervention.
1013      */
1014     if (!vmem->block_size) {
1015         vmem->block_size = virtio_mem_default_block_size(rb);
1016     }
1017 
1018     if (vmem->block_size < page_size) {
1019         error_setg(errp, "'%s' property has to be at least the page size (0x%"
1020                    PRIx64 ")", VIRTIO_MEM_BLOCK_SIZE_PROP, page_size);
1021         return;
1022     } else if (vmem->block_size < virtio_mem_default_block_size(rb)) {
1023         warn_report("'%s' property is smaller than the default block size (%"
1024                     PRIx64 " MiB)", VIRTIO_MEM_BLOCK_SIZE_PROP,
1025                     virtio_mem_default_block_size(rb) / MiB);
1026     }
1027     if (!QEMU_IS_ALIGNED(vmem->requested_size, vmem->block_size)) {
1028         error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64
1029                    ")", VIRTIO_MEM_REQUESTED_SIZE_PROP,
1030                    VIRTIO_MEM_BLOCK_SIZE_PROP, vmem->block_size);
1031         return;
1032     } else if (!QEMU_IS_ALIGNED(vmem->addr, vmem->block_size)) {
1033         error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64
1034                    ")", VIRTIO_MEM_ADDR_PROP, VIRTIO_MEM_BLOCK_SIZE_PROP,
1035                    vmem->block_size);
1036         return;
1037     } else if (!QEMU_IS_ALIGNED(memory_region_size(&vmem->memdev->mr),
1038                                 vmem->block_size)) {
1039         error_setg(errp, "'%s' property memdev size has to be multiples of"
1040                    "'%s' (0x%" PRIx64 ")", VIRTIO_MEM_MEMDEV_PROP,
1041                    VIRTIO_MEM_BLOCK_SIZE_PROP, vmem->block_size);
1042         return;
1043     }
1044 
1045     if (ram_block_coordinated_discard_require(true)) {
1046         error_setg(errp, "Discarding RAM is disabled");
1047         return;
1048     }
1049 
1050     /*
1051      * Set ourselves as RamDiscardManager before the plug handler maps the
1052      * memory region and exposes it via an address space.
1053      */
1054     if (memory_region_set_ram_discard_manager(&vmem->memdev->mr,
1055                                               RAM_DISCARD_MANAGER(vmem))) {
1056         error_setg(errp, "Failed to set RamDiscardManager");
1057         ram_block_coordinated_discard_require(false);
1058         return;
1059     }
1060 
1061     /*
1062      * We don't know at this point whether shared RAM is migrated using
1063      * QEMU or migrated using the file content. "x-ignore-shared" will be
1064      * configured after realizing the device. So in case we have an
1065      * incoming migration, simply always skip the discard step.
1066      *
1067      * Otherwise, make sure that we start with a clean slate: either the
1068      * memory backend might get reused or the shared file might still have
1069      * memory allocated.
1070      */
1071     if (!runstate_check(RUN_STATE_INMIGRATE)) {
1072         ret = ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb));
1073         if (ret) {
1074             error_setg_errno(errp, -ret, "Unexpected error discarding RAM");
1075             memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL);
1076             ram_block_coordinated_discard_require(false);
1077             return;
1078         }
1079     }
1080 
1081     virtio_mem_resize_usable_region(vmem, vmem->requested_size, true);
1082 
1083     vmem->bitmap_size = memory_region_size(&vmem->memdev->mr) /
1084                         vmem->block_size;
1085     vmem->bitmap = bitmap_new(vmem->bitmap_size);
1086 
1087     virtio_init(vdev, VIRTIO_ID_MEM, sizeof(struct virtio_mem_config));
1088     vmem->vq = virtio_add_queue(vdev, 128, virtio_mem_handle_request);
1089 
1090     /*
1091      * With "dynamic-memslots=off" (old behavior) we always map the whole
1092      * RAM memory region directly.
1093      */
1094     if (vmem->dynamic_memslots) {
1095         if (!vmem->mr) {
1096             virtio_mem_prepare_mr(vmem);
1097         }
1098         if (vmem->nb_memslots <= 1) {
1099             vmem->nb_memslots = 1;
1100             vmem->memslot_size = memory_region_size(&vmem->memdev->mr);
1101         }
1102         if (!vmem->memslots) {
1103             virtio_mem_prepare_memslots(vmem);
1104         }
1105     } else {
1106         assert(!vmem->mr && !vmem->nb_memslots && !vmem->memslots);
1107     }
1108 
1109     host_memory_backend_set_mapped(vmem->memdev, true);
1110     vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem));
1111     if (vmem->early_migration) {
1112         vmstate_register_any(VMSTATE_IF(vmem),
1113                              &vmstate_virtio_mem_device_early, vmem);
1114     }
1115 
1116     /*
1117      * We only want to unplug all memory to start with a clean slate when
1118      * it is safe for the guest -- during system resets that call
1119      * qemu_devices_reset().
1120      *
1121      * We'll filter out selected qemu_devices_reset() calls used for other
1122      * purposes, like resetting all devices during wakeup from suspend on
1123      * x86 based on the reset type passed to qemu_devices_reset().
1124      *
1125      * Unplugging all memory during simple device resets can result in the VM
1126      * unexpectedly losing RAM, corrupting VM state.
1127      *
1128      * Simple device resets (or resets triggered by getting a parent device
1129      * reset) must not change the state of plugged memory blocks. Therefore,
1130      * we need a dedicated reset object that only gets called during
1131      * qemu_devices_reset().
1132      */
1133     obj = object_new(TYPE_VIRTIO_MEM_SYSTEM_RESET);
1134     vmem->system_reset = VIRTIO_MEM_SYSTEM_RESET(obj);
1135     vmem->system_reset->vmem = vmem;
1136     qemu_register_resettable(obj);
1137 }
1138 
virtio_mem_device_unrealize(DeviceState * dev)1139 static void virtio_mem_device_unrealize(DeviceState *dev)
1140 {
1141     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
1142     VirtIOMEM *vmem = VIRTIO_MEM(dev);
1143 
1144     qemu_unregister_resettable(OBJECT(vmem->system_reset));
1145     object_unref(OBJECT(vmem->system_reset));
1146 
1147     if (vmem->early_migration) {
1148         vmstate_unregister(VMSTATE_IF(vmem), &vmstate_virtio_mem_device_early,
1149                            vmem);
1150     }
1151     vmstate_unregister_ram(&vmem->memdev->mr, DEVICE(vmem));
1152     host_memory_backend_set_mapped(vmem->memdev, false);
1153     virtio_del_queue(vdev, 0);
1154     virtio_cleanup(vdev);
1155     g_free(vmem->bitmap);
1156     /*
1157      * The unplug handler unmapped the memory region, it cannot be
1158      * found via an address space anymore. Unset ourselves.
1159      */
1160     memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL);
1161     ram_block_coordinated_discard_require(false);
1162 }
1163 
virtio_mem_discard_range_cb(VirtIOMEM * vmem,void * arg,uint64_t offset,uint64_t size)1164 static int virtio_mem_discard_range_cb(VirtIOMEM *vmem, void *arg,
1165                                        uint64_t offset, uint64_t size)
1166 {
1167     RAMBlock *rb = vmem->memdev->mr.ram_block;
1168 
1169     return ram_block_discard_range(rb, offset, size) ? -EINVAL : 0;
1170 }
1171 
virtio_mem_restore_unplugged(VirtIOMEM * vmem)1172 static int virtio_mem_restore_unplugged(VirtIOMEM *vmem)
1173 {
1174     /* Make sure all memory is really discarded after migration. */
1175     return virtio_mem_for_each_unplugged_range(vmem, NULL,
1176                                                virtio_mem_discard_range_cb);
1177 }
1178 
virtio_mem_activate_memslot_range_cb(VirtIOMEM * vmem,void * arg,uint64_t offset,uint64_t size)1179 static int virtio_mem_activate_memslot_range_cb(VirtIOMEM *vmem, void *arg,
1180                                                 uint64_t offset, uint64_t size)
1181 {
1182     virtio_mem_activate_memslots_to_plug(vmem, offset, size);
1183     return 0;
1184 }
1185 
virtio_mem_post_load_bitmap(VirtIOMEM * vmem)1186 static int virtio_mem_post_load_bitmap(VirtIOMEM *vmem)
1187 {
1188     RamDiscardListener *rdl;
1189     int ret;
1190 
1191     /*
1192      * We restored the bitmap and updated the requested size; activate all
1193      * memslots (so listeners register) before notifying about plugged blocks.
1194      */
1195     if (vmem->dynamic_memslots) {
1196         /*
1197          * We don't expect any active memslots at this point to deactivate: no
1198          * memory was plugged on the migration destination.
1199          */
1200         virtio_mem_for_each_plugged_range(vmem, NULL,
1201                                           virtio_mem_activate_memslot_range_cb);
1202     }
1203 
1204     /*
1205      * We started out with all memory discarded and our memory region is mapped
1206      * into an address space. Replay, now that we updated the bitmap.
1207      */
1208     QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
1209         ret = virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
1210                                                  virtio_mem_notify_populate_cb);
1211         if (ret) {
1212             return ret;
1213         }
1214     }
1215     return 0;
1216 }
1217 
virtio_mem_post_load(void * opaque,int version_id)1218 static int virtio_mem_post_load(void *opaque, int version_id)
1219 {
1220     VirtIOMEM *vmem = VIRTIO_MEM(opaque);
1221     int ret;
1222 
1223     if (!vmem->early_migration) {
1224         ret = virtio_mem_post_load_bitmap(vmem);
1225         if (ret) {
1226             return ret;
1227         }
1228     }
1229 
1230     /*
1231      * If shared RAM is migrated using the file content and not using QEMU,
1232      * don't mess with preallocation and postcopy.
1233      */
1234     if (migrate_ram_is_ignored(vmem->memdev->mr.ram_block)) {
1235         return 0;
1236     }
1237 
1238     if (vmem->prealloc && !vmem->early_migration) {
1239         warn_report("Proper preallocation with migration requires a newer QEMU machine");
1240     }
1241 
1242     if (migration_in_incoming_postcopy()) {
1243         return 0;
1244     }
1245 
1246     return virtio_mem_restore_unplugged(vmem);
1247 }
1248 
virtio_mem_prealloc_range_cb(VirtIOMEM * vmem,void * arg,uint64_t offset,uint64_t size)1249 static int virtio_mem_prealloc_range_cb(VirtIOMEM *vmem, void *arg,
1250                                         uint64_t offset, uint64_t size)
1251 {
1252     void *area = memory_region_get_ram_ptr(&vmem->memdev->mr) + offset;
1253     int fd = memory_region_get_fd(&vmem->memdev->mr);
1254     Error *local_err = NULL;
1255 
1256     if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) {
1257         error_report_err(local_err);
1258         return -ENOMEM;
1259     }
1260     return 0;
1261 }
1262 
virtio_mem_post_load_early(void * opaque,int version_id)1263 static int virtio_mem_post_load_early(void *opaque, int version_id)
1264 {
1265     VirtIOMEM *vmem = VIRTIO_MEM(opaque);
1266     RAMBlock *rb = vmem->memdev->mr.ram_block;
1267     int ret;
1268 
1269     if (!vmem->prealloc) {
1270         goto post_load_bitmap;
1271     }
1272 
1273     /*
1274      * If shared RAM is migrated using the file content and not using QEMU,
1275      * don't mess with preallocation and postcopy.
1276      */
1277     if (migrate_ram_is_ignored(rb)) {
1278         goto post_load_bitmap;
1279     }
1280 
1281     /*
1282      * We restored the bitmap and verified that the basic properties
1283      * match on source and destination, so we can go ahead and preallocate
1284      * memory for all plugged memory blocks, before actual RAM migration starts
1285      * touching this memory.
1286      */
1287     ret = virtio_mem_for_each_plugged_range(vmem, NULL,
1288                                             virtio_mem_prealloc_range_cb);
1289     if (ret) {
1290         return ret;
1291     }
1292 
1293     /*
1294      * This is tricky: postcopy wants to start with a clean slate. On
1295      * POSTCOPY_INCOMING_ADVISE, postcopy code discards all (ordinarily
1296      * preallocated) RAM such that postcopy will work as expected later.
1297      *
1298      * However, we run after POSTCOPY_INCOMING_ADVISE -- but before actual
1299      * RAM migration. So let's discard all memory again. This looks like an
1300      * expensive NOP, but actually serves a purpose: we made sure that we
1301      * were able to allocate all required backend memory once. We cannot
1302      * guarantee that the backend memory we will free will remain free
1303      * until we need it during postcopy, but at least we can catch the
1304      * obvious setup issues this way.
1305      */
1306     if (migration_incoming_postcopy_advised()) {
1307         if (ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb))) {
1308             return -EBUSY;
1309         }
1310     }
1311 
1312 post_load_bitmap:
1313     /* Finally, update any other state to be consistent with the new bitmap. */
1314     return virtio_mem_post_load_bitmap(vmem);
1315 }
1316 
1317 typedef struct VirtIOMEMMigSanityChecks {
1318     VirtIOMEM *parent;
1319     uint64_t addr;
1320     uint64_t region_size;
1321     uint64_t block_size;
1322     uint32_t node;
1323 } VirtIOMEMMigSanityChecks;
1324 
virtio_mem_mig_sanity_checks_pre_save(void * opaque)1325 static int virtio_mem_mig_sanity_checks_pre_save(void *opaque)
1326 {
1327     VirtIOMEMMigSanityChecks *tmp = opaque;
1328     VirtIOMEM *vmem = tmp->parent;
1329 
1330     tmp->addr = vmem->addr;
1331     tmp->region_size = memory_region_size(&vmem->memdev->mr);
1332     tmp->block_size = vmem->block_size;
1333     tmp->node = vmem->node;
1334     return 0;
1335 }
1336 
virtio_mem_mig_sanity_checks_post_load(void * opaque,int version_id)1337 static int virtio_mem_mig_sanity_checks_post_load(void *opaque, int version_id)
1338 {
1339     VirtIOMEMMigSanityChecks *tmp = opaque;
1340     VirtIOMEM *vmem = tmp->parent;
1341     const uint64_t new_region_size = memory_region_size(&vmem->memdev->mr);
1342 
1343     if (tmp->addr != vmem->addr) {
1344         error_report("Property '%s' changed from 0x%" PRIx64 " to 0x%" PRIx64,
1345                      VIRTIO_MEM_ADDR_PROP, tmp->addr, vmem->addr);
1346         return -EINVAL;
1347     }
1348     /*
1349      * Note: Preparation for resizable memory regions. The maximum size
1350      * of the memory region must not change during migration.
1351      */
1352     if (tmp->region_size != new_region_size) {
1353         error_report("Property '%s' size changed from 0x%" PRIx64 " to 0x%"
1354                      PRIx64, VIRTIO_MEM_MEMDEV_PROP, tmp->region_size,
1355                      new_region_size);
1356         return -EINVAL;
1357     }
1358     if (tmp->block_size != vmem->block_size) {
1359         error_report("Property '%s' changed from 0x%" PRIx64 " to 0x%" PRIx64,
1360                      VIRTIO_MEM_BLOCK_SIZE_PROP, tmp->block_size,
1361                      vmem->block_size);
1362         return -EINVAL;
1363     }
1364     if (tmp->node != vmem->node) {
1365         error_report("Property '%s' changed from %" PRIu32 " to %" PRIu32,
1366                      VIRTIO_MEM_NODE_PROP, tmp->node, vmem->node);
1367         return -EINVAL;
1368     }
1369     return 0;
1370 }
1371 
1372 static const VMStateDescription vmstate_virtio_mem_sanity_checks = {
1373     .name = "virtio-mem-device/sanity-checks",
1374     .pre_save = virtio_mem_mig_sanity_checks_pre_save,
1375     .post_load = virtio_mem_mig_sanity_checks_post_load,
1376     .fields = (const VMStateField[]) {
1377         VMSTATE_UINT64(addr, VirtIOMEMMigSanityChecks),
1378         VMSTATE_UINT64(region_size, VirtIOMEMMigSanityChecks),
1379         VMSTATE_UINT64(block_size, VirtIOMEMMigSanityChecks),
1380         VMSTATE_UINT32(node, VirtIOMEMMigSanityChecks),
1381         VMSTATE_END_OF_LIST(),
1382     },
1383 };
1384 
virtio_mem_vmstate_field_exists(void * opaque,int version_id)1385 static bool virtio_mem_vmstate_field_exists(void *opaque, int version_id)
1386 {
1387     const VirtIOMEM *vmem = VIRTIO_MEM(opaque);
1388 
1389     /* With early migration, these fields were already migrated. */
1390     return !vmem->early_migration;
1391 }
1392 
1393 static const VMStateDescription vmstate_virtio_mem_device = {
1394     .name = "virtio-mem-device",
1395     .minimum_version_id = 1,
1396     .version_id = 1,
1397     .priority = MIG_PRI_VIRTIO_MEM,
1398     .post_load = virtio_mem_post_load,
1399     .fields = (const VMStateField[]) {
1400         VMSTATE_WITH_TMP_TEST(VirtIOMEM, virtio_mem_vmstate_field_exists,
1401                               VirtIOMEMMigSanityChecks,
1402                               vmstate_virtio_mem_sanity_checks),
1403         VMSTATE_UINT64(usable_region_size, VirtIOMEM),
1404         VMSTATE_UINT64_TEST(size, VirtIOMEM, virtio_mem_vmstate_field_exists),
1405         VMSTATE_UINT64(requested_size, VirtIOMEM),
1406         VMSTATE_BITMAP_TEST(bitmap, VirtIOMEM, virtio_mem_vmstate_field_exists,
1407                             0, bitmap_size),
1408         VMSTATE_END_OF_LIST()
1409     },
1410 };
1411 
1412 /*
1413  * Transfer properties that are immutable while migration is active early,
1414  * such that we have have this information around before migrating any RAM
1415  * content.
1416  *
1417  * Note that virtio_mem_is_busy() makes sure these properties can no longer
1418  * change on the migration source until migration completed.
1419  *
1420  * With QEMU compat machines, we transmit these properties later, via
1421  * vmstate_virtio_mem_device instead -- see virtio_mem_vmstate_field_exists().
1422  */
1423 static const VMStateDescription vmstate_virtio_mem_device_early = {
1424     .name = "virtio-mem-device-early",
1425     .minimum_version_id = 1,
1426     .version_id = 1,
1427     .early_setup = true,
1428     .post_load = virtio_mem_post_load_early,
1429     .fields = (const VMStateField[]) {
1430         VMSTATE_WITH_TMP(VirtIOMEM, VirtIOMEMMigSanityChecks,
1431                          vmstate_virtio_mem_sanity_checks),
1432         VMSTATE_UINT64(size, VirtIOMEM),
1433         VMSTATE_BITMAP(bitmap, VirtIOMEM, 0, bitmap_size),
1434         VMSTATE_END_OF_LIST()
1435     },
1436 };
1437 
1438 static const VMStateDescription vmstate_virtio_mem = {
1439     .name = "virtio-mem",
1440     .minimum_version_id = 1,
1441     .version_id = 1,
1442     .fields = (const VMStateField[]) {
1443         VMSTATE_VIRTIO_DEVICE,
1444         VMSTATE_END_OF_LIST()
1445     },
1446 };
1447 
virtio_mem_fill_device_info(const VirtIOMEM * vmem,VirtioMEMDeviceInfo * vi)1448 static void virtio_mem_fill_device_info(const VirtIOMEM *vmem,
1449                                         VirtioMEMDeviceInfo *vi)
1450 {
1451     vi->memaddr = vmem->addr;
1452     vi->node = vmem->node;
1453     vi->requested_size = vmem->requested_size;
1454     vi->size = vmem->size;
1455     vi->max_size = memory_region_size(&vmem->memdev->mr);
1456     vi->block_size = vmem->block_size;
1457     vi->memdev = object_get_canonical_path(OBJECT(vmem->memdev));
1458 }
1459 
virtio_mem_get_memory_region(VirtIOMEM * vmem,Error ** errp)1460 static MemoryRegion *virtio_mem_get_memory_region(VirtIOMEM *vmem, Error **errp)
1461 {
1462     if (!vmem->memdev) {
1463         error_setg(errp, "'%s' property must be set", VIRTIO_MEM_MEMDEV_PROP);
1464         return NULL;
1465     } else if (vmem->dynamic_memslots) {
1466         if (!vmem->mr) {
1467             virtio_mem_prepare_mr(vmem);
1468         }
1469         return vmem->mr;
1470     }
1471 
1472     return &vmem->memdev->mr;
1473 }
1474 
virtio_mem_decide_memslots(VirtIOMEM * vmem,unsigned int limit)1475 static void virtio_mem_decide_memslots(VirtIOMEM *vmem, unsigned int limit)
1476 {
1477     uint64_t region_size, memslot_size, min_memslot_size;
1478     unsigned int memslots;
1479     RAMBlock *rb;
1480 
1481     if (!vmem->dynamic_memslots) {
1482         return;
1483     }
1484 
1485     /* We're called exactly once, before realizing the device. */
1486     assert(!vmem->nb_memslots);
1487 
1488     /* If realizing the device will fail, just assume a single memslot. */
1489     if (limit <= 1 || !vmem->memdev || !vmem->memdev->mr.ram_block) {
1490         vmem->nb_memslots = 1;
1491         return;
1492     }
1493 
1494     rb = vmem->memdev->mr.ram_block;
1495     region_size = memory_region_size(&vmem->memdev->mr);
1496 
1497     /*
1498      * Determine the default block size now, to determine the minimum memslot
1499      * size. We want the minimum slot size to be at least the device block size.
1500      */
1501     if (!vmem->block_size) {
1502         vmem->block_size = virtio_mem_default_block_size(rb);
1503     }
1504     /* If realizing the device will fail, just assume a single memslot. */
1505     if (vmem->block_size < qemu_ram_pagesize(rb) ||
1506         !QEMU_IS_ALIGNED(region_size, vmem->block_size)) {
1507         vmem->nb_memslots = 1;
1508         return;
1509     }
1510 
1511     /*
1512      * All memslots except the last one have a reasonable minimum size, and
1513      * and all memslot sizes are aligned to the device block size.
1514      */
1515     memslot_size = QEMU_ALIGN_UP(region_size / limit, vmem->block_size);
1516     min_memslot_size = MAX(vmem->block_size, VIRTIO_MEM_MIN_MEMSLOT_SIZE);
1517     memslot_size = MAX(memslot_size, min_memslot_size);
1518 
1519     memslots = QEMU_ALIGN_UP(region_size, memslot_size) / memslot_size;
1520     if (memslots != 1) {
1521         vmem->memslot_size = memslot_size;
1522     }
1523     vmem->nb_memslots = memslots;
1524 }
1525 
virtio_mem_get_memslots(VirtIOMEM * vmem)1526 static unsigned int virtio_mem_get_memslots(VirtIOMEM *vmem)
1527 {
1528     if (!vmem->dynamic_memslots) {
1529         /* Exactly one static RAM memory region. */
1530         return 1;
1531     }
1532 
1533     /* We're called after instructed to make a decision. */
1534     g_assert(vmem->nb_memslots);
1535     return vmem->nb_memslots;
1536 }
1537 
virtio_mem_add_size_change_notifier(VirtIOMEM * vmem,Notifier * notifier)1538 static void virtio_mem_add_size_change_notifier(VirtIOMEM *vmem,
1539                                                 Notifier *notifier)
1540 {
1541     notifier_list_add(&vmem->size_change_notifiers, notifier);
1542 }
1543 
virtio_mem_remove_size_change_notifier(VirtIOMEM * vmem,Notifier * notifier)1544 static void virtio_mem_remove_size_change_notifier(VirtIOMEM *vmem,
1545                                                    Notifier *notifier)
1546 {
1547     notifier_remove(notifier);
1548 }
1549 
virtio_mem_get_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)1550 static void virtio_mem_get_size(Object *obj, Visitor *v, const char *name,
1551                                 void *opaque, Error **errp)
1552 {
1553     const VirtIOMEM *vmem = VIRTIO_MEM(obj);
1554     uint64_t value = vmem->size;
1555 
1556     visit_type_size(v, name, &value, errp);
1557 }
1558 
virtio_mem_get_requested_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)1559 static void virtio_mem_get_requested_size(Object *obj, Visitor *v,
1560                                           const char *name, void *opaque,
1561                                           Error **errp)
1562 {
1563     const VirtIOMEM *vmem = VIRTIO_MEM(obj);
1564     uint64_t value = vmem->requested_size;
1565 
1566     visit_type_size(v, name, &value, errp);
1567 }
1568 
virtio_mem_set_requested_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)1569 static void virtio_mem_set_requested_size(Object *obj, Visitor *v,
1570                                           const char *name, void *opaque,
1571                                           Error **errp)
1572 {
1573     VirtIOMEM *vmem = VIRTIO_MEM(obj);
1574     uint64_t value;
1575 
1576     if (!visit_type_size(v, name, &value, errp)) {
1577         return;
1578     }
1579 
1580     /*
1581      * The block size and memory backend are not fixed until the device was
1582      * realized. realize() will verify these properties then.
1583      */
1584     if (DEVICE(obj)->realized) {
1585         if (!QEMU_IS_ALIGNED(value, vmem->block_size)) {
1586             error_setg(errp, "'%s' has to be multiples of '%s' (0x%" PRIx64
1587                        ")", name, VIRTIO_MEM_BLOCK_SIZE_PROP,
1588                        vmem->block_size);
1589             return;
1590         } else if (value > memory_region_size(&vmem->memdev->mr)) {
1591             error_setg(errp, "'%s' cannot exceed the memory backend size"
1592                        "(0x%" PRIx64 ")", name,
1593                        memory_region_size(&vmem->memdev->mr));
1594             return;
1595         }
1596 
1597         if (value != vmem->requested_size) {
1598             virtio_mem_resize_usable_region(vmem, value, false);
1599             vmem->requested_size = value;
1600         }
1601         /*
1602          * Trigger a config update so the guest gets notified. We trigger
1603          * even if the size didn't change (especially helpful for debugging).
1604          */
1605         virtio_notify_config(VIRTIO_DEVICE(vmem));
1606     } else {
1607         vmem->requested_size = value;
1608     }
1609 }
1610 
virtio_mem_get_block_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)1611 static void virtio_mem_get_block_size(Object *obj, Visitor *v, const char *name,
1612                                       void *opaque, Error **errp)
1613 {
1614     const VirtIOMEM *vmem = VIRTIO_MEM(obj);
1615     uint64_t value = vmem->block_size;
1616 
1617     /*
1618      * If not configured by the user (and we're not realized yet), use the
1619      * default block size we would use with the current memory backend.
1620      */
1621     if (!value) {
1622         if (vmem->memdev && memory_region_is_ram(&vmem->memdev->mr)) {
1623             value = virtio_mem_default_block_size(vmem->memdev->mr.ram_block);
1624         } else {
1625             value = virtio_mem_thp_size();
1626         }
1627     }
1628 
1629     visit_type_size(v, name, &value, errp);
1630 }
1631 
virtio_mem_set_block_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)1632 static void virtio_mem_set_block_size(Object *obj, Visitor *v, const char *name,
1633                                       void *opaque, Error **errp)
1634 {
1635     VirtIOMEM *vmem = VIRTIO_MEM(obj);
1636     uint64_t value;
1637 
1638     if (DEVICE(obj)->realized) {
1639         error_setg(errp, "'%s' cannot be changed", name);
1640         return;
1641     }
1642 
1643     if (!visit_type_size(v, name, &value, errp)) {
1644         return;
1645     }
1646 
1647     if (value < VIRTIO_MEM_MIN_BLOCK_SIZE) {
1648         error_setg(errp, "'%s' property has to be at least 0x%" PRIx32, name,
1649                    VIRTIO_MEM_MIN_BLOCK_SIZE);
1650         return;
1651     } else if (!is_power_of_2(value)) {
1652         error_setg(errp, "'%s' property has to be a power of two", name);
1653         return;
1654     }
1655     vmem->block_size = value;
1656 }
1657 
virtio_mem_instance_init(Object * obj)1658 static void virtio_mem_instance_init(Object *obj)
1659 {
1660     VirtIOMEM *vmem = VIRTIO_MEM(obj);
1661 
1662     notifier_list_init(&vmem->size_change_notifiers);
1663     QLIST_INIT(&vmem->rdl_list);
1664 
1665     object_property_add(obj, VIRTIO_MEM_SIZE_PROP, "size", virtio_mem_get_size,
1666                         NULL, NULL, NULL);
1667     object_property_add(obj, VIRTIO_MEM_REQUESTED_SIZE_PROP, "size",
1668                         virtio_mem_get_requested_size,
1669                         virtio_mem_set_requested_size, NULL, NULL);
1670     object_property_add(obj, VIRTIO_MEM_BLOCK_SIZE_PROP, "size",
1671                         virtio_mem_get_block_size, virtio_mem_set_block_size,
1672                         NULL, NULL);
1673 }
1674 
virtio_mem_instance_finalize(Object * obj)1675 static void virtio_mem_instance_finalize(Object *obj)
1676 {
1677     VirtIOMEM *vmem = VIRTIO_MEM(obj);
1678 
1679     /*
1680      * Note: the core already dropped the references on all memory regions
1681      * (it's passed as the owner to memory_region_init_*()) and finalized
1682      * these objects. We can simply free the memory.
1683      */
1684     g_free(vmem->memslots);
1685     vmem->memslots = NULL;
1686     g_free(vmem->mr);
1687     vmem->mr = NULL;
1688 }
1689 
1690 static const Property virtio_mem_properties[] = {
1691     DEFINE_PROP_UINT64(VIRTIO_MEM_ADDR_PROP, VirtIOMEM, addr, 0),
1692     DEFINE_PROP_UINT32(VIRTIO_MEM_NODE_PROP, VirtIOMEM, node, 0),
1693     DEFINE_PROP_BOOL(VIRTIO_MEM_PREALLOC_PROP, VirtIOMEM, prealloc, false),
1694     DEFINE_PROP_LINK(VIRTIO_MEM_MEMDEV_PROP, VirtIOMEM, memdev,
1695                      TYPE_MEMORY_BACKEND, HostMemoryBackend *),
1696 #if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS)
1697     DEFINE_PROP_ON_OFF_AUTO(VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP, VirtIOMEM,
1698                             unplugged_inaccessible, ON_OFF_AUTO_ON),
1699 #endif
1700     DEFINE_PROP_BOOL(VIRTIO_MEM_EARLY_MIGRATION_PROP, VirtIOMEM,
1701                      early_migration, true),
1702     DEFINE_PROP_BOOL(VIRTIO_MEM_DYNAMIC_MEMSLOTS_PROP, VirtIOMEM,
1703                      dynamic_memslots, false),
1704 };
1705 
virtio_mem_rdm_get_min_granularity(const RamDiscardManager * rdm,const MemoryRegion * mr)1706 static uint64_t virtio_mem_rdm_get_min_granularity(const RamDiscardManager *rdm,
1707                                                    const MemoryRegion *mr)
1708 {
1709     const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1710 
1711     g_assert(mr == &vmem->memdev->mr);
1712     return vmem->block_size;
1713 }
1714 
virtio_mem_rdm_is_populated(const RamDiscardManager * rdm,const MemoryRegionSection * s)1715 static bool virtio_mem_rdm_is_populated(const RamDiscardManager *rdm,
1716                                         const MemoryRegionSection *s)
1717 {
1718     const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1719     uint64_t start_gpa = vmem->addr + s->offset_within_region;
1720     uint64_t end_gpa = start_gpa + int128_get64(s->size);
1721 
1722     g_assert(s->mr == &vmem->memdev->mr);
1723 
1724     start_gpa = QEMU_ALIGN_DOWN(start_gpa, vmem->block_size);
1725     end_gpa = QEMU_ALIGN_UP(end_gpa, vmem->block_size);
1726 
1727     if (!virtio_mem_valid_range(vmem, start_gpa, end_gpa - start_gpa)) {
1728         return false;
1729     }
1730 
1731     return virtio_mem_is_range_plugged(vmem, start_gpa, end_gpa - start_gpa);
1732 }
1733 
1734 struct VirtIOMEMReplayData {
1735     ReplayRamDiscardState fn;
1736     void *opaque;
1737 };
1738 
virtio_mem_rdm_replay_populated_cb(MemoryRegionSection * s,void * arg)1739 static int virtio_mem_rdm_replay_populated_cb(MemoryRegionSection *s, void *arg)
1740 {
1741     struct VirtIOMEMReplayData *data = arg;
1742 
1743     return data->fn(s, data->opaque);
1744 }
1745 
virtio_mem_rdm_replay_populated(const RamDiscardManager * rdm,MemoryRegionSection * s,ReplayRamDiscardState replay_fn,void * opaque)1746 static int virtio_mem_rdm_replay_populated(const RamDiscardManager *rdm,
1747                                            MemoryRegionSection *s,
1748                                            ReplayRamDiscardState replay_fn,
1749                                            void *opaque)
1750 {
1751     const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1752     struct VirtIOMEMReplayData data = {
1753         .fn = replay_fn,
1754         .opaque = opaque,
1755     };
1756 
1757     g_assert(s->mr == &vmem->memdev->mr);
1758     return virtio_mem_for_each_plugged_section(vmem, s, &data,
1759                                             virtio_mem_rdm_replay_populated_cb);
1760 }
1761 
virtio_mem_rdm_replay_discarded_cb(MemoryRegionSection * s,void * arg)1762 static int virtio_mem_rdm_replay_discarded_cb(MemoryRegionSection *s,
1763                                               void *arg)
1764 {
1765     struct VirtIOMEMReplayData *data = arg;
1766 
1767     return data->fn(s, data->opaque);
1768 }
1769 
virtio_mem_rdm_replay_discarded(const RamDiscardManager * rdm,MemoryRegionSection * s,ReplayRamDiscardState replay_fn,void * opaque)1770 static int virtio_mem_rdm_replay_discarded(const RamDiscardManager *rdm,
1771                                            MemoryRegionSection *s,
1772                                            ReplayRamDiscardState replay_fn,
1773                                            void *opaque)
1774 {
1775     const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1776     struct VirtIOMEMReplayData data = {
1777         .fn = replay_fn,
1778         .opaque = opaque,
1779     };
1780 
1781     g_assert(s->mr == &vmem->memdev->mr);
1782     return virtio_mem_for_each_unplugged_section(vmem, s, &data,
1783                                                  virtio_mem_rdm_replay_discarded_cb);
1784 }
1785 
virtio_mem_rdm_register_listener(RamDiscardManager * rdm,RamDiscardListener * rdl,MemoryRegionSection * s)1786 static void virtio_mem_rdm_register_listener(RamDiscardManager *rdm,
1787                                              RamDiscardListener *rdl,
1788                                              MemoryRegionSection *s)
1789 {
1790     VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1791     int ret;
1792 
1793     g_assert(s->mr == &vmem->memdev->mr);
1794     rdl->section = memory_region_section_new_copy(s);
1795 
1796     QLIST_INSERT_HEAD(&vmem->rdl_list, rdl, next);
1797     ret = virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
1798                                               virtio_mem_notify_populate_cb);
1799     if (ret) {
1800         error_report("%s: Replaying plugged ranges failed: %s", __func__,
1801                      strerror(-ret));
1802     }
1803 }
1804 
virtio_mem_rdm_unregister_listener(RamDiscardManager * rdm,RamDiscardListener * rdl)1805 static void virtio_mem_rdm_unregister_listener(RamDiscardManager *rdm,
1806                                                RamDiscardListener *rdl)
1807 {
1808     VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1809 
1810     g_assert(rdl->section->mr == &vmem->memdev->mr);
1811     if (vmem->size) {
1812         if (rdl->double_discard_supported) {
1813             rdl->notify_discard(rdl, rdl->section);
1814         } else {
1815             virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
1816                                                 virtio_mem_notify_discard_cb);
1817         }
1818     }
1819 
1820     memory_region_section_free_copy(rdl->section);
1821     rdl->section = NULL;
1822     QLIST_REMOVE(rdl, next);
1823 }
1824 
virtio_mem_unplug_request_check(VirtIOMEM * vmem,Error ** errp)1825 static void virtio_mem_unplug_request_check(VirtIOMEM *vmem, Error **errp)
1826 {
1827     if (vmem->unplugged_inaccessible == ON_OFF_AUTO_OFF) {
1828         /*
1829          * We could allow it with a usable region size of 0, but let's just
1830          * not care about that legacy setting.
1831          */
1832         error_setg(errp, "virtio-mem device cannot get unplugged while"
1833                    " '" VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP "' != 'on'");
1834         return;
1835     }
1836 
1837     if (vmem->size) {
1838         error_setg(errp, "virtio-mem device cannot get unplugged while some"
1839                    " of its memory is still plugged");
1840         return;
1841     }
1842     if (vmem->requested_size) {
1843         error_setg(errp, "virtio-mem device cannot get unplugged while"
1844                    " '" VIRTIO_MEM_REQUESTED_SIZE_PROP "' != '0'");
1845         return;
1846     }
1847 }
1848 
virtio_mem_class_init(ObjectClass * klass,const void * data)1849 static void virtio_mem_class_init(ObjectClass *klass, const void *data)
1850 {
1851     DeviceClass *dc = DEVICE_CLASS(klass);
1852     VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
1853     VirtIOMEMClass *vmc = VIRTIO_MEM_CLASS(klass);
1854     RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(klass);
1855 
1856     device_class_set_props(dc, virtio_mem_properties);
1857     dc->vmsd = &vmstate_virtio_mem;
1858 
1859     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1860     vdc->realize = virtio_mem_device_realize;
1861     vdc->unrealize = virtio_mem_device_unrealize;
1862     vdc->get_config = virtio_mem_get_config;
1863     vdc->get_features = virtio_mem_get_features;
1864     vdc->validate_features = virtio_mem_validate_features;
1865     vdc->vmsd = &vmstate_virtio_mem_device;
1866 
1867     vmc->fill_device_info = virtio_mem_fill_device_info;
1868     vmc->get_memory_region = virtio_mem_get_memory_region;
1869     vmc->decide_memslots = virtio_mem_decide_memslots;
1870     vmc->get_memslots = virtio_mem_get_memslots;
1871     vmc->add_size_change_notifier = virtio_mem_add_size_change_notifier;
1872     vmc->remove_size_change_notifier = virtio_mem_remove_size_change_notifier;
1873     vmc->unplug_request_check = virtio_mem_unplug_request_check;
1874 
1875     rdmc->get_min_granularity = virtio_mem_rdm_get_min_granularity;
1876     rdmc->is_populated = virtio_mem_rdm_is_populated;
1877     rdmc->replay_populated = virtio_mem_rdm_replay_populated;
1878     rdmc->replay_discarded = virtio_mem_rdm_replay_discarded;
1879     rdmc->register_listener = virtio_mem_rdm_register_listener;
1880     rdmc->unregister_listener = virtio_mem_rdm_unregister_listener;
1881 }
1882 
1883 static const TypeInfo virtio_mem_info = {
1884     .name = TYPE_VIRTIO_MEM,
1885     .parent = TYPE_VIRTIO_DEVICE,
1886     .instance_size = sizeof(VirtIOMEM),
1887     .instance_init = virtio_mem_instance_init,
1888     .instance_finalize = virtio_mem_instance_finalize,
1889     .class_init = virtio_mem_class_init,
1890     .class_size = sizeof(VirtIOMEMClass),
1891     .interfaces = (const InterfaceInfo[]) {
1892         { TYPE_RAM_DISCARD_MANAGER },
1893         { }
1894     },
1895 };
1896 
virtio_register_types(void)1897 static void virtio_register_types(void)
1898 {
1899     type_register_static(&virtio_mem_info);
1900 }
1901 
1902 type_init(virtio_register_types)
1903 
1904 OBJECT_DEFINE_SIMPLE_TYPE_WITH_INTERFACES(VirtioMemSystemReset, virtio_mem_system_reset, VIRTIO_MEM_SYSTEM_RESET, OBJECT, { TYPE_RESETTABLE_INTERFACE }, { })
1905 
virtio_mem_system_reset_init(Object * obj)1906 static void virtio_mem_system_reset_init(Object *obj)
1907 {
1908 }
1909 
virtio_mem_system_reset_finalize(Object * obj)1910 static void virtio_mem_system_reset_finalize(Object *obj)
1911 {
1912 }
1913 
virtio_mem_system_reset_get_state(Object * obj)1914 static ResettableState *virtio_mem_system_reset_get_state(Object *obj)
1915 {
1916     VirtioMemSystemReset *vmem_reset = VIRTIO_MEM_SYSTEM_RESET(obj);
1917 
1918     return &vmem_reset->reset_state;
1919 }
1920 
virtio_mem_system_reset_hold(Object * obj,ResetType type)1921 static void virtio_mem_system_reset_hold(Object *obj, ResetType type)
1922 {
1923     VirtioMemSystemReset *vmem_reset = VIRTIO_MEM_SYSTEM_RESET(obj);
1924     VirtIOMEM *vmem = vmem_reset->vmem;
1925 
1926     /*
1927      * When waking up from standby/suspend-to-ram, do not unplug any memory.
1928      */
1929     if (type == RESET_TYPE_WAKEUP) {
1930         return;
1931     }
1932 
1933     /*
1934      * During usual resets, we will unplug all memory and shrink the usable
1935      * region size. This is, however, not possible in all scenarios. Then,
1936      * the guest has to deal with this manually (VIRTIO_MEM_REQ_UNPLUG_ALL).
1937      */
1938     virtio_mem_unplug_all(vmem);
1939 }
1940 
virtio_mem_system_reset_class_init(ObjectClass * klass,const void * data)1941 static void virtio_mem_system_reset_class_init(ObjectClass *klass,
1942                                                const void *data)
1943 {
1944     ResettableClass *rc = RESETTABLE_CLASS(klass);
1945 
1946     rc->get_state = virtio_mem_system_reset_get_state;
1947     rc->phases.hold = virtio_mem_system_reset_hold;
1948 }
1949