1 /*
2 * Virtio MEM device
3 *
4 * Copyright (C) 2020 Red Hat, Inc.
5 *
6 * Authors:
7 * David Hildenbrand <david@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2.
10 * See the COPYING file in the top-level directory.
11 */
12
13 #include "qemu/osdep.h"
14 #include "qemu/iov.h"
15 #include "qemu/cutils.h"
16 #include "qemu/error-report.h"
17 #include "qemu/units.h"
18 #include "system/numa.h"
19 #include "system/system.h"
20 #include "system/reset.h"
21 #include "system/runstate.h"
22 #include "hw/virtio/virtio.h"
23 #include "hw/virtio/virtio-bus.h"
24 #include "hw/virtio/virtio-mem.h"
25 #include "qapi/error.h"
26 #include "qapi/visitor.h"
27 #include "system/ram_addr.h"
28 #include "migration/misc.h"
29 #include "hw/boards.h"
30 #include "hw/qdev-properties.h"
31 #include "hw/acpi/acpi.h"
32 #include "trace.h"
33
34 static const VMStateDescription vmstate_virtio_mem_device_early;
35
36 /*
37 * We only had legacy x86 guests that did not support
38 * VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE. Other targets don't have legacy guests.
39 */
40 #if defined(TARGET_X86_64) || defined(TARGET_I386)
41 #define VIRTIO_MEM_HAS_LEGACY_GUESTS
42 #endif
43
44 /*
45 * Let's not allow blocks smaller than 1 MiB, for example, to keep the tracking
46 * bitmap small.
47 */
48 #define VIRTIO_MEM_MIN_BLOCK_SIZE ((uint32_t)(1 * MiB))
49
virtio_mem_default_thp_size(void)50 static uint32_t virtio_mem_default_thp_size(void)
51 {
52 uint32_t default_thp_size = VIRTIO_MEM_MIN_BLOCK_SIZE;
53
54 #if defined(__x86_64__) || defined(__arm__) || defined(__powerpc64__)
55 default_thp_size = 2 * MiB;
56 #elif defined(__aarch64__)
57 if (qemu_real_host_page_size() == 4 * KiB) {
58 default_thp_size = 2 * MiB;
59 } else if (qemu_real_host_page_size() == 16 * KiB) {
60 default_thp_size = 32 * MiB;
61 } else if (qemu_real_host_page_size() == 64 * KiB) {
62 default_thp_size = 512 * MiB;
63 }
64 #elif defined(__s390x__)
65 default_thp_size = 1 * MiB;
66 #endif
67
68 return default_thp_size;
69 }
70
71 /*
72 * The minimum memslot size depends on this setting ("sane default"), the
73 * device block size, and the memory backend page size. The last (or single)
74 * memslot might be smaller than this constant.
75 */
76 #define VIRTIO_MEM_MIN_MEMSLOT_SIZE (1 * GiB)
77
78 /*
79 * We want to have a reasonable default block size such that
80 * 1. We avoid splitting THPs when unplugging memory, which degrades
81 * performance.
82 * 2. We avoid placing THPs for plugged blocks that also cover unplugged
83 * blocks.
84 *
85 * The actual THP size might differ between Linux kernels, so we try to probe
86 * it. In the future (if we ever run into issues regarding 2.), we might want
87 * to disable THP in case we fail to properly probe the THP size, or if the
88 * block size is configured smaller than the THP size.
89 */
90 static uint32_t thp_size;
91
92 #define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"
93 #define HPAGE_PATH "/sys/kernel/mm/transparent_hugepage/"
virtio_mem_thp_size(void)94 static uint32_t virtio_mem_thp_size(void)
95 {
96 gchar *content = NULL;
97 const char *endptr;
98 uint64_t tmp;
99
100 if (thp_size) {
101 return thp_size;
102 }
103
104 /* No THP -> no restrictions. */
105 if (!g_file_test(HPAGE_PATH, G_FILE_TEST_EXISTS)) {
106 thp_size = VIRTIO_MEM_MIN_BLOCK_SIZE;
107 return thp_size;
108 }
109
110 /*
111 * Try to probe the actual THP size, fallback to (sane but eventually
112 * incorrect) default sizes.
113 */
114 if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) &&
115 !qemu_strtou64(content, &endptr, 0, &tmp) &&
116 (!endptr || *endptr == '\n')) {
117 /* Sanity-check the value and fallback to something reasonable. */
118 if (!tmp || !is_power_of_2(tmp)) {
119 warn_report("Read unsupported THP size: %" PRIx64, tmp);
120 } else {
121 thp_size = tmp;
122 }
123 }
124
125 if (!thp_size) {
126 thp_size = virtio_mem_default_thp_size();
127 warn_report("Could not detect THP size, falling back to %" PRIx64
128 " MiB.", thp_size / MiB);
129 }
130
131 g_free(content);
132 return thp_size;
133 }
134
virtio_mem_default_block_size(RAMBlock * rb)135 static uint64_t virtio_mem_default_block_size(RAMBlock *rb)
136 {
137 const uint64_t page_size = qemu_ram_pagesize(rb);
138
139 /* We can have hugetlbfs with a page size smaller than the THP size. */
140 if (page_size == qemu_real_host_page_size()) {
141 return MAX(page_size, virtio_mem_thp_size());
142 }
143 return MAX(page_size, VIRTIO_MEM_MIN_BLOCK_SIZE);
144 }
145
146 #if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS)
virtio_mem_has_shared_zeropage(RAMBlock * rb)147 static bool virtio_mem_has_shared_zeropage(RAMBlock *rb)
148 {
149 /*
150 * We only have a guaranteed shared zeropage on ordinary MAP_PRIVATE
151 * anonymous RAM. In any other case, reading unplugged *can* populate a
152 * fresh page, consuming actual memory.
153 */
154 return !qemu_ram_is_shared(rb) && qemu_ram_get_fd(rb) < 0 &&
155 qemu_ram_pagesize(rb) == qemu_real_host_page_size();
156 }
157 #endif /* VIRTIO_MEM_HAS_LEGACY_GUESTS */
158
159 /*
160 * Size the usable region bigger than the requested size if possible. Esp.
161 * Linux guests will only add (aligned) memory blocks in case they fully
162 * fit into the usable region, but plug+online only a subset of the pages.
163 * The memory block size corresponds mostly to the section size.
164 *
165 * This allows e.g., to add 20MB with a section size of 128MB on x86_64, and
166 * a section size of 512MB on arm64 (as long as the start address is properly
167 * aligned, similar to ordinary DIMMs).
168 *
169 * We can change this at any time and maybe even make it configurable if
170 * necessary (as the section size can change). But it's more likely that the
171 * section size will rather get smaller and not bigger over time.
172 */
173 #if defined(TARGET_X86_64) || defined(TARGET_I386) || defined(TARGET_S390X)
174 #define VIRTIO_MEM_USABLE_EXTENT (2 * (128 * MiB))
175 #elif defined(TARGET_ARM)
176 #define VIRTIO_MEM_USABLE_EXTENT (2 * (512 * MiB))
177 #else
178 #error VIRTIO_MEM_USABLE_EXTENT not defined
179 #endif
180
virtio_mem_is_busy(void)181 static bool virtio_mem_is_busy(void)
182 {
183 /*
184 * Postcopy cannot handle concurrent discards and we don't want to migrate
185 * pages on-demand with stale content when plugging new blocks.
186 *
187 * For precopy, we don't want unplugged blocks in our migration stream, and
188 * when plugging new blocks, the page content might differ between source
189 * and destination (observable by the guest when not initializing pages
190 * after plugging them) until we're running on the destination (as we didn't
191 * migrate these blocks when they were unplugged).
192 */
193 return migration_in_incoming_postcopy() || migration_is_running();
194 }
195
196 typedef int (*virtio_mem_range_cb)(VirtIOMEM *vmem, void *arg,
197 uint64_t offset, uint64_t size);
198
virtio_mem_for_each_unplugged_range(VirtIOMEM * vmem,void * arg,virtio_mem_range_cb cb)199 static int virtio_mem_for_each_unplugged_range(VirtIOMEM *vmem, void *arg,
200 virtio_mem_range_cb cb)
201 {
202 unsigned long first_zero_bit, last_zero_bit;
203 uint64_t offset, size;
204 int ret = 0;
205
206 first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size);
207 while (first_zero_bit < vmem->bitmap_size) {
208 offset = first_zero_bit * vmem->block_size;
209 last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
210 first_zero_bit + 1) - 1;
211 size = (last_zero_bit - first_zero_bit + 1) * vmem->block_size;
212
213 ret = cb(vmem, arg, offset, size);
214 if (ret) {
215 break;
216 }
217 first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
218 last_zero_bit + 2);
219 }
220 return ret;
221 }
222
virtio_mem_for_each_plugged_range(VirtIOMEM * vmem,void * arg,virtio_mem_range_cb cb)223 static int virtio_mem_for_each_plugged_range(VirtIOMEM *vmem, void *arg,
224 virtio_mem_range_cb cb)
225 {
226 unsigned long first_bit, last_bit;
227 uint64_t offset, size;
228 int ret = 0;
229
230 first_bit = find_first_bit(vmem->bitmap, vmem->bitmap_size);
231 while (first_bit < vmem->bitmap_size) {
232 offset = first_bit * vmem->block_size;
233 last_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
234 first_bit + 1) - 1;
235 size = (last_bit - first_bit + 1) * vmem->block_size;
236
237 ret = cb(vmem, arg, offset, size);
238 if (ret) {
239 break;
240 }
241 first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
242 last_bit + 2);
243 }
244 return ret;
245 }
246
247 /*
248 * Adjust the memory section to cover the intersection with the given range.
249 *
250 * Returns false if the intersection is empty, otherwise returns true.
251 */
virtio_mem_intersect_memory_section(MemoryRegionSection * s,uint64_t offset,uint64_t size)252 static bool virtio_mem_intersect_memory_section(MemoryRegionSection *s,
253 uint64_t offset, uint64_t size)
254 {
255 uint64_t start = MAX(s->offset_within_region, offset);
256 uint64_t end = MIN(s->offset_within_region + int128_get64(s->size),
257 offset + size);
258
259 if (end <= start) {
260 return false;
261 }
262
263 s->offset_within_address_space += start - s->offset_within_region;
264 s->offset_within_region = start;
265 s->size = int128_make64(end - start);
266 return true;
267 }
268
269 typedef int (*virtio_mem_section_cb)(MemoryRegionSection *s, void *arg);
270
virtio_mem_for_each_plugged_section(const VirtIOMEM * vmem,MemoryRegionSection * s,void * arg,virtio_mem_section_cb cb)271 static int virtio_mem_for_each_plugged_section(const VirtIOMEM *vmem,
272 MemoryRegionSection *s,
273 void *arg,
274 virtio_mem_section_cb cb)
275 {
276 unsigned long first_bit, last_bit;
277 uint64_t offset, size;
278 int ret = 0;
279
280 first_bit = s->offset_within_region / vmem->block_size;
281 first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, first_bit);
282 while (first_bit < vmem->bitmap_size) {
283 MemoryRegionSection tmp = *s;
284
285 offset = first_bit * vmem->block_size;
286 last_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
287 first_bit + 1) - 1;
288 size = (last_bit - first_bit + 1) * vmem->block_size;
289
290 if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) {
291 break;
292 }
293 ret = cb(&tmp, arg);
294 if (ret) {
295 break;
296 }
297 first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
298 last_bit + 2);
299 }
300 return ret;
301 }
302
virtio_mem_for_each_unplugged_section(const VirtIOMEM * vmem,MemoryRegionSection * s,void * arg,virtio_mem_section_cb cb)303 static int virtio_mem_for_each_unplugged_section(const VirtIOMEM *vmem,
304 MemoryRegionSection *s,
305 void *arg,
306 virtio_mem_section_cb cb)
307 {
308 unsigned long first_bit, last_bit;
309 uint64_t offset, size;
310 int ret = 0;
311
312 first_bit = s->offset_within_region / vmem->block_size;
313 first_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, first_bit);
314 while (first_bit < vmem->bitmap_size) {
315 MemoryRegionSection tmp = *s;
316
317 offset = first_bit * vmem->block_size;
318 last_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
319 first_bit + 1) - 1;
320 size = (last_bit - first_bit + 1) * vmem->block_size;
321
322 if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) {
323 break;
324 }
325 ret = cb(&tmp, arg);
326 if (ret) {
327 break;
328 }
329 first_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
330 last_bit + 2);
331 }
332 return ret;
333 }
334
virtio_mem_notify_populate_cb(MemoryRegionSection * s,void * arg)335 static int virtio_mem_notify_populate_cb(MemoryRegionSection *s, void *arg)
336 {
337 RamDiscardListener *rdl = arg;
338
339 return rdl->notify_populate(rdl, s);
340 }
341
virtio_mem_notify_discard_cb(MemoryRegionSection * s,void * arg)342 static int virtio_mem_notify_discard_cb(MemoryRegionSection *s, void *arg)
343 {
344 RamDiscardListener *rdl = arg;
345
346 rdl->notify_discard(rdl, s);
347 return 0;
348 }
349
virtio_mem_notify_unplug(VirtIOMEM * vmem,uint64_t offset,uint64_t size)350 static void virtio_mem_notify_unplug(VirtIOMEM *vmem, uint64_t offset,
351 uint64_t size)
352 {
353 RamDiscardListener *rdl;
354
355 QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
356 MemoryRegionSection tmp = *rdl->section;
357
358 if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) {
359 continue;
360 }
361 rdl->notify_discard(rdl, &tmp);
362 }
363 }
364
virtio_mem_notify_plug(VirtIOMEM * vmem,uint64_t offset,uint64_t size)365 static int virtio_mem_notify_plug(VirtIOMEM *vmem, uint64_t offset,
366 uint64_t size)
367 {
368 RamDiscardListener *rdl, *rdl2;
369 int ret = 0;
370
371 QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
372 MemoryRegionSection tmp = *rdl->section;
373
374 if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) {
375 continue;
376 }
377 ret = rdl->notify_populate(rdl, &tmp);
378 if (ret) {
379 break;
380 }
381 }
382
383 if (ret) {
384 /* Notify all already-notified listeners. */
385 QLIST_FOREACH(rdl2, &vmem->rdl_list, next) {
386 MemoryRegionSection tmp = *rdl2->section;
387
388 if (rdl2 == rdl) {
389 break;
390 }
391 if (!virtio_mem_intersect_memory_section(&tmp, offset, size)) {
392 continue;
393 }
394 rdl2->notify_discard(rdl2, &tmp);
395 }
396 }
397 return ret;
398 }
399
virtio_mem_notify_unplug_all(VirtIOMEM * vmem)400 static void virtio_mem_notify_unplug_all(VirtIOMEM *vmem)
401 {
402 RamDiscardListener *rdl;
403
404 if (!vmem->size) {
405 return;
406 }
407
408 QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
409 if (rdl->double_discard_supported) {
410 rdl->notify_discard(rdl, rdl->section);
411 } else {
412 virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
413 virtio_mem_notify_discard_cb);
414 }
415 }
416 }
417
virtio_mem_is_range_plugged(const VirtIOMEM * vmem,uint64_t start_gpa,uint64_t size)418 static bool virtio_mem_is_range_plugged(const VirtIOMEM *vmem,
419 uint64_t start_gpa, uint64_t size)
420 {
421 const unsigned long first_bit = (start_gpa - vmem->addr) / vmem->block_size;
422 const unsigned long last_bit = first_bit + (size / vmem->block_size) - 1;
423 unsigned long found_bit;
424
425 /* We fake a shorter bitmap to avoid searching too far. */
426 found_bit = find_next_zero_bit(vmem->bitmap, last_bit + 1, first_bit);
427 return found_bit > last_bit;
428 }
429
virtio_mem_is_range_unplugged(const VirtIOMEM * vmem,uint64_t start_gpa,uint64_t size)430 static bool virtio_mem_is_range_unplugged(const VirtIOMEM *vmem,
431 uint64_t start_gpa, uint64_t size)
432 {
433 const unsigned long first_bit = (start_gpa - vmem->addr) / vmem->block_size;
434 const unsigned long last_bit = first_bit + (size / vmem->block_size) - 1;
435 unsigned long found_bit;
436
437 /* We fake a shorter bitmap to avoid searching too far. */
438 found_bit = find_next_bit(vmem->bitmap, last_bit + 1, first_bit);
439 return found_bit > last_bit;
440 }
441
virtio_mem_set_range_plugged(VirtIOMEM * vmem,uint64_t start_gpa,uint64_t size)442 static void virtio_mem_set_range_plugged(VirtIOMEM *vmem, uint64_t start_gpa,
443 uint64_t size)
444 {
445 const unsigned long bit = (start_gpa - vmem->addr) / vmem->block_size;
446 const unsigned long nbits = size / vmem->block_size;
447
448 bitmap_set(vmem->bitmap, bit, nbits);
449 }
450
virtio_mem_set_range_unplugged(VirtIOMEM * vmem,uint64_t start_gpa,uint64_t size)451 static void virtio_mem_set_range_unplugged(VirtIOMEM *vmem, uint64_t start_gpa,
452 uint64_t size)
453 {
454 const unsigned long bit = (start_gpa - vmem->addr) / vmem->block_size;
455 const unsigned long nbits = size / vmem->block_size;
456
457 bitmap_clear(vmem->bitmap, bit, nbits);
458 }
459
virtio_mem_send_response(VirtIOMEM * vmem,VirtQueueElement * elem,struct virtio_mem_resp * resp)460 static void virtio_mem_send_response(VirtIOMEM *vmem, VirtQueueElement *elem,
461 struct virtio_mem_resp *resp)
462 {
463 VirtIODevice *vdev = VIRTIO_DEVICE(vmem);
464 VirtQueue *vq = vmem->vq;
465
466 trace_virtio_mem_send_response(le16_to_cpu(resp->type));
467 iov_from_buf(elem->in_sg, elem->in_num, 0, resp, sizeof(*resp));
468
469 virtqueue_push(vq, elem, sizeof(*resp));
470 virtio_notify(vdev, vq);
471 }
472
virtio_mem_send_response_simple(VirtIOMEM * vmem,VirtQueueElement * elem,uint16_t type)473 static void virtio_mem_send_response_simple(VirtIOMEM *vmem,
474 VirtQueueElement *elem,
475 uint16_t type)
476 {
477 struct virtio_mem_resp resp = {
478 .type = cpu_to_le16(type),
479 };
480
481 virtio_mem_send_response(vmem, elem, &resp);
482 }
483
virtio_mem_valid_range(const VirtIOMEM * vmem,uint64_t gpa,uint64_t size)484 static bool virtio_mem_valid_range(const VirtIOMEM *vmem, uint64_t gpa,
485 uint64_t size)
486 {
487 if (!QEMU_IS_ALIGNED(gpa, vmem->block_size)) {
488 return false;
489 }
490 if (gpa + size < gpa || !size) {
491 return false;
492 }
493 if (gpa < vmem->addr || gpa >= vmem->addr + vmem->usable_region_size) {
494 return false;
495 }
496 if (gpa + size > vmem->addr + vmem->usable_region_size) {
497 return false;
498 }
499 return true;
500 }
501
virtio_mem_activate_memslot(VirtIOMEM * vmem,unsigned int idx)502 static void virtio_mem_activate_memslot(VirtIOMEM *vmem, unsigned int idx)
503 {
504 const uint64_t memslot_offset = idx * vmem->memslot_size;
505
506 assert(vmem->memslots);
507
508 /*
509 * Instead of enabling/disabling memslots, we add/remove them. This should
510 * make address space updates faster, because we don't have to loop over
511 * many disabled subregions.
512 */
513 if (memory_region_is_mapped(&vmem->memslots[idx])) {
514 return;
515 }
516 memory_region_add_subregion(vmem->mr, memslot_offset, &vmem->memslots[idx]);
517 }
518
virtio_mem_deactivate_memslot(VirtIOMEM * vmem,unsigned int idx)519 static void virtio_mem_deactivate_memslot(VirtIOMEM *vmem, unsigned int idx)
520 {
521 assert(vmem->memslots);
522
523 if (!memory_region_is_mapped(&vmem->memslots[idx])) {
524 return;
525 }
526 memory_region_del_subregion(vmem->mr, &vmem->memslots[idx]);
527 }
528
virtio_mem_activate_memslots_to_plug(VirtIOMEM * vmem,uint64_t offset,uint64_t size)529 static void virtio_mem_activate_memslots_to_plug(VirtIOMEM *vmem,
530 uint64_t offset, uint64_t size)
531 {
532 const unsigned int start_idx = offset / vmem->memslot_size;
533 const unsigned int end_idx = (offset + size + vmem->memslot_size - 1) /
534 vmem->memslot_size;
535 unsigned int idx;
536
537 assert(vmem->dynamic_memslots);
538
539 /* Activate all involved memslots in a single transaction. */
540 memory_region_transaction_begin();
541 for (idx = start_idx; idx < end_idx; idx++) {
542 virtio_mem_activate_memslot(vmem, idx);
543 }
544 memory_region_transaction_commit();
545 }
546
virtio_mem_deactivate_unplugged_memslots(VirtIOMEM * vmem,uint64_t offset,uint64_t size)547 static void virtio_mem_deactivate_unplugged_memslots(VirtIOMEM *vmem,
548 uint64_t offset,
549 uint64_t size)
550 {
551 const uint64_t region_size = memory_region_size(&vmem->memdev->mr);
552 const unsigned int start_idx = offset / vmem->memslot_size;
553 const unsigned int end_idx = (offset + size + vmem->memslot_size - 1) /
554 vmem->memslot_size;
555 unsigned int idx;
556
557 assert(vmem->dynamic_memslots);
558
559 /* Deactivate all memslots with unplugged blocks in a single transaction. */
560 memory_region_transaction_begin();
561 for (idx = start_idx; idx < end_idx; idx++) {
562 const uint64_t memslot_offset = idx * vmem->memslot_size;
563 uint64_t memslot_size = vmem->memslot_size;
564
565 /* The size of the last memslot might be smaller. */
566 if (idx == vmem->nb_memslots - 1) {
567 memslot_size = region_size - memslot_offset;
568 }
569
570 /*
571 * Partially covered memslots might still have some blocks plugged and
572 * have to remain active if that's the case.
573 */
574 if (offset > memslot_offset ||
575 offset + size < memslot_offset + memslot_size) {
576 const uint64_t gpa = vmem->addr + memslot_offset;
577
578 if (!virtio_mem_is_range_unplugged(vmem, gpa, memslot_size)) {
579 continue;
580 }
581 }
582
583 virtio_mem_deactivate_memslot(vmem, idx);
584 }
585 memory_region_transaction_commit();
586 }
587
virtio_mem_set_block_state(VirtIOMEM * vmem,uint64_t start_gpa,uint64_t size,bool plug)588 static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,
589 uint64_t size, bool plug)
590 {
591 const uint64_t offset = start_gpa - vmem->addr;
592 RAMBlock *rb = vmem->memdev->mr.ram_block;
593 int ret = 0;
594
595 if (virtio_mem_is_busy()) {
596 return -EBUSY;
597 }
598
599 if (!plug) {
600 if (ram_block_discard_range(rb, offset, size)) {
601 return -EBUSY;
602 }
603 virtio_mem_notify_unplug(vmem, offset, size);
604 virtio_mem_set_range_unplugged(vmem, start_gpa, size);
605 /* Deactivate completely unplugged memslots after updating the state. */
606 if (vmem->dynamic_memslots) {
607 virtio_mem_deactivate_unplugged_memslots(vmem, offset, size);
608 }
609 return 0;
610 }
611
612 if (vmem->prealloc) {
613 void *area = memory_region_get_ram_ptr(&vmem->memdev->mr) + offset;
614 int fd = memory_region_get_fd(&vmem->memdev->mr);
615 Error *local_err = NULL;
616
617 if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) {
618 static bool warned;
619
620 /*
621 * Warn only once, we don't want to fill the log with these
622 * warnings.
623 */
624 if (!warned) {
625 warn_report_err(local_err);
626 warned = true;
627 } else {
628 error_free(local_err);
629 }
630 ret = -EBUSY;
631 }
632 }
633
634 if (!ret) {
635 /*
636 * Activate before notifying and rollback in case of any errors.
637 *
638 * When activating a yet inactive memslot, memory notifiers will get
639 * notified about the added memory region and can register with the
640 * RamDiscardManager; this will traverse all plugged blocks and skip the
641 * blocks we are plugging here. The following notification will inform
642 * registered listeners about the blocks we're plugging.
643 */
644 if (vmem->dynamic_memslots) {
645 virtio_mem_activate_memslots_to_plug(vmem, offset, size);
646 }
647 ret = virtio_mem_notify_plug(vmem, offset, size);
648 if (ret && vmem->dynamic_memslots) {
649 virtio_mem_deactivate_unplugged_memslots(vmem, offset, size);
650 }
651 }
652 if (ret) {
653 /* Could be preallocation or a notifier populated memory. */
654 ram_block_discard_range(vmem->memdev->mr.ram_block, offset, size);
655 return -EBUSY;
656 }
657
658 virtio_mem_set_range_plugged(vmem, start_gpa, size);
659 return 0;
660 }
661
virtio_mem_state_change_request(VirtIOMEM * vmem,uint64_t gpa,uint16_t nb_blocks,bool plug)662 static int virtio_mem_state_change_request(VirtIOMEM *vmem, uint64_t gpa,
663 uint16_t nb_blocks, bool plug)
664 {
665 const uint64_t size = nb_blocks * vmem->block_size;
666 int ret;
667
668 if (!virtio_mem_valid_range(vmem, gpa, size)) {
669 return VIRTIO_MEM_RESP_ERROR;
670 }
671
672 if (plug && (vmem->size + size > vmem->requested_size)) {
673 return VIRTIO_MEM_RESP_NACK;
674 }
675
676 /* test if really all blocks are in the opposite state */
677 if ((plug && !virtio_mem_is_range_unplugged(vmem, gpa, size)) ||
678 (!plug && !virtio_mem_is_range_plugged(vmem, gpa, size))) {
679 return VIRTIO_MEM_RESP_ERROR;
680 }
681
682 ret = virtio_mem_set_block_state(vmem, gpa, size, plug);
683 if (ret) {
684 return VIRTIO_MEM_RESP_BUSY;
685 }
686 if (plug) {
687 vmem->size += size;
688 } else {
689 vmem->size -= size;
690 }
691 notifier_list_notify(&vmem->size_change_notifiers, &vmem->size);
692 return VIRTIO_MEM_RESP_ACK;
693 }
694
virtio_mem_plug_request(VirtIOMEM * vmem,VirtQueueElement * elem,struct virtio_mem_req * req)695 static void virtio_mem_plug_request(VirtIOMEM *vmem, VirtQueueElement *elem,
696 struct virtio_mem_req *req)
697 {
698 const uint64_t gpa = le64_to_cpu(req->u.plug.addr);
699 const uint16_t nb_blocks = le16_to_cpu(req->u.plug.nb_blocks);
700 uint16_t type;
701
702 trace_virtio_mem_plug_request(gpa, nb_blocks);
703 type = virtio_mem_state_change_request(vmem, gpa, nb_blocks, true);
704 virtio_mem_send_response_simple(vmem, elem, type);
705 }
706
virtio_mem_unplug_request(VirtIOMEM * vmem,VirtQueueElement * elem,struct virtio_mem_req * req)707 static void virtio_mem_unplug_request(VirtIOMEM *vmem, VirtQueueElement *elem,
708 struct virtio_mem_req *req)
709 {
710 const uint64_t gpa = le64_to_cpu(req->u.unplug.addr);
711 const uint16_t nb_blocks = le16_to_cpu(req->u.unplug.nb_blocks);
712 uint16_t type;
713
714 trace_virtio_mem_unplug_request(gpa, nb_blocks);
715 type = virtio_mem_state_change_request(vmem, gpa, nb_blocks, false);
716 virtio_mem_send_response_simple(vmem, elem, type);
717 }
718
virtio_mem_resize_usable_region(VirtIOMEM * vmem,uint64_t requested_size,bool can_shrink)719 static void virtio_mem_resize_usable_region(VirtIOMEM *vmem,
720 uint64_t requested_size,
721 bool can_shrink)
722 {
723 uint64_t newsize = MIN(memory_region_size(&vmem->memdev->mr),
724 requested_size + VIRTIO_MEM_USABLE_EXTENT);
725
726 /* The usable region size always has to be multiples of the block size. */
727 newsize = QEMU_ALIGN_UP(newsize, vmem->block_size);
728
729 if (!requested_size) {
730 newsize = 0;
731 }
732
733 if (newsize < vmem->usable_region_size && !can_shrink) {
734 return;
735 }
736
737 trace_virtio_mem_resized_usable_region(vmem->usable_region_size, newsize);
738 vmem->usable_region_size = newsize;
739 }
740
virtio_mem_unplug_all(VirtIOMEM * vmem)741 static int virtio_mem_unplug_all(VirtIOMEM *vmem)
742 {
743 const uint64_t region_size = memory_region_size(&vmem->memdev->mr);
744 RAMBlock *rb = vmem->memdev->mr.ram_block;
745
746 if (vmem->size) {
747 if (virtio_mem_is_busy()) {
748 return -EBUSY;
749 }
750 if (ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb))) {
751 return -EBUSY;
752 }
753 virtio_mem_notify_unplug_all(vmem);
754
755 bitmap_clear(vmem->bitmap, 0, vmem->bitmap_size);
756 vmem->size = 0;
757 notifier_list_notify(&vmem->size_change_notifiers, &vmem->size);
758
759 /* Deactivate all memslots after updating the state. */
760 if (vmem->dynamic_memslots) {
761 virtio_mem_deactivate_unplugged_memslots(vmem, 0, region_size);
762 }
763 }
764
765 trace_virtio_mem_unplugged_all();
766 virtio_mem_resize_usable_region(vmem, vmem->requested_size, true);
767 return 0;
768 }
769
virtio_mem_unplug_all_request(VirtIOMEM * vmem,VirtQueueElement * elem)770 static void virtio_mem_unplug_all_request(VirtIOMEM *vmem,
771 VirtQueueElement *elem)
772 {
773 trace_virtio_mem_unplug_all_request();
774 if (virtio_mem_unplug_all(vmem)) {
775 virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_BUSY);
776 } else {
777 virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_ACK);
778 }
779 }
780
virtio_mem_state_request(VirtIOMEM * vmem,VirtQueueElement * elem,struct virtio_mem_req * req)781 static void virtio_mem_state_request(VirtIOMEM *vmem, VirtQueueElement *elem,
782 struct virtio_mem_req *req)
783 {
784 const uint16_t nb_blocks = le16_to_cpu(req->u.state.nb_blocks);
785 const uint64_t gpa = le64_to_cpu(req->u.state.addr);
786 const uint64_t size = nb_blocks * vmem->block_size;
787 struct virtio_mem_resp resp = {
788 .type = cpu_to_le16(VIRTIO_MEM_RESP_ACK),
789 };
790
791 trace_virtio_mem_state_request(gpa, nb_blocks);
792 if (!virtio_mem_valid_range(vmem, gpa, size)) {
793 virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_ERROR);
794 return;
795 }
796
797 if (virtio_mem_is_range_plugged(vmem, gpa, size)) {
798 resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_PLUGGED);
799 } else if (virtio_mem_is_range_unplugged(vmem, gpa, size)) {
800 resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_UNPLUGGED);
801 } else {
802 resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_MIXED);
803 }
804 trace_virtio_mem_state_response(le16_to_cpu(resp.u.state.state));
805 virtio_mem_send_response(vmem, elem, &resp);
806 }
807
virtio_mem_handle_request(VirtIODevice * vdev,VirtQueue * vq)808 static void virtio_mem_handle_request(VirtIODevice *vdev, VirtQueue *vq)
809 {
810 const int len = sizeof(struct virtio_mem_req);
811 VirtIOMEM *vmem = VIRTIO_MEM(vdev);
812 VirtQueueElement *elem;
813 struct virtio_mem_req req;
814 uint16_t type;
815
816 while (true) {
817 elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
818 if (!elem) {
819 return;
820 }
821
822 if (iov_to_buf(elem->out_sg, elem->out_num, 0, &req, len) < len) {
823 virtio_error(vdev, "virtio-mem protocol violation: invalid request"
824 " size: %d", len);
825 virtqueue_detach_element(vq, elem, 0);
826 g_free(elem);
827 return;
828 }
829
830 if (iov_size(elem->in_sg, elem->in_num) <
831 sizeof(struct virtio_mem_resp)) {
832 virtio_error(vdev, "virtio-mem protocol violation: not enough space"
833 " for response: %zu",
834 iov_size(elem->in_sg, elem->in_num));
835 virtqueue_detach_element(vq, elem, 0);
836 g_free(elem);
837 return;
838 }
839
840 type = le16_to_cpu(req.type);
841 switch (type) {
842 case VIRTIO_MEM_REQ_PLUG:
843 virtio_mem_plug_request(vmem, elem, &req);
844 break;
845 case VIRTIO_MEM_REQ_UNPLUG:
846 virtio_mem_unplug_request(vmem, elem, &req);
847 break;
848 case VIRTIO_MEM_REQ_UNPLUG_ALL:
849 virtio_mem_unplug_all_request(vmem, elem);
850 break;
851 case VIRTIO_MEM_REQ_STATE:
852 virtio_mem_state_request(vmem, elem, &req);
853 break;
854 default:
855 virtio_error(vdev, "virtio-mem protocol violation: unknown request"
856 " type: %d", type);
857 virtqueue_detach_element(vq, elem, 0);
858 g_free(elem);
859 return;
860 }
861
862 g_free(elem);
863 }
864 }
865
virtio_mem_get_config(VirtIODevice * vdev,uint8_t * config_data)866 static void virtio_mem_get_config(VirtIODevice *vdev, uint8_t *config_data)
867 {
868 VirtIOMEM *vmem = VIRTIO_MEM(vdev);
869 struct virtio_mem_config *config = (void *) config_data;
870
871 config->block_size = cpu_to_le64(vmem->block_size);
872 config->node_id = cpu_to_le16(vmem->node);
873 config->requested_size = cpu_to_le64(vmem->requested_size);
874 config->plugged_size = cpu_to_le64(vmem->size);
875 config->addr = cpu_to_le64(vmem->addr);
876 config->region_size = cpu_to_le64(memory_region_size(&vmem->memdev->mr));
877 config->usable_region_size = cpu_to_le64(vmem->usable_region_size);
878 }
879
virtio_mem_get_features(VirtIODevice * vdev,uint64_t features,Error ** errp)880 static uint64_t virtio_mem_get_features(VirtIODevice *vdev, uint64_t features,
881 Error **errp)
882 {
883 MachineState *ms = MACHINE(qdev_get_machine());
884 VirtIOMEM *vmem = VIRTIO_MEM(vdev);
885
886 if (ms->numa_state && acpi_builtin()) {
887 virtio_add_feature(&features, VIRTIO_MEM_F_ACPI_PXM);
888 }
889 assert(vmem->unplugged_inaccessible != ON_OFF_AUTO_AUTO);
890 if (vmem->unplugged_inaccessible == ON_OFF_AUTO_ON) {
891 virtio_add_feature(&features, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE);
892 }
893 if (qemu_wakeup_suspend_enabled()) {
894 virtio_add_feature(&features, VIRTIO_MEM_F_PERSISTENT_SUSPEND);
895 }
896 return features;
897 }
898
virtio_mem_validate_features(VirtIODevice * vdev)899 static int virtio_mem_validate_features(VirtIODevice *vdev)
900 {
901 if (virtio_host_has_feature(vdev, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE) &&
902 !virtio_vdev_has_feature(vdev, VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE)) {
903 return -EFAULT;
904 }
905 return 0;
906 }
907
virtio_mem_prepare_mr(VirtIOMEM * vmem)908 static void virtio_mem_prepare_mr(VirtIOMEM *vmem)
909 {
910 const uint64_t region_size = memory_region_size(&vmem->memdev->mr);
911
912 assert(!vmem->mr && vmem->dynamic_memslots);
913 vmem->mr = g_new0(MemoryRegion, 1);
914 memory_region_init(vmem->mr, OBJECT(vmem), "virtio-mem",
915 region_size);
916 vmem->mr->align = memory_region_get_alignment(&vmem->memdev->mr);
917 }
918
virtio_mem_prepare_memslots(VirtIOMEM * vmem)919 static void virtio_mem_prepare_memslots(VirtIOMEM *vmem)
920 {
921 const uint64_t region_size = memory_region_size(&vmem->memdev->mr);
922 unsigned int idx;
923
924 g_assert(!vmem->memslots && vmem->nb_memslots && vmem->dynamic_memslots);
925 vmem->memslots = g_new0(MemoryRegion, vmem->nb_memslots);
926
927 /* Initialize our memslots, but don't map them yet. */
928 for (idx = 0; idx < vmem->nb_memslots; idx++) {
929 const uint64_t memslot_offset = idx * vmem->memslot_size;
930 uint64_t memslot_size = vmem->memslot_size;
931 char name[20];
932
933 /* The size of the last memslot might be smaller. */
934 if (idx == vmem->nb_memslots - 1) {
935 memslot_size = region_size - memslot_offset;
936 }
937
938 snprintf(name, sizeof(name), "memslot-%u", idx);
939 memory_region_init_alias(&vmem->memslots[idx], OBJECT(vmem), name,
940 &vmem->memdev->mr, memslot_offset,
941 memslot_size);
942 /*
943 * We want to be able to atomically and efficiently activate/deactivate
944 * individual memslots without affecting adjacent memslots in memory
945 * notifiers.
946 */
947 memory_region_set_unmergeable(&vmem->memslots[idx], true);
948 }
949 }
950
virtio_mem_device_realize(DeviceState * dev,Error ** errp)951 static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
952 {
953 MachineState *ms = MACHINE(qdev_get_machine());
954 int nb_numa_nodes = ms->numa_state ? ms->numa_state->num_nodes : 0;
955 VirtIODevice *vdev = VIRTIO_DEVICE(dev);
956 VirtIOMEM *vmem = VIRTIO_MEM(dev);
957 uint64_t page_size;
958 RAMBlock *rb;
959 Object *obj;
960 int ret;
961
962 if (!vmem->memdev) {
963 error_setg(errp, "'%s' property is not set", VIRTIO_MEM_MEMDEV_PROP);
964 return;
965 } else if (host_memory_backend_is_mapped(vmem->memdev)) {
966 error_setg(errp, "'%s' property specifies a busy memdev: %s",
967 VIRTIO_MEM_MEMDEV_PROP,
968 object_get_canonical_path_component(OBJECT(vmem->memdev)));
969 return;
970 } else if (!memory_region_is_ram(&vmem->memdev->mr) ||
971 memory_region_is_rom(&vmem->memdev->mr) ||
972 !vmem->memdev->mr.ram_block) {
973 error_setg(errp, "'%s' property specifies an unsupported memdev",
974 VIRTIO_MEM_MEMDEV_PROP);
975 return;
976 } else if (vmem->memdev->prealloc) {
977 error_setg(errp, "'%s' property specifies a memdev with preallocation"
978 " enabled: %s. Instead, specify 'prealloc=on' for the"
979 " virtio-mem device. ", VIRTIO_MEM_MEMDEV_PROP,
980 object_get_canonical_path_component(OBJECT(vmem->memdev)));
981 return;
982 }
983
984 if ((nb_numa_nodes && vmem->node >= nb_numa_nodes) ||
985 (!nb_numa_nodes && vmem->node)) {
986 error_setg(errp, "'%s' property has value '%" PRIu32 "', which exceeds"
987 "the number of numa nodes: %d", VIRTIO_MEM_NODE_PROP,
988 vmem->node, nb_numa_nodes ? nb_numa_nodes : 1);
989 return;
990 }
991
992 if (should_mlock(mlock_state)) {
993 error_setg(errp, "Incompatible with mlock");
994 return;
995 }
996
997 rb = vmem->memdev->mr.ram_block;
998 page_size = qemu_ram_pagesize(rb);
999
1000 #if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS)
1001 switch (vmem->unplugged_inaccessible) {
1002 case ON_OFF_AUTO_AUTO:
1003 if (virtio_mem_has_shared_zeropage(rb)) {
1004 vmem->unplugged_inaccessible = ON_OFF_AUTO_OFF;
1005 } else {
1006 vmem->unplugged_inaccessible = ON_OFF_AUTO_ON;
1007 }
1008 break;
1009 case ON_OFF_AUTO_OFF:
1010 if (!virtio_mem_has_shared_zeropage(rb)) {
1011 warn_report("'%s' property set to 'off' with a memdev that does"
1012 " not support the shared zeropage.",
1013 VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP);
1014 }
1015 break;
1016 default:
1017 break;
1018 }
1019 #else /* VIRTIO_MEM_HAS_LEGACY_GUESTS */
1020 vmem->unplugged_inaccessible = ON_OFF_AUTO_ON;
1021 #endif /* VIRTIO_MEM_HAS_LEGACY_GUESTS */
1022
1023 if (vmem->dynamic_memslots &&
1024 vmem->unplugged_inaccessible != ON_OFF_AUTO_ON) {
1025 error_setg(errp, "'%s' property set to 'on' requires '%s' to be 'on'",
1026 VIRTIO_MEM_DYNAMIC_MEMSLOTS_PROP,
1027 VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP);
1028 return;
1029 }
1030
1031 /*
1032 * If the block size wasn't configured by the user, use a sane default. This
1033 * allows using hugetlbfs backends of any page size without manual
1034 * intervention.
1035 */
1036 if (!vmem->block_size) {
1037 vmem->block_size = virtio_mem_default_block_size(rb);
1038 }
1039
1040 if (vmem->block_size < page_size) {
1041 error_setg(errp, "'%s' property has to be at least the page size (0x%"
1042 PRIx64 ")", VIRTIO_MEM_BLOCK_SIZE_PROP, page_size);
1043 return;
1044 } else if (vmem->block_size < virtio_mem_default_block_size(rb)) {
1045 warn_report("'%s' property is smaller than the default block size (%"
1046 PRIx64 " MiB)", VIRTIO_MEM_BLOCK_SIZE_PROP,
1047 virtio_mem_default_block_size(rb) / MiB);
1048 }
1049 if (!QEMU_IS_ALIGNED(vmem->requested_size, vmem->block_size)) {
1050 error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64
1051 ")", VIRTIO_MEM_REQUESTED_SIZE_PROP,
1052 VIRTIO_MEM_BLOCK_SIZE_PROP, vmem->block_size);
1053 return;
1054 } else if (!QEMU_IS_ALIGNED(vmem->addr, vmem->block_size)) {
1055 error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64
1056 ")", VIRTIO_MEM_ADDR_PROP, VIRTIO_MEM_BLOCK_SIZE_PROP,
1057 vmem->block_size);
1058 return;
1059 } else if (!QEMU_IS_ALIGNED(memory_region_size(&vmem->memdev->mr),
1060 vmem->block_size)) {
1061 error_setg(errp, "'%s' property memdev size has to be multiples of"
1062 "'%s' (0x%" PRIx64 ")", VIRTIO_MEM_MEMDEV_PROP,
1063 VIRTIO_MEM_BLOCK_SIZE_PROP, vmem->block_size);
1064 return;
1065 }
1066
1067 if (ram_block_coordinated_discard_require(true)) {
1068 error_setg(errp, "Discarding RAM is disabled");
1069 return;
1070 }
1071
1072 /*
1073 * We don't know at this point whether shared RAM is migrated using
1074 * QEMU or migrated using the file content. "x-ignore-shared" will be
1075 * configured after realizing the device. So in case we have an
1076 * incoming migration, simply always skip the discard step.
1077 *
1078 * Otherwise, make sure that we start with a clean slate: either the
1079 * memory backend might get reused or the shared file might still have
1080 * memory allocated.
1081 */
1082 if (!runstate_check(RUN_STATE_INMIGRATE)) {
1083 ret = ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb));
1084 if (ret) {
1085 error_setg_errno(errp, -ret, "Unexpected error discarding RAM");
1086 ram_block_coordinated_discard_require(false);
1087 return;
1088 }
1089 }
1090
1091 virtio_mem_resize_usable_region(vmem, vmem->requested_size, true);
1092
1093 vmem->bitmap_size = memory_region_size(&vmem->memdev->mr) /
1094 vmem->block_size;
1095 vmem->bitmap = bitmap_new(vmem->bitmap_size);
1096
1097 virtio_init(vdev, VIRTIO_ID_MEM, sizeof(struct virtio_mem_config));
1098 vmem->vq = virtio_add_queue(vdev, 128, virtio_mem_handle_request);
1099
1100 /*
1101 * With "dynamic-memslots=off" (old behavior) we always map the whole
1102 * RAM memory region directly.
1103 */
1104 if (vmem->dynamic_memslots) {
1105 if (!vmem->mr) {
1106 virtio_mem_prepare_mr(vmem);
1107 }
1108 if (vmem->nb_memslots <= 1) {
1109 vmem->nb_memslots = 1;
1110 vmem->memslot_size = memory_region_size(&vmem->memdev->mr);
1111 }
1112 if (!vmem->memslots) {
1113 virtio_mem_prepare_memslots(vmem);
1114 }
1115 } else {
1116 assert(!vmem->mr && !vmem->nb_memslots && !vmem->memslots);
1117 }
1118
1119 host_memory_backend_set_mapped(vmem->memdev, true);
1120 vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem));
1121 if (vmem->early_migration) {
1122 vmstate_register_any(VMSTATE_IF(vmem),
1123 &vmstate_virtio_mem_device_early, vmem);
1124 }
1125
1126 /*
1127 * We only want to unplug all memory to start with a clean slate when
1128 * it is safe for the guest -- during system resets that call
1129 * qemu_devices_reset().
1130 *
1131 * We'll filter out selected qemu_devices_reset() calls used for other
1132 * purposes, like resetting all devices during wakeup from suspend on
1133 * x86 based on the reset type passed to qemu_devices_reset().
1134 *
1135 * Unplugging all memory during simple device resets can result in the VM
1136 * unexpectedly losing RAM, corrupting VM state.
1137 *
1138 * Simple device resets (or resets triggered by getting a parent device
1139 * reset) must not change the state of plugged memory blocks. Therefore,
1140 * we need a dedicated reset object that only gets called during
1141 * qemu_devices_reset().
1142 */
1143 obj = object_new(TYPE_VIRTIO_MEM_SYSTEM_RESET);
1144 vmem->system_reset = VIRTIO_MEM_SYSTEM_RESET(obj);
1145 vmem->system_reset->vmem = vmem;
1146 qemu_register_resettable(obj);
1147
1148 /*
1149 * Set ourselves as RamDiscardManager before the plug handler maps the
1150 * memory region and exposes it via an address space.
1151 */
1152 memory_region_set_ram_discard_manager(&vmem->memdev->mr,
1153 RAM_DISCARD_MANAGER(vmem));
1154 }
1155
virtio_mem_device_unrealize(DeviceState * dev)1156 static void virtio_mem_device_unrealize(DeviceState *dev)
1157 {
1158 VirtIODevice *vdev = VIRTIO_DEVICE(dev);
1159 VirtIOMEM *vmem = VIRTIO_MEM(dev);
1160
1161 /*
1162 * The unplug handler unmapped the memory region, it cannot be
1163 * found via an address space anymore. Unset ourselves.
1164 */
1165 memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL);
1166
1167 qemu_unregister_resettable(OBJECT(vmem->system_reset));
1168 object_unref(OBJECT(vmem->system_reset));
1169
1170 if (vmem->early_migration) {
1171 vmstate_unregister(VMSTATE_IF(vmem), &vmstate_virtio_mem_device_early,
1172 vmem);
1173 }
1174 vmstate_unregister_ram(&vmem->memdev->mr, DEVICE(vmem));
1175 host_memory_backend_set_mapped(vmem->memdev, false);
1176 virtio_del_queue(vdev, 0);
1177 virtio_cleanup(vdev);
1178 g_free(vmem->bitmap);
1179 ram_block_coordinated_discard_require(false);
1180 }
1181
virtio_mem_discard_range_cb(VirtIOMEM * vmem,void * arg,uint64_t offset,uint64_t size)1182 static int virtio_mem_discard_range_cb(VirtIOMEM *vmem, void *arg,
1183 uint64_t offset, uint64_t size)
1184 {
1185 RAMBlock *rb = vmem->memdev->mr.ram_block;
1186
1187 return ram_block_discard_range(rb, offset, size) ? -EINVAL : 0;
1188 }
1189
virtio_mem_restore_unplugged(VirtIOMEM * vmem)1190 static int virtio_mem_restore_unplugged(VirtIOMEM *vmem)
1191 {
1192 /* Make sure all memory is really discarded after migration. */
1193 return virtio_mem_for_each_unplugged_range(vmem, NULL,
1194 virtio_mem_discard_range_cb);
1195 }
1196
virtio_mem_activate_memslot_range_cb(VirtIOMEM * vmem,void * arg,uint64_t offset,uint64_t size)1197 static int virtio_mem_activate_memslot_range_cb(VirtIOMEM *vmem, void *arg,
1198 uint64_t offset, uint64_t size)
1199 {
1200 virtio_mem_activate_memslots_to_plug(vmem, offset, size);
1201 return 0;
1202 }
1203
virtio_mem_post_load_bitmap(VirtIOMEM * vmem)1204 static int virtio_mem_post_load_bitmap(VirtIOMEM *vmem)
1205 {
1206 RamDiscardListener *rdl;
1207 int ret;
1208
1209 /*
1210 * We restored the bitmap and updated the requested size; activate all
1211 * memslots (so listeners register) before notifying about plugged blocks.
1212 */
1213 if (vmem->dynamic_memslots) {
1214 /*
1215 * We don't expect any active memslots at this point to deactivate: no
1216 * memory was plugged on the migration destination.
1217 */
1218 virtio_mem_for_each_plugged_range(vmem, NULL,
1219 virtio_mem_activate_memslot_range_cb);
1220 }
1221
1222 /*
1223 * We started out with all memory discarded and our memory region is mapped
1224 * into an address space. Replay, now that we updated the bitmap.
1225 */
1226 QLIST_FOREACH(rdl, &vmem->rdl_list, next) {
1227 ret = virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
1228 virtio_mem_notify_populate_cb);
1229 if (ret) {
1230 return ret;
1231 }
1232 }
1233 return 0;
1234 }
1235
virtio_mem_post_load(void * opaque,int version_id)1236 static int virtio_mem_post_load(void *opaque, int version_id)
1237 {
1238 VirtIOMEM *vmem = VIRTIO_MEM(opaque);
1239 int ret;
1240
1241 if (!vmem->early_migration) {
1242 ret = virtio_mem_post_load_bitmap(vmem);
1243 if (ret) {
1244 return ret;
1245 }
1246 }
1247
1248 /*
1249 * If shared RAM is migrated using the file content and not using QEMU,
1250 * don't mess with preallocation and postcopy.
1251 */
1252 if (migrate_ram_is_ignored(vmem->memdev->mr.ram_block)) {
1253 return 0;
1254 }
1255
1256 if (vmem->prealloc && !vmem->early_migration) {
1257 warn_report("Proper preallocation with migration requires a newer QEMU machine");
1258 }
1259
1260 if (migration_in_incoming_postcopy()) {
1261 return 0;
1262 }
1263
1264 return virtio_mem_restore_unplugged(vmem);
1265 }
1266
virtio_mem_prealloc_range_cb(VirtIOMEM * vmem,void * arg,uint64_t offset,uint64_t size)1267 static int virtio_mem_prealloc_range_cb(VirtIOMEM *vmem, void *arg,
1268 uint64_t offset, uint64_t size)
1269 {
1270 void *area = memory_region_get_ram_ptr(&vmem->memdev->mr) + offset;
1271 int fd = memory_region_get_fd(&vmem->memdev->mr);
1272 Error *local_err = NULL;
1273
1274 if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) {
1275 error_report_err(local_err);
1276 return -ENOMEM;
1277 }
1278 return 0;
1279 }
1280
virtio_mem_post_load_early(void * opaque,int version_id)1281 static int virtio_mem_post_load_early(void *opaque, int version_id)
1282 {
1283 VirtIOMEM *vmem = VIRTIO_MEM(opaque);
1284 RAMBlock *rb = vmem->memdev->mr.ram_block;
1285 int ret;
1286
1287 if (!vmem->prealloc) {
1288 goto post_load_bitmap;
1289 }
1290
1291 /*
1292 * If shared RAM is migrated using the file content and not using QEMU,
1293 * don't mess with preallocation and postcopy.
1294 */
1295 if (migrate_ram_is_ignored(rb)) {
1296 goto post_load_bitmap;
1297 }
1298
1299 /*
1300 * We restored the bitmap and verified that the basic properties
1301 * match on source and destination, so we can go ahead and preallocate
1302 * memory for all plugged memory blocks, before actual RAM migration starts
1303 * touching this memory.
1304 */
1305 ret = virtio_mem_for_each_plugged_range(vmem, NULL,
1306 virtio_mem_prealloc_range_cb);
1307 if (ret) {
1308 return ret;
1309 }
1310
1311 /*
1312 * This is tricky: postcopy wants to start with a clean slate. On
1313 * POSTCOPY_INCOMING_ADVISE, postcopy code discards all (ordinarily
1314 * preallocated) RAM such that postcopy will work as expected later.
1315 *
1316 * However, we run after POSTCOPY_INCOMING_ADVISE -- but before actual
1317 * RAM migration. So let's discard all memory again. This looks like an
1318 * expensive NOP, but actually serves a purpose: we made sure that we
1319 * were able to allocate all required backend memory once. We cannot
1320 * guarantee that the backend memory we will free will remain free
1321 * until we need it during postcopy, but at least we can catch the
1322 * obvious setup issues this way.
1323 */
1324 if (migration_incoming_postcopy_advised()) {
1325 if (ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb))) {
1326 return -EBUSY;
1327 }
1328 }
1329
1330 post_load_bitmap:
1331 /* Finally, update any other state to be consistent with the new bitmap. */
1332 return virtio_mem_post_load_bitmap(vmem);
1333 }
1334
1335 typedef struct VirtIOMEMMigSanityChecks {
1336 VirtIOMEM *parent;
1337 uint64_t addr;
1338 uint64_t region_size;
1339 uint64_t block_size;
1340 uint32_t node;
1341 } VirtIOMEMMigSanityChecks;
1342
virtio_mem_mig_sanity_checks_pre_save(void * opaque)1343 static int virtio_mem_mig_sanity_checks_pre_save(void *opaque)
1344 {
1345 VirtIOMEMMigSanityChecks *tmp = opaque;
1346 VirtIOMEM *vmem = tmp->parent;
1347
1348 tmp->addr = vmem->addr;
1349 tmp->region_size = memory_region_size(&vmem->memdev->mr);
1350 tmp->block_size = vmem->block_size;
1351 tmp->node = vmem->node;
1352 return 0;
1353 }
1354
virtio_mem_mig_sanity_checks_post_load(void * opaque,int version_id)1355 static int virtio_mem_mig_sanity_checks_post_load(void *opaque, int version_id)
1356 {
1357 VirtIOMEMMigSanityChecks *tmp = opaque;
1358 VirtIOMEM *vmem = tmp->parent;
1359 const uint64_t new_region_size = memory_region_size(&vmem->memdev->mr);
1360
1361 if (tmp->addr != vmem->addr) {
1362 error_report("Property '%s' changed from 0x%" PRIx64 " to 0x%" PRIx64,
1363 VIRTIO_MEM_ADDR_PROP, tmp->addr, vmem->addr);
1364 return -EINVAL;
1365 }
1366 /*
1367 * Note: Preparation for resizable memory regions. The maximum size
1368 * of the memory region must not change during migration.
1369 */
1370 if (tmp->region_size != new_region_size) {
1371 error_report("Property '%s' size changed from 0x%" PRIx64 " to 0x%"
1372 PRIx64, VIRTIO_MEM_MEMDEV_PROP, tmp->region_size,
1373 new_region_size);
1374 return -EINVAL;
1375 }
1376 if (tmp->block_size != vmem->block_size) {
1377 error_report("Property '%s' changed from 0x%" PRIx64 " to 0x%" PRIx64,
1378 VIRTIO_MEM_BLOCK_SIZE_PROP, tmp->block_size,
1379 vmem->block_size);
1380 return -EINVAL;
1381 }
1382 if (tmp->node != vmem->node) {
1383 error_report("Property '%s' changed from %" PRIu32 " to %" PRIu32,
1384 VIRTIO_MEM_NODE_PROP, tmp->node, vmem->node);
1385 return -EINVAL;
1386 }
1387 return 0;
1388 }
1389
1390 static const VMStateDescription vmstate_virtio_mem_sanity_checks = {
1391 .name = "virtio-mem-device/sanity-checks",
1392 .pre_save = virtio_mem_mig_sanity_checks_pre_save,
1393 .post_load = virtio_mem_mig_sanity_checks_post_load,
1394 .fields = (const VMStateField[]) {
1395 VMSTATE_UINT64(addr, VirtIOMEMMigSanityChecks),
1396 VMSTATE_UINT64(region_size, VirtIOMEMMigSanityChecks),
1397 VMSTATE_UINT64(block_size, VirtIOMEMMigSanityChecks),
1398 VMSTATE_UINT32(node, VirtIOMEMMigSanityChecks),
1399 VMSTATE_END_OF_LIST(),
1400 },
1401 };
1402
virtio_mem_vmstate_field_exists(void * opaque,int version_id)1403 static bool virtio_mem_vmstate_field_exists(void *opaque, int version_id)
1404 {
1405 const VirtIOMEM *vmem = VIRTIO_MEM(opaque);
1406
1407 /* With early migration, these fields were already migrated. */
1408 return !vmem->early_migration;
1409 }
1410
1411 static const VMStateDescription vmstate_virtio_mem_device = {
1412 .name = "virtio-mem-device",
1413 .minimum_version_id = 1,
1414 .version_id = 1,
1415 .priority = MIG_PRI_VIRTIO_MEM,
1416 .post_load = virtio_mem_post_load,
1417 .fields = (const VMStateField[]) {
1418 VMSTATE_WITH_TMP_TEST(VirtIOMEM, virtio_mem_vmstate_field_exists,
1419 VirtIOMEMMigSanityChecks,
1420 vmstate_virtio_mem_sanity_checks),
1421 VMSTATE_UINT64(usable_region_size, VirtIOMEM),
1422 VMSTATE_UINT64_TEST(size, VirtIOMEM, virtio_mem_vmstate_field_exists),
1423 VMSTATE_UINT64(requested_size, VirtIOMEM),
1424 VMSTATE_BITMAP_TEST(bitmap, VirtIOMEM, virtio_mem_vmstate_field_exists,
1425 0, bitmap_size),
1426 VMSTATE_END_OF_LIST()
1427 },
1428 };
1429
1430 /*
1431 * Transfer properties that are immutable while migration is active early,
1432 * such that we have have this information around before migrating any RAM
1433 * content.
1434 *
1435 * Note that virtio_mem_is_busy() makes sure these properties can no longer
1436 * change on the migration source until migration completed.
1437 *
1438 * With QEMU compat machines, we transmit these properties later, via
1439 * vmstate_virtio_mem_device instead -- see virtio_mem_vmstate_field_exists().
1440 */
1441 static const VMStateDescription vmstate_virtio_mem_device_early = {
1442 .name = "virtio-mem-device-early",
1443 .minimum_version_id = 1,
1444 .version_id = 1,
1445 .early_setup = true,
1446 .post_load = virtio_mem_post_load_early,
1447 .fields = (const VMStateField[]) {
1448 VMSTATE_WITH_TMP(VirtIOMEM, VirtIOMEMMigSanityChecks,
1449 vmstate_virtio_mem_sanity_checks),
1450 VMSTATE_UINT64(size, VirtIOMEM),
1451 VMSTATE_BITMAP(bitmap, VirtIOMEM, 0, bitmap_size),
1452 VMSTATE_END_OF_LIST()
1453 },
1454 };
1455
1456 static const VMStateDescription vmstate_virtio_mem = {
1457 .name = "virtio-mem",
1458 .minimum_version_id = 1,
1459 .version_id = 1,
1460 .fields = (const VMStateField[]) {
1461 VMSTATE_VIRTIO_DEVICE,
1462 VMSTATE_END_OF_LIST()
1463 },
1464 };
1465
virtio_mem_fill_device_info(const VirtIOMEM * vmem,VirtioMEMDeviceInfo * vi)1466 static void virtio_mem_fill_device_info(const VirtIOMEM *vmem,
1467 VirtioMEMDeviceInfo *vi)
1468 {
1469 vi->memaddr = vmem->addr;
1470 vi->node = vmem->node;
1471 vi->requested_size = vmem->requested_size;
1472 vi->size = vmem->size;
1473 vi->max_size = memory_region_size(&vmem->memdev->mr);
1474 vi->block_size = vmem->block_size;
1475 vi->memdev = object_get_canonical_path(OBJECT(vmem->memdev));
1476 }
1477
virtio_mem_get_memory_region(VirtIOMEM * vmem,Error ** errp)1478 static MemoryRegion *virtio_mem_get_memory_region(VirtIOMEM *vmem, Error **errp)
1479 {
1480 if (!vmem->memdev) {
1481 error_setg(errp, "'%s' property must be set", VIRTIO_MEM_MEMDEV_PROP);
1482 return NULL;
1483 } else if (vmem->dynamic_memslots) {
1484 if (!vmem->mr) {
1485 virtio_mem_prepare_mr(vmem);
1486 }
1487 return vmem->mr;
1488 }
1489
1490 return &vmem->memdev->mr;
1491 }
1492
virtio_mem_decide_memslots(VirtIOMEM * vmem,unsigned int limit)1493 static void virtio_mem_decide_memslots(VirtIOMEM *vmem, unsigned int limit)
1494 {
1495 uint64_t region_size, memslot_size, min_memslot_size;
1496 unsigned int memslots;
1497 RAMBlock *rb;
1498
1499 if (!vmem->dynamic_memslots) {
1500 return;
1501 }
1502
1503 /* We're called exactly once, before realizing the device. */
1504 assert(!vmem->nb_memslots);
1505
1506 /* If realizing the device will fail, just assume a single memslot. */
1507 if (limit <= 1 || !vmem->memdev || !vmem->memdev->mr.ram_block) {
1508 vmem->nb_memslots = 1;
1509 return;
1510 }
1511
1512 rb = vmem->memdev->mr.ram_block;
1513 region_size = memory_region_size(&vmem->memdev->mr);
1514
1515 /*
1516 * Determine the default block size now, to determine the minimum memslot
1517 * size. We want the minimum slot size to be at least the device block size.
1518 */
1519 if (!vmem->block_size) {
1520 vmem->block_size = virtio_mem_default_block_size(rb);
1521 }
1522 /* If realizing the device will fail, just assume a single memslot. */
1523 if (vmem->block_size < qemu_ram_pagesize(rb) ||
1524 !QEMU_IS_ALIGNED(region_size, vmem->block_size)) {
1525 vmem->nb_memslots = 1;
1526 return;
1527 }
1528
1529 /*
1530 * All memslots except the last one have a reasonable minimum size, and
1531 * and all memslot sizes are aligned to the device block size.
1532 */
1533 memslot_size = QEMU_ALIGN_UP(region_size / limit, vmem->block_size);
1534 min_memslot_size = MAX(vmem->block_size, VIRTIO_MEM_MIN_MEMSLOT_SIZE);
1535 memslot_size = MAX(memslot_size, min_memslot_size);
1536
1537 memslots = QEMU_ALIGN_UP(region_size, memslot_size) / memslot_size;
1538 if (memslots != 1) {
1539 vmem->memslot_size = memslot_size;
1540 }
1541 vmem->nb_memslots = memslots;
1542 }
1543
virtio_mem_get_memslots(VirtIOMEM * vmem)1544 static unsigned int virtio_mem_get_memslots(VirtIOMEM *vmem)
1545 {
1546 if (!vmem->dynamic_memslots) {
1547 /* Exactly one static RAM memory region. */
1548 return 1;
1549 }
1550
1551 /* We're called after instructed to make a decision. */
1552 g_assert(vmem->nb_memslots);
1553 return vmem->nb_memslots;
1554 }
1555
virtio_mem_add_size_change_notifier(VirtIOMEM * vmem,Notifier * notifier)1556 static void virtio_mem_add_size_change_notifier(VirtIOMEM *vmem,
1557 Notifier *notifier)
1558 {
1559 notifier_list_add(&vmem->size_change_notifiers, notifier);
1560 }
1561
virtio_mem_remove_size_change_notifier(VirtIOMEM * vmem,Notifier * notifier)1562 static void virtio_mem_remove_size_change_notifier(VirtIOMEM *vmem,
1563 Notifier *notifier)
1564 {
1565 notifier_remove(notifier);
1566 }
1567
virtio_mem_get_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)1568 static void virtio_mem_get_size(Object *obj, Visitor *v, const char *name,
1569 void *opaque, Error **errp)
1570 {
1571 const VirtIOMEM *vmem = VIRTIO_MEM(obj);
1572 uint64_t value = vmem->size;
1573
1574 visit_type_size(v, name, &value, errp);
1575 }
1576
virtio_mem_get_requested_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)1577 static void virtio_mem_get_requested_size(Object *obj, Visitor *v,
1578 const char *name, void *opaque,
1579 Error **errp)
1580 {
1581 const VirtIOMEM *vmem = VIRTIO_MEM(obj);
1582 uint64_t value = vmem->requested_size;
1583
1584 visit_type_size(v, name, &value, errp);
1585 }
1586
virtio_mem_set_requested_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)1587 static void virtio_mem_set_requested_size(Object *obj, Visitor *v,
1588 const char *name, void *opaque,
1589 Error **errp)
1590 {
1591 VirtIOMEM *vmem = VIRTIO_MEM(obj);
1592 uint64_t value;
1593
1594 if (!visit_type_size(v, name, &value, errp)) {
1595 return;
1596 }
1597
1598 /*
1599 * The block size and memory backend are not fixed until the device was
1600 * realized. realize() will verify these properties then.
1601 */
1602 if (DEVICE(obj)->realized) {
1603 if (!QEMU_IS_ALIGNED(value, vmem->block_size)) {
1604 error_setg(errp, "'%s' has to be multiples of '%s' (0x%" PRIx64
1605 ")", name, VIRTIO_MEM_BLOCK_SIZE_PROP,
1606 vmem->block_size);
1607 return;
1608 } else if (value > memory_region_size(&vmem->memdev->mr)) {
1609 error_setg(errp, "'%s' cannot exceed the memory backend size"
1610 "(0x%" PRIx64 ")", name,
1611 memory_region_size(&vmem->memdev->mr));
1612 return;
1613 }
1614
1615 if (value != vmem->requested_size) {
1616 virtio_mem_resize_usable_region(vmem, value, false);
1617 vmem->requested_size = value;
1618 }
1619 /*
1620 * Trigger a config update so the guest gets notified. We trigger
1621 * even if the size didn't change (especially helpful for debugging).
1622 */
1623 virtio_notify_config(VIRTIO_DEVICE(vmem));
1624 } else {
1625 vmem->requested_size = value;
1626 }
1627 }
1628
virtio_mem_get_block_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)1629 static void virtio_mem_get_block_size(Object *obj, Visitor *v, const char *name,
1630 void *opaque, Error **errp)
1631 {
1632 const VirtIOMEM *vmem = VIRTIO_MEM(obj);
1633 uint64_t value = vmem->block_size;
1634
1635 /*
1636 * If not configured by the user (and we're not realized yet), use the
1637 * default block size we would use with the current memory backend.
1638 */
1639 if (!value) {
1640 if (vmem->memdev && memory_region_is_ram(&vmem->memdev->mr)) {
1641 value = virtio_mem_default_block_size(vmem->memdev->mr.ram_block);
1642 } else {
1643 value = virtio_mem_thp_size();
1644 }
1645 }
1646
1647 visit_type_size(v, name, &value, errp);
1648 }
1649
virtio_mem_set_block_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)1650 static void virtio_mem_set_block_size(Object *obj, Visitor *v, const char *name,
1651 void *opaque, Error **errp)
1652 {
1653 VirtIOMEM *vmem = VIRTIO_MEM(obj);
1654 uint64_t value;
1655
1656 if (DEVICE(obj)->realized) {
1657 error_setg(errp, "'%s' cannot be changed", name);
1658 return;
1659 }
1660
1661 if (!visit_type_size(v, name, &value, errp)) {
1662 return;
1663 }
1664
1665 if (value < VIRTIO_MEM_MIN_BLOCK_SIZE) {
1666 error_setg(errp, "'%s' property has to be at least 0x%" PRIx32, name,
1667 VIRTIO_MEM_MIN_BLOCK_SIZE);
1668 return;
1669 } else if (!is_power_of_2(value)) {
1670 error_setg(errp, "'%s' property has to be a power of two", name);
1671 return;
1672 }
1673 vmem->block_size = value;
1674 }
1675
virtio_mem_instance_init(Object * obj)1676 static void virtio_mem_instance_init(Object *obj)
1677 {
1678 VirtIOMEM *vmem = VIRTIO_MEM(obj);
1679
1680 notifier_list_init(&vmem->size_change_notifiers);
1681 QLIST_INIT(&vmem->rdl_list);
1682
1683 object_property_add(obj, VIRTIO_MEM_SIZE_PROP, "size", virtio_mem_get_size,
1684 NULL, NULL, NULL);
1685 object_property_add(obj, VIRTIO_MEM_REQUESTED_SIZE_PROP, "size",
1686 virtio_mem_get_requested_size,
1687 virtio_mem_set_requested_size, NULL, NULL);
1688 object_property_add(obj, VIRTIO_MEM_BLOCK_SIZE_PROP, "size",
1689 virtio_mem_get_block_size, virtio_mem_set_block_size,
1690 NULL, NULL);
1691 }
1692
virtio_mem_instance_finalize(Object * obj)1693 static void virtio_mem_instance_finalize(Object *obj)
1694 {
1695 VirtIOMEM *vmem = VIRTIO_MEM(obj);
1696
1697 /*
1698 * Note: the core already dropped the references on all memory regions
1699 * (it's passed as the owner to memory_region_init_*()) and finalized
1700 * these objects. We can simply free the memory.
1701 */
1702 g_free(vmem->memslots);
1703 vmem->memslots = NULL;
1704 g_free(vmem->mr);
1705 vmem->mr = NULL;
1706 }
1707
1708 static const Property virtio_mem_properties[] = {
1709 DEFINE_PROP_UINT64(VIRTIO_MEM_ADDR_PROP, VirtIOMEM, addr, 0),
1710 DEFINE_PROP_UINT32(VIRTIO_MEM_NODE_PROP, VirtIOMEM, node, 0),
1711 DEFINE_PROP_BOOL(VIRTIO_MEM_PREALLOC_PROP, VirtIOMEM, prealloc, false),
1712 DEFINE_PROP_LINK(VIRTIO_MEM_MEMDEV_PROP, VirtIOMEM, memdev,
1713 TYPE_MEMORY_BACKEND, HostMemoryBackend *),
1714 #if defined(VIRTIO_MEM_HAS_LEGACY_GUESTS)
1715 DEFINE_PROP_ON_OFF_AUTO(VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP, VirtIOMEM,
1716 unplugged_inaccessible, ON_OFF_AUTO_ON),
1717 #endif
1718 DEFINE_PROP_BOOL(VIRTIO_MEM_EARLY_MIGRATION_PROP, VirtIOMEM,
1719 early_migration, true),
1720 DEFINE_PROP_BOOL(VIRTIO_MEM_DYNAMIC_MEMSLOTS_PROP, VirtIOMEM,
1721 dynamic_memslots, false),
1722 };
1723
virtio_mem_rdm_get_min_granularity(const RamDiscardManager * rdm,const MemoryRegion * mr)1724 static uint64_t virtio_mem_rdm_get_min_granularity(const RamDiscardManager *rdm,
1725 const MemoryRegion *mr)
1726 {
1727 const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1728
1729 g_assert(mr == &vmem->memdev->mr);
1730 return vmem->block_size;
1731 }
1732
virtio_mem_rdm_is_populated(const RamDiscardManager * rdm,const MemoryRegionSection * s)1733 static bool virtio_mem_rdm_is_populated(const RamDiscardManager *rdm,
1734 const MemoryRegionSection *s)
1735 {
1736 const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1737 uint64_t start_gpa = vmem->addr + s->offset_within_region;
1738 uint64_t end_gpa = start_gpa + int128_get64(s->size);
1739
1740 g_assert(s->mr == &vmem->memdev->mr);
1741
1742 start_gpa = QEMU_ALIGN_DOWN(start_gpa, vmem->block_size);
1743 end_gpa = QEMU_ALIGN_UP(end_gpa, vmem->block_size);
1744
1745 if (!virtio_mem_valid_range(vmem, start_gpa, end_gpa - start_gpa)) {
1746 return false;
1747 }
1748
1749 return virtio_mem_is_range_plugged(vmem, start_gpa, end_gpa - start_gpa);
1750 }
1751
1752 struct VirtIOMEMReplayData {
1753 void *fn;
1754 void *opaque;
1755 };
1756
virtio_mem_rdm_replay_populated_cb(MemoryRegionSection * s,void * arg)1757 static int virtio_mem_rdm_replay_populated_cb(MemoryRegionSection *s, void *arg)
1758 {
1759 struct VirtIOMEMReplayData *data = arg;
1760
1761 return ((ReplayRamPopulate)data->fn)(s, data->opaque);
1762 }
1763
virtio_mem_rdm_replay_populated(const RamDiscardManager * rdm,MemoryRegionSection * s,ReplayRamPopulate replay_fn,void * opaque)1764 static int virtio_mem_rdm_replay_populated(const RamDiscardManager *rdm,
1765 MemoryRegionSection *s,
1766 ReplayRamPopulate replay_fn,
1767 void *opaque)
1768 {
1769 const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1770 struct VirtIOMEMReplayData data = {
1771 .fn = replay_fn,
1772 .opaque = opaque,
1773 };
1774
1775 g_assert(s->mr == &vmem->memdev->mr);
1776 return virtio_mem_for_each_plugged_section(vmem, s, &data,
1777 virtio_mem_rdm_replay_populated_cb);
1778 }
1779
virtio_mem_rdm_replay_discarded_cb(MemoryRegionSection * s,void * arg)1780 static int virtio_mem_rdm_replay_discarded_cb(MemoryRegionSection *s,
1781 void *arg)
1782 {
1783 struct VirtIOMEMReplayData *data = arg;
1784
1785 ((ReplayRamDiscard)data->fn)(s, data->opaque);
1786 return 0;
1787 }
1788
virtio_mem_rdm_replay_discarded(const RamDiscardManager * rdm,MemoryRegionSection * s,ReplayRamDiscard replay_fn,void * opaque)1789 static void virtio_mem_rdm_replay_discarded(const RamDiscardManager *rdm,
1790 MemoryRegionSection *s,
1791 ReplayRamDiscard replay_fn,
1792 void *opaque)
1793 {
1794 const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1795 struct VirtIOMEMReplayData data = {
1796 .fn = replay_fn,
1797 .opaque = opaque,
1798 };
1799
1800 g_assert(s->mr == &vmem->memdev->mr);
1801 virtio_mem_for_each_unplugged_section(vmem, s, &data,
1802 virtio_mem_rdm_replay_discarded_cb);
1803 }
1804
virtio_mem_rdm_register_listener(RamDiscardManager * rdm,RamDiscardListener * rdl,MemoryRegionSection * s)1805 static void virtio_mem_rdm_register_listener(RamDiscardManager *rdm,
1806 RamDiscardListener *rdl,
1807 MemoryRegionSection *s)
1808 {
1809 VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1810 int ret;
1811
1812 g_assert(s->mr == &vmem->memdev->mr);
1813 rdl->section = memory_region_section_new_copy(s);
1814
1815 QLIST_INSERT_HEAD(&vmem->rdl_list, rdl, next);
1816 ret = virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
1817 virtio_mem_notify_populate_cb);
1818 if (ret) {
1819 error_report("%s: Replaying plugged ranges failed: %s", __func__,
1820 strerror(-ret));
1821 }
1822 }
1823
virtio_mem_rdm_unregister_listener(RamDiscardManager * rdm,RamDiscardListener * rdl)1824 static void virtio_mem_rdm_unregister_listener(RamDiscardManager *rdm,
1825 RamDiscardListener *rdl)
1826 {
1827 VirtIOMEM *vmem = VIRTIO_MEM(rdm);
1828
1829 g_assert(rdl->section->mr == &vmem->memdev->mr);
1830 if (vmem->size) {
1831 if (rdl->double_discard_supported) {
1832 rdl->notify_discard(rdl, rdl->section);
1833 } else {
1834 virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl,
1835 virtio_mem_notify_discard_cb);
1836 }
1837 }
1838
1839 memory_region_section_free_copy(rdl->section);
1840 rdl->section = NULL;
1841 QLIST_REMOVE(rdl, next);
1842 }
1843
virtio_mem_unplug_request_check(VirtIOMEM * vmem,Error ** errp)1844 static void virtio_mem_unplug_request_check(VirtIOMEM *vmem, Error **errp)
1845 {
1846 if (vmem->unplugged_inaccessible == ON_OFF_AUTO_OFF) {
1847 /*
1848 * We could allow it with a usable region size of 0, but let's just
1849 * not care about that legacy setting.
1850 */
1851 error_setg(errp, "virtio-mem device cannot get unplugged while"
1852 " '" VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP "' != 'on'");
1853 return;
1854 }
1855
1856 if (vmem->size) {
1857 error_setg(errp, "virtio-mem device cannot get unplugged while some"
1858 " of its memory is still plugged");
1859 return;
1860 }
1861 if (vmem->requested_size) {
1862 error_setg(errp, "virtio-mem device cannot get unplugged while"
1863 " '" VIRTIO_MEM_REQUESTED_SIZE_PROP "' != '0'");
1864 return;
1865 }
1866 }
1867
virtio_mem_class_init(ObjectClass * klass,const void * data)1868 static void virtio_mem_class_init(ObjectClass *klass, const void *data)
1869 {
1870 DeviceClass *dc = DEVICE_CLASS(klass);
1871 VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
1872 VirtIOMEMClass *vmc = VIRTIO_MEM_CLASS(klass);
1873 RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(klass);
1874
1875 device_class_set_props(dc, virtio_mem_properties);
1876 dc->vmsd = &vmstate_virtio_mem;
1877
1878 set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1879 vdc->realize = virtio_mem_device_realize;
1880 vdc->unrealize = virtio_mem_device_unrealize;
1881 vdc->get_config = virtio_mem_get_config;
1882 vdc->get_features = virtio_mem_get_features;
1883 vdc->validate_features = virtio_mem_validate_features;
1884 vdc->vmsd = &vmstate_virtio_mem_device;
1885
1886 vmc->fill_device_info = virtio_mem_fill_device_info;
1887 vmc->get_memory_region = virtio_mem_get_memory_region;
1888 vmc->decide_memslots = virtio_mem_decide_memslots;
1889 vmc->get_memslots = virtio_mem_get_memslots;
1890 vmc->add_size_change_notifier = virtio_mem_add_size_change_notifier;
1891 vmc->remove_size_change_notifier = virtio_mem_remove_size_change_notifier;
1892 vmc->unplug_request_check = virtio_mem_unplug_request_check;
1893
1894 rdmc->get_min_granularity = virtio_mem_rdm_get_min_granularity;
1895 rdmc->is_populated = virtio_mem_rdm_is_populated;
1896 rdmc->replay_populated = virtio_mem_rdm_replay_populated;
1897 rdmc->replay_discarded = virtio_mem_rdm_replay_discarded;
1898 rdmc->register_listener = virtio_mem_rdm_register_listener;
1899 rdmc->unregister_listener = virtio_mem_rdm_unregister_listener;
1900 }
1901
1902 static const TypeInfo virtio_mem_info = {
1903 .name = TYPE_VIRTIO_MEM,
1904 .parent = TYPE_VIRTIO_DEVICE,
1905 .instance_size = sizeof(VirtIOMEM),
1906 .instance_init = virtio_mem_instance_init,
1907 .instance_finalize = virtio_mem_instance_finalize,
1908 .class_init = virtio_mem_class_init,
1909 .class_size = sizeof(VirtIOMEMClass),
1910 .interfaces = (const InterfaceInfo[]) {
1911 { TYPE_RAM_DISCARD_MANAGER },
1912 { }
1913 },
1914 };
1915
virtio_register_types(void)1916 static void virtio_register_types(void)
1917 {
1918 type_register_static(&virtio_mem_info);
1919 }
1920
1921 type_init(virtio_register_types)
1922
1923 OBJECT_DEFINE_SIMPLE_TYPE_WITH_INTERFACES(VirtioMemSystemReset, virtio_mem_system_reset, VIRTIO_MEM_SYSTEM_RESET, OBJECT, { TYPE_RESETTABLE_INTERFACE }, { })
1924
virtio_mem_system_reset_init(Object * obj)1925 static void virtio_mem_system_reset_init(Object *obj)
1926 {
1927 }
1928
virtio_mem_system_reset_finalize(Object * obj)1929 static void virtio_mem_system_reset_finalize(Object *obj)
1930 {
1931 }
1932
virtio_mem_system_reset_get_state(Object * obj)1933 static ResettableState *virtio_mem_system_reset_get_state(Object *obj)
1934 {
1935 VirtioMemSystemReset *vmem_reset = VIRTIO_MEM_SYSTEM_RESET(obj);
1936
1937 return &vmem_reset->reset_state;
1938 }
1939
virtio_mem_system_reset_hold(Object * obj,ResetType type)1940 static void virtio_mem_system_reset_hold(Object *obj, ResetType type)
1941 {
1942 VirtioMemSystemReset *vmem_reset = VIRTIO_MEM_SYSTEM_RESET(obj);
1943 VirtIOMEM *vmem = vmem_reset->vmem;
1944
1945 /*
1946 * When waking up from standby/suspend-to-ram, do not unplug any memory.
1947 */
1948 if (type == RESET_TYPE_WAKEUP) {
1949 return;
1950 }
1951
1952 /*
1953 * During usual resets, we will unplug all memory and shrink the usable
1954 * region size. This is, however, not possible in all scenarios. Then,
1955 * the guest has to deal with this manually (VIRTIO_MEM_REQ_UNPLUG_ALL).
1956 */
1957 virtio_mem_unplug_all(vmem);
1958 }
1959
virtio_mem_system_reset_class_init(ObjectClass * klass,const void * data)1960 static void virtio_mem_system_reset_class_init(ObjectClass *klass,
1961 const void *data)
1962 {
1963 ResettableClass *rc = RESETTABLE_CLASS(klass);
1964
1965 rc->get_state = virtio_mem_system_reset_get_state;
1966 rc->phases.hold = virtio_mem_system_reset_hold;
1967 }
1968