xref: /qemu/system/physmem.c (revision c07cd110a1824e2d046581af7375f16dac26e96f)
1 /*
2  * RAM allocation and memory access
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "qemu/osdep.h"
21 #include "exec/page-vary.h"
22 #include "qapi/error.h"
23 
24 #include "qemu/cutils.h"
25 #include "qemu/cacheflush.h"
26 #include "qemu/hbitmap.h"
27 #include "qemu/madvise.h"
28 #include "qemu/lockable.h"
29 
30 #ifdef CONFIG_TCG
31 #include "accel/tcg/cpu-ops.h"
32 #endif /* CONFIG_TCG */
33 
34 #include "exec/exec-all.h"
35 #include "exec/cputlb.h"
36 #include "exec/page-protection.h"
37 #include "exec/target_page.h"
38 #include "exec/translation-block.h"
39 #include "hw/qdev-core.h"
40 #include "hw/qdev-properties.h"
41 #include "hw/boards.h"
42 #include "system/xen.h"
43 #include "system/kvm.h"
44 #include "system/tcg.h"
45 #include "system/qtest.h"
46 #include "qemu/timer.h"
47 #include "qemu/config-file.h"
48 #include "qemu/error-report.h"
49 #include "qemu/qemu-print.h"
50 #include "qemu/log.h"
51 #include "qemu/memalign.h"
52 #include "qemu/memfd.h"
53 #include "exec/memory.h"
54 #include "exec/ioport.h"
55 #include "system/dma.h"
56 #include "system/hostmem.h"
57 #include "system/hw_accel.h"
58 #include "system/xen-mapcache.h"
59 #include "trace.h"
60 
61 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
62 #include <linux/falloc.h>
63 #endif
64 
65 #include "qemu/rcu_queue.h"
66 #include "qemu/main-loop.h"
67 #include "system/replay.h"
68 
69 #include "exec/memory-internal.h"
70 #include "exec/ram_addr.h"
71 
72 #include "qemu/pmem.h"
73 
74 #include "qapi/qapi-types-migration.h"
75 #include "migration/blocker.h"
76 #include "migration/cpr.h"
77 #include "migration/options.h"
78 #include "migration/vmstate.h"
79 
80 #include "qemu/range.h"
81 #ifndef _WIN32
82 #include "qemu/mmap-alloc.h"
83 #endif
84 
85 #include "monitor/monitor.h"
86 
87 #ifdef CONFIG_LIBDAXCTL
88 #include <daxctl/libdaxctl.h>
89 #endif
90 
91 //#define DEBUG_SUBPAGE
92 
93 /* ram_list is read under rcu_read_lock()/rcu_read_unlock().  Writes
94  * are protected by the ramlist lock.
95  */
96 RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
97 
98 static MemoryRegion *system_memory;
99 static MemoryRegion *system_io;
100 
101 AddressSpace address_space_io;
102 AddressSpace address_space_memory;
103 
104 static MemoryRegion io_mem_unassigned;
105 
106 typedef struct PhysPageEntry PhysPageEntry;
107 
108 struct PhysPageEntry {
109     /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
110     uint32_t skip : 6;
111      /* index into phys_sections (!skip) or phys_map_nodes (skip) */
112     uint32_t ptr : 26;
113 };
114 
115 #define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
116 
117 /* Size of the L2 (and L3, etc) page tables.  */
118 #define ADDR_SPACE_BITS 64
119 
120 #define P_L2_BITS 9
121 #define P_L2_SIZE (1 << P_L2_BITS)
122 
123 #define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
124 
125 typedef PhysPageEntry Node[P_L2_SIZE];
126 
127 typedef struct PhysPageMap {
128     struct rcu_head rcu;
129 
130     unsigned sections_nb;
131     unsigned sections_nb_alloc;
132     unsigned nodes_nb;
133     unsigned nodes_nb_alloc;
134     Node *nodes;
135     MemoryRegionSection *sections;
136 } PhysPageMap;
137 
138 struct AddressSpaceDispatch {
139     MemoryRegionSection *mru_section;
140     /* This is a multi-level map on the physical address space.
141      * The bottom level has pointers to MemoryRegionSections.
142      */
143     PhysPageEntry phys_map;
144     PhysPageMap map;
145 };
146 
147 #define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
148 typedef struct subpage_t {
149     MemoryRegion iomem;
150     FlatView *fv;
151     hwaddr base;
152     uint16_t sub_section[];
153 } subpage_t;
154 
155 #define PHYS_SECTION_UNASSIGNED 0
156 
157 static void io_mem_init(void);
158 static void memory_map_init(void);
159 static void tcg_log_global_after_sync(MemoryListener *listener);
160 static void tcg_commit(MemoryListener *listener);
161 static bool ram_is_cpr_compatible(RAMBlock *rb);
162 
163 /**
164  * CPUAddressSpace: all the information a CPU needs about an AddressSpace
165  * @cpu: the CPU whose AddressSpace this is
166  * @as: the AddressSpace itself
167  * @memory_dispatch: its dispatch pointer (cached, RCU protected)
168  * @tcg_as_listener: listener for tracking changes to the AddressSpace
169  */
170 typedef struct CPUAddressSpace {
171     CPUState *cpu;
172     AddressSpace *as;
173     struct AddressSpaceDispatch *memory_dispatch;
174     MemoryListener tcg_as_listener;
175 } CPUAddressSpace;
176 
177 struct DirtyBitmapSnapshot {
178     ram_addr_t start;
179     ram_addr_t end;
180     unsigned long dirty[];
181 };
182 
183 static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
184 {
185     static unsigned alloc_hint = 16;
186     if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
187         map->nodes_nb_alloc = MAX(alloc_hint, map->nodes_nb + nodes);
188         map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
189         alloc_hint = map->nodes_nb_alloc;
190     }
191 }
192 
193 static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
194 {
195     unsigned i;
196     uint32_t ret;
197     PhysPageEntry e;
198     PhysPageEntry *p;
199 
200     ret = map->nodes_nb++;
201     p = map->nodes[ret];
202     assert(ret != PHYS_MAP_NODE_NIL);
203     assert(ret != map->nodes_nb_alloc);
204 
205     e.skip = leaf ? 0 : 1;
206     e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
207     for (i = 0; i < P_L2_SIZE; ++i) {
208         memcpy(&p[i], &e, sizeof(e));
209     }
210     return ret;
211 }
212 
213 static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
214                                 hwaddr *index, uint64_t *nb, uint16_t leaf,
215                                 int level)
216 {
217     PhysPageEntry *p;
218     hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
219 
220     if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
221         lp->ptr = phys_map_node_alloc(map, level == 0);
222     }
223     p = map->nodes[lp->ptr];
224     lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
225 
226     while (*nb && lp < &p[P_L2_SIZE]) {
227         if ((*index & (step - 1)) == 0 && *nb >= step) {
228             lp->skip = 0;
229             lp->ptr = leaf;
230             *index += step;
231             *nb -= step;
232         } else {
233             phys_page_set_level(map, lp, index, nb, leaf, level - 1);
234         }
235         ++lp;
236     }
237 }
238 
239 static void phys_page_set(AddressSpaceDispatch *d,
240                           hwaddr index, uint64_t nb,
241                           uint16_t leaf)
242 {
243     /* Wildly overreserve - it doesn't matter much. */
244     phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
245 
246     phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
247 }
248 
249 /* Compact a non leaf page entry. Simply detect that the entry has a single child,
250  * and update our entry so we can skip it and go directly to the destination.
251  */
252 static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
253 {
254     unsigned valid_ptr = P_L2_SIZE;
255     int valid = 0;
256     PhysPageEntry *p;
257     int i;
258 
259     if (lp->ptr == PHYS_MAP_NODE_NIL) {
260         return;
261     }
262 
263     p = nodes[lp->ptr];
264     for (i = 0; i < P_L2_SIZE; i++) {
265         if (p[i].ptr == PHYS_MAP_NODE_NIL) {
266             continue;
267         }
268 
269         valid_ptr = i;
270         valid++;
271         if (p[i].skip) {
272             phys_page_compact(&p[i], nodes);
273         }
274     }
275 
276     /* We can only compress if there's only one child. */
277     if (valid != 1) {
278         return;
279     }
280 
281     assert(valid_ptr < P_L2_SIZE);
282 
283     /* Don't compress if it won't fit in the # of bits we have. */
284     if (P_L2_LEVELS >= (1 << 6) &&
285         lp->skip + p[valid_ptr].skip >= (1 << 6)) {
286         return;
287     }
288 
289     lp->ptr = p[valid_ptr].ptr;
290     if (!p[valid_ptr].skip) {
291         /* If our only child is a leaf, make this a leaf. */
292         /* By design, we should have made this node a leaf to begin with so we
293          * should never reach here.
294          * But since it's so simple to handle this, let's do it just in case we
295          * change this rule.
296          */
297         lp->skip = 0;
298     } else {
299         lp->skip += p[valid_ptr].skip;
300     }
301 }
302 
303 void address_space_dispatch_compact(AddressSpaceDispatch *d)
304 {
305     if (d->phys_map.skip) {
306         phys_page_compact(&d->phys_map, d->map.nodes);
307     }
308 }
309 
310 static inline bool section_covers_addr(const MemoryRegionSection *section,
311                                        hwaddr addr)
312 {
313     /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
314      * the section must cover the entire address space.
315      */
316     return int128_gethi(section->size) ||
317            range_covers_byte(section->offset_within_address_space,
318                              int128_getlo(section->size), addr);
319 }
320 
321 static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr)
322 {
323     PhysPageEntry lp = d->phys_map, *p;
324     Node *nodes = d->map.nodes;
325     MemoryRegionSection *sections = d->map.sections;
326     hwaddr index = addr >> TARGET_PAGE_BITS;
327     int i;
328 
329     for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
330         if (lp.ptr == PHYS_MAP_NODE_NIL) {
331             return &sections[PHYS_SECTION_UNASSIGNED];
332         }
333         p = nodes[lp.ptr];
334         lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
335     }
336 
337     if (section_covers_addr(&sections[lp.ptr], addr)) {
338         return &sections[lp.ptr];
339     } else {
340         return &sections[PHYS_SECTION_UNASSIGNED];
341     }
342 }
343 
344 /* Called from RCU critical section */
345 static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
346                                                         hwaddr addr,
347                                                         bool resolve_subpage)
348 {
349     MemoryRegionSection *section = qatomic_read(&d->mru_section);
350     subpage_t *subpage;
351 
352     if (!section || section == &d->map.sections[PHYS_SECTION_UNASSIGNED] ||
353         !section_covers_addr(section, addr)) {
354         section = phys_page_find(d, addr);
355         qatomic_set(&d->mru_section, section);
356     }
357     if (resolve_subpage && section->mr->subpage) {
358         subpage = container_of(section->mr, subpage_t, iomem);
359         section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
360     }
361     return section;
362 }
363 
364 /* Called from RCU critical section */
365 static MemoryRegionSection *
366 address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
367                                  hwaddr *plen, bool resolve_subpage)
368 {
369     MemoryRegionSection *section;
370     MemoryRegion *mr;
371     Int128 diff;
372 
373     section = address_space_lookup_region(d, addr, resolve_subpage);
374     /* Compute offset within MemoryRegionSection */
375     addr -= section->offset_within_address_space;
376 
377     /* Compute offset within MemoryRegion */
378     *xlat = addr + section->offset_within_region;
379 
380     mr = section->mr;
381 
382     /* MMIO registers can be expected to perform full-width accesses based only
383      * on their address, without considering adjacent registers that could
384      * decode to completely different MemoryRegions.  When such registers
385      * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
386      * regions overlap wildly.  For this reason we cannot clamp the accesses
387      * here.
388      *
389      * If the length is small (as is the case for address_space_ldl/stl),
390      * everything works fine.  If the incoming length is large, however,
391      * the caller really has to do the clamping through memory_access_size.
392      */
393     if (memory_region_is_ram(mr)) {
394         diff = int128_sub(section->size, int128_make64(addr));
395         *plen = int128_get64(int128_min(diff, int128_make64(*plen)));
396     }
397     return section;
398 }
399 
400 /**
401  * address_space_translate_iommu - translate an address through an IOMMU
402  * memory region and then through the target address space.
403  *
404  * @iommu_mr: the IOMMU memory region that we start the translation from
405  * @addr: the address to be translated through the MMU
406  * @xlat: the translated address offset within the destination memory region.
407  *        It cannot be %NULL.
408  * @plen_out: valid read/write length of the translated address. It
409  *            cannot be %NULL.
410  * @page_mask_out: page mask for the translated address. This
411  *            should only be meaningful for IOMMU translated
412  *            addresses, since there may be huge pages that this bit
413  *            would tell. It can be %NULL if we don't care about it.
414  * @is_write: whether the translation operation is for write
415  * @is_mmio: whether this can be MMIO, set true if it can
416  * @target_as: the address space targeted by the IOMMU
417  * @attrs: transaction attributes
418  *
419  * This function is called from RCU critical section.  It is the common
420  * part of flatview_do_translate and address_space_translate_cached.
421  */
422 static MemoryRegionSection address_space_translate_iommu(IOMMUMemoryRegion *iommu_mr,
423                                                          hwaddr *xlat,
424                                                          hwaddr *plen_out,
425                                                          hwaddr *page_mask_out,
426                                                          bool is_write,
427                                                          bool is_mmio,
428                                                          AddressSpace **target_as,
429                                                          MemTxAttrs attrs)
430 {
431     MemoryRegionSection *section;
432     hwaddr page_mask = (hwaddr)-1;
433 
434     do {
435         hwaddr addr = *xlat;
436         IOMMUMemoryRegionClass *imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
437         int iommu_idx = 0;
438         IOMMUTLBEntry iotlb;
439 
440         if (imrc->attrs_to_index) {
441             iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
442         }
443 
444         iotlb = imrc->translate(iommu_mr, addr, is_write ?
445                                 IOMMU_WO : IOMMU_RO, iommu_idx);
446 
447         if (!(iotlb.perm & (1 << is_write))) {
448             goto unassigned;
449         }
450 
451         addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
452                 | (addr & iotlb.addr_mask));
453         page_mask &= iotlb.addr_mask;
454         *plen_out = MIN(*plen_out, (addr | iotlb.addr_mask) - addr + 1);
455         *target_as = iotlb.target_as;
456 
457         section = address_space_translate_internal(
458                 address_space_to_dispatch(iotlb.target_as), addr, xlat,
459                 plen_out, is_mmio);
460 
461         iommu_mr = memory_region_get_iommu(section->mr);
462     } while (unlikely(iommu_mr));
463 
464     if (page_mask_out) {
465         *page_mask_out = page_mask;
466     }
467     return *section;
468 
469 unassigned:
470     return (MemoryRegionSection) { .mr = &io_mem_unassigned };
471 }
472 
473 /**
474  * flatview_do_translate - translate an address in FlatView
475  *
476  * @fv: the flat view that we want to translate on
477  * @addr: the address to be translated in above address space
478  * @xlat: the translated address offset within memory region. It
479  *        cannot be @NULL.
480  * @plen_out: valid read/write length of the translated address. It
481  *            can be @NULL when we don't care about it.
482  * @page_mask_out: page mask for the translated address. This
483  *            should only be meaningful for IOMMU translated
484  *            addresses, since there may be huge pages that this bit
485  *            would tell. It can be @NULL if we don't care about it.
486  * @is_write: whether the translation operation is for write
487  * @is_mmio: whether this can be MMIO, set true if it can
488  * @target_as: the address space targeted by the IOMMU
489  * @attrs: memory transaction attributes
490  *
491  * This function is called from RCU critical section
492  */
493 static MemoryRegionSection flatview_do_translate(FlatView *fv,
494                                                  hwaddr addr,
495                                                  hwaddr *xlat,
496                                                  hwaddr *plen_out,
497                                                  hwaddr *page_mask_out,
498                                                  bool is_write,
499                                                  bool is_mmio,
500                                                  AddressSpace **target_as,
501                                                  MemTxAttrs attrs)
502 {
503     MemoryRegionSection *section;
504     IOMMUMemoryRegion *iommu_mr;
505     hwaddr plen = (hwaddr)(-1);
506 
507     if (!plen_out) {
508         plen_out = &plen;
509     }
510 
511     section = address_space_translate_internal(
512             flatview_to_dispatch(fv), addr, xlat,
513             plen_out, is_mmio);
514 
515     iommu_mr = memory_region_get_iommu(section->mr);
516     if (unlikely(iommu_mr)) {
517         return address_space_translate_iommu(iommu_mr, xlat,
518                                              plen_out, page_mask_out,
519                                              is_write, is_mmio,
520                                              target_as, attrs);
521     }
522     if (page_mask_out) {
523         /* Not behind an IOMMU, use default page size. */
524         *page_mask_out = ~TARGET_PAGE_MASK;
525     }
526 
527     return *section;
528 }
529 
530 /* Called from RCU critical section */
531 IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
532                                             bool is_write, MemTxAttrs attrs)
533 {
534     MemoryRegionSection section;
535     hwaddr xlat, page_mask;
536 
537     /*
538      * This can never be MMIO, and we don't really care about plen,
539      * but page mask.
540      */
541     section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat,
542                                     NULL, &page_mask, is_write, false, &as,
543                                     attrs);
544 
545     /* Illegal translation */
546     if (section.mr == &io_mem_unassigned) {
547         goto iotlb_fail;
548     }
549 
550     /* Convert memory region offset into address space offset */
551     xlat += section.offset_within_address_space -
552         section.offset_within_region;
553 
554     return (IOMMUTLBEntry) {
555         .target_as = as,
556         .iova = addr & ~page_mask,
557         .translated_addr = xlat & ~page_mask,
558         .addr_mask = page_mask,
559         /* IOTLBs are for DMAs, and DMA only allows on RAMs. */
560         .perm = IOMMU_RW,
561     };
562 
563 iotlb_fail:
564     return (IOMMUTLBEntry) {0};
565 }
566 
567 /* Called from RCU critical section */
568 MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat,
569                                  hwaddr *plen, bool is_write,
570                                  MemTxAttrs attrs)
571 {
572     MemoryRegion *mr;
573     MemoryRegionSection section;
574     AddressSpace *as = NULL;
575 
576     /* This can be MMIO, so setup MMIO bit. */
577     section = flatview_do_translate(fv, addr, xlat, plen, NULL,
578                                     is_write, true, &as, attrs);
579     mr = section.mr;
580 
581     if (xen_enabled() && memory_access_is_direct(mr, is_write, attrs)) {
582         hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
583         *plen = MIN(page, *plen);
584     }
585 
586     return mr;
587 }
588 
589 typedef struct TCGIOMMUNotifier {
590     IOMMUNotifier n;
591     MemoryRegion *mr;
592     CPUState *cpu;
593     int iommu_idx;
594     bool active;
595 } TCGIOMMUNotifier;
596 
597 static void tcg_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
598 {
599     TCGIOMMUNotifier *notifier = container_of(n, TCGIOMMUNotifier, n);
600 
601     if (!notifier->active) {
602         return;
603     }
604     tlb_flush(notifier->cpu);
605     notifier->active = false;
606     /* We leave the notifier struct on the list to avoid reallocating it later.
607      * Generally the number of IOMMUs a CPU deals with will be small.
608      * In any case we can't unregister the iommu notifier from a notify
609      * callback.
610      */
611 }
612 
613 static void tcg_register_iommu_notifier(CPUState *cpu,
614                                         IOMMUMemoryRegion *iommu_mr,
615                                         int iommu_idx)
616 {
617     /* Make sure this CPU has an IOMMU notifier registered for this
618      * IOMMU/IOMMU index combination, so that we can flush its TLB
619      * when the IOMMU tells us the mappings we've cached have changed.
620      */
621     MemoryRegion *mr = MEMORY_REGION(iommu_mr);
622     TCGIOMMUNotifier *notifier = NULL;
623     int i;
624 
625     for (i = 0; i < cpu->iommu_notifiers->len; i++) {
626         notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
627         if (notifier->mr == mr && notifier->iommu_idx == iommu_idx) {
628             break;
629         }
630     }
631     if (i == cpu->iommu_notifiers->len) {
632         /* Not found, add a new entry at the end of the array */
633         cpu->iommu_notifiers = g_array_set_size(cpu->iommu_notifiers, i + 1);
634         notifier = g_new0(TCGIOMMUNotifier, 1);
635         g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i) = notifier;
636 
637         notifier->mr = mr;
638         notifier->iommu_idx = iommu_idx;
639         notifier->cpu = cpu;
640         /* Rather than trying to register interest in the specific part
641          * of the iommu's address space that we've accessed and then
642          * expand it later as subsequent accesses touch more of it, we
643          * just register interest in the whole thing, on the assumption
644          * that iommu reconfiguration will be rare.
645          */
646         iommu_notifier_init(&notifier->n,
647                             tcg_iommu_unmap_notify,
648                             IOMMU_NOTIFIER_UNMAP,
649                             0,
650                             HWADDR_MAX,
651                             iommu_idx);
652         memory_region_register_iommu_notifier(notifier->mr, &notifier->n,
653                                               &error_fatal);
654     }
655 
656     if (!notifier->active) {
657         notifier->active = true;
658     }
659 }
660 
661 void tcg_iommu_free_notifier_list(CPUState *cpu)
662 {
663     /* Destroy the CPU's notifier list */
664     int i;
665     TCGIOMMUNotifier *notifier;
666 
667     for (i = 0; i < cpu->iommu_notifiers->len; i++) {
668         notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
669         memory_region_unregister_iommu_notifier(notifier->mr, &notifier->n);
670         g_free(notifier);
671     }
672     g_array_free(cpu->iommu_notifiers, true);
673 }
674 
675 void tcg_iommu_init_notifier_list(CPUState *cpu)
676 {
677     cpu->iommu_notifiers = g_array_new(false, true, sizeof(TCGIOMMUNotifier *));
678 }
679 
680 /* Called from RCU critical section */
681 MemoryRegionSection *
682 address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr orig_addr,
683                                   hwaddr *xlat, hwaddr *plen,
684                                   MemTxAttrs attrs, int *prot)
685 {
686     MemoryRegionSection *section;
687     IOMMUMemoryRegion *iommu_mr;
688     IOMMUMemoryRegionClass *imrc;
689     IOMMUTLBEntry iotlb;
690     int iommu_idx;
691     hwaddr addr = orig_addr;
692     AddressSpaceDispatch *d = cpu->cpu_ases[asidx].memory_dispatch;
693 
694     for (;;) {
695         section = address_space_translate_internal(d, addr, &addr, plen, false);
696 
697         iommu_mr = memory_region_get_iommu(section->mr);
698         if (!iommu_mr) {
699             break;
700         }
701 
702         imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
703 
704         iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
705         tcg_register_iommu_notifier(cpu, iommu_mr, iommu_idx);
706         /* We need all the permissions, so pass IOMMU_NONE so the IOMMU
707          * doesn't short-cut its translation table walk.
708          */
709         iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, iommu_idx);
710         addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
711                 | (addr & iotlb.addr_mask));
712         /* Update the caller's prot bits to remove permissions the IOMMU
713          * is giving us a failure response for. If we get down to no
714          * permissions left at all we can give up now.
715          */
716         if (!(iotlb.perm & IOMMU_RO)) {
717             *prot &= ~(PAGE_READ | PAGE_EXEC);
718         }
719         if (!(iotlb.perm & IOMMU_WO)) {
720             *prot &= ~PAGE_WRITE;
721         }
722 
723         if (!*prot) {
724             goto translate_fail;
725         }
726 
727         d = flatview_to_dispatch(address_space_to_flatview(iotlb.target_as));
728     }
729 
730     assert(!memory_region_is_iommu(section->mr));
731     *xlat = addr;
732     return section;
733 
734 translate_fail:
735     /*
736      * We should be given a page-aligned address -- certainly
737      * tlb_set_page_with_attrs() does so.  The page offset of xlat
738      * is used to index sections[], and PHYS_SECTION_UNASSIGNED = 0.
739      * The page portion of xlat will be logged by memory_region_access_valid()
740      * when this memory access is rejected, so use the original untranslated
741      * physical address.
742      */
743     assert((orig_addr & ~TARGET_PAGE_MASK) == 0);
744     *xlat = orig_addr;
745     return &d->map.sections[PHYS_SECTION_UNASSIGNED];
746 }
747 
748 void cpu_address_space_init(CPUState *cpu, int asidx,
749                             const char *prefix, MemoryRegion *mr)
750 {
751     CPUAddressSpace *newas;
752     AddressSpace *as = g_new0(AddressSpace, 1);
753     char *as_name;
754 
755     assert(mr);
756     as_name = g_strdup_printf("%s-%d", prefix, cpu->cpu_index);
757     address_space_init(as, mr, as_name);
758     g_free(as_name);
759 
760     /* Target code should have set num_ases before calling us */
761     assert(asidx < cpu->num_ases);
762 
763     if (asidx == 0) {
764         /* address space 0 gets the convenience alias */
765         cpu->as = as;
766     }
767 
768     /* KVM cannot currently support multiple address spaces. */
769     assert(asidx == 0 || !kvm_enabled());
770 
771     if (!cpu->cpu_ases) {
772         cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases);
773         cpu->cpu_ases_count = cpu->num_ases;
774     }
775 
776     newas = &cpu->cpu_ases[asidx];
777     newas->cpu = cpu;
778     newas->as = as;
779     if (tcg_enabled()) {
780         newas->tcg_as_listener.log_global_after_sync = tcg_log_global_after_sync;
781         newas->tcg_as_listener.commit = tcg_commit;
782         newas->tcg_as_listener.name = "tcg";
783         memory_listener_register(&newas->tcg_as_listener, as);
784     }
785 }
786 
787 void cpu_address_space_destroy(CPUState *cpu, int asidx)
788 {
789     CPUAddressSpace *cpuas;
790 
791     assert(cpu->cpu_ases);
792     assert(asidx >= 0 && asidx < cpu->num_ases);
793     /* KVM cannot currently support multiple address spaces. */
794     assert(asidx == 0 || !kvm_enabled());
795 
796     cpuas = &cpu->cpu_ases[asidx];
797     if (tcg_enabled()) {
798         memory_listener_unregister(&cpuas->tcg_as_listener);
799     }
800 
801     address_space_destroy(cpuas->as);
802     g_free_rcu(cpuas->as, rcu);
803 
804     if (asidx == 0) {
805         /* reset the convenience alias for address space 0 */
806         cpu->as = NULL;
807     }
808 
809     if (--cpu->cpu_ases_count == 0) {
810         g_free(cpu->cpu_ases);
811         cpu->cpu_ases = NULL;
812     }
813 }
814 
815 AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
816 {
817     /* Return the AddressSpace corresponding to the specified index */
818     return cpu->cpu_ases[asidx].as;
819 }
820 
821 /* Called from RCU critical section */
822 static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
823 {
824     RAMBlock *block;
825 
826     block = qatomic_rcu_read(&ram_list.mru_block);
827     if (block && addr - block->offset < block->max_length) {
828         return block;
829     }
830     RAMBLOCK_FOREACH(block) {
831         if (addr - block->offset < block->max_length) {
832             goto found;
833         }
834     }
835 
836     fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
837     abort();
838 
839 found:
840     /* It is safe to write mru_block outside the BQL.  This
841      * is what happens:
842      *
843      *     mru_block = xxx
844      *     rcu_read_unlock()
845      *                                        xxx removed from list
846      *                  rcu_read_lock()
847      *                  read mru_block
848      *                                        mru_block = NULL;
849      *                                        call_rcu(reclaim_ramblock, xxx);
850      *                  rcu_read_unlock()
851      *
852      * qatomic_rcu_set is not needed here.  The block was already published
853      * when it was placed into the list.  Here we're just making an extra
854      * copy of the pointer.
855      */
856     ram_list.mru_block = block;
857     return block;
858 }
859 
860 void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
861 {
862     CPUState *cpu;
863     ram_addr_t start1;
864     RAMBlock *block;
865     ram_addr_t end;
866 
867     assert(tcg_enabled());
868     end = TARGET_PAGE_ALIGN(start + length);
869     start &= TARGET_PAGE_MASK;
870 
871     RCU_READ_LOCK_GUARD();
872     block = qemu_get_ram_block(start);
873     assert(block == qemu_get_ram_block(end - 1));
874     start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
875     CPU_FOREACH(cpu) {
876         tlb_reset_dirty(cpu, start1, length);
877     }
878 }
879 
880 /* Note: start and end must be within the same ram block.  */
881 bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start,
882                                               ram_addr_t length,
883                                               unsigned client)
884 {
885     DirtyMemoryBlocks *blocks;
886     unsigned long end, page, start_page;
887     bool dirty = false;
888     RAMBlock *ramblock;
889     uint64_t mr_offset, mr_size;
890 
891     if (length == 0) {
892         return false;
893     }
894 
895     end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
896     start_page = start >> TARGET_PAGE_BITS;
897     page = start_page;
898 
899     WITH_RCU_READ_LOCK_GUARD() {
900         blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]);
901         ramblock = qemu_get_ram_block(start);
902         /* Range sanity check on the ramblock */
903         assert(start >= ramblock->offset &&
904                start + length <= ramblock->offset + ramblock->used_length);
905 
906         while (page < end) {
907             unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
908             unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
909             unsigned long num = MIN(end - page,
910                                     DIRTY_MEMORY_BLOCK_SIZE - offset);
911 
912             dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
913                                                   offset, num);
914             page += num;
915         }
916 
917         mr_offset = (ram_addr_t)(start_page << TARGET_PAGE_BITS) - ramblock->offset;
918         mr_size = (end - start_page) << TARGET_PAGE_BITS;
919         memory_region_clear_dirty_bitmap(ramblock->mr, mr_offset, mr_size);
920     }
921 
922     if (dirty) {
923         cpu_physical_memory_dirty_bits_cleared(start, length);
924     }
925 
926     return dirty;
927 }
928 
929 DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty
930     (MemoryRegion *mr, hwaddr offset, hwaddr length, unsigned client)
931 {
932     DirtyMemoryBlocks *blocks;
933     ram_addr_t start, first, last;
934     unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
935     DirtyBitmapSnapshot *snap;
936     unsigned long page, end, dest;
937 
938     start = memory_region_get_ram_addr(mr);
939     /* We know we're only called for RAM MemoryRegions */
940     assert(start != RAM_ADDR_INVALID);
941     start += offset;
942 
943     first = QEMU_ALIGN_DOWN(start, align);
944     last  = QEMU_ALIGN_UP(start + length, align);
945 
946     snap = g_malloc0(sizeof(*snap) +
947                      ((last - first) >> (TARGET_PAGE_BITS + 3)));
948     snap->start = first;
949     snap->end   = last;
950 
951     page = first >> TARGET_PAGE_BITS;
952     end  = last  >> TARGET_PAGE_BITS;
953     dest = 0;
954 
955     WITH_RCU_READ_LOCK_GUARD() {
956         blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]);
957 
958         while (page < end) {
959             unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
960             unsigned long ofs = page % DIRTY_MEMORY_BLOCK_SIZE;
961             unsigned long num = MIN(end - page,
962                                     DIRTY_MEMORY_BLOCK_SIZE - ofs);
963 
964             assert(QEMU_IS_ALIGNED(ofs, (1 << BITS_PER_LEVEL)));
965             assert(QEMU_IS_ALIGNED(num,    (1 << BITS_PER_LEVEL)));
966             ofs >>= BITS_PER_LEVEL;
967 
968             bitmap_copy_and_clear_atomic(snap->dirty + dest,
969                                          blocks->blocks[idx] + ofs,
970                                          num);
971             page += num;
972             dest += num >> BITS_PER_LEVEL;
973         }
974     }
975 
976     cpu_physical_memory_dirty_bits_cleared(start, length);
977 
978     memory_region_clear_dirty_bitmap(mr, offset, length);
979 
980     return snap;
981 }
982 
983 bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
984                                             ram_addr_t start,
985                                             ram_addr_t length)
986 {
987     unsigned long page, end;
988 
989     assert(start >= snap->start);
990     assert(start + length <= snap->end);
991 
992     end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
993     page = (start - snap->start) >> TARGET_PAGE_BITS;
994 
995     while (page < end) {
996         if (test_bit(page, snap->dirty)) {
997             return true;
998         }
999         page++;
1000     }
1001     return false;
1002 }
1003 
1004 /* Called from RCU critical section */
1005 hwaddr memory_region_section_get_iotlb(CPUState *cpu,
1006                                        MemoryRegionSection *section)
1007 {
1008     AddressSpaceDispatch *d = flatview_to_dispatch(section->fv);
1009     return section - d->map.sections;
1010 }
1011 
1012 static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
1013                             uint16_t section);
1014 static subpage_t *subpage_init(FlatView *fv, hwaddr base);
1015 
1016 static uint16_t phys_section_add(PhysPageMap *map,
1017                                  MemoryRegionSection *section)
1018 {
1019     /* The physical section number is ORed with a page-aligned
1020      * pointer to produce the iotlb entries.  Thus it should
1021      * never overflow into the page-aligned value.
1022      */
1023     assert(map->sections_nb < TARGET_PAGE_SIZE);
1024 
1025     if (map->sections_nb == map->sections_nb_alloc) {
1026         map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
1027         map->sections = g_renew(MemoryRegionSection, map->sections,
1028                                 map->sections_nb_alloc);
1029     }
1030     map->sections[map->sections_nb] = *section;
1031     memory_region_ref(section->mr);
1032     return map->sections_nb++;
1033 }
1034 
1035 static void phys_section_destroy(MemoryRegion *mr)
1036 {
1037     bool have_sub_page = mr->subpage;
1038 
1039     memory_region_unref(mr);
1040 
1041     if (have_sub_page) {
1042         subpage_t *subpage = container_of(mr, subpage_t, iomem);
1043         object_unref(OBJECT(&subpage->iomem));
1044         g_free(subpage);
1045     }
1046 }
1047 
1048 static void phys_sections_free(PhysPageMap *map)
1049 {
1050     while (map->sections_nb > 0) {
1051         MemoryRegionSection *section = &map->sections[--map->sections_nb];
1052         phys_section_destroy(section->mr);
1053     }
1054     g_free(map->sections);
1055     g_free(map->nodes);
1056 }
1057 
1058 static void register_subpage(FlatView *fv, MemoryRegionSection *section)
1059 {
1060     AddressSpaceDispatch *d = flatview_to_dispatch(fv);
1061     subpage_t *subpage;
1062     hwaddr base = section->offset_within_address_space
1063         & TARGET_PAGE_MASK;
1064     MemoryRegionSection *existing = phys_page_find(d, base);
1065     MemoryRegionSection subsection = {
1066         .offset_within_address_space = base,
1067         .size = int128_make64(TARGET_PAGE_SIZE),
1068     };
1069     hwaddr start, end;
1070 
1071     assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
1072 
1073     if (!(existing->mr->subpage)) {
1074         subpage = subpage_init(fv, base);
1075         subsection.fv = fv;
1076         subsection.mr = &subpage->iomem;
1077         phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
1078                       phys_section_add(&d->map, &subsection));
1079     } else {
1080         subpage = container_of(existing->mr, subpage_t, iomem);
1081     }
1082     start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
1083     end = start + int128_get64(section->size) - 1;
1084     subpage_register(subpage, start, end,
1085                      phys_section_add(&d->map, section));
1086 }
1087 
1088 
1089 static void register_multipage(FlatView *fv,
1090                                MemoryRegionSection *section)
1091 {
1092     AddressSpaceDispatch *d = flatview_to_dispatch(fv);
1093     hwaddr start_addr = section->offset_within_address_space;
1094     uint16_t section_index = phys_section_add(&d->map, section);
1095     uint64_t num_pages = int128_get64(int128_rshift(section->size,
1096                                                     TARGET_PAGE_BITS));
1097 
1098     assert(num_pages);
1099     phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
1100 }
1101 
1102 /*
1103  * The range in *section* may look like this:
1104  *
1105  *      |s|PPPPPPP|s|
1106  *
1107  * where s stands for subpage and P for page.
1108  */
1109 void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section)
1110 {
1111     MemoryRegionSection remain = *section;
1112     Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
1113 
1114     /* register first subpage */
1115     if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
1116         uint64_t left = TARGET_PAGE_ALIGN(remain.offset_within_address_space)
1117                         - remain.offset_within_address_space;
1118 
1119         MemoryRegionSection now = remain;
1120         now.size = int128_min(int128_make64(left), now.size);
1121         register_subpage(fv, &now);
1122         if (int128_eq(remain.size, now.size)) {
1123             return;
1124         }
1125         remain.size = int128_sub(remain.size, now.size);
1126         remain.offset_within_address_space += int128_get64(now.size);
1127         remain.offset_within_region += int128_get64(now.size);
1128     }
1129 
1130     /* register whole pages */
1131     if (int128_ge(remain.size, page_size)) {
1132         MemoryRegionSection now = remain;
1133         now.size = int128_and(now.size, int128_neg(page_size));
1134         register_multipage(fv, &now);
1135         if (int128_eq(remain.size, now.size)) {
1136             return;
1137         }
1138         remain.size = int128_sub(remain.size, now.size);
1139         remain.offset_within_address_space += int128_get64(now.size);
1140         remain.offset_within_region += int128_get64(now.size);
1141     }
1142 
1143     /* register last subpage */
1144     register_subpage(fv, &remain);
1145 }
1146 
1147 void qemu_flush_coalesced_mmio_buffer(void)
1148 {
1149     if (kvm_enabled())
1150         kvm_flush_coalesced_mmio_buffer();
1151 }
1152 
1153 void qemu_mutex_lock_ramlist(void)
1154 {
1155     qemu_mutex_lock(&ram_list.mutex);
1156 }
1157 
1158 void qemu_mutex_unlock_ramlist(void)
1159 {
1160     qemu_mutex_unlock(&ram_list.mutex);
1161 }
1162 
1163 GString *ram_block_format(void)
1164 {
1165     RAMBlock *block;
1166     char *psize;
1167     GString *buf = g_string_new("");
1168 
1169     RCU_READ_LOCK_GUARD();
1170     g_string_append_printf(buf, "%24s %8s  %18s %18s %18s %18s %3s\n",
1171                            "Block Name", "PSize", "Offset", "Used", "Total",
1172                            "HVA", "RO");
1173 
1174     RAMBLOCK_FOREACH(block) {
1175         psize = size_to_str(block->page_size);
1176         g_string_append_printf(buf, "%24s %8s  0x%016" PRIx64 " 0x%016" PRIx64
1177                                " 0x%016" PRIx64 " 0x%016" PRIx64 " %3s\n",
1178                                block->idstr, psize,
1179                                (uint64_t)block->offset,
1180                                (uint64_t)block->used_length,
1181                                (uint64_t)block->max_length,
1182                                (uint64_t)(uintptr_t)block->host,
1183                                block->mr->readonly ? "ro" : "rw");
1184 
1185         g_free(psize);
1186     }
1187 
1188     return buf;
1189 }
1190 
1191 static int find_min_backend_pagesize(Object *obj, void *opaque)
1192 {
1193     long *hpsize_min = opaque;
1194 
1195     if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
1196         HostMemoryBackend *backend = MEMORY_BACKEND(obj);
1197         long hpsize = host_memory_backend_pagesize(backend);
1198 
1199         if (host_memory_backend_is_mapped(backend) && (hpsize < *hpsize_min)) {
1200             *hpsize_min = hpsize;
1201         }
1202     }
1203 
1204     return 0;
1205 }
1206 
1207 static int find_max_backend_pagesize(Object *obj, void *opaque)
1208 {
1209     long *hpsize_max = opaque;
1210 
1211     if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
1212         HostMemoryBackend *backend = MEMORY_BACKEND(obj);
1213         long hpsize = host_memory_backend_pagesize(backend);
1214 
1215         if (host_memory_backend_is_mapped(backend) && (hpsize > *hpsize_max)) {
1216             *hpsize_max = hpsize;
1217         }
1218     }
1219 
1220     return 0;
1221 }
1222 
1223 /*
1224  * TODO: We assume right now that all mapped host memory backends are
1225  * used as RAM, however some might be used for different purposes.
1226  */
1227 long qemu_minrampagesize(void)
1228 {
1229     long hpsize = LONG_MAX;
1230     Object *memdev_root = object_resolve_path("/objects", NULL);
1231 
1232     object_child_foreach(memdev_root, find_min_backend_pagesize, &hpsize);
1233     return hpsize;
1234 }
1235 
1236 long qemu_maxrampagesize(void)
1237 {
1238     long pagesize = 0;
1239     Object *memdev_root = object_resolve_path("/objects", NULL);
1240 
1241     object_child_foreach(memdev_root, find_max_backend_pagesize, &pagesize);
1242     return pagesize;
1243 }
1244 
1245 #ifdef CONFIG_POSIX
1246 static int64_t get_file_size(int fd)
1247 {
1248     int64_t size;
1249 #if defined(__linux__)
1250     struct stat st;
1251 
1252     if (fstat(fd, &st) < 0) {
1253         return -errno;
1254     }
1255 
1256     /* Special handling for devdax character devices */
1257     if (S_ISCHR(st.st_mode)) {
1258         g_autofree char *subsystem_path = NULL;
1259         g_autofree char *subsystem = NULL;
1260 
1261         subsystem_path = g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
1262                                          major(st.st_rdev), minor(st.st_rdev));
1263         subsystem = g_file_read_link(subsystem_path, NULL);
1264 
1265         if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
1266             g_autofree char *size_path = NULL;
1267             g_autofree char *size_str = NULL;
1268 
1269             size_path = g_strdup_printf("/sys/dev/char/%d:%d/size",
1270                                     major(st.st_rdev), minor(st.st_rdev));
1271 
1272             if (g_file_get_contents(size_path, &size_str, NULL, NULL)) {
1273                 return g_ascii_strtoll(size_str, NULL, 0);
1274             }
1275         }
1276     }
1277 #endif /* defined(__linux__) */
1278 
1279     /* st.st_size may be zero for special files yet lseek(2) works */
1280     size = lseek(fd, 0, SEEK_END);
1281     if (size < 0) {
1282         return -errno;
1283     }
1284     return size;
1285 }
1286 
1287 static int64_t get_file_align(int fd)
1288 {
1289     int64_t align = -1;
1290 #if defined(__linux__) && defined(CONFIG_LIBDAXCTL)
1291     struct stat st;
1292 
1293     if (fstat(fd, &st) < 0) {
1294         return -errno;
1295     }
1296 
1297     /* Special handling for devdax character devices */
1298     if (S_ISCHR(st.st_mode)) {
1299         g_autofree char *path = NULL;
1300         g_autofree char *rpath = NULL;
1301         struct daxctl_ctx *ctx;
1302         struct daxctl_region *region;
1303         int rc = 0;
1304 
1305         path = g_strdup_printf("/sys/dev/char/%d:%d",
1306                     major(st.st_rdev), minor(st.st_rdev));
1307         rpath = realpath(path, NULL);
1308         if (!rpath) {
1309             return -errno;
1310         }
1311 
1312         rc = daxctl_new(&ctx);
1313         if (rc) {
1314             return -1;
1315         }
1316 
1317         daxctl_region_foreach(ctx, region) {
1318             if (strstr(rpath, daxctl_region_get_path(region))) {
1319                 align = daxctl_region_get_align(region);
1320                 break;
1321             }
1322         }
1323         daxctl_unref(ctx);
1324     }
1325 #endif /* defined(__linux__) && defined(CONFIG_LIBDAXCTL) */
1326 
1327     return align;
1328 }
1329 
1330 static int file_ram_open(const char *path,
1331                          const char *region_name,
1332                          bool readonly,
1333                          bool *created)
1334 {
1335     char *filename;
1336     char *sanitized_name;
1337     char *c;
1338     int fd = -1;
1339 
1340     *created = false;
1341     for (;;) {
1342         fd = open(path, readonly ? O_RDONLY : O_RDWR);
1343         if (fd >= 0) {
1344             /*
1345              * open(O_RDONLY) won't fail with EISDIR. Check manually if we
1346              * opened a directory and fail similarly to how we fail ENOENT
1347              * in readonly mode. Note that mkstemp() would imply O_RDWR.
1348              */
1349             if (readonly) {
1350                 struct stat file_stat;
1351 
1352                 if (fstat(fd, &file_stat)) {
1353                     close(fd);
1354                     if (errno == EINTR) {
1355                         continue;
1356                     }
1357                     return -errno;
1358                 } else if (S_ISDIR(file_stat.st_mode)) {
1359                     close(fd);
1360                     return -EISDIR;
1361                 }
1362             }
1363             /* @path names an existing file, use it */
1364             break;
1365         }
1366         if (errno == ENOENT) {
1367             if (readonly) {
1368                 /* Refuse to create new, readonly files. */
1369                 return -ENOENT;
1370             }
1371             /* @path names a file that doesn't exist, create it */
1372             fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
1373             if (fd >= 0) {
1374                 *created = true;
1375                 break;
1376             }
1377         } else if (errno == EISDIR) {
1378             /* @path names a directory, create a file there */
1379             /* Make name safe to use with mkstemp by replacing '/' with '_'. */
1380             sanitized_name = g_strdup(region_name);
1381             for (c = sanitized_name; *c != '\0'; c++) {
1382                 if (*c == '/') {
1383                     *c = '_';
1384                 }
1385             }
1386 
1387             filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
1388                                        sanitized_name);
1389             g_free(sanitized_name);
1390 
1391             fd = mkstemp(filename);
1392             if (fd >= 0) {
1393                 unlink(filename);
1394                 g_free(filename);
1395                 break;
1396             }
1397             g_free(filename);
1398         }
1399         if (errno != EEXIST && errno != EINTR) {
1400             return -errno;
1401         }
1402         /*
1403          * Try again on EINTR and EEXIST.  The latter happens when
1404          * something else creates the file between our two open().
1405          */
1406     }
1407 
1408     return fd;
1409 }
1410 
1411 static void *file_ram_alloc(RAMBlock *block,
1412                             ram_addr_t memory,
1413                             int fd,
1414                             bool truncate,
1415                             off_t offset,
1416                             Error **errp)
1417 {
1418     uint32_t qemu_map_flags;
1419     void *area;
1420 
1421     block->page_size = qemu_fd_getpagesize(fd);
1422     if (block->mr->align % block->page_size) {
1423         error_setg(errp, "alignment 0x%" PRIx64
1424                    " must be multiples of page size 0x%zx",
1425                    block->mr->align, block->page_size);
1426         return NULL;
1427     } else if (block->mr->align && !is_power_of_2(block->mr->align)) {
1428         error_setg(errp, "alignment 0x%" PRIx64
1429                    " must be a power of two", block->mr->align);
1430         return NULL;
1431     } else if (offset % block->page_size) {
1432         error_setg(errp, "offset 0x%" PRIx64
1433                    " must be multiples of page size 0x%zx",
1434                    offset, block->page_size);
1435         return NULL;
1436     }
1437     block->mr->align = MAX(block->page_size, block->mr->align);
1438 #if defined(__s390x__)
1439     if (kvm_enabled()) {
1440         block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
1441     }
1442 #endif
1443 
1444     if (memory < block->page_size) {
1445         error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
1446                    "or larger than page size 0x%zx",
1447                    memory, block->page_size);
1448         return NULL;
1449     }
1450 
1451     memory = ROUND_UP(memory, block->page_size);
1452 
1453     /*
1454      * ftruncate is not supported by hugetlbfs in older
1455      * hosts, so don't bother bailing out on errors.
1456      * If anything goes wrong with it under other filesystems,
1457      * mmap will fail.
1458      *
1459      * Do not truncate the non-empty backend file to avoid corrupting
1460      * the existing data in the file. Disabling shrinking is not
1461      * enough. For example, the current vNVDIMM implementation stores
1462      * the guest NVDIMM labels at the end of the backend file. If the
1463      * backend file is later extended, QEMU will not be able to find
1464      * those labels. Therefore, extending the non-empty backend file
1465      * is disabled as well.
1466      */
1467     if (truncate && ftruncate(fd, offset + memory)) {
1468         perror("ftruncate");
1469     }
1470 
1471     qemu_map_flags = (block->flags & RAM_READONLY) ? QEMU_MAP_READONLY : 0;
1472     qemu_map_flags |= (block->flags & RAM_SHARED) ? QEMU_MAP_SHARED : 0;
1473     qemu_map_flags |= (block->flags & RAM_PMEM) ? QEMU_MAP_SYNC : 0;
1474     qemu_map_flags |= (block->flags & RAM_NORESERVE) ? QEMU_MAP_NORESERVE : 0;
1475     area = qemu_ram_mmap(fd, memory, block->mr->align, qemu_map_flags, offset);
1476     if (area == MAP_FAILED) {
1477         error_setg_errno(errp, errno,
1478                          "unable to map backing store for guest RAM");
1479         return NULL;
1480     }
1481 
1482     block->fd = fd;
1483     block->fd_offset = offset;
1484     return area;
1485 }
1486 #endif
1487 
1488 /* Allocate space within the ram_addr_t space that governs the
1489  * dirty bitmaps.
1490  * Called with the ramlist lock held.
1491  */
1492 static ram_addr_t find_ram_offset(ram_addr_t size)
1493 {
1494     RAMBlock *block, *next_block;
1495     ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
1496 
1497     assert(size != 0); /* it would hand out same offset multiple times */
1498 
1499     if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
1500         return 0;
1501     }
1502 
1503     RAMBLOCK_FOREACH(block) {
1504         ram_addr_t candidate, next = RAM_ADDR_MAX;
1505 
1506         /* Align blocks to start on a 'long' in the bitmap
1507          * which makes the bitmap sync'ing take the fast path.
1508          */
1509         candidate = block->offset + block->max_length;
1510         candidate = ROUND_UP(candidate, BITS_PER_LONG << TARGET_PAGE_BITS);
1511 
1512         /* Search for the closest following block
1513          * and find the gap.
1514          */
1515         RAMBLOCK_FOREACH(next_block) {
1516             if (next_block->offset >= candidate) {
1517                 next = MIN(next, next_block->offset);
1518             }
1519         }
1520 
1521         /* If it fits remember our place and remember the size
1522          * of gap, but keep going so that we might find a smaller
1523          * gap to fill so avoiding fragmentation.
1524          */
1525         if (next - candidate >= size && next - candidate < mingap) {
1526             offset = candidate;
1527             mingap = next - candidate;
1528         }
1529 
1530         trace_find_ram_offset_loop(size, candidate, offset, next, mingap);
1531     }
1532 
1533     if (offset == RAM_ADDR_MAX) {
1534         fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
1535                 (uint64_t)size);
1536         abort();
1537     }
1538 
1539     trace_find_ram_offset(size, offset);
1540 
1541     return offset;
1542 }
1543 
1544 static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
1545 {
1546     int ret;
1547 
1548     /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
1549     if (!machine_dump_guest_core(current_machine)) {
1550         ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
1551         if (ret) {
1552             perror("qemu_madvise");
1553             fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
1554                             "but dump-guest-core=off specified\n");
1555         }
1556     }
1557 }
1558 
1559 const char *qemu_ram_get_idstr(RAMBlock *rb)
1560 {
1561     return rb->idstr;
1562 }
1563 
1564 void *qemu_ram_get_host_addr(RAMBlock *rb)
1565 {
1566     return rb->host;
1567 }
1568 
1569 ram_addr_t qemu_ram_get_offset(RAMBlock *rb)
1570 {
1571     return rb->offset;
1572 }
1573 
1574 ram_addr_t qemu_ram_get_used_length(RAMBlock *rb)
1575 {
1576     return rb->used_length;
1577 }
1578 
1579 ram_addr_t qemu_ram_get_max_length(RAMBlock *rb)
1580 {
1581     return rb->max_length;
1582 }
1583 
1584 bool qemu_ram_is_shared(RAMBlock *rb)
1585 {
1586     return rb->flags & RAM_SHARED;
1587 }
1588 
1589 bool qemu_ram_is_noreserve(RAMBlock *rb)
1590 {
1591     return rb->flags & RAM_NORESERVE;
1592 }
1593 
1594 /* Note: Only set at the start of postcopy */
1595 bool qemu_ram_is_uf_zeroable(RAMBlock *rb)
1596 {
1597     return rb->flags & RAM_UF_ZEROPAGE;
1598 }
1599 
1600 void qemu_ram_set_uf_zeroable(RAMBlock *rb)
1601 {
1602     rb->flags |= RAM_UF_ZEROPAGE;
1603 }
1604 
1605 bool qemu_ram_is_migratable(RAMBlock *rb)
1606 {
1607     return rb->flags & RAM_MIGRATABLE;
1608 }
1609 
1610 void qemu_ram_set_migratable(RAMBlock *rb)
1611 {
1612     rb->flags |= RAM_MIGRATABLE;
1613 }
1614 
1615 void qemu_ram_unset_migratable(RAMBlock *rb)
1616 {
1617     rb->flags &= ~RAM_MIGRATABLE;
1618 }
1619 
1620 bool qemu_ram_is_named_file(RAMBlock *rb)
1621 {
1622     return rb->flags & RAM_NAMED_FILE;
1623 }
1624 
1625 int qemu_ram_get_fd(RAMBlock *rb)
1626 {
1627     return rb->fd;
1628 }
1629 
1630 /* Called with the BQL held.  */
1631 void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
1632 {
1633     RAMBlock *block;
1634 
1635     assert(new_block);
1636     assert(!new_block->idstr[0]);
1637 
1638     if (dev) {
1639         char *id = qdev_get_dev_path(dev);
1640         if (id) {
1641             snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
1642             g_free(id);
1643         }
1644     }
1645     pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
1646 
1647     RCU_READ_LOCK_GUARD();
1648     RAMBLOCK_FOREACH(block) {
1649         if (block != new_block &&
1650             !strcmp(block->idstr, new_block->idstr)) {
1651             fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
1652                     new_block->idstr);
1653             abort();
1654         }
1655     }
1656 }
1657 
1658 /* Called with the BQL held.  */
1659 void qemu_ram_unset_idstr(RAMBlock *block)
1660 {
1661     /* FIXME: arch_init.c assumes that this is not called throughout
1662      * migration.  Ignore the problem since hot-unplug during migration
1663      * does not work anyway.
1664      */
1665     if (block) {
1666         memset(block->idstr, 0, sizeof(block->idstr));
1667     }
1668 }
1669 
1670 static char *cpr_name(MemoryRegion *mr)
1671 {
1672     const char *mr_name = memory_region_name(mr);
1673     g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
1674 
1675     if (id) {
1676         return g_strdup_printf("%s/%s", id, mr_name);
1677     } else {
1678         return g_strdup(mr_name);
1679     }
1680 }
1681 
1682 size_t qemu_ram_pagesize(RAMBlock *rb)
1683 {
1684     return rb->page_size;
1685 }
1686 
1687 /* Returns the largest size of page in use */
1688 size_t qemu_ram_pagesize_largest(void)
1689 {
1690     RAMBlock *block;
1691     size_t largest = 0;
1692 
1693     RAMBLOCK_FOREACH(block) {
1694         largest = MAX(largest, qemu_ram_pagesize(block));
1695     }
1696 
1697     return largest;
1698 }
1699 
1700 static int memory_try_enable_merging(void *addr, size_t len)
1701 {
1702     if (!machine_mem_merge(current_machine)) {
1703         /* disabled by the user */
1704         return 0;
1705     }
1706 
1707     return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
1708 }
1709 
1710 /*
1711  * Resizing RAM while migrating can result in the migration being canceled.
1712  * Care has to be taken if the guest might have already detected the memory.
1713  *
1714  * As memory core doesn't know how is memory accessed, it is up to
1715  * resize callback to update device state and/or add assertions to detect
1716  * misuse, if necessary.
1717  */
1718 int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp)
1719 {
1720     const ram_addr_t oldsize = block->used_length;
1721     const ram_addr_t unaligned_size = newsize;
1722 
1723     assert(block);
1724 
1725     newsize = TARGET_PAGE_ALIGN(newsize);
1726     newsize = REAL_HOST_PAGE_ALIGN(newsize);
1727 
1728     if (block->used_length == newsize) {
1729         /*
1730          * We don't have to resize the ram block (which only knows aligned
1731          * sizes), however, we have to notify if the unaligned size changed.
1732          */
1733         if (unaligned_size != memory_region_size(block->mr)) {
1734             memory_region_set_size(block->mr, unaligned_size);
1735             if (block->resized) {
1736                 block->resized(block->idstr, unaligned_size, block->host);
1737             }
1738         }
1739         return 0;
1740     }
1741 
1742     if (!(block->flags & RAM_RESIZEABLE)) {
1743         error_setg_errno(errp, EINVAL,
1744                          "Size mismatch: %s: 0x" RAM_ADDR_FMT
1745                          " != 0x" RAM_ADDR_FMT, block->idstr,
1746                          newsize, block->used_length);
1747         return -EINVAL;
1748     }
1749 
1750     if (block->max_length < newsize) {
1751         error_setg_errno(errp, EINVAL,
1752                          "Size too large: %s: 0x" RAM_ADDR_FMT
1753                          " > 0x" RAM_ADDR_FMT, block->idstr,
1754                          newsize, block->max_length);
1755         return -EINVAL;
1756     }
1757 
1758     /* Notify before modifying the ram block and touching the bitmaps. */
1759     if (block->host) {
1760         ram_block_notify_resize(block->host, oldsize, newsize);
1761     }
1762 
1763     cpu_physical_memory_clear_dirty_range(block->offset, block->used_length);
1764     block->used_length = newsize;
1765     cpu_physical_memory_set_dirty_range(block->offset, block->used_length,
1766                                         DIRTY_CLIENTS_ALL);
1767     memory_region_set_size(block->mr, unaligned_size);
1768     if (block->resized) {
1769         block->resized(block->idstr, unaligned_size, block->host);
1770     }
1771     return 0;
1772 }
1773 
1774 /*
1775  * Trigger sync on the given ram block for range [start, start + length]
1776  * with the backing store if one is available.
1777  * Otherwise no-op.
1778  * @Note: this is supposed to be a synchronous op.
1779  */
1780 void qemu_ram_msync(RAMBlock *block, ram_addr_t start, ram_addr_t length)
1781 {
1782     /* The requested range should fit in within the block range */
1783     g_assert((start + length) <= block->used_length);
1784 
1785 #ifdef CONFIG_LIBPMEM
1786     /* The lack of support for pmem should not block the sync */
1787     if (ramblock_is_pmem(block)) {
1788         void *addr = ramblock_ptr(block, start);
1789         pmem_persist(addr, length);
1790         return;
1791     }
1792 #endif
1793     if (block->fd >= 0) {
1794         /**
1795          * Case there is no support for PMEM or the memory has not been
1796          * specified as persistent (or is not one) - use the msync.
1797          * Less optimal but still achieves the same goal
1798          */
1799         void *addr = ramblock_ptr(block, start);
1800         if (qemu_msync(addr, length, block->fd)) {
1801             warn_report("%s: failed to sync memory range: start: "
1802                     RAM_ADDR_FMT " length: " RAM_ADDR_FMT,
1803                     __func__, start, length);
1804         }
1805     }
1806 }
1807 
1808 /* Called with ram_list.mutex held */
1809 static void dirty_memory_extend(ram_addr_t new_ram_size)
1810 {
1811     unsigned int old_num_blocks = ram_list.num_dirty_blocks;
1812     unsigned int new_num_blocks = DIV_ROUND_UP(new_ram_size,
1813                                                DIRTY_MEMORY_BLOCK_SIZE);
1814     int i;
1815 
1816     /* Only need to extend if block count increased */
1817     if (new_num_blocks <= old_num_blocks) {
1818         return;
1819     }
1820 
1821     for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
1822         DirtyMemoryBlocks *old_blocks;
1823         DirtyMemoryBlocks *new_blocks;
1824         int j;
1825 
1826         old_blocks = qatomic_rcu_read(&ram_list.dirty_memory[i]);
1827         new_blocks = g_malloc(sizeof(*new_blocks) +
1828                               sizeof(new_blocks->blocks[0]) * new_num_blocks);
1829 
1830         if (old_num_blocks) {
1831             memcpy(new_blocks->blocks, old_blocks->blocks,
1832                    old_num_blocks * sizeof(old_blocks->blocks[0]));
1833         }
1834 
1835         for (j = old_num_blocks; j < new_num_blocks; j++) {
1836             new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
1837         }
1838 
1839         qatomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
1840 
1841         if (old_blocks) {
1842             g_free_rcu(old_blocks, rcu);
1843         }
1844     }
1845 
1846     ram_list.num_dirty_blocks = new_num_blocks;
1847 }
1848 
1849 static void ram_block_add(RAMBlock *new_block, Error **errp)
1850 {
1851     const bool noreserve = qemu_ram_is_noreserve(new_block);
1852     const bool shared = qemu_ram_is_shared(new_block);
1853     RAMBlock *block;
1854     RAMBlock *last_block = NULL;
1855     bool free_on_error = false;
1856     ram_addr_t ram_size;
1857     Error *err = NULL;
1858 
1859     qemu_mutex_lock_ramlist();
1860     new_block->offset = find_ram_offset(new_block->max_length);
1861 
1862     if (!new_block->host) {
1863         if (xen_enabled()) {
1864             xen_ram_alloc(new_block->offset, new_block->max_length,
1865                           new_block->mr, &err);
1866             if (err) {
1867                 error_propagate(errp, err);
1868                 qemu_mutex_unlock_ramlist();
1869                 return;
1870             }
1871         } else {
1872             new_block->host = qemu_anon_ram_alloc(new_block->max_length,
1873                                                   &new_block->mr->align,
1874                                                   shared, noreserve);
1875             if (!new_block->host) {
1876                 error_setg_errno(errp, errno,
1877                                  "cannot set up guest memory '%s'",
1878                                  memory_region_name(new_block->mr));
1879                 qemu_mutex_unlock_ramlist();
1880                 return;
1881             }
1882             memory_try_enable_merging(new_block->host, new_block->max_length);
1883             free_on_error = true;
1884         }
1885     }
1886 
1887     if (new_block->flags & RAM_GUEST_MEMFD) {
1888         int ret;
1889 
1890         if (!kvm_enabled()) {
1891             error_setg(errp, "cannot set up private guest memory for %s: KVM required",
1892                        object_get_typename(OBJECT(current_machine->cgs)));
1893             goto out_free;
1894         }
1895         assert(new_block->guest_memfd < 0);
1896 
1897         ret = ram_block_discard_require(true);
1898         if (ret < 0) {
1899             error_setg_errno(errp, -ret,
1900                              "cannot set up private guest memory: discard currently blocked");
1901             error_append_hint(errp, "Are you using assigned devices?\n");
1902             goto out_free;
1903         }
1904 
1905         new_block->guest_memfd = kvm_create_guest_memfd(new_block->max_length,
1906                                                         0, errp);
1907         if (new_block->guest_memfd < 0) {
1908             qemu_mutex_unlock_ramlist();
1909             goto out_free;
1910         }
1911 
1912         /*
1913          * Add a specific guest_memfd blocker if a generic one would not be
1914          * added by ram_block_add_cpr_blocker.
1915          */
1916         if (ram_is_cpr_compatible(new_block)) {
1917             error_setg(&new_block->cpr_blocker,
1918                        "Memory region %s uses guest_memfd, "
1919                        "which is not supported with CPR.",
1920                        memory_region_name(new_block->mr));
1921             migrate_add_blocker_modes(&new_block->cpr_blocker, errp,
1922                                       MIG_MODE_CPR_TRANSFER, -1);
1923         }
1924     }
1925 
1926     ram_size = (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS;
1927     dirty_memory_extend(ram_size);
1928     /* Keep the list sorted from biggest to smallest block.  Unlike QTAILQ,
1929      * QLIST (which has an RCU-friendly variant) does not have insertion at
1930      * tail, so save the last element in last_block.
1931      */
1932     RAMBLOCK_FOREACH(block) {
1933         last_block = block;
1934         if (block->max_length < new_block->max_length) {
1935             break;
1936         }
1937     }
1938     if (block) {
1939         QLIST_INSERT_BEFORE_RCU(block, new_block, next);
1940     } else if (last_block) {
1941         QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
1942     } else { /* list is empty */
1943         QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
1944     }
1945     ram_list.mru_block = NULL;
1946 
1947     /* Write list before version */
1948     smp_wmb();
1949     ram_list.version++;
1950     qemu_mutex_unlock_ramlist();
1951 
1952     cpu_physical_memory_set_dirty_range(new_block->offset,
1953                                         new_block->used_length,
1954                                         DIRTY_CLIENTS_ALL);
1955 
1956     if (new_block->host) {
1957         qemu_ram_setup_dump(new_block->host, new_block->max_length);
1958         qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
1959         /*
1960          * MADV_DONTFORK is also needed by KVM in absence of synchronous MMU
1961          * Configure it unless the machine is a qtest server, in which case
1962          * KVM is not used and it may be forked (eg for fuzzing purposes).
1963          */
1964         if (!qtest_enabled()) {
1965             qemu_madvise(new_block->host, new_block->max_length,
1966                          QEMU_MADV_DONTFORK);
1967         }
1968         ram_block_notify_add(new_block->host, new_block->used_length,
1969                              new_block->max_length);
1970     }
1971     return;
1972 
1973 out_free:
1974     if (free_on_error) {
1975         qemu_anon_ram_free(new_block->host, new_block->max_length);
1976         new_block->host = NULL;
1977     }
1978 }
1979 
1980 #ifdef CONFIG_POSIX
1981 RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, ram_addr_t max_size,
1982                                  qemu_ram_resize_cb resized, MemoryRegion *mr,
1983                                  uint32_t ram_flags, int fd, off_t offset,
1984                                  bool grow,
1985                                  Error **errp)
1986 {
1987     ERRP_GUARD();
1988     RAMBlock *new_block;
1989     Error *local_err = NULL;
1990     int64_t file_size, file_align, share_flags;
1991 
1992     share_flags = ram_flags & (RAM_PRIVATE | RAM_SHARED);
1993     assert(share_flags != (RAM_SHARED | RAM_PRIVATE));
1994     ram_flags &= ~RAM_PRIVATE;
1995 
1996     /* Just support these ram flags by now. */
1997     assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_NORESERVE |
1998                           RAM_PROTECTED | RAM_NAMED_FILE | RAM_READONLY |
1999                           RAM_READONLY_FD | RAM_GUEST_MEMFD |
2000                           RAM_RESIZEABLE)) == 0);
2001     assert(max_size >= size);
2002 
2003     if (xen_enabled()) {
2004         error_setg(errp, "-mem-path not supported with Xen");
2005         return NULL;
2006     }
2007 
2008     if (kvm_enabled() && !kvm_has_sync_mmu()) {
2009         error_setg(errp,
2010                    "host lacks kvm mmu notifiers, -mem-path unsupported");
2011         return NULL;
2012     }
2013 
2014     size = TARGET_PAGE_ALIGN(size);
2015     size = REAL_HOST_PAGE_ALIGN(size);
2016     max_size = TARGET_PAGE_ALIGN(max_size);
2017     max_size = REAL_HOST_PAGE_ALIGN(max_size);
2018 
2019     file_size = get_file_size(fd);
2020     if (file_size && file_size < offset + max_size && !grow) {
2021         error_setg(errp, "%s backing store size 0x%" PRIx64
2022                    " is too small for 'size' option 0x" RAM_ADDR_FMT
2023                    " plus 'offset' option 0x%" PRIx64,
2024                    memory_region_name(mr), file_size, max_size,
2025                    (uint64_t)offset);
2026         return NULL;
2027     }
2028 
2029     file_align = get_file_align(fd);
2030     if (file_align > 0 && file_align > mr->align) {
2031         error_setg(errp, "backing store align 0x%" PRIx64
2032                    " is larger than 'align' option 0x%" PRIx64,
2033                    file_align, mr->align);
2034         return NULL;
2035     }
2036 
2037     new_block = g_malloc0(sizeof(*new_block));
2038     new_block->mr = mr;
2039     new_block->used_length = size;
2040     new_block->max_length = max_size;
2041     new_block->resized = resized;
2042     new_block->flags = ram_flags;
2043     new_block->guest_memfd = -1;
2044     new_block->host = file_ram_alloc(new_block, max_size, fd,
2045                                      file_size < offset + max_size,
2046                                      offset, errp);
2047     if (!new_block->host) {
2048         g_free(new_block);
2049         return NULL;
2050     }
2051 
2052     ram_block_add(new_block, &local_err);
2053     if (local_err) {
2054         g_free(new_block);
2055         error_propagate(errp, local_err);
2056         return NULL;
2057     }
2058     return new_block;
2059 
2060 }
2061 
2062 
2063 RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
2064                                    uint32_t ram_flags, const char *mem_path,
2065                                    off_t offset, Error **errp)
2066 {
2067     int fd;
2068     bool created;
2069     RAMBlock *block;
2070 
2071     fd = file_ram_open(mem_path, memory_region_name(mr),
2072                        !!(ram_flags & RAM_READONLY_FD), &created);
2073     if (fd < 0) {
2074         error_setg_errno(errp, -fd, "can't open backing store %s for guest RAM",
2075                          mem_path);
2076         if (!(ram_flags & RAM_READONLY_FD) && !(ram_flags & RAM_SHARED) &&
2077             fd == -EACCES) {
2078             /*
2079              * If we can open the file R/O (note: will never create a new file)
2080              * and we are dealing with a private mapping, there are still ways
2081              * to consume such files and get RAM instead of ROM.
2082              */
2083             fd = file_ram_open(mem_path, memory_region_name(mr), true,
2084                                &created);
2085             if (fd < 0) {
2086                 return NULL;
2087             }
2088             assert(!created);
2089             close(fd);
2090             error_append_hint(errp, "Consider opening the backing store"
2091                 " read-only but still creating writable RAM using"
2092                 " '-object memory-backend-file,readonly=on,rom=off...'"
2093                 " (see \"VM templating\" documentation)\n");
2094         }
2095         return NULL;
2096     }
2097 
2098     block = qemu_ram_alloc_from_fd(size, size, NULL, mr, ram_flags, fd, offset,
2099                                    false, errp);
2100     if (!block) {
2101         if (created) {
2102             unlink(mem_path);
2103         }
2104         close(fd);
2105         return NULL;
2106     }
2107 
2108     return block;
2109 }
2110 #endif
2111 
2112 #ifdef CONFIG_POSIX
2113 /*
2114  * Create MAP_SHARED RAMBlocks by mmap'ing a file descriptor, so it can be
2115  * shared with another process if CPR is being used.  Use memfd if available
2116  * because it has no size limits, else use POSIX shm.
2117  */
2118 static int qemu_ram_get_shared_fd(const char *name, bool *reused, Error **errp)
2119 {
2120     int fd = cpr_find_fd(name, 0);
2121 
2122     if (fd >= 0) {
2123         *reused = true;
2124         return fd;
2125     }
2126 
2127     if (qemu_memfd_check(0)) {
2128         fd = qemu_memfd_create(name, 0, 0, 0, 0, errp);
2129     } else {
2130         fd = qemu_shm_alloc(0, errp);
2131     }
2132 
2133     if (fd >= 0) {
2134         cpr_save_fd(name, 0, fd);
2135     }
2136     *reused = false;
2137     return fd;
2138 }
2139 #endif
2140 
2141 static
2142 RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
2143                                   qemu_ram_resize_cb resized,
2144                                   void *host, uint32_t ram_flags,
2145                                   MemoryRegion *mr, Error **errp)
2146 {
2147     RAMBlock *new_block;
2148     Error *local_err = NULL;
2149     int align, share_flags;
2150 
2151     share_flags = ram_flags & (RAM_PRIVATE | RAM_SHARED);
2152     assert(share_flags != (RAM_SHARED | RAM_PRIVATE));
2153     ram_flags &= ~RAM_PRIVATE;
2154 
2155     assert((ram_flags & ~(RAM_SHARED | RAM_RESIZEABLE | RAM_PREALLOC |
2156                           RAM_NORESERVE | RAM_GUEST_MEMFD)) == 0);
2157     assert(!host ^ (ram_flags & RAM_PREALLOC));
2158     assert(max_size >= size);
2159 
2160 #ifdef CONFIG_POSIX         /* ignore RAM_SHARED for Windows */
2161     if (!host) {
2162         if (!share_flags && current_machine->aux_ram_share) {
2163             ram_flags |= RAM_SHARED;
2164         }
2165         if (ram_flags & RAM_SHARED) {
2166             bool reused;
2167             g_autofree char *name = cpr_name(mr);
2168             int fd = qemu_ram_get_shared_fd(name, &reused, errp);
2169 
2170             if (fd < 0) {
2171                 return NULL;
2172             }
2173 
2174             /* Use same alignment as qemu_anon_ram_alloc */
2175             mr->align = QEMU_VMALLOC_ALIGN;
2176 
2177             /*
2178              * This can fail if the shm mount size is too small, or alloc from
2179              * fd is not supported, but previous QEMU versions that called
2180              * qemu_anon_ram_alloc for anonymous shared memory could have
2181              * succeeded.  Quietly fail and fall back.
2182              *
2183              * After cpr-transfer, new QEMU could create a memory region
2184              * with a larger max size than old, so pass reused to grow the
2185              * region if necessary.  The extra space will be usable after a
2186              * guest reset.
2187              */
2188             new_block = qemu_ram_alloc_from_fd(size, max_size, resized, mr,
2189                                                ram_flags, fd, 0, reused, NULL);
2190             if (new_block) {
2191                 trace_qemu_ram_alloc_shared(name, new_block->used_length,
2192                                             new_block->max_length, fd,
2193                                             new_block->host);
2194                 return new_block;
2195             }
2196 
2197             cpr_delete_fd(name, 0);
2198             close(fd);
2199             /* fall back to anon allocation */
2200         }
2201     }
2202 #endif
2203 
2204     align = qemu_real_host_page_size();
2205     align = MAX(align, TARGET_PAGE_SIZE);
2206     size = ROUND_UP(size, align);
2207     max_size = ROUND_UP(max_size, align);
2208 
2209     new_block = g_malloc0(sizeof(*new_block));
2210     new_block->mr = mr;
2211     new_block->resized = resized;
2212     new_block->used_length = size;
2213     new_block->max_length = max_size;
2214     new_block->fd = -1;
2215     new_block->guest_memfd = -1;
2216     new_block->page_size = qemu_real_host_page_size();
2217     new_block->host = host;
2218     new_block->flags = ram_flags;
2219     ram_block_add(new_block, &local_err);
2220     if (local_err) {
2221         g_free(new_block);
2222         error_propagate(errp, local_err);
2223         return NULL;
2224     }
2225     return new_block;
2226 }
2227 
2228 RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
2229                                    MemoryRegion *mr, Error **errp)
2230 {
2231     return qemu_ram_alloc_internal(size, size, NULL, host, RAM_PREALLOC, mr,
2232                                    errp);
2233 }
2234 
2235 RAMBlock *qemu_ram_alloc(ram_addr_t size, uint32_t ram_flags,
2236                          MemoryRegion *mr, Error **errp)
2237 {
2238     assert((ram_flags & ~(RAM_SHARED | RAM_NORESERVE | RAM_GUEST_MEMFD |
2239                           RAM_PRIVATE)) == 0);
2240     return qemu_ram_alloc_internal(size, size, NULL, NULL, ram_flags, mr, errp);
2241 }
2242 
2243 RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
2244                                     qemu_ram_resize_cb resized,
2245                                     MemoryRegion *mr, Error **errp)
2246 {
2247     return qemu_ram_alloc_internal(size, maxsz, resized, NULL,
2248                                    RAM_RESIZEABLE, mr, errp);
2249 }
2250 
2251 static void reclaim_ramblock(RAMBlock *block)
2252 {
2253     if (block->flags & RAM_PREALLOC) {
2254         ;
2255     } else if (xen_enabled()) {
2256         xen_invalidate_map_cache_entry(block->host);
2257 #ifndef _WIN32
2258     } else if (block->fd >= 0) {
2259         qemu_ram_munmap(block->fd, block->host, block->max_length);
2260         close(block->fd);
2261 #endif
2262     } else {
2263         qemu_anon_ram_free(block->host, block->max_length);
2264     }
2265 
2266     if (block->guest_memfd >= 0) {
2267         close(block->guest_memfd);
2268         ram_block_discard_require(false);
2269     }
2270 
2271     g_free(block);
2272 }
2273 
2274 void qemu_ram_free(RAMBlock *block)
2275 {
2276     g_autofree char *name = NULL;
2277 
2278     if (!block) {
2279         return;
2280     }
2281 
2282     if (block->host) {
2283         ram_block_notify_remove(block->host, block->used_length,
2284                                 block->max_length);
2285     }
2286 
2287     qemu_mutex_lock_ramlist();
2288     name = cpr_name(block->mr);
2289     cpr_delete_fd(name, 0);
2290     QLIST_REMOVE_RCU(block, next);
2291     ram_list.mru_block = NULL;
2292     /* Write list before version */
2293     smp_wmb();
2294     ram_list.version++;
2295     call_rcu(block, reclaim_ramblock, rcu);
2296     qemu_mutex_unlock_ramlist();
2297 }
2298 
2299 #ifndef _WIN32
2300 /* Simply remap the given VM memory location from start to start+length */
2301 static int qemu_ram_remap_mmap(RAMBlock *block, uint64_t start, size_t length)
2302 {
2303     int flags, prot;
2304     void *area;
2305     void *host_startaddr = block->host + start;
2306 
2307     assert(block->fd < 0);
2308     flags = MAP_FIXED | MAP_ANONYMOUS;
2309     flags |= block->flags & RAM_SHARED ? MAP_SHARED : MAP_PRIVATE;
2310     flags |= block->flags & RAM_NORESERVE ? MAP_NORESERVE : 0;
2311     prot = PROT_READ;
2312     prot |= block->flags & RAM_READONLY ? 0 : PROT_WRITE;
2313     area = mmap(host_startaddr, length, prot, flags, -1, 0);
2314     return area != host_startaddr ? -errno : 0;
2315 }
2316 
2317 /*
2318  * qemu_ram_remap - remap a single RAM page
2319  *
2320  * @addr: address in ram_addr_t address space.
2321  *
2322  * This function will try remapping a single page of guest RAM identified by
2323  * @addr, essentially discarding memory to recover from previously poisoned
2324  * memory (MCE). The page size depends on the RAMBlock (i.e., hugetlb). @addr
2325  * does not have to point at the start of the page.
2326  *
2327  * This function is only to be used during system resets; it will kill the
2328  * VM if remapping failed.
2329  */
2330 void qemu_ram_remap(ram_addr_t addr)
2331 {
2332     RAMBlock *block;
2333     uint64_t offset;
2334     void *vaddr;
2335     size_t page_size;
2336 
2337     RAMBLOCK_FOREACH(block) {
2338         offset = addr - block->offset;
2339         if (offset < block->max_length) {
2340             /* Respect the pagesize of our RAMBlock */
2341             page_size = qemu_ram_pagesize(block);
2342             offset = QEMU_ALIGN_DOWN(offset, page_size);
2343 
2344             vaddr = ramblock_ptr(block, offset);
2345             if (block->flags & RAM_PREALLOC) {
2346                 ;
2347             } else if (xen_enabled()) {
2348                 abort();
2349             } else {
2350                 if (ram_block_discard_range(block, offset, page_size) != 0) {
2351                     /*
2352                      * Fall back to using mmap() only for anonymous mapping,
2353                      * as if a backing file is associated we may not be able
2354                      * to recover the memory in all cases.
2355                      * So don't take the risk of using only mmap and fail now.
2356                      */
2357                     if (block->fd >= 0) {
2358                         error_report("Could not remap RAM %s:%" PRIx64 "+%"
2359                                      PRIx64 " +%zx", block->idstr, offset,
2360                                      block->fd_offset, page_size);
2361                         exit(1);
2362                     }
2363                     if (qemu_ram_remap_mmap(block, offset, page_size) != 0) {
2364                         error_report("Could not remap RAM %s:%" PRIx64 " +%zx",
2365                                      block->idstr, offset, page_size);
2366                         exit(1);
2367                     }
2368                 }
2369                 memory_try_enable_merging(vaddr, page_size);
2370                 qemu_ram_setup_dump(vaddr, page_size);
2371             }
2372 
2373             break;
2374         }
2375     }
2376 }
2377 #endif /* !_WIN32 */
2378 
2379 /*
2380  * Return a host pointer to guest's ram.
2381  * For Xen, foreign mappings get created if they don't already exist.
2382  *
2383  * @block: block for the RAM to lookup (optional and may be NULL).
2384  * @addr: address within the memory region.
2385  * @size: pointer to requested size (optional and may be NULL).
2386  *        size may get modified and return a value smaller than
2387  *        what was requested.
2388  * @lock: wether to lock the mapping in xen-mapcache until invalidated.
2389  * @is_write: hint wether to map RW or RO in the xen-mapcache.
2390  *            (optional and may always be set to true).
2391  *
2392  * Called within RCU critical section.
2393  */
2394 static void *qemu_ram_ptr_length(RAMBlock *block, ram_addr_t addr,
2395                                  hwaddr *size, bool lock,
2396                                  bool is_write)
2397 {
2398     hwaddr len = 0;
2399 
2400     if (size && *size == 0) {
2401         return NULL;
2402     }
2403 
2404     if (block == NULL) {
2405         block = qemu_get_ram_block(addr);
2406         addr -= block->offset;
2407     }
2408     if (size) {
2409         *size = MIN(*size, block->max_length - addr);
2410         len = *size;
2411     }
2412 
2413     if (xen_enabled() && block->host == NULL) {
2414         /* We need to check if the requested address is in the RAM
2415          * because we don't want to map the entire memory in QEMU.
2416          * In that case just map the requested area.
2417          */
2418         if (xen_mr_is_memory(block->mr)) {
2419             return xen_map_cache(block->mr, block->offset + addr,
2420                                  len, block->offset,
2421                                  lock, lock, is_write);
2422         }
2423 
2424         block->host = xen_map_cache(block->mr, block->offset,
2425                                     block->max_length,
2426                                     block->offset,
2427                                     1, lock, is_write);
2428     }
2429 
2430     return ramblock_ptr(block, addr);
2431 }
2432 
2433 /*
2434  * Return a host pointer to ram allocated with qemu_ram_alloc.
2435  * This should not be used for general purpose DMA.  Use address_space_map
2436  * or address_space_rw instead. For local memory (e.g. video ram) that the
2437  * device owns, use memory_region_get_ram_ptr.
2438  *
2439  * Called within RCU critical section.
2440  */
2441 void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
2442 {
2443     return qemu_ram_ptr_length(ram_block, addr, NULL, false, true);
2444 }
2445 
2446 /* Return the offset of a hostpointer within a ramblock */
2447 ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host)
2448 {
2449     ram_addr_t res = (uint8_t *)host - (uint8_t *)rb->host;
2450     assert((uintptr_t)host >= (uintptr_t)rb->host);
2451     assert(res < rb->max_length);
2452 
2453     return res;
2454 }
2455 
2456 RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
2457                                    ram_addr_t *offset)
2458 {
2459     RAMBlock *block;
2460     uint8_t *host = ptr;
2461 
2462     if (xen_enabled()) {
2463         ram_addr_t ram_addr;
2464         RCU_READ_LOCK_GUARD();
2465         ram_addr = xen_ram_addr_from_mapcache(ptr);
2466         if (ram_addr == RAM_ADDR_INVALID) {
2467             return NULL;
2468         }
2469 
2470         block = qemu_get_ram_block(ram_addr);
2471         if (block) {
2472             *offset = ram_addr - block->offset;
2473         }
2474         return block;
2475     }
2476 
2477     RCU_READ_LOCK_GUARD();
2478     block = qatomic_rcu_read(&ram_list.mru_block);
2479     if (block && block->host && host - block->host < block->max_length) {
2480         goto found;
2481     }
2482 
2483     RAMBLOCK_FOREACH(block) {
2484         /* This case append when the block is not mapped. */
2485         if (block->host == NULL) {
2486             continue;
2487         }
2488         if (host - block->host < block->max_length) {
2489             goto found;
2490         }
2491     }
2492 
2493     return NULL;
2494 
2495 found:
2496     *offset = (host - block->host);
2497     if (round_offset) {
2498         *offset &= TARGET_PAGE_MASK;
2499     }
2500     return block;
2501 }
2502 
2503 /*
2504  * Finds the named RAMBlock
2505  *
2506  * name: The name of RAMBlock to find
2507  *
2508  * Returns: RAMBlock (or NULL if not found)
2509  */
2510 RAMBlock *qemu_ram_block_by_name(const char *name)
2511 {
2512     RAMBlock *block;
2513 
2514     RAMBLOCK_FOREACH(block) {
2515         if (!strcmp(name, block->idstr)) {
2516             return block;
2517         }
2518     }
2519 
2520     return NULL;
2521 }
2522 
2523 /*
2524  * Some of the system routines need to translate from a host pointer
2525  * (typically a TLB entry) back to a ram offset.
2526  */
2527 ram_addr_t qemu_ram_addr_from_host(void *ptr)
2528 {
2529     RAMBlock *block;
2530     ram_addr_t offset;
2531 
2532     block = qemu_ram_block_from_host(ptr, false, &offset);
2533     if (!block) {
2534         return RAM_ADDR_INVALID;
2535     }
2536 
2537     return block->offset + offset;
2538 }
2539 
2540 ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
2541 {
2542     ram_addr_t ram_addr;
2543 
2544     ram_addr = qemu_ram_addr_from_host(ptr);
2545     if (ram_addr == RAM_ADDR_INVALID) {
2546         error_report("Bad ram pointer %p", ptr);
2547         abort();
2548     }
2549     return ram_addr;
2550 }
2551 
2552 static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
2553                                  MemTxAttrs attrs, void *buf, hwaddr len);
2554 static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
2555                                   const void *buf, hwaddr len);
2556 static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
2557                                   bool is_write, MemTxAttrs attrs);
2558 
2559 static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
2560                                 unsigned len, MemTxAttrs attrs)
2561 {
2562     subpage_t *subpage = opaque;
2563     uint8_t buf[8];
2564     MemTxResult res;
2565 
2566 #if defined(DEBUG_SUBPAGE)
2567     printf("%s: subpage %p len %u addr " HWADDR_FMT_plx "\n", __func__,
2568            subpage, len, addr);
2569 #endif
2570     res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len);
2571     if (res) {
2572         return res;
2573     }
2574     *data = ldn_p(buf, len);
2575     return MEMTX_OK;
2576 }
2577 
2578 static MemTxResult subpage_write(void *opaque, hwaddr addr,
2579                                  uint64_t value, unsigned len, MemTxAttrs attrs)
2580 {
2581     subpage_t *subpage = opaque;
2582     uint8_t buf[8];
2583 
2584 #if defined(DEBUG_SUBPAGE)
2585     printf("%s: subpage %p len %u addr " HWADDR_FMT_plx
2586            " value %"PRIx64"\n",
2587            __func__, subpage, len, addr, value);
2588 #endif
2589     stn_p(buf, len, value);
2590     return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len);
2591 }
2592 
2593 static bool subpage_accepts(void *opaque, hwaddr addr,
2594                             unsigned len, bool is_write,
2595                             MemTxAttrs attrs)
2596 {
2597     subpage_t *subpage = opaque;
2598 #if defined(DEBUG_SUBPAGE)
2599     printf("%s: subpage %p %c len %u addr " HWADDR_FMT_plx "\n",
2600            __func__, subpage, is_write ? 'w' : 'r', len, addr);
2601 #endif
2602 
2603     return flatview_access_valid(subpage->fv, addr + subpage->base,
2604                                  len, is_write, attrs);
2605 }
2606 
2607 static const MemoryRegionOps subpage_ops = {
2608     .read_with_attrs = subpage_read,
2609     .write_with_attrs = subpage_write,
2610     .impl.min_access_size = 1,
2611     .impl.max_access_size = 8,
2612     .valid.min_access_size = 1,
2613     .valid.max_access_size = 8,
2614     .valid.accepts = subpage_accepts,
2615     .endianness = DEVICE_NATIVE_ENDIAN,
2616 };
2617 
2618 static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
2619                             uint16_t section)
2620 {
2621     int idx, eidx;
2622 
2623     if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
2624         return -1;
2625     idx = SUBPAGE_IDX(start);
2626     eidx = SUBPAGE_IDX(end);
2627 #if defined(DEBUG_SUBPAGE)
2628     printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n",
2629            __func__, mmio, start, end, idx, eidx, section);
2630 #endif
2631     for (; idx <= eidx; idx++) {
2632         mmio->sub_section[idx] = section;
2633     }
2634 
2635     return 0;
2636 }
2637 
2638 static subpage_t *subpage_init(FlatView *fv, hwaddr base)
2639 {
2640     subpage_t *mmio;
2641 
2642     /* mmio->sub_section is set to PHYS_SECTION_UNASSIGNED with g_malloc0 */
2643     mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
2644     mmio->fv = fv;
2645     mmio->base = base;
2646     memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
2647                           NULL, TARGET_PAGE_SIZE);
2648     mmio->iomem.subpage = true;
2649 #if defined(DEBUG_SUBPAGE)
2650     printf("%s: %p base " HWADDR_FMT_plx " len %08x\n", __func__,
2651            mmio, base, TARGET_PAGE_SIZE);
2652 #endif
2653 
2654     return mmio;
2655 }
2656 
2657 static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr)
2658 {
2659     assert(fv);
2660     MemoryRegionSection section = {
2661         .fv = fv,
2662         .mr = mr,
2663         .offset_within_address_space = 0,
2664         .offset_within_region = 0,
2665         .size = int128_2_64(),
2666     };
2667 
2668     return phys_section_add(map, &section);
2669 }
2670 
2671 MemoryRegionSection *iotlb_to_section(CPUState *cpu,
2672                                       hwaddr index, MemTxAttrs attrs)
2673 {
2674     int asidx = cpu_asidx_from_attrs(cpu, attrs);
2675     CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx];
2676     AddressSpaceDispatch *d = cpuas->memory_dispatch;
2677     int section_index = index & ~TARGET_PAGE_MASK;
2678     MemoryRegionSection *ret;
2679 
2680     assert(section_index < d->map.sections_nb);
2681     ret = d->map.sections + section_index;
2682     assert(ret->mr);
2683     assert(ret->mr->ops);
2684 
2685     return ret;
2686 }
2687 
2688 static void io_mem_init(void)
2689 {
2690     memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
2691                           NULL, UINT64_MAX);
2692 }
2693 
2694 AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
2695 {
2696     AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
2697     uint16_t n;
2698 
2699     n = dummy_section(&d->map, fv, &io_mem_unassigned);
2700     assert(n == PHYS_SECTION_UNASSIGNED);
2701 
2702     d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
2703 
2704     return d;
2705 }
2706 
2707 void address_space_dispatch_free(AddressSpaceDispatch *d)
2708 {
2709     phys_sections_free(&d->map);
2710     g_free(d);
2711 }
2712 
2713 static void do_nothing(CPUState *cpu, run_on_cpu_data d)
2714 {
2715 }
2716 
2717 static void tcg_log_global_after_sync(MemoryListener *listener)
2718 {
2719     CPUAddressSpace *cpuas;
2720 
2721     /* Wait for the CPU to end the current TB.  This avoids the following
2722      * incorrect race:
2723      *
2724      *      vCPU                         migration
2725      *      ----------------------       -------------------------
2726      *      TLB check -> slow path
2727      *        notdirty_mem_write
2728      *          write to RAM
2729      *          mark dirty
2730      *                                   clear dirty flag
2731      *      TLB check -> fast path
2732      *                                   read memory
2733      *        write to RAM
2734      *
2735      * by pushing the migration thread's memory read after the vCPU thread has
2736      * written the memory.
2737      */
2738     if (replay_mode == REPLAY_MODE_NONE) {
2739         /*
2740          * VGA can make calls to this function while updating the screen.
2741          * In record/replay mode this causes a deadlock, because
2742          * run_on_cpu waits for rr mutex. Therefore no races are possible
2743          * in this case and no need for making run_on_cpu when
2744          * record/replay is enabled.
2745          */
2746         cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
2747         run_on_cpu(cpuas->cpu, do_nothing, RUN_ON_CPU_NULL);
2748     }
2749 }
2750 
2751 static void tcg_commit_cpu(CPUState *cpu, run_on_cpu_data data)
2752 {
2753     CPUAddressSpace *cpuas = data.host_ptr;
2754 
2755     cpuas->memory_dispatch = address_space_to_dispatch(cpuas->as);
2756     tlb_flush(cpu);
2757 }
2758 
2759 static void tcg_commit(MemoryListener *listener)
2760 {
2761     CPUAddressSpace *cpuas;
2762     CPUState *cpu;
2763 
2764     assert(tcg_enabled());
2765     /* since each CPU stores ram addresses in its TLB cache, we must
2766        reset the modified entries */
2767     cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
2768     cpu = cpuas->cpu;
2769 
2770     /*
2771      * Defer changes to as->memory_dispatch until the cpu is quiescent.
2772      * Otherwise we race between (1) other cpu threads and (2) ongoing
2773      * i/o for the current cpu thread, with data cached by mmu_lookup().
2774      *
2775      * In addition, queueing the work function will kick the cpu back to
2776      * the main loop, which will end the RCU critical section and reclaim
2777      * the memory data structures.
2778      *
2779      * That said, the listener is also called during realize, before
2780      * all of the tcg machinery for run-on is initialized: thus halt_cond.
2781      */
2782     if (cpu->halt_cond) {
2783         async_run_on_cpu(cpu, tcg_commit_cpu, RUN_ON_CPU_HOST_PTR(cpuas));
2784     } else {
2785         tcg_commit_cpu(cpu, RUN_ON_CPU_HOST_PTR(cpuas));
2786     }
2787 }
2788 
2789 static void memory_map_init(void)
2790 {
2791     system_memory = g_malloc(sizeof(*system_memory));
2792 
2793     memory_region_init(system_memory, NULL, "system", UINT64_MAX);
2794     address_space_init(&address_space_memory, system_memory, "memory");
2795 
2796     system_io = g_malloc(sizeof(*system_io));
2797     memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
2798                           65536);
2799     address_space_init(&address_space_io, system_io, "I/O");
2800 }
2801 
2802 MemoryRegion *get_system_memory(void)
2803 {
2804     return system_memory;
2805 }
2806 
2807 MemoryRegion *get_system_io(void)
2808 {
2809     return system_io;
2810 }
2811 
2812 static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
2813                                      hwaddr length)
2814 {
2815     uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
2816     ram_addr_t ramaddr = memory_region_get_ram_addr(mr);
2817 
2818     /* We know we're only called for RAM MemoryRegions */
2819     assert(ramaddr != RAM_ADDR_INVALID);
2820     addr += ramaddr;
2821 
2822     /* No early return if dirty_log_mask is or becomes 0, because
2823      * cpu_physical_memory_set_dirty_range will still call
2824      * xen_modified_memory.
2825      */
2826     if (dirty_log_mask) {
2827         dirty_log_mask =
2828             cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask);
2829     }
2830     if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
2831         assert(tcg_enabled());
2832         tb_invalidate_phys_range(addr, addr + length - 1);
2833         dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
2834     }
2835     cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask);
2836 }
2837 
2838 void memory_region_flush_rom_device(MemoryRegion *mr, hwaddr addr, hwaddr size)
2839 {
2840     /*
2841      * In principle this function would work on other memory region types too,
2842      * but the ROM device use case is the only one where this operation is
2843      * necessary.  Other memory regions should use the
2844      * address_space_read/write() APIs.
2845      */
2846     assert(memory_region_is_romd(mr));
2847 
2848     invalidate_and_set_dirty(mr, addr, size);
2849 }
2850 
2851 int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
2852 {
2853     unsigned access_size_max = mr->ops->valid.max_access_size;
2854 
2855     /* Regions are assumed to support 1-4 byte accesses unless
2856        otherwise specified.  */
2857     if (access_size_max == 0) {
2858         access_size_max = 4;
2859     }
2860 
2861     /* Bound the maximum access by the alignment of the address.  */
2862     if (!mr->ops->impl.unaligned) {
2863         unsigned align_size_max = addr & -addr;
2864         if (align_size_max != 0 && align_size_max < access_size_max) {
2865             access_size_max = align_size_max;
2866         }
2867     }
2868 
2869     /* Don't attempt accesses larger than the maximum.  */
2870     if (l > access_size_max) {
2871         l = access_size_max;
2872     }
2873     l = pow2floor(l);
2874 
2875     return l;
2876 }
2877 
2878 bool prepare_mmio_access(MemoryRegion *mr)
2879 {
2880     bool release_lock = false;
2881 
2882     if (!bql_locked()) {
2883         bql_lock();
2884         release_lock = true;
2885     }
2886     if (mr->flush_coalesced_mmio) {
2887         qemu_flush_coalesced_mmio_buffer();
2888     }
2889 
2890     return release_lock;
2891 }
2892 
2893 /**
2894  * flatview_access_allowed
2895  * @mr: #MemoryRegion to be accessed
2896  * @attrs: memory transaction attributes
2897  * @addr: address within that memory region
2898  * @len: the number of bytes to access
2899  *
2900  * Check if a memory transaction is allowed.
2901  *
2902  * Returns: true if transaction is allowed, false if denied.
2903  */
2904 static bool flatview_access_allowed(MemoryRegion *mr, MemTxAttrs attrs,
2905                                     hwaddr addr, hwaddr len)
2906 {
2907     if (likely(!attrs.memory)) {
2908         return true;
2909     }
2910     if (memory_region_is_ram(mr)) {
2911         return true;
2912     }
2913     qemu_log_mask(LOG_INVALID_MEM,
2914                   "Invalid access to non-RAM device at "
2915                   "addr 0x%" HWADDR_PRIX ", size %" HWADDR_PRIu ", "
2916                   "region '%s'\n", addr, len, memory_region_name(mr));
2917     return false;
2918 }
2919 
2920 static MemTxResult flatview_write_continue_step(MemTxAttrs attrs,
2921                                                 const uint8_t *buf,
2922                                                 hwaddr len, hwaddr mr_addr,
2923                                                 hwaddr *l, MemoryRegion *mr)
2924 {
2925     if (!flatview_access_allowed(mr, attrs, mr_addr, *l)) {
2926         return MEMTX_ACCESS_ERROR;
2927     }
2928 
2929     if (!memory_access_is_direct(mr, true, attrs)) {
2930         uint64_t val;
2931         MemTxResult result;
2932         bool release_lock = prepare_mmio_access(mr);
2933 
2934         *l = memory_access_size(mr, *l, mr_addr);
2935         /*
2936          * XXX: could force current_cpu to NULL to avoid
2937          * potential bugs
2938          */
2939 
2940         /*
2941          * Assure Coverity (and ourselves) that we are not going to OVERRUN
2942          * the buffer by following ldn_he_p().
2943          */
2944 #ifdef QEMU_STATIC_ANALYSIS
2945         assert((*l == 1 && len >= 1) ||
2946                (*l == 2 && len >= 2) ||
2947                (*l == 4 && len >= 4) ||
2948                (*l == 8 && len >= 8));
2949 #endif
2950         val = ldn_he_p(buf, *l);
2951         result = memory_region_dispatch_write(mr, mr_addr, val,
2952                                               size_memop(*l), attrs);
2953         if (release_lock) {
2954             bql_unlock();
2955         }
2956 
2957         return result;
2958     } else {
2959         /* RAM case */
2960         uint8_t *ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, l,
2961                                                false, true);
2962 
2963         memmove(ram_ptr, buf, *l);
2964         invalidate_and_set_dirty(mr, mr_addr, *l);
2965 
2966         return MEMTX_OK;
2967     }
2968 }
2969 
2970 /* Called within RCU critical section.  */
2971 static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
2972                                            MemTxAttrs attrs,
2973                                            const void *ptr,
2974                                            hwaddr len, hwaddr mr_addr,
2975                                            hwaddr l, MemoryRegion *mr)
2976 {
2977     MemTxResult result = MEMTX_OK;
2978     const uint8_t *buf = ptr;
2979 
2980     for (;;) {
2981         result |= flatview_write_continue_step(attrs, buf, len, mr_addr, &l,
2982                                                mr);
2983 
2984         len -= l;
2985         buf += l;
2986         addr += l;
2987 
2988         if (!len) {
2989             break;
2990         }
2991 
2992         l = len;
2993         mr = flatview_translate(fv, addr, &mr_addr, &l, true, attrs);
2994     }
2995 
2996     return result;
2997 }
2998 
2999 /* Called from RCU critical section.  */
3000 static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
3001                                   const void *buf, hwaddr len)
3002 {
3003     hwaddr l;
3004     hwaddr mr_addr;
3005     MemoryRegion *mr;
3006 
3007     l = len;
3008     mr = flatview_translate(fv, addr, &mr_addr, &l, true, attrs);
3009     if (!flatview_access_allowed(mr, attrs, addr, len)) {
3010         return MEMTX_ACCESS_ERROR;
3011     }
3012     return flatview_write_continue(fv, addr, attrs, buf, len,
3013                                    mr_addr, l, mr);
3014 }
3015 
3016 static MemTxResult flatview_read_continue_step(MemTxAttrs attrs, uint8_t *buf,
3017                                                hwaddr len, hwaddr mr_addr,
3018                                                hwaddr *l,
3019                                                MemoryRegion *mr)
3020 {
3021     if (!flatview_access_allowed(mr, attrs, mr_addr, *l)) {
3022         return MEMTX_ACCESS_ERROR;
3023     }
3024 
3025     if (!memory_access_is_direct(mr, false, attrs)) {
3026         /* I/O case */
3027         uint64_t val;
3028         MemTxResult result;
3029         bool release_lock = prepare_mmio_access(mr);
3030 
3031         *l = memory_access_size(mr, *l, mr_addr);
3032         result = memory_region_dispatch_read(mr, mr_addr, &val, size_memop(*l),
3033                                              attrs);
3034 
3035         /*
3036          * Assure Coverity (and ourselves) that we are not going to OVERRUN
3037          * the buffer by following stn_he_p().
3038          */
3039 #ifdef QEMU_STATIC_ANALYSIS
3040         assert((*l == 1 && len >= 1) ||
3041                (*l == 2 && len >= 2) ||
3042                (*l == 4 && len >= 4) ||
3043                (*l == 8 && len >= 8));
3044 #endif
3045         stn_he_p(buf, *l, val);
3046 
3047         if (release_lock) {
3048             bql_unlock();
3049         }
3050         return result;
3051     } else {
3052         /* RAM case */
3053         uint8_t *ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, l,
3054                                                false, false);
3055 
3056         memcpy(buf, ram_ptr, *l);
3057 
3058         return MEMTX_OK;
3059     }
3060 }
3061 
3062 /* Called within RCU critical section.  */
3063 MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
3064                                    MemTxAttrs attrs, void *ptr,
3065                                    hwaddr len, hwaddr mr_addr, hwaddr l,
3066                                    MemoryRegion *mr)
3067 {
3068     MemTxResult result = MEMTX_OK;
3069     uint8_t *buf = ptr;
3070 
3071     fuzz_dma_read_cb(addr, len, mr);
3072     for (;;) {
3073         result |= flatview_read_continue_step(attrs, buf, len, mr_addr, &l, mr);
3074 
3075         len -= l;
3076         buf += l;
3077         addr += l;
3078 
3079         if (!len) {
3080             break;
3081         }
3082 
3083         l = len;
3084         mr = flatview_translate(fv, addr, &mr_addr, &l, false, attrs);
3085     }
3086 
3087     return result;
3088 }
3089 
3090 /* Called from RCU critical section.  */
3091 static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
3092                                  MemTxAttrs attrs, void *buf, hwaddr len)
3093 {
3094     hwaddr l;
3095     hwaddr mr_addr;
3096     MemoryRegion *mr;
3097 
3098     l = len;
3099     mr = flatview_translate(fv, addr, &mr_addr, &l, false, attrs);
3100     if (!flatview_access_allowed(mr, attrs, addr, len)) {
3101         return MEMTX_ACCESS_ERROR;
3102     }
3103     return flatview_read_continue(fv, addr, attrs, buf, len,
3104                                   mr_addr, l, mr);
3105 }
3106 
3107 MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
3108                                     MemTxAttrs attrs, void *buf, hwaddr len)
3109 {
3110     MemTxResult result = MEMTX_OK;
3111     FlatView *fv;
3112 
3113     if (len > 0) {
3114         RCU_READ_LOCK_GUARD();
3115         fv = address_space_to_flatview(as);
3116         result = flatview_read(fv, addr, attrs, buf, len);
3117     }
3118 
3119     return result;
3120 }
3121 
3122 MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
3123                                 MemTxAttrs attrs,
3124                                 const void *buf, hwaddr len)
3125 {
3126     MemTxResult result = MEMTX_OK;
3127     FlatView *fv;
3128 
3129     if (len > 0) {
3130         RCU_READ_LOCK_GUARD();
3131         fv = address_space_to_flatview(as);
3132         result = flatview_write(fv, addr, attrs, buf, len);
3133     }
3134 
3135     return result;
3136 }
3137 
3138 MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
3139                              void *buf, hwaddr len, bool is_write)
3140 {
3141     if (is_write) {
3142         return address_space_write(as, addr, attrs, buf, len);
3143     } else {
3144         return address_space_read_full(as, addr, attrs, buf, len);
3145     }
3146 }
3147 
3148 MemTxResult address_space_set(AddressSpace *as, hwaddr addr,
3149                               uint8_t c, hwaddr len, MemTxAttrs attrs)
3150 {
3151 #define FILLBUF_SIZE 512
3152     uint8_t fillbuf[FILLBUF_SIZE];
3153     int l;
3154     MemTxResult error = MEMTX_OK;
3155 
3156     memset(fillbuf, c, FILLBUF_SIZE);
3157     while (len > 0) {
3158         l = len < FILLBUF_SIZE ? len : FILLBUF_SIZE;
3159         error |= address_space_write(as, addr, attrs, fillbuf, l);
3160         len -= l;
3161         addr += l;
3162     }
3163 
3164     return error;
3165 }
3166 
3167 void cpu_physical_memory_rw(hwaddr addr, void *buf,
3168                             hwaddr len, bool is_write)
3169 {
3170     address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED,
3171                      buf, len, is_write);
3172 }
3173 
3174 enum write_rom_type {
3175     WRITE_DATA,
3176     FLUSH_CACHE,
3177 };
3178 
3179 static inline MemTxResult address_space_write_rom_internal(AddressSpace *as,
3180                                                            hwaddr addr,
3181                                                            MemTxAttrs attrs,
3182                                                            const void *ptr,
3183                                                            hwaddr len,
3184                                                            enum write_rom_type type)
3185 {
3186     hwaddr l;
3187     uint8_t *ram_ptr;
3188     hwaddr addr1;
3189     MemoryRegion *mr;
3190     const uint8_t *buf = ptr;
3191 
3192     RCU_READ_LOCK_GUARD();
3193     while (len > 0) {
3194         l = len;
3195         mr = address_space_translate(as, addr, &addr1, &l, true, attrs);
3196 
3197         if (!memory_region_supports_direct_access(mr)) {
3198             l = memory_access_size(mr, l, addr1);
3199         } else {
3200             /* ROM/RAM case */
3201             ram_ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
3202             switch (type) {
3203             case WRITE_DATA:
3204                 memcpy(ram_ptr, buf, l);
3205                 invalidate_and_set_dirty(mr, addr1, l);
3206                 break;
3207             case FLUSH_CACHE:
3208                 flush_idcache_range((uintptr_t)ram_ptr, (uintptr_t)ram_ptr, l);
3209                 break;
3210             }
3211         }
3212         len -= l;
3213         buf += l;
3214         addr += l;
3215     }
3216     return MEMTX_OK;
3217 }
3218 
3219 /* used for ROM loading : can write in RAM and ROM */
3220 MemTxResult address_space_write_rom(AddressSpace *as, hwaddr addr,
3221                                     MemTxAttrs attrs,
3222                                     const void *buf, hwaddr len)
3223 {
3224     return address_space_write_rom_internal(as, addr, attrs,
3225                                             buf, len, WRITE_DATA);
3226 }
3227 
3228 void cpu_flush_icache_range(hwaddr start, hwaddr len)
3229 {
3230     /*
3231      * This function should do the same thing as an icache flush that was
3232      * triggered from within the guest. For TCG we are always cache coherent,
3233      * so there is no need to flush anything. For KVM / Xen we need to flush
3234      * the host's instruction cache at least.
3235      */
3236     if (tcg_enabled()) {
3237         return;
3238     }
3239 
3240     address_space_write_rom_internal(&address_space_memory,
3241                                      start, MEMTXATTRS_UNSPECIFIED,
3242                                      NULL, len, FLUSH_CACHE);
3243 }
3244 
3245 /*
3246  * A magic value stored in the first 8 bytes of the bounce buffer struct. Used
3247  * to detect illegal pointers passed to address_space_unmap.
3248  */
3249 #define BOUNCE_BUFFER_MAGIC 0xb4017ceb4ffe12ed
3250 
3251 typedef struct {
3252     uint64_t magic;
3253     MemoryRegion *mr;
3254     hwaddr addr;
3255     size_t len;
3256     uint8_t buffer[];
3257 } BounceBuffer;
3258 
3259 static void
3260 address_space_unregister_map_client_do(AddressSpaceMapClient *client)
3261 {
3262     QLIST_REMOVE(client, link);
3263     g_free(client);
3264 }
3265 
3266 static void address_space_notify_map_clients_locked(AddressSpace *as)
3267 {
3268     AddressSpaceMapClient *client;
3269 
3270     while (!QLIST_EMPTY(&as->map_client_list)) {
3271         client = QLIST_FIRST(&as->map_client_list);
3272         qemu_bh_schedule(client->bh);
3273         address_space_unregister_map_client_do(client);
3274     }
3275 }
3276 
3277 void address_space_register_map_client(AddressSpace *as, QEMUBH *bh)
3278 {
3279     AddressSpaceMapClient *client = g_malloc(sizeof(*client));
3280 
3281     QEMU_LOCK_GUARD(&as->map_client_list_lock);
3282     client->bh = bh;
3283     QLIST_INSERT_HEAD(&as->map_client_list, client, link);
3284     /* Write map_client_list before reading bounce_buffer_size. */
3285     smp_mb();
3286     if (qatomic_read(&as->bounce_buffer_size) < as->max_bounce_buffer_size) {
3287         address_space_notify_map_clients_locked(as);
3288     }
3289 }
3290 
3291 void cpu_exec_init_all(void)
3292 {
3293     qemu_mutex_init(&ram_list.mutex);
3294     /* The data structures we set up here depend on knowing the page size,
3295      * so no more changes can be made after this point.
3296      * In an ideal world, nothing we did before we had finished the
3297      * machine setup would care about the target page size, and we could
3298      * do this much later, rather than requiring board models to state
3299      * up front what their requirements are.
3300      */
3301     finalize_target_page_bits();
3302     io_mem_init();
3303     memory_map_init();
3304 }
3305 
3306 void address_space_unregister_map_client(AddressSpace *as, QEMUBH *bh)
3307 {
3308     AddressSpaceMapClient *client;
3309 
3310     QEMU_LOCK_GUARD(&as->map_client_list_lock);
3311     QLIST_FOREACH(client, &as->map_client_list, link) {
3312         if (client->bh == bh) {
3313             address_space_unregister_map_client_do(client);
3314             break;
3315         }
3316     }
3317 }
3318 
3319 static void address_space_notify_map_clients(AddressSpace *as)
3320 {
3321     QEMU_LOCK_GUARD(&as->map_client_list_lock);
3322     address_space_notify_map_clients_locked(as);
3323 }
3324 
3325 static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
3326                                   bool is_write, MemTxAttrs attrs)
3327 {
3328     MemoryRegion *mr;
3329     hwaddr l, xlat;
3330 
3331     while (len > 0) {
3332         l = len;
3333         mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
3334         if (!memory_access_is_direct(mr, is_write, attrs)) {
3335             l = memory_access_size(mr, l, addr);
3336             if (!memory_region_access_valid(mr, xlat, l, is_write, attrs)) {
3337                 return false;
3338             }
3339         }
3340 
3341         len -= l;
3342         addr += l;
3343     }
3344     return true;
3345 }
3346 
3347 bool address_space_access_valid(AddressSpace *as, hwaddr addr,
3348                                 hwaddr len, bool is_write,
3349                                 MemTxAttrs attrs)
3350 {
3351     FlatView *fv;
3352 
3353     RCU_READ_LOCK_GUARD();
3354     fv = address_space_to_flatview(as);
3355     return flatview_access_valid(fv, addr, len, is_write, attrs);
3356 }
3357 
3358 static hwaddr
3359 flatview_extend_translation(FlatView *fv, hwaddr addr,
3360                             hwaddr target_len,
3361                             MemoryRegion *mr, hwaddr base, hwaddr len,
3362                             bool is_write, MemTxAttrs attrs)
3363 {
3364     hwaddr done = 0;
3365     hwaddr xlat;
3366     MemoryRegion *this_mr;
3367 
3368     for (;;) {
3369         target_len -= len;
3370         addr += len;
3371         done += len;
3372         if (target_len == 0) {
3373             return done;
3374         }
3375 
3376         len = target_len;
3377         this_mr = flatview_translate(fv, addr, &xlat,
3378                                      &len, is_write, attrs);
3379         if (this_mr != mr || xlat != base + done) {
3380             return done;
3381         }
3382     }
3383 }
3384 
3385 /* Map a physical memory region into a host virtual address.
3386  * May map a subset of the requested range, given by and returned in *plen.
3387  * May return NULL if resources needed to perform the mapping are exhausted.
3388  * Use only for reads OR writes - not for read-modify-write operations.
3389  * Use address_space_register_map_client() to know when retrying the map
3390  * operation is likely to succeed.
3391  */
3392 void *address_space_map(AddressSpace *as,
3393                         hwaddr addr,
3394                         hwaddr *plen,
3395                         bool is_write,
3396                         MemTxAttrs attrs)
3397 {
3398     hwaddr len = *plen;
3399     hwaddr l, xlat;
3400     MemoryRegion *mr;
3401     FlatView *fv;
3402 
3403     trace_address_space_map(as, addr, len, is_write, *(uint32_t *) &attrs);
3404 
3405     if (len == 0) {
3406         return NULL;
3407     }
3408 
3409     l = len;
3410     RCU_READ_LOCK_GUARD();
3411     fv = address_space_to_flatview(as);
3412     mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
3413 
3414     if (!memory_access_is_direct(mr, is_write, attrs)) {
3415         size_t used = qatomic_read(&as->bounce_buffer_size);
3416         for (;;) {
3417             hwaddr alloc = MIN(as->max_bounce_buffer_size - used, l);
3418             size_t new_size = used + alloc;
3419             size_t actual =
3420                 qatomic_cmpxchg(&as->bounce_buffer_size, used, new_size);
3421             if (actual == used) {
3422                 l = alloc;
3423                 break;
3424             }
3425             used = actual;
3426         }
3427 
3428         if (l == 0) {
3429             *plen = 0;
3430             return NULL;
3431         }
3432 
3433         BounceBuffer *bounce = g_malloc0(l + sizeof(BounceBuffer));
3434         bounce->magic = BOUNCE_BUFFER_MAGIC;
3435         memory_region_ref(mr);
3436         bounce->mr = mr;
3437         bounce->addr = addr;
3438         bounce->len = l;
3439 
3440         if (!is_write) {
3441             flatview_read(fv, addr, attrs,
3442                           bounce->buffer, l);
3443         }
3444 
3445         *plen = l;
3446         return bounce->buffer;
3447     }
3448 
3449     memory_region_ref(mr);
3450     *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
3451                                         l, is_write, attrs);
3452     fuzz_dma_read_cb(addr, *plen, mr);
3453     return qemu_ram_ptr_length(mr->ram_block, xlat, plen, true, is_write);
3454 }
3455 
3456 /* Unmaps a memory region previously mapped by address_space_map().
3457  * Will also mark the memory as dirty if is_write is true.  access_len gives
3458  * the amount of memory that was actually read or written by the caller.
3459  */
3460 void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
3461                          bool is_write, hwaddr access_len)
3462 {
3463     MemoryRegion *mr;
3464     ram_addr_t addr1;
3465 
3466     mr = memory_region_from_host(buffer, &addr1);
3467     if (mr != NULL) {
3468         if (is_write) {
3469             invalidate_and_set_dirty(mr, addr1, access_len);
3470         }
3471         if (xen_enabled()) {
3472             xen_invalidate_map_cache_entry(buffer);
3473         }
3474         memory_region_unref(mr);
3475         return;
3476     }
3477 
3478 
3479     BounceBuffer *bounce = container_of(buffer, BounceBuffer, buffer);
3480     assert(bounce->magic == BOUNCE_BUFFER_MAGIC);
3481 
3482     if (is_write) {
3483         address_space_write(as, bounce->addr, MEMTXATTRS_UNSPECIFIED,
3484                             bounce->buffer, access_len);
3485     }
3486 
3487     qatomic_sub(&as->bounce_buffer_size, bounce->len);
3488     bounce->magic = ~BOUNCE_BUFFER_MAGIC;
3489     memory_region_unref(bounce->mr);
3490     g_free(bounce);
3491     /* Write bounce_buffer_size before reading map_client_list. */
3492     smp_mb();
3493     address_space_notify_map_clients(as);
3494 }
3495 
3496 void *cpu_physical_memory_map(hwaddr addr,
3497                               hwaddr *plen,
3498                               bool is_write)
3499 {
3500     return address_space_map(&address_space_memory, addr, plen, is_write,
3501                              MEMTXATTRS_UNSPECIFIED);
3502 }
3503 
3504 void cpu_physical_memory_unmap(void *buffer, hwaddr len,
3505                                bool is_write, hwaddr access_len)
3506 {
3507     return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
3508 }
3509 
3510 #define ARG1_DECL                AddressSpace *as
3511 #define ARG1                     as
3512 #define SUFFIX
3513 #define TRANSLATE(...)           address_space_translate(as, __VA_ARGS__)
3514 #define RCU_READ_LOCK(...)       rcu_read_lock()
3515 #define RCU_READ_UNLOCK(...)     rcu_read_unlock()
3516 #include "memory_ldst.c.inc"
3517 
3518 int64_t address_space_cache_init(MemoryRegionCache *cache,
3519                                  AddressSpace *as,
3520                                  hwaddr addr,
3521                                  hwaddr len,
3522                                  bool is_write)
3523 {
3524     AddressSpaceDispatch *d;
3525     hwaddr l;
3526     MemoryRegion *mr;
3527     Int128 diff;
3528 
3529     assert(len > 0);
3530 
3531     l = len;
3532     cache->fv = address_space_get_flatview(as);
3533     d = flatview_to_dispatch(cache->fv);
3534     cache->mrs = *address_space_translate_internal(d, addr, &cache->xlat, &l, true);
3535 
3536     /*
3537      * cache->xlat is now relative to cache->mrs.mr, not to the section itself.
3538      * Take that into account to compute how many bytes are there between
3539      * cache->xlat and the end of the section.
3540      */
3541     diff = int128_sub(cache->mrs.size,
3542                       int128_make64(cache->xlat - cache->mrs.offset_within_region));
3543     l = int128_get64(int128_min(diff, int128_make64(l)));
3544 
3545     mr = cache->mrs.mr;
3546     memory_region_ref(mr);
3547     if (memory_access_is_direct(mr, is_write, MEMTXATTRS_UNSPECIFIED)) {
3548         /* We don't care about the memory attributes here as we're only
3549          * doing this if we found actual RAM, which behaves the same
3550          * regardless of attributes; so UNSPECIFIED is fine.
3551          */
3552         l = flatview_extend_translation(cache->fv, addr, len, mr,
3553                                         cache->xlat, l, is_write,
3554                                         MEMTXATTRS_UNSPECIFIED);
3555         cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, true,
3556                                          is_write);
3557     } else {
3558         cache->ptr = NULL;
3559     }
3560 
3561     cache->len = l;
3562     cache->is_write = is_write;
3563     return l;
3564 }
3565 
3566 void address_space_cache_invalidate(MemoryRegionCache *cache,
3567                                     hwaddr addr,
3568                                     hwaddr access_len)
3569 {
3570     assert(cache->is_write);
3571     if (likely(cache->ptr)) {
3572         invalidate_and_set_dirty(cache->mrs.mr, addr + cache->xlat, access_len);
3573     }
3574 }
3575 
3576 void address_space_cache_destroy(MemoryRegionCache *cache)
3577 {
3578     if (!cache->mrs.mr) {
3579         return;
3580     }
3581 
3582     if (xen_enabled()) {
3583         xen_invalidate_map_cache_entry(cache->ptr);
3584     }
3585     memory_region_unref(cache->mrs.mr);
3586     flatview_unref(cache->fv);
3587     cache->mrs.mr = NULL;
3588     cache->fv = NULL;
3589 }
3590 
3591 /* Called from RCU critical section.  This function has the same
3592  * semantics as address_space_translate, but it only works on a
3593  * predefined range of a MemoryRegion that was mapped with
3594  * address_space_cache_init.
3595  */
3596 static inline MemoryRegion *address_space_translate_cached(
3597     MemoryRegionCache *cache, hwaddr addr, hwaddr *xlat,
3598     hwaddr *plen, bool is_write, MemTxAttrs attrs)
3599 {
3600     MemoryRegionSection section;
3601     MemoryRegion *mr;
3602     IOMMUMemoryRegion *iommu_mr;
3603     AddressSpace *target_as;
3604 
3605     assert(!cache->ptr);
3606     *xlat = addr + cache->xlat;
3607 
3608     mr = cache->mrs.mr;
3609     iommu_mr = memory_region_get_iommu(mr);
3610     if (!iommu_mr) {
3611         /* MMIO region.  */
3612         return mr;
3613     }
3614 
3615     section = address_space_translate_iommu(iommu_mr, xlat, plen,
3616                                             NULL, is_write, true,
3617                                             &target_as, attrs);
3618     return section.mr;
3619 }
3620 
3621 /* Called within RCU critical section.  */
3622 static MemTxResult address_space_write_continue_cached(MemTxAttrs attrs,
3623                                                        const void *ptr,
3624                                                        hwaddr len,
3625                                                        hwaddr mr_addr,
3626                                                        hwaddr l,
3627                                                        MemoryRegion *mr)
3628 {
3629     MemTxResult result = MEMTX_OK;
3630     const uint8_t *buf = ptr;
3631 
3632     for (;;) {
3633         result |= flatview_write_continue_step(attrs, buf, len, mr_addr, &l,
3634                                                mr);
3635 
3636         len -= l;
3637         buf += l;
3638         mr_addr += l;
3639 
3640         if (!len) {
3641             break;
3642         }
3643 
3644         l = len;
3645     }
3646 
3647     return result;
3648 }
3649 
3650 /* Called within RCU critical section.  */
3651 static MemTxResult address_space_read_continue_cached(MemTxAttrs attrs,
3652                                                       void *ptr, hwaddr len,
3653                                                       hwaddr mr_addr, hwaddr l,
3654                                                       MemoryRegion *mr)
3655 {
3656     MemTxResult result = MEMTX_OK;
3657     uint8_t *buf = ptr;
3658 
3659     for (;;) {
3660         result |= flatview_read_continue_step(attrs, buf, len, mr_addr, &l, mr);
3661         len -= l;
3662         buf += l;
3663         mr_addr += l;
3664 
3665         if (!len) {
3666             break;
3667         }
3668         l = len;
3669     }
3670 
3671     return result;
3672 }
3673 
3674 /* Called from RCU critical section. address_space_read_cached uses this
3675  * out of line function when the target is an MMIO or IOMMU region.
3676  */
3677 MemTxResult
3678 address_space_read_cached_slow(MemoryRegionCache *cache, hwaddr addr,
3679                                    void *buf, hwaddr len)
3680 {
3681     hwaddr mr_addr, l;
3682     MemoryRegion *mr;
3683 
3684     l = len;
3685     mr = address_space_translate_cached(cache, addr, &mr_addr, &l, false,
3686                                         MEMTXATTRS_UNSPECIFIED);
3687     return address_space_read_continue_cached(MEMTXATTRS_UNSPECIFIED,
3688                                               buf, len, mr_addr, l, mr);
3689 }
3690 
3691 /* Called from RCU critical section. address_space_write_cached uses this
3692  * out of line function when the target is an MMIO or IOMMU region.
3693  */
3694 MemTxResult
3695 address_space_write_cached_slow(MemoryRegionCache *cache, hwaddr addr,
3696                                     const void *buf, hwaddr len)
3697 {
3698     hwaddr mr_addr, l;
3699     MemoryRegion *mr;
3700 
3701     l = len;
3702     mr = address_space_translate_cached(cache, addr, &mr_addr, &l, true,
3703                                         MEMTXATTRS_UNSPECIFIED);
3704     return address_space_write_continue_cached(MEMTXATTRS_UNSPECIFIED,
3705                                                buf, len, mr_addr, l, mr);
3706 }
3707 
3708 #define ARG1_DECL                MemoryRegionCache *cache
3709 #define ARG1                     cache
3710 #define SUFFIX                   _cached_slow
3711 #define TRANSLATE(...)           address_space_translate_cached(cache, __VA_ARGS__)
3712 #define RCU_READ_LOCK()          ((void)0)
3713 #define RCU_READ_UNLOCK()        ((void)0)
3714 #include "memory_ldst.c.inc"
3715 
3716 /* virtual memory access for debug (includes writing to ROM) */
3717 int cpu_memory_rw_debug(CPUState *cpu, vaddr addr,
3718                         void *ptr, size_t len, bool is_write)
3719 {
3720     hwaddr phys_addr;
3721     vaddr l, page;
3722     uint8_t *buf = ptr;
3723 
3724     cpu_synchronize_state(cpu);
3725     while (len > 0) {
3726         int asidx;
3727         MemTxAttrs attrs;
3728         MemTxResult res;
3729 
3730         page = addr & TARGET_PAGE_MASK;
3731         phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
3732         asidx = cpu_asidx_from_attrs(cpu, attrs);
3733         /* if no physical page mapped, return an error */
3734         if (phys_addr == -1)
3735             return -1;
3736         l = (page + TARGET_PAGE_SIZE) - addr;
3737         if (l > len)
3738             l = len;
3739         phys_addr += (addr & ~TARGET_PAGE_MASK);
3740         res = address_space_rw(cpu->cpu_ases[asidx].as, phys_addr, attrs, buf,
3741                                l, is_write);
3742         if (res != MEMTX_OK) {
3743             return -1;
3744         }
3745         len -= l;
3746         buf += l;
3747         addr += l;
3748     }
3749     return 0;
3750 }
3751 
3752 bool cpu_physical_memory_is_io(hwaddr phys_addr)
3753 {
3754     MemoryRegion*mr;
3755     hwaddr l = 1;
3756 
3757     RCU_READ_LOCK_GUARD();
3758     mr = address_space_translate(&address_space_memory,
3759                                  phys_addr, &phys_addr, &l, false,
3760                                  MEMTXATTRS_UNSPECIFIED);
3761 
3762     return !(memory_region_is_ram(mr) || memory_region_is_romd(mr));
3763 }
3764 
3765 int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
3766 {
3767     RAMBlock *block;
3768     int ret = 0;
3769 
3770     RCU_READ_LOCK_GUARD();
3771     RAMBLOCK_FOREACH(block) {
3772         ret = func(block, opaque);
3773         if (ret) {
3774             break;
3775         }
3776     }
3777     return ret;
3778 }
3779 
3780 /*
3781  * Unmap pages of memory from start to start+length such that
3782  * they a) read as 0, b) Trigger whatever fault mechanism
3783  * the OS provides for postcopy.
3784  * The pages must be unmapped by the end of the function.
3785  * Returns: 0 on success, none-0 on failure
3786  *
3787  */
3788 int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
3789 {
3790     int ret = -1;
3791 
3792     uint8_t *host_startaddr = rb->host + start;
3793 
3794     if (!QEMU_PTR_IS_ALIGNED(host_startaddr, rb->page_size)) {
3795         error_report("%s: Unaligned start address: %p",
3796                      __func__, host_startaddr);
3797         goto err;
3798     }
3799 
3800     if ((start + length) <= rb->max_length) {
3801         bool need_madvise, need_fallocate;
3802         if (!QEMU_IS_ALIGNED(length, rb->page_size)) {
3803             error_report("%s: Unaligned length: %zx", __func__, length);
3804             goto err;
3805         }
3806 
3807         errno = ENOTSUP; /* If we are missing MADVISE etc */
3808 
3809         /* The logic here is messy;
3810          *    madvise DONTNEED fails for hugepages
3811          *    fallocate works on hugepages and shmem
3812          *    shared anonymous memory requires madvise REMOVE
3813          */
3814         need_madvise = (rb->page_size == qemu_real_host_page_size());
3815         need_fallocate = rb->fd != -1;
3816         if (need_fallocate) {
3817             /* For a file, this causes the area of the file to be zero'd
3818              * if read, and for hugetlbfs also causes it to be unmapped
3819              * so a userfault will trigger.
3820              */
3821 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
3822             /*
3823              * fallocate() will fail with readonly files. Let's print a
3824              * proper error message.
3825              */
3826             if (rb->flags & RAM_READONLY_FD) {
3827                 error_report("%s: Discarding RAM with readonly files is not"
3828                              " supported", __func__);
3829                 goto err;
3830 
3831             }
3832             /*
3833              * We'll discard data from the actual file, even though we only
3834              * have a MAP_PRIVATE mapping, possibly messing with other
3835              * MAP_PRIVATE/MAP_SHARED mappings. There is no easy way to
3836              * change that behavior whithout violating the promised
3837              * semantics of ram_block_discard_range().
3838              *
3839              * Only warn, because it works as long as nobody else uses that
3840              * file.
3841              */
3842             if (!qemu_ram_is_shared(rb)) {
3843                 warn_report_once("%s: Discarding RAM"
3844                                  " in private file mappings is possibly"
3845                                  " dangerous, because it will modify the"
3846                                  " underlying file and will affect other"
3847                                  " users of the file", __func__);
3848             }
3849 
3850             ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
3851                             start + rb->fd_offset, length);
3852             if (ret) {
3853                 ret = -errno;
3854                 error_report("%s: Failed to fallocate %s:%" PRIx64 "+%" PRIx64
3855                              " +%zx (%d)", __func__, rb->idstr, start,
3856                              rb->fd_offset, length, ret);
3857                 goto err;
3858             }
3859 #else
3860             ret = -ENOSYS;
3861             error_report("%s: fallocate not available/file"
3862                          "%s:%" PRIx64 "+%" PRIx64 " +%zx (%d)", __func__,
3863                          rb->idstr, start, rb->fd_offset, length, ret);
3864             goto err;
3865 #endif
3866         }
3867         if (need_madvise) {
3868             /* For normal RAM this causes it to be unmapped,
3869              * for shared memory it causes the local mapping to disappear
3870              * and to fall back on the file contents (which we just
3871              * fallocate'd away).
3872              */
3873 #if defined(CONFIG_MADVISE)
3874             if (qemu_ram_is_shared(rb) && rb->fd < 0) {
3875                 ret = madvise(host_startaddr, length, QEMU_MADV_REMOVE);
3876             } else {
3877                 ret = madvise(host_startaddr, length, QEMU_MADV_DONTNEED);
3878             }
3879             if (ret) {
3880                 ret = -errno;
3881                 error_report("%s: Failed to discard range "
3882                              "%s:%" PRIx64 " +%zx (%d)",
3883                              __func__, rb->idstr, start, length, ret);
3884                 goto err;
3885             }
3886 #else
3887             ret = -ENOSYS;
3888             error_report("%s: MADVISE not available %s:%" PRIx64 " +%zx (%d)",
3889                          __func__, rb->idstr, start, length, ret);
3890             goto err;
3891 #endif
3892         }
3893         trace_ram_block_discard_range(rb->idstr, host_startaddr, length,
3894                                       need_madvise, need_fallocate, ret);
3895     } else {
3896         error_report("%s: Overrun block '%s' (%" PRIu64 "/%zx/" RAM_ADDR_FMT")",
3897                      __func__, rb->idstr, start, length, rb->max_length);
3898     }
3899 
3900 err:
3901     return ret;
3902 }
3903 
3904 int ram_block_discard_guest_memfd_range(RAMBlock *rb, uint64_t start,
3905                                         size_t length)
3906 {
3907     int ret = -1;
3908 
3909 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
3910     /* ignore fd_offset with guest_memfd */
3911     ret = fallocate(rb->guest_memfd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
3912                     start, length);
3913 
3914     if (ret) {
3915         ret = -errno;
3916         error_report("%s: Failed to fallocate %s:%" PRIx64 " +%zx (%d)",
3917                      __func__, rb->idstr, start, length, ret);
3918     }
3919 #else
3920     ret = -ENOSYS;
3921     error_report("%s: fallocate not available %s:%" PRIx64 " +%zx (%d)",
3922                  __func__, rb->idstr, start, length, ret);
3923 #endif
3924 
3925     return ret;
3926 }
3927 
3928 bool ramblock_is_pmem(RAMBlock *rb)
3929 {
3930     return rb->flags & RAM_PMEM;
3931 }
3932 
3933 static void mtree_print_phys_entries(int start, int end, int skip, int ptr)
3934 {
3935     if (start == end - 1) {
3936         qemu_printf("\t%3d      ", start);
3937     } else {
3938         qemu_printf("\t%3d..%-3d ", start, end - 1);
3939     }
3940     qemu_printf(" skip=%d ", skip);
3941     if (ptr == PHYS_MAP_NODE_NIL) {
3942         qemu_printf(" ptr=NIL");
3943     } else if (!skip) {
3944         qemu_printf(" ptr=#%d", ptr);
3945     } else {
3946         qemu_printf(" ptr=[%d]", ptr);
3947     }
3948     qemu_printf("\n");
3949 }
3950 
3951 #define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \
3952                            int128_sub((size), int128_one())) : 0)
3953 
3954 void mtree_print_dispatch(AddressSpaceDispatch *d, MemoryRegion *root)
3955 {
3956     int i;
3957 
3958     qemu_printf("  Dispatch\n");
3959     qemu_printf("    Physical sections\n");
3960 
3961     for (i = 0; i < d->map.sections_nb; ++i) {
3962         MemoryRegionSection *s = d->map.sections + i;
3963         const char *names[] = { " [unassigned]", " [not dirty]",
3964                                 " [ROM]", " [watch]" };
3965 
3966         qemu_printf("      #%d @" HWADDR_FMT_plx ".." HWADDR_FMT_plx
3967                     " %s%s%s%s%s",
3968             i,
3969             s->offset_within_address_space,
3970             s->offset_within_address_space + MR_SIZE(s->size),
3971             s->mr->name ? s->mr->name : "(noname)",
3972             i < ARRAY_SIZE(names) ? names[i] : "",
3973             s->mr == root ? " [ROOT]" : "",
3974             s == d->mru_section ? " [MRU]" : "",
3975             s->mr->is_iommu ? " [iommu]" : "");
3976 
3977         if (s->mr->alias) {
3978             qemu_printf(" alias=%s", s->mr->alias->name ?
3979                     s->mr->alias->name : "noname");
3980         }
3981         qemu_printf("\n");
3982     }
3983 
3984     qemu_printf("    Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n",
3985                P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip);
3986     for (i = 0; i < d->map.nodes_nb; ++i) {
3987         int j, jprev;
3988         PhysPageEntry prev;
3989         Node *n = d->map.nodes + i;
3990 
3991         qemu_printf("      [%d]\n", i);
3992 
3993         for (j = 0, jprev = 0, prev = *n[0]; j < ARRAY_SIZE(*n); ++j) {
3994             PhysPageEntry *pe = *n + j;
3995 
3996             if (pe->ptr == prev.ptr && pe->skip == prev.skip) {
3997                 continue;
3998             }
3999 
4000             mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
4001 
4002             jprev = j;
4003             prev = *pe;
4004         }
4005 
4006         if (jprev != ARRAY_SIZE(*n)) {
4007             mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
4008         }
4009     }
4010 }
4011 
4012 /* Require any discards to work. */
4013 static unsigned int ram_block_discard_required_cnt;
4014 /* Require only coordinated discards to work. */
4015 static unsigned int ram_block_coordinated_discard_required_cnt;
4016 /* Disable any discards. */
4017 static unsigned int ram_block_discard_disabled_cnt;
4018 /* Disable only uncoordinated discards. */
4019 static unsigned int ram_block_uncoordinated_discard_disabled_cnt;
4020 static QemuMutex ram_block_discard_disable_mutex;
4021 
4022 static void ram_block_discard_disable_mutex_lock(void)
4023 {
4024     static gsize initialized;
4025 
4026     if (g_once_init_enter(&initialized)) {
4027         qemu_mutex_init(&ram_block_discard_disable_mutex);
4028         g_once_init_leave(&initialized, 1);
4029     }
4030     qemu_mutex_lock(&ram_block_discard_disable_mutex);
4031 }
4032 
4033 static void ram_block_discard_disable_mutex_unlock(void)
4034 {
4035     qemu_mutex_unlock(&ram_block_discard_disable_mutex);
4036 }
4037 
4038 int ram_block_discard_disable(bool state)
4039 {
4040     int ret = 0;
4041 
4042     ram_block_discard_disable_mutex_lock();
4043     if (!state) {
4044         ram_block_discard_disabled_cnt--;
4045     } else if (ram_block_discard_required_cnt ||
4046                ram_block_coordinated_discard_required_cnt) {
4047         ret = -EBUSY;
4048     } else {
4049         ram_block_discard_disabled_cnt++;
4050     }
4051     ram_block_discard_disable_mutex_unlock();
4052     return ret;
4053 }
4054 
4055 int ram_block_uncoordinated_discard_disable(bool state)
4056 {
4057     int ret = 0;
4058 
4059     ram_block_discard_disable_mutex_lock();
4060     if (!state) {
4061         ram_block_uncoordinated_discard_disabled_cnt--;
4062     } else if (ram_block_discard_required_cnt) {
4063         ret = -EBUSY;
4064     } else {
4065         ram_block_uncoordinated_discard_disabled_cnt++;
4066     }
4067     ram_block_discard_disable_mutex_unlock();
4068     return ret;
4069 }
4070 
4071 int ram_block_discard_require(bool state)
4072 {
4073     int ret = 0;
4074 
4075     ram_block_discard_disable_mutex_lock();
4076     if (!state) {
4077         ram_block_discard_required_cnt--;
4078     } else if (ram_block_discard_disabled_cnt ||
4079                ram_block_uncoordinated_discard_disabled_cnt) {
4080         ret = -EBUSY;
4081     } else {
4082         ram_block_discard_required_cnt++;
4083     }
4084     ram_block_discard_disable_mutex_unlock();
4085     return ret;
4086 }
4087 
4088 int ram_block_coordinated_discard_require(bool state)
4089 {
4090     int ret = 0;
4091 
4092     ram_block_discard_disable_mutex_lock();
4093     if (!state) {
4094         ram_block_coordinated_discard_required_cnt--;
4095     } else if (ram_block_discard_disabled_cnt) {
4096         ret = -EBUSY;
4097     } else {
4098         ram_block_coordinated_discard_required_cnt++;
4099     }
4100     ram_block_discard_disable_mutex_unlock();
4101     return ret;
4102 }
4103 
4104 bool ram_block_discard_is_disabled(void)
4105 {
4106     return qatomic_read(&ram_block_discard_disabled_cnt) ||
4107            qatomic_read(&ram_block_uncoordinated_discard_disabled_cnt);
4108 }
4109 
4110 bool ram_block_discard_is_required(void)
4111 {
4112     return qatomic_read(&ram_block_discard_required_cnt) ||
4113            qatomic_read(&ram_block_coordinated_discard_required_cnt);
4114 }
4115 
4116 /*
4117  * Return true if ram is compatible with CPR.  Do not exclude rom,
4118  * because the rom file could change in new QEMU.
4119  */
4120 static bool ram_is_cpr_compatible(RAMBlock *rb)
4121 {
4122     MemoryRegion *mr = rb->mr;
4123 
4124     if (!mr || !memory_region_is_ram(mr)) {
4125         return true;
4126     }
4127 
4128     /* Ram device is remapped in new QEMU */
4129     if (memory_region_is_ram_device(mr)) {
4130         return true;
4131     }
4132 
4133     /*
4134      * A file descriptor is passed to new QEMU and remapped, or its backing
4135      * file is reopened and mapped.  It must be shared to avoid COW.
4136      */
4137     if (rb->fd >= 0 && qemu_ram_is_shared(rb)) {
4138         return true;
4139     }
4140 
4141     return false;
4142 }
4143 
4144 /*
4145  * Add a blocker for each volatile ram block.  This function should only be
4146  * called after we know that the block is migratable.  Non-migratable blocks
4147  * are either re-created in new QEMU, or are handled specially, or are covered
4148  * by a device-level CPR blocker.
4149  */
4150 void ram_block_add_cpr_blocker(RAMBlock *rb, Error **errp)
4151 {
4152     assert(qemu_ram_is_migratable(rb));
4153 
4154     if (ram_is_cpr_compatible(rb)) {
4155         return;
4156     }
4157 
4158     error_setg(&rb->cpr_blocker,
4159                "Memory region %s is not compatible with CPR. share=on is "
4160                "required for memory-backend objects, and aux-ram-share=on is "
4161                "required.", memory_region_name(rb->mr));
4162     migrate_add_blocker_modes(&rb->cpr_blocker, errp, MIG_MODE_CPR_TRANSFER,
4163                               -1);
4164 }
4165 
4166 void ram_block_del_cpr_blocker(RAMBlock *rb)
4167 {
4168     migrate_del_blocker(&rb->cpr_blocker);
4169 }
4170