xref: /qemu/migration/postcopy-ram.c (revision 2a4c42f18c987496c2c48764d4785a9d6448874a)
1eb59db53SDr. David Alan Gilbert /*
2eb59db53SDr. David Alan Gilbert  * Postcopy migration for RAM
3eb59db53SDr. David Alan Gilbert  *
4eb59db53SDr. David Alan Gilbert  * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates
5eb59db53SDr. David Alan Gilbert  *
6eb59db53SDr. David Alan Gilbert  * Authors:
7eb59db53SDr. David Alan Gilbert  *  Dave Gilbert  <dgilbert@redhat.com>
8eb59db53SDr. David Alan Gilbert  *
9eb59db53SDr. David Alan Gilbert  * This work is licensed under the terms of the GNU GPL, version 2 or later.
10eb59db53SDr. David Alan Gilbert  * See the COPYING file in the top-level directory.
11eb59db53SDr. David Alan Gilbert  *
12eb59db53SDr. David Alan Gilbert  */
13eb59db53SDr. David Alan Gilbert 
14eb59db53SDr. David Alan Gilbert /*
15eb59db53SDr. David Alan Gilbert  * Postcopy is a migration technique where the execution flips from the
16eb59db53SDr. David Alan Gilbert  * source to the destination before all the data has been copied.
17eb59db53SDr. David Alan Gilbert  */
18eb59db53SDr. David Alan Gilbert 
191393a485SPeter Maydell #include "qemu/osdep.h"
2051180423SJuan Quintela #include "exec/target_page.h"
216666c96aSJuan Quintela #include "migration.h"
2208a0aee1SJuan Quintela #include "qemu-file.h"
2320a519a0SJuan Quintela #include "savevm.h"
24be07b0acSJuan Quintela #include "postcopy-ram.h"
257b1e1a22SJuan Quintela #include "ram.h"
261693c64cSDr. David Alan Gilbert #include "qapi/error.h"
271693c64cSDr. David Alan Gilbert #include "qemu/notify.h"
28eb59db53SDr. David Alan Gilbert #include "sysemu/sysemu.h"
29371ff5a3SDr. David Alan Gilbert #include "sysemu/balloon.h"
30eb59db53SDr. David Alan Gilbert #include "qemu/error-report.h"
31eb59db53SDr. David Alan Gilbert #include "trace.h"
32eb59db53SDr. David Alan Gilbert 
33e0b266f0SDr. David Alan Gilbert /* Arbitrary limit on size of each discard command,
34e0b266f0SDr. David Alan Gilbert  * keeps them around ~200 bytes
35e0b266f0SDr. David Alan Gilbert  */
36e0b266f0SDr. David Alan Gilbert #define MAX_DISCARDS_PER_COMMAND 12
37e0b266f0SDr. David Alan Gilbert 
38e0b266f0SDr. David Alan Gilbert struct PostcopyDiscardState {
39e0b266f0SDr. David Alan Gilbert     const char *ramblock_name;
40e0b266f0SDr. David Alan Gilbert     uint16_t cur_entry;
41e0b266f0SDr. David Alan Gilbert     /*
42e0b266f0SDr. David Alan Gilbert      * Start and length of a discard range (bytes)
43e0b266f0SDr. David Alan Gilbert      */
44e0b266f0SDr. David Alan Gilbert     uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
45e0b266f0SDr. David Alan Gilbert     uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
46e0b266f0SDr. David Alan Gilbert     unsigned int nsentwords;
47e0b266f0SDr. David Alan Gilbert     unsigned int nsentcmds;
48e0b266f0SDr. David Alan Gilbert };
49e0b266f0SDr. David Alan Gilbert 
501693c64cSDr. David Alan Gilbert static NotifierWithReturnList postcopy_notifier_list;
511693c64cSDr. David Alan Gilbert 
521693c64cSDr. David Alan Gilbert void postcopy_infrastructure_init(void)
531693c64cSDr. David Alan Gilbert {
541693c64cSDr. David Alan Gilbert     notifier_with_return_list_init(&postcopy_notifier_list);
551693c64cSDr. David Alan Gilbert }
561693c64cSDr. David Alan Gilbert 
571693c64cSDr. David Alan Gilbert void postcopy_add_notifier(NotifierWithReturn *nn)
581693c64cSDr. David Alan Gilbert {
591693c64cSDr. David Alan Gilbert     notifier_with_return_list_add(&postcopy_notifier_list, nn);
601693c64cSDr. David Alan Gilbert }
611693c64cSDr. David Alan Gilbert 
621693c64cSDr. David Alan Gilbert void postcopy_remove_notifier(NotifierWithReturn *n)
631693c64cSDr. David Alan Gilbert {
641693c64cSDr. David Alan Gilbert     notifier_with_return_remove(n);
651693c64cSDr. David Alan Gilbert }
661693c64cSDr. David Alan Gilbert 
671693c64cSDr. David Alan Gilbert int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
681693c64cSDr. David Alan Gilbert {
691693c64cSDr. David Alan Gilbert     struct PostcopyNotifyData pnd;
701693c64cSDr. David Alan Gilbert     pnd.reason = reason;
711693c64cSDr. David Alan Gilbert     pnd.errp = errp;
721693c64cSDr. David Alan Gilbert 
731693c64cSDr. David Alan Gilbert     return notifier_with_return_list_notify(&postcopy_notifier_list,
741693c64cSDr. David Alan Gilbert                                             &pnd);
751693c64cSDr. David Alan Gilbert }
761693c64cSDr. David Alan Gilbert 
77eb59db53SDr. David Alan Gilbert /* Postcopy needs to detect accesses to pages that haven't yet been copied
78eb59db53SDr. David Alan Gilbert  * across, and efficiently map new pages in, the techniques for doing this
79eb59db53SDr. David Alan Gilbert  * are target OS specific.
80eb59db53SDr. David Alan Gilbert  */
81eb59db53SDr. David Alan Gilbert #if defined(__linux__)
82eb59db53SDr. David Alan Gilbert 
83c4faeed2SDr. David Alan Gilbert #include <poll.h>
84eb59db53SDr. David Alan Gilbert #include <sys/ioctl.h>
85eb59db53SDr. David Alan Gilbert #include <sys/syscall.h>
86eb59db53SDr. David Alan Gilbert #include <asm/types.h> /* for __u64 */
87eb59db53SDr. David Alan Gilbert #endif
88eb59db53SDr. David Alan Gilbert 
89d8b9d771SMatthew Fortune #if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
90d8b9d771SMatthew Fortune #include <sys/eventfd.h>
91eb59db53SDr. David Alan Gilbert #include <linux/userfaultfd.h>
92eb59db53SDr. David Alan Gilbert 
93*2a4c42f1SAlexey Perevalov typedef struct PostcopyBlocktimeContext {
94*2a4c42f1SAlexey Perevalov     /* time when page fault initiated per vCPU */
95*2a4c42f1SAlexey Perevalov     uint32_t *page_fault_vcpu_time;
96*2a4c42f1SAlexey Perevalov     /* page address per vCPU */
97*2a4c42f1SAlexey Perevalov     uintptr_t *vcpu_addr;
98*2a4c42f1SAlexey Perevalov     uint32_t total_blocktime;
99*2a4c42f1SAlexey Perevalov     /* blocktime per vCPU */
100*2a4c42f1SAlexey Perevalov     uint32_t *vcpu_blocktime;
101*2a4c42f1SAlexey Perevalov     /* point in time when last page fault was initiated */
102*2a4c42f1SAlexey Perevalov     uint32_t last_begin;
103*2a4c42f1SAlexey Perevalov     /* number of vCPU are suspended */
104*2a4c42f1SAlexey Perevalov     int smp_cpus_down;
105*2a4c42f1SAlexey Perevalov     uint64_t start_time;
106*2a4c42f1SAlexey Perevalov 
107*2a4c42f1SAlexey Perevalov     /*
108*2a4c42f1SAlexey Perevalov      * Handler for exit event, necessary for
109*2a4c42f1SAlexey Perevalov      * releasing whole blocktime_ctx
110*2a4c42f1SAlexey Perevalov      */
111*2a4c42f1SAlexey Perevalov     Notifier exit_notifier;
112*2a4c42f1SAlexey Perevalov } PostcopyBlocktimeContext;
113*2a4c42f1SAlexey Perevalov 
114*2a4c42f1SAlexey Perevalov static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
115*2a4c42f1SAlexey Perevalov {
116*2a4c42f1SAlexey Perevalov     g_free(ctx->page_fault_vcpu_time);
117*2a4c42f1SAlexey Perevalov     g_free(ctx->vcpu_addr);
118*2a4c42f1SAlexey Perevalov     g_free(ctx->vcpu_blocktime);
119*2a4c42f1SAlexey Perevalov     g_free(ctx);
120*2a4c42f1SAlexey Perevalov }
121*2a4c42f1SAlexey Perevalov 
122*2a4c42f1SAlexey Perevalov static void migration_exit_cb(Notifier *n, void *data)
123*2a4c42f1SAlexey Perevalov {
124*2a4c42f1SAlexey Perevalov     PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
125*2a4c42f1SAlexey Perevalov                                                  exit_notifier);
126*2a4c42f1SAlexey Perevalov     destroy_blocktime_context(ctx);
127*2a4c42f1SAlexey Perevalov }
128*2a4c42f1SAlexey Perevalov 
129*2a4c42f1SAlexey Perevalov static struct PostcopyBlocktimeContext *blocktime_context_new(void)
130*2a4c42f1SAlexey Perevalov {
131*2a4c42f1SAlexey Perevalov     PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
132*2a4c42f1SAlexey Perevalov     ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
133*2a4c42f1SAlexey Perevalov     ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
134*2a4c42f1SAlexey Perevalov     ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
135*2a4c42f1SAlexey Perevalov 
136*2a4c42f1SAlexey Perevalov     ctx->exit_notifier.notify = migration_exit_cb;
137*2a4c42f1SAlexey Perevalov     ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
138*2a4c42f1SAlexey Perevalov     qemu_add_exit_notifier(&ctx->exit_notifier);
139*2a4c42f1SAlexey Perevalov     return ctx;
140*2a4c42f1SAlexey Perevalov }
141ca6011c2SAlexey Perevalov 
14254ae0886SAlexey Perevalov /**
14354ae0886SAlexey Perevalov  * receive_ufd_features: check userfault fd features, to request only supported
14454ae0886SAlexey Perevalov  * features in the future.
14554ae0886SAlexey Perevalov  *
14654ae0886SAlexey Perevalov  * Returns: true on success
14754ae0886SAlexey Perevalov  *
14854ae0886SAlexey Perevalov  * __NR_userfaultfd - should be checked before
14954ae0886SAlexey Perevalov  *  @features: out parameter will contain uffdio_api.features provided by kernel
15054ae0886SAlexey Perevalov  *              in case of success
15154ae0886SAlexey Perevalov  */
15254ae0886SAlexey Perevalov static bool receive_ufd_features(uint64_t *features)
15354ae0886SAlexey Perevalov {
15454ae0886SAlexey Perevalov     struct uffdio_api api_struct = {0};
15554ae0886SAlexey Perevalov     int ufd;
15654ae0886SAlexey Perevalov     bool ret = true;
15754ae0886SAlexey Perevalov 
15854ae0886SAlexey Perevalov     /* if we are here __NR_userfaultfd should exists */
15954ae0886SAlexey Perevalov     ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
16054ae0886SAlexey Perevalov     if (ufd == -1) {
16154ae0886SAlexey Perevalov         error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
16254ae0886SAlexey Perevalov                      strerror(errno));
16354ae0886SAlexey Perevalov         return false;
16454ae0886SAlexey Perevalov     }
16554ae0886SAlexey Perevalov 
16654ae0886SAlexey Perevalov     /* ask features */
167eb59db53SDr. David Alan Gilbert     api_struct.api = UFFD_API;
168eb59db53SDr. David Alan Gilbert     api_struct.features = 0;
169eb59db53SDr. David Alan Gilbert     if (ioctl(ufd, UFFDIO_API, &api_struct)) {
1705553499fSAlexey Perevalov         error_report("%s: UFFDIO_API failed: %s", __func__,
171eb59db53SDr. David Alan Gilbert                      strerror(errno));
17254ae0886SAlexey Perevalov         ret = false;
17354ae0886SAlexey Perevalov         goto release_ufd;
17454ae0886SAlexey Perevalov     }
17554ae0886SAlexey Perevalov 
17654ae0886SAlexey Perevalov     *features = api_struct.features;
17754ae0886SAlexey Perevalov 
17854ae0886SAlexey Perevalov release_ufd:
17954ae0886SAlexey Perevalov     close(ufd);
18054ae0886SAlexey Perevalov     return ret;
18154ae0886SAlexey Perevalov }
18254ae0886SAlexey Perevalov 
18354ae0886SAlexey Perevalov /**
18454ae0886SAlexey Perevalov  * request_ufd_features: this function should be called only once on a newly
18554ae0886SAlexey Perevalov  * opened ufd, subsequent calls will lead to error.
18654ae0886SAlexey Perevalov  *
18754ae0886SAlexey Perevalov  * Returns: true on succes
18854ae0886SAlexey Perevalov  *
18954ae0886SAlexey Perevalov  * @ufd: fd obtained from userfaultfd syscall
19054ae0886SAlexey Perevalov  * @features: bit mask see UFFD_API_FEATURES
19154ae0886SAlexey Perevalov  */
19254ae0886SAlexey Perevalov static bool request_ufd_features(int ufd, uint64_t features)
19354ae0886SAlexey Perevalov {
19454ae0886SAlexey Perevalov     struct uffdio_api api_struct = {0};
19554ae0886SAlexey Perevalov     uint64_t ioctl_mask;
19654ae0886SAlexey Perevalov 
19754ae0886SAlexey Perevalov     api_struct.api = UFFD_API;
19854ae0886SAlexey Perevalov     api_struct.features = features;
19954ae0886SAlexey Perevalov     if (ioctl(ufd, UFFDIO_API, &api_struct)) {
20054ae0886SAlexey Perevalov         error_report("%s failed: UFFDIO_API failed: %s", __func__,
20154ae0886SAlexey Perevalov                      strerror(errno));
202eb59db53SDr. David Alan Gilbert         return false;
203eb59db53SDr. David Alan Gilbert     }
204eb59db53SDr. David Alan Gilbert 
205eb59db53SDr. David Alan Gilbert     ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
206eb59db53SDr. David Alan Gilbert                  (__u64)1 << _UFFDIO_UNREGISTER;
207eb59db53SDr. David Alan Gilbert     if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
208eb59db53SDr. David Alan Gilbert         error_report("Missing userfault features: %" PRIx64,
209eb59db53SDr. David Alan Gilbert                      (uint64_t)(~api_struct.ioctls & ioctl_mask));
210eb59db53SDr. David Alan Gilbert         return false;
211eb59db53SDr. David Alan Gilbert     }
212eb59db53SDr. David Alan Gilbert 
21354ae0886SAlexey Perevalov     return true;
21454ae0886SAlexey Perevalov }
21554ae0886SAlexey Perevalov 
21654ae0886SAlexey Perevalov static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
21754ae0886SAlexey Perevalov {
21854ae0886SAlexey Perevalov     uint64_t asked_features = 0;
21954ae0886SAlexey Perevalov     static uint64_t supported_features;
22054ae0886SAlexey Perevalov 
22154ae0886SAlexey Perevalov     /*
22254ae0886SAlexey Perevalov      * it's not possible to
22354ae0886SAlexey Perevalov      * request UFFD_API twice per one fd
22454ae0886SAlexey Perevalov      * userfault fd features is persistent
22554ae0886SAlexey Perevalov      */
22654ae0886SAlexey Perevalov     if (!supported_features) {
22754ae0886SAlexey Perevalov         if (!receive_ufd_features(&supported_features)) {
22854ae0886SAlexey Perevalov             error_report("%s failed", __func__);
22954ae0886SAlexey Perevalov             return false;
23054ae0886SAlexey Perevalov         }
23154ae0886SAlexey Perevalov     }
23254ae0886SAlexey Perevalov 
233*2a4c42f1SAlexey Perevalov #ifdef UFFD_FEATURE_THREAD_ID
234*2a4c42f1SAlexey Perevalov     if (migrate_postcopy_blocktime() && mis &&
235*2a4c42f1SAlexey Perevalov         UFFD_FEATURE_THREAD_ID & supported_features) {
236*2a4c42f1SAlexey Perevalov         /* kernel supports that feature */
237*2a4c42f1SAlexey Perevalov         /* don't create blocktime_context if it exists */
238*2a4c42f1SAlexey Perevalov         if (!mis->blocktime_ctx) {
239*2a4c42f1SAlexey Perevalov             mis->blocktime_ctx = blocktime_context_new();
240*2a4c42f1SAlexey Perevalov         }
241*2a4c42f1SAlexey Perevalov 
242*2a4c42f1SAlexey Perevalov         asked_features |= UFFD_FEATURE_THREAD_ID;
243*2a4c42f1SAlexey Perevalov     }
244*2a4c42f1SAlexey Perevalov #endif
245*2a4c42f1SAlexey Perevalov 
24654ae0886SAlexey Perevalov     /*
24754ae0886SAlexey Perevalov      * request features, even if asked_features is 0, due to
24854ae0886SAlexey Perevalov      * kernel expects UFFD_API before UFFDIO_REGISTER, per
24954ae0886SAlexey Perevalov      * userfault file descriptor
25054ae0886SAlexey Perevalov      */
25154ae0886SAlexey Perevalov     if (!request_ufd_features(ufd, asked_features)) {
25254ae0886SAlexey Perevalov         error_report("%s failed: features %" PRIu64, __func__,
25354ae0886SAlexey Perevalov                      asked_features);
25454ae0886SAlexey Perevalov         return false;
25554ae0886SAlexey Perevalov     }
25654ae0886SAlexey Perevalov 
2577e8cafb7SDr. David Alan Gilbert     if (getpagesize() != ram_pagesize_summary()) {
2587e8cafb7SDr. David Alan Gilbert         bool have_hp = false;
2597e8cafb7SDr. David Alan Gilbert         /* We've got a huge page */
2607e8cafb7SDr. David Alan Gilbert #ifdef UFFD_FEATURE_MISSING_HUGETLBFS
26154ae0886SAlexey Perevalov         have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
2627e8cafb7SDr. David Alan Gilbert #endif
2637e8cafb7SDr. David Alan Gilbert         if (!have_hp) {
2647e8cafb7SDr. David Alan Gilbert             error_report("Userfault on this host does not support huge pages");
2657e8cafb7SDr. David Alan Gilbert             return false;
2667e8cafb7SDr. David Alan Gilbert         }
2677e8cafb7SDr. David Alan Gilbert     }
268eb59db53SDr. David Alan Gilbert     return true;
269eb59db53SDr. David Alan Gilbert }
270eb59db53SDr. David Alan Gilbert 
2718679638bSDr. David Alan Gilbert /* Callback from postcopy_ram_supported_by_host block iterator.
2728679638bSDr. David Alan Gilbert  */
2735d214a92SDr. David Alan Gilbert static int test_ramblock_postcopiable(const char *block_name, void *host_addr,
2748679638bSDr. David Alan Gilbert                              ram_addr_t offset, ram_addr_t length, void *opaque)
2758679638bSDr. David Alan Gilbert {
2765d214a92SDr. David Alan Gilbert     RAMBlock *rb = qemu_ram_block_by_name(block_name);
2775d214a92SDr. David Alan Gilbert     size_t pagesize = qemu_ram_pagesize(rb);
2785d214a92SDr. David Alan Gilbert 
2795d214a92SDr. David Alan Gilbert     if (length % pagesize) {
2805d214a92SDr. David Alan Gilbert         error_report("Postcopy requires RAM blocks to be a page size multiple,"
2815d214a92SDr. David Alan Gilbert                      " block %s is 0x" RAM_ADDR_FMT " bytes with a "
2825d214a92SDr. David Alan Gilbert                      "page size of 0x%zx", block_name, length, pagesize);
2835d214a92SDr. David Alan Gilbert         return 1;
2845d214a92SDr. David Alan Gilbert     }
2858679638bSDr. David Alan Gilbert     return 0;
2868679638bSDr. David Alan Gilbert }
2878679638bSDr. David Alan Gilbert 
28858b7c17eSDr. David Alan Gilbert /*
28958b7c17eSDr. David Alan Gilbert  * Note: This has the side effect of munlock'ing all of RAM, that's
29058b7c17eSDr. David Alan Gilbert  * normally fine since if the postcopy succeeds it gets turned back on at the
29158b7c17eSDr. David Alan Gilbert  * end.
29258b7c17eSDr. David Alan Gilbert  */
293d7651f15SAlexey Perevalov bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
294eb59db53SDr. David Alan Gilbert {
295eb59db53SDr. David Alan Gilbert     long pagesize = getpagesize();
296eb59db53SDr. David Alan Gilbert     int ufd = -1;
297eb59db53SDr. David Alan Gilbert     bool ret = false; /* Error unless we change it */
298eb59db53SDr. David Alan Gilbert     void *testarea = NULL;
299eb59db53SDr. David Alan Gilbert     struct uffdio_register reg_struct;
300eb59db53SDr. David Alan Gilbert     struct uffdio_range range_struct;
301eb59db53SDr. David Alan Gilbert     uint64_t feature_mask;
3021693c64cSDr. David Alan Gilbert     Error *local_err = NULL;
303eb59db53SDr. David Alan Gilbert 
30420afaed9SJuan Quintela     if (qemu_target_page_size() > pagesize) {
305eb59db53SDr. David Alan Gilbert         error_report("Target page size bigger than host page size");
306eb59db53SDr. David Alan Gilbert         goto out;
307eb59db53SDr. David Alan Gilbert     }
308eb59db53SDr. David Alan Gilbert 
309eb59db53SDr. David Alan Gilbert     ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
310eb59db53SDr. David Alan Gilbert     if (ufd == -1) {
311eb59db53SDr. David Alan Gilbert         error_report("%s: userfaultfd not available: %s", __func__,
312eb59db53SDr. David Alan Gilbert                      strerror(errno));
313eb59db53SDr. David Alan Gilbert         goto out;
314eb59db53SDr. David Alan Gilbert     }
315eb59db53SDr. David Alan Gilbert 
3161693c64cSDr. David Alan Gilbert     /* Give devices a chance to object */
3171693c64cSDr. David Alan Gilbert     if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) {
3181693c64cSDr. David Alan Gilbert         error_report_err(local_err);
3191693c64cSDr. David Alan Gilbert         goto out;
3201693c64cSDr. David Alan Gilbert     }
3211693c64cSDr. David Alan Gilbert 
322eb59db53SDr. David Alan Gilbert     /* Version and features check */
32354ae0886SAlexey Perevalov     if (!ufd_check_and_apply(ufd, mis)) {
324eb59db53SDr. David Alan Gilbert         goto out;
325eb59db53SDr. David Alan Gilbert     }
326eb59db53SDr. David Alan Gilbert 
3278679638bSDr. David Alan Gilbert     /* We don't support postcopy with shared RAM yet */
3285d214a92SDr. David Alan Gilbert     if (qemu_ram_foreach_block(test_ramblock_postcopiable, NULL)) {
3298679638bSDr. David Alan Gilbert         goto out;
3308679638bSDr. David Alan Gilbert     }
3318679638bSDr. David Alan Gilbert 
332eb59db53SDr. David Alan Gilbert     /*
33358b7c17eSDr. David Alan Gilbert      * userfault and mlock don't go together; we'll put it back later if
33458b7c17eSDr. David Alan Gilbert      * it was enabled.
33558b7c17eSDr. David Alan Gilbert      */
33658b7c17eSDr. David Alan Gilbert     if (munlockall()) {
33758b7c17eSDr. David Alan Gilbert         error_report("%s: munlockall: %s", __func__,  strerror(errno));
33858b7c17eSDr. David Alan Gilbert         return -1;
33958b7c17eSDr. David Alan Gilbert     }
34058b7c17eSDr. David Alan Gilbert 
34158b7c17eSDr. David Alan Gilbert     /*
342eb59db53SDr. David Alan Gilbert      *  We need to check that the ops we need are supported on anon memory
343eb59db53SDr. David Alan Gilbert      *  To do that we need to register a chunk and see the flags that
344eb59db53SDr. David Alan Gilbert      *  are returned.
345eb59db53SDr. David Alan Gilbert      */
346eb59db53SDr. David Alan Gilbert     testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
347eb59db53SDr. David Alan Gilbert                                     MAP_ANONYMOUS, -1, 0);
348eb59db53SDr. David Alan Gilbert     if (testarea == MAP_FAILED) {
349eb59db53SDr. David Alan Gilbert         error_report("%s: Failed to map test area: %s", __func__,
350eb59db53SDr. David Alan Gilbert                      strerror(errno));
351eb59db53SDr. David Alan Gilbert         goto out;
352eb59db53SDr. David Alan Gilbert     }
353eb59db53SDr. David Alan Gilbert     g_assert(((size_t)testarea & (pagesize-1)) == 0);
354eb59db53SDr. David Alan Gilbert 
355eb59db53SDr. David Alan Gilbert     reg_struct.range.start = (uintptr_t)testarea;
356eb59db53SDr. David Alan Gilbert     reg_struct.range.len = pagesize;
357eb59db53SDr. David Alan Gilbert     reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
358eb59db53SDr. David Alan Gilbert 
359eb59db53SDr. David Alan Gilbert     if (ioctl(ufd, UFFDIO_REGISTER, &reg_struct)) {
360eb59db53SDr. David Alan Gilbert         error_report("%s userfault register: %s", __func__, strerror(errno));
361eb59db53SDr. David Alan Gilbert         goto out;
362eb59db53SDr. David Alan Gilbert     }
363eb59db53SDr. David Alan Gilbert 
364eb59db53SDr. David Alan Gilbert     range_struct.start = (uintptr_t)testarea;
365eb59db53SDr. David Alan Gilbert     range_struct.len = pagesize;
366eb59db53SDr. David Alan Gilbert     if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
367eb59db53SDr. David Alan Gilbert         error_report("%s userfault unregister: %s", __func__, strerror(errno));
368eb59db53SDr. David Alan Gilbert         goto out;
369eb59db53SDr. David Alan Gilbert     }
370eb59db53SDr. David Alan Gilbert 
371eb59db53SDr. David Alan Gilbert     feature_mask = (__u64)1 << _UFFDIO_WAKE |
372eb59db53SDr. David Alan Gilbert                    (__u64)1 << _UFFDIO_COPY |
373eb59db53SDr. David Alan Gilbert                    (__u64)1 << _UFFDIO_ZEROPAGE;
374eb59db53SDr. David Alan Gilbert     if ((reg_struct.ioctls & feature_mask) != feature_mask) {
375eb59db53SDr. David Alan Gilbert         error_report("Missing userfault map features: %" PRIx64,
376eb59db53SDr. David Alan Gilbert                      (uint64_t)(~reg_struct.ioctls & feature_mask));
377eb59db53SDr. David Alan Gilbert         goto out;
378eb59db53SDr. David Alan Gilbert     }
379eb59db53SDr. David Alan Gilbert 
380eb59db53SDr. David Alan Gilbert     /* Success! */
381eb59db53SDr. David Alan Gilbert     ret = true;
382eb59db53SDr. David Alan Gilbert out:
383eb59db53SDr. David Alan Gilbert     if (testarea) {
384eb59db53SDr. David Alan Gilbert         munmap(testarea, pagesize);
385eb59db53SDr. David Alan Gilbert     }
386eb59db53SDr. David Alan Gilbert     if (ufd != -1) {
387eb59db53SDr. David Alan Gilbert         close(ufd);
388eb59db53SDr. David Alan Gilbert     }
389eb59db53SDr. David Alan Gilbert     return ret;
390eb59db53SDr. David Alan Gilbert }
391eb59db53SDr. David Alan Gilbert 
3921caddf8aSDr. David Alan Gilbert /*
3931caddf8aSDr. David Alan Gilbert  * Setup an area of RAM so that it *can* be used for postcopy later; this
3941caddf8aSDr. David Alan Gilbert  * must be done right at the start prior to pre-copy.
3951caddf8aSDr. David Alan Gilbert  * opaque should be the MIS.
3961caddf8aSDr. David Alan Gilbert  */
3971caddf8aSDr. David Alan Gilbert static int init_range(const char *block_name, void *host_addr,
3981caddf8aSDr. David Alan Gilbert                       ram_addr_t offset, ram_addr_t length, void *opaque)
3991caddf8aSDr. David Alan Gilbert {
4001caddf8aSDr. David Alan Gilbert     trace_postcopy_init_range(block_name, host_addr, offset, length);
4011caddf8aSDr. David Alan Gilbert 
4021caddf8aSDr. David Alan Gilbert     /*
4031caddf8aSDr. David Alan Gilbert      * We need the whole of RAM to be truly empty for postcopy, so things
4041caddf8aSDr. David Alan Gilbert      * like ROMs and any data tables built during init must be zero'd
4051caddf8aSDr. David Alan Gilbert      * - we're going to get the copy from the source anyway.
4061caddf8aSDr. David Alan Gilbert      * (Precopy will just overwrite this data, so doesn't need the discard)
4071caddf8aSDr. David Alan Gilbert      */
408aaa2064cSJuan Quintela     if (ram_discard_range(block_name, 0, length)) {
4091caddf8aSDr. David Alan Gilbert         return -1;
4101caddf8aSDr. David Alan Gilbert     }
4111caddf8aSDr. David Alan Gilbert 
4121caddf8aSDr. David Alan Gilbert     return 0;
4131caddf8aSDr. David Alan Gilbert }
4141caddf8aSDr. David Alan Gilbert 
4151caddf8aSDr. David Alan Gilbert /*
4161caddf8aSDr. David Alan Gilbert  * At the end of migration, undo the effects of init_range
4171caddf8aSDr. David Alan Gilbert  * opaque should be the MIS.
4181caddf8aSDr. David Alan Gilbert  */
4191caddf8aSDr. David Alan Gilbert static int cleanup_range(const char *block_name, void *host_addr,
4201caddf8aSDr. David Alan Gilbert                         ram_addr_t offset, ram_addr_t length, void *opaque)
4211caddf8aSDr. David Alan Gilbert {
4221caddf8aSDr. David Alan Gilbert     MigrationIncomingState *mis = opaque;
4231caddf8aSDr. David Alan Gilbert     struct uffdio_range range_struct;
4241caddf8aSDr. David Alan Gilbert     trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
4251caddf8aSDr. David Alan Gilbert 
4261caddf8aSDr. David Alan Gilbert     /*
4271caddf8aSDr. David Alan Gilbert      * We turned off hugepage for the precopy stage with postcopy enabled
4281caddf8aSDr. David Alan Gilbert      * we can turn it back on now.
4291caddf8aSDr. David Alan Gilbert      */
4301d741439SDr. David Alan Gilbert     qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
4311caddf8aSDr. David Alan Gilbert 
4321caddf8aSDr. David Alan Gilbert     /*
4331caddf8aSDr. David Alan Gilbert      * We can also turn off userfault now since we should have all the
4341caddf8aSDr. David Alan Gilbert      * pages.   It can be useful to leave it on to debug postcopy
4351caddf8aSDr. David Alan Gilbert      * if you're not sure it's always getting every page.
4361caddf8aSDr. David Alan Gilbert      */
4371caddf8aSDr. David Alan Gilbert     range_struct.start = (uintptr_t)host_addr;
4381caddf8aSDr. David Alan Gilbert     range_struct.len = length;
4391caddf8aSDr. David Alan Gilbert 
4401caddf8aSDr. David Alan Gilbert     if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
4411caddf8aSDr. David Alan Gilbert         error_report("%s: userfault unregister %s", __func__, strerror(errno));
4421caddf8aSDr. David Alan Gilbert 
4431caddf8aSDr. David Alan Gilbert         return -1;
4441caddf8aSDr. David Alan Gilbert     }
4451caddf8aSDr. David Alan Gilbert 
4461caddf8aSDr. David Alan Gilbert     return 0;
4471caddf8aSDr. David Alan Gilbert }
4481caddf8aSDr. David Alan Gilbert 
4491caddf8aSDr. David Alan Gilbert /*
4501caddf8aSDr. David Alan Gilbert  * Initialise postcopy-ram, setting the RAM to a state where we can go into
4511caddf8aSDr. David Alan Gilbert  * postcopy later; must be called prior to any precopy.
4521caddf8aSDr. David Alan Gilbert  * called from arch_init's similarly named ram_postcopy_incoming_init
4531caddf8aSDr. David Alan Gilbert  */
4541caddf8aSDr. David Alan Gilbert int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
4551caddf8aSDr. David Alan Gilbert {
456aaa2064cSJuan Quintela     if (qemu_ram_foreach_block(init_range, NULL)) {
4571caddf8aSDr. David Alan Gilbert         return -1;
4581caddf8aSDr. David Alan Gilbert     }
4591caddf8aSDr. David Alan Gilbert 
4601caddf8aSDr. David Alan Gilbert     return 0;
4611caddf8aSDr. David Alan Gilbert }
4621caddf8aSDr. David Alan Gilbert 
4631caddf8aSDr. David Alan Gilbert /*
4641caddf8aSDr. David Alan Gilbert  * At the end of a migration where postcopy_ram_incoming_init was called.
4651caddf8aSDr. David Alan Gilbert  */
4661caddf8aSDr. David Alan Gilbert int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
4671caddf8aSDr. David Alan Gilbert {
468c4faeed2SDr. David Alan Gilbert     trace_postcopy_ram_incoming_cleanup_entry();
469c4faeed2SDr. David Alan Gilbert 
470c4faeed2SDr. David Alan Gilbert     if (mis->have_fault_thread) {
47146343570SDr. David Alan Gilbert         Error *local_err = NULL;
47246343570SDr. David Alan Gilbert 
47346343570SDr. David Alan Gilbert         if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
47446343570SDr. David Alan Gilbert             error_report_err(local_err);
47546343570SDr. David Alan Gilbert             return -1;
47646343570SDr. David Alan Gilbert         }
47746343570SDr. David Alan Gilbert 
4781caddf8aSDr. David Alan Gilbert         if (qemu_ram_foreach_block(cleanup_range, mis)) {
4791caddf8aSDr. David Alan Gilbert             return -1;
4801caddf8aSDr. David Alan Gilbert         }
4819ab7ef9bSPeter Xu         /* Let the fault thread quit */
48264f615feSPeter Xu         atomic_set(&mis->fault_thread_quit, 1);
4839ab7ef9bSPeter Xu         postcopy_fault_thread_notify(mis);
484c4faeed2SDr. David Alan Gilbert         trace_postcopy_ram_incoming_cleanup_join();
485c4faeed2SDr. David Alan Gilbert         qemu_thread_join(&mis->fault_thread);
4869ab7ef9bSPeter Xu 
487c4faeed2SDr. David Alan Gilbert         trace_postcopy_ram_incoming_cleanup_closeuf();
488c4faeed2SDr. David Alan Gilbert         close(mis->userfault_fd);
48964f615feSPeter Xu         close(mis->userfault_event_fd);
490c4faeed2SDr. David Alan Gilbert         mis->have_fault_thread = false;
491c4faeed2SDr. David Alan Gilbert     }
492c4faeed2SDr. David Alan Gilbert 
493371ff5a3SDr. David Alan Gilbert     qemu_balloon_inhibit(false);
494371ff5a3SDr. David Alan Gilbert 
49558b7c17eSDr. David Alan Gilbert     if (enable_mlock) {
49658b7c17eSDr. David Alan Gilbert         if (os_mlock() < 0) {
49758b7c17eSDr. David Alan Gilbert             error_report("mlock: %s", strerror(errno));
49858b7c17eSDr. David Alan Gilbert             /*
49958b7c17eSDr. David Alan Gilbert              * It doesn't feel right to fail at this point, we have a valid
50058b7c17eSDr. David Alan Gilbert              * VM state.
50158b7c17eSDr. David Alan Gilbert              */
50258b7c17eSDr. David Alan Gilbert         }
50358b7c17eSDr. David Alan Gilbert     }
50458b7c17eSDr. David Alan Gilbert 
505c4faeed2SDr. David Alan Gilbert     postcopy_state_set(POSTCOPY_INCOMING_END);
5061caddf8aSDr. David Alan Gilbert 
507696ed9a9SDr. David Alan Gilbert     if (mis->postcopy_tmp_page) {
508df9ff5e1SDr. David Alan Gilbert         munmap(mis->postcopy_tmp_page, mis->largest_page_size);
509696ed9a9SDr. David Alan Gilbert         mis->postcopy_tmp_page = NULL;
510696ed9a9SDr. David Alan Gilbert     }
51141d84210SDr. David Alan Gilbert     if (mis->postcopy_tmp_zero_page) {
51241d84210SDr. David Alan Gilbert         munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
51341d84210SDr. David Alan Gilbert         mis->postcopy_tmp_zero_page = NULL;
51441d84210SDr. David Alan Gilbert     }
515c4faeed2SDr. David Alan Gilbert     trace_postcopy_ram_incoming_cleanup_exit();
5161caddf8aSDr. David Alan Gilbert     return 0;
5171caddf8aSDr. David Alan Gilbert }
5181caddf8aSDr. David Alan Gilbert 
519f0a227adSDr. David Alan Gilbert /*
520f9527107SDr. David Alan Gilbert  * Disable huge pages on an area
521f9527107SDr. David Alan Gilbert  */
522f9527107SDr. David Alan Gilbert static int nhp_range(const char *block_name, void *host_addr,
523f9527107SDr. David Alan Gilbert                     ram_addr_t offset, ram_addr_t length, void *opaque)
524f9527107SDr. David Alan Gilbert {
525f9527107SDr. David Alan Gilbert     trace_postcopy_nhp_range(block_name, host_addr, offset, length);
526f9527107SDr. David Alan Gilbert 
527f9527107SDr. David Alan Gilbert     /*
528f9527107SDr. David Alan Gilbert      * Before we do discards we need to ensure those discards really
529f9527107SDr. David Alan Gilbert      * do delete areas of the page, even if THP thinks a hugepage would
530f9527107SDr. David Alan Gilbert      * be a good idea, so force hugepages off.
531f9527107SDr. David Alan Gilbert      */
5321d741439SDr. David Alan Gilbert     qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
533f9527107SDr. David Alan Gilbert 
534f9527107SDr. David Alan Gilbert     return 0;
535f9527107SDr. David Alan Gilbert }
536f9527107SDr. David Alan Gilbert 
537f9527107SDr. David Alan Gilbert /*
538f9527107SDr. David Alan Gilbert  * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
539f9527107SDr. David Alan Gilbert  * however leaving it until after precopy means that most of the precopy
540f9527107SDr. David Alan Gilbert  * data is still THPd
541f9527107SDr. David Alan Gilbert  */
542f9527107SDr. David Alan Gilbert int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
543f9527107SDr. David Alan Gilbert {
544f9527107SDr. David Alan Gilbert     if (qemu_ram_foreach_block(nhp_range, mis)) {
545f9527107SDr. David Alan Gilbert         return -1;
546f9527107SDr. David Alan Gilbert     }
547f9527107SDr. David Alan Gilbert 
548f9527107SDr. David Alan Gilbert     postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
549f9527107SDr. David Alan Gilbert 
550f9527107SDr. David Alan Gilbert     return 0;
551f9527107SDr. David Alan Gilbert }
552f9527107SDr. David Alan Gilbert 
553f9527107SDr. David Alan Gilbert /*
554f0a227adSDr. David Alan Gilbert  * Mark the given area of RAM as requiring notification to unwritten areas
555f0a227adSDr. David Alan Gilbert  * Used as a  callback on qemu_ram_foreach_block.
556f0a227adSDr. David Alan Gilbert  *   host_addr: Base of area to mark
557f0a227adSDr. David Alan Gilbert  *   offset: Offset in the whole ram arena
558f0a227adSDr. David Alan Gilbert  *   length: Length of the section
559f0a227adSDr. David Alan Gilbert  *   opaque: MigrationIncomingState pointer
560f0a227adSDr. David Alan Gilbert  * Returns 0 on success
561f0a227adSDr. David Alan Gilbert  */
562f0a227adSDr. David Alan Gilbert static int ram_block_enable_notify(const char *block_name, void *host_addr,
563f0a227adSDr. David Alan Gilbert                                    ram_addr_t offset, ram_addr_t length,
564f0a227adSDr. David Alan Gilbert                                    void *opaque)
565f0a227adSDr. David Alan Gilbert {
566f0a227adSDr. David Alan Gilbert     MigrationIncomingState *mis = opaque;
567f0a227adSDr. David Alan Gilbert     struct uffdio_register reg_struct;
568f0a227adSDr. David Alan Gilbert 
569f0a227adSDr. David Alan Gilbert     reg_struct.range.start = (uintptr_t)host_addr;
570f0a227adSDr. David Alan Gilbert     reg_struct.range.len = length;
571f0a227adSDr. David Alan Gilbert     reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
572f0a227adSDr. David Alan Gilbert 
573f0a227adSDr. David Alan Gilbert     /* Now tell our userfault_fd that it's responsible for this area */
574f0a227adSDr. David Alan Gilbert     if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, &reg_struct)) {
575f0a227adSDr. David Alan Gilbert         error_report("%s userfault register: %s", __func__, strerror(errno));
576f0a227adSDr. David Alan Gilbert         return -1;
577f0a227adSDr. David Alan Gilbert     }
578665414adSDr. David Alan Gilbert     if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
579665414adSDr. David Alan Gilbert         error_report("%s userfault: Region doesn't support COPY", __func__);
580665414adSDr. David Alan Gilbert         return -1;
581665414adSDr. David Alan Gilbert     }
5822ce16640SDr. David Alan Gilbert     if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
5832ce16640SDr. David Alan Gilbert         RAMBlock *rb = qemu_ram_block_by_name(block_name);
5842ce16640SDr. David Alan Gilbert         qemu_ram_set_uf_zeroable(rb);
5852ce16640SDr. David Alan Gilbert     }
586f0a227adSDr. David Alan Gilbert 
587f0a227adSDr. David Alan Gilbert     return 0;
588f0a227adSDr. David Alan Gilbert }
589f0a227adSDr. David Alan Gilbert 
5905efc3564SDr. David Alan Gilbert int postcopy_wake_shared(struct PostCopyFD *pcfd,
5915efc3564SDr. David Alan Gilbert                          uint64_t client_addr,
5925efc3564SDr. David Alan Gilbert                          RAMBlock *rb)
5935efc3564SDr. David Alan Gilbert {
5945efc3564SDr. David Alan Gilbert     size_t pagesize = qemu_ram_pagesize(rb);
5955efc3564SDr. David Alan Gilbert     struct uffdio_range range;
5965efc3564SDr. David Alan Gilbert     int ret;
5975efc3564SDr. David Alan Gilbert     trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
5985efc3564SDr. David Alan Gilbert     range.start = client_addr & ~(pagesize - 1);
5995efc3564SDr. David Alan Gilbert     range.len = pagesize;
6005efc3564SDr. David Alan Gilbert     ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
6015efc3564SDr. David Alan Gilbert     if (ret) {
6025efc3564SDr. David Alan Gilbert         error_report("%s: Failed to wake: %zx in %s (%s)",
6035efc3564SDr. David Alan Gilbert                      __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
6045efc3564SDr. David Alan Gilbert                      strerror(errno));
6055efc3564SDr. David Alan Gilbert     }
6065efc3564SDr. David Alan Gilbert     return ret;
6075efc3564SDr. David Alan Gilbert }
6085efc3564SDr. David Alan Gilbert 
609f0a227adSDr. David Alan Gilbert /*
610096bf4c8SDr. David Alan Gilbert  * Callback from shared fault handlers to ask for a page,
611096bf4c8SDr. David Alan Gilbert  * the page must be specified by a RAMBlock and an offset in that rb
612096bf4c8SDr. David Alan Gilbert  * Note: Only for use by shared fault handlers (in fault thread)
613096bf4c8SDr. David Alan Gilbert  */
614096bf4c8SDr. David Alan Gilbert int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
615096bf4c8SDr. David Alan Gilbert                                  uint64_t client_addr, uint64_t rb_offset)
616096bf4c8SDr. David Alan Gilbert {
617096bf4c8SDr. David Alan Gilbert     size_t pagesize = qemu_ram_pagesize(rb);
618096bf4c8SDr. David Alan Gilbert     uint64_t aligned_rbo = rb_offset & ~(pagesize - 1);
619096bf4c8SDr. David Alan Gilbert     MigrationIncomingState *mis = migration_incoming_get_current();
620096bf4c8SDr. David Alan Gilbert 
621096bf4c8SDr. David Alan Gilbert     trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
622096bf4c8SDr. David Alan Gilbert                                        rb_offset);
623dedfb4b2SDr. David Alan Gilbert     if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
624dedfb4b2SDr. David Alan Gilbert         trace_postcopy_request_shared_page_present(pcfd->idstr,
625dedfb4b2SDr. David Alan Gilbert                                         qemu_ram_get_idstr(rb), rb_offset);
626dedfb4b2SDr. David Alan Gilbert         return postcopy_wake_shared(pcfd, client_addr, rb);
627dedfb4b2SDr. David Alan Gilbert     }
628096bf4c8SDr. David Alan Gilbert     if (rb != mis->last_rb) {
629096bf4c8SDr. David Alan Gilbert         mis->last_rb = rb;
630096bf4c8SDr. David Alan Gilbert         migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
631096bf4c8SDr. David Alan Gilbert                                   aligned_rbo, pagesize);
632096bf4c8SDr. David Alan Gilbert     } else {
633096bf4c8SDr. David Alan Gilbert         /* Save some space */
634096bf4c8SDr. David Alan Gilbert         migrate_send_rp_req_pages(mis, NULL, aligned_rbo, pagesize);
635096bf4c8SDr. David Alan Gilbert     }
636096bf4c8SDr. David Alan Gilbert     return 0;
637096bf4c8SDr. David Alan Gilbert }
638096bf4c8SDr. David Alan Gilbert 
639096bf4c8SDr. David Alan Gilbert /*
640f0a227adSDr. David Alan Gilbert  * Handle faults detected by the USERFAULT markings
641f0a227adSDr. David Alan Gilbert  */
642f0a227adSDr. David Alan Gilbert static void *postcopy_ram_fault_thread(void *opaque)
643f0a227adSDr. David Alan Gilbert {
644f0a227adSDr. David Alan Gilbert     MigrationIncomingState *mis = opaque;
645c4faeed2SDr. David Alan Gilbert     struct uffd_msg msg;
646c4faeed2SDr. David Alan Gilbert     int ret;
64700fa4fc8SDr. David Alan Gilbert     size_t index;
648c4faeed2SDr. David Alan Gilbert     RAMBlock *rb = NULL;
649f0a227adSDr. David Alan Gilbert 
650c4faeed2SDr. David Alan Gilbert     trace_postcopy_ram_fault_thread_entry();
651096bf4c8SDr. David Alan Gilbert     mis->last_rb = NULL; /* last RAMBlock we sent part of */
652f0a227adSDr. David Alan Gilbert     qemu_sem_post(&mis->fault_thread_sem);
653c4faeed2SDr. David Alan Gilbert 
65400fa4fc8SDr. David Alan Gilbert     struct pollfd *pfd;
65500fa4fc8SDr. David Alan Gilbert     size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
65600fa4fc8SDr. David Alan Gilbert 
65700fa4fc8SDr. David Alan Gilbert     pfd = g_new0(struct pollfd, pfd_len);
65800fa4fc8SDr. David Alan Gilbert 
65900fa4fc8SDr. David Alan Gilbert     pfd[0].fd = mis->userfault_fd;
66000fa4fc8SDr. David Alan Gilbert     pfd[0].events = POLLIN;
66100fa4fc8SDr. David Alan Gilbert     pfd[1].fd = mis->userfault_event_fd;
66200fa4fc8SDr. David Alan Gilbert     pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
66300fa4fc8SDr. David Alan Gilbert     trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
66400fa4fc8SDr. David Alan Gilbert     for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
66500fa4fc8SDr. David Alan Gilbert         struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
66600fa4fc8SDr. David Alan Gilbert                                                  struct PostCopyFD, index);
66700fa4fc8SDr. David Alan Gilbert         pfd[2 + index].fd = pcfd->fd;
66800fa4fc8SDr. David Alan Gilbert         pfd[2 + index].events = POLLIN;
66900fa4fc8SDr. David Alan Gilbert         trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
67000fa4fc8SDr. David Alan Gilbert                                                   pcfd->fd);
67100fa4fc8SDr. David Alan Gilbert     }
67200fa4fc8SDr. David Alan Gilbert 
673c4faeed2SDr. David Alan Gilbert     while (true) {
674c4faeed2SDr. David Alan Gilbert         ram_addr_t rb_offset;
67500fa4fc8SDr. David Alan Gilbert         int poll_result;
676c4faeed2SDr. David Alan Gilbert 
677c4faeed2SDr. David Alan Gilbert         /*
678c4faeed2SDr. David Alan Gilbert          * We're mainly waiting for the kernel to give us a faulting HVA,
679c4faeed2SDr. David Alan Gilbert          * however we can be told to quit via userfault_quit_fd which is
680c4faeed2SDr. David Alan Gilbert          * an eventfd
681c4faeed2SDr. David Alan Gilbert          */
682c4faeed2SDr. David Alan Gilbert 
68300fa4fc8SDr. David Alan Gilbert         poll_result = poll(pfd, pfd_len, -1 /* Wait forever */);
68400fa4fc8SDr. David Alan Gilbert         if (poll_result == -1) {
685c4faeed2SDr. David Alan Gilbert             error_report("%s: userfault poll: %s", __func__, strerror(errno));
686c4faeed2SDr. David Alan Gilbert             break;
687f0a227adSDr. David Alan Gilbert         }
688f0a227adSDr. David Alan Gilbert 
689c4faeed2SDr. David Alan Gilbert         if (pfd[1].revents) {
69064f615feSPeter Xu             uint64_t tmp64 = 0;
69164f615feSPeter Xu 
69264f615feSPeter Xu             /* Consume the signal */
69364f615feSPeter Xu             if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
69464f615feSPeter Xu                 /* Nothing obviously nicer than posting this error. */
69564f615feSPeter Xu                 error_report("%s: read() failed", __func__);
69664f615feSPeter Xu             }
69764f615feSPeter Xu 
69864f615feSPeter Xu             if (atomic_read(&mis->fault_thread_quit)) {
699c4faeed2SDr. David Alan Gilbert                 trace_postcopy_ram_fault_thread_quit();
700c4faeed2SDr. David Alan Gilbert                 break;
701c4faeed2SDr. David Alan Gilbert             }
70264f615feSPeter Xu         }
703c4faeed2SDr. David Alan Gilbert 
70400fa4fc8SDr. David Alan Gilbert         if (pfd[0].revents) {
70500fa4fc8SDr. David Alan Gilbert             poll_result--;
706c4faeed2SDr. David Alan Gilbert             ret = read(mis->userfault_fd, &msg, sizeof(msg));
707c4faeed2SDr. David Alan Gilbert             if (ret != sizeof(msg)) {
708c4faeed2SDr. David Alan Gilbert                 if (errno == EAGAIN) {
709c4faeed2SDr. David Alan Gilbert                     /*
710c4faeed2SDr. David Alan Gilbert                      * if a wake up happens on the other thread just after
711c4faeed2SDr. David Alan Gilbert                      * the poll, there is nothing to read.
712c4faeed2SDr. David Alan Gilbert                      */
713c4faeed2SDr. David Alan Gilbert                     continue;
714c4faeed2SDr. David Alan Gilbert                 }
715c4faeed2SDr. David Alan Gilbert                 if (ret < 0) {
71600fa4fc8SDr. David Alan Gilbert                     error_report("%s: Failed to read full userfault "
71700fa4fc8SDr. David Alan Gilbert                                  "message: %s",
718c4faeed2SDr. David Alan Gilbert                                  __func__, strerror(errno));
719c4faeed2SDr. David Alan Gilbert                     break;
720c4faeed2SDr. David Alan Gilbert                 } else {
72100fa4fc8SDr. David Alan Gilbert                     error_report("%s: Read %d bytes from userfaultfd "
72200fa4fc8SDr. David Alan Gilbert                                  "expected %zd",
723c4faeed2SDr. David Alan Gilbert                                  __func__, ret, sizeof(msg));
724c4faeed2SDr. David Alan Gilbert                     break; /* Lost alignment, don't know what we'd read next */
725c4faeed2SDr. David Alan Gilbert                 }
726c4faeed2SDr. David Alan Gilbert             }
727c4faeed2SDr. David Alan Gilbert             if (msg.event != UFFD_EVENT_PAGEFAULT) {
728c4faeed2SDr. David Alan Gilbert                 error_report("%s: Read unexpected event %ud from userfaultfd",
729c4faeed2SDr. David Alan Gilbert                              __func__, msg.event);
730c4faeed2SDr. David Alan Gilbert                 continue; /* It's not a page fault, shouldn't happen */
731c4faeed2SDr. David Alan Gilbert             }
732c4faeed2SDr. David Alan Gilbert 
733c4faeed2SDr. David Alan Gilbert             rb = qemu_ram_block_from_host(
734c4faeed2SDr. David Alan Gilbert                      (void *)(uintptr_t)msg.arg.pagefault.address,
735f615f396SPaolo Bonzini                      true, &rb_offset);
736c4faeed2SDr. David Alan Gilbert             if (!rb) {
737c4faeed2SDr. David Alan Gilbert                 error_report("postcopy_ram_fault_thread: Fault outside guest: %"
738c4faeed2SDr. David Alan Gilbert                              PRIx64, (uint64_t)msg.arg.pagefault.address);
739c4faeed2SDr. David Alan Gilbert                 break;
740c4faeed2SDr. David Alan Gilbert             }
741c4faeed2SDr. David Alan Gilbert 
742332847f0SDr. David Alan Gilbert             rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
743c4faeed2SDr. David Alan Gilbert             trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
744c4faeed2SDr. David Alan Gilbert                                                 qemu_ram_get_idstr(rb),
745ee86981bSPeter Maydell                                                 rb_offset);
746c4faeed2SDr. David Alan Gilbert             /*
747c4faeed2SDr. David Alan Gilbert              * Send the request to the source - we want to request one
748c4faeed2SDr. David Alan Gilbert              * of our host page sizes (which is >= TPS)
749c4faeed2SDr. David Alan Gilbert              */
750096bf4c8SDr. David Alan Gilbert             if (rb != mis->last_rb) {
751096bf4c8SDr. David Alan Gilbert                 mis->last_rb = rb;
752c4faeed2SDr. David Alan Gilbert                 migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
753332847f0SDr. David Alan Gilbert                                          rb_offset, qemu_ram_pagesize(rb));
754c4faeed2SDr. David Alan Gilbert             } else {
755c4faeed2SDr. David Alan Gilbert                 /* Save some space */
756c4faeed2SDr. David Alan Gilbert                 migrate_send_rp_req_pages(mis, NULL,
757332847f0SDr. David Alan Gilbert                                          rb_offset, qemu_ram_pagesize(rb));
758c4faeed2SDr. David Alan Gilbert             }
759c4faeed2SDr. David Alan Gilbert         }
76000fa4fc8SDr. David Alan Gilbert 
76100fa4fc8SDr. David Alan Gilbert         /* Now handle any requests from external processes on shared memory */
76200fa4fc8SDr. David Alan Gilbert         /* TODO: May need to handle devices deregistering during postcopy */
76300fa4fc8SDr. David Alan Gilbert         for (index = 2; index < pfd_len && poll_result; index++) {
76400fa4fc8SDr. David Alan Gilbert             if (pfd[index].revents) {
76500fa4fc8SDr. David Alan Gilbert                 struct PostCopyFD *pcfd =
76600fa4fc8SDr. David Alan Gilbert                     &g_array_index(mis->postcopy_remote_fds,
76700fa4fc8SDr. David Alan Gilbert                                    struct PostCopyFD, index - 2);
76800fa4fc8SDr. David Alan Gilbert 
76900fa4fc8SDr. David Alan Gilbert                 poll_result--;
77000fa4fc8SDr. David Alan Gilbert                 if (pfd[index].revents & POLLERR) {
77100fa4fc8SDr. David Alan Gilbert                     error_report("%s: POLLERR on poll %zd fd=%d",
77200fa4fc8SDr. David Alan Gilbert                                  __func__, index, pcfd->fd);
77300fa4fc8SDr. David Alan Gilbert                     pfd[index].events = 0;
77400fa4fc8SDr. David Alan Gilbert                     continue;
77500fa4fc8SDr. David Alan Gilbert                 }
77600fa4fc8SDr. David Alan Gilbert 
77700fa4fc8SDr. David Alan Gilbert                 ret = read(pcfd->fd, &msg, sizeof(msg));
77800fa4fc8SDr. David Alan Gilbert                 if (ret != sizeof(msg)) {
77900fa4fc8SDr. David Alan Gilbert                     if (errno == EAGAIN) {
78000fa4fc8SDr. David Alan Gilbert                         /*
78100fa4fc8SDr. David Alan Gilbert                          * if a wake up happens on the other thread just after
78200fa4fc8SDr. David Alan Gilbert                          * the poll, there is nothing to read.
78300fa4fc8SDr. David Alan Gilbert                          */
78400fa4fc8SDr. David Alan Gilbert                         continue;
78500fa4fc8SDr. David Alan Gilbert                     }
78600fa4fc8SDr. David Alan Gilbert                     if (ret < 0) {
78700fa4fc8SDr. David Alan Gilbert                         error_report("%s: Failed to read full userfault "
78800fa4fc8SDr. David Alan Gilbert                                      "message: %s (shared) revents=%d",
78900fa4fc8SDr. David Alan Gilbert                                      __func__, strerror(errno),
79000fa4fc8SDr. David Alan Gilbert                                      pfd[index].revents);
79100fa4fc8SDr. David Alan Gilbert                         /*TODO: Could just disable this sharer */
79200fa4fc8SDr. David Alan Gilbert                         break;
79300fa4fc8SDr. David Alan Gilbert                     } else {
79400fa4fc8SDr. David Alan Gilbert                         error_report("%s: Read %d bytes from userfaultfd "
79500fa4fc8SDr. David Alan Gilbert                                      "expected %zd (shared)",
79600fa4fc8SDr. David Alan Gilbert                                      __func__, ret, sizeof(msg));
79700fa4fc8SDr. David Alan Gilbert                         /*TODO: Could just disable this sharer */
79800fa4fc8SDr. David Alan Gilbert                         break; /*Lost alignment,don't know what we'd read next*/
79900fa4fc8SDr. David Alan Gilbert                     }
80000fa4fc8SDr. David Alan Gilbert                 }
80100fa4fc8SDr. David Alan Gilbert                 if (msg.event != UFFD_EVENT_PAGEFAULT) {
80200fa4fc8SDr. David Alan Gilbert                     error_report("%s: Read unexpected event %ud "
80300fa4fc8SDr. David Alan Gilbert                                  "from userfaultfd (shared)",
80400fa4fc8SDr. David Alan Gilbert                                  __func__, msg.event);
80500fa4fc8SDr. David Alan Gilbert                     continue; /* It's not a page fault, shouldn't happen */
80600fa4fc8SDr. David Alan Gilbert                 }
80700fa4fc8SDr. David Alan Gilbert                 /* Call the device handler registered with us */
80800fa4fc8SDr. David Alan Gilbert                 ret = pcfd->handler(pcfd, &msg);
80900fa4fc8SDr. David Alan Gilbert                 if (ret) {
81000fa4fc8SDr. David Alan Gilbert                     error_report("%s: Failed to resolve shared fault on %zd/%s",
81100fa4fc8SDr. David Alan Gilbert                                  __func__, index, pcfd->idstr);
81200fa4fc8SDr. David Alan Gilbert                     /* TODO: Fail? Disable this sharer? */
81300fa4fc8SDr. David Alan Gilbert                 }
81400fa4fc8SDr. David Alan Gilbert             }
81500fa4fc8SDr. David Alan Gilbert         }
81600fa4fc8SDr. David Alan Gilbert     }
817c4faeed2SDr. David Alan Gilbert     trace_postcopy_ram_fault_thread_exit();
818fc6008f3SMarc-André Lureau     g_free(pfd);
819f0a227adSDr. David Alan Gilbert     return NULL;
820f0a227adSDr. David Alan Gilbert }
821f0a227adSDr. David Alan Gilbert 
822f0a227adSDr. David Alan Gilbert int postcopy_ram_enable_notify(MigrationIncomingState *mis)
823f0a227adSDr. David Alan Gilbert {
824c4faeed2SDr. David Alan Gilbert     /* Open the fd for the kernel to give us userfaults */
825c4faeed2SDr. David Alan Gilbert     mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
826c4faeed2SDr. David Alan Gilbert     if (mis->userfault_fd == -1) {
827c4faeed2SDr. David Alan Gilbert         error_report("%s: Failed to open userfault fd: %s", __func__,
828c4faeed2SDr. David Alan Gilbert                      strerror(errno));
829c4faeed2SDr. David Alan Gilbert         return -1;
830c4faeed2SDr. David Alan Gilbert     }
831c4faeed2SDr. David Alan Gilbert 
832c4faeed2SDr. David Alan Gilbert     /*
833c4faeed2SDr. David Alan Gilbert      * Although the host check already tested the API, we need to
834c4faeed2SDr. David Alan Gilbert      * do the check again as an ABI handshake on the new fd.
835c4faeed2SDr. David Alan Gilbert      */
83654ae0886SAlexey Perevalov     if (!ufd_check_and_apply(mis->userfault_fd, mis)) {
837c4faeed2SDr. David Alan Gilbert         return -1;
838c4faeed2SDr. David Alan Gilbert     }
839c4faeed2SDr. David Alan Gilbert 
840c4faeed2SDr. David Alan Gilbert     /* Now an eventfd we use to tell the fault-thread to quit */
84164f615feSPeter Xu     mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
84264f615feSPeter Xu     if (mis->userfault_event_fd == -1) {
84364f615feSPeter Xu         error_report("%s: Opening userfault_event_fd: %s", __func__,
844c4faeed2SDr. David Alan Gilbert                      strerror(errno));
845c4faeed2SDr. David Alan Gilbert         close(mis->userfault_fd);
846c4faeed2SDr. David Alan Gilbert         return -1;
847c4faeed2SDr. David Alan Gilbert     }
848c4faeed2SDr. David Alan Gilbert 
849f0a227adSDr. David Alan Gilbert     qemu_sem_init(&mis->fault_thread_sem, 0);
850f0a227adSDr. David Alan Gilbert     qemu_thread_create(&mis->fault_thread, "postcopy/fault",
851f0a227adSDr. David Alan Gilbert                        postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
852f0a227adSDr. David Alan Gilbert     qemu_sem_wait(&mis->fault_thread_sem);
853f0a227adSDr. David Alan Gilbert     qemu_sem_destroy(&mis->fault_thread_sem);
854c4faeed2SDr. David Alan Gilbert     mis->have_fault_thread = true;
855f0a227adSDr. David Alan Gilbert 
856f0a227adSDr. David Alan Gilbert     /* Mark so that we get notified of accesses to unwritten areas */
857f0a227adSDr. David Alan Gilbert     if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) {
858f0a227adSDr. David Alan Gilbert         return -1;
859f0a227adSDr. David Alan Gilbert     }
860f0a227adSDr. David Alan Gilbert 
861371ff5a3SDr. David Alan Gilbert     /*
862371ff5a3SDr. David Alan Gilbert      * Ballooning can mark pages as absent while we're postcopying
863371ff5a3SDr. David Alan Gilbert      * that would cause false userfaults.
864371ff5a3SDr. David Alan Gilbert      */
865371ff5a3SDr. David Alan Gilbert     qemu_balloon_inhibit(true);
866371ff5a3SDr. David Alan Gilbert 
867c4faeed2SDr. David Alan Gilbert     trace_postcopy_ram_enable_notify();
868c4faeed2SDr. David Alan Gilbert 
869f0a227adSDr. David Alan Gilbert     return 0;
870f0a227adSDr. David Alan Gilbert }
871f0a227adSDr. David Alan Gilbert 
872727b9d7eSAlexey Perevalov static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr,
873f9494614SAlexey Perevalov                                void *from_addr, uint64_t pagesize, RAMBlock *rb)
874727b9d7eSAlexey Perevalov {
875f9494614SAlexey Perevalov     int ret;
876727b9d7eSAlexey Perevalov     if (from_addr) {
877727b9d7eSAlexey Perevalov         struct uffdio_copy copy_struct;
878727b9d7eSAlexey Perevalov         copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
879727b9d7eSAlexey Perevalov         copy_struct.src = (uint64_t)(uintptr_t)from_addr;
880727b9d7eSAlexey Perevalov         copy_struct.len = pagesize;
881727b9d7eSAlexey Perevalov         copy_struct.mode = 0;
882f9494614SAlexey Perevalov         ret = ioctl(userfault_fd, UFFDIO_COPY, &copy_struct);
883727b9d7eSAlexey Perevalov     } else {
884727b9d7eSAlexey Perevalov         struct uffdio_zeropage zero_struct;
885727b9d7eSAlexey Perevalov         zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
886727b9d7eSAlexey Perevalov         zero_struct.range.len = pagesize;
887727b9d7eSAlexey Perevalov         zero_struct.mode = 0;
888f9494614SAlexey Perevalov         ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
889727b9d7eSAlexey Perevalov     }
890f9494614SAlexey Perevalov     if (!ret) {
891f9494614SAlexey Perevalov         ramblock_recv_bitmap_set_range(rb, host_addr,
892f9494614SAlexey Perevalov                                        pagesize / qemu_target_page_size());
893f9494614SAlexey Perevalov     }
894f9494614SAlexey Perevalov     return ret;
895727b9d7eSAlexey Perevalov }
896727b9d7eSAlexey Perevalov 
897d488b349SDr. David Alan Gilbert int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
898d488b349SDr. David Alan Gilbert {
899d488b349SDr. David Alan Gilbert     int i;
900d488b349SDr. David Alan Gilbert     MigrationIncomingState *mis = migration_incoming_get_current();
901d488b349SDr. David Alan Gilbert     GArray *pcrfds = mis->postcopy_remote_fds;
902d488b349SDr. David Alan Gilbert 
903d488b349SDr. David Alan Gilbert     for (i = 0; i < pcrfds->len; i++) {
904d488b349SDr. David Alan Gilbert         struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
905d488b349SDr. David Alan Gilbert         int ret = cur->waker(cur, rb, offset);
906d488b349SDr. David Alan Gilbert         if (ret) {
907d488b349SDr. David Alan Gilbert             return ret;
908d488b349SDr. David Alan Gilbert         }
909d488b349SDr. David Alan Gilbert     }
910d488b349SDr. David Alan Gilbert     return 0;
911d488b349SDr. David Alan Gilbert }
912d488b349SDr. David Alan Gilbert 
913696ed9a9SDr. David Alan Gilbert /*
914696ed9a9SDr. David Alan Gilbert  * Place a host page (from) at (host) atomically
915696ed9a9SDr. David Alan Gilbert  * returns 0 on success
916696ed9a9SDr. David Alan Gilbert  */
917df9ff5e1SDr. David Alan Gilbert int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
9188be4620bSAlexey Perevalov                         RAMBlock *rb)
919696ed9a9SDr. David Alan Gilbert {
9208be4620bSAlexey Perevalov     size_t pagesize = qemu_ram_pagesize(rb);
921696ed9a9SDr. David Alan Gilbert 
922696ed9a9SDr. David Alan Gilbert     /* copy also acks to the kernel waking the stalled thread up
923696ed9a9SDr. David Alan Gilbert      * TODO: We can inhibit that ack and only do it if it was requested
924696ed9a9SDr. David Alan Gilbert      * which would be slightly cheaper, but we'd have to be careful
925696ed9a9SDr. David Alan Gilbert      * of the order of updating our page state.
926696ed9a9SDr. David Alan Gilbert      */
927f9494614SAlexey Perevalov     if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, from, pagesize, rb)) {
928696ed9a9SDr. David Alan Gilbert         int e = errno;
929df9ff5e1SDr. David Alan Gilbert         error_report("%s: %s copy host: %p from: %p (size: %zd)",
930df9ff5e1SDr. David Alan Gilbert                      __func__, strerror(e), host, from, pagesize);
931696ed9a9SDr. David Alan Gilbert 
932696ed9a9SDr. David Alan Gilbert         return -e;
933696ed9a9SDr. David Alan Gilbert     }
934696ed9a9SDr. David Alan Gilbert 
935696ed9a9SDr. David Alan Gilbert     trace_postcopy_place_page(host);
936dedfb4b2SDr. David Alan Gilbert     return postcopy_notify_shared_wake(rb,
937dedfb4b2SDr. David Alan Gilbert                                        qemu_ram_block_host_offset(rb, host));
938696ed9a9SDr. David Alan Gilbert }
939696ed9a9SDr. David Alan Gilbert 
940696ed9a9SDr. David Alan Gilbert /*
941696ed9a9SDr. David Alan Gilbert  * Place a zero page at (host) atomically
942696ed9a9SDr. David Alan Gilbert  * returns 0 on success
943696ed9a9SDr. David Alan Gilbert  */
944df9ff5e1SDr. David Alan Gilbert int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
9458be4620bSAlexey Perevalov                              RAMBlock *rb)
946696ed9a9SDr. David Alan Gilbert {
9472ce16640SDr. David Alan Gilbert     size_t pagesize = qemu_ram_pagesize(rb);
948df9ff5e1SDr. David Alan Gilbert     trace_postcopy_place_page_zero(host);
949696ed9a9SDr. David Alan Gilbert 
9502ce16640SDr. David Alan Gilbert     /* Normal RAMBlocks can zero a page using UFFDIO_ZEROPAGE
9512ce16640SDr. David Alan Gilbert      * but it's not available for everything (e.g. hugetlbpages)
9522ce16640SDr. David Alan Gilbert      */
9532ce16640SDr. David Alan Gilbert     if (qemu_ram_is_uf_zeroable(rb)) {
9542ce16640SDr. David Alan Gilbert         if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, pagesize, rb)) {
955696ed9a9SDr. David Alan Gilbert             int e = errno;
956696ed9a9SDr. David Alan Gilbert             error_report("%s: %s zero host: %p",
957696ed9a9SDr. David Alan Gilbert                          __func__, strerror(e), host);
958696ed9a9SDr. David Alan Gilbert 
959696ed9a9SDr. David Alan Gilbert             return -e;
960696ed9a9SDr. David Alan Gilbert         }
961dedfb4b2SDr. David Alan Gilbert         return postcopy_notify_shared_wake(rb,
962dedfb4b2SDr. David Alan Gilbert                                            qemu_ram_block_host_offset(rb,
963dedfb4b2SDr. David Alan Gilbert                                                                       host));
964df9ff5e1SDr. David Alan Gilbert     } else {
96541d84210SDr. David Alan Gilbert         /* The kernel can't use UFFDIO_ZEROPAGE for hugepages */
96641d84210SDr. David Alan Gilbert         if (!mis->postcopy_tmp_zero_page) {
96741d84210SDr. David Alan Gilbert             mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
96841d84210SDr. David Alan Gilbert                                                PROT_READ | PROT_WRITE,
96941d84210SDr. David Alan Gilbert                                                MAP_PRIVATE | MAP_ANONYMOUS,
97041d84210SDr. David Alan Gilbert                                                -1, 0);
97141d84210SDr. David Alan Gilbert             if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
97241d84210SDr. David Alan Gilbert                 int e = errno;
97341d84210SDr. David Alan Gilbert                 mis->postcopy_tmp_zero_page = NULL;
97441d84210SDr. David Alan Gilbert                 error_report("%s: %s mapping large zero page",
97541d84210SDr. David Alan Gilbert                              __func__, strerror(e));
97641d84210SDr. David Alan Gilbert                 return -e;
97741d84210SDr. David Alan Gilbert             }
97841d84210SDr. David Alan Gilbert             memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
97941d84210SDr. David Alan Gilbert         }
98041d84210SDr. David Alan Gilbert         return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page,
9818be4620bSAlexey Perevalov                                    rb);
982df9ff5e1SDr. David Alan Gilbert     }
983696ed9a9SDr. David Alan Gilbert }
984696ed9a9SDr. David Alan Gilbert 
985696ed9a9SDr. David Alan Gilbert /*
986696ed9a9SDr. David Alan Gilbert  * Returns a target page of memory that can be mapped at a later point in time
987696ed9a9SDr. David Alan Gilbert  * using postcopy_place_page
988696ed9a9SDr. David Alan Gilbert  * The same address is used repeatedly, postcopy_place_page just takes the
989696ed9a9SDr. David Alan Gilbert  * backing page away.
990696ed9a9SDr. David Alan Gilbert  * Returns: Pointer to allocated page
991696ed9a9SDr. David Alan Gilbert  *
992696ed9a9SDr. David Alan Gilbert  */
993696ed9a9SDr. David Alan Gilbert void *postcopy_get_tmp_page(MigrationIncomingState *mis)
994696ed9a9SDr. David Alan Gilbert {
995696ed9a9SDr. David Alan Gilbert     if (!mis->postcopy_tmp_page) {
996df9ff5e1SDr. David Alan Gilbert         mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
997696ed9a9SDr. David Alan Gilbert                              PROT_READ | PROT_WRITE, MAP_PRIVATE |
998696ed9a9SDr. David Alan Gilbert                              MAP_ANONYMOUS, -1, 0);
9990e8b3cdfSEvgeny Yakovlev         if (mis->postcopy_tmp_page == MAP_FAILED) {
10000e8b3cdfSEvgeny Yakovlev             mis->postcopy_tmp_page = NULL;
1001696ed9a9SDr. David Alan Gilbert             error_report("%s: %s", __func__, strerror(errno));
1002696ed9a9SDr. David Alan Gilbert             return NULL;
1003696ed9a9SDr. David Alan Gilbert         }
1004696ed9a9SDr. David Alan Gilbert     }
1005696ed9a9SDr. David Alan Gilbert 
1006696ed9a9SDr. David Alan Gilbert     return mis->postcopy_tmp_page;
1007696ed9a9SDr. David Alan Gilbert }
1008696ed9a9SDr. David Alan Gilbert 
1009eb59db53SDr. David Alan Gilbert #else
1010eb59db53SDr. David Alan Gilbert /* No target OS support, stubs just fail */
1011d7651f15SAlexey Perevalov bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
1012eb59db53SDr. David Alan Gilbert {
1013eb59db53SDr. David Alan Gilbert     error_report("%s: No OS support", __func__);
1014eb59db53SDr. David Alan Gilbert     return false;
1015eb59db53SDr. David Alan Gilbert }
1016eb59db53SDr. David Alan Gilbert 
10171caddf8aSDr. David Alan Gilbert int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
10181caddf8aSDr. David Alan Gilbert {
10191caddf8aSDr. David Alan Gilbert     error_report("postcopy_ram_incoming_init: No OS support");
10201caddf8aSDr. David Alan Gilbert     return -1;
10211caddf8aSDr. David Alan Gilbert }
10221caddf8aSDr. David Alan Gilbert 
10231caddf8aSDr. David Alan Gilbert int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
10241caddf8aSDr. David Alan Gilbert {
10251caddf8aSDr. David Alan Gilbert     assert(0);
10261caddf8aSDr. David Alan Gilbert     return -1;
10271caddf8aSDr. David Alan Gilbert }
10281caddf8aSDr. David Alan Gilbert 
1029f9527107SDr. David Alan Gilbert int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
1030f9527107SDr. David Alan Gilbert {
1031f9527107SDr. David Alan Gilbert     assert(0);
1032f9527107SDr. David Alan Gilbert     return -1;
1033f9527107SDr. David Alan Gilbert }
1034f9527107SDr. David Alan Gilbert 
1035c188c539SMichael S. Tsirkin int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
1036c188c539SMichael S. Tsirkin                                  uint64_t client_addr, uint64_t rb_offset)
1037c188c539SMichael S. Tsirkin {
1038c188c539SMichael S. Tsirkin     assert(0);
1039c188c539SMichael S. Tsirkin     return -1;
1040c188c539SMichael S. Tsirkin }
1041c188c539SMichael S. Tsirkin 
1042f0a227adSDr. David Alan Gilbert int postcopy_ram_enable_notify(MigrationIncomingState *mis)
1043f0a227adSDr. David Alan Gilbert {
1044f0a227adSDr. David Alan Gilbert     assert(0);
1045f0a227adSDr. David Alan Gilbert     return -1;
1046f0a227adSDr. David Alan Gilbert }
1047696ed9a9SDr. David Alan Gilbert 
1048df9ff5e1SDr. David Alan Gilbert int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
10498be4620bSAlexey Perevalov                         RAMBlock *rb)
1050696ed9a9SDr. David Alan Gilbert {
1051696ed9a9SDr. David Alan Gilbert     assert(0);
1052696ed9a9SDr. David Alan Gilbert     return -1;
1053696ed9a9SDr. David Alan Gilbert }
1054696ed9a9SDr. David Alan Gilbert 
1055df9ff5e1SDr. David Alan Gilbert int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
10568be4620bSAlexey Perevalov                         RAMBlock *rb)
1057696ed9a9SDr. David Alan Gilbert {
1058696ed9a9SDr. David Alan Gilbert     assert(0);
1059696ed9a9SDr. David Alan Gilbert     return -1;
1060696ed9a9SDr. David Alan Gilbert }
1061696ed9a9SDr. David Alan Gilbert 
1062696ed9a9SDr. David Alan Gilbert void *postcopy_get_tmp_page(MigrationIncomingState *mis)
1063696ed9a9SDr. David Alan Gilbert {
1064696ed9a9SDr. David Alan Gilbert     assert(0);
1065696ed9a9SDr. David Alan Gilbert     return NULL;
1066696ed9a9SDr. David Alan Gilbert }
1067696ed9a9SDr. David Alan Gilbert 
10685efc3564SDr. David Alan Gilbert int postcopy_wake_shared(struct PostCopyFD *pcfd,
10695efc3564SDr. David Alan Gilbert                          uint64_t client_addr,
10705efc3564SDr. David Alan Gilbert                          RAMBlock *rb)
10715efc3564SDr. David Alan Gilbert {
10725efc3564SDr. David Alan Gilbert     assert(0);
10735efc3564SDr. David Alan Gilbert     return -1;
10745efc3564SDr. David Alan Gilbert }
1075eb59db53SDr. David Alan Gilbert #endif
1076eb59db53SDr. David Alan Gilbert 
1077e0b266f0SDr. David Alan Gilbert /* ------------------------------------------------------------------------- */
1078e0b266f0SDr. David Alan Gilbert 
10799ab7ef9bSPeter Xu void postcopy_fault_thread_notify(MigrationIncomingState *mis)
10809ab7ef9bSPeter Xu {
10819ab7ef9bSPeter Xu     uint64_t tmp64 = 1;
10829ab7ef9bSPeter Xu 
10839ab7ef9bSPeter Xu     /*
10849ab7ef9bSPeter Xu      * Wakeup the fault_thread.  It's an eventfd that should currently
10859ab7ef9bSPeter Xu      * be at 0, we're going to increment it to 1
10869ab7ef9bSPeter Xu      */
10879ab7ef9bSPeter Xu     if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
10889ab7ef9bSPeter Xu         /* Not much we can do here, but may as well report it */
10899ab7ef9bSPeter Xu         error_report("%s: incrementing failed: %s", __func__,
10909ab7ef9bSPeter Xu                      strerror(errno));
10919ab7ef9bSPeter Xu     }
10929ab7ef9bSPeter Xu }
10939ab7ef9bSPeter Xu 
1094e0b266f0SDr. David Alan Gilbert /**
1095e0b266f0SDr. David Alan Gilbert  * postcopy_discard_send_init: Called at the start of each RAMBlock before
1096e0b266f0SDr. David Alan Gilbert  *   asking to discard individual ranges.
1097e0b266f0SDr. David Alan Gilbert  *
1098e0b266f0SDr. David Alan Gilbert  * @ms: The current migration state.
1099e0b266f0SDr. David Alan Gilbert  * @offset: the bitmap offset of the named RAMBlock in the migration
1100e0b266f0SDr. David Alan Gilbert  *   bitmap.
1101e0b266f0SDr. David Alan Gilbert  * @name: RAMBlock that discards will operate on.
1102e0b266f0SDr. David Alan Gilbert  *
1103e0b266f0SDr. David Alan Gilbert  * returns: a new PDS.
1104e0b266f0SDr. David Alan Gilbert  */
1105e0b266f0SDr. David Alan Gilbert PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
1106e0b266f0SDr. David Alan Gilbert                                                  const char *name)
1107e0b266f0SDr. David Alan Gilbert {
1108e0b266f0SDr. David Alan Gilbert     PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState));
1109e0b266f0SDr. David Alan Gilbert 
1110e0b266f0SDr. David Alan Gilbert     if (res) {
1111e0b266f0SDr. David Alan Gilbert         res->ramblock_name = name;
1112e0b266f0SDr. David Alan Gilbert     }
1113e0b266f0SDr. David Alan Gilbert 
1114e0b266f0SDr. David Alan Gilbert     return res;
1115e0b266f0SDr. David Alan Gilbert }
1116e0b266f0SDr. David Alan Gilbert 
1117e0b266f0SDr. David Alan Gilbert /**
1118e0b266f0SDr. David Alan Gilbert  * postcopy_discard_send_range: Called by the bitmap code for each chunk to
1119e0b266f0SDr. David Alan Gilbert  *   discard. May send a discard message, may just leave it queued to
1120e0b266f0SDr. David Alan Gilbert  *   be sent later.
1121e0b266f0SDr. David Alan Gilbert  *
1122e0b266f0SDr. David Alan Gilbert  * @ms: Current migration state.
1123e0b266f0SDr. David Alan Gilbert  * @pds: Structure initialised by postcopy_discard_send_init().
1124e0b266f0SDr. David Alan Gilbert  * @start,@length: a range of pages in the migration bitmap in the
1125e0b266f0SDr. David Alan Gilbert  *   RAM block passed to postcopy_discard_send_init() (length=1 is one page)
1126e0b266f0SDr. David Alan Gilbert  */
1127e0b266f0SDr. David Alan Gilbert void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
1128e0b266f0SDr. David Alan Gilbert                                 unsigned long start, unsigned long length)
1129e0b266f0SDr. David Alan Gilbert {
113020afaed9SJuan Quintela     size_t tp_size = qemu_target_page_size();
1131e0b266f0SDr. David Alan Gilbert     /* Convert to byte offsets within the RAM block */
11326b6712efSJuan Quintela     pds->start_list[pds->cur_entry] = start  * tp_size;
113320afaed9SJuan Quintela     pds->length_list[pds->cur_entry] = length * tp_size;
1134e0b266f0SDr. David Alan Gilbert     trace_postcopy_discard_send_range(pds->ramblock_name, start, length);
1135e0b266f0SDr. David Alan Gilbert     pds->cur_entry++;
1136e0b266f0SDr. David Alan Gilbert     pds->nsentwords++;
1137e0b266f0SDr. David Alan Gilbert 
1138e0b266f0SDr. David Alan Gilbert     if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
1139e0b266f0SDr. David Alan Gilbert         /* Full set, ship it! */
114089a02a9fSzhanghailiang         qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
114189a02a9fSzhanghailiang                                               pds->ramblock_name,
1142e0b266f0SDr. David Alan Gilbert                                               pds->cur_entry,
1143e0b266f0SDr. David Alan Gilbert                                               pds->start_list,
1144e0b266f0SDr. David Alan Gilbert                                               pds->length_list);
1145e0b266f0SDr. David Alan Gilbert         pds->nsentcmds++;
1146e0b266f0SDr. David Alan Gilbert         pds->cur_entry = 0;
1147e0b266f0SDr. David Alan Gilbert     }
1148e0b266f0SDr. David Alan Gilbert }
1149e0b266f0SDr. David Alan Gilbert 
1150e0b266f0SDr. David Alan Gilbert /**
1151e0b266f0SDr. David Alan Gilbert  * postcopy_discard_send_finish: Called at the end of each RAMBlock by the
1152e0b266f0SDr. David Alan Gilbert  * bitmap code. Sends any outstanding discard messages, frees the PDS
1153e0b266f0SDr. David Alan Gilbert  *
1154e0b266f0SDr. David Alan Gilbert  * @ms: Current migration state.
1155e0b266f0SDr. David Alan Gilbert  * @pds: Structure initialised by postcopy_discard_send_init().
1156e0b266f0SDr. David Alan Gilbert  */
1157e0b266f0SDr. David Alan Gilbert void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
1158e0b266f0SDr. David Alan Gilbert {
1159e0b266f0SDr. David Alan Gilbert     /* Anything unsent? */
1160e0b266f0SDr. David Alan Gilbert     if (pds->cur_entry) {
116189a02a9fSzhanghailiang         qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
116289a02a9fSzhanghailiang                                               pds->ramblock_name,
1163e0b266f0SDr. David Alan Gilbert                                               pds->cur_entry,
1164e0b266f0SDr. David Alan Gilbert                                               pds->start_list,
1165e0b266f0SDr. David Alan Gilbert                                               pds->length_list);
1166e0b266f0SDr. David Alan Gilbert         pds->nsentcmds++;
1167e0b266f0SDr. David Alan Gilbert     }
1168e0b266f0SDr. David Alan Gilbert 
1169e0b266f0SDr. David Alan Gilbert     trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords,
1170e0b266f0SDr. David Alan Gilbert                                        pds->nsentcmds);
1171e0b266f0SDr. David Alan Gilbert 
1172e0b266f0SDr. David Alan Gilbert     g_free(pds);
1173e0b266f0SDr. David Alan Gilbert }
1174bac3b212SJuan Quintela 
1175bac3b212SJuan Quintela /*
1176bac3b212SJuan Quintela  * Current state of incoming postcopy; note this is not part of
1177bac3b212SJuan Quintela  * MigrationIncomingState since it's state is used during cleanup
1178bac3b212SJuan Quintela  * at the end as MIS is being freed.
1179bac3b212SJuan Quintela  */
1180bac3b212SJuan Quintela static PostcopyState incoming_postcopy_state;
1181bac3b212SJuan Quintela 
1182bac3b212SJuan Quintela PostcopyState  postcopy_state_get(void)
1183bac3b212SJuan Quintela {
1184bac3b212SJuan Quintela     return atomic_mb_read(&incoming_postcopy_state);
1185bac3b212SJuan Quintela }
1186bac3b212SJuan Quintela 
1187bac3b212SJuan Quintela /* Set the state and return the old state */
1188bac3b212SJuan Quintela PostcopyState postcopy_state_set(PostcopyState new_state)
1189bac3b212SJuan Quintela {
1190bac3b212SJuan Quintela     return atomic_xchg(&incoming_postcopy_state, new_state);
1191bac3b212SJuan Quintela }
119200fa4fc8SDr. David Alan Gilbert 
119300fa4fc8SDr. David Alan Gilbert /* Register a handler for external shared memory postcopy
119400fa4fc8SDr. David Alan Gilbert  * called on the destination.
119500fa4fc8SDr. David Alan Gilbert  */
119600fa4fc8SDr. David Alan Gilbert void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
119700fa4fc8SDr. David Alan Gilbert {
119800fa4fc8SDr. David Alan Gilbert     MigrationIncomingState *mis = migration_incoming_get_current();
119900fa4fc8SDr. David Alan Gilbert 
120000fa4fc8SDr. David Alan Gilbert     mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
120100fa4fc8SDr. David Alan Gilbert                                                   *pcfd);
120200fa4fc8SDr. David Alan Gilbert }
120300fa4fc8SDr. David Alan Gilbert 
120400fa4fc8SDr. David Alan Gilbert /* Unregister a handler for external shared memory postcopy
120500fa4fc8SDr. David Alan Gilbert  */
120600fa4fc8SDr. David Alan Gilbert void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
120700fa4fc8SDr. David Alan Gilbert {
120800fa4fc8SDr. David Alan Gilbert     guint i;
120900fa4fc8SDr. David Alan Gilbert     MigrationIncomingState *mis = migration_incoming_get_current();
121000fa4fc8SDr. David Alan Gilbert     GArray *pcrfds = mis->postcopy_remote_fds;
121100fa4fc8SDr. David Alan Gilbert 
121200fa4fc8SDr. David Alan Gilbert     for (i = 0; i < pcrfds->len; i++) {
121300fa4fc8SDr. David Alan Gilbert         struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
121400fa4fc8SDr. David Alan Gilbert         if (cur->fd == pcfd->fd) {
121500fa4fc8SDr. David Alan Gilbert             mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
121600fa4fc8SDr. David Alan Gilbert             return;
121700fa4fc8SDr. David Alan Gilbert         }
121800fa4fc8SDr. David Alan Gilbert     }
121900fa4fc8SDr. David Alan Gilbert }
1220