xref: /qemu/migration/ram.c (revision ead62c75f618c072a3a18221fd03ae99ae923cca)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include "qemu/cutils.h"
32 #include "qemu/bitops.h"
33 #include "qemu/bitmap.h"
34 #include "qemu/main-loop.h"
35 #include "xbzrle.h"
36 #include "ram.h"
37 #include "migration.h"
38 #include "migration/register.h"
39 #include "migration/misc.h"
40 #include "qemu-file.h"
41 #include "postcopy-ram.h"
42 #include "page_cache.h"
43 #include "qemu/error-report.h"
44 #include "qapi/error.h"
45 #include "qapi/qapi-types-migration.h"
46 #include "qapi/qapi-events-migration.h"
47 #include "qapi/qmp/qerror.h"
48 #include "trace.h"
49 #include "exec/ram_addr.h"
50 #include "exec/target_page.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
53 #include "block.h"
54 #include "sysemu/cpu-throttle.h"
55 #include "savevm.h"
56 #include "qemu/iov.h"
57 #include "multifd.h"
58 #include "sysemu/runstate.h"
59 
60 #if defined(__linux__)
61 #include "qemu/userfaultfd.h"
62 #endif /* defined(__linux__) */
63 
64 /***********************************************************/
65 /* ram save/restore */
66 
67 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
68  * worked for pages that where filled with the same char.  We switched
69  * it to only search for the zero value.  And to avoid confusion with
70  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
71  */
72 
73 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
74 #define RAM_SAVE_FLAG_ZERO     0x02
75 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
76 #define RAM_SAVE_FLAG_PAGE     0x08
77 #define RAM_SAVE_FLAG_EOS      0x10
78 #define RAM_SAVE_FLAG_CONTINUE 0x20
79 #define RAM_SAVE_FLAG_XBZRLE   0x40
80 /* 0x80 is reserved in migration.h start with 0x100 next */
81 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
82 
83 static inline bool is_zero_range(uint8_t *p, uint64_t size)
84 {
85     return buffer_is_zero(p, size);
86 }
87 
88 XBZRLECacheStats xbzrle_counters;
89 
90 /* struct contains XBZRLE cache and a static page
91    used by the compression */
92 static struct {
93     /* buffer used for XBZRLE encoding */
94     uint8_t *encoded_buf;
95     /* buffer for storing page content */
96     uint8_t *current_buf;
97     /* Cache for XBZRLE, Protected by lock. */
98     PageCache *cache;
99     QemuMutex lock;
100     /* it will store a page full of zeros */
101     uint8_t *zero_target_page;
102     /* buffer used for XBZRLE decoding */
103     uint8_t *decoded_buf;
104 } XBZRLE;
105 
106 static void XBZRLE_cache_lock(void)
107 {
108     if (migrate_use_xbzrle()) {
109         qemu_mutex_lock(&XBZRLE.lock);
110     }
111 }
112 
113 static void XBZRLE_cache_unlock(void)
114 {
115     if (migrate_use_xbzrle()) {
116         qemu_mutex_unlock(&XBZRLE.lock);
117     }
118 }
119 
120 /**
121  * xbzrle_cache_resize: resize the xbzrle cache
122  *
123  * This function is called from migrate_params_apply in main
124  * thread, possibly while a migration is in progress.  A running
125  * migration may be using the cache and might finish during this call,
126  * hence changes to the cache are protected by XBZRLE.lock().
127  *
128  * Returns 0 for success or -1 for error
129  *
130  * @new_size: new cache size
131  * @errp: set *errp if the check failed, with reason
132  */
133 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
134 {
135     PageCache *new_cache;
136     int64_t ret = 0;
137 
138     /* Check for truncation */
139     if (new_size != (size_t)new_size) {
140         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
141                    "exceeding address space");
142         return -1;
143     }
144 
145     if (new_size == migrate_xbzrle_cache_size()) {
146         /* nothing to do */
147         return 0;
148     }
149 
150     XBZRLE_cache_lock();
151 
152     if (XBZRLE.cache != NULL) {
153         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
154         if (!new_cache) {
155             ret = -1;
156             goto out;
157         }
158 
159         cache_fini(XBZRLE.cache);
160         XBZRLE.cache = new_cache;
161     }
162 out:
163     XBZRLE_cache_unlock();
164     return ret;
165 }
166 
167 bool ramblock_is_ignored(RAMBlock *block)
168 {
169     return !qemu_ram_is_migratable(block) ||
170            (migrate_ignore_shared() && qemu_ram_is_shared(block));
171 }
172 
173 #undef RAMBLOCK_FOREACH
174 
175 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
176 {
177     RAMBlock *block;
178     int ret = 0;
179 
180     RCU_READ_LOCK_GUARD();
181 
182     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
183         ret = func(block, opaque);
184         if (ret) {
185             break;
186         }
187     }
188     return ret;
189 }
190 
191 static void ramblock_recv_map_init(void)
192 {
193     RAMBlock *rb;
194 
195     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
196         assert(!rb->receivedmap);
197         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
198     }
199 }
200 
201 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
202 {
203     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
204                     rb->receivedmap);
205 }
206 
207 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
208 {
209     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
210 }
211 
212 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
213 {
214     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
215 }
216 
217 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
218                                     size_t nr)
219 {
220     bitmap_set_atomic(rb->receivedmap,
221                       ramblock_recv_bitmap_offset(host_addr, rb),
222                       nr);
223 }
224 
225 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
226 
227 /*
228  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
229  *
230  * Returns >0 if success with sent bytes, or <0 if error.
231  */
232 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
233                                   const char *block_name)
234 {
235     RAMBlock *block = qemu_ram_block_by_name(block_name);
236     unsigned long *le_bitmap, nbits;
237     uint64_t size;
238 
239     if (!block) {
240         error_report("%s: invalid block name: %s", __func__, block_name);
241         return -1;
242     }
243 
244     nbits = block->used_length >> TARGET_PAGE_BITS;
245 
246     /*
247      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
248      * machines we may need 4 more bytes for padding (see below
249      * comment). So extend it a bit before hand.
250      */
251     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
252 
253     /*
254      * Always use little endian when sending the bitmap. This is
255      * required that when source and destination VMs are not using the
256      * same endianness. (Note: big endian won't work.)
257      */
258     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
259 
260     /* Size of the bitmap, in bytes */
261     size = DIV_ROUND_UP(nbits, 8);
262 
263     /*
264      * size is always aligned to 8 bytes for 64bit machines, but it
265      * may not be true for 32bit machines. We need this padding to
266      * make sure the migration can survive even between 32bit and
267      * 64bit machines.
268      */
269     size = ROUND_UP(size, 8);
270 
271     qemu_put_be64(file, size);
272     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
273     /*
274      * Mark as an end, in case the middle part is screwed up due to
275      * some "mysterious" reason.
276      */
277     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
278     qemu_fflush(file);
279 
280     g_free(le_bitmap);
281 
282     if (qemu_file_get_error(file)) {
283         return qemu_file_get_error(file);
284     }
285 
286     return size + sizeof(size);
287 }
288 
289 /*
290  * An outstanding page request, on the source, having been received
291  * and queued
292  */
293 struct RAMSrcPageRequest {
294     RAMBlock *rb;
295     hwaddr    offset;
296     hwaddr    len;
297 
298     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
299 };
300 
301 /* State of RAM for migration */
302 struct RAMState {
303     /* QEMUFile used for this migration */
304     QEMUFile *f;
305     /* UFFD file descriptor, used in 'write-tracking' migration */
306     int uffdio_fd;
307     /* Last block that we have visited searching for dirty pages */
308     RAMBlock *last_seen_block;
309     /* Last block from where we have sent data */
310     RAMBlock *last_sent_block;
311     /* Last dirty target page we have sent */
312     ram_addr_t last_page;
313     /* last ram version we have seen */
314     uint32_t last_version;
315     /* We are in the first round */
316     bool ram_bulk_stage;
317     /* The free page optimization is enabled */
318     bool fpo_enabled;
319     /* How many times we have dirty too many pages */
320     int dirty_rate_high_cnt;
321     /* these variables are used for bitmap sync */
322     /* last time we did a full bitmap_sync */
323     int64_t time_last_bitmap_sync;
324     /* bytes transferred at start_time */
325     uint64_t bytes_xfer_prev;
326     /* number of dirty pages since start_time */
327     uint64_t num_dirty_pages_period;
328     /* xbzrle misses since the beginning of the period */
329     uint64_t xbzrle_cache_miss_prev;
330     /* Amount of xbzrle pages since the beginning of the period */
331     uint64_t xbzrle_pages_prev;
332     /* Amount of xbzrle encoded bytes since the beginning of the period */
333     uint64_t xbzrle_bytes_prev;
334 
335     /* compression statistics since the beginning of the period */
336     /* amount of count that no free thread to compress data */
337     uint64_t compress_thread_busy_prev;
338     /* amount bytes after compression */
339     uint64_t compressed_size_prev;
340     /* amount of compressed pages */
341     uint64_t compress_pages_prev;
342 
343     /* total handled target pages at the beginning of period */
344     uint64_t target_page_count_prev;
345     /* total handled target pages since start */
346     uint64_t target_page_count;
347     /* number of dirty bits in the bitmap */
348     uint64_t migration_dirty_pages;
349     /* Protects modification of the bitmap and migration dirty pages */
350     QemuMutex bitmap_mutex;
351     /* The RAMBlock used in the last src_page_requests */
352     RAMBlock *last_req_rb;
353     /* Queue of outstanding page requests from the destination */
354     QemuMutex src_page_req_mutex;
355     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
356 };
357 typedef struct RAMState RAMState;
358 
359 static RAMState *ram_state;
360 
361 static NotifierWithReturnList precopy_notifier_list;
362 
363 void precopy_infrastructure_init(void)
364 {
365     notifier_with_return_list_init(&precopy_notifier_list);
366 }
367 
368 void precopy_add_notifier(NotifierWithReturn *n)
369 {
370     notifier_with_return_list_add(&precopy_notifier_list, n);
371 }
372 
373 void precopy_remove_notifier(NotifierWithReturn *n)
374 {
375     notifier_with_return_remove(n);
376 }
377 
378 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
379 {
380     PrecopyNotifyData pnd;
381     pnd.reason = reason;
382     pnd.errp = errp;
383 
384     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
385 }
386 
387 void precopy_enable_free_page_optimization(void)
388 {
389     if (!ram_state) {
390         return;
391     }
392 
393     ram_state->fpo_enabled = true;
394 }
395 
396 uint64_t ram_bytes_remaining(void)
397 {
398     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
399                        0;
400 }
401 
402 MigrationStats ram_counters;
403 
404 /* used by the search for pages to send */
405 struct PageSearchStatus {
406     /* Current block being searched */
407     RAMBlock    *block;
408     /* Current page to search from */
409     unsigned long page;
410     /* Set once we wrap around */
411     bool         complete_round;
412 };
413 typedef struct PageSearchStatus PageSearchStatus;
414 
415 CompressionStats compression_counters;
416 
417 struct CompressParam {
418     bool done;
419     bool quit;
420     bool zero_page;
421     QEMUFile *file;
422     QemuMutex mutex;
423     QemuCond cond;
424     RAMBlock *block;
425     ram_addr_t offset;
426 
427     /* internally used fields */
428     z_stream stream;
429     uint8_t *originbuf;
430 };
431 typedef struct CompressParam CompressParam;
432 
433 struct DecompressParam {
434     bool done;
435     bool quit;
436     QemuMutex mutex;
437     QemuCond cond;
438     void *des;
439     uint8_t *compbuf;
440     int len;
441     z_stream stream;
442 };
443 typedef struct DecompressParam DecompressParam;
444 
445 static CompressParam *comp_param;
446 static QemuThread *compress_threads;
447 /* comp_done_cond is used to wake up the migration thread when
448  * one of the compression threads has finished the compression.
449  * comp_done_lock is used to co-work with comp_done_cond.
450  */
451 static QemuMutex comp_done_lock;
452 static QemuCond comp_done_cond;
453 /* The empty QEMUFileOps will be used by file in CompressParam */
454 static const QEMUFileOps empty_ops = { };
455 
456 static QEMUFile *decomp_file;
457 static DecompressParam *decomp_param;
458 static QemuThread *decompress_threads;
459 static QemuMutex decomp_done_lock;
460 static QemuCond decomp_done_cond;
461 
462 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
463                                  ram_addr_t offset, uint8_t *source_buf);
464 
465 static void *do_data_compress(void *opaque)
466 {
467     CompressParam *param = opaque;
468     RAMBlock *block;
469     ram_addr_t offset;
470     bool zero_page;
471 
472     qemu_mutex_lock(&param->mutex);
473     while (!param->quit) {
474         if (param->block) {
475             block = param->block;
476             offset = param->offset;
477             param->block = NULL;
478             qemu_mutex_unlock(&param->mutex);
479 
480             zero_page = do_compress_ram_page(param->file, &param->stream,
481                                              block, offset, param->originbuf);
482 
483             qemu_mutex_lock(&comp_done_lock);
484             param->done = true;
485             param->zero_page = zero_page;
486             qemu_cond_signal(&comp_done_cond);
487             qemu_mutex_unlock(&comp_done_lock);
488 
489             qemu_mutex_lock(&param->mutex);
490         } else {
491             qemu_cond_wait(&param->cond, &param->mutex);
492         }
493     }
494     qemu_mutex_unlock(&param->mutex);
495 
496     return NULL;
497 }
498 
499 static void compress_threads_save_cleanup(void)
500 {
501     int i, thread_count;
502 
503     if (!migrate_use_compression() || !comp_param) {
504         return;
505     }
506 
507     thread_count = migrate_compress_threads();
508     for (i = 0; i < thread_count; i++) {
509         /*
510          * we use it as a indicator which shows if the thread is
511          * properly init'd or not
512          */
513         if (!comp_param[i].file) {
514             break;
515         }
516 
517         qemu_mutex_lock(&comp_param[i].mutex);
518         comp_param[i].quit = true;
519         qemu_cond_signal(&comp_param[i].cond);
520         qemu_mutex_unlock(&comp_param[i].mutex);
521 
522         qemu_thread_join(compress_threads + i);
523         qemu_mutex_destroy(&comp_param[i].mutex);
524         qemu_cond_destroy(&comp_param[i].cond);
525         deflateEnd(&comp_param[i].stream);
526         g_free(comp_param[i].originbuf);
527         qemu_fclose(comp_param[i].file);
528         comp_param[i].file = NULL;
529     }
530     qemu_mutex_destroy(&comp_done_lock);
531     qemu_cond_destroy(&comp_done_cond);
532     g_free(compress_threads);
533     g_free(comp_param);
534     compress_threads = NULL;
535     comp_param = NULL;
536 }
537 
538 static int compress_threads_save_setup(void)
539 {
540     int i, thread_count;
541 
542     if (!migrate_use_compression()) {
543         return 0;
544     }
545     thread_count = migrate_compress_threads();
546     compress_threads = g_new0(QemuThread, thread_count);
547     comp_param = g_new0(CompressParam, thread_count);
548     qemu_cond_init(&comp_done_cond);
549     qemu_mutex_init(&comp_done_lock);
550     for (i = 0; i < thread_count; i++) {
551         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
552         if (!comp_param[i].originbuf) {
553             goto exit;
554         }
555 
556         if (deflateInit(&comp_param[i].stream,
557                         migrate_compress_level()) != Z_OK) {
558             g_free(comp_param[i].originbuf);
559             goto exit;
560         }
561 
562         /* comp_param[i].file is just used as a dummy buffer to save data,
563          * set its ops to empty.
564          */
565         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
566         comp_param[i].done = true;
567         comp_param[i].quit = false;
568         qemu_mutex_init(&comp_param[i].mutex);
569         qemu_cond_init(&comp_param[i].cond);
570         qemu_thread_create(compress_threads + i, "compress",
571                            do_data_compress, comp_param + i,
572                            QEMU_THREAD_JOINABLE);
573     }
574     return 0;
575 
576 exit:
577     compress_threads_save_cleanup();
578     return -1;
579 }
580 
581 /**
582  * save_page_header: write page header to wire
583  *
584  * If this is the 1st block, it also writes the block identification
585  *
586  * Returns the number of bytes written
587  *
588  * @f: QEMUFile where to send the data
589  * @block: block that contains the page we want to send
590  * @offset: offset inside the block for the page
591  *          in the lower bits, it contains flags
592  */
593 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
594                                ram_addr_t offset)
595 {
596     size_t size, len;
597 
598     if (block == rs->last_sent_block) {
599         offset |= RAM_SAVE_FLAG_CONTINUE;
600     }
601     qemu_put_be64(f, offset);
602     size = 8;
603 
604     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
605         len = strlen(block->idstr);
606         qemu_put_byte(f, len);
607         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
608         size += 1 + len;
609         rs->last_sent_block = block;
610     }
611     return size;
612 }
613 
614 /**
615  * mig_throttle_guest_down: throotle down the guest
616  *
617  * Reduce amount of guest cpu execution to hopefully slow down memory
618  * writes. If guest dirty memory rate is reduced below the rate at
619  * which we can transfer pages to the destination then we should be
620  * able to complete migration. Some workloads dirty memory way too
621  * fast and will not effectively converge, even with auto-converge.
622  */
623 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
624                                     uint64_t bytes_dirty_threshold)
625 {
626     MigrationState *s = migrate_get_current();
627     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
628     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
629     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
630     int pct_max = s->parameters.max_cpu_throttle;
631 
632     uint64_t throttle_now = cpu_throttle_get_percentage();
633     uint64_t cpu_now, cpu_ideal, throttle_inc;
634 
635     /* We have not started throttling yet. Let's start it. */
636     if (!cpu_throttle_active()) {
637         cpu_throttle_set(pct_initial);
638     } else {
639         /* Throttling already on, just increase the rate */
640         if (!pct_tailslow) {
641             throttle_inc = pct_increment;
642         } else {
643             /* Compute the ideal CPU percentage used by Guest, which may
644              * make the dirty rate match the dirty rate threshold. */
645             cpu_now = 100 - throttle_now;
646             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
647                         bytes_dirty_period);
648             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
649         }
650         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
651     }
652 }
653 
654 /**
655  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
656  *
657  * @rs: current RAM state
658  * @current_addr: address for the zero page
659  *
660  * Update the xbzrle cache to reflect a page that's been sent as all 0.
661  * The important thing is that a stale (not-yet-0'd) page be replaced
662  * by the new data.
663  * As a bonus, if the page wasn't in the cache it gets added so that
664  * when a small write is made into the 0'd page it gets XBZRLE sent.
665  */
666 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
667 {
668     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
669         return;
670     }
671 
672     /* We don't care if this fails to allocate a new cache page
673      * as long as it updated an old one */
674     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
675                  ram_counters.dirty_sync_count);
676 }
677 
678 #define ENCODING_FLAG_XBZRLE 0x1
679 
680 /**
681  * save_xbzrle_page: compress and send current page
682  *
683  * Returns: 1 means that we wrote the page
684  *          0 means that page is identical to the one already sent
685  *          -1 means that xbzrle would be longer than normal
686  *
687  * @rs: current RAM state
688  * @current_data: pointer to the address of the page contents
689  * @current_addr: addr of the page
690  * @block: block that contains the page we want to send
691  * @offset: offset inside the block for the page
692  * @last_stage: if we are at the completion stage
693  */
694 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
695                             ram_addr_t current_addr, RAMBlock *block,
696                             ram_addr_t offset, bool last_stage)
697 {
698     int encoded_len = 0, bytes_xbzrle;
699     uint8_t *prev_cached_page;
700 
701     if (!cache_is_cached(XBZRLE.cache, current_addr,
702                          ram_counters.dirty_sync_count)) {
703         xbzrle_counters.cache_miss++;
704         if (!last_stage) {
705             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
706                              ram_counters.dirty_sync_count) == -1) {
707                 return -1;
708             } else {
709                 /* update *current_data when the page has been
710                    inserted into cache */
711                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
712             }
713         }
714         return -1;
715     }
716 
717     /*
718      * Reaching here means the page has hit the xbzrle cache, no matter what
719      * encoding result it is (normal encoding, overflow or skipping the page),
720      * count the page as encoded. This is used to calculate the encoding rate.
721      *
722      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
723      * 2nd page turns out to be skipped (i.e. no new bytes written to the
724      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
725      * skipped page included. In this way, the encoding rate can tell if the
726      * guest page is good for xbzrle encoding.
727      */
728     xbzrle_counters.pages++;
729     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
730 
731     /* save current buffer into memory */
732     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
733 
734     /* XBZRLE encoding (if there is no overflow) */
735     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
736                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
737                                        TARGET_PAGE_SIZE);
738 
739     /*
740      * Update the cache contents, so that it corresponds to the data
741      * sent, in all cases except where we skip the page.
742      */
743     if (!last_stage && encoded_len != 0) {
744         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
745         /*
746          * In the case where we couldn't compress, ensure that the caller
747          * sends the data from the cache, since the guest might have
748          * changed the RAM since we copied it.
749          */
750         *current_data = prev_cached_page;
751     }
752 
753     if (encoded_len == 0) {
754         trace_save_xbzrle_page_skipping();
755         return 0;
756     } else if (encoded_len == -1) {
757         trace_save_xbzrle_page_overflow();
758         xbzrle_counters.overflow++;
759         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
760         return -1;
761     }
762 
763     /* Send XBZRLE based compressed page */
764     bytes_xbzrle = save_page_header(rs, rs->f, block,
765                                     offset | RAM_SAVE_FLAG_XBZRLE);
766     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
767     qemu_put_be16(rs->f, encoded_len);
768     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
769     bytes_xbzrle += encoded_len + 1 + 2;
770     /*
771      * Like compressed_size (please see update_compress_thread_counts),
772      * the xbzrle encoded bytes don't count the 8 byte header with
773      * RAM_SAVE_FLAG_CONTINUE.
774      */
775     xbzrle_counters.bytes += bytes_xbzrle - 8;
776     ram_counters.transferred += bytes_xbzrle;
777 
778     return 1;
779 }
780 
781 /**
782  * migration_bitmap_find_dirty: find the next dirty page from start
783  *
784  * Returns the page offset within memory region of the start of a dirty page
785  *
786  * @rs: current RAM state
787  * @rb: RAMBlock where to search for dirty pages
788  * @start: page where we start the search
789  */
790 static inline
791 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
792                                           unsigned long start)
793 {
794     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
795     unsigned long *bitmap = rb->bmap;
796     unsigned long next;
797 
798     if (ramblock_is_ignored(rb)) {
799         return size;
800     }
801 
802     /*
803      * When the free page optimization is enabled, we need to check the bitmap
804      * to send the non-free pages rather than all the pages in the bulk stage.
805      */
806     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
807         next = start + 1;
808     } else {
809         next = find_next_bit(bitmap, size, start);
810     }
811 
812     return next;
813 }
814 
815 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
816                                                 RAMBlock *rb,
817                                                 unsigned long page)
818 {
819     bool ret;
820 
821     QEMU_LOCK_GUARD(&rs->bitmap_mutex);
822 
823     /*
824      * Clear dirty bitmap if needed.  This _must_ be called before we
825      * send any of the page in the chunk because we need to make sure
826      * we can capture further page content changes when we sync dirty
827      * log the next time.  So as long as we are going to send any of
828      * the page in the chunk we clear the remote dirty bitmap for all.
829      * Clearing it earlier won't be a problem, but too late will.
830      */
831     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
832         uint8_t shift = rb->clear_bmap_shift;
833         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
834         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
835 
836         /*
837          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
838          * can make things easier sometimes since then start address
839          * of the small chunk will always be 64 pages aligned so the
840          * bitmap will always be aligned to unsigned long.  We should
841          * even be able to remove this restriction but I'm simply
842          * keeping it.
843          */
844         assert(shift >= 6);
845         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
846         memory_region_clear_dirty_bitmap(rb->mr, start, size);
847     }
848 
849     ret = test_and_clear_bit(page, rb->bmap);
850 
851     if (ret) {
852         rs->migration_dirty_pages--;
853     }
854 
855     return ret;
856 }
857 
858 /* Called with RCU critical section */
859 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
860 {
861     uint64_t new_dirty_pages =
862         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
863 
864     rs->migration_dirty_pages += new_dirty_pages;
865     rs->num_dirty_pages_period += new_dirty_pages;
866 }
867 
868 /**
869  * ram_pagesize_summary: calculate all the pagesizes of a VM
870  *
871  * Returns a summary bitmap of the page sizes of all RAMBlocks
872  *
873  * For VMs with just normal pages this is equivalent to the host page
874  * size. If it's got some huge pages then it's the OR of all the
875  * different page sizes.
876  */
877 uint64_t ram_pagesize_summary(void)
878 {
879     RAMBlock *block;
880     uint64_t summary = 0;
881 
882     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
883         summary |= block->page_size;
884     }
885 
886     return summary;
887 }
888 
889 uint64_t ram_get_total_transferred_pages(void)
890 {
891     return  ram_counters.normal + ram_counters.duplicate +
892                 compression_counters.pages + xbzrle_counters.pages;
893 }
894 
895 static void migration_update_rates(RAMState *rs, int64_t end_time)
896 {
897     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
898     double compressed_size;
899 
900     /* calculate period counters */
901     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
902                 / (end_time - rs->time_last_bitmap_sync);
903 
904     if (!page_count) {
905         return;
906     }
907 
908     if (migrate_use_xbzrle()) {
909         double encoded_size, unencoded_size;
910 
911         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
912             rs->xbzrle_cache_miss_prev) / page_count;
913         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
914         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
915                          TARGET_PAGE_SIZE;
916         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
917         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
918             xbzrle_counters.encoding_rate = 0;
919         } else {
920             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
921         }
922         rs->xbzrle_pages_prev = xbzrle_counters.pages;
923         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
924     }
925 
926     if (migrate_use_compression()) {
927         compression_counters.busy_rate = (double)(compression_counters.busy -
928             rs->compress_thread_busy_prev) / page_count;
929         rs->compress_thread_busy_prev = compression_counters.busy;
930 
931         compressed_size = compression_counters.compressed_size -
932                           rs->compressed_size_prev;
933         if (compressed_size) {
934             double uncompressed_size = (compression_counters.pages -
935                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
936 
937             /* Compression-Ratio = Uncompressed-size / Compressed-size */
938             compression_counters.compression_rate =
939                                         uncompressed_size / compressed_size;
940 
941             rs->compress_pages_prev = compression_counters.pages;
942             rs->compressed_size_prev = compression_counters.compressed_size;
943         }
944     }
945 }
946 
947 static void migration_trigger_throttle(RAMState *rs)
948 {
949     MigrationState *s = migrate_get_current();
950     uint64_t threshold = s->parameters.throttle_trigger_threshold;
951 
952     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
953     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
954     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
955 
956     /* During block migration the auto-converge logic incorrectly detects
957      * that ram migration makes no progress. Avoid this by disabling the
958      * throttling logic during the bulk phase of block migration. */
959     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
960         /* The following detection logic can be refined later. For now:
961            Check to see if the ratio between dirtied bytes and the approx.
962            amount of bytes that just got transferred since the last time
963            we were in this routine reaches the threshold. If that happens
964            twice, start or increase throttling. */
965 
966         if ((bytes_dirty_period > bytes_dirty_threshold) &&
967             (++rs->dirty_rate_high_cnt >= 2)) {
968             trace_migration_throttle();
969             rs->dirty_rate_high_cnt = 0;
970             mig_throttle_guest_down(bytes_dirty_period,
971                                     bytes_dirty_threshold);
972         }
973     }
974 }
975 
976 static void migration_bitmap_sync(RAMState *rs)
977 {
978     RAMBlock *block;
979     int64_t end_time;
980 
981     ram_counters.dirty_sync_count++;
982 
983     if (!rs->time_last_bitmap_sync) {
984         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
985     }
986 
987     trace_migration_bitmap_sync_start();
988     memory_global_dirty_log_sync();
989 
990     qemu_mutex_lock(&rs->bitmap_mutex);
991     WITH_RCU_READ_LOCK_GUARD() {
992         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
993             ramblock_sync_dirty_bitmap(rs, block);
994         }
995         ram_counters.remaining = ram_bytes_remaining();
996     }
997     qemu_mutex_unlock(&rs->bitmap_mutex);
998 
999     memory_global_after_dirty_log_sync();
1000     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1001 
1002     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1003 
1004     /* more than 1 second = 1000 millisecons */
1005     if (end_time > rs->time_last_bitmap_sync + 1000) {
1006         migration_trigger_throttle(rs);
1007 
1008         migration_update_rates(rs, end_time);
1009 
1010         rs->target_page_count_prev = rs->target_page_count;
1011 
1012         /* reset period counters */
1013         rs->time_last_bitmap_sync = end_time;
1014         rs->num_dirty_pages_period = 0;
1015         rs->bytes_xfer_prev = ram_counters.transferred;
1016     }
1017     if (migrate_use_events()) {
1018         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1019     }
1020 }
1021 
1022 static void migration_bitmap_sync_precopy(RAMState *rs)
1023 {
1024     Error *local_err = NULL;
1025 
1026     /*
1027      * The current notifier usage is just an optimization to migration, so we
1028      * don't stop the normal migration process in the error case.
1029      */
1030     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1031         error_report_err(local_err);
1032         local_err = NULL;
1033     }
1034 
1035     migration_bitmap_sync(rs);
1036 
1037     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1038         error_report_err(local_err);
1039     }
1040 }
1041 
1042 /**
1043  * save_zero_page_to_file: send the zero page to the file
1044  *
1045  * Returns the size of data written to the file, 0 means the page is not
1046  * a zero page
1047  *
1048  * @rs: current RAM state
1049  * @file: the file where the data is saved
1050  * @block: block that contains the page we want to send
1051  * @offset: offset inside the block for the page
1052  */
1053 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1054                                   RAMBlock *block, ram_addr_t offset)
1055 {
1056     uint8_t *p = block->host + offset;
1057     int len = 0;
1058 
1059     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1060         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1061         qemu_put_byte(file, 0);
1062         len += 1;
1063     }
1064     return len;
1065 }
1066 
1067 /**
1068  * save_zero_page: send the zero page to the stream
1069  *
1070  * Returns the number of pages written.
1071  *
1072  * @rs: current RAM state
1073  * @block: block that contains the page we want to send
1074  * @offset: offset inside the block for the page
1075  */
1076 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1077 {
1078     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1079 
1080     if (len) {
1081         ram_counters.duplicate++;
1082         ram_counters.transferred += len;
1083         return 1;
1084     }
1085     return -1;
1086 }
1087 
1088 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1089 {
1090     if (!migrate_release_ram() || !migration_in_postcopy()) {
1091         return;
1092     }
1093 
1094     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1095 }
1096 
1097 /*
1098  * @pages: the number of pages written by the control path,
1099  *        < 0 - error
1100  *        > 0 - number of pages written
1101  *
1102  * Return true if the pages has been saved, otherwise false is returned.
1103  */
1104 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1105                               int *pages)
1106 {
1107     uint64_t bytes_xmit = 0;
1108     int ret;
1109 
1110     *pages = -1;
1111     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1112                                 &bytes_xmit);
1113     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1114         return false;
1115     }
1116 
1117     if (bytes_xmit) {
1118         ram_counters.transferred += bytes_xmit;
1119         *pages = 1;
1120     }
1121 
1122     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1123         return true;
1124     }
1125 
1126     if (bytes_xmit > 0) {
1127         ram_counters.normal++;
1128     } else if (bytes_xmit == 0) {
1129         ram_counters.duplicate++;
1130     }
1131 
1132     return true;
1133 }
1134 
1135 /*
1136  * directly send the page to the stream
1137  *
1138  * Returns the number of pages written.
1139  *
1140  * @rs: current RAM state
1141  * @block: block that contains the page we want to send
1142  * @offset: offset inside the block for the page
1143  * @buf: the page to be sent
1144  * @async: send to page asyncly
1145  */
1146 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1147                             uint8_t *buf, bool async)
1148 {
1149     ram_counters.transferred += save_page_header(rs, rs->f, block,
1150                                                  offset | RAM_SAVE_FLAG_PAGE);
1151     if (async) {
1152         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1153                               migrate_release_ram() &
1154                               migration_in_postcopy());
1155     } else {
1156         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1157     }
1158     ram_counters.transferred += TARGET_PAGE_SIZE;
1159     ram_counters.normal++;
1160     return 1;
1161 }
1162 
1163 /**
1164  * ram_save_page: send the given page to the stream
1165  *
1166  * Returns the number of pages written.
1167  *          < 0 - error
1168  *          >=0 - Number of pages written - this might legally be 0
1169  *                if xbzrle noticed the page was the same.
1170  *
1171  * @rs: current RAM state
1172  * @block: block that contains the page we want to send
1173  * @offset: offset inside the block for the page
1174  * @last_stage: if we are at the completion stage
1175  */
1176 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1177 {
1178     int pages = -1;
1179     uint8_t *p;
1180     bool send_async = true;
1181     RAMBlock *block = pss->block;
1182     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1183     ram_addr_t current_addr = block->offset + offset;
1184 
1185     p = block->host + offset;
1186     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1187 
1188     XBZRLE_cache_lock();
1189     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1190         migrate_use_xbzrle()) {
1191         pages = save_xbzrle_page(rs, &p, current_addr, block,
1192                                  offset, last_stage);
1193         if (!last_stage) {
1194             /* Can't send this cached data async, since the cache page
1195              * might get updated before it gets to the wire
1196              */
1197             send_async = false;
1198         }
1199     }
1200 
1201     /* XBZRLE overflow or normal page */
1202     if (pages == -1) {
1203         pages = save_normal_page(rs, block, offset, p, send_async);
1204     }
1205 
1206     XBZRLE_cache_unlock();
1207 
1208     return pages;
1209 }
1210 
1211 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1212                                  ram_addr_t offset)
1213 {
1214     if (multifd_queue_page(rs->f, block, offset) < 0) {
1215         return -1;
1216     }
1217     ram_counters.normal++;
1218 
1219     return 1;
1220 }
1221 
1222 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1223                                  ram_addr_t offset, uint8_t *source_buf)
1224 {
1225     RAMState *rs = ram_state;
1226     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1227     bool zero_page = false;
1228     int ret;
1229 
1230     if (save_zero_page_to_file(rs, f, block, offset)) {
1231         zero_page = true;
1232         goto exit;
1233     }
1234 
1235     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1236 
1237     /*
1238      * copy it to a internal buffer to avoid it being modified by VM
1239      * so that we can catch up the error during compression and
1240      * decompression
1241      */
1242     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1243     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1244     if (ret < 0) {
1245         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1246         error_report("compressed data failed!");
1247         return false;
1248     }
1249 
1250 exit:
1251     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1252     return zero_page;
1253 }
1254 
1255 static void
1256 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1257 {
1258     ram_counters.transferred += bytes_xmit;
1259 
1260     if (param->zero_page) {
1261         ram_counters.duplicate++;
1262         return;
1263     }
1264 
1265     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1266     compression_counters.compressed_size += bytes_xmit - 8;
1267     compression_counters.pages++;
1268 }
1269 
1270 static bool save_page_use_compression(RAMState *rs);
1271 
1272 static void flush_compressed_data(RAMState *rs)
1273 {
1274     int idx, len, thread_count;
1275 
1276     if (!save_page_use_compression(rs)) {
1277         return;
1278     }
1279     thread_count = migrate_compress_threads();
1280 
1281     qemu_mutex_lock(&comp_done_lock);
1282     for (idx = 0; idx < thread_count; idx++) {
1283         while (!comp_param[idx].done) {
1284             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1285         }
1286     }
1287     qemu_mutex_unlock(&comp_done_lock);
1288 
1289     for (idx = 0; idx < thread_count; idx++) {
1290         qemu_mutex_lock(&comp_param[idx].mutex);
1291         if (!comp_param[idx].quit) {
1292             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1293             /*
1294              * it's safe to fetch zero_page without holding comp_done_lock
1295              * as there is no further request submitted to the thread,
1296              * i.e, the thread should be waiting for a request at this point.
1297              */
1298             update_compress_thread_counts(&comp_param[idx], len);
1299         }
1300         qemu_mutex_unlock(&comp_param[idx].mutex);
1301     }
1302 }
1303 
1304 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1305                                        ram_addr_t offset)
1306 {
1307     param->block = block;
1308     param->offset = offset;
1309 }
1310 
1311 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1312                                            ram_addr_t offset)
1313 {
1314     int idx, thread_count, bytes_xmit = -1, pages = -1;
1315     bool wait = migrate_compress_wait_thread();
1316 
1317     thread_count = migrate_compress_threads();
1318     qemu_mutex_lock(&comp_done_lock);
1319 retry:
1320     for (idx = 0; idx < thread_count; idx++) {
1321         if (comp_param[idx].done) {
1322             comp_param[idx].done = false;
1323             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1324             qemu_mutex_lock(&comp_param[idx].mutex);
1325             set_compress_params(&comp_param[idx], block, offset);
1326             qemu_cond_signal(&comp_param[idx].cond);
1327             qemu_mutex_unlock(&comp_param[idx].mutex);
1328             pages = 1;
1329             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1330             break;
1331         }
1332     }
1333 
1334     /*
1335      * wait for the free thread if the user specifies 'compress-wait-thread',
1336      * otherwise we will post the page out in the main thread as normal page.
1337      */
1338     if (pages < 0 && wait) {
1339         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1340         goto retry;
1341     }
1342     qemu_mutex_unlock(&comp_done_lock);
1343 
1344     return pages;
1345 }
1346 
1347 /**
1348  * find_dirty_block: find the next dirty page and update any state
1349  * associated with the search process.
1350  *
1351  * Returns true if a page is found
1352  *
1353  * @rs: current RAM state
1354  * @pss: data about the state of the current dirty page scan
1355  * @again: set to false if the search has scanned the whole of RAM
1356  */
1357 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1358 {
1359     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1360     if (pss->complete_round && pss->block == rs->last_seen_block &&
1361         pss->page >= rs->last_page) {
1362         /*
1363          * We've been once around the RAM and haven't found anything.
1364          * Give up.
1365          */
1366         *again = false;
1367         return false;
1368     }
1369     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1370         >= pss->block->used_length) {
1371         /* Didn't find anything in this RAM Block */
1372         pss->page = 0;
1373         pss->block = QLIST_NEXT_RCU(pss->block, next);
1374         if (!pss->block) {
1375             /*
1376              * If memory migration starts over, we will meet a dirtied page
1377              * which may still exists in compression threads's ring, so we
1378              * should flush the compressed data to make sure the new page
1379              * is not overwritten by the old one in the destination.
1380              *
1381              * Also If xbzrle is on, stop using the data compression at this
1382              * point. In theory, xbzrle can do better than compression.
1383              */
1384             flush_compressed_data(rs);
1385 
1386             /* Hit the end of the list */
1387             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1388             /* Flag that we've looped */
1389             pss->complete_round = true;
1390             rs->ram_bulk_stage = false;
1391         }
1392         /* Didn't find anything this time, but try again on the new block */
1393         *again = true;
1394         return false;
1395     } else {
1396         /* Can go around again, but... */
1397         *again = true;
1398         /* We've found something so probably don't need to */
1399         return true;
1400     }
1401 }
1402 
1403 /**
1404  * unqueue_page: gets a page of the queue
1405  *
1406  * Helper for 'get_queued_page' - gets a page off the queue
1407  *
1408  * Returns the block of the page (or NULL if none available)
1409  *
1410  * @rs: current RAM state
1411  * @offset: used to return the offset within the RAMBlock
1412  */
1413 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1414 {
1415     RAMBlock *block = NULL;
1416 
1417     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1418         return NULL;
1419     }
1420 
1421     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1422     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1423         struct RAMSrcPageRequest *entry =
1424                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1425         block = entry->rb;
1426         *offset = entry->offset;
1427 
1428         if (entry->len > TARGET_PAGE_SIZE) {
1429             entry->len -= TARGET_PAGE_SIZE;
1430             entry->offset += TARGET_PAGE_SIZE;
1431         } else {
1432             memory_region_unref(block->mr);
1433             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1434             g_free(entry);
1435             migration_consume_urgent_request();
1436         }
1437     }
1438 
1439     return block;
1440 }
1441 
1442 #if defined(__linux__)
1443 /**
1444  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1445  *   is found, return RAM block pointer and page offset
1446  *
1447  * Returns pointer to the RAMBlock containing faulting page,
1448  *   NULL if no write faults are pending
1449  *
1450  * @rs: current RAM state
1451  * @offset: page offset from the beginning of the block
1452  */
1453 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1454 {
1455     struct uffd_msg uffd_msg;
1456     void *page_address;
1457     RAMBlock *block;
1458     int res;
1459 
1460     if (!migrate_background_snapshot()) {
1461         return NULL;
1462     }
1463 
1464     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1465     if (res <= 0) {
1466         return NULL;
1467     }
1468 
1469     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1470     block = qemu_ram_block_from_host(page_address, false, offset);
1471     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1472     return block;
1473 }
1474 
1475 /**
1476  * ram_save_release_protection: release UFFD write protection after
1477  *   a range of pages has been saved
1478  *
1479  * @rs: current RAM state
1480  * @pss: page-search-status structure
1481  * @start_page: index of the first page in the range relative to pss->block
1482  *
1483  * Returns 0 on success, negative value in case of an error
1484 */
1485 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1486         unsigned long start_page)
1487 {
1488     int res = 0;
1489 
1490     /* Check if page is from UFFD-managed region. */
1491     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1492         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1493         uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1494 
1495         /* Flush async buffers before un-protect. */
1496         qemu_fflush(rs->f);
1497         /* Un-protect memory range. */
1498         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1499                 false, false);
1500     }
1501 
1502     return res;
1503 }
1504 
1505 /* ram_write_tracking_available: check if kernel supports required UFFD features
1506  *
1507  * Returns true if supports, false otherwise
1508  */
1509 bool ram_write_tracking_available(void)
1510 {
1511     uint64_t uffd_features;
1512     int res;
1513 
1514     res = uffd_query_features(&uffd_features);
1515     return (res == 0 &&
1516             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1517 }
1518 
1519 /* ram_write_tracking_compatible: check if guest configuration is
1520  *   compatible with 'write-tracking'
1521  *
1522  * Returns true if compatible, false otherwise
1523  */
1524 bool ram_write_tracking_compatible(void)
1525 {
1526     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1527     int uffd_fd;
1528     RAMBlock *block;
1529     bool ret = false;
1530 
1531     /* Open UFFD file descriptor */
1532     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1533     if (uffd_fd < 0) {
1534         return false;
1535     }
1536 
1537     RCU_READ_LOCK_GUARD();
1538 
1539     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1540         uint64_t uffd_ioctls;
1541 
1542         /* Nothing to do with read-only and MMIO-writable regions */
1543         if (block->mr->readonly || block->mr->rom_device) {
1544             continue;
1545         }
1546         /* Try to register block memory via UFFD-IO to track writes */
1547         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1548                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1549             goto out;
1550         }
1551         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1552             goto out;
1553         }
1554     }
1555     ret = true;
1556 
1557 out:
1558     uffd_close_fd(uffd_fd);
1559     return ret;
1560 }
1561 
1562 /*
1563  * ram_block_populate_pages: populate memory in the RAM block by reading
1564  *   an integer from the beginning of each page.
1565  *
1566  * Since it's solely used for userfault_fd WP feature, here we just
1567  *   hardcode page size to qemu_real_host_page_size.
1568  *
1569  * @block: RAM block to populate
1570  */
1571 static void ram_block_populate_pages(RAMBlock *block)
1572 {
1573     char *ptr = (char *) block->host;
1574 
1575     for (ram_addr_t offset = 0; offset < block->used_length;
1576             offset += qemu_real_host_page_size) {
1577         char tmp = *(ptr + offset);
1578 
1579         /* Don't optimize the read out */
1580         asm volatile("" : "+r" (tmp));
1581     }
1582 }
1583 
1584 /*
1585  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1586  */
1587 void ram_write_tracking_prepare(void)
1588 {
1589     RAMBlock *block;
1590 
1591     RCU_READ_LOCK_GUARD();
1592 
1593     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1594         /* Nothing to do with read-only and MMIO-writable regions */
1595         if (block->mr->readonly || block->mr->rom_device) {
1596             continue;
1597         }
1598 
1599         /*
1600          * Populate pages of the RAM block before enabling userfault_fd
1601          * write protection.
1602          *
1603          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1604          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1605          * pages with pte_none() entries in page table.
1606          */
1607         ram_block_populate_pages(block);
1608     }
1609 }
1610 
1611 /*
1612  * ram_write_tracking_start: start UFFD-WP memory tracking
1613  *
1614  * Returns 0 for success or negative value in case of error
1615  */
1616 int ram_write_tracking_start(void)
1617 {
1618     int uffd_fd;
1619     RAMState *rs = ram_state;
1620     RAMBlock *block;
1621 
1622     /* Open UFFD file descriptor */
1623     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1624     if (uffd_fd < 0) {
1625         return uffd_fd;
1626     }
1627     rs->uffdio_fd = uffd_fd;
1628 
1629     RCU_READ_LOCK_GUARD();
1630 
1631     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1632         /* Nothing to do with read-only and MMIO-writable regions */
1633         if (block->mr->readonly || block->mr->rom_device) {
1634             continue;
1635         }
1636 
1637         /* Register block memory with UFFD to track writes */
1638         if (uffd_register_memory(rs->uffdio_fd, block->host,
1639                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1640             goto fail;
1641         }
1642         /* Apply UFFD write protection to the block memory range */
1643         if (uffd_change_protection(rs->uffdio_fd, block->host,
1644                 block->max_length, true, false)) {
1645             goto fail;
1646         }
1647         block->flags |= RAM_UF_WRITEPROTECT;
1648         memory_region_ref(block->mr);
1649 
1650         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1651                 block->host, block->max_length);
1652     }
1653 
1654     return 0;
1655 
1656 fail:
1657     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1658 
1659     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1660         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1661             continue;
1662         }
1663         /*
1664          * In case some memory block failed to be write-protected
1665          * remove protection and unregister all succeeded RAM blocks
1666          */
1667         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1668                 false, false);
1669         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1670         /* Cleanup flags and remove reference */
1671         block->flags &= ~RAM_UF_WRITEPROTECT;
1672         memory_region_unref(block->mr);
1673     }
1674 
1675     uffd_close_fd(uffd_fd);
1676     rs->uffdio_fd = -1;
1677     return -1;
1678 }
1679 
1680 /**
1681  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1682  */
1683 void ram_write_tracking_stop(void)
1684 {
1685     RAMState *rs = ram_state;
1686     RAMBlock *block;
1687 
1688     RCU_READ_LOCK_GUARD();
1689 
1690     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1691         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1692             continue;
1693         }
1694         /* Remove protection and unregister all affected RAM blocks */
1695         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1696                 false, false);
1697         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1698 
1699         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1700                 block->host, block->max_length);
1701 
1702         /* Cleanup flags and remove reference */
1703         block->flags &= ~RAM_UF_WRITEPROTECT;
1704         memory_region_unref(block->mr);
1705     }
1706 
1707     /* Finally close UFFD file descriptor */
1708     uffd_close_fd(rs->uffdio_fd);
1709     rs->uffdio_fd = -1;
1710 }
1711 
1712 #else
1713 /* No target OS support, stubs just fail or ignore */
1714 
1715 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1716 {
1717     (void) rs;
1718     (void) offset;
1719 
1720     return NULL;
1721 }
1722 
1723 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1724         unsigned long start_page)
1725 {
1726     (void) rs;
1727     (void) pss;
1728     (void) start_page;
1729 
1730     return 0;
1731 }
1732 
1733 bool ram_write_tracking_available(void)
1734 {
1735     return false;
1736 }
1737 
1738 bool ram_write_tracking_compatible(void)
1739 {
1740     assert(0);
1741     return false;
1742 }
1743 
1744 int ram_write_tracking_start(void)
1745 {
1746     assert(0);
1747     return -1;
1748 }
1749 
1750 void ram_write_tracking_stop(void)
1751 {
1752     assert(0);
1753 }
1754 #endif /* defined(__linux__) */
1755 
1756 /**
1757  * get_queued_page: unqueue a page from the postcopy requests
1758  *
1759  * Skips pages that are already sent (!dirty)
1760  *
1761  * Returns true if a queued page is found
1762  *
1763  * @rs: current RAM state
1764  * @pss: data about the state of the current dirty page scan
1765  */
1766 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1767 {
1768     RAMBlock  *block;
1769     ram_addr_t offset;
1770     bool dirty;
1771 
1772     do {
1773         block = unqueue_page(rs, &offset);
1774         /*
1775          * We're sending this page, and since it's postcopy nothing else
1776          * will dirty it, and we must make sure it doesn't get sent again
1777          * even if this queue request was received after the background
1778          * search already sent it.
1779          */
1780         if (block) {
1781             unsigned long page;
1782 
1783             page = offset >> TARGET_PAGE_BITS;
1784             dirty = test_bit(page, block->bmap);
1785             if (!dirty) {
1786                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1787                                                 page);
1788             } else {
1789                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1790             }
1791         }
1792 
1793     } while (block && !dirty);
1794 
1795     if (!block) {
1796         /*
1797          * Poll write faults too if background snapshot is enabled; that's
1798          * when we have vcpus got blocked by the write protected pages.
1799          */
1800         block = poll_fault_page(rs, &offset);
1801     }
1802 
1803     if (block) {
1804         /*
1805          * As soon as we start servicing pages out of order, then we have
1806          * to kill the bulk stage, since the bulk stage assumes
1807          * in (migration_bitmap_find_and_reset_dirty) that every page is
1808          * dirty, that's no longer true.
1809          */
1810         rs->ram_bulk_stage = false;
1811 
1812         /*
1813          * We want the background search to continue from the queued page
1814          * since the guest is likely to want other pages near to the page
1815          * it just requested.
1816          */
1817         pss->block = block;
1818         pss->page = offset >> TARGET_PAGE_BITS;
1819 
1820         /*
1821          * This unqueued page would break the "one round" check, even is
1822          * really rare.
1823          */
1824         pss->complete_round = false;
1825     }
1826 
1827     return !!block;
1828 }
1829 
1830 /**
1831  * migration_page_queue_free: drop any remaining pages in the ram
1832  * request queue
1833  *
1834  * It should be empty at the end anyway, but in error cases there may
1835  * be some left.  in case that there is any page left, we drop it.
1836  *
1837  */
1838 static void migration_page_queue_free(RAMState *rs)
1839 {
1840     struct RAMSrcPageRequest *mspr, *next_mspr;
1841     /* This queue generally should be empty - but in the case of a failed
1842      * migration might have some droppings in.
1843      */
1844     RCU_READ_LOCK_GUARD();
1845     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1846         memory_region_unref(mspr->rb->mr);
1847         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1848         g_free(mspr);
1849     }
1850 }
1851 
1852 /**
1853  * ram_save_queue_pages: queue the page for transmission
1854  *
1855  * A request from postcopy destination for example.
1856  *
1857  * Returns zero on success or negative on error
1858  *
1859  * @rbname: Name of the RAMBLock of the request. NULL means the
1860  *          same that last one.
1861  * @start: starting address from the start of the RAMBlock
1862  * @len: length (in bytes) to send
1863  */
1864 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1865 {
1866     RAMBlock *ramblock;
1867     RAMState *rs = ram_state;
1868 
1869     ram_counters.postcopy_requests++;
1870     RCU_READ_LOCK_GUARD();
1871 
1872     if (!rbname) {
1873         /* Reuse last RAMBlock */
1874         ramblock = rs->last_req_rb;
1875 
1876         if (!ramblock) {
1877             /*
1878              * Shouldn't happen, we can't reuse the last RAMBlock if
1879              * it's the 1st request.
1880              */
1881             error_report("ram_save_queue_pages no previous block");
1882             return -1;
1883         }
1884     } else {
1885         ramblock = qemu_ram_block_by_name(rbname);
1886 
1887         if (!ramblock) {
1888             /* We shouldn't be asked for a non-existent RAMBlock */
1889             error_report("ram_save_queue_pages no block '%s'", rbname);
1890             return -1;
1891         }
1892         rs->last_req_rb = ramblock;
1893     }
1894     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1895     if (start + len > ramblock->used_length) {
1896         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1897                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1898                      __func__, start, len, ramblock->used_length);
1899         return -1;
1900     }
1901 
1902     struct RAMSrcPageRequest *new_entry =
1903         g_malloc0(sizeof(struct RAMSrcPageRequest));
1904     new_entry->rb = ramblock;
1905     new_entry->offset = start;
1906     new_entry->len = len;
1907 
1908     memory_region_ref(ramblock->mr);
1909     qemu_mutex_lock(&rs->src_page_req_mutex);
1910     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1911     migration_make_urgent_request();
1912     qemu_mutex_unlock(&rs->src_page_req_mutex);
1913 
1914     return 0;
1915 }
1916 
1917 static bool save_page_use_compression(RAMState *rs)
1918 {
1919     if (!migrate_use_compression()) {
1920         return false;
1921     }
1922 
1923     /*
1924      * If xbzrle is on, stop using the data compression after first
1925      * round of migration even if compression is enabled. In theory,
1926      * xbzrle can do better than compression.
1927      */
1928     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1929         return true;
1930     }
1931 
1932     return false;
1933 }
1934 
1935 /*
1936  * try to compress the page before posting it out, return true if the page
1937  * has been properly handled by compression, otherwise needs other
1938  * paths to handle it
1939  */
1940 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1941 {
1942     if (!save_page_use_compression(rs)) {
1943         return false;
1944     }
1945 
1946     /*
1947      * When starting the process of a new block, the first page of
1948      * the block should be sent out before other pages in the same
1949      * block, and all the pages in last block should have been sent
1950      * out, keeping this order is important, because the 'cont' flag
1951      * is used to avoid resending the block name.
1952      *
1953      * We post the fist page as normal page as compression will take
1954      * much CPU resource.
1955      */
1956     if (block != rs->last_sent_block) {
1957         flush_compressed_data(rs);
1958         return false;
1959     }
1960 
1961     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1962         return true;
1963     }
1964 
1965     compression_counters.busy++;
1966     return false;
1967 }
1968 
1969 /**
1970  * ram_save_target_page: save one target page
1971  *
1972  * Returns the number of pages written
1973  *
1974  * @rs: current RAM state
1975  * @pss: data about the page we want to send
1976  * @last_stage: if we are at the completion stage
1977  */
1978 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1979                                 bool last_stage)
1980 {
1981     RAMBlock *block = pss->block;
1982     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1983     int res;
1984 
1985     if (control_save_page(rs, block, offset, &res)) {
1986         return res;
1987     }
1988 
1989     if (save_compress_page(rs, block, offset)) {
1990         return 1;
1991     }
1992 
1993     res = save_zero_page(rs, block, offset);
1994     if (res > 0) {
1995         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1996          * page would be stale
1997          */
1998         if (!save_page_use_compression(rs)) {
1999             XBZRLE_cache_lock();
2000             xbzrle_cache_zero_page(rs, block->offset + offset);
2001             XBZRLE_cache_unlock();
2002         }
2003         ram_release_pages(block->idstr, offset, res);
2004         return res;
2005     }
2006 
2007     /*
2008      * Do not use multifd for:
2009      * 1. Compression as the first page in the new block should be posted out
2010      *    before sending the compressed page
2011      * 2. In postcopy as one whole host page should be placed
2012      */
2013     if (!save_page_use_compression(rs) && migrate_use_multifd()
2014         && !migration_in_postcopy()) {
2015         return ram_save_multifd_page(rs, block, offset);
2016     }
2017 
2018     return ram_save_page(rs, pss, last_stage);
2019 }
2020 
2021 /**
2022  * ram_save_host_page: save a whole host page
2023  *
2024  * Starting at *offset send pages up to the end of the current host
2025  * page. It's valid for the initial offset to point into the middle of
2026  * a host page in which case the remainder of the hostpage is sent.
2027  * Only dirty target pages are sent. Note that the host page size may
2028  * be a huge page for this block.
2029  * The saving stops at the boundary of the used_length of the block
2030  * if the RAMBlock isn't a multiple of the host page size.
2031  *
2032  * Returns the number of pages written or negative on error
2033  *
2034  * @rs: current RAM state
2035  * @ms: current migration state
2036  * @pss: data about the page we want to send
2037  * @last_stage: if we are at the completion stage
2038  */
2039 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2040                               bool last_stage)
2041 {
2042     int tmppages, pages = 0;
2043     size_t pagesize_bits =
2044         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2045     unsigned long start_page = pss->page;
2046     int res;
2047 
2048     if (ramblock_is_ignored(pss->block)) {
2049         error_report("block %s should not be migrated !", pss->block->idstr);
2050         return 0;
2051     }
2052 
2053     do {
2054         /* Check the pages is dirty and if it is send it */
2055         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2056             pss->page++;
2057             continue;
2058         }
2059 
2060         tmppages = ram_save_target_page(rs, pss, last_stage);
2061         if (tmppages < 0) {
2062             return tmppages;
2063         }
2064 
2065         pages += tmppages;
2066         pss->page++;
2067         /* Allow rate limiting to happen in the middle of huge pages */
2068         migration_rate_limit();
2069     } while ((pss->page & (pagesize_bits - 1)) &&
2070              offset_in_ramblock(pss->block,
2071                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2072     /* The offset we leave with is the last one we looked at */
2073     pss->page--;
2074 
2075     res = ram_save_release_protection(rs, pss, start_page);
2076     return (res < 0 ? res : pages);
2077 }
2078 
2079 /**
2080  * ram_find_and_save_block: finds a dirty page and sends it to f
2081  *
2082  * Called within an RCU critical section.
2083  *
2084  * Returns the number of pages written where zero means no dirty pages,
2085  * or negative on error
2086  *
2087  * @rs: current RAM state
2088  * @last_stage: if we are at the completion stage
2089  *
2090  * On systems where host-page-size > target-page-size it will send all the
2091  * pages in a host page that are dirty.
2092  */
2093 
2094 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2095 {
2096     PageSearchStatus pss;
2097     int pages = 0;
2098     bool again, found;
2099 
2100     /* No dirty page as there is zero RAM */
2101     if (!ram_bytes_total()) {
2102         return pages;
2103     }
2104 
2105     pss.block = rs->last_seen_block;
2106     pss.page = rs->last_page;
2107     pss.complete_round = false;
2108 
2109     if (!pss.block) {
2110         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2111     }
2112 
2113     do {
2114         again = true;
2115         found = get_queued_page(rs, &pss);
2116 
2117         if (!found) {
2118             /* priority queue empty, so just search for something dirty */
2119             found = find_dirty_block(rs, &pss, &again);
2120         }
2121 
2122         if (found) {
2123             pages = ram_save_host_page(rs, &pss, last_stage);
2124         }
2125     } while (!pages && again);
2126 
2127     rs->last_seen_block = pss.block;
2128     rs->last_page = pss.page;
2129 
2130     return pages;
2131 }
2132 
2133 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2134 {
2135     uint64_t pages = size / TARGET_PAGE_SIZE;
2136 
2137     if (zero) {
2138         ram_counters.duplicate += pages;
2139     } else {
2140         ram_counters.normal += pages;
2141         ram_counters.transferred += size;
2142         qemu_update_position(f, size);
2143     }
2144 }
2145 
2146 static uint64_t ram_bytes_total_common(bool count_ignored)
2147 {
2148     RAMBlock *block;
2149     uint64_t total = 0;
2150 
2151     RCU_READ_LOCK_GUARD();
2152 
2153     if (count_ignored) {
2154         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2155             total += block->used_length;
2156         }
2157     } else {
2158         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2159             total += block->used_length;
2160         }
2161     }
2162     return total;
2163 }
2164 
2165 uint64_t ram_bytes_total(void)
2166 {
2167     return ram_bytes_total_common(false);
2168 }
2169 
2170 static void xbzrle_load_setup(void)
2171 {
2172     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2173 }
2174 
2175 static void xbzrle_load_cleanup(void)
2176 {
2177     g_free(XBZRLE.decoded_buf);
2178     XBZRLE.decoded_buf = NULL;
2179 }
2180 
2181 static void ram_state_cleanup(RAMState **rsp)
2182 {
2183     if (*rsp) {
2184         migration_page_queue_free(*rsp);
2185         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2186         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2187         g_free(*rsp);
2188         *rsp = NULL;
2189     }
2190 }
2191 
2192 static void xbzrle_cleanup(void)
2193 {
2194     XBZRLE_cache_lock();
2195     if (XBZRLE.cache) {
2196         cache_fini(XBZRLE.cache);
2197         g_free(XBZRLE.encoded_buf);
2198         g_free(XBZRLE.current_buf);
2199         g_free(XBZRLE.zero_target_page);
2200         XBZRLE.cache = NULL;
2201         XBZRLE.encoded_buf = NULL;
2202         XBZRLE.current_buf = NULL;
2203         XBZRLE.zero_target_page = NULL;
2204     }
2205     XBZRLE_cache_unlock();
2206 }
2207 
2208 static void ram_save_cleanup(void *opaque)
2209 {
2210     RAMState **rsp = opaque;
2211     RAMBlock *block;
2212 
2213     /* We don't use dirty log with background snapshots */
2214     if (!migrate_background_snapshot()) {
2215         /* caller have hold iothread lock or is in a bh, so there is
2216          * no writing race against the migration bitmap
2217          */
2218         memory_global_dirty_log_stop();
2219     }
2220 
2221     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2222         g_free(block->clear_bmap);
2223         block->clear_bmap = NULL;
2224         g_free(block->bmap);
2225         block->bmap = NULL;
2226     }
2227 
2228     xbzrle_cleanup();
2229     compress_threads_save_cleanup();
2230     ram_state_cleanup(rsp);
2231 }
2232 
2233 static void ram_state_reset(RAMState *rs)
2234 {
2235     rs->last_seen_block = NULL;
2236     rs->last_sent_block = NULL;
2237     rs->last_page = 0;
2238     rs->last_version = ram_list.version;
2239     rs->ram_bulk_stage = true;
2240     rs->fpo_enabled = false;
2241 }
2242 
2243 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2244 
2245 /*
2246  * 'expected' is the value you expect the bitmap mostly to be full
2247  * of; it won't bother printing lines that are all this value.
2248  * If 'todump' is null the migration bitmap is dumped.
2249  */
2250 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2251                            unsigned long pages)
2252 {
2253     int64_t cur;
2254     int64_t linelen = 128;
2255     char linebuf[129];
2256 
2257     for (cur = 0; cur < pages; cur += linelen) {
2258         int64_t curb;
2259         bool found = false;
2260         /*
2261          * Last line; catch the case where the line length
2262          * is longer than remaining ram
2263          */
2264         if (cur + linelen > pages) {
2265             linelen = pages - cur;
2266         }
2267         for (curb = 0; curb < linelen; curb++) {
2268             bool thisbit = test_bit(cur + curb, todump);
2269             linebuf[curb] = thisbit ? '1' : '.';
2270             found = found || (thisbit != expected);
2271         }
2272         if (found) {
2273             linebuf[curb] = '\0';
2274             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2275         }
2276     }
2277 }
2278 
2279 /* **** functions for postcopy ***** */
2280 
2281 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2282 {
2283     struct RAMBlock *block;
2284 
2285     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2286         unsigned long *bitmap = block->bmap;
2287         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2288         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2289 
2290         while (run_start < range) {
2291             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2292             ram_discard_range(block->idstr,
2293                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2294                               ((ram_addr_t)(run_end - run_start))
2295                                 << TARGET_PAGE_BITS);
2296             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2297         }
2298     }
2299 }
2300 
2301 /**
2302  * postcopy_send_discard_bm_ram: discard a RAMBlock
2303  *
2304  * Returns zero on success
2305  *
2306  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2307  *
2308  * @ms: current migration state
2309  * @block: RAMBlock to discard
2310  */
2311 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2312 {
2313     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2314     unsigned long current;
2315     unsigned long *bitmap = block->bmap;
2316 
2317     for (current = 0; current < end; ) {
2318         unsigned long one = find_next_bit(bitmap, end, current);
2319         unsigned long zero, discard_length;
2320 
2321         if (one >= end) {
2322             break;
2323         }
2324 
2325         zero = find_next_zero_bit(bitmap, end, one + 1);
2326 
2327         if (zero >= end) {
2328             discard_length = end - one;
2329         } else {
2330             discard_length = zero - one;
2331         }
2332         postcopy_discard_send_range(ms, one, discard_length);
2333         current = one + discard_length;
2334     }
2335 
2336     return 0;
2337 }
2338 
2339 /**
2340  * postcopy_each_ram_send_discard: discard all RAMBlocks
2341  *
2342  * Returns 0 for success or negative for error
2343  *
2344  * Utility for the outgoing postcopy code.
2345  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2346  *   passing it bitmap indexes and name.
2347  * (qemu_ram_foreach_block ends up passing unscaled lengths
2348  *  which would mean postcopy code would have to deal with target page)
2349  *
2350  * @ms: current migration state
2351  */
2352 static int postcopy_each_ram_send_discard(MigrationState *ms)
2353 {
2354     struct RAMBlock *block;
2355     int ret;
2356 
2357     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2358         postcopy_discard_send_init(ms, block->idstr);
2359 
2360         /*
2361          * Postcopy sends chunks of bitmap over the wire, but it
2362          * just needs indexes at this point, avoids it having
2363          * target page specific code.
2364          */
2365         ret = postcopy_send_discard_bm_ram(ms, block);
2366         postcopy_discard_send_finish(ms);
2367         if (ret) {
2368             return ret;
2369         }
2370     }
2371 
2372     return 0;
2373 }
2374 
2375 /**
2376  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2377  *
2378  * Helper for postcopy_chunk_hostpages; it's called twice to
2379  * canonicalize the two bitmaps, that are similar, but one is
2380  * inverted.
2381  *
2382  * Postcopy requires that all target pages in a hostpage are dirty or
2383  * clean, not a mix.  This function canonicalizes the bitmaps.
2384  *
2385  * @ms: current migration state
2386  * @block: block that contains the page we want to canonicalize
2387  */
2388 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2389 {
2390     RAMState *rs = ram_state;
2391     unsigned long *bitmap = block->bmap;
2392     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2393     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2394     unsigned long run_start;
2395 
2396     if (block->page_size == TARGET_PAGE_SIZE) {
2397         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2398         return;
2399     }
2400 
2401     /* Find a dirty page */
2402     run_start = find_next_bit(bitmap, pages, 0);
2403 
2404     while (run_start < pages) {
2405 
2406         /*
2407          * If the start of this run of pages is in the middle of a host
2408          * page, then we need to fixup this host page.
2409          */
2410         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2411             /* Find the end of this run */
2412             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2413             /*
2414              * If the end isn't at the start of a host page, then the
2415              * run doesn't finish at the end of a host page
2416              * and we need to discard.
2417              */
2418         }
2419 
2420         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2421             unsigned long page;
2422             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2423                                                              host_ratio);
2424             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2425 
2426             /* Clean up the bitmap */
2427             for (page = fixup_start_addr;
2428                  page < fixup_start_addr + host_ratio; page++) {
2429                 /*
2430                  * Remark them as dirty, updating the count for any pages
2431                  * that weren't previously dirty.
2432                  */
2433                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2434             }
2435         }
2436 
2437         /* Find the next dirty page for the next iteration */
2438         run_start = find_next_bit(bitmap, pages, run_start);
2439     }
2440 }
2441 
2442 /**
2443  * postcopy_chunk_hostpages: discard any partially sent host page
2444  *
2445  * Utility for the outgoing postcopy code.
2446  *
2447  * Discard any partially sent host-page size chunks, mark any partially
2448  * dirty host-page size chunks as all dirty.  In this case the host-page
2449  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2450  *
2451  * Returns zero on success
2452  *
2453  * @ms: current migration state
2454  * @block: block we want to work with
2455  */
2456 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2457 {
2458     postcopy_discard_send_init(ms, block->idstr);
2459 
2460     /*
2461      * Ensure that all partially dirty host pages are made fully dirty.
2462      */
2463     postcopy_chunk_hostpages_pass(ms, block);
2464 
2465     postcopy_discard_send_finish(ms);
2466     return 0;
2467 }
2468 
2469 /**
2470  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2471  *
2472  * Returns zero on success
2473  *
2474  * Transmit the set of pages to be discarded after precopy to the target
2475  * these are pages that:
2476  *     a) Have been previously transmitted but are now dirty again
2477  *     b) Pages that have never been transmitted, this ensures that
2478  *        any pages on the destination that have been mapped by background
2479  *        tasks get discarded (transparent huge pages is the specific concern)
2480  * Hopefully this is pretty sparse
2481  *
2482  * @ms: current migration state
2483  */
2484 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2485 {
2486     RAMState *rs = ram_state;
2487     RAMBlock *block;
2488     int ret;
2489 
2490     RCU_READ_LOCK_GUARD();
2491 
2492     /* This should be our last sync, the src is now paused */
2493     migration_bitmap_sync(rs);
2494 
2495     /* Easiest way to make sure we don't resume in the middle of a host-page */
2496     rs->last_seen_block = NULL;
2497     rs->last_sent_block = NULL;
2498     rs->last_page = 0;
2499 
2500     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2501         /* Deal with TPS != HPS and huge pages */
2502         ret = postcopy_chunk_hostpages(ms, block);
2503         if (ret) {
2504             return ret;
2505         }
2506 
2507 #ifdef DEBUG_POSTCOPY
2508         ram_debug_dump_bitmap(block->bmap, true,
2509                               block->used_length >> TARGET_PAGE_BITS);
2510 #endif
2511     }
2512     trace_ram_postcopy_send_discard_bitmap();
2513 
2514     return postcopy_each_ram_send_discard(ms);
2515 }
2516 
2517 /**
2518  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2519  *
2520  * Returns zero on success
2521  *
2522  * @rbname: name of the RAMBlock of the request. NULL means the
2523  *          same that last one.
2524  * @start: RAMBlock starting page
2525  * @length: RAMBlock size
2526  */
2527 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2528 {
2529     trace_ram_discard_range(rbname, start, length);
2530 
2531     RCU_READ_LOCK_GUARD();
2532     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2533 
2534     if (!rb) {
2535         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2536         return -1;
2537     }
2538 
2539     /*
2540      * On source VM, we don't need to update the received bitmap since
2541      * we don't even have one.
2542      */
2543     if (rb->receivedmap) {
2544         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2545                      length >> qemu_target_page_bits());
2546     }
2547 
2548     return ram_block_discard_range(rb, start, length);
2549 }
2550 
2551 /*
2552  * For every allocation, we will try not to crash the VM if the
2553  * allocation failed.
2554  */
2555 static int xbzrle_init(void)
2556 {
2557     Error *local_err = NULL;
2558 
2559     if (!migrate_use_xbzrle()) {
2560         return 0;
2561     }
2562 
2563     XBZRLE_cache_lock();
2564 
2565     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2566     if (!XBZRLE.zero_target_page) {
2567         error_report("%s: Error allocating zero page", __func__);
2568         goto err_out;
2569     }
2570 
2571     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2572                               TARGET_PAGE_SIZE, &local_err);
2573     if (!XBZRLE.cache) {
2574         error_report_err(local_err);
2575         goto free_zero_page;
2576     }
2577 
2578     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2579     if (!XBZRLE.encoded_buf) {
2580         error_report("%s: Error allocating encoded_buf", __func__);
2581         goto free_cache;
2582     }
2583 
2584     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2585     if (!XBZRLE.current_buf) {
2586         error_report("%s: Error allocating current_buf", __func__);
2587         goto free_encoded_buf;
2588     }
2589 
2590     /* We are all good */
2591     XBZRLE_cache_unlock();
2592     return 0;
2593 
2594 free_encoded_buf:
2595     g_free(XBZRLE.encoded_buf);
2596     XBZRLE.encoded_buf = NULL;
2597 free_cache:
2598     cache_fini(XBZRLE.cache);
2599     XBZRLE.cache = NULL;
2600 free_zero_page:
2601     g_free(XBZRLE.zero_target_page);
2602     XBZRLE.zero_target_page = NULL;
2603 err_out:
2604     XBZRLE_cache_unlock();
2605     return -ENOMEM;
2606 }
2607 
2608 static int ram_state_init(RAMState **rsp)
2609 {
2610     *rsp = g_try_new0(RAMState, 1);
2611 
2612     if (!*rsp) {
2613         error_report("%s: Init ramstate fail", __func__);
2614         return -1;
2615     }
2616 
2617     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2618     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2619     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2620 
2621     /*
2622      * Count the total number of pages used by ram blocks not including any
2623      * gaps due to alignment or unplugs.
2624      * This must match with the initial values of dirty bitmap.
2625      */
2626     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2627     ram_state_reset(*rsp);
2628 
2629     return 0;
2630 }
2631 
2632 static void ram_list_init_bitmaps(void)
2633 {
2634     MigrationState *ms = migrate_get_current();
2635     RAMBlock *block;
2636     unsigned long pages;
2637     uint8_t shift;
2638 
2639     /* Skip setting bitmap if there is no RAM */
2640     if (ram_bytes_total()) {
2641         shift = ms->clear_bitmap_shift;
2642         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2643             error_report("clear_bitmap_shift (%u) too big, using "
2644                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2645             shift = CLEAR_BITMAP_SHIFT_MAX;
2646         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2647             error_report("clear_bitmap_shift (%u) too small, using "
2648                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2649             shift = CLEAR_BITMAP_SHIFT_MIN;
2650         }
2651 
2652         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2653             pages = block->max_length >> TARGET_PAGE_BITS;
2654             /*
2655              * The initial dirty bitmap for migration must be set with all
2656              * ones to make sure we'll migrate every guest RAM page to
2657              * destination.
2658              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2659              * new migration after a failed migration, ram_list.
2660              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2661              * guest memory.
2662              */
2663             block->bmap = bitmap_new(pages);
2664             bitmap_set(block->bmap, 0, pages);
2665             block->clear_bmap_shift = shift;
2666             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2667         }
2668     }
2669 }
2670 
2671 static void ram_init_bitmaps(RAMState *rs)
2672 {
2673     /* For memory_global_dirty_log_start below.  */
2674     qemu_mutex_lock_iothread();
2675     qemu_mutex_lock_ramlist();
2676 
2677     WITH_RCU_READ_LOCK_GUARD() {
2678         ram_list_init_bitmaps();
2679         /* We don't use dirty log with background snapshots */
2680         if (!migrate_background_snapshot()) {
2681             memory_global_dirty_log_start();
2682             migration_bitmap_sync_precopy(rs);
2683         }
2684     }
2685     qemu_mutex_unlock_ramlist();
2686     qemu_mutex_unlock_iothread();
2687 }
2688 
2689 static int ram_init_all(RAMState **rsp)
2690 {
2691     if (ram_state_init(rsp)) {
2692         return -1;
2693     }
2694 
2695     if (xbzrle_init()) {
2696         ram_state_cleanup(rsp);
2697         return -1;
2698     }
2699 
2700     ram_init_bitmaps(*rsp);
2701 
2702     return 0;
2703 }
2704 
2705 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2706 {
2707     RAMBlock *block;
2708     uint64_t pages = 0;
2709 
2710     /*
2711      * Postcopy is not using xbzrle/compression, so no need for that.
2712      * Also, since source are already halted, we don't need to care
2713      * about dirty page logging as well.
2714      */
2715 
2716     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2717         pages += bitmap_count_one(block->bmap,
2718                                   block->used_length >> TARGET_PAGE_BITS);
2719     }
2720 
2721     /* This may not be aligned with current bitmaps. Recalculate. */
2722     rs->migration_dirty_pages = pages;
2723 
2724     rs->last_seen_block = NULL;
2725     rs->last_sent_block = NULL;
2726     rs->last_page = 0;
2727     rs->last_version = ram_list.version;
2728     /*
2729      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2730      * matter what we have sent.
2731      */
2732     rs->ram_bulk_stage = false;
2733 
2734     /* Update RAMState cache of output QEMUFile */
2735     rs->f = out;
2736 
2737     trace_ram_state_resume_prepare(pages);
2738 }
2739 
2740 /*
2741  * This function clears bits of the free pages reported by the caller from the
2742  * migration dirty bitmap. @addr is the host address corresponding to the
2743  * start of the continuous guest free pages, and @len is the total bytes of
2744  * those pages.
2745  */
2746 void qemu_guest_free_page_hint(void *addr, size_t len)
2747 {
2748     RAMBlock *block;
2749     ram_addr_t offset;
2750     size_t used_len, start, npages;
2751     MigrationState *s = migrate_get_current();
2752 
2753     /* This function is currently expected to be used during live migration */
2754     if (!migration_is_setup_or_active(s->state)) {
2755         return;
2756     }
2757 
2758     for (; len > 0; len -= used_len, addr += used_len) {
2759         block = qemu_ram_block_from_host(addr, false, &offset);
2760         if (unlikely(!block || offset >= block->used_length)) {
2761             /*
2762              * The implementation might not support RAMBlock resize during
2763              * live migration, but it could happen in theory with future
2764              * updates. So we add a check here to capture that case.
2765              */
2766             error_report_once("%s unexpected error", __func__);
2767             return;
2768         }
2769 
2770         if (len <= block->used_length - offset) {
2771             used_len = len;
2772         } else {
2773             used_len = block->used_length - offset;
2774         }
2775 
2776         start = offset >> TARGET_PAGE_BITS;
2777         npages = used_len >> TARGET_PAGE_BITS;
2778 
2779         qemu_mutex_lock(&ram_state->bitmap_mutex);
2780         ram_state->migration_dirty_pages -=
2781                       bitmap_count_one_with_offset(block->bmap, start, npages);
2782         bitmap_clear(block->bmap, start, npages);
2783         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2784     }
2785 }
2786 
2787 /*
2788  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2789  * long-running RCU critical section.  When rcu-reclaims in the code
2790  * start to become numerous it will be necessary to reduce the
2791  * granularity of these critical sections.
2792  */
2793 
2794 /**
2795  * ram_save_setup: Setup RAM for migration
2796  *
2797  * Returns zero to indicate success and negative for error
2798  *
2799  * @f: QEMUFile where to send the data
2800  * @opaque: RAMState pointer
2801  */
2802 static int ram_save_setup(QEMUFile *f, void *opaque)
2803 {
2804     RAMState **rsp = opaque;
2805     RAMBlock *block;
2806 
2807     if (compress_threads_save_setup()) {
2808         return -1;
2809     }
2810 
2811     /* migration has already setup the bitmap, reuse it. */
2812     if (!migration_in_colo_state()) {
2813         if (ram_init_all(rsp) != 0) {
2814             compress_threads_save_cleanup();
2815             return -1;
2816         }
2817     }
2818     (*rsp)->f = f;
2819 
2820     WITH_RCU_READ_LOCK_GUARD() {
2821         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2822 
2823         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2824             qemu_put_byte(f, strlen(block->idstr));
2825             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2826             qemu_put_be64(f, block->used_length);
2827             if (migrate_postcopy_ram() && block->page_size !=
2828                                           qemu_host_page_size) {
2829                 qemu_put_be64(f, block->page_size);
2830             }
2831             if (migrate_ignore_shared()) {
2832                 qemu_put_be64(f, block->mr->addr);
2833             }
2834         }
2835     }
2836 
2837     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2838     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2839 
2840     multifd_send_sync_main(f);
2841     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2842     qemu_fflush(f);
2843 
2844     return 0;
2845 }
2846 
2847 /**
2848  * ram_save_iterate: iterative stage for migration
2849  *
2850  * Returns zero to indicate success and negative for error
2851  *
2852  * @f: QEMUFile where to send the data
2853  * @opaque: RAMState pointer
2854  */
2855 static int ram_save_iterate(QEMUFile *f, void *opaque)
2856 {
2857     RAMState **temp = opaque;
2858     RAMState *rs = *temp;
2859     int ret = 0;
2860     int i;
2861     int64_t t0;
2862     int done = 0;
2863 
2864     if (blk_mig_bulk_active()) {
2865         /* Avoid transferring ram during bulk phase of block migration as
2866          * the bulk phase will usually take a long time and transferring
2867          * ram updates during that time is pointless. */
2868         goto out;
2869     }
2870 
2871     WITH_RCU_READ_LOCK_GUARD() {
2872         if (ram_list.version != rs->last_version) {
2873             ram_state_reset(rs);
2874         }
2875 
2876         /* Read version before ram_list.blocks */
2877         smp_rmb();
2878 
2879         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2880 
2881         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2882         i = 0;
2883         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2884                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2885             int pages;
2886 
2887             if (qemu_file_get_error(f)) {
2888                 break;
2889             }
2890 
2891             pages = ram_find_and_save_block(rs, false);
2892             /* no more pages to sent */
2893             if (pages == 0) {
2894                 done = 1;
2895                 break;
2896             }
2897 
2898             if (pages < 0) {
2899                 qemu_file_set_error(f, pages);
2900                 break;
2901             }
2902 
2903             rs->target_page_count += pages;
2904 
2905             /*
2906              * During postcopy, it is necessary to make sure one whole host
2907              * page is sent in one chunk.
2908              */
2909             if (migrate_postcopy_ram()) {
2910                 flush_compressed_data(rs);
2911             }
2912 
2913             /*
2914              * we want to check in the 1st loop, just in case it was the 1st
2915              * time and we had to sync the dirty bitmap.
2916              * qemu_clock_get_ns() is a bit expensive, so we only check each
2917              * some iterations
2918              */
2919             if ((i & 63) == 0) {
2920                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2921                               1000000;
2922                 if (t1 > MAX_WAIT) {
2923                     trace_ram_save_iterate_big_wait(t1, i);
2924                     break;
2925                 }
2926             }
2927             i++;
2928         }
2929     }
2930 
2931     /*
2932      * Must occur before EOS (or any QEMUFile operation)
2933      * because of RDMA protocol.
2934      */
2935     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2936 
2937 out:
2938     if (ret >= 0
2939         && migration_is_setup_or_active(migrate_get_current()->state)) {
2940         multifd_send_sync_main(rs->f);
2941         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2942         qemu_fflush(f);
2943         ram_counters.transferred += 8;
2944 
2945         ret = qemu_file_get_error(f);
2946     }
2947     if (ret < 0) {
2948         return ret;
2949     }
2950 
2951     return done;
2952 }
2953 
2954 /**
2955  * ram_save_complete: function called to send the remaining amount of ram
2956  *
2957  * Returns zero to indicate success or negative on error
2958  *
2959  * Called with iothread lock
2960  *
2961  * @f: QEMUFile where to send the data
2962  * @opaque: RAMState pointer
2963  */
2964 static int ram_save_complete(QEMUFile *f, void *opaque)
2965 {
2966     RAMState **temp = opaque;
2967     RAMState *rs = *temp;
2968     int ret = 0;
2969 
2970     WITH_RCU_READ_LOCK_GUARD() {
2971         if (!migration_in_postcopy()) {
2972             migration_bitmap_sync_precopy(rs);
2973         }
2974 
2975         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2976 
2977         /* try transferring iterative blocks of memory */
2978 
2979         /* flush all remaining blocks regardless of rate limiting */
2980         while (true) {
2981             int pages;
2982 
2983             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2984             /* no more blocks to sent */
2985             if (pages == 0) {
2986                 break;
2987             }
2988             if (pages < 0) {
2989                 ret = pages;
2990                 break;
2991             }
2992         }
2993 
2994         flush_compressed_data(rs);
2995         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2996     }
2997 
2998     if (ret >= 0) {
2999         multifd_send_sync_main(rs->f);
3000         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3001         qemu_fflush(f);
3002     }
3003 
3004     return ret;
3005 }
3006 
3007 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3008                              uint64_t *res_precopy_only,
3009                              uint64_t *res_compatible,
3010                              uint64_t *res_postcopy_only)
3011 {
3012     RAMState **temp = opaque;
3013     RAMState *rs = *temp;
3014     uint64_t remaining_size;
3015 
3016     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3017 
3018     if (!migration_in_postcopy() &&
3019         remaining_size < max_size) {
3020         qemu_mutex_lock_iothread();
3021         WITH_RCU_READ_LOCK_GUARD() {
3022             migration_bitmap_sync_precopy(rs);
3023         }
3024         qemu_mutex_unlock_iothread();
3025         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3026     }
3027 
3028     if (migrate_postcopy_ram()) {
3029         /* We can do postcopy, and all the data is postcopiable */
3030         *res_compatible += remaining_size;
3031     } else {
3032         *res_precopy_only += remaining_size;
3033     }
3034 }
3035 
3036 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3037 {
3038     unsigned int xh_len;
3039     int xh_flags;
3040     uint8_t *loaded_data;
3041 
3042     /* extract RLE header */
3043     xh_flags = qemu_get_byte(f);
3044     xh_len = qemu_get_be16(f);
3045 
3046     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3047         error_report("Failed to load XBZRLE page - wrong compression!");
3048         return -1;
3049     }
3050 
3051     if (xh_len > TARGET_PAGE_SIZE) {
3052         error_report("Failed to load XBZRLE page - len overflow!");
3053         return -1;
3054     }
3055     loaded_data = XBZRLE.decoded_buf;
3056     /* load data and decode */
3057     /* it can change loaded_data to point to an internal buffer */
3058     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3059 
3060     /* decode RLE */
3061     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3062                              TARGET_PAGE_SIZE) == -1) {
3063         error_report("Failed to load XBZRLE page - decode error!");
3064         return -1;
3065     }
3066 
3067     return 0;
3068 }
3069 
3070 /**
3071  * ram_block_from_stream: read a RAMBlock id from the migration stream
3072  *
3073  * Must be called from within a rcu critical section.
3074  *
3075  * Returns a pointer from within the RCU-protected ram_list.
3076  *
3077  * @f: QEMUFile where to read the data from
3078  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3079  */
3080 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3081 {
3082     static RAMBlock *block;
3083     char id[256];
3084     uint8_t len;
3085 
3086     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3087         if (!block) {
3088             error_report("Ack, bad migration stream!");
3089             return NULL;
3090         }
3091         return block;
3092     }
3093 
3094     len = qemu_get_byte(f);
3095     qemu_get_buffer(f, (uint8_t *)id, len);
3096     id[len] = 0;
3097 
3098     block = qemu_ram_block_by_name(id);
3099     if (!block) {
3100         error_report("Can't find block %s", id);
3101         return NULL;
3102     }
3103 
3104     if (ramblock_is_ignored(block)) {
3105         error_report("block %s should not be migrated !", id);
3106         return NULL;
3107     }
3108 
3109     return block;
3110 }
3111 
3112 static inline void *host_from_ram_block_offset(RAMBlock *block,
3113                                                ram_addr_t offset)
3114 {
3115     if (!offset_in_ramblock(block, offset)) {
3116         return NULL;
3117     }
3118 
3119     return block->host + offset;
3120 }
3121 
3122 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3123                              ram_addr_t offset, bool record_bitmap)
3124 {
3125     if (!offset_in_ramblock(block, offset)) {
3126         return NULL;
3127     }
3128     if (!block->colo_cache) {
3129         error_report("%s: colo_cache is NULL in block :%s",
3130                      __func__, block->idstr);
3131         return NULL;
3132     }
3133 
3134     /*
3135     * During colo checkpoint, we need bitmap of these migrated pages.
3136     * It help us to decide which pages in ram cache should be flushed
3137     * into VM's RAM later.
3138     */
3139     if (record_bitmap &&
3140         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3141         ram_state->migration_dirty_pages++;
3142     }
3143     return block->colo_cache + offset;
3144 }
3145 
3146 /**
3147  * ram_handle_compressed: handle the zero page case
3148  *
3149  * If a page (or a whole RDMA chunk) has been
3150  * determined to be zero, then zap it.
3151  *
3152  * @host: host address for the zero page
3153  * @ch: what the page is filled from.  We only support zero
3154  * @size: size of the zero page
3155  */
3156 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3157 {
3158     if (ch != 0 || !is_zero_range(host, size)) {
3159         memset(host, ch, size);
3160     }
3161 }
3162 
3163 /* return the size after decompression, or negative value on error */
3164 static int
3165 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3166                      const uint8_t *source, size_t source_len)
3167 {
3168     int err;
3169 
3170     err = inflateReset(stream);
3171     if (err != Z_OK) {
3172         return -1;
3173     }
3174 
3175     stream->avail_in = source_len;
3176     stream->next_in = (uint8_t *)source;
3177     stream->avail_out = dest_len;
3178     stream->next_out = dest;
3179 
3180     err = inflate(stream, Z_NO_FLUSH);
3181     if (err != Z_STREAM_END) {
3182         return -1;
3183     }
3184 
3185     return stream->total_out;
3186 }
3187 
3188 static void *do_data_decompress(void *opaque)
3189 {
3190     DecompressParam *param = opaque;
3191     unsigned long pagesize;
3192     uint8_t *des;
3193     int len, ret;
3194 
3195     qemu_mutex_lock(&param->mutex);
3196     while (!param->quit) {
3197         if (param->des) {
3198             des = param->des;
3199             len = param->len;
3200             param->des = 0;
3201             qemu_mutex_unlock(&param->mutex);
3202 
3203             pagesize = TARGET_PAGE_SIZE;
3204 
3205             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3206                                        param->compbuf, len);
3207             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3208                 error_report("decompress data failed");
3209                 qemu_file_set_error(decomp_file, ret);
3210             }
3211 
3212             qemu_mutex_lock(&decomp_done_lock);
3213             param->done = true;
3214             qemu_cond_signal(&decomp_done_cond);
3215             qemu_mutex_unlock(&decomp_done_lock);
3216 
3217             qemu_mutex_lock(&param->mutex);
3218         } else {
3219             qemu_cond_wait(&param->cond, &param->mutex);
3220         }
3221     }
3222     qemu_mutex_unlock(&param->mutex);
3223 
3224     return NULL;
3225 }
3226 
3227 static int wait_for_decompress_done(void)
3228 {
3229     int idx, thread_count;
3230 
3231     if (!migrate_use_compression()) {
3232         return 0;
3233     }
3234 
3235     thread_count = migrate_decompress_threads();
3236     qemu_mutex_lock(&decomp_done_lock);
3237     for (idx = 0; idx < thread_count; idx++) {
3238         while (!decomp_param[idx].done) {
3239             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3240         }
3241     }
3242     qemu_mutex_unlock(&decomp_done_lock);
3243     return qemu_file_get_error(decomp_file);
3244 }
3245 
3246 static void compress_threads_load_cleanup(void)
3247 {
3248     int i, thread_count;
3249 
3250     if (!migrate_use_compression()) {
3251         return;
3252     }
3253     thread_count = migrate_decompress_threads();
3254     for (i = 0; i < thread_count; i++) {
3255         /*
3256          * we use it as a indicator which shows if the thread is
3257          * properly init'd or not
3258          */
3259         if (!decomp_param[i].compbuf) {
3260             break;
3261         }
3262 
3263         qemu_mutex_lock(&decomp_param[i].mutex);
3264         decomp_param[i].quit = true;
3265         qemu_cond_signal(&decomp_param[i].cond);
3266         qemu_mutex_unlock(&decomp_param[i].mutex);
3267     }
3268     for (i = 0; i < thread_count; i++) {
3269         if (!decomp_param[i].compbuf) {
3270             break;
3271         }
3272 
3273         qemu_thread_join(decompress_threads + i);
3274         qemu_mutex_destroy(&decomp_param[i].mutex);
3275         qemu_cond_destroy(&decomp_param[i].cond);
3276         inflateEnd(&decomp_param[i].stream);
3277         g_free(decomp_param[i].compbuf);
3278         decomp_param[i].compbuf = NULL;
3279     }
3280     g_free(decompress_threads);
3281     g_free(decomp_param);
3282     decompress_threads = NULL;
3283     decomp_param = NULL;
3284     decomp_file = NULL;
3285 }
3286 
3287 static int compress_threads_load_setup(QEMUFile *f)
3288 {
3289     int i, thread_count;
3290 
3291     if (!migrate_use_compression()) {
3292         return 0;
3293     }
3294 
3295     thread_count = migrate_decompress_threads();
3296     decompress_threads = g_new0(QemuThread, thread_count);
3297     decomp_param = g_new0(DecompressParam, thread_count);
3298     qemu_mutex_init(&decomp_done_lock);
3299     qemu_cond_init(&decomp_done_cond);
3300     decomp_file = f;
3301     for (i = 0; i < thread_count; i++) {
3302         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3303             goto exit;
3304         }
3305 
3306         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3307         qemu_mutex_init(&decomp_param[i].mutex);
3308         qemu_cond_init(&decomp_param[i].cond);
3309         decomp_param[i].done = true;
3310         decomp_param[i].quit = false;
3311         qemu_thread_create(decompress_threads + i, "decompress",
3312                            do_data_decompress, decomp_param + i,
3313                            QEMU_THREAD_JOINABLE);
3314     }
3315     return 0;
3316 exit:
3317     compress_threads_load_cleanup();
3318     return -1;
3319 }
3320 
3321 static void decompress_data_with_multi_threads(QEMUFile *f,
3322                                                void *host, int len)
3323 {
3324     int idx, thread_count;
3325 
3326     thread_count = migrate_decompress_threads();
3327     QEMU_LOCK_GUARD(&decomp_done_lock);
3328     while (true) {
3329         for (idx = 0; idx < thread_count; idx++) {
3330             if (decomp_param[idx].done) {
3331                 decomp_param[idx].done = false;
3332                 qemu_mutex_lock(&decomp_param[idx].mutex);
3333                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3334                 decomp_param[idx].des = host;
3335                 decomp_param[idx].len = len;
3336                 qemu_cond_signal(&decomp_param[idx].cond);
3337                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3338                 break;
3339             }
3340         }
3341         if (idx < thread_count) {
3342             break;
3343         } else {
3344             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3345         }
3346     }
3347 }
3348 
3349  /*
3350   * we must set ram_bulk_stage to false, otherwise in
3351   * migation_bitmap_find_dirty the bitmap will be unused and
3352   * all the pages in ram cache wil be flushed to the ram of
3353   * secondary VM.
3354   */
3355 static void colo_init_ram_state(void)
3356 {
3357     ram_state_init(&ram_state);
3358     ram_state->ram_bulk_stage = false;
3359 }
3360 
3361 /*
3362  * colo cache: this is for secondary VM, we cache the whole
3363  * memory of the secondary VM, it is need to hold the global lock
3364  * to call this helper.
3365  */
3366 int colo_init_ram_cache(void)
3367 {
3368     RAMBlock *block;
3369 
3370     WITH_RCU_READ_LOCK_GUARD() {
3371         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3372             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3373                                                     NULL,
3374                                                     false);
3375             if (!block->colo_cache) {
3376                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3377                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3378                              block->used_length);
3379                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3380                     if (block->colo_cache) {
3381                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3382                         block->colo_cache = NULL;
3383                     }
3384                 }
3385                 return -errno;
3386             }
3387         }
3388     }
3389 
3390     /*
3391     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3392     * with to decide which page in cache should be flushed into SVM's RAM. Here
3393     * we use the same name 'ram_bitmap' as for migration.
3394     */
3395     if (ram_bytes_total()) {
3396         RAMBlock *block;
3397 
3398         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3399             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3400             block->bmap = bitmap_new(pages);
3401         }
3402     }
3403 
3404     colo_init_ram_state();
3405     return 0;
3406 }
3407 
3408 /* TODO: duplicated with ram_init_bitmaps */
3409 void colo_incoming_start_dirty_log(void)
3410 {
3411     RAMBlock *block = NULL;
3412     /* For memory_global_dirty_log_start below. */
3413     qemu_mutex_lock_iothread();
3414     qemu_mutex_lock_ramlist();
3415 
3416     memory_global_dirty_log_sync();
3417     WITH_RCU_READ_LOCK_GUARD() {
3418         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3419             ramblock_sync_dirty_bitmap(ram_state, block);
3420             /* Discard this dirty bitmap record */
3421             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3422         }
3423         memory_global_dirty_log_start();
3424     }
3425     ram_state->migration_dirty_pages = 0;
3426     qemu_mutex_unlock_ramlist();
3427     qemu_mutex_unlock_iothread();
3428 }
3429 
3430 /* It is need to hold the global lock to call this helper */
3431 void colo_release_ram_cache(void)
3432 {
3433     RAMBlock *block;
3434 
3435     memory_global_dirty_log_stop();
3436     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3437         g_free(block->bmap);
3438         block->bmap = NULL;
3439     }
3440 
3441     WITH_RCU_READ_LOCK_GUARD() {
3442         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3443             if (block->colo_cache) {
3444                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3445                 block->colo_cache = NULL;
3446             }
3447         }
3448     }
3449     ram_state_cleanup(&ram_state);
3450 }
3451 
3452 /**
3453  * ram_load_setup: Setup RAM for migration incoming side
3454  *
3455  * Returns zero to indicate success and negative for error
3456  *
3457  * @f: QEMUFile where to receive the data
3458  * @opaque: RAMState pointer
3459  */
3460 static int ram_load_setup(QEMUFile *f, void *opaque)
3461 {
3462     if (compress_threads_load_setup(f)) {
3463         return -1;
3464     }
3465 
3466     xbzrle_load_setup();
3467     ramblock_recv_map_init();
3468 
3469     return 0;
3470 }
3471 
3472 static int ram_load_cleanup(void *opaque)
3473 {
3474     RAMBlock *rb;
3475 
3476     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3477         qemu_ram_block_writeback(rb);
3478     }
3479 
3480     xbzrle_load_cleanup();
3481     compress_threads_load_cleanup();
3482 
3483     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3484         g_free(rb->receivedmap);
3485         rb->receivedmap = NULL;
3486     }
3487 
3488     return 0;
3489 }
3490 
3491 /**
3492  * ram_postcopy_incoming_init: allocate postcopy data structures
3493  *
3494  * Returns 0 for success and negative if there was one error
3495  *
3496  * @mis: current migration incoming state
3497  *
3498  * Allocate data structures etc needed by incoming migration with
3499  * postcopy-ram. postcopy-ram's similarly names
3500  * postcopy_ram_incoming_init does the work.
3501  */
3502 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3503 {
3504     return postcopy_ram_incoming_init(mis);
3505 }
3506 
3507 /**
3508  * ram_load_postcopy: load a page in postcopy case
3509  *
3510  * Returns 0 for success or -errno in case of error
3511  *
3512  * Called in postcopy mode by ram_load().
3513  * rcu_read_lock is taken prior to this being called.
3514  *
3515  * @f: QEMUFile where to send the data
3516  */
3517 static int ram_load_postcopy(QEMUFile *f)
3518 {
3519     int flags = 0, ret = 0;
3520     bool place_needed = false;
3521     bool matches_target_page_size = false;
3522     MigrationIncomingState *mis = migration_incoming_get_current();
3523     /* Temporary page that is later 'placed' */
3524     void *postcopy_host_page = mis->postcopy_tmp_page;
3525     void *this_host = NULL;
3526     bool all_zero = true;
3527     int target_pages = 0;
3528 
3529     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3530         ram_addr_t addr;
3531         void *host = NULL;
3532         void *page_buffer = NULL;
3533         void *place_source = NULL;
3534         RAMBlock *block = NULL;
3535         uint8_t ch;
3536         int len;
3537 
3538         addr = qemu_get_be64(f);
3539 
3540         /*
3541          * If qemu file error, we should stop here, and then "addr"
3542          * may be invalid
3543          */
3544         ret = qemu_file_get_error(f);
3545         if (ret) {
3546             break;
3547         }
3548 
3549         flags = addr & ~TARGET_PAGE_MASK;
3550         addr &= TARGET_PAGE_MASK;
3551 
3552         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3553         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3554                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3555             block = ram_block_from_stream(f, flags);
3556 
3557             host = host_from_ram_block_offset(block, addr);
3558             if (!host) {
3559                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3560                 ret = -EINVAL;
3561                 break;
3562             }
3563             target_pages++;
3564             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3565             /*
3566              * Postcopy requires that we place whole host pages atomically;
3567              * these may be huge pages for RAMBlocks that are backed by
3568              * hugetlbfs.
3569              * To make it atomic, the data is read into a temporary page
3570              * that's moved into place later.
3571              * The migration protocol uses,  possibly smaller, target-pages
3572              * however the source ensures it always sends all the components
3573              * of a host page in one chunk.
3574              */
3575             page_buffer = postcopy_host_page +
3576                           ((uintptr_t)host & (block->page_size - 1));
3577             if (target_pages == 1) {
3578                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3579                                                     block->page_size);
3580             } else {
3581                 /* not the 1st TP within the HP */
3582                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3583                     (uintptr_t)this_host) {
3584                     error_report("Non-same host page %p/%p",
3585                                   host, this_host);
3586                     ret = -EINVAL;
3587                     break;
3588                 }
3589             }
3590 
3591             /*
3592              * If it's the last part of a host page then we place the host
3593              * page
3594              */
3595             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3596                 place_needed = true;
3597             }
3598             place_source = postcopy_host_page;
3599         }
3600 
3601         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3602         case RAM_SAVE_FLAG_ZERO:
3603             ch = qemu_get_byte(f);
3604             /*
3605              * Can skip to set page_buffer when
3606              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3607              */
3608             if (ch || !matches_target_page_size) {
3609                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3610             }
3611             if (ch) {
3612                 all_zero = false;
3613             }
3614             break;
3615 
3616         case RAM_SAVE_FLAG_PAGE:
3617             all_zero = false;
3618             if (!matches_target_page_size) {
3619                 /* For huge pages, we always use temporary buffer */
3620                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3621             } else {
3622                 /*
3623                  * For small pages that matches target page size, we
3624                  * avoid the qemu_file copy.  Instead we directly use
3625                  * the buffer of QEMUFile to place the page.  Note: we
3626                  * cannot do any QEMUFile operation before using that
3627                  * buffer to make sure the buffer is valid when
3628                  * placing the page.
3629                  */
3630                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3631                                          TARGET_PAGE_SIZE);
3632             }
3633             break;
3634         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3635             all_zero = false;
3636             len = qemu_get_be32(f);
3637             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3638                 error_report("Invalid compressed data length: %d", len);
3639                 ret = -EINVAL;
3640                 break;
3641             }
3642             decompress_data_with_multi_threads(f, page_buffer, len);
3643             break;
3644 
3645         case RAM_SAVE_FLAG_EOS:
3646             /* normal exit */
3647             multifd_recv_sync_main();
3648             break;
3649         default:
3650             error_report("Unknown combination of migration flags: 0x%x"
3651                          " (postcopy mode)", flags);
3652             ret = -EINVAL;
3653             break;
3654         }
3655 
3656         /* Got the whole host page, wait for decompress before placing. */
3657         if (place_needed) {
3658             ret |= wait_for_decompress_done();
3659         }
3660 
3661         /* Detect for any possible file errors */
3662         if (!ret && qemu_file_get_error(f)) {
3663             ret = qemu_file_get_error(f);
3664         }
3665 
3666         if (!ret && place_needed) {
3667             /* This gets called at the last target page in the host page */
3668             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3669                                                        block->page_size);
3670 
3671             if (all_zero) {
3672                 ret = postcopy_place_page_zero(mis, place_dest,
3673                                                block);
3674             } else {
3675                 ret = postcopy_place_page(mis, place_dest,
3676                                           place_source, block);
3677             }
3678             place_needed = false;
3679             target_pages = 0;
3680             /* Assume we have a zero page until we detect something different */
3681             all_zero = true;
3682         }
3683     }
3684 
3685     return ret;
3686 }
3687 
3688 static bool postcopy_is_advised(void)
3689 {
3690     PostcopyState ps = postcopy_state_get();
3691     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3692 }
3693 
3694 static bool postcopy_is_running(void)
3695 {
3696     PostcopyState ps = postcopy_state_get();
3697     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3698 }
3699 
3700 /*
3701  * Flush content of RAM cache into SVM's memory.
3702  * Only flush the pages that be dirtied by PVM or SVM or both.
3703  */
3704 void colo_flush_ram_cache(void)
3705 {
3706     RAMBlock *block = NULL;
3707     void *dst_host;
3708     void *src_host;
3709     unsigned long offset = 0;
3710 
3711     memory_global_dirty_log_sync();
3712     WITH_RCU_READ_LOCK_GUARD() {
3713         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3714             ramblock_sync_dirty_bitmap(ram_state, block);
3715         }
3716     }
3717 
3718     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3719     WITH_RCU_READ_LOCK_GUARD() {
3720         block = QLIST_FIRST_RCU(&ram_list.blocks);
3721 
3722         while (block) {
3723             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3724 
3725             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3726                 >= block->used_length) {
3727                 offset = 0;
3728                 block = QLIST_NEXT_RCU(block, next);
3729             } else {
3730                 migration_bitmap_clear_dirty(ram_state, block, offset);
3731                 dst_host = block->host
3732                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3733                 src_host = block->colo_cache
3734                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3735                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3736             }
3737         }
3738     }
3739     trace_colo_flush_ram_cache_end();
3740 }
3741 
3742 /**
3743  * ram_load_precopy: load pages in precopy case
3744  *
3745  * Returns 0 for success or -errno in case of error
3746  *
3747  * Called in precopy mode by ram_load().
3748  * rcu_read_lock is taken prior to this being called.
3749  *
3750  * @f: QEMUFile where to send the data
3751  */
3752 static int ram_load_precopy(QEMUFile *f)
3753 {
3754     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3755     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3756     bool postcopy_advised = postcopy_is_advised();
3757     if (!migrate_use_compression()) {
3758         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3759     }
3760 
3761     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3762         ram_addr_t addr, total_ram_bytes;
3763         void *host = NULL, *host_bak = NULL;
3764         uint8_t ch;
3765 
3766         /*
3767          * Yield periodically to let main loop run, but an iteration of
3768          * the main loop is expensive, so do it each some iterations
3769          */
3770         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3771             aio_co_schedule(qemu_get_current_aio_context(),
3772                             qemu_coroutine_self());
3773             qemu_coroutine_yield();
3774         }
3775         i++;
3776 
3777         addr = qemu_get_be64(f);
3778         flags = addr & ~TARGET_PAGE_MASK;
3779         addr &= TARGET_PAGE_MASK;
3780 
3781         if (flags & invalid_flags) {
3782             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3783                 error_report("Received an unexpected compressed page");
3784             }
3785 
3786             ret = -EINVAL;
3787             break;
3788         }
3789 
3790         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3791                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3792             RAMBlock *block = ram_block_from_stream(f, flags);
3793 
3794             host = host_from_ram_block_offset(block, addr);
3795             /*
3796              * After going into COLO stage, we should not load the page
3797              * into SVM's memory directly, we put them into colo_cache firstly.
3798              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3799              * Previously, we copied all these memory in preparing stage of COLO
3800              * while we need to stop VM, which is a time-consuming process.
3801              * Here we optimize it by a trick, back-up every page while in
3802              * migration process while COLO is enabled, though it affects the
3803              * speed of the migration, but it obviously reduce the downtime of
3804              * back-up all SVM'S memory in COLO preparing stage.
3805              */
3806             if (migration_incoming_colo_enabled()) {
3807                 if (migration_incoming_in_colo_state()) {
3808                     /* In COLO stage, put all pages into cache temporarily */
3809                     host = colo_cache_from_block_offset(block, addr, true);
3810                 } else {
3811                    /*
3812                     * In migration stage but before COLO stage,
3813                     * Put all pages into both cache and SVM's memory.
3814                     */
3815                     host_bak = colo_cache_from_block_offset(block, addr, false);
3816                 }
3817             }
3818             if (!host) {
3819                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3820                 ret = -EINVAL;
3821                 break;
3822             }
3823             if (!migration_incoming_in_colo_state()) {
3824                 ramblock_recv_bitmap_set(block, host);
3825             }
3826 
3827             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3828         }
3829 
3830         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3831         case RAM_SAVE_FLAG_MEM_SIZE:
3832             /* Synchronize RAM block list */
3833             total_ram_bytes = addr;
3834             while (!ret && total_ram_bytes) {
3835                 RAMBlock *block;
3836                 char id[256];
3837                 ram_addr_t length;
3838 
3839                 len = qemu_get_byte(f);
3840                 qemu_get_buffer(f, (uint8_t *)id, len);
3841                 id[len] = 0;
3842                 length = qemu_get_be64(f);
3843 
3844                 block = qemu_ram_block_by_name(id);
3845                 if (block && !qemu_ram_is_migratable(block)) {
3846                     error_report("block %s should not be migrated !", id);
3847                     ret = -EINVAL;
3848                 } else if (block) {
3849                     if (length != block->used_length) {
3850                         Error *local_err = NULL;
3851 
3852                         ret = qemu_ram_resize(block, length,
3853                                               &local_err);
3854                         if (local_err) {
3855                             error_report_err(local_err);
3856                         }
3857                     }
3858                     /* For postcopy we need to check hugepage sizes match */
3859                     if (postcopy_advised && migrate_postcopy_ram() &&
3860                         block->page_size != qemu_host_page_size) {
3861                         uint64_t remote_page_size = qemu_get_be64(f);
3862                         if (remote_page_size != block->page_size) {
3863                             error_report("Mismatched RAM page size %s "
3864                                          "(local) %zd != %" PRId64,
3865                                          id, block->page_size,
3866                                          remote_page_size);
3867                             ret = -EINVAL;
3868                         }
3869                     }
3870                     if (migrate_ignore_shared()) {
3871                         hwaddr addr = qemu_get_be64(f);
3872                         if (ramblock_is_ignored(block) &&
3873                             block->mr->addr != addr) {
3874                             error_report("Mismatched GPAs for block %s "
3875                                          "%" PRId64 "!= %" PRId64,
3876                                          id, (uint64_t)addr,
3877                                          (uint64_t)block->mr->addr);
3878                             ret = -EINVAL;
3879                         }
3880                     }
3881                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3882                                           block->idstr);
3883                 } else {
3884                     error_report("Unknown ramblock \"%s\", cannot "
3885                                  "accept migration", id);
3886                     ret = -EINVAL;
3887                 }
3888 
3889                 total_ram_bytes -= length;
3890             }
3891             break;
3892 
3893         case RAM_SAVE_FLAG_ZERO:
3894             ch = qemu_get_byte(f);
3895             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3896             break;
3897 
3898         case RAM_SAVE_FLAG_PAGE:
3899             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3900             break;
3901 
3902         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3903             len = qemu_get_be32(f);
3904             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3905                 error_report("Invalid compressed data length: %d", len);
3906                 ret = -EINVAL;
3907                 break;
3908             }
3909             decompress_data_with_multi_threads(f, host, len);
3910             break;
3911 
3912         case RAM_SAVE_FLAG_XBZRLE:
3913             if (load_xbzrle(f, addr, host) < 0) {
3914                 error_report("Failed to decompress XBZRLE page at "
3915                              RAM_ADDR_FMT, addr);
3916                 ret = -EINVAL;
3917                 break;
3918             }
3919             break;
3920         case RAM_SAVE_FLAG_EOS:
3921             /* normal exit */
3922             multifd_recv_sync_main();
3923             break;
3924         default:
3925             if (flags & RAM_SAVE_FLAG_HOOK) {
3926                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3927             } else {
3928                 error_report("Unknown combination of migration flags: 0x%x",
3929                              flags);
3930                 ret = -EINVAL;
3931             }
3932         }
3933         if (!ret) {
3934             ret = qemu_file_get_error(f);
3935         }
3936         if (!ret && host_bak) {
3937             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3938         }
3939     }
3940 
3941     ret |= wait_for_decompress_done();
3942     return ret;
3943 }
3944 
3945 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3946 {
3947     int ret = 0;
3948     static uint64_t seq_iter;
3949     /*
3950      * If system is running in postcopy mode, page inserts to host memory must
3951      * be atomic
3952      */
3953     bool postcopy_running = postcopy_is_running();
3954 
3955     seq_iter++;
3956 
3957     if (version_id != 4) {
3958         return -EINVAL;
3959     }
3960 
3961     /*
3962      * This RCU critical section can be very long running.
3963      * When RCU reclaims in the code start to become numerous,
3964      * it will be necessary to reduce the granularity of this
3965      * critical section.
3966      */
3967     WITH_RCU_READ_LOCK_GUARD() {
3968         if (postcopy_running) {
3969             ret = ram_load_postcopy(f);
3970         } else {
3971             ret = ram_load_precopy(f);
3972         }
3973     }
3974     trace_ram_load_complete(ret, seq_iter);
3975 
3976     return ret;
3977 }
3978 
3979 static bool ram_has_postcopy(void *opaque)
3980 {
3981     RAMBlock *rb;
3982     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3983         if (ramblock_is_pmem(rb)) {
3984             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3985                          "is not supported now!", rb->idstr, rb->host);
3986             return false;
3987         }
3988     }
3989 
3990     return migrate_postcopy_ram();
3991 }
3992 
3993 /* Sync all the dirty bitmap with destination VM.  */
3994 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3995 {
3996     RAMBlock *block;
3997     QEMUFile *file = s->to_dst_file;
3998     int ramblock_count = 0;
3999 
4000     trace_ram_dirty_bitmap_sync_start();
4001 
4002     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4003         qemu_savevm_send_recv_bitmap(file, block->idstr);
4004         trace_ram_dirty_bitmap_request(block->idstr);
4005         ramblock_count++;
4006     }
4007 
4008     trace_ram_dirty_bitmap_sync_wait();
4009 
4010     /* Wait until all the ramblocks' dirty bitmap synced */
4011     while (ramblock_count--) {
4012         qemu_sem_wait(&s->rp_state.rp_sem);
4013     }
4014 
4015     trace_ram_dirty_bitmap_sync_complete();
4016 
4017     return 0;
4018 }
4019 
4020 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4021 {
4022     qemu_sem_post(&s->rp_state.rp_sem);
4023 }
4024 
4025 /*
4026  * Read the received bitmap, revert it as the initial dirty bitmap.
4027  * This is only used when the postcopy migration is paused but wants
4028  * to resume from a middle point.
4029  */
4030 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4031 {
4032     int ret = -EINVAL;
4033     QEMUFile *file = s->rp_state.from_dst_file;
4034     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4035     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4036     uint64_t size, end_mark;
4037 
4038     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4039 
4040     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4041         error_report("%s: incorrect state %s", __func__,
4042                      MigrationStatus_str(s->state));
4043         return -EINVAL;
4044     }
4045 
4046     /*
4047      * Note: see comments in ramblock_recv_bitmap_send() on why we
4048      * need the endianness conversion, and the paddings.
4049      */
4050     local_size = ROUND_UP(local_size, 8);
4051 
4052     /* Add paddings */
4053     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4054 
4055     size = qemu_get_be64(file);
4056 
4057     /* The size of the bitmap should match with our ramblock */
4058     if (size != local_size) {
4059         error_report("%s: ramblock '%s' bitmap size mismatch "
4060                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4061                      block->idstr, size, local_size);
4062         ret = -EINVAL;
4063         goto out;
4064     }
4065 
4066     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4067     end_mark = qemu_get_be64(file);
4068 
4069     ret = qemu_file_get_error(file);
4070     if (ret || size != local_size) {
4071         error_report("%s: read bitmap failed for ramblock '%s': %d"
4072                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4073                      __func__, block->idstr, ret, local_size, size);
4074         ret = -EIO;
4075         goto out;
4076     }
4077 
4078     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4079         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4080                      __func__, block->idstr, end_mark);
4081         ret = -EINVAL;
4082         goto out;
4083     }
4084 
4085     /*
4086      * Endianness conversion. We are during postcopy (though paused).
4087      * The dirty bitmap won't change. We can directly modify it.
4088      */
4089     bitmap_from_le(block->bmap, le_bitmap, nbits);
4090 
4091     /*
4092      * What we received is "received bitmap". Revert it as the initial
4093      * dirty bitmap for this ramblock.
4094      */
4095     bitmap_complement(block->bmap, block->bmap, nbits);
4096 
4097     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4098 
4099     /*
4100      * We succeeded to sync bitmap for current ramblock. If this is
4101      * the last one to sync, we need to notify the main send thread.
4102      */
4103     ram_dirty_bitmap_reload_notify(s);
4104 
4105     ret = 0;
4106 out:
4107     g_free(le_bitmap);
4108     return ret;
4109 }
4110 
4111 static int ram_resume_prepare(MigrationState *s, void *opaque)
4112 {
4113     RAMState *rs = *(RAMState **)opaque;
4114     int ret;
4115 
4116     ret = ram_dirty_bitmap_sync_all(s, rs);
4117     if (ret) {
4118         return ret;
4119     }
4120 
4121     ram_state_resume_prepare(rs, s->to_dst_file);
4122 
4123     return 0;
4124 }
4125 
4126 static SaveVMHandlers savevm_ram_handlers = {
4127     .save_setup = ram_save_setup,
4128     .save_live_iterate = ram_save_iterate,
4129     .save_live_complete_postcopy = ram_save_complete,
4130     .save_live_complete_precopy = ram_save_complete,
4131     .has_postcopy = ram_has_postcopy,
4132     .save_live_pending = ram_save_pending,
4133     .load_state = ram_load,
4134     .save_cleanup = ram_save_cleanup,
4135     .load_setup = ram_load_setup,
4136     .load_cleanup = ram_load_cleanup,
4137     .resume_prepare = ram_resume_prepare,
4138 };
4139 
4140 void ram_mig_init(void)
4141 {
4142     qemu_mutex_init(&XBZRLE.lock);
4143     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4144 }
4145