xref: /qemu/migration/ram.c (revision 33c11879fd422b759483ed25fef133ea900ea8d7)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 #include "qemu/osdep.h"
29 #include "qemu-common.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qapi-event.h"
33 #include "qemu/cutils.h"
34 #include "qemu/bitops.h"
35 #include "qemu/bitmap.h"
36 #include "qemu/timer.h"
37 #include "qemu/main-loop.h"
38 #include "migration/migration.h"
39 #include "migration/postcopy-ram.h"
40 #include "exec/address-spaces.h"
41 #include "migration/page_cache.h"
42 #include "qemu/error-report.h"
43 #include "trace.h"
44 #include "exec/ram_addr.h"
45 #include "qemu/rcu_queue.h"
46 
47 #ifdef DEBUG_MIGRATION_RAM
48 #define DPRINTF(fmt, ...) \
49     do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0)
50 #else
51 #define DPRINTF(fmt, ...) \
52     do { } while (0)
53 #endif
54 
55 static int dirty_rate_high_cnt;
56 
57 static uint64_t bitmap_sync_count;
58 
59 /***********************************************************/
60 /* ram save/restore */
61 
62 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
63 #define RAM_SAVE_FLAG_COMPRESS 0x02
64 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
65 #define RAM_SAVE_FLAG_PAGE     0x08
66 #define RAM_SAVE_FLAG_EOS      0x10
67 #define RAM_SAVE_FLAG_CONTINUE 0x20
68 #define RAM_SAVE_FLAG_XBZRLE   0x40
69 /* 0x80 is reserved in migration.h start with 0x100 next */
70 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
71 
72 static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE];
73 
74 static inline bool is_zero_range(uint8_t *p, uint64_t size)
75 {
76     return buffer_find_nonzero_offset(p, size) == size;
77 }
78 
79 /* struct contains XBZRLE cache and a static page
80    used by the compression */
81 static struct {
82     /* buffer used for XBZRLE encoding */
83     uint8_t *encoded_buf;
84     /* buffer for storing page content */
85     uint8_t *current_buf;
86     /* Cache for XBZRLE, Protected by lock. */
87     PageCache *cache;
88     QemuMutex lock;
89 } XBZRLE;
90 
91 /* buffer used for XBZRLE decoding */
92 static uint8_t *xbzrle_decoded_buf;
93 
94 static void XBZRLE_cache_lock(void)
95 {
96     if (migrate_use_xbzrle())
97         qemu_mutex_lock(&XBZRLE.lock);
98 }
99 
100 static void XBZRLE_cache_unlock(void)
101 {
102     if (migrate_use_xbzrle())
103         qemu_mutex_unlock(&XBZRLE.lock);
104 }
105 
106 /*
107  * called from qmp_migrate_set_cache_size in main thread, possibly while
108  * a migration is in progress.
109  * A running migration maybe using the cache and might finish during this
110  * call, hence changes to the cache are protected by XBZRLE.lock().
111  */
112 int64_t xbzrle_cache_resize(int64_t new_size)
113 {
114     PageCache *new_cache;
115     int64_t ret;
116 
117     if (new_size < TARGET_PAGE_SIZE) {
118         return -1;
119     }
120 
121     XBZRLE_cache_lock();
122 
123     if (XBZRLE.cache != NULL) {
124         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
125             goto out_new_size;
126         }
127         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
128                                         TARGET_PAGE_SIZE);
129         if (!new_cache) {
130             error_report("Error creating cache");
131             ret = -1;
132             goto out;
133         }
134 
135         cache_fini(XBZRLE.cache);
136         XBZRLE.cache = new_cache;
137     }
138 
139 out_new_size:
140     ret = pow2floor(new_size);
141 out:
142     XBZRLE_cache_unlock();
143     return ret;
144 }
145 
146 /* accounting for migration statistics */
147 typedef struct AccountingInfo {
148     uint64_t dup_pages;
149     uint64_t skipped_pages;
150     uint64_t norm_pages;
151     uint64_t iterations;
152     uint64_t xbzrle_bytes;
153     uint64_t xbzrle_pages;
154     uint64_t xbzrle_cache_miss;
155     double xbzrle_cache_miss_rate;
156     uint64_t xbzrle_overflows;
157 } AccountingInfo;
158 
159 static AccountingInfo acct_info;
160 
161 static void acct_clear(void)
162 {
163     memset(&acct_info, 0, sizeof(acct_info));
164 }
165 
166 uint64_t dup_mig_bytes_transferred(void)
167 {
168     return acct_info.dup_pages * TARGET_PAGE_SIZE;
169 }
170 
171 uint64_t dup_mig_pages_transferred(void)
172 {
173     return acct_info.dup_pages;
174 }
175 
176 uint64_t skipped_mig_bytes_transferred(void)
177 {
178     return acct_info.skipped_pages * TARGET_PAGE_SIZE;
179 }
180 
181 uint64_t skipped_mig_pages_transferred(void)
182 {
183     return acct_info.skipped_pages;
184 }
185 
186 uint64_t norm_mig_bytes_transferred(void)
187 {
188     return acct_info.norm_pages * TARGET_PAGE_SIZE;
189 }
190 
191 uint64_t norm_mig_pages_transferred(void)
192 {
193     return acct_info.norm_pages;
194 }
195 
196 uint64_t xbzrle_mig_bytes_transferred(void)
197 {
198     return acct_info.xbzrle_bytes;
199 }
200 
201 uint64_t xbzrle_mig_pages_transferred(void)
202 {
203     return acct_info.xbzrle_pages;
204 }
205 
206 uint64_t xbzrle_mig_pages_cache_miss(void)
207 {
208     return acct_info.xbzrle_cache_miss;
209 }
210 
211 double xbzrle_mig_cache_miss_rate(void)
212 {
213     return acct_info.xbzrle_cache_miss_rate;
214 }
215 
216 uint64_t xbzrle_mig_pages_overflow(void)
217 {
218     return acct_info.xbzrle_overflows;
219 }
220 
221 /* This is the last block that we have visited serching for dirty pages
222  */
223 static RAMBlock *last_seen_block;
224 /* This is the last block from where we have sent data */
225 static RAMBlock *last_sent_block;
226 static ram_addr_t last_offset;
227 static QemuMutex migration_bitmap_mutex;
228 static uint64_t migration_dirty_pages;
229 static uint32_t last_version;
230 static bool ram_bulk_stage;
231 
232 /* used by the search for pages to send */
233 struct PageSearchStatus {
234     /* Current block being searched */
235     RAMBlock    *block;
236     /* Current offset to search from */
237     ram_addr_t   offset;
238     /* Set once we wrap around */
239     bool         complete_round;
240 };
241 typedef struct PageSearchStatus PageSearchStatus;
242 
243 static struct BitmapRcu {
244     struct rcu_head rcu;
245     /* Main migration bitmap */
246     unsigned long *bmap;
247     /* bitmap of pages that haven't been sent even once
248      * only maintained and used in postcopy at the moment
249      * where it's used to send the dirtymap at the start
250      * of the postcopy phase
251      */
252     unsigned long *unsentmap;
253 } *migration_bitmap_rcu;
254 
255 struct CompressParam {
256     bool start;
257     bool done;
258     QEMUFile *file;
259     QemuMutex mutex;
260     QemuCond cond;
261     RAMBlock *block;
262     ram_addr_t offset;
263 };
264 typedef struct CompressParam CompressParam;
265 
266 struct DecompressParam {
267     bool start;
268     QemuMutex mutex;
269     QemuCond cond;
270     void *des;
271     uint8_t *compbuf;
272     int len;
273 };
274 typedef struct DecompressParam DecompressParam;
275 
276 static CompressParam *comp_param;
277 static QemuThread *compress_threads;
278 /* comp_done_cond is used to wake up the migration thread when
279  * one of the compression threads has finished the compression.
280  * comp_done_lock is used to co-work with comp_done_cond.
281  */
282 static QemuMutex *comp_done_lock;
283 static QemuCond *comp_done_cond;
284 /* The empty QEMUFileOps will be used by file in CompressParam */
285 static const QEMUFileOps empty_ops = { };
286 
287 static bool compression_switch;
288 static bool quit_comp_thread;
289 static bool quit_decomp_thread;
290 static DecompressParam *decomp_param;
291 static QemuThread *decompress_threads;
292 
293 static int do_compress_ram_page(CompressParam *param);
294 
295 static void *do_data_compress(void *opaque)
296 {
297     CompressParam *param = opaque;
298 
299     while (!quit_comp_thread) {
300         qemu_mutex_lock(&param->mutex);
301         /* Re-check the quit_comp_thread in case of
302          * terminate_compression_threads is called just before
303          * qemu_mutex_lock(&param->mutex) and after
304          * while(!quit_comp_thread), re-check it here can make
305          * sure the compression thread terminate as expected.
306          */
307         while (!param->start && !quit_comp_thread) {
308             qemu_cond_wait(&param->cond, &param->mutex);
309         }
310         if (!quit_comp_thread) {
311             do_compress_ram_page(param);
312         }
313         param->start = false;
314         qemu_mutex_unlock(&param->mutex);
315 
316         qemu_mutex_lock(comp_done_lock);
317         param->done = true;
318         qemu_cond_signal(comp_done_cond);
319         qemu_mutex_unlock(comp_done_lock);
320     }
321 
322     return NULL;
323 }
324 
325 static inline void terminate_compression_threads(void)
326 {
327     int idx, thread_count;
328 
329     thread_count = migrate_compress_threads();
330     quit_comp_thread = true;
331     for (idx = 0; idx < thread_count; idx++) {
332         qemu_mutex_lock(&comp_param[idx].mutex);
333         qemu_cond_signal(&comp_param[idx].cond);
334         qemu_mutex_unlock(&comp_param[idx].mutex);
335     }
336 }
337 
338 void migrate_compress_threads_join(void)
339 {
340     int i, thread_count;
341 
342     if (!migrate_use_compression()) {
343         return;
344     }
345     terminate_compression_threads();
346     thread_count = migrate_compress_threads();
347     for (i = 0; i < thread_count; i++) {
348         qemu_thread_join(compress_threads + i);
349         qemu_fclose(comp_param[i].file);
350         qemu_mutex_destroy(&comp_param[i].mutex);
351         qemu_cond_destroy(&comp_param[i].cond);
352     }
353     qemu_mutex_destroy(comp_done_lock);
354     qemu_cond_destroy(comp_done_cond);
355     g_free(compress_threads);
356     g_free(comp_param);
357     g_free(comp_done_cond);
358     g_free(comp_done_lock);
359     compress_threads = NULL;
360     comp_param = NULL;
361     comp_done_cond = NULL;
362     comp_done_lock = NULL;
363 }
364 
365 void migrate_compress_threads_create(void)
366 {
367     int i, thread_count;
368 
369     if (!migrate_use_compression()) {
370         return;
371     }
372     quit_comp_thread = false;
373     compression_switch = true;
374     thread_count = migrate_compress_threads();
375     compress_threads = g_new0(QemuThread, thread_count);
376     comp_param = g_new0(CompressParam, thread_count);
377     comp_done_cond = g_new0(QemuCond, 1);
378     comp_done_lock = g_new0(QemuMutex, 1);
379     qemu_cond_init(comp_done_cond);
380     qemu_mutex_init(comp_done_lock);
381     for (i = 0; i < thread_count; i++) {
382         /* com_param[i].file is just used as a dummy buffer to save data, set
383          * it's ops to empty.
384          */
385         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
386         comp_param[i].done = true;
387         qemu_mutex_init(&comp_param[i].mutex);
388         qemu_cond_init(&comp_param[i].cond);
389         qemu_thread_create(compress_threads + i, "compress",
390                            do_data_compress, comp_param + i,
391                            QEMU_THREAD_JOINABLE);
392     }
393 }
394 
395 /**
396  * save_page_header: Write page header to wire
397  *
398  * If this is the 1st block, it also writes the block identification
399  *
400  * Returns: Number of bytes written
401  *
402  * @f: QEMUFile where to send the data
403  * @block: block that contains the page we want to send
404  * @offset: offset inside the block for the page
405  *          in the lower bits, it contains flags
406  */
407 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
408 {
409     size_t size, len;
410 
411     qemu_put_be64(f, offset);
412     size = 8;
413 
414     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
415         len = strlen(block->idstr);
416         qemu_put_byte(f, len);
417         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
418         size += 1 + len;
419     }
420     return size;
421 }
422 
423 /* Reduce amount of guest cpu execution to hopefully slow down memory writes.
424  * If guest dirty memory rate is reduced below the rate at which we can
425  * transfer pages to the destination then we should be able to complete
426  * migration. Some workloads dirty memory way too fast and will not effectively
427  * converge, even with auto-converge.
428  */
429 static void mig_throttle_guest_down(void)
430 {
431     MigrationState *s = migrate_get_current();
432     uint64_t pct_initial =
433             s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL];
434     uint64_t pct_icrement =
435             s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT];
436 
437     /* We have not started throttling yet. Let's start it. */
438     if (!cpu_throttle_active()) {
439         cpu_throttle_set(pct_initial);
440     } else {
441         /* Throttling already on, just increase the rate */
442         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
443     }
444 }
445 
446 /* Update the xbzrle cache to reflect a page that's been sent as all 0.
447  * The important thing is that a stale (not-yet-0'd) page be replaced
448  * by the new data.
449  * As a bonus, if the page wasn't in the cache it gets added so that
450  * when a small write is made into the 0'd page it gets XBZRLE sent
451  */
452 static void xbzrle_cache_zero_page(ram_addr_t current_addr)
453 {
454     if (ram_bulk_stage || !migrate_use_xbzrle()) {
455         return;
456     }
457 
458     /* We don't care if this fails to allocate a new cache page
459      * as long as it updated an old one */
460     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
461                  bitmap_sync_count);
462 }
463 
464 #define ENCODING_FLAG_XBZRLE 0x1
465 
466 /**
467  * save_xbzrle_page: compress and send current page
468  *
469  * Returns: 1 means that we wrote the page
470  *          0 means that page is identical to the one already sent
471  *          -1 means that xbzrle would be longer than normal
472  *
473  * @f: QEMUFile where to send the data
474  * @current_data:
475  * @current_addr:
476  * @block: block that contains the page we want to send
477  * @offset: offset inside the block for the page
478  * @last_stage: if we are at the completion stage
479  * @bytes_transferred: increase it with the number of transferred bytes
480  */
481 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
482                             ram_addr_t current_addr, RAMBlock *block,
483                             ram_addr_t offset, bool last_stage,
484                             uint64_t *bytes_transferred)
485 {
486     int encoded_len = 0, bytes_xbzrle;
487     uint8_t *prev_cached_page;
488 
489     if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
490         acct_info.xbzrle_cache_miss++;
491         if (!last_stage) {
492             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
493                              bitmap_sync_count) == -1) {
494                 return -1;
495             } else {
496                 /* update *current_data when the page has been
497                    inserted into cache */
498                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
499             }
500         }
501         return -1;
502     }
503 
504     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
505 
506     /* save current buffer into memory */
507     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
508 
509     /* XBZRLE encoding (if there is no overflow) */
510     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
511                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
512                                        TARGET_PAGE_SIZE);
513     if (encoded_len == 0) {
514         DPRINTF("Skipping unmodified page\n");
515         return 0;
516     } else if (encoded_len == -1) {
517         DPRINTF("Overflow\n");
518         acct_info.xbzrle_overflows++;
519         /* update data in the cache */
520         if (!last_stage) {
521             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
522             *current_data = prev_cached_page;
523         }
524         return -1;
525     }
526 
527     /* we need to update the data in the cache, in order to get the same data */
528     if (!last_stage) {
529         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
530     }
531 
532     /* Send XBZRLE based compressed page */
533     bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
534     qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
535     qemu_put_be16(f, encoded_len);
536     qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
537     bytes_xbzrle += encoded_len + 1 + 2;
538     acct_info.xbzrle_pages++;
539     acct_info.xbzrle_bytes += bytes_xbzrle;
540     *bytes_transferred += bytes_xbzrle;
541 
542     return 1;
543 }
544 
545 /* Called with rcu_read_lock() to protect migration_bitmap
546  * rb: The RAMBlock  to search for dirty pages in
547  * start: Start address (typically so we can continue from previous page)
548  * ram_addr_abs: Pointer into which to store the address of the dirty page
549  *               within the global ram_addr space
550  *
551  * Returns: byte offset within memory region of the start of a dirty page
552  */
553 static inline
554 ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
555                                        ram_addr_t start,
556                                        ram_addr_t *ram_addr_abs)
557 {
558     unsigned long base = rb->offset >> TARGET_PAGE_BITS;
559     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
560     uint64_t rb_size = rb->used_length;
561     unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
562     unsigned long *bitmap;
563 
564     unsigned long next;
565 
566     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
567     if (ram_bulk_stage && nr > base) {
568         next = nr + 1;
569     } else {
570         next = find_next_bit(bitmap, size, nr);
571     }
572 
573     *ram_addr_abs = next << TARGET_PAGE_BITS;
574     return (next - base) << TARGET_PAGE_BITS;
575 }
576 
577 static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
578 {
579     bool ret;
580     int nr = addr >> TARGET_PAGE_BITS;
581     unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
582 
583     ret = test_and_clear_bit(nr, bitmap);
584 
585     if (ret) {
586         migration_dirty_pages--;
587     }
588     return ret;
589 }
590 
591 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
592 {
593     unsigned long *bitmap;
594     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
595     migration_dirty_pages +=
596         cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
597 }
598 
599 /* Fix me: there are too many global variables used in migration process. */
600 static int64_t start_time;
601 static int64_t bytes_xfer_prev;
602 static int64_t num_dirty_pages_period;
603 static uint64_t xbzrle_cache_miss_prev;
604 static uint64_t iterations_prev;
605 
606 static void migration_bitmap_sync_init(void)
607 {
608     start_time = 0;
609     bytes_xfer_prev = 0;
610     num_dirty_pages_period = 0;
611     xbzrle_cache_miss_prev = 0;
612     iterations_prev = 0;
613 }
614 
615 static void migration_bitmap_sync(void)
616 {
617     RAMBlock *block;
618     uint64_t num_dirty_pages_init = migration_dirty_pages;
619     MigrationState *s = migrate_get_current();
620     int64_t end_time;
621     int64_t bytes_xfer_now;
622 
623     bitmap_sync_count++;
624 
625     if (!bytes_xfer_prev) {
626         bytes_xfer_prev = ram_bytes_transferred();
627     }
628 
629     if (!start_time) {
630         start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
631     }
632 
633     trace_migration_bitmap_sync_start();
634     address_space_sync_dirty_bitmap(&address_space_memory);
635 
636     qemu_mutex_lock(&migration_bitmap_mutex);
637     rcu_read_lock();
638     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
639         migration_bitmap_sync_range(block->offset, block->used_length);
640     }
641     rcu_read_unlock();
642     qemu_mutex_unlock(&migration_bitmap_mutex);
643 
644     trace_migration_bitmap_sync_end(migration_dirty_pages
645                                     - num_dirty_pages_init);
646     num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
647     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
648 
649     /* more than 1 second = 1000 millisecons */
650     if (end_time > start_time + 1000) {
651         if (migrate_auto_converge()) {
652             /* The following detection logic can be refined later. For now:
653                Check to see if the dirtied bytes is 50% more than the approx.
654                amount of bytes that just got transferred since the last time we
655                were in this routine. If that happens twice, start or increase
656                throttling */
657             bytes_xfer_now = ram_bytes_transferred();
658 
659             if (s->dirty_pages_rate &&
660                (num_dirty_pages_period * TARGET_PAGE_SIZE >
661                    (bytes_xfer_now - bytes_xfer_prev)/2) &&
662                (dirty_rate_high_cnt++ >= 2)) {
663                     trace_migration_throttle();
664                     dirty_rate_high_cnt = 0;
665                     mig_throttle_guest_down();
666              }
667              bytes_xfer_prev = bytes_xfer_now;
668         }
669 
670         if (migrate_use_xbzrle()) {
671             if (iterations_prev != acct_info.iterations) {
672                 acct_info.xbzrle_cache_miss_rate =
673                    (double)(acct_info.xbzrle_cache_miss -
674                             xbzrle_cache_miss_prev) /
675                    (acct_info.iterations - iterations_prev);
676             }
677             iterations_prev = acct_info.iterations;
678             xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
679         }
680         s->dirty_pages_rate = num_dirty_pages_period * 1000
681             / (end_time - start_time);
682         s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
683         start_time = end_time;
684         num_dirty_pages_period = 0;
685     }
686     s->dirty_sync_count = bitmap_sync_count;
687     if (migrate_use_events()) {
688         qapi_event_send_migration_pass(bitmap_sync_count, NULL);
689     }
690 }
691 
692 /**
693  * save_zero_page: Send the zero page to the stream
694  *
695  * Returns: Number of pages written.
696  *
697  * @f: QEMUFile where to send the data
698  * @block: block that contains the page we want to send
699  * @offset: offset inside the block for the page
700  * @p: pointer to the page
701  * @bytes_transferred: increase it with the number of transferred bytes
702  */
703 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
704                           uint8_t *p, uint64_t *bytes_transferred)
705 {
706     int pages = -1;
707 
708     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
709         acct_info.dup_pages++;
710         *bytes_transferred += save_page_header(f, block,
711                                                offset | RAM_SAVE_FLAG_COMPRESS);
712         qemu_put_byte(f, 0);
713         *bytes_transferred += 1;
714         pages = 1;
715     }
716 
717     return pages;
718 }
719 
720 /**
721  * ram_save_page: Send the given page to the stream
722  *
723  * Returns: Number of pages written.
724  *          < 0 - error
725  *          >=0 - Number of pages written - this might legally be 0
726  *                if xbzrle noticed the page was the same.
727  *
728  * @f: QEMUFile where to send the data
729  * @block: block that contains the page we want to send
730  * @offset: offset inside the block for the page
731  * @last_stage: if we are at the completion stage
732  * @bytes_transferred: increase it with the number of transferred bytes
733  */
734 static int ram_save_page(QEMUFile *f, PageSearchStatus *pss,
735                          bool last_stage, uint64_t *bytes_transferred)
736 {
737     int pages = -1;
738     uint64_t bytes_xmit;
739     ram_addr_t current_addr;
740     uint8_t *p;
741     int ret;
742     bool send_async = true;
743     RAMBlock *block = pss->block;
744     ram_addr_t offset = pss->offset;
745 
746     p = block->host + offset;
747 
748     /* In doubt sent page as normal */
749     bytes_xmit = 0;
750     ret = ram_control_save_page(f, block->offset,
751                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
752     if (bytes_xmit) {
753         *bytes_transferred += bytes_xmit;
754         pages = 1;
755     }
756 
757     XBZRLE_cache_lock();
758 
759     current_addr = block->offset + offset;
760 
761     if (block == last_sent_block) {
762         offset |= RAM_SAVE_FLAG_CONTINUE;
763     }
764     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
765         if (ret != RAM_SAVE_CONTROL_DELAYED) {
766             if (bytes_xmit > 0) {
767                 acct_info.norm_pages++;
768             } else if (bytes_xmit == 0) {
769                 acct_info.dup_pages++;
770             }
771         }
772     } else {
773         pages = save_zero_page(f, block, offset, p, bytes_transferred);
774         if (pages > 0) {
775             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
776              * page would be stale
777              */
778             xbzrle_cache_zero_page(current_addr);
779         } else if (!ram_bulk_stage && migrate_use_xbzrle()) {
780             pages = save_xbzrle_page(f, &p, current_addr, block,
781                                      offset, last_stage, bytes_transferred);
782             if (!last_stage) {
783                 /* Can't send this cached data async, since the cache page
784                  * might get updated before it gets to the wire
785                  */
786                 send_async = false;
787             }
788         }
789     }
790 
791     /* XBZRLE overflow or normal page */
792     if (pages == -1) {
793         *bytes_transferred += save_page_header(f, block,
794                                                offset | RAM_SAVE_FLAG_PAGE);
795         if (send_async) {
796             qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
797         } else {
798             qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
799         }
800         *bytes_transferred += TARGET_PAGE_SIZE;
801         pages = 1;
802         acct_info.norm_pages++;
803     }
804 
805     XBZRLE_cache_unlock();
806 
807     return pages;
808 }
809 
810 static int do_compress_ram_page(CompressParam *param)
811 {
812     int bytes_sent, blen;
813     uint8_t *p;
814     RAMBlock *block = param->block;
815     ram_addr_t offset = param->offset;
816 
817     p = block->host + (offset & TARGET_PAGE_MASK);
818 
819     bytes_sent = save_page_header(param->file, block, offset |
820                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
821     blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE,
822                                      migrate_compress_level());
823     bytes_sent += blen;
824 
825     return bytes_sent;
826 }
827 
828 static inline void start_compression(CompressParam *param)
829 {
830     param->done = false;
831     qemu_mutex_lock(&param->mutex);
832     param->start = true;
833     qemu_cond_signal(&param->cond);
834     qemu_mutex_unlock(&param->mutex);
835 }
836 
837 static inline void start_decompression(DecompressParam *param)
838 {
839     qemu_mutex_lock(&param->mutex);
840     param->start = true;
841     qemu_cond_signal(&param->cond);
842     qemu_mutex_unlock(&param->mutex);
843 }
844 
845 static uint64_t bytes_transferred;
846 
847 static void flush_compressed_data(QEMUFile *f)
848 {
849     int idx, len, thread_count;
850 
851     if (!migrate_use_compression()) {
852         return;
853     }
854     thread_count = migrate_compress_threads();
855     for (idx = 0; idx < thread_count; idx++) {
856         if (!comp_param[idx].done) {
857             qemu_mutex_lock(comp_done_lock);
858             while (!comp_param[idx].done && !quit_comp_thread) {
859                 qemu_cond_wait(comp_done_cond, comp_done_lock);
860             }
861             qemu_mutex_unlock(comp_done_lock);
862         }
863         if (!quit_comp_thread) {
864             len = qemu_put_qemu_file(f, comp_param[idx].file);
865             bytes_transferred += len;
866         }
867     }
868 }
869 
870 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
871                                        ram_addr_t offset)
872 {
873     param->block = block;
874     param->offset = offset;
875 }
876 
877 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
878                                            ram_addr_t offset,
879                                            uint64_t *bytes_transferred)
880 {
881     int idx, thread_count, bytes_xmit = -1, pages = -1;
882 
883     thread_count = migrate_compress_threads();
884     qemu_mutex_lock(comp_done_lock);
885     while (true) {
886         for (idx = 0; idx < thread_count; idx++) {
887             if (comp_param[idx].done) {
888                 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
889                 set_compress_params(&comp_param[idx], block, offset);
890                 start_compression(&comp_param[idx]);
891                 pages = 1;
892                 acct_info.norm_pages++;
893                 *bytes_transferred += bytes_xmit;
894                 break;
895             }
896         }
897         if (pages > 0) {
898             break;
899         } else {
900             qemu_cond_wait(comp_done_cond, comp_done_lock);
901         }
902     }
903     qemu_mutex_unlock(comp_done_lock);
904 
905     return pages;
906 }
907 
908 /**
909  * ram_save_compressed_page: compress the given page and send it to the stream
910  *
911  * Returns: Number of pages written.
912  *
913  * @f: QEMUFile where to send the data
914  * @block: block that contains the page we want to send
915  * @offset: offset inside the block for the page
916  * @last_stage: if we are at the completion stage
917  * @bytes_transferred: increase it with the number of transferred bytes
918  */
919 static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss,
920                                     bool last_stage,
921                                     uint64_t *bytes_transferred)
922 {
923     int pages = -1;
924     uint64_t bytes_xmit;
925     uint8_t *p;
926     int ret;
927     RAMBlock *block = pss->block;
928     ram_addr_t offset = pss->offset;
929 
930     p = block->host + offset;
931 
932     bytes_xmit = 0;
933     ret = ram_control_save_page(f, block->offset,
934                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
935     if (bytes_xmit) {
936         *bytes_transferred += bytes_xmit;
937         pages = 1;
938     }
939     if (block == last_sent_block) {
940         offset |= RAM_SAVE_FLAG_CONTINUE;
941     }
942     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
943         if (ret != RAM_SAVE_CONTROL_DELAYED) {
944             if (bytes_xmit > 0) {
945                 acct_info.norm_pages++;
946             } else if (bytes_xmit == 0) {
947                 acct_info.dup_pages++;
948             }
949         }
950     } else {
951         /* When starting the process of a new block, the first page of
952          * the block should be sent out before other pages in the same
953          * block, and all the pages in last block should have been sent
954          * out, keeping this order is important, because the 'cont' flag
955          * is used to avoid resending the block name.
956          */
957         if (block != last_sent_block) {
958             flush_compressed_data(f);
959             pages = save_zero_page(f, block, offset, p, bytes_transferred);
960             if (pages == -1) {
961                 set_compress_params(&comp_param[0], block, offset);
962                 /* Use the qemu thread to compress the data to make sure the
963                  * first page is sent out before other pages
964                  */
965                 bytes_xmit = do_compress_ram_page(&comp_param[0]);
966                 acct_info.norm_pages++;
967                 qemu_put_qemu_file(f, comp_param[0].file);
968                 *bytes_transferred += bytes_xmit;
969                 pages = 1;
970             }
971         } else {
972             pages = save_zero_page(f, block, offset, p, bytes_transferred);
973             if (pages == -1) {
974                 pages = compress_page_with_multi_thread(f, block, offset,
975                                                         bytes_transferred);
976             }
977         }
978     }
979 
980     return pages;
981 }
982 
983 /*
984  * Find the next dirty page and update any state associated with
985  * the search process.
986  *
987  * Returns: True if a page is found
988  *
989  * @f: Current migration stream.
990  * @pss: Data about the state of the current dirty page scan.
991  * @*again: Set to false if the search has scanned the whole of RAM
992  * *ram_addr_abs: Pointer into which to store the address of the dirty page
993  *               within the global ram_addr space
994  */
995 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
996                              bool *again, ram_addr_t *ram_addr_abs)
997 {
998     pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
999                                               ram_addr_abs);
1000     if (pss->complete_round && pss->block == last_seen_block &&
1001         pss->offset >= last_offset) {
1002         /*
1003          * We've been once around the RAM and haven't found anything.
1004          * Give up.
1005          */
1006         *again = false;
1007         return false;
1008     }
1009     if (pss->offset >= pss->block->used_length) {
1010         /* Didn't find anything in this RAM Block */
1011         pss->offset = 0;
1012         pss->block = QLIST_NEXT_RCU(pss->block, next);
1013         if (!pss->block) {
1014             /* Hit the end of the list */
1015             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1016             /* Flag that we've looped */
1017             pss->complete_round = true;
1018             ram_bulk_stage = false;
1019             if (migrate_use_xbzrle()) {
1020                 /* If xbzrle is on, stop using the data compression at this
1021                  * point. In theory, xbzrle can do better than compression.
1022                  */
1023                 flush_compressed_data(f);
1024                 compression_switch = false;
1025             }
1026         }
1027         /* Didn't find anything this time, but try again on the new block */
1028         *again = true;
1029         return false;
1030     } else {
1031         /* Can go around again, but... */
1032         *again = true;
1033         /* We've found something so probably don't need to */
1034         return true;
1035     }
1036 }
1037 
1038 /*
1039  * Helper for 'get_queued_page' - gets a page off the queue
1040  *      ms:      MigrationState in
1041  * *offset:      Used to return the offset within the RAMBlock
1042  * ram_addr_abs: global offset in the dirty/sent bitmaps
1043  *
1044  * Returns:      block (or NULL if none available)
1045  */
1046 static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1047                               ram_addr_t *ram_addr_abs)
1048 {
1049     RAMBlock *block = NULL;
1050 
1051     qemu_mutex_lock(&ms->src_page_req_mutex);
1052     if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1053         struct MigrationSrcPageRequest *entry =
1054                                 QSIMPLEQ_FIRST(&ms->src_page_requests);
1055         block = entry->rb;
1056         *offset = entry->offset;
1057         *ram_addr_abs = (entry->offset + entry->rb->offset) &
1058                         TARGET_PAGE_MASK;
1059 
1060         if (entry->len > TARGET_PAGE_SIZE) {
1061             entry->len -= TARGET_PAGE_SIZE;
1062             entry->offset += TARGET_PAGE_SIZE;
1063         } else {
1064             memory_region_unref(block->mr);
1065             QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1066             g_free(entry);
1067         }
1068     }
1069     qemu_mutex_unlock(&ms->src_page_req_mutex);
1070 
1071     return block;
1072 }
1073 
1074 /*
1075  * Unqueue a page from the queue fed by postcopy page requests; skips pages
1076  * that are already sent (!dirty)
1077  *
1078  *      ms:      MigrationState in
1079  *     pss:      PageSearchStatus structure updated with found block/offset
1080  * ram_addr_abs: global offset in the dirty/sent bitmaps
1081  *
1082  * Returns:      true if a queued page is found
1083  */
1084 static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
1085                             ram_addr_t *ram_addr_abs)
1086 {
1087     RAMBlock  *block;
1088     ram_addr_t offset;
1089     bool dirty;
1090 
1091     do {
1092         block = unqueue_page(ms, &offset, ram_addr_abs);
1093         /*
1094          * We're sending this page, and since it's postcopy nothing else
1095          * will dirty it, and we must make sure it doesn't get sent again
1096          * even if this queue request was received after the background
1097          * search already sent it.
1098          */
1099         if (block) {
1100             unsigned long *bitmap;
1101             bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1102             dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1103             if (!dirty) {
1104                 trace_get_queued_page_not_dirty(
1105                     block->idstr, (uint64_t)offset,
1106                     (uint64_t)*ram_addr_abs,
1107                     test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1108                          atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1109             } else {
1110                 trace_get_queued_page(block->idstr,
1111                                       (uint64_t)offset,
1112                                       (uint64_t)*ram_addr_abs);
1113             }
1114         }
1115 
1116     } while (block && !dirty);
1117 
1118     if (block) {
1119         /*
1120          * As soon as we start servicing pages out of order, then we have
1121          * to kill the bulk stage, since the bulk stage assumes
1122          * in (migration_bitmap_find_and_reset_dirty) that every page is
1123          * dirty, that's no longer true.
1124          */
1125         ram_bulk_stage = false;
1126 
1127         /*
1128          * We want the background search to continue from the queued page
1129          * since the guest is likely to want other pages near to the page
1130          * it just requested.
1131          */
1132         pss->block = block;
1133         pss->offset = offset;
1134     }
1135 
1136     return !!block;
1137 }
1138 
1139 /**
1140  * flush_page_queue: Flush any remaining pages in the ram request queue
1141  *    it should be empty at the end anyway, but in error cases there may be
1142  *    some left.
1143  *
1144  * ms: MigrationState
1145  */
1146 void flush_page_queue(MigrationState *ms)
1147 {
1148     struct MigrationSrcPageRequest *mspr, *next_mspr;
1149     /* This queue generally should be empty - but in the case of a failed
1150      * migration might have some droppings in.
1151      */
1152     rcu_read_lock();
1153     QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1154         memory_region_unref(mspr->rb->mr);
1155         QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1156         g_free(mspr);
1157     }
1158     rcu_read_unlock();
1159 }
1160 
1161 /**
1162  * Queue the pages for transmission, e.g. a request from postcopy destination
1163  *   ms: MigrationStatus in which the queue is held
1164  *   rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
1165  *   start: Offset from the start of the RAMBlock
1166  *   len: Length (in bytes) to send
1167  *   Return: 0 on success
1168  */
1169 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1170                          ram_addr_t start, ram_addr_t len)
1171 {
1172     RAMBlock *ramblock;
1173 
1174     rcu_read_lock();
1175     if (!rbname) {
1176         /* Reuse last RAMBlock */
1177         ramblock = ms->last_req_rb;
1178 
1179         if (!ramblock) {
1180             /*
1181              * Shouldn't happen, we can't reuse the last RAMBlock if
1182              * it's the 1st request.
1183              */
1184             error_report("ram_save_queue_pages no previous block");
1185             goto err;
1186         }
1187     } else {
1188         ramblock = qemu_ram_block_by_name(rbname);
1189 
1190         if (!ramblock) {
1191             /* We shouldn't be asked for a non-existent RAMBlock */
1192             error_report("ram_save_queue_pages no block '%s'", rbname);
1193             goto err;
1194         }
1195         ms->last_req_rb = ramblock;
1196     }
1197     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1198     if (start+len > ramblock->used_length) {
1199         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1200                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1201                      __func__, start, len, ramblock->used_length);
1202         goto err;
1203     }
1204 
1205     struct MigrationSrcPageRequest *new_entry =
1206         g_malloc0(sizeof(struct MigrationSrcPageRequest));
1207     new_entry->rb = ramblock;
1208     new_entry->offset = start;
1209     new_entry->len = len;
1210 
1211     memory_region_ref(ramblock->mr);
1212     qemu_mutex_lock(&ms->src_page_req_mutex);
1213     QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1214     qemu_mutex_unlock(&ms->src_page_req_mutex);
1215     rcu_read_unlock();
1216 
1217     return 0;
1218 
1219 err:
1220     rcu_read_unlock();
1221     return -1;
1222 }
1223 
1224 /**
1225  * ram_save_target_page: Save one target page
1226  *
1227  *
1228  * @f: QEMUFile where to send the data
1229  * @block: pointer to block that contains the page we want to send
1230  * @offset: offset inside the block for the page;
1231  * @last_stage: if we are at the completion stage
1232  * @bytes_transferred: increase it with the number of transferred bytes
1233  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1234  *
1235  * Returns: Number of pages written.
1236  */
1237 static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
1238                                 PageSearchStatus *pss,
1239                                 bool last_stage,
1240                                 uint64_t *bytes_transferred,
1241                                 ram_addr_t dirty_ram_abs)
1242 {
1243     int res = 0;
1244 
1245     /* Check the pages is dirty and if it is send it */
1246     if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1247         unsigned long *unsentmap;
1248         if (compression_switch && migrate_use_compression()) {
1249             res = ram_save_compressed_page(f, pss,
1250                                            last_stage,
1251                                            bytes_transferred);
1252         } else {
1253             res = ram_save_page(f, pss, last_stage,
1254                                 bytes_transferred);
1255         }
1256 
1257         if (res < 0) {
1258             return res;
1259         }
1260         unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1261         if (unsentmap) {
1262             clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1263         }
1264         /* Only update last_sent_block if a block was actually sent; xbzrle
1265          * might have decided the page was identical so didn't bother writing
1266          * to the stream.
1267          */
1268         if (res > 0) {
1269             last_sent_block = pss->block;
1270         }
1271     }
1272 
1273     return res;
1274 }
1275 
1276 /**
1277  * ram_save_host_page: Starting at *offset send pages up to the end
1278  *                     of the current host page.  It's valid for the initial
1279  *                     offset to point into the middle of a host page
1280  *                     in which case the remainder of the hostpage is sent.
1281  *                     Only dirty target pages are sent.
1282  *
1283  * Returns: Number of pages written.
1284  *
1285  * @f: QEMUFile where to send the data
1286  * @block: pointer to block that contains the page we want to send
1287  * @offset: offset inside the block for the page; updated to last target page
1288  *          sent
1289  * @last_stage: if we are at the completion stage
1290  * @bytes_transferred: increase it with the number of transferred bytes
1291  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1292  */
1293 static int ram_save_host_page(MigrationState *ms, QEMUFile *f,
1294                               PageSearchStatus *pss,
1295                               bool last_stage,
1296                               uint64_t *bytes_transferred,
1297                               ram_addr_t dirty_ram_abs)
1298 {
1299     int tmppages, pages = 0;
1300     do {
1301         tmppages = ram_save_target_page(ms, f, pss, last_stage,
1302                                         bytes_transferred, dirty_ram_abs);
1303         if (tmppages < 0) {
1304             return tmppages;
1305         }
1306 
1307         pages += tmppages;
1308         pss->offset += TARGET_PAGE_SIZE;
1309         dirty_ram_abs += TARGET_PAGE_SIZE;
1310     } while (pss->offset & (qemu_host_page_size - 1));
1311 
1312     /* The offset we leave with is the last one we looked at */
1313     pss->offset -= TARGET_PAGE_SIZE;
1314     return pages;
1315 }
1316 
1317 /**
1318  * ram_find_and_save_block: Finds a dirty page and sends it to f
1319  *
1320  * Called within an RCU critical section.
1321  *
1322  * Returns:  The number of pages written
1323  *           0 means no dirty pages
1324  *
1325  * @f: QEMUFile where to send the data
1326  * @last_stage: if we are at the completion stage
1327  * @bytes_transferred: increase it with the number of transferred bytes
1328  *
1329  * On systems where host-page-size > target-page-size it will send all the
1330  * pages in a host page that are dirty.
1331  */
1332 
1333 static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1334                                    uint64_t *bytes_transferred)
1335 {
1336     PageSearchStatus pss;
1337     MigrationState *ms = migrate_get_current();
1338     int pages = 0;
1339     bool again, found;
1340     ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1341                                  ram_addr_t space */
1342 
1343     pss.block = last_seen_block;
1344     pss.offset = last_offset;
1345     pss.complete_round = false;
1346 
1347     if (!pss.block) {
1348         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1349     }
1350 
1351     do {
1352         again = true;
1353         found = get_queued_page(ms, &pss, &dirty_ram_abs);
1354 
1355         if (!found) {
1356             /* priority queue empty, so just search for something dirty */
1357             found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
1358         }
1359 
1360         if (found) {
1361             pages = ram_save_host_page(ms, f, &pss,
1362                                        last_stage, bytes_transferred,
1363                                        dirty_ram_abs);
1364         }
1365     } while (!pages && again);
1366 
1367     last_seen_block = pss.block;
1368     last_offset = pss.offset;
1369 
1370     return pages;
1371 }
1372 
1373 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1374 {
1375     uint64_t pages = size / TARGET_PAGE_SIZE;
1376     if (zero) {
1377         acct_info.dup_pages += pages;
1378     } else {
1379         acct_info.norm_pages += pages;
1380         bytes_transferred += size;
1381         qemu_update_position(f, size);
1382     }
1383 }
1384 
1385 static ram_addr_t ram_save_remaining(void)
1386 {
1387     return migration_dirty_pages;
1388 }
1389 
1390 uint64_t ram_bytes_remaining(void)
1391 {
1392     return ram_save_remaining() * TARGET_PAGE_SIZE;
1393 }
1394 
1395 uint64_t ram_bytes_transferred(void)
1396 {
1397     return bytes_transferred;
1398 }
1399 
1400 uint64_t ram_bytes_total(void)
1401 {
1402     RAMBlock *block;
1403     uint64_t total = 0;
1404 
1405     rcu_read_lock();
1406     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1407         total += block->used_length;
1408     rcu_read_unlock();
1409     return total;
1410 }
1411 
1412 void free_xbzrle_decoded_buf(void)
1413 {
1414     g_free(xbzrle_decoded_buf);
1415     xbzrle_decoded_buf = NULL;
1416 }
1417 
1418 static void migration_bitmap_free(struct BitmapRcu *bmap)
1419 {
1420     g_free(bmap->bmap);
1421     g_free(bmap->unsentmap);
1422     g_free(bmap);
1423 }
1424 
1425 static void ram_migration_cleanup(void *opaque)
1426 {
1427     /* caller have hold iothread lock or is in a bh, so there is
1428      * no writing race against this migration_bitmap
1429      */
1430     struct BitmapRcu *bitmap = migration_bitmap_rcu;
1431     atomic_rcu_set(&migration_bitmap_rcu, NULL);
1432     if (bitmap) {
1433         memory_global_dirty_log_stop();
1434         call_rcu(bitmap, migration_bitmap_free, rcu);
1435     }
1436 
1437     XBZRLE_cache_lock();
1438     if (XBZRLE.cache) {
1439         cache_fini(XBZRLE.cache);
1440         g_free(XBZRLE.encoded_buf);
1441         g_free(XBZRLE.current_buf);
1442         XBZRLE.cache = NULL;
1443         XBZRLE.encoded_buf = NULL;
1444         XBZRLE.current_buf = NULL;
1445     }
1446     XBZRLE_cache_unlock();
1447 }
1448 
1449 static void reset_ram_globals(void)
1450 {
1451     last_seen_block = NULL;
1452     last_sent_block = NULL;
1453     last_offset = 0;
1454     last_version = ram_list.version;
1455     ram_bulk_stage = true;
1456 }
1457 
1458 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1459 
1460 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1461 {
1462     /* called in qemu main thread, so there is
1463      * no writing race against this migration_bitmap
1464      */
1465     if (migration_bitmap_rcu) {
1466         struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1467         bitmap = g_new(struct BitmapRcu, 1);
1468         bitmap->bmap = bitmap_new(new);
1469 
1470         /* prevent migration_bitmap content from being set bit
1471          * by migration_bitmap_sync_range() at the same time.
1472          * it is safe to migration if migration_bitmap is cleared bit
1473          * at the same time.
1474          */
1475         qemu_mutex_lock(&migration_bitmap_mutex);
1476         bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1477         bitmap_set(bitmap->bmap, old, new - old);
1478 
1479         /* We don't have a way to safely extend the sentmap
1480          * with RCU; so mark it as missing, entry to postcopy
1481          * will fail.
1482          */
1483         bitmap->unsentmap = NULL;
1484 
1485         atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1486         qemu_mutex_unlock(&migration_bitmap_mutex);
1487         migration_dirty_pages += new - old;
1488         call_rcu(old_bitmap, migration_bitmap_free, rcu);
1489     }
1490 }
1491 
1492 /*
1493  * 'expected' is the value you expect the bitmap mostly to be full
1494  * of; it won't bother printing lines that are all this value.
1495  * If 'todump' is null the migration bitmap is dumped.
1496  */
1497 void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1498 {
1499     int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1500 
1501     int64_t cur;
1502     int64_t linelen = 128;
1503     char linebuf[129];
1504 
1505     if (!todump) {
1506         todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1507     }
1508 
1509     for (cur = 0; cur < ram_pages; cur += linelen) {
1510         int64_t curb;
1511         bool found = false;
1512         /*
1513          * Last line; catch the case where the line length
1514          * is longer than remaining ram
1515          */
1516         if (cur + linelen > ram_pages) {
1517             linelen = ram_pages - cur;
1518         }
1519         for (curb = 0; curb < linelen; curb++) {
1520             bool thisbit = test_bit(cur + curb, todump);
1521             linebuf[curb] = thisbit ? '1' : '.';
1522             found = found || (thisbit != expected);
1523         }
1524         if (found) {
1525             linebuf[curb] = '\0';
1526             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1527         }
1528     }
1529 }
1530 
1531 /* **** functions for postcopy ***** */
1532 
1533 /*
1534  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1535  * Note: At this point the 'unsentmap' is the processed bitmap combined
1536  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1537  * start,length: Indexes into the bitmap for the first bit
1538  *            representing the named block and length in target-pages
1539  */
1540 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1541                                         PostcopyDiscardState *pds,
1542                                         unsigned long start,
1543                                         unsigned long length)
1544 {
1545     unsigned long end = start + length; /* one after the end */
1546     unsigned long current;
1547     unsigned long *unsentmap;
1548 
1549     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1550     for (current = start; current < end; ) {
1551         unsigned long one = find_next_bit(unsentmap, end, current);
1552 
1553         if (one <= end) {
1554             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1555             unsigned long discard_length;
1556 
1557             if (zero >= end) {
1558                 discard_length = end - one;
1559             } else {
1560                 discard_length = zero - one;
1561             }
1562             postcopy_discard_send_range(ms, pds, one, discard_length);
1563             current = one + discard_length;
1564         } else {
1565             current = one;
1566         }
1567     }
1568 
1569     return 0;
1570 }
1571 
1572 /*
1573  * Utility for the outgoing postcopy code.
1574  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1575  *   passing it bitmap indexes and name.
1576  * Returns: 0 on success
1577  * (qemu_ram_foreach_block ends up passing unscaled lengths
1578  *  which would mean postcopy code would have to deal with target page)
1579  */
1580 static int postcopy_each_ram_send_discard(MigrationState *ms)
1581 {
1582     struct RAMBlock *block;
1583     int ret;
1584 
1585     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1586         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1587         PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1588                                                                first,
1589                                                                block->idstr);
1590 
1591         /*
1592          * Postcopy sends chunks of bitmap over the wire, but it
1593          * just needs indexes at this point, avoids it having
1594          * target page specific code.
1595          */
1596         ret = postcopy_send_discard_bm_ram(ms, pds, first,
1597                                     block->used_length >> TARGET_PAGE_BITS);
1598         postcopy_discard_send_finish(ms, pds);
1599         if (ret) {
1600             return ret;
1601         }
1602     }
1603 
1604     return 0;
1605 }
1606 
1607 /*
1608  * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
1609  *   the two bitmaps, that are similar, but one is inverted.
1610  *
1611  * We search for runs of target-pages that don't start or end on a
1612  * host page boundary;
1613  * unsent_pass=true: Cleans up partially unsent host pages by searching
1614  *                 the unsentmap
1615  * unsent_pass=false: Cleans up partially dirty host pages by searching
1616  *                 the main migration bitmap
1617  *
1618  */
1619 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1620                                           RAMBlock *block,
1621                                           PostcopyDiscardState *pds)
1622 {
1623     unsigned long *bitmap;
1624     unsigned long *unsentmap;
1625     unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE;
1626     unsigned long first = block->offset >> TARGET_PAGE_BITS;
1627     unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1628     unsigned long last = first + (len - 1);
1629     unsigned long run_start;
1630 
1631     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1632     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1633 
1634     if (unsent_pass) {
1635         /* Find a sent page */
1636         run_start = find_next_zero_bit(unsentmap, last + 1, first);
1637     } else {
1638         /* Find a dirty page */
1639         run_start = find_next_bit(bitmap, last + 1, first);
1640     }
1641 
1642     while (run_start <= last) {
1643         bool do_fixup = false;
1644         unsigned long fixup_start_addr;
1645         unsigned long host_offset;
1646 
1647         /*
1648          * If the start of this run of pages is in the middle of a host
1649          * page, then we need to fixup this host page.
1650          */
1651         host_offset = run_start % host_ratio;
1652         if (host_offset) {
1653             do_fixup = true;
1654             run_start -= host_offset;
1655             fixup_start_addr = run_start;
1656             /* For the next pass */
1657             run_start = run_start + host_ratio;
1658         } else {
1659             /* Find the end of this run */
1660             unsigned long run_end;
1661             if (unsent_pass) {
1662                 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1663             } else {
1664                 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1665             }
1666             /*
1667              * If the end isn't at the start of a host page, then the
1668              * run doesn't finish at the end of a host page
1669              * and we need to discard.
1670              */
1671             host_offset = run_end % host_ratio;
1672             if (host_offset) {
1673                 do_fixup = true;
1674                 fixup_start_addr = run_end - host_offset;
1675                 /*
1676                  * This host page has gone, the next loop iteration starts
1677                  * from after the fixup
1678                  */
1679                 run_start = fixup_start_addr + host_ratio;
1680             } else {
1681                 /*
1682                  * No discards on this iteration, next loop starts from
1683                  * next sent/dirty page
1684                  */
1685                 run_start = run_end + 1;
1686             }
1687         }
1688 
1689         if (do_fixup) {
1690             unsigned long page;
1691 
1692             /* Tell the destination to discard this page */
1693             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1694                 /* For the unsent_pass we:
1695                  *     discard partially sent pages
1696                  * For the !unsent_pass (dirty) we:
1697                  *     discard partially dirty pages that were sent
1698                  *     (any partially sent pages were already discarded
1699                  *     by the previous unsent_pass)
1700                  */
1701                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1702                                             host_ratio);
1703             }
1704 
1705             /* Clean up the bitmap */
1706             for (page = fixup_start_addr;
1707                  page < fixup_start_addr + host_ratio; page++) {
1708                 /* All pages in this host page are now not sent */
1709                 set_bit(page, unsentmap);
1710 
1711                 /*
1712                  * Remark them as dirty, updating the count for any pages
1713                  * that weren't previously dirty.
1714                  */
1715                 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1716             }
1717         }
1718 
1719         if (unsent_pass) {
1720             /* Find the next sent page for the next iteration */
1721             run_start = find_next_zero_bit(unsentmap, last + 1,
1722                                            run_start);
1723         } else {
1724             /* Find the next dirty page for the next iteration */
1725             run_start = find_next_bit(bitmap, last + 1, run_start);
1726         }
1727     }
1728 }
1729 
1730 /*
1731  * Utility for the outgoing postcopy code.
1732  *
1733  * Discard any partially sent host-page size chunks, mark any partially
1734  * dirty host-page size chunks as all dirty.
1735  *
1736  * Returns: 0 on success
1737  */
1738 static int postcopy_chunk_hostpages(MigrationState *ms)
1739 {
1740     struct RAMBlock *block;
1741 
1742     if (qemu_host_page_size == TARGET_PAGE_SIZE) {
1743         /* Easy case - TPS==HPS - nothing to be done */
1744         return 0;
1745     }
1746 
1747     /* Easiest way to make sure we don't resume in the middle of a host-page */
1748     last_seen_block = NULL;
1749     last_sent_block = NULL;
1750     last_offset     = 0;
1751 
1752     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1753         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1754 
1755         PostcopyDiscardState *pds =
1756                          postcopy_discard_send_init(ms, first, block->idstr);
1757 
1758         /* First pass: Discard all partially sent host pages */
1759         postcopy_chunk_hostpages_pass(ms, true, block, pds);
1760         /*
1761          * Second pass: Ensure that all partially dirty host pages are made
1762          * fully dirty.
1763          */
1764         postcopy_chunk_hostpages_pass(ms, false, block, pds);
1765 
1766         postcopy_discard_send_finish(ms, pds);
1767     } /* ram_list loop */
1768 
1769     return 0;
1770 }
1771 
1772 /*
1773  * Transmit the set of pages to be discarded after precopy to the target
1774  * these are pages that:
1775  *     a) Have been previously transmitted but are now dirty again
1776  *     b) Pages that have never been transmitted, this ensures that
1777  *        any pages on the destination that have been mapped by background
1778  *        tasks get discarded (transparent huge pages is the specific concern)
1779  * Hopefully this is pretty sparse
1780  */
1781 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1782 {
1783     int ret;
1784     unsigned long *bitmap, *unsentmap;
1785 
1786     rcu_read_lock();
1787 
1788     /* This should be our last sync, the src is now paused */
1789     migration_bitmap_sync();
1790 
1791     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1792     if (!unsentmap) {
1793         /* We don't have a safe way to resize the sentmap, so
1794          * if the bitmap was resized it will be NULL at this
1795          * point.
1796          */
1797         error_report("migration ram resized during precopy phase");
1798         rcu_read_unlock();
1799         return -EINVAL;
1800     }
1801 
1802     /* Deal with TPS != HPS */
1803     ret = postcopy_chunk_hostpages(ms);
1804     if (ret) {
1805         rcu_read_unlock();
1806         return ret;
1807     }
1808 
1809     /*
1810      * Update the unsentmap to be unsentmap = unsentmap | dirty
1811      */
1812     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1813     bitmap_or(unsentmap, unsentmap, bitmap,
1814                last_ram_offset() >> TARGET_PAGE_BITS);
1815 
1816 
1817     trace_ram_postcopy_send_discard_bitmap();
1818 #ifdef DEBUG_POSTCOPY
1819     ram_debug_dump_bitmap(unsentmap, true);
1820 #endif
1821 
1822     ret = postcopy_each_ram_send_discard(ms);
1823     rcu_read_unlock();
1824 
1825     return ret;
1826 }
1827 
1828 /*
1829  * At the start of the postcopy phase of migration, any now-dirty
1830  * precopied pages are discarded.
1831  *
1832  * start, length describe a byte address range within the RAMBlock
1833  *
1834  * Returns 0 on success.
1835  */
1836 int ram_discard_range(MigrationIncomingState *mis,
1837                       const char *block_name,
1838                       uint64_t start, size_t length)
1839 {
1840     int ret = -1;
1841 
1842     rcu_read_lock();
1843     RAMBlock *rb = qemu_ram_block_by_name(block_name);
1844 
1845     if (!rb) {
1846         error_report("ram_discard_range: Failed to find block '%s'",
1847                      block_name);
1848         goto err;
1849     }
1850 
1851     uint8_t *host_startaddr = rb->host + start;
1852 
1853     if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) {
1854         error_report("ram_discard_range: Unaligned start address: %p",
1855                      host_startaddr);
1856         goto err;
1857     }
1858 
1859     if ((start + length) <= rb->used_length) {
1860         uint8_t *host_endaddr = host_startaddr + length;
1861         if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) {
1862             error_report("ram_discard_range: Unaligned end address: %p",
1863                          host_endaddr);
1864             goto err;
1865         }
1866         ret = postcopy_ram_discard_range(mis, host_startaddr, length);
1867     } else {
1868         error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
1869                      "/%zx/" RAM_ADDR_FMT")",
1870                      block_name, start, length, rb->used_length);
1871     }
1872 
1873 err:
1874     rcu_read_unlock();
1875 
1876     return ret;
1877 }
1878 
1879 
1880 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1881  * long-running RCU critical section.  When rcu-reclaims in the code
1882  * start to become numerous it will be necessary to reduce the
1883  * granularity of these critical sections.
1884  */
1885 
1886 static int ram_save_setup(QEMUFile *f, void *opaque)
1887 {
1888     RAMBlock *block;
1889     int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1890 
1891     dirty_rate_high_cnt = 0;
1892     bitmap_sync_count = 0;
1893     migration_bitmap_sync_init();
1894     qemu_mutex_init(&migration_bitmap_mutex);
1895 
1896     if (migrate_use_xbzrle()) {
1897         XBZRLE_cache_lock();
1898         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1899                                   TARGET_PAGE_SIZE,
1900                                   TARGET_PAGE_SIZE);
1901         if (!XBZRLE.cache) {
1902             XBZRLE_cache_unlock();
1903             error_report("Error creating cache");
1904             return -1;
1905         }
1906         XBZRLE_cache_unlock();
1907 
1908         /* We prefer not to abort if there is no memory */
1909         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1910         if (!XBZRLE.encoded_buf) {
1911             error_report("Error allocating encoded_buf");
1912             return -1;
1913         }
1914 
1915         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1916         if (!XBZRLE.current_buf) {
1917             error_report("Error allocating current_buf");
1918             g_free(XBZRLE.encoded_buf);
1919             XBZRLE.encoded_buf = NULL;
1920             return -1;
1921         }
1922 
1923         acct_clear();
1924     }
1925 
1926     /* For memory_global_dirty_log_start below.  */
1927     qemu_mutex_lock_iothread();
1928 
1929     qemu_mutex_lock_ramlist();
1930     rcu_read_lock();
1931     bytes_transferred = 0;
1932     reset_ram_globals();
1933 
1934     ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1935     migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
1936     migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1937     bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
1938 
1939     if (migrate_postcopy_ram()) {
1940         migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
1941         bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
1942     }
1943 
1944     /*
1945      * Count the total number of pages used by ram blocks not including any
1946      * gaps due to alignment or unplugs.
1947      */
1948     migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1949 
1950     memory_global_dirty_log_start();
1951     migration_bitmap_sync();
1952     qemu_mutex_unlock_ramlist();
1953     qemu_mutex_unlock_iothread();
1954 
1955     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1956 
1957     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1958         qemu_put_byte(f, strlen(block->idstr));
1959         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1960         qemu_put_be64(f, block->used_length);
1961     }
1962 
1963     rcu_read_unlock();
1964 
1965     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1966     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1967 
1968     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1969 
1970     return 0;
1971 }
1972 
1973 static int ram_save_iterate(QEMUFile *f, void *opaque)
1974 {
1975     int ret;
1976     int i;
1977     int64_t t0;
1978     int pages_sent = 0;
1979 
1980     rcu_read_lock();
1981     if (ram_list.version != last_version) {
1982         reset_ram_globals();
1983     }
1984 
1985     /* Read version before ram_list.blocks */
1986     smp_rmb();
1987 
1988     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1989 
1990     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1991     i = 0;
1992     while ((ret = qemu_file_rate_limit(f)) == 0) {
1993         int pages;
1994 
1995         pages = ram_find_and_save_block(f, false, &bytes_transferred);
1996         /* no more pages to sent */
1997         if (pages == 0) {
1998             break;
1999         }
2000         pages_sent += pages;
2001         acct_info.iterations++;
2002 
2003         /* we want to check in the 1st loop, just in case it was the 1st time
2004            and we had to sync the dirty bitmap.
2005            qemu_get_clock_ns() is a bit expensive, so we only check each some
2006            iterations
2007         */
2008         if ((i & 63) == 0) {
2009             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2010             if (t1 > MAX_WAIT) {
2011                 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
2012                         t1, i);
2013                 break;
2014             }
2015         }
2016         i++;
2017     }
2018     flush_compressed_data(f);
2019     rcu_read_unlock();
2020 
2021     /*
2022      * Must occur before EOS (or any QEMUFile operation)
2023      * because of RDMA protocol.
2024      */
2025     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2026 
2027     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2028     bytes_transferred += 8;
2029 
2030     ret = qemu_file_get_error(f);
2031     if (ret < 0) {
2032         return ret;
2033     }
2034 
2035     return pages_sent;
2036 }
2037 
2038 /* Called with iothread lock */
2039 static int ram_save_complete(QEMUFile *f, void *opaque)
2040 {
2041     rcu_read_lock();
2042 
2043     if (!migration_in_postcopy(migrate_get_current())) {
2044         migration_bitmap_sync();
2045     }
2046 
2047     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2048 
2049     /* try transferring iterative blocks of memory */
2050 
2051     /* flush all remaining blocks regardless of rate limiting */
2052     while (true) {
2053         int pages;
2054 
2055         pages = ram_find_and_save_block(f, true, &bytes_transferred);
2056         /* no more blocks to sent */
2057         if (pages == 0) {
2058             break;
2059         }
2060     }
2061 
2062     flush_compressed_data(f);
2063     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2064 
2065     rcu_read_unlock();
2066 
2067     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2068 
2069     return 0;
2070 }
2071 
2072 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2073                              uint64_t *non_postcopiable_pending,
2074                              uint64_t *postcopiable_pending)
2075 {
2076     uint64_t remaining_size;
2077 
2078     remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2079 
2080     if (!migration_in_postcopy(migrate_get_current()) &&
2081         remaining_size < max_size) {
2082         qemu_mutex_lock_iothread();
2083         rcu_read_lock();
2084         migration_bitmap_sync();
2085         rcu_read_unlock();
2086         qemu_mutex_unlock_iothread();
2087         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2088     }
2089 
2090     /* We can do postcopy, and all the data is postcopiable */
2091     *postcopiable_pending += remaining_size;
2092 }
2093 
2094 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2095 {
2096     unsigned int xh_len;
2097     int xh_flags;
2098     uint8_t *loaded_data;
2099 
2100     if (!xbzrle_decoded_buf) {
2101         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2102     }
2103     loaded_data = xbzrle_decoded_buf;
2104 
2105     /* extract RLE header */
2106     xh_flags = qemu_get_byte(f);
2107     xh_len = qemu_get_be16(f);
2108 
2109     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2110         error_report("Failed to load XBZRLE page - wrong compression!");
2111         return -1;
2112     }
2113 
2114     if (xh_len > TARGET_PAGE_SIZE) {
2115         error_report("Failed to load XBZRLE page - len overflow!");
2116         return -1;
2117     }
2118     /* load data and decode */
2119     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2120 
2121     /* decode RLE */
2122     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2123                              TARGET_PAGE_SIZE) == -1) {
2124         error_report("Failed to load XBZRLE page - decode error!");
2125         return -1;
2126     }
2127 
2128     return 0;
2129 }
2130 
2131 /* Must be called from within a rcu critical section.
2132  * Returns a pointer from within the RCU-protected ram_list.
2133  */
2134 /*
2135  * Read a RAMBlock ID from the stream f.
2136  *
2137  * f: Stream to read from
2138  * flags: Page flags (mostly to see if it's a continuation of previous block)
2139  */
2140 static inline RAMBlock *ram_block_from_stream(QEMUFile *f,
2141                                               int flags)
2142 {
2143     static RAMBlock *block = NULL;
2144     char id[256];
2145     uint8_t len;
2146 
2147     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2148         if (!block) {
2149             error_report("Ack, bad migration stream!");
2150             return NULL;
2151         }
2152         return block;
2153     }
2154 
2155     len = qemu_get_byte(f);
2156     qemu_get_buffer(f, (uint8_t *)id, len);
2157     id[len] = 0;
2158 
2159     block = qemu_ram_block_by_name(id);
2160     if (!block) {
2161         error_report("Can't find block %s", id);
2162         return NULL;
2163     }
2164 
2165     return block;
2166 }
2167 
2168 static inline void *host_from_ram_block_offset(RAMBlock *block,
2169                                                ram_addr_t offset)
2170 {
2171     if (!offset_in_ramblock(block, offset)) {
2172         return NULL;
2173     }
2174 
2175     return block->host + offset;
2176 }
2177 
2178 /*
2179  * If a page (or a whole RDMA chunk) has been
2180  * determined to be zero, then zap it.
2181  */
2182 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2183 {
2184     if (ch != 0 || !is_zero_range(host, size)) {
2185         memset(host, ch, size);
2186     }
2187 }
2188 
2189 static void *do_data_decompress(void *opaque)
2190 {
2191     DecompressParam *param = opaque;
2192     unsigned long pagesize;
2193 
2194     while (!quit_decomp_thread) {
2195         qemu_mutex_lock(&param->mutex);
2196         while (!param->start && !quit_decomp_thread) {
2197             qemu_cond_wait(&param->cond, &param->mutex);
2198             pagesize = TARGET_PAGE_SIZE;
2199             if (!quit_decomp_thread) {
2200                 /* uncompress() will return failed in some case, especially
2201                  * when the page is dirted when doing the compression, it's
2202                  * not a problem because the dirty page will be retransferred
2203                  * and uncompress() won't break the data in other pages.
2204                  */
2205                 uncompress((Bytef *)param->des, &pagesize,
2206                            (const Bytef *)param->compbuf, param->len);
2207             }
2208             param->start = false;
2209         }
2210         qemu_mutex_unlock(&param->mutex);
2211     }
2212 
2213     return NULL;
2214 }
2215 
2216 void migrate_decompress_threads_create(void)
2217 {
2218     int i, thread_count;
2219 
2220     thread_count = migrate_decompress_threads();
2221     decompress_threads = g_new0(QemuThread, thread_count);
2222     decomp_param = g_new0(DecompressParam, thread_count);
2223     quit_decomp_thread = false;
2224     for (i = 0; i < thread_count; i++) {
2225         qemu_mutex_init(&decomp_param[i].mutex);
2226         qemu_cond_init(&decomp_param[i].cond);
2227         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2228         qemu_thread_create(decompress_threads + i, "decompress",
2229                            do_data_decompress, decomp_param + i,
2230                            QEMU_THREAD_JOINABLE);
2231     }
2232 }
2233 
2234 void migrate_decompress_threads_join(void)
2235 {
2236     int i, thread_count;
2237 
2238     quit_decomp_thread = true;
2239     thread_count = migrate_decompress_threads();
2240     for (i = 0; i < thread_count; i++) {
2241         qemu_mutex_lock(&decomp_param[i].mutex);
2242         qemu_cond_signal(&decomp_param[i].cond);
2243         qemu_mutex_unlock(&decomp_param[i].mutex);
2244     }
2245     for (i = 0; i < thread_count; i++) {
2246         qemu_thread_join(decompress_threads + i);
2247         qemu_mutex_destroy(&decomp_param[i].mutex);
2248         qemu_cond_destroy(&decomp_param[i].cond);
2249         g_free(decomp_param[i].compbuf);
2250     }
2251     g_free(decompress_threads);
2252     g_free(decomp_param);
2253     decompress_threads = NULL;
2254     decomp_param = NULL;
2255 }
2256 
2257 static void decompress_data_with_multi_threads(QEMUFile *f,
2258                                                void *host, int len)
2259 {
2260     int idx, thread_count;
2261 
2262     thread_count = migrate_decompress_threads();
2263     while (true) {
2264         for (idx = 0; idx < thread_count; idx++) {
2265             if (!decomp_param[idx].start) {
2266                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2267                 decomp_param[idx].des = host;
2268                 decomp_param[idx].len = len;
2269                 start_decompression(&decomp_param[idx]);
2270                 break;
2271             }
2272         }
2273         if (idx < thread_count) {
2274             break;
2275         }
2276     }
2277 }
2278 
2279 /*
2280  * Allocate data structures etc needed by incoming migration with postcopy-ram
2281  * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
2282  */
2283 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2284 {
2285     size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2286 
2287     return postcopy_ram_incoming_init(mis, ram_pages);
2288 }
2289 
2290 /*
2291  * Called in postcopy mode by ram_load().
2292  * rcu_read_lock is taken prior to this being called.
2293  */
2294 static int ram_load_postcopy(QEMUFile *f)
2295 {
2296     int flags = 0, ret = 0;
2297     bool place_needed = false;
2298     bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE;
2299     MigrationIncomingState *mis = migration_incoming_get_current();
2300     /* Temporary page that is later 'placed' */
2301     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2302     void *last_host = NULL;
2303     bool all_zero = false;
2304 
2305     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2306         ram_addr_t addr;
2307         void *host = NULL;
2308         void *page_buffer = NULL;
2309         void *place_source = NULL;
2310         uint8_t ch;
2311 
2312         addr = qemu_get_be64(f);
2313         flags = addr & ~TARGET_PAGE_MASK;
2314         addr &= TARGET_PAGE_MASK;
2315 
2316         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2317         place_needed = false;
2318         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2319             RAMBlock *block = ram_block_from_stream(f, flags);
2320 
2321             host = host_from_ram_block_offset(block, addr);
2322             if (!host) {
2323                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2324                 ret = -EINVAL;
2325                 break;
2326             }
2327             page_buffer = host;
2328             /*
2329              * Postcopy requires that we place whole host pages atomically.
2330              * To make it atomic, the data is read into a temporary page
2331              * that's moved into place later.
2332              * The migration protocol uses,  possibly smaller, target-pages
2333              * however the source ensures it always sends all the components
2334              * of a host page in order.
2335              */
2336             page_buffer = postcopy_host_page +
2337                           ((uintptr_t)host & ~qemu_host_page_mask);
2338             /* If all TP are zero then we can optimise the place */
2339             if (!((uintptr_t)host & ~qemu_host_page_mask)) {
2340                 all_zero = true;
2341             } else {
2342                 /* not the 1st TP within the HP */
2343                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2344                     error_report("Non-sequential target page %p/%p",
2345                                   host, last_host);
2346                     ret = -EINVAL;
2347                     break;
2348                 }
2349             }
2350 
2351 
2352             /*
2353              * If it's the last part of a host page then we place the host
2354              * page
2355              */
2356             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2357                                      ~qemu_host_page_mask) == 0;
2358             place_source = postcopy_host_page;
2359         }
2360         last_host = host;
2361 
2362         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2363         case RAM_SAVE_FLAG_COMPRESS:
2364             ch = qemu_get_byte(f);
2365             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2366             if (ch) {
2367                 all_zero = false;
2368             }
2369             break;
2370 
2371         case RAM_SAVE_FLAG_PAGE:
2372             all_zero = false;
2373             if (!place_needed || !matching_page_sizes) {
2374                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2375             } else {
2376                 /* Avoids the qemu_file copy during postcopy, which is
2377                  * going to do a copy later; can only do it when we
2378                  * do this read in one go (matching page sizes)
2379                  */
2380                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2381                                          TARGET_PAGE_SIZE);
2382             }
2383             break;
2384         case RAM_SAVE_FLAG_EOS:
2385             /* normal exit */
2386             break;
2387         default:
2388             error_report("Unknown combination of migration flags: %#x"
2389                          " (postcopy mode)", flags);
2390             ret = -EINVAL;
2391         }
2392 
2393         if (place_needed) {
2394             /* This gets called at the last target page in the host page */
2395             if (all_zero) {
2396                 ret = postcopy_place_page_zero(mis,
2397                                                host + TARGET_PAGE_SIZE -
2398                                                qemu_host_page_size);
2399             } else {
2400                 ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE -
2401                                                qemu_host_page_size,
2402                                                place_source);
2403             }
2404         }
2405         if (!ret) {
2406             ret = qemu_file_get_error(f);
2407         }
2408     }
2409 
2410     return ret;
2411 }
2412 
2413 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2414 {
2415     int flags = 0, ret = 0;
2416     static uint64_t seq_iter;
2417     int len = 0;
2418     /*
2419      * If system is running in postcopy mode, page inserts to host memory must
2420      * be atomic
2421      */
2422     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2423 
2424     seq_iter++;
2425 
2426     if (version_id != 4) {
2427         ret = -EINVAL;
2428     }
2429 
2430     /* This RCU critical section can be very long running.
2431      * When RCU reclaims in the code start to become numerous,
2432      * it will be necessary to reduce the granularity of this
2433      * critical section.
2434      */
2435     rcu_read_lock();
2436 
2437     if (postcopy_running) {
2438         ret = ram_load_postcopy(f);
2439     }
2440 
2441     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2442         ram_addr_t addr, total_ram_bytes;
2443         void *host = NULL;
2444         uint8_t ch;
2445 
2446         addr = qemu_get_be64(f);
2447         flags = addr & ~TARGET_PAGE_MASK;
2448         addr &= TARGET_PAGE_MASK;
2449 
2450         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2451                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2452             RAMBlock *block = ram_block_from_stream(f, flags);
2453 
2454             host = host_from_ram_block_offset(block, addr);
2455             if (!host) {
2456                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2457                 ret = -EINVAL;
2458                 break;
2459             }
2460         }
2461 
2462         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2463         case RAM_SAVE_FLAG_MEM_SIZE:
2464             /* Synchronize RAM block list */
2465             total_ram_bytes = addr;
2466             while (!ret && total_ram_bytes) {
2467                 RAMBlock *block;
2468                 char id[256];
2469                 ram_addr_t length;
2470 
2471                 len = qemu_get_byte(f);
2472                 qemu_get_buffer(f, (uint8_t *)id, len);
2473                 id[len] = 0;
2474                 length = qemu_get_be64(f);
2475 
2476                 block = qemu_ram_block_by_name(id);
2477                 if (block) {
2478                     if (length != block->used_length) {
2479                         Error *local_err = NULL;
2480 
2481                         ret = qemu_ram_resize(block->offset, length,
2482                                               &local_err);
2483                         if (local_err) {
2484                             error_report_err(local_err);
2485                         }
2486                     }
2487                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2488                                           block->idstr);
2489                 } else {
2490                     error_report("Unknown ramblock \"%s\", cannot "
2491                                  "accept migration", id);
2492                     ret = -EINVAL;
2493                 }
2494 
2495                 total_ram_bytes -= length;
2496             }
2497             break;
2498 
2499         case RAM_SAVE_FLAG_COMPRESS:
2500             ch = qemu_get_byte(f);
2501             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2502             break;
2503 
2504         case RAM_SAVE_FLAG_PAGE:
2505             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2506             break;
2507 
2508         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2509             len = qemu_get_be32(f);
2510             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2511                 error_report("Invalid compressed data length: %d", len);
2512                 ret = -EINVAL;
2513                 break;
2514             }
2515             decompress_data_with_multi_threads(f, host, len);
2516             break;
2517 
2518         case RAM_SAVE_FLAG_XBZRLE:
2519             if (load_xbzrle(f, addr, host) < 0) {
2520                 error_report("Failed to decompress XBZRLE page at "
2521                              RAM_ADDR_FMT, addr);
2522                 ret = -EINVAL;
2523                 break;
2524             }
2525             break;
2526         case RAM_SAVE_FLAG_EOS:
2527             /* normal exit */
2528             break;
2529         default:
2530             if (flags & RAM_SAVE_FLAG_HOOK) {
2531                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2532             } else {
2533                 error_report("Unknown combination of migration flags: %#x",
2534                              flags);
2535                 ret = -EINVAL;
2536             }
2537         }
2538         if (!ret) {
2539             ret = qemu_file_get_error(f);
2540         }
2541     }
2542 
2543     rcu_read_unlock();
2544     DPRINTF("Completed load of VM with exit code %d seq iteration "
2545             "%" PRIu64 "\n", ret, seq_iter);
2546     return ret;
2547 }
2548 
2549 static SaveVMHandlers savevm_ram_handlers = {
2550     .save_live_setup = ram_save_setup,
2551     .save_live_iterate = ram_save_iterate,
2552     .save_live_complete_postcopy = ram_save_complete,
2553     .save_live_complete_precopy = ram_save_complete,
2554     .save_live_pending = ram_save_pending,
2555     .load_state = ram_load,
2556     .cleanup = ram_migration_cleanup,
2557 };
2558 
2559 void ram_mig_init(void)
2560 {
2561     qemu_mutex_init(&XBZRLE.lock);
2562     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
2563 }
2564