xref: /qemu/migration/ram.c (revision c3e31eaa21bc038c146cb196f7762a972eb9de5b)
1 /*
2  * QEMU System Emulator
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2011-2015 Red Hat Inc
6  *
7  * Authors:
8  *  Juan Quintela <quintela@redhat.com>
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 #include "qemu/osdep.h"
29 #include "qemu-common.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qapi-event.h"
33 #include "qemu/cutils.h"
34 #include "qemu/bitops.h"
35 #include "qemu/bitmap.h"
36 #include "qemu/timer.h"
37 #include "qemu/main-loop.h"
38 #include "migration/migration.h"
39 #include "migration/postcopy-ram.h"
40 #include "exec/address-spaces.h"
41 #include "migration/page_cache.h"
42 #include "qemu/error-report.h"
43 #include "trace.h"
44 #include "exec/ram_addr.h"
45 #include "qemu/rcu_queue.h"
46 #include "migration/colo.h"
47 
48 static int dirty_rate_high_cnt;
49 
50 static uint64_t bitmap_sync_count;
51 
52 /***********************************************************/
53 /* ram save/restore */
54 
55 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
56 #define RAM_SAVE_FLAG_COMPRESS 0x02
57 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
58 #define RAM_SAVE_FLAG_PAGE     0x08
59 #define RAM_SAVE_FLAG_EOS      0x10
60 #define RAM_SAVE_FLAG_CONTINUE 0x20
61 #define RAM_SAVE_FLAG_XBZRLE   0x40
62 /* 0x80 is reserved in migration.h start with 0x100 next */
63 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
64 
65 static uint8_t *ZERO_TARGET_PAGE;
66 
67 static inline bool is_zero_range(uint8_t *p, uint64_t size)
68 {
69     return buffer_is_zero(p, size);
70 }
71 
72 /* struct contains XBZRLE cache and a static page
73    used by the compression */
74 static struct {
75     /* buffer used for XBZRLE encoding */
76     uint8_t *encoded_buf;
77     /* buffer for storing page content */
78     uint8_t *current_buf;
79     /* Cache for XBZRLE, Protected by lock. */
80     PageCache *cache;
81     QemuMutex lock;
82 } XBZRLE;
83 
84 /* buffer used for XBZRLE decoding */
85 static uint8_t *xbzrle_decoded_buf;
86 
87 static void XBZRLE_cache_lock(void)
88 {
89     if (migrate_use_xbzrle())
90         qemu_mutex_lock(&XBZRLE.lock);
91 }
92 
93 static void XBZRLE_cache_unlock(void)
94 {
95     if (migrate_use_xbzrle())
96         qemu_mutex_unlock(&XBZRLE.lock);
97 }
98 
99 /*
100  * called from qmp_migrate_set_cache_size in main thread, possibly while
101  * a migration is in progress.
102  * A running migration maybe using the cache and might finish during this
103  * call, hence changes to the cache are protected by XBZRLE.lock().
104  */
105 int64_t xbzrle_cache_resize(int64_t new_size)
106 {
107     PageCache *new_cache;
108     int64_t ret;
109 
110     if (new_size < TARGET_PAGE_SIZE) {
111         return -1;
112     }
113 
114     XBZRLE_cache_lock();
115 
116     if (XBZRLE.cache != NULL) {
117         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
118             goto out_new_size;
119         }
120         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
121                                         TARGET_PAGE_SIZE);
122         if (!new_cache) {
123             error_report("Error creating cache");
124             ret = -1;
125             goto out;
126         }
127 
128         cache_fini(XBZRLE.cache);
129         XBZRLE.cache = new_cache;
130     }
131 
132 out_new_size:
133     ret = pow2floor(new_size);
134 out:
135     XBZRLE_cache_unlock();
136     return ret;
137 }
138 
139 /* accounting for migration statistics */
140 typedef struct AccountingInfo {
141     uint64_t dup_pages;
142     uint64_t skipped_pages;
143     uint64_t norm_pages;
144     uint64_t iterations;
145     uint64_t xbzrle_bytes;
146     uint64_t xbzrle_pages;
147     uint64_t xbzrle_cache_miss;
148     double xbzrle_cache_miss_rate;
149     uint64_t xbzrle_overflows;
150 } AccountingInfo;
151 
152 static AccountingInfo acct_info;
153 
154 static void acct_clear(void)
155 {
156     memset(&acct_info, 0, sizeof(acct_info));
157 }
158 
159 uint64_t dup_mig_bytes_transferred(void)
160 {
161     return acct_info.dup_pages * TARGET_PAGE_SIZE;
162 }
163 
164 uint64_t dup_mig_pages_transferred(void)
165 {
166     return acct_info.dup_pages;
167 }
168 
169 uint64_t skipped_mig_bytes_transferred(void)
170 {
171     return acct_info.skipped_pages * TARGET_PAGE_SIZE;
172 }
173 
174 uint64_t skipped_mig_pages_transferred(void)
175 {
176     return acct_info.skipped_pages;
177 }
178 
179 uint64_t norm_mig_bytes_transferred(void)
180 {
181     return acct_info.norm_pages * TARGET_PAGE_SIZE;
182 }
183 
184 uint64_t norm_mig_pages_transferred(void)
185 {
186     return acct_info.norm_pages;
187 }
188 
189 uint64_t xbzrle_mig_bytes_transferred(void)
190 {
191     return acct_info.xbzrle_bytes;
192 }
193 
194 uint64_t xbzrle_mig_pages_transferred(void)
195 {
196     return acct_info.xbzrle_pages;
197 }
198 
199 uint64_t xbzrle_mig_pages_cache_miss(void)
200 {
201     return acct_info.xbzrle_cache_miss;
202 }
203 
204 double xbzrle_mig_cache_miss_rate(void)
205 {
206     return acct_info.xbzrle_cache_miss_rate;
207 }
208 
209 uint64_t xbzrle_mig_pages_overflow(void)
210 {
211     return acct_info.xbzrle_overflows;
212 }
213 
214 /* This is the last block that we have visited serching for dirty pages
215  */
216 static RAMBlock *last_seen_block;
217 /* This is the last block from where we have sent data */
218 static RAMBlock *last_sent_block;
219 static ram_addr_t last_offset;
220 static QemuMutex migration_bitmap_mutex;
221 static uint64_t migration_dirty_pages;
222 static uint32_t last_version;
223 static bool ram_bulk_stage;
224 
225 /* used by the search for pages to send */
226 struct PageSearchStatus {
227     /* Current block being searched */
228     RAMBlock    *block;
229     /* Current offset to search from */
230     ram_addr_t   offset;
231     /* Set once we wrap around */
232     bool         complete_round;
233 };
234 typedef struct PageSearchStatus PageSearchStatus;
235 
236 static struct BitmapRcu {
237     struct rcu_head rcu;
238     /* Main migration bitmap */
239     unsigned long *bmap;
240     /* bitmap of pages that haven't been sent even once
241      * only maintained and used in postcopy at the moment
242      * where it's used to send the dirtymap at the start
243      * of the postcopy phase
244      */
245     unsigned long *unsentmap;
246 } *migration_bitmap_rcu;
247 
248 struct CompressParam {
249     bool done;
250     bool quit;
251     QEMUFile *file;
252     QemuMutex mutex;
253     QemuCond cond;
254     RAMBlock *block;
255     ram_addr_t offset;
256 };
257 typedef struct CompressParam CompressParam;
258 
259 struct DecompressParam {
260     bool done;
261     bool quit;
262     QemuMutex mutex;
263     QemuCond cond;
264     void *des;
265     uint8_t *compbuf;
266     int len;
267 };
268 typedef struct DecompressParam DecompressParam;
269 
270 static CompressParam *comp_param;
271 static QemuThread *compress_threads;
272 /* comp_done_cond is used to wake up the migration thread when
273  * one of the compression threads has finished the compression.
274  * comp_done_lock is used to co-work with comp_done_cond.
275  */
276 static QemuMutex comp_done_lock;
277 static QemuCond comp_done_cond;
278 /* The empty QEMUFileOps will be used by file in CompressParam */
279 static const QEMUFileOps empty_ops = { };
280 
281 static bool compression_switch;
282 static DecompressParam *decomp_param;
283 static QemuThread *decompress_threads;
284 static QemuMutex decomp_done_lock;
285 static QemuCond decomp_done_cond;
286 
287 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
288                                 ram_addr_t offset);
289 
290 static void *do_data_compress(void *opaque)
291 {
292     CompressParam *param = opaque;
293     RAMBlock *block;
294     ram_addr_t offset;
295 
296     qemu_mutex_lock(&param->mutex);
297     while (!param->quit) {
298         if (param->block) {
299             block = param->block;
300             offset = param->offset;
301             param->block = NULL;
302             qemu_mutex_unlock(&param->mutex);
303 
304             do_compress_ram_page(param->file, block, offset);
305 
306             qemu_mutex_lock(&comp_done_lock);
307             param->done = true;
308             qemu_cond_signal(&comp_done_cond);
309             qemu_mutex_unlock(&comp_done_lock);
310 
311             qemu_mutex_lock(&param->mutex);
312         } else {
313             qemu_cond_wait(&param->cond, &param->mutex);
314         }
315     }
316     qemu_mutex_unlock(&param->mutex);
317 
318     return NULL;
319 }
320 
321 static inline void terminate_compression_threads(void)
322 {
323     int idx, thread_count;
324 
325     thread_count = migrate_compress_threads();
326     for (idx = 0; idx < thread_count; idx++) {
327         qemu_mutex_lock(&comp_param[idx].mutex);
328         comp_param[idx].quit = true;
329         qemu_cond_signal(&comp_param[idx].cond);
330         qemu_mutex_unlock(&comp_param[idx].mutex);
331     }
332 }
333 
334 void migrate_compress_threads_join(void)
335 {
336     int i, thread_count;
337 
338     if (!migrate_use_compression()) {
339         return;
340     }
341     terminate_compression_threads();
342     thread_count = migrate_compress_threads();
343     for (i = 0; i < thread_count; i++) {
344         qemu_thread_join(compress_threads + i);
345         qemu_fclose(comp_param[i].file);
346         qemu_mutex_destroy(&comp_param[i].mutex);
347         qemu_cond_destroy(&comp_param[i].cond);
348     }
349     qemu_mutex_destroy(&comp_done_lock);
350     qemu_cond_destroy(&comp_done_cond);
351     g_free(compress_threads);
352     g_free(comp_param);
353     compress_threads = NULL;
354     comp_param = NULL;
355 }
356 
357 void migrate_compress_threads_create(void)
358 {
359     int i, thread_count;
360 
361     if (!migrate_use_compression()) {
362         return;
363     }
364     compression_switch = true;
365     thread_count = migrate_compress_threads();
366     compress_threads = g_new0(QemuThread, thread_count);
367     comp_param = g_new0(CompressParam, thread_count);
368     qemu_cond_init(&comp_done_cond);
369     qemu_mutex_init(&comp_done_lock);
370     for (i = 0; i < thread_count; i++) {
371         /* comp_param[i].file is just used as a dummy buffer to save data,
372          * set its ops to empty.
373          */
374         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
375         comp_param[i].done = true;
376         comp_param[i].quit = false;
377         qemu_mutex_init(&comp_param[i].mutex);
378         qemu_cond_init(&comp_param[i].cond);
379         qemu_thread_create(compress_threads + i, "compress",
380                            do_data_compress, comp_param + i,
381                            QEMU_THREAD_JOINABLE);
382     }
383 }
384 
385 /**
386  * save_page_header: Write page header to wire
387  *
388  * If this is the 1st block, it also writes the block identification
389  *
390  * Returns: Number of bytes written
391  *
392  * @f: QEMUFile where to send the data
393  * @block: block that contains the page we want to send
394  * @offset: offset inside the block for the page
395  *          in the lower bits, it contains flags
396  */
397 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
398 {
399     size_t size, len;
400 
401     qemu_put_be64(f, offset);
402     size = 8;
403 
404     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
405         len = strlen(block->idstr);
406         qemu_put_byte(f, len);
407         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
408         size += 1 + len;
409     }
410     return size;
411 }
412 
413 /* Reduce amount of guest cpu execution to hopefully slow down memory writes.
414  * If guest dirty memory rate is reduced below the rate at which we can
415  * transfer pages to the destination then we should be able to complete
416  * migration. Some workloads dirty memory way too fast and will not effectively
417  * converge, even with auto-converge.
418  */
419 static void mig_throttle_guest_down(void)
420 {
421     MigrationState *s = migrate_get_current();
422     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
423     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
424 
425     /* We have not started throttling yet. Let's start it. */
426     if (!cpu_throttle_active()) {
427         cpu_throttle_set(pct_initial);
428     } else {
429         /* Throttling already on, just increase the rate */
430         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
431     }
432 }
433 
434 /* Update the xbzrle cache to reflect a page that's been sent as all 0.
435  * The important thing is that a stale (not-yet-0'd) page be replaced
436  * by the new data.
437  * As a bonus, if the page wasn't in the cache it gets added so that
438  * when a small write is made into the 0'd page it gets XBZRLE sent
439  */
440 static void xbzrle_cache_zero_page(ram_addr_t current_addr)
441 {
442     if (ram_bulk_stage || !migrate_use_xbzrle()) {
443         return;
444     }
445 
446     /* We don't care if this fails to allocate a new cache page
447      * as long as it updated an old one */
448     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
449                  bitmap_sync_count);
450 }
451 
452 #define ENCODING_FLAG_XBZRLE 0x1
453 
454 /**
455  * save_xbzrle_page: compress and send current page
456  *
457  * Returns: 1 means that we wrote the page
458  *          0 means that page is identical to the one already sent
459  *          -1 means that xbzrle would be longer than normal
460  *
461  * @f: QEMUFile where to send the data
462  * @current_data:
463  * @current_addr:
464  * @block: block that contains the page we want to send
465  * @offset: offset inside the block for the page
466  * @last_stage: if we are at the completion stage
467  * @bytes_transferred: increase it with the number of transferred bytes
468  */
469 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
470                             ram_addr_t current_addr, RAMBlock *block,
471                             ram_addr_t offset, bool last_stage,
472                             uint64_t *bytes_transferred)
473 {
474     int encoded_len = 0, bytes_xbzrle;
475     uint8_t *prev_cached_page;
476 
477     if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
478         acct_info.xbzrle_cache_miss++;
479         if (!last_stage) {
480             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
481                              bitmap_sync_count) == -1) {
482                 return -1;
483             } else {
484                 /* update *current_data when the page has been
485                    inserted into cache */
486                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
487             }
488         }
489         return -1;
490     }
491 
492     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
493 
494     /* save current buffer into memory */
495     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
496 
497     /* XBZRLE encoding (if there is no overflow) */
498     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
499                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
500                                        TARGET_PAGE_SIZE);
501     if (encoded_len == 0) {
502         trace_save_xbzrle_page_skipping();
503         return 0;
504     } else if (encoded_len == -1) {
505         trace_save_xbzrle_page_overflow();
506         acct_info.xbzrle_overflows++;
507         /* update data in the cache */
508         if (!last_stage) {
509             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
510             *current_data = prev_cached_page;
511         }
512         return -1;
513     }
514 
515     /* we need to update the data in the cache, in order to get the same data */
516     if (!last_stage) {
517         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
518     }
519 
520     /* Send XBZRLE based compressed page */
521     bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
522     qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
523     qemu_put_be16(f, encoded_len);
524     qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
525     bytes_xbzrle += encoded_len + 1 + 2;
526     acct_info.xbzrle_pages++;
527     acct_info.xbzrle_bytes += bytes_xbzrle;
528     *bytes_transferred += bytes_xbzrle;
529 
530     return 1;
531 }
532 
533 /* Called with rcu_read_lock() to protect migration_bitmap
534  * rb: The RAMBlock  to search for dirty pages in
535  * start: Start address (typically so we can continue from previous page)
536  * ram_addr_abs: Pointer into which to store the address of the dirty page
537  *               within the global ram_addr space
538  *
539  * Returns: byte offset within memory region of the start of a dirty page
540  */
541 static inline
542 ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
543                                        ram_addr_t start,
544                                        ram_addr_t *ram_addr_abs)
545 {
546     unsigned long base = rb->offset >> TARGET_PAGE_BITS;
547     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
548     uint64_t rb_size = rb->used_length;
549     unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
550     unsigned long *bitmap;
551 
552     unsigned long next;
553 
554     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
555     if (ram_bulk_stage && nr > base) {
556         next = nr + 1;
557     } else {
558         next = find_next_bit(bitmap, size, nr);
559     }
560 
561     *ram_addr_abs = next << TARGET_PAGE_BITS;
562     return (next - base) << TARGET_PAGE_BITS;
563 }
564 
565 static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
566 {
567     bool ret;
568     int nr = addr >> TARGET_PAGE_BITS;
569     unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
570 
571     ret = test_and_clear_bit(nr, bitmap);
572 
573     if (ret) {
574         migration_dirty_pages--;
575     }
576     return ret;
577 }
578 
579 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
580 {
581     unsigned long *bitmap;
582     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
583     migration_dirty_pages +=
584         cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
585 }
586 
587 /* Fix me: there are too many global variables used in migration process. */
588 static int64_t start_time;
589 static int64_t bytes_xfer_prev;
590 static int64_t num_dirty_pages_period;
591 static uint64_t xbzrle_cache_miss_prev;
592 static uint64_t iterations_prev;
593 
594 static void migration_bitmap_sync_init(void)
595 {
596     start_time = 0;
597     bytes_xfer_prev = 0;
598     num_dirty_pages_period = 0;
599     xbzrle_cache_miss_prev = 0;
600     iterations_prev = 0;
601 }
602 
603 /* Returns a summary bitmap of the page sizes of all RAMBlocks;
604  * for VMs with just normal pages this is equivalent to the
605  * host page size.  If it's got some huge pages then it's the OR
606  * of all the different page sizes.
607  */
608 uint64_t ram_pagesize_summary(void)
609 {
610     RAMBlock *block;
611     uint64_t summary = 0;
612 
613     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
614         summary |= block->page_size;
615     }
616 
617     return summary;
618 }
619 
620 static void migration_bitmap_sync(void)
621 {
622     RAMBlock *block;
623     uint64_t num_dirty_pages_init = migration_dirty_pages;
624     MigrationState *s = migrate_get_current();
625     int64_t end_time;
626     int64_t bytes_xfer_now;
627 
628     bitmap_sync_count++;
629 
630     if (!bytes_xfer_prev) {
631         bytes_xfer_prev = ram_bytes_transferred();
632     }
633 
634     if (!start_time) {
635         start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
636     }
637 
638     trace_migration_bitmap_sync_start();
639     memory_global_dirty_log_sync();
640 
641     qemu_mutex_lock(&migration_bitmap_mutex);
642     rcu_read_lock();
643     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
644         migration_bitmap_sync_range(block->offset, block->used_length);
645     }
646     rcu_read_unlock();
647     qemu_mutex_unlock(&migration_bitmap_mutex);
648 
649     trace_migration_bitmap_sync_end(migration_dirty_pages
650                                     - num_dirty_pages_init);
651     num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
652     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
653 
654     /* more than 1 second = 1000 millisecons */
655     if (end_time > start_time + 1000) {
656         if (migrate_auto_converge()) {
657             /* The following detection logic can be refined later. For now:
658                Check to see if the dirtied bytes is 50% more than the approx.
659                amount of bytes that just got transferred since the last time we
660                were in this routine. If that happens twice, start or increase
661                throttling */
662             bytes_xfer_now = ram_bytes_transferred();
663 
664             if (s->dirty_pages_rate &&
665                (num_dirty_pages_period * TARGET_PAGE_SIZE >
666                    (bytes_xfer_now - bytes_xfer_prev)/2) &&
667                (dirty_rate_high_cnt++ >= 2)) {
668                     trace_migration_throttle();
669                     dirty_rate_high_cnt = 0;
670                     mig_throttle_guest_down();
671              }
672              bytes_xfer_prev = bytes_xfer_now;
673         }
674 
675         if (migrate_use_xbzrle()) {
676             if (iterations_prev != acct_info.iterations) {
677                 acct_info.xbzrle_cache_miss_rate =
678                    (double)(acct_info.xbzrle_cache_miss -
679                             xbzrle_cache_miss_prev) /
680                    (acct_info.iterations - iterations_prev);
681             }
682             iterations_prev = acct_info.iterations;
683             xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
684         }
685         s->dirty_pages_rate = num_dirty_pages_period * 1000
686             / (end_time - start_time);
687         s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
688         start_time = end_time;
689         num_dirty_pages_period = 0;
690     }
691     s->dirty_sync_count = bitmap_sync_count;
692     if (migrate_use_events()) {
693         qapi_event_send_migration_pass(bitmap_sync_count, NULL);
694     }
695 }
696 
697 /**
698  * save_zero_page: Send the zero page to the stream
699  *
700  * Returns: Number of pages written.
701  *
702  * @f: QEMUFile where to send the data
703  * @block: block that contains the page we want to send
704  * @offset: offset inside the block for the page
705  * @p: pointer to the page
706  * @bytes_transferred: increase it with the number of transferred bytes
707  */
708 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
709                           uint8_t *p, uint64_t *bytes_transferred)
710 {
711     int pages = -1;
712 
713     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
714         acct_info.dup_pages++;
715         *bytes_transferred += save_page_header(f, block,
716                                                offset | RAM_SAVE_FLAG_COMPRESS);
717         qemu_put_byte(f, 0);
718         *bytes_transferred += 1;
719         pages = 1;
720     }
721 
722     return pages;
723 }
724 
725 static void ram_release_pages(MigrationState *ms, const char *block_name,
726                               uint64_t offset, int pages)
727 {
728     if (!migrate_release_ram() || !migration_in_postcopy(ms)) {
729         return;
730     }
731 
732     ram_discard_range(NULL, block_name, offset, pages << TARGET_PAGE_BITS);
733 }
734 
735 /**
736  * ram_save_page: Send the given page to the stream
737  *
738  * Returns: Number of pages written.
739  *          < 0 - error
740  *          >=0 - Number of pages written - this might legally be 0
741  *                if xbzrle noticed the page was the same.
742  *
743  * @ms: The current migration state.
744  * @f: QEMUFile where to send the data
745  * @block: block that contains the page we want to send
746  * @offset: offset inside the block for the page
747  * @last_stage: if we are at the completion stage
748  * @bytes_transferred: increase it with the number of transferred bytes
749  */
750 static int ram_save_page(MigrationState *ms, QEMUFile *f, PageSearchStatus *pss,
751                          bool last_stage, uint64_t *bytes_transferred)
752 {
753     int pages = -1;
754     uint64_t bytes_xmit;
755     ram_addr_t current_addr;
756     uint8_t *p;
757     int ret;
758     bool send_async = true;
759     RAMBlock *block = pss->block;
760     ram_addr_t offset = pss->offset;
761 
762     p = block->host + offset;
763 
764     /* In doubt sent page as normal */
765     bytes_xmit = 0;
766     ret = ram_control_save_page(f, block->offset,
767                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
768     if (bytes_xmit) {
769         *bytes_transferred += bytes_xmit;
770         pages = 1;
771     }
772 
773     XBZRLE_cache_lock();
774 
775     current_addr = block->offset + offset;
776 
777     if (block == last_sent_block) {
778         offset |= RAM_SAVE_FLAG_CONTINUE;
779     }
780     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
781         if (ret != RAM_SAVE_CONTROL_DELAYED) {
782             if (bytes_xmit > 0) {
783                 acct_info.norm_pages++;
784             } else if (bytes_xmit == 0) {
785                 acct_info.dup_pages++;
786             }
787         }
788     } else {
789         pages = save_zero_page(f, block, offset, p, bytes_transferred);
790         if (pages > 0) {
791             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
792              * page would be stale
793              */
794             xbzrle_cache_zero_page(current_addr);
795             ram_release_pages(ms, block->idstr, pss->offset, pages);
796         } else if (!ram_bulk_stage &&
797                    !migration_in_postcopy(ms) && migrate_use_xbzrle()) {
798             pages = save_xbzrle_page(f, &p, current_addr, block,
799                                      offset, last_stage, bytes_transferred);
800             if (!last_stage) {
801                 /* Can't send this cached data async, since the cache page
802                  * might get updated before it gets to the wire
803                  */
804                 send_async = false;
805             }
806         }
807     }
808 
809     /* XBZRLE overflow or normal page */
810     if (pages == -1) {
811         *bytes_transferred += save_page_header(f, block,
812                                                offset | RAM_SAVE_FLAG_PAGE);
813         if (send_async) {
814             qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE,
815                                   migrate_release_ram() &
816                                   migration_in_postcopy(ms));
817         } else {
818             qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
819         }
820         *bytes_transferred += TARGET_PAGE_SIZE;
821         pages = 1;
822         acct_info.norm_pages++;
823     }
824 
825     XBZRLE_cache_unlock();
826 
827     return pages;
828 }
829 
830 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
831                                 ram_addr_t offset)
832 {
833     int bytes_sent, blen;
834     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
835 
836     bytes_sent = save_page_header(f, block, offset |
837                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
838     blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
839                                      migrate_compress_level());
840     if (blen < 0) {
841         bytes_sent = 0;
842         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
843         error_report("compressed data failed!");
844     } else {
845         bytes_sent += blen;
846         ram_release_pages(migrate_get_current(), block->idstr,
847                           offset & TARGET_PAGE_MASK, 1);
848     }
849 
850     return bytes_sent;
851 }
852 
853 static uint64_t bytes_transferred;
854 
855 static void flush_compressed_data(QEMUFile *f)
856 {
857     int idx, len, thread_count;
858 
859     if (!migrate_use_compression()) {
860         return;
861     }
862     thread_count = migrate_compress_threads();
863 
864     qemu_mutex_lock(&comp_done_lock);
865     for (idx = 0; idx < thread_count; idx++) {
866         while (!comp_param[idx].done) {
867             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
868         }
869     }
870     qemu_mutex_unlock(&comp_done_lock);
871 
872     for (idx = 0; idx < thread_count; idx++) {
873         qemu_mutex_lock(&comp_param[idx].mutex);
874         if (!comp_param[idx].quit) {
875             len = qemu_put_qemu_file(f, comp_param[idx].file);
876             bytes_transferred += len;
877         }
878         qemu_mutex_unlock(&comp_param[idx].mutex);
879     }
880 }
881 
882 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
883                                        ram_addr_t offset)
884 {
885     param->block = block;
886     param->offset = offset;
887 }
888 
889 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
890                                            ram_addr_t offset,
891                                            uint64_t *bytes_transferred)
892 {
893     int idx, thread_count, bytes_xmit = -1, pages = -1;
894 
895     thread_count = migrate_compress_threads();
896     qemu_mutex_lock(&comp_done_lock);
897     while (true) {
898         for (idx = 0; idx < thread_count; idx++) {
899             if (comp_param[idx].done) {
900                 comp_param[idx].done = false;
901                 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
902                 qemu_mutex_lock(&comp_param[idx].mutex);
903                 set_compress_params(&comp_param[idx], block, offset);
904                 qemu_cond_signal(&comp_param[idx].cond);
905                 qemu_mutex_unlock(&comp_param[idx].mutex);
906                 pages = 1;
907                 acct_info.norm_pages++;
908                 *bytes_transferred += bytes_xmit;
909                 break;
910             }
911         }
912         if (pages > 0) {
913             break;
914         } else {
915             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
916         }
917     }
918     qemu_mutex_unlock(&comp_done_lock);
919 
920     return pages;
921 }
922 
923 /**
924  * ram_save_compressed_page: compress the given page and send it to the stream
925  *
926  * Returns: Number of pages written.
927  *
928  * @ms: The current migration state.
929  * @f: QEMUFile where to send the data
930  * @block: block that contains the page we want to send
931  * @offset: offset inside the block for the page
932  * @last_stage: if we are at the completion stage
933  * @bytes_transferred: increase it with the number of transferred bytes
934  */
935 static int ram_save_compressed_page(MigrationState *ms, QEMUFile *f,
936                                     PageSearchStatus *pss, bool last_stage,
937                                     uint64_t *bytes_transferred)
938 {
939     int pages = -1;
940     uint64_t bytes_xmit = 0;
941     uint8_t *p;
942     int ret, blen;
943     RAMBlock *block = pss->block;
944     ram_addr_t offset = pss->offset;
945 
946     p = block->host + offset;
947 
948     ret = ram_control_save_page(f, block->offset,
949                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
950     if (bytes_xmit) {
951         *bytes_transferred += bytes_xmit;
952         pages = 1;
953     }
954     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
955         if (ret != RAM_SAVE_CONTROL_DELAYED) {
956             if (bytes_xmit > 0) {
957                 acct_info.norm_pages++;
958             } else if (bytes_xmit == 0) {
959                 acct_info.dup_pages++;
960             }
961         }
962     } else {
963         /* When starting the process of a new block, the first page of
964          * the block should be sent out before other pages in the same
965          * block, and all the pages in last block should have been sent
966          * out, keeping this order is important, because the 'cont' flag
967          * is used to avoid resending the block name.
968          */
969         if (block != last_sent_block) {
970             flush_compressed_data(f);
971             pages = save_zero_page(f, block, offset, p, bytes_transferred);
972             if (pages == -1) {
973                 /* Make sure the first page is sent out before other pages */
974                 bytes_xmit = save_page_header(f, block, offset |
975                                               RAM_SAVE_FLAG_COMPRESS_PAGE);
976                 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
977                                                  migrate_compress_level());
978                 if (blen > 0) {
979                     *bytes_transferred += bytes_xmit + blen;
980                     acct_info.norm_pages++;
981                     pages = 1;
982                 } else {
983                     qemu_file_set_error(f, blen);
984                     error_report("compressed data failed!");
985                 }
986             }
987             if (pages > 0) {
988                 ram_release_pages(ms, block->idstr, pss->offset, pages);
989             }
990         } else {
991             offset |= RAM_SAVE_FLAG_CONTINUE;
992             pages = save_zero_page(f, block, offset, p, bytes_transferred);
993             if (pages == -1) {
994                 pages = compress_page_with_multi_thread(f, block, offset,
995                                                         bytes_transferred);
996             } else {
997                 ram_release_pages(ms, block->idstr, pss->offset, pages);
998             }
999         }
1000     }
1001 
1002     return pages;
1003 }
1004 
1005 /*
1006  * Find the next dirty page and update any state associated with
1007  * the search process.
1008  *
1009  * Returns: True if a page is found
1010  *
1011  * @f: Current migration stream.
1012  * @pss: Data about the state of the current dirty page scan.
1013  * @*again: Set to false if the search has scanned the whole of RAM
1014  * *ram_addr_abs: Pointer into which to store the address of the dirty page
1015  *               within the global ram_addr space
1016  */
1017 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
1018                              bool *again, ram_addr_t *ram_addr_abs)
1019 {
1020     pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
1021                                               ram_addr_abs);
1022     if (pss->complete_round && pss->block == last_seen_block &&
1023         pss->offset >= last_offset) {
1024         /*
1025          * We've been once around the RAM and haven't found anything.
1026          * Give up.
1027          */
1028         *again = false;
1029         return false;
1030     }
1031     if (pss->offset >= pss->block->used_length) {
1032         /* Didn't find anything in this RAM Block */
1033         pss->offset = 0;
1034         pss->block = QLIST_NEXT_RCU(pss->block, next);
1035         if (!pss->block) {
1036             /* Hit the end of the list */
1037             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1038             /* Flag that we've looped */
1039             pss->complete_round = true;
1040             ram_bulk_stage = false;
1041             if (migrate_use_xbzrle()) {
1042                 /* If xbzrle is on, stop using the data compression at this
1043                  * point. In theory, xbzrle can do better than compression.
1044                  */
1045                 flush_compressed_data(f);
1046                 compression_switch = false;
1047             }
1048         }
1049         /* Didn't find anything this time, but try again on the new block */
1050         *again = true;
1051         return false;
1052     } else {
1053         /* Can go around again, but... */
1054         *again = true;
1055         /* We've found something so probably don't need to */
1056         return true;
1057     }
1058 }
1059 
1060 /*
1061  * Helper for 'get_queued_page' - gets a page off the queue
1062  *      ms:      MigrationState in
1063  * *offset:      Used to return the offset within the RAMBlock
1064  * ram_addr_abs: global offset in the dirty/sent bitmaps
1065  *
1066  * Returns:      block (or NULL if none available)
1067  */
1068 static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1069                               ram_addr_t *ram_addr_abs)
1070 {
1071     RAMBlock *block = NULL;
1072 
1073     qemu_mutex_lock(&ms->src_page_req_mutex);
1074     if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1075         struct MigrationSrcPageRequest *entry =
1076                                 QSIMPLEQ_FIRST(&ms->src_page_requests);
1077         block = entry->rb;
1078         *offset = entry->offset;
1079         *ram_addr_abs = (entry->offset + entry->rb->offset) &
1080                         TARGET_PAGE_MASK;
1081 
1082         if (entry->len > TARGET_PAGE_SIZE) {
1083             entry->len -= TARGET_PAGE_SIZE;
1084             entry->offset += TARGET_PAGE_SIZE;
1085         } else {
1086             memory_region_unref(block->mr);
1087             QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1088             g_free(entry);
1089         }
1090     }
1091     qemu_mutex_unlock(&ms->src_page_req_mutex);
1092 
1093     return block;
1094 }
1095 
1096 /*
1097  * Unqueue a page from the queue fed by postcopy page requests; skips pages
1098  * that are already sent (!dirty)
1099  *
1100  *      ms:      MigrationState in
1101  *     pss:      PageSearchStatus structure updated with found block/offset
1102  * ram_addr_abs: global offset in the dirty/sent bitmaps
1103  *
1104  * Returns:      true if a queued page is found
1105  */
1106 static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
1107                             ram_addr_t *ram_addr_abs)
1108 {
1109     RAMBlock  *block;
1110     ram_addr_t offset;
1111     bool dirty;
1112 
1113     do {
1114         block = unqueue_page(ms, &offset, ram_addr_abs);
1115         /*
1116          * We're sending this page, and since it's postcopy nothing else
1117          * will dirty it, and we must make sure it doesn't get sent again
1118          * even if this queue request was received after the background
1119          * search already sent it.
1120          */
1121         if (block) {
1122             unsigned long *bitmap;
1123             bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1124             dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1125             if (!dirty) {
1126                 trace_get_queued_page_not_dirty(
1127                     block->idstr, (uint64_t)offset,
1128                     (uint64_t)*ram_addr_abs,
1129                     test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1130                          atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1131             } else {
1132                 trace_get_queued_page(block->idstr,
1133                                       (uint64_t)offset,
1134                                       (uint64_t)*ram_addr_abs);
1135             }
1136         }
1137 
1138     } while (block && !dirty);
1139 
1140     if (block) {
1141         /*
1142          * As soon as we start servicing pages out of order, then we have
1143          * to kill the bulk stage, since the bulk stage assumes
1144          * in (migration_bitmap_find_and_reset_dirty) that every page is
1145          * dirty, that's no longer true.
1146          */
1147         ram_bulk_stage = false;
1148 
1149         /*
1150          * We want the background search to continue from the queued page
1151          * since the guest is likely to want other pages near to the page
1152          * it just requested.
1153          */
1154         pss->block = block;
1155         pss->offset = offset;
1156     }
1157 
1158     return !!block;
1159 }
1160 
1161 /**
1162  * flush_page_queue: Flush any remaining pages in the ram request queue
1163  *    it should be empty at the end anyway, but in error cases there may be
1164  *    some left.
1165  *
1166  * ms: MigrationState
1167  */
1168 void flush_page_queue(MigrationState *ms)
1169 {
1170     struct MigrationSrcPageRequest *mspr, *next_mspr;
1171     /* This queue generally should be empty - but in the case of a failed
1172      * migration might have some droppings in.
1173      */
1174     rcu_read_lock();
1175     QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1176         memory_region_unref(mspr->rb->mr);
1177         QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1178         g_free(mspr);
1179     }
1180     rcu_read_unlock();
1181 }
1182 
1183 /**
1184  * Queue the pages for transmission, e.g. a request from postcopy destination
1185  *   ms: MigrationStatus in which the queue is held
1186  *   rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
1187  *   start: Offset from the start of the RAMBlock
1188  *   len: Length (in bytes) to send
1189  *   Return: 0 on success
1190  */
1191 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1192                          ram_addr_t start, ram_addr_t len)
1193 {
1194     RAMBlock *ramblock;
1195 
1196     ms->postcopy_requests++;
1197     rcu_read_lock();
1198     if (!rbname) {
1199         /* Reuse last RAMBlock */
1200         ramblock = ms->last_req_rb;
1201 
1202         if (!ramblock) {
1203             /*
1204              * Shouldn't happen, we can't reuse the last RAMBlock if
1205              * it's the 1st request.
1206              */
1207             error_report("ram_save_queue_pages no previous block");
1208             goto err;
1209         }
1210     } else {
1211         ramblock = qemu_ram_block_by_name(rbname);
1212 
1213         if (!ramblock) {
1214             /* We shouldn't be asked for a non-existent RAMBlock */
1215             error_report("ram_save_queue_pages no block '%s'", rbname);
1216             goto err;
1217         }
1218         ms->last_req_rb = ramblock;
1219     }
1220     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1221     if (start+len > ramblock->used_length) {
1222         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1223                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1224                      __func__, start, len, ramblock->used_length);
1225         goto err;
1226     }
1227 
1228     struct MigrationSrcPageRequest *new_entry =
1229         g_malloc0(sizeof(struct MigrationSrcPageRequest));
1230     new_entry->rb = ramblock;
1231     new_entry->offset = start;
1232     new_entry->len = len;
1233 
1234     memory_region_ref(ramblock->mr);
1235     qemu_mutex_lock(&ms->src_page_req_mutex);
1236     QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1237     qemu_mutex_unlock(&ms->src_page_req_mutex);
1238     rcu_read_unlock();
1239 
1240     return 0;
1241 
1242 err:
1243     rcu_read_unlock();
1244     return -1;
1245 }
1246 
1247 /**
1248  * ram_save_target_page: Save one target page
1249  *
1250  *
1251  * @f: QEMUFile where to send the data
1252  * @block: pointer to block that contains the page we want to send
1253  * @offset: offset inside the block for the page;
1254  * @last_stage: if we are at the completion stage
1255  * @bytes_transferred: increase it with the number of transferred bytes
1256  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1257  *
1258  * Returns: Number of pages written.
1259  */
1260 static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
1261                                 PageSearchStatus *pss,
1262                                 bool last_stage,
1263                                 uint64_t *bytes_transferred,
1264                                 ram_addr_t dirty_ram_abs)
1265 {
1266     int res = 0;
1267 
1268     /* Check the pages is dirty and if it is send it */
1269     if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1270         unsigned long *unsentmap;
1271         if (compression_switch && migrate_use_compression()) {
1272             res = ram_save_compressed_page(ms, f, pss,
1273                                            last_stage,
1274                                            bytes_transferred);
1275         } else {
1276             res = ram_save_page(ms, f, pss, last_stage,
1277                                 bytes_transferred);
1278         }
1279 
1280         if (res < 0) {
1281             return res;
1282         }
1283         unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1284         if (unsentmap) {
1285             clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1286         }
1287         /* Only update last_sent_block if a block was actually sent; xbzrle
1288          * might have decided the page was identical so didn't bother writing
1289          * to the stream.
1290          */
1291         if (res > 0) {
1292             last_sent_block = pss->block;
1293         }
1294     }
1295 
1296     return res;
1297 }
1298 
1299 /**
1300  * ram_save_host_page: Starting at *offset send pages up to the end
1301  *                     of the current host page.  It's valid for the initial
1302  *                     offset to point into the middle of a host page
1303  *                     in which case the remainder of the hostpage is sent.
1304  *                     Only dirty target pages are sent.
1305  *                     Note that the host page size may be a huge page for this
1306  *                     block.
1307  *
1308  * Returns: Number of pages written.
1309  *
1310  * @f: QEMUFile where to send the data
1311  * @block: pointer to block that contains the page we want to send
1312  * @offset: offset inside the block for the page; updated to last target page
1313  *          sent
1314  * @last_stage: if we are at the completion stage
1315  * @bytes_transferred: increase it with the number of transferred bytes
1316  * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1317  */
1318 static int ram_save_host_page(MigrationState *ms, QEMUFile *f,
1319                               PageSearchStatus *pss,
1320                               bool last_stage,
1321                               uint64_t *bytes_transferred,
1322                               ram_addr_t dirty_ram_abs)
1323 {
1324     int tmppages, pages = 0;
1325     size_t pagesize = qemu_ram_pagesize(pss->block);
1326 
1327     do {
1328         tmppages = ram_save_target_page(ms, f, pss, last_stage,
1329                                         bytes_transferred, dirty_ram_abs);
1330         if (tmppages < 0) {
1331             return tmppages;
1332         }
1333 
1334         pages += tmppages;
1335         pss->offset += TARGET_PAGE_SIZE;
1336         dirty_ram_abs += TARGET_PAGE_SIZE;
1337     } while (pss->offset & (pagesize - 1));
1338 
1339     /* The offset we leave with is the last one we looked at */
1340     pss->offset -= TARGET_PAGE_SIZE;
1341     return pages;
1342 }
1343 
1344 /**
1345  * ram_find_and_save_block: Finds a dirty page and sends it to f
1346  *
1347  * Called within an RCU critical section.
1348  *
1349  * Returns:  The number of pages written
1350  *           0 means no dirty pages
1351  *
1352  * @f: QEMUFile where to send the data
1353  * @last_stage: if we are at the completion stage
1354  * @bytes_transferred: increase it with the number of transferred bytes
1355  *
1356  * On systems where host-page-size > target-page-size it will send all the
1357  * pages in a host page that are dirty.
1358  */
1359 
1360 static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1361                                    uint64_t *bytes_transferred)
1362 {
1363     PageSearchStatus pss;
1364     MigrationState *ms = migrate_get_current();
1365     int pages = 0;
1366     bool again, found;
1367     ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1368                                  ram_addr_t space */
1369 
1370     /* No dirty page as there is zero RAM */
1371     if (!ram_bytes_total()) {
1372         return pages;
1373     }
1374 
1375     pss.block = last_seen_block;
1376     pss.offset = last_offset;
1377     pss.complete_round = false;
1378 
1379     if (!pss.block) {
1380         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1381     }
1382 
1383     do {
1384         again = true;
1385         found = get_queued_page(ms, &pss, &dirty_ram_abs);
1386 
1387         if (!found) {
1388             /* priority queue empty, so just search for something dirty */
1389             found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
1390         }
1391 
1392         if (found) {
1393             pages = ram_save_host_page(ms, f, &pss,
1394                                        last_stage, bytes_transferred,
1395                                        dirty_ram_abs);
1396         }
1397     } while (!pages && again);
1398 
1399     last_seen_block = pss.block;
1400     last_offset = pss.offset;
1401 
1402     return pages;
1403 }
1404 
1405 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1406 {
1407     uint64_t pages = size / TARGET_PAGE_SIZE;
1408     if (zero) {
1409         acct_info.dup_pages += pages;
1410     } else {
1411         acct_info.norm_pages += pages;
1412         bytes_transferred += size;
1413         qemu_update_position(f, size);
1414     }
1415 }
1416 
1417 static ram_addr_t ram_save_remaining(void)
1418 {
1419     return migration_dirty_pages;
1420 }
1421 
1422 uint64_t ram_bytes_remaining(void)
1423 {
1424     return ram_save_remaining() * TARGET_PAGE_SIZE;
1425 }
1426 
1427 uint64_t ram_bytes_transferred(void)
1428 {
1429     return bytes_transferred;
1430 }
1431 
1432 uint64_t ram_bytes_total(void)
1433 {
1434     RAMBlock *block;
1435     uint64_t total = 0;
1436 
1437     rcu_read_lock();
1438     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1439         total += block->used_length;
1440     rcu_read_unlock();
1441     return total;
1442 }
1443 
1444 void free_xbzrle_decoded_buf(void)
1445 {
1446     g_free(xbzrle_decoded_buf);
1447     xbzrle_decoded_buf = NULL;
1448 }
1449 
1450 static void migration_bitmap_free(struct BitmapRcu *bmap)
1451 {
1452     g_free(bmap->bmap);
1453     g_free(bmap->unsentmap);
1454     g_free(bmap);
1455 }
1456 
1457 static void ram_migration_cleanup(void *opaque)
1458 {
1459     /* caller have hold iothread lock or is in a bh, so there is
1460      * no writing race against this migration_bitmap
1461      */
1462     struct BitmapRcu *bitmap = migration_bitmap_rcu;
1463     atomic_rcu_set(&migration_bitmap_rcu, NULL);
1464     if (bitmap) {
1465         memory_global_dirty_log_stop();
1466         call_rcu(bitmap, migration_bitmap_free, rcu);
1467     }
1468 
1469     XBZRLE_cache_lock();
1470     if (XBZRLE.cache) {
1471         cache_fini(XBZRLE.cache);
1472         g_free(XBZRLE.encoded_buf);
1473         g_free(XBZRLE.current_buf);
1474         g_free(ZERO_TARGET_PAGE);
1475         XBZRLE.cache = NULL;
1476         XBZRLE.encoded_buf = NULL;
1477         XBZRLE.current_buf = NULL;
1478     }
1479     XBZRLE_cache_unlock();
1480 }
1481 
1482 static void reset_ram_globals(void)
1483 {
1484     last_seen_block = NULL;
1485     last_sent_block = NULL;
1486     last_offset = 0;
1487     last_version = ram_list.version;
1488     ram_bulk_stage = true;
1489 }
1490 
1491 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1492 
1493 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1494 {
1495     /* called in qemu main thread, so there is
1496      * no writing race against this migration_bitmap
1497      */
1498     if (migration_bitmap_rcu) {
1499         struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1500         bitmap = g_new(struct BitmapRcu, 1);
1501         bitmap->bmap = bitmap_new(new);
1502 
1503         /* prevent migration_bitmap content from being set bit
1504          * by migration_bitmap_sync_range() at the same time.
1505          * it is safe to migration if migration_bitmap is cleared bit
1506          * at the same time.
1507          */
1508         qemu_mutex_lock(&migration_bitmap_mutex);
1509         bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1510         bitmap_set(bitmap->bmap, old, new - old);
1511 
1512         /* We don't have a way to safely extend the sentmap
1513          * with RCU; so mark it as missing, entry to postcopy
1514          * will fail.
1515          */
1516         bitmap->unsentmap = NULL;
1517 
1518         atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1519         qemu_mutex_unlock(&migration_bitmap_mutex);
1520         migration_dirty_pages += new - old;
1521         call_rcu(old_bitmap, migration_bitmap_free, rcu);
1522     }
1523 }
1524 
1525 /*
1526  * 'expected' is the value you expect the bitmap mostly to be full
1527  * of; it won't bother printing lines that are all this value.
1528  * If 'todump' is null the migration bitmap is dumped.
1529  */
1530 void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1531 {
1532     int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1533 
1534     int64_t cur;
1535     int64_t linelen = 128;
1536     char linebuf[129];
1537 
1538     if (!todump) {
1539         todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1540     }
1541 
1542     for (cur = 0; cur < ram_pages; cur += linelen) {
1543         int64_t curb;
1544         bool found = false;
1545         /*
1546          * Last line; catch the case where the line length
1547          * is longer than remaining ram
1548          */
1549         if (cur + linelen > ram_pages) {
1550             linelen = ram_pages - cur;
1551         }
1552         for (curb = 0; curb < linelen; curb++) {
1553             bool thisbit = test_bit(cur + curb, todump);
1554             linebuf[curb] = thisbit ? '1' : '.';
1555             found = found || (thisbit != expected);
1556         }
1557         if (found) {
1558             linebuf[curb] = '\0';
1559             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1560         }
1561     }
1562 }
1563 
1564 /* **** functions for postcopy ***** */
1565 
1566 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1567 {
1568     struct RAMBlock *block;
1569     unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1570 
1571     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1572         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1573         unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS);
1574         unsigned long run_start = find_next_zero_bit(bitmap, range, first);
1575 
1576         while (run_start < range) {
1577             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1578             ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS,
1579                               (run_end - run_start) << TARGET_PAGE_BITS);
1580             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1581         }
1582     }
1583 }
1584 
1585 /*
1586  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1587  * Note: At this point the 'unsentmap' is the processed bitmap combined
1588  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1589  * start,length: Indexes into the bitmap for the first bit
1590  *            representing the named block and length in target-pages
1591  */
1592 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1593                                         PostcopyDiscardState *pds,
1594                                         unsigned long start,
1595                                         unsigned long length)
1596 {
1597     unsigned long end = start + length; /* one after the end */
1598     unsigned long current;
1599     unsigned long *unsentmap;
1600 
1601     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1602     for (current = start; current < end; ) {
1603         unsigned long one = find_next_bit(unsentmap, end, current);
1604 
1605         if (one <= end) {
1606             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1607             unsigned long discard_length;
1608 
1609             if (zero >= end) {
1610                 discard_length = end - one;
1611             } else {
1612                 discard_length = zero - one;
1613             }
1614             if (discard_length) {
1615                 postcopy_discard_send_range(ms, pds, one, discard_length);
1616             }
1617             current = one + discard_length;
1618         } else {
1619             current = one;
1620         }
1621     }
1622 
1623     return 0;
1624 }
1625 
1626 /*
1627  * Utility for the outgoing postcopy code.
1628  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1629  *   passing it bitmap indexes and name.
1630  * Returns: 0 on success
1631  * (qemu_ram_foreach_block ends up passing unscaled lengths
1632  *  which would mean postcopy code would have to deal with target page)
1633  */
1634 static int postcopy_each_ram_send_discard(MigrationState *ms)
1635 {
1636     struct RAMBlock *block;
1637     int ret;
1638 
1639     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1640         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1641         PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1642                                                                first,
1643                                                                block->idstr);
1644 
1645         /*
1646          * Postcopy sends chunks of bitmap over the wire, but it
1647          * just needs indexes at this point, avoids it having
1648          * target page specific code.
1649          */
1650         ret = postcopy_send_discard_bm_ram(ms, pds, first,
1651                                     block->used_length >> TARGET_PAGE_BITS);
1652         postcopy_discard_send_finish(ms, pds);
1653         if (ret) {
1654             return ret;
1655         }
1656     }
1657 
1658     return 0;
1659 }
1660 
1661 /*
1662  * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
1663  *   the two bitmaps, that are similar, but one is inverted.
1664  *
1665  * We search for runs of target-pages that don't start or end on a
1666  * host page boundary;
1667  * unsent_pass=true: Cleans up partially unsent host pages by searching
1668  *                 the unsentmap
1669  * unsent_pass=false: Cleans up partially dirty host pages by searching
1670  *                 the main migration bitmap
1671  *
1672  */
1673 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1674                                           RAMBlock *block,
1675                                           PostcopyDiscardState *pds)
1676 {
1677     unsigned long *bitmap;
1678     unsigned long *unsentmap;
1679     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1680     unsigned long first = block->offset >> TARGET_PAGE_BITS;
1681     unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1682     unsigned long last = first + (len - 1);
1683     unsigned long run_start;
1684 
1685     if (block->page_size == TARGET_PAGE_SIZE) {
1686         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1687         return;
1688     }
1689 
1690     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1691     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1692 
1693     if (unsent_pass) {
1694         /* Find a sent page */
1695         run_start = find_next_zero_bit(unsentmap, last + 1, first);
1696     } else {
1697         /* Find a dirty page */
1698         run_start = find_next_bit(bitmap, last + 1, first);
1699     }
1700 
1701     while (run_start <= last) {
1702         bool do_fixup = false;
1703         unsigned long fixup_start_addr;
1704         unsigned long host_offset;
1705 
1706         /*
1707          * If the start of this run of pages is in the middle of a host
1708          * page, then we need to fixup this host page.
1709          */
1710         host_offset = run_start % host_ratio;
1711         if (host_offset) {
1712             do_fixup = true;
1713             run_start -= host_offset;
1714             fixup_start_addr = run_start;
1715             /* For the next pass */
1716             run_start = run_start + host_ratio;
1717         } else {
1718             /* Find the end of this run */
1719             unsigned long run_end;
1720             if (unsent_pass) {
1721                 run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1722             } else {
1723                 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1724             }
1725             /*
1726              * If the end isn't at the start of a host page, then the
1727              * run doesn't finish at the end of a host page
1728              * and we need to discard.
1729              */
1730             host_offset = run_end % host_ratio;
1731             if (host_offset) {
1732                 do_fixup = true;
1733                 fixup_start_addr = run_end - host_offset;
1734                 /*
1735                  * This host page has gone, the next loop iteration starts
1736                  * from after the fixup
1737                  */
1738                 run_start = fixup_start_addr + host_ratio;
1739             } else {
1740                 /*
1741                  * No discards on this iteration, next loop starts from
1742                  * next sent/dirty page
1743                  */
1744                 run_start = run_end + 1;
1745             }
1746         }
1747 
1748         if (do_fixup) {
1749             unsigned long page;
1750 
1751             /* Tell the destination to discard this page */
1752             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1753                 /* For the unsent_pass we:
1754                  *     discard partially sent pages
1755                  * For the !unsent_pass (dirty) we:
1756                  *     discard partially dirty pages that were sent
1757                  *     (any partially sent pages were already discarded
1758                  *     by the previous unsent_pass)
1759                  */
1760                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1761                                             host_ratio);
1762             }
1763 
1764             /* Clean up the bitmap */
1765             for (page = fixup_start_addr;
1766                  page < fixup_start_addr + host_ratio; page++) {
1767                 /* All pages in this host page are now not sent */
1768                 set_bit(page, unsentmap);
1769 
1770                 /*
1771                  * Remark them as dirty, updating the count for any pages
1772                  * that weren't previously dirty.
1773                  */
1774                 migration_dirty_pages += !test_and_set_bit(page, bitmap);
1775             }
1776         }
1777 
1778         if (unsent_pass) {
1779             /* Find the next sent page for the next iteration */
1780             run_start = find_next_zero_bit(unsentmap, last + 1,
1781                                            run_start);
1782         } else {
1783             /* Find the next dirty page for the next iteration */
1784             run_start = find_next_bit(bitmap, last + 1, run_start);
1785         }
1786     }
1787 }
1788 
1789 /*
1790  * Utility for the outgoing postcopy code.
1791  *
1792  * Discard any partially sent host-page size chunks, mark any partially
1793  * dirty host-page size chunks as all dirty.  In this case the host-page
1794  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1795  *
1796  * Returns: 0 on success
1797  */
1798 static int postcopy_chunk_hostpages(MigrationState *ms)
1799 {
1800     struct RAMBlock *block;
1801 
1802     /* Easiest way to make sure we don't resume in the middle of a host-page */
1803     last_seen_block = NULL;
1804     last_sent_block = NULL;
1805     last_offset     = 0;
1806 
1807     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1808         unsigned long first = block->offset >> TARGET_PAGE_BITS;
1809 
1810         PostcopyDiscardState *pds =
1811                          postcopy_discard_send_init(ms, first, block->idstr);
1812 
1813         /* First pass: Discard all partially sent host pages */
1814         postcopy_chunk_hostpages_pass(ms, true, block, pds);
1815         /*
1816          * Second pass: Ensure that all partially dirty host pages are made
1817          * fully dirty.
1818          */
1819         postcopy_chunk_hostpages_pass(ms, false, block, pds);
1820 
1821         postcopy_discard_send_finish(ms, pds);
1822     } /* ram_list loop */
1823 
1824     return 0;
1825 }
1826 
1827 /*
1828  * Transmit the set of pages to be discarded after precopy to the target
1829  * these are pages that:
1830  *     a) Have been previously transmitted but are now dirty again
1831  *     b) Pages that have never been transmitted, this ensures that
1832  *        any pages on the destination that have been mapped by background
1833  *        tasks get discarded (transparent huge pages is the specific concern)
1834  * Hopefully this is pretty sparse
1835  */
1836 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1837 {
1838     int ret;
1839     unsigned long *bitmap, *unsentmap;
1840 
1841     rcu_read_lock();
1842 
1843     /* This should be our last sync, the src is now paused */
1844     migration_bitmap_sync();
1845 
1846     unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1847     if (!unsentmap) {
1848         /* We don't have a safe way to resize the sentmap, so
1849          * if the bitmap was resized it will be NULL at this
1850          * point.
1851          */
1852         error_report("migration ram resized during precopy phase");
1853         rcu_read_unlock();
1854         return -EINVAL;
1855     }
1856 
1857     /* Deal with TPS != HPS and huge pages */
1858     ret = postcopy_chunk_hostpages(ms);
1859     if (ret) {
1860         rcu_read_unlock();
1861         return ret;
1862     }
1863 
1864     /*
1865      * Update the unsentmap to be unsentmap = unsentmap | dirty
1866      */
1867     bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1868     bitmap_or(unsentmap, unsentmap, bitmap,
1869                last_ram_offset() >> TARGET_PAGE_BITS);
1870 
1871 
1872     trace_ram_postcopy_send_discard_bitmap();
1873 #ifdef DEBUG_POSTCOPY
1874     ram_debug_dump_bitmap(unsentmap, true);
1875 #endif
1876 
1877     ret = postcopy_each_ram_send_discard(ms);
1878     rcu_read_unlock();
1879 
1880     return ret;
1881 }
1882 
1883 /*
1884  * At the start of the postcopy phase of migration, any now-dirty
1885  * precopied pages are discarded.
1886  *
1887  * start, length describe a byte address range within the RAMBlock
1888  *
1889  * Returns 0 on success.
1890  */
1891 int ram_discard_range(MigrationIncomingState *mis,
1892                       const char *block_name,
1893                       uint64_t start, size_t length)
1894 {
1895     int ret = -1;
1896 
1897     trace_ram_discard_range(block_name, start, length);
1898 
1899     rcu_read_lock();
1900     RAMBlock *rb = qemu_ram_block_by_name(block_name);
1901 
1902     if (!rb) {
1903         error_report("ram_discard_range: Failed to find block '%s'",
1904                      block_name);
1905         goto err;
1906     }
1907 
1908     ret = ram_block_discard_range(rb, start, length);
1909 
1910 err:
1911     rcu_read_unlock();
1912 
1913     return ret;
1914 }
1915 
1916 static int ram_save_init_globals(void)
1917 {
1918     int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1919 
1920     dirty_rate_high_cnt = 0;
1921     bitmap_sync_count = 0;
1922     migration_bitmap_sync_init();
1923     qemu_mutex_init(&migration_bitmap_mutex);
1924 
1925     if (migrate_use_xbzrle()) {
1926         XBZRLE_cache_lock();
1927         ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1928         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1929                                   TARGET_PAGE_SIZE,
1930                                   TARGET_PAGE_SIZE);
1931         if (!XBZRLE.cache) {
1932             XBZRLE_cache_unlock();
1933             error_report("Error creating cache");
1934             return -1;
1935         }
1936         XBZRLE_cache_unlock();
1937 
1938         /* We prefer not to abort if there is no memory */
1939         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1940         if (!XBZRLE.encoded_buf) {
1941             error_report("Error allocating encoded_buf");
1942             return -1;
1943         }
1944 
1945         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1946         if (!XBZRLE.current_buf) {
1947             error_report("Error allocating current_buf");
1948             g_free(XBZRLE.encoded_buf);
1949             XBZRLE.encoded_buf = NULL;
1950             return -1;
1951         }
1952 
1953         acct_clear();
1954     }
1955 
1956     /* For memory_global_dirty_log_start below.  */
1957     qemu_mutex_lock_iothread();
1958 
1959     qemu_mutex_lock_ramlist();
1960     rcu_read_lock();
1961     bytes_transferred = 0;
1962     reset_ram_globals();
1963 
1964     migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
1965     /* Skip setting bitmap if there is no RAM */
1966     if (ram_bytes_total()) {
1967         ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1968         migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1969         bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
1970 
1971         if (migrate_postcopy_ram()) {
1972             migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
1973             bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
1974         }
1975     }
1976 
1977     /*
1978      * Count the total number of pages used by ram blocks not including any
1979      * gaps due to alignment or unplugs.
1980      */
1981     migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1982 
1983     memory_global_dirty_log_start();
1984     migration_bitmap_sync();
1985     qemu_mutex_unlock_ramlist();
1986     qemu_mutex_unlock_iothread();
1987     rcu_read_unlock();
1988 
1989     return 0;
1990 }
1991 
1992 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1993  * long-running RCU critical section.  When rcu-reclaims in the code
1994  * start to become numerous it will be necessary to reduce the
1995  * granularity of these critical sections.
1996  */
1997 
1998 static int ram_save_setup(QEMUFile *f, void *opaque)
1999 {
2000     RAMBlock *block;
2001 
2002     /* migration has already setup the bitmap, reuse it. */
2003     if (!migration_in_colo_state()) {
2004         if (ram_save_init_globals() < 0) {
2005             return -1;
2006          }
2007     }
2008 
2009     rcu_read_lock();
2010 
2011     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2012 
2013     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2014         qemu_put_byte(f, strlen(block->idstr));
2015         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2016         qemu_put_be64(f, block->used_length);
2017         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2018             qemu_put_be64(f, block->page_size);
2019         }
2020     }
2021 
2022     rcu_read_unlock();
2023 
2024     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2025     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2026 
2027     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2028 
2029     return 0;
2030 }
2031 
2032 static int ram_save_iterate(QEMUFile *f, void *opaque)
2033 {
2034     int ret;
2035     int i;
2036     int64_t t0;
2037     int done = 0;
2038 
2039     rcu_read_lock();
2040     if (ram_list.version != last_version) {
2041         reset_ram_globals();
2042     }
2043 
2044     /* Read version before ram_list.blocks */
2045     smp_rmb();
2046 
2047     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2048 
2049     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2050     i = 0;
2051     while ((ret = qemu_file_rate_limit(f)) == 0) {
2052         int pages;
2053 
2054         pages = ram_find_and_save_block(f, false, &bytes_transferred);
2055         /* no more pages to sent */
2056         if (pages == 0) {
2057             done = 1;
2058             break;
2059         }
2060         acct_info.iterations++;
2061 
2062         /* we want to check in the 1st loop, just in case it was the 1st time
2063            and we had to sync the dirty bitmap.
2064            qemu_get_clock_ns() is a bit expensive, so we only check each some
2065            iterations
2066         */
2067         if ((i & 63) == 0) {
2068             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2069             if (t1 > MAX_WAIT) {
2070                 trace_ram_save_iterate_big_wait(t1, i);
2071                 break;
2072             }
2073         }
2074         i++;
2075     }
2076     flush_compressed_data(f);
2077     rcu_read_unlock();
2078 
2079     /*
2080      * Must occur before EOS (or any QEMUFile operation)
2081      * because of RDMA protocol.
2082      */
2083     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2084 
2085     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2086     bytes_transferred += 8;
2087 
2088     ret = qemu_file_get_error(f);
2089     if (ret < 0) {
2090         return ret;
2091     }
2092 
2093     return done;
2094 }
2095 
2096 /* Called with iothread lock */
2097 static int ram_save_complete(QEMUFile *f, void *opaque)
2098 {
2099     rcu_read_lock();
2100 
2101     if (!migration_in_postcopy(migrate_get_current())) {
2102         migration_bitmap_sync();
2103     }
2104 
2105     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2106 
2107     /* try transferring iterative blocks of memory */
2108 
2109     /* flush all remaining blocks regardless of rate limiting */
2110     while (true) {
2111         int pages;
2112 
2113         pages = ram_find_and_save_block(f, !migration_in_colo_state(),
2114                                         &bytes_transferred);
2115         /* no more blocks to sent */
2116         if (pages == 0) {
2117             break;
2118         }
2119     }
2120 
2121     flush_compressed_data(f);
2122     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2123 
2124     rcu_read_unlock();
2125 
2126     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2127 
2128     return 0;
2129 }
2130 
2131 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2132                              uint64_t *non_postcopiable_pending,
2133                              uint64_t *postcopiable_pending)
2134 {
2135     uint64_t remaining_size;
2136 
2137     remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2138 
2139     if (!migration_in_postcopy(migrate_get_current()) &&
2140         remaining_size < max_size) {
2141         qemu_mutex_lock_iothread();
2142         rcu_read_lock();
2143         migration_bitmap_sync();
2144         rcu_read_unlock();
2145         qemu_mutex_unlock_iothread();
2146         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2147     }
2148 
2149     /* We can do postcopy, and all the data is postcopiable */
2150     *postcopiable_pending += remaining_size;
2151 }
2152 
2153 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2154 {
2155     unsigned int xh_len;
2156     int xh_flags;
2157     uint8_t *loaded_data;
2158 
2159     if (!xbzrle_decoded_buf) {
2160         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2161     }
2162     loaded_data = xbzrle_decoded_buf;
2163 
2164     /* extract RLE header */
2165     xh_flags = qemu_get_byte(f);
2166     xh_len = qemu_get_be16(f);
2167 
2168     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2169         error_report("Failed to load XBZRLE page - wrong compression!");
2170         return -1;
2171     }
2172 
2173     if (xh_len > TARGET_PAGE_SIZE) {
2174         error_report("Failed to load XBZRLE page - len overflow!");
2175         return -1;
2176     }
2177     /* load data and decode */
2178     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2179 
2180     /* decode RLE */
2181     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2182                              TARGET_PAGE_SIZE) == -1) {
2183         error_report("Failed to load XBZRLE page - decode error!");
2184         return -1;
2185     }
2186 
2187     return 0;
2188 }
2189 
2190 /* Must be called from within a rcu critical section.
2191  * Returns a pointer from within the RCU-protected ram_list.
2192  */
2193 /*
2194  * Read a RAMBlock ID from the stream f.
2195  *
2196  * f: Stream to read from
2197  * flags: Page flags (mostly to see if it's a continuation of previous block)
2198  */
2199 static inline RAMBlock *ram_block_from_stream(QEMUFile *f,
2200                                               int flags)
2201 {
2202     static RAMBlock *block = NULL;
2203     char id[256];
2204     uint8_t len;
2205 
2206     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2207         if (!block) {
2208             error_report("Ack, bad migration stream!");
2209             return NULL;
2210         }
2211         return block;
2212     }
2213 
2214     len = qemu_get_byte(f);
2215     qemu_get_buffer(f, (uint8_t *)id, len);
2216     id[len] = 0;
2217 
2218     block = qemu_ram_block_by_name(id);
2219     if (!block) {
2220         error_report("Can't find block %s", id);
2221         return NULL;
2222     }
2223 
2224     return block;
2225 }
2226 
2227 static inline void *host_from_ram_block_offset(RAMBlock *block,
2228                                                ram_addr_t offset)
2229 {
2230     if (!offset_in_ramblock(block, offset)) {
2231         return NULL;
2232     }
2233 
2234     return block->host + offset;
2235 }
2236 
2237 /*
2238  * If a page (or a whole RDMA chunk) has been
2239  * determined to be zero, then zap it.
2240  */
2241 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2242 {
2243     if (ch != 0 || !is_zero_range(host, size)) {
2244         memset(host, ch, size);
2245     }
2246 }
2247 
2248 static void *do_data_decompress(void *opaque)
2249 {
2250     DecompressParam *param = opaque;
2251     unsigned long pagesize;
2252     uint8_t *des;
2253     int len;
2254 
2255     qemu_mutex_lock(&param->mutex);
2256     while (!param->quit) {
2257         if (param->des) {
2258             des = param->des;
2259             len = param->len;
2260             param->des = 0;
2261             qemu_mutex_unlock(&param->mutex);
2262 
2263             pagesize = TARGET_PAGE_SIZE;
2264             /* uncompress() will return failed in some case, especially
2265              * when the page is dirted when doing the compression, it's
2266              * not a problem because the dirty page will be retransferred
2267              * and uncompress() won't break the data in other pages.
2268              */
2269             uncompress((Bytef *)des, &pagesize,
2270                        (const Bytef *)param->compbuf, len);
2271 
2272             qemu_mutex_lock(&decomp_done_lock);
2273             param->done = true;
2274             qemu_cond_signal(&decomp_done_cond);
2275             qemu_mutex_unlock(&decomp_done_lock);
2276 
2277             qemu_mutex_lock(&param->mutex);
2278         } else {
2279             qemu_cond_wait(&param->cond, &param->mutex);
2280         }
2281     }
2282     qemu_mutex_unlock(&param->mutex);
2283 
2284     return NULL;
2285 }
2286 
2287 static void wait_for_decompress_done(void)
2288 {
2289     int idx, thread_count;
2290 
2291     if (!migrate_use_compression()) {
2292         return;
2293     }
2294 
2295     thread_count = migrate_decompress_threads();
2296     qemu_mutex_lock(&decomp_done_lock);
2297     for (idx = 0; idx < thread_count; idx++) {
2298         while (!decomp_param[idx].done) {
2299             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2300         }
2301     }
2302     qemu_mutex_unlock(&decomp_done_lock);
2303 }
2304 
2305 void migrate_decompress_threads_create(void)
2306 {
2307     int i, thread_count;
2308 
2309     thread_count = migrate_decompress_threads();
2310     decompress_threads = g_new0(QemuThread, thread_count);
2311     decomp_param = g_new0(DecompressParam, thread_count);
2312     qemu_mutex_init(&decomp_done_lock);
2313     qemu_cond_init(&decomp_done_cond);
2314     for (i = 0; i < thread_count; i++) {
2315         qemu_mutex_init(&decomp_param[i].mutex);
2316         qemu_cond_init(&decomp_param[i].cond);
2317         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2318         decomp_param[i].done = true;
2319         decomp_param[i].quit = false;
2320         qemu_thread_create(decompress_threads + i, "decompress",
2321                            do_data_decompress, decomp_param + i,
2322                            QEMU_THREAD_JOINABLE);
2323     }
2324 }
2325 
2326 void migrate_decompress_threads_join(void)
2327 {
2328     int i, thread_count;
2329 
2330     thread_count = migrate_decompress_threads();
2331     for (i = 0; i < thread_count; i++) {
2332         qemu_mutex_lock(&decomp_param[i].mutex);
2333         decomp_param[i].quit = true;
2334         qemu_cond_signal(&decomp_param[i].cond);
2335         qemu_mutex_unlock(&decomp_param[i].mutex);
2336     }
2337     for (i = 0; i < thread_count; i++) {
2338         qemu_thread_join(decompress_threads + i);
2339         qemu_mutex_destroy(&decomp_param[i].mutex);
2340         qemu_cond_destroy(&decomp_param[i].cond);
2341         g_free(decomp_param[i].compbuf);
2342     }
2343     g_free(decompress_threads);
2344     g_free(decomp_param);
2345     decompress_threads = NULL;
2346     decomp_param = NULL;
2347 }
2348 
2349 static void decompress_data_with_multi_threads(QEMUFile *f,
2350                                                void *host, int len)
2351 {
2352     int idx, thread_count;
2353 
2354     thread_count = migrate_decompress_threads();
2355     qemu_mutex_lock(&decomp_done_lock);
2356     while (true) {
2357         for (idx = 0; idx < thread_count; idx++) {
2358             if (decomp_param[idx].done) {
2359                 decomp_param[idx].done = false;
2360                 qemu_mutex_lock(&decomp_param[idx].mutex);
2361                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2362                 decomp_param[idx].des = host;
2363                 decomp_param[idx].len = len;
2364                 qemu_cond_signal(&decomp_param[idx].cond);
2365                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2366                 break;
2367             }
2368         }
2369         if (idx < thread_count) {
2370             break;
2371         } else {
2372             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2373         }
2374     }
2375     qemu_mutex_unlock(&decomp_done_lock);
2376 }
2377 
2378 /*
2379  * Allocate data structures etc needed by incoming migration with postcopy-ram
2380  * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
2381  */
2382 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2383 {
2384     size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2385 
2386     return postcopy_ram_incoming_init(mis, ram_pages);
2387 }
2388 
2389 /*
2390  * Called in postcopy mode by ram_load().
2391  * rcu_read_lock is taken prior to this being called.
2392  */
2393 static int ram_load_postcopy(QEMUFile *f)
2394 {
2395     int flags = 0, ret = 0;
2396     bool place_needed = false;
2397     bool matching_page_sizes = false;
2398     MigrationIncomingState *mis = migration_incoming_get_current();
2399     /* Temporary page that is later 'placed' */
2400     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2401     void *last_host = NULL;
2402     bool all_zero = false;
2403 
2404     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2405         ram_addr_t addr;
2406         void *host = NULL;
2407         void *page_buffer = NULL;
2408         void *place_source = NULL;
2409         RAMBlock *block = NULL;
2410         uint8_t ch;
2411 
2412         addr = qemu_get_be64(f);
2413         flags = addr & ~TARGET_PAGE_MASK;
2414         addr &= TARGET_PAGE_MASK;
2415 
2416         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2417         place_needed = false;
2418         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2419             block = ram_block_from_stream(f, flags);
2420 
2421             host = host_from_ram_block_offset(block, addr);
2422             if (!host) {
2423                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2424                 ret = -EINVAL;
2425                 break;
2426             }
2427             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2428             /*
2429              * Postcopy requires that we place whole host pages atomically;
2430              * these may be huge pages for RAMBlocks that are backed by
2431              * hugetlbfs.
2432              * To make it atomic, the data is read into a temporary page
2433              * that's moved into place later.
2434              * The migration protocol uses,  possibly smaller, target-pages
2435              * however the source ensures it always sends all the components
2436              * of a host page in order.
2437              */
2438             page_buffer = postcopy_host_page +
2439                           ((uintptr_t)host & (block->page_size - 1));
2440             /* If all TP are zero then we can optimise the place */
2441             if (!((uintptr_t)host & (block->page_size - 1))) {
2442                 all_zero = true;
2443             } else {
2444                 /* not the 1st TP within the HP */
2445                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2446                     error_report("Non-sequential target page %p/%p",
2447                                   host, last_host);
2448                     ret = -EINVAL;
2449                     break;
2450                 }
2451             }
2452 
2453 
2454             /*
2455              * If it's the last part of a host page then we place the host
2456              * page
2457              */
2458             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2459                                      (block->page_size - 1)) == 0;
2460             place_source = postcopy_host_page;
2461         }
2462         last_host = host;
2463 
2464         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2465         case RAM_SAVE_FLAG_COMPRESS:
2466             ch = qemu_get_byte(f);
2467             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2468             if (ch) {
2469                 all_zero = false;
2470             }
2471             break;
2472 
2473         case RAM_SAVE_FLAG_PAGE:
2474             all_zero = false;
2475             if (!place_needed || !matching_page_sizes) {
2476                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2477             } else {
2478                 /* Avoids the qemu_file copy during postcopy, which is
2479                  * going to do a copy later; can only do it when we
2480                  * do this read in one go (matching page sizes)
2481                  */
2482                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2483                                          TARGET_PAGE_SIZE);
2484             }
2485             break;
2486         case RAM_SAVE_FLAG_EOS:
2487             /* normal exit */
2488             break;
2489         default:
2490             error_report("Unknown combination of migration flags: %#x"
2491                          " (postcopy mode)", flags);
2492             ret = -EINVAL;
2493         }
2494 
2495         if (place_needed) {
2496             /* This gets called at the last target page in the host page */
2497             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2498 
2499             if (all_zero) {
2500                 ret = postcopy_place_page_zero(mis, place_dest,
2501                                                block->page_size);
2502             } else {
2503                 ret = postcopy_place_page(mis, place_dest,
2504                                           place_source, block->page_size);
2505             }
2506         }
2507         if (!ret) {
2508             ret = qemu_file_get_error(f);
2509         }
2510     }
2511 
2512     return ret;
2513 }
2514 
2515 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2516 {
2517     int flags = 0, ret = 0;
2518     static uint64_t seq_iter;
2519     int len = 0;
2520     /*
2521      * If system is running in postcopy mode, page inserts to host memory must
2522      * be atomic
2523      */
2524     bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2525     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2526     bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2527 
2528     seq_iter++;
2529 
2530     if (version_id != 4) {
2531         ret = -EINVAL;
2532     }
2533 
2534     /* This RCU critical section can be very long running.
2535      * When RCU reclaims in the code start to become numerous,
2536      * it will be necessary to reduce the granularity of this
2537      * critical section.
2538      */
2539     rcu_read_lock();
2540 
2541     if (postcopy_running) {
2542         ret = ram_load_postcopy(f);
2543     }
2544 
2545     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2546         ram_addr_t addr, total_ram_bytes;
2547         void *host = NULL;
2548         uint8_t ch;
2549 
2550         addr = qemu_get_be64(f);
2551         flags = addr & ~TARGET_PAGE_MASK;
2552         addr &= TARGET_PAGE_MASK;
2553 
2554         if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2555                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2556             RAMBlock *block = ram_block_from_stream(f, flags);
2557 
2558             host = host_from_ram_block_offset(block, addr);
2559             if (!host) {
2560                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2561                 ret = -EINVAL;
2562                 break;
2563             }
2564         }
2565 
2566         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2567         case RAM_SAVE_FLAG_MEM_SIZE:
2568             /* Synchronize RAM block list */
2569             total_ram_bytes = addr;
2570             while (!ret && total_ram_bytes) {
2571                 RAMBlock *block;
2572                 char id[256];
2573                 ram_addr_t length;
2574 
2575                 len = qemu_get_byte(f);
2576                 qemu_get_buffer(f, (uint8_t *)id, len);
2577                 id[len] = 0;
2578                 length = qemu_get_be64(f);
2579 
2580                 block = qemu_ram_block_by_name(id);
2581                 if (block) {
2582                     if (length != block->used_length) {
2583                         Error *local_err = NULL;
2584 
2585                         ret = qemu_ram_resize(block, length,
2586                                               &local_err);
2587                         if (local_err) {
2588                             error_report_err(local_err);
2589                         }
2590                     }
2591                     /* For postcopy we need to check hugepage sizes match */
2592                     if (postcopy_advised &&
2593                         block->page_size != qemu_host_page_size) {
2594                         uint64_t remote_page_size = qemu_get_be64(f);
2595                         if (remote_page_size != block->page_size) {
2596                             error_report("Mismatched RAM page size %s "
2597                                          "(local) %zd != %" PRId64,
2598                                          id, block->page_size,
2599                                          remote_page_size);
2600                             ret = -EINVAL;
2601                         }
2602                     }
2603                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2604                                           block->idstr);
2605                 } else {
2606                     error_report("Unknown ramblock \"%s\", cannot "
2607                                  "accept migration", id);
2608                     ret = -EINVAL;
2609                 }
2610 
2611                 total_ram_bytes -= length;
2612             }
2613             break;
2614 
2615         case RAM_SAVE_FLAG_COMPRESS:
2616             ch = qemu_get_byte(f);
2617             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2618             break;
2619 
2620         case RAM_SAVE_FLAG_PAGE:
2621             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2622             break;
2623 
2624         case RAM_SAVE_FLAG_COMPRESS_PAGE:
2625             len = qemu_get_be32(f);
2626             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2627                 error_report("Invalid compressed data length: %d", len);
2628                 ret = -EINVAL;
2629                 break;
2630             }
2631             decompress_data_with_multi_threads(f, host, len);
2632             break;
2633 
2634         case RAM_SAVE_FLAG_XBZRLE:
2635             if (load_xbzrle(f, addr, host) < 0) {
2636                 error_report("Failed to decompress XBZRLE page at "
2637                              RAM_ADDR_FMT, addr);
2638                 ret = -EINVAL;
2639                 break;
2640             }
2641             break;
2642         case RAM_SAVE_FLAG_EOS:
2643             /* normal exit */
2644             break;
2645         default:
2646             if (flags & RAM_SAVE_FLAG_HOOK) {
2647                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2648             } else {
2649                 error_report("Unknown combination of migration flags: %#x",
2650                              flags);
2651                 ret = -EINVAL;
2652             }
2653         }
2654         if (!ret) {
2655             ret = qemu_file_get_error(f);
2656         }
2657     }
2658 
2659     wait_for_decompress_done();
2660     rcu_read_unlock();
2661     trace_ram_load_complete(ret, seq_iter);
2662     return ret;
2663 }
2664 
2665 static SaveVMHandlers savevm_ram_handlers = {
2666     .save_live_setup = ram_save_setup,
2667     .save_live_iterate = ram_save_iterate,
2668     .save_live_complete_postcopy = ram_save_complete,
2669     .save_live_complete_precopy = ram_save_complete,
2670     .save_live_pending = ram_save_pending,
2671     .load_state = ram_load,
2672     .cleanup = ram_migration_cleanup,
2673 };
2674 
2675 void ram_mig_init(void)
2676 {
2677     qemu_mutex_init(&XBZRLE.lock);
2678     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
2679 }
2680