1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 #include "qemu/osdep.h" 29 #include "qemu-common.h" 30 #include "cpu.h" 31 #include <zlib.h> 32 #include "qapi-event.h" 33 #include "qemu/cutils.h" 34 #include "qemu/bitops.h" 35 #include "qemu/bitmap.h" 36 #include "qemu/timer.h" 37 #include "qemu/main-loop.h" 38 #include "migration/migration.h" 39 #include "migration/postcopy-ram.h" 40 #include "exec/address-spaces.h" 41 #include "migration/page_cache.h" 42 #include "qemu/error-report.h" 43 #include "trace.h" 44 #include "exec/ram_addr.h" 45 #include "qemu/rcu_queue.h" 46 #include "migration/colo.h" 47 48 static int dirty_rate_high_cnt; 49 50 static uint64_t bitmap_sync_count; 51 52 /***********************************************************/ 53 /* ram save/restore */ 54 55 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ 56 #define RAM_SAVE_FLAG_COMPRESS 0x02 57 #define RAM_SAVE_FLAG_MEM_SIZE 0x04 58 #define RAM_SAVE_FLAG_PAGE 0x08 59 #define RAM_SAVE_FLAG_EOS 0x10 60 #define RAM_SAVE_FLAG_CONTINUE 0x20 61 #define RAM_SAVE_FLAG_XBZRLE 0x40 62 /* 0x80 is reserved in migration.h start with 0x100 next */ 63 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 64 65 static uint8_t *ZERO_TARGET_PAGE; 66 67 static inline bool is_zero_range(uint8_t *p, uint64_t size) 68 { 69 return buffer_is_zero(p, size); 70 } 71 72 /* struct contains XBZRLE cache and a static page 73 used by the compression */ 74 static struct { 75 /* buffer used for XBZRLE encoding */ 76 uint8_t *encoded_buf; 77 /* buffer for storing page content */ 78 uint8_t *current_buf; 79 /* Cache for XBZRLE, Protected by lock. */ 80 PageCache *cache; 81 QemuMutex lock; 82 } XBZRLE; 83 84 /* buffer used for XBZRLE decoding */ 85 static uint8_t *xbzrle_decoded_buf; 86 87 static void XBZRLE_cache_lock(void) 88 { 89 if (migrate_use_xbzrle()) 90 qemu_mutex_lock(&XBZRLE.lock); 91 } 92 93 static void XBZRLE_cache_unlock(void) 94 { 95 if (migrate_use_xbzrle()) 96 qemu_mutex_unlock(&XBZRLE.lock); 97 } 98 99 /* 100 * called from qmp_migrate_set_cache_size in main thread, possibly while 101 * a migration is in progress. 102 * A running migration maybe using the cache and might finish during this 103 * call, hence changes to the cache are protected by XBZRLE.lock(). 104 */ 105 int64_t xbzrle_cache_resize(int64_t new_size) 106 { 107 PageCache *new_cache; 108 int64_t ret; 109 110 if (new_size < TARGET_PAGE_SIZE) { 111 return -1; 112 } 113 114 XBZRLE_cache_lock(); 115 116 if (XBZRLE.cache != NULL) { 117 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) { 118 goto out_new_size; 119 } 120 new_cache = cache_init(new_size / TARGET_PAGE_SIZE, 121 TARGET_PAGE_SIZE); 122 if (!new_cache) { 123 error_report("Error creating cache"); 124 ret = -1; 125 goto out; 126 } 127 128 cache_fini(XBZRLE.cache); 129 XBZRLE.cache = new_cache; 130 } 131 132 out_new_size: 133 ret = pow2floor(new_size); 134 out: 135 XBZRLE_cache_unlock(); 136 return ret; 137 } 138 139 /* accounting for migration statistics */ 140 typedef struct AccountingInfo { 141 uint64_t dup_pages; 142 uint64_t skipped_pages; 143 uint64_t norm_pages; 144 uint64_t iterations; 145 uint64_t xbzrle_bytes; 146 uint64_t xbzrle_pages; 147 uint64_t xbzrle_cache_miss; 148 double xbzrle_cache_miss_rate; 149 uint64_t xbzrle_overflows; 150 } AccountingInfo; 151 152 static AccountingInfo acct_info; 153 154 static void acct_clear(void) 155 { 156 memset(&acct_info, 0, sizeof(acct_info)); 157 } 158 159 uint64_t dup_mig_bytes_transferred(void) 160 { 161 return acct_info.dup_pages * TARGET_PAGE_SIZE; 162 } 163 164 uint64_t dup_mig_pages_transferred(void) 165 { 166 return acct_info.dup_pages; 167 } 168 169 uint64_t skipped_mig_bytes_transferred(void) 170 { 171 return acct_info.skipped_pages * TARGET_PAGE_SIZE; 172 } 173 174 uint64_t skipped_mig_pages_transferred(void) 175 { 176 return acct_info.skipped_pages; 177 } 178 179 uint64_t norm_mig_bytes_transferred(void) 180 { 181 return acct_info.norm_pages * TARGET_PAGE_SIZE; 182 } 183 184 uint64_t norm_mig_pages_transferred(void) 185 { 186 return acct_info.norm_pages; 187 } 188 189 uint64_t xbzrle_mig_bytes_transferred(void) 190 { 191 return acct_info.xbzrle_bytes; 192 } 193 194 uint64_t xbzrle_mig_pages_transferred(void) 195 { 196 return acct_info.xbzrle_pages; 197 } 198 199 uint64_t xbzrle_mig_pages_cache_miss(void) 200 { 201 return acct_info.xbzrle_cache_miss; 202 } 203 204 double xbzrle_mig_cache_miss_rate(void) 205 { 206 return acct_info.xbzrle_cache_miss_rate; 207 } 208 209 uint64_t xbzrle_mig_pages_overflow(void) 210 { 211 return acct_info.xbzrle_overflows; 212 } 213 214 /* This is the last block that we have visited serching for dirty pages 215 */ 216 static RAMBlock *last_seen_block; 217 /* This is the last block from where we have sent data */ 218 static RAMBlock *last_sent_block; 219 static ram_addr_t last_offset; 220 static QemuMutex migration_bitmap_mutex; 221 static uint64_t migration_dirty_pages; 222 static uint32_t last_version; 223 static bool ram_bulk_stage; 224 225 /* used by the search for pages to send */ 226 struct PageSearchStatus { 227 /* Current block being searched */ 228 RAMBlock *block; 229 /* Current offset to search from */ 230 ram_addr_t offset; 231 /* Set once we wrap around */ 232 bool complete_round; 233 }; 234 typedef struct PageSearchStatus PageSearchStatus; 235 236 static struct BitmapRcu { 237 struct rcu_head rcu; 238 /* Main migration bitmap */ 239 unsigned long *bmap; 240 /* bitmap of pages that haven't been sent even once 241 * only maintained and used in postcopy at the moment 242 * where it's used to send the dirtymap at the start 243 * of the postcopy phase 244 */ 245 unsigned long *unsentmap; 246 } *migration_bitmap_rcu; 247 248 struct CompressParam { 249 bool done; 250 bool quit; 251 QEMUFile *file; 252 QemuMutex mutex; 253 QemuCond cond; 254 RAMBlock *block; 255 ram_addr_t offset; 256 }; 257 typedef struct CompressParam CompressParam; 258 259 struct DecompressParam { 260 bool done; 261 bool quit; 262 QemuMutex mutex; 263 QemuCond cond; 264 void *des; 265 uint8_t *compbuf; 266 int len; 267 }; 268 typedef struct DecompressParam DecompressParam; 269 270 static CompressParam *comp_param; 271 static QemuThread *compress_threads; 272 /* comp_done_cond is used to wake up the migration thread when 273 * one of the compression threads has finished the compression. 274 * comp_done_lock is used to co-work with comp_done_cond. 275 */ 276 static QemuMutex comp_done_lock; 277 static QemuCond comp_done_cond; 278 /* The empty QEMUFileOps will be used by file in CompressParam */ 279 static const QEMUFileOps empty_ops = { }; 280 281 static bool compression_switch; 282 static DecompressParam *decomp_param; 283 static QemuThread *decompress_threads; 284 static QemuMutex decomp_done_lock; 285 static QemuCond decomp_done_cond; 286 287 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block, 288 ram_addr_t offset); 289 290 static void *do_data_compress(void *opaque) 291 { 292 CompressParam *param = opaque; 293 RAMBlock *block; 294 ram_addr_t offset; 295 296 qemu_mutex_lock(¶m->mutex); 297 while (!param->quit) { 298 if (param->block) { 299 block = param->block; 300 offset = param->offset; 301 param->block = NULL; 302 qemu_mutex_unlock(¶m->mutex); 303 304 do_compress_ram_page(param->file, block, offset); 305 306 qemu_mutex_lock(&comp_done_lock); 307 param->done = true; 308 qemu_cond_signal(&comp_done_cond); 309 qemu_mutex_unlock(&comp_done_lock); 310 311 qemu_mutex_lock(¶m->mutex); 312 } else { 313 qemu_cond_wait(¶m->cond, ¶m->mutex); 314 } 315 } 316 qemu_mutex_unlock(¶m->mutex); 317 318 return NULL; 319 } 320 321 static inline void terminate_compression_threads(void) 322 { 323 int idx, thread_count; 324 325 thread_count = migrate_compress_threads(); 326 for (idx = 0; idx < thread_count; idx++) { 327 qemu_mutex_lock(&comp_param[idx].mutex); 328 comp_param[idx].quit = true; 329 qemu_cond_signal(&comp_param[idx].cond); 330 qemu_mutex_unlock(&comp_param[idx].mutex); 331 } 332 } 333 334 void migrate_compress_threads_join(void) 335 { 336 int i, thread_count; 337 338 if (!migrate_use_compression()) { 339 return; 340 } 341 terminate_compression_threads(); 342 thread_count = migrate_compress_threads(); 343 for (i = 0; i < thread_count; i++) { 344 qemu_thread_join(compress_threads + i); 345 qemu_fclose(comp_param[i].file); 346 qemu_mutex_destroy(&comp_param[i].mutex); 347 qemu_cond_destroy(&comp_param[i].cond); 348 } 349 qemu_mutex_destroy(&comp_done_lock); 350 qemu_cond_destroy(&comp_done_cond); 351 g_free(compress_threads); 352 g_free(comp_param); 353 compress_threads = NULL; 354 comp_param = NULL; 355 } 356 357 void migrate_compress_threads_create(void) 358 { 359 int i, thread_count; 360 361 if (!migrate_use_compression()) { 362 return; 363 } 364 compression_switch = true; 365 thread_count = migrate_compress_threads(); 366 compress_threads = g_new0(QemuThread, thread_count); 367 comp_param = g_new0(CompressParam, thread_count); 368 qemu_cond_init(&comp_done_cond); 369 qemu_mutex_init(&comp_done_lock); 370 for (i = 0; i < thread_count; i++) { 371 /* comp_param[i].file is just used as a dummy buffer to save data, 372 * set its ops to empty. 373 */ 374 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops); 375 comp_param[i].done = true; 376 comp_param[i].quit = false; 377 qemu_mutex_init(&comp_param[i].mutex); 378 qemu_cond_init(&comp_param[i].cond); 379 qemu_thread_create(compress_threads + i, "compress", 380 do_data_compress, comp_param + i, 381 QEMU_THREAD_JOINABLE); 382 } 383 } 384 385 /** 386 * save_page_header: Write page header to wire 387 * 388 * If this is the 1st block, it also writes the block identification 389 * 390 * Returns: Number of bytes written 391 * 392 * @f: QEMUFile where to send the data 393 * @block: block that contains the page we want to send 394 * @offset: offset inside the block for the page 395 * in the lower bits, it contains flags 396 */ 397 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset) 398 { 399 size_t size, len; 400 401 qemu_put_be64(f, offset); 402 size = 8; 403 404 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { 405 len = strlen(block->idstr); 406 qemu_put_byte(f, len); 407 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 408 size += 1 + len; 409 } 410 return size; 411 } 412 413 /* Reduce amount of guest cpu execution to hopefully slow down memory writes. 414 * If guest dirty memory rate is reduced below the rate at which we can 415 * transfer pages to the destination then we should be able to complete 416 * migration. Some workloads dirty memory way too fast and will not effectively 417 * converge, even with auto-converge. 418 */ 419 static void mig_throttle_guest_down(void) 420 { 421 MigrationState *s = migrate_get_current(); 422 uint64_t pct_initial = s->parameters.cpu_throttle_initial; 423 uint64_t pct_icrement = s->parameters.cpu_throttle_increment; 424 425 /* We have not started throttling yet. Let's start it. */ 426 if (!cpu_throttle_active()) { 427 cpu_throttle_set(pct_initial); 428 } else { 429 /* Throttling already on, just increase the rate */ 430 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement); 431 } 432 } 433 434 /* Update the xbzrle cache to reflect a page that's been sent as all 0. 435 * The important thing is that a stale (not-yet-0'd) page be replaced 436 * by the new data. 437 * As a bonus, if the page wasn't in the cache it gets added so that 438 * when a small write is made into the 0'd page it gets XBZRLE sent 439 */ 440 static void xbzrle_cache_zero_page(ram_addr_t current_addr) 441 { 442 if (ram_bulk_stage || !migrate_use_xbzrle()) { 443 return; 444 } 445 446 /* We don't care if this fails to allocate a new cache page 447 * as long as it updated an old one */ 448 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE, 449 bitmap_sync_count); 450 } 451 452 #define ENCODING_FLAG_XBZRLE 0x1 453 454 /** 455 * save_xbzrle_page: compress and send current page 456 * 457 * Returns: 1 means that we wrote the page 458 * 0 means that page is identical to the one already sent 459 * -1 means that xbzrle would be longer than normal 460 * 461 * @f: QEMUFile where to send the data 462 * @current_data: 463 * @current_addr: 464 * @block: block that contains the page we want to send 465 * @offset: offset inside the block for the page 466 * @last_stage: if we are at the completion stage 467 * @bytes_transferred: increase it with the number of transferred bytes 468 */ 469 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data, 470 ram_addr_t current_addr, RAMBlock *block, 471 ram_addr_t offset, bool last_stage, 472 uint64_t *bytes_transferred) 473 { 474 int encoded_len = 0, bytes_xbzrle; 475 uint8_t *prev_cached_page; 476 477 if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) { 478 acct_info.xbzrle_cache_miss++; 479 if (!last_stage) { 480 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 481 bitmap_sync_count) == -1) { 482 return -1; 483 } else { 484 /* update *current_data when the page has been 485 inserted into cache */ 486 *current_data = get_cached_data(XBZRLE.cache, current_addr); 487 } 488 } 489 return -1; 490 } 491 492 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 493 494 /* save current buffer into memory */ 495 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 496 497 /* XBZRLE encoding (if there is no overflow) */ 498 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 499 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 500 TARGET_PAGE_SIZE); 501 if (encoded_len == 0) { 502 trace_save_xbzrle_page_skipping(); 503 return 0; 504 } else if (encoded_len == -1) { 505 trace_save_xbzrle_page_overflow(); 506 acct_info.xbzrle_overflows++; 507 /* update data in the cache */ 508 if (!last_stage) { 509 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE); 510 *current_data = prev_cached_page; 511 } 512 return -1; 513 } 514 515 /* we need to update the data in the cache, in order to get the same data */ 516 if (!last_stage) { 517 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 518 } 519 520 /* Send XBZRLE based compressed page */ 521 bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE); 522 qemu_put_byte(f, ENCODING_FLAG_XBZRLE); 523 qemu_put_be16(f, encoded_len); 524 qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len); 525 bytes_xbzrle += encoded_len + 1 + 2; 526 acct_info.xbzrle_pages++; 527 acct_info.xbzrle_bytes += bytes_xbzrle; 528 *bytes_transferred += bytes_xbzrle; 529 530 return 1; 531 } 532 533 /* Called with rcu_read_lock() to protect migration_bitmap 534 * rb: The RAMBlock to search for dirty pages in 535 * start: Start address (typically so we can continue from previous page) 536 * ram_addr_abs: Pointer into which to store the address of the dirty page 537 * within the global ram_addr space 538 * 539 * Returns: byte offset within memory region of the start of a dirty page 540 */ 541 static inline 542 ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb, 543 ram_addr_t start, 544 ram_addr_t *ram_addr_abs) 545 { 546 unsigned long base = rb->offset >> TARGET_PAGE_BITS; 547 unsigned long nr = base + (start >> TARGET_PAGE_BITS); 548 uint64_t rb_size = rb->used_length; 549 unsigned long size = base + (rb_size >> TARGET_PAGE_BITS); 550 unsigned long *bitmap; 551 552 unsigned long next; 553 554 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 555 if (ram_bulk_stage && nr > base) { 556 next = nr + 1; 557 } else { 558 next = find_next_bit(bitmap, size, nr); 559 } 560 561 *ram_addr_abs = next << TARGET_PAGE_BITS; 562 return (next - base) << TARGET_PAGE_BITS; 563 } 564 565 static inline bool migration_bitmap_clear_dirty(ram_addr_t addr) 566 { 567 bool ret; 568 int nr = addr >> TARGET_PAGE_BITS; 569 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 570 571 ret = test_and_clear_bit(nr, bitmap); 572 573 if (ret) { 574 migration_dirty_pages--; 575 } 576 return ret; 577 } 578 579 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length) 580 { 581 unsigned long *bitmap; 582 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 583 migration_dirty_pages += 584 cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length); 585 } 586 587 /* Fix me: there are too many global variables used in migration process. */ 588 static int64_t start_time; 589 static int64_t bytes_xfer_prev; 590 static int64_t num_dirty_pages_period; 591 static uint64_t xbzrle_cache_miss_prev; 592 static uint64_t iterations_prev; 593 594 static void migration_bitmap_sync_init(void) 595 { 596 start_time = 0; 597 bytes_xfer_prev = 0; 598 num_dirty_pages_period = 0; 599 xbzrle_cache_miss_prev = 0; 600 iterations_prev = 0; 601 } 602 603 /* Returns a summary bitmap of the page sizes of all RAMBlocks; 604 * for VMs with just normal pages this is equivalent to the 605 * host page size. If it's got some huge pages then it's the OR 606 * of all the different page sizes. 607 */ 608 uint64_t ram_pagesize_summary(void) 609 { 610 RAMBlock *block; 611 uint64_t summary = 0; 612 613 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 614 summary |= block->page_size; 615 } 616 617 return summary; 618 } 619 620 static void migration_bitmap_sync(void) 621 { 622 RAMBlock *block; 623 uint64_t num_dirty_pages_init = migration_dirty_pages; 624 MigrationState *s = migrate_get_current(); 625 int64_t end_time; 626 int64_t bytes_xfer_now; 627 628 bitmap_sync_count++; 629 630 if (!bytes_xfer_prev) { 631 bytes_xfer_prev = ram_bytes_transferred(); 632 } 633 634 if (!start_time) { 635 start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 636 } 637 638 trace_migration_bitmap_sync_start(); 639 memory_global_dirty_log_sync(); 640 641 qemu_mutex_lock(&migration_bitmap_mutex); 642 rcu_read_lock(); 643 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 644 migration_bitmap_sync_range(block->offset, block->used_length); 645 } 646 rcu_read_unlock(); 647 qemu_mutex_unlock(&migration_bitmap_mutex); 648 649 trace_migration_bitmap_sync_end(migration_dirty_pages 650 - num_dirty_pages_init); 651 num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init; 652 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 653 654 /* more than 1 second = 1000 millisecons */ 655 if (end_time > start_time + 1000) { 656 if (migrate_auto_converge()) { 657 /* The following detection logic can be refined later. For now: 658 Check to see if the dirtied bytes is 50% more than the approx. 659 amount of bytes that just got transferred since the last time we 660 were in this routine. If that happens twice, start or increase 661 throttling */ 662 bytes_xfer_now = ram_bytes_transferred(); 663 664 if (s->dirty_pages_rate && 665 (num_dirty_pages_period * TARGET_PAGE_SIZE > 666 (bytes_xfer_now - bytes_xfer_prev)/2) && 667 (dirty_rate_high_cnt++ >= 2)) { 668 trace_migration_throttle(); 669 dirty_rate_high_cnt = 0; 670 mig_throttle_guest_down(); 671 } 672 bytes_xfer_prev = bytes_xfer_now; 673 } 674 675 if (migrate_use_xbzrle()) { 676 if (iterations_prev != acct_info.iterations) { 677 acct_info.xbzrle_cache_miss_rate = 678 (double)(acct_info.xbzrle_cache_miss - 679 xbzrle_cache_miss_prev) / 680 (acct_info.iterations - iterations_prev); 681 } 682 iterations_prev = acct_info.iterations; 683 xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss; 684 } 685 s->dirty_pages_rate = num_dirty_pages_period * 1000 686 / (end_time - start_time); 687 s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE; 688 start_time = end_time; 689 num_dirty_pages_period = 0; 690 } 691 s->dirty_sync_count = bitmap_sync_count; 692 if (migrate_use_events()) { 693 qapi_event_send_migration_pass(bitmap_sync_count, NULL); 694 } 695 } 696 697 /** 698 * save_zero_page: Send the zero page to the stream 699 * 700 * Returns: Number of pages written. 701 * 702 * @f: QEMUFile where to send the data 703 * @block: block that contains the page we want to send 704 * @offset: offset inside the block for the page 705 * @p: pointer to the page 706 * @bytes_transferred: increase it with the number of transferred bytes 707 */ 708 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset, 709 uint8_t *p, uint64_t *bytes_transferred) 710 { 711 int pages = -1; 712 713 if (is_zero_range(p, TARGET_PAGE_SIZE)) { 714 acct_info.dup_pages++; 715 *bytes_transferred += save_page_header(f, block, 716 offset | RAM_SAVE_FLAG_COMPRESS); 717 qemu_put_byte(f, 0); 718 *bytes_transferred += 1; 719 pages = 1; 720 } 721 722 return pages; 723 } 724 725 static void ram_release_pages(MigrationState *ms, const char *block_name, 726 uint64_t offset, int pages) 727 { 728 if (!migrate_release_ram() || !migration_in_postcopy(ms)) { 729 return; 730 } 731 732 ram_discard_range(NULL, block_name, offset, pages << TARGET_PAGE_BITS); 733 } 734 735 /** 736 * ram_save_page: Send the given page to the stream 737 * 738 * Returns: Number of pages written. 739 * < 0 - error 740 * >=0 - Number of pages written - this might legally be 0 741 * if xbzrle noticed the page was the same. 742 * 743 * @ms: The current migration state. 744 * @f: QEMUFile where to send the data 745 * @block: block that contains the page we want to send 746 * @offset: offset inside the block for the page 747 * @last_stage: if we are at the completion stage 748 * @bytes_transferred: increase it with the number of transferred bytes 749 */ 750 static int ram_save_page(MigrationState *ms, QEMUFile *f, PageSearchStatus *pss, 751 bool last_stage, uint64_t *bytes_transferred) 752 { 753 int pages = -1; 754 uint64_t bytes_xmit; 755 ram_addr_t current_addr; 756 uint8_t *p; 757 int ret; 758 bool send_async = true; 759 RAMBlock *block = pss->block; 760 ram_addr_t offset = pss->offset; 761 762 p = block->host + offset; 763 764 /* In doubt sent page as normal */ 765 bytes_xmit = 0; 766 ret = ram_control_save_page(f, block->offset, 767 offset, TARGET_PAGE_SIZE, &bytes_xmit); 768 if (bytes_xmit) { 769 *bytes_transferred += bytes_xmit; 770 pages = 1; 771 } 772 773 XBZRLE_cache_lock(); 774 775 current_addr = block->offset + offset; 776 777 if (block == last_sent_block) { 778 offset |= RAM_SAVE_FLAG_CONTINUE; 779 } 780 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { 781 if (ret != RAM_SAVE_CONTROL_DELAYED) { 782 if (bytes_xmit > 0) { 783 acct_info.norm_pages++; 784 } else if (bytes_xmit == 0) { 785 acct_info.dup_pages++; 786 } 787 } 788 } else { 789 pages = save_zero_page(f, block, offset, p, bytes_transferred); 790 if (pages > 0) { 791 /* Must let xbzrle know, otherwise a previous (now 0'd) cached 792 * page would be stale 793 */ 794 xbzrle_cache_zero_page(current_addr); 795 ram_release_pages(ms, block->idstr, pss->offset, pages); 796 } else if (!ram_bulk_stage && 797 !migration_in_postcopy(ms) && migrate_use_xbzrle()) { 798 pages = save_xbzrle_page(f, &p, current_addr, block, 799 offset, last_stage, bytes_transferred); 800 if (!last_stage) { 801 /* Can't send this cached data async, since the cache page 802 * might get updated before it gets to the wire 803 */ 804 send_async = false; 805 } 806 } 807 } 808 809 /* XBZRLE overflow or normal page */ 810 if (pages == -1) { 811 *bytes_transferred += save_page_header(f, block, 812 offset | RAM_SAVE_FLAG_PAGE); 813 if (send_async) { 814 qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE, 815 migrate_release_ram() & 816 migration_in_postcopy(ms)); 817 } else { 818 qemu_put_buffer(f, p, TARGET_PAGE_SIZE); 819 } 820 *bytes_transferred += TARGET_PAGE_SIZE; 821 pages = 1; 822 acct_info.norm_pages++; 823 } 824 825 XBZRLE_cache_unlock(); 826 827 return pages; 828 } 829 830 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block, 831 ram_addr_t offset) 832 { 833 int bytes_sent, blen; 834 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK); 835 836 bytes_sent = save_page_header(f, block, offset | 837 RAM_SAVE_FLAG_COMPRESS_PAGE); 838 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE, 839 migrate_compress_level()); 840 if (blen < 0) { 841 bytes_sent = 0; 842 qemu_file_set_error(migrate_get_current()->to_dst_file, blen); 843 error_report("compressed data failed!"); 844 } else { 845 bytes_sent += blen; 846 ram_release_pages(migrate_get_current(), block->idstr, 847 offset & TARGET_PAGE_MASK, 1); 848 } 849 850 return bytes_sent; 851 } 852 853 static uint64_t bytes_transferred; 854 855 static void flush_compressed_data(QEMUFile *f) 856 { 857 int idx, len, thread_count; 858 859 if (!migrate_use_compression()) { 860 return; 861 } 862 thread_count = migrate_compress_threads(); 863 864 qemu_mutex_lock(&comp_done_lock); 865 for (idx = 0; idx < thread_count; idx++) { 866 while (!comp_param[idx].done) { 867 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 868 } 869 } 870 qemu_mutex_unlock(&comp_done_lock); 871 872 for (idx = 0; idx < thread_count; idx++) { 873 qemu_mutex_lock(&comp_param[idx].mutex); 874 if (!comp_param[idx].quit) { 875 len = qemu_put_qemu_file(f, comp_param[idx].file); 876 bytes_transferred += len; 877 } 878 qemu_mutex_unlock(&comp_param[idx].mutex); 879 } 880 } 881 882 static inline void set_compress_params(CompressParam *param, RAMBlock *block, 883 ram_addr_t offset) 884 { 885 param->block = block; 886 param->offset = offset; 887 } 888 889 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block, 890 ram_addr_t offset, 891 uint64_t *bytes_transferred) 892 { 893 int idx, thread_count, bytes_xmit = -1, pages = -1; 894 895 thread_count = migrate_compress_threads(); 896 qemu_mutex_lock(&comp_done_lock); 897 while (true) { 898 for (idx = 0; idx < thread_count; idx++) { 899 if (comp_param[idx].done) { 900 comp_param[idx].done = false; 901 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file); 902 qemu_mutex_lock(&comp_param[idx].mutex); 903 set_compress_params(&comp_param[idx], block, offset); 904 qemu_cond_signal(&comp_param[idx].cond); 905 qemu_mutex_unlock(&comp_param[idx].mutex); 906 pages = 1; 907 acct_info.norm_pages++; 908 *bytes_transferred += bytes_xmit; 909 break; 910 } 911 } 912 if (pages > 0) { 913 break; 914 } else { 915 qemu_cond_wait(&comp_done_cond, &comp_done_lock); 916 } 917 } 918 qemu_mutex_unlock(&comp_done_lock); 919 920 return pages; 921 } 922 923 /** 924 * ram_save_compressed_page: compress the given page and send it to the stream 925 * 926 * Returns: Number of pages written. 927 * 928 * @ms: The current migration state. 929 * @f: QEMUFile where to send the data 930 * @block: block that contains the page we want to send 931 * @offset: offset inside the block for the page 932 * @last_stage: if we are at the completion stage 933 * @bytes_transferred: increase it with the number of transferred bytes 934 */ 935 static int ram_save_compressed_page(MigrationState *ms, QEMUFile *f, 936 PageSearchStatus *pss, bool last_stage, 937 uint64_t *bytes_transferred) 938 { 939 int pages = -1; 940 uint64_t bytes_xmit = 0; 941 uint8_t *p; 942 int ret, blen; 943 RAMBlock *block = pss->block; 944 ram_addr_t offset = pss->offset; 945 946 p = block->host + offset; 947 948 ret = ram_control_save_page(f, block->offset, 949 offset, TARGET_PAGE_SIZE, &bytes_xmit); 950 if (bytes_xmit) { 951 *bytes_transferred += bytes_xmit; 952 pages = 1; 953 } 954 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { 955 if (ret != RAM_SAVE_CONTROL_DELAYED) { 956 if (bytes_xmit > 0) { 957 acct_info.norm_pages++; 958 } else if (bytes_xmit == 0) { 959 acct_info.dup_pages++; 960 } 961 } 962 } else { 963 /* When starting the process of a new block, the first page of 964 * the block should be sent out before other pages in the same 965 * block, and all the pages in last block should have been sent 966 * out, keeping this order is important, because the 'cont' flag 967 * is used to avoid resending the block name. 968 */ 969 if (block != last_sent_block) { 970 flush_compressed_data(f); 971 pages = save_zero_page(f, block, offset, p, bytes_transferred); 972 if (pages == -1) { 973 /* Make sure the first page is sent out before other pages */ 974 bytes_xmit = save_page_header(f, block, offset | 975 RAM_SAVE_FLAG_COMPRESS_PAGE); 976 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE, 977 migrate_compress_level()); 978 if (blen > 0) { 979 *bytes_transferred += bytes_xmit + blen; 980 acct_info.norm_pages++; 981 pages = 1; 982 } else { 983 qemu_file_set_error(f, blen); 984 error_report("compressed data failed!"); 985 } 986 } 987 if (pages > 0) { 988 ram_release_pages(ms, block->idstr, pss->offset, pages); 989 } 990 } else { 991 offset |= RAM_SAVE_FLAG_CONTINUE; 992 pages = save_zero_page(f, block, offset, p, bytes_transferred); 993 if (pages == -1) { 994 pages = compress_page_with_multi_thread(f, block, offset, 995 bytes_transferred); 996 } else { 997 ram_release_pages(ms, block->idstr, pss->offset, pages); 998 } 999 } 1000 } 1001 1002 return pages; 1003 } 1004 1005 /* 1006 * Find the next dirty page and update any state associated with 1007 * the search process. 1008 * 1009 * Returns: True if a page is found 1010 * 1011 * @f: Current migration stream. 1012 * @pss: Data about the state of the current dirty page scan. 1013 * @*again: Set to false if the search has scanned the whole of RAM 1014 * *ram_addr_abs: Pointer into which to store the address of the dirty page 1015 * within the global ram_addr space 1016 */ 1017 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss, 1018 bool *again, ram_addr_t *ram_addr_abs) 1019 { 1020 pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset, 1021 ram_addr_abs); 1022 if (pss->complete_round && pss->block == last_seen_block && 1023 pss->offset >= last_offset) { 1024 /* 1025 * We've been once around the RAM and haven't found anything. 1026 * Give up. 1027 */ 1028 *again = false; 1029 return false; 1030 } 1031 if (pss->offset >= pss->block->used_length) { 1032 /* Didn't find anything in this RAM Block */ 1033 pss->offset = 0; 1034 pss->block = QLIST_NEXT_RCU(pss->block, next); 1035 if (!pss->block) { 1036 /* Hit the end of the list */ 1037 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1038 /* Flag that we've looped */ 1039 pss->complete_round = true; 1040 ram_bulk_stage = false; 1041 if (migrate_use_xbzrle()) { 1042 /* If xbzrle is on, stop using the data compression at this 1043 * point. In theory, xbzrle can do better than compression. 1044 */ 1045 flush_compressed_data(f); 1046 compression_switch = false; 1047 } 1048 } 1049 /* Didn't find anything this time, but try again on the new block */ 1050 *again = true; 1051 return false; 1052 } else { 1053 /* Can go around again, but... */ 1054 *again = true; 1055 /* We've found something so probably don't need to */ 1056 return true; 1057 } 1058 } 1059 1060 /* 1061 * Helper for 'get_queued_page' - gets a page off the queue 1062 * ms: MigrationState in 1063 * *offset: Used to return the offset within the RAMBlock 1064 * ram_addr_abs: global offset in the dirty/sent bitmaps 1065 * 1066 * Returns: block (or NULL if none available) 1067 */ 1068 static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset, 1069 ram_addr_t *ram_addr_abs) 1070 { 1071 RAMBlock *block = NULL; 1072 1073 qemu_mutex_lock(&ms->src_page_req_mutex); 1074 if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) { 1075 struct MigrationSrcPageRequest *entry = 1076 QSIMPLEQ_FIRST(&ms->src_page_requests); 1077 block = entry->rb; 1078 *offset = entry->offset; 1079 *ram_addr_abs = (entry->offset + entry->rb->offset) & 1080 TARGET_PAGE_MASK; 1081 1082 if (entry->len > TARGET_PAGE_SIZE) { 1083 entry->len -= TARGET_PAGE_SIZE; 1084 entry->offset += TARGET_PAGE_SIZE; 1085 } else { 1086 memory_region_unref(block->mr); 1087 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req); 1088 g_free(entry); 1089 } 1090 } 1091 qemu_mutex_unlock(&ms->src_page_req_mutex); 1092 1093 return block; 1094 } 1095 1096 /* 1097 * Unqueue a page from the queue fed by postcopy page requests; skips pages 1098 * that are already sent (!dirty) 1099 * 1100 * ms: MigrationState in 1101 * pss: PageSearchStatus structure updated with found block/offset 1102 * ram_addr_abs: global offset in the dirty/sent bitmaps 1103 * 1104 * Returns: true if a queued page is found 1105 */ 1106 static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss, 1107 ram_addr_t *ram_addr_abs) 1108 { 1109 RAMBlock *block; 1110 ram_addr_t offset; 1111 bool dirty; 1112 1113 do { 1114 block = unqueue_page(ms, &offset, ram_addr_abs); 1115 /* 1116 * We're sending this page, and since it's postcopy nothing else 1117 * will dirty it, and we must make sure it doesn't get sent again 1118 * even if this queue request was received after the background 1119 * search already sent it. 1120 */ 1121 if (block) { 1122 unsigned long *bitmap; 1123 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 1124 dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap); 1125 if (!dirty) { 1126 trace_get_queued_page_not_dirty( 1127 block->idstr, (uint64_t)offset, 1128 (uint64_t)*ram_addr_abs, 1129 test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, 1130 atomic_rcu_read(&migration_bitmap_rcu)->unsentmap)); 1131 } else { 1132 trace_get_queued_page(block->idstr, 1133 (uint64_t)offset, 1134 (uint64_t)*ram_addr_abs); 1135 } 1136 } 1137 1138 } while (block && !dirty); 1139 1140 if (block) { 1141 /* 1142 * As soon as we start servicing pages out of order, then we have 1143 * to kill the bulk stage, since the bulk stage assumes 1144 * in (migration_bitmap_find_and_reset_dirty) that every page is 1145 * dirty, that's no longer true. 1146 */ 1147 ram_bulk_stage = false; 1148 1149 /* 1150 * We want the background search to continue from the queued page 1151 * since the guest is likely to want other pages near to the page 1152 * it just requested. 1153 */ 1154 pss->block = block; 1155 pss->offset = offset; 1156 } 1157 1158 return !!block; 1159 } 1160 1161 /** 1162 * flush_page_queue: Flush any remaining pages in the ram request queue 1163 * it should be empty at the end anyway, but in error cases there may be 1164 * some left. 1165 * 1166 * ms: MigrationState 1167 */ 1168 void flush_page_queue(MigrationState *ms) 1169 { 1170 struct MigrationSrcPageRequest *mspr, *next_mspr; 1171 /* This queue generally should be empty - but in the case of a failed 1172 * migration might have some droppings in. 1173 */ 1174 rcu_read_lock(); 1175 QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) { 1176 memory_region_unref(mspr->rb->mr); 1177 QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req); 1178 g_free(mspr); 1179 } 1180 rcu_read_unlock(); 1181 } 1182 1183 /** 1184 * Queue the pages for transmission, e.g. a request from postcopy destination 1185 * ms: MigrationStatus in which the queue is held 1186 * rbname: The RAMBlock the request is for - may be NULL (to mean reuse last) 1187 * start: Offset from the start of the RAMBlock 1188 * len: Length (in bytes) to send 1189 * Return: 0 on success 1190 */ 1191 int ram_save_queue_pages(MigrationState *ms, const char *rbname, 1192 ram_addr_t start, ram_addr_t len) 1193 { 1194 RAMBlock *ramblock; 1195 1196 ms->postcopy_requests++; 1197 rcu_read_lock(); 1198 if (!rbname) { 1199 /* Reuse last RAMBlock */ 1200 ramblock = ms->last_req_rb; 1201 1202 if (!ramblock) { 1203 /* 1204 * Shouldn't happen, we can't reuse the last RAMBlock if 1205 * it's the 1st request. 1206 */ 1207 error_report("ram_save_queue_pages no previous block"); 1208 goto err; 1209 } 1210 } else { 1211 ramblock = qemu_ram_block_by_name(rbname); 1212 1213 if (!ramblock) { 1214 /* We shouldn't be asked for a non-existent RAMBlock */ 1215 error_report("ram_save_queue_pages no block '%s'", rbname); 1216 goto err; 1217 } 1218 ms->last_req_rb = ramblock; 1219 } 1220 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1221 if (start+len > ramblock->used_length) { 1222 error_report("%s request overrun start=" RAM_ADDR_FMT " len=" 1223 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1224 __func__, start, len, ramblock->used_length); 1225 goto err; 1226 } 1227 1228 struct MigrationSrcPageRequest *new_entry = 1229 g_malloc0(sizeof(struct MigrationSrcPageRequest)); 1230 new_entry->rb = ramblock; 1231 new_entry->offset = start; 1232 new_entry->len = len; 1233 1234 memory_region_ref(ramblock->mr); 1235 qemu_mutex_lock(&ms->src_page_req_mutex); 1236 QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req); 1237 qemu_mutex_unlock(&ms->src_page_req_mutex); 1238 rcu_read_unlock(); 1239 1240 return 0; 1241 1242 err: 1243 rcu_read_unlock(); 1244 return -1; 1245 } 1246 1247 /** 1248 * ram_save_target_page: Save one target page 1249 * 1250 * 1251 * @f: QEMUFile where to send the data 1252 * @block: pointer to block that contains the page we want to send 1253 * @offset: offset inside the block for the page; 1254 * @last_stage: if we are at the completion stage 1255 * @bytes_transferred: increase it with the number of transferred bytes 1256 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space 1257 * 1258 * Returns: Number of pages written. 1259 */ 1260 static int ram_save_target_page(MigrationState *ms, QEMUFile *f, 1261 PageSearchStatus *pss, 1262 bool last_stage, 1263 uint64_t *bytes_transferred, 1264 ram_addr_t dirty_ram_abs) 1265 { 1266 int res = 0; 1267 1268 /* Check the pages is dirty and if it is send it */ 1269 if (migration_bitmap_clear_dirty(dirty_ram_abs)) { 1270 unsigned long *unsentmap; 1271 if (compression_switch && migrate_use_compression()) { 1272 res = ram_save_compressed_page(ms, f, pss, 1273 last_stage, 1274 bytes_transferred); 1275 } else { 1276 res = ram_save_page(ms, f, pss, last_stage, 1277 bytes_transferred); 1278 } 1279 1280 if (res < 0) { 1281 return res; 1282 } 1283 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; 1284 if (unsentmap) { 1285 clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap); 1286 } 1287 /* Only update last_sent_block if a block was actually sent; xbzrle 1288 * might have decided the page was identical so didn't bother writing 1289 * to the stream. 1290 */ 1291 if (res > 0) { 1292 last_sent_block = pss->block; 1293 } 1294 } 1295 1296 return res; 1297 } 1298 1299 /** 1300 * ram_save_host_page: Starting at *offset send pages up to the end 1301 * of the current host page. It's valid for the initial 1302 * offset to point into the middle of a host page 1303 * in which case the remainder of the hostpage is sent. 1304 * Only dirty target pages are sent. 1305 * Note that the host page size may be a huge page for this 1306 * block. 1307 * 1308 * Returns: Number of pages written. 1309 * 1310 * @f: QEMUFile where to send the data 1311 * @block: pointer to block that contains the page we want to send 1312 * @offset: offset inside the block for the page; updated to last target page 1313 * sent 1314 * @last_stage: if we are at the completion stage 1315 * @bytes_transferred: increase it with the number of transferred bytes 1316 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space 1317 */ 1318 static int ram_save_host_page(MigrationState *ms, QEMUFile *f, 1319 PageSearchStatus *pss, 1320 bool last_stage, 1321 uint64_t *bytes_transferred, 1322 ram_addr_t dirty_ram_abs) 1323 { 1324 int tmppages, pages = 0; 1325 size_t pagesize = qemu_ram_pagesize(pss->block); 1326 1327 do { 1328 tmppages = ram_save_target_page(ms, f, pss, last_stage, 1329 bytes_transferred, dirty_ram_abs); 1330 if (tmppages < 0) { 1331 return tmppages; 1332 } 1333 1334 pages += tmppages; 1335 pss->offset += TARGET_PAGE_SIZE; 1336 dirty_ram_abs += TARGET_PAGE_SIZE; 1337 } while (pss->offset & (pagesize - 1)); 1338 1339 /* The offset we leave with is the last one we looked at */ 1340 pss->offset -= TARGET_PAGE_SIZE; 1341 return pages; 1342 } 1343 1344 /** 1345 * ram_find_and_save_block: Finds a dirty page and sends it to f 1346 * 1347 * Called within an RCU critical section. 1348 * 1349 * Returns: The number of pages written 1350 * 0 means no dirty pages 1351 * 1352 * @f: QEMUFile where to send the data 1353 * @last_stage: if we are at the completion stage 1354 * @bytes_transferred: increase it with the number of transferred bytes 1355 * 1356 * On systems where host-page-size > target-page-size it will send all the 1357 * pages in a host page that are dirty. 1358 */ 1359 1360 static int ram_find_and_save_block(QEMUFile *f, bool last_stage, 1361 uint64_t *bytes_transferred) 1362 { 1363 PageSearchStatus pss; 1364 MigrationState *ms = migrate_get_current(); 1365 int pages = 0; 1366 bool again, found; 1367 ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in 1368 ram_addr_t space */ 1369 1370 /* No dirty page as there is zero RAM */ 1371 if (!ram_bytes_total()) { 1372 return pages; 1373 } 1374 1375 pss.block = last_seen_block; 1376 pss.offset = last_offset; 1377 pss.complete_round = false; 1378 1379 if (!pss.block) { 1380 pss.block = QLIST_FIRST_RCU(&ram_list.blocks); 1381 } 1382 1383 do { 1384 again = true; 1385 found = get_queued_page(ms, &pss, &dirty_ram_abs); 1386 1387 if (!found) { 1388 /* priority queue empty, so just search for something dirty */ 1389 found = find_dirty_block(f, &pss, &again, &dirty_ram_abs); 1390 } 1391 1392 if (found) { 1393 pages = ram_save_host_page(ms, f, &pss, 1394 last_stage, bytes_transferred, 1395 dirty_ram_abs); 1396 } 1397 } while (!pages && again); 1398 1399 last_seen_block = pss.block; 1400 last_offset = pss.offset; 1401 1402 return pages; 1403 } 1404 1405 void acct_update_position(QEMUFile *f, size_t size, bool zero) 1406 { 1407 uint64_t pages = size / TARGET_PAGE_SIZE; 1408 if (zero) { 1409 acct_info.dup_pages += pages; 1410 } else { 1411 acct_info.norm_pages += pages; 1412 bytes_transferred += size; 1413 qemu_update_position(f, size); 1414 } 1415 } 1416 1417 static ram_addr_t ram_save_remaining(void) 1418 { 1419 return migration_dirty_pages; 1420 } 1421 1422 uint64_t ram_bytes_remaining(void) 1423 { 1424 return ram_save_remaining() * TARGET_PAGE_SIZE; 1425 } 1426 1427 uint64_t ram_bytes_transferred(void) 1428 { 1429 return bytes_transferred; 1430 } 1431 1432 uint64_t ram_bytes_total(void) 1433 { 1434 RAMBlock *block; 1435 uint64_t total = 0; 1436 1437 rcu_read_lock(); 1438 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) 1439 total += block->used_length; 1440 rcu_read_unlock(); 1441 return total; 1442 } 1443 1444 void free_xbzrle_decoded_buf(void) 1445 { 1446 g_free(xbzrle_decoded_buf); 1447 xbzrle_decoded_buf = NULL; 1448 } 1449 1450 static void migration_bitmap_free(struct BitmapRcu *bmap) 1451 { 1452 g_free(bmap->bmap); 1453 g_free(bmap->unsentmap); 1454 g_free(bmap); 1455 } 1456 1457 static void ram_migration_cleanup(void *opaque) 1458 { 1459 /* caller have hold iothread lock or is in a bh, so there is 1460 * no writing race against this migration_bitmap 1461 */ 1462 struct BitmapRcu *bitmap = migration_bitmap_rcu; 1463 atomic_rcu_set(&migration_bitmap_rcu, NULL); 1464 if (bitmap) { 1465 memory_global_dirty_log_stop(); 1466 call_rcu(bitmap, migration_bitmap_free, rcu); 1467 } 1468 1469 XBZRLE_cache_lock(); 1470 if (XBZRLE.cache) { 1471 cache_fini(XBZRLE.cache); 1472 g_free(XBZRLE.encoded_buf); 1473 g_free(XBZRLE.current_buf); 1474 g_free(ZERO_TARGET_PAGE); 1475 XBZRLE.cache = NULL; 1476 XBZRLE.encoded_buf = NULL; 1477 XBZRLE.current_buf = NULL; 1478 } 1479 XBZRLE_cache_unlock(); 1480 } 1481 1482 static void reset_ram_globals(void) 1483 { 1484 last_seen_block = NULL; 1485 last_sent_block = NULL; 1486 last_offset = 0; 1487 last_version = ram_list.version; 1488 ram_bulk_stage = true; 1489 } 1490 1491 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 1492 1493 void migration_bitmap_extend(ram_addr_t old, ram_addr_t new) 1494 { 1495 /* called in qemu main thread, so there is 1496 * no writing race against this migration_bitmap 1497 */ 1498 if (migration_bitmap_rcu) { 1499 struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap; 1500 bitmap = g_new(struct BitmapRcu, 1); 1501 bitmap->bmap = bitmap_new(new); 1502 1503 /* prevent migration_bitmap content from being set bit 1504 * by migration_bitmap_sync_range() at the same time. 1505 * it is safe to migration if migration_bitmap is cleared bit 1506 * at the same time. 1507 */ 1508 qemu_mutex_lock(&migration_bitmap_mutex); 1509 bitmap_copy(bitmap->bmap, old_bitmap->bmap, old); 1510 bitmap_set(bitmap->bmap, old, new - old); 1511 1512 /* We don't have a way to safely extend the sentmap 1513 * with RCU; so mark it as missing, entry to postcopy 1514 * will fail. 1515 */ 1516 bitmap->unsentmap = NULL; 1517 1518 atomic_rcu_set(&migration_bitmap_rcu, bitmap); 1519 qemu_mutex_unlock(&migration_bitmap_mutex); 1520 migration_dirty_pages += new - old; 1521 call_rcu(old_bitmap, migration_bitmap_free, rcu); 1522 } 1523 } 1524 1525 /* 1526 * 'expected' is the value you expect the bitmap mostly to be full 1527 * of; it won't bother printing lines that are all this value. 1528 * If 'todump' is null the migration bitmap is dumped. 1529 */ 1530 void ram_debug_dump_bitmap(unsigned long *todump, bool expected) 1531 { 1532 int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS; 1533 1534 int64_t cur; 1535 int64_t linelen = 128; 1536 char linebuf[129]; 1537 1538 if (!todump) { 1539 todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 1540 } 1541 1542 for (cur = 0; cur < ram_pages; cur += linelen) { 1543 int64_t curb; 1544 bool found = false; 1545 /* 1546 * Last line; catch the case where the line length 1547 * is longer than remaining ram 1548 */ 1549 if (cur + linelen > ram_pages) { 1550 linelen = ram_pages - cur; 1551 } 1552 for (curb = 0; curb < linelen; curb++) { 1553 bool thisbit = test_bit(cur + curb, todump); 1554 linebuf[curb] = thisbit ? '1' : '.'; 1555 found = found || (thisbit != expected); 1556 } 1557 if (found) { 1558 linebuf[curb] = '\0'; 1559 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf); 1560 } 1561 } 1562 } 1563 1564 /* **** functions for postcopy ***** */ 1565 1566 void ram_postcopy_migrated_memory_release(MigrationState *ms) 1567 { 1568 struct RAMBlock *block; 1569 unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 1570 1571 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 1572 unsigned long first = block->offset >> TARGET_PAGE_BITS; 1573 unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS); 1574 unsigned long run_start = find_next_zero_bit(bitmap, range, first); 1575 1576 while (run_start < range) { 1577 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 1578 ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS, 1579 (run_end - run_start) << TARGET_PAGE_BITS); 1580 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 1581 } 1582 } 1583 } 1584 1585 /* 1586 * Callback from postcopy_each_ram_send_discard for each RAMBlock 1587 * Note: At this point the 'unsentmap' is the processed bitmap combined 1588 * with the dirtymap; so a '1' means it's either dirty or unsent. 1589 * start,length: Indexes into the bitmap for the first bit 1590 * representing the named block and length in target-pages 1591 */ 1592 static int postcopy_send_discard_bm_ram(MigrationState *ms, 1593 PostcopyDiscardState *pds, 1594 unsigned long start, 1595 unsigned long length) 1596 { 1597 unsigned long end = start + length; /* one after the end */ 1598 unsigned long current; 1599 unsigned long *unsentmap; 1600 1601 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; 1602 for (current = start; current < end; ) { 1603 unsigned long one = find_next_bit(unsentmap, end, current); 1604 1605 if (one <= end) { 1606 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1); 1607 unsigned long discard_length; 1608 1609 if (zero >= end) { 1610 discard_length = end - one; 1611 } else { 1612 discard_length = zero - one; 1613 } 1614 if (discard_length) { 1615 postcopy_discard_send_range(ms, pds, one, discard_length); 1616 } 1617 current = one + discard_length; 1618 } else { 1619 current = one; 1620 } 1621 } 1622 1623 return 0; 1624 } 1625 1626 /* 1627 * Utility for the outgoing postcopy code. 1628 * Calls postcopy_send_discard_bm_ram for each RAMBlock 1629 * passing it bitmap indexes and name. 1630 * Returns: 0 on success 1631 * (qemu_ram_foreach_block ends up passing unscaled lengths 1632 * which would mean postcopy code would have to deal with target page) 1633 */ 1634 static int postcopy_each_ram_send_discard(MigrationState *ms) 1635 { 1636 struct RAMBlock *block; 1637 int ret; 1638 1639 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 1640 unsigned long first = block->offset >> TARGET_PAGE_BITS; 1641 PostcopyDiscardState *pds = postcopy_discard_send_init(ms, 1642 first, 1643 block->idstr); 1644 1645 /* 1646 * Postcopy sends chunks of bitmap over the wire, but it 1647 * just needs indexes at this point, avoids it having 1648 * target page specific code. 1649 */ 1650 ret = postcopy_send_discard_bm_ram(ms, pds, first, 1651 block->used_length >> TARGET_PAGE_BITS); 1652 postcopy_discard_send_finish(ms, pds); 1653 if (ret) { 1654 return ret; 1655 } 1656 } 1657 1658 return 0; 1659 } 1660 1661 /* 1662 * Helper for postcopy_chunk_hostpages; it's called twice to cleanup 1663 * the two bitmaps, that are similar, but one is inverted. 1664 * 1665 * We search for runs of target-pages that don't start or end on a 1666 * host page boundary; 1667 * unsent_pass=true: Cleans up partially unsent host pages by searching 1668 * the unsentmap 1669 * unsent_pass=false: Cleans up partially dirty host pages by searching 1670 * the main migration bitmap 1671 * 1672 */ 1673 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass, 1674 RAMBlock *block, 1675 PostcopyDiscardState *pds) 1676 { 1677 unsigned long *bitmap; 1678 unsigned long *unsentmap; 1679 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 1680 unsigned long first = block->offset >> TARGET_PAGE_BITS; 1681 unsigned long len = block->used_length >> TARGET_PAGE_BITS; 1682 unsigned long last = first + (len - 1); 1683 unsigned long run_start; 1684 1685 if (block->page_size == TARGET_PAGE_SIZE) { 1686 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 1687 return; 1688 } 1689 1690 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 1691 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; 1692 1693 if (unsent_pass) { 1694 /* Find a sent page */ 1695 run_start = find_next_zero_bit(unsentmap, last + 1, first); 1696 } else { 1697 /* Find a dirty page */ 1698 run_start = find_next_bit(bitmap, last + 1, first); 1699 } 1700 1701 while (run_start <= last) { 1702 bool do_fixup = false; 1703 unsigned long fixup_start_addr; 1704 unsigned long host_offset; 1705 1706 /* 1707 * If the start of this run of pages is in the middle of a host 1708 * page, then we need to fixup this host page. 1709 */ 1710 host_offset = run_start % host_ratio; 1711 if (host_offset) { 1712 do_fixup = true; 1713 run_start -= host_offset; 1714 fixup_start_addr = run_start; 1715 /* For the next pass */ 1716 run_start = run_start + host_ratio; 1717 } else { 1718 /* Find the end of this run */ 1719 unsigned long run_end; 1720 if (unsent_pass) { 1721 run_end = find_next_bit(unsentmap, last + 1, run_start + 1); 1722 } else { 1723 run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1); 1724 } 1725 /* 1726 * If the end isn't at the start of a host page, then the 1727 * run doesn't finish at the end of a host page 1728 * and we need to discard. 1729 */ 1730 host_offset = run_end % host_ratio; 1731 if (host_offset) { 1732 do_fixup = true; 1733 fixup_start_addr = run_end - host_offset; 1734 /* 1735 * This host page has gone, the next loop iteration starts 1736 * from after the fixup 1737 */ 1738 run_start = fixup_start_addr + host_ratio; 1739 } else { 1740 /* 1741 * No discards on this iteration, next loop starts from 1742 * next sent/dirty page 1743 */ 1744 run_start = run_end + 1; 1745 } 1746 } 1747 1748 if (do_fixup) { 1749 unsigned long page; 1750 1751 /* Tell the destination to discard this page */ 1752 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) { 1753 /* For the unsent_pass we: 1754 * discard partially sent pages 1755 * For the !unsent_pass (dirty) we: 1756 * discard partially dirty pages that were sent 1757 * (any partially sent pages were already discarded 1758 * by the previous unsent_pass) 1759 */ 1760 postcopy_discard_send_range(ms, pds, fixup_start_addr, 1761 host_ratio); 1762 } 1763 1764 /* Clean up the bitmap */ 1765 for (page = fixup_start_addr; 1766 page < fixup_start_addr + host_ratio; page++) { 1767 /* All pages in this host page are now not sent */ 1768 set_bit(page, unsentmap); 1769 1770 /* 1771 * Remark them as dirty, updating the count for any pages 1772 * that weren't previously dirty. 1773 */ 1774 migration_dirty_pages += !test_and_set_bit(page, bitmap); 1775 } 1776 } 1777 1778 if (unsent_pass) { 1779 /* Find the next sent page for the next iteration */ 1780 run_start = find_next_zero_bit(unsentmap, last + 1, 1781 run_start); 1782 } else { 1783 /* Find the next dirty page for the next iteration */ 1784 run_start = find_next_bit(bitmap, last + 1, run_start); 1785 } 1786 } 1787 } 1788 1789 /* 1790 * Utility for the outgoing postcopy code. 1791 * 1792 * Discard any partially sent host-page size chunks, mark any partially 1793 * dirty host-page size chunks as all dirty. In this case the host-page 1794 * is the host-page for the particular RAMBlock, i.e. it might be a huge page 1795 * 1796 * Returns: 0 on success 1797 */ 1798 static int postcopy_chunk_hostpages(MigrationState *ms) 1799 { 1800 struct RAMBlock *block; 1801 1802 /* Easiest way to make sure we don't resume in the middle of a host-page */ 1803 last_seen_block = NULL; 1804 last_sent_block = NULL; 1805 last_offset = 0; 1806 1807 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 1808 unsigned long first = block->offset >> TARGET_PAGE_BITS; 1809 1810 PostcopyDiscardState *pds = 1811 postcopy_discard_send_init(ms, first, block->idstr); 1812 1813 /* First pass: Discard all partially sent host pages */ 1814 postcopy_chunk_hostpages_pass(ms, true, block, pds); 1815 /* 1816 * Second pass: Ensure that all partially dirty host pages are made 1817 * fully dirty. 1818 */ 1819 postcopy_chunk_hostpages_pass(ms, false, block, pds); 1820 1821 postcopy_discard_send_finish(ms, pds); 1822 } /* ram_list loop */ 1823 1824 return 0; 1825 } 1826 1827 /* 1828 * Transmit the set of pages to be discarded after precopy to the target 1829 * these are pages that: 1830 * a) Have been previously transmitted but are now dirty again 1831 * b) Pages that have never been transmitted, this ensures that 1832 * any pages on the destination that have been mapped by background 1833 * tasks get discarded (transparent huge pages is the specific concern) 1834 * Hopefully this is pretty sparse 1835 */ 1836 int ram_postcopy_send_discard_bitmap(MigrationState *ms) 1837 { 1838 int ret; 1839 unsigned long *bitmap, *unsentmap; 1840 1841 rcu_read_lock(); 1842 1843 /* This should be our last sync, the src is now paused */ 1844 migration_bitmap_sync(); 1845 1846 unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; 1847 if (!unsentmap) { 1848 /* We don't have a safe way to resize the sentmap, so 1849 * if the bitmap was resized it will be NULL at this 1850 * point. 1851 */ 1852 error_report("migration ram resized during precopy phase"); 1853 rcu_read_unlock(); 1854 return -EINVAL; 1855 } 1856 1857 /* Deal with TPS != HPS and huge pages */ 1858 ret = postcopy_chunk_hostpages(ms); 1859 if (ret) { 1860 rcu_read_unlock(); 1861 return ret; 1862 } 1863 1864 /* 1865 * Update the unsentmap to be unsentmap = unsentmap | dirty 1866 */ 1867 bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; 1868 bitmap_or(unsentmap, unsentmap, bitmap, 1869 last_ram_offset() >> TARGET_PAGE_BITS); 1870 1871 1872 trace_ram_postcopy_send_discard_bitmap(); 1873 #ifdef DEBUG_POSTCOPY 1874 ram_debug_dump_bitmap(unsentmap, true); 1875 #endif 1876 1877 ret = postcopy_each_ram_send_discard(ms); 1878 rcu_read_unlock(); 1879 1880 return ret; 1881 } 1882 1883 /* 1884 * At the start of the postcopy phase of migration, any now-dirty 1885 * precopied pages are discarded. 1886 * 1887 * start, length describe a byte address range within the RAMBlock 1888 * 1889 * Returns 0 on success. 1890 */ 1891 int ram_discard_range(MigrationIncomingState *mis, 1892 const char *block_name, 1893 uint64_t start, size_t length) 1894 { 1895 int ret = -1; 1896 1897 trace_ram_discard_range(block_name, start, length); 1898 1899 rcu_read_lock(); 1900 RAMBlock *rb = qemu_ram_block_by_name(block_name); 1901 1902 if (!rb) { 1903 error_report("ram_discard_range: Failed to find block '%s'", 1904 block_name); 1905 goto err; 1906 } 1907 1908 ret = ram_block_discard_range(rb, start, length); 1909 1910 err: 1911 rcu_read_unlock(); 1912 1913 return ret; 1914 } 1915 1916 static int ram_save_init_globals(void) 1917 { 1918 int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */ 1919 1920 dirty_rate_high_cnt = 0; 1921 bitmap_sync_count = 0; 1922 migration_bitmap_sync_init(); 1923 qemu_mutex_init(&migration_bitmap_mutex); 1924 1925 if (migrate_use_xbzrle()) { 1926 XBZRLE_cache_lock(); 1927 ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE); 1928 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() / 1929 TARGET_PAGE_SIZE, 1930 TARGET_PAGE_SIZE); 1931 if (!XBZRLE.cache) { 1932 XBZRLE_cache_unlock(); 1933 error_report("Error creating cache"); 1934 return -1; 1935 } 1936 XBZRLE_cache_unlock(); 1937 1938 /* We prefer not to abort if there is no memory */ 1939 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 1940 if (!XBZRLE.encoded_buf) { 1941 error_report("Error allocating encoded_buf"); 1942 return -1; 1943 } 1944 1945 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 1946 if (!XBZRLE.current_buf) { 1947 error_report("Error allocating current_buf"); 1948 g_free(XBZRLE.encoded_buf); 1949 XBZRLE.encoded_buf = NULL; 1950 return -1; 1951 } 1952 1953 acct_clear(); 1954 } 1955 1956 /* For memory_global_dirty_log_start below. */ 1957 qemu_mutex_lock_iothread(); 1958 1959 qemu_mutex_lock_ramlist(); 1960 rcu_read_lock(); 1961 bytes_transferred = 0; 1962 reset_ram_globals(); 1963 1964 migration_bitmap_rcu = g_new0(struct BitmapRcu, 1); 1965 /* Skip setting bitmap if there is no RAM */ 1966 if (ram_bytes_total()) { 1967 ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS; 1968 migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages); 1969 bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages); 1970 1971 if (migrate_postcopy_ram()) { 1972 migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages); 1973 bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages); 1974 } 1975 } 1976 1977 /* 1978 * Count the total number of pages used by ram blocks not including any 1979 * gaps due to alignment or unplugs. 1980 */ 1981 migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; 1982 1983 memory_global_dirty_log_start(); 1984 migration_bitmap_sync(); 1985 qemu_mutex_unlock_ramlist(); 1986 qemu_mutex_unlock_iothread(); 1987 rcu_read_unlock(); 1988 1989 return 0; 1990 } 1991 1992 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has 1993 * long-running RCU critical section. When rcu-reclaims in the code 1994 * start to become numerous it will be necessary to reduce the 1995 * granularity of these critical sections. 1996 */ 1997 1998 static int ram_save_setup(QEMUFile *f, void *opaque) 1999 { 2000 RAMBlock *block; 2001 2002 /* migration has already setup the bitmap, reuse it. */ 2003 if (!migration_in_colo_state()) { 2004 if (ram_save_init_globals() < 0) { 2005 return -1; 2006 } 2007 } 2008 2009 rcu_read_lock(); 2010 2011 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE); 2012 2013 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { 2014 qemu_put_byte(f, strlen(block->idstr)); 2015 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 2016 qemu_put_be64(f, block->used_length); 2017 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) { 2018 qemu_put_be64(f, block->page_size); 2019 } 2020 } 2021 2022 rcu_read_unlock(); 2023 2024 ram_control_before_iterate(f, RAM_CONTROL_SETUP); 2025 ram_control_after_iterate(f, RAM_CONTROL_SETUP); 2026 2027 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2028 2029 return 0; 2030 } 2031 2032 static int ram_save_iterate(QEMUFile *f, void *opaque) 2033 { 2034 int ret; 2035 int i; 2036 int64_t t0; 2037 int done = 0; 2038 2039 rcu_read_lock(); 2040 if (ram_list.version != last_version) { 2041 reset_ram_globals(); 2042 } 2043 2044 /* Read version before ram_list.blocks */ 2045 smp_rmb(); 2046 2047 ram_control_before_iterate(f, RAM_CONTROL_ROUND); 2048 2049 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 2050 i = 0; 2051 while ((ret = qemu_file_rate_limit(f)) == 0) { 2052 int pages; 2053 2054 pages = ram_find_and_save_block(f, false, &bytes_transferred); 2055 /* no more pages to sent */ 2056 if (pages == 0) { 2057 done = 1; 2058 break; 2059 } 2060 acct_info.iterations++; 2061 2062 /* we want to check in the 1st loop, just in case it was the 1st time 2063 and we had to sync the dirty bitmap. 2064 qemu_get_clock_ns() is a bit expensive, so we only check each some 2065 iterations 2066 */ 2067 if ((i & 63) == 0) { 2068 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000; 2069 if (t1 > MAX_WAIT) { 2070 trace_ram_save_iterate_big_wait(t1, i); 2071 break; 2072 } 2073 } 2074 i++; 2075 } 2076 flush_compressed_data(f); 2077 rcu_read_unlock(); 2078 2079 /* 2080 * Must occur before EOS (or any QEMUFile operation) 2081 * because of RDMA protocol. 2082 */ 2083 ram_control_after_iterate(f, RAM_CONTROL_ROUND); 2084 2085 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2086 bytes_transferred += 8; 2087 2088 ret = qemu_file_get_error(f); 2089 if (ret < 0) { 2090 return ret; 2091 } 2092 2093 return done; 2094 } 2095 2096 /* Called with iothread lock */ 2097 static int ram_save_complete(QEMUFile *f, void *opaque) 2098 { 2099 rcu_read_lock(); 2100 2101 if (!migration_in_postcopy(migrate_get_current())) { 2102 migration_bitmap_sync(); 2103 } 2104 2105 ram_control_before_iterate(f, RAM_CONTROL_FINISH); 2106 2107 /* try transferring iterative blocks of memory */ 2108 2109 /* flush all remaining blocks regardless of rate limiting */ 2110 while (true) { 2111 int pages; 2112 2113 pages = ram_find_and_save_block(f, !migration_in_colo_state(), 2114 &bytes_transferred); 2115 /* no more blocks to sent */ 2116 if (pages == 0) { 2117 break; 2118 } 2119 } 2120 2121 flush_compressed_data(f); 2122 ram_control_after_iterate(f, RAM_CONTROL_FINISH); 2123 2124 rcu_read_unlock(); 2125 2126 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 2127 2128 return 0; 2129 } 2130 2131 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, 2132 uint64_t *non_postcopiable_pending, 2133 uint64_t *postcopiable_pending) 2134 { 2135 uint64_t remaining_size; 2136 2137 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE; 2138 2139 if (!migration_in_postcopy(migrate_get_current()) && 2140 remaining_size < max_size) { 2141 qemu_mutex_lock_iothread(); 2142 rcu_read_lock(); 2143 migration_bitmap_sync(); 2144 rcu_read_unlock(); 2145 qemu_mutex_unlock_iothread(); 2146 remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE; 2147 } 2148 2149 /* We can do postcopy, and all the data is postcopiable */ 2150 *postcopiable_pending += remaining_size; 2151 } 2152 2153 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 2154 { 2155 unsigned int xh_len; 2156 int xh_flags; 2157 uint8_t *loaded_data; 2158 2159 if (!xbzrle_decoded_buf) { 2160 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2161 } 2162 loaded_data = xbzrle_decoded_buf; 2163 2164 /* extract RLE header */ 2165 xh_flags = qemu_get_byte(f); 2166 xh_len = qemu_get_be16(f); 2167 2168 if (xh_flags != ENCODING_FLAG_XBZRLE) { 2169 error_report("Failed to load XBZRLE page - wrong compression!"); 2170 return -1; 2171 } 2172 2173 if (xh_len > TARGET_PAGE_SIZE) { 2174 error_report("Failed to load XBZRLE page - len overflow!"); 2175 return -1; 2176 } 2177 /* load data and decode */ 2178 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 2179 2180 /* decode RLE */ 2181 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 2182 TARGET_PAGE_SIZE) == -1) { 2183 error_report("Failed to load XBZRLE page - decode error!"); 2184 return -1; 2185 } 2186 2187 return 0; 2188 } 2189 2190 /* Must be called from within a rcu critical section. 2191 * Returns a pointer from within the RCU-protected ram_list. 2192 */ 2193 /* 2194 * Read a RAMBlock ID from the stream f. 2195 * 2196 * f: Stream to read from 2197 * flags: Page flags (mostly to see if it's a continuation of previous block) 2198 */ 2199 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, 2200 int flags) 2201 { 2202 static RAMBlock *block = NULL; 2203 char id[256]; 2204 uint8_t len; 2205 2206 if (flags & RAM_SAVE_FLAG_CONTINUE) { 2207 if (!block) { 2208 error_report("Ack, bad migration stream!"); 2209 return NULL; 2210 } 2211 return block; 2212 } 2213 2214 len = qemu_get_byte(f); 2215 qemu_get_buffer(f, (uint8_t *)id, len); 2216 id[len] = 0; 2217 2218 block = qemu_ram_block_by_name(id); 2219 if (!block) { 2220 error_report("Can't find block %s", id); 2221 return NULL; 2222 } 2223 2224 return block; 2225 } 2226 2227 static inline void *host_from_ram_block_offset(RAMBlock *block, 2228 ram_addr_t offset) 2229 { 2230 if (!offset_in_ramblock(block, offset)) { 2231 return NULL; 2232 } 2233 2234 return block->host + offset; 2235 } 2236 2237 /* 2238 * If a page (or a whole RDMA chunk) has been 2239 * determined to be zero, then zap it. 2240 */ 2241 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) 2242 { 2243 if (ch != 0 || !is_zero_range(host, size)) { 2244 memset(host, ch, size); 2245 } 2246 } 2247 2248 static void *do_data_decompress(void *opaque) 2249 { 2250 DecompressParam *param = opaque; 2251 unsigned long pagesize; 2252 uint8_t *des; 2253 int len; 2254 2255 qemu_mutex_lock(¶m->mutex); 2256 while (!param->quit) { 2257 if (param->des) { 2258 des = param->des; 2259 len = param->len; 2260 param->des = 0; 2261 qemu_mutex_unlock(¶m->mutex); 2262 2263 pagesize = TARGET_PAGE_SIZE; 2264 /* uncompress() will return failed in some case, especially 2265 * when the page is dirted when doing the compression, it's 2266 * not a problem because the dirty page will be retransferred 2267 * and uncompress() won't break the data in other pages. 2268 */ 2269 uncompress((Bytef *)des, &pagesize, 2270 (const Bytef *)param->compbuf, len); 2271 2272 qemu_mutex_lock(&decomp_done_lock); 2273 param->done = true; 2274 qemu_cond_signal(&decomp_done_cond); 2275 qemu_mutex_unlock(&decomp_done_lock); 2276 2277 qemu_mutex_lock(¶m->mutex); 2278 } else { 2279 qemu_cond_wait(¶m->cond, ¶m->mutex); 2280 } 2281 } 2282 qemu_mutex_unlock(¶m->mutex); 2283 2284 return NULL; 2285 } 2286 2287 static void wait_for_decompress_done(void) 2288 { 2289 int idx, thread_count; 2290 2291 if (!migrate_use_compression()) { 2292 return; 2293 } 2294 2295 thread_count = migrate_decompress_threads(); 2296 qemu_mutex_lock(&decomp_done_lock); 2297 for (idx = 0; idx < thread_count; idx++) { 2298 while (!decomp_param[idx].done) { 2299 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 2300 } 2301 } 2302 qemu_mutex_unlock(&decomp_done_lock); 2303 } 2304 2305 void migrate_decompress_threads_create(void) 2306 { 2307 int i, thread_count; 2308 2309 thread_count = migrate_decompress_threads(); 2310 decompress_threads = g_new0(QemuThread, thread_count); 2311 decomp_param = g_new0(DecompressParam, thread_count); 2312 qemu_mutex_init(&decomp_done_lock); 2313 qemu_cond_init(&decomp_done_cond); 2314 for (i = 0; i < thread_count; i++) { 2315 qemu_mutex_init(&decomp_param[i].mutex); 2316 qemu_cond_init(&decomp_param[i].cond); 2317 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); 2318 decomp_param[i].done = true; 2319 decomp_param[i].quit = false; 2320 qemu_thread_create(decompress_threads + i, "decompress", 2321 do_data_decompress, decomp_param + i, 2322 QEMU_THREAD_JOINABLE); 2323 } 2324 } 2325 2326 void migrate_decompress_threads_join(void) 2327 { 2328 int i, thread_count; 2329 2330 thread_count = migrate_decompress_threads(); 2331 for (i = 0; i < thread_count; i++) { 2332 qemu_mutex_lock(&decomp_param[i].mutex); 2333 decomp_param[i].quit = true; 2334 qemu_cond_signal(&decomp_param[i].cond); 2335 qemu_mutex_unlock(&decomp_param[i].mutex); 2336 } 2337 for (i = 0; i < thread_count; i++) { 2338 qemu_thread_join(decompress_threads + i); 2339 qemu_mutex_destroy(&decomp_param[i].mutex); 2340 qemu_cond_destroy(&decomp_param[i].cond); 2341 g_free(decomp_param[i].compbuf); 2342 } 2343 g_free(decompress_threads); 2344 g_free(decomp_param); 2345 decompress_threads = NULL; 2346 decomp_param = NULL; 2347 } 2348 2349 static void decompress_data_with_multi_threads(QEMUFile *f, 2350 void *host, int len) 2351 { 2352 int idx, thread_count; 2353 2354 thread_count = migrate_decompress_threads(); 2355 qemu_mutex_lock(&decomp_done_lock); 2356 while (true) { 2357 for (idx = 0; idx < thread_count; idx++) { 2358 if (decomp_param[idx].done) { 2359 decomp_param[idx].done = false; 2360 qemu_mutex_lock(&decomp_param[idx].mutex); 2361 qemu_get_buffer(f, decomp_param[idx].compbuf, len); 2362 decomp_param[idx].des = host; 2363 decomp_param[idx].len = len; 2364 qemu_cond_signal(&decomp_param[idx].cond); 2365 qemu_mutex_unlock(&decomp_param[idx].mutex); 2366 break; 2367 } 2368 } 2369 if (idx < thread_count) { 2370 break; 2371 } else { 2372 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock); 2373 } 2374 } 2375 qemu_mutex_unlock(&decomp_done_lock); 2376 } 2377 2378 /* 2379 * Allocate data structures etc needed by incoming migration with postcopy-ram 2380 * postcopy-ram's similarly names postcopy_ram_incoming_init does the work 2381 */ 2382 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 2383 { 2384 size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS; 2385 2386 return postcopy_ram_incoming_init(mis, ram_pages); 2387 } 2388 2389 /* 2390 * Called in postcopy mode by ram_load(). 2391 * rcu_read_lock is taken prior to this being called. 2392 */ 2393 static int ram_load_postcopy(QEMUFile *f) 2394 { 2395 int flags = 0, ret = 0; 2396 bool place_needed = false; 2397 bool matching_page_sizes = false; 2398 MigrationIncomingState *mis = migration_incoming_get_current(); 2399 /* Temporary page that is later 'placed' */ 2400 void *postcopy_host_page = postcopy_get_tmp_page(mis); 2401 void *last_host = NULL; 2402 bool all_zero = false; 2403 2404 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 2405 ram_addr_t addr; 2406 void *host = NULL; 2407 void *page_buffer = NULL; 2408 void *place_source = NULL; 2409 RAMBlock *block = NULL; 2410 uint8_t ch; 2411 2412 addr = qemu_get_be64(f); 2413 flags = addr & ~TARGET_PAGE_MASK; 2414 addr &= TARGET_PAGE_MASK; 2415 2416 trace_ram_load_postcopy_loop((uint64_t)addr, flags); 2417 place_needed = false; 2418 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) { 2419 block = ram_block_from_stream(f, flags); 2420 2421 host = host_from_ram_block_offset(block, addr); 2422 if (!host) { 2423 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 2424 ret = -EINVAL; 2425 break; 2426 } 2427 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE; 2428 /* 2429 * Postcopy requires that we place whole host pages atomically; 2430 * these may be huge pages for RAMBlocks that are backed by 2431 * hugetlbfs. 2432 * To make it atomic, the data is read into a temporary page 2433 * that's moved into place later. 2434 * The migration protocol uses, possibly smaller, target-pages 2435 * however the source ensures it always sends all the components 2436 * of a host page in order. 2437 */ 2438 page_buffer = postcopy_host_page + 2439 ((uintptr_t)host & (block->page_size - 1)); 2440 /* If all TP are zero then we can optimise the place */ 2441 if (!((uintptr_t)host & (block->page_size - 1))) { 2442 all_zero = true; 2443 } else { 2444 /* not the 1st TP within the HP */ 2445 if (host != (last_host + TARGET_PAGE_SIZE)) { 2446 error_report("Non-sequential target page %p/%p", 2447 host, last_host); 2448 ret = -EINVAL; 2449 break; 2450 } 2451 } 2452 2453 2454 /* 2455 * If it's the last part of a host page then we place the host 2456 * page 2457 */ 2458 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) & 2459 (block->page_size - 1)) == 0; 2460 place_source = postcopy_host_page; 2461 } 2462 last_host = host; 2463 2464 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 2465 case RAM_SAVE_FLAG_COMPRESS: 2466 ch = qemu_get_byte(f); 2467 memset(page_buffer, ch, TARGET_PAGE_SIZE); 2468 if (ch) { 2469 all_zero = false; 2470 } 2471 break; 2472 2473 case RAM_SAVE_FLAG_PAGE: 2474 all_zero = false; 2475 if (!place_needed || !matching_page_sizes) { 2476 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 2477 } else { 2478 /* Avoids the qemu_file copy during postcopy, which is 2479 * going to do a copy later; can only do it when we 2480 * do this read in one go (matching page sizes) 2481 */ 2482 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 2483 TARGET_PAGE_SIZE); 2484 } 2485 break; 2486 case RAM_SAVE_FLAG_EOS: 2487 /* normal exit */ 2488 break; 2489 default: 2490 error_report("Unknown combination of migration flags: %#x" 2491 " (postcopy mode)", flags); 2492 ret = -EINVAL; 2493 } 2494 2495 if (place_needed) { 2496 /* This gets called at the last target page in the host page */ 2497 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size; 2498 2499 if (all_zero) { 2500 ret = postcopy_place_page_zero(mis, place_dest, 2501 block->page_size); 2502 } else { 2503 ret = postcopy_place_page(mis, place_dest, 2504 place_source, block->page_size); 2505 } 2506 } 2507 if (!ret) { 2508 ret = qemu_file_get_error(f); 2509 } 2510 } 2511 2512 return ret; 2513 } 2514 2515 static int ram_load(QEMUFile *f, void *opaque, int version_id) 2516 { 2517 int flags = 0, ret = 0; 2518 static uint64_t seq_iter; 2519 int len = 0; 2520 /* 2521 * If system is running in postcopy mode, page inserts to host memory must 2522 * be atomic 2523 */ 2524 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING; 2525 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 2526 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE; 2527 2528 seq_iter++; 2529 2530 if (version_id != 4) { 2531 ret = -EINVAL; 2532 } 2533 2534 /* This RCU critical section can be very long running. 2535 * When RCU reclaims in the code start to become numerous, 2536 * it will be necessary to reduce the granularity of this 2537 * critical section. 2538 */ 2539 rcu_read_lock(); 2540 2541 if (postcopy_running) { 2542 ret = ram_load_postcopy(f); 2543 } 2544 2545 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) { 2546 ram_addr_t addr, total_ram_bytes; 2547 void *host = NULL; 2548 uint8_t ch; 2549 2550 addr = qemu_get_be64(f); 2551 flags = addr & ~TARGET_PAGE_MASK; 2552 addr &= TARGET_PAGE_MASK; 2553 2554 if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE | 2555 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { 2556 RAMBlock *block = ram_block_from_stream(f, flags); 2557 2558 host = host_from_ram_block_offset(block, addr); 2559 if (!host) { 2560 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 2561 ret = -EINVAL; 2562 break; 2563 } 2564 } 2565 2566 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 2567 case RAM_SAVE_FLAG_MEM_SIZE: 2568 /* Synchronize RAM block list */ 2569 total_ram_bytes = addr; 2570 while (!ret && total_ram_bytes) { 2571 RAMBlock *block; 2572 char id[256]; 2573 ram_addr_t length; 2574 2575 len = qemu_get_byte(f); 2576 qemu_get_buffer(f, (uint8_t *)id, len); 2577 id[len] = 0; 2578 length = qemu_get_be64(f); 2579 2580 block = qemu_ram_block_by_name(id); 2581 if (block) { 2582 if (length != block->used_length) { 2583 Error *local_err = NULL; 2584 2585 ret = qemu_ram_resize(block, length, 2586 &local_err); 2587 if (local_err) { 2588 error_report_err(local_err); 2589 } 2590 } 2591 /* For postcopy we need to check hugepage sizes match */ 2592 if (postcopy_advised && 2593 block->page_size != qemu_host_page_size) { 2594 uint64_t remote_page_size = qemu_get_be64(f); 2595 if (remote_page_size != block->page_size) { 2596 error_report("Mismatched RAM page size %s " 2597 "(local) %zd != %" PRId64, 2598 id, block->page_size, 2599 remote_page_size); 2600 ret = -EINVAL; 2601 } 2602 } 2603 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, 2604 block->idstr); 2605 } else { 2606 error_report("Unknown ramblock \"%s\", cannot " 2607 "accept migration", id); 2608 ret = -EINVAL; 2609 } 2610 2611 total_ram_bytes -= length; 2612 } 2613 break; 2614 2615 case RAM_SAVE_FLAG_COMPRESS: 2616 ch = qemu_get_byte(f); 2617 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); 2618 break; 2619 2620 case RAM_SAVE_FLAG_PAGE: 2621 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 2622 break; 2623 2624 case RAM_SAVE_FLAG_COMPRESS_PAGE: 2625 len = qemu_get_be32(f); 2626 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { 2627 error_report("Invalid compressed data length: %d", len); 2628 ret = -EINVAL; 2629 break; 2630 } 2631 decompress_data_with_multi_threads(f, host, len); 2632 break; 2633 2634 case RAM_SAVE_FLAG_XBZRLE: 2635 if (load_xbzrle(f, addr, host) < 0) { 2636 error_report("Failed to decompress XBZRLE page at " 2637 RAM_ADDR_FMT, addr); 2638 ret = -EINVAL; 2639 break; 2640 } 2641 break; 2642 case RAM_SAVE_FLAG_EOS: 2643 /* normal exit */ 2644 break; 2645 default: 2646 if (flags & RAM_SAVE_FLAG_HOOK) { 2647 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); 2648 } else { 2649 error_report("Unknown combination of migration flags: %#x", 2650 flags); 2651 ret = -EINVAL; 2652 } 2653 } 2654 if (!ret) { 2655 ret = qemu_file_get_error(f); 2656 } 2657 } 2658 2659 wait_for_decompress_done(); 2660 rcu_read_unlock(); 2661 trace_ram_load_complete(ret, seq_iter); 2662 return ret; 2663 } 2664 2665 static SaveVMHandlers savevm_ram_handlers = { 2666 .save_live_setup = ram_save_setup, 2667 .save_live_iterate = ram_save_iterate, 2668 .save_live_complete_postcopy = ram_save_complete, 2669 .save_live_complete_precopy = ram_save_complete, 2670 .save_live_pending = ram_save_pending, 2671 .load_state = ram_load, 2672 .cleanup = ram_migration_cleanup, 2673 }; 2674 2675 void ram_mig_init(void) 2676 { 2677 qemu_mutex_init(&XBZRLE.lock); 2678 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL); 2679 } 2680