1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "xbzrle.h" 36 #include "ram.h" 37 #include "migration.h" 38 #include "migration-stats.h" 39 #include "migration/register.h" 40 #include "migration/misc.h" 41 #include "qemu-file.h" 42 #include "postcopy-ram.h" 43 #include "page_cache.h" 44 #include "qemu/error-report.h" 45 #include "qapi/error.h" 46 #include "qapi/qapi-types-migration.h" 47 #include "qapi/qapi-events-migration.h" 48 #include "qapi/qapi-commands-migration.h" 49 #include "qapi/qmp/qerror.h" 50 #include "trace.h" 51 #include "system/ram_addr.h" 52 #include "exec/target_page.h" 53 #include "qemu/rcu_queue.h" 54 #include "migration/colo.h" 55 #include "system/cpu-throttle.h" 56 #include "savevm.h" 57 #include "qemu/iov.h" 58 #include "multifd.h" 59 #include "system/runstate.h" 60 #include "rdma.h" 61 #include "options.h" 62 #include "system/dirtylimit.h" 63 #include "system/kvm.h" 64 65 #include "hw/boards.h" /* for machine_dump_guest_core() */ 66 67 #if defined(__linux__) 68 #include "qemu/userfaultfd.h" 69 #endif /* defined(__linux__) */ 70 71 /***********************************************************/ 72 /* ram save/restore */ 73 74 /* 75 * mapped-ram migration supports O_DIRECT, so we need to make sure the 76 * userspace buffer, the IO operation size and the file offset are 77 * aligned according to the underlying device's block size. The first 78 * two are already aligned to page size, but we need to add padding to 79 * the file to align the offset. We cannot read the block size 80 * dynamically because the migration file can be moved between 81 * different systems, so use 1M to cover most block sizes and to keep 82 * the file offset aligned at page size as well. 83 */ 84 #define MAPPED_RAM_FILE_OFFSET_ALIGNMENT 0x100000 85 86 /* 87 * When doing mapped-ram migration, this is the amount we read from 88 * the pages region in the migration file at a time. 89 */ 90 #define MAPPED_RAM_LOAD_BUF_SIZE 0x100000 91 92 XBZRLECacheStats xbzrle_counters; 93 94 /* 95 * This structure locates a specific location of a guest page. In QEMU, 96 * it's described in a tuple of (ramblock, offset). 97 */ 98 struct PageLocation { 99 RAMBlock *block; 100 unsigned long offset; 101 }; 102 typedef struct PageLocation PageLocation; 103 104 /** 105 * PageLocationHint: describes a hint to a page location 106 * 107 * @valid set if the hint is vaild and to be consumed 108 * @location: the hint content 109 * 110 * In postcopy preempt mode, the urgent channel may provide hints to the 111 * background channel, so that QEMU source can try to migrate whatever is 112 * right after the requested urgent pages. 113 * 114 * This is based on the assumption that the VM (already running on the 115 * destination side) tends to access the memory with spatial locality. 116 * This is also the default behavior of vanilla postcopy (preempt off). 117 */ 118 struct PageLocationHint { 119 bool valid; 120 PageLocation location; 121 }; 122 typedef struct PageLocationHint PageLocationHint; 123 124 /* used by the search for pages to send */ 125 struct PageSearchStatus { 126 /* The migration channel used for a specific host page */ 127 QEMUFile *pss_channel; 128 /* Last block from where we have sent data */ 129 RAMBlock *last_sent_block; 130 /* Current block being searched */ 131 RAMBlock *block; 132 /* Current page to search from */ 133 unsigned long page; 134 /* Set once we wrap around */ 135 bool complete_round; 136 /* Whether we're sending a host page */ 137 bool host_page_sending; 138 /* The start/end of current host page. Invalid if host_page_sending==false */ 139 unsigned long host_page_start; 140 unsigned long host_page_end; 141 }; 142 typedef struct PageSearchStatus PageSearchStatus; 143 144 /* struct contains XBZRLE cache and a static page 145 used by the compression */ 146 static struct { 147 /* buffer used for XBZRLE encoding */ 148 uint8_t *encoded_buf; 149 /* buffer for storing page content */ 150 uint8_t *current_buf; 151 /* Cache for XBZRLE, Protected by lock. */ 152 PageCache *cache; 153 QemuMutex lock; 154 /* it will store a page full of zeros */ 155 uint8_t *zero_target_page; 156 /* buffer used for XBZRLE decoding */ 157 uint8_t *decoded_buf; 158 } XBZRLE; 159 160 static void XBZRLE_cache_lock(void) 161 { 162 if (migrate_xbzrle()) { 163 qemu_mutex_lock(&XBZRLE.lock); 164 } 165 } 166 167 static void XBZRLE_cache_unlock(void) 168 { 169 if (migrate_xbzrle()) { 170 qemu_mutex_unlock(&XBZRLE.lock); 171 } 172 } 173 174 /** 175 * xbzrle_cache_resize: resize the xbzrle cache 176 * 177 * This function is called from migrate_params_apply in main 178 * thread, possibly while a migration is in progress. A running 179 * migration may be using the cache and might finish during this call, 180 * hence changes to the cache are protected by XBZRLE.lock(). 181 * 182 * Returns 0 for success or -1 for error 183 * 184 * @new_size: new cache size 185 * @errp: set *errp if the check failed, with reason 186 */ 187 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 188 { 189 PageCache *new_cache; 190 int64_t ret = 0; 191 192 /* Check for truncation */ 193 if (new_size != (size_t)new_size) { 194 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 195 "exceeding address space"); 196 return -1; 197 } 198 199 if (new_size == migrate_xbzrle_cache_size()) { 200 /* nothing to do */ 201 return 0; 202 } 203 204 XBZRLE_cache_lock(); 205 206 if (XBZRLE.cache != NULL) { 207 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 208 if (!new_cache) { 209 ret = -1; 210 goto out; 211 } 212 213 cache_fini(XBZRLE.cache); 214 XBZRLE.cache = new_cache; 215 } 216 out: 217 XBZRLE_cache_unlock(); 218 return ret; 219 } 220 221 static bool postcopy_preempt_active(void) 222 { 223 return migrate_postcopy_preempt() && migration_in_postcopy(); 224 } 225 226 bool migrate_ram_is_ignored(RAMBlock *block) 227 { 228 MigMode mode = migrate_mode(); 229 return !qemu_ram_is_migratable(block) || 230 mode == MIG_MODE_CPR_TRANSFER || 231 (migrate_ignore_shared() && qemu_ram_is_shared(block) 232 && qemu_ram_is_named_file(block)); 233 } 234 235 #undef RAMBLOCK_FOREACH 236 237 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 238 { 239 RAMBlock *block; 240 int ret = 0; 241 242 RCU_READ_LOCK_GUARD(); 243 244 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 245 ret = func(block, opaque); 246 if (ret) { 247 break; 248 } 249 } 250 return ret; 251 } 252 253 static void ramblock_recv_map_init(void) 254 { 255 RAMBlock *rb; 256 257 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 258 assert(!rb->receivedmap); 259 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 260 } 261 } 262 263 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 264 { 265 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 266 rb->receivedmap); 267 } 268 269 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 270 { 271 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 272 } 273 274 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 275 { 276 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 277 } 278 279 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 280 size_t nr) 281 { 282 bitmap_set_atomic(rb->receivedmap, 283 ramblock_recv_bitmap_offset(host_addr, rb), 284 nr); 285 } 286 287 void ramblock_recv_bitmap_set_offset(RAMBlock *rb, uint64_t byte_offset) 288 { 289 set_bit_atomic(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 290 } 291 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 292 293 /* 294 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 295 * 296 * Returns >0 if success with sent bytes, or <0 if error. 297 */ 298 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 299 const char *block_name) 300 { 301 RAMBlock *block = qemu_ram_block_by_name(block_name); 302 unsigned long *le_bitmap, nbits; 303 uint64_t size; 304 305 if (!block) { 306 error_report("%s: invalid block name: %s", __func__, block_name); 307 return -1; 308 } 309 310 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 311 312 /* 313 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 314 * machines we may need 4 more bytes for padding (see below 315 * comment). So extend it a bit before hand. 316 */ 317 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 318 319 /* 320 * Always use little endian when sending the bitmap. This is 321 * required that when source and destination VMs are not using the 322 * same endianness. (Note: big endian won't work.) 323 */ 324 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 325 326 /* Size of the bitmap, in bytes */ 327 size = DIV_ROUND_UP(nbits, 8); 328 329 /* 330 * size is always aligned to 8 bytes for 64bit machines, but it 331 * may not be true for 32bit machines. We need this padding to 332 * make sure the migration can survive even between 32bit and 333 * 64bit machines. 334 */ 335 size = ROUND_UP(size, 8); 336 337 qemu_put_be64(file, size); 338 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 339 g_free(le_bitmap); 340 /* 341 * Mark as an end, in case the middle part is screwed up due to 342 * some "mysterious" reason. 343 */ 344 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 345 int ret = qemu_fflush(file); 346 if (ret) { 347 return ret; 348 } 349 350 return size + sizeof(size); 351 } 352 353 /* 354 * An outstanding page request, on the source, having been received 355 * and queued 356 */ 357 struct RAMSrcPageRequest { 358 RAMBlock *rb; 359 hwaddr offset; 360 hwaddr len; 361 362 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 363 }; 364 365 /* State of RAM for migration */ 366 struct RAMState { 367 /* 368 * PageSearchStatus structures for the channels when send pages. 369 * Protected by the bitmap_mutex. 370 */ 371 PageSearchStatus pss[RAM_CHANNEL_MAX]; 372 /* UFFD file descriptor, used in 'write-tracking' migration */ 373 int uffdio_fd; 374 /* total ram size in bytes */ 375 uint64_t ram_bytes_total; 376 /* Last block that we have visited searching for dirty pages */ 377 RAMBlock *last_seen_block; 378 /* Last dirty target page we have sent */ 379 ram_addr_t last_page; 380 /* last ram version we have seen */ 381 uint32_t last_version; 382 /* How many times we have dirty too many pages */ 383 int dirty_rate_high_cnt; 384 /* these variables are used for bitmap sync */ 385 /* last time we did a full bitmap_sync */ 386 int64_t time_last_bitmap_sync; 387 /* bytes transferred at start_time */ 388 uint64_t bytes_xfer_prev; 389 /* number of dirty pages since start_time */ 390 uint64_t num_dirty_pages_period; 391 /* xbzrle misses since the beginning of the period */ 392 uint64_t xbzrle_cache_miss_prev; 393 /* Amount of xbzrle pages since the beginning of the period */ 394 uint64_t xbzrle_pages_prev; 395 /* Amount of xbzrle encoded bytes since the beginning of the period */ 396 uint64_t xbzrle_bytes_prev; 397 /* Are we really using XBZRLE (e.g., after the first round). */ 398 bool xbzrle_started; 399 /* Are we on the last stage of migration */ 400 bool last_stage; 401 402 /* total handled target pages at the beginning of period */ 403 uint64_t target_page_count_prev; 404 /* total handled target pages since start */ 405 uint64_t target_page_count; 406 /* number of dirty bits in the bitmap */ 407 uint64_t migration_dirty_pages; 408 /* 409 * Protects: 410 * - dirty/clear bitmap 411 * - migration_dirty_pages 412 * - pss structures 413 */ 414 QemuMutex bitmap_mutex; 415 /* The RAMBlock used in the last src_page_requests */ 416 RAMBlock *last_req_rb; 417 /* Queue of outstanding page requests from the destination */ 418 QemuMutex src_page_req_mutex; 419 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 420 421 /* 422 * This is only used when postcopy is in recovery phase, to communicate 423 * between the migration thread and the return path thread on dirty 424 * bitmap synchronizations. This field is unused in other stages of 425 * RAM migration. 426 */ 427 unsigned int postcopy_bmap_sync_requested; 428 /* 429 * Page hint during postcopy when preempt mode is on. Return path 430 * thread sets it, while background migration thread consumes it. 431 * 432 * Protected by @bitmap_mutex. 433 */ 434 PageLocationHint page_hint; 435 }; 436 typedef struct RAMState RAMState; 437 438 static RAMState *ram_state; 439 440 static NotifierWithReturnList precopy_notifier_list; 441 442 /* Whether postcopy has queued requests? */ 443 static bool postcopy_has_request(RAMState *rs) 444 { 445 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 446 } 447 448 void precopy_infrastructure_init(void) 449 { 450 notifier_with_return_list_init(&precopy_notifier_list); 451 } 452 453 void precopy_add_notifier(NotifierWithReturn *n) 454 { 455 notifier_with_return_list_add(&precopy_notifier_list, n); 456 } 457 458 void precopy_remove_notifier(NotifierWithReturn *n) 459 { 460 notifier_with_return_remove(n); 461 } 462 463 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 464 { 465 PrecopyNotifyData pnd; 466 pnd.reason = reason; 467 468 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd, errp); 469 } 470 471 uint64_t ram_bytes_remaining(void) 472 { 473 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 474 0; 475 } 476 477 void ram_transferred_add(uint64_t bytes) 478 { 479 if (runstate_is_running()) { 480 stat64_add(&mig_stats.precopy_bytes, bytes); 481 } else if (migration_in_postcopy()) { 482 stat64_add(&mig_stats.postcopy_bytes, bytes); 483 } else { 484 stat64_add(&mig_stats.downtime_bytes, bytes); 485 } 486 } 487 488 static int ram_save_host_page_urgent(PageSearchStatus *pss); 489 490 /* NOTE: page is the PFN not real ram_addr_t. */ 491 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) 492 { 493 pss->block = rb; 494 pss->page = page; 495 pss->complete_round = false; 496 } 497 498 /* 499 * Check whether two PSSs are actively sending the same page. Return true 500 * if it is, false otherwise. 501 */ 502 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2) 503 { 504 return pss1->host_page_sending && pss2->host_page_sending && 505 (pss1->host_page_start == pss2->host_page_start); 506 } 507 508 /** 509 * save_page_header: write page header to wire 510 * 511 * If this is the 1st block, it also writes the block identification 512 * 513 * Returns the number of bytes written 514 * 515 * @pss: current PSS channel status 516 * @block: block that contains the page we want to send 517 * @offset: offset inside the block for the page 518 * in the lower bits, it contains flags 519 */ 520 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f, 521 RAMBlock *block, ram_addr_t offset) 522 { 523 size_t size, len; 524 bool same_block = (block == pss->last_sent_block); 525 526 if (same_block) { 527 offset |= RAM_SAVE_FLAG_CONTINUE; 528 } 529 qemu_put_be64(f, offset); 530 size = 8; 531 532 if (!same_block) { 533 len = strlen(block->idstr); 534 qemu_put_byte(f, len); 535 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 536 size += 1 + len; 537 pss->last_sent_block = block; 538 } 539 return size; 540 } 541 542 /** 543 * mig_throttle_guest_down: throttle down the guest 544 * 545 * Reduce amount of guest cpu execution to hopefully slow down memory 546 * writes. If guest dirty memory rate is reduced below the rate at 547 * which we can transfer pages to the destination then we should be 548 * able to complete migration. Some workloads dirty memory way too 549 * fast and will not effectively converge, even with auto-converge. 550 */ 551 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 552 uint64_t bytes_dirty_threshold) 553 { 554 uint64_t pct_initial = migrate_cpu_throttle_initial(); 555 uint64_t pct_increment = migrate_cpu_throttle_increment(); 556 bool pct_tailslow = migrate_cpu_throttle_tailslow(); 557 int pct_max = migrate_max_cpu_throttle(); 558 559 uint64_t throttle_now = cpu_throttle_get_percentage(); 560 uint64_t cpu_now, cpu_ideal, throttle_inc; 561 562 /* We have not started throttling yet. Let's start it. */ 563 if (!cpu_throttle_active()) { 564 cpu_throttle_set(pct_initial); 565 } else { 566 /* Throttling already on, just increase the rate */ 567 if (!pct_tailslow) { 568 throttle_inc = pct_increment; 569 } else { 570 /* Compute the ideal CPU percentage used by Guest, which may 571 * make the dirty rate match the dirty rate threshold. */ 572 cpu_now = 100 - throttle_now; 573 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 574 bytes_dirty_period); 575 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 576 } 577 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 578 } 579 } 580 581 void mig_throttle_counter_reset(void) 582 { 583 RAMState *rs = ram_state; 584 585 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 586 rs->num_dirty_pages_period = 0; 587 rs->bytes_xfer_prev = migration_transferred_bytes(); 588 } 589 590 /** 591 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 592 * 593 * @current_addr: address for the zero page 594 * 595 * Update the xbzrle cache to reflect a page that's been sent as all 0. 596 * The important thing is that a stale (not-yet-0'd) page be replaced 597 * by the new data. 598 * As a bonus, if the page wasn't in the cache it gets added so that 599 * when a small write is made into the 0'd page it gets XBZRLE sent. 600 */ 601 static void xbzrle_cache_zero_page(ram_addr_t current_addr) 602 { 603 /* We don't care if this fails to allocate a new cache page 604 * as long as it updated an old one */ 605 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 606 stat64_get(&mig_stats.dirty_sync_count)); 607 } 608 609 #define ENCODING_FLAG_XBZRLE 0x1 610 611 /** 612 * save_xbzrle_page: compress and send current page 613 * 614 * Returns: 1 means that we wrote the page 615 * 0 means that page is identical to the one already sent 616 * -1 means that xbzrle would be longer than normal 617 * 618 * @rs: current RAM state 619 * @pss: current PSS channel 620 * @current_data: pointer to the address of the page contents 621 * @current_addr: addr of the page 622 * @block: block that contains the page we want to send 623 * @offset: offset inside the block for the page 624 */ 625 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, 626 uint8_t **current_data, ram_addr_t current_addr, 627 RAMBlock *block, ram_addr_t offset) 628 { 629 int encoded_len = 0, bytes_xbzrle; 630 uint8_t *prev_cached_page; 631 QEMUFile *file = pss->pss_channel; 632 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); 633 634 if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) { 635 xbzrle_counters.cache_miss++; 636 if (!rs->last_stage) { 637 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 638 generation) == -1) { 639 return -1; 640 } else { 641 /* update *current_data when the page has been 642 inserted into cache */ 643 *current_data = get_cached_data(XBZRLE.cache, current_addr); 644 } 645 } 646 return -1; 647 } 648 649 /* 650 * Reaching here means the page has hit the xbzrle cache, no matter what 651 * encoding result it is (normal encoding, overflow or skipping the page), 652 * count the page as encoded. This is used to calculate the encoding rate. 653 * 654 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 655 * 2nd page turns out to be skipped (i.e. no new bytes written to the 656 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 657 * skipped page included. In this way, the encoding rate can tell if the 658 * guest page is good for xbzrle encoding. 659 */ 660 xbzrle_counters.pages++; 661 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 662 663 /* save current buffer into memory */ 664 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 665 666 /* XBZRLE encoding (if there is no overflow) */ 667 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 668 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 669 TARGET_PAGE_SIZE); 670 671 /* 672 * Update the cache contents, so that it corresponds to the data 673 * sent, in all cases except where we skip the page. 674 */ 675 if (!rs->last_stage && encoded_len != 0) { 676 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 677 /* 678 * In the case where we couldn't compress, ensure that the caller 679 * sends the data from the cache, since the guest might have 680 * changed the RAM since we copied it. 681 */ 682 *current_data = prev_cached_page; 683 } 684 685 if (encoded_len == 0) { 686 trace_save_xbzrle_page_skipping(); 687 return 0; 688 } else if (encoded_len == -1) { 689 trace_save_xbzrle_page_overflow(); 690 xbzrle_counters.overflow++; 691 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 692 return -1; 693 } 694 695 /* Send XBZRLE based compressed page */ 696 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block, 697 offset | RAM_SAVE_FLAG_XBZRLE); 698 qemu_put_byte(file, ENCODING_FLAG_XBZRLE); 699 qemu_put_be16(file, encoded_len); 700 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len); 701 bytes_xbzrle += encoded_len + 1 + 2; 702 /* 703 * The xbzrle encoded bytes don't count the 8 byte header with 704 * RAM_SAVE_FLAG_CONTINUE. 705 */ 706 xbzrle_counters.bytes += bytes_xbzrle - 8; 707 ram_transferred_add(bytes_xbzrle); 708 709 return 1; 710 } 711 712 /** 713 * pss_find_next_dirty: find the next dirty page of current ramblock 714 * 715 * This function updates pss->page to point to the next dirty page index 716 * within the ramblock to migrate, or the end of ramblock when nothing 717 * found. Note that when pss->host_page_sending==true it means we're 718 * during sending a host page, so we won't look for dirty page that is 719 * outside the host page boundary. 720 * 721 * @pss: the current page search status 722 */ 723 static void pss_find_next_dirty(PageSearchStatus *pss) 724 { 725 RAMBlock *rb = pss->block; 726 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 727 unsigned long *bitmap = rb->bmap; 728 729 if (migrate_ram_is_ignored(rb)) { 730 /* Points directly to the end, so we know no dirty page */ 731 pss->page = size; 732 return; 733 } 734 735 /* 736 * If during sending a host page, only look for dirty pages within the 737 * current host page being send. 738 */ 739 if (pss->host_page_sending) { 740 assert(pss->host_page_end); 741 size = MIN(size, pss->host_page_end); 742 } 743 744 pss->page = find_next_bit(bitmap, size, pss->page); 745 } 746 747 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 748 unsigned long page) 749 { 750 uint8_t shift; 751 hwaddr size, start; 752 753 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 754 return; 755 } 756 757 shift = rb->clear_bmap_shift; 758 /* 759 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 760 * can make things easier sometimes since then start address 761 * of the small chunk will always be 64 pages aligned so the 762 * bitmap will always be aligned to unsigned long. We should 763 * even be able to remove this restriction but I'm simply 764 * keeping it. 765 */ 766 assert(shift >= 6); 767 768 size = 1ULL << (TARGET_PAGE_BITS + shift); 769 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 770 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 771 memory_region_clear_dirty_bitmap(rb->mr, start, size); 772 } 773 774 static void 775 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 776 unsigned long start, 777 unsigned long npages) 778 { 779 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 780 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 781 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 782 783 /* 784 * Clear pages from start to start + npages - 1, so the end boundary is 785 * exclusive. 786 */ 787 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 788 migration_clear_memory_region_dirty_bitmap(rb, i); 789 } 790 } 791 792 /* 793 * colo_bitmap_find_diry:find contiguous dirty pages from start 794 * 795 * Returns the page offset within memory region of the start of the contiguout 796 * dirty page 797 * 798 * @rs: current RAM state 799 * @rb: RAMBlock where to search for dirty pages 800 * @start: page where we start the search 801 * @num: the number of contiguous dirty pages 802 */ 803 static inline 804 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 805 unsigned long start, unsigned long *num) 806 { 807 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 808 unsigned long *bitmap = rb->bmap; 809 unsigned long first, next; 810 811 *num = 0; 812 813 if (migrate_ram_is_ignored(rb)) { 814 return size; 815 } 816 817 first = find_next_bit(bitmap, size, start); 818 if (first >= size) { 819 return first; 820 } 821 next = find_next_zero_bit(bitmap, size, first + 1); 822 assert(next >= first); 823 *num = next - first; 824 return first; 825 } 826 827 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 828 RAMBlock *rb, 829 unsigned long page) 830 { 831 bool ret; 832 833 /* 834 * Clear dirty bitmap if needed. This _must_ be called before we 835 * send any of the page in the chunk because we need to make sure 836 * we can capture further page content changes when we sync dirty 837 * log the next time. So as long as we are going to send any of 838 * the page in the chunk we clear the remote dirty bitmap for all. 839 * Clearing it earlier won't be a problem, but too late will. 840 */ 841 migration_clear_memory_region_dirty_bitmap(rb, page); 842 843 ret = test_and_clear_bit(page, rb->bmap); 844 if (ret) { 845 rs->migration_dirty_pages--; 846 } 847 848 return ret; 849 } 850 851 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 852 void *opaque) 853 { 854 const hwaddr offset = section->offset_within_region; 855 const hwaddr size = int128_get64(section->size); 856 const unsigned long start = offset >> TARGET_PAGE_BITS; 857 const unsigned long npages = size >> TARGET_PAGE_BITS; 858 RAMBlock *rb = section->mr->ram_block; 859 uint64_t *cleared_bits = opaque; 860 861 /* 862 * We don't grab ram_state->bitmap_mutex because we expect to run 863 * only when starting migration or during postcopy recovery where 864 * we don't have concurrent access. 865 */ 866 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 867 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 868 } 869 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 870 bitmap_clear(rb->bmap, start, npages); 871 } 872 873 /* 874 * Exclude all dirty pages from migration that fall into a discarded range as 875 * managed by a RamDiscardManager responsible for the mapped memory region of 876 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 877 * 878 * Discarded pages ("logically unplugged") have undefined content and must 879 * not get migrated, because even reading these pages for migration might 880 * result in undesired behavior. 881 * 882 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 883 * 884 * Note: The result is only stable while migrating (precopy/postcopy). 885 */ 886 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 887 { 888 uint64_t cleared_bits = 0; 889 890 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 891 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 892 MemoryRegionSection section = { 893 .mr = rb->mr, 894 .offset_within_region = 0, 895 .size = int128_make64(qemu_ram_get_used_length(rb)), 896 }; 897 898 ram_discard_manager_replay_discarded(rdm, §ion, 899 dirty_bitmap_clear_section, 900 &cleared_bits); 901 } 902 return cleared_bits; 903 } 904 905 /* 906 * Check if a host-page aligned page falls into a discarded range as managed by 907 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 908 * 909 * Note: The result is only stable while migrating (precopy/postcopy). 910 */ 911 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 912 { 913 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 914 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 915 MemoryRegionSection section = { 916 .mr = rb->mr, 917 .offset_within_region = start, 918 .size = int128_make64(qemu_ram_pagesize(rb)), 919 }; 920 921 return !ram_discard_manager_is_populated(rdm, §ion); 922 } 923 return false; 924 } 925 926 /* Called with RCU critical section */ 927 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 928 { 929 uint64_t new_dirty_pages = 930 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 931 932 rs->migration_dirty_pages += new_dirty_pages; 933 rs->num_dirty_pages_period += new_dirty_pages; 934 } 935 936 /** 937 * ram_pagesize_summary: calculate all the pagesizes of a VM 938 * 939 * Returns a summary bitmap of the page sizes of all RAMBlocks 940 * 941 * For VMs with just normal pages this is equivalent to the host page 942 * size. If it's got some huge pages then it's the OR of all the 943 * different page sizes. 944 */ 945 uint64_t ram_pagesize_summary(void) 946 { 947 RAMBlock *block; 948 uint64_t summary = 0; 949 950 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 951 summary |= block->page_size; 952 } 953 954 return summary; 955 } 956 957 uint64_t ram_get_total_transferred_pages(void) 958 { 959 return stat64_get(&mig_stats.normal_pages) + 960 stat64_get(&mig_stats.zero_pages) + 961 xbzrle_counters.pages; 962 } 963 964 static void migration_update_rates(RAMState *rs, int64_t end_time) 965 { 966 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 967 968 /* calculate period counters */ 969 stat64_set(&mig_stats.dirty_pages_rate, 970 rs->num_dirty_pages_period * 1000 / 971 (end_time - rs->time_last_bitmap_sync)); 972 973 if (!page_count) { 974 return; 975 } 976 977 if (migrate_xbzrle()) { 978 double encoded_size, unencoded_size; 979 980 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 981 rs->xbzrle_cache_miss_prev) / page_count; 982 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 983 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 984 TARGET_PAGE_SIZE; 985 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 986 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 987 xbzrle_counters.encoding_rate = 0; 988 } else { 989 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 990 } 991 rs->xbzrle_pages_prev = xbzrle_counters.pages; 992 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 993 } 994 } 995 996 /* 997 * Enable dirty-limit to throttle down the guest 998 */ 999 static void migration_dirty_limit_guest(void) 1000 { 1001 /* 1002 * dirty page rate quota for all vCPUs fetched from 1003 * migration parameter 'vcpu_dirty_limit' 1004 */ 1005 static int64_t quota_dirtyrate; 1006 MigrationState *s = migrate_get_current(); 1007 1008 /* 1009 * If dirty limit already enabled and migration parameter 1010 * vcpu-dirty-limit untouched. 1011 */ 1012 if (dirtylimit_in_service() && 1013 quota_dirtyrate == s->parameters.vcpu_dirty_limit) { 1014 return; 1015 } 1016 1017 quota_dirtyrate = s->parameters.vcpu_dirty_limit; 1018 1019 /* 1020 * Set all vCPU a quota dirtyrate, note that the second 1021 * parameter will be ignored if setting all vCPU for the vm 1022 */ 1023 qmp_set_vcpu_dirty_limit(false, -1, quota_dirtyrate, NULL); 1024 trace_migration_dirty_limit_guest(quota_dirtyrate); 1025 } 1026 1027 static void migration_trigger_throttle(RAMState *rs) 1028 { 1029 uint64_t threshold = migrate_throttle_trigger_threshold(); 1030 uint64_t bytes_xfer_period = 1031 migration_transferred_bytes() - rs->bytes_xfer_prev; 1032 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1033 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1034 1035 /* 1036 * The following detection logic can be refined later. For now: 1037 * Check to see if the ratio between dirtied bytes and the approx. 1038 * amount of bytes that just got transferred since the last time 1039 * we were in this routine reaches the threshold. If that happens 1040 * twice, start or increase throttling. 1041 */ 1042 if ((bytes_dirty_period > bytes_dirty_threshold) && 1043 (++rs->dirty_rate_high_cnt >= 2)) { 1044 rs->dirty_rate_high_cnt = 0; 1045 if (migrate_auto_converge()) { 1046 trace_migration_throttle(); 1047 mig_throttle_guest_down(bytes_dirty_period, 1048 bytes_dirty_threshold); 1049 } else if (migrate_dirty_limit()) { 1050 migration_dirty_limit_guest(); 1051 } 1052 } 1053 } 1054 1055 static void migration_bitmap_sync(RAMState *rs, bool last_stage) 1056 { 1057 RAMBlock *block; 1058 int64_t end_time; 1059 1060 stat64_add(&mig_stats.dirty_sync_count, 1); 1061 1062 if (!rs->time_last_bitmap_sync) { 1063 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1064 } 1065 1066 trace_migration_bitmap_sync_start(); 1067 memory_global_dirty_log_sync(last_stage); 1068 1069 WITH_QEMU_LOCK_GUARD(&rs->bitmap_mutex) { 1070 WITH_RCU_READ_LOCK_GUARD() { 1071 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1072 ramblock_sync_dirty_bitmap(rs, block); 1073 } 1074 stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining()); 1075 } 1076 } 1077 1078 memory_global_after_dirty_log_sync(); 1079 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1080 1081 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1082 1083 /* more than 1 second = 1000 millisecons */ 1084 if (end_time > rs->time_last_bitmap_sync + 1000) { 1085 migration_trigger_throttle(rs); 1086 1087 migration_update_rates(rs, end_time); 1088 1089 rs->target_page_count_prev = rs->target_page_count; 1090 1091 /* reset period counters */ 1092 rs->time_last_bitmap_sync = end_time; 1093 rs->num_dirty_pages_period = 0; 1094 rs->bytes_xfer_prev = migration_transferred_bytes(); 1095 } 1096 if (migrate_events()) { 1097 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); 1098 qapi_event_send_migration_pass(generation); 1099 } 1100 } 1101 1102 void migration_bitmap_sync_precopy(bool last_stage) 1103 { 1104 Error *local_err = NULL; 1105 assert(ram_state); 1106 1107 /* 1108 * The current notifier usage is just an optimization to migration, so we 1109 * don't stop the normal migration process in the error case. 1110 */ 1111 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1112 error_report_err(local_err); 1113 local_err = NULL; 1114 } 1115 1116 migration_bitmap_sync(ram_state, last_stage); 1117 1118 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1119 error_report_err(local_err); 1120 } 1121 } 1122 1123 void ram_release_page(const char *rbname, uint64_t offset) 1124 { 1125 if (!migrate_release_ram() || !migration_in_postcopy()) { 1126 return; 1127 } 1128 1129 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1130 } 1131 1132 /** 1133 * save_zero_page: send the zero page to the stream 1134 * 1135 * Returns the number of pages written. 1136 * 1137 * @rs: current RAM state 1138 * @pss: current PSS channel 1139 * @offset: offset inside the block for the page 1140 */ 1141 static int save_zero_page(RAMState *rs, PageSearchStatus *pss, 1142 ram_addr_t offset) 1143 { 1144 uint8_t *p = pss->block->host + offset; 1145 QEMUFile *file = pss->pss_channel; 1146 int len = 0; 1147 1148 if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_NONE) { 1149 return 0; 1150 } 1151 1152 if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1153 return 0; 1154 } 1155 1156 stat64_add(&mig_stats.zero_pages, 1); 1157 1158 if (migrate_mapped_ram()) { 1159 /* zero pages are not transferred with mapped-ram */ 1160 clear_bit_atomic(offset >> TARGET_PAGE_BITS, pss->block->file_bmap); 1161 return 1; 1162 } 1163 1164 len += save_page_header(pss, file, pss->block, offset | RAM_SAVE_FLAG_ZERO); 1165 qemu_put_byte(file, 0); 1166 len += 1; 1167 ram_release_page(pss->block->idstr, offset); 1168 ram_transferred_add(len); 1169 1170 /* 1171 * Must let xbzrle know, otherwise a previous (now 0'd) cached 1172 * page would be stale. 1173 */ 1174 if (rs->xbzrle_started) { 1175 XBZRLE_cache_lock(); 1176 xbzrle_cache_zero_page(pss->block->offset + offset); 1177 XBZRLE_cache_unlock(); 1178 } 1179 1180 return len; 1181 } 1182 1183 /* 1184 * directly send the page to the stream 1185 * 1186 * Returns the number of pages written. 1187 * 1188 * @pss: current PSS channel 1189 * @block: block that contains the page we want to send 1190 * @offset: offset inside the block for the page 1191 * @buf: the page to be sent 1192 * @async: send to page asyncly 1193 */ 1194 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, 1195 ram_addr_t offset, uint8_t *buf, bool async) 1196 { 1197 QEMUFile *file = pss->pss_channel; 1198 1199 if (migrate_mapped_ram()) { 1200 qemu_put_buffer_at(file, buf, TARGET_PAGE_SIZE, 1201 block->pages_offset + offset); 1202 set_bit(offset >> TARGET_PAGE_BITS, block->file_bmap); 1203 } else { 1204 ram_transferred_add(save_page_header(pss, pss->pss_channel, block, 1205 offset | RAM_SAVE_FLAG_PAGE)); 1206 if (async) { 1207 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, 1208 migrate_release_ram() && 1209 migration_in_postcopy()); 1210 } else { 1211 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); 1212 } 1213 } 1214 ram_transferred_add(TARGET_PAGE_SIZE); 1215 stat64_add(&mig_stats.normal_pages, 1); 1216 return 1; 1217 } 1218 1219 /** 1220 * ram_save_page: send the given page to the stream 1221 * 1222 * Returns the number of pages written. 1223 * < 0 - error 1224 * >=0 - Number of pages written - this might legally be 0 1225 * if xbzrle noticed the page was the same. 1226 * 1227 * @rs: current RAM state 1228 * @block: block that contains the page we want to send 1229 * @offset: offset inside the block for the page 1230 */ 1231 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1232 { 1233 int pages = -1; 1234 uint8_t *p; 1235 bool send_async = true; 1236 RAMBlock *block = pss->block; 1237 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1238 ram_addr_t current_addr = block->offset + offset; 1239 1240 p = block->host + offset; 1241 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1242 1243 XBZRLE_cache_lock(); 1244 if (rs->xbzrle_started && !migration_in_postcopy()) { 1245 pages = save_xbzrle_page(rs, pss, &p, current_addr, 1246 block, offset); 1247 if (!rs->last_stage) { 1248 /* Can't send this cached data async, since the cache page 1249 * might get updated before it gets to the wire 1250 */ 1251 send_async = false; 1252 } 1253 } 1254 1255 /* XBZRLE overflow or normal page */ 1256 if (pages == -1) { 1257 pages = save_normal_page(pss, block, offset, p, send_async); 1258 } 1259 1260 XBZRLE_cache_unlock(); 1261 1262 return pages; 1263 } 1264 1265 static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) 1266 { 1267 if (!multifd_queue_page(block, offset)) { 1268 return -1; 1269 } 1270 1271 return 1; 1272 } 1273 1274 1275 #define PAGE_ALL_CLEAN 0 1276 #define PAGE_TRY_AGAIN 1 1277 #define PAGE_DIRTY_FOUND 2 1278 /** 1279 * find_dirty_block: find the next dirty page and update any state 1280 * associated with the search process. 1281 * 1282 * Returns: 1283 * <0: An error happened 1284 * PAGE_ALL_CLEAN: no dirty page found, give up 1285 * PAGE_TRY_AGAIN: no dirty page found, retry for next block 1286 * PAGE_DIRTY_FOUND: dirty page found 1287 * 1288 * @rs: current RAM state 1289 * @pss: data about the state of the current dirty page scan 1290 * @again: set to false if the search has scanned the whole of RAM 1291 */ 1292 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) 1293 { 1294 /* Update pss->page for the next dirty bit in ramblock */ 1295 pss_find_next_dirty(pss); 1296 1297 if (pss->complete_round && pss->block == rs->last_seen_block && 1298 pss->page >= rs->last_page) { 1299 /* 1300 * We've been once around the RAM and haven't found anything. 1301 * Give up. 1302 */ 1303 return PAGE_ALL_CLEAN; 1304 } 1305 if (!offset_in_ramblock(pss->block, 1306 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1307 /* Didn't find anything in this RAM Block */ 1308 pss->page = 0; 1309 pss->block = QLIST_NEXT_RCU(pss->block, next); 1310 if (!pss->block) { 1311 if (multifd_ram_sync_per_round()) { 1312 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; 1313 int ret = multifd_ram_flush_and_sync(f); 1314 if (ret < 0) { 1315 return ret; 1316 } 1317 } 1318 1319 /* Hit the end of the list */ 1320 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1321 /* Flag that we've looped */ 1322 pss->complete_round = true; 1323 /* After the first round, enable XBZRLE. */ 1324 if (migrate_xbzrle()) { 1325 rs->xbzrle_started = true; 1326 } 1327 } 1328 /* Didn't find anything this time, but try again on the new block */ 1329 return PAGE_TRY_AGAIN; 1330 } else { 1331 /* We've found something */ 1332 return PAGE_DIRTY_FOUND; 1333 } 1334 } 1335 1336 /** 1337 * unqueue_page: gets a page of the queue 1338 * 1339 * Helper for 'get_queued_page' - gets a page off the queue 1340 * 1341 * Returns the block of the page (or NULL if none available) 1342 * 1343 * @rs: current RAM state 1344 * @offset: used to return the offset within the RAMBlock 1345 */ 1346 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1347 { 1348 struct RAMSrcPageRequest *entry; 1349 RAMBlock *block = NULL; 1350 1351 if (!postcopy_has_request(rs)) { 1352 return NULL; 1353 } 1354 1355 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1356 1357 /* 1358 * This should _never_ change even after we take the lock, because no one 1359 * should be taking anything off the request list other than us. 1360 */ 1361 assert(postcopy_has_request(rs)); 1362 1363 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1364 block = entry->rb; 1365 *offset = entry->offset; 1366 1367 if (entry->len > TARGET_PAGE_SIZE) { 1368 entry->len -= TARGET_PAGE_SIZE; 1369 entry->offset += TARGET_PAGE_SIZE; 1370 } else { 1371 memory_region_unref(block->mr); 1372 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1373 g_free(entry); 1374 migration_consume_urgent_request(); 1375 } 1376 1377 return block; 1378 } 1379 1380 #if defined(__linux__) 1381 /** 1382 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1383 * is found, return RAM block pointer and page offset 1384 * 1385 * Returns pointer to the RAMBlock containing faulting page, 1386 * NULL if no write faults are pending 1387 * 1388 * @rs: current RAM state 1389 * @offset: page offset from the beginning of the block 1390 */ 1391 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1392 { 1393 struct uffd_msg uffd_msg; 1394 void *page_address; 1395 RAMBlock *block; 1396 int res; 1397 1398 if (!migrate_background_snapshot()) { 1399 return NULL; 1400 } 1401 1402 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1403 if (res <= 0) { 1404 return NULL; 1405 } 1406 1407 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1408 block = qemu_ram_block_from_host(page_address, false, offset); 1409 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1410 return block; 1411 } 1412 1413 /** 1414 * ram_save_release_protection: release UFFD write protection after 1415 * a range of pages has been saved 1416 * 1417 * @rs: current RAM state 1418 * @pss: page-search-status structure 1419 * @start_page: index of the first page in the range relative to pss->block 1420 * 1421 * Returns 0 on success, negative value in case of an error 1422 */ 1423 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1424 unsigned long start_page) 1425 { 1426 int res = 0; 1427 1428 /* Check if page is from UFFD-managed region. */ 1429 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1430 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1431 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1432 1433 /* Flush async buffers before un-protect. */ 1434 qemu_fflush(pss->pss_channel); 1435 /* Un-protect memory range. */ 1436 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1437 false, false); 1438 } 1439 1440 return res; 1441 } 1442 1443 /* ram_write_tracking_available: check if kernel supports required UFFD features 1444 * 1445 * Returns true if supports, false otherwise 1446 */ 1447 bool ram_write_tracking_available(void) 1448 { 1449 uint64_t uffd_features; 1450 int res; 1451 1452 res = uffd_query_features(&uffd_features); 1453 return (res == 0 && 1454 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1455 } 1456 1457 /* ram_write_tracking_compatible: check if guest configuration is 1458 * compatible with 'write-tracking' 1459 * 1460 * Returns true if compatible, false otherwise 1461 */ 1462 bool ram_write_tracking_compatible(void) 1463 { 1464 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1465 int uffd_fd; 1466 RAMBlock *block; 1467 bool ret = false; 1468 1469 /* Open UFFD file descriptor */ 1470 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1471 if (uffd_fd < 0) { 1472 return false; 1473 } 1474 1475 RCU_READ_LOCK_GUARD(); 1476 1477 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1478 uint64_t uffd_ioctls; 1479 1480 /* Nothing to do with read-only and MMIO-writable regions */ 1481 if (block->mr->readonly || block->mr->rom_device) { 1482 continue; 1483 } 1484 /* Try to register block memory via UFFD-IO to track writes */ 1485 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1486 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1487 goto out; 1488 } 1489 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1490 goto out; 1491 } 1492 } 1493 ret = true; 1494 1495 out: 1496 uffd_close_fd(uffd_fd); 1497 return ret; 1498 } 1499 1500 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1501 ram_addr_t size) 1502 { 1503 const ram_addr_t end = offset + size; 1504 1505 /* 1506 * We read one byte of each page; this will preallocate page tables if 1507 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1508 * where no page was populated yet. This might require adaption when 1509 * supporting other mappings, like shmem. 1510 */ 1511 for (; offset < end; offset += block->page_size) { 1512 char tmp = *((char *)block->host + offset); 1513 1514 /* Don't optimize the read out */ 1515 asm volatile("" : "+r" (tmp)); 1516 } 1517 } 1518 1519 static inline int populate_read_section(MemoryRegionSection *section, 1520 void *opaque) 1521 { 1522 const hwaddr size = int128_get64(section->size); 1523 hwaddr offset = section->offset_within_region; 1524 RAMBlock *block = section->mr->ram_block; 1525 1526 populate_read_range(block, offset, size); 1527 return 0; 1528 } 1529 1530 /* 1531 * ram_block_populate_read: preallocate page tables and populate pages in the 1532 * RAM block by reading a byte of each page. 1533 * 1534 * Since it's solely used for userfault_fd WP feature, here we just 1535 * hardcode page size to qemu_real_host_page_size. 1536 * 1537 * @block: RAM block to populate 1538 */ 1539 static void ram_block_populate_read(RAMBlock *rb) 1540 { 1541 /* 1542 * Skip populating all pages that fall into a discarded range as managed by 1543 * a RamDiscardManager responsible for the mapped memory region of the 1544 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1545 * must not get populated automatically. We don't have to track 1546 * modifications via userfaultfd WP reliably, because these pages will 1547 * not be part of the migration stream either way -- see 1548 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1549 * 1550 * Note: The result is only stable while migrating (precopy/postcopy). 1551 */ 1552 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1553 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1554 MemoryRegionSection section = { 1555 .mr = rb->mr, 1556 .offset_within_region = 0, 1557 .size = rb->mr->size, 1558 }; 1559 1560 ram_discard_manager_replay_populated(rdm, §ion, 1561 populate_read_section, NULL); 1562 } else { 1563 populate_read_range(rb, 0, rb->used_length); 1564 } 1565 } 1566 1567 /* 1568 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1569 */ 1570 void ram_write_tracking_prepare(void) 1571 { 1572 RAMBlock *block; 1573 1574 RCU_READ_LOCK_GUARD(); 1575 1576 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1577 /* Nothing to do with read-only and MMIO-writable regions */ 1578 if (block->mr->readonly || block->mr->rom_device) { 1579 continue; 1580 } 1581 1582 /* 1583 * Populate pages of the RAM block before enabling userfault_fd 1584 * write protection. 1585 * 1586 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1587 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1588 * pages with pte_none() entries in page table. 1589 */ 1590 ram_block_populate_read(block); 1591 } 1592 } 1593 1594 static inline int uffd_protect_section(MemoryRegionSection *section, 1595 void *opaque) 1596 { 1597 const hwaddr size = int128_get64(section->size); 1598 const hwaddr offset = section->offset_within_region; 1599 RAMBlock *rb = section->mr->ram_block; 1600 int uffd_fd = (uintptr_t)opaque; 1601 1602 return uffd_change_protection(uffd_fd, rb->host + offset, size, true, 1603 false); 1604 } 1605 1606 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd) 1607 { 1608 assert(rb->flags & RAM_UF_WRITEPROTECT); 1609 1610 /* See ram_block_populate_read() */ 1611 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1612 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1613 MemoryRegionSection section = { 1614 .mr = rb->mr, 1615 .offset_within_region = 0, 1616 .size = rb->mr->size, 1617 }; 1618 1619 return ram_discard_manager_replay_populated(rdm, §ion, 1620 uffd_protect_section, 1621 (void *)(uintptr_t)uffd_fd); 1622 } 1623 return uffd_change_protection(uffd_fd, rb->host, 1624 rb->used_length, true, false); 1625 } 1626 1627 /* 1628 * ram_write_tracking_start: start UFFD-WP memory tracking 1629 * 1630 * Returns 0 for success or negative value in case of error 1631 */ 1632 int ram_write_tracking_start(void) 1633 { 1634 int uffd_fd; 1635 RAMState *rs = ram_state; 1636 RAMBlock *block; 1637 1638 /* Open UFFD file descriptor */ 1639 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1640 if (uffd_fd < 0) { 1641 return uffd_fd; 1642 } 1643 rs->uffdio_fd = uffd_fd; 1644 1645 RCU_READ_LOCK_GUARD(); 1646 1647 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1648 /* Nothing to do with read-only and MMIO-writable regions */ 1649 if (block->mr->readonly || block->mr->rom_device) { 1650 continue; 1651 } 1652 1653 /* Register block memory with UFFD to track writes */ 1654 if (uffd_register_memory(rs->uffdio_fd, block->host, 1655 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1656 goto fail; 1657 } 1658 block->flags |= RAM_UF_WRITEPROTECT; 1659 memory_region_ref(block->mr); 1660 1661 /* Apply UFFD write protection to the block memory range */ 1662 if (ram_block_uffd_protect(block, uffd_fd)) { 1663 goto fail; 1664 } 1665 1666 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1667 block->host, block->max_length); 1668 } 1669 1670 return 0; 1671 1672 fail: 1673 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1674 1675 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1676 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1677 continue; 1678 } 1679 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1680 /* Cleanup flags and remove reference */ 1681 block->flags &= ~RAM_UF_WRITEPROTECT; 1682 memory_region_unref(block->mr); 1683 } 1684 1685 uffd_close_fd(uffd_fd); 1686 rs->uffdio_fd = -1; 1687 return -1; 1688 } 1689 1690 /** 1691 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1692 */ 1693 void ram_write_tracking_stop(void) 1694 { 1695 RAMState *rs = ram_state; 1696 RAMBlock *block; 1697 1698 RCU_READ_LOCK_GUARD(); 1699 1700 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1701 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1702 continue; 1703 } 1704 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1705 1706 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 1707 block->host, block->max_length); 1708 1709 /* Cleanup flags and remove reference */ 1710 block->flags &= ~RAM_UF_WRITEPROTECT; 1711 memory_region_unref(block->mr); 1712 } 1713 1714 /* Finally close UFFD file descriptor */ 1715 uffd_close_fd(rs->uffdio_fd); 1716 rs->uffdio_fd = -1; 1717 } 1718 1719 #else 1720 /* No target OS support, stubs just fail or ignore */ 1721 1722 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1723 { 1724 (void) rs; 1725 (void) offset; 1726 1727 return NULL; 1728 } 1729 1730 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1731 unsigned long start_page) 1732 { 1733 (void) rs; 1734 (void) pss; 1735 (void) start_page; 1736 1737 return 0; 1738 } 1739 1740 bool ram_write_tracking_available(void) 1741 { 1742 return false; 1743 } 1744 1745 bool ram_write_tracking_compatible(void) 1746 { 1747 g_assert_not_reached(); 1748 } 1749 1750 int ram_write_tracking_start(void) 1751 { 1752 g_assert_not_reached(); 1753 } 1754 1755 void ram_write_tracking_stop(void) 1756 { 1757 g_assert_not_reached(); 1758 } 1759 #endif /* defined(__linux__) */ 1760 1761 /** 1762 * get_queued_page: unqueue a page from the postcopy requests 1763 * 1764 * Skips pages that are already sent (!dirty) 1765 * 1766 * Returns true if a queued page is found 1767 * 1768 * @rs: current RAM state 1769 * @pss: data about the state of the current dirty page scan 1770 */ 1771 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1772 { 1773 RAMBlock *block; 1774 ram_addr_t offset; 1775 bool dirty = false; 1776 1777 do { 1778 block = unqueue_page(rs, &offset); 1779 /* 1780 * We're sending this page, and since it's postcopy nothing else 1781 * will dirty it, and we must make sure it doesn't get sent again 1782 * even if this queue request was received after the background 1783 * search already sent it. 1784 */ 1785 if (block) { 1786 unsigned long page; 1787 1788 page = offset >> TARGET_PAGE_BITS; 1789 dirty = test_bit(page, block->bmap); 1790 if (!dirty) { 1791 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1792 page); 1793 } else { 1794 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1795 } 1796 } 1797 1798 } while (block && !dirty); 1799 1800 if (!block) { 1801 /* 1802 * Poll write faults too if background snapshot is enabled; that's 1803 * when we have vcpus got blocked by the write protected pages. 1804 */ 1805 block = poll_fault_page(rs, &offset); 1806 } 1807 1808 if (block) { 1809 /* 1810 * We want the background search to continue from the queued page 1811 * since the guest is likely to want other pages near to the page 1812 * it just requested. 1813 */ 1814 pss->block = block; 1815 pss->page = offset >> TARGET_PAGE_BITS; 1816 1817 /* 1818 * This unqueued page would break the "one round" check, even is 1819 * really rare. 1820 */ 1821 pss->complete_round = false; 1822 } 1823 1824 return !!block; 1825 } 1826 1827 /** 1828 * migration_page_queue_free: drop any remaining pages in the ram 1829 * request queue 1830 * 1831 * It should be empty at the end anyway, but in error cases there may 1832 * be some left. in case that there is any page left, we drop it. 1833 * 1834 */ 1835 static void migration_page_queue_free(RAMState *rs) 1836 { 1837 struct RAMSrcPageRequest *mspr, *next_mspr; 1838 /* This queue generally should be empty - but in the case of a failed 1839 * migration might have some droppings in. 1840 */ 1841 RCU_READ_LOCK_GUARD(); 1842 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 1843 memory_region_unref(mspr->rb->mr); 1844 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1845 g_free(mspr); 1846 } 1847 } 1848 1849 /** 1850 * ram_save_queue_pages: queue the page for transmission 1851 * 1852 * A request from postcopy destination for example. 1853 * 1854 * Returns zero on success or negative on error 1855 * 1856 * @rbname: Name of the RAMBLock of the request. NULL means the 1857 * same that last one. 1858 * @start: starting address from the start of the RAMBlock 1859 * @len: length (in bytes) to send 1860 */ 1861 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len, 1862 Error **errp) 1863 { 1864 RAMBlock *ramblock; 1865 RAMState *rs = ram_state; 1866 1867 stat64_add(&mig_stats.postcopy_requests, 1); 1868 RCU_READ_LOCK_GUARD(); 1869 1870 if (!rbname) { 1871 /* Reuse last RAMBlock */ 1872 ramblock = rs->last_req_rb; 1873 1874 if (!ramblock) { 1875 /* 1876 * Shouldn't happen, we can't reuse the last RAMBlock if 1877 * it's the 1st request. 1878 */ 1879 error_setg(errp, "MIG_RP_MSG_REQ_PAGES has no previous block"); 1880 return -1; 1881 } 1882 } else { 1883 ramblock = qemu_ram_block_by_name(rbname); 1884 1885 if (!ramblock) { 1886 /* We shouldn't be asked for a non-existent RAMBlock */ 1887 error_setg(errp, "MIG_RP_MSG_REQ_PAGES has no block '%s'", rbname); 1888 return -1; 1889 } 1890 rs->last_req_rb = ramblock; 1891 } 1892 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1893 if (!offset_in_ramblock(ramblock, start + len - 1)) { 1894 error_setg(errp, "MIG_RP_MSG_REQ_PAGES request overrun, " 1895 "start=" RAM_ADDR_FMT " len=" 1896 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1897 start, len, ramblock->used_length); 1898 return -1; 1899 } 1900 1901 /* 1902 * When with postcopy preempt, we send back the page directly in the 1903 * rp-return thread. 1904 */ 1905 if (postcopy_preempt_active()) { 1906 ram_addr_t page_start = start >> TARGET_PAGE_BITS; 1907 size_t page_size = qemu_ram_pagesize(ramblock); 1908 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY]; 1909 int ret = 0; 1910 1911 qemu_mutex_lock(&rs->bitmap_mutex); 1912 1913 pss_init(pss, ramblock, page_start); 1914 /* 1915 * Always use the preempt channel, and make sure it's there. It's 1916 * safe to access without lock, because when rp-thread is running 1917 * we should be the only one who operates on the qemufile 1918 */ 1919 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src; 1920 assert(pss->pss_channel); 1921 1922 /* 1923 * It must be either one or multiple of host page size. Just 1924 * assert; if something wrong we're mostly split brain anyway. 1925 */ 1926 assert(len % page_size == 0); 1927 while (len) { 1928 if (ram_save_host_page_urgent(pss)) { 1929 error_setg(errp, "ram_save_host_page_urgent() failed: " 1930 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, 1931 ramblock->idstr, start); 1932 ret = -1; 1933 break; 1934 } 1935 /* 1936 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page 1937 * will automatically be moved and point to the next host page 1938 * we're going to send, so no need to update here. 1939 * 1940 * Normally QEMU never sends >1 host page in requests, so 1941 * logically we don't even need that as the loop should only 1942 * run once, but just to be consistent. 1943 */ 1944 len -= page_size; 1945 }; 1946 qemu_mutex_unlock(&rs->bitmap_mutex); 1947 1948 return ret; 1949 } 1950 1951 struct RAMSrcPageRequest *new_entry = 1952 g_new0(struct RAMSrcPageRequest, 1); 1953 new_entry->rb = ramblock; 1954 new_entry->offset = start; 1955 new_entry->len = len; 1956 1957 memory_region_ref(ramblock->mr); 1958 qemu_mutex_lock(&rs->src_page_req_mutex); 1959 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 1960 migration_make_urgent_request(); 1961 qemu_mutex_unlock(&rs->src_page_req_mutex); 1962 1963 return 0; 1964 } 1965 1966 /** 1967 * ram_save_target_page: save one target page to the precopy thread 1968 * OR to multifd workers. 1969 * 1970 * @rs: current RAM state 1971 * @pss: data about the page we want to send 1972 */ 1973 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) 1974 { 1975 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1976 int res; 1977 1978 /* Hand over to RDMA first */ 1979 if (migrate_rdma()) { 1980 res = rdma_control_save_page(pss->pss_channel, pss->block->offset, 1981 offset, TARGET_PAGE_SIZE); 1982 1983 if (res == RAM_SAVE_CONTROL_DELAYED) { 1984 res = 1; 1985 } 1986 return res; 1987 } 1988 1989 if (!migrate_multifd() 1990 || migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) { 1991 if (save_zero_page(rs, pss, offset)) { 1992 return 1; 1993 } 1994 } 1995 1996 if (migrate_multifd()) { 1997 RAMBlock *block = pss->block; 1998 return ram_save_multifd_page(block, offset); 1999 } 2000 2001 return ram_save_page(rs, pss); 2002 } 2003 2004 /* Should be called before sending a host page */ 2005 static void pss_host_page_prepare(PageSearchStatus *pss) 2006 { 2007 /* How many guest pages are there in one host page? */ 2008 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2009 2010 pss->host_page_sending = true; 2011 if (guest_pfns <= 1) { 2012 /* 2013 * This covers both when guest psize == host psize, or when guest 2014 * has larger psize than the host (guest_pfns==0). 2015 * 2016 * For the latter, we always send one whole guest page per 2017 * iteration of the host page (example: an Alpha VM on x86 host 2018 * will have guest psize 8K while host psize 4K). 2019 */ 2020 pss->host_page_start = pss->page; 2021 pss->host_page_end = pss->page + 1; 2022 } else { 2023 /* 2024 * The host page spans over multiple guest pages, we send them 2025 * within the same host page iteration. 2026 */ 2027 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); 2028 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); 2029 } 2030 } 2031 2032 /* 2033 * Whether the page pointed by PSS is within the host page being sent. 2034 * Must be called after a previous pss_host_page_prepare(). 2035 */ 2036 static bool pss_within_range(PageSearchStatus *pss) 2037 { 2038 ram_addr_t ram_addr; 2039 2040 assert(pss->host_page_sending); 2041 2042 /* Over host-page boundary? */ 2043 if (pss->page >= pss->host_page_end) { 2044 return false; 2045 } 2046 2047 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2048 2049 return offset_in_ramblock(pss->block, ram_addr); 2050 } 2051 2052 static void pss_host_page_finish(PageSearchStatus *pss) 2053 { 2054 pss->host_page_sending = false; 2055 /* This is not needed, but just to reset it */ 2056 pss->host_page_start = pss->host_page_end = 0; 2057 } 2058 2059 static void ram_page_hint_update(RAMState *rs, PageSearchStatus *pss) 2060 { 2061 PageLocationHint *hint = &rs->page_hint; 2062 2063 /* If there's a pending hint not consumed, don't bother */ 2064 if (hint->valid) { 2065 return; 2066 } 2067 2068 /* Provide a hint to the background stream otherwise */ 2069 hint->location.block = pss->block; 2070 hint->location.offset = pss->page; 2071 hint->valid = true; 2072 } 2073 2074 /* 2075 * Send an urgent host page specified by `pss'. Need to be called with 2076 * bitmap_mutex held. 2077 * 2078 * Returns 0 if save host page succeeded, false otherwise. 2079 */ 2080 static int ram_save_host_page_urgent(PageSearchStatus *pss) 2081 { 2082 bool page_dirty, sent = false; 2083 RAMState *rs = ram_state; 2084 int ret = 0; 2085 2086 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); 2087 pss_host_page_prepare(pss); 2088 2089 /* 2090 * If precopy is sending the same page, let it be done in precopy, or 2091 * we could send the same page in two channels and none of them will 2092 * receive the whole page. 2093 */ 2094 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) { 2095 trace_postcopy_preempt_hit(pss->block->idstr, 2096 pss->page << TARGET_PAGE_BITS); 2097 return 0; 2098 } 2099 2100 do { 2101 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2102 2103 if (page_dirty) { 2104 /* Be strict to return code; it must be 1, or what else? */ 2105 if (ram_save_target_page(rs, pss) != 1) { 2106 error_report_once("%s: ram_save_target_page failed", __func__); 2107 ret = -1; 2108 goto out; 2109 } 2110 sent = true; 2111 } 2112 pss_find_next_dirty(pss); 2113 } while (pss_within_range(pss)); 2114 out: 2115 pss_host_page_finish(pss); 2116 /* For urgent requests, flush immediately if sent */ 2117 if (sent) { 2118 qemu_fflush(pss->pss_channel); 2119 ram_page_hint_update(rs, pss); 2120 } 2121 return ret; 2122 } 2123 2124 /** 2125 * ram_save_host_page: save a whole host page 2126 * 2127 * Starting at *offset send pages up to the end of the current host 2128 * page. It's valid for the initial offset to point into the middle of 2129 * a host page in which case the remainder of the hostpage is sent. 2130 * Only dirty target pages are sent. Note that the host page size may 2131 * be a huge page for this block. 2132 * 2133 * The saving stops at the boundary of the used_length of the block 2134 * if the RAMBlock isn't a multiple of the host page size. 2135 * 2136 * The caller must be with ram_state.bitmap_mutex held to call this 2137 * function. Note that this function can temporarily release the lock, but 2138 * when the function is returned it'll make sure the lock is still held. 2139 * 2140 * Returns the number of pages written or negative on error 2141 * 2142 * @rs: current RAM state 2143 * @pss: data about the page we want to send 2144 */ 2145 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2146 { 2147 bool page_dirty, preempt_active = postcopy_preempt_active(); 2148 int tmppages, pages = 0; 2149 size_t pagesize_bits = 2150 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2151 unsigned long start_page = pss->page; 2152 int res; 2153 2154 if (migrate_ram_is_ignored(pss->block)) { 2155 error_report("block %s should not be migrated !", pss->block->idstr); 2156 return 0; 2157 } 2158 2159 /* Update host page boundary information */ 2160 pss_host_page_prepare(pss); 2161 2162 do { 2163 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2164 2165 /* Check the pages is dirty and if it is send it */ 2166 if (page_dirty) { 2167 /* 2168 * Properly yield the lock only in postcopy preempt mode 2169 * because both migration thread and rp-return thread can 2170 * operate on the bitmaps. 2171 */ 2172 if (preempt_active) { 2173 qemu_mutex_unlock(&rs->bitmap_mutex); 2174 } 2175 tmppages = ram_save_target_page(rs, pss); 2176 if (tmppages >= 0) { 2177 pages += tmppages; 2178 /* 2179 * Allow rate limiting to happen in the middle of huge pages if 2180 * something is sent in the current iteration. 2181 */ 2182 if (pagesize_bits > 1 && tmppages > 0) { 2183 migration_rate_limit(); 2184 } 2185 } 2186 if (preempt_active) { 2187 qemu_mutex_lock(&rs->bitmap_mutex); 2188 } 2189 } else { 2190 tmppages = 0; 2191 } 2192 2193 if (tmppages < 0) { 2194 pss_host_page_finish(pss); 2195 return tmppages; 2196 } 2197 2198 pss_find_next_dirty(pss); 2199 } while (pss_within_range(pss)); 2200 2201 pss_host_page_finish(pss); 2202 2203 res = ram_save_release_protection(rs, pss, start_page); 2204 return (res < 0 ? res : pages); 2205 } 2206 2207 static bool ram_page_hint_valid(RAMState *rs) 2208 { 2209 /* There's only page hint during postcopy preempt mode */ 2210 if (!postcopy_preempt_active()) { 2211 return false; 2212 } 2213 2214 return rs->page_hint.valid; 2215 } 2216 2217 static void ram_page_hint_collect(RAMState *rs, RAMBlock **block, 2218 unsigned long *page) 2219 { 2220 PageLocationHint *hint = &rs->page_hint; 2221 2222 assert(hint->valid); 2223 2224 *block = hint->location.block; 2225 *page = hint->location.offset; 2226 2227 /* Mark the hint consumed */ 2228 hint->valid = false; 2229 } 2230 2231 /** 2232 * ram_find_and_save_block: finds a dirty page and sends it to f 2233 * 2234 * Called within an RCU critical section. 2235 * 2236 * Returns the number of pages written where zero means no dirty pages, 2237 * or negative on error 2238 * 2239 * @rs: current RAM state 2240 * 2241 * On systems where host-page-size > target-page-size it will send all the 2242 * pages in a host page that are dirty. 2243 */ 2244 static int ram_find_and_save_block(RAMState *rs) 2245 { 2246 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 2247 unsigned long next_page; 2248 RAMBlock *next_block; 2249 int pages = 0; 2250 2251 /* No dirty page as there is zero RAM */ 2252 if (!rs->ram_bytes_total) { 2253 return pages; 2254 } 2255 2256 /* 2257 * Always keep last_seen_block/last_page valid during this procedure, 2258 * because find_dirty_block() relies on these values (e.g., we compare 2259 * last_seen_block with pss.block to see whether we searched all the 2260 * ramblocks) to detect the completion of migration. Having NULL value 2261 * of last_seen_block can conditionally cause below loop to run forever. 2262 */ 2263 if (!rs->last_seen_block) { 2264 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks); 2265 rs->last_page = 0; 2266 } 2267 2268 if (ram_page_hint_valid(rs)) { 2269 ram_page_hint_collect(rs, &next_block, &next_page); 2270 } else { 2271 next_block = rs->last_seen_block; 2272 next_page = rs->last_page; 2273 } 2274 2275 pss_init(pss, next_block, next_page); 2276 2277 while (true){ 2278 if (!get_queued_page(rs, pss)) { 2279 /* priority queue empty, so just search for something dirty */ 2280 int res = find_dirty_block(rs, pss); 2281 if (res != PAGE_DIRTY_FOUND) { 2282 if (res == PAGE_ALL_CLEAN) { 2283 break; 2284 } else if (res == PAGE_TRY_AGAIN) { 2285 continue; 2286 } else if (res < 0) { 2287 pages = res; 2288 break; 2289 } 2290 } 2291 } 2292 pages = ram_save_host_page(rs, pss); 2293 if (pages) { 2294 break; 2295 } 2296 } 2297 2298 rs->last_seen_block = pss->block; 2299 rs->last_page = pss->page; 2300 2301 return pages; 2302 } 2303 2304 static uint64_t ram_bytes_total_with_ignored(void) 2305 { 2306 RAMBlock *block; 2307 uint64_t total = 0; 2308 2309 RCU_READ_LOCK_GUARD(); 2310 2311 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2312 total += block->used_length; 2313 } 2314 return total; 2315 } 2316 2317 uint64_t ram_bytes_total(void) 2318 { 2319 RAMBlock *block; 2320 uint64_t total = 0; 2321 2322 RCU_READ_LOCK_GUARD(); 2323 2324 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2325 total += block->used_length; 2326 } 2327 return total; 2328 } 2329 2330 static void xbzrle_load_setup(void) 2331 { 2332 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2333 } 2334 2335 static void xbzrle_load_cleanup(void) 2336 { 2337 g_free(XBZRLE.decoded_buf); 2338 XBZRLE.decoded_buf = NULL; 2339 } 2340 2341 static void ram_state_cleanup(RAMState **rsp) 2342 { 2343 if (*rsp) { 2344 migration_page_queue_free(*rsp); 2345 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2346 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2347 g_free(*rsp); 2348 *rsp = NULL; 2349 } 2350 } 2351 2352 static void xbzrle_cleanup(void) 2353 { 2354 XBZRLE_cache_lock(); 2355 if (XBZRLE.cache) { 2356 cache_fini(XBZRLE.cache); 2357 g_free(XBZRLE.encoded_buf); 2358 g_free(XBZRLE.current_buf); 2359 g_free(XBZRLE.zero_target_page); 2360 XBZRLE.cache = NULL; 2361 XBZRLE.encoded_buf = NULL; 2362 XBZRLE.current_buf = NULL; 2363 XBZRLE.zero_target_page = NULL; 2364 } 2365 XBZRLE_cache_unlock(); 2366 } 2367 2368 static void ram_bitmaps_destroy(void) 2369 { 2370 RAMBlock *block; 2371 2372 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2373 g_free(block->clear_bmap); 2374 block->clear_bmap = NULL; 2375 g_free(block->bmap); 2376 block->bmap = NULL; 2377 g_free(block->file_bmap); 2378 block->file_bmap = NULL; 2379 } 2380 } 2381 2382 static void ram_save_cleanup(void *opaque) 2383 { 2384 RAMState **rsp = opaque; 2385 2386 /* We don't use dirty log with background snapshots */ 2387 if (!migrate_background_snapshot()) { 2388 /* caller have hold BQL or is in a bh, so there is 2389 * no writing race against the migration bitmap 2390 */ 2391 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2392 /* 2393 * do not stop dirty log without starting it, since 2394 * memory_global_dirty_log_stop will assert that 2395 * memory_global_dirty_log_start/stop used in pairs 2396 */ 2397 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2398 } 2399 } 2400 2401 ram_bitmaps_destroy(); 2402 2403 xbzrle_cleanup(); 2404 multifd_ram_save_cleanup(); 2405 ram_state_cleanup(rsp); 2406 } 2407 2408 static void ram_page_hint_reset(PageLocationHint *hint) 2409 { 2410 hint->location.block = NULL; 2411 hint->location.offset = 0; 2412 hint->valid = false; 2413 } 2414 2415 static void ram_state_reset(RAMState *rs) 2416 { 2417 int i; 2418 2419 for (i = 0; i < RAM_CHANNEL_MAX; i++) { 2420 rs->pss[i].last_sent_block = NULL; 2421 } 2422 2423 rs->last_seen_block = NULL; 2424 rs->last_page = 0; 2425 rs->last_version = ram_list.version; 2426 rs->xbzrle_started = false; 2427 2428 ram_page_hint_reset(&rs->page_hint); 2429 } 2430 2431 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2432 2433 /* **** functions for postcopy ***** */ 2434 2435 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2436 { 2437 struct RAMBlock *block; 2438 2439 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2440 unsigned long *bitmap = block->bmap; 2441 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2442 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2443 2444 while (run_start < range) { 2445 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2446 ram_discard_range(block->idstr, 2447 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2448 ((ram_addr_t)(run_end - run_start)) 2449 << TARGET_PAGE_BITS); 2450 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2451 } 2452 } 2453 } 2454 2455 /** 2456 * postcopy_send_discard_bm_ram: discard a RAMBlock 2457 * 2458 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2459 * 2460 * @ms: current migration state 2461 * @block: RAMBlock to discard 2462 */ 2463 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2464 { 2465 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2466 unsigned long current; 2467 unsigned long *bitmap = block->bmap; 2468 2469 for (current = 0; current < end; ) { 2470 unsigned long one = find_next_bit(bitmap, end, current); 2471 unsigned long zero, discard_length; 2472 2473 if (one >= end) { 2474 break; 2475 } 2476 2477 zero = find_next_zero_bit(bitmap, end, one + 1); 2478 2479 if (zero >= end) { 2480 discard_length = end - one; 2481 } else { 2482 discard_length = zero - one; 2483 } 2484 postcopy_discard_send_range(ms, one, discard_length); 2485 current = one + discard_length; 2486 } 2487 } 2488 2489 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2490 2491 /** 2492 * postcopy_each_ram_send_discard: discard all RAMBlocks 2493 * 2494 * Utility for the outgoing postcopy code. 2495 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2496 * passing it bitmap indexes and name. 2497 * (qemu_ram_foreach_block ends up passing unscaled lengths 2498 * which would mean postcopy code would have to deal with target page) 2499 * 2500 * @ms: current migration state 2501 */ 2502 static void postcopy_each_ram_send_discard(MigrationState *ms) 2503 { 2504 struct RAMBlock *block; 2505 2506 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2507 postcopy_discard_send_init(ms, block->idstr); 2508 2509 /* 2510 * Deal with TPS != HPS and huge pages. It discard any partially sent 2511 * host-page size chunks, mark any partially dirty host-page size 2512 * chunks as all dirty. In this case the host-page is the host-page 2513 * for the particular RAMBlock, i.e. it might be a huge page. 2514 */ 2515 postcopy_chunk_hostpages_pass(ms, block); 2516 2517 /* 2518 * Postcopy sends chunks of bitmap over the wire, but it 2519 * just needs indexes at this point, avoids it having 2520 * target page specific code. 2521 */ 2522 postcopy_send_discard_bm_ram(ms, block); 2523 postcopy_discard_send_finish(ms); 2524 } 2525 } 2526 2527 /** 2528 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2529 * 2530 * Helper for postcopy_chunk_hostpages; it's called twice to 2531 * canonicalize the two bitmaps, that are similar, but one is 2532 * inverted. 2533 * 2534 * Postcopy requires that all target pages in a hostpage are dirty or 2535 * clean, not a mix. This function canonicalizes the bitmaps. 2536 * 2537 * @ms: current migration state 2538 * @block: block that contains the page we want to canonicalize 2539 */ 2540 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2541 { 2542 RAMState *rs = ram_state; 2543 unsigned long *bitmap = block->bmap; 2544 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2545 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2546 unsigned long run_start; 2547 2548 if (block->page_size == TARGET_PAGE_SIZE) { 2549 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2550 return; 2551 } 2552 2553 /* Find a dirty page */ 2554 run_start = find_next_bit(bitmap, pages, 0); 2555 2556 while (run_start < pages) { 2557 2558 /* 2559 * If the start of this run of pages is in the middle of a host 2560 * page, then we need to fixup this host page. 2561 */ 2562 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2563 /* Find the end of this run */ 2564 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2565 /* 2566 * If the end isn't at the start of a host page, then the 2567 * run doesn't finish at the end of a host page 2568 * and we need to discard. 2569 */ 2570 } 2571 2572 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2573 unsigned long page; 2574 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2575 host_ratio); 2576 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2577 2578 /* Clean up the bitmap */ 2579 for (page = fixup_start_addr; 2580 page < fixup_start_addr + host_ratio; page++) { 2581 /* 2582 * Remark them as dirty, updating the count for any pages 2583 * that weren't previously dirty. 2584 */ 2585 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2586 } 2587 } 2588 2589 /* Find the next dirty page for the next iteration */ 2590 run_start = find_next_bit(bitmap, pages, run_start); 2591 } 2592 } 2593 2594 /** 2595 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2596 * 2597 * Transmit the set of pages to be discarded after precopy to the target 2598 * these are pages that: 2599 * a) Have been previously transmitted but are now dirty again 2600 * b) Pages that have never been transmitted, this ensures that 2601 * any pages on the destination that have been mapped by background 2602 * tasks get discarded (transparent huge pages is the specific concern) 2603 * Hopefully this is pretty sparse 2604 * 2605 * @ms: current migration state 2606 */ 2607 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2608 { 2609 RAMState *rs = ram_state; 2610 2611 RCU_READ_LOCK_GUARD(); 2612 2613 /* This should be our last sync, the src is now paused */ 2614 migration_bitmap_sync(rs, false); 2615 2616 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2617 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL; 2618 rs->last_seen_block = NULL; 2619 rs->last_page = 0; 2620 2621 postcopy_each_ram_send_discard(ms); 2622 2623 trace_ram_postcopy_send_discard_bitmap(); 2624 } 2625 2626 /** 2627 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2628 * 2629 * Returns zero on success 2630 * 2631 * @rbname: name of the RAMBlock of the request. NULL means the 2632 * same that last one. 2633 * @start: RAMBlock starting page 2634 * @length: RAMBlock size 2635 */ 2636 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2637 { 2638 trace_ram_discard_range(rbname, start, length); 2639 2640 RCU_READ_LOCK_GUARD(); 2641 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2642 2643 if (!rb) { 2644 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2645 return -1; 2646 } 2647 2648 /* 2649 * On source VM, we don't need to update the received bitmap since 2650 * we don't even have one. 2651 */ 2652 if (rb->receivedmap) { 2653 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2654 length >> qemu_target_page_bits()); 2655 } 2656 2657 return ram_block_discard_range(rb, start, length); 2658 } 2659 2660 /* 2661 * For every allocation, we will try not to crash the VM if the 2662 * allocation failed. 2663 */ 2664 static bool xbzrle_init(Error **errp) 2665 { 2666 if (!migrate_xbzrle()) { 2667 return true; 2668 } 2669 2670 XBZRLE_cache_lock(); 2671 2672 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2673 if (!XBZRLE.zero_target_page) { 2674 error_setg(errp, "%s: Error allocating zero page", __func__); 2675 goto err_out; 2676 } 2677 2678 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2679 TARGET_PAGE_SIZE, errp); 2680 if (!XBZRLE.cache) { 2681 goto free_zero_page; 2682 } 2683 2684 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2685 if (!XBZRLE.encoded_buf) { 2686 error_setg(errp, "%s: Error allocating encoded_buf", __func__); 2687 goto free_cache; 2688 } 2689 2690 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2691 if (!XBZRLE.current_buf) { 2692 error_setg(errp, "%s: Error allocating current_buf", __func__); 2693 goto free_encoded_buf; 2694 } 2695 2696 /* We are all good */ 2697 XBZRLE_cache_unlock(); 2698 return true; 2699 2700 free_encoded_buf: 2701 g_free(XBZRLE.encoded_buf); 2702 XBZRLE.encoded_buf = NULL; 2703 free_cache: 2704 cache_fini(XBZRLE.cache); 2705 XBZRLE.cache = NULL; 2706 free_zero_page: 2707 g_free(XBZRLE.zero_target_page); 2708 XBZRLE.zero_target_page = NULL; 2709 err_out: 2710 XBZRLE_cache_unlock(); 2711 return false; 2712 } 2713 2714 static bool ram_state_init(RAMState **rsp, Error **errp) 2715 { 2716 *rsp = g_try_new0(RAMState, 1); 2717 2718 if (!*rsp) { 2719 error_setg(errp, "%s: Init ramstate fail", __func__); 2720 return false; 2721 } 2722 2723 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2724 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2725 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2726 (*rsp)->ram_bytes_total = ram_bytes_total(); 2727 2728 /* 2729 * Count the total number of pages used by ram blocks not including any 2730 * gaps due to alignment or unplugs. 2731 * This must match with the initial values of dirty bitmap. 2732 */ 2733 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS; 2734 ram_state_reset(*rsp); 2735 2736 return true; 2737 } 2738 2739 static void ram_list_init_bitmaps(void) 2740 { 2741 MigrationState *ms = migrate_get_current(); 2742 RAMBlock *block; 2743 unsigned long pages; 2744 uint8_t shift; 2745 2746 /* Skip setting bitmap if there is no RAM */ 2747 if (ram_bytes_total()) { 2748 shift = ms->clear_bitmap_shift; 2749 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2750 error_report("clear_bitmap_shift (%u) too big, using " 2751 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2752 shift = CLEAR_BITMAP_SHIFT_MAX; 2753 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2754 error_report("clear_bitmap_shift (%u) too small, using " 2755 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2756 shift = CLEAR_BITMAP_SHIFT_MIN; 2757 } 2758 2759 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2760 pages = block->max_length >> TARGET_PAGE_BITS; 2761 /* 2762 * The initial dirty bitmap for migration must be set with all 2763 * ones to make sure we'll migrate every guest RAM page to 2764 * destination. 2765 * Here we set RAMBlock.bmap all to 1 because when rebegin a 2766 * new migration after a failed migration, ram_list. 2767 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 2768 * guest memory. 2769 */ 2770 block->bmap = bitmap_new(pages); 2771 bitmap_set(block->bmap, 0, pages); 2772 if (migrate_mapped_ram()) { 2773 block->file_bmap = bitmap_new(pages); 2774 } 2775 block->clear_bmap_shift = shift; 2776 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 2777 } 2778 } 2779 } 2780 2781 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 2782 { 2783 unsigned long pages; 2784 RAMBlock *rb; 2785 2786 RCU_READ_LOCK_GUARD(); 2787 2788 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 2789 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 2790 rs->migration_dirty_pages -= pages; 2791 } 2792 } 2793 2794 static bool ram_init_bitmaps(RAMState *rs, Error **errp) 2795 { 2796 bool ret = true; 2797 2798 qemu_mutex_lock_ramlist(); 2799 2800 WITH_RCU_READ_LOCK_GUARD() { 2801 ram_list_init_bitmaps(); 2802 /* We don't use dirty log with background snapshots */ 2803 if (!migrate_background_snapshot()) { 2804 ret = memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION, errp); 2805 if (!ret) { 2806 goto out_unlock; 2807 } 2808 migration_bitmap_sync_precopy(false); 2809 } 2810 } 2811 out_unlock: 2812 qemu_mutex_unlock_ramlist(); 2813 2814 if (!ret) { 2815 ram_bitmaps_destroy(); 2816 return false; 2817 } 2818 2819 /* 2820 * After an eventual first bitmap sync, fixup the initial bitmap 2821 * containing all 1s to exclude any discarded pages from migration. 2822 */ 2823 migration_bitmap_clear_discarded_pages(rs); 2824 return true; 2825 } 2826 2827 static int ram_init_all(RAMState **rsp, Error **errp) 2828 { 2829 if (!ram_state_init(rsp, errp)) { 2830 return -1; 2831 } 2832 2833 if (!xbzrle_init(errp)) { 2834 ram_state_cleanup(rsp); 2835 return -1; 2836 } 2837 2838 if (!ram_init_bitmaps(*rsp, errp)) { 2839 return -1; 2840 } 2841 2842 return 0; 2843 } 2844 2845 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2846 { 2847 RAMBlock *block; 2848 uint64_t pages = 0; 2849 2850 /* 2851 * Postcopy is not using xbzrle/compression, so no need for that. 2852 * Also, since source are already halted, we don't need to care 2853 * about dirty page logging as well. 2854 */ 2855 2856 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2857 pages += bitmap_count_one(block->bmap, 2858 block->used_length >> TARGET_PAGE_BITS); 2859 } 2860 2861 /* This may not be aligned with current bitmaps. Recalculate. */ 2862 rs->migration_dirty_pages = pages; 2863 2864 ram_state_reset(rs); 2865 2866 /* Update RAMState cache of output QEMUFile */ 2867 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out; 2868 2869 trace_ram_state_resume_prepare(pages); 2870 } 2871 2872 /* 2873 * This function clears bits of the free pages reported by the caller from the 2874 * migration dirty bitmap. @addr is the host address corresponding to the 2875 * start of the continuous guest free pages, and @len is the total bytes of 2876 * those pages. 2877 */ 2878 void qemu_guest_free_page_hint(void *addr, size_t len) 2879 { 2880 RAMBlock *block; 2881 ram_addr_t offset; 2882 size_t used_len, start, npages; 2883 2884 /* This function is currently expected to be used during live migration */ 2885 if (!migration_is_running()) { 2886 return; 2887 } 2888 2889 for (; len > 0; len -= used_len, addr += used_len) { 2890 block = qemu_ram_block_from_host(addr, false, &offset); 2891 if (unlikely(!block || offset >= block->used_length)) { 2892 /* 2893 * The implementation might not support RAMBlock resize during 2894 * live migration, but it could happen in theory with future 2895 * updates. So we add a check here to capture that case. 2896 */ 2897 error_report_once("%s unexpected error", __func__); 2898 return; 2899 } 2900 2901 if (len <= block->used_length - offset) { 2902 used_len = len; 2903 } else { 2904 used_len = block->used_length - offset; 2905 } 2906 2907 start = offset >> TARGET_PAGE_BITS; 2908 npages = used_len >> TARGET_PAGE_BITS; 2909 2910 qemu_mutex_lock(&ram_state->bitmap_mutex); 2911 /* 2912 * The skipped free pages are equavalent to be sent from clear_bmap's 2913 * perspective, so clear the bits from the memory region bitmap which 2914 * are initially set. Otherwise those skipped pages will be sent in 2915 * the next round after syncing from the memory region bitmap. 2916 */ 2917 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 2918 ram_state->migration_dirty_pages -= 2919 bitmap_count_one_with_offset(block->bmap, start, npages); 2920 bitmap_clear(block->bmap, start, npages); 2921 qemu_mutex_unlock(&ram_state->bitmap_mutex); 2922 } 2923 } 2924 2925 #define MAPPED_RAM_HDR_VERSION 1 2926 struct MappedRamHeader { 2927 uint32_t version; 2928 /* 2929 * The target's page size, so we know how many pages are in the 2930 * bitmap. 2931 */ 2932 uint64_t page_size; 2933 /* 2934 * The offset in the migration file where the pages bitmap is 2935 * stored. 2936 */ 2937 uint64_t bitmap_offset; 2938 /* 2939 * The offset in the migration file where the actual pages (data) 2940 * are stored. 2941 */ 2942 uint64_t pages_offset; 2943 } QEMU_PACKED; 2944 typedef struct MappedRamHeader MappedRamHeader; 2945 2946 static void mapped_ram_setup_ramblock(QEMUFile *file, RAMBlock *block) 2947 { 2948 g_autofree MappedRamHeader *header = NULL; 2949 size_t header_size, bitmap_size; 2950 long num_pages; 2951 2952 header = g_new0(MappedRamHeader, 1); 2953 header_size = sizeof(MappedRamHeader); 2954 2955 num_pages = block->used_length >> TARGET_PAGE_BITS; 2956 bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); 2957 2958 /* 2959 * Save the file offsets of where the bitmap and the pages should 2960 * go as they are written at the end of migration and during the 2961 * iterative phase, respectively. 2962 */ 2963 block->bitmap_offset = qemu_get_offset(file) + header_size; 2964 block->pages_offset = ROUND_UP(block->bitmap_offset + 2965 bitmap_size, 2966 MAPPED_RAM_FILE_OFFSET_ALIGNMENT); 2967 2968 header->version = cpu_to_be32(MAPPED_RAM_HDR_VERSION); 2969 header->page_size = cpu_to_be64(TARGET_PAGE_SIZE); 2970 header->bitmap_offset = cpu_to_be64(block->bitmap_offset); 2971 header->pages_offset = cpu_to_be64(block->pages_offset); 2972 2973 qemu_put_buffer(file, (uint8_t *) header, header_size); 2974 2975 /* prepare offset for next ramblock */ 2976 qemu_set_offset(file, block->pages_offset + block->used_length, SEEK_SET); 2977 } 2978 2979 static bool mapped_ram_read_header(QEMUFile *file, MappedRamHeader *header, 2980 Error **errp) 2981 { 2982 size_t ret, header_size = sizeof(MappedRamHeader); 2983 2984 ret = qemu_get_buffer(file, (uint8_t *)header, header_size); 2985 if (ret != header_size) { 2986 error_setg(errp, "Could not read whole mapped-ram migration header " 2987 "(expected %zd, got %zd bytes)", header_size, ret); 2988 return false; 2989 } 2990 2991 /* migration stream is big-endian */ 2992 header->version = be32_to_cpu(header->version); 2993 2994 if (header->version > MAPPED_RAM_HDR_VERSION) { 2995 error_setg(errp, "Migration mapped-ram capability version not " 2996 "supported (expected <= %d, got %d)", MAPPED_RAM_HDR_VERSION, 2997 header->version); 2998 return false; 2999 } 3000 3001 header->page_size = be64_to_cpu(header->page_size); 3002 header->bitmap_offset = be64_to_cpu(header->bitmap_offset); 3003 header->pages_offset = be64_to_cpu(header->pages_offset); 3004 3005 return true; 3006 } 3007 3008 /* 3009 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 3010 * long-running RCU critical section. When rcu-reclaims in the code 3011 * start to become numerous it will be necessary to reduce the 3012 * granularity of these critical sections. 3013 */ 3014 3015 /** 3016 * ram_save_setup: Setup RAM for migration 3017 * 3018 * Returns zero to indicate success and negative for error 3019 * 3020 * @f: QEMUFile where to send the data 3021 * @opaque: RAMState pointer 3022 * @errp: pointer to Error*, to store an error if it happens. 3023 */ 3024 static int ram_save_setup(QEMUFile *f, void *opaque, Error **errp) 3025 { 3026 RAMState **rsp = opaque; 3027 RAMBlock *block; 3028 int ret, max_hg_page_size; 3029 3030 /* migration has already setup the bitmap, reuse it. */ 3031 if (!migration_in_colo_state()) { 3032 if (ram_init_all(rsp, errp) != 0) { 3033 return -1; 3034 } 3035 } 3036 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; 3037 3038 /* 3039 * ??? Mirrors the previous value of qemu_host_page_size, 3040 * but is this really what was intended for the migration? 3041 */ 3042 max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE); 3043 3044 WITH_RCU_READ_LOCK_GUARD() { 3045 qemu_put_be64(f, ram_bytes_total_with_ignored() 3046 | RAM_SAVE_FLAG_MEM_SIZE); 3047 3048 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3049 qemu_put_byte(f, strlen(block->idstr)); 3050 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3051 qemu_put_be64(f, block->used_length); 3052 if (migrate_postcopy_ram() && 3053 block->page_size != max_hg_page_size) { 3054 qemu_put_be64(f, block->page_size); 3055 } 3056 if (migrate_ignore_shared()) { 3057 qemu_put_be64(f, block->mr->addr); 3058 } 3059 3060 if (migrate_mapped_ram()) { 3061 mapped_ram_setup_ramblock(f, block); 3062 } 3063 } 3064 } 3065 3066 ret = rdma_registration_start(f, RAM_CONTROL_SETUP); 3067 if (ret < 0) { 3068 error_setg(errp, "%s: failed to start RDMA registration", __func__); 3069 qemu_file_set_error(f, ret); 3070 return ret; 3071 } 3072 3073 ret = rdma_registration_stop(f, RAM_CONTROL_SETUP); 3074 if (ret < 0) { 3075 error_setg(errp, "%s: failed to stop RDMA registration", __func__); 3076 qemu_file_set_error(f, ret); 3077 return ret; 3078 } 3079 3080 if (migrate_multifd()) { 3081 multifd_ram_save_setup(); 3082 } 3083 3084 /* 3085 * This operation is unfortunate.. 3086 * 3087 * For legacy QEMUs using per-section sync 3088 * ======================================= 3089 * 3090 * This must exist because the EOS below requires the SYNC messages 3091 * per-channel to work. 3092 * 3093 * For modern QEMUs using per-round sync 3094 * ===================================== 3095 * 3096 * Logically such sync is not needed, and recv threads should not run 3097 * until setup ready (using things like channels_ready on src). Then 3098 * we should be all fine. 3099 * 3100 * However even if we add channels_ready to recv side in new QEMUs, old 3101 * QEMU won't have them so this sync will still be needed to make sure 3102 * multifd recv threads won't start processing guest pages early before 3103 * ram_load_setup() is properly done. 3104 * 3105 * Let's stick with this. Fortunately the overhead is low to sync 3106 * during setup because the VM is running, so at least it's not 3107 * accounted as part of downtime. 3108 */ 3109 bql_unlock(); 3110 ret = multifd_ram_flush_and_sync(f); 3111 bql_lock(); 3112 if (ret < 0) { 3113 error_setg(errp, "%s: multifd synchronization failed", __func__); 3114 return ret; 3115 } 3116 3117 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3118 ret = qemu_fflush(f); 3119 if (ret < 0) { 3120 error_setg_errno(errp, -ret, "%s failed", __func__); 3121 } 3122 return ret; 3123 } 3124 3125 static void ram_save_file_bmap(QEMUFile *f) 3126 { 3127 RAMBlock *block; 3128 3129 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3130 long num_pages = block->used_length >> TARGET_PAGE_BITS; 3131 long bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); 3132 3133 qemu_put_buffer_at(f, (uint8_t *)block->file_bmap, bitmap_size, 3134 block->bitmap_offset); 3135 ram_transferred_add(bitmap_size); 3136 3137 /* 3138 * Free the bitmap here to catch any synchronization issues 3139 * with multifd channels. No channels should be sending pages 3140 * after we've written the bitmap to file. 3141 */ 3142 g_free(block->file_bmap); 3143 block->file_bmap = NULL; 3144 } 3145 } 3146 3147 void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset, bool set) 3148 { 3149 if (set) { 3150 set_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); 3151 } else { 3152 clear_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); 3153 } 3154 } 3155 3156 /** 3157 * ram_save_iterate: iterative stage for migration 3158 * 3159 * Returns zero to indicate success and negative for error 3160 * 3161 * @f: QEMUFile where to send the data 3162 * @opaque: RAMState pointer 3163 */ 3164 static int ram_save_iterate(QEMUFile *f, void *opaque) 3165 { 3166 RAMState **temp = opaque; 3167 RAMState *rs = *temp; 3168 int ret = 0; 3169 int i; 3170 int64_t t0; 3171 int done = 0; 3172 3173 /* 3174 * We'll take this lock a little bit long, but it's okay for two reasons. 3175 * Firstly, the only possible other thread to take it is who calls 3176 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3177 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3178 * guarantees that we'll at least released it in a regular basis. 3179 */ 3180 WITH_QEMU_LOCK_GUARD(&rs->bitmap_mutex) { 3181 WITH_RCU_READ_LOCK_GUARD() { 3182 if (ram_list.version != rs->last_version) { 3183 ram_state_reset(rs); 3184 } 3185 3186 /* Read version before ram_list.blocks */ 3187 smp_rmb(); 3188 3189 ret = rdma_registration_start(f, RAM_CONTROL_ROUND); 3190 if (ret < 0) { 3191 qemu_file_set_error(f, ret); 3192 goto out; 3193 } 3194 3195 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3196 i = 0; 3197 while ((ret = migration_rate_exceeded(f)) == 0 || 3198 postcopy_has_request(rs)) { 3199 int pages; 3200 3201 if (qemu_file_get_error(f)) { 3202 break; 3203 } 3204 3205 pages = ram_find_and_save_block(rs); 3206 /* no more pages to sent */ 3207 if (pages == 0) { 3208 done = 1; 3209 break; 3210 } 3211 3212 if (pages < 0) { 3213 qemu_file_set_error(f, pages); 3214 break; 3215 } 3216 3217 rs->target_page_count += pages; 3218 3219 /* 3220 * we want to check in the 1st loop, just in case it was the 1st 3221 * time and we had to sync the dirty bitmap. 3222 * qemu_clock_get_ns() is a bit expensive, so we only check each 3223 * some iterations 3224 */ 3225 if ((i & 63) == 0) { 3226 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3227 1000000; 3228 if (t1 > MAX_WAIT) { 3229 trace_ram_save_iterate_big_wait(t1, i); 3230 break; 3231 } 3232 } 3233 i++; 3234 } 3235 } 3236 } 3237 3238 /* 3239 * Must occur before EOS (or any QEMUFile operation) 3240 * because of RDMA protocol. 3241 */ 3242 ret = rdma_registration_stop(f, RAM_CONTROL_ROUND); 3243 if (ret < 0) { 3244 qemu_file_set_error(f, ret); 3245 } 3246 3247 out: 3248 if (ret >= 0 && migration_is_running()) { 3249 if (multifd_ram_sync_per_section()) { 3250 ret = multifd_ram_flush_and_sync(f); 3251 if (ret < 0) { 3252 return ret; 3253 } 3254 } 3255 3256 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3257 ram_transferred_add(8); 3258 ret = qemu_fflush(f); 3259 } 3260 if (ret < 0) { 3261 return ret; 3262 } 3263 3264 return done; 3265 } 3266 3267 /** 3268 * ram_save_complete: function called to send the remaining amount of ram 3269 * 3270 * Returns zero to indicate success or negative on error 3271 * 3272 * Called with the BQL 3273 * 3274 * @f: QEMUFile where to send the data 3275 * @opaque: RAMState pointer 3276 */ 3277 static int ram_save_complete(QEMUFile *f, void *opaque) 3278 { 3279 RAMState **temp = opaque; 3280 RAMState *rs = *temp; 3281 int ret = 0; 3282 3283 rs->last_stage = !migration_in_colo_state(); 3284 3285 WITH_RCU_READ_LOCK_GUARD() { 3286 if (!migration_in_postcopy()) { 3287 migration_bitmap_sync_precopy(true); 3288 } 3289 3290 ret = rdma_registration_start(f, RAM_CONTROL_FINISH); 3291 if (ret < 0) { 3292 qemu_file_set_error(f, ret); 3293 return ret; 3294 } 3295 3296 /* try transferring iterative blocks of memory */ 3297 3298 /* flush all remaining blocks regardless of rate limiting */ 3299 qemu_mutex_lock(&rs->bitmap_mutex); 3300 while (true) { 3301 int pages; 3302 3303 pages = ram_find_and_save_block(rs); 3304 /* no more blocks to sent */ 3305 if (pages == 0) { 3306 break; 3307 } 3308 if (pages < 0) { 3309 qemu_mutex_unlock(&rs->bitmap_mutex); 3310 return pages; 3311 } 3312 } 3313 qemu_mutex_unlock(&rs->bitmap_mutex); 3314 3315 ret = rdma_registration_stop(f, RAM_CONTROL_FINISH); 3316 if (ret < 0) { 3317 qemu_file_set_error(f, ret); 3318 return ret; 3319 } 3320 } 3321 3322 if (multifd_ram_sync_per_section()) { 3323 /* 3324 * Only the old dest QEMU will need this sync, because each EOS 3325 * will require one SYNC message on each channel. 3326 */ 3327 ret = multifd_ram_flush_and_sync(f); 3328 if (ret < 0) { 3329 return ret; 3330 } 3331 } 3332 3333 if (migrate_mapped_ram()) { 3334 ram_save_file_bmap(f); 3335 3336 if (qemu_file_get_error(f)) { 3337 Error *local_err = NULL; 3338 int err = qemu_file_get_error_obj(f, &local_err); 3339 3340 error_reportf_err(local_err, "Failed to write bitmap to file: "); 3341 return -err; 3342 } 3343 } 3344 3345 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3346 return qemu_fflush(f); 3347 } 3348 3349 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy, 3350 uint64_t *can_postcopy) 3351 { 3352 RAMState **temp = opaque; 3353 RAMState *rs = *temp; 3354 3355 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3356 3357 if (migrate_postcopy_ram()) { 3358 /* We can do postcopy, and all the data is postcopiable */ 3359 *can_postcopy += remaining_size; 3360 } else { 3361 *must_precopy += remaining_size; 3362 } 3363 } 3364 3365 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy, 3366 uint64_t *can_postcopy) 3367 { 3368 RAMState **temp = opaque; 3369 RAMState *rs = *temp; 3370 uint64_t remaining_size; 3371 3372 if (!migration_in_postcopy()) { 3373 bql_lock(); 3374 WITH_RCU_READ_LOCK_GUARD() { 3375 migration_bitmap_sync_precopy(false); 3376 } 3377 bql_unlock(); 3378 } 3379 3380 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3381 3382 if (migrate_postcopy_ram()) { 3383 /* We can do postcopy, and all the data is postcopiable */ 3384 *can_postcopy += remaining_size; 3385 } else { 3386 *must_precopy += remaining_size; 3387 } 3388 } 3389 3390 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3391 { 3392 unsigned int xh_len; 3393 int xh_flags; 3394 uint8_t *loaded_data; 3395 3396 /* extract RLE header */ 3397 xh_flags = qemu_get_byte(f); 3398 xh_len = qemu_get_be16(f); 3399 3400 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3401 error_report("Failed to load XBZRLE page - wrong compression!"); 3402 return -1; 3403 } 3404 3405 if (xh_len > TARGET_PAGE_SIZE) { 3406 error_report("Failed to load XBZRLE page - len overflow!"); 3407 return -1; 3408 } 3409 loaded_data = XBZRLE.decoded_buf; 3410 /* load data and decode */ 3411 /* it can change loaded_data to point to an internal buffer */ 3412 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3413 3414 /* decode RLE */ 3415 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3416 TARGET_PAGE_SIZE) == -1) { 3417 error_report("Failed to load XBZRLE page - decode error!"); 3418 return -1; 3419 } 3420 3421 return 0; 3422 } 3423 3424 /** 3425 * ram_block_from_stream: read a RAMBlock id from the migration stream 3426 * 3427 * Must be called from within a rcu critical section. 3428 * 3429 * Returns a pointer from within the RCU-protected ram_list. 3430 * 3431 * @mis: the migration incoming state pointer 3432 * @f: QEMUFile where to read the data from 3433 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3434 * @channel: the channel we're using 3435 */ 3436 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3437 QEMUFile *f, int flags, 3438 int channel) 3439 { 3440 RAMBlock *block = mis->last_recv_block[channel]; 3441 char id[256]; 3442 uint8_t len; 3443 3444 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3445 if (!block) { 3446 error_report("Ack, bad migration stream!"); 3447 return NULL; 3448 } 3449 return block; 3450 } 3451 3452 len = qemu_get_byte(f); 3453 qemu_get_buffer(f, (uint8_t *)id, len); 3454 id[len] = 0; 3455 3456 block = qemu_ram_block_by_name(id); 3457 if (!block) { 3458 error_report("Can't find block %s", id); 3459 return NULL; 3460 } 3461 3462 if (migrate_ram_is_ignored(block)) { 3463 error_report("block %s should not be migrated !", id); 3464 return NULL; 3465 } 3466 3467 mis->last_recv_block[channel] = block; 3468 3469 return block; 3470 } 3471 3472 static inline void *host_from_ram_block_offset(RAMBlock *block, 3473 ram_addr_t offset) 3474 { 3475 if (!offset_in_ramblock(block, offset)) { 3476 return NULL; 3477 } 3478 3479 return block->host + offset; 3480 } 3481 3482 static void *host_page_from_ram_block_offset(RAMBlock *block, 3483 ram_addr_t offset) 3484 { 3485 /* Note: Explicitly no check against offset_in_ramblock(). */ 3486 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3487 block->page_size); 3488 } 3489 3490 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3491 ram_addr_t offset) 3492 { 3493 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3494 } 3495 3496 void colo_record_bitmap(RAMBlock *block, ram_addr_t *normal, uint32_t pages) 3497 { 3498 qemu_mutex_lock(&ram_state->bitmap_mutex); 3499 for (int i = 0; i < pages; i++) { 3500 ram_addr_t offset = normal[i]; 3501 ram_state->migration_dirty_pages += !test_and_set_bit( 3502 offset >> TARGET_PAGE_BITS, 3503 block->bmap); 3504 } 3505 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3506 } 3507 3508 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3509 ram_addr_t offset, bool record_bitmap) 3510 { 3511 if (!offset_in_ramblock(block, offset)) { 3512 return NULL; 3513 } 3514 if (!block->colo_cache) { 3515 error_report("%s: colo_cache is NULL in block :%s", 3516 __func__, block->idstr); 3517 return NULL; 3518 } 3519 3520 /* 3521 * During colo checkpoint, we need bitmap of these migrated pages. 3522 * It help us to decide which pages in ram cache should be flushed 3523 * into VM's RAM later. 3524 */ 3525 if (record_bitmap) { 3526 colo_record_bitmap(block, &offset, 1); 3527 } 3528 return block->colo_cache + offset; 3529 } 3530 3531 /** 3532 * ram_handle_zero: handle the zero page case 3533 * 3534 * If a page (or a whole RDMA chunk) has been 3535 * determined to be zero, then zap it. 3536 * 3537 * @host: host address for the zero page 3538 * @ch: what the page is filled from. We only support zero 3539 * @size: size of the zero page 3540 */ 3541 void ram_handle_zero(void *host, uint64_t size) 3542 { 3543 if (!buffer_is_zero(host, size)) { 3544 memset(host, 0, size); 3545 } 3546 } 3547 3548 static void colo_init_ram_state(void) 3549 { 3550 Error *local_err = NULL; 3551 3552 if (!ram_state_init(&ram_state, &local_err)) { 3553 error_report_err(local_err); 3554 } 3555 } 3556 3557 /* 3558 * colo cache: this is for secondary VM, we cache the whole 3559 * memory of the secondary VM, it is need to hold the global lock 3560 * to call this helper. 3561 */ 3562 int colo_init_ram_cache(void) 3563 { 3564 RAMBlock *block; 3565 3566 WITH_RCU_READ_LOCK_GUARD() { 3567 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3568 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3569 NULL, false, false); 3570 if (!block->colo_cache) { 3571 error_report("%s: Can't alloc memory for COLO cache of block %s," 3572 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3573 block->used_length); 3574 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3575 if (block->colo_cache) { 3576 qemu_anon_ram_free(block->colo_cache, block->used_length); 3577 block->colo_cache = NULL; 3578 } 3579 } 3580 return -errno; 3581 } 3582 if (!machine_dump_guest_core(current_machine)) { 3583 qemu_madvise(block->colo_cache, block->used_length, 3584 QEMU_MADV_DONTDUMP); 3585 } 3586 } 3587 } 3588 3589 /* 3590 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3591 * with to decide which page in cache should be flushed into SVM's RAM. Here 3592 * we use the same name 'ram_bitmap' as for migration. 3593 */ 3594 if (ram_bytes_total()) { 3595 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3596 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3597 block->bmap = bitmap_new(pages); 3598 } 3599 } 3600 3601 colo_init_ram_state(); 3602 return 0; 3603 } 3604 3605 /* TODO: duplicated with ram_init_bitmaps */ 3606 void colo_incoming_start_dirty_log(void) 3607 { 3608 RAMBlock *block = NULL; 3609 Error *local_err = NULL; 3610 3611 /* For memory_global_dirty_log_start below. */ 3612 bql_lock(); 3613 qemu_mutex_lock_ramlist(); 3614 3615 memory_global_dirty_log_sync(false); 3616 WITH_RCU_READ_LOCK_GUARD() { 3617 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3618 ramblock_sync_dirty_bitmap(ram_state, block); 3619 /* Discard this dirty bitmap record */ 3620 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3621 } 3622 if (!memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION, 3623 &local_err)) { 3624 error_report_err(local_err); 3625 } 3626 } 3627 ram_state->migration_dirty_pages = 0; 3628 qemu_mutex_unlock_ramlist(); 3629 bql_unlock(); 3630 } 3631 3632 /* It is need to hold the global lock to call this helper */ 3633 void colo_release_ram_cache(void) 3634 { 3635 RAMBlock *block; 3636 3637 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3638 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3639 g_free(block->bmap); 3640 block->bmap = NULL; 3641 } 3642 3643 WITH_RCU_READ_LOCK_GUARD() { 3644 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3645 if (block->colo_cache) { 3646 qemu_anon_ram_free(block->colo_cache, block->used_length); 3647 block->colo_cache = NULL; 3648 } 3649 } 3650 } 3651 ram_state_cleanup(&ram_state); 3652 } 3653 3654 /** 3655 * ram_load_setup: Setup RAM for migration incoming side 3656 * 3657 * Returns zero to indicate success and negative for error 3658 * 3659 * @f: QEMUFile where to receive the data 3660 * @opaque: RAMState pointer 3661 * @errp: pointer to Error*, to store an error if it happens. 3662 */ 3663 static int ram_load_setup(QEMUFile *f, void *opaque, Error **errp) 3664 { 3665 xbzrle_load_setup(); 3666 ramblock_recv_map_init(); 3667 3668 return 0; 3669 } 3670 3671 static int ram_load_cleanup(void *opaque) 3672 { 3673 RAMBlock *rb; 3674 3675 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3676 qemu_ram_block_writeback(rb); 3677 } 3678 3679 xbzrle_load_cleanup(); 3680 3681 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3682 g_free(rb->receivedmap); 3683 rb->receivedmap = NULL; 3684 } 3685 3686 return 0; 3687 } 3688 3689 /** 3690 * ram_postcopy_incoming_init: allocate postcopy data structures 3691 * 3692 * Returns 0 for success and negative if there was one error 3693 * 3694 * @mis: current migration incoming state 3695 * 3696 * Allocate data structures etc needed by incoming migration with 3697 * postcopy-ram. postcopy-ram's similarly names 3698 * postcopy_ram_incoming_init does the work. 3699 */ 3700 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3701 { 3702 return postcopy_ram_incoming_init(mis); 3703 } 3704 3705 /** 3706 * ram_load_postcopy: load a page in postcopy case 3707 * 3708 * Returns 0 for success or -errno in case of error 3709 * 3710 * Called in postcopy mode by ram_load(). 3711 * rcu_read_lock is taken prior to this being called. 3712 * 3713 * @f: QEMUFile where to send the data 3714 * @channel: the channel to use for loading 3715 */ 3716 int ram_load_postcopy(QEMUFile *f, int channel) 3717 { 3718 int flags = 0, ret = 0; 3719 bool place_needed = false; 3720 bool matches_target_page_size = false; 3721 MigrationIncomingState *mis = migration_incoming_get_current(); 3722 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel]; 3723 3724 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3725 ram_addr_t addr; 3726 void *page_buffer = NULL; 3727 void *place_source = NULL; 3728 RAMBlock *block = NULL; 3729 uint8_t ch; 3730 3731 addr = qemu_get_be64(f); 3732 3733 /* 3734 * If qemu file error, we should stop here, and then "addr" 3735 * may be invalid 3736 */ 3737 ret = qemu_file_get_error(f); 3738 if (ret) { 3739 break; 3740 } 3741 3742 flags = addr & ~TARGET_PAGE_MASK; 3743 addr &= TARGET_PAGE_MASK; 3744 3745 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags); 3746 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) { 3747 block = ram_block_from_stream(mis, f, flags, channel); 3748 if (!block) { 3749 ret = -EINVAL; 3750 break; 3751 } 3752 3753 /* 3754 * Relying on used_length is racy and can result in false positives. 3755 * We might place pages beyond used_length in case RAM was shrunk 3756 * while in postcopy, which is fine - trying to place via 3757 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 3758 */ 3759 if (!block->host || addr >= block->postcopy_length) { 3760 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3761 ret = -EINVAL; 3762 break; 3763 } 3764 tmp_page->target_pages++; 3765 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3766 /* 3767 * Postcopy requires that we place whole host pages atomically; 3768 * these may be huge pages for RAMBlocks that are backed by 3769 * hugetlbfs. 3770 * To make it atomic, the data is read into a temporary page 3771 * that's moved into place later. 3772 * The migration protocol uses, possibly smaller, target-pages 3773 * however the source ensures it always sends all the components 3774 * of a host page in one chunk. 3775 */ 3776 page_buffer = tmp_page->tmp_huge_page + 3777 host_page_offset_from_ram_block_offset(block, addr); 3778 /* If all TP are zero then we can optimise the place */ 3779 if (tmp_page->target_pages == 1) { 3780 tmp_page->host_addr = 3781 host_page_from_ram_block_offset(block, addr); 3782 } else if (tmp_page->host_addr != 3783 host_page_from_ram_block_offset(block, addr)) { 3784 /* not the 1st TP within the HP */ 3785 error_report("Non-same host page detected on channel %d: " 3786 "Target host page %p, received host page %p " 3787 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 3788 channel, tmp_page->host_addr, 3789 host_page_from_ram_block_offset(block, addr), 3790 block->idstr, addr, tmp_page->target_pages); 3791 ret = -EINVAL; 3792 break; 3793 } 3794 3795 /* 3796 * If it's the last part of a host page then we place the host 3797 * page 3798 */ 3799 if (tmp_page->target_pages == 3800 (block->page_size / TARGET_PAGE_SIZE)) { 3801 place_needed = true; 3802 } 3803 place_source = tmp_page->tmp_huge_page; 3804 } 3805 3806 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3807 case RAM_SAVE_FLAG_ZERO: 3808 ch = qemu_get_byte(f); 3809 if (ch != 0) { 3810 error_report("Found a zero page with value %d", ch); 3811 ret = -EINVAL; 3812 break; 3813 } 3814 /* 3815 * Can skip to set page_buffer when 3816 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 3817 */ 3818 if (!matches_target_page_size) { 3819 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3820 } 3821 break; 3822 3823 case RAM_SAVE_FLAG_PAGE: 3824 tmp_page->all_zero = false; 3825 if (!matches_target_page_size) { 3826 /* For huge pages, we always use temporary buffer */ 3827 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3828 } else { 3829 /* 3830 * For small pages that matches target page size, we 3831 * avoid the qemu_file copy. Instead we directly use 3832 * the buffer of QEMUFile to place the page. Note: we 3833 * cannot do any QEMUFile operation before using that 3834 * buffer to make sure the buffer is valid when 3835 * placing the page. 3836 */ 3837 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3838 TARGET_PAGE_SIZE); 3839 } 3840 break; 3841 case RAM_SAVE_FLAG_EOS: 3842 break; 3843 default: 3844 error_report("Unknown combination of migration flags: 0x%x" 3845 " (postcopy mode)", flags); 3846 ret = -EINVAL; 3847 break; 3848 } 3849 3850 /* Detect for any possible file errors */ 3851 if (!ret && qemu_file_get_error(f)) { 3852 ret = qemu_file_get_error(f); 3853 } 3854 3855 if (!ret && place_needed) { 3856 if (tmp_page->all_zero) { 3857 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 3858 } else { 3859 ret = postcopy_place_page(mis, tmp_page->host_addr, 3860 place_source, block); 3861 } 3862 place_needed = false; 3863 postcopy_temp_page_reset(tmp_page); 3864 } 3865 } 3866 3867 return ret; 3868 } 3869 3870 static bool postcopy_is_running(void) 3871 { 3872 PostcopyState ps = postcopy_state_get(); 3873 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3874 } 3875 3876 /* 3877 * Flush content of RAM cache into SVM's memory. 3878 * Only flush the pages that be dirtied by PVM or SVM or both. 3879 */ 3880 void colo_flush_ram_cache(void) 3881 { 3882 RAMBlock *block = NULL; 3883 void *dst_host; 3884 void *src_host; 3885 unsigned long offset = 0; 3886 3887 memory_global_dirty_log_sync(false); 3888 qemu_mutex_lock(&ram_state->bitmap_mutex); 3889 WITH_RCU_READ_LOCK_GUARD() { 3890 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3891 ramblock_sync_dirty_bitmap(ram_state, block); 3892 } 3893 } 3894 3895 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 3896 WITH_RCU_READ_LOCK_GUARD() { 3897 block = QLIST_FIRST_RCU(&ram_list.blocks); 3898 3899 while (block) { 3900 unsigned long num = 0; 3901 3902 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 3903 if (!offset_in_ramblock(block, 3904 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 3905 offset = 0; 3906 num = 0; 3907 block = QLIST_NEXT_RCU(block, next); 3908 } else { 3909 unsigned long i = 0; 3910 3911 for (i = 0; i < num; i++) { 3912 migration_bitmap_clear_dirty(ram_state, block, offset + i); 3913 } 3914 dst_host = block->host 3915 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3916 src_host = block->colo_cache 3917 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3918 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 3919 offset += num; 3920 } 3921 } 3922 } 3923 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3924 trace_colo_flush_ram_cache_end(); 3925 } 3926 3927 static size_t ram_load_multifd_pages(void *host_addr, size_t size, 3928 uint64_t offset) 3929 { 3930 MultiFDRecvData *data = multifd_get_recv_data(); 3931 3932 data->opaque = host_addr; 3933 data->file_offset = offset; 3934 data->size = size; 3935 3936 if (!multifd_recv()) { 3937 return 0; 3938 } 3939 3940 return size; 3941 } 3942 3943 static bool read_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, 3944 long num_pages, unsigned long *bitmap, 3945 Error **errp) 3946 { 3947 ERRP_GUARD(); 3948 unsigned long set_bit_idx, clear_bit_idx; 3949 ram_addr_t offset; 3950 void *host; 3951 size_t read, unread, size; 3952 3953 for (set_bit_idx = find_first_bit(bitmap, num_pages); 3954 set_bit_idx < num_pages; 3955 set_bit_idx = find_next_bit(bitmap, num_pages, clear_bit_idx + 1)) { 3956 3957 clear_bit_idx = find_next_zero_bit(bitmap, num_pages, set_bit_idx + 1); 3958 3959 unread = TARGET_PAGE_SIZE * (clear_bit_idx - set_bit_idx); 3960 offset = set_bit_idx << TARGET_PAGE_BITS; 3961 3962 while (unread > 0) { 3963 host = host_from_ram_block_offset(block, offset); 3964 if (!host) { 3965 error_setg(errp, "page outside of ramblock %s range", 3966 block->idstr); 3967 return false; 3968 } 3969 3970 size = MIN(unread, MAPPED_RAM_LOAD_BUF_SIZE); 3971 3972 if (migrate_multifd()) { 3973 read = ram_load_multifd_pages(host, size, 3974 block->pages_offset + offset); 3975 } else { 3976 read = qemu_get_buffer_at(f, host, size, 3977 block->pages_offset + offset); 3978 } 3979 3980 if (!read) { 3981 goto err; 3982 } 3983 offset += read; 3984 unread -= read; 3985 } 3986 } 3987 3988 return true; 3989 3990 err: 3991 qemu_file_get_error_obj(f, errp); 3992 error_prepend(errp, "(%s) failed to read page " RAM_ADDR_FMT 3993 "from file offset %" PRIx64 ": ", block->idstr, offset, 3994 block->pages_offset + offset); 3995 return false; 3996 } 3997 3998 static void parse_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, 3999 ram_addr_t length, Error **errp) 4000 { 4001 g_autofree unsigned long *bitmap = NULL; 4002 MappedRamHeader header; 4003 size_t bitmap_size; 4004 long num_pages; 4005 4006 if (!mapped_ram_read_header(f, &header, errp)) { 4007 return; 4008 } 4009 4010 block->pages_offset = header.pages_offset; 4011 4012 /* 4013 * Check the alignment of the file region that contains pages. We 4014 * don't enforce MAPPED_RAM_FILE_OFFSET_ALIGNMENT to allow that 4015 * value to change in the future. Do only a sanity check with page 4016 * size alignment. 4017 */ 4018 if (!QEMU_IS_ALIGNED(block->pages_offset, TARGET_PAGE_SIZE)) { 4019 error_setg(errp, 4020 "Error reading ramblock %s pages, region has bad alignment", 4021 block->idstr); 4022 return; 4023 } 4024 4025 num_pages = length / header.page_size; 4026 bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); 4027 4028 bitmap = g_malloc0(bitmap_size); 4029 if (qemu_get_buffer_at(f, (uint8_t *)bitmap, bitmap_size, 4030 header.bitmap_offset) != bitmap_size) { 4031 error_setg(errp, "Error reading dirty bitmap"); 4032 return; 4033 } 4034 4035 if (!read_ramblock_mapped_ram(f, block, num_pages, bitmap, errp)) { 4036 return; 4037 } 4038 4039 /* Skip pages array */ 4040 qemu_set_offset(f, block->pages_offset + length, SEEK_SET); 4041 } 4042 4043 static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length) 4044 { 4045 int ret = 0; 4046 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 4047 bool postcopy_advised = migration_incoming_postcopy_advised(); 4048 int max_hg_page_size; 4049 Error *local_err = NULL; 4050 4051 assert(block); 4052 4053 if (migrate_mapped_ram()) { 4054 parse_ramblock_mapped_ram(f, block, length, &local_err); 4055 if (local_err) { 4056 error_report_err(local_err); 4057 return -EINVAL; 4058 } 4059 return 0; 4060 } 4061 4062 if (!qemu_ram_is_migratable(block)) { 4063 error_report("block %s should not be migrated !", block->idstr); 4064 return -EINVAL; 4065 } 4066 4067 if (length != block->used_length) { 4068 ret = qemu_ram_resize(block, length, &local_err); 4069 if (local_err) { 4070 error_report_err(local_err); 4071 return ret; 4072 } 4073 } 4074 4075 /* 4076 * ??? Mirrors the previous value of qemu_host_page_size, 4077 * but is this really what was intended for the migration? 4078 */ 4079 max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE); 4080 4081 /* For postcopy we need to check hugepage sizes match */ 4082 if (postcopy_advised && migrate_postcopy_ram() && 4083 block->page_size != max_hg_page_size) { 4084 uint64_t remote_page_size = qemu_get_be64(f); 4085 if (remote_page_size != block->page_size) { 4086 error_report("Mismatched RAM page size %s " 4087 "(local) %zd != %" PRId64, block->idstr, 4088 block->page_size, remote_page_size); 4089 return -EINVAL; 4090 } 4091 } 4092 if (migrate_ignore_shared()) { 4093 hwaddr addr = qemu_get_be64(f); 4094 if (migrate_ram_is_ignored(block) && 4095 block->mr->addr != addr) { 4096 error_report("Mismatched GPAs for block %s " 4097 "%" PRId64 "!= %" PRId64, block->idstr, 4098 (uint64_t)addr, (uint64_t)block->mr->addr); 4099 return -EINVAL; 4100 } 4101 } 4102 ret = rdma_block_notification_handle(f, block->idstr); 4103 if (ret < 0) { 4104 qemu_file_set_error(f, ret); 4105 } 4106 4107 return ret; 4108 } 4109 4110 static int parse_ramblocks(QEMUFile *f, ram_addr_t total_ram_bytes) 4111 { 4112 int ret = 0; 4113 4114 /* Synchronize RAM block list */ 4115 while (!ret && total_ram_bytes) { 4116 RAMBlock *block; 4117 char id[256]; 4118 ram_addr_t length; 4119 int len = qemu_get_byte(f); 4120 4121 qemu_get_buffer(f, (uint8_t *)id, len); 4122 id[len] = 0; 4123 length = qemu_get_be64(f); 4124 4125 block = qemu_ram_block_by_name(id); 4126 if (block) { 4127 ret = parse_ramblock(f, block, length); 4128 } else { 4129 error_report("Unknown ramblock \"%s\", cannot accept " 4130 "migration", id); 4131 ret = -EINVAL; 4132 } 4133 total_ram_bytes -= length; 4134 } 4135 4136 return ret; 4137 } 4138 4139 /** 4140 * ram_load_precopy: load pages in precopy case 4141 * 4142 * Returns 0 for success or -errno in case of error 4143 * 4144 * Called in precopy mode by ram_load(). 4145 * rcu_read_lock is taken prior to this being called. 4146 * 4147 * @f: QEMUFile where to send the data 4148 */ 4149 static int ram_load_precopy(QEMUFile *f) 4150 { 4151 MigrationIncomingState *mis = migration_incoming_get_current(); 4152 int flags = 0, ret = 0, invalid_flags = 0, i = 0; 4153 4154 if (migrate_mapped_ram()) { 4155 invalid_flags |= (RAM_SAVE_FLAG_HOOK | RAM_SAVE_FLAG_MULTIFD_FLUSH | 4156 RAM_SAVE_FLAG_PAGE | RAM_SAVE_FLAG_XBZRLE | 4157 RAM_SAVE_FLAG_ZERO); 4158 } 4159 4160 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4161 ram_addr_t addr; 4162 void *host = NULL, *host_bak = NULL; 4163 uint8_t ch; 4164 4165 /* 4166 * Yield periodically to let main loop run, but an iteration of 4167 * the main loop is expensive, so do it each some iterations 4168 */ 4169 if ((i & 32767) == 0 && qemu_in_coroutine()) { 4170 aio_co_schedule(qemu_get_current_aio_context(), 4171 qemu_coroutine_self()); 4172 qemu_coroutine_yield(); 4173 } 4174 i++; 4175 4176 addr = qemu_get_be64(f); 4177 ret = qemu_file_get_error(f); 4178 if (ret) { 4179 error_report("Getting RAM address failed"); 4180 break; 4181 } 4182 4183 flags = addr & ~TARGET_PAGE_MASK; 4184 addr &= TARGET_PAGE_MASK; 4185 4186 if (flags & invalid_flags) { 4187 error_report("Unexpected RAM flags: %d", flags & invalid_flags); 4188 4189 ret = -EINVAL; 4190 break; 4191 } 4192 4193 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4194 RAM_SAVE_FLAG_XBZRLE)) { 4195 RAMBlock *block = ram_block_from_stream(mis, f, flags, 4196 RAM_CHANNEL_PRECOPY); 4197 4198 host = host_from_ram_block_offset(block, addr); 4199 /* 4200 * After going into COLO stage, we should not load the page 4201 * into SVM's memory directly, we put them into colo_cache firstly. 4202 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 4203 * Previously, we copied all these memory in preparing stage of COLO 4204 * while we need to stop VM, which is a time-consuming process. 4205 * Here we optimize it by a trick, back-up every page while in 4206 * migration process while COLO is enabled, though it affects the 4207 * speed of the migration, but it obviously reduce the downtime of 4208 * back-up all SVM'S memory in COLO preparing stage. 4209 */ 4210 if (migration_incoming_colo_enabled()) { 4211 if (migration_incoming_in_colo_state()) { 4212 /* In COLO stage, put all pages into cache temporarily */ 4213 host = colo_cache_from_block_offset(block, addr, true); 4214 } else { 4215 /* 4216 * In migration stage but before COLO stage, 4217 * Put all pages into both cache and SVM's memory. 4218 */ 4219 host_bak = colo_cache_from_block_offset(block, addr, false); 4220 } 4221 } 4222 if (!host) { 4223 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4224 ret = -EINVAL; 4225 break; 4226 } 4227 if (!migration_incoming_in_colo_state()) { 4228 ramblock_recv_bitmap_set(block, host); 4229 } 4230 4231 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 4232 } 4233 4234 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4235 case RAM_SAVE_FLAG_MEM_SIZE: 4236 ret = parse_ramblocks(f, addr); 4237 /* 4238 * For mapped-ram migration (to a file) using multifd, we sync 4239 * once and for all here to make sure all tasks we queued to 4240 * multifd threads are completed, so that all the ramblocks 4241 * (including all the guest memory pages within) are fully 4242 * loaded after this sync returns. 4243 */ 4244 if (migrate_mapped_ram()) { 4245 multifd_recv_sync_main(); 4246 } 4247 break; 4248 4249 case RAM_SAVE_FLAG_ZERO: 4250 ch = qemu_get_byte(f); 4251 if (ch != 0) { 4252 error_report("Found a zero page with value %d", ch); 4253 ret = -EINVAL; 4254 break; 4255 } 4256 ram_handle_zero(host, TARGET_PAGE_SIZE); 4257 break; 4258 4259 case RAM_SAVE_FLAG_PAGE: 4260 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4261 break; 4262 4263 case RAM_SAVE_FLAG_XBZRLE: 4264 if (load_xbzrle(f, addr, host) < 0) { 4265 error_report("Failed to decompress XBZRLE page at " 4266 RAM_ADDR_FMT, addr); 4267 ret = -EINVAL; 4268 break; 4269 } 4270 break; 4271 case RAM_SAVE_FLAG_MULTIFD_FLUSH: 4272 multifd_recv_sync_main(); 4273 break; 4274 case RAM_SAVE_FLAG_EOS: 4275 /* normal exit */ 4276 if (migrate_multifd() && 4277 migrate_multifd_flush_after_each_section() && 4278 /* 4279 * Mapped-ram migration flushes once and for all after 4280 * parsing ramblocks. Always ignore EOS for it. 4281 */ 4282 !migrate_mapped_ram()) { 4283 multifd_recv_sync_main(); 4284 } 4285 break; 4286 case RAM_SAVE_FLAG_HOOK: 4287 ret = rdma_registration_handle(f); 4288 if (ret < 0) { 4289 qemu_file_set_error(f, ret); 4290 } 4291 break; 4292 default: 4293 error_report("Unknown combination of migration flags: 0x%x", flags); 4294 ret = -EINVAL; 4295 } 4296 if (!ret) { 4297 ret = qemu_file_get_error(f); 4298 } 4299 if (!ret && host_bak) { 4300 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4301 } 4302 } 4303 4304 return ret; 4305 } 4306 4307 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4308 { 4309 int ret = 0; 4310 static uint64_t seq_iter; 4311 /* 4312 * If system is running in postcopy mode, page inserts to host memory must 4313 * be atomic 4314 */ 4315 bool postcopy_running = postcopy_is_running(); 4316 4317 seq_iter++; 4318 4319 if (version_id != 4) { 4320 return -EINVAL; 4321 } 4322 4323 /* 4324 * This RCU critical section can be very long running. 4325 * When RCU reclaims in the code start to become numerous, 4326 * it will be necessary to reduce the granularity of this 4327 * critical section. 4328 */ 4329 trace_ram_load_start(); 4330 WITH_RCU_READ_LOCK_GUARD() { 4331 if (postcopy_running) { 4332 /* 4333 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of 4334 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to 4335 * service fast page faults. 4336 */ 4337 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY); 4338 } else { 4339 ret = ram_load_precopy(f); 4340 } 4341 } 4342 trace_ram_load_complete(ret, seq_iter); 4343 4344 return ret; 4345 } 4346 4347 static bool ram_has_postcopy(void *opaque) 4348 { 4349 RAMBlock *rb; 4350 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4351 if (ramblock_is_pmem(rb)) { 4352 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4353 "is not supported now!", rb->idstr, rb->host); 4354 return false; 4355 } 4356 } 4357 4358 return migrate_postcopy_ram(); 4359 } 4360 4361 /* Sync all the dirty bitmap with destination VM. */ 4362 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4363 { 4364 RAMBlock *block; 4365 QEMUFile *file = s->to_dst_file; 4366 4367 trace_ram_dirty_bitmap_sync_start(); 4368 4369 qatomic_set(&rs->postcopy_bmap_sync_requested, 0); 4370 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4371 qemu_savevm_send_recv_bitmap(file, block->idstr); 4372 trace_ram_dirty_bitmap_request(block->idstr); 4373 qatomic_inc(&rs->postcopy_bmap_sync_requested); 4374 } 4375 4376 trace_ram_dirty_bitmap_sync_wait(); 4377 4378 /* Wait until all the ramblocks' dirty bitmap synced */ 4379 while (qatomic_read(&rs->postcopy_bmap_sync_requested)) { 4380 if (migration_rp_wait(s)) { 4381 return -1; 4382 } 4383 } 4384 4385 trace_ram_dirty_bitmap_sync_complete(); 4386 4387 return 0; 4388 } 4389 4390 /* 4391 * Read the received bitmap, revert it as the initial dirty bitmap. 4392 * This is only used when the postcopy migration is paused but wants 4393 * to resume from a middle point. 4394 * 4395 * Returns true if succeeded, false for errors. 4396 */ 4397 bool ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block, Error **errp) 4398 { 4399 /* from_dst_file is always valid because we're within rp_thread */ 4400 QEMUFile *file = s->rp_state.from_dst_file; 4401 g_autofree unsigned long *le_bitmap = NULL; 4402 unsigned long nbits = block->used_length >> TARGET_PAGE_BITS; 4403 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4404 uint64_t size, end_mark; 4405 RAMState *rs = ram_state; 4406 4407 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4408 4409 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4410 error_setg(errp, "Reload bitmap in incorrect state %s", 4411 MigrationStatus_str(s->state)); 4412 return false; 4413 } 4414 4415 /* 4416 * Note: see comments in ramblock_recv_bitmap_send() on why we 4417 * need the endianness conversion, and the paddings. 4418 */ 4419 local_size = ROUND_UP(local_size, 8); 4420 4421 /* Add paddings */ 4422 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4423 4424 size = qemu_get_be64(file); 4425 4426 /* The size of the bitmap should match with our ramblock */ 4427 if (size != local_size) { 4428 error_setg(errp, "ramblock '%s' bitmap size mismatch (0x%"PRIx64 4429 " != 0x%"PRIx64")", block->idstr, size, local_size); 4430 return false; 4431 } 4432 4433 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4434 end_mark = qemu_get_be64(file); 4435 4436 if (qemu_file_get_error(file) || size != local_size) { 4437 error_setg(errp, "read bitmap failed for ramblock '%s': " 4438 "(size 0x%"PRIx64", got: 0x%"PRIx64")", 4439 block->idstr, local_size, size); 4440 return false; 4441 } 4442 4443 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4444 error_setg(errp, "ramblock '%s' end mark incorrect: 0x%"PRIx64, 4445 block->idstr, end_mark); 4446 return false; 4447 } 4448 4449 /* 4450 * Endianness conversion. We are during postcopy (though paused). 4451 * The dirty bitmap won't change. We can directly modify it. 4452 */ 4453 bitmap_from_le(block->bmap, le_bitmap, nbits); 4454 4455 /* 4456 * What we received is "received bitmap". Revert it as the initial 4457 * dirty bitmap for this ramblock. 4458 */ 4459 bitmap_complement(block->bmap, block->bmap, nbits); 4460 4461 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4462 ramblock_dirty_bitmap_clear_discarded_pages(block); 4463 4464 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4465 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4466 4467 qatomic_dec(&rs->postcopy_bmap_sync_requested); 4468 4469 /* 4470 * We succeeded to sync bitmap for current ramblock. Always kick the 4471 * migration thread to check whether all requested bitmaps are 4472 * reloaded. NOTE: it's racy to only kick when requested==0, because 4473 * we don't know whether the migration thread may still be increasing 4474 * it. 4475 */ 4476 migration_rp_kick(s); 4477 4478 return true; 4479 } 4480 4481 static int ram_resume_prepare(MigrationState *s, void *opaque) 4482 { 4483 RAMState *rs = *(RAMState **)opaque; 4484 int ret; 4485 4486 ret = ram_dirty_bitmap_sync_all(s, rs); 4487 if (ret) { 4488 return ret; 4489 } 4490 4491 ram_state_resume_prepare(rs, s->to_dst_file); 4492 4493 return 0; 4494 } 4495 4496 static bool ram_save_postcopy_prepare(QEMUFile *f, void *opaque, Error **errp) 4497 { 4498 int ret; 4499 4500 if (migrate_multifd()) { 4501 /* 4502 * When multifd is enabled, source QEMU needs to make sure all the 4503 * pages queued before postcopy starts have been flushed. 4504 * 4505 * The load of these pages must happen before switching to postcopy. 4506 * It's because loading of guest pages (so far) in multifd recv 4507 * threads is still non-atomic, so the load cannot happen with vCPUs 4508 * running on the destination side. 4509 * 4510 * This flush and sync will guarantee that those pages are loaded 4511 * _before_ postcopy starts on the destination. The rationale is, 4512 * this happens before VM stops (and before source QEMU sends all 4513 * the rest of the postcopy messages). So when the destination QEMU 4514 * receives the postcopy messages, it must have received the sync 4515 * message on the main channel (either RAM_SAVE_FLAG_MULTIFD_FLUSH, 4516 * or RAM_SAVE_FLAG_EOS), and such message would guarantee that 4517 * all previous guest pages queued in the multifd channels are 4518 * completely loaded. 4519 */ 4520 ret = multifd_ram_flush_and_sync(f); 4521 if (ret < 0) { 4522 error_setg(errp, "%s: multifd flush and sync failed", __func__); 4523 return false; 4524 } 4525 } 4526 4527 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 4528 4529 return true; 4530 } 4531 4532 void postcopy_preempt_shutdown_file(MigrationState *s) 4533 { 4534 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); 4535 qemu_fflush(s->postcopy_qemufile_src); 4536 } 4537 4538 static SaveVMHandlers savevm_ram_handlers = { 4539 .save_setup = ram_save_setup, 4540 .save_live_iterate = ram_save_iterate, 4541 .save_live_complete_postcopy = ram_save_complete, 4542 .save_live_complete_precopy = ram_save_complete, 4543 .has_postcopy = ram_has_postcopy, 4544 .state_pending_exact = ram_state_pending_exact, 4545 .state_pending_estimate = ram_state_pending_estimate, 4546 .load_state = ram_load, 4547 .save_cleanup = ram_save_cleanup, 4548 .load_setup = ram_load_setup, 4549 .load_cleanup = ram_load_cleanup, 4550 .resume_prepare = ram_resume_prepare, 4551 .save_postcopy_prepare = ram_save_postcopy_prepare, 4552 }; 4553 4554 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4555 size_t old_size, size_t new_size) 4556 { 4557 PostcopyState ps = postcopy_state_get(); 4558 ram_addr_t offset; 4559 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4560 Error *err = NULL; 4561 4562 if (!rb) { 4563 error_report("RAM block not found"); 4564 return; 4565 } 4566 4567 if (migrate_ram_is_ignored(rb)) { 4568 return; 4569 } 4570 4571 if (migration_is_running()) { 4572 /* 4573 * Precopy code on the source cannot deal with the size of RAM blocks 4574 * changing at random points in time - especially after sending the 4575 * RAM block sizes in the migration stream, they must no longer change. 4576 * Abort and indicate a proper reason. 4577 */ 4578 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4579 migrate_set_error(migrate_get_current(), err); 4580 error_free(err); 4581 4582 migration_cancel(); 4583 } 4584 4585 switch (ps) { 4586 case POSTCOPY_INCOMING_ADVISE: 4587 /* 4588 * Update what ram_postcopy_incoming_init()->init_range() does at the 4589 * time postcopy was advised. Syncing RAM blocks with the source will 4590 * result in RAM resizes. 4591 */ 4592 if (old_size < new_size) { 4593 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4594 error_report("RAM block '%s' discard of resized RAM failed", 4595 rb->idstr); 4596 } 4597 } 4598 rb->postcopy_length = new_size; 4599 break; 4600 case POSTCOPY_INCOMING_NONE: 4601 case POSTCOPY_INCOMING_RUNNING: 4602 case POSTCOPY_INCOMING_END: 4603 /* 4604 * Once our guest is running, postcopy does no longer care about 4605 * resizes. When growing, the new memory was not available on the 4606 * source, no handler needed. 4607 */ 4608 break; 4609 default: 4610 error_report("RAM block '%s' resized during postcopy state: %d", 4611 rb->idstr, ps); 4612 exit(-1); 4613 } 4614 } 4615 4616 static RAMBlockNotifier ram_mig_ram_notifier = { 4617 .ram_block_resized = ram_mig_ram_block_resized, 4618 }; 4619 4620 void ram_mig_init(void) 4621 { 4622 qemu_mutex_init(&XBZRLE.lock); 4623 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4624 ram_block_notifier_add(&ram_mig_ram_notifier); 4625 } 4626