1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "xbzrle.h" 36 #include "ram.h" 37 #include "migration.h" 38 #include "migration-stats.h" 39 #include "migration/register.h" 40 #include "migration/misc.h" 41 #include "qemu-file.h" 42 #include "postcopy-ram.h" 43 #include "page_cache.h" 44 #include "qemu/error-report.h" 45 #include "qapi/error.h" 46 #include "qapi/qapi-types-migration.h" 47 #include "qapi/qapi-events-migration.h" 48 #include "qapi/qapi-commands-migration.h" 49 #include "qapi/qmp/qerror.h" 50 #include "trace.h" 51 #include "system/ram_addr.h" 52 #include "exec/target_page.h" 53 #include "qemu/rcu_queue.h" 54 #include "migration/colo.h" 55 #include "system/cpu-throttle.h" 56 #include "savevm.h" 57 #include "qemu/iov.h" 58 #include "multifd.h" 59 #include "system/runstate.h" 60 #include "rdma.h" 61 #include "options.h" 62 #include "system/dirtylimit.h" 63 #include "system/kvm.h" 64 65 #include "hw/boards.h" /* for machine_dump_guest_core() */ 66 67 #if defined(__linux__) 68 #include "qemu/userfaultfd.h" 69 #endif /* defined(__linux__) */ 70 71 /***********************************************************/ 72 /* ram save/restore */ 73 74 /* 75 * mapped-ram migration supports O_DIRECT, so we need to make sure the 76 * userspace buffer, the IO operation size and the file offset are 77 * aligned according to the underlying device's block size. The first 78 * two are already aligned to page size, but we need to add padding to 79 * the file to align the offset. We cannot read the block size 80 * dynamically because the migration file can be moved between 81 * different systems, so use 1M to cover most block sizes and to keep 82 * the file offset aligned at page size as well. 83 */ 84 #define MAPPED_RAM_FILE_OFFSET_ALIGNMENT 0x100000 85 86 /* 87 * When doing mapped-ram migration, this is the amount we read from 88 * the pages region in the migration file at a time. 89 */ 90 #define MAPPED_RAM_LOAD_BUF_SIZE 0x100000 91 92 XBZRLECacheStats xbzrle_counters; 93 94 /* 95 * This structure locates a specific location of a guest page. In QEMU, 96 * it's described in a tuple of (ramblock, offset). 97 */ 98 struct PageLocation { 99 RAMBlock *block; 100 unsigned long offset; 101 }; 102 typedef struct PageLocation PageLocation; 103 104 /** 105 * PageLocationHint: describes a hint to a page location 106 * 107 * @valid set if the hint is vaild and to be consumed 108 * @location: the hint content 109 * 110 * In postcopy preempt mode, the urgent channel may provide hints to the 111 * background channel, so that QEMU source can try to migrate whatever is 112 * right after the requested urgent pages. 113 * 114 * This is based on the assumption that the VM (already running on the 115 * destination side) tends to access the memory with spatial locality. 116 * This is also the default behavior of vanilla postcopy (preempt off). 117 */ 118 struct PageLocationHint { 119 bool valid; 120 PageLocation location; 121 }; 122 typedef struct PageLocationHint PageLocationHint; 123 124 /* used by the search for pages to send */ 125 struct PageSearchStatus { 126 /* The migration channel used for a specific host page */ 127 QEMUFile *pss_channel; 128 /* Last block from where we have sent data */ 129 RAMBlock *last_sent_block; 130 /* Current block being searched */ 131 RAMBlock *block; 132 /* Current page to search from */ 133 unsigned long page; 134 /* Set once we wrap around */ 135 bool complete_round; 136 /* Whether we're sending a host page */ 137 bool host_page_sending; 138 /* The start/end of current host page. Invalid if host_page_sending==false */ 139 unsigned long host_page_start; 140 unsigned long host_page_end; 141 }; 142 typedef struct PageSearchStatus PageSearchStatus; 143 144 /* struct contains XBZRLE cache and a static page 145 used by the compression */ 146 static struct { 147 /* buffer used for XBZRLE encoding */ 148 uint8_t *encoded_buf; 149 /* buffer for storing page content */ 150 uint8_t *current_buf; 151 /* Cache for XBZRLE, Protected by lock. */ 152 PageCache *cache; 153 QemuMutex lock; 154 /* it will store a page full of zeros */ 155 uint8_t *zero_target_page; 156 /* buffer used for XBZRLE decoding */ 157 uint8_t *decoded_buf; 158 } XBZRLE; 159 160 static void XBZRLE_cache_lock(void) 161 { 162 if (migrate_xbzrle()) { 163 qemu_mutex_lock(&XBZRLE.lock); 164 } 165 } 166 167 static void XBZRLE_cache_unlock(void) 168 { 169 if (migrate_xbzrle()) { 170 qemu_mutex_unlock(&XBZRLE.lock); 171 } 172 } 173 174 /** 175 * xbzrle_cache_resize: resize the xbzrle cache 176 * 177 * This function is called from migrate_params_apply in main 178 * thread, possibly while a migration is in progress. A running 179 * migration may be using the cache and might finish during this call, 180 * hence changes to the cache are protected by XBZRLE.lock(). 181 * 182 * Returns 0 for success or -1 for error 183 * 184 * @new_size: new cache size 185 * @errp: set *errp if the check failed, with reason 186 */ 187 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 188 { 189 PageCache *new_cache; 190 int64_t ret = 0; 191 192 /* Check for truncation */ 193 if (new_size != (size_t)new_size) { 194 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 195 "exceeding address space"); 196 return -1; 197 } 198 199 if (new_size == migrate_xbzrle_cache_size()) { 200 /* nothing to do */ 201 return 0; 202 } 203 204 XBZRLE_cache_lock(); 205 206 if (XBZRLE.cache != NULL) { 207 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 208 if (!new_cache) { 209 ret = -1; 210 goto out; 211 } 212 213 cache_fini(XBZRLE.cache); 214 XBZRLE.cache = new_cache; 215 } 216 out: 217 XBZRLE_cache_unlock(); 218 return ret; 219 } 220 221 static bool postcopy_preempt_active(void) 222 { 223 return migrate_postcopy_preempt() && migration_in_postcopy(); 224 } 225 226 bool migrate_ram_is_ignored(RAMBlock *block) 227 { 228 MigMode mode = migrate_mode(); 229 return !qemu_ram_is_migratable(block) || 230 mode == MIG_MODE_CPR_TRANSFER || 231 (migrate_ignore_shared() && qemu_ram_is_shared(block) 232 && qemu_ram_is_named_file(block)); 233 } 234 235 #undef RAMBLOCK_FOREACH 236 237 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 238 { 239 RAMBlock *block; 240 int ret = 0; 241 242 RCU_READ_LOCK_GUARD(); 243 244 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 245 ret = func(block, opaque); 246 if (ret) { 247 break; 248 } 249 } 250 return ret; 251 } 252 253 static void ramblock_recv_map_init(void) 254 { 255 RAMBlock *rb; 256 257 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 258 assert(!rb->receivedmap); 259 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 260 } 261 } 262 263 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 264 { 265 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 266 rb->receivedmap); 267 } 268 269 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 270 { 271 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 272 } 273 274 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 275 { 276 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 277 } 278 279 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 280 size_t nr) 281 { 282 bitmap_set_atomic(rb->receivedmap, 283 ramblock_recv_bitmap_offset(host_addr, rb), 284 nr); 285 } 286 287 void ramblock_recv_bitmap_set_offset(RAMBlock *rb, uint64_t byte_offset) 288 { 289 set_bit_atomic(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 290 } 291 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 292 293 /* 294 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 295 * 296 * Returns >0 if success with sent bytes, or <0 if error. 297 */ 298 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 299 const char *block_name) 300 { 301 RAMBlock *block = qemu_ram_block_by_name(block_name); 302 unsigned long *le_bitmap, nbits; 303 uint64_t size; 304 305 if (!block) { 306 error_report("%s: invalid block name: %s", __func__, block_name); 307 return -1; 308 } 309 310 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 311 312 /* 313 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 314 * machines we may need 4 more bytes for padding (see below 315 * comment). So extend it a bit before hand. 316 */ 317 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 318 319 /* 320 * Always use little endian when sending the bitmap. This is 321 * required that when source and destination VMs are not using the 322 * same endianness. (Note: big endian won't work.) 323 */ 324 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 325 326 /* Size of the bitmap, in bytes */ 327 size = DIV_ROUND_UP(nbits, 8); 328 329 /* 330 * size is always aligned to 8 bytes for 64bit machines, but it 331 * may not be true for 32bit machines. We need this padding to 332 * make sure the migration can survive even between 32bit and 333 * 64bit machines. 334 */ 335 size = ROUND_UP(size, 8); 336 337 qemu_put_be64(file, size); 338 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 339 g_free(le_bitmap); 340 /* 341 * Mark as an end, in case the middle part is screwed up due to 342 * some "mysterious" reason. 343 */ 344 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 345 int ret = qemu_fflush(file); 346 if (ret) { 347 return ret; 348 } 349 350 return size + sizeof(size); 351 } 352 353 /* 354 * An outstanding page request, on the source, having been received 355 * and queued 356 */ 357 struct RAMSrcPageRequest { 358 RAMBlock *rb; 359 hwaddr offset; 360 hwaddr len; 361 362 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 363 }; 364 365 /* State of RAM for migration */ 366 struct RAMState { 367 /* 368 * PageSearchStatus structures for the channels when send pages. 369 * Protected by the bitmap_mutex. 370 */ 371 PageSearchStatus pss[RAM_CHANNEL_MAX]; 372 /* UFFD file descriptor, used in 'write-tracking' migration */ 373 int uffdio_fd; 374 /* total ram size in bytes */ 375 uint64_t ram_bytes_total; 376 /* Last block that we have visited searching for dirty pages */ 377 RAMBlock *last_seen_block; 378 /* Last dirty target page we have sent */ 379 ram_addr_t last_page; 380 /* last ram version we have seen */ 381 uint32_t last_version; 382 /* How many times we have dirty too many pages */ 383 int dirty_rate_high_cnt; 384 /* these variables are used for bitmap sync */ 385 /* last time we did a full bitmap_sync */ 386 int64_t time_last_bitmap_sync; 387 /* bytes transferred at start_time */ 388 uint64_t bytes_xfer_prev; 389 /* number of dirty pages since start_time */ 390 uint64_t num_dirty_pages_period; 391 /* xbzrle misses since the beginning of the period */ 392 uint64_t xbzrle_cache_miss_prev; 393 /* Amount of xbzrle pages since the beginning of the period */ 394 uint64_t xbzrle_pages_prev; 395 /* Amount of xbzrle encoded bytes since the beginning of the period */ 396 uint64_t xbzrle_bytes_prev; 397 /* Are we really using XBZRLE (e.g., after the first round). */ 398 bool xbzrle_started; 399 /* Are we on the last stage of migration */ 400 bool last_stage; 401 402 /* total handled target pages at the beginning of period */ 403 uint64_t target_page_count_prev; 404 /* total handled target pages since start */ 405 uint64_t target_page_count; 406 /* number of dirty bits in the bitmap */ 407 uint64_t migration_dirty_pages; 408 /* 409 * Protects: 410 * - dirty/clear bitmap 411 * - migration_dirty_pages 412 * - pss structures 413 */ 414 QemuMutex bitmap_mutex; 415 /* The RAMBlock used in the last src_page_requests */ 416 RAMBlock *last_req_rb; 417 /* Queue of outstanding page requests from the destination */ 418 QemuMutex src_page_req_mutex; 419 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 420 421 /* 422 * This is only used when postcopy is in recovery phase, to communicate 423 * between the migration thread and the return path thread on dirty 424 * bitmap synchronizations. This field is unused in other stages of 425 * RAM migration. 426 */ 427 unsigned int postcopy_bmap_sync_requested; 428 /* 429 * Page hint during postcopy when preempt mode is on. Return path 430 * thread sets it, while background migration thread consumes it. 431 * 432 * Protected by @bitmap_mutex. 433 */ 434 PageLocationHint page_hint; 435 }; 436 typedef struct RAMState RAMState; 437 438 static RAMState *ram_state; 439 440 static NotifierWithReturnList precopy_notifier_list; 441 442 /* Whether postcopy has queued requests? */ 443 static bool postcopy_has_request(RAMState *rs) 444 { 445 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 446 } 447 448 void precopy_infrastructure_init(void) 449 { 450 notifier_with_return_list_init(&precopy_notifier_list); 451 } 452 453 void precopy_add_notifier(NotifierWithReturn *n) 454 { 455 notifier_with_return_list_add(&precopy_notifier_list, n); 456 } 457 458 void precopy_remove_notifier(NotifierWithReturn *n) 459 { 460 notifier_with_return_remove(n); 461 } 462 463 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 464 { 465 PrecopyNotifyData pnd; 466 pnd.reason = reason; 467 468 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd, errp); 469 } 470 471 uint64_t ram_bytes_remaining(void) 472 { 473 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 474 0; 475 } 476 477 void ram_transferred_add(uint64_t bytes) 478 { 479 if (runstate_is_running()) { 480 stat64_add(&mig_stats.precopy_bytes, bytes); 481 } else if (migration_in_postcopy()) { 482 stat64_add(&mig_stats.postcopy_bytes, bytes); 483 } else { 484 stat64_add(&mig_stats.downtime_bytes, bytes); 485 } 486 } 487 488 static int ram_save_host_page_urgent(PageSearchStatus *pss); 489 490 /* NOTE: page is the PFN not real ram_addr_t. */ 491 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) 492 { 493 pss->block = rb; 494 pss->page = page; 495 pss->complete_round = false; 496 } 497 498 /* 499 * Check whether two PSSs are actively sending the same page. Return true 500 * if it is, false otherwise. 501 */ 502 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2) 503 { 504 return pss1->host_page_sending && pss2->host_page_sending && 505 (pss1->host_page_start == pss2->host_page_start); 506 } 507 508 /** 509 * save_page_header: write page header to wire 510 * 511 * If this is the 1st block, it also writes the block identification 512 * 513 * Returns the number of bytes written 514 * 515 * @pss: current PSS channel status 516 * @block: block that contains the page we want to send 517 * @offset: offset inside the block for the page 518 * in the lower bits, it contains flags 519 */ 520 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f, 521 RAMBlock *block, ram_addr_t offset) 522 { 523 size_t size, len; 524 bool same_block = (block == pss->last_sent_block); 525 526 if (same_block) { 527 offset |= RAM_SAVE_FLAG_CONTINUE; 528 } 529 qemu_put_be64(f, offset); 530 size = 8; 531 532 if (!same_block) { 533 len = strlen(block->idstr); 534 qemu_put_byte(f, len); 535 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 536 size += 1 + len; 537 pss->last_sent_block = block; 538 } 539 return size; 540 } 541 542 /** 543 * mig_throttle_guest_down: throttle down the guest 544 * 545 * Reduce amount of guest cpu execution to hopefully slow down memory 546 * writes. If guest dirty memory rate is reduced below the rate at 547 * which we can transfer pages to the destination then we should be 548 * able to complete migration. Some workloads dirty memory way too 549 * fast and will not effectively converge, even with auto-converge. 550 */ 551 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 552 uint64_t bytes_dirty_threshold) 553 { 554 uint64_t pct_initial = migrate_cpu_throttle_initial(); 555 uint64_t pct_increment = migrate_cpu_throttle_increment(); 556 bool pct_tailslow = migrate_cpu_throttle_tailslow(); 557 int pct_max = migrate_max_cpu_throttle(); 558 559 uint64_t throttle_now = cpu_throttle_get_percentage(); 560 uint64_t cpu_now, cpu_ideal, throttle_inc; 561 562 /* We have not started throttling yet. Let's start it. */ 563 if (!cpu_throttle_active()) { 564 cpu_throttle_set(pct_initial); 565 } else { 566 /* Throttling already on, just increase the rate */ 567 if (!pct_tailslow) { 568 throttle_inc = pct_increment; 569 } else { 570 /* Compute the ideal CPU percentage used by Guest, which may 571 * make the dirty rate match the dirty rate threshold. */ 572 cpu_now = 100 - throttle_now; 573 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 574 bytes_dirty_period); 575 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 576 } 577 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 578 } 579 } 580 581 void mig_throttle_counter_reset(void) 582 { 583 RAMState *rs = ram_state; 584 585 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 586 rs->num_dirty_pages_period = 0; 587 rs->bytes_xfer_prev = migration_transferred_bytes(); 588 } 589 590 /** 591 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 592 * 593 * @current_addr: address for the zero page 594 * 595 * Update the xbzrle cache to reflect a page that's been sent as all 0. 596 * The important thing is that a stale (not-yet-0'd) page be replaced 597 * by the new data. 598 * As a bonus, if the page wasn't in the cache it gets added so that 599 * when a small write is made into the 0'd page it gets XBZRLE sent. 600 */ 601 static void xbzrle_cache_zero_page(ram_addr_t current_addr) 602 { 603 /* We don't care if this fails to allocate a new cache page 604 * as long as it updated an old one */ 605 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 606 stat64_get(&mig_stats.dirty_sync_count)); 607 } 608 609 #define ENCODING_FLAG_XBZRLE 0x1 610 611 /** 612 * save_xbzrle_page: compress and send current page 613 * 614 * Returns: 1 means that we wrote the page 615 * 0 means that page is identical to the one already sent 616 * -1 means that xbzrle would be longer than normal 617 * 618 * @rs: current RAM state 619 * @pss: current PSS channel 620 * @current_data: pointer to the address of the page contents 621 * @current_addr: addr of the page 622 * @block: block that contains the page we want to send 623 * @offset: offset inside the block for the page 624 */ 625 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, 626 uint8_t **current_data, ram_addr_t current_addr, 627 RAMBlock *block, ram_addr_t offset) 628 { 629 int encoded_len = 0, bytes_xbzrle; 630 uint8_t *prev_cached_page; 631 QEMUFile *file = pss->pss_channel; 632 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); 633 634 if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) { 635 xbzrle_counters.cache_miss++; 636 if (!rs->last_stage) { 637 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 638 generation) == -1) { 639 return -1; 640 } else { 641 /* update *current_data when the page has been 642 inserted into cache */ 643 *current_data = get_cached_data(XBZRLE.cache, current_addr); 644 } 645 } 646 return -1; 647 } 648 649 /* 650 * Reaching here means the page has hit the xbzrle cache, no matter what 651 * encoding result it is (normal encoding, overflow or skipping the page), 652 * count the page as encoded. This is used to calculate the encoding rate. 653 * 654 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 655 * 2nd page turns out to be skipped (i.e. no new bytes written to the 656 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 657 * skipped page included. In this way, the encoding rate can tell if the 658 * guest page is good for xbzrle encoding. 659 */ 660 xbzrle_counters.pages++; 661 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 662 663 /* save current buffer into memory */ 664 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 665 666 /* XBZRLE encoding (if there is no overflow) */ 667 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 668 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 669 TARGET_PAGE_SIZE); 670 671 /* 672 * Update the cache contents, so that it corresponds to the data 673 * sent, in all cases except where we skip the page. 674 */ 675 if (!rs->last_stage && encoded_len != 0) { 676 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 677 /* 678 * In the case where we couldn't compress, ensure that the caller 679 * sends the data from the cache, since the guest might have 680 * changed the RAM since we copied it. 681 */ 682 *current_data = prev_cached_page; 683 } 684 685 if (encoded_len == 0) { 686 trace_save_xbzrle_page_skipping(); 687 return 0; 688 } else if (encoded_len == -1) { 689 trace_save_xbzrle_page_overflow(); 690 xbzrle_counters.overflow++; 691 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 692 return -1; 693 } 694 695 /* Send XBZRLE based compressed page */ 696 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block, 697 offset | RAM_SAVE_FLAG_XBZRLE); 698 qemu_put_byte(file, ENCODING_FLAG_XBZRLE); 699 qemu_put_be16(file, encoded_len); 700 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len); 701 bytes_xbzrle += encoded_len + 1 + 2; 702 /* 703 * The xbzrle encoded bytes don't count the 8 byte header with 704 * RAM_SAVE_FLAG_CONTINUE. 705 */ 706 xbzrle_counters.bytes += bytes_xbzrle - 8; 707 ram_transferred_add(bytes_xbzrle); 708 709 return 1; 710 } 711 712 /** 713 * pss_find_next_dirty: find the next dirty page of current ramblock 714 * 715 * This function updates pss->page to point to the next dirty page index 716 * within the ramblock to migrate, or the end of ramblock when nothing 717 * found. Note that when pss->host_page_sending==true it means we're 718 * during sending a host page, so we won't look for dirty page that is 719 * outside the host page boundary. 720 * 721 * @pss: the current page search status 722 */ 723 static void pss_find_next_dirty(PageSearchStatus *pss) 724 { 725 RAMBlock *rb = pss->block; 726 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 727 unsigned long *bitmap = rb->bmap; 728 729 if (migrate_ram_is_ignored(rb)) { 730 /* Points directly to the end, so we know no dirty page */ 731 pss->page = size; 732 return; 733 } 734 735 /* 736 * If during sending a host page, only look for dirty pages within the 737 * current host page being send. 738 */ 739 if (pss->host_page_sending) { 740 assert(pss->host_page_end); 741 size = MIN(size, pss->host_page_end); 742 } 743 744 pss->page = find_next_bit(bitmap, size, pss->page); 745 } 746 747 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 748 unsigned long page) 749 { 750 uint8_t shift; 751 hwaddr size, start; 752 753 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 754 return; 755 } 756 757 shift = rb->clear_bmap_shift; 758 /* 759 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 760 * can make things easier sometimes since then start address 761 * of the small chunk will always be 64 pages aligned so the 762 * bitmap will always be aligned to unsigned long. We should 763 * even be able to remove this restriction but I'm simply 764 * keeping it. 765 */ 766 assert(shift >= 6); 767 768 size = 1ULL << (TARGET_PAGE_BITS + shift); 769 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 770 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 771 memory_region_clear_dirty_bitmap(rb->mr, start, size); 772 } 773 774 static void 775 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 776 unsigned long start, 777 unsigned long npages) 778 { 779 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 780 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 781 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 782 783 /* 784 * Clear pages from start to start + npages - 1, so the end boundary is 785 * exclusive. 786 */ 787 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 788 migration_clear_memory_region_dirty_bitmap(rb, i); 789 } 790 } 791 792 /* 793 * colo_bitmap_find_diry:find contiguous dirty pages from start 794 * 795 * Returns the page offset within memory region of the start of the contiguout 796 * dirty page 797 * 798 * @rs: current RAM state 799 * @rb: RAMBlock where to search for dirty pages 800 * @start: page where we start the search 801 * @num: the number of contiguous dirty pages 802 */ 803 static inline 804 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 805 unsigned long start, unsigned long *num) 806 { 807 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 808 unsigned long *bitmap = rb->bmap; 809 unsigned long first, next; 810 811 *num = 0; 812 813 if (migrate_ram_is_ignored(rb)) { 814 return size; 815 } 816 817 first = find_next_bit(bitmap, size, start); 818 if (first >= size) { 819 return first; 820 } 821 next = find_next_zero_bit(bitmap, size, first + 1); 822 assert(next >= first); 823 *num = next - first; 824 return first; 825 } 826 827 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 828 RAMBlock *rb, 829 unsigned long page) 830 { 831 bool ret; 832 833 /* 834 * Clear dirty bitmap if needed. This _must_ be called before we 835 * send any of the page in the chunk because we need to make sure 836 * we can capture further page content changes when we sync dirty 837 * log the next time. So as long as we are going to send any of 838 * the page in the chunk we clear the remote dirty bitmap for all. 839 * Clearing it earlier won't be a problem, but too late will. 840 */ 841 migration_clear_memory_region_dirty_bitmap(rb, page); 842 843 ret = test_and_clear_bit(page, rb->bmap); 844 if (ret) { 845 rs->migration_dirty_pages--; 846 } 847 848 return ret; 849 } 850 851 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 852 void *opaque) 853 { 854 const hwaddr offset = section->offset_within_region; 855 const hwaddr size = int128_get64(section->size); 856 const unsigned long start = offset >> TARGET_PAGE_BITS; 857 const unsigned long npages = size >> TARGET_PAGE_BITS; 858 RAMBlock *rb = section->mr->ram_block; 859 uint64_t *cleared_bits = opaque; 860 861 /* 862 * We don't grab ram_state->bitmap_mutex because we expect to run 863 * only when starting migration or during postcopy recovery where 864 * we don't have concurrent access. 865 */ 866 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 867 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 868 } 869 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 870 bitmap_clear(rb->bmap, start, npages); 871 } 872 873 /* 874 * Exclude all dirty pages from migration that fall into a discarded range as 875 * managed by a RamDiscardManager responsible for the mapped memory region of 876 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 877 * 878 * Discarded pages ("logically unplugged") have undefined content and must 879 * not get migrated, because even reading these pages for migration might 880 * result in undesired behavior. 881 * 882 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 883 * 884 * Note: The result is only stable while migrating (precopy/postcopy). 885 */ 886 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 887 { 888 uint64_t cleared_bits = 0; 889 890 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 891 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 892 MemoryRegionSection section = { 893 .mr = rb->mr, 894 .offset_within_region = 0, 895 .size = int128_make64(qemu_ram_get_used_length(rb)), 896 }; 897 898 ram_discard_manager_replay_discarded(rdm, §ion, 899 dirty_bitmap_clear_section, 900 &cleared_bits); 901 } 902 return cleared_bits; 903 } 904 905 /* 906 * Check if a host-page aligned page falls into a discarded range as managed by 907 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 908 * 909 * Note: The result is only stable while migrating (precopy/postcopy). 910 */ 911 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 912 { 913 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 914 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 915 MemoryRegionSection section = { 916 .mr = rb->mr, 917 .offset_within_region = start, 918 .size = int128_make64(qemu_ram_pagesize(rb)), 919 }; 920 921 return !ram_discard_manager_is_populated(rdm, §ion); 922 } 923 return false; 924 } 925 926 /* Called with RCU critical section */ 927 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 928 { 929 uint64_t new_dirty_pages = 930 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 931 932 rs->migration_dirty_pages += new_dirty_pages; 933 rs->num_dirty_pages_period += new_dirty_pages; 934 } 935 936 /** 937 * ram_pagesize_summary: calculate all the pagesizes of a VM 938 * 939 * Returns a summary bitmap of the page sizes of all RAMBlocks 940 * 941 * For VMs with just normal pages this is equivalent to the host page 942 * size. If it's got some huge pages then it's the OR of all the 943 * different page sizes. 944 */ 945 uint64_t ram_pagesize_summary(void) 946 { 947 RAMBlock *block; 948 uint64_t summary = 0; 949 950 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 951 summary |= block->page_size; 952 } 953 954 return summary; 955 } 956 957 uint64_t ram_get_total_transferred_pages(void) 958 { 959 return stat64_get(&mig_stats.normal_pages) + 960 stat64_get(&mig_stats.zero_pages) + 961 xbzrle_counters.pages; 962 } 963 964 static void migration_update_rates(RAMState *rs, int64_t end_time) 965 { 966 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 967 968 /* calculate period counters */ 969 stat64_set(&mig_stats.dirty_pages_rate, 970 rs->num_dirty_pages_period * 1000 / 971 (end_time - rs->time_last_bitmap_sync)); 972 973 if (!page_count) { 974 return; 975 } 976 977 if (migrate_xbzrle()) { 978 double encoded_size, unencoded_size; 979 980 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 981 rs->xbzrle_cache_miss_prev) / page_count; 982 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 983 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 984 TARGET_PAGE_SIZE; 985 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 986 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 987 xbzrle_counters.encoding_rate = 0; 988 } else { 989 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 990 } 991 rs->xbzrle_pages_prev = xbzrle_counters.pages; 992 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 993 } 994 } 995 996 /* 997 * Enable dirty-limit to throttle down the guest 998 */ 999 static void migration_dirty_limit_guest(void) 1000 { 1001 /* 1002 * dirty page rate quota for all vCPUs fetched from 1003 * migration parameter 'vcpu_dirty_limit' 1004 */ 1005 static int64_t quota_dirtyrate; 1006 MigrationState *s = migrate_get_current(); 1007 1008 /* 1009 * If dirty limit already enabled and migration parameter 1010 * vcpu-dirty-limit untouched. 1011 */ 1012 if (dirtylimit_in_service() && 1013 quota_dirtyrate == s->parameters.vcpu_dirty_limit) { 1014 return; 1015 } 1016 1017 quota_dirtyrate = s->parameters.vcpu_dirty_limit; 1018 1019 /* 1020 * Set all vCPU a quota dirtyrate, note that the second 1021 * parameter will be ignored if setting all vCPU for the vm 1022 */ 1023 qmp_set_vcpu_dirty_limit(false, -1, quota_dirtyrate, NULL); 1024 trace_migration_dirty_limit_guest(quota_dirtyrate); 1025 } 1026 1027 static void migration_trigger_throttle(RAMState *rs) 1028 { 1029 uint64_t threshold = migrate_throttle_trigger_threshold(); 1030 uint64_t bytes_xfer_period = 1031 migration_transferred_bytes() - rs->bytes_xfer_prev; 1032 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1033 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1034 1035 /* 1036 * The following detection logic can be refined later. For now: 1037 * Check to see if the ratio between dirtied bytes and the approx. 1038 * amount of bytes that just got transferred since the last time 1039 * we were in this routine reaches the threshold. If that happens 1040 * twice, start or increase throttling. 1041 */ 1042 if ((bytes_dirty_period > bytes_dirty_threshold) && 1043 (++rs->dirty_rate_high_cnt >= 2)) { 1044 rs->dirty_rate_high_cnt = 0; 1045 if (migrate_auto_converge()) { 1046 trace_migration_throttle(); 1047 mig_throttle_guest_down(bytes_dirty_period, 1048 bytes_dirty_threshold); 1049 } else if (migrate_dirty_limit()) { 1050 migration_dirty_limit_guest(); 1051 } 1052 } 1053 } 1054 1055 static void migration_bitmap_sync(RAMState *rs, bool last_stage) 1056 { 1057 RAMBlock *block; 1058 int64_t end_time; 1059 1060 stat64_add(&mig_stats.dirty_sync_count, 1); 1061 1062 if (!rs->time_last_bitmap_sync) { 1063 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1064 } 1065 1066 trace_migration_bitmap_sync_start(); 1067 memory_global_dirty_log_sync(last_stage); 1068 1069 WITH_QEMU_LOCK_GUARD(&rs->bitmap_mutex) { 1070 WITH_RCU_READ_LOCK_GUARD() { 1071 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1072 ramblock_sync_dirty_bitmap(rs, block); 1073 } 1074 stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining()); 1075 } 1076 } 1077 1078 memory_global_after_dirty_log_sync(); 1079 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1080 1081 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1082 1083 /* more than 1 second = 1000 millisecons */ 1084 if (end_time > rs->time_last_bitmap_sync + 1000) { 1085 migration_trigger_throttle(rs); 1086 1087 migration_update_rates(rs, end_time); 1088 1089 rs->target_page_count_prev = rs->target_page_count; 1090 1091 /* reset period counters */ 1092 rs->time_last_bitmap_sync = end_time; 1093 rs->num_dirty_pages_period = 0; 1094 rs->bytes_xfer_prev = migration_transferred_bytes(); 1095 } 1096 if (migrate_events()) { 1097 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); 1098 qapi_event_send_migration_pass(generation); 1099 } 1100 } 1101 1102 void migration_bitmap_sync_precopy(bool last_stage) 1103 { 1104 Error *local_err = NULL; 1105 assert(ram_state); 1106 1107 /* 1108 * The current notifier usage is just an optimization to migration, so we 1109 * don't stop the normal migration process in the error case. 1110 */ 1111 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1112 error_report_err(local_err); 1113 local_err = NULL; 1114 } 1115 1116 migration_bitmap_sync(ram_state, last_stage); 1117 1118 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1119 error_report_err(local_err); 1120 } 1121 } 1122 1123 void ram_release_page(const char *rbname, uint64_t offset) 1124 { 1125 if (!migrate_release_ram() || !migration_in_postcopy()) { 1126 return; 1127 } 1128 1129 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1130 } 1131 1132 /** 1133 * save_zero_page: send the zero page to the stream 1134 * 1135 * Returns the number of pages written. 1136 * 1137 * @rs: current RAM state 1138 * @pss: current PSS channel 1139 * @offset: offset inside the block for the page 1140 */ 1141 static int save_zero_page(RAMState *rs, PageSearchStatus *pss, 1142 ram_addr_t offset) 1143 { 1144 uint8_t *p = pss->block->host + offset; 1145 QEMUFile *file = pss->pss_channel; 1146 int len = 0; 1147 1148 if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_NONE) { 1149 return 0; 1150 } 1151 1152 if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1153 return 0; 1154 } 1155 1156 stat64_add(&mig_stats.zero_pages, 1); 1157 1158 if (migrate_mapped_ram()) { 1159 /* zero pages are not transferred with mapped-ram */ 1160 clear_bit_atomic(offset >> TARGET_PAGE_BITS, pss->block->file_bmap); 1161 return 1; 1162 } 1163 1164 len += save_page_header(pss, file, pss->block, offset | RAM_SAVE_FLAG_ZERO); 1165 qemu_put_byte(file, 0); 1166 len += 1; 1167 ram_release_page(pss->block->idstr, offset); 1168 ram_transferred_add(len); 1169 1170 /* 1171 * Must let xbzrle know, otherwise a previous (now 0'd) cached 1172 * page would be stale. 1173 */ 1174 if (rs->xbzrle_started) { 1175 XBZRLE_cache_lock(); 1176 xbzrle_cache_zero_page(pss->block->offset + offset); 1177 XBZRLE_cache_unlock(); 1178 } 1179 1180 return len; 1181 } 1182 1183 /* 1184 * directly send the page to the stream 1185 * 1186 * Returns the number of pages written. 1187 * 1188 * @pss: current PSS channel 1189 * @block: block that contains the page we want to send 1190 * @offset: offset inside the block for the page 1191 * @buf: the page to be sent 1192 * @async: send to page asyncly 1193 */ 1194 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, 1195 ram_addr_t offset, uint8_t *buf, bool async) 1196 { 1197 QEMUFile *file = pss->pss_channel; 1198 1199 if (migrate_mapped_ram()) { 1200 qemu_put_buffer_at(file, buf, TARGET_PAGE_SIZE, 1201 block->pages_offset + offset); 1202 set_bit(offset >> TARGET_PAGE_BITS, block->file_bmap); 1203 } else { 1204 ram_transferred_add(save_page_header(pss, pss->pss_channel, block, 1205 offset | RAM_SAVE_FLAG_PAGE)); 1206 if (async) { 1207 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, 1208 migrate_release_ram() && 1209 migration_in_postcopy()); 1210 } else { 1211 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); 1212 } 1213 } 1214 ram_transferred_add(TARGET_PAGE_SIZE); 1215 stat64_add(&mig_stats.normal_pages, 1); 1216 return 1; 1217 } 1218 1219 /** 1220 * ram_save_page: send the given page to the stream 1221 * 1222 * Returns the number of pages written. 1223 * < 0 - error 1224 * >=0 - Number of pages written - this might legally be 0 1225 * if xbzrle noticed the page was the same. 1226 * 1227 * @rs: current RAM state 1228 * @block: block that contains the page we want to send 1229 * @offset: offset inside the block for the page 1230 */ 1231 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1232 { 1233 int pages = -1; 1234 uint8_t *p; 1235 bool send_async = true; 1236 RAMBlock *block = pss->block; 1237 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1238 ram_addr_t current_addr = block->offset + offset; 1239 1240 p = block->host + offset; 1241 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1242 1243 XBZRLE_cache_lock(); 1244 if (rs->xbzrle_started && !migration_in_postcopy()) { 1245 pages = save_xbzrle_page(rs, pss, &p, current_addr, 1246 block, offset); 1247 if (!rs->last_stage) { 1248 /* Can't send this cached data async, since the cache page 1249 * might get updated before it gets to the wire 1250 */ 1251 send_async = false; 1252 } 1253 } 1254 1255 /* XBZRLE overflow or normal page */ 1256 if (pages == -1) { 1257 pages = save_normal_page(pss, block, offset, p, send_async); 1258 } 1259 1260 XBZRLE_cache_unlock(); 1261 1262 return pages; 1263 } 1264 1265 static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) 1266 { 1267 if (!multifd_queue_page(block, offset)) { 1268 return -1; 1269 } 1270 1271 return 1; 1272 } 1273 1274 1275 #define PAGE_ALL_CLEAN 0 1276 #define PAGE_TRY_AGAIN 1 1277 #define PAGE_DIRTY_FOUND 2 1278 /** 1279 * find_dirty_block: find the next dirty page and update any state 1280 * associated with the search process. 1281 * 1282 * Returns: 1283 * <0: An error happened 1284 * PAGE_ALL_CLEAN: no dirty page found, give up 1285 * PAGE_TRY_AGAIN: no dirty page found, retry for next block 1286 * PAGE_DIRTY_FOUND: dirty page found 1287 * 1288 * @rs: current RAM state 1289 * @pss: data about the state of the current dirty page scan 1290 * @again: set to false if the search has scanned the whole of RAM 1291 */ 1292 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) 1293 { 1294 /* Update pss->page for the next dirty bit in ramblock */ 1295 pss_find_next_dirty(pss); 1296 1297 if (pss->complete_round && pss->block == rs->last_seen_block && 1298 pss->page >= rs->last_page) { 1299 /* 1300 * We've been once around the RAM and haven't found anything. 1301 * Give up. 1302 */ 1303 return PAGE_ALL_CLEAN; 1304 } 1305 if (!offset_in_ramblock(pss->block, 1306 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1307 /* Didn't find anything in this RAM Block */ 1308 pss->page = 0; 1309 pss->block = QLIST_NEXT_RCU(pss->block, next); 1310 if (!pss->block) { 1311 if (multifd_ram_sync_per_round()) { 1312 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; 1313 int ret = multifd_ram_flush_and_sync(f); 1314 if (ret < 0) { 1315 return ret; 1316 } 1317 } 1318 1319 /* Hit the end of the list */ 1320 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1321 /* Flag that we've looped */ 1322 pss->complete_round = true; 1323 /* After the first round, enable XBZRLE. */ 1324 if (migrate_xbzrle()) { 1325 rs->xbzrle_started = true; 1326 } 1327 } 1328 /* Didn't find anything this time, but try again on the new block */ 1329 return PAGE_TRY_AGAIN; 1330 } else { 1331 /* We've found something */ 1332 return PAGE_DIRTY_FOUND; 1333 } 1334 } 1335 1336 /** 1337 * unqueue_page: gets a page of the queue 1338 * 1339 * Helper for 'get_queued_page' - gets a page off the queue 1340 * 1341 * Returns the block of the page (or NULL if none available) 1342 * 1343 * @rs: current RAM state 1344 * @offset: used to return the offset within the RAMBlock 1345 */ 1346 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1347 { 1348 struct RAMSrcPageRequest *entry; 1349 RAMBlock *block = NULL; 1350 1351 if (!postcopy_has_request(rs)) { 1352 return NULL; 1353 } 1354 1355 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1356 1357 /* 1358 * This should _never_ change even after we take the lock, because no one 1359 * should be taking anything off the request list other than us. 1360 */ 1361 assert(postcopy_has_request(rs)); 1362 1363 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1364 block = entry->rb; 1365 *offset = entry->offset; 1366 1367 if (entry->len > TARGET_PAGE_SIZE) { 1368 entry->len -= TARGET_PAGE_SIZE; 1369 entry->offset += TARGET_PAGE_SIZE; 1370 } else { 1371 memory_region_unref(block->mr); 1372 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1373 g_free(entry); 1374 migration_consume_urgent_request(); 1375 } 1376 1377 return block; 1378 } 1379 1380 #if defined(__linux__) 1381 /** 1382 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1383 * is found, return RAM block pointer and page offset 1384 * 1385 * Returns pointer to the RAMBlock containing faulting page, 1386 * NULL if no write faults are pending 1387 * 1388 * @rs: current RAM state 1389 * @offset: page offset from the beginning of the block 1390 */ 1391 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1392 { 1393 struct uffd_msg uffd_msg; 1394 void *page_address; 1395 RAMBlock *block; 1396 int res; 1397 1398 if (!migrate_background_snapshot()) { 1399 return NULL; 1400 } 1401 1402 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1403 if (res <= 0) { 1404 return NULL; 1405 } 1406 1407 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1408 block = qemu_ram_block_from_host(page_address, false, offset); 1409 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1410 return block; 1411 } 1412 1413 /** 1414 * ram_save_release_protection: release UFFD write protection after 1415 * a range of pages has been saved 1416 * 1417 * @rs: current RAM state 1418 * @pss: page-search-status structure 1419 * @start_page: index of the first page in the range relative to pss->block 1420 * 1421 * Returns 0 on success, negative value in case of an error 1422 */ 1423 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1424 unsigned long start_page) 1425 { 1426 int res = 0; 1427 1428 /* Check if page is from UFFD-managed region. */ 1429 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1430 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1431 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1432 1433 /* Flush async buffers before un-protect. */ 1434 qemu_fflush(pss->pss_channel); 1435 /* Un-protect memory range. */ 1436 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1437 false, false); 1438 } 1439 1440 return res; 1441 } 1442 1443 /* ram_write_tracking_available: check if kernel supports required UFFD features 1444 * 1445 * Returns true if supports, false otherwise 1446 */ 1447 bool ram_write_tracking_available(void) 1448 { 1449 uint64_t uffd_features; 1450 int res; 1451 1452 res = uffd_query_features(&uffd_features); 1453 return (res == 0 && 1454 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1455 } 1456 1457 /* ram_write_tracking_compatible: check if guest configuration is 1458 * compatible with 'write-tracking' 1459 * 1460 * Returns true if compatible, false otherwise 1461 */ 1462 bool ram_write_tracking_compatible(void) 1463 { 1464 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1465 int uffd_fd; 1466 RAMBlock *block; 1467 bool ret = false; 1468 1469 /* Open UFFD file descriptor */ 1470 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1471 if (uffd_fd < 0) { 1472 return false; 1473 } 1474 1475 RCU_READ_LOCK_GUARD(); 1476 1477 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1478 uint64_t uffd_ioctls; 1479 1480 /* Nothing to do with read-only and MMIO-writable regions */ 1481 if (block->mr->readonly || block->mr->rom_device) { 1482 continue; 1483 } 1484 /* Try to register block memory via UFFD-IO to track writes */ 1485 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1486 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1487 goto out; 1488 } 1489 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1490 goto out; 1491 } 1492 } 1493 ret = true; 1494 1495 out: 1496 uffd_close_fd(uffd_fd); 1497 return ret; 1498 } 1499 1500 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1501 ram_addr_t size) 1502 { 1503 const ram_addr_t end = offset + size; 1504 1505 /* 1506 * We read one byte of each page; this will preallocate page tables if 1507 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1508 * where no page was populated yet. This might require adaption when 1509 * supporting other mappings, like shmem. 1510 */ 1511 for (; offset < end; offset += block->page_size) { 1512 char tmp = *((char *)block->host + offset); 1513 1514 /* Don't optimize the read out */ 1515 asm volatile("" : "+r" (tmp)); 1516 } 1517 } 1518 1519 static inline int populate_read_section(MemoryRegionSection *section, 1520 void *opaque) 1521 { 1522 const hwaddr size = int128_get64(section->size); 1523 hwaddr offset = section->offset_within_region; 1524 RAMBlock *block = section->mr->ram_block; 1525 1526 populate_read_range(block, offset, size); 1527 return 0; 1528 } 1529 1530 /* 1531 * ram_block_populate_read: preallocate page tables and populate pages in the 1532 * RAM block by reading a byte of each page. 1533 * 1534 * Since it's solely used for userfault_fd WP feature, here we just 1535 * hardcode page size to qemu_real_host_page_size. 1536 * 1537 * @block: RAM block to populate 1538 */ 1539 static void ram_block_populate_read(RAMBlock *rb) 1540 { 1541 /* 1542 * Skip populating all pages that fall into a discarded range as managed by 1543 * a RamDiscardManager responsible for the mapped memory region of the 1544 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1545 * must not get populated automatically. We don't have to track 1546 * modifications via userfaultfd WP reliably, because these pages will 1547 * not be part of the migration stream either way -- see 1548 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1549 * 1550 * Note: The result is only stable while migrating (precopy/postcopy). 1551 */ 1552 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1553 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1554 MemoryRegionSection section = { 1555 .mr = rb->mr, 1556 .offset_within_region = 0, 1557 .size = rb->mr->size, 1558 }; 1559 1560 ram_discard_manager_replay_populated(rdm, §ion, 1561 populate_read_section, NULL); 1562 } else { 1563 populate_read_range(rb, 0, rb->used_length); 1564 } 1565 } 1566 1567 /* 1568 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1569 */ 1570 void ram_write_tracking_prepare(void) 1571 { 1572 RAMBlock *block; 1573 1574 RCU_READ_LOCK_GUARD(); 1575 1576 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1577 /* Nothing to do with read-only and MMIO-writable regions */ 1578 if (block->mr->readonly || block->mr->rom_device) { 1579 continue; 1580 } 1581 1582 /* 1583 * Populate pages of the RAM block before enabling userfault_fd 1584 * write protection. 1585 * 1586 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1587 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1588 * pages with pte_none() entries in page table. 1589 */ 1590 ram_block_populate_read(block); 1591 } 1592 } 1593 1594 static inline int uffd_protect_section(MemoryRegionSection *section, 1595 void *opaque) 1596 { 1597 const hwaddr size = int128_get64(section->size); 1598 const hwaddr offset = section->offset_within_region; 1599 RAMBlock *rb = section->mr->ram_block; 1600 int uffd_fd = (uintptr_t)opaque; 1601 1602 return uffd_change_protection(uffd_fd, rb->host + offset, size, true, 1603 false); 1604 } 1605 1606 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd) 1607 { 1608 assert(rb->flags & RAM_UF_WRITEPROTECT); 1609 1610 /* See ram_block_populate_read() */ 1611 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1612 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1613 MemoryRegionSection section = { 1614 .mr = rb->mr, 1615 .offset_within_region = 0, 1616 .size = rb->mr->size, 1617 }; 1618 1619 return ram_discard_manager_replay_populated(rdm, §ion, 1620 uffd_protect_section, 1621 (void *)(uintptr_t)uffd_fd); 1622 } 1623 return uffd_change_protection(uffd_fd, rb->host, 1624 rb->used_length, true, false); 1625 } 1626 1627 /* 1628 * ram_write_tracking_start: start UFFD-WP memory tracking 1629 * 1630 * Returns 0 for success or negative value in case of error 1631 */ 1632 int ram_write_tracking_start(void) 1633 { 1634 int uffd_fd; 1635 RAMState *rs = ram_state; 1636 RAMBlock *block; 1637 1638 /* Open UFFD file descriptor */ 1639 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1640 if (uffd_fd < 0) { 1641 return uffd_fd; 1642 } 1643 rs->uffdio_fd = uffd_fd; 1644 1645 RCU_READ_LOCK_GUARD(); 1646 1647 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1648 /* Nothing to do with read-only and MMIO-writable regions */ 1649 if (block->mr->readonly || block->mr->rom_device) { 1650 continue; 1651 } 1652 1653 /* Register block memory with UFFD to track writes */ 1654 if (uffd_register_memory(rs->uffdio_fd, block->host, 1655 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1656 goto fail; 1657 } 1658 block->flags |= RAM_UF_WRITEPROTECT; 1659 memory_region_ref(block->mr); 1660 1661 /* Apply UFFD write protection to the block memory range */ 1662 if (ram_block_uffd_protect(block, uffd_fd)) { 1663 goto fail; 1664 } 1665 1666 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1667 block->host, block->max_length); 1668 } 1669 1670 return 0; 1671 1672 fail: 1673 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1674 1675 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1676 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1677 continue; 1678 } 1679 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1680 /* Cleanup flags and remove reference */ 1681 block->flags &= ~RAM_UF_WRITEPROTECT; 1682 memory_region_unref(block->mr); 1683 } 1684 1685 uffd_close_fd(uffd_fd); 1686 rs->uffdio_fd = -1; 1687 return -1; 1688 } 1689 1690 /** 1691 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1692 */ 1693 void ram_write_tracking_stop(void) 1694 { 1695 RAMState *rs = ram_state; 1696 RAMBlock *block; 1697 1698 RCU_READ_LOCK_GUARD(); 1699 1700 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1701 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1702 continue; 1703 } 1704 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1705 1706 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 1707 block->host, block->max_length); 1708 1709 /* Cleanup flags and remove reference */ 1710 block->flags &= ~RAM_UF_WRITEPROTECT; 1711 memory_region_unref(block->mr); 1712 } 1713 1714 /* Finally close UFFD file descriptor */ 1715 uffd_close_fd(rs->uffdio_fd); 1716 rs->uffdio_fd = -1; 1717 } 1718 1719 #else 1720 /* No target OS support, stubs just fail or ignore */ 1721 1722 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1723 { 1724 (void) rs; 1725 (void) offset; 1726 1727 return NULL; 1728 } 1729 1730 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1731 unsigned long start_page) 1732 { 1733 (void) rs; 1734 (void) pss; 1735 (void) start_page; 1736 1737 return 0; 1738 } 1739 1740 bool ram_write_tracking_available(void) 1741 { 1742 return false; 1743 } 1744 1745 bool ram_write_tracking_compatible(void) 1746 { 1747 g_assert_not_reached(); 1748 } 1749 1750 int ram_write_tracking_start(void) 1751 { 1752 g_assert_not_reached(); 1753 } 1754 1755 void ram_write_tracking_stop(void) 1756 { 1757 g_assert_not_reached(); 1758 } 1759 #endif /* defined(__linux__) */ 1760 1761 /** 1762 * get_queued_page: unqueue a page from the postcopy requests 1763 * 1764 * Skips pages that are already sent (!dirty) 1765 * 1766 * Returns true if a queued page is found 1767 * 1768 * @rs: current RAM state 1769 * @pss: data about the state of the current dirty page scan 1770 */ 1771 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1772 { 1773 RAMBlock *block; 1774 ram_addr_t offset; 1775 bool dirty = false; 1776 1777 do { 1778 block = unqueue_page(rs, &offset); 1779 /* 1780 * We're sending this page, and since it's postcopy nothing else 1781 * will dirty it, and we must make sure it doesn't get sent again 1782 * even if this queue request was received after the background 1783 * search already sent it. 1784 */ 1785 if (block) { 1786 unsigned long page; 1787 1788 page = offset >> TARGET_PAGE_BITS; 1789 dirty = test_bit(page, block->bmap); 1790 if (!dirty) { 1791 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1792 page); 1793 } else { 1794 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1795 } 1796 } 1797 1798 } while (block && !dirty); 1799 1800 if (!block) { 1801 /* 1802 * Poll write faults too if background snapshot is enabled; that's 1803 * when we have vcpus got blocked by the write protected pages. 1804 */ 1805 block = poll_fault_page(rs, &offset); 1806 } 1807 1808 if (block) { 1809 /* 1810 * We want the background search to continue from the queued page 1811 * since the guest is likely to want other pages near to the page 1812 * it just requested. 1813 */ 1814 pss->block = block; 1815 pss->page = offset >> TARGET_PAGE_BITS; 1816 1817 /* 1818 * This unqueued page would break the "one round" check, even is 1819 * really rare. 1820 */ 1821 pss->complete_round = false; 1822 } 1823 1824 return !!block; 1825 } 1826 1827 /** 1828 * migration_page_queue_free: drop any remaining pages in the ram 1829 * request queue 1830 * 1831 * It should be empty at the end anyway, but in error cases there may 1832 * be some left. in case that there is any page left, we drop it. 1833 * 1834 */ 1835 static void migration_page_queue_free(RAMState *rs) 1836 { 1837 struct RAMSrcPageRequest *mspr, *next_mspr; 1838 /* This queue generally should be empty - but in the case of a failed 1839 * migration might have some droppings in. 1840 */ 1841 RCU_READ_LOCK_GUARD(); 1842 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 1843 memory_region_unref(mspr->rb->mr); 1844 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1845 g_free(mspr); 1846 } 1847 } 1848 1849 /** 1850 * ram_save_queue_pages: queue the page for transmission 1851 * 1852 * A request from postcopy destination for example. 1853 * 1854 * Returns zero on success or negative on error 1855 * 1856 * @rbname: Name of the RAMBLock of the request. NULL means the 1857 * same that last one. 1858 * @start: starting address from the start of the RAMBlock 1859 * @len: length (in bytes) to send 1860 */ 1861 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len, 1862 Error **errp) 1863 { 1864 RAMBlock *ramblock; 1865 RAMState *rs = ram_state; 1866 1867 stat64_add(&mig_stats.postcopy_requests, 1); 1868 RCU_READ_LOCK_GUARD(); 1869 1870 if (!rbname) { 1871 /* Reuse last RAMBlock */ 1872 ramblock = rs->last_req_rb; 1873 1874 if (!ramblock) { 1875 /* 1876 * Shouldn't happen, we can't reuse the last RAMBlock if 1877 * it's the 1st request. 1878 */ 1879 error_setg(errp, "MIG_RP_MSG_REQ_PAGES has no previous block"); 1880 return -1; 1881 } 1882 } else { 1883 ramblock = qemu_ram_block_by_name(rbname); 1884 1885 if (!ramblock) { 1886 /* We shouldn't be asked for a non-existent RAMBlock */ 1887 error_setg(errp, "MIG_RP_MSG_REQ_PAGES has no block '%s'", rbname); 1888 return -1; 1889 } 1890 rs->last_req_rb = ramblock; 1891 } 1892 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1893 if (!offset_in_ramblock(ramblock, start + len - 1)) { 1894 error_setg(errp, "MIG_RP_MSG_REQ_PAGES request overrun, " 1895 "start=" RAM_ADDR_FMT " len=" 1896 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1897 start, len, ramblock->used_length); 1898 return -1; 1899 } 1900 1901 /* 1902 * When with postcopy preempt, we send back the page directly in the 1903 * rp-return thread. 1904 */ 1905 if (postcopy_preempt_active()) { 1906 ram_addr_t page_start = start >> TARGET_PAGE_BITS; 1907 size_t page_size = qemu_ram_pagesize(ramblock); 1908 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY]; 1909 int ret = 0; 1910 1911 qemu_mutex_lock(&rs->bitmap_mutex); 1912 1913 pss_init(pss, ramblock, page_start); 1914 /* 1915 * Always use the preempt channel, and make sure it's there. It's 1916 * safe to access without lock, because when rp-thread is running 1917 * we should be the only one who operates on the qemufile 1918 */ 1919 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src; 1920 assert(pss->pss_channel); 1921 1922 /* 1923 * It must be either one or multiple of host page size. Just 1924 * assert; if something wrong we're mostly split brain anyway. 1925 */ 1926 assert(len % page_size == 0); 1927 while (len) { 1928 if (ram_save_host_page_urgent(pss)) { 1929 error_setg(errp, "ram_save_host_page_urgent() failed: " 1930 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, 1931 ramblock->idstr, start); 1932 ret = -1; 1933 break; 1934 } 1935 /* 1936 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page 1937 * will automatically be moved and point to the next host page 1938 * we're going to send, so no need to update here. 1939 * 1940 * Normally QEMU never sends >1 host page in requests, so 1941 * logically we don't even need that as the loop should only 1942 * run once, but just to be consistent. 1943 */ 1944 len -= page_size; 1945 }; 1946 qemu_mutex_unlock(&rs->bitmap_mutex); 1947 1948 return ret; 1949 } 1950 1951 struct RAMSrcPageRequest *new_entry = 1952 g_new0(struct RAMSrcPageRequest, 1); 1953 new_entry->rb = ramblock; 1954 new_entry->offset = start; 1955 new_entry->len = len; 1956 1957 memory_region_ref(ramblock->mr); 1958 qemu_mutex_lock(&rs->src_page_req_mutex); 1959 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 1960 migration_make_urgent_request(); 1961 qemu_mutex_unlock(&rs->src_page_req_mutex); 1962 1963 return 0; 1964 } 1965 1966 /** 1967 * ram_save_target_page: save one target page to the precopy thread 1968 * OR to multifd workers. 1969 * 1970 * @rs: current RAM state 1971 * @pss: data about the page we want to send 1972 */ 1973 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss) 1974 { 1975 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1976 int res; 1977 1978 /* Hand over to RDMA first */ 1979 if (migrate_rdma()) { 1980 res = rdma_control_save_page(pss->pss_channel, pss->block->offset, 1981 offset, TARGET_PAGE_SIZE); 1982 1983 if (res == RAM_SAVE_CONTROL_DELAYED) { 1984 res = 1; 1985 } 1986 return res; 1987 } 1988 1989 if (!migrate_multifd() 1990 || migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) { 1991 if (save_zero_page(rs, pss, offset)) { 1992 return 1; 1993 } 1994 } 1995 1996 if (migrate_multifd() && !migration_in_postcopy()) { 1997 return ram_save_multifd_page(pss->block, offset); 1998 } 1999 2000 return ram_save_page(rs, pss); 2001 } 2002 2003 /* Should be called before sending a host page */ 2004 static void pss_host_page_prepare(PageSearchStatus *pss) 2005 { 2006 /* How many guest pages are there in one host page? */ 2007 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2008 2009 pss->host_page_sending = true; 2010 if (guest_pfns <= 1) { 2011 /* 2012 * This covers both when guest psize == host psize, or when guest 2013 * has larger psize than the host (guest_pfns==0). 2014 * 2015 * For the latter, we always send one whole guest page per 2016 * iteration of the host page (example: an Alpha VM on x86 host 2017 * will have guest psize 8K while host psize 4K). 2018 */ 2019 pss->host_page_start = pss->page; 2020 pss->host_page_end = pss->page + 1; 2021 } else { 2022 /* 2023 * The host page spans over multiple guest pages, we send them 2024 * within the same host page iteration. 2025 */ 2026 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); 2027 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); 2028 } 2029 } 2030 2031 /* 2032 * Whether the page pointed by PSS is within the host page being sent. 2033 * Must be called after a previous pss_host_page_prepare(). 2034 */ 2035 static bool pss_within_range(PageSearchStatus *pss) 2036 { 2037 ram_addr_t ram_addr; 2038 2039 assert(pss->host_page_sending); 2040 2041 /* Over host-page boundary? */ 2042 if (pss->page >= pss->host_page_end) { 2043 return false; 2044 } 2045 2046 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2047 2048 return offset_in_ramblock(pss->block, ram_addr); 2049 } 2050 2051 static void pss_host_page_finish(PageSearchStatus *pss) 2052 { 2053 pss->host_page_sending = false; 2054 /* This is not needed, but just to reset it */ 2055 pss->host_page_start = pss->host_page_end = 0; 2056 } 2057 2058 static void ram_page_hint_update(RAMState *rs, PageSearchStatus *pss) 2059 { 2060 PageLocationHint *hint = &rs->page_hint; 2061 2062 /* If there's a pending hint not consumed, don't bother */ 2063 if (hint->valid) { 2064 return; 2065 } 2066 2067 /* Provide a hint to the background stream otherwise */ 2068 hint->location.block = pss->block; 2069 hint->location.offset = pss->page; 2070 hint->valid = true; 2071 } 2072 2073 /* 2074 * Send an urgent host page specified by `pss'. Need to be called with 2075 * bitmap_mutex held. 2076 * 2077 * Returns 0 if save host page succeeded, false otherwise. 2078 */ 2079 static int ram_save_host_page_urgent(PageSearchStatus *pss) 2080 { 2081 bool page_dirty, sent = false; 2082 RAMState *rs = ram_state; 2083 int ret = 0; 2084 2085 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); 2086 pss_host_page_prepare(pss); 2087 2088 /* 2089 * If precopy is sending the same page, let it be done in precopy, or 2090 * we could send the same page in two channels and none of them will 2091 * receive the whole page. 2092 */ 2093 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) { 2094 trace_postcopy_preempt_hit(pss->block->idstr, 2095 pss->page << TARGET_PAGE_BITS); 2096 return 0; 2097 } 2098 2099 do { 2100 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2101 2102 if (page_dirty) { 2103 /* Be strict to return code; it must be 1, or what else? */ 2104 if (ram_save_target_page(rs, pss) != 1) { 2105 error_report_once("%s: ram_save_target_page failed", __func__); 2106 ret = -1; 2107 goto out; 2108 } 2109 sent = true; 2110 } 2111 pss_find_next_dirty(pss); 2112 } while (pss_within_range(pss)); 2113 out: 2114 pss_host_page_finish(pss); 2115 /* For urgent requests, flush immediately if sent */ 2116 if (sent) { 2117 qemu_fflush(pss->pss_channel); 2118 ram_page_hint_update(rs, pss); 2119 } 2120 return ret; 2121 } 2122 2123 /** 2124 * ram_save_host_page: save a whole host page 2125 * 2126 * Starting at *offset send pages up to the end of the current host 2127 * page. It's valid for the initial offset to point into the middle of 2128 * a host page in which case the remainder of the hostpage is sent. 2129 * Only dirty target pages are sent. Note that the host page size may 2130 * be a huge page for this block. 2131 * 2132 * The saving stops at the boundary of the used_length of the block 2133 * if the RAMBlock isn't a multiple of the host page size. 2134 * 2135 * The caller must be with ram_state.bitmap_mutex held to call this 2136 * function. Note that this function can temporarily release the lock, but 2137 * when the function is returned it'll make sure the lock is still held. 2138 * 2139 * Returns the number of pages written or negative on error 2140 * 2141 * @rs: current RAM state 2142 * @pss: data about the page we want to send 2143 */ 2144 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2145 { 2146 bool page_dirty, preempt_active = postcopy_preempt_active(); 2147 int tmppages, pages = 0; 2148 size_t pagesize_bits = 2149 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2150 unsigned long start_page = pss->page; 2151 int res; 2152 2153 if (migrate_ram_is_ignored(pss->block)) { 2154 error_report("block %s should not be migrated !", pss->block->idstr); 2155 return 0; 2156 } 2157 2158 /* Update host page boundary information */ 2159 pss_host_page_prepare(pss); 2160 2161 do { 2162 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2163 2164 /* Check the pages is dirty and if it is send it */ 2165 if (page_dirty) { 2166 /* 2167 * Properly yield the lock only in postcopy preempt mode 2168 * because both migration thread and rp-return thread can 2169 * operate on the bitmaps. 2170 */ 2171 if (preempt_active) { 2172 qemu_mutex_unlock(&rs->bitmap_mutex); 2173 } 2174 tmppages = ram_save_target_page(rs, pss); 2175 if (tmppages >= 0) { 2176 pages += tmppages; 2177 /* 2178 * Allow rate limiting to happen in the middle of huge pages if 2179 * something is sent in the current iteration. 2180 */ 2181 if (pagesize_bits > 1 && tmppages > 0) { 2182 migration_rate_limit(); 2183 } 2184 } 2185 if (preempt_active) { 2186 qemu_mutex_lock(&rs->bitmap_mutex); 2187 } 2188 } else { 2189 tmppages = 0; 2190 } 2191 2192 if (tmppages < 0) { 2193 pss_host_page_finish(pss); 2194 return tmppages; 2195 } 2196 2197 pss_find_next_dirty(pss); 2198 } while (pss_within_range(pss)); 2199 2200 pss_host_page_finish(pss); 2201 2202 res = ram_save_release_protection(rs, pss, start_page); 2203 return (res < 0 ? res : pages); 2204 } 2205 2206 static bool ram_page_hint_valid(RAMState *rs) 2207 { 2208 /* There's only page hint during postcopy preempt mode */ 2209 if (!postcopy_preempt_active()) { 2210 return false; 2211 } 2212 2213 return rs->page_hint.valid; 2214 } 2215 2216 static void ram_page_hint_collect(RAMState *rs, RAMBlock **block, 2217 unsigned long *page) 2218 { 2219 PageLocationHint *hint = &rs->page_hint; 2220 2221 assert(hint->valid); 2222 2223 *block = hint->location.block; 2224 *page = hint->location.offset; 2225 2226 /* Mark the hint consumed */ 2227 hint->valid = false; 2228 } 2229 2230 /** 2231 * ram_find_and_save_block: finds a dirty page and sends it to f 2232 * 2233 * Called within an RCU critical section. 2234 * 2235 * Returns the number of pages written where zero means no dirty pages, 2236 * or negative on error 2237 * 2238 * @rs: current RAM state 2239 * 2240 * On systems where host-page-size > target-page-size it will send all the 2241 * pages in a host page that are dirty. 2242 */ 2243 static int ram_find_and_save_block(RAMState *rs) 2244 { 2245 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 2246 unsigned long next_page; 2247 RAMBlock *next_block; 2248 int pages = 0; 2249 2250 /* No dirty page as there is zero RAM */ 2251 if (!rs->ram_bytes_total) { 2252 return pages; 2253 } 2254 2255 /* 2256 * Always keep last_seen_block/last_page valid during this procedure, 2257 * because find_dirty_block() relies on these values (e.g., we compare 2258 * last_seen_block with pss.block to see whether we searched all the 2259 * ramblocks) to detect the completion of migration. Having NULL value 2260 * of last_seen_block can conditionally cause below loop to run forever. 2261 */ 2262 if (!rs->last_seen_block) { 2263 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks); 2264 rs->last_page = 0; 2265 } 2266 2267 if (ram_page_hint_valid(rs)) { 2268 ram_page_hint_collect(rs, &next_block, &next_page); 2269 } else { 2270 next_block = rs->last_seen_block; 2271 next_page = rs->last_page; 2272 } 2273 2274 pss_init(pss, next_block, next_page); 2275 2276 while (true){ 2277 if (!get_queued_page(rs, pss)) { 2278 /* priority queue empty, so just search for something dirty */ 2279 int res = find_dirty_block(rs, pss); 2280 if (res != PAGE_DIRTY_FOUND) { 2281 if (res == PAGE_ALL_CLEAN) { 2282 break; 2283 } else if (res == PAGE_TRY_AGAIN) { 2284 continue; 2285 } else if (res < 0) { 2286 pages = res; 2287 break; 2288 } 2289 } 2290 } 2291 pages = ram_save_host_page(rs, pss); 2292 if (pages) { 2293 break; 2294 } 2295 } 2296 2297 rs->last_seen_block = pss->block; 2298 rs->last_page = pss->page; 2299 2300 return pages; 2301 } 2302 2303 static uint64_t ram_bytes_total_with_ignored(void) 2304 { 2305 RAMBlock *block; 2306 uint64_t total = 0; 2307 2308 RCU_READ_LOCK_GUARD(); 2309 2310 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2311 total += block->used_length; 2312 } 2313 return total; 2314 } 2315 2316 uint64_t ram_bytes_total(void) 2317 { 2318 RAMBlock *block; 2319 uint64_t total = 0; 2320 2321 RCU_READ_LOCK_GUARD(); 2322 2323 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2324 total += block->used_length; 2325 } 2326 return total; 2327 } 2328 2329 static void xbzrle_load_setup(void) 2330 { 2331 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2332 } 2333 2334 static void xbzrle_load_cleanup(void) 2335 { 2336 g_free(XBZRLE.decoded_buf); 2337 XBZRLE.decoded_buf = NULL; 2338 } 2339 2340 static void ram_state_cleanup(RAMState **rsp) 2341 { 2342 if (*rsp) { 2343 migration_page_queue_free(*rsp); 2344 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2345 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2346 g_free(*rsp); 2347 *rsp = NULL; 2348 } 2349 } 2350 2351 static void xbzrle_cleanup(void) 2352 { 2353 XBZRLE_cache_lock(); 2354 if (XBZRLE.cache) { 2355 cache_fini(XBZRLE.cache); 2356 g_free(XBZRLE.encoded_buf); 2357 g_free(XBZRLE.current_buf); 2358 g_free(XBZRLE.zero_target_page); 2359 XBZRLE.cache = NULL; 2360 XBZRLE.encoded_buf = NULL; 2361 XBZRLE.current_buf = NULL; 2362 XBZRLE.zero_target_page = NULL; 2363 } 2364 XBZRLE_cache_unlock(); 2365 } 2366 2367 static void ram_bitmaps_destroy(void) 2368 { 2369 RAMBlock *block; 2370 2371 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2372 g_free(block->clear_bmap); 2373 block->clear_bmap = NULL; 2374 g_free(block->bmap); 2375 block->bmap = NULL; 2376 g_free(block->file_bmap); 2377 block->file_bmap = NULL; 2378 } 2379 } 2380 2381 static void ram_save_cleanup(void *opaque) 2382 { 2383 RAMState **rsp = opaque; 2384 2385 /* We don't use dirty log with background snapshots */ 2386 if (!migrate_background_snapshot()) { 2387 /* caller have hold BQL or is in a bh, so there is 2388 * no writing race against the migration bitmap 2389 */ 2390 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2391 /* 2392 * do not stop dirty log without starting it, since 2393 * memory_global_dirty_log_stop will assert that 2394 * memory_global_dirty_log_start/stop used in pairs 2395 */ 2396 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2397 } 2398 } 2399 2400 ram_bitmaps_destroy(); 2401 2402 xbzrle_cleanup(); 2403 multifd_ram_save_cleanup(); 2404 ram_state_cleanup(rsp); 2405 } 2406 2407 static void ram_page_hint_reset(PageLocationHint *hint) 2408 { 2409 hint->location.block = NULL; 2410 hint->location.offset = 0; 2411 hint->valid = false; 2412 } 2413 2414 static void ram_state_reset(RAMState *rs) 2415 { 2416 int i; 2417 2418 for (i = 0; i < RAM_CHANNEL_MAX; i++) { 2419 rs->pss[i].last_sent_block = NULL; 2420 } 2421 2422 rs->last_seen_block = NULL; 2423 rs->last_page = 0; 2424 rs->last_version = ram_list.version; 2425 rs->xbzrle_started = false; 2426 2427 ram_page_hint_reset(&rs->page_hint); 2428 } 2429 2430 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2431 2432 /* **** functions for postcopy ***** */ 2433 2434 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2435 { 2436 struct RAMBlock *block; 2437 2438 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2439 unsigned long *bitmap = block->bmap; 2440 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2441 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2442 2443 while (run_start < range) { 2444 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2445 ram_discard_range(block->idstr, 2446 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2447 ((ram_addr_t)(run_end - run_start)) 2448 << TARGET_PAGE_BITS); 2449 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2450 } 2451 } 2452 } 2453 2454 /** 2455 * postcopy_send_discard_bm_ram: discard a RAMBlock 2456 * 2457 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2458 * 2459 * @ms: current migration state 2460 * @block: RAMBlock to discard 2461 */ 2462 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2463 { 2464 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2465 unsigned long current; 2466 unsigned long *bitmap = block->bmap; 2467 2468 for (current = 0; current < end; ) { 2469 unsigned long one = find_next_bit(bitmap, end, current); 2470 unsigned long zero, discard_length; 2471 2472 if (one >= end) { 2473 break; 2474 } 2475 2476 zero = find_next_zero_bit(bitmap, end, one + 1); 2477 2478 if (zero >= end) { 2479 discard_length = end - one; 2480 } else { 2481 discard_length = zero - one; 2482 } 2483 postcopy_discard_send_range(ms, one, discard_length); 2484 current = one + discard_length; 2485 } 2486 } 2487 2488 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2489 2490 /** 2491 * postcopy_each_ram_send_discard: discard all RAMBlocks 2492 * 2493 * Utility for the outgoing postcopy code. 2494 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2495 * passing it bitmap indexes and name. 2496 * (qemu_ram_foreach_block ends up passing unscaled lengths 2497 * which would mean postcopy code would have to deal with target page) 2498 * 2499 * @ms: current migration state 2500 */ 2501 static void postcopy_each_ram_send_discard(MigrationState *ms) 2502 { 2503 struct RAMBlock *block; 2504 2505 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2506 postcopy_discard_send_init(ms, block->idstr); 2507 2508 /* 2509 * Deal with TPS != HPS and huge pages. It discard any partially sent 2510 * host-page size chunks, mark any partially dirty host-page size 2511 * chunks as all dirty. In this case the host-page is the host-page 2512 * for the particular RAMBlock, i.e. it might be a huge page. 2513 */ 2514 postcopy_chunk_hostpages_pass(ms, block); 2515 2516 /* 2517 * Postcopy sends chunks of bitmap over the wire, but it 2518 * just needs indexes at this point, avoids it having 2519 * target page specific code. 2520 */ 2521 postcopy_send_discard_bm_ram(ms, block); 2522 postcopy_discard_send_finish(ms); 2523 } 2524 } 2525 2526 /** 2527 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2528 * 2529 * Helper for postcopy_chunk_hostpages; it's called twice to 2530 * canonicalize the two bitmaps, that are similar, but one is 2531 * inverted. 2532 * 2533 * Postcopy requires that all target pages in a hostpage are dirty or 2534 * clean, not a mix. This function canonicalizes the bitmaps. 2535 * 2536 * @ms: current migration state 2537 * @block: block that contains the page we want to canonicalize 2538 */ 2539 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2540 { 2541 RAMState *rs = ram_state; 2542 unsigned long *bitmap = block->bmap; 2543 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2544 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2545 unsigned long run_start; 2546 2547 if (block->page_size == TARGET_PAGE_SIZE) { 2548 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2549 return; 2550 } 2551 2552 /* Find a dirty page */ 2553 run_start = find_next_bit(bitmap, pages, 0); 2554 2555 while (run_start < pages) { 2556 2557 /* 2558 * If the start of this run of pages is in the middle of a host 2559 * page, then we need to fixup this host page. 2560 */ 2561 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2562 /* Find the end of this run */ 2563 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2564 /* 2565 * If the end isn't at the start of a host page, then the 2566 * run doesn't finish at the end of a host page 2567 * and we need to discard. 2568 */ 2569 } 2570 2571 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2572 unsigned long page; 2573 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2574 host_ratio); 2575 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2576 2577 /* Clean up the bitmap */ 2578 for (page = fixup_start_addr; 2579 page < fixup_start_addr + host_ratio; page++) { 2580 /* 2581 * Remark them as dirty, updating the count for any pages 2582 * that weren't previously dirty. 2583 */ 2584 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2585 } 2586 } 2587 2588 /* Find the next dirty page for the next iteration */ 2589 run_start = find_next_bit(bitmap, pages, run_start); 2590 } 2591 } 2592 2593 /** 2594 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2595 * 2596 * Transmit the set of pages to be discarded after precopy to the target 2597 * these are pages that: 2598 * a) Have been previously transmitted but are now dirty again 2599 * b) Pages that have never been transmitted, this ensures that 2600 * any pages on the destination that have been mapped by background 2601 * tasks get discarded (transparent huge pages is the specific concern) 2602 * Hopefully this is pretty sparse 2603 * 2604 * @ms: current migration state 2605 */ 2606 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2607 { 2608 RAMState *rs = ram_state; 2609 2610 RCU_READ_LOCK_GUARD(); 2611 2612 /* This should be our last sync, the src is now paused */ 2613 migration_bitmap_sync(rs, false); 2614 2615 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2616 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL; 2617 rs->last_seen_block = NULL; 2618 rs->last_page = 0; 2619 2620 postcopy_each_ram_send_discard(ms); 2621 2622 trace_ram_postcopy_send_discard_bitmap(); 2623 } 2624 2625 /** 2626 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2627 * 2628 * Returns zero on success 2629 * 2630 * @rbname: name of the RAMBlock of the request. NULL means the 2631 * same that last one. 2632 * @start: RAMBlock starting page 2633 * @length: RAMBlock size 2634 */ 2635 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2636 { 2637 trace_ram_discard_range(rbname, start, length); 2638 2639 RCU_READ_LOCK_GUARD(); 2640 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2641 2642 if (!rb) { 2643 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2644 return -1; 2645 } 2646 2647 /* 2648 * On source VM, we don't need to update the received bitmap since 2649 * we don't even have one. 2650 */ 2651 if (rb->receivedmap) { 2652 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2653 length >> qemu_target_page_bits()); 2654 } 2655 2656 return ram_block_discard_range(rb, start, length); 2657 } 2658 2659 /* 2660 * For every allocation, we will try not to crash the VM if the 2661 * allocation failed. 2662 */ 2663 static bool xbzrle_init(Error **errp) 2664 { 2665 if (!migrate_xbzrle()) { 2666 return true; 2667 } 2668 2669 XBZRLE_cache_lock(); 2670 2671 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2672 if (!XBZRLE.zero_target_page) { 2673 error_setg(errp, "%s: Error allocating zero page", __func__); 2674 goto err_out; 2675 } 2676 2677 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2678 TARGET_PAGE_SIZE, errp); 2679 if (!XBZRLE.cache) { 2680 goto free_zero_page; 2681 } 2682 2683 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2684 if (!XBZRLE.encoded_buf) { 2685 error_setg(errp, "%s: Error allocating encoded_buf", __func__); 2686 goto free_cache; 2687 } 2688 2689 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2690 if (!XBZRLE.current_buf) { 2691 error_setg(errp, "%s: Error allocating current_buf", __func__); 2692 goto free_encoded_buf; 2693 } 2694 2695 /* We are all good */ 2696 XBZRLE_cache_unlock(); 2697 return true; 2698 2699 free_encoded_buf: 2700 g_free(XBZRLE.encoded_buf); 2701 XBZRLE.encoded_buf = NULL; 2702 free_cache: 2703 cache_fini(XBZRLE.cache); 2704 XBZRLE.cache = NULL; 2705 free_zero_page: 2706 g_free(XBZRLE.zero_target_page); 2707 XBZRLE.zero_target_page = NULL; 2708 err_out: 2709 XBZRLE_cache_unlock(); 2710 return false; 2711 } 2712 2713 static bool ram_state_init(RAMState **rsp, Error **errp) 2714 { 2715 *rsp = g_try_new0(RAMState, 1); 2716 2717 if (!*rsp) { 2718 error_setg(errp, "%s: Init ramstate fail", __func__); 2719 return false; 2720 } 2721 2722 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2723 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2724 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2725 (*rsp)->ram_bytes_total = ram_bytes_total(); 2726 2727 /* 2728 * Count the total number of pages used by ram blocks not including any 2729 * gaps due to alignment or unplugs. 2730 * This must match with the initial values of dirty bitmap. 2731 */ 2732 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS; 2733 ram_state_reset(*rsp); 2734 2735 return true; 2736 } 2737 2738 static void ram_list_init_bitmaps(void) 2739 { 2740 MigrationState *ms = migrate_get_current(); 2741 RAMBlock *block; 2742 unsigned long pages; 2743 uint8_t shift; 2744 2745 /* Skip setting bitmap if there is no RAM */ 2746 if (ram_bytes_total()) { 2747 shift = ms->clear_bitmap_shift; 2748 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2749 error_report("clear_bitmap_shift (%u) too big, using " 2750 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2751 shift = CLEAR_BITMAP_SHIFT_MAX; 2752 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2753 error_report("clear_bitmap_shift (%u) too small, using " 2754 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2755 shift = CLEAR_BITMAP_SHIFT_MIN; 2756 } 2757 2758 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2759 pages = block->max_length >> TARGET_PAGE_BITS; 2760 /* 2761 * The initial dirty bitmap for migration must be set with all 2762 * ones to make sure we'll migrate every guest RAM page to 2763 * destination. 2764 * Here we set RAMBlock.bmap all to 1 because when rebegin a 2765 * new migration after a failed migration, ram_list. 2766 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 2767 * guest memory. 2768 */ 2769 block->bmap = bitmap_new(pages); 2770 bitmap_set(block->bmap, 0, pages); 2771 if (migrate_mapped_ram()) { 2772 block->file_bmap = bitmap_new(pages); 2773 } 2774 block->clear_bmap_shift = shift; 2775 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 2776 } 2777 } 2778 } 2779 2780 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 2781 { 2782 unsigned long pages; 2783 RAMBlock *rb; 2784 2785 RCU_READ_LOCK_GUARD(); 2786 2787 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 2788 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 2789 rs->migration_dirty_pages -= pages; 2790 } 2791 } 2792 2793 static bool ram_init_bitmaps(RAMState *rs, Error **errp) 2794 { 2795 bool ret = true; 2796 2797 qemu_mutex_lock_ramlist(); 2798 2799 WITH_RCU_READ_LOCK_GUARD() { 2800 ram_list_init_bitmaps(); 2801 /* We don't use dirty log with background snapshots */ 2802 if (!migrate_background_snapshot()) { 2803 ret = memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION, errp); 2804 if (!ret) { 2805 goto out_unlock; 2806 } 2807 migration_bitmap_sync_precopy(false); 2808 } 2809 } 2810 out_unlock: 2811 qemu_mutex_unlock_ramlist(); 2812 2813 if (!ret) { 2814 ram_bitmaps_destroy(); 2815 return false; 2816 } 2817 2818 /* 2819 * After an eventual first bitmap sync, fixup the initial bitmap 2820 * containing all 1s to exclude any discarded pages from migration. 2821 */ 2822 migration_bitmap_clear_discarded_pages(rs); 2823 return true; 2824 } 2825 2826 static int ram_init_all(RAMState **rsp, Error **errp) 2827 { 2828 if (!ram_state_init(rsp, errp)) { 2829 return -1; 2830 } 2831 2832 if (!xbzrle_init(errp)) { 2833 ram_state_cleanup(rsp); 2834 return -1; 2835 } 2836 2837 if (!ram_init_bitmaps(*rsp, errp)) { 2838 return -1; 2839 } 2840 2841 return 0; 2842 } 2843 2844 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2845 { 2846 RAMBlock *block; 2847 uint64_t pages = 0; 2848 2849 /* 2850 * Postcopy is not using xbzrle/compression, so no need for that. 2851 * Also, since source are already halted, we don't need to care 2852 * about dirty page logging as well. 2853 */ 2854 2855 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2856 pages += bitmap_count_one(block->bmap, 2857 block->used_length >> TARGET_PAGE_BITS); 2858 } 2859 2860 /* This may not be aligned with current bitmaps. Recalculate. */ 2861 rs->migration_dirty_pages = pages; 2862 2863 ram_state_reset(rs); 2864 2865 /* Update RAMState cache of output QEMUFile */ 2866 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out; 2867 2868 trace_ram_state_resume_prepare(pages); 2869 } 2870 2871 /* 2872 * This function clears bits of the free pages reported by the caller from the 2873 * migration dirty bitmap. @addr is the host address corresponding to the 2874 * start of the continuous guest free pages, and @len is the total bytes of 2875 * those pages. 2876 */ 2877 void qemu_guest_free_page_hint(void *addr, size_t len) 2878 { 2879 RAMBlock *block; 2880 ram_addr_t offset; 2881 size_t used_len, start, npages; 2882 2883 /* This function is currently expected to be used during live migration */ 2884 if (!migration_is_running()) { 2885 return; 2886 } 2887 2888 for (; len > 0; len -= used_len, addr += used_len) { 2889 block = qemu_ram_block_from_host(addr, false, &offset); 2890 if (unlikely(!block || offset >= block->used_length)) { 2891 /* 2892 * The implementation might not support RAMBlock resize during 2893 * live migration, but it could happen in theory with future 2894 * updates. So we add a check here to capture that case. 2895 */ 2896 error_report_once("%s unexpected error", __func__); 2897 return; 2898 } 2899 2900 if (len <= block->used_length - offset) { 2901 used_len = len; 2902 } else { 2903 used_len = block->used_length - offset; 2904 } 2905 2906 start = offset >> TARGET_PAGE_BITS; 2907 npages = used_len >> TARGET_PAGE_BITS; 2908 2909 qemu_mutex_lock(&ram_state->bitmap_mutex); 2910 /* 2911 * The skipped free pages are equavalent to be sent from clear_bmap's 2912 * perspective, so clear the bits from the memory region bitmap which 2913 * are initially set. Otherwise those skipped pages will be sent in 2914 * the next round after syncing from the memory region bitmap. 2915 */ 2916 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 2917 ram_state->migration_dirty_pages -= 2918 bitmap_count_one_with_offset(block->bmap, start, npages); 2919 bitmap_clear(block->bmap, start, npages); 2920 qemu_mutex_unlock(&ram_state->bitmap_mutex); 2921 } 2922 } 2923 2924 #define MAPPED_RAM_HDR_VERSION 1 2925 struct MappedRamHeader { 2926 uint32_t version; 2927 /* 2928 * The target's page size, so we know how many pages are in the 2929 * bitmap. 2930 */ 2931 uint64_t page_size; 2932 /* 2933 * The offset in the migration file where the pages bitmap is 2934 * stored. 2935 */ 2936 uint64_t bitmap_offset; 2937 /* 2938 * The offset in the migration file where the actual pages (data) 2939 * are stored. 2940 */ 2941 uint64_t pages_offset; 2942 } QEMU_PACKED; 2943 typedef struct MappedRamHeader MappedRamHeader; 2944 2945 static void mapped_ram_setup_ramblock(QEMUFile *file, RAMBlock *block) 2946 { 2947 g_autofree MappedRamHeader *header = NULL; 2948 size_t header_size, bitmap_size; 2949 long num_pages; 2950 2951 header = g_new0(MappedRamHeader, 1); 2952 header_size = sizeof(MappedRamHeader); 2953 2954 num_pages = block->used_length >> TARGET_PAGE_BITS; 2955 bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); 2956 2957 /* 2958 * Save the file offsets of where the bitmap and the pages should 2959 * go as they are written at the end of migration and during the 2960 * iterative phase, respectively. 2961 */ 2962 block->bitmap_offset = qemu_get_offset(file) + header_size; 2963 block->pages_offset = ROUND_UP(block->bitmap_offset + 2964 bitmap_size, 2965 MAPPED_RAM_FILE_OFFSET_ALIGNMENT); 2966 2967 header->version = cpu_to_be32(MAPPED_RAM_HDR_VERSION); 2968 header->page_size = cpu_to_be64(TARGET_PAGE_SIZE); 2969 header->bitmap_offset = cpu_to_be64(block->bitmap_offset); 2970 header->pages_offset = cpu_to_be64(block->pages_offset); 2971 2972 qemu_put_buffer(file, (uint8_t *) header, header_size); 2973 2974 /* prepare offset for next ramblock */ 2975 qemu_set_offset(file, block->pages_offset + block->used_length, SEEK_SET); 2976 } 2977 2978 static bool mapped_ram_read_header(QEMUFile *file, MappedRamHeader *header, 2979 Error **errp) 2980 { 2981 size_t ret, header_size = sizeof(MappedRamHeader); 2982 2983 ret = qemu_get_buffer(file, (uint8_t *)header, header_size); 2984 if (ret != header_size) { 2985 error_setg(errp, "Could not read whole mapped-ram migration header " 2986 "(expected %zd, got %zd bytes)", header_size, ret); 2987 return false; 2988 } 2989 2990 /* migration stream is big-endian */ 2991 header->version = be32_to_cpu(header->version); 2992 2993 if (header->version > MAPPED_RAM_HDR_VERSION) { 2994 error_setg(errp, "Migration mapped-ram capability version not " 2995 "supported (expected <= %d, got %d)", MAPPED_RAM_HDR_VERSION, 2996 header->version); 2997 return false; 2998 } 2999 3000 header->page_size = be64_to_cpu(header->page_size); 3001 header->bitmap_offset = be64_to_cpu(header->bitmap_offset); 3002 header->pages_offset = be64_to_cpu(header->pages_offset); 3003 3004 return true; 3005 } 3006 3007 /* 3008 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 3009 * long-running RCU critical section. When rcu-reclaims in the code 3010 * start to become numerous it will be necessary to reduce the 3011 * granularity of these critical sections. 3012 */ 3013 3014 /** 3015 * ram_save_setup: Setup RAM for migration 3016 * 3017 * Returns zero to indicate success and negative for error 3018 * 3019 * @f: QEMUFile where to send the data 3020 * @opaque: RAMState pointer 3021 * @errp: pointer to Error*, to store an error if it happens. 3022 */ 3023 static int ram_save_setup(QEMUFile *f, void *opaque, Error **errp) 3024 { 3025 RAMState **rsp = opaque; 3026 RAMBlock *block; 3027 int ret, max_hg_page_size; 3028 3029 /* migration has already setup the bitmap, reuse it. */ 3030 if (!migration_in_colo_state()) { 3031 if (ram_init_all(rsp, errp) != 0) { 3032 return -1; 3033 } 3034 } 3035 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; 3036 3037 /* 3038 * ??? Mirrors the previous value of qemu_host_page_size, 3039 * but is this really what was intended for the migration? 3040 */ 3041 max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE); 3042 3043 WITH_RCU_READ_LOCK_GUARD() { 3044 qemu_put_be64(f, ram_bytes_total_with_ignored() 3045 | RAM_SAVE_FLAG_MEM_SIZE); 3046 3047 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3048 qemu_put_byte(f, strlen(block->idstr)); 3049 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3050 qemu_put_be64(f, block->used_length); 3051 if (migrate_postcopy_ram() && 3052 block->page_size != max_hg_page_size) { 3053 qemu_put_be64(f, block->page_size); 3054 } 3055 if (migrate_ignore_shared()) { 3056 qemu_put_be64(f, block->mr->addr); 3057 } 3058 3059 if (migrate_mapped_ram()) { 3060 mapped_ram_setup_ramblock(f, block); 3061 } 3062 } 3063 } 3064 3065 ret = rdma_registration_start(f, RAM_CONTROL_SETUP); 3066 if (ret < 0) { 3067 error_setg(errp, "%s: failed to start RDMA registration", __func__); 3068 qemu_file_set_error(f, ret); 3069 return ret; 3070 } 3071 3072 ret = rdma_registration_stop(f, RAM_CONTROL_SETUP); 3073 if (ret < 0) { 3074 error_setg(errp, "%s: failed to stop RDMA registration", __func__); 3075 qemu_file_set_error(f, ret); 3076 return ret; 3077 } 3078 3079 if (migrate_multifd()) { 3080 multifd_ram_save_setup(); 3081 } 3082 3083 /* 3084 * This operation is unfortunate.. 3085 * 3086 * For legacy QEMUs using per-section sync 3087 * ======================================= 3088 * 3089 * This must exist because the EOS below requires the SYNC messages 3090 * per-channel to work. 3091 * 3092 * For modern QEMUs using per-round sync 3093 * ===================================== 3094 * 3095 * Logically such sync is not needed, and recv threads should not run 3096 * until setup ready (using things like channels_ready on src). Then 3097 * we should be all fine. 3098 * 3099 * However even if we add channels_ready to recv side in new QEMUs, old 3100 * QEMU won't have them so this sync will still be needed to make sure 3101 * multifd recv threads won't start processing guest pages early before 3102 * ram_load_setup() is properly done. 3103 * 3104 * Let's stick with this. Fortunately the overhead is low to sync 3105 * during setup because the VM is running, so at least it's not 3106 * accounted as part of downtime. 3107 */ 3108 bql_unlock(); 3109 ret = multifd_ram_flush_and_sync(f); 3110 bql_lock(); 3111 if (ret < 0) { 3112 error_setg(errp, "%s: multifd synchronization failed", __func__); 3113 return ret; 3114 } 3115 3116 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3117 ret = qemu_fflush(f); 3118 if (ret < 0) { 3119 error_setg_errno(errp, -ret, "%s failed", __func__); 3120 } 3121 return ret; 3122 } 3123 3124 static void ram_save_file_bmap(QEMUFile *f) 3125 { 3126 RAMBlock *block; 3127 3128 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3129 long num_pages = block->used_length >> TARGET_PAGE_BITS; 3130 long bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); 3131 3132 qemu_put_buffer_at(f, (uint8_t *)block->file_bmap, bitmap_size, 3133 block->bitmap_offset); 3134 ram_transferred_add(bitmap_size); 3135 3136 /* 3137 * Free the bitmap here to catch any synchronization issues 3138 * with multifd channels. No channels should be sending pages 3139 * after we've written the bitmap to file. 3140 */ 3141 g_free(block->file_bmap); 3142 block->file_bmap = NULL; 3143 } 3144 } 3145 3146 void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset, bool set) 3147 { 3148 if (set) { 3149 set_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); 3150 } else { 3151 clear_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); 3152 } 3153 } 3154 3155 /** 3156 * ram_save_iterate: iterative stage for migration 3157 * 3158 * Returns zero to indicate success and negative for error 3159 * 3160 * @f: QEMUFile where to send the data 3161 * @opaque: RAMState pointer 3162 */ 3163 static int ram_save_iterate(QEMUFile *f, void *opaque) 3164 { 3165 RAMState **temp = opaque; 3166 RAMState *rs = *temp; 3167 int ret = 0; 3168 int i; 3169 int64_t t0; 3170 int done = 0; 3171 3172 /* 3173 * We'll take this lock a little bit long, but it's okay for two reasons. 3174 * Firstly, the only possible other thread to take it is who calls 3175 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3176 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3177 * guarantees that we'll at least released it in a regular basis. 3178 */ 3179 WITH_QEMU_LOCK_GUARD(&rs->bitmap_mutex) { 3180 WITH_RCU_READ_LOCK_GUARD() { 3181 if (ram_list.version != rs->last_version) { 3182 ram_state_reset(rs); 3183 } 3184 3185 /* Read version before ram_list.blocks */ 3186 smp_rmb(); 3187 3188 ret = rdma_registration_start(f, RAM_CONTROL_ROUND); 3189 if (ret < 0) { 3190 qemu_file_set_error(f, ret); 3191 goto out; 3192 } 3193 3194 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3195 i = 0; 3196 while ((ret = migration_rate_exceeded(f)) == 0 || 3197 postcopy_has_request(rs)) { 3198 int pages; 3199 3200 if (qemu_file_get_error(f)) { 3201 break; 3202 } 3203 3204 pages = ram_find_and_save_block(rs); 3205 /* no more pages to sent */ 3206 if (pages == 0) { 3207 done = 1; 3208 break; 3209 } 3210 3211 if (pages < 0) { 3212 qemu_file_set_error(f, pages); 3213 break; 3214 } 3215 3216 rs->target_page_count += pages; 3217 3218 /* 3219 * we want to check in the 1st loop, just in case it was the 1st 3220 * time and we had to sync the dirty bitmap. 3221 * qemu_clock_get_ns() is a bit expensive, so we only check each 3222 * some iterations 3223 */ 3224 if ((i & 63) == 0) { 3225 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3226 1000000; 3227 if (t1 > MAX_WAIT) { 3228 trace_ram_save_iterate_big_wait(t1, i); 3229 break; 3230 } 3231 } 3232 i++; 3233 } 3234 } 3235 } 3236 3237 /* 3238 * Must occur before EOS (or any QEMUFile operation) 3239 * because of RDMA protocol. 3240 */ 3241 ret = rdma_registration_stop(f, RAM_CONTROL_ROUND); 3242 if (ret < 0) { 3243 qemu_file_set_error(f, ret); 3244 } 3245 3246 out: 3247 if (ret >= 0 && migration_is_running()) { 3248 if (multifd_ram_sync_per_section()) { 3249 ret = multifd_ram_flush_and_sync(f); 3250 if (ret < 0) { 3251 return ret; 3252 } 3253 } 3254 3255 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3256 ram_transferred_add(8); 3257 ret = qemu_fflush(f); 3258 } 3259 if (ret < 0) { 3260 return ret; 3261 } 3262 3263 return done; 3264 } 3265 3266 /** 3267 * ram_save_complete: function called to send the remaining amount of ram 3268 * 3269 * Returns zero to indicate success or negative on error 3270 * 3271 * Called with the BQL 3272 * 3273 * @f: QEMUFile where to send the data 3274 * @opaque: RAMState pointer 3275 */ 3276 static int ram_save_complete(QEMUFile *f, void *opaque) 3277 { 3278 RAMState **temp = opaque; 3279 RAMState *rs = *temp; 3280 int ret = 0; 3281 3282 rs->last_stage = !migration_in_colo_state(); 3283 3284 WITH_RCU_READ_LOCK_GUARD() { 3285 if (!migration_in_postcopy()) { 3286 migration_bitmap_sync_precopy(true); 3287 } 3288 3289 ret = rdma_registration_start(f, RAM_CONTROL_FINISH); 3290 if (ret < 0) { 3291 qemu_file_set_error(f, ret); 3292 return ret; 3293 } 3294 3295 /* try transferring iterative blocks of memory */ 3296 3297 /* flush all remaining blocks regardless of rate limiting */ 3298 qemu_mutex_lock(&rs->bitmap_mutex); 3299 while (true) { 3300 int pages; 3301 3302 pages = ram_find_and_save_block(rs); 3303 /* no more blocks to sent */ 3304 if (pages == 0) { 3305 break; 3306 } 3307 if (pages < 0) { 3308 qemu_mutex_unlock(&rs->bitmap_mutex); 3309 return pages; 3310 } 3311 } 3312 qemu_mutex_unlock(&rs->bitmap_mutex); 3313 3314 ret = rdma_registration_stop(f, RAM_CONTROL_FINISH); 3315 if (ret < 0) { 3316 qemu_file_set_error(f, ret); 3317 return ret; 3318 } 3319 } 3320 3321 if (multifd_ram_sync_per_section()) { 3322 /* 3323 * Only the old dest QEMU will need this sync, because each EOS 3324 * will require one SYNC message on each channel. 3325 */ 3326 ret = multifd_ram_flush_and_sync(f); 3327 if (ret < 0) { 3328 return ret; 3329 } 3330 } 3331 3332 if (migrate_mapped_ram()) { 3333 ram_save_file_bmap(f); 3334 3335 if (qemu_file_get_error(f)) { 3336 Error *local_err = NULL; 3337 int err = qemu_file_get_error_obj(f, &local_err); 3338 3339 error_reportf_err(local_err, "Failed to write bitmap to file: "); 3340 return -err; 3341 } 3342 } 3343 3344 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3345 return qemu_fflush(f); 3346 } 3347 3348 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy, 3349 uint64_t *can_postcopy) 3350 { 3351 RAMState **temp = opaque; 3352 RAMState *rs = *temp; 3353 3354 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3355 3356 if (migrate_postcopy_ram()) { 3357 /* We can do postcopy, and all the data is postcopiable */ 3358 *can_postcopy += remaining_size; 3359 } else { 3360 *must_precopy += remaining_size; 3361 } 3362 } 3363 3364 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy, 3365 uint64_t *can_postcopy) 3366 { 3367 RAMState **temp = opaque; 3368 RAMState *rs = *temp; 3369 uint64_t remaining_size; 3370 3371 if (!migration_in_postcopy()) { 3372 bql_lock(); 3373 WITH_RCU_READ_LOCK_GUARD() { 3374 migration_bitmap_sync_precopy(false); 3375 } 3376 bql_unlock(); 3377 } 3378 3379 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3380 3381 if (migrate_postcopy_ram()) { 3382 /* We can do postcopy, and all the data is postcopiable */ 3383 *can_postcopy += remaining_size; 3384 } else { 3385 *must_precopy += remaining_size; 3386 } 3387 } 3388 3389 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3390 { 3391 unsigned int xh_len; 3392 int xh_flags; 3393 uint8_t *loaded_data; 3394 3395 /* extract RLE header */ 3396 xh_flags = qemu_get_byte(f); 3397 xh_len = qemu_get_be16(f); 3398 3399 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3400 error_report("Failed to load XBZRLE page - wrong compression!"); 3401 return -1; 3402 } 3403 3404 if (xh_len > TARGET_PAGE_SIZE) { 3405 error_report("Failed to load XBZRLE page - len overflow!"); 3406 return -1; 3407 } 3408 loaded_data = XBZRLE.decoded_buf; 3409 /* load data and decode */ 3410 /* it can change loaded_data to point to an internal buffer */ 3411 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3412 3413 /* decode RLE */ 3414 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3415 TARGET_PAGE_SIZE) == -1) { 3416 error_report("Failed to load XBZRLE page - decode error!"); 3417 return -1; 3418 } 3419 3420 return 0; 3421 } 3422 3423 /** 3424 * ram_block_from_stream: read a RAMBlock id from the migration stream 3425 * 3426 * Must be called from within a rcu critical section. 3427 * 3428 * Returns a pointer from within the RCU-protected ram_list. 3429 * 3430 * @mis: the migration incoming state pointer 3431 * @f: QEMUFile where to read the data from 3432 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3433 * @channel: the channel we're using 3434 */ 3435 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3436 QEMUFile *f, int flags, 3437 int channel) 3438 { 3439 RAMBlock *block = mis->last_recv_block[channel]; 3440 char id[256]; 3441 uint8_t len; 3442 3443 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3444 if (!block) { 3445 error_report("Ack, bad migration stream!"); 3446 return NULL; 3447 } 3448 return block; 3449 } 3450 3451 len = qemu_get_byte(f); 3452 qemu_get_buffer(f, (uint8_t *)id, len); 3453 id[len] = 0; 3454 3455 block = qemu_ram_block_by_name(id); 3456 if (!block) { 3457 error_report("Can't find block %s", id); 3458 return NULL; 3459 } 3460 3461 if (migrate_ram_is_ignored(block)) { 3462 error_report("block %s should not be migrated !", id); 3463 return NULL; 3464 } 3465 3466 mis->last_recv_block[channel] = block; 3467 3468 return block; 3469 } 3470 3471 static inline void *host_from_ram_block_offset(RAMBlock *block, 3472 ram_addr_t offset) 3473 { 3474 if (!offset_in_ramblock(block, offset)) { 3475 return NULL; 3476 } 3477 3478 return block->host + offset; 3479 } 3480 3481 static void *host_page_from_ram_block_offset(RAMBlock *block, 3482 ram_addr_t offset) 3483 { 3484 /* Note: Explicitly no check against offset_in_ramblock(). */ 3485 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3486 block->page_size); 3487 } 3488 3489 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3490 ram_addr_t offset) 3491 { 3492 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3493 } 3494 3495 void colo_record_bitmap(RAMBlock *block, ram_addr_t *normal, uint32_t pages) 3496 { 3497 qemu_mutex_lock(&ram_state->bitmap_mutex); 3498 for (int i = 0; i < pages; i++) { 3499 ram_addr_t offset = normal[i]; 3500 ram_state->migration_dirty_pages += !test_and_set_bit( 3501 offset >> TARGET_PAGE_BITS, 3502 block->bmap); 3503 } 3504 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3505 } 3506 3507 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3508 ram_addr_t offset, bool record_bitmap) 3509 { 3510 if (!offset_in_ramblock(block, offset)) { 3511 return NULL; 3512 } 3513 if (!block->colo_cache) { 3514 error_report("%s: colo_cache is NULL in block :%s", 3515 __func__, block->idstr); 3516 return NULL; 3517 } 3518 3519 /* 3520 * During colo checkpoint, we need bitmap of these migrated pages. 3521 * It help us to decide which pages in ram cache should be flushed 3522 * into VM's RAM later. 3523 */ 3524 if (record_bitmap) { 3525 colo_record_bitmap(block, &offset, 1); 3526 } 3527 return block->colo_cache + offset; 3528 } 3529 3530 /** 3531 * ram_handle_zero: handle the zero page case 3532 * 3533 * If a page (or a whole RDMA chunk) has been 3534 * determined to be zero, then zap it. 3535 * 3536 * @host: host address for the zero page 3537 * @ch: what the page is filled from. We only support zero 3538 * @size: size of the zero page 3539 */ 3540 void ram_handle_zero(void *host, uint64_t size) 3541 { 3542 if (!buffer_is_zero(host, size)) { 3543 memset(host, 0, size); 3544 } 3545 } 3546 3547 static void colo_init_ram_state(void) 3548 { 3549 Error *local_err = NULL; 3550 3551 if (!ram_state_init(&ram_state, &local_err)) { 3552 error_report_err(local_err); 3553 } 3554 } 3555 3556 /* 3557 * colo cache: this is for secondary VM, we cache the whole 3558 * memory of the secondary VM, it is need to hold the global lock 3559 * to call this helper. 3560 */ 3561 int colo_init_ram_cache(void) 3562 { 3563 RAMBlock *block; 3564 3565 WITH_RCU_READ_LOCK_GUARD() { 3566 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3567 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3568 NULL, false, false); 3569 if (!block->colo_cache) { 3570 error_report("%s: Can't alloc memory for COLO cache of block %s," 3571 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3572 block->used_length); 3573 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3574 if (block->colo_cache) { 3575 qemu_anon_ram_free(block->colo_cache, block->used_length); 3576 block->colo_cache = NULL; 3577 } 3578 } 3579 return -errno; 3580 } 3581 if (!machine_dump_guest_core(current_machine)) { 3582 qemu_madvise(block->colo_cache, block->used_length, 3583 QEMU_MADV_DONTDUMP); 3584 } 3585 } 3586 } 3587 3588 /* 3589 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3590 * with to decide which page in cache should be flushed into SVM's RAM. Here 3591 * we use the same name 'ram_bitmap' as for migration. 3592 */ 3593 if (ram_bytes_total()) { 3594 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3595 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3596 block->bmap = bitmap_new(pages); 3597 } 3598 } 3599 3600 colo_init_ram_state(); 3601 return 0; 3602 } 3603 3604 /* TODO: duplicated with ram_init_bitmaps */ 3605 void colo_incoming_start_dirty_log(void) 3606 { 3607 RAMBlock *block = NULL; 3608 Error *local_err = NULL; 3609 3610 /* For memory_global_dirty_log_start below. */ 3611 bql_lock(); 3612 qemu_mutex_lock_ramlist(); 3613 3614 memory_global_dirty_log_sync(false); 3615 WITH_RCU_READ_LOCK_GUARD() { 3616 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3617 ramblock_sync_dirty_bitmap(ram_state, block); 3618 /* Discard this dirty bitmap record */ 3619 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3620 } 3621 if (!memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION, 3622 &local_err)) { 3623 error_report_err(local_err); 3624 } 3625 } 3626 ram_state->migration_dirty_pages = 0; 3627 qemu_mutex_unlock_ramlist(); 3628 bql_unlock(); 3629 } 3630 3631 /* It is need to hold the global lock to call this helper */ 3632 void colo_release_ram_cache(void) 3633 { 3634 RAMBlock *block; 3635 3636 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3637 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3638 g_free(block->bmap); 3639 block->bmap = NULL; 3640 } 3641 3642 WITH_RCU_READ_LOCK_GUARD() { 3643 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3644 if (block->colo_cache) { 3645 qemu_anon_ram_free(block->colo_cache, block->used_length); 3646 block->colo_cache = NULL; 3647 } 3648 } 3649 } 3650 ram_state_cleanup(&ram_state); 3651 } 3652 3653 /** 3654 * ram_load_setup: Setup RAM for migration incoming side 3655 * 3656 * Returns zero to indicate success and negative for error 3657 * 3658 * @f: QEMUFile where to receive the data 3659 * @opaque: RAMState pointer 3660 * @errp: pointer to Error*, to store an error if it happens. 3661 */ 3662 static int ram_load_setup(QEMUFile *f, void *opaque, Error **errp) 3663 { 3664 xbzrle_load_setup(); 3665 ramblock_recv_map_init(); 3666 3667 return 0; 3668 } 3669 3670 static int ram_load_cleanup(void *opaque) 3671 { 3672 RAMBlock *rb; 3673 3674 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3675 qemu_ram_block_writeback(rb); 3676 } 3677 3678 xbzrle_load_cleanup(); 3679 3680 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3681 g_free(rb->receivedmap); 3682 rb->receivedmap = NULL; 3683 } 3684 3685 return 0; 3686 } 3687 3688 /** 3689 * ram_postcopy_incoming_init: allocate postcopy data structures 3690 * 3691 * Returns 0 for success and negative if there was one error 3692 * 3693 * @mis: current migration incoming state 3694 * 3695 * Allocate data structures etc needed by incoming migration with 3696 * postcopy-ram. postcopy-ram's similarly names 3697 * postcopy_ram_incoming_init does the work. 3698 */ 3699 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3700 { 3701 return postcopy_ram_incoming_init(mis); 3702 } 3703 3704 /** 3705 * ram_load_postcopy: load a page in postcopy case 3706 * 3707 * Returns 0 for success or -errno in case of error 3708 * 3709 * Called in postcopy mode by ram_load(). 3710 * rcu_read_lock is taken prior to this being called. 3711 * 3712 * @f: QEMUFile where to send the data 3713 * @channel: the channel to use for loading 3714 */ 3715 int ram_load_postcopy(QEMUFile *f, int channel) 3716 { 3717 int flags = 0, ret = 0; 3718 bool place_needed = false; 3719 bool matches_target_page_size = false; 3720 MigrationIncomingState *mis = migration_incoming_get_current(); 3721 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel]; 3722 3723 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3724 ram_addr_t addr; 3725 void *page_buffer = NULL; 3726 void *place_source = NULL; 3727 RAMBlock *block = NULL; 3728 uint8_t ch; 3729 3730 addr = qemu_get_be64(f); 3731 3732 /* 3733 * If qemu file error, we should stop here, and then "addr" 3734 * may be invalid 3735 */ 3736 ret = qemu_file_get_error(f); 3737 if (ret) { 3738 break; 3739 } 3740 3741 flags = addr & ~TARGET_PAGE_MASK; 3742 addr &= TARGET_PAGE_MASK; 3743 3744 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags); 3745 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) { 3746 block = ram_block_from_stream(mis, f, flags, channel); 3747 if (!block) { 3748 ret = -EINVAL; 3749 break; 3750 } 3751 3752 /* 3753 * Relying on used_length is racy and can result in false positives. 3754 * We might place pages beyond used_length in case RAM was shrunk 3755 * while in postcopy, which is fine - trying to place via 3756 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 3757 */ 3758 if (!block->host || addr >= block->postcopy_length) { 3759 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3760 ret = -EINVAL; 3761 break; 3762 } 3763 tmp_page->target_pages++; 3764 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3765 /* 3766 * Postcopy requires that we place whole host pages atomically; 3767 * these may be huge pages for RAMBlocks that are backed by 3768 * hugetlbfs. 3769 * To make it atomic, the data is read into a temporary page 3770 * that's moved into place later. 3771 * The migration protocol uses, possibly smaller, target-pages 3772 * however the source ensures it always sends all the components 3773 * of a host page in one chunk. 3774 */ 3775 page_buffer = tmp_page->tmp_huge_page + 3776 host_page_offset_from_ram_block_offset(block, addr); 3777 /* If all TP are zero then we can optimise the place */ 3778 if (tmp_page->target_pages == 1) { 3779 tmp_page->host_addr = 3780 host_page_from_ram_block_offset(block, addr); 3781 } else if (tmp_page->host_addr != 3782 host_page_from_ram_block_offset(block, addr)) { 3783 /* not the 1st TP within the HP */ 3784 error_report("Non-same host page detected on channel %d: " 3785 "Target host page %p, received host page %p " 3786 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 3787 channel, tmp_page->host_addr, 3788 host_page_from_ram_block_offset(block, addr), 3789 block->idstr, addr, tmp_page->target_pages); 3790 ret = -EINVAL; 3791 break; 3792 } 3793 3794 /* 3795 * If it's the last part of a host page then we place the host 3796 * page 3797 */ 3798 if (tmp_page->target_pages == 3799 (block->page_size / TARGET_PAGE_SIZE)) { 3800 place_needed = true; 3801 } 3802 place_source = tmp_page->tmp_huge_page; 3803 } 3804 3805 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3806 case RAM_SAVE_FLAG_ZERO: 3807 ch = qemu_get_byte(f); 3808 if (ch != 0) { 3809 error_report("Found a zero page with value %d", ch); 3810 ret = -EINVAL; 3811 break; 3812 } 3813 /* 3814 * Can skip to set page_buffer when 3815 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 3816 */ 3817 if (!matches_target_page_size) { 3818 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3819 } 3820 break; 3821 3822 case RAM_SAVE_FLAG_PAGE: 3823 tmp_page->all_zero = false; 3824 if (!matches_target_page_size) { 3825 /* For huge pages, we always use temporary buffer */ 3826 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3827 } else { 3828 /* 3829 * For small pages that matches target page size, we 3830 * avoid the qemu_file copy. Instead we directly use 3831 * the buffer of QEMUFile to place the page. Note: we 3832 * cannot do any QEMUFile operation before using that 3833 * buffer to make sure the buffer is valid when 3834 * placing the page. 3835 */ 3836 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3837 TARGET_PAGE_SIZE); 3838 } 3839 break; 3840 case RAM_SAVE_FLAG_EOS: 3841 break; 3842 default: 3843 error_report("Unknown combination of migration flags: 0x%x" 3844 " (postcopy mode)", flags); 3845 ret = -EINVAL; 3846 break; 3847 } 3848 3849 /* Detect for any possible file errors */ 3850 if (!ret && qemu_file_get_error(f)) { 3851 ret = qemu_file_get_error(f); 3852 } 3853 3854 if (!ret && place_needed) { 3855 if (tmp_page->all_zero) { 3856 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 3857 } else { 3858 ret = postcopy_place_page(mis, tmp_page->host_addr, 3859 place_source, block); 3860 } 3861 place_needed = false; 3862 postcopy_temp_page_reset(tmp_page); 3863 } 3864 } 3865 3866 return ret; 3867 } 3868 3869 static bool postcopy_is_running(void) 3870 { 3871 PostcopyState ps = postcopy_state_get(); 3872 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3873 } 3874 3875 /* 3876 * Flush content of RAM cache into SVM's memory. 3877 * Only flush the pages that be dirtied by PVM or SVM or both. 3878 */ 3879 void colo_flush_ram_cache(void) 3880 { 3881 RAMBlock *block = NULL; 3882 void *dst_host; 3883 void *src_host; 3884 unsigned long offset = 0; 3885 3886 memory_global_dirty_log_sync(false); 3887 qemu_mutex_lock(&ram_state->bitmap_mutex); 3888 WITH_RCU_READ_LOCK_GUARD() { 3889 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3890 ramblock_sync_dirty_bitmap(ram_state, block); 3891 } 3892 } 3893 3894 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 3895 WITH_RCU_READ_LOCK_GUARD() { 3896 block = QLIST_FIRST_RCU(&ram_list.blocks); 3897 3898 while (block) { 3899 unsigned long num = 0; 3900 3901 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 3902 if (!offset_in_ramblock(block, 3903 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 3904 offset = 0; 3905 num = 0; 3906 block = QLIST_NEXT_RCU(block, next); 3907 } else { 3908 unsigned long i = 0; 3909 3910 for (i = 0; i < num; i++) { 3911 migration_bitmap_clear_dirty(ram_state, block, offset + i); 3912 } 3913 dst_host = block->host 3914 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3915 src_host = block->colo_cache 3916 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3917 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 3918 offset += num; 3919 } 3920 } 3921 } 3922 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3923 trace_colo_flush_ram_cache_end(); 3924 } 3925 3926 static size_t ram_load_multifd_pages(void *host_addr, size_t size, 3927 uint64_t offset) 3928 { 3929 MultiFDRecvData *data = multifd_get_recv_data(); 3930 3931 data->opaque = host_addr; 3932 data->file_offset = offset; 3933 data->size = size; 3934 3935 if (!multifd_recv()) { 3936 return 0; 3937 } 3938 3939 return size; 3940 } 3941 3942 static bool read_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, 3943 long num_pages, unsigned long *bitmap, 3944 Error **errp) 3945 { 3946 ERRP_GUARD(); 3947 unsigned long set_bit_idx, clear_bit_idx; 3948 ram_addr_t offset; 3949 void *host; 3950 size_t read, unread, size; 3951 3952 for (set_bit_idx = find_first_bit(bitmap, num_pages); 3953 set_bit_idx < num_pages; 3954 set_bit_idx = find_next_bit(bitmap, num_pages, clear_bit_idx + 1)) { 3955 3956 clear_bit_idx = find_next_zero_bit(bitmap, num_pages, set_bit_idx + 1); 3957 3958 unread = TARGET_PAGE_SIZE * (clear_bit_idx - set_bit_idx); 3959 offset = set_bit_idx << TARGET_PAGE_BITS; 3960 3961 while (unread > 0) { 3962 host = host_from_ram_block_offset(block, offset); 3963 if (!host) { 3964 error_setg(errp, "page outside of ramblock %s range", 3965 block->idstr); 3966 return false; 3967 } 3968 3969 size = MIN(unread, MAPPED_RAM_LOAD_BUF_SIZE); 3970 3971 if (migrate_multifd()) { 3972 read = ram_load_multifd_pages(host, size, 3973 block->pages_offset + offset); 3974 } else { 3975 read = qemu_get_buffer_at(f, host, size, 3976 block->pages_offset + offset); 3977 } 3978 3979 if (!read) { 3980 goto err; 3981 } 3982 offset += read; 3983 unread -= read; 3984 } 3985 } 3986 3987 return true; 3988 3989 err: 3990 qemu_file_get_error_obj(f, errp); 3991 error_prepend(errp, "(%s) failed to read page " RAM_ADDR_FMT 3992 "from file offset %" PRIx64 ": ", block->idstr, offset, 3993 block->pages_offset + offset); 3994 return false; 3995 } 3996 3997 static void parse_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, 3998 ram_addr_t length, Error **errp) 3999 { 4000 g_autofree unsigned long *bitmap = NULL; 4001 MappedRamHeader header; 4002 size_t bitmap_size; 4003 long num_pages; 4004 4005 if (!mapped_ram_read_header(f, &header, errp)) { 4006 return; 4007 } 4008 4009 block->pages_offset = header.pages_offset; 4010 4011 /* 4012 * Check the alignment of the file region that contains pages. We 4013 * don't enforce MAPPED_RAM_FILE_OFFSET_ALIGNMENT to allow that 4014 * value to change in the future. Do only a sanity check with page 4015 * size alignment. 4016 */ 4017 if (!QEMU_IS_ALIGNED(block->pages_offset, TARGET_PAGE_SIZE)) { 4018 error_setg(errp, 4019 "Error reading ramblock %s pages, region has bad alignment", 4020 block->idstr); 4021 return; 4022 } 4023 4024 num_pages = length / header.page_size; 4025 bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); 4026 4027 bitmap = g_malloc0(bitmap_size); 4028 if (qemu_get_buffer_at(f, (uint8_t *)bitmap, bitmap_size, 4029 header.bitmap_offset) != bitmap_size) { 4030 error_setg(errp, "Error reading dirty bitmap"); 4031 return; 4032 } 4033 4034 if (!read_ramblock_mapped_ram(f, block, num_pages, bitmap, errp)) { 4035 return; 4036 } 4037 4038 /* Skip pages array */ 4039 qemu_set_offset(f, block->pages_offset + length, SEEK_SET); 4040 } 4041 4042 static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length) 4043 { 4044 int ret = 0; 4045 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 4046 bool postcopy_advised = migration_incoming_postcopy_advised(); 4047 int max_hg_page_size; 4048 Error *local_err = NULL; 4049 4050 assert(block); 4051 4052 if (migrate_mapped_ram()) { 4053 parse_ramblock_mapped_ram(f, block, length, &local_err); 4054 if (local_err) { 4055 error_report_err(local_err); 4056 return -EINVAL; 4057 } 4058 return 0; 4059 } 4060 4061 if (!qemu_ram_is_migratable(block)) { 4062 error_report("block %s should not be migrated !", block->idstr); 4063 return -EINVAL; 4064 } 4065 4066 if (length != block->used_length) { 4067 ret = qemu_ram_resize(block, length, &local_err); 4068 if (local_err) { 4069 error_report_err(local_err); 4070 return ret; 4071 } 4072 } 4073 4074 /* 4075 * ??? Mirrors the previous value of qemu_host_page_size, 4076 * but is this really what was intended for the migration? 4077 */ 4078 max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE); 4079 4080 /* For postcopy we need to check hugepage sizes match */ 4081 if (postcopy_advised && migrate_postcopy_ram() && 4082 block->page_size != max_hg_page_size) { 4083 uint64_t remote_page_size = qemu_get_be64(f); 4084 if (remote_page_size != block->page_size) { 4085 error_report("Mismatched RAM page size %s " 4086 "(local) %zd != %" PRId64, block->idstr, 4087 block->page_size, remote_page_size); 4088 return -EINVAL; 4089 } 4090 } 4091 if (migrate_ignore_shared()) { 4092 hwaddr addr = qemu_get_be64(f); 4093 if (migrate_ram_is_ignored(block) && 4094 block->mr->addr != addr) { 4095 error_report("Mismatched GPAs for block %s " 4096 "%" PRId64 "!= %" PRId64, block->idstr, 4097 (uint64_t)addr, (uint64_t)block->mr->addr); 4098 return -EINVAL; 4099 } 4100 } 4101 ret = rdma_block_notification_handle(f, block->idstr); 4102 if (ret < 0) { 4103 qemu_file_set_error(f, ret); 4104 } 4105 4106 return ret; 4107 } 4108 4109 static int parse_ramblocks(QEMUFile *f, ram_addr_t total_ram_bytes) 4110 { 4111 int ret = 0; 4112 4113 /* Synchronize RAM block list */ 4114 while (!ret && total_ram_bytes) { 4115 RAMBlock *block; 4116 char id[256]; 4117 ram_addr_t length; 4118 int len = qemu_get_byte(f); 4119 4120 qemu_get_buffer(f, (uint8_t *)id, len); 4121 id[len] = 0; 4122 length = qemu_get_be64(f); 4123 4124 block = qemu_ram_block_by_name(id); 4125 if (block) { 4126 ret = parse_ramblock(f, block, length); 4127 } else { 4128 error_report("Unknown ramblock \"%s\", cannot accept " 4129 "migration", id); 4130 ret = -EINVAL; 4131 } 4132 total_ram_bytes -= length; 4133 } 4134 4135 return ret; 4136 } 4137 4138 /** 4139 * ram_load_precopy: load pages in precopy case 4140 * 4141 * Returns 0 for success or -errno in case of error 4142 * 4143 * Called in precopy mode by ram_load(). 4144 * rcu_read_lock is taken prior to this being called. 4145 * 4146 * @f: QEMUFile where to send the data 4147 */ 4148 static int ram_load_precopy(QEMUFile *f) 4149 { 4150 MigrationIncomingState *mis = migration_incoming_get_current(); 4151 int flags = 0, ret = 0, invalid_flags = 0, i = 0; 4152 4153 if (migrate_mapped_ram()) { 4154 invalid_flags |= (RAM_SAVE_FLAG_HOOK | RAM_SAVE_FLAG_MULTIFD_FLUSH | 4155 RAM_SAVE_FLAG_PAGE | RAM_SAVE_FLAG_XBZRLE | 4156 RAM_SAVE_FLAG_ZERO); 4157 } 4158 4159 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4160 ram_addr_t addr; 4161 void *host = NULL, *host_bak = NULL; 4162 uint8_t ch; 4163 4164 /* 4165 * Yield periodically to let main loop run, but an iteration of 4166 * the main loop is expensive, so do it each some iterations 4167 */ 4168 if ((i & 32767) == 0 && qemu_in_coroutine()) { 4169 aio_co_schedule(qemu_get_current_aio_context(), 4170 qemu_coroutine_self()); 4171 qemu_coroutine_yield(); 4172 } 4173 i++; 4174 4175 addr = qemu_get_be64(f); 4176 ret = qemu_file_get_error(f); 4177 if (ret) { 4178 error_report("Getting RAM address failed"); 4179 break; 4180 } 4181 4182 flags = addr & ~TARGET_PAGE_MASK; 4183 addr &= TARGET_PAGE_MASK; 4184 4185 if (flags & invalid_flags) { 4186 error_report("Unexpected RAM flags: %d", flags & invalid_flags); 4187 4188 ret = -EINVAL; 4189 break; 4190 } 4191 4192 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4193 RAM_SAVE_FLAG_XBZRLE)) { 4194 RAMBlock *block = ram_block_from_stream(mis, f, flags, 4195 RAM_CHANNEL_PRECOPY); 4196 4197 host = host_from_ram_block_offset(block, addr); 4198 /* 4199 * After going into COLO stage, we should not load the page 4200 * into SVM's memory directly, we put them into colo_cache firstly. 4201 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 4202 * Previously, we copied all these memory in preparing stage of COLO 4203 * while we need to stop VM, which is a time-consuming process. 4204 * Here we optimize it by a trick, back-up every page while in 4205 * migration process while COLO is enabled, though it affects the 4206 * speed of the migration, but it obviously reduce the downtime of 4207 * back-up all SVM'S memory in COLO preparing stage. 4208 */ 4209 if (migration_incoming_colo_enabled()) { 4210 if (migration_incoming_in_colo_state()) { 4211 /* In COLO stage, put all pages into cache temporarily */ 4212 host = colo_cache_from_block_offset(block, addr, true); 4213 } else { 4214 /* 4215 * In migration stage but before COLO stage, 4216 * Put all pages into both cache and SVM's memory. 4217 */ 4218 host_bak = colo_cache_from_block_offset(block, addr, false); 4219 } 4220 } 4221 if (!host) { 4222 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4223 ret = -EINVAL; 4224 break; 4225 } 4226 if (!migration_incoming_in_colo_state()) { 4227 ramblock_recv_bitmap_set(block, host); 4228 } 4229 4230 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 4231 } 4232 4233 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4234 case RAM_SAVE_FLAG_MEM_SIZE: 4235 ret = parse_ramblocks(f, addr); 4236 /* 4237 * For mapped-ram migration (to a file) using multifd, we sync 4238 * once and for all here to make sure all tasks we queued to 4239 * multifd threads are completed, so that all the ramblocks 4240 * (including all the guest memory pages within) are fully 4241 * loaded after this sync returns. 4242 */ 4243 if (migrate_mapped_ram()) { 4244 multifd_recv_sync_main(); 4245 } 4246 break; 4247 4248 case RAM_SAVE_FLAG_ZERO: 4249 ch = qemu_get_byte(f); 4250 if (ch != 0) { 4251 error_report("Found a zero page with value %d", ch); 4252 ret = -EINVAL; 4253 break; 4254 } 4255 ram_handle_zero(host, TARGET_PAGE_SIZE); 4256 break; 4257 4258 case RAM_SAVE_FLAG_PAGE: 4259 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4260 break; 4261 4262 case RAM_SAVE_FLAG_XBZRLE: 4263 if (load_xbzrle(f, addr, host) < 0) { 4264 error_report("Failed to decompress XBZRLE page at " 4265 RAM_ADDR_FMT, addr); 4266 ret = -EINVAL; 4267 break; 4268 } 4269 break; 4270 case RAM_SAVE_FLAG_MULTIFD_FLUSH: 4271 multifd_recv_sync_main(); 4272 break; 4273 case RAM_SAVE_FLAG_EOS: 4274 /* normal exit */ 4275 if (migrate_multifd() && 4276 migrate_multifd_flush_after_each_section() && 4277 /* 4278 * Mapped-ram migration flushes once and for all after 4279 * parsing ramblocks. Always ignore EOS for it. 4280 */ 4281 !migrate_mapped_ram()) { 4282 multifd_recv_sync_main(); 4283 } 4284 break; 4285 case RAM_SAVE_FLAG_HOOK: 4286 ret = rdma_registration_handle(f); 4287 if (ret < 0) { 4288 qemu_file_set_error(f, ret); 4289 } 4290 break; 4291 default: 4292 error_report("Unknown combination of migration flags: 0x%x", flags); 4293 ret = -EINVAL; 4294 } 4295 if (!ret) { 4296 ret = qemu_file_get_error(f); 4297 } 4298 if (!ret && host_bak) { 4299 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4300 } 4301 } 4302 4303 return ret; 4304 } 4305 4306 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4307 { 4308 int ret = 0; 4309 static uint64_t seq_iter; 4310 /* 4311 * If system is running in postcopy mode, page inserts to host memory must 4312 * be atomic 4313 */ 4314 bool postcopy_running = postcopy_is_running(); 4315 4316 seq_iter++; 4317 4318 if (version_id != 4) { 4319 return -EINVAL; 4320 } 4321 4322 /* 4323 * This RCU critical section can be very long running. 4324 * When RCU reclaims in the code start to become numerous, 4325 * it will be necessary to reduce the granularity of this 4326 * critical section. 4327 */ 4328 trace_ram_load_start(); 4329 WITH_RCU_READ_LOCK_GUARD() { 4330 if (postcopy_running) { 4331 /* 4332 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of 4333 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to 4334 * service fast page faults. 4335 */ 4336 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY); 4337 } else { 4338 ret = ram_load_precopy(f); 4339 } 4340 } 4341 trace_ram_load_complete(ret, seq_iter); 4342 4343 return ret; 4344 } 4345 4346 static bool ram_has_postcopy(void *opaque) 4347 { 4348 RAMBlock *rb; 4349 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4350 if (ramblock_is_pmem(rb)) { 4351 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4352 "is not supported now!", rb->idstr, rb->host); 4353 return false; 4354 } 4355 } 4356 4357 return migrate_postcopy_ram(); 4358 } 4359 4360 /* Sync all the dirty bitmap with destination VM. */ 4361 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4362 { 4363 RAMBlock *block; 4364 QEMUFile *file = s->to_dst_file; 4365 4366 trace_ram_dirty_bitmap_sync_start(); 4367 4368 qatomic_set(&rs->postcopy_bmap_sync_requested, 0); 4369 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4370 qemu_savevm_send_recv_bitmap(file, block->idstr); 4371 trace_ram_dirty_bitmap_request(block->idstr); 4372 qatomic_inc(&rs->postcopy_bmap_sync_requested); 4373 } 4374 4375 trace_ram_dirty_bitmap_sync_wait(); 4376 4377 /* Wait until all the ramblocks' dirty bitmap synced */ 4378 while (qatomic_read(&rs->postcopy_bmap_sync_requested)) { 4379 if (migration_rp_wait(s)) { 4380 return -1; 4381 } 4382 } 4383 4384 trace_ram_dirty_bitmap_sync_complete(); 4385 4386 return 0; 4387 } 4388 4389 /* 4390 * Read the received bitmap, revert it as the initial dirty bitmap. 4391 * This is only used when the postcopy migration is paused but wants 4392 * to resume from a middle point. 4393 * 4394 * Returns true if succeeded, false for errors. 4395 */ 4396 bool ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block, Error **errp) 4397 { 4398 /* from_dst_file is always valid because we're within rp_thread */ 4399 QEMUFile *file = s->rp_state.from_dst_file; 4400 g_autofree unsigned long *le_bitmap = NULL; 4401 unsigned long nbits = block->used_length >> TARGET_PAGE_BITS; 4402 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4403 uint64_t size, end_mark; 4404 RAMState *rs = ram_state; 4405 4406 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4407 4408 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4409 error_setg(errp, "Reload bitmap in incorrect state %s", 4410 MigrationStatus_str(s->state)); 4411 return false; 4412 } 4413 4414 /* 4415 * Note: see comments in ramblock_recv_bitmap_send() on why we 4416 * need the endianness conversion, and the paddings. 4417 */ 4418 local_size = ROUND_UP(local_size, 8); 4419 4420 /* Add paddings */ 4421 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4422 4423 size = qemu_get_be64(file); 4424 4425 /* The size of the bitmap should match with our ramblock */ 4426 if (size != local_size) { 4427 error_setg(errp, "ramblock '%s' bitmap size mismatch (0x%"PRIx64 4428 " != 0x%"PRIx64")", block->idstr, size, local_size); 4429 return false; 4430 } 4431 4432 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4433 end_mark = qemu_get_be64(file); 4434 4435 if (qemu_file_get_error(file) || size != local_size) { 4436 error_setg(errp, "read bitmap failed for ramblock '%s': " 4437 "(size 0x%"PRIx64", got: 0x%"PRIx64")", 4438 block->idstr, local_size, size); 4439 return false; 4440 } 4441 4442 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4443 error_setg(errp, "ramblock '%s' end mark incorrect: 0x%"PRIx64, 4444 block->idstr, end_mark); 4445 return false; 4446 } 4447 4448 /* 4449 * Endianness conversion. We are during postcopy (though paused). 4450 * The dirty bitmap won't change. We can directly modify it. 4451 */ 4452 bitmap_from_le(block->bmap, le_bitmap, nbits); 4453 4454 /* 4455 * What we received is "received bitmap". Revert it as the initial 4456 * dirty bitmap for this ramblock. 4457 */ 4458 bitmap_complement(block->bmap, block->bmap, nbits); 4459 4460 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4461 ramblock_dirty_bitmap_clear_discarded_pages(block); 4462 4463 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4464 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4465 4466 qatomic_dec(&rs->postcopy_bmap_sync_requested); 4467 4468 /* 4469 * We succeeded to sync bitmap for current ramblock. Always kick the 4470 * migration thread to check whether all requested bitmaps are 4471 * reloaded. NOTE: it's racy to only kick when requested==0, because 4472 * we don't know whether the migration thread may still be increasing 4473 * it. 4474 */ 4475 migration_rp_kick(s); 4476 4477 return true; 4478 } 4479 4480 static int ram_resume_prepare(MigrationState *s, void *opaque) 4481 { 4482 RAMState *rs = *(RAMState **)opaque; 4483 int ret; 4484 4485 ret = ram_dirty_bitmap_sync_all(s, rs); 4486 if (ret) { 4487 return ret; 4488 } 4489 4490 ram_state_resume_prepare(rs, s->to_dst_file); 4491 4492 return 0; 4493 } 4494 4495 static bool ram_save_postcopy_prepare(QEMUFile *f, void *opaque, Error **errp) 4496 { 4497 int ret; 4498 4499 if (migrate_multifd()) { 4500 /* 4501 * When multifd is enabled, source QEMU needs to make sure all the 4502 * pages queued before postcopy starts have been flushed. 4503 * 4504 * The load of these pages must happen before switching to postcopy. 4505 * It's because loading of guest pages (so far) in multifd recv 4506 * threads is still non-atomic, so the load cannot happen with vCPUs 4507 * running on the destination side. 4508 * 4509 * This flush and sync will guarantee that those pages are loaded 4510 * _before_ postcopy starts on the destination. The rationale is, 4511 * this happens before VM stops (and before source QEMU sends all 4512 * the rest of the postcopy messages). So when the destination QEMU 4513 * receives the postcopy messages, it must have received the sync 4514 * message on the main channel (either RAM_SAVE_FLAG_MULTIFD_FLUSH, 4515 * or RAM_SAVE_FLAG_EOS), and such message would guarantee that 4516 * all previous guest pages queued in the multifd channels are 4517 * completely loaded. 4518 */ 4519 ret = multifd_ram_flush_and_sync(f); 4520 if (ret < 0) { 4521 error_setg(errp, "%s: multifd flush and sync failed", __func__); 4522 return false; 4523 } 4524 } 4525 4526 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 4527 4528 return true; 4529 } 4530 4531 void postcopy_preempt_shutdown_file(MigrationState *s) 4532 { 4533 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); 4534 qemu_fflush(s->postcopy_qemufile_src); 4535 } 4536 4537 static SaveVMHandlers savevm_ram_handlers = { 4538 .save_setup = ram_save_setup, 4539 .save_live_iterate = ram_save_iterate, 4540 .save_live_complete_postcopy = ram_save_complete, 4541 .save_live_complete_precopy = ram_save_complete, 4542 .has_postcopy = ram_has_postcopy, 4543 .state_pending_exact = ram_state_pending_exact, 4544 .state_pending_estimate = ram_state_pending_estimate, 4545 .load_state = ram_load, 4546 .save_cleanup = ram_save_cleanup, 4547 .load_setup = ram_load_setup, 4548 .load_cleanup = ram_load_cleanup, 4549 .resume_prepare = ram_resume_prepare, 4550 .save_postcopy_prepare = ram_save_postcopy_prepare, 4551 }; 4552 4553 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4554 size_t old_size, size_t new_size) 4555 { 4556 PostcopyState ps = postcopy_state_get(); 4557 ram_addr_t offset; 4558 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4559 Error *err = NULL; 4560 4561 if (!rb) { 4562 error_report("RAM block not found"); 4563 return; 4564 } 4565 4566 if (migrate_ram_is_ignored(rb)) { 4567 return; 4568 } 4569 4570 if (migration_is_running()) { 4571 /* 4572 * Precopy code on the source cannot deal with the size of RAM blocks 4573 * changing at random points in time - especially after sending the 4574 * RAM block sizes in the migration stream, they must no longer change. 4575 * Abort and indicate a proper reason. 4576 */ 4577 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4578 migrate_set_error(migrate_get_current(), err); 4579 error_free(err); 4580 4581 migration_cancel(); 4582 } 4583 4584 switch (ps) { 4585 case POSTCOPY_INCOMING_ADVISE: 4586 /* 4587 * Update what ram_postcopy_incoming_init()->init_range() does at the 4588 * time postcopy was advised. Syncing RAM blocks with the source will 4589 * result in RAM resizes. 4590 */ 4591 if (old_size < new_size) { 4592 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4593 error_report("RAM block '%s' discard of resized RAM failed", 4594 rb->idstr); 4595 } 4596 } 4597 rb->postcopy_length = new_size; 4598 break; 4599 case POSTCOPY_INCOMING_NONE: 4600 case POSTCOPY_INCOMING_RUNNING: 4601 case POSTCOPY_INCOMING_END: 4602 /* 4603 * Once our guest is running, postcopy does no longer care about 4604 * resizes. When growing, the new memory was not available on the 4605 * source, no handler needed. 4606 */ 4607 break; 4608 default: 4609 error_report("RAM block '%s' resized during postcopy state: %d", 4610 rb->idstr, ps); 4611 exit(-1); 4612 } 4613 } 4614 4615 static RAMBlockNotifier ram_mig_ram_notifier = { 4616 .ram_block_resized = ram_mig_ram_block_resized, 4617 }; 4618 4619 void ram_mig_init(void) 4620 { 4621 qemu_mutex_init(&XBZRLE.lock); 4622 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4623 ram_block_notifier_add(&ram_mig_ram_notifier); 4624 } 4625