1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "xbzrle.h" 36 #include "ram.h" 37 #include "migration.h" 38 #include "migration-stats.h" 39 #include "migration/register.h" 40 #include "migration/misc.h" 41 #include "qemu-file.h" 42 #include "postcopy-ram.h" 43 #include "page_cache.h" 44 #include "qemu/error-report.h" 45 #include "qapi/error.h" 46 #include "qapi/qapi-types-migration.h" 47 #include "qapi/qapi-events-migration.h" 48 #include "qapi/qapi-commands-migration.h" 49 #include "qapi/qmp/qerror.h" 50 #include "trace.h" 51 #include "exec/ram_addr.h" 52 #include "exec/target_page.h" 53 #include "qemu/rcu_queue.h" 54 #include "migration/colo.h" 55 #include "system/cpu-throttle.h" 56 #include "savevm.h" 57 #include "qemu/iov.h" 58 #include "multifd.h" 59 #include "system/runstate.h" 60 #include "rdma.h" 61 #include "options.h" 62 #include "system/dirtylimit.h" 63 #include "system/kvm.h" 64 65 #include "hw/boards.h" /* for machine_dump_guest_core() */ 66 67 #if defined(__linux__) 68 #include "qemu/userfaultfd.h" 69 #endif /* defined(__linux__) */ 70 71 /***********************************************************/ 72 /* ram save/restore */ 73 74 /* 75 * mapped-ram migration supports O_DIRECT, so we need to make sure the 76 * userspace buffer, the IO operation size and the file offset are 77 * aligned according to the underlying device's block size. The first 78 * two are already aligned to page size, but we need to add padding to 79 * the file to align the offset. We cannot read the block size 80 * dynamically because the migration file can be moved between 81 * different systems, so use 1M to cover most block sizes and to keep 82 * the file offset aligned at page size as well. 83 */ 84 #define MAPPED_RAM_FILE_OFFSET_ALIGNMENT 0x100000 85 86 /* 87 * When doing mapped-ram migration, this is the amount we read from 88 * the pages region in the migration file at a time. 89 */ 90 #define MAPPED_RAM_LOAD_BUF_SIZE 0x100000 91 92 XBZRLECacheStats xbzrle_counters; 93 94 /* used by the search for pages to send */ 95 struct PageSearchStatus { 96 /* The migration channel used for a specific host page */ 97 QEMUFile *pss_channel; 98 /* Last block from where we have sent data */ 99 RAMBlock *last_sent_block; 100 /* Current block being searched */ 101 RAMBlock *block; 102 /* Current page to search from */ 103 unsigned long page; 104 /* Set once we wrap around */ 105 bool complete_round; 106 /* Whether we're sending a host page */ 107 bool host_page_sending; 108 /* The start/end of current host page. Invalid if host_page_sending==false */ 109 unsigned long host_page_start; 110 unsigned long host_page_end; 111 }; 112 typedef struct PageSearchStatus PageSearchStatus; 113 114 /* struct contains XBZRLE cache and a static page 115 used by the compression */ 116 static struct { 117 /* buffer used for XBZRLE encoding */ 118 uint8_t *encoded_buf; 119 /* buffer for storing page content */ 120 uint8_t *current_buf; 121 /* Cache for XBZRLE, Protected by lock. */ 122 PageCache *cache; 123 QemuMutex lock; 124 /* it will store a page full of zeros */ 125 uint8_t *zero_target_page; 126 /* buffer used for XBZRLE decoding */ 127 uint8_t *decoded_buf; 128 } XBZRLE; 129 130 static void XBZRLE_cache_lock(void) 131 { 132 if (migrate_xbzrle()) { 133 qemu_mutex_lock(&XBZRLE.lock); 134 } 135 } 136 137 static void XBZRLE_cache_unlock(void) 138 { 139 if (migrate_xbzrle()) { 140 qemu_mutex_unlock(&XBZRLE.lock); 141 } 142 } 143 144 /** 145 * xbzrle_cache_resize: resize the xbzrle cache 146 * 147 * This function is called from migrate_params_apply in main 148 * thread, possibly while a migration is in progress. A running 149 * migration may be using the cache and might finish during this call, 150 * hence changes to the cache are protected by XBZRLE.lock(). 151 * 152 * Returns 0 for success or -1 for error 153 * 154 * @new_size: new cache size 155 * @errp: set *errp if the check failed, with reason 156 */ 157 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 158 { 159 PageCache *new_cache; 160 int64_t ret = 0; 161 162 /* Check for truncation */ 163 if (new_size != (size_t)new_size) { 164 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 165 "exceeding address space"); 166 return -1; 167 } 168 169 if (new_size == migrate_xbzrle_cache_size()) { 170 /* nothing to do */ 171 return 0; 172 } 173 174 XBZRLE_cache_lock(); 175 176 if (XBZRLE.cache != NULL) { 177 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 178 if (!new_cache) { 179 ret = -1; 180 goto out; 181 } 182 183 cache_fini(XBZRLE.cache); 184 XBZRLE.cache = new_cache; 185 } 186 out: 187 XBZRLE_cache_unlock(); 188 return ret; 189 } 190 191 static bool postcopy_preempt_active(void) 192 { 193 return migrate_postcopy_preempt() && migration_in_postcopy(); 194 } 195 196 bool migrate_ram_is_ignored(RAMBlock *block) 197 { 198 MigMode mode = migrate_mode(); 199 return !qemu_ram_is_migratable(block) || 200 mode == MIG_MODE_CPR_TRANSFER || 201 (migrate_ignore_shared() && qemu_ram_is_shared(block) 202 && qemu_ram_is_named_file(block)); 203 } 204 205 #undef RAMBLOCK_FOREACH 206 207 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 208 { 209 RAMBlock *block; 210 int ret = 0; 211 212 RCU_READ_LOCK_GUARD(); 213 214 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 215 ret = func(block, opaque); 216 if (ret) { 217 break; 218 } 219 } 220 return ret; 221 } 222 223 static void ramblock_recv_map_init(void) 224 { 225 RAMBlock *rb; 226 227 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 228 assert(!rb->receivedmap); 229 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 230 } 231 } 232 233 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 234 { 235 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 236 rb->receivedmap); 237 } 238 239 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 240 { 241 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 242 } 243 244 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 245 { 246 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 247 } 248 249 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 250 size_t nr) 251 { 252 bitmap_set_atomic(rb->receivedmap, 253 ramblock_recv_bitmap_offset(host_addr, rb), 254 nr); 255 } 256 257 void ramblock_recv_bitmap_set_offset(RAMBlock *rb, uint64_t byte_offset) 258 { 259 set_bit_atomic(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 260 } 261 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 262 263 /* 264 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 265 * 266 * Returns >0 if success with sent bytes, or <0 if error. 267 */ 268 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 269 const char *block_name) 270 { 271 RAMBlock *block = qemu_ram_block_by_name(block_name); 272 unsigned long *le_bitmap, nbits; 273 uint64_t size; 274 275 if (!block) { 276 error_report("%s: invalid block name: %s", __func__, block_name); 277 return -1; 278 } 279 280 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 281 282 /* 283 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 284 * machines we may need 4 more bytes for padding (see below 285 * comment). So extend it a bit before hand. 286 */ 287 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 288 289 /* 290 * Always use little endian when sending the bitmap. This is 291 * required that when source and destination VMs are not using the 292 * same endianness. (Note: big endian won't work.) 293 */ 294 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 295 296 /* Size of the bitmap, in bytes */ 297 size = DIV_ROUND_UP(nbits, 8); 298 299 /* 300 * size is always aligned to 8 bytes for 64bit machines, but it 301 * may not be true for 32bit machines. We need this padding to 302 * make sure the migration can survive even between 32bit and 303 * 64bit machines. 304 */ 305 size = ROUND_UP(size, 8); 306 307 qemu_put_be64(file, size); 308 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 309 g_free(le_bitmap); 310 /* 311 * Mark as an end, in case the middle part is screwed up due to 312 * some "mysterious" reason. 313 */ 314 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 315 int ret = qemu_fflush(file); 316 if (ret) { 317 return ret; 318 } 319 320 return size + sizeof(size); 321 } 322 323 /* 324 * An outstanding page request, on the source, having been received 325 * and queued 326 */ 327 struct RAMSrcPageRequest { 328 RAMBlock *rb; 329 hwaddr offset; 330 hwaddr len; 331 332 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 333 }; 334 335 /* State of RAM for migration */ 336 struct RAMState { 337 /* 338 * PageSearchStatus structures for the channels when send pages. 339 * Protected by the bitmap_mutex. 340 */ 341 PageSearchStatus pss[RAM_CHANNEL_MAX]; 342 /* UFFD file descriptor, used in 'write-tracking' migration */ 343 int uffdio_fd; 344 /* total ram size in bytes */ 345 uint64_t ram_bytes_total; 346 /* Last block that we have visited searching for dirty pages */ 347 RAMBlock *last_seen_block; 348 /* Last dirty target page we have sent */ 349 ram_addr_t last_page; 350 /* last ram version we have seen */ 351 uint32_t last_version; 352 /* How many times we have dirty too many pages */ 353 int dirty_rate_high_cnt; 354 /* these variables are used for bitmap sync */ 355 /* last time we did a full bitmap_sync */ 356 int64_t time_last_bitmap_sync; 357 /* bytes transferred at start_time */ 358 uint64_t bytes_xfer_prev; 359 /* number of dirty pages since start_time */ 360 uint64_t num_dirty_pages_period; 361 /* xbzrle misses since the beginning of the period */ 362 uint64_t xbzrle_cache_miss_prev; 363 /* Amount of xbzrle pages since the beginning of the period */ 364 uint64_t xbzrle_pages_prev; 365 /* Amount of xbzrle encoded bytes since the beginning of the period */ 366 uint64_t xbzrle_bytes_prev; 367 /* Are we really using XBZRLE (e.g., after the first round). */ 368 bool xbzrle_started; 369 /* Are we on the last stage of migration */ 370 bool last_stage; 371 372 /* total handled target pages at the beginning of period */ 373 uint64_t target_page_count_prev; 374 /* total handled target pages since start */ 375 uint64_t target_page_count; 376 /* number of dirty bits in the bitmap */ 377 uint64_t migration_dirty_pages; 378 /* 379 * Protects: 380 * - dirty/clear bitmap 381 * - migration_dirty_pages 382 * - pss structures 383 */ 384 QemuMutex bitmap_mutex; 385 /* The RAMBlock used in the last src_page_requests */ 386 RAMBlock *last_req_rb; 387 /* Queue of outstanding page requests from the destination */ 388 QemuMutex src_page_req_mutex; 389 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 390 391 /* 392 * This is only used when postcopy is in recovery phase, to communicate 393 * between the migration thread and the return path thread on dirty 394 * bitmap synchronizations. This field is unused in other stages of 395 * RAM migration. 396 */ 397 unsigned int postcopy_bmap_sync_requested; 398 }; 399 typedef struct RAMState RAMState; 400 401 static RAMState *ram_state; 402 403 static NotifierWithReturnList precopy_notifier_list; 404 405 /* Whether postcopy has queued requests? */ 406 static bool postcopy_has_request(RAMState *rs) 407 { 408 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 409 } 410 411 void precopy_infrastructure_init(void) 412 { 413 notifier_with_return_list_init(&precopy_notifier_list); 414 } 415 416 void precopy_add_notifier(NotifierWithReturn *n) 417 { 418 notifier_with_return_list_add(&precopy_notifier_list, n); 419 } 420 421 void precopy_remove_notifier(NotifierWithReturn *n) 422 { 423 notifier_with_return_remove(n); 424 } 425 426 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 427 { 428 PrecopyNotifyData pnd; 429 pnd.reason = reason; 430 431 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd, errp); 432 } 433 434 uint64_t ram_bytes_remaining(void) 435 { 436 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 437 0; 438 } 439 440 void ram_transferred_add(uint64_t bytes) 441 { 442 if (runstate_is_running()) { 443 stat64_add(&mig_stats.precopy_bytes, bytes); 444 } else if (migration_in_postcopy()) { 445 stat64_add(&mig_stats.postcopy_bytes, bytes); 446 } else { 447 stat64_add(&mig_stats.downtime_bytes, bytes); 448 } 449 } 450 451 struct MigrationOps { 452 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss); 453 }; 454 typedef struct MigrationOps MigrationOps; 455 456 MigrationOps *migration_ops; 457 458 static int ram_save_host_page_urgent(PageSearchStatus *pss); 459 460 /* NOTE: page is the PFN not real ram_addr_t. */ 461 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) 462 { 463 pss->block = rb; 464 pss->page = page; 465 pss->complete_round = false; 466 } 467 468 /* 469 * Check whether two PSSs are actively sending the same page. Return true 470 * if it is, false otherwise. 471 */ 472 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2) 473 { 474 return pss1->host_page_sending && pss2->host_page_sending && 475 (pss1->host_page_start == pss2->host_page_start); 476 } 477 478 /** 479 * save_page_header: write page header to wire 480 * 481 * If this is the 1st block, it also writes the block identification 482 * 483 * Returns the number of bytes written 484 * 485 * @pss: current PSS channel status 486 * @block: block that contains the page we want to send 487 * @offset: offset inside the block for the page 488 * in the lower bits, it contains flags 489 */ 490 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f, 491 RAMBlock *block, ram_addr_t offset) 492 { 493 size_t size, len; 494 bool same_block = (block == pss->last_sent_block); 495 496 if (same_block) { 497 offset |= RAM_SAVE_FLAG_CONTINUE; 498 } 499 qemu_put_be64(f, offset); 500 size = 8; 501 502 if (!same_block) { 503 len = strlen(block->idstr); 504 qemu_put_byte(f, len); 505 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 506 size += 1 + len; 507 pss->last_sent_block = block; 508 } 509 return size; 510 } 511 512 /** 513 * mig_throttle_guest_down: throttle down the guest 514 * 515 * Reduce amount of guest cpu execution to hopefully slow down memory 516 * writes. If guest dirty memory rate is reduced below the rate at 517 * which we can transfer pages to the destination then we should be 518 * able to complete migration. Some workloads dirty memory way too 519 * fast and will not effectively converge, even with auto-converge. 520 */ 521 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 522 uint64_t bytes_dirty_threshold) 523 { 524 uint64_t pct_initial = migrate_cpu_throttle_initial(); 525 uint64_t pct_increment = migrate_cpu_throttle_increment(); 526 bool pct_tailslow = migrate_cpu_throttle_tailslow(); 527 int pct_max = migrate_max_cpu_throttle(); 528 529 uint64_t throttle_now = cpu_throttle_get_percentage(); 530 uint64_t cpu_now, cpu_ideal, throttle_inc; 531 532 /* We have not started throttling yet. Let's start it. */ 533 if (!cpu_throttle_active()) { 534 cpu_throttle_set(pct_initial); 535 } else { 536 /* Throttling already on, just increase the rate */ 537 if (!pct_tailslow) { 538 throttle_inc = pct_increment; 539 } else { 540 /* Compute the ideal CPU percentage used by Guest, which may 541 * make the dirty rate match the dirty rate threshold. */ 542 cpu_now = 100 - throttle_now; 543 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 544 bytes_dirty_period); 545 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 546 } 547 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 548 } 549 } 550 551 void mig_throttle_counter_reset(void) 552 { 553 RAMState *rs = ram_state; 554 555 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 556 rs->num_dirty_pages_period = 0; 557 rs->bytes_xfer_prev = migration_transferred_bytes(); 558 } 559 560 /** 561 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 562 * 563 * @current_addr: address for the zero page 564 * 565 * Update the xbzrle cache to reflect a page that's been sent as all 0. 566 * The important thing is that a stale (not-yet-0'd) page be replaced 567 * by the new data. 568 * As a bonus, if the page wasn't in the cache it gets added so that 569 * when a small write is made into the 0'd page it gets XBZRLE sent. 570 */ 571 static void xbzrle_cache_zero_page(ram_addr_t current_addr) 572 { 573 /* We don't care if this fails to allocate a new cache page 574 * as long as it updated an old one */ 575 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 576 stat64_get(&mig_stats.dirty_sync_count)); 577 } 578 579 #define ENCODING_FLAG_XBZRLE 0x1 580 581 /** 582 * save_xbzrle_page: compress and send current page 583 * 584 * Returns: 1 means that we wrote the page 585 * 0 means that page is identical to the one already sent 586 * -1 means that xbzrle would be longer than normal 587 * 588 * @rs: current RAM state 589 * @pss: current PSS channel 590 * @current_data: pointer to the address of the page contents 591 * @current_addr: addr of the page 592 * @block: block that contains the page we want to send 593 * @offset: offset inside the block for the page 594 */ 595 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, 596 uint8_t **current_data, ram_addr_t current_addr, 597 RAMBlock *block, ram_addr_t offset) 598 { 599 int encoded_len = 0, bytes_xbzrle; 600 uint8_t *prev_cached_page; 601 QEMUFile *file = pss->pss_channel; 602 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); 603 604 if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) { 605 xbzrle_counters.cache_miss++; 606 if (!rs->last_stage) { 607 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 608 generation) == -1) { 609 return -1; 610 } else { 611 /* update *current_data when the page has been 612 inserted into cache */ 613 *current_data = get_cached_data(XBZRLE.cache, current_addr); 614 } 615 } 616 return -1; 617 } 618 619 /* 620 * Reaching here means the page has hit the xbzrle cache, no matter what 621 * encoding result it is (normal encoding, overflow or skipping the page), 622 * count the page as encoded. This is used to calculate the encoding rate. 623 * 624 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 625 * 2nd page turns out to be skipped (i.e. no new bytes written to the 626 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 627 * skipped page included. In this way, the encoding rate can tell if the 628 * guest page is good for xbzrle encoding. 629 */ 630 xbzrle_counters.pages++; 631 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 632 633 /* save current buffer into memory */ 634 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 635 636 /* XBZRLE encoding (if there is no overflow) */ 637 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 638 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 639 TARGET_PAGE_SIZE); 640 641 /* 642 * Update the cache contents, so that it corresponds to the data 643 * sent, in all cases except where we skip the page. 644 */ 645 if (!rs->last_stage && encoded_len != 0) { 646 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 647 /* 648 * In the case where we couldn't compress, ensure that the caller 649 * sends the data from the cache, since the guest might have 650 * changed the RAM since we copied it. 651 */ 652 *current_data = prev_cached_page; 653 } 654 655 if (encoded_len == 0) { 656 trace_save_xbzrle_page_skipping(); 657 return 0; 658 } else if (encoded_len == -1) { 659 trace_save_xbzrle_page_overflow(); 660 xbzrle_counters.overflow++; 661 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 662 return -1; 663 } 664 665 /* Send XBZRLE based compressed page */ 666 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block, 667 offset | RAM_SAVE_FLAG_XBZRLE); 668 qemu_put_byte(file, ENCODING_FLAG_XBZRLE); 669 qemu_put_be16(file, encoded_len); 670 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len); 671 bytes_xbzrle += encoded_len + 1 + 2; 672 /* 673 * The xbzrle encoded bytes don't count the 8 byte header with 674 * RAM_SAVE_FLAG_CONTINUE. 675 */ 676 xbzrle_counters.bytes += bytes_xbzrle - 8; 677 ram_transferred_add(bytes_xbzrle); 678 679 return 1; 680 } 681 682 /** 683 * pss_find_next_dirty: find the next dirty page of current ramblock 684 * 685 * This function updates pss->page to point to the next dirty page index 686 * within the ramblock to migrate, or the end of ramblock when nothing 687 * found. Note that when pss->host_page_sending==true it means we're 688 * during sending a host page, so we won't look for dirty page that is 689 * outside the host page boundary. 690 * 691 * @pss: the current page search status 692 */ 693 static void pss_find_next_dirty(PageSearchStatus *pss) 694 { 695 RAMBlock *rb = pss->block; 696 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 697 unsigned long *bitmap = rb->bmap; 698 699 if (migrate_ram_is_ignored(rb)) { 700 /* Points directly to the end, so we know no dirty page */ 701 pss->page = size; 702 return; 703 } 704 705 /* 706 * If during sending a host page, only look for dirty pages within the 707 * current host page being send. 708 */ 709 if (pss->host_page_sending) { 710 assert(pss->host_page_end); 711 size = MIN(size, pss->host_page_end); 712 } 713 714 pss->page = find_next_bit(bitmap, size, pss->page); 715 } 716 717 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 718 unsigned long page) 719 { 720 uint8_t shift; 721 hwaddr size, start; 722 723 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 724 return; 725 } 726 727 shift = rb->clear_bmap_shift; 728 /* 729 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 730 * can make things easier sometimes since then start address 731 * of the small chunk will always be 64 pages aligned so the 732 * bitmap will always be aligned to unsigned long. We should 733 * even be able to remove this restriction but I'm simply 734 * keeping it. 735 */ 736 assert(shift >= 6); 737 738 size = 1ULL << (TARGET_PAGE_BITS + shift); 739 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 740 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 741 memory_region_clear_dirty_bitmap(rb->mr, start, size); 742 } 743 744 static void 745 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 746 unsigned long start, 747 unsigned long npages) 748 { 749 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 750 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 751 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 752 753 /* 754 * Clear pages from start to start + npages - 1, so the end boundary is 755 * exclusive. 756 */ 757 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 758 migration_clear_memory_region_dirty_bitmap(rb, i); 759 } 760 } 761 762 /* 763 * colo_bitmap_find_diry:find contiguous dirty pages from start 764 * 765 * Returns the page offset within memory region of the start of the contiguout 766 * dirty page 767 * 768 * @rs: current RAM state 769 * @rb: RAMBlock where to search for dirty pages 770 * @start: page where we start the search 771 * @num: the number of contiguous dirty pages 772 */ 773 static inline 774 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 775 unsigned long start, unsigned long *num) 776 { 777 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 778 unsigned long *bitmap = rb->bmap; 779 unsigned long first, next; 780 781 *num = 0; 782 783 if (migrate_ram_is_ignored(rb)) { 784 return size; 785 } 786 787 first = find_next_bit(bitmap, size, start); 788 if (first >= size) { 789 return first; 790 } 791 next = find_next_zero_bit(bitmap, size, first + 1); 792 assert(next >= first); 793 *num = next - first; 794 return first; 795 } 796 797 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 798 RAMBlock *rb, 799 unsigned long page) 800 { 801 bool ret; 802 803 /* 804 * Clear dirty bitmap if needed. This _must_ be called before we 805 * send any of the page in the chunk because we need to make sure 806 * we can capture further page content changes when we sync dirty 807 * log the next time. So as long as we are going to send any of 808 * the page in the chunk we clear the remote dirty bitmap for all. 809 * Clearing it earlier won't be a problem, but too late will. 810 */ 811 migration_clear_memory_region_dirty_bitmap(rb, page); 812 813 ret = test_and_clear_bit(page, rb->bmap); 814 if (ret) { 815 rs->migration_dirty_pages--; 816 } 817 818 return ret; 819 } 820 821 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 822 void *opaque) 823 { 824 const hwaddr offset = section->offset_within_region; 825 const hwaddr size = int128_get64(section->size); 826 const unsigned long start = offset >> TARGET_PAGE_BITS; 827 const unsigned long npages = size >> TARGET_PAGE_BITS; 828 RAMBlock *rb = section->mr->ram_block; 829 uint64_t *cleared_bits = opaque; 830 831 /* 832 * We don't grab ram_state->bitmap_mutex because we expect to run 833 * only when starting migration or during postcopy recovery where 834 * we don't have concurrent access. 835 */ 836 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 837 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 838 } 839 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 840 bitmap_clear(rb->bmap, start, npages); 841 } 842 843 /* 844 * Exclude all dirty pages from migration that fall into a discarded range as 845 * managed by a RamDiscardManager responsible for the mapped memory region of 846 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 847 * 848 * Discarded pages ("logically unplugged") have undefined content and must 849 * not get migrated, because even reading these pages for migration might 850 * result in undesired behavior. 851 * 852 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 853 * 854 * Note: The result is only stable while migrating (precopy/postcopy). 855 */ 856 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 857 { 858 uint64_t cleared_bits = 0; 859 860 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 861 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 862 MemoryRegionSection section = { 863 .mr = rb->mr, 864 .offset_within_region = 0, 865 .size = int128_make64(qemu_ram_get_used_length(rb)), 866 }; 867 868 ram_discard_manager_replay_discarded(rdm, §ion, 869 dirty_bitmap_clear_section, 870 &cleared_bits); 871 } 872 return cleared_bits; 873 } 874 875 /* 876 * Check if a host-page aligned page falls into a discarded range as managed by 877 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 878 * 879 * Note: The result is only stable while migrating (precopy/postcopy). 880 */ 881 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 882 { 883 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 884 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 885 MemoryRegionSection section = { 886 .mr = rb->mr, 887 .offset_within_region = start, 888 .size = int128_make64(qemu_ram_pagesize(rb)), 889 }; 890 891 return !ram_discard_manager_is_populated(rdm, §ion); 892 } 893 return false; 894 } 895 896 /* Called with RCU critical section */ 897 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 898 { 899 uint64_t new_dirty_pages = 900 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 901 902 rs->migration_dirty_pages += new_dirty_pages; 903 rs->num_dirty_pages_period += new_dirty_pages; 904 } 905 906 /** 907 * ram_pagesize_summary: calculate all the pagesizes of a VM 908 * 909 * Returns a summary bitmap of the page sizes of all RAMBlocks 910 * 911 * For VMs with just normal pages this is equivalent to the host page 912 * size. If it's got some huge pages then it's the OR of all the 913 * different page sizes. 914 */ 915 uint64_t ram_pagesize_summary(void) 916 { 917 RAMBlock *block; 918 uint64_t summary = 0; 919 920 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 921 summary |= block->page_size; 922 } 923 924 return summary; 925 } 926 927 uint64_t ram_get_total_transferred_pages(void) 928 { 929 return stat64_get(&mig_stats.normal_pages) + 930 stat64_get(&mig_stats.zero_pages) + 931 xbzrle_counters.pages; 932 } 933 934 static void migration_update_rates(RAMState *rs, int64_t end_time) 935 { 936 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 937 938 /* calculate period counters */ 939 stat64_set(&mig_stats.dirty_pages_rate, 940 rs->num_dirty_pages_period * 1000 / 941 (end_time - rs->time_last_bitmap_sync)); 942 943 if (!page_count) { 944 return; 945 } 946 947 if (migrate_xbzrle()) { 948 double encoded_size, unencoded_size; 949 950 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 951 rs->xbzrle_cache_miss_prev) / page_count; 952 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 953 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 954 TARGET_PAGE_SIZE; 955 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 956 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 957 xbzrle_counters.encoding_rate = 0; 958 } else { 959 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 960 } 961 rs->xbzrle_pages_prev = xbzrle_counters.pages; 962 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 963 } 964 } 965 966 /* 967 * Enable dirty-limit to throttle down the guest 968 */ 969 static void migration_dirty_limit_guest(void) 970 { 971 /* 972 * dirty page rate quota for all vCPUs fetched from 973 * migration parameter 'vcpu_dirty_limit' 974 */ 975 static int64_t quota_dirtyrate; 976 MigrationState *s = migrate_get_current(); 977 978 /* 979 * If dirty limit already enabled and migration parameter 980 * vcpu-dirty-limit untouched. 981 */ 982 if (dirtylimit_in_service() && 983 quota_dirtyrate == s->parameters.vcpu_dirty_limit) { 984 return; 985 } 986 987 quota_dirtyrate = s->parameters.vcpu_dirty_limit; 988 989 /* 990 * Set all vCPU a quota dirtyrate, note that the second 991 * parameter will be ignored if setting all vCPU for the vm 992 */ 993 qmp_set_vcpu_dirty_limit(false, -1, quota_dirtyrate, NULL); 994 trace_migration_dirty_limit_guest(quota_dirtyrate); 995 } 996 997 static void migration_trigger_throttle(RAMState *rs) 998 { 999 uint64_t threshold = migrate_throttle_trigger_threshold(); 1000 uint64_t bytes_xfer_period = 1001 migration_transferred_bytes() - rs->bytes_xfer_prev; 1002 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1003 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1004 1005 /* 1006 * The following detection logic can be refined later. For now: 1007 * Check to see if the ratio between dirtied bytes and the approx. 1008 * amount of bytes that just got transferred since the last time 1009 * we were in this routine reaches the threshold. If that happens 1010 * twice, start or increase throttling. 1011 */ 1012 if ((bytes_dirty_period > bytes_dirty_threshold) && 1013 (++rs->dirty_rate_high_cnt >= 2)) { 1014 rs->dirty_rate_high_cnt = 0; 1015 if (migrate_auto_converge()) { 1016 trace_migration_throttle(); 1017 mig_throttle_guest_down(bytes_dirty_period, 1018 bytes_dirty_threshold); 1019 } else if (migrate_dirty_limit()) { 1020 migration_dirty_limit_guest(); 1021 } 1022 } 1023 } 1024 1025 static void migration_bitmap_sync(RAMState *rs, bool last_stage) 1026 { 1027 RAMBlock *block; 1028 int64_t end_time; 1029 1030 stat64_add(&mig_stats.dirty_sync_count, 1); 1031 1032 if (!rs->time_last_bitmap_sync) { 1033 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1034 } 1035 1036 trace_migration_bitmap_sync_start(); 1037 memory_global_dirty_log_sync(last_stage); 1038 1039 WITH_QEMU_LOCK_GUARD(&rs->bitmap_mutex) { 1040 WITH_RCU_READ_LOCK_GUARD() { 1041 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1042 ramblock_sync_dirty_bitmap(rs, block); 1043 } 1044 stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining()); 1045 } 1046 } 1047 1048 memory_global_after_dirty_log_sync(); 1049 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1050 1051 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1052 1053 /* more than 1 second = 1000 millisecons */ 1054 if (end_time > rs->time_last_bitmap_sync + 1000) { 1055 migration_trigger_throttle(rs); 1056 1057 migration_update_rates(rs, end_time); 1058 1059 rs->target_page_count_prev = rs->target_page_count; 1060 1061 /* reset period counters */ 1062 rs->time_last_bitmap_sync = end_time; 1063 rs->num_dirty_pages_period = 0; 1064 rs->bytes_xfer_prev = migration_transferred_bytes(); 1065 } 1066 if (migrate_events()) { 1067 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); 1068 qapi_event_send_migration_pass(generation); 1069 } 1070 } 1071 1072 void migration_bitmap_sync_precopy(bool last_stage) 1073 { 1074 Error *local_err = NULL; 1075 assert(ram_state); 1076 1077 /* 1078 * The current notifier usage is just an optimization to migration, so we 1079 * don't stop the normal migration process in the error case. 1080 */ 1081 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1082 error_report_err(local_err); 1083 local_err = NULL; 1084 } 1085 1086 migration_bitmap_sync(ram_state, last_stage); 1087 1088 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1089 error_report_err(local_err); 1090 } 1091 } 1092 1093 void ram_release_page(const char *rbname, uint64_t offset) 1094 { 1095 if (!migrate_release_ram() || !migration_in_postcopy()) { 1096 return; 1097 } 1098 1099 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1100 } 1101 1102 /** 1103 * save_zero_page: send the zero page to the stream 1104 * 1105 * Returns the number of pages written. 1106 * 1107 * @rs: current RAM state 1108 * @pss: current PSS channel 1109 * @offset: offset inside the block for the page 1110 */ 1111 static int save_zero_page(RAMState *rs, PageSearchStatus *pss, 1112 ram_addr_t offset) 1113 { 1114 uint8_t *p = pss->block->host + offset; 1115 QEMUFile *file = pss->pss_channel; 1116 int len = 0; 1117 1118 if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_NONE) { 1119 return 0; 1120 } 1121 1122 if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1123 return 0; 1124 } 1125 1126 stat64_add(&mig_stats.zero_pages, 1); 1127 1128 if (migrate_mapped_ram()) { 1129 /* zero pages are not transferred with mapped-ram */ 1130 clear_bit_atomic(offset >> TARGET_PAGE_BITS, pss->block->file_bmap); 1131 return 1; 1132 } 1133 1134 len += save_page_header(pss, file, pss->block, offset | RAM_SAVE_FLAG_ZERO); 1135 qemu_put_byte(file, 0); 1136 len += 1; 1137 ram_release_page(pss->block->idstr, offset); 1138 ram_transferred_add(len); 1139 1140 /* 1141 * Must let xbzrle know, otherwise a previous (now 0'd) cached 1142 * page would be stale. 1143 */ 1144 if (rs->xbzrle_started) { 1145 XBZRLE_cache_lock(); 1146 xbzrle_cache_zero_page(pss->block->offset + offset); 1147 XBZRLE_cache_unlock(); 1148 } 1149 1150 return len; 1151 } 1152 1153 /* 1154 * @pages: the number of pages written by the control path, 1155 * < 0 - error 1156 * > 0 - number of pages written 1157 * 1158 * Return true if the pages has been saved, otherwise false is returned. 1159 */ 1160 static bool control_save_page(PageSearchStatus *pss, 1161 ram_addr_t offset, int *pages) 1162 { 1163 int ret; 1164 1165 ret = rdma_control_save_page(pss->pss_channel, pss->block->offset, offset, 1166 TARGET_PAGE_SIZE); 1167 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1168 return false; 1169 } 1170 1171 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1172 *pages = 1; 1173 return true; 1174 } 1175 *pages = ret; 1176 return true; 1177 } 1178 1179 /* 1180 * directly send the page to the stream 1181 * 1182 * Returns the number of pages written. 1183 * 1184 * @pss: current PSS channel 1185 * @block: block that contains the page we want to send 1186 * @offset: offset inside the block for the page 1187 * @buf: the page to be sent 1188 * @async: send to page asyncly 1189 */ 1190 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, 1191 ram_addr_t offset, uint8_t *buf, bool async) 1192 { 1193 QEMUFile *file = pss->pss_channel; 1194 1195 if (migrate_mapped_ram()) { 1196 qemu_put_buffer_at(file, buf, TARGET_PAGE_SIZE, 1197 block->pages_offset + offset); 1198 set_bit(offset >> TARGET_PAGE_BITS, block->file_bmap); 1199 } else { 1200 ram_transferred_add(save_page_header(pss, pss->pss_channel, block, 1201 offset | RAM_SAVE_FLAG_PAGE)); 1202 if (async) { 1203 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, 1204 migrate_release_ram() && 1205 migration_in_postcopy()); 1206 } else { 1207 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); 1208 } 1209 } 1210 ram_transferred_add(TARGET_PAGE_SIZE); 1211 stat64_add(&mig_stats.normal_pages, 1); 1212 return 1; 1213 } 1214 1215 /** 1216 * ram_save_page: send the given page to the stream 1217 * 1218 * Returns the number of pages written. 1219 * < 0 - error 1220 * >=0 - Number of pages written - this might legally be 0 1221 * if xbzrle noticed the page was the same. 1222 * 1223 * @rs: current RAM state 1224 * @block: block that contains the page we want to send 1225 * @offset: offset inside the block for the page 1226 */ 1227 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1228 { 1229 int pages = -1; 1230 uint8_t *p; 1231 bool send_async = true; 1232 RAMBlock *block = pss->block; 1233 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1234 ram_addr_t current_addr = block->offset + offset; 1235 1236 p = block->host + offset; 1237 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1238 1239 XBZRLE_cache_lock(); 1240 if (rs->xbzrle_started && !migration_in_postcopy()) { 1241 pages = save_xbzrle_page(rs, pss, &p, current_addr, 1242 block, offset); 1243 if (!rs->last_stage) { 1244 /* Can't send this cached data async, since the cache page 1245 * might get updated before it gets to the wire 1246 */ 1247 send_async = false; 1248 } 1249 } 1250 1251 /* XBZRLE overflow or normal page */ 1252 if (pages == -1) { 1253 pages = save_normal_page(pss, block, offset, p, send_async); 1254 } 1255 1256 XBZRLE_cache_unlock(); 1257 1258 return pages; 1259 } 1260 1261 static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) 1262 { 1263 if (!multifd_queue_page(block, offset)) { 1264 return -1; 1265 } 1266 1267 return 1; 1268 } 1269 1270 1271 #define PAGE_ALL_CLEAN 0 1272 #define PAGE_TRY_AGAIN 1 1273 #define PAGE_DIRTY_FOUND 2 1274 /** 1275 * find_dirty_block: find the next dirty page and update any state 1276 * associated with the search process. 1277 * 1278 * Returns: 1279 * <0: An error happened 1280 * PAGE_ALL_CLEAN: no dirty page found, give up 1281 * PAGE_TRY_AGAIN: no dirty page found, retry for next block 1282 * PAGE_DIRTY_FOUND: dirty page found 1283 * 1284 * @rs: current RAM state 1285 * @pss: data about the state of the current dirty page scan 1286 * @again: set to false if the search has scanned the whole of RAM 1287 */ 1288 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) 1289 { 1290 /* Update pss->page for the next dirty bit in ramblock */ 1291 pss_find_next_dirty(pss); 1292 1293 if (pss->complete_round && pss->block == rs->last_seen_block && 1294 pss->page >= rs->last_page) { 1295 /* 1296 * We've been once around the RAM and haven't found anything. 1297 * Give up. 1298 */ 1299 return PAGE_ALL_CLEAN; 1300 } 1301 if (!offset_in_ramblock(pss->block, 1302 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1303 /* Didn't find anything in this RAM Block */ 1304 pss->page = 0; 1305 pss->block = QLIST_NEXT_RCU(pss->block, next); 1306 if (!pss->block) { 1307 if (multifd_ram_sync_per_round()) { 1308 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; 1309 int ret = multifd_ram_flush_and_sync(f); 1310 if (ret < 0) { 1311 return ret; 1312 } 1313 } 1314 1315 /* Hit the end of the list */ 1316 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1317 /* Flag that we've looped */ 1318 pss->complete_round = true; 1319 /* After the first round, enable XBZRLE. */ 1320 if (migrate_xbzrle()) { 1321 rs->xbzrle_started = true; 1322 } 1323 } 1324 /* Didn't find anything this time, but try again on the new block */ 1325 return PAGE_TRY_AGAIN; 1326 } else { 1327 /* We've found something */ 1328 return PAGE_DIRTY_FOUND; 1329 } 1330 } 1331 1332 /** 1333 * unqueue_page: gets a page of the queue 1334 * 1335 * Helper for 'get_queued_page' - gets a page off the queue 1336 * 1337 * Returns the block of the page (or NULL if none available) 1338 * 1339 * @rs: current RAM state 1340 * @offset: used to return the offset within the RAMBlock 1341 */ 1342 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1343 { 1344 struct RAMSrcPageRequest *entry; 1345 RAMBlock *block = NULL; 1346 1347 if (!postcopy_has_request(rs)) { 1348 return NULL; 1349 } 1350 1351 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1352 1353 /* 1354 * This should _never_ change even after we take the lock, because no one 1355 * should be taking anything off the request list other than us. 1356 */ 1357 assert(postcopy_has_request(rs)); 1358 1359 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1360 block = entry->rb; 1361 *offset = entry->offset; 1362 1363 if (entry->len > TARGET_PAGE_SIZE) { 1364 entry->len -= TARGET_PAGE_SIZE; 1365 entry->offset += TARGET_PAGE_SIZE; 1366 } else { 1367 memory_region_unref(block->mr); 1368 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1369 g_free(entry); 1370 migration_consume_urgent_request(); 1371 } 1372 1373 return block; 1374 } 1375 1376 #if defined(__linux__) 1377 /** 1378 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1379 * is found, return RAM block pointer and page offset 1380 * 1381 * Returns pointer to the RAMBlock containing faulting page, 1382 * NULL if no write faults are pending 1383 * 1384 * @rs: current RAM state 1385 * @offset: page offset from the beginning of the block 1386 */ 1387 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1388 { 1389 struct uffd_msg uffd_msg; 1390 void *page_address; 1391 RAMBlock *block; 1392 int res; 1393 1394 if (!migrate_background_snapshot()) { 1395 return NULL; 1396 } 1397 1398 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1399 if (res <= 0) { 1400 return NULL; 1401 } 1402 1403 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1404 block = qemu_ram_block_from_host(page_address, false, offset); 1405 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1406 return block; 1407 } 1408 1409 /** 1410 * ram_save_release_protection: release UFFD write protection after 1411 * a range of pages has been saved 1412 * 1413 * @rs: current RAM state 1414 * @pss: page-search-status structure 1415 * @start_page: index of the first page in the range relative to pss->block 1416 * 1417 * Returns 0 on success, negative value in case of an error 1418 */ 1419 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1420 unsigned long start_page) 1421 { 1422 int res = 0; 1423 1424 /* Check if page is from UFFD-managed region. */ 1425 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1426 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1427 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1428 1429 /* Flush async buffers before un-protect. */ 1430 qemu_fflush(pss->pss_channel); 1431 /* Un-protect memory range. */ 1432 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1433 false, false); 1434 } 1435 1436 return res; 1437 } 1438 1439 /* ram_write_tracking_available: check if kernel supports required UFFD features 1440 * 1441 * Returns true if supports, false otherwise 1442 */ 1443 bool ram_write_tracking_available(void) 1444 { 1445 uint64_t uffd_features; 1446 int res; 1447 1448 res = uffd_query_features(&uffd_features); 1449 return (res == 0 && 1450 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1451 } 1452 1453 /* ram_write_tracking_compatible: check if guest configuration is 1454 * compatible with 'write-tracking' 1455 * 1456 * Returns true if compatible, false otherwise 1457 */ 1458 bool ram_write_tracking_compatible(void) 1459 { 1460 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1461 int uffd_fd; 1462 RAMBlock *block; 1463 bool ret = false; 1464 1465 /* Open UFFD file descriptor */ 1466 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1467 if (uffd_fd < 0) { 1468 return false; 1469 } 1470 1471 RCU_READ_LOCK_GUARD(); 1472 1473 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1474 uint64_t uffd_ioctls; 1475 1476 /* Nothing to do with read-only and MMIO-writable regions */ 1477 if (block->mr->readonly || block->mr->rom_device) { 1478 continue; 1479 } 1480 /* Try to register block memory via UFFD-IO to track writes */ 1481 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1482 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1483 goto out; 1484 } 1485 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1486 goto out; 1487 } 1488 } 1489 ret = true; 1490 1491 out: 1492 uffd_close_fd(uffd_fd); 1493 return ret; 1494 } 1495 1496 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1497 ram_addr_t size) 1498 { 1499 const ram_addr_t end = offset + size; 1500 1501 /* 1502 * We read one byte of each page; this will preallocate page tables if 1503 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1504 * where no page was populated yet. This might require adaption when 1505 * supporting other mappings, like shmem. 1506 */ 1507 for (; offset < end; offset += block->page_size) { 1508 char tmp = *((char *)block->host + offset); 1509 1510 /* Don't optimize the read out */ 1511 asm volatile("" : "+r" (tmp)); 1512 } 1513 } 1514 1515 static inline int populate_read_section(MemoryRegionSection *section, 1516 void *opaque) 1517 { 1518 const hwaddr size = int128_get64(section->size); 1519 hwaddr offset = section->offset_within_region; 1520 RAMBlock *block = section->mr->ram_block; 1521 1522 populate_read_range(block, offset, size); 1523 return 0; 1524 } 1525 1526 /* 1527 * ram_block_populate_read: preallocate page tables and populate pages in the 1528 * RAM block by reading a byte of each page. 1529 * 1530 * Since it's solely used for userfault_fd WP feature, here we just 1531 * hardcode page size to qemu_real_host_page_size. 1532 * 1533 * @block: RAM block to populate 1534 */ 1535 static void ram_block_populate_read(RAMBlock *rb) 1536 { 1537 /* 1538 * Skip populating all pages that fall into a discarded range as managed by 1539 * a RamDiscardManager responsible for the mapped memory region of the 1540 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1541 * must not get populated automatically. We don't have to track 1542 * modifications via userfaultfd WP reliably, because these pages will 1543 * not be part of the migration stream either way -- see 1544 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1545 * 1546 * Note: The result is only stable while migrating (precopy/postcopy). 1547 */ 1548 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1549 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1550 MemoryRegionSection section = { 1551 .mr = rb->mr, 1552 .offset_within_region = 0, 1553 .size = rb->mr->size, 1554 }; 1555 1556 ram_discard_manager_replay_populated(rdm, §ion, 1557 populate_read_section, NULL); 1558 } else { 1559 populate_read_range(rb, 0, rb->used_length); 1560 } 1561 } 1562 1563 /* 1564 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1565 */ 1566 void ram_write_tracking_prepare(void) 1567 { 1568 RAMBlock *block; 1569 1570 RCU_READ_LOCK_GUARD(); 1571 1572 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1573 /* Nothing to do with read-only and MMIO-writable regions */ 1574 if (block->mr->readonly || block->mr->rom_device) { 1575 continue; 1576 } 1577 1578 /* 1579 * Populate pages of the RAM block before enabling userfault_fd 1580 * write protection. 1581 * 1582 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1583 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1584 * pages with pte_none() entries in page table. 1585 */ 1586 ram_block_populate_read(block); 1587 } 1588 } 1589 1590 static inline int uffd_protect_section(MemoryRegionSection *section, 1591 void *opaque) 1592 { 1593 const hwaddr size = int128_get64(section->size); 1594 const hwaddr offset = section->offset_within_region; 1595 RAMBlock *rb = section->mr->ram_block; 1596 int uffd_fd = (uintptr_t)opaque; 1597 1598 return uffd_change_protection(uffd_fd, rb->host + offset, size, true, 1599 false); 1600 } 1601 1602 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd) 1603 { 1604 assert(rb->flags & RAM_UF_WRITEPROTECT); 1605 1606 /* See ram_block_populate_read() */ 1607 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1608 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1609 MemoryRegionSection section = { 1610 .mr = rb->mr, 1611 .offset_within_region = 0, 1612 .size = rb->mr->size, 1613 }; 1614 1615 return ram_discard_manager_replay_populated(rdm, §ion, 1616 uffd_protect_section, 1617 (void *)(uintptr_t)uffd_fd); 1618 } 1619 return uffd_change_protection(uffd_fd, rb->host, 1620 rb->used_length, true, false); 1621 } 1622 1623 /* 1624 * ram_write_tracking_start: start UFFD-WP memory tracking 1625 * 1626 * Returns 0 for success or negative value in case of error 1627 */ 1628 int ram_write_tracking_start(void) 1629 { 1630 int uffd_fd; 1631 RAMState *rs = ram_state; 1632 RAMBlock *block; 1633 1634 /* Open UFFD file descriptor */ 1635 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1636 if (uffd_fd < 0) { 1637 return uffd_fd; 1638 } 1639 rs->uffdio_fd = uffd_fd; 1640 1641 RCU_READ_LOCK_GUARD(); 1642 1643 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1644 /* Nothing to do with read-only and MMIO-writable regions */ 1645 if (block->mr->readonly || block->mr->rom_device) { 1646 continue; 1647 } 1648 1649 /* Register block memory with UFFD to track writes */ 1650 if (uffd_register_memory(rs->uffdio_fd, block->host, 1651 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1652 goto fail; 1653 } 1654 block->flags |= RAM_UF_WRITEPROTECT; 1655 memory_region_ref(block->mr); 1656 1657 /* Apply UFFD write protection to the block memory range */ 1658 if (ram_block_uffd_protect(block, uffd_fd)) { 1659 goto fail; 1660 } 1661 1662 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1663 block->host, block->max_length); 1664 } 1665 1666 return 0; 1667 1668 fail: 1669 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1670 1671 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1672 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1673 continue; 1674 } 1675 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1676 /* Cleanup flags and remove reference */ 1677 block->flags &= ~RAM_UF_WRITEPROTECT; 1678 memory_region_unref(block->mr); 1679 } 1680 1681 uffd_close_fd(uffd_fd); 1682 rs->uffdio_fd = -1; 1683 return -1; 1684 } 1685 1686 /** 1687 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1688 */ 1689 void ram_write_tracking_stop(void) 1690 { 1691 RAMState *rs = ram_state; 1692 RAMBlock *block; 1693 1694 RCU_READ_LOCK_GUARD(); 1695 1696 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1697 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1698 continue; 1699 } 1700 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1701 1702 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 1703 block->host, block->max_length); 1704 1705 /* Cleanup flags and remove reference */ 1706 block->flags &= ~RAM_UF_WRITEPROTECT; 1707 memory_region_unref(block->mr); 1708 } 1709 1710 /* Finally close UFFD file descriptor */ 1711 uffd_close_fd(rs->uffdio_fd); 1712 rs->uffdio_fd = -1; 1713 } 1714 1715 #else 1716 /* No target OS support, stubs just fail or ignore */ 1717 1718 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1719 { 1720 (void) rs; 1721 (void) offset; 1722 1723 return NULL; 1724 } 1725 1726 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1727 unsigned long start_page) 1728 { 1729 (void) rs; 1730 (void) pss; 1731 (void) start_page; 1732 1733 return 0; 1734 } 1735 1736 bool ram_write_tracking_available(void) 1737 { 1738 return false; 1739 } 1740 1741 bool ram_write_tracking_compatible(void) 1742 { 1743 g_assert_not_reached(); 1744 } 1745 1746 int ram_write_tracking_start(void) 1747 { 1748 g_assert_not_reached(); 1749 } 1750 1751 void ram_write_tracking_stop(void) 1752 { 1753 g_assert_not_reached(); 1754 } 1755 #endif /* defined(__linux__) */ 1756 1757 /** 1758 * get_queued_page: unqueue a page from the postcopy requests 1759 * 1760 * Skips pages that are already sent (!dirty) 1761 * 1762 * Returns true if a queued page is found 1763 * 1764 * @rs: current RAM state 1765 * @pss: data about the state of the current dirty page scan 1766 */ 1767 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1768 { 1769 RAMBlock *block; 1770 ram_addr_t offset; 1771 bool dirty = false; 1772 1773 do { 1774 block = unqueue_page(rs, &offset); 1775 /* 1776 * We're sending this page, and since it's postcopy nothing else 1777 * will dirty it, and we must make sure it doesn't get sent again 1778 * even if this queue request was received after the background 1779 * search already sent it. 1780 */ 1781 if (block) { 1782 unsigned long page; 1783 1784 page = offset >> TARGET_PAGE_BITS; 1785 dirty = test_bit(page, block->bmap); 1786 if (!dirty) { 1787 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1788 page); 1789 } else { 1790 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1791 } 1792 } 1793 1794 } while (block && !dirty); 1795 1796 if (!block) { 1797 /* 1798 * Poll write faults too if background snapshot is enabled; that's 1799 * when we have vcpus got blocked by the write protected pages. 1800 */ 1801 block = poll_fault_page(rs, &offset); 1802 } 1803 1804 if (block) { 1805 /* 1806 * We want the background search to continue from the queued page 1807 * since the guest is likely to want other pages near to the page 1808 * it just requested. 1809 */ 1810 pss->block = block; 1811 pss->page = offset >> TARGET_PAGE_BITS; 1812 1813 /* 1814 * This unqueued page would break the "one round" check, even is 1815 * really rare. 1816 */ 1817 pss->complete_round = false; 1818 } 1819 1820 return !!block; 1821 } 1822 1823 /** 1824 * migration_page_queue_free: drop any remaining pages in the ram 1825 * request queue 1826 * 1827 * It should be empty at the end anyway, but in error cases there may 1828 * be some left. in case that there is any page left, we drop it. 1829 * 1830 */ 1831 static void migration_page_queue_free(RAMState *rs) 1832 { 1833 struct RAMSrcPageRequest *mspr, *next_mspr; 1834 /* This queue generally should be empty - but in the case of a failed 1835 * migration might have some droppings in. 1836 */ 1837 RCU_READ_LOCK_GUARD(); 1838 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 1839 memory_region_unref(mspr->rb->mr); 1840 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1841 g_free(mspr); 1842 } 1843 } 1844 1845 /** 1846 * ram_save_queue_pages: queue the page for transmission 1847 * 1848 * A request from postcopy destination for example. 1849 * 1850 * Returns zero on success or negative on error 1851 * 1852 * @rbname: Name of the RAMBLock of the request. NULL means the 1853 * same that last one. 1854 * @start: starting address from the start of the RAMBlock 1855 * @len: length (in bytes) to send 1856 */ 1857 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len, 1858 Error **errp) 1859 { 1860 RAMBlock *ramblock; 1861 RAMState *rs = ram_state; 1862 1863 stat64_add(&mig_stats.postcopy_requests, 1); 1864 RCU_READ_LOCK_GUARD(); 1865 1866 if (!rbname) { 1867 /* Reuse last RAMBlock */ 1868 ramblock = rs->last_req_rb; 1869 1870 if (!ramblock) { 1871 /* 1872 * Shouldn't happen, we can't reuse the last RAMBlock if 1873 * it's the 1st request. 1874 */ 1875 error_setg(errp, "MIG_RP_MSG_REQ_PAGES has no previous block"); 1876 return -1; 1877 } 1878 } else { 1879 ramblock = qemu_ram_block_by_name(rbname); 1880 1881 if (!ramblock) { 1882 /* We shouldn't be asked for a non-existent RAMBlock */ 1883 error_setg(errp, "MIG_RP_MSG_REQ_PAGES has no block '%s'", rbname); 1884 return -1; 1885 } 1886 rs->last_req_rb = ramblock; 1887 } 1888 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1889 if (!offset_in_ramblock(ramblock, start + len - 1)) { 1890 error_setg(errp, "MIG_RP_MSG_REQ_PAGES request overrun, " 1891 "start=" RAM_ADDR_FMT " len=" 1892 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1893 start, len, ramblock->used_length); 1894 return -1; 1895 } 1896 1897 /* 1898 * When with postcopy preempt, we send back the page directly in the 1899 * rp-return thread. 1900 */ 1901 if (postcopy_preempt_active()) { 1902 ram_addr_t page_start = start >> TARGET_PAGE_BITS; 1903 size_t page_size = qemu_ram_pagesize(ramblock); 1904 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY]; 1905 int ret = 0; 1906 1907 qemu_mutex_lock(&rs->bitmap_mutex); 1908 1909 pss_init(pss, ramblock, page_start); 1910 /* 1911 * Always use the preempt channel, and make sure it's there. It's 1912 * safe to access without lock, because when rp-thread is running 1913 * we should be the only one who operates on the qemufile 1914 */ 1915 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src; 1916 assert(pss->pss_channel); 1917 1918 /* 1919 * It must be either one or multiple of host page size. Just 1920 * assert; if something wrong we're mostly split brain anyway. 1921 */ 1922 assert(len % page_size == 0); 1923 while (len) { 1924 if (ram_save_host_page_urgent(pss)) { 1925 error_setg(errp, "ram_save_host_page_urgent() failed: " 1926 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, 1927 ramblock->idstr, start); 1928 ret = -1; 1929 break; 1930 } 1931 /* 1932 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page 1933 * will automatically be moved and point to the next host page 1934 * we're going to send, so no need to update here. 1935 * 1936 * Normally QEMU never sends >1 host page in requests, so 1937 * logically we don't even need that as the loop should only 1938 * run once, but just to be consistent. 1939 */ 1940 len -= page_size; 1941 }; 1942 qemu_mutex_unlock(&rs->bitmap_mutex); 1943 1944 return ret; 1945 } 1946 1947 struct RAMSrcPageRequest *new_entry = 1948 g_new0(struct RAMSrcPageRequest, 1); 1949 new_entry->rb = ramblock; 1950 new_entry->offset = start; 1951 new_entry->len = len; 1952 1953 memory_region_ref(ramblock->mr); 1954 qemu_mutex_lock(&rs->src_page_req_mutex); 1955 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 1956 migration_make_urgent_request(); 1957 qemu_mutex_unlock(&rs->src_page_req_mutex); 1958 1959 return 0; 1960 } 1961 1962 /** 1963 * ram_save_target_page_legacy: save one target page 1964 * 1965 * Returns the number of pages written 1966 * 1967 * @rs: current RAM state 1968 * @pss: data about the page we want to send 1969 */ 1970 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) 1971 { 1972 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1973 int res; 1974 1975 if (control_save_page(pss, offset, &res)) { 1976 return res; 1977 } 1978 1979 if (save_zero_page(rs, pss, offset)) { 1980 return 1; 1981 } 1982 1983 return ram_save_page(rs, pss); 1984 } 1985 1986 /** 1987 * ram_save_target_page_multifd: send one target page to multifd workers 1988 * 1989 * Returns 1 if the page was queued, -1 otherwise. 1990 * 1991 * @rs: current RAM state 1992 * @pss: data about the page we want to send 1993 */ 1994 static int ram_save_target_page_multifd(RAMState *rs, PageSearchStatus *pss) 1995 { 1996 RAMBlock *block = pss->block; 1997 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1998 1999 /* 2000 * While using multifd live migration, we still need to handle zero 2001 * page checking on the migration main thread. 2002 */ 2003 if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) { 2004 if (save_zero_page(rs, pss, offset)) { 2005 return 1; 2006 } 2007 } 2008 2009 return ram_save_multifd_page(block, offset); 2010 } 2011 2012 /* Should be called before sending a host page */ 2013 static void pss_host_page_prepare(PageSearchStatus *pss) 2014 { 2015 /* How many guest pages are there in one host page? */ 2016 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2017 2018 pss->host_page_sending = true; 2019 if (guest_pfns <= 1) { 2020 /* 2021 * This covers both when guest psize == host psize, or when guest 2022 * has larger psize than the host (guest_pfns==0). 2023 * 2024 * For the latter, we always send one whole guest page per 2025 * iteration of the host page (example: an Alpha VM on x86 host 2026 * will have guest psize 8K while host psize 4K). 2027 */ 2028 pss->host_page_start = pss->page; 2029 pss->host_page_end = pss->page + 1; 2030 } else { 2031 /* 2032 * The host page spans over multiple guest pages, we send them 2033 * within the same host page iteration. 2034 */ 2035 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); 2036 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); 2037 } 2038 } 2039 2040 /* 2041 * Whether the page pointed by PSS is within the host page being sent. 2042 * Must be called after a previous pss_host_page_prepare(). 2043 */ 2044 static bool pss_within_range(PageSearchStatus *pss) 2045 { 2046 ram_addr_t ram_addr; 2047 2048 assert(pss->host_page_sending); 2049 2050 /* Over host-page boundary? */ 2051 if (pss->page >= pss->host_page_end) { 2052 return false; 2053 } 2054 2055 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2056 2057 return offset_in_ramblock(pss->block, ram_addr); 2058 } 2059 2060 static void pss_host_page_finish(PageSearchStatus *pss) 2061 { 2062 pss->host_page_sending = false; 2063 /* This is not needed, but just to reset it */ 2064 pss->host_page_start = pss->host_page_end = 0; 2065 } 2066 2067 /* 2068 * Send an urgent host page specified by `pss'. Need to be called with 2069 * bitmap_mutex held. 2070 * 2071 * Returns 0 if save host page succeeded, false otherwise. 2072 */ 2073 static int ram_save_host_page_urgent(PageSearchStatus *pss) 2074 { 2075 bool page_dirty, sent = false; 2076 RAMState *rs = ram_state; 2077 int ret = 0; 2078 2079 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); 2080 pss_host_page_prepare(pss); 2081 2082 /* 2083 * If precopy is sending the same page, let it be done in precopy, or 2084 * we could send the same page in two channels and none of them will 2085 * receive the whole page. 2086 */ 2087 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) { 2088 trace_postcopy_preempt_hit(pss->block->idstr, 2089 pss->page << TARGET_PAGE_BITS); 2090 return 0; 2091 } 2092 2093 do { 2094 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2095 2096 if (page_dirty) { 2097 /* Be strict to return code; it must be 1, or what else? */ 2098 if (migration_ops->ram_save_target_page(rs, pss) != 1) { 2099 error_report_once("%s: ram_save_target_page failed", __func__); 2100 ret = -1; 2101 goto out; 2102 } 2103 sent = true; 2104 } 2105 pss_find_next_dirty(pss); 2106 } while (pss_within_range(pss)); 2107 out: 2108 pss_host_page_finish(pss); 2109 /* For urgent requests, flush immediately if sent */ 2110 if (sent) { 2111 qemu_fflush(pss->pss_channel); 2112 } 2113 return ret; 2114 } 2115 2116 /** 2117 * ram_save_host_page: save a whole host page 2118 * 2119 * Starting at *offset send pages up to the end of the current host 2120 * page. It's valid for the initial offset to point into the middle of 2121 * a host page in which case the remainder of the hostpage is sent. 2122 * Only dirty target pages are sent. Note that the host page size may 2123 * be a huge page for this block. 2124 * 2125 * The saving stops at the boundary of the used_length of the block 2126 * if the RAMBlock isn't a multiple of the host page size. 2127 * 2128 * The caller must be with ram_state.bitmap_mutex held to call this 2129 * function. Note that this function can temporarily release the lock, but 2130 * when the function is returned it'll make sure the lock is still held. 2131 * 2132 * Returns the number of pages written or negative on error 2133 * 2134 * @rs: current RAM state 2135 * @pss: data about the page we want to send 2136 */ 2137 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2138 { 2139 bool page_dirty, preempt_active = postcopy_preempt_active(); 2140 int tmppages, pages = 0; 2141 size_t pagesize_bits = 2142 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2143 unsigned long start_page = pss->page; 2144 int res; 2145 2146 if (migrate_ram_is_ignored(pss->block)) { 2147 error_report("block %s should not be migrated !", pss->block->idstr); 2148 return 0; 2149 } 2150 2151 /* Update host page boundary information */ 2152 pss_host_page_prepare(pss); 2153 2154 do { 2155 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2156 2157 /* Check the pages is dirty and if it is send it */ 2158 if (page_dirty) { 2159 /* 2160 * Properly yield the lock only in postcopy preempt mode 2161 * because both migration thread and rp-return thread can 2162 * operate on the bitmaps. 2163 */ 2164 if (preempt_active) { 2165 qemu_mutex_unlock(&rs->bitmap_mutex); 2166 } 2167 tmppages = migration_ops->ram_save_target_page(rs, pss); 2168 if (tmppages >= 0) { 2169 pages += tmppages; 2170 /* 2171 * Allow rate limiting to happen in the middle of huge pages if 2172 * something is sent in the current iteration. 2173 */ 2174 if (pagesize_bits > 1 && tmppages > 0) { 2175 migration_rate_limit(); 2176 } 2177 } 2178 if (preempt_active) { 2179 qemu_mutex_lock(&rs->bitmap_mutex); 2180 } 2181 } else { 2182 tmppages = 0; 2183 } 2184 2185 if (tmppages < 0) { 2186 pss_host_page_finish(pss); 2187 return tmppages; 2188 } 2189 2190 pss_find_next_dirty(pss); 2191 } while (pss_within_range(pss)); 2192 2193 pss_host_page_finish(pss); 2194 2195 res = ram_save_release_protection(rs, pss, start_page); 2196 return (res < 0 ? res : pages); 2197 } 2198 2199 /** 2200 * ram_find_and_save_block: finds a dirty page and sends it to f 2201 * 2202 * Called within an RCU critical section. 2203 * 2204 * Returns the number of pages written where zero means no dirty pages, 2205 * or negative on error 2206 * 2207 * @rs: current RAM state 2208 * 2209 * On systems where host-page-size > target-page-size it will send all the 2210 * pages in a host page that are dirty. 2211 */ 2212 static int ram_find_and_save_block(RAMState *rs) 2213 { 2214 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 2215 int pages = 0; 2216 2217 /* No dirty page as there is zero RAM */ 2218 if (!rs->ram_bytes_total) { 2219 return pages; 2220 } 2221 2222 /* 2223 * Always keep last_seen_block/last_page valid during this procedure, 2224 * because find_dirty_block() relies on these values (e.g., we compare 2225 * last_seen_block with pss.block to see whether we searched all the 2226 * ramblocks) to detect the completion of migration. Having NULL value 2227 * of last_seen_block can conditionally cause below loop to run forever. 2228 */ 2229 if (!rs->last_seen_block) { 2230 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks); 2231 rs->last_page = 0; 2232 } 2233 2234 pss_init(pss, rs->last_seen_block, rs->last_page); 2235 2236 while (true){ 2237 if (!get_queued_page(rs, pss)) { 2238 /* priority queue empty, so just search for something dirty */ 2239 int res = find_dirty_block(rs, pss); 2240 if (res != PAGE_DIRTY_FOUND) { 2241 if (res == PAGE_ALL_CLEAN) { 2242 break; 2243 } else if (res == PAGE_TRY_AGAIN) { 2244 continue; 2245 } else if (res < 0) { 2246 pages = res; 2247 break; 2248 } 2249 } 2250 } 2251 pages = ram_save_host_page(rs, pss); 2252 if (pages) { 2253 break; 2254 } 2255 } 2256 2257 rs->last_seen_block = pss->block; 2258 rs->last_page = pss->page; 2259 2260 return pages; 2261 } 2262 2263 static uint64_t ram_bytes_total_with_ignored(void) 2264 { 2265 RAMBlock *block; 2266 uint64_t total = 0; 2267 2268 RCU_READ_LOCK_GUARD(); 2269 2270 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2271 total += block->used_length; 2272 } 2273 return total; 2274 } 2275 2276 uint64_t ram_bytes_total(void) 2277 { 2278 RAMBlock *block; 2279 uint64_t total = 0; 2280 2281 RCU_READ_LOCK_GUARD(); 2282 2283 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2284 total += block->used_length; 2285 } 2286 return total; 2287 } 2288 2289 static void xbzrle_load_setup(void) 2290 { 2291 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2292 } 2293 2294 static void xbzrle_load_cleanup(void) 2295 { 2296 g_free(XBZRLE.decoded_buf); 2297 XBZRLE.decoded_buf = NULL; 2298 } 2299 2300 static void ram_state_cleanup(RAMState **rsp) 2301 { 2302 if (*rsp) { 2303 migration_page_queue_free(*rsp); 2304 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2305 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2306 g_free(*rsp); 2307 *rsp = NULL; 2308 } 2309 } 2310 2311 static void xbzrle_cleanup(void) 2312 { 2313 XBZRLE_cache_lock(); 2314 if (XBZRLE.cache) { 2315 cache_fini(XBZRLE.cache); 2316 g_free(XBZRLE.encoded_buf); 2317 g_free(XBZRLE.current_buf); 2318 g_free(XBZRLE.zero_target_page); 2319 XBZRLE.cache = NULL; 2320 XBZRLE.encoded_buf = NULL; 2321 XBZRLE.current_buf = NULL; 2322 XBZRLE.zero_target_page = NULL; 2323 } 2324 XBZRLE_cache_unlock(); 2325 } 2326 2327 static void ram_bitmaps_destroy(void) 2328 { 2329 RAMBlock *block; 2330 2331 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2332 g_free(block->clear_bmap); 2333 block->clear_bmap = NULL; 2334 g_free(block->bmap); 2335 block->bmap = NULL; 2336 g_free(block->file_bmap); 2337 block->file_bmap = NULL; 2338 } 2339 } 2340 2341 static void ram_save_cleanup(void *opaque) 2342 { 2343 RAMState **rsp = opaque; 2344 2345 /* We don't use dirty log with background snapshots */ 2346 if (!migrate_background_snapshot()) { 2347 /* caller have hold BQL or is in a bh, so there is 2348 * no writing race against the migration bitmap 2349 */ 2350 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2351 /* 2352 * do not stop dirty log without starting it, since 2353 * memory_global_dirty_log_stop will assert that 2354 * memory_global_dirty_log_start/stop used in pairs 2355 */ 2356 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2357 } 2358 } 2359 2360 ram_bitmaps_destroy(); 2361 2362 xbzrle_cleanup(); 2363 multifd_ram_save_cleanup(); 2364 ram_state_cleanup(rsp); 2365 g_free(migration_ops); 2366 migration_ops = NULL; 2367 } 2368 2369 static void ram_state_reset(RAMState *rs) 2370 { 2371 int i; 2372 2373 for (i = 0; i < RAM_CHANNEL_MAX; i++) { 2374 rs->pss[i].last_sent_block = NULL; 2375 } 2376 2377 rs->last_seen_block = NULL; 2378 rs->last_page = 0; 2379 rs->last_version = ram_list.version; 2380 rs->xbzrle_started = false; 2381 } 2382 2383 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2384 2385 /* **** functions for postcopy ***** */ 2386 2387 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2388 { 2389 struct RAMBlock *block; 2390 2391 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2392 unsigned long *bitmap = block->bmap; 2393 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2394 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2395 2396 while (run_start < range) { 2397 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2398 ram_discard_range(block->idstr, 2399 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2400 ((ram_addr_t)(run_end - run_start)) 2401 << TARGET_PAGE_BITS); 2402 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2403 } 2404 } 2405 } 2406 2407 /** 2408 * postcopy_send_discard_bm_ram: discard a RAMBlock 2409 * 2410 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2411 * 2412 * @ms: current migration state 2413 * @block: RAMBlock to discard 2414 */ 2415 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2416 { 2417 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2418 unsigned long current; 2419 unsigned long *bitmap = block->bmap; 2420 2421 for (current = 0; current < end; ) { 2422 unsigned long one = find_next_bit(bitmap, end, current); 2423 unsigned long zero, discard_length; 2424 2425 if (one >= end) { 2426 break; 2427 } 2428 2429 zero = find_next_zero_bit(bitmap, end, one + 1); 2430 2431 if (zero >= end) { 2432 discard_length = end - one; 2433 } else { 2434 discard_length = zero - one; 2435 } 2436 postcopy_discard_send_range(ms, one, discard_length); 2437 current = one + discard_length; 2438 } 2439 } 2440 2441 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2442 2443 /** 2444 * postcopy_each_ram_send_discard: discard all RAMBlocks 2445 * 2446 * Utility for the outgoing postcopy code. 2447 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2448 * passing it bitmap indexes and name. 2449 * (qemu_ram_foreach_block ends up passing unscaled lengths 2450 * which would mean postcopy code would have to deal with target page) 2451 * 2452 * @ms: current migration state 2453 */ 2454 static void postcopy_each_ram_send_discard(MigrationState *ms) 2455 { 2456 struct RAMBlock *block; 2457 2458 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2459 postcopy_discard_send_init(ms, block->idstr); 2460 2461 /* 2462 * Deal with TPS != HPS and huge pages. It discard any partially sent 2463 * host-page size chunks, mark any partially dirty host-page size 2464 * chunks as all dirty. In this case the host-page is the host-page 2465 * for the particular RAMBlock, i.e. it might be a huge page. 2466 */ 2467 postcopy_chunk_hostpages_pass(ms, block); 2468 2469 /* 2470 * Postcopy sends chunks of bitmap over the wire, but it 2471 * just needs indexes at this point, avoids it having 2472 * target page specific code. 2473 */ 2474 postcopy_send_discard_bm_ram(ms, block); 2475 postcopy_discard_send_finish(ms); 2476 } 2477 } 2478 2479 /** 2480 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2481 * 2482 * Helper for postcopy_chunk_hostpages; it's called twice to 2483 * canonicalize the two bitmaps, that are similar, but one is 2484 * inverted. 2485 * 2486 * Postcopy requires that all target pages in a hostpage are dirty or 2487 * clean, not a mix. This function canonicalizes the bitmaps. 2488 * 2489 * @ms: current migration state 2490 * @block: block that contains the page we want to canonicalize 2491 */ 2492 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2493 { 2494 RAMState *rs = ram_state; 2495 unsigned long *bitmap = block->bmap; 2496 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2497 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2498 unsigned long run_start; 2499 2500 if (block->page_size == TARGET_PAGE_SIZE) { 2501 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2502 return; 2503 } 2504 2505 /* Find a dirty page */ 2506 run_start = find_next_bit(bitmap, pages, 0); 2507 2508 while (run_start < pages) { 2509 2510 /* 2511 * If the start of this run of pages is in the middle of a host 2512 * page, then we need to fixup this host page. 2513 */ 2514 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2515 /* Find the end of this run */ 2516 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2517 /* 2518 * If the end isn't at the start of a host page, then the 2519 * run doesn't finish at the end of a host page 2520 * and we need to discard. 2521 */ 2522 } 2523 2524 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2525 unsigned long page; 2526 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2527 host_ratio); 2528 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2529 2530 /* Clean up the bitmap */ 2531 for (page = fixup_start_addr; 2532 page < fixup_start_addr + host_ratio; page++) { 2533 /* 2534 * Remark them as dirty, updating the count for any pages 2535 * that weren't previously dirty. 2536 */ 2537 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2538 } 2539 } 2540 2541 /* Find the next dirty page for the next iteration */ 2542 run_start = find_next_bit(bitmap, pages, run_start); 2543 } 2544 } 2545 2546 /** 2547 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2548 * 2549 * Transmit the set of pages to be discarded after precopy to the target 2550 * these are pages that: 2551 * a) Have been previously transmitted but are now dirty again 2552 * b) Pages that have never been transmitted, this ensures that 2553 * any pages on the destination that have been mapped by background 2554 * tasks get discarded (transparent huge pages is the specific concern) 2555 * Hopefully this is pretty sparse 2556 * 2557 * @ms: current migration state 2558 */ 2559 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2560 { 2561 RAMState *rs = ram_state; 2562 2563 RCU_READ_LOCK_GUARD(); 2564 2565 /* This should be our last sync, the src is now paused */ 2566 migration_bitmap_sync(rs, false); 2567 2568 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2569 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL; 2570 rs->last_seen_block = NULL; 2571 rs->last_page = 0; 2572 2573 postcopy_each_ram_send_discard(ms); 2574 2575 trace_ram_postcopy_send_discard_bitmap(); 2576 } 2577 2578 /** 2579 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2580 * 2581 * Returns zero on success 2582 * 2583 * @rbname: name of the RAMBlock of the request. NULL means the 2584 * same that last one. 2585 * @start: RAMBlock starting page 2586 * @length: RAMBlock size 2587 */ 2588 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2589 { 2590 trace_ram_discard_range(rbname, start, length); 2591 2592 RCU_READ_LOCK_GUARD(); 2593 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2594 2595 if (!rb) { 2596 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2597 return -1; 2598 } 2599 2600 /* 2601 * On source VM, we don't need to update the received bitmap since 2602 * we don't even have one. 2603 */ 2604 if (rb->receivedmap) { 2605 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2606 length >> qemu_target_page_bits()); 2607 } 2608 2609 return ram_block_discard_range(rb, start, length); 2610 } 2611 2612 /* 2613 * For every allocation, we will try not to crash the VM if the 2614 * allocation failed. 2615 */ 2616 static bool xbzrle_init(Error **errp) 2617 { 2618 if (!migrate_xbzrle()) { 2619 return true; 2620 } 2621 2622 XBZRLE_cache_lock(); 2623 2624 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2625 if (!XBZRLE.zero_target_page) { 2626 error_setg(errp, "%s: Error allocating zero page", __func__); 2627 goto err_out; 2628 } 2629 2630 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2631 TARGET_PAGE_SIZE, errp); 2632 if (!XBZRLE.cache) { 2633 goto free_zero_page; 2634 } 2635 2636 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2637 if (!XBZRLE.encoded_buf) { 2638 error_setg(errp, "%s: Error allocating encoded_buf", __func__); 2639 goto free_cache; 2640 } 2641 2642 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2643 if (!XBZRLE.current_buf) { 2644 error_setg(errp, "%s: Error allocating current_buf", __func__); 2645 goto free_encoded_buf; 2646 } 2647 2648 /* We are all good */ 2649 XBZRLE_cache_unlock(); 2650 return true; 2651 2652 free_encoded_buf: 2653 g_free(XBZRLE.encoded_buf); 2654 XBZRLE.encoded_buf = NULL; 2655 free_cache: 2656 cache_fini(XBZRLE.cache); 2657 XBZRLE.cache = NULL; 2658 free_zero_page: 2659 g_free(XBZRLE.zero_target_page); 2660 XBZRLE.zero_target_page = NULL; 2661 err_out: 2662 XBZRLE_cache_unlock(); 2663 return false; 2664 } 2665 2666 static bool ram_state_init(RAMState **rsp, Error **errp) 2667 { 2668 *rsp = g_try_new0(RAMState, 1); 2669 2670 if (!*rsp) { 2671 error_setg(errp, "%s: Init ramstate fail", __func__); 2672 return false; 2673 } 2674 2675 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2676 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2677 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2678 (*rsp)->ram_bytes_total = ram_bytes_total(); 2679 2680 /* 2681 * Count the total number of pages used by ram blocks not including any 2682 * gaps due to alignment or unplugs. 2683 * This must match with the initial values of dirty bitmap. 2684 */ 2685 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS; 2686 ram_state_reset(*rsp); 2687 2688 return true; 2689 } 2690 2691 static void ram_list_init_bitmaps(void) 2692 { 2693 MigrationState *ms = migrate_get_current(); 2694 RAMBlock *block; 2695 unsigned long pages; 2696 uint8_t shift; 2697 2698 /* Skip setting bitmap if there is no RAM */ 2699 if (ram_bytes_total()) { 2700 shift = ms->clear_bitmap_shift; 2701 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2702 error_report("clear_bitmap_shift (%u) too big, using " 2703 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2704 shift = CLEAR_BITMAP_SHIFT_MAX; 2705 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2706 error_report("clear_bitmap_shift (%u) too small, using " 2707 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2708 shift = CLEAR_BITMAP_SHIFT_MIN; 2709 } 2710 2711 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2712 pages = block->max_length >> TARGET_PAGE_BITS; 2713 /* 2714 * The initial dirty bitmap for migration must be set with all 2715 * ones to make sure we'll migrate every guest RAM page to 2716 * destination. 2717 * Here we set RAMBlock.bmap all to 1 because when rebegin a 2718 * new migration after a failed migration, ram_list. 2719 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 2720 * guest memory. 2721 */ 2722 block->bmap = bitmap_new(pages); 2723 bitmap_set(block->bmap, 0, pages); 2724 if (migrate_mapped_ram()) { 2725 block->file_bmap = bitmap_new(pages); 2726 } 2727 block->clear_bmap_shift = shift; 2728 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 2729 } 2730 } 2731 } 2732 2733 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 2734 { 2735 unsigned long pages; 2736 RAMBlock *rb; 2737 2738 RCU_READ_LOCK_GUARD(); 2739 2740 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 2741 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 2742 rs->migration_dirty_pages -= pages; 2743 } 2744 } 2745 2746 static bool ram_init_bitmaps(RAMState *rs, Error **errp) 2747 { 2748 bool ret = true; 2749 2750 qemu_mutex_lock_ramlist(); 2751 2752 WITH_RCU_READ_LOCK_GUARD() { 2753 ram_list_init_bitmaps(); 2754 /* We don't use dirty log with background snapshots */ 2755 if (!migrate_background_snapshot()) { 2756 ret = memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION, errp); 2757 if (!ret) { 2758 goto out_unlock; 2759 } 2760 migration_bitmap_sync_precopy(false); 2761 } 2762 } 2763 out_unlock: 2764 qemu_mutex_unlock_ramlist(); 2765 2766 if (!ret) { 2767 ram_bitmaps_destroy(); 2768 return false; 2769 } 2770 2771 /* 2772 * After an eventual first bitmap sync, fixup the initial bitmap 2773 * containing all 1s to exclude any discarded pages from migration. 2774 */ 2775 migration_bitmap_clear_discarded_pages(rs); 2776 return true; 2777 } 2778 2779 static int ram_init_all(RAMState **rsp, Error **errp) 2780 { 2781 if (!ram_state_init(rsp, errp)) { 2782 return -1; 2783 } 2784 2785 if (!xbzrle_init(errp)) { 2786 ram_state_cleanup(rsp); 2787 return -1; 2788 } 2789 2790 if (!ram_init_bitmaps(*rsp, errp)) { 2791 return -1; 2792 } 2793 2794 return 0; 2795 } 2796 2797 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2798 { 2799 RAMBlock *block; 2800 uint64_t pages = 0; 2801 2802 /* 2803 * Postcopy is not using xbzrle/compression, so no need for that. 2804 * Also, since source are already halted, we don't need to care 2805 * about dirty page logging as well. 2806 */ 2807 2808 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2809 pages += bitmap_count_one(block->bmap, 2810 block->used_length >> TARGET_PAGE_BITS); 2811 } 2812 2813 /* This may not be aligned with current bitmaps. Recalculate. */ 2814 rs->migration_dirty_pages = pages; 2815 2816 ram_state_reset(rs); 2817 2818 /* Update RAMState cache of output QEMUFile */ 2819 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out; 2820 2821 trace_ram_state_resume_prepare(pages); 2822 } 2823 2824 /* 2825 * This function clears bits of the free pages reported by the caller from the 2826 * migration dirty bitmap. @addr is the host address corresponding to the 2827 * start of the continuous guest free pages, and @len is the total bytes of 2828 * those pages. 2829 */ 2830 void qemu_guest_free_page_hint(void *addr, size_t len) 2831 { 2832 RAMBlock *block; 2833 ram_addr_t offset; 2834 size_t used_len, start, npages; 2835 2836 /* This function is currently expected to be used during live migration */ 2837 if (!migration_is_running()) { 2838 return; 2839 } 2840 2841 for (; len > 0; len -= used_len, addr += used_len) { 2842 block = qemu_ram_block_from_host(addr, false, &offset); 2843 if (unlikely(!block || offset >= block->used_length)) { 2844 /* 2845 * The implementation might not support RAMBlock resize during 2846 * live migration, but it could happen in theory with future 2847 * updates. So we add a check here to capture that case. 2848 */ 2849 error_report_once("%s unexpected error", __func__); 2850 return; 2851 } 2852 2853 if (len <= block->used_length - offset) { 2854 used_len = len; 2855 } else { 2856 used_len = block->used_length - offset; 2857 } 2858 2859 start = offset >> TARGET_PAGE_BITS; 2860 npages = used_len >> TARGET_PAGE_BITS; 2861 2862 qemu_mutex_lock(&ram_state->bitmap_mutex); 2863 /* 2864 * The skipped free pages are equavalent to be sent from clear_bmap's 2865 * perspective, so clear the bits from the memory region bitmap which 2866 * are initially set. Otherwise those skipped pages will be sent in 2867 * the next round after syncing from the memory region bitmap. 2868 */ 2869 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 2870 ram_state->migration_dirty_pages -= 2871 bitmap_count_one_with_offset(block->bmap, start, npages); 2872 bitmap_clear(block->bmap, start, npages); 2873 qemu_mutex_unlock(&ram_state->bitmap_mutex); 2874 } 2875 } 2876 2877 #define MAPPED_RAM_HDR_VERSION 1 2878 struct MappedRamHeader { 2879 uint32_t version; 2880 /* 2881 * The target's page size, so we know how many pages are in the 2882 * bitmap. 2883 */ 2884 uint64_t page_size; 2885 /* 2886 * The offset in the migration file where the pages bitmap is 2887 * stored. 2888 */ 2889 uint64_t bitmap_offset; 2890 /* 2891 * The offset in the migration file where the actual pages (data) 2892 * are stored. 2893 */ 2894 uint64_t pages_offset; 2895 } QEMU_PACKED; 2896 typedef struct MappedRamHeader MappedRamHeader; 2897 2898 static void mapped_ram_setup_ramblock(QEMUFile *file, RAMBlock *block) 2899 { 2900 g_autofree MappedRamHeader *header = NULL; 2901 size_t header_size, bitmap_size; 2902 long num_pages; 2903 2904 header = g_new0(MappedRamHeader, 1); 2905 header_size = sizeof(MappedRamHeader); 2906 2907 num_pages = block->used_length >> TARGET_PAGE_BITS; 2908 bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); 2909 2910 /* 2911 * Save the file offsets of where the bitmap and the pages should 2912 * go as they are written at the end of migration and during the 2913 * iterative phase, respectively. 2914 */ 2915 block->bitmap_offset = qemu_get_offset(file) + header_size; 2916 block->pages_offset = ROUND_UP(block->bitmap_offset + 2917 bitmap_size, 2918 MAPPED_RAM_FILE_OFFSET_ALIGNMENT); 2919 2920 header->version = cpu_to_be32(MAPPED_RAM_HDR_VERSION); 2921 header->page_size = cpu_to_be64(TARGET_PAGE_SIZE); 2922 header->bitmap_offset = cpu_to_be64(block->bitmap_offset); 2923 header->pages_offset = cpu_to_be64(block->pages_offset); 2924 2925 qemu_put_buffer(file, (uint8_t *) header, header_size); 2926 2927 /* prepare offset for next ramblock */ 2928 qemu_set_offset(file, block->pages_offset + block->used_length, SEEK_SET); 2929 } 2930 2931 static bool mapped_ram_read_header(QEMUFile *file, MappedRamHeader *header, 2932 Error **errp) 2933 { 2934 size_t ret, header_size = sizeof(MappedRamHeader); 2935 2936 ret = qemu_get_buffer(file, (uint8_t *)header, header_size); 2937 if (ret != header_size) { 2938 error_setg(errp, "Could not read whole mapped-ram migration header " 2939 "(expected %zd, got %zd bytes)", header_size, ret); 2940 return false; 2941 } 2942 2943 /* migration stream is big-endian */ 2944 header->version = be32_to_cpu(header->version); 2945 2946 if (header->version > MAPPED_RAM_HDR_VERSION) { 2947 error_setg(errp, "Migration mapped-ram capability version not " 2948 "supported (expected <= %d, got %d)", MAPPED_RAM_HDR_VERSION, 2949 header->version); 2950 return false; 2951 } 2952 2953 header->page_size = be64_to_cpu(header->page_size); 2954 header->bitmap_offset = be64_to_cpu(header->bitmap_offset); 2955 header->pages_offset = be64_to_cpu(header->pages_offset); 2956 2957 return true; 2958 } 2959 2960 /* 2961 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 2962 * long-running RCU critical section. When rcu-reclaims in the code 2963 * start to become numerous it will be necessary to reduce the 2964 * granularity of these critical sections. 2965 */ 2966 2967 /** 2968 * ram_save_setup: Setup RAM for migration 2969 * 2970 * Returns zero to indicate success and negative for error 2971 * 2972 * @f: QEMUFile where to send the data 2973 * @opaque: RAMState pointer 2974 * @errp: pointer to Error*, to store an error if it happens. 2975 */ 2976 static int ram_save_setup(QEMUFile *f, void *opaque, Error **errp) 2977 { 2978 RAMState **rsp = opaque; 2979 RAMBlock *block; 2980 int ret, max_hg_page_size; 2981 2982 /* migration has already setup the bitmap, reuse it. */ 2983 if (!migration_in_colo_state()) { 2984 if (ram_init_all(rsp, errp) != 0) { 2985 return -1; 2986 } 2987 } 2988 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; 2989 2990 /* 2991 * ??? Mirrors the previous value of qemu_host_page_size, 2992 * but is this really what was intended for the migration? 2993 */ 2994 max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE); 2995 2996 WITH_RCU_READ_LOCK_GUARD() { 2997 qemu_put_be64(f, ram_bytes_total_with_ignored() 2998 | RAM_SAVE_FLAG_MEM_SIZE); 2999 3000 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3001 qemu_put_byte(f, strlen(block->idstr)); 3002 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3003 qemu_put_be64(f, block->used_length); 3004 if (migrate_postcopy_ram() && 3005 block->page_size != max_hg_page_size) { 3006 qemu_put_be64(f, block->page_size); 3007 } 3008 if (migrate_ignore_shared()) { 3009 qemu_put_be64(f, block->mr->addr); 3010 } 3011 3012 if (migrate_mapped_ram()) { 3013 mapped_ram_setup_ramblock(f, block); 3014 } 3015 } 3016 } 3017 3018 ret = rdma_registration_start(f, RAM_CONTROL_SETUP); 3019 if (ret < 0) { 3020 error_setg(errp, "%s: failed to start RDMA registration", __func__); 3021 qemu_file_set_error(f, ret); 3022 return ret; 3023 } 3024 3025 ret = rdma_registration_stop(f, RAM_CONTROL_SETUP); 3026 if (ret < 0) { 3027 error_setg(errp, "%s: failed to stop RDMA registration", __func__); 3028 qemu_file_set_error(f, ret); 3029 return ret; 3030 } 3031 3032 migration_ops = g_malloc0(sizeof(MigrationOps)); 3033 3034 if (migrate_multifd()) { 3035 multifd_ram_save_setup(); 3036 migration_ops->ram_save_target_page = ram_save_target_page_multifd; 3037 } else { 3038 migration_ops->ram_save_target_page = ram_save_target_page_legacy; 3039 } 3040 3041 /* 3042 * This operation is unfortunate.. 3043 * 3044 * For legacy QEMUs using per-section sync 3045 * ======================================= 3046 * 3047 * This must exist because the EOS below requires the SYNC messages 3048 * per-channel to work. 3049 * 3050 * For modern QEMUs using per-round sync 3051 * ===================================== 3052 * 3053 * Logically such sync is not needed, and recv threads should not run 3054 * until setup ready (using things like channels_ready on src). Then 3055 * we should be all fine. 3056 * 3057 * However even if we add channels_ready to recv side in new QEMUs, old 3058 * QEMU won't have them so this sync will still be needed to make sure 3059 * multifd recv threads won't start processing guest pages early before 3060 * ram_load_setup() is properly done. 3061 * 3062 * Let's stick with this. Fortunately the overhead is low to sync 3063 * during setup because the VM is running, so at least it's not 3064 * accounted as part of downtime. 3065 */ 3066 bql_unlock(); 3067 ret = multifd_ram_flush_and_sync(f); 3068 bql_lock(); 3069 if (ret < 0) { 3070 error_setg(errp, "%s: multifd synchronization failed", __func__); 3071 return ret; 3072 } 3073 3074 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3075 ret = qemu_fflush(f); 3076 if (ret < 0) { 3077 error_setg_errno(errp, -ret, "%s failed", __func__); 3078 } 3079 return ret; 3080 } 3081 3082 static void ram_save_file_bmap(QEMUFile *f) 3083 { 3084 RAMBlock *block; 3085 3086 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3087 long num_pages = block->used_length >> TARGET_PAGE_BITS; 3088 long bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); 3089 3090 qemu_put_buffer_at(f, (uint8_t *)block->file_bmap, bitmap_size, 3091 block->bitmap_offset); 3092 ram_transferred_add(bitmap_size); 3093 3094 /* 3095 * Free the bitmap here to catch any synchronization issues 3096 * with multifd channels. No channels should be sending pages 3097 * after we've written the bitmap to file. 3098 */ 3099 g_free(block->file_bmap); 3100 block->file_bmap = NULL; 3101 } 3102 } 3103 3104 void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset, bool set) 3105 { 3106 if (set) { 3107 set_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); 3108 } else { 3109 clear_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); 3110 } 3111 } 3112 3113 /** 3114 * ram_save_iterate: iterative stage for migration 3115 * 3116 * Returns zero to indicate success and negative for error 3117 * 3118 * @f: QEMUFile where to send the data 3119 * @opaque: RAMState pointer 3120 */ 3121 static int ram_save_iterate(QEMUFile *f, void *opaque) 3122 { 3123 RAMState **temp = opaque; 3124 RAMState *rs = *temp; 3125 int ret = 0; 3126 int i; 3127 int64_t t0; 3128 int done = 0; 3129 3130 /* 3131 * We'll take this lock a little bit long, but it's okay for two reasons. 3132 * Firstly, the only possible other thread to take it is who calls 3133 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3134 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3135 * guarantees that we'll at least released it in a regular basis. 3136 */ 3137 WITH_QEMU_LOCK_GUARD(&rs->bitmap_mutex) { 3138 WITH_RCU_READ_LOCK_GUARD() { 3139 if (ram_list.version != rs->last_version) { 3140 ram_state_reset(rs); 3141 } 3142 3143 /* Read version before ram_list.blocks */ 3144 smp_rmb(); 3145 3146 ret = rdma_registration_start(f, RAM_CONTROL_ROUND); 3147 if (ret < 0) { 3148 qemu_file_set_error(f, ret); 3149 goto out; 3150 } 3151 3152 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3153 i = 0; 3154 while ((ret = migration_rate_exceeded(f)) == 0 || 3155 postcopy_has_request(rs)) { 3156 int pages; 3157 3158 if (qemu_file_get_error(f)) { 3159 break; 3160 } 3161 3162 pages = ram_find_and_save_block(rs); 3163 /* no more pages to sent */ 3164 if (pages == 0) { 3165 done = 1; 3166 break; 3167 } 3168 3169 if (pages < 0) { 3170 qemu_file_set_error(f, pages); 3171 break; 3172 } 3173 3174 rs->target_page_count += pages; 3175 3176 /* 3177 * we want to check in the 1st loop, just in case it was the 1st 3178 * time and we had to sync the dirty bitmap. 3179 * qemu_clock_get_ns() is a bit expensive, so we only check each 3180 * some iterations 3181 */ 3182 if ((i & 63) == 0) { 3183 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3184 1000000; 3185 if (t1 > MAX_WAIT) { 3186 trace_ram_save_iterate_big_wait(t1, i); 3187 break; 3188 } 3189 } 3190 i++; 3191 } 3192 } 3193 } 3194 3195 /* 3196 * Must occur before EOS (or any QEMUFile operation) 3197 * because of RDMA protocol. 3198 */ 3199 ret = rdma_registration_stop(f, RAM_CONTROL_ROUND); 3200 if (ret < 0) { 3201 qemu_file_set_error(f, ret); 3202 } 3203 3204 out: 3205 if (ret >= 0 && migration_is_running()) { 3206 if (multifd_ram_sync_per_section()) { 3207 ret = multifd_ram_flush_and_sync(f); 3208 if (ret < 0) { 3209 return ret; 3210 } 3211 } 3212 3213 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3214 ram_transferred_add(8); 3215 ret = qemu_fflush(f); 3216 } 3217 if (ret < 0) { 3218 return ret; 3219 } 3220 3221 return done; 3222 } 3223 3224 /** 3225 * ram_save_complete: function called to send the remaining amount of ram 3226 * 3227 * Returns zero to indicate success or negative on error 3228 * 3229 * Called with the BQL 3230 * 3231 * @f: QEMUFile where to send the data 3232 * @opaque: RAMState pointer 3233 */ 3234 static int ram_save_complete(QEMUFile *f, void *opaque) 3235 { 3236 RAMState **temp = opaque; 3237 RAMState *rs = *temp; 3238 int ret = 0; 3239 3240 rs->last_stage = !migration_in_colo_state(); 3241 3242 WITH_RCU_READ_LOCK_GUARD() { 3243 if (!migration_in_postcopy()) { 3244 migration_bitmap_sync_precopy(true); 3245 } 3246 3247 ret = rdma_registration_start(f, RAM_CONTROL_FINISH); 3248 if (ret < 0) { 3249 qemu_file_set_error(f, ret); 3250 return ret; 3251 } 3252 3253 /* try transferring iterative blocks of memory */ 3254 3255 /* flush all remaining blocks regardless of rate limiting */ 3256 qemu_mutex_lock(&rs->bitmap_mutex); 3257 while (true) { 3258 int pages; 3259 3260 pages = ram_find_and_save_block(rs); 3261 /* no more blocks to sent */ 3262 if (pages == 0) { 3263 break; 3264 } 3265 if (pages < 0) { 3266 qemu_mutex_unlock(&rs->bitmap_mutex); 3267 return pages; 3268 } 3269 } 3270 qemu_mutex_unlock(&rs->bitmap_mutex); 3271 3272 ret = rdma_registration_stop(f, RAM_CONTROL_FINISH); 3273 if (ret < 0) { 3274 qemu_file_set_error(f, ret); 3275 return ret; 3276 } 3277 } 3278 3279 if (multifd_ram_sync_per_section()) { 3280 /* 3281 * Only the old dest QEMU will need this sync, because each EOS 3282 * will require one SYNC message on each channel. 3283 */ 3284 ret = multifd_ram_flush_and_sync(f); 3285 if (ret < 0) { 3286 return ret; 3287 } 3288 } 3289 3290 if (migrate_mapped_ram()) { 3291 ram_save_file_bmap(f); 3292 3293 if (qemu_file_get_error(f)) { 3294 Error *local_err = NULL; 3295 int err = qemu_file_get_error_obj(f, &local_err); 3296 3297 error_reportf_err(local_err, "Failed to write bitmap to file: "); 3298 return -err; 3299 } 3300 } 3301 3302 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3303 return qemu_fflush(f); 3304 } 3305 3306 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy, 3307 uint64_t *can_postcopy) 3308 { 3309 RAMState **temp = opaque; 3310 RAMState *rs = *temp; 3311 3312 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3313 3314 if (migrate_postcopy_ram()) { 3315 /* We can do postcopy, and all the data is postcopiable */ 3316 *can_postcopy += remaining_size; 3317 } else { 3318 *must_precopy += remaining_size; 3319 } 3320 } 3321 3322 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy, 3323 uint64_t *can_postcopy) 3324 { 3325 RAMState **temp = opaque; 3326 RAMState *rs = *temp; 3327 uint64_t remaining_size; 3328 3329 if (!migration_in_postcopy()) { 3330 bql_lock(); 3331 WITH_RCU_READ_LOCK_GUARD() { 3332 migration_bitmap_sync_precopy(false); 3333 } 3334 bql_unlock(); 3335 } 3336 3337 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3338 3339 if (migrate_postcopy_ram()) { 3340 /* We can do postcopy, and all the data is postcopiable */ 3341 *can_postcopy += remaining_size; 3342 } else { 3343 *must_precopy += remaining_size; 3344 } 3345 } 3346 3347 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3348 { 3349 unsigned int xh_len; 3350 int xh_flags; 3351 uint8_t *loaded_data; 3352 3353 /* extract RLE header */ 3354 xh_flags = qemu_get_byte(f); 3355 xh_len = qemu_get_be16(f); 3356 3357 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3358 error_report("Failed to load XBZRLE page - wrong compression!"); 3359 return -1; 3360 } 3361 3362 if (xh_len > TARGET_PAGE_SIZE) { 3363 error_report("Failed to load XBZRLE page - len overflow!"); 3364 return -1; 3365 } 3366 loaded_data = XBZRLE.decoded_buf; 3367 /* load data and decode */ 3368 /* it can change loaded_data to point to an internal buffer */ 3369 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3370 3371 /* decode RLE */ 3372 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3373 TARGET_PAGE_SIZE) == -1) { 3374 error_report("Failed to load XBZRLE page - decode error!"); 3375 return -1; 3376 } 3377 3378 return 0; 3379 } 3380 3381 /** 3382 * ram_block_from_stream: read a RAMBlock id from the migration stream 3383 * 3384 * Must be called from within a rcu critical section. 3385 * 3386 * Returns a pointer from within the RCU-protected ram_list. 3387 * 3388 * @mis: the migration incoming state pointer 3389 * @f: QEMUFile where to read the data from 3390 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3391 * @channel: the channel we're using 3392 */ 3393 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3394 QEMUFile *f, int flags, 3395 int channel) 3396 { 3397 RAMBlock *block = mis->last_recv_block[channel]; 3398 char id[256]; 3399 uint8_t len; 3400 3401 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3402 if (!block) { 3403 error_report("Ack, bad migration stream!"); 3404 return NULL; 3405 } 3406 return block; 3407 } 3408 3409 len = qemu_get_byte(f); 3410 qemu_get_buffer(f, (uint8_t *)id, len); 3411 id[len] = 0; 3412 3413 block = qemu_ram_block_by_name(id); 3414 if (!block) { 3415 error_report("Can't find block %s", id); 3416 return NULL; 3417 } 3418 3419 if (migrate_ram_is_ignored(block)) { 3420 error_report("block %s should not be migrated !", id); 3421 return NULL; 3422 } 3423 3424 mis->last_recv_block[channel] = block; 3425 3426 return block; 3427 } 3428 3429 static inline void *host_from_ram_block_offset(RAMBlock *block, 3430 ram_addr_t offset) 3431 { 3432 if (!offset_in_ramblock(block, offset)) { 3433 return NULL; 3434 } 3435 3436 return block->host + offset; 3437 } 3438 3439 static void *host_page_from_ram_block_offset(RAMBlock *block, 3440 ram_addr_t offset) 3441 { 3442 /* Note: Explicitly no check against offset_in_ramblock(). */ 3443 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3444 block->page_size); 3445 } 3446 3447 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3448 ram_addr_t offset) 3449 { 3450 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3451 } 3452 3453 void colo_record_bitmap(RAMBlock *block, ram_addr_t *normal, uint32_t pages) 3454 { 3455 qemu_mutex_lock(&ram_state->bitmap_mutex); 3456 for (int i = 0; i < pages; i++) { 3457 ram_addr_t offset = normal[i]; 3458 ram_state->migration_dirty_pages += !test_and_set_bit( 3459 offset >> TARGET_PAGE_BITS, 3460 block->bmap); 3461 } 3462 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3463 } 3464 3465 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3466 ram_addr_t offset, bool record_bitmap) 3467 { 3468 if (!offset_in_ramblock(block, offset)) { 3469 return NULL; 3470 } 3471 if (!block->colo_cache) { 3472 error_report("%s: colo_cache is NULL in block :%s", 3473 __func__, block->idstr); 3474 return NULL; 3475 } 3476 3477 /* 3478 * During colo checkpoint, we need bitmap of these migrated pages. 3479 * It help us to decide which pages in ram cache should be flushed 3480 * into VM's RAM later. 3481 */ 3482 if (record_bitmap) { 3483 colo_record_bitmap(block, &offset, 1); 3484 } 3485 return block->colo_cache + offset; 3486 } 3487 3488 /** 3489 * ram_handle_zero: handle the zero page case 3490 * 3491 * If a page (or a whole RDMA chunk) has been 3492 * determined to be zero, then zap it. 3493 * 3494 * @host: host address for the zero page 3495 * @ch: what the page is filled from. We only support zero 3496 * @size: size of the zero page 3497 */ 3498 void ram_handle_zero(void *host, uint64_t size) 3499 { 3500 if (!buffer_is_zero(host, size)) { 3501 memset(host, 0, size); 3502 } 3503 } 3504 3505 static void colo_init_ram_state(void) 3506 { 3507 Error *local_err = NULL; 3508 3509 if (!ram_state_init(&ram_state, &local_err)) { 3510 error_report_err(local_err); 3511 } 3512 } 3513 3514 /* 3515 * colo cache: this is for secondary VM, we cache the whole 3516 * memory of the secondary VM, it is need to hold the global lock 3517 * to call this helper. 3518 */ 3519 int colo_init_ram_cache(void) 3520 { 3521 RAMBlock *block; 3522 3523 WITH_RCU_READ_LOCK_GUARD() { 3524 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3525 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3526 NULL, false, false); 3527 if (!block->colo_cache) { 3528 error_report("%s: Can't alloc memory for COLO cache of block %s," 3529 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3530 block->used_length); 3531 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3532 if (block->colo_cache) { 3533 qemu_anon_ram_free(block->colo_cache, block->used_length); 3534 block->colo_cache = NULL; 3535 } 3536 } 3537 return -errno; 3538 } 3539 if (!machine_dump_guest_core(current_machine)) { 3540 qemu_madvise(block->colo_cache, block->used_length, 3541 QEMU_MADV_DONTDUMP); 3542 } 3543 } 3544 } 3545 3546 /* 3547 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3548 * with to decide which page in cache should be flushed into SVM's RAM. Here 3549 * we use the same name 'ram_bitmap' as for migration. 3550 */ 3551 if (ram_bytes_total()) { 3552 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3553 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3554 block->bmap = bitmap_new(pages); 3555 } 3556 } 3557 3558 colo_init_ram_state(); 3559 return 0; 3560 } 3561 3562 /* TODO: duplicated with ram_init_bitmaps */ 3563 void colo_incoming_start_dirty_log(void) 3564 { 3565 RAMBlock *block = NULL; 3566 Error *local_err = NULL; 3567 3568 /* For memory_global_dirty_log_start below. */ 3569 bql_lock(); 3570 qemu_mutex_lock_ramlist(); 3571 3572 memory_global_dirty_log_sync(false); 3573 WITH_RCU_READ_LOCK_GUARD() { 3574 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3575 ramblock_sync_dirty_bitmap(ram_state, block); 3576 /* Discard this dirty bitmap record */ 3577 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3578 } 3579 if (!memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION, 3580 &local_err)) { 3581 error_report_err(local_err); 3582 } 3583 } 3584 ram_state->migration_dirty_pages = 0; 3585 qemu_mutex_unlock_ramlist(); 3586 bql_unlock(); 3587 } 3588 3589 /* It is need to hold the global lock to call this helper */ 3590 void colo_release_ram_cache(void) 3591 { 3592 RAMBlock *block; 3593 3594 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3595 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3596 g_free(block->bmap); 3597 block->bmap = NULL; 3598 } 3599 3600 WITH_RCU_READ_LOCK_GUARD() { 3601 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3602 if (block->colo_cache) { 3603 qemu_anon_ram_free(block->colo_cache, block->used_length); 3604 block->colo_cache = NULL; 3605 } 3606 } 3607 } 3608 ram_state_cleanup(&ram_state); 3609 } 3610 3611 /** 3612 * ram_load_setup: Setup RAM for migration incoming side 3613 * 3614 * Returns zero to indicate success and negative for error 3615 * 3616 * @f: QEMUFile where to receive the data 3617 * @opaque: RAMState pointer 3618 * @errp: pointer to Error*, to store an error if it happens. 3619 */ 3620 static int ram_load_setup(QEMUFile *f, void *opaque, Error **errp) 3621 { 3622 xbzrle_load_setup(); 3623 ramblock_recv_map_init(); 3624 3625 return 0; 3626 } 3627 3628 static int ram_load_cleanup(void *opaque) 3629 { 3630 RAMBlock *rb; 3631 3632 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3633 qemu_ram_block_writeback(rb); 3634 } 3635 3636 xbzrle_load_cleanup(); 3637 3638 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3639 g_free(rb->receivedmap); 3640 rb->receivedmap = NULL; 3641 } 3642 3643 return 0; 3644 } 3645 3646 /** 3647 * ram_postcopy_incoming_init: allocate postcopy data structures 3648 * 3649 * Returns 0 for success and negative if there was one error 3650 * 3651 * @mis: current migration incoming state 3652 * 3653 * Allocate data structures etc needed by incoming migration with 3654 * postcopy-ram. postcopy-ram's similarly names 3655 * postcopy_ram_incoming_init does the work. 3656 */ 3657 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3658 { 3659 return postcopy_ram_incoming_init(mis); 3660 } 3661 3662 /** 3663 * ram_load_postcopy: load a page in postcopy case 3664 * 3665 * Returns 0 for success or -errno in case of error 3666 * 3667 * Called in postcopy mode by ram_load(). 3668 * rcu_read_lock is taken prior to this being called. 3669 * 3670 * @f: QEMUFile where to send the data 3671 * @channel: the channel to use for loading 3672 */ 3673 int ram_load_postcopy(QEMUFile *f, int channel) 3674 { 3675 int flags = 0, ret = 0; 3676 bool place_needed = false; 3677 bool matches_target_page_size = false; 3678 MigrationIncomingState *mis = migration_incoming_get_current(); 3679 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel]; 3680 3681 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3682 ram_addr_t addr; 3683 void *page_buffer = NULL; 3684 void *place_source = NULL; 3685 RAMBlock *block = NULL; 3686 uint8_t ch; 3687 3688 addr = qemu_get_be64(f); 3689 3690 /* 3691 * If qemu file error, we should stop here, and then "addr" 3692 * may be invalid 3693 */ 3694 ret = qemu_file_get_error(f); 3695 if (ret) { 3696 break; 3697 } 3698 3699 flags = addr & ~TARGET_PAGE_MASK; 3700 addr &= TARGET_PAGE_MASK; 3701 3702 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags); 3703 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) { 3704 block = ram_block_from_stream(mis, f, flags, channel); 3705 if (!block) { 3706 ret = -EINVAL; 3707 break; 3708 } 3709 3710 /* 3711 * Relying on used_length is racy and can result in false positives. 3712 * We might place pages beyond used_length in case RAM was shrunk 3713 * while in postcopy, which is fine - trying to place via 3714 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 3715 */ 3716 if (!block->host || addr >= block->postcopy_length) { 3717 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3718 ret = -EINVAL; 3719 break; 3720 } 3721 tmp_page->target_pages++; 3722 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3723 /* 3724 * Postcopy requires that we place whole host pages atomically; 3725 * these may be huge pages for RAMBlocks that are backed by 3726 * hugetlbfs. 3727 * To make it atomic, the data is read into a temporary page 3728 * that's moved into place later. 3729 * The migration protocol uses, possibly smaller, target-pages 3730 * however the source ensures it always sends all the components 3731 * of a host page in one chunk. 3732 */ 3733 page_buffer = tmp_page->tmp_huge_page + 3734 host_page_offset_from_ram_block_offset(block, addr); 3735 /* If all TP are zero then we can optimise the place */ 3736 if (tmp_page->target_pages == 1) { 3737 tmp_page->host_addr = 3738 host_page_from_ram_block_offset(block, addr); 3739 } else if (tmp_page->host_addr != 3740 host_page_from_ram_block_offset(block, addr)) { 3741 /* not the 1st TP within the HP */ 3742 error_report("Non-same host page detected on channel %d: " 3743 "Target host page %p, received host page %p " 3744 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 3745 channel, tmp_page->host_addr, 3746 host_page_from_ram_block_offset(block, addr), 3747 block->idstr, addr, tmp_page->target_pages); 3748 ret = -EINVAL; 3749 break; 3750 } 3751 3752 /* 3753 * If it's the last part of a host page then we place the host 3754 * page 3755 */ 3756 if (tmp_page->target_pages == 3757 (block->page_size / TARGET_PAGE_SIZE)) { 3758 place_needed = true; 3759 } 3760 place_source = tmp_page->tmp_huge_page; 3761 } 3762 3763 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3764 case RAM_SAVE_FLAG_ZERO: 3765 ch = qemu_get_byte(f); 3766 if (ch != 0) { 3767 error_report("Found a zero page with value %d", ch); 3768 ret = -EINVAL; 3769 break; 3770 } 3771 /* 3772 * Can skip to set page_buffer when 3773 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 3774 */ 3775 if (!matches_target_page_size) { 3776 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3777 } 3778 break; 3779 3780 case RAM_SAVE_FLAG_PAGE: 3781 tmp_page->all_zero = false; 3782 if (!matches_target_page_size) { 3783 /* For huge pages, we always use temporary buffer */ 3784 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3785 } else { 3786 /* 3787 * For small pages that matches target page size, we 3788 * avoid the qemu_file copy. Instead we directly use 3789 * the buffer of QEMUFile to place the page. Note: we 3790 * cannot do any QEMUFile operation before using that 3791 * buffer to make sure the buffer is valid when 3792 * placing the page. 3793 */ 3794 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3795 TARGET_PAGE_SIZE); 3796 } 3797 break; 3798 case RAM_SAVE_FLAG_EOS: 3799 break; 3800 default: 3801 error_report("Unknown combination of migration flags: 0x%x" 3802 " (postcopy mode)", flags); 3803 ret = -EINVAL; 3804 break; 3805 } 3806 3807 /* Detect for any possible file errors */ 3808 if (!ret && qemu_file_get_error(f)) { 3809 ret = qemu_file_get_error(f); 3810 } 3811 3812 if (!ret && place_needed) { 3813 if (tmp_page->all_zero) { 3814 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 3815 } else { 3816 ret = postcopy_place_page(mis, tmp_page->host_addr, 3817 place_source, block); 3818 } 3819 place_needed = false; 3820 postcopy_temp_page_reset(tmp_page); 3821 } 3822 } 3823 3824 return ret; 3825 } 3826 3827 static bool postcopy_is_running(void) 3828 { 3829 PostcopyState ps = postcopy_state_get(); 3830 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3831 } 3832 3833 /* 3834 * Flush content of RAM cache into SVM's memory. 3835 * Only flush the pages that be dirtied by PVM or SVM or both. 3836 */ 3837 void colo_flush_ram_cache(void) 3838 { 3839 RAMBlock *block = NULL; 3840 void *dst_host; 3841 void *src_host; 3842 unsigned long offset = 0; 3843 3844 memory_global_dirty_log_sync(false); 3845 qemu_mutex_lock(&ram_state->bitmap_mutex); 3846 WITH_RCU_READ_LOCK_GUARD() { 3847 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3848 ramblock_sync_dirty_bitmap(ram_state, block); 3849 } 3850 } 3851 3852 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 3853 WITH_RCU_READ_LOCK_GUARD() { 3854 block = QLIST_FIRST_RCU(&ram_list.blocks); 3855 3856 while (block) { 3857 unsigned long num = 0; 3858 3859 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 3860 if (!offset_in_ramblock(block, 3861 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 3862 offset = 0; 3863 num = 0; 3864 block = QLIST_NEXT_RCU(block, next); 3865 } else { 3866 unsigned long i = 0; 3867 3868 for (i = 0; i < num; i++) { 3869 migration_bitmap_clear_dirty(ram_state, block, offset + i); 3870 } 3871 dst_host = block->host 3872 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3873 src_host = block->colo_cache 3874 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3875 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 3876 offset += num; 3877 } 3878 } 3879 } 3880 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3881 trace_colo_flush_ram_cache_end(); 3882 } 3883 3884 static size_t ram_load_multifd_pages(void *host_addr, size_t size, 3885 uint64_t offset) 3886 { 3887 MultiFDRecvData *data = multifd_get_recv_data(); 3888 3889 data->opaque = host_addr; 3890 data->file_offset = offset; 3891 data->size = size; 3892 3893 if (!multifd_recv()) { 3894 return 0; 3895 } 3896 3897 return size; 3898 } 3899 3900 static bool read_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, 3901 long num_pages, unsigned long *bitmap, 3902 Error **errp) 3903 { 3904 ERRP_GUARD(); 3905 unsigned long set_bit_idx, clear_bit_idx; 3906 ram_addr_t offset; 3907 void *host; 3908 size_t read, unread, size; 3909 3910 for (set_bit_idx = find_first_bit(bitmap, num_pages); 3911 set_bit_idx < num_pages; 3912 set_bit_idx = find_next_bit(bitmap, num_pages, clear_bit_idx + 1)) { 3913 3914 clear_bit_idx = find_next_zero_bit(bitmap, num_pages, set_bit_idx + 1); 3915 3916 unread = TARGET_PAGE_SIZE * (clear_bit_idx - set_bit_idx); 3917 offset = set_bit_idx << TARGET_PAGE_BITS; 3918 3919 while (unread > 0) { 3920 host = host_from_ram_block_offset(block, offset); 3921 if (!host) { 3922 error_setg(errp, "page outside of ramblock %s range", 3923 block->idstr); 3924 return false; 3925 } 3926 3927 size = MIN(unread, MAPPED_RAM_LOAD_BUF_SIZE); 3928 3929 if (migrate_multifd()) { 3930 read = ram_load_multifd_pages(host, size, 3931 block->pages_offset + offset); 3932 } else { 3933 read = qemu_get_buffer_at(f, host, size, 3934 block->pages_offset + offset); 3935 } 3936 3937 if (!read) { 3938 goto err; 3939 } 3940 offset += read; 3941 unread -= read; 3942 } 3943 } 3944 3945 return true; 3946 3947 err: 3948 qemu_file_get_error_obj(f, errp); 3949 error_prepend(errp, "(%s) failed to read page " RAM_ADDR_FMT 3950 "from file offset %" PRIx64 ": ", block->idstr, offset, 3951 block->pages_offset + offset); 3952 return false; 3953 } 3954 3955 static void parse_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, 3956 ram_addr_t length, Error **errp) 3957 { 3958 g_autofree unsigned long *bitmap = NULL; 3959 MappedRamHeader header; 3960 size_t bitmap_size; 3961 long num_pages; 3962 3963 if (!mapped_ram_read_header(f, &header, errp)) { 3964 return; 3965 } 3966 3967 block->pages_offset = header.pages_offset; 3968 3969 /* 3970 * Check the alignment of the file region that contains pages. We 3971 * don't enforce MAPPED_RAM_FILE_OFFSET_ALIGNMENT to allow that 3972 * value to change in the future. Do only a sanity check with page 3973 * size alignment. 3974 */ 3975 if (!QEMU_IS_ALIGNED(block->pages_offset, TARGET_PAGE_SIZE)) { 3976 error_setg(errp, 3977 "Error reading ramblock %s pages, region has bad alignment", 3978 block->idstr); 3979 return; 3980 } 3981 3982 num_pages = length / header.page_size; 3983 bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); 3984 3985 bitmap = g_malloc0(bitmap_size); 3986 if (qemu_get_buffer_at(f, (uint8_t *)bitmap, bitmap_size, 3987 header.bitmap_offset) != bitmap_size) { 3988 error_setg(errp, "Error reading dirty bitmap"); 3989 return; 3990 } 3991 3992 if (!read_ramblock_mapped_ram(f, block, num_pages, bitmap, errp)) { 3993 return; 3994 } 3995 3996 /* Skip pages array */ 3997 qemu_set_offset(f, block->pages_offset + length, SEEK_SET); 3998 3999 return; 4000 } 4001 4002 static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length) 4003 { 4004 int ret = 0; 4005 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 4006 bool postcopy_advised = migration_incoming_postcopy_advised(); 4007 int max_hg_page_size; 4008 Error *local_err = NULL; 4009 4010 assert(block); 4011 4012 if (migrate_mapped_ram()) { 4013 parse_ramblock_mapped_ram(f, block, length, &local_err); 4014 if (local_err) { 4015 error_report_err(local_err); 4016 return -EINVAL; 4017 } 4018 return 0; 4019 } 4020 4021 if (!qemu_ram_is_migratable(block)) { 4022 error_report("block %s should not be migrated !", block->idstr); 4023 return -EINVAL; 4024 } 4025 4026 if (length != block->used_length) { 4027 ret = qemu_ram_resize(block, length, &local_err); 4028 if (local_err) { 4029 error_report_err(local_err); 4030 return ret; 4031 } 4032 } 4033 4034 /* 4035 * ??? Mirrors the previous value of qemu_host_page_size, 4036 * but is this really what was intended for the migration? 4037 */ 4038 max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE); 4039 4040 /* For postcopy we need to check hugepage sizes match */ 4041 if (postcopy_advised && migrate_postcopy_ram() && 4042 block->page_size != max_hg_page_size) { 4043 uint64_t remote_page_size = qemu_get_be64(f); 4044 if (remote_page_size != block->page_size) { 4045 error_report("Mismatched RAM page size %s " 4046 "(local) %zd != %" PRId64, block->idstr, 4047 block->page_size, remote_page_size); 4048 return -EINVAL; 4049 } 4050 } 4051 if (migrate_ignore_shared()) { 4052 hwaddr addr = qemu_get_be64(f); 4053 if (migrate_ram_is_ignored(block) && 4054 block->mr->addr != addr) { 4055 error_report("Mismatched GPAs for block %s " 4056 "%" PRId64 "!= %" PRId64, block->idstr, 4057 (uint64_t)addr, (uint64_t)block->mr->addr); 4058 return -EINVAL; 4059 } 4060 } 4061 ret = rdma_block_notification_handle(f, block->idstr); 4062 if (ret < 0) { 4063 qemu_file_set_error(f, ret); 4064 } 4065 4066 return ret; 4067 } 4068 4069 static int parse_ramblocks(QEMUFile *f, ram_addr_t total_ram_bytes) 4070 { 4071 int ret = 0; 4072 4073 /* Synchronize RAM block list */ 4074 while (!ret && total_ram_bytes) { 4075 RAMBlock *block; 4076 char id[256]; 4077 ram_addr_t length; 4078 int len = qemu_get_byte(f); 4079 4080 qemu_get_buffer(f, (uint8_t *)id, len); 4081 id[len] = 0; 4082 length = qemu_get_be64(f); 4083 4084 block = qemu_ram_block_by_name(id); 4085 if (block) { 4086 ret = parse_ramblock(f, block, length); 4087 } else { 4088 error_report("Unknown ramblock \"%s\", cannot accept " 4089 "migration", id); 4090 ret = -EINVAL; 4091 } 4092 total_ram_bytes -= length; 4093 } 4094 4095 return ret; 4096 } 4097 4098 /** 4099 * ram_load_precopy: load pages in precopy case 4100 * 4101 * Returns 0 for success or -errno in case of error 4102 * 4103 * Called in precopy mode by ram_load(). 4104 * rcu_read_lock is taken prior to this being called. 4105 * 4106 * @f: QEMUFile where to send the data 4107 */ 4108 static int ram_load_precopy(QEMUFile *f) 4109 { 4110 MigrationIncomingState *mis = migration_incoming_get_current(); 4111 int flags = 0, ret = 0, invalid_flags = 0, i = 0; 4112 4113 if (migrate_mapped_ram()) { 4114 invalid_flags |= (RAM_SAVE_FLAG_HOOK | RAM_SAVE_FLAG_MULTIFD_FLUSH | 4115 RAM_SAVE_FLAG_PAGE | RAM_SAVE_FLAG_XBZRLE | 4116 RAM_SAVE_FLAG_ZERO); 4117 } 4118 4119 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4120 ram_addr_t addr; 4121 void *host = NULL, *host_bak = NULL; 4122 uint8_t ch; 4123 4124 /* 4125 * Yield periodically to let main loop run, but an iteration of 4126 * the main loop is expensive, so do it each some iterations 4127 */ 4128 if ((i & 32767) == 0 && qemu_in_coroutine()) { 4129 aio_co_schedule(qemu_get_current_aio_context(), 4130 qemu_coroutine_self()); 4131 qemu_coroutine_yield(); 4132 } 4133 i++; 4134 4135 addr = qemu_get_be64(f); 4136 ret = qemu_file_get_error(f); 4137 if (ret) { 4138 error_report("Getting RAM address failed"); 4139 break; 4140 } 4141 4142 flags = addr & ~TARGET_PAGE_MASK; 4143 addr &= TARGET_PAGE_MASK; 4144 4145 if (flags & invalid_flags) { 4146 error_report("Unexpected RAM flags: %d", flags & invalid_flags); 4147 4148 ret = -EINVAL; 4149 break; 4150 } 4151 4152 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4153 RAM_SAVE_FLAG_XBZRLE)) { 4154 RAMBlock *block = ram_block_from_stream(mis, f, flags, 4155 RAM_CHANNEL_PRECOPY); 4156 4157 host = host_from_ram_block_offset(block, addr); 4158 /* 4159 * After going into COLO stage, we should not load the page 4160 * into SVM's memory directly, we put them into colo_cache firstly. 4161 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 4162 * Previously, we copied all these memory in preparing stage of COLO 4163 * while we need to stop VM, which is a time-consuming process. 4164 * Here we optimize it by a trick, back-up every page while in 4165 * migration process while COLO is enabled, though it affects the 4166 * speed of the migration, but it obviously reduce the downtime of 4167 * back-up all SVM'S memory in COLO preparing stage. 4168 */ 4169 if (migration_incoming_colo_enabled()) { 4170 if (migration_incoming_in_colo_state()) { 4171 /* In COLO stage, put all pages into cache temporarily */ 4172 host = colo_cache_from_block_offset(block, addr, true); 4173 } else { 4174 /* 4175 * In migration stage but before COLO stage, 4176 * Put all pages into both cache and SVM's memory. 4177 */ 4178 host_bak = colo_cache_from_block_offset(block, addr, false); 4179 } 4180 } 4181 if (!host) { 4182 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4183 ret = -EINVAL; 4184 break; 4185 } 4186 if (!migration_incoming_in_colo_state()) { 4187 ramblock_recv_bitmap_set(block, host); 4188 } 4189 4190 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 4191 } 4192 4193 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4194 case RAM_SAVE_FLAG_MEM_SIZE: 4195 ret = parse_ramblocks(f, addr); 4196 /* 4197 * For mapped-ram migration (to a file) using multifd, we sync 4198 * once and for all here to make sure all tasks we queued to 4199 * multifd threads are completed, so that all the ramblocks 4200 * (including all the guest memory pages within) are fully 4201 * loaded after this sync returns. 4202 */ 4203 if (migrate_mapped_ram()) { 4204 multifd_recv_sync_main(); 4205 } 4206 break; 4207 4208 case RAM_SAVE_FLAG_ZERO: 4209 ch = qemu_get_byte(f); 4210 if (ch != 0) { 4211 error_report("Found a zero page with value %d", ch); 4212 ret = -EINVAL; 4213 break; 4214 } 4215 ram_handle_zero(host, TARGET_PAGE_SIZE); 4216 break; 4217 4218 case RAM_SAVE_FLAG_PAGE: 4219 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4220 break; 4221 4222 case RAM_SAVE_FLAG_XBZRLE: 4223 if (load_xbzrle(f, addr, host) < 0) { 4224 error_report("Failed to decompress XBZRLE page at " 4225 RAM_ADDR_FMT, addr); 4226 ret = -EINVAL; 4227 break; 4228 } 4229 break; 4230 case RAM_SAVE_FLAG_MULTIFD_FLUSH: 4231 multifd_recv_sync_main(); 4232 break; 4233 case RAM_SAVE_FLAG_EOS: 4234 /* normal exit */ 4235 if (migrate_multifd() && 4236 migrate_multifd_flush_after_each_section() && 4237 /* 4238 * Mapped-ram migration flushes once and for all after 4239 * parsing ramblocks. Always ignore EOS for it. 4240 */ 4241 !migrate_mapped_ram()) { 4242 multifd_recv_sync_main(); 4243 } 4244 break; 4245 case RAM_SAVE_FLAG_HOOK: 4246 ret = rdma_registration_handle(f); 4247 if (ret < 0) { 4248 qemu_file_set_error(f, ret); 4249 } 4250 break; 4251 default: 4252 error_report("Unknown combination of migration flags: 0x%x", flags); 4253 ret = -EINVAL; 4254 } 4255 if (!ret) { 4256 ret = qemu_file_get_error(f); 4257 } 4258 if (!ret && host_bak) { 4259 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4260 } 4261 } 4262 4263 return ret; 4264 } 4265 4266 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4267 { 4268 int ret = 0; 4269 static uint64_t seq_iter; 4270 /* 4271 * If system is running in postcopy mode, page inserts to host memory must 4272 * be atomic 4273 */ 4274 bool postcopy_running = postcopy_is_running(); 4275 4276 seq_iter++; 4277 4278 if (version_id != 4) { 4279 return -EINVAL; 4280 } 4281 4282 /* 4283 * This RCU critical section can be very long running. 4284 * When RCU reclaims in the code start to become numerous, 4285 * it will be necessary to reduce the granularity of this 4286 * critical section. 4287 */ 4288 trace_ram_load_start(); 4289 WITH_RCU_READ_LOCK_GUARD() { 4290 if (postcopy_running) { 4291 /* 4292 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of 4293 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to 4294 * service fast page faults. 4295 */ 4296 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY); 4297 } else { 4298 ret = ram_load_precopy(f); 4299 } 4300 } 4301 trace_ram_load_complete(ret, seq_iter); 4302 4303 return ret; 4304 } 4305 4306 static bool ram_has_postcopy(void *opaque) 4307 { 4308 RAMBlock *rb; 4309 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4310 if (ramblock_is_pmem(rb)) { 4311 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4312 "is not supported now!", rb->idstr, rb->host); 4313 return false; 4314 } 4315 } 4316 4317 return migrate_postcopy_ram(); 4318 } 4319 4320 /* Sync all the dirty bitmap with destination VM. */ 4321 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4322 { 4323 RAMBlock *block; 4324 QEMUFile *file = s->to_dst_file; 4325 4326 trace_ram_dirty_bitmap_sync_start(); 4327 4328 qatomic_set(&rs->postcopy_bmap_sync_requested, 0); 4329 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4330 qemu_savevm_send_recv_bitmap(file, block->idstr); 4331 trace_ram_dirty_bitmap_request(block->idstr); 4332 qatomic_inc(&rs->postcopy_bmap_sync_requested); 4333 } 4334 4335 trace_ram_dirty_bitmap_sync_wait(); 4336 4337 /* Wait until all the ramblocks' dirty bitmap synced */ 4338 while (qatomic_read(&rs->postcopy_bmap_sync_requested)) { 4339 if (migration_rp_wait(s)) { 4340 return -1; 4341 } 4342 } 4343 4344 trace_ram_dirty_bitmap_sync_complete(); 4345 4346 return 0; 4347 } 4348 4349 /* 4350 * Read the received bitmap, revert it as the initial dirty bitmap. 4351 * This is only used when the postcopy migration is paused but wants 4352 * to resume from a middle point. 4353 * 4354 * Returns true if succeeded, false for errors. 4355 */ 4356 bool ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block, Error **errp) 4357 { 4358 /* from_dst_file is always valid because we're within rp_thread */ 4359 QEMUFile *file = s->rp_state.from_dst_file; 4360 g_autofree unsigned long *le_bitmap = NULL; 4361 unsigned long nbits = block->used_length >> TARGET_PAGE_BITS; 4362 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4363 uint64_t size, end_mark; 4364 RAMState *rs = ram_state; 4365 4366 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4367 4368 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4369 error_setg(errp, "Reload bitmap in incorrect state %s", 4370 MigrationStatus_str(s->state)); 4371 return false; 4372 } 4373 4374 /* 4375 * Note: see comments in ramblock_recv_bitmap_send() on why we 4376 * need the endianness conversion, and the paddings. 4377 */ 4378 local_size = ROUND_UP(local_size, 8); 4379 4380 /* Add paddings */ 4381 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4382 4383 size = qemu_get_be64(file); 4384 4385 /* The size of the bitmap should match with our ramblock */ 4386 if (size != local_size) { 4387 error_setg(errp, "ramblock '%s' bitmap size mismatch (0x%"PRIx64 4388 " != 0x%"PRIx64")", block->idstr, size, local_size); 4389 return false; 4390 } 4391 4392 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4393 end_mark = qemu_get_be64(file); 4394 4395 if (qemu_file_get_error(file) || size != local_size) { 4396 error_setg(errp, "read bitmap failed for ramblock '%s': " 4397 "(size 0x%"PRIx64", got: 0x%"PRIx64")", 4398 block->idstr, local_size, size); 4399 return false; 4400 } 4401 4402 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4403 error_setg(errp, "ramblock '%s' end mark incorrect: 0x%"PRIx64, 4404 block->idstr, end_mark); 4405 return false; 4406 } 4407 4408 /* 4409 * Endianness conversion. We are during postcopy (though paused). 4410 * The dirty bitmap won't change. We can directly modify it. 4411 */ 4412 bitmap_from_le(block->bmap, le_bitmap, nbits); 4413 4414 /* 4415 * What we received is "received bitmap". Revert it as the initial 4416 * dirty bitmap for this ramblock. 4417 */ 4418 bitmap_complement(block->bmap, block->bmap, nbits); 4419 4420 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4421 ramblock_dirty_bitmap_clear_discarded_pages(block); 4422 4423 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4424 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4425 4426 qatomic_dec(&rs->postcopy_bmap_sync_requested); 4427 4428 /* 4429 * We succeeded to sync bitmap for current ramblock. Always kick the 4430 * migration thread to check whether all requested bitmaps are 4431 * reloaded. NOTE: it's racy to only kick when requested==0, because 4432 * we don't know whether the migration thread may still be increasing 4433 * it. 4434 */ 4435 migration_rp_kick(s); 4436 4437 return true; 4438 } 4439 4440 static int ram_resume_prepare(MigrationState *s, void *opaque) 4441 { 4442 RAMState *rs = *(RAMState **)opaque; 4443 int ret; 4444 4445 ret = ram_dirty_bitmap_sync_all(s, rs); 4446 if (ret) { 4447 return ret; 4448 } 4449 4450 ram_state_resume_prepare(rs, s->to_dst_file); 4451 4452 return 0; 4453 } 4454 4455 void postcopy_preempt_shutdown_file(MigrationState *s) 4456 { 4457 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); 4458 qemu_fflush(s->postcopy_qemufile_src); 4459 } 4460 4461 static SaveVMHandlers savevm_ram_handlers = { 4462 .save_setup = ram_save_setup, 4463 .save_live_iterate = ram_save_iterate, 4464 .save_live_complete_postcopy = ram_save_complete, 4465 .save_live_complete_precopy = ram_save_complete, 4466 .has_postcopy = ram_has_postcopy, 4467 .state_pending_exact = ram_state_pending_exact, 4468 .state_pending_estimate = ram_state_pending_estimate, 4469 .load_state = ram_load, 4470 .save_cleanup = ram_save_cleanup, 4471 .load_setup = ram_load_setup, 4472 .load_cleanup = ram_load_cleanup, 4473 .resume_prepare = ram_resume_prepare, 4474 }; 4475 4476 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4477 size_t old_size, size_t new_size) 4478 { 4479 PostcopyState ps = postcopy_state_get(); 4480 ram_addr_t offset; 4481 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4482 Error *err = NULL; 4483 4484 if (!rb) { 4485 error_report("RAM block not found"); 4486 return; 4487 } 4488 4489 if (migrate_ram_is_ignored(rb)) { 4490 return; 4491 } 4492 4493 if (migration_is_running()) { 4494 /* 4495 * Precopy code on the source cannot deal with the size of RAM blocks 4496 * changing at random points in time - especially after sending the 4497 * RAM block sizes in the migration stream, they must no longer change. 4498 * Abort and indicate a proper reason. 4499 */ 4500 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4501 migration_cancel(err); 4502 error_free(err); 4503 } 4504 4505 switch (ps) { 4506 case POSTCOPY_INCOMING_ADVISE: 4507 /* 4508 * Update what ram_postcopy_incoming_init()->init_range() does at the 4509 * time postcopy was advised. Syncing RAM blocks with the source will 4510 * result in RAM resizes. 4511 */ 4512 if (old_size < new_size) { 4513 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4514 error_report("RAM block '%s' discard of resized RAM failed", 4515 rb->idstr); 4516 } 4517 } 4518 rb->postcopy_length = new_size; 4519 break; 4520 case POSTCOPY_INCOMING_NONE: 4521 case POSTCOPY_INCOMING_RUNNING: 4522 case POSTCOPY_INCOMING_END: 4523 /* 4524 * Once our guest is running, postcopy does no longer care about 4525 * resizes. When growing, the new memory was not available on the 4526 * source, no handler needed. 4527 */ 4528 break; 4529 default: 4530 error_report("RAM block '%s' resized during postcopy state: %d", 4531 rb->idstr, ps); 4532 exit(-1); 4533 } 4534 } 4535 4536 static RAMBlockNotifier ram_mig_ram_notifier = { 4537 .ram_block_resized = ram_mig_ram_block_resized, 4538 }; 4539 4540 void ram_mig_init(void) 4541 { 4542 qemu_mutex_init(&XBZRLE.lock); 4543 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4544 ram_block_notifier_add(&ram_mig_ram_notifier); 4545 } 4546