1 /* 2 * QEMU System Emulator 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2011-2015 Red Hat Inc 6 * 7 * Authors: 8 * Juan Quintela <quintela@redhat.com> 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include "qemu/cutils.h" 31 #include "qemu/bitops.h" 32 #include "qemu/bitmap.h" 33 #include "qemu/madvise.h" 34 #include "qemu/main-loop.h" 35 #include "xbzrle.h" 36 #include "ram.h" 37 #include "migration.h" 38 #include "migration-stats.h" 39 #include "migration/register.h" 40 #include "migration/misc.h" 41 #include "qemu-file.h" 42 #include "postcopy-ram.h" 43 #include "page_cache.h" 44 #include "qemu/error-report.h" 45 #include "qapi/error.h" 46 #include "qapi/qapi-types-migration.h" 47 #include "qapi/qapi-events-migration.h" 48 #include "qapi/qapi-commands-migration.h" 49 #include "qapi/qmp/qerror.h" 50 #include "trace.h" 51 #include "exec/ram_addr.h" 52 #include "exec/target_page.h" 53 #include "qemu/rcu_queue.h" 54 #include "migration/colo.h" 55 #include "system/cpu-throttle.h" 56 #include "savevm.h" 57 #include "qemu/iov.h" 58 #include "multifd.h" 59 #include "system/runstate.h" 60 #include "rdma.h" 61 #include "options.h" 62 #include "system/dirtylimit.h" 63 #include "system/kvm.h" 64 65 #include "hw/boards.h" /* for machine_dump_guest_core() */ 66 67 #if defined(__linux__) 68 #include "qemu/userfaultfd.h" 69 #endif /* defined(__linux__) */ 70 71 /***********************************************************/ 72 /* ram save/restore */ 73 74 /* 75 * mapped-ram migration supports O_DIRECT, so we need to make sure the 76 * userspace buffer, the IO operation size and the file offset are 77 * aligned according to the underlying device's block size. The first 78 * two are already aligned to page size, but we need to add padding to 79 * the file to align the offset. We cannot read the block size 80 * dynamically because the migration file can be moved between 81 * different systems, so use 1M to cover most block sizes and to keep 82 * the file offset aligned at page size as well. 83 */ 84 #define MAPPED_RAM_FILE_OFFSET_ALIGNMENT 0x100000 85 86 /* 87 * When doing mapped-ram migration, this is the amount we read from 88 * the pages region in the migration file at a time. 89 */ 90 #define MAPPED_RAM_LOAD_BUF_SIZE 0x100000 91 92 XBZRLECacheStats xbzrle_counters; 93 94 /* used by the search for pages to send */ 95 struct PageSearchStatus { 96 /* The migration channel used for a specific host page */ 97 QEMUFile *pss_channel; 98 /* Last block from where we have sent data */ 99 RAMBlock *last_sent_block; 100 /* Current block being searched */ 101 RAMBlock *block; 102 /* Current page to search from */ 103 unsigned long page; 104 /* Set once we wrap around */ 105 bool complete_round; 106 /* Whether we're sending a host page */ 107 bool host_page_sending; 108 /* The start/end of current host page. Invalid if host_page_sending==false */ 109 unsigned long host_page_start; 110 unsigned long host_page_end; 111 }; 112 typedef struct PageSearchStatus PageSearchStatus; 113 114 /* struct contains XBZRLE cache and a static page 115 used by the compression */ 116 static struct { 117 /* buffer used for XBZRLE encoding */ 118 uint8_t *encoded_buf; 119 /* buffer for storing page content */ 120 uint8_t *current_buf; 121 /* Cache for XBZRLE, Protected by lock. */ 122 PageCache *cache; 123 QemuMutex lock; 124 /* it will store a page full of zeros */ 125 uint8_t *zero_target_page; 126 /* buffer used for XBZRLE decoding */ 127 uint8_t *decoded_buf; 128 } XBZRLE; 129 130 static void XBZRLE_cache_lock(void) 131 { 132 if (migrate_xbzrle()) { 133 qemu_mutex_lock(&XBZRLE.lock); 134 } 135 } 136 137 static void XBZRLE_cache_unlock(void) 138 { 139 if (migrate_xbzrle()) { 140 qemu_mutex_unlock(&XBZRLE.lock); 141 } 142 } 143 144 /** 145 * xbzrle_cache_resize: resize the xbzrle cache 146 * 147 * This function is called from migrate_params_apply in main 148 * thread, possibly while a migration is in progress. A running 149 * migration may be using the cache and might finish during this call, 150 * hence changes to the cache are protected by XBZRLE.lock(). 151 * 152 * Returns 0 for success or -1 for error 153 * 154 * @new_size: new cache size 155 * @errp: set *errp if the check failed, with reason 156 */ 157 int xbzrle_cache_resize(uint64_t new_size, Error **errp) 158 { 159 PageCache *new_cache; 160 int64_t ret = 0; 161 162 /* Check for truncation */ 163 if (new_size != (size_t)new_size) { 164 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", 165 "exceeding address space"); 166 return -1; 167 } 168 169 if (new_size == migrate_xbzrle_cache_size()) { 170 /* nothing to do */ 171 return 0; 172 } 173 174 XBZRLE_cache_lock(); 175 176 if (XBZRLE.cache != NULL) { 177 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp); 178 if (!new_cache) { 179 ret = -1; 180 goto out; 181 } 182 183 cache_fini(XBZRLE.cache); 184 XBZRLE.cache = new_cache; 185 } 186 out: 187 XBZRLE_cache_unlock(); 188 return ret; 189 } 190 191 static bool postcopy_preempt_active(void) 192 { 193 return migrate_postcopy_preempt() && migration_in_postcopy(); 194 } 195 196 bool migrate_ram_is_ignored(RAMBlock *block) 197 { 198 return !qemu_ram_is_migratable(block) || 199 (migrate_ignore_shared() && qemu_ram_is_shared(block) 200 && qemu_ram_is_named_file(block)); 201 } 202 203 #undef RAMBLOCK_FOREACH 204 205 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque) 206 { 207 RAMBlock *block; 208 int ret = 0; 209 210 RCU_READ_LOCK_GUARD(); 211 212 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 213 ret = func(block, opaque); 214 if (ret) { 215 break; 216 } 217 } 218 return ret; 219 } 220 221 static void ramblock_recv_map_init(void) 222 { 223 RAMBlock *rb; 224 225 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 226 assert(!rb->receivedmap); 227 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits()); 228 } 229 } 230 231 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr) 232 { 233 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb), 234 rb->receivedmap); 235 } 236 237 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset) 238 { 239 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 240 } 241 242 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr) 243 { 244 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap); 245 } 246 247 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, 248 size_t nr) 249 { 250 bitmap_set_atomic(rb->receivedmap, 251 ramblock_recv_bitmap_offset(host_addr, rb), 252 nr); 253 } 254 255 void ramblock_recv_bitmap_set_offset(RAMBlock *rb, uint64_t byte_offset) 256 { 257 set_bit_atomic(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); 258 } 259 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) 260 261 /* 262 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes). 263 * 264 * Returns >0 if success with sent bytes, or <0 if error. 265 */ 266 int64_t ramblock_recv_bitmap_send(QEMUFile *file, 267 const char *block_name) 268 { 269 RAMBlock *block = qemu_ram_block_by_name(block_name); 270 unsigned long *le_bitmap, nbits; 271 uint64_t size; 272 273 if (!block) { 274 error_report("%s: invalid block name: %s", __func__, block_name); 275 return -1; 276 } 277 278 nbits = block->postcopy_length >> TARGET_PAGE_BITS; 279 280 /* 281 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit 282 * machines we may need 4 more bytes for padding (see below 283 * comment). So extend it a bit before hand. 284 */ 285 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 286 287 /* 288 * Always use little endian when sending the bitmap. This is 289 * required that when source and destination VMs are not using the 290 * same endianness. (Note: big endian won't work.) 291 */ 292 bitmap_to_le(le_bitmap, block->receivedmap, nbits); 293 294 /* Size of the bitmap, in bytes */ 295 size = DIV_ROUND_UP(nbits, 8); 296 297 /* 298 * size is always aligned to 8 bytes for 64bit machines, but it 299 * may not be true for 32bit machines. We need this padding to 300 * make sure the migration can survive even between 32bit and 301 * 64bit machines. 302 */ 303 size = ROUND_UP(size, 8); 304 305 qemu_put_be64(file, size); 306 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size); 307 g_free(le_bitmap); 308 /* 309 * Mark as an end, in case the middle part is screwed up due to 310 * some "mysterious" reason. 311 */ 312 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING); 313 int ret = qemu_fflush(file); 314 if (ret) { 315 return ret; 316 } 317 318 return size + sizeof(size); 319 } 320 321 /* 322 * An outstanding page request, on the source, having been received 323 * and queued 324 */ 325 struct RAMSrcPageRequest { 326 RAMBlock *rb; 327 hwaddr offset; 328 hwaddr len; 329 330 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req; 331 }; 332 333 /* State of RAM for migration */ 334 struct RAMState { 335 /* 336 * PageSearchStatus structures for the channels when send pages. 337 * Protected by the bitmap_mutex. 338 */ 339 PageSearchStatus pss[RAM_CHANNEL_MAX]; 340 /* UFFD file descriptor, used in 'write-tracking' migration */ 341 int uffdio_fd; 342 /* total ram size in bytes */ 343 uint64_t ram_bytes_total; 344 /* Last block that we have visited searching for dirty pages */ 345 RAMBlock *last_seen_block; 346 /* Last dirty target page we have sent */ 347 ram_addr_t last_page; 348 /* last ram version we have seen */ 349 uint32_t last_version; 350 /* How many times we have dirty too many pages */ 351 int dirty_rate_high_cnt; 352 /* these variables are used for bitmap sync */ 353 /* last time we did a full bitmap_sync */ 354 int64_t time_last_bitmap_sync; 355 /* bytes transferred at start_time */ 356 uint64_t bytes_xfer_prev; 357 /* number of dirty pages since start_time */ 358 uint64_t num_dirty_pages_period; 359 /* xbzrle misses since the beginning of the period */ 360 uint64_t xbzrle_cache_miss_prev; 361 /* Amount of xbzrle pages since the beginning of the period */ 362 uint64_t xbzrle_pages_prev; 363 /* Amount of xbzrle encoded bytes since the beginning of the period */ 364 uint64_t xbzrle_bytes_prev; 365 /* Are we really using XBZRLE (e.g., after the first round). */ 366 bool xbzrle_started; 367 /* Are we on the last stage of migration */ 368 bool last_stage; 369 370 /* total handled target pages at the beginning of period */ 371 uint64_t target_page_count_prev; 372 /* total handled target pages since start */ 373 uint64_t target_page_count; 374 /* number of dirty bits in the bitmap */ 375 uint64_t migration_dirty_pages; 376 /* 377 * Protects: 378 * - dirty/clear bitmap 379 * - migration_dirty_pages 380 * - pss structures 381 */ 382 QemuMutex bitmap_mutex; 383 /* The RAMBlock used in the last src_page_requests */ 384 RAMBlock *last_req_rb; 385 /* Queue of outstanding page requests from the destination */ 386 QemuMutex src_page_req_mutex; 387 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests; 388 389 /* 390 * This is only used when postcopy is in recovery phase, to communicate 391 * between the migration thread and the return path thread on dirty 392 * bitmap synchronizations. This field is unused in other stages of 393 * RAM migration. 394 */ 395 unsigned int postcopy_bmap_sync_requested; 396 }; 397 typedef struct RAMState RAMState; 398 399 static RAMState *ram_state; 400 401 static NotifierWithReturnList precopy_notifier_list; 402 403 /* Whether postcopy has queued requests? */ 404 static bool postcopy_has_request(RAMState *rs) 405 { 406 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests); 407 } 408 409 void precopy_infrastructure_init(void) 410 { 411 notifier_with_return_list_init(&precopy_notifier_list); 412 } 413 414 void precopy_add_notifier(NotifierWithReturn *n) 415 { 416 notifier_with_return_list_add(&precopy_notifier_list, n); 417 } 418 419 void precopy_remove_notifier(NotifierWithReturn *n) 420 { 421 notifier_with_return_remove(n); 422 } 423 424 int precopy_notify(PrecopyNotifyReason reason, Error **errp) 425 { 426 PrecopyNotifyData pnd; 427 pnd.reason = reason; 428 429 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd, errp); 430 } 431 432 uint64_t ram_bytes_remaining(void) 433 { 434 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) : 435 0; 436 } 437 438 void ram_transferred_add(uint64_t bytes) 439 { 440 if (runstate_is_running()) { 441 stat64_add(&mig_stats.precopy_bytes, bytes); 442 } else if (migration_in_postcopy()) { 443 stat64_add(&mig_stats.postcopy_bytes, bytes); 444 } else { 445 stat64_add(&mig_stats.downtime_bytes, bytes); 446 } 447 } 448 449 struct MigrationOps { 450 int (*ram_save_target_page)(RAMState *rs, PageSearchStatus *pss); 451 }; 452 typedef struct MigrationOps MigrationOps; 453 454 MigrationOps *migration_ops; 455 456 static int ram_save_host_page_urgent(PageSearchStatus *pss); 457 458 /* NOTE: page is the PFN not real ram_addr_t. */ 459 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page) 460 { 461 pss->block = rb; 462 pss->page = page; 463 pss->complete_round = false; 464 } 465 466 /* 467 * Check whether two PSSs are actively sending the same page. Return true 468 * if it is, false otherwise. 469 */ 470 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2) 471 { 472 return pss1->host_page_sending && pss2->host_page_sending && 473 (pss1->host_page_start == pss2->host_page_start); 474 } 475 476 /** 477 * save_page_header: write page header to wire 478 * 479 * If this is the 1st block, it also writes the block identification 480 * 481 * Returns the number of bytes written 482 * 483 * @pss: current PSS channel status 484 * @block: block that contains the page we want to send 485 * @offset: offset inside the block for the page 486 * in the lower bits, it contains flags 487 */ 488 static size_t save_page_header(PageSearchStatus *pss, QEMUFile *f, 489 RAMBlock *block, ram_addr_t offset) 490 { 491 size_t size, len; 492 bool same_block = (block == pss->last_sent_block); 493 494 if (same_block) { 495 offset |= RAM_SAVE_FLAG_CONTINUE; 496 } 497 qemu_put_be64(f, offset); 498 size = 8; 499 500 if (!same_block) { 501 len = strlen(block->idstr); 502 qemu_put_byte(f, len); 503 qemu_put_buffer(f, (uint8_t *)block->idstr, len); 504 size += 1 + len; 505 pss->last_sent_block = block; 506 } 507 return size; 508 } 509 510 /** 511 * mig_throttle_guest_down: throttle down the guest 512 * 513 * Reduce amount of guest cpu execution to hopefully slow down memory 514 * writes. If guest dirty memory rate is reduced below the rate at 515 * which we can transfer pages to the destination then we should be 516 * able to complete migration. Some workloads dirty memory way too 517 * fast and will not effectively converge, even with auto-converge. 518 */ 519 static void mig_throttle_guest_down(uint64_t bytes_dirty_period, 520 uint64_t bytes_dirty_threshold) 521 { 522 uint64_t pct_initial = migrate_cpu_throttle_initial(); 523 uint64_t pct_increment = migrate_cpu_throttle_increment(); 524 bool pct_tailslow = migrate_cpu_throttle_tailslow(); 525 int pct_max = migrate_max_cpu_throttle(); 526 527 uint64_t throttle_now = cpu_throttle_get_percentage(); 528 uint64_t cpu_now, cpu_ideal, throttle_inc; 529 530 /* We have not started throttling yet. Let's start it. */ 531 if (!cpu_throttle_active()) { 532 cpu_throttle_set(pct_initial); 533 } else { 534 /* Throttling already on, just increase the rate */ 535 if (!pct_tailslow) { 536 throttle_inc = pct_increment; 537 } else { 538 /* Compute the ideal CPU percentage used by Guest, which may 539 * make the dirty rate match the dirty rate threshold. */ 540 cpu_now = 100 - throttle_now; 541 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 / 542 bytes_dirty_period); 543 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment); 544 } 545 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max)); 546 } 547 } 548 549 void mig_throttle_counter_reset(void) 550 { 551 RAMState *rs = ram_state; 552 553 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 554 rs->num_dirty_pages_period = 0; 555 rs->bytes_xfer_prev = migration_transferred_bytes(); 556 } 557 558 /** 559 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache 560 * 561 * @current_addr: address for the zero page 562 * 563 * Update the xbzrle cache to reflect a page that's been sent as all 0. 564 * The important thing is that a stale (not-yet-0'd) page be replaced 565 * by the new data. 566 * As a bonus, if the page wasn't in the cache it gets added so that 567 * when a small write is made into the 0'd page it gets XBZRLE sent. 568 */ 569 static void xbzrle_cache_zero_page(ram_addr_t current_addr) 570 { 571 /* We don't care if this fails to allocate a new cache page 572 * as long as it updated an old one */ 573 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page, 574 stat64_get(&mig_stats.dirty_sync_count)); 575 } 576 577 #define ENCODING_FLAG_XBZRLE 0x1 578 579 /** 580 * save_xbzrle_page: compress and send current page 581 * 582 * Returns: 1 means that we wrote the page 583 * 0 means that page is identical to the one already sent 584 * -1 means that xbzrle would be longer than normal 585 * 586 * @rs: current RAM state 587 * @pss: current PSS channel 588 * @current_data: pointer to the address of the page contents 589 * @current_addr: addr of the page 590 * @block: block that contains the page we want to send 591 * @offset: offset inside the block for the page 592 */ 593 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss, 594 uint8_t **current_data, ram_addr_t current_addr, 595 RAMBlock *block, ram_addr_t offset) 596 { 597 int encoded_len = 0, bytes_xbzrle; 598 uint8_t *prev_cached_page; 599 QEMUFile *file = pss->pss_channel; 600 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); 601 602 if (!cache_is_cached(XBZRLE.cache, current_addr, generation)) { 603 xbzrle_counters.cache_miss++; 604 if (!rs->last_stage) { 605 if (cache_insert(XBZRLE.cache, current_addr, *current_data, 606 generation) == -1) { 607 return -1; 608 } else { 609 /* update *current_data when the page has been 610 inserted into cache */ 611 *current_data = get_cached_data(XBZRLE.cache, current_addr); 612 } 613 } 614 return -1; 615 } 616 617 /* 618 * Reaching here means the page has hit the xbzrle cache, no matter what 619 * encoding result it is (normal encoding, overflow or skipping the page), 620 * count the page as encoded. This is used to calculate the encoding rate. 621 * 622 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB, 623 * 2nd page turns out to be skipped (i.e. no new bytes written to the 624 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the 625 * skipped page included. In this way, the encoding rate can tell if the 626 * guest page is good for xbzrle encoding. 627 */ 628 xbzrle_counters.pages++; 629 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); 630 631 /* save current buffer into memory */ 632 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); 633 634 /* XBZRLE encoding (if there is no overflow) */ 635 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, 636 TARGET_PAGE_SIZE, XBZRLE.encoded_buf, 637 TARGET_PAGE_SIZE); 638 639 /* 640 * Update the cache contents, so that it corresponds to the data 641 * sent, in all cases except where we skip the page. 642 */ 643 if (!rs->last_stage && encoded_len != 0) { 644 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); 645 /* 646 * In the case where we couldn't compress, ensure that the caller 647 * sends the data from the cache, since the guest might have 648 * changed the RAM since we copied it. 649 */ 650 *current_data = prev_cached_page; 651 } 652 653 if (encoded_len == 0) { 654 trace_save_xbzrle_page_skipping(); 655 return 0; 656 } else if (encoded_len == -1) { 657 trace_save_xbzrle_page_overflow(); 658 xbzrle_counters.overflow++; 659 xbzrle_counters.bytes += TARGET_PAGE_SIZE; 660 return -1; 661 } 662 663 /* Send XBZRLE based compressed page */ 664 bytes_xbzrle = save_page_header(pss, pss->pss_channel, block, 665 offset | RAM_SAVE_FLAG_XBZRLE); 666 qemu_put_byte(file, ENCODING_FLAG_XBZRLE); 667 qemu_put_be16(file, encoded_len); 668 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len); 669 bytes_xbzrle += encoded_len + 1 + 2; 670 /* 671 * The xbzrle encoded bytes don't count the 8 byte header with 672 * RAM_SAVE_FLAG_CONTINUE. 673 */ 674 xbzrle_counters.bytes += bytes_xbzrle - 8; 675 ram_transferred_add(bytes_xbzrle); 676 677 return 1; 678 } 679 680 /** 681 * pss_find_next_dirty: find the next dirty page of current ramblock 682 * 683 * This function updates pss->page to point to the next dirty page index 684 * within the ramblock to migrate, or the end of ramblock when nothing 685 * found. Note that when pss->host_page_sending==true it means we're 686 * during sending a host page, so we won't look for dirty page that is 687 * outside the host page boundary. 688 * 689 * @pss: the current page search status 690 */ 691 static void pss_find_next_dirty(PageSearchStatus *pss) 692 { 693 RAMBlock *rb = pss->block; 694 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 695 unsigned long *bitmap = rb->bmap; 696 697 if (migrate_ram_is_ignored(rb)) { 698 /* Points directly to the end, so we know no dirty page */ 699 pss->page = size; 700 return; 701 } 702 703 /* 704 * If during sending a host page, only look for dirty pages within the 705 * current host page being send. 706 */ 707 if (pss->host_page_sending) { 708 assert(pss->host_page_end); 709 size = MIN(size, pss->host_page_end); 710 } 711 712 pss->page = find_next_bit(bitmap, size, pss->page); 713 } 714 715 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb, 716 unsigned long page) 717 { 718 uint8_t shift; 719 hwaddr size, start; 720 721 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) { 722 return; 723 } 724 725 shift = rb->clear_bmap_shift; 726 /* 727 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this 728 * can make things easier sometimes since then start address 729 * of the small chunk will always be 64 pages aligned so the 730 * bitmap will always be aligned to unsigned long. We should 731 * even be able to remove this restriction but I'm simply 732 * keeping it. 733 */ 734 assert(shift >= 6); 735 736 size = 1ULL << (TARGET_PAGE_BITS + shift); 737 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size); 738 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); 739 memory_region_clear_dirty_bitmap(rb->mr, start, size); 740 } 741 742 static void 743 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb, 744 unsigned long start, 745 unsigned long npages) 746 { 747 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift; 748 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages); 749 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages); 750 751 /* 752 * Clear pages from start to start + npages - 1, so the end boundary is 753 * exclusive. 754 */ 755 for (i = chunk_start; i < chunk_end; i += chunk_pages) { 756 migration_clear_memory_region_dirty_bitmap(rb, i); 757 } 758 } 759 760 /* 761 * colo_bitmap_find_diry:find contiguous dirty pages from start 762 * 763 * Returns the page offset within memory region of the start of the contiguout 764 * dirty page 765 * 766 * @rs: current RAM state 767 * @rb: RAMBlock where to search for dirty pages 768 * @start: page where we start the search 769 * @num: the number of contiguous dirty pages 770 */ 771 static inline 772 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb, 773 unsigned long start, unsigned long *num) 774 { 775 unsigned long size = rb->used_length >> TARGET_PAGE_BITS; 776 unsigned long *bitmap = rb->bmap; 777 unsigned long first, next; 778 779 *num = 0; 780 781 if (migrate_ram_is_ignored(rb)) { 782 return size; 783 } 784 785 first = find_next_bit(bitmap, size, start); 786 if (first >= size) { 787 return first; 788 } 789 next = find_next_zero_bit(bitmap, size, first + 1); 790 assert(next >= first); 791 *num = next - first; 792 return first; 793 } 794 795 static inline bool migration_bitmap_clear_dirty(RAMState *rs, 796 RAMBlock *rb, 797 unsigned long page) 798 { 799 bool ret; 800 801 /* 802 * Clear dirty bitmap if needed. This _must_ be called before we 803 * send any of the page in the chunk because we need to make sure 804 * we can capture further page content changes when we sync dirty 805 * log the next time. So as long as we are going to send any of 806 * the page in the chunk we clear the remote dirty bitmap for all. 807 * Clearing it earlier won't be a problem, but too late will. 808 */ 809 migration_clear_memory_region_dirty_bitmap(rb, page); 810 811 ret = test_and_clear_bit(page, rb->bmap); 812 if (ret) { 813 rs->migration_dirty_pages--; 814 } 815 816 return ret; 817 } 818 819 static void dirty_bitmap_clear_section(MemoryRegionSection *section, 820 void *opaque) 821 { 822 const hwaddr offset = section->offset_within_region; 823 const hwaddr size = int128_get64(section->size); 824 const unsigned long start = offset >> TARGET_PAGE_BITS; 825 const unsigned long npages = size >> TARGET_PAGE_BITS; 826 RAMBlock *rb = section->mr->ram_block; 827 uint64_t *cleared_bits = opaque; 828 829 /* 830 * We don't grab ram_state->bitmap_mutex because we expect to run 831 * only when starting migration or during postcopy recovery where 832 * we don't have concurrent access. 833 */ 834 if (!migration_in_postcopy() && !migrate_background_snapshot()) { 835 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages); 836 } 837 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages); 838 bitmap_clear(rb->bmap, start, npages); 839 } 840 841 /* 842 * Exclude all dirty pages from migration that fall into a discarded range as 843 * managed by a RamDiscardManager responsible for the mapped memory region of 844 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps. 845 * 846 * Discarded pages ("logically unplugged") have undefined content and must 847 * not get migrated, because even reading these pages for migration might 848 * result in undesired behavior. 849 * 850 * Returns the number of cleared bits in the RAMBlock dirty bitmap. 851 * 852 * Note: The result is only stable while migrating (precopy/postcopy). 853 */ 854 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb) 855 { 856 uint64_t cleared_bits = 0; 857 858 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) { 859 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 860 MemoryRegionSection section = { 861 .mr = rb->mr, 862 .offset_within_region = 0, 863 .size = int128_make64(qemu_ram_get_used_length(rb)), 864 }; 865 866 ram_discard_manager_replay_discarded(rdm, §ion, 867 dirty_bitmap_clear_section, 868 &cleared_bits); 869 } 870 return cleared_bits; 871 } 872 873 /* 874 * Check if a host-page aligned page falls into a discarded range as managed by 875 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock. 876 * 877 * Note: The result is only stable while migrating (precopy/postcopy). 878 */ 879 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start) 880 { 881 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 882 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 883 MemoryRegionSection section = { 884 .mr = rb->mr, 885 .offset_within_region = start, 886 .size = int128_make64(qemu_ram_pagesize(rb)), 887 }; 888 889 return !ram_discard_manager_is_populated(rdm, §ion); 890 } 891 return false; 892 } 893 894 /* Called with RCU critical section */ 895 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb) 896 { 897 uint64_t new_dirty_pages = 898 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length); 899 900 rs->migration_dirty_pages += new_dirty_pages; 901 rs->num_dirty_pages_period += new_dirty_pages; 902 } 903 904 /** 905 * ram_pagesize_summary: calculate all the pagesizes of a VM 906 * 907 * Returns a summary bitmap of the page sizes of all RAMBlocks 908 * 909 * For VMs with just normal pages this is equivalent to the host page 910 * size. If it's got some huge pages then it's the OR of all the 911 * different page sizes. 912 */ 913 uint64_t ram_pagesize_summary(void) 914 { 915 RAMBlock *block; 916 uint64_t summary = 0; 917 918 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 919 summary |= block->page_size; 920 } 921 922 return summary; 923 } 924 925 uint64_t ram_get_total_transferred_pages(void) 926 { 927 return stat64_get(&mig_stats.normal_pages) + 928 stat64_get(&mig_stats.zero_pages) + 929 xbzrle_counters.pages; 930 } 931 932 static void migration_update_rates(RAMState *rs, int64_t end_time) 933 { 934 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev; 935 936 /* calculate period counters */ 937 stat64_set(&mig_stats.dirty_pages_rate, 938 rs->num_dirty_pages_period * 1000 / 939 (end_time - rs->time_last_bitmap_sync)); 940 941 if (!page_count) { 942 return; 943 } 944 945 if (migrate_xbzrle()) { 946 double encoded_size, unencoded_size; 947 948 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss - 949 rs->xbzrle_cache_miss_prev) / page_count; 950 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss; 951 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) * 952 TARGET_PAGE_SIZE; 953 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev; 954 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) { 955 xbzrle_counters.encoding_rate = 0; 956 } else { 957 xbzrle_counters.encoding_rate = unencoded_size / encoded_size; 958 } 959 rs->xbzrle_pages_prev = xbzrle_counters.pages; 960 rs->xbzrle_bytes_prev = xbzrle_counters.bytes; 961 } 962 } 963 964 /* 965 * Enable dirty-limit to throttle down the guest 966 */ 967 static void migration_dirty_limit_guest(void) 968 { 969 /* 970 * dirty page rate quota for all vCPUs fetched from 971 * migration parameter 'vcpu_dirty_limit' 972 */ 973 static int64_t quota_dirtyrate; 974 MigrationState *s = migrate_get_current(); 975 976 /* 977 * If dirty limit already enabled and migration parameter 978 * vcpu-dirty-limit untouched. 979 */ 980 if (dirtylimit_in_service() && 981 quota_dirtyrate == s->parameters.vcpu_dirty_limit) { 982 return; 983 } 984 985 quota_dirtyrate = s->parameters.vcpu_dirty_limit; 986 987 /* 988 * Set all vCPU a quota dirtyrate, note that the second 989 * parameter will be ignored if setting all vCPU for the vm 990 */ 991 qmp_set_vcpu_dirty_limit(false, -1, quota_dirtyrate, NULL); 992 trace_migration_dirty_limit_guest(quota_dirtyrate); 993 } 994 995 static void migration_trigger_throttle(RAMState *rs) 996 { 997 uint64_t threshold = migrate_throttle_trigger_threshold(); 998 uint64_t bytes_xfer_period = 999 migration_transferred_bytes() - rs->bytes_xfer_prev; 1000 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE; 1001 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100; 1002 1003 /* 1004 * The following detection logic can be refined later. For now: 1005 * Check to see if the ratio between dirtied bytes and the approx. 1006 * amount of bytes that just got transferred since the last time 1007 * we were in this routine reaches the threshold. If that happens 1008 * twice, start or increase throttling. 1009 */ 1010 if ((bytes_dirty_period > bytes_dirty_threshold) && 1011 (++rs->dirty_rate_high_cnt >= 2)) { 1012 rs->dirty_rate_high_cnt = 0; 1013 if (migrate_auto_converge()) { 1014 trace_migration_throttle(); 1015 mig_throttle_guest_down(bytes_dirty_period, 1016 bytes_dirty_threshold); 1017 } else if (migrate_dirty_limit()) { 1018 migration_dirty_limit_guest(); 1019 } 1020 } 1021 } 1022 1023 static void migration_bitmap_sync(RAMState *rs, bool last_stage) 1024 { 1025 RAMBlock *block; 1026 int64_t end_time; 1027 1028 stat64_add(&mig_stats.dirty_sync_count, 1); 1029 1030 if (!rs->time_last_bitmap_sync) { 1031 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1032 } 1033 1034 trace_migration_bitmap_sync_start(); 1035 memory_global_dirty_log_sync(last_stage); 1036 1037 WITH_QEMU_LOCK_GUARD(&rs->bitmap_mutex) { 1038 WITH_RCU_READ_LOCK_GUARD() { 1039 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1040 ramblock_sync_dirty_bitmap(rs, block); 1041 } 1042 stat64_set(&mig_stats.dirty_bytes_last_sync, ram_bytes_remaining()); 1043 } 1044 } 1045 1046 memory_global_after_dirty_log_sync(); 1047 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period); 1048 1049 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); 1050 1051 /* more than 1 second = 1000 millisecons */ 1052 if (end_time > rs->time_last_bitmap_sync + 1000) { 1053 migration_trigger_throttle(rs); 1054 1055 migration_update_rates(rs, end_time); 1056 1057 rs->target_page_count_prev = rs->target_page_count; 1058 1059 /* reset period counters */ 1060 rs->time_last_bitmap_sync = end_time; 1061 rs->num_dirty_pages_period = 0; 1062 rs->bytes_xfer_prev = migration_transferred_bytes(); 1063 } 1064 if (migrate_events()) { 1065 uint64_t generation = stat64_get(&mig_stats.dirty_sync_count); 1066 qapi_event_send_migration_pass(generation); 1067 } 1068 } 1069 1070 void migration_bitmap_sync_precopy(bool last_stage) 1071 { 1072 Error *local_err = NULL; 1073 assert(ram_state); 1074 1075 /* 1076 * The current notifier usage is just an optimization to migration, so we 1077 * don't stop the normal migration process in the error case. 1078 */ 1079 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) { 1080 error_report_err(local_err); 1081 local_err = NULL; 1082 } 1083 1084 migration_bitmap_sync(ram_state, last_stage); 1085 1086 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) { 1087 error_report_err(local_err); 1088 } 1089 } 1090 1091 void ram_release_page(const char *rbname, uint64_t offset) 1092 { 1093 if (!migrate_release_ram() || !migration_in_postcopy()) { 1094 return; 1095 } 1096 1097 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE); 1098 } 1099 1100 /** 1101 * save_zero_page: send the zero page to the stream 1102 * 1103 * Returns the number of pages written. 1104 * 1105 * @rs: current RAM state 1106 * @pss: current PSS channel 1107 * @offset: offset inside the block for the page 1108 */ 1109 static int save_zero_page(RAMState *rs, PageSearchStatus *pss, 1110 ram_addr_t offset) 1111 { 1112 uint8_t *p = pss->block->host + offset; 1113 QEMUFile *file = pss->pss_channel; 1114 int len = 0; 1115 1116 if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_NONE) { 1117 return 0; 1118 } 1119 1120 if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) { 1121 return 0; 1122 } 1123 1124 stat64_add(&mig_stats.zero_pages, 1); 1125 1126 if (migrate_mapped_ram()) { 1127 /* zero pages are not transferred with mapped-ram */ 1128 clear_bit_atomic(offset >> TARGET_PAGE_BITS, pss->block->file_bmap); 1129 return 1; 1130 } 1131 1132 len += save_page_header(pss, file, pss->block, offset | RAM_SAVE_FLAG_ZERO); 1133 qemu_put_byte(file, 0); 1134 len += 1; 1135 ram_release_page(pss->block->idstr, offset); 1136 ram_transferred_add(len); 1137 1138 /* 1139 * Must let xbzrle know, otherwise a previous (now 0'd) cached 1140 * page would be stale. 1141 */ 1142 if (rs->xbzrle_started) { 1143 XBZRLE_cache_lock(); 1144 xbzrle_cache_zero_page(pss->block->offset + offset); 1145 XBZRLE_cache_unlock(); 1146 } 1147 1148 return len; 1149 } 1150 1151 /* 1152 * @pages: the number of pages written by the control path, 1153 * < 0 - error 1154 * > 0 - number of pages written 1155 * 1156 * Return true if the pages has been saved, otherwise false is returned. 1157 */ 1158 static bool control_save_page(PageSearchStatus *pss, 1159 ram_addr_t offset, int *pages) 1160 { 1161 int ret; 1162 1163 ret = rdma_control_save_page(pss->pss_channel, pss->block->offset, offset, 1164 TARGET_PAGE_SIZE); 1165 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) { 1166 return false; 1167 } 1168 1169 if (ret == RAM_SAVE_CONTROL_DELAYED) { 1170 *pages = 1; 1171 return true; 1172 } 1173 *pages = ret; 1174 return true; 1175 } 1176 1177 /* 1178 * directly send the page to the stream 1179 * 1180 * Returns the number of pages written. 1181 * 1182 * @pss: current PSS channel 1183 * @block: block that contains the page we want to send 1184 * @offset: offset inside the block for the page 1185 * @buf: the page to be sent 1186 * @async: send to page asyncly 1187 */ 1188 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block, 1189 ram_addr_t offset, uint8_t *buf, bool async) 1190 { 1191 QEMUFile *file = pss->pss_channel; 1192 1193 if (migrate_mapped_ram()) { 1194 qemu_put_buffer_at(file, buf, TARGET_PAGE_SIZE, 1195 block->pages_offset + offset); 1196 set_bit(offset >> TARGET_PAGE_BITS, block->file_bmap); 1197 } else { 1198 ram_transferred_add(save_page_header(pss, pss->pss_channel, block, 1199 offset | RAM_SAVE_FLAG_PAGE)); 1200 if (async) { 1201 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE, 1202 migrate_release_ram() && 1203 migration_in_postcopy()); 1204 } else { 1205 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE); 1206 } 1207 } 1208 ram_transferred_add(TARGET_PAGE_SIZE); 1209 stat64_add(&mig_stats.normal_pages, 1); 1210 return 1; 1211 } 1212 1213 /** 1214 * ram_save_page: send the given page to the stream 1215 * 1216 * Returns the number of pages written. 1217 * < 0 - error 1218 * >=0 - Number of pages written - this might legally be 0 1219 * if xbzrle noticed the page was the same. 1220 * 1221 * @rs: current RAM state 1222 * @block: block that contains the page we want to send 1223 * @offset: offset inside the block for the page 1224 */ 1225 static int ram_save_page(RAMState *rs, PageSearchStatus *pss) 1226 { 1227 int pages = -1; 1228 uint8_t *p; 1229 bool send_async = true; 1230 RAMBlock *block = pss->block; 1231 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1232 ram_addr_t current_addr = block->offset + offset; 1233 1234 p = block->host + offset; 1235 trace_ram_save_page(block->idstr, (uint64_t)offset, p); 1236 1237 XBZRLE_cache_lock(); 1238 if (rs->xbzrle_started && !migration_in_postcopy()) { 1239 pages = save_xbzrle_page(rs, pss, &p, current_addr, 1240 block, offset); 1241 if (!rs->last_stage) { 1242 /* Can't send this cached data async, since the cache page 1243 * might get updated before it gets to the wire 1244 */ 1245 send_async = false; 1246 } 1247 } 1248 1249 /* XBZRLE overflow or normal page */ 1250 if (pages == -1) { 1251 pages = save_normal_page(pss, block, offset, p, send_async); 1252 } 1253 1254 XBZRLE_cache_unlock(); 1255 1256 return pages; 1257 } 1258 1259 static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) 1260 { 1261 if (!multifd_queue_page(block, offset)) { 1262 return -1; 1263 } 1264 1265 return 1; 1266 } 1267 1268 1269 #define PAGE_ALL_CLEAN 0 1270 #define PAGE_TRY_AGAIN 1 1271 #define PAGE_DIRTY_FOUND 2 1272 /** 1273 * find_dirty_block: find the next dirty page and update any state 1274 * associated with the search process. 1275 * 1276 * Returns: 1277 * <0: An error happened 1278 * PAGE_ALL_CLEAN: no dirty page found, give up 1279 * PAGE_TRY_AGAIN: no dirty page found, retry for next block 1280 * PAGE_DIRTY_FOUND: dirty page found 1281 * 1282 * @rs: current RAM state 1283 * @pss: data about the state of the current dirty page scan 1284 * @again: set to false if the search has scanned the whole of RAM 1285 */ 1286 static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) 1287 { 1288 /* Update pss->page for the next dirty bit in ramblock */ 1289 pss_find_next_dirty(pss); 1290 1291 if (pss->complete_round && pss->block == rs->last_seen_block && 1292 pss->page >= rs->last_page) { 1293 /* 1294 * We've been once around the RAM and haven't found anything. 1295 * Give up. 1296 */ 1297 return PAGE_ALL_CLEAN; 1298 } 1299 if (!offset_in_ramblock(pss->block, 1300 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) { 1301 /* Didn't find anything in this RAM Block */ 1302 pss->page = 0; 1303 pss->block = QLIST_NEXT_RCU(pss->block, next); 1304 if (!pss->block) { 1305 if (multifd_ram_sync_per_round()) { 1306 QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; 1307 int ret = multifd_ram_flush_and_sync(f); 1308 if (ret < 0) { 1309 return ret; 1310 } 1311 } 1312 1313 /* Hit the end of the list */ 1314 pss->block = QLIST_FIRST_RCU(&ram_list.blocks); 1315 /* Flag that we've looped */ 1316 pss->complete_round = true; 1317 /* After the first round, enable XBZRLE. */ 1318 if (migrate_xbzrle()) { 1319 rs->xbzrle_started = true; 1320 } 1321 } 1322 /* Didn't find anything this time, but try again on the new block */ 1323 return PAGE_TRY_AGAIN; 1324 } else { 1325 /* We've found something */ 1326 return PAGE_DIRTY_FOUND; 1327 } 1328 } 1329 1330 /** 1331 * unqueue_page: gets a page of the queue 1332 * 1333 * Helper for 'get_queued_page' - gets a page off the queue 1334 * 1335 * Returns the block of the page (or NULL if none available) 1336 * 1337 * @rs: current RAM state 1338 * @offset: used to return the offset within the RAMBlock 1339 */ 1340 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset) 1341 { 1342 struct RAMSrcPageRequest *entry; 1343 RAMBlock *block = NULL; 1344 1345 if (!postcopy_has_request(rs)) { 1346 return NULL; 1347 } 1348 1349 QEMU_LOCK_GUARD(&rs->src_page_req_mutex); 1350 1351 /* 1352 * This should _never_ change even after we take the lock, because no one 1353 * should be taking anything off the request list other than us. 1354 */ 1355 assert(postcopy_has_request(rs)); 1356 1357 entry = QSIMPLEQ_FIRST(&rs->src_page_requests); 1358 block = entry->rb; 1359 *offset = entry->offset; 1360 1361 if (entry->len > TARGET_PAGE_SIZE) { 1362 entry->len -= TARGET_PAGE_SIZE; 1363 entry->offset += TARGET_PAGE_SIZE; 1364 } else { 1365 memory_region_unref(block->mr); 1366 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1367 g_free(entry); 1368 migration_consume_urgent_request(); 1369 } 1370 1371 return block; 1372 } 1373 1374 #if defined(__linux__) 1375 /** 1376 * poll_fault_page: try to get next UFFD write fault page and, if pending fault 1377 * is found, return RAM block pointer and page offset 1378 * 1379 * Returns pointer to the RAMBlock containing faulting page, 1380 * NULL if no write faults are pending 1381 * 1382 * @rs: current RAM state 1383 * @offset: page offset from the beginning of the block 1384 */ 1385 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1386 { 1387 struct uffd_msg uffd_msg; 1388 void *page_address; 1389 RAMBlock *block; 1390 int res; 1391 1392 if (!migrate_background_snapshot()) { 1393 return NULL; 1394 } 1395 1396 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1); 1397 if (res <= 0) { 1398 return NULL; 1399 } 1400 1401 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address; 1402 block = qemu_ram_block_from_host(page_address, false, offset); 1403 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0); 1404 return block; 1405 } 1406 1407 /** 1408 * ram_save_release_protection: release UFFD write protection after 1409 * a range of pages has been saved 1410 * 1411 * @rs: current RAM state 1412 * @pss: page-search-status structure 1413 * @start_page: index of the first page in the range relative to pss->block 1414 * 1415 * Returns 0 on success, negative value in case of an error 1416 */ 1417 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1418 unsigned long start_page) 1419 { 1420 int res = 0; 1421 1422 /* Check if page is from UFFD-managed region. */ 1423 if (pss->block->flags & RAM_UF_WRITEPROTECT) { 1424 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS); 1425 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS; 1426 1427 /* Flush async buffers before un-protect. */ 1428 qemu_fflush(pss->pss_channel); 1429 /* Un-protect memory range. */ 1430 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length, 1431 false, false); 1432 } 1433 1434 return res; 1435 } 1436 1437 /* ram_write_tracking_available: check if kernel supports required UFFD features 1438 * 1439 * Returns true if supports, false otherwise 1440 */ 1441 bool ram_write_tracking_available(void) 1442 { 1443 uint64_t uffd_features; 1444 int res; 1445 1446 res = uffd_query_features(&uffd_features); 1447 return (res == 0 && 1448 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0); 1449 } 1450 1451 /* ram_write_tracking_compatible: check if guest configuration is 1452 * compatible with 'write-tracking' 1453 * 1454 * Returns true if compatible, false otherwise 1455 */ 1456 bool ram_write_tracking_compatible(void) 1457 { 1458 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT); 1459 int uffd_fd; 1460 RAMBlock *block; 1461 bool ret = false; 1462 1463 /* Open UFFD file descriptor */ 1464 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false); 1465 if (uffd_fd < 0) { 1466 return false; 1467 } 1468 1469 RCU_READ_LOCK_GUARD(); 1470 1471 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1472 uint64_t uffd_ioctls; 1473 1474 /* Nothing to do with read-only and MMIO-writable regions */ 1475 if (block->mr->readonly || block->mr->rom_device) { 1476 continue; 1477 } 1478 /* Try to register block memory via UFFD-IO to track writes */ 1479 if (uffd_register_memory(uffd_fd, block->host, block->max_length, 1480 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) { 1481 goto out; 1482 } 1483 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) { 1484 goto out; 1485 } 1486 } 1487 ret = true; 1488 1489 out: 1490 uffd_close_fd(uffd_fd); 1491 return ret; 1492 } 1493 1494 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset, 1495 ram_addr_t size) 1496 { 1497 const ram_addr_t end = offset + size; 1498 1499 /* 1500 * We read one byte of each page; this will preallocate page tables if 1501 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory 1502 * where no page was populated yet. This might require adaption when 1503 * supporting other mappings, like shmem. 1504 */ 1505 for (; offset < end; offset += block->page_size) { 1506 char tmp = *((char *)block->host + offset); 1507 1508 /* Don't optimize the read out */ 1509 asm volatile("" : "+r" (tmp)); 1510 } 1511 } 1512 1513 static inline int populate_read_section(MemoryRegionSection *section, 1514 void *opaque) 1515 { 1516 const hwaddr size = int128_get64(section->size); 1517 hwaddr offset = section->offset_within_region; 1518 RAMBlock *block = section->mr->ram_block; 1519 1520 populate_read_range(block, offset, size); 1521 return 0; 1522 } 1523 1524 /* 1525 * ram_block_populate_read: preallocate page tables and populate pages in the 1526 * RAM block by reading a byte of each page. 1527 * 1528 * Since it's solely used for userfault_fd WP feature, here we just 1529 * hardcode page size to qemu_real_host_page_size. 1530 * 1531 * @block: RAM block to populate 1532 */ 1533 static void ram_block_populate_read(RAMBlock *rb) 1534 { 1535 /* 1536 * Skip populating all pages that fall into a discarded range as managed by 1537 * a RamDiscardManager responsible for the mapped memory region of the 1538 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock 1539 * must not get populated automatically. We don't have to track 1540 * modifications via userfaultfd WP reliably, because these pages will 1541 * not be part of the migration stream either way -- see 1542 * ramblock_dirty_bitmap_exclude_discarded_pages(). 1543 * 1544 * Note: The result is only stable while migrating (precopy/postcopy). 1545 */ 1546 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1547 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1548 MemoryRegionSection section = { 1549 .mr = rb->mr, 1550 .offset_within_region = 0, 1551 .size = rb->mr->size, 1552 }; 1553 1554 ram_discard_manager_replay_populated(rdm, §ion, 1555 populate_read_section, NULL); 1556 } else { 1557 populate_read_range(rb, 0, rb->used_length); 1558 } 1559 } 1560 1561 /* 1562 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking 1563 */ 1564 void ram_write_tracking_prepare(void) 1565 { 1566 RAMBlock *block; 1567 1568 RCU_READ_LOCK_GUARD(); 1569 1570 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1571 /* Nothing to do with read-only and MMIO-writable regions */ 1572 if (block->mr->readonly || block->mr->rom_device) { 1573 continue; 1574 } 1575 1576 /* 1577 * Populate pages of the RAM block before enabling userfault_fd 1578 * write protection. 1579 * 1580 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with 1581 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip 1582 * pages with pte_none() entries in page table. 1583 */ 1584 ram_block_populate_read(block); 1585 } 1586 } 1587 1588 static inline int uffd_protect_section(MemoryRegionSection *section, 1589 void *opaque) 1590 { 1591 const hwaddr size = int128_get64(section->size); 1592 const hwaddr offset = section->offset_within_region; 1593 RAMBlock *rb = section->mr->ram_block; 1594 int uffd_fd = (uintptr_t)opaque; 1595 1596 return uffd_change_protection(uffd_fd, rb->host + offset, size, true, 1597 false); 1598 } 1599 1600 static int ram_block_uffd_protect(RAMBlock *rb, int uffd_fd) 1601 { 1602 assert(rb->flags & RAM_UF_WRITEPROTECT); 1603 1604 /* See ram_block_populate_read() */ 1605 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) { 1606 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr); 1607 MemoryRegionSection section = { 1608 .mr = rb->mr, 1609 .offset_within_region = 0, 1610 .size = rb->mr->size, 1611 }; 1612 1613 return ram_discard_manager_replay_populated(rdm, §ion, 1614 uffd_protect_section, 1615 (void *)(uintptr_t)uffd_fd); 1616 } 1617 return uffd_change_protection(uffd_fd, rb->host, 1618 rb->used_length, true, false); 1619 } 1620 1621 /* 1622 * ram_write_tracking_start: start UFFD-WP memory tracking 1623 * 1624 * Returns 0 for success or negative value in case of error 1625 */ 1626 int ram_write_tracking_start(void) 1627 { 1628 int uffd_fd; 1629 RAMState *rs = ram_state; 1630 RAMBlock *block; 1631 1632 /* Open UFFD file descriptor */ 1633 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true); 1634 if (uffd_fd < 0) { 1635 return uffd_fd; 1636 } 1637 rs->uffdio_fd = uffd_fd; 1638 1639 RCU_READ_LOCK_GUARD(); 1640 1641 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1642 /* Nothing to do with read-only and MMIO-writable regions */ 1643 if (block->mr->readonly || block->mr->rom_device) { 1644 continue; 1645 } 1646 1647 /* Register block memory with UFFD to track writes */ 1648 if (uffd_register_memory(rs->uffdio_fd, block->host, 1649 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) { 1650 goto fail; 1651 } 1652 block->flags |= RAM_UF_WRITEPROTECT; 1653 memory_region_ref(block->mr); 1654 1655 /* Apply UFFD write protection to the block memory range */ 1656 if (ram_block_uffd_protect(block, uffd_fd)) { 1657 goto fail; 1658 } 1659 1660 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size, 1661 block->host, block->max_length); 1662 } 1663 1664 return 0; 1665 1666 fail: 1667 error_report("ram_write_tracking_start() failed: restoring initial memory state"); 1668 1669 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1670 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1671 continue; 1672 } 1673 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1674 /* Cleanup flags and remove reference */ 1675 block->flags &= ~RAM_UF_WRITEPROTECT; 1676 memory_region_unref(block->mr); 1677 } 1678 1679 uffd_close_fd(uffd_fd); 1680 rs->uffdio_fd = -1; 1681 return -1; 1682 } 1683 1684 /** 1685 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection 1686 */ 1687 void ram_write_tracking_stop(void) 1688 { 1689 RAMState *rs = ram_state; 1690 RAMBlock *block; 1691 1692 RCU_READ_LOCK_GUARD(); 1693 1694 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 1695 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) { 1696 continue; 1697 } 1698 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length); 1699 1700 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size, 1701 block->host, block->max_length); 1702 1703 /* Cleanup flags and remove reference */ 1704 block->flags &= ~RAM_UF_WRITEPROTECT; 1705 memory_region_unref(block->mr); 1706 } 1707 1708 /* Finally close UFFD file descriptor */ 1709 uffd_close_fd(rs->uffdio_fd); 1710 rs->uffdio_fd = -1; 1711 } 1712 1713 #else 1714 /* No target OS support, stubs just fail or ignore */ 1715 1716 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset) 1717 { 1718 (void) rs; 1719 (void) offset; 1720 1721 return NULL; 1722 } 1723 1724 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss, 1725 unsigned long start_page) 1726 { 1727 (void) rs; 1728 (void) pss; 1729 (void) start_page; 1730 1731 return 0; 1732 } 1733 1734 bool ram_write_tracking_available(void) 1735 { 1736 return false; 1737 } 1738 1739 bool ram_write_tracking_compatible(void) 1740 { 1741 g_assert_not_reached(); 1742 } 1743 1744 int ram_write_tracking_start(void) 1745 { 1746 g_assert_not_reached(); 1747 } 1748 1749 void ram_write_tracking_stop(void) 1750 { 1751 g_assert_not_reached(); 1752 } 1753 #endif /* defined(__linux__) */ 1754 1755 /** 1756 * get_queued_page: unqueue a page from the postcopy requests 1757 * 1758 * Skips pages that are already sent (!dirty) 1759 * 1760 * Returns true if a queued page is found 1761 * 1762 * @rs: current RAM state 1763 * @pss: data about the state of the current dirty page scan 1764 */ 1765 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss) 1766 { 1767 RAMBlock *block; 1768 ram_addr_t offset; 1769 bool dirty = false; 1770 1771 do { 1772 block = unqueue_page(rs, &offset); 1773 /* 1774 * We're sending this page, and since it's postcopy nothing else 1775 * will dirty it, and we must make sure it doesn't get sent again 1776 * even if this queue request was received after the background 1777 * search already sent it. 1778 */ 1779 if (block) { 1780 unsigned long page; 1781 1782 page = offset >> TARGET_PAGE_BITS; 1783 dirty = test_bit(page, block->bmap); 1784 if (!dirty) { 1785 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset, 1786 page); 1787 } else { 1788 trace_get_queued_page(block->idstr, (uint64_t)offset, page); 1789 } 1790 } 1791 1792 } while (block && !dirty); 1793 1794 if (!block) { 1795 /* 1796 * Poll write faults too if background snapshot is enabled; that's 1797 * when we have vcpus got blocked by the write protected pages. 1798 */ 1799 block = poll_fault_page(rs, &offset); 1800 } 1801 1802 if (block) { 1803 /* 1804 * We want the background search to continue from the queued page 1805 * since the guest is likely to want other pages near to the page 1806 * it just requested. 1807 */ 1808 pss->block = block; 1809 pss->page = offset >> TARGET_PAGE_BITS; 1810 1811 /* 1812 * This unqueued page would break the "one round" check, even is 1813 * really rare. 1814 */ 1815 pss->complete_round = false; 1816 } 1817 1818 return !!block; 1819 } 1820 1821 /** 1822 * migration_page_queue_free: drop any remaining pages in the ram 1823 * request queue 1824 * 1825 * It should be empty at the end anyway, but in error cases there may 1826 * be some left. in case that there is any page left, we drop it. 1827 * 1828 */ 1829 static void migration_page_queue_free(RAMState *rs) 1830 { 1831 struct RAMSrcPageRequest *mspr, *next_mspr; 1832 /* This queue generally should be empty - but in the case of a failed 1833 * migration might have some droppings in. 1834 */ 1835 RCU_READ_LOCK_GUARD(); 1836 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) { 1837 memory_region_unref(mspr->rb->mr); 1838 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req); 1839 g_free(mspr); 1840 } 1841 } 1842 1843 /** 1844 * ram_save_queue_pages: queue the page for transmission 1845 * 1846 * A request from postcopy destination for example. 1847 * 1848 * Returns zero on success or negative on error 1849 * 1850 * @rbname: Name of the RAMBLock of the request. NULL means the 1851 * same that last one. 1852 * @start: starting address from the start of the RAMBlock 1853 * @len: length (in bytes) to send 1854 */ 1855 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len, 1856 Error **errp) 1857 { 1858 RAMBlock *ramblock; 1859 RAMState *rs = ram_state; 1860 1861 stat64_add(&mig_stats.postcopy_requests, 1); 1862 RCU_READ_LOCK_GUARD(); 1863 1864 if (!rbname) { 1865 /* Reuse last RAMBlock */ 1866 ramblock = rs->last_req_rb; 1867 1868 if (!ramblock) { 1869 /* 1870 * Shouldn't happen, we can't reuse the last RAMBlock if 1871 * it's the 1st request. 1872 */ 1873 error_setg(errp, "MIG_RP_MSG_REQ_PAGES has no previous block"); 1874 return -1; 1875 } 1876 } else { 1877 ramblock = qemu_ram_block_by_name(rbname); 1878 1879 if (!ramblock) { 1880 /* We shouldn't be asked for a non-existent RAMBlock */ 1881 error_setg(errp, "MIG_RP_MSG_REQ_PAGES has no block '%s'", rbname); 1882 return -1; 1883 } 1884 rs->last_req_rb = ramblock; 1885 } 1886 trace_ram_save_queue_pages(ramblock->idstr, start, len); 1887 if (!offset_in_ramblock(ramblock, start + len - 1)) { 1888 error_setg(errp, "MIG_RP_MSG_REQ_PAGES request overrun, " 1889 "start=" RAM_ADDR_FMT " len=" 1890 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, 1891 start, len, ramblock->used_length); 1892 return -1; 1893 } 1894 1895 /* 1896 * When with postcopy preempt, we send back the page directly in the 1897 * rp-return thread. 1898 */ 1899 if (postcopy_preempt_active()) { 1900 ram_addr_t page_start = start >> TARGET_PAGE_BITS; 1901 size_t page_size = qemu_ram_pagesize(ramblock); 1902 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY]; 1903 int ret = 0; 1904 1905 qemu_mutex_lock(&rs->bitmap_mutex); 1906 1907 pss_init(pss, ramblock, page_start); 1908 /* 1909 * Always use the preempt channel, and make sure it's there. It's 1910 * safe to access without lock, because when rp-thread is running 1911 * we should be the only one who operates on the qemufile 1912 */ 1913 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src; 1914 assert(pss->pss_channel); 1915 1916 /* 1917 * It must be either one or multiple of host page size. Just 1918 * assert; if something wrong we're mostly split brain anyway. 1919 */ 1920 assert(len % page_size == 0); 1921 while (len) { 1922 if (ram_save_host_page_urgent(pss)) { 1923 error_setg(errp, "ram_save_host_page_urgent() failed: " 1924 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT, 1925 ramblock->idstr, start); 1926 ret = -1; 1927 break; 1928 } 1929 /* 1930 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page 1931 * will automatically be moved and point to the next host page 1932 * we're going to send, so no need to update here. 1933 * 1934 * Normally QEMU never sends >1 host page in requests, so 1935 * logically we don't even need that as the loop should only 1936 * run once, but just to be consistent. 1937 */ 1938 len -= page_size; 1939 }; 1940 qemu_mutex_unlock(&rs->bitmap_mutex); 1941 1942 return ret; 1943 } 1944 1945 struct RAMSrcPageRequest *new_entry = 1946 g_new0(struct RAMSrcPageRequest, 1); 1947 new_entry->rb = ramblock; 1948 new_entry->offset = start; 1949 new_entry->len = len; 1950 1951 memory_region_ref(ramblock->mr); 1952 qemu_mutex_lock(&rs->src_page_req_mutex); 1953 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req); 1954 migration_make_urgent_request(); 1955 qemu_mutex_unlock(&rs->src_page_req_mutex); 1956 1957 return 0; 1958 } 1959 1960 /** 1961 * ram_save_target_page_legacy: save one target page 1962 * 1963 * Returns the number of pages written 1964 * 1965 * @rs: current RAM state 1966 * @pss: data about the page we want to send 1967 */ 1968 static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) 1969 { 1970 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1971 int res; 1972 1973 if (control_save_page(pss, offset, &res)) { 1974 return res; 1975 } 1976 1977 if (save_zero_page(rs, pss, offset)) { 1978 return 1; 1979 } 1980 1981 return ram_save_page(rs, pss); 1982 } 1983 1984 /** 1985 * ram_save_target_page_multifd: send one target page to multifd workers 1986 * 1987 * Returns 1 if the page was queued, -1 otherwise. 1988 * 1989 * @rs: current RAM state 1990 * @pss: data about the page we want to send 1991 */ 1992 static int ram_save_target_page_multifd(RAMState *rs, PageSearchStatus *pss) 1993 { 1994 RAMBlock *block = pss->block; 1995 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 1996 1997 /* 1998 * While using multifd live migration, we still need to handle zero 1999 * page checking on the migration main thread. 2000 */ 2001 if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) { 2002 if (save_zero_page(rs, pss, offset)) { 2003 return 1; 2004 } 2005 } 2006 2007 return ram_save_multifd_page(block, offset); 2008 } 2009 2010 /* Should be called before sending a host page */ 2011 static void pss_host_page_prepare(PageSearchStatus *pss) 2012 { 2013 /* How many guest pages are there in one host page? */ 2014 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2015 2016 pss->host_page_sending = true; 2017 if (guest_pfns <= 1) { 2018 /* 2019 * This covers both when guest psize == host psize, or when guest 2020 * has larger psize than the host (guest_pfns==0). 2021 * 2022 * For the latter, we always send one whole guest page per 2023 * iteration of the host page (example: an Alpha VM on x86 host 2024 * will have guest psize 8K while host psize 4K). 2025 */ 2026 pss->host_page_start = pss->page; 2027 pss->host_page_end = pss->page + 1; 2028 } else { 2029 /* 2030 * The host page spans over multiple guest pages, we send them 2031 * within the same host page iteration. 2032 */ 2033 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns); 2034 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns); 2035 } 2036 } 2037 2038 /* 2039 * Whether the page pointed by PSS is within the host page being sent. 2040 * Must be called after a previous pss_host_page_prepare(). 2041 */ 2042 static bool pss_within_range(PageSearchStatus *pss) 2043 { 2044 ram_addr_t ram_addr; 2045 2046 assert(pss->host_page_sending); 2047 2048 /* Over host-page boundary? */ 2049 if (pss->page >= pss->host_page_end) { 2050 return false; 2051 } 2052 2053 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; 2054 2055 return offset_in_ramblock(pss->block, ram_addr); 2056 } 2057 2058 static void pss_host_page_finish(PageSearchStatus *pss) 2059 { 2060 pss->host_page_sending = false; 2061 /* This is not needed, but just to reset it */ 2062 pss->host_page_start = pss->host_page_end = 0; 2063 } 2064 2065 /* 2066 * Send an urgent host page specified by `pss'. Need to be called with 2067 * bitmap_mutex held. 2068 * 2069 * Returns 0 if save host page succeeded, false otherwise. 2070 */ 2071 static int ram_save_host_page_urgent(PageSearchStatus *pss) 2072 { 2073 bool page_dirty, sent = false; 2074 RAMState *rs = ram_state; 2075 int ret = 0; 2076 2077 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page); 2078 pss_host_page_prepare(pss); 2079 2080 /* 2081 * If precopy is sending the same page, let it be done in precopy, or 2082 * we could send the same page in two channels and none of them will 2083 * receive the whole page. 2084 */ 2085 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) { 2086 trace_postcopy_preempt_hit(pss->block->idstr, 2087 pss->page << TARGET_PAGE_BITS); 2088 return 0; 2089 } 2090 2091 do { 2092 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2093 2094 if (page_dirty) { 2095 /* Be strict to return code; it must be 1, or what else? */ 2096 if (migration_ops->ram_save_target_page(rs, pss) != 1) { 2097 error_report_once("%s: ram_save_target_page failed", __func__); 2098 ret = -1; 2099 goto out; 2100 } 2101 sent = true; 2102 } 2103 pss_find_next_dirty(pss); 2104 } while (pss_within_range(pss)); 2105 out: 2106 pss_host_page_finish(pss); 2107 /* For urgent requests, flush immediately if sent */ 2108 if (sent) { 2109 qemu_fflush(pss->pss_channel); 2110 } 2111 return ret; 2112 } 2113 2114 /** 2115 * ram_save_host_page: save a whole host page 2116 * 2117 * Starting at *offset send pages up to the end of the current host 2118 * page. It's valid for the initial offset to point into the middle of 2119 * a host page in which case the remainder of the hostpage is sent. 2120 * Only dirty target pages are sent. Note that the host page size may 2121 * be a huge page for this block. 2122 * 2123 * The saving stops at the boundary of the used_length of the block 2124 * if the RAMBlock isn't a multiple of the host page size. 2125 * 2126 * The caller must be with ram_state.bitmap_mutex held to call this 2127 * function. Note that this function can temporarily release the lock, but 2128 * when the function is returned it'll make sure the lock is still held. 2129 * 2130 * Returns the number of pages written or negative on error 2131 * 2132 * @rs: current RAM state 2133 * @pss: data about the page we want to send 2134 */ 2135 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss) 2136 { 2137 bool page_dirty, preempt_active = postcopy_preempt_active(); 2138 int tmppages, pages = 0; 2139 size_t pagesize_bits = 2140 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS; 2141 unsigned long start_page = pss->page; 2142 int res; 2143 2144 if (migrate_ram_is_ignored(pss->block)) { 2145 error_report("block %s should not be migrated !", pss->block->idstr); 2146 return 0; 2147 } 2148 2149 /* Update host page boundary information */ 2150 pss_host_page_prepare(pss); 2151 2152 do { 2153 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page); 2154 2155 /* Check the pages is dirty and if it is send it */ 2156 if (page_dirty) { 2157 /* 2158 * Properly yield the lock only in postcopy preempt mode 2159 * because both migration thread and rp-return thread can 2160 * operate on the bitmaps. 2161 */ 2162 if (preempt_active) { 2163 qemu_mutex_unlock(&rs->bitmap_mutex); 2164 } 2165 tmppages = migration_ops->ram_save_target_page(rs, pss); 2166 if (tmppages >= 0) { 2167 pages += tmppages; 2168 /* 2169 * Allow rate limiting to happen in the middle of huge pages if 2170 * something is sent in the current iteration. 2171 */ 2172 if (pagesize_bits > 1 && tmppages > 0) { 2173 migration_rate_limit(); 2174 } 2175 } 2176 if (preempt_active) { 2177 qemu_mutex_lock(&rs->bitmap_mutex); 2178 } 2179 } else { 2180 tmppages = 0; 2181 } 2182 2183 if (tmppages < 0) { 2184 pss_host_page_finish(pss); 2185 return tmppages; 2186 } 2187 2188 pss_find_next_dirty(pss); 2189 } while (pss_within_range(pss)); 2190 2191 pss_host_page_finish(pss); 2192 2193 res = ram_save_release_protection(rs, pss, start_page); 2194 return (res < 0 ? res : pages); 2195 } 2196 2197 /** 2198 * ram_find_and_save_block: finds a dirty page and sends it to f 2199 * 2200 * Called within an RCU critical section. 2201 * 2202 * Returns the number of pages written where zero means no dirty pages, 2203 * or negative on error 2204 * 2205 * @rs: current RAM state 2206 * 2207 * On systems where host-page-size > target-page-size it will send all the 2208 * pages in a host page that are dirty. 2209 */ 2210 static int ram_find_and_save_block(RAMState *rs) 2211 { 2212 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY]; 2213 int pages = 0; 2214 2215 /* No dirty page as there is zero RAM */ 2216 if (!rs->ram_bytes_total) { 2217 return pages; 2218 } 2219 2220 /* 2221 * Always keep last_seen_block/last_page valid during this procedure, 2222 * because find_dirty_block() relies on these values (e.g., we compare 2223 * last_seen_block with pss.block to see whether we searched all the 2224 * ramblocks) to detect the completion of migration. Having NULL value 2225 * of last_seen_block can conditionally cause below loop to run forever. 2226 */ 2227 if (!rs->last_seen_block) { 2228 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks); 2229 rs->last_page = 0; 2230 } 2231 2232 pss_init(pss, rs->last_seen_block, rs->last_page); 2233 2234 while (true){ 2235 if (!get_queued_page(rs, pss)) { 2236 /* priority queue empty, so just search for something dirty */ 2237 int res = find_dirty_block(rs, pss); 2238 if (res != PAGE_DIRTY_FOUND) { 2239 if (res == PAGE_ALL_CLEAN) { 2240 break; 2241 } else if (res == PAGE_TRY_AGAIN) { 2242 continue; 2243 } else if (res < 0) { 2244 pages = res; 2245 break; 2246 } 2247 } 2248 } 2249 pages = ram_save_host_page(rs, pss); 2250 if (pages) { 2251 break; 2252 } 2253 } 2254 2255 rs->last_seen_block = pss->block; 2256 rs->last_page = pss->page; 2257 2258 return pages; 2259 } 2260 2261 static uint64_t ram_bytes_total_with_ignored(void) 2262 { 2263 RAMBlock *block; 2264 uint64_t total = 0; 2265 2266 RCU_READ_LOCK_GUARD(); 2267 2268 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2269 total += block->used_length; 2270 } 2271 return total; 2272 } 2273 2274 uint64_t ram_bytes_total(void) 2275 { 2276 RAMBlock *block; 2277 uint64_t total = 0; 2278 2279 RCU_READ_LOCK_GUARD(); 2280 2281 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2282 total += block->used_length; 2283 } 2284 return total; 2285 } 2286 2287 static void xbzrle_load_setup(void) 2288 { 2289 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); 2290 } 2291 2292 static void xbzrle_load_cleanup(void) 2293 { 2294 g_free(XBZRLE.decoded_buf); 2295 XBZRLE.decoded_buf = NULL; 2296 } 2297 2298 static void ram_state_cleanup(RAMState **rsp) 2299 { 2300 if (*rsp) { 2301 migration_page_queue_free(*rsp); 2302 qemu_mutex_destroy(&(*rsp)->bitmap_mutex); 2303 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex); 2304 g_free(*rsp); 2305 *rsp = NULL; 2306 } 2307 } 2308 2309 static void xbzrle_cleanup(void) 2310 { 2311 XBZRLE_cache_lock(); 2312 if (XBZRLE.cache) { 2313 cache_fini(XBZRLE.cache); 2314 g_free(XBZRLE.encoded_buf); 2315 g_free(XBZRLE.current_buf); 2316 g_free(XBZRLE.zero_target_page); 2317 XBZRLE.cache = NULL; 2318 XBZRLE.encoded_buf = NULL; 2319 XBZRLE.current_buf = NULL; 2320 XBZRLE.zero_target_page = NULL; 2321 } 2322 XBZRLE_cache_unlock(); 2323 } 2324 2325 static void ram_bitmaps_destroy(void) 2326 { 2327 RAMBlock *block; 2328 2329 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2330 g_free(block->clear_bmap); 2331 block->clear_bmap = NULL; 2332 g_free(block->bmap); 2333 block->bmap = NULL; 2334 g_free(block->file_bmap); 2335 block->file_bmap = NULL; 2336 } 2337 } 2338 2339 static void ram_save_cleanup(void *opaque) 2340 { 2341 RAMState **rsp = opaque; 2342 2343 /* We don't use dirty log with background snapshots */ 2344 if (!migrate_background_snapshot()) { 2345 /* caller have hold BQL or is in a bh, so there is 2346 * no writing race against the migration bitmap 2347 */ 2348 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) { 2349 /* 2350 * do not stop dirty log without starting it, since 2351 * memory_global_dirty_log_stop will assert that 2352 * memory_global_dirty_log_start/stop used in pairs 2353 */ 2354 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 2355 } 2356 } 2357 2358 ram_bitmaps_destroy(); 2359 2360 xbzrle_cleanup(); 2361 multifd_ram_save_cleanup(); 2362 ram_state_cleanup(rsp); 2363 g_free(migration_ops); 2364 migration_ops = NULL; 2365 } 2366 2367 static void ram_state_reset(RAMState *rs) 2368 { 2369 int i; 2370 2371 for (i = 0; i < RAM_CHANNEL_MAX; i++) { 2372 rs->pss[i].last_sent_block = NULL; 2373 } 2374 2375 rs->last_seen_block = NULL; 2376 rs->last_page = 0; 2377 rs->last_version = ram_list.version; 2378 rs->xbzrle_started = false; 2379 } 2380 2381 #define MAX_WAIT 50 /* ms, half buffered_file limit */ 2382 2383 /* **** functions for postcopy ***** */ 2384 2385 void ram_postcopy_migrated_memory_release(MigrationState *ms) 2386 { 2387 struct RAMBlock *block; 2388 2389 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2390 unsigned long *bitmap = block->bmap; 2391 unsigned long range = block->used_length >> TARGET_PAGE_BITS; 2392 unsigned long run_start = find_next_zero_bit(bitmap, range, 0); 2393 2394 while (run_start < range) { 2395 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1); 2396 ram_discard_range(block->idstr, 2397 ((ram_addr_t)run_start) << TARGET_PAGE_BITS, 2398 ((ram_addr_t)(run_end - run_start)) 2399 << TARGET_PAGE_BITS); 2400 run_start = find_next_zero_bit(bitmap, range, run_end + 1); 2401 } 2402 } 2403 } 2404 2405 /** 2406 * postcopy_send_discard_bm_ram: discard a RAMBlock 2407 * 2408 * Callback from postcopy_each_ram_send_discard for each RAMBlock 2409 * 2410 * @ms: current migration state 2411 * @block: RAMBlock to discard 2412 */ 2413 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block) 2414 { 2415 unsigned long end = block->used_length >> TARGET_PAGE_BITS; 2416 unsigned long current; 2417 unsigned long *bitmap = block->bmap; 2418 2419 for (current = 0; current < end; ) { 2420 unsigned long one = find_next_bit(bitmap, end, current); 2421 unsigned long zero, discard_length; 2422 2423 if (one >= end) { 2424 break; 2425 } 2426 2427 zero = find_next_zero_bit(bitmap, end, one + 1); 2428 2429 if (zero >= end) { 2430 discard_length = end - one; 2431 } else { 2432 discard_length = zero - one; 2433 } 2434 postcopy_discard_send_range(ms, one, discard_length); 2435 current = one + discard_length; 2436 } 2437 } 2438 2439 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block); 2440 2441 /** 2442 * postcopy_each_ram_send_discard: discard all RAMBlocks 2443 * 2444 * Utility for the outgoing postcopy code. 2445 * Calls postcopy_send_discard_bm_ram for each RAMBlock 2446 * passing it bitmap indexes and name. 2447 * (qemu_ram_foreach_block ends up passing unscaled lengths 2448 * which would mean postcopy code would have to deal with target page) 2449 * 2450 * @ms: current migration state 2451 */ 2452 static void postcopy_each_ram_send_discard(MigrationState *ms) 2453 { 2454 struct RAMBlock *block; 2455 2456 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2457 postcopy_discard_send_init(ms, block->idstr); 2458 2459 /* 2460 * Deal with TPS != HPS and huge pages. It discard any partially sent 2461 * host-page size chunks, mark any partially dirty host-page size 2462 * chunks as all dirty. In this case the host-page is the host-page 2463 * for the particular RAMBlock, i.e. it might be a huge page. 2464 */ 2465 postcopy_chunk_hostpages_pass(ms, block); 2466 2467 /* 2468 * Postcopy sends chunks of bitmap over the wire, but it 2469 * just needs indexes at this point, avoids it having 2470 * target page specific code. 2471 */ 2472 postcopy_send_discard_bm_ram(ms, block); 2473 postcopy_discard_send_finish(ms); 2474 } 2475 } 2476 2477 /** 2478 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages 2479 * 2480 * Helper for postcopy_chunk_hostpages; it's called twice to 2481 * canonicalize the two bitmaps, that are similar, but one is 2482 * inverted. 2483 * 2484 * Postcopy requires that all target pages in a hostpage are dirty or 2485 * clean, not a mix. This function canonicalizes the bitmaps. 2486 * 2487 * @ms: current migration state 2488 * @block: block that contains the page we want to canonicalize 2489 */ 2490 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block) 2491 { 2492 RAMState *rs = ram_state; 2493 unsigned long *bitmap = block->bmap; 2494 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE; 2495 unsigned long pages = block->used_length >> TARGET_PAGE_BITS; 2496 unsigned long run_start; 2497 2498 if (block->page_size == TARGET_PAGE_SIZE) { 2499 /* Easy case - TPS==HPS for a non-huge page RAMBlock */ 2500 return; 2501 } 2502 2503 /* Find a dirty page */ 2504 run_start = find_next_bit(bitmap, pages, 0); 2505 2506 while (run_start < pages) { 2507 2508 /* 2509 * If the start of this run of pages is in the middle of a host 2510 * page, then we need to fixup this host page. 2511 */ 2512 if (QEMU_IS_ALIGNED(run_start, host_ratio)) { 2513 /* Find the end of this run */ 2514 run_start = find_next_zero_bit(bitmap, pages, run_start + 1); 2515 /* 2516 * If the end isn't at the start of a host page, then the 2517 * run doesn't finish at the end of a host page 2518 * and we need to discard. 2519 */ 2520 } 2521 2522 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) { 2523 unsigned long page; 2524 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start, 2525 host_ratio); 2526 run_start = QEMU_ALIGN_UP(run_start, host_ratio); 2527 2528 /* Clean up the bitmap */ 2529 for (page = fixup_start_addr; 2530 page < fixup_start_addr + host_ratio; page++) { 2531 /* 2532 * Remark them as dirty, updating the count for any pages 2533 * that weren't previously dirty. 2534 */ 2535 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap); 2536 } 2537 } 2538 2539 /* Find the next dirty page for the next iteration */ 2540 run_start = find_next_bit(bitmap, pages, run_start); 2541 } 2542 } 2543 2544 /** 2545 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap 2546 * 2547 * Transmit the set of pages to be discarded after precopy to the target 2548 * these are pages that: 2549 * a) Have been previously transmitted but are now dirty again 2550 * b) Pages that have never been transmitted, this ensures that 2551 * any pages on the destination that have been mapped by background 2552 * tasks get discarded (transparent huge pages is the specific concern) 2553 * Hopefully this is pretty sparse 2554 * 2555 * @ms: current migration state 2556 */ 2557 void ram_postcopy_send_discard_bitmap(MigrationState *ms) 2558 { 2559 RAMState *rs = ram_state; 2560 2561 RCU_READ_LOCK_GUARD(); 2562 2563 /* This should be our last sync, the src is now paused */ 2564 migration_bitmap_sync(rs, false); 2565 2566 /* Easiest way to make sure we don't resume in the middle of a host-page */ 2567 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL; 2568 rs->last_seen_block = NULL; 2569 rs->last_page = 0; 2570 2571 postcopy_each_ram_send_discard(ms); 2572 2573 trace_ram_postcopy_send_discard_bitmap(); 2574 } 2575 2576 /** 2577 * ram_discard_range: discard dirtied pages at the beginning of postcopy 2578 * 2579 * Returns zero on success 2580 * 2581 * @rbname: name of the RAMBlock of the request. NULL means the 2582 * same that last one. 2583 * @start: RAMBlock starting page 2584 * @length: RAMBlock size 2585 */ 2586 int ram_discard_range(const char *rbname, uint64_t start, size_t length) 2587 { 2588 trace_ram_discard_range(rbname, start, length); 2589 2590 RCU_READ_LOCK_GUARD(); 2591 RAMBlock *rb = qemu_ram_block_by_name(rbname); 2592 2593 if (!rb) { 2594 error_report("ram_discard_range: Failed to find block '%s'", rbname); 2595 return -1; 2596 } 2597 2598 /* 2599 * On source VM, we don't need to update the received bitmap since 2600 * we don't even have one. 2601 */ 2602 if (rb->receivedmap) { 2603 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(), 2604 length >> qemu_target_page_bits()); 2605 } 2606 2607 return ram_block_discard_range(rb, start, length); 2608 } 2609 2610 /* 2611 * For every allocation, we will try not to crash the VM if the 2612 * allocation failed. 2613 */ 2614 static bool xbzrle_init(Error **errp) 2615 { 2616 if (!migrate_xbzrle()) { 2617 return true; 2618 } 2619 2620 XBZRLE_cache_lock(); 2621 2622 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE); 2623 if (!XBZRLE.zero_target_page) { 2624 error_setg(errp, "%s: Error allocating zero page", __func__); 2625 goto err_out; 2626 } 2627 2628 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(), 2629 TARGET_PAGE_SIZE, errp); 2630 if (!XBZRLE.cache) { 2631 goto free_zero_page; 2632 } 2633 2634 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); 2635 if (!XBZRLE.encoded_buf) { 2636 error_setg(errp, "%s: Error allocating encoded_buf", __func__); 2637 goto free_cache; 2638 } 2639 2640 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); 2641 if (!XBZRLE.current_buf) { 2642 error_setg(errp, "%s: Error allocating current_buf", __func__); 2643 goto free_encoded_buf; 2644 } 2645 2646 /* We are all good */ 2647 XBZRLE_cache_unlock(); 2648 return true; 2649 2650 free_encoded_buf: 2651 g_free(XBZRLE.encoded_buf); 2652 XBZRLE.encoded_buf = NULL; 2653 free_cache: 2654 cache_fini(XBZRLE.cache); 2655 XBZRLE.cache = NULL; 2656 free_zero_page: 2657 g_free(XBZRLE.zero_target_page); 2658 XBZRLE.zero_target_page = NULL; 2659 err_out: 2660 XBZRLE_cache_unlock(); 2661 return false; 2662 } 2663 2664 static bool ram_state_init(RAMState **rsp, Error **errp) 2665 { 2666 *rsp = g_try_new0(RAMState, 1); 2667 2668 if (!*rsp) { 2669 error_setg(errp, "%s: Init ramstate fail", __func__); 2670 return false; 2671 } 2672 2673 qemu_mutex_init(&(*rsp)->bitmap_mutex); 2674 qemu_mutex_init(&(*rsp)->src_page_req_mutex); 2675 QSIMPLEQ_INIT(&(*rsp)->src_page_requests); 2676 (*rsp)->ram_bytes_total = ram_bytes_total(); 2677 2678 /* 2679 * Count the total number of pages used by ram blocks not including any 2680 * gaps due to alignment or unplugs. 2681 * This must match with the initial values of dirty bitmap. 2682 */ 2683 (*rsp)->migration_dirty_pages = (*rsp)->ram_bytes_total >> TARGET_PAGE_BITS; 2684 ram_state_reset(*rsp); 2685 2686 return true; 2687 } 2688 2689 static void ram_list_init_bitmaps(void) 2690 { 2691 MigrationState *ms = migrate_get_current(); 2692 RAMBlock *block; 2693 unsigned long pages; 2694 uint8_t shift; 2695 2696 /* Skip setting bitmap if there is no RAM */ 2697 if (ram_bytes_total()) { 2698 shift = ms->clear_bitmap_shift; 2699 if (shift > CLEAR_BITMAP_SHIFT_MAX) { 2700 error_report("clear_bitmap_shift (%u) too big, using " 2701 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); 2702 shift = CLEAR_BITMAP_SHIFT_MAX; 2703 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { 2704 error_report("clear_bitmap_shift (%u) too small, using " 2705 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); 2706 shift = CLEAR_BITMAP_SHIFT_MIN; 2707 } 2708 2709 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2710 pages = block->max_length >> TARGET_PAGE_BITS; 2711 /* 2712 * The initial dirty bitmap for migration must be set with all 2713 * ones to make sure we'll migrate every guest RAM page to 2714 * destination. 2715 * Here we set RAMBlock.bmap all to 1 because when rebegin a 2716 * new migration after a failed migration, ram_list. 2717 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole 2718 * guest memory. 2719 */ 2720 block->bmap = bitmap_new(pages); 2721 bitmap_set(block->bmap, 0, pages); 2722 if (migrate_mapped_ram()) { 2723 block->file_bmap = bitmap_new(pages); 2724 } 2725 block->clear_bmap_shift = shift; 2726 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); 2727 } 2728 } 2729 } 2730 2731 static void migration_bitmap_clear_discarded_pages(RAMState *rs) 2732 { 2733 unsigned long pages; 2734 RAMBlock *rb; 2735 2736 RCU_READ_LOCK_GUARD(); 2737 2738 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 2739 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb); 2740 rs->migration_dirty_pages -= pages; 2741 } 2742 } 2743 2744 static bool ram_init_bitmaps(RAMState *rs, Error **errp) 2745 { 2746 bool ret = true; 2747 2748 qemu_mutex_lock_ramlist(); 2749 2750 WITH_RCU_READ_LOCK_GUARD() { 2751 ram_list_init_bitmaps(); 2752 /* We don't use dirty log with background snapshots */ 2753 if (!migrate_background_snapshot()) { 2754 ret = memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION, errp); 2755 if (!ret) { 2756 goto out_unlock; 2757 } 2758 migration_bitmap_sync_precopy(false); 2759 } 2760 } 2761 out_unlock: 2762 qemu_mutex_unlock_ramlist(); 2763 2764 if (!ret) { 2765 ram_bitmaps_destroy(); 2766 return false; 2767 } 2768 2769 /* 2770 * After an eventual first bitmap sync, fixup the initial bitmap 2771 * containing all 1s to exclude any discarded pages from migration. 2772 */ 2773 migration_bitmap_clear_discarded_pages(rs); 2774 return true; 2775 } 2776 2777 static int ram_init_all(RAMState **rsp, Error **errp) 2778 { 2779 if (!ram_state_init(rsp, errp)) { 2780 return -1; 2781 } 2782 2783 if (!xbzrle_init(errp)) { 2784 ram_state_cleanup(rsp); 2785 return -1; 2786 } 2787 2788 if (!ram_init_bitmaps(*rsp, errp)) { 2789 return -1; 2790 } 2791 2792 return 0; 2793 } 2794 2795 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out) 2796 { 2797 RAMBlock *block; 2798 uint64_t pages = 0; 2799 2800 /* 2801 * Postcopy is not using xbzrle/compression, so no need for that. 2802 * Also, since source are already halted, we don't need to care 2803 * about dirty page logging as well. 2804 */ 2805 2806 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 2807 pages += bitmap_count_one(block->bmap, 2808 block->used_length >> TARGET_PAGE_BITS); 2809 } 2810 2811 /* This may not be aligned with current bitmaps. Recalculate. */ 2812 rs->migration_dirty_pages = pages; 2813 2814 ram_state_reset(rs); 2815 2816 /* Update RAMState cache of output QEMUFile */ 2817 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out; 2818 2819 trace_ram_state_resume_prepare(pages); 2820 } 2821 2822 /* 2823 * This function clears bits of the free pages reported by the caller from the 2824 * migration dirty bitmap. @addr is the host address corresponding to the 2825 * start of the continuous guest free pages, and @len is the total bytes of 2826 * those pages. 2827 */ 2828 void qemu_guest_free_page_hint(void *addr, size_t len) 2829 { 2830 RAMBlock *block; 2831 ram_addr_t offset; 2832 size_t used_len, start, npages; 2833 2834 /* This function is currently expected to be used during live migration */ 2835 if (!migration_is_running()) { 2836 return; 2837 } 2838 2839 for (; len > 0; len -= used_len, addr += used_len) { 2840 block = qemu_ram_block_from_host(addr, false, &offset); 2841 if (unlikely(!block || offset >= block->used_length)) { 2842 /* 2843 * The implementation might not support RAMBlock resize during 2844 * live migration, but it could happen in theory with future 2845 * updates. So we add a check here to capture that case. 2846 */ 2847 error_report_once("%s unexpected error", __func__); 2848 return; 2849 } 2850 2851 if (len <= block->used_length - offset) { 2852 used_len = len; 2853 } else { 2854 used_len = block->used_length - offset; 2855 } 2856 2857 start = offset >> TARGET_PAGE_BITS; 2858 npages = used_len >> TARGET_PAGE_BITS; 2859 2860 qemu_mutex_lock(&ram_state->bitmap_mutex); 2861 /* 2862 * The skipped free pages are equavalent to be sent from clear_bmap's 2863 * perspective, so clear the bits from the memory region bitmap which 2864 * are initially set. Otherwise those skipped pages will be sent in 2865 * the next round after syncing from the memory region bitmap. 2866 */ 2867 migration_clear_memory_region_dirty_bitmap_range(block, start, npages); 2868 ram_state->migration_dirty_pages -= 2869 bitmap_count_one_with_offset(block->bmap, start, npages); 2870 bitmap_clear(block->bmap, start, npages); 2871 qemu_mutex_unlock(&ram_state->bitmap_mutex); 2872 } 2873 } 2874 2875 #define MAPPED_RAM_HDR_VERSION 1 2876 struct MappedRamHeader { 2877 uint32_t version; 2878 /* 2879 * The target's page size, so we know how many pages are in the 2880 * bitmap. 2881 */ 2882 uint64_t page_size; 2883 /* 2884 * The offset in the migration file where the pages bitmap is 2885 * stored. 2886 */ 2887 uint64_t bitmap_offset; 2888 /* 2889 * The offset in the migration file where the actual pages (data) 2890 * are stored. 2891 */ 2892 uint64_t pages_offset; 2893 } QEMU_PACKED; 2894 typedef struct MappedRamHeader MappedRamHeader; 2895 2896 static void mapped_ram_setup_ramblock(QEMUFile *file, RAMBlock *block) 2897 { 2898 g_autofree MappedRamHeader *header = NULL; 2899 size_t header_size, bitmap_size; 2900 long num_pages; 2901 2902 header = g_new0(MappedRamHeader, 1); 2903 header_size = sizeof(MappedRamHeader); 2904 2905 num_pages = block->used_length >> TARGET_PAGE_BITS; 2906 bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); 2907 2908 /* 2909 * Save the file offsets of where the bitmap and the pages should 2910 * go as they are written at the end of migration and during the 2911 * iterative phase, respectively. 2912 */ 2913 block->bitmap_offset = qemu_get_offset(file) + header_size; 2914 block->pages_offset = ROUND_UP(block->bitmap_offset + 2915 bitmap_size, 2916 MAPPED_RAM_FILE_OFFSET_ALIGNMENT); 2917 2918 header->version = cpu_to_be32(MAPPED_RAM_HDR_VERSION); 2919 header->page_size = cpu_to_be64(TARGET_PAGE_SIZE); 2920 header->bitmap_offset = cpu_to_be64(block->bitmap_offset); 2921 header->pages_offset = cpu_to_be64(block->pages_offset); 2922 2923 qemu_put_buffer(file, (uint8_t *) header, header_size); 2924 2925 /* prepare offset for next ramblock */ 2926 qemu_set_offset(file, block->pages_offset + block->used_length, SEEK_SET); 2927 } 2928 2929 static bool mapped_ram_read_header(QEMUFile *file, MappedRamHeader *header, 2930 Error **errp) 2931 { 2932 size_t ret, header_size = sizeof(MappedRamHeader); 2933 2934 ret = qemu_get_buffer(file, (uint8_t *)header, header_size); 2935 if (ret != header_size) { 2936 error_setg(errp, "Could not read whole mapped-ram migration header " 2937 "(expected %zd, got %zd bytes)", header_size, ret); 2938 return false; 2939 } 2940 2941 /* migration stream is big-endian */ 2942 header->version = be32_to_cpu(header->version); 2943 2944 if (header->version > MAPPED_RAM_HDR_VERSION) { 2945 error_setg(errp, "Migration mapped-ram capability version not " 2946 "supported (expected <= %d, got %d)", MAPPED_RAM_HDR_VERSION, 2947 header->version); 2948 return false; 2949 } 2950 2951 header->page_size = be64_to_cpu(header->page_size); 2952 header->bitmap_offset = be64_to_cpu(header->bitmap_offset); 2953 header->pages_offset = be64_to_cpu(header->pages_offset); 2954 2955 return true; 2956 } 2957 2958 /* 2959 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has 2960 * long-running RCU critical section. When rcu-reclaims in the code 2961 * start to become numerous it will be necessary to reduce the 2962 * granularity of these critical sections. 2963 */ 2964 2965 /** 2966 * ram_save_setup: Setup RAM for migration 2967 * 2968 * Returns zero to indicate success and negative for error 2969 * 2970 * @f: QEMUFile where to send the data 2971 * @opaque: RAMState pointer 2972 * @errp: pointer to Error*, to store an error if it happens. 2973 */ 2974 static int ram_save_setup(QEMUFile *f, void *opaque, Error **errp) 2975 { 2976 RAMState **rsp = opaque; 2977 RAMBlock *block; 2978 int ret, max_hg_page_size; 2979 2980 /* migration has already setup the bitmap, reuse it. */ 2981 if (!migration_in_colo_state()) { 2982 if (ram_init_all(rsp, errp) != 0) { 2983 return -1; 2984 } 2985 } 2986 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f; 2987 2988 /* 2989 * ??? Mirrors the previous value of qemu_host_page_size, 2990 * but is this really what was intended for the migration? 2991 */ 2992 max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE); 2993 2994 WITH_RCU_READ_LOCK_GUARD() { 2995 qemu_put_be64(f, ram_bytes_total_with_ignored() 2996 | RAM_SAVE_FLAG_MEM_SIZE); 2997 2998 RAMBLOCK_FOREACH_MIGRATABLE(block) { 2999 qemu_put_byte(f, strlen(block->idstr)); 3000 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); 3001 qemu_put_be64(f, block->used_length); 3002 if (migrate_postcopy_ram() && 3003 block->page_size != max_hg_page_size) { 3004 qemu_put_be64(f, block->page_size); 3005 } 3006 if (migrate_ignore_shared()) { 3007 qemu_put_be64(f, block->mr->addr); 3008 } 3009 3010 if (migrate_mapped_ram()) { 3011 mapped_ram_setup_ramblock(f, block); 3012 } 3013 } 3014 } 3015 3016 ret = rdma_registration_start(f, RAM_CONTROL_SETUP); 3017 if (ret < 0) { 3018 error_setg(errp, "%s: failed to start RDMA registration", __func__); 3019 qemu_file_set_error(f, ret); 3020 return ret; 3021 } 3022 3023 ret = rdma_registration_stop(f, RAM_CONTROL_SETUP); 3024 if (ret < 0) { 3025 error_setg(errp, "%s: failed to stop RDMA registration", __func__); 3026 qemu_file_set_error(f, ret); 3027 return ret; 3028 } 3029 3030 migration_ops = g_malloc0(sizeof(MigrationOps)); 3031 3032 if (migrate_multifd()) { 3033 multifd_ram_save_setup(); 3034 migration_ops->ram_save_target_page = ram_save_target_page_multifd; 3035 } else { 3036 migration_ops->ram_save_target_page = ram_save_target_page_legacy; 3037 } 3038 3039 /* 3040 * This operation is unfortunate.. 3041 * 3042 * For legacy QEMUs using per-section sync 3043 * ======================================= 3044 * 3045 * This must exist because the EOS below requires the SYNC messages 3046 * per-channel to work. 3047 * 3048 * For modern QEMUs using per-round sync 3049 * ===================================== 3050 * 3051 * Logically such sync is not needed, and recv threads should not run 3052 * until setup ready (using things like channels_ready on src). Then 3053 * we should be all fine. 3054 * 3055 * However even if we add channels_ready to recv side in new QEMUs, old 3056 * QEMU won't have them so this sync will still be needed to make sure 3057 * multifd recv threads won't start processing guest pages early before 3058 * ram_load_setup() is properly done. 3059 * 3060 * Let's stick with this. Fortunately the overhead is low to sync 3061 * during setup because the VM is running, so at least it's not 3062 * accounted as part of downtime. 3063 */ 3064 bql_unlock(); 3065 ret = multifd_ram_flush_and_sync(f); 3066 bql_lock(); 3067 if (ret < 0) { 3068 error_setg(errp, "%s: multifd synchronization failed", __func__); 3069 return ret; 3070 } 3071 3072 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3073 ret = qemu_fflush(f); 3074 if (ret < 0) { 3075 error_setg_errno(errp, -ret, "%s failed", __func__); 3076 } 3077 return ret; 3078 } 3079 3080 static void ram_save_file_bmap(QEMUFile *f) 3081 { 3082 RAMBlock *block; 3083 3084 RAMBLOCK_FOREACH_MIGRATABLE(block) { 3085 long num_pages = block->used_length >> TARGET_PAGE_BITS; 3086 long bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); 3087 3088 qemu_put_buffer_at(f, (uint8_t *)block->file_bmap, bitmap_size, 3089 block->bitmap_offset); 3090 ram_transferred_add(bitmap_size); 3091 3092 /* 3093 * Free the bitmap here to catch any synchronization issues 3094 * with multifd channels. No channels should be sending pages 3095 * after we've written the bitmap to file. 3096 */ 3097 g_free(block->file_bmap); 3098 block->file_bmap = NULL; 3099 } 3100 } 3101 3102 void ramblock_set_file_bmap_atomic(RAMBlock *block, ram_addr_t offset, bool set) 3103 { 3104 if (set) { 3105 set_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); 3106 } else { 3107 clear_bit_atomic(offset >> TARGET_PAGE_BITS, block->file_bmap); 3108 } 3109 } 3110 3111 /** 3112 * ram_save_iterate: iterative stage for migration 3113 * 3114 * Returns zero to indicate success and negative for error 3115 * 3116 * @f: QEMUFile where to send the data 3117 * @opaque: RAMState pointer 3118 */ 3119 static int ram_save_iterate(QEMUFile *f, void *opaque) 3120 { 3121 RAMState **temp = opaque; 3122 RAMState *rs = *temp; 3123 int ret = 0; 3124 int i; 3125 int64_t t0; 3126 int done = 0; 3127 3128 /* 3129 * We'll take this lock a little bit long, but it's okay for two reasons. 3130 * Firstly, the only possible other thread to take it is who calls 3131 * qemu_guest_free_page_hint(), which should be rare; secondly, see 3132 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which 3133 * guarantees that we'll at least released it in a regular basis. 3134 */ 3135 WITH_QEMU_LOCK_GUARD(&rs->bitmap_mutex) { 3136 WITH_RCU_READ_LOCK_GUARD() { 3137 if (ram_list.version != rs->last_version) { 3138 ram_state_reset(rs); 3139 } 3140 3141 /* Read version before ram_list.blocks */ 3142 smp_rmb(); 3143 3144 ret = rdma_registration_start(f, RAM_CONTROL_ROUND); 3145 if (ret < 0) { 3146 qemu_file_set_error(f, ret); 3147 goto out; 3148 } 3149 3150 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); 3151 i = 0; 3152 while ((ret = migration_rate_exceeded(f)) == 0 || 3153 postcopy_has_request(rs)) { 3154 int pages; 3155 3156 if (qemu_file_get_error(f)) { 3157 break; 3158 } 3159 3160 pages = ram_find_and_save_block(rs); 3161 /* no more pages to sent */ 3162 if (pages == 0) { 3163 done = 1; 3164 break; 3165 } 3166 3167 if (pages < 0) { 3168 qemu_file_set_error(f, pages); 3169 break; 3170 } 3171 3172 rs->target_page_count += pages; 3173 3174 /* 3175 * we want to check in the 1st loop, just in case it was the 1st 3176 * time and we had to sync the dirty bitmap. 3177 * qemu_clock_get_ns() is a bit expensive, so we only check each 3178 * some iterations 3179 */ 3180 if ((i & 63) == 0) { 3181 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 3182 1000000; 3183 if (t1 > MAX_WAIT) { 3184 trace_ram_save_iterate_big_wait(t1, i); 3185 break; 3186 } 3187 } 3188 i++; 3189 } 3190 } 3191 } 3192 3193 /* 3194 * Must occur before EOS (or any QEMUFile operation) 3195 * because of RDMA protocol. 3196 */ 3197 ret = rdma_registration_stop(f, RAM_CONTROL_ROUND); 3198 if (ret < 0) { 3199 qemu_file_set_error(f, ret); 3200 } 3201 3202 out: 3203 if (ret >= 0 && migration_is_running()) { 3204 if (multifd_ram_sync_per_section()) { 3205 ret = multifd_ram_flush_and_sync(f); 3206 if (ret < 0) { 3207 return ret; 3208 } 3209 } 3210 3211 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3212 ram_transferred_add(8); 3213 ret = qemu_fflush(f); 3214 } 3215 if (ret < 0) { 3216 return ret; 3217 } 3218 3219 return done; 3220 } 3221 3222 /** 3223 * ram_save_complete: function called to send the remaining amount of ram 3224 * 3225 * Returns zero to indicate success or negative on error 3226 * 3227 * Called with the BQL 3228 * 3229 * @f: QEMUFile where to send the data 3230 * @opaque: RAMState pointer 3231 */ 3232 static int ram_save_complete(QEMUFile *f, void *opaque) 3233 { 3234 RAMState **temp = opaque; 3235 RAMState *rs = *temp; 3236 int ret = 0; 3237 3238 rs->last_stage = !migration_in_colo_state(); 3239 3240 WITH_RCU_READ_LOCK_GUARD() { 3241 if (!migration_in_postcopy()) { 3242 migration_bitmap_sync_precopy(true); 3243 } 3244 3245 ret = rdma_registration_start(f, RAM_CONTROL_FINISH); 3246 if (ret < 0) { 3247 qemu_file_set_error(f, ret); 3248 return ret; 3249 } 3250 3251 /* try transferring iterative blocks of memory */ 3252 3253 /* flush all remaining blocks regardless of rate limiting */ 3254 qemu_mutex_lock(&rs->bitmap_mutex); 3255 while (true) { 3256 int pages; 3257 3258 pages = ram_find_and_save_block(rs); 3259 /* no more blocks to sent */ 3260 if (pages == 0) { 3261 break; 3262 } 3263 if (pages < 0) { 3264 qemu_mutex_unlock(&rs->bitmap_mutex); 3265 return pages; 3266 } 3267 } 3268 qemu_mutex_unlock(&rs->bitmap_mutex); 3269 3270 ret = rdma_registration_stop(f, RAM_CONTROL_FINISH); 3271 if (ret < 0) { 3272 qemu_file_set_error(f, ret); 3273 return ret; 3274 } 3275 } 3276 3277 if (multifd_ram_sync_per_section()) { 3278 /* 3279 * Only the old dest QEMU will need this sync, because each EOS 3280 * will require one SYNC message on each channel. 3281 */ 3282 ret = multifd_ram_flush_and_sync(f); 3283 if (ret < 0) { 3284 return ret; 3285 } 3286 } 3287 3288 if (migrate_mapped_ram()) { 3289 ram_save_file_bmap(f); 3290 3291 if (qemu_file_get_error(f)) { 3292 Error *local_err = NULL; 3293 int err = qemu_file_get_error_obj(f, &local_err); 3294 3295 error_reportf_err(local_err, "Failed to write bitmap to file: "); 3296 return -err; 3297 } 3298 } 3299 3300 qemu_put_be64(f, RAM_SAVE_FLAG_EOS); 3301 return qemu_fflush(f); 3302 } 3303 3304 static void ram_state_pending_estimate(void *opaque, uint64_t *must_precopy, 3305 uint64_t *can_postcopy) 3306 { 3307 RAMState **temp = opaque; 3308 RAMState *rs = *temp; 3309 3310 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3311 3312 if (migrate_postcopy_ram()) { 3313 /* We can do postcopy, and all the data is postcopiable */ 3314 *can_postcopy += remaining_size; 3315 } else { 3316 *must_precopy += remaining_size; 3317 } 3318 } 3319 3320 static void ram_state_pending_exact(void *opaque, uint64_t *must_precopy, 3321 uint64_t *can_postcopy) 3322 { 3323 RAMState **temp = opaque; 3324 RAMState *rs = *temp; 3325 uint64_t remaining_size; 3326 3327 if (!migration_in_postcopy()) { 3328 bql_lock(); 3329 WITH_RCU_READ_LOCK_GUARD() { 3330 migration_bitmap_sync_precopy(false); 3331 } 3332 bql_unlock(); 3333 } 3334 3335 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE; 3336 3337 if (migrate_postcopy_ram()) { 3338 /* We can do postcopy, and all the data is postcopiable */ 3339 *can_postcopy += remaining_size; 3340 } else { 3341 *must_precopy += remaining_size; 3342 } 3343 } 3344 3345 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) 3346 { 3347 unsigned int xh_len; 3348 int xh_flags; 3349 uint8_t *loaded_data; 3350 3351 /* extract RLE header */ 3352 xh_flags = qemu_get_byte(f); 3353 xh_len = qemu_get_be16(f); 3354 3355 if (xh_flags != ENCODING_FLAG_XBZRLE) { 3356 error_report("Failed to load XBZRLE page - wrong compression!"); 3357 return -1; 3358 } 3359 3360 if (xh_len > TARGET_PAGE_SIZE) { 3361 error_report("Failed to load XBZRLE page - len overflow!"); 3362 return -1; 3363 } 3364 loaded_data = XBZRLE.decoded_buf; 3365 /* load data and decode */ 3366 /* it can change loaded_data to point to an internal buffer */ 3367 qemu_get_buffer_in_place(f, &loaded_data, xh_len); 3368 3369 /* decode RLE */ 3370 if (xbzrle_decode_buffer(loaded_data, xh_len, host, 3371 TARGET_PAGE_SIZE) == -1) { 3372 error_report("Failed to load XBZRLE page - decode error!"); 3373 return -1; 3374 } 3375 3376 return 0; 3377 } 3378 3379 /** 3380 * ram_block_from_stream: read a RAMBlock id from the migration stream 3381 * 3382 * Must be called from within a rcu critical section. 3383 * 3384 * Returns a pointer from within the RCU-protected ram_list. 3385 * 3386 * @mis: the migration incoming state pointer 3387 * @f: QEMUFile where to read the data from 3388 * @flags: Page flags (mostly to see if it's a continuation of previous block) 3389 * @channel: the channel we're using 3390 */ 3391 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis, 3392 QEMUFile *f, int flags, 3393 int channel) 3394 { 3395 RAMBlock *block = mis->last_recv_block[channel]; 3396 char id[256]; 3397 uint8_t len; 3398 3399 if (flags & RAM_SAVE_FLAG_CONTINUE) { 3400 if (!block) { 3401 error_report("Ack, bad migration stream!"); 3402 return NULL; 3403 } 3404 return block; 3405 } 3406 3407 len = qemu_get_byte(f); 3408 qemu_get_buffer(f, (uint8_t *)id, len); 3409 id[len] = 0; 3410 3411 block = qemu_ram_block_by_name(id); 3412 if (!block) { 3413 error_report("Can't find block %s", id); 3414 return NULL; 3415 } 3416 3417 if (migrate_ram_is_ignored(block)) { 3418 error_report("block %s should not be migrated !", id); 3419 return NULL; 3420 } 3421 3422 mis->last_recv_block[channel] = block; 3423 3424 return block; 3425 } 3426 3427 static inline void *host_from_ram_block_offset(RAMBlock *block, 3428 ram_addr_t offset) 3429 { 3430 if (!offset_in_ramblock(block, offset)) { 3431 return NULL; 3432 } 3433 3434 return block->host + offset; 3435 } 3436 3437 static void *host_page_from_ram_block_offset(RAMBlock *block, 3438 ram_addr_t offset) 3439 { 3440 /* Note: Explicitly no check against offset_in_ramblock(). */ 3441 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset), 3442 block->page_size); 3443 } 3444 3445 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block, 3446 ram_addr_t offset) 3447 { 3448 return ((uintptr_t)block->host + offset) & (block->page_size - 1); 3449 } 3450 3451 void colo_record_bitmap(RAMBlock *block, ram_addr_t *normal, uint32_t pages) 3452 { 3453 qemu_mutex_lock(&ram_state->bitmap_mutex); 3454 for (int i = 0; i < pages; i++) { 3455 ram_addr_t offset = normal[i]; 3456 ram_state->migration_dirty_pages += !test_and_set_bit( 3457 offset >> TARGET_PAGE_BITS, 3458 block->bmap); 3459 } 3460 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3461 } 3462 3463 static inline void *colo_cache_from_block_offset(RAMBlock *block, 3464 ram_addr_t offset, bool record_bitmap) 3465 { 3466 if (!offset_in_ramblock(block, offset)) { 3467 return NULL; 3468 } 3469 if (!block->colo_cache) { 3470 error_report("%s: colo_cache is NULL in block :%s", 3471 __func__, block->idstr); 3472 return NULL; 3473 } 3474 3475 /* 3476 * During colo checkpoint, we need bitmap of these migrated pages. 3477 * It help us to decide which pages in ram cache should be flushed 3478 * into VM's RAM later. 3479 */ 3480 if (record_bitmap) { 3481 colo_record_bitmap(block, &offset, 1); 3482 } 3483 return block->colo_cache + offset; 3484 } 3485 3486 /** 3487 * ram_handle_zero: handle the zero page case 3488 * 3489 * If a page (or a whole RDMA chunk) has been 3490 * determined to be zero, then zap it. 3491 * 3492 * @host: host address for the zero page 3493 * @ch: what the page is filled from. We only support zero 3494 * @size: size of the zero page 3495 */ 3496 void ram_handle_zero(void *host, uint64_t size) 3497 { 3498 if (!buffer_is_zero(host, size)) { 3499 memset(host, 0, size); 3500 } 3501 } 3502 3503 static void colo_init_ram_state(void) 3504 { 3505 Error *local_err = NULL; 3506 3507 if (!ram_state_init(&ram_state, &local_err)) { 3508 error_report_err(local_err); 3509 } 3510 } 3511 3512 /* 3513 * colo cache: this is for secondary VM, we cache the whole 3514 * memory of the secondary VM, it is need to hold the global lock 3515 * to call this helper. 3516 */ 3517 int colo_init_ram_cache(void) 3518 { 3519 RAMBlock *block; 3520 3521 WITH_RCU_READ_LOCK_GUARD() { 3522 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3523 block->colo_cache = qemu_anon_ram_alloc(block->used_length, 3524 NULL, false, false); 3525 if (!block->colo_cache) { 3526 error_report("%s: Can't alloc memory for COLO cache of block %s," 3527 "size 0x" RAM_ADDR_FMT, __func__, block->idstr, 3528 block->used_length); 3529 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3530 if (block->colo_cache) { 3531 qemu_anon_ram_free(block->colo_cache, block->used_length); 3532 block->colo_cache = NULL; 3533 } 3534 } 3535 return -errno; 3536 } 3537 if (!machine_dump_guest_core(current_machine)) { 3538 qemu_madvise(block->colo_cache, block->used_length, 3539 QEMU_MADV_DONTDUMP); 3540 } 3541 } 3542 } 3543 3544 /* 3545 * Record the dirty pages that sent by PVM, we use this dirty bitmap together 3546 * with to decide which page in cache should be flushed into SVM's RAM. Here 3547 * we use the same name 'ram_bitmap' as for migration. 3548 */ 3549 if (ram_bytes_total()) { 3550 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3551 unsigned long pages = block->max_length >> TARGET_PAGE_BITS; 3552 block->bmap = bitmap_new(pages); 3553 } 3554 } 3555 3556 colo_init_ram_state(); 3557 return 0; 3558 } 3559 3560 /* TODO: duplicated with ram_init_bitmaps */ 3561 void colo_incoming_start_dirty_log(void) 3562 { 3563 RAMBlock *block = NULL; 3564 Error *local_err = NULL; 3565 3566 /* For memory_global_dirty_log_start below. */ 3567 bql_lock(); 3568 qemu_mutex_lock_ramlist(); 3569 3570 memory_global_dirty_log_sync(false); 3571 WITH_RCU_READ_LOCK_GUARD() { 3572 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3573 ramblock_sync_dirty_bitmap(ram_state, block); 3574 /* Discard this dirty bitmap record */ 3575 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS); 3576 } 3577 if (!memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION, 3578 &local_err)) { 3579 error_report_err(local_err); 3580 } 3581 } 3582 ram_state->migration_dirty_pages = 0; 3583 qemu_mutex_unlock_ramlist(); 3584 bql_unlock(); 3585 } 3586 3587 /* It is need to hold the global lock to call this helper */ 3588 void colo_release_ram_cache(void) 3589 { 3590 RAMBlock *block; 3591 3592 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION); 3593 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3594 g_free(block->bmap); 3595 block->bmap = NULL; 3596 } 3597 3598 WITH_RCU_READ_LOCK_GUARD() { 3599 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3600 if (block->colo_cache) { 3601 qemu_anon_ram_free(block->colo_cache, block->used_length); 3602 block->colo_cache = NULL; 3603 } 3604 } 3605 } 3606 ram_state_cleanup(&ram_state); 3607 } 3608 3609 /** 3610 * ram_load_setup: Setup RAM for migration incoming side 3611 * 3612 * Returns zero to indicate success and negative for error 3613 * 3614 * @f: QEMUFile where to receive the data 3615 * @opaque: RAMState pointer 3616 * @errp: pointer to Error*, to store an error if it happens. 3617 */ 3618 static int ram_load_setup(QEMUFile *f, void *opaque, Error **errp) 3619 { 3620 xbzrle_load_setup(); 3621 ramblock_recv_map_init(); 3622 3623 return 0; 3624 } 3625 3626 static int ram_load_cleanup(void *opaque) 3627 { 3628 RAMBlock *rb; 3629 3630 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3631 qemu_ram_block_writeback(rb); 3632 } 3633 3634 xbzrle_load_cleanup(); 3635 3636 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 3637 g_free(rb->receivedmap); 3638 rb->receivedmap = NULL; 3639 } 3640 3641 return 0; 3642 } 3643 3644 /** 3645 * ram_postcopy_incoming_init: allocate postcopy data structures 3646 * 3647 * Returns 0 for success and negative if there was one error 3648 * 3649 * @mis: current migration incoming state 3650 * 3651 * Allocate data structures etc needed by incoming migration with 3652 * postcopy-ram. postcopy-ram's similarly names 3653 * postcopy_ram_incoming_init does the work. 3654 */ 3655 int ram_postcopy_incoming_init(MigrationIncomingState *mis) 3656 { 3657 return postcopy_ram_incoming_init(mis); 3658 } 3659 3660 /** 3661 * ram_load_postcopy: load a page in postcopy case 3662 * 3663 * Returns 0 for success or -errno in case of error 3664 * 3665 * Called in postcopy mode by ram_load(). 3666 * rcu_read_lock is taken prior to this being called. 3667 * 3668 * @f: QEMUFile where to send the data 3669 * @channel: the channel to use for loading 3670 */ 3671 int ram_load_postcopy(QEMUFile *f, int channel) 3672 { 3673 int flags = 0, ret = 0; 3674 bool place_needed = false; 3675 bool matches_target_page_size = false; 3676 MigrationIncomingState *mis = migration_incoming_get_current(); 3677 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel]; 3678 3679 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 3680 ram_addr_t addr; 3681 void *page_buffer = NULL; 3682 void *place_source = NULL; 3683 RAMBlock *block = NULL; 3684 uint8_t ch; 3685 3686 addr = qemu_get_be64(f); 3687 3688 /* 3689 * If qemu file error, we should stop here, and then "addr" 3690 * may be invalid 3691 */ 3692 ret = qemu_file_get_error(f); 3693 if (ret) { 3694 break; 3695 } 3696 3697 flags = addr & ~TARGET_PAGE_MASK; 3698 addr &= TARGET_PAGE_MASK; 3699 3700 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags); 3701 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) { 3702 block = ram_block_from_stream(mis, f, flags, channel); 3703 if (!block) { 3704 ret = -EINVAL; 3705 break; 3706 } 3707 3708 /* 3709 * Relying on used_length is racy and can result in false positives. 3710 * We might place pages beyond used_length in case RAM was shrunk 3711 * while in postcopy, which is fine - trying to place via 3712 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault. 3713 */ 3714 if (!block->host || addr >= block->postcopy_length) { 3715 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 3716 ret = -EINVAL; 3717 break; 3718 } 3719 tmp_page->target_pages++; 3720 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE; 3721 /* 3722 * Postcopy requires that we place whole host pages atomically; 3723 * these may be huge pages for RAMBlocks that are backed by 3724 * hugetlbfs. 3725 * To make it atomic, the data is read into a temporary page 3726 * that's moved into place later. 3727 * The migration protocol uses, possibly smaller, target-pages 3728 * however the source ensures it always sends all the components 3729 * of a host page in one chunk. 3730 */ 3731 page_buffer = tmp_page->tmp_huge_page + 3732 host_page_offset_from_ram_block_offset(block, addr); 3733 /* If all TP are zero then we can optimise the place */ 3734 if (tmp_page->target_pages == 1) { 3735 tmp_page->host_addr = 3736 host_page_from_ram_block_offset(block, addr); 3737 } else if (tmp_page->host_addr != 3738 host_page_from_ram_block_offset(block, addr)) { 3739 /* not the 1st TP within the HP */ 3740 error_report("Non-same host page detected on channel %d: " 3741 "Target host page %p, received host page %p " 3742 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)", 3743 channel, tmp_page->host_addr, 3744 host_page_from_ram_block_offset(block, addr), 3745 block->idstr, addr, tmp_page->target_pages); 3746 ret = -EINVAL; 3747 break; 3748 } 3749 3750 /* 3751 * If it's the last part of a host page then we place the host 3752 * page 3753 */ 3754 if (tmp_page->target_pages == 3755 (block->page_size / TARGET_PAGE_SIZE)) { 3756 place_needed = true; 3757 } 3758 place_source = tmp_page->tmp_huge_page; 3759 } 3760 3761 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 3762 case RAM_SAVE_FLAG_ZERO: 3763 ch = qemu_get_byte(f); 3764 if (ch != 0) { 3765 error_report("Found a zero page with value %d", ch); 3766 ret = -EINVAL; 3767 break; 3768 } 3769 /* 3770 * Can skip to set page_buffer when 3771 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE). 3772 */ 3773 if (!matches_target_page_size) { 3774 memset(page_buffer, ch, TARGET_PAGE_SIZE); 3775 } 3776 break; 3777 3778 case RAM_SAVE_FLAG_PAGE: 3779 tmp_page->all_zero = false; 3780 if (!matches_target_page_size) { 3781 /* For huge pages, we always use temporary buffer */ 3782 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); 3783 } else { 3784 /* 3785 * For small pages that matches target page size, we 3786 * avoid the qemu_file copy. Instead we directly use 3787 * the buffer of QEMUFile to place the page. Note: we 3788 * cannot do any QEMUFile operation before using that 3789 * buffer to make sure the buffer is valid when 3790 * placing the page. 3791 */ 3792 qemu_get_buffer_in_place(f, (uint8_t **)&place_source, 3793 TARGET_PAGE_SIZE); 3794 } 3795 break; 3796 case RAM_SAVE_FLAG_EOS: 3797 break; 3798 default: 3799 error_report("Unknown combination of migration flags: 0x%x" 3800 " (postcopy mode)", flags); 3801 ret = -EINVAL; 3802 break; 3803 } 3804 3805 /* Detect for any possible file errors */ 3806 if (!ret && qemu_file_get_error(f)) { 3807 ret = qemu_file_get_error(f); 3808 } 3809 3810 if (!ret && place_needed) { 3811 if (tmp_page->all_zero) { 3812 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block); 3813 } else { 3814 ret = postcopy_place_page(mis, tmp_page->host_addr, 3815 place_source, block); 3816 } 3817 place_needed = false; 3818 postcopy_temp_page_reset(tmp_page); 3819 } 3820 } 3821 3822 return ret; 3823 } 3824 3825 static bool postcopy_is_running(void) 3826 { 3827 PostcopyState ps = postcopy_state_get(); 3828 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END; 3829 } 3830 3831 /* 3832 * Flush content of RAM cache into SVM's memory. 3833 * Only flush the pages that be dirtied by PVM or SVM or both. 3834 */ 3835 void colo_flush_ram_cache(void) 3836 { 3837 RAMBlock *block = NULL; 3838 void *dst_host; 3839 void *src_host; 3840 unsigned long offset = 0; 3841 3842 memory_global_dirty_log_sync(false); 3843 qemu_mutex_lock(&ram_state->bitmap_mutex); 3844 WITH_RCU_READ_LOCK_GUARD() { 3845 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 3846 ramblock_sync_dirty_bitmap(ram_state, block); 3847 } 3848 } 3849 3850 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages); 3851 WITH_RCU_READ_LOCK_GUARD() { 3852 block = QLIST_FIRST_RCU(&ram_list.blocks); 3853 3854 while (block) { 3855 unsigned long num = 0; 3856 3857 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num); 3858 if (!offset_in_ramblock(block, 3859 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) { 3860 offset = 0; 3861 num = 0; 3862 block = QLIST_NEXT_RCU(block, next); 3863 } else { 3864 unsigned long i = 0; 3865 3866 for (i = 0; i < num; i++) { 3867 migration_bitmap_clear_dirty(ram_state, block, offset + i); 3868 } 3869 dst_host = block->host 3870 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3871 src_host = block->colo_cache 3872 + (((ram_addr_t)offset) << TARGET_PAGE_BITS); 3873 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num); 3874 offset += num; 3875 } 3876 } 3877 } 3878 qemu_mutex_unlock(&ram_state->bitmap_mutex); 3879 trace_colo_flush_ram_cache_end(); 3880 } 3881 3882 static size_t ram_load_multifd_pages(void *host_addr, size_t size, 3883 uint64_t offset) 3884 { 3885 MultiFDRecvData *data = multifd_get_recv_data(); 3886 3887 data->opaque = host_addr; 3888 data->file_offset = offset; 3889 data->size = size; 3890 3891 if (!multifd_recv()) { 3892 return 0; 3893 } 3894 3895 return size; 3896 } 3897 3898 static bool read_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, 3899 long num_pages, unsigned long *bitmap, 3900 Error **errp) 3901 { 3902 ERRP_GUARD(); 3903 unsigned long set_bit_idx, clear_bit_idx; 3904 ram_addr_t offset; 3905 void *host; 3906 size_t read, unread, size; 3907 3908 for (set_bit_idx = find_first_bit(bitmap, num_pages); 3909 set_bit_idx < num_pages; 3910 set_bit_idx = find_next_bit(bitmap, num_pages, clear_bit_idx + 1)) { 3911 3912 clear_bit_idx = find_next_zero_bit(bitmap, num_pages, set_bit_idx + 1); 3913 3914 unread = TARGET_PAGE_SIZE * (clear_bit_idx - set_bit_idx); 3915 offset = set_bit_idx << TARGET_PAGE_BITS; 3916 3917 while (unread > 0) { 3918 host = host_from_ram_block_offset(block, offset); 3919 if (!host) { 3920 error_setg(errp, "page outside of ramblock %s range", 3921 block->idstr); 3922 return false; 3923 } 3924 3925 size = MIN(unread, MAPPED_RAM_LOAD_BUF_SIZE); 3926 3927 if (migrate_multifd()) { 3928 read = ram_load_multifd_pages(host, size, 3929 block->pages_offset + offset); 3930 } else { 3931 read = qemu_get_buffer_at(f, host, size, 3932 block->pages_offset + offset); 3933 } 3934 3935 if (!read) { 3936 goto err; 3937 } 3938 offset += read; 3939 unread -= read; 3940 } 3941 } 3942 3943 return true; 3944 3945 err: 3946 qemu_file_get_error_obj(f, errp); 3947 error_prepend(errp, "(%s) failed to read page " RAM_ADDR_FMT 3948 "from file offset %" PRIx64 ": ", block->idstr, offset, 3949 block->pages_offset + offset); 3950 return false; 3951 } 3952 3953 static void parse_ramblock_mapped_ram(QEMUFile *f, RAMBlock *block, 3954 ram_addr_t length, Error **errp) 3955 { 3956 g_autofree unsigned long *bitmap = NULL; 3957 MappedRamHeader header; 3958 size_t bitmap_size; 3959 long num_pages; 3960 3961 if (!mapped_ram_read_header(f, &header, errp)) { 3962 return; 3963 } 3964 3965 block->pages_offset = header.pages_offset; 3966 3967 /* 3968 * Check the alignment of the file region that contains pages. We 3969 * don't enforce MAPPED_RAM_FILE_OFFSET_ALIGNMENT to allow that 3970 * value to change in the future. Do only a sanity check with page 3971 * size alignment. 3972 */ 3973 if (!QEMU_IS_ALIGNED(block->pages_offset, TARGET_PAGE_SIZE)) { 3974 error_setg(errp, 3975 "Error reading ramblock %s pages, region has bad alignment", 3976 block->idstr); 3977 return; 3978 } 3979 3980 num_pages = length / header.page_size; 3981 bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); 3982 3983 bitmap = g_malloc0(bitmap_size); 3984 if (qemu_get_buffer_at(f, (uint8_t *)bitmap, bitmap_size, 3985 header.bitmap_offset) != bitmap_size) { 3986 error_setg(errp, "Error reading dirty bitmap"); 3987 return; 3988 } 3989 3990 if (!read_ramblock_mapped_ram(f, block, num_pages, bitmap, errp)) { 3991 return; 3992 } 3993 3994 /* Skip pages array */ 3995 qemu_set_offset(f, block->pages_offset + length, SEEK_SET); 3996 3997 return; 3998 } 3999 4000 static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length) 4001 { 4002 int ret = 0; 4003 /* ADVISE is earlier, it shows the source has the postcopy capability on */ 4004 bool postcopy_advised = migration_incoming_postcopy_advised(); 4005 int max_hg_page_size; 4006 Error *local_err = NULL; 4007 4008 assert(block); 4009 4010 if (migrate_mapped_ram()) { 4011 parse_ramblock_mapped_ram(f, block, length, &local_err); 4012 if (local_err) { 4013 error_report_err(local_err); 4014 return -EINVAL; 4015 } 4016 return 0; 4017 } 4018 4019 if (!qemu_ram_is_migratable(block)) { 4020 error_report("block %s should not be migrated !", block->idstr); 4021 return -EINVAL; 4022 } 4023 4024 if (length != block->used_length) { 4025 ret = qemu_ram_resize(block, length, &local_err); 4026 if (local_err) { 4027 error_report_err(local_err); 4028 return ret; 4029 } 4030 } 4031 4032 /* 4033 * ??? Mirrors the previous value of qemu_host_page_size, 4034 * but is this really what was intended for the migration? 4035 */ 4036 max_hg_page_size = MAX(qemu_real_host_page_size(), TARGET_PAGE_SIZE); 4037 4038 /* For postcopy we need to check hugepage sizes match */ 4039 if (postcopy_advised && migrate_postcopy_ram() && 4040 block->page_size != max_hg_page_size) { 4041 uint64_t remote_page_size = qemu_get_be64(f); 4042 if (remote_page_size != block->page_size) { 4043 error_report("Mismatched RAM page size %s " 4044 "(local) %zd != %" PRId64, block->idstr, 4045 block->page_size, remote_page_size); 4046 return -EINVAL; 4047 } 4048 } 4049 if (migrate_ignore_shared()) { 4050 hwaddr addr = qemu_get_be64(f); 4051 if (migrate_ram_is_ignored(block) && 4052 block->mr->addr != addr) { 4053 error_report("Mismatched GPAs for block %s " 4054 "%" PRId64 "!= %" PRId64, block->idstr, 4055 (uint64_t)addr, (uint64_t)block->mr->addr); 4056 return -EINVAL; 4057 } 4058 } 4059 ret = rdma_block_notification_handle(f, block->idstr); 4060 if (ret < 0) { 4061 qemu_file_set_error(f, ret); 4062 } 4063 4064 return ret; 4065 } 4066 4067 static int parse_ramblocks(QEMUFile *f, ram_addr_t total_ram_bytes) 4068 { 4069 int ret = 0; 4070 4071 /* Synchronize RAM block list */ 4072 while (!ret && total_ram_bytes) { 4073 RAMBlock *block; 4074 char id[256]; 4075 ram_addr_t length; 4076 int len = qemu_get_byte(f); 4077 4078 qemu_get_buffer(f, (uint8_t *)id, len); 4079 id[len] = 0; 4080 length = qemu_get_be64(f); 4081 4082 block = qemu_ram_block_by_name(id); 4083 if (block) { 4084 ret = parse_ramblock(f, block, length); 4085 } else { 4086 error_report("Unknown ramblock \"%s\", cannot accept " 4087 "migration", id); 4088 ret = -EINVAL; 4089 } 4090 total_ram_bytes -= length; 4091 } 4092 4093 return ret; 4094 } 4095 4096 /** 4097 * ram_load_precopy: load pages in precopy case 4098 * 4099 * Returns 0 for success or -errno in case of error 4100 * 4101 * Called in precopy mode by ram_load(). 4102 * rcu_read_lock is taken prior to this being called. 4103 * 4104 * @f: QEMUFile where to send the data 4105 */ 4106 static int ram_load_precopy(QEMUFile *f) 4107 { 4108 MigrationIncomingState *mis = migration_incoming_get_current(); 4109 int flags = 0, ret = 0, invalid_flags = 0, i = 0; 4110 4111 if (migrate_mapped_ram()) { 4112 invalid_flags |= (RAM_SAVE_FLAG_HOOK | RAM_SAVE_FLAG_MULTIFD_FLUSH | 4113 RAM_SAVE_FLAG_PAGE | RAM_SAVE_FLAG_XBZRLE | 4114 RAM_SAVE_FLAG_ZERO); 4115 } 4116 4117 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { 4118 ram_addr_t addr; 4119 void *host = NULL, *host_bak = NULL; 4120 uint8_t ch; 4121 4122 /* 4123 * Yield periodically to let main loop run, but an iteration of 4124 * the main loop is expensive, so do it each some iterations 4125 */ 4126 if ((i & 32767) == 0 && qemu_in_coroutine()) { 4127 aio_co_schedule(qemu_get_current_aio_context(), 4128 qemu_coroutine_self()); 4129 qemu_coroutine_yield(); 4130 } 4131 i++; 4132 4133 addr = qemu_get_be64(f); 4134 ret = qemu_file_get_error(f); 4135 if (ret) { 4136 error_report("Getting RAM address failed"); 4137 break; 4138 } 4139 4140 flags = addr & ~TARGET_PAGE_MASK; 4141 addr &= TARGET_PAGE_MASK; 4142 4143 if (flags & invalid_flags) { 4144 error_report("Unexpected RAM flags: %d", flags & invalid_flags); 4145 4146 ret = -EINVAL; 4147 break; 4148 } 4149 4150 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE | 4151 RAM_SAVE_FLAG_XBZRLE)) { 4152 RAMBlock *block = ram_block_from_stream(mis, f, flags, 4153 RAM_CHANNEL_PRECOPY); 4154 4155 host = host_from_ram_block_offset(block, addr); 4156 /* 4157 * After going into COLO stage, we should not load the page 4158 * into SVM's memory directly, we put them into colo_cache firstly. 4159 * NOTE: We need to keep a copy of SVM's ram in colo_cache. 4160 * Previously, we copied all these memory in preparing stage of COLO 4161 * while we need to stop VM, which is a time-consuming process. 4162 * Here we optimize it by a trick, back-up every page while in 4163 * migration process while COLO is enabled, though it affects the 4164 * speed of the migration, but it obviously reduce the downtime of 4165 * back-up all SVM'S memory in COLO preparing stage. 4166 */ 4167 if (migration_incoming_colo_enabled()) { 4168 if (migration_incoming_in_colo_state()) { 4169 /* In COLO stage, put all pages into cache temporarily */ 4170 host = colo_cache_from_block_offset(block, addr, true); 4171 } else { 4172 /* 4173 * In migration stage but before COLO stage, 4174 * Put all pages into both cache and SVM's memory. 4175 */ 4176 host_bak = colo_cache_from_block_offset(block, addr, false); 4177 } 4178 } 4179 if (!host) { 4180 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); 4181 ret = -EINVAL; 4182 break; 4183 } 4184 if (!migration_incoming_in_colo_state()) { 4185 ramblock_recv_bitmap_set(block, host); 4186 } 4187 4188 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host); 4189 } 4190 4191 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { 4192 case RAM_SAVE_FLAG_MEM_SIZE: 4193 ret = parse_ramblocks(f, addr); 4194 /* 4195 * For mapped-ram migration (to a file) using multifd, we sync 4196 * once and for all here to make sure all tasks we queued to 4197 * multifd threads are completed, so that all the ramblocks 4198 * (including all the guest memory pages within) are fully 4199 * loaded after this sync returns. 4200 */ 4201 if (migrate_mapped_ram()) { 4202 multifd_recv_sync_main(); 4203 } 4204 break; 4205 4206 case RAM_SAVE_FLAG_ZERO: 4207 ch = qemu_get_byte(f); 4208 if (ch != 0) { 4209 error_report("Found a zero page with value %d", ch); 4210 ret = -EINVAL; 4211 break; 4212 } 4213 ram_handle_zero(host, TARGET_PAGE_SIZE); 4214 break; 4215 4216 case RAM_SAVE_FLAG_PAGE: 4217 qemu_get_buffer(f, host, TARGET_PAGE_SIZE); 4218 break; 4219 4220 case RAM_SAVE_FLAG_XBZRLE: 4221 if (load_xbzrle(f, addr, host) < 0) { 4222 error_report("Failed to decompress XBZRLE page at " 4223 RAM_ADDR_FMT, addr); 4224 ret = -EINVAL; 4225 break; 4226 } 4227 break; 4228 case RAM_SAVE_FLAG_MULTIFD_FLUSH: 4229 multifd_recv_sync_main(); 4230 break; 4231 case RAM_SAVE_FLAG_EOS: 4232 /* normal exit */ 4233 if (migrate_multifd() && 4234 migrate_multifd_flush_after_each_section() && 4235 /* 4236 * Mapped-ram migration flushes once and for all after 4237 * parsing ramblocks. Always ignore EOS for it. 4238 */ 4239 !migrate_mapped_ram()) { 4240 multifd_recv_sync_main(); 4241 } 4242 break; 4243 case RAM_SAVE_FLAG_HOOK: 4244 ret = rdma_registration_handle(f); 4245 if (ret < 0) { 4246 qemu_file_set_error(f, ret); 4247 } 4248 break; 4249 default: 4250 error_report("Unknown combination of migration flags: 0x%x", flags); 4251 ret = -EINVAL; 4252 } 4253 if (!ret) { 4254 ret = qemu_file_get_error(f); 4255 } 4256 if (!ret && host_bak) { 4257 memcpy(host_bak, host, TARGET_PAGE_SIZE); 4258 } 4259 } 4260 4261 return ret; 4262 } 4263 4264 static int ram_load(QEMUFile *f, void *opaque, int version_id) 4265 { 4266 int ret = 0; 4267 static uint64_t seq_iter; 4268 /* 4269 * If system is running in postcopy mode, page inserts to host memory must 4270 * be atomic 4271 */ 4272 bool postcopy_running = postcopy_is_running(); 4273 4274 seq_iter++; 4275 4276 if (version_id != 4) { 4277 return -EINVAL; 4278 } 4279 4280 /* 4281 * This RCU critical section can be very long running. 4282 * When RCU reclaims in the code start to become numerous, 4283 * it will be necessary to reduce the granularity of this 4284 * critical section. 4285 */ 4286 trace_ram_load_start(); 4287 WITH_RCU_READ_LOCK_GUARD() { 4288 if (postcopy_running) { 4289 /* 4290 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of 4291 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to 4292 * service fast page faults. 4293 */ 4294 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY); 4295 } else { 4296 ret = ram_load_precopy(f); 4297 } 4298 } 4299 trace_ram_load_complete(ret, seq_iter); 4300 4301 return ret; 4302 } 4303 4304 static bool ram_has_postcopy(void *opaque) 4305 { 4306 RAMBlock *rb; 4307 RAMBLOCK_FOREACH_NOT_IGNORED(rb) { 4308 if (ramblock_is_pmem(rb)) { 4309 info_report("Block: %s, host: %p is a nvdimm memory, postcopy" 4310 "is not supported now!", rb->idstr, rb->host); 4311 return false; 4312 } 4313 } 4314 4315 return migrate_postcopy_ram(); 4316 } 4317 4318 /* Sync all the dirty bitmap with destination VM. */ 4319 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs) 4320 { 4321 RAMBlock *block; 4322 QEMUFile *file = s->to_dst_file; 4323 4324 trace_ram_dirty_bitmap_sync_start(); 4325 4326 qatomic_set(&rs->postcopy_bmap_sync_requested, 0); 4327 RAMBLOCK_FOREACH_NOT_IGNORED(block) { 4328 qemu_savevm_send_recv_bitmap(file, block->idstr); 4329 trace_ram_dirty_bitmap_request(block->idstr); 4330 qatomic_inc(&rs->postcopy_bmap_sync_requested); 4331 } 4332 4333 trace_ram_dirty_bitmap_sync_wait(); 4334 4335 /* Wait until all the ramblocks' dirty bitmap synced */ 4336 while (qatomic_read(&rs->postcopy_bmap_sync_requested)) { 4337 if (migration_rp_wait(s)) { 4338 return -1; 4339 } 4340 } 4341 4342 trace_ram_dirty_bitmap_sync_complete(); 4343 4344 return 0; 4345 } 4346 4347 /* 4348 * Read the received bitmap, revert it as the initial dirty bitmap. 4349 * This is only used when the postcopy migration is paused but wants 4350 * to resume from a middle point. 4351 * 4352 * Returns true if succeeded, false for errors. 4353 */ 4354 bool ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block, Error **errp) 4355 { 4356 /* from_dst_file is always valid because we're within rp_thread */ 4357 QEMUFile *file = s->rp_state.from_dst_file; 4358 g_autofree unsigned long *le_bitmap = NULL; 4359 unsigned long nbits = block->used_length >> TARGET_PAGE_BITS; 4360 uint64_t local_size = DIV_ROUND_UP(nbits, 8); 4361 uint64_t size, end_mark; 4362 RAMState *rs = ram_state; 4363 4364 trace_ram_dirty_bitmap_reload_begin(block->idstr); 4365 4366 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) { 4367 error_setg(errp, "Reload bitmap in incorrect state %s", 4368 MigrationStatus_str(s->state)); 4369 return false; 4370 } 4371 4372 /* 4373 * Note: see comments in ramblock_recv_bitmap_send() on why we 4374 * need the endianness conversion, and the paddings. 4375 */ 4376 local_size = ROUND_UP(local_size, 8); 4377 4378 /* Add paddings */ 4379 le_bitmap = bitmap_new(nbits + BITS_PER_LONG); 4380 4381 size = qemu_get_be64(file); 4382 4383 /* The size of the bitmap should match with our ramblock */ 4384 if (size != local_size) { 4385 error_setg(errp, "ramblock '%s' bitmap size mismatch (0x%"PRIx64 4386 " != 0x%"PRIx64")", block->idstr, size, local_size); 4387 return false; 4388 } 4389 4390 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size); 4391 end_mark = qemu_get_be64(file); 4392 4393 if (qemu_file_get_error(file) || size != local_size) { 4394 error_setg(errp, "read bitmap failed for ramblock '%s': " 4395 "(size 0x%"PRIx64", got: 0x%"PRIx64")", 4396 block->idstr, local_size, size); 4397 return false; 4398 } 4399 4400 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) { 4401 error_setg(errp, "ramblock '%s' end mark incorrect: 0x%"PRIx64, 4402 block->idstr, end_mark); 4403 return false; 4404 } 4405 4406 /* 4407 * Endianness conversion. We are during postcopy (though paused). 4408 * The dirty bitmap won't change. We can directly modify it. 4409 */ 4410 bitmap_from_le(block->bmap, le_bitmap, nbits); 4411 4412 /* 4413 * What we received is "received bitmap". Revert it as the initial 4414 * dirty bitmap for this ramblock. 4415 */ 4416 bitmap_complement(block->bmap, block->bmap, nbits); 4417 4418 /* Clear dirty bits of discarded ranges that we don't want to migrate. */ 4419 ramblock_dirty_bitmap_clear_discarded_pages(block); 4420 4421 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */ 4422 trace_ram_dirty_bitmap_reload_complete(block->idstr); 4423 4424 qatomic_dec(&rs->postcopy_bmap_sync_requested); 4425 4426 /* 4427 * We succeeded to sync bitmap for current ramblock. Always kick the 4428 * migration thread to check whether all requested bitmaps are 4429 * reloaded. NOTE: it's racy to only kick when requested==0, because 4430 * we don't know whether the migration thread may still be increasing 4431 * it. 4432 */ 4433 migration_rp_kick(s); 4434 4435 return true; 4436 } 4437 4438 static int ram_resume_prepare(MigrationState *s, void *opaque) 4439 { 4440 RAMState *rs = *(RAMState **)opaque; 4441 int ret; 4442 4443 ret = ram_dirty_bitmap_sync_all(s, rs); 4444 if (ret) { 4445 return ret; 4446 } 4447 4448 ram_state_resume_prepare(rs, s->to_dst_file); 4449 4450 return 0; 4451 } 4452 4453 void postcopy_preempt_shutdown_file(MigrationState *s) 4454 { 4455 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS); 4456 qemu_fflush(s->postcopy_qemufile_src); 4457 } 4458 4459 static SaveVMHandlers savevm_ram_handlers = { 4460 .save_setup = ram_save_setup, 4461 .save_live_iterate = ram_save_iterate, 4462 .save_live_complete_postcopy = ram_save_complete, 4463 .save_live_complete_precopy = ram_save_complete, 4464 .has_postcopy = ram_has_postcopy, 4465 .state_pending_exact = ram_state_pending_exact, 4466 .state_pending_estimate = ram_state_pending_estimate, 4467 .load_state = ram_load, 4468 .save_cleanup = ram_save_cleanup, 4469 .load_setup = ram_load_setup, 4470 .load_cleanup = ram_load_cleanup, 4471 .resume_prepare = ram_resume_prepare, 4472 }; 4473 4474 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host, 4475 size_t old_size, size_t new_size) 4476 { 4477 PostcopyState ps = postcopy_state_get(); 4478 ram_addr_t offset; 4479 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset); 4480 Error *err = NULL; 4481 4482 if (!rb) { 4483 error_report("RAM block not found"); 4484 return; 4485 } 4486 4487 if (migrate_ram_is_ignored(rb)) { 4488 return; 4489 } 4490 4491 if (migration_is_running()) { 4492 /* 4493 * Precopy code on the source cannot deal with the size of RAM blocks 4494 * changing at random points in time - especially after sending the 4495 * RAM block sizes in the migration stream, they must no longer change. 4496 * Abort and indicate a proper reason. 4497 */ 4498 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr); 4499 migration_cancel(err); 4500 error_free(err); 4501 } 4502 4503 switch (ps) { 4504 case POSTCOPY_INCOMING_ADVISE: 4505 /* 4506 * Update what ram_postcopy_incoming_init()->init_range() does at the 4507 * time postcopy was advised. Syncing RAM blocks with the source will 4508 * result in RAM resizes. 4509 */ 4510 if (old_size < new_size) { 4511 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) { 4512 error_report("RAM block '%s' discard of resized RAM failed", 4513 rb->idstr); 4514 } 4515 } 4516 rb->postcopy_length = new_size; 4517 break; 4518 case POSTCOPY_INCOMING_NONE: 4519 case POSTCOPY_INCOMING_RUNNING: 4520 case POSTCOPY_INCOMING_END: 4521 /* 4522 * Once our guest is running, postcopy does no longer care about 4523 * resizes. When growing, the new memory was not available on the 4524 * source, no handler needed. 4525 */ 4526 break; 4527 default: 4528 error_report("RAM block '%s' resized during postcopy state: %d", 4529 rb->idstr, ps); 4530 exit(-1); 4531 } 4532 } 4533 4534 static RAMBlockNotifier ram_mig_ram_notifier = { 4535 .ram_block_resized = ram_mig_ram_block_resized, 4536 }; 4537 4538 void ram_mig_init(void) 4539 { 4540 qemu_mutex_init(&XBZRLE.lock); 4541 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state); 4542 ram_block_notifier_add(&ram_mig_ram_notifier); 4543 } 4544