1 /* 2 * Multifd VFIO migration 3 * 4 * Copyright (C) 2024,2025 Oracle and/or its affiliates. 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2 or later. 7 * See the COPYING file in the top-level directory. 8 * 9 * SPDX-License-Identifier: GPL-2.0-or-later 10 */ 11 12 #include "qemu/osdep.h" 13 #include "hw/vfio/vfio-device.h" 14 #include "migration/misc.h" 15 #include "qapi/error.h" 16 #include "qemu/bswap.h" 17 #include "qemu/error-report.h" 18 #include "qemu/lockable.h" 19 #include "qemu/main-loop.h" 20 #include "qemu/thread.h" 21 #include "io/channel-buffer.h" 22 #include "migration/qemu-file.h" 23 #include "migration-multifd.h" 24 #include "vfio-migration-internal.h" 25 #include "trace.h" 26 27 #define VFIO_DEVICE_STATE_CONFIG_STATE (1) 28 29 #define VFIO_DEVICE_STATE_PACKET_VER_CURRENT (0) 30 31 typedef struct VFIODeviceStatePacket { 32 uint32_t version; 33 uint32_t idx; 34 uint32_t flags; 35 uint8_t data[0]; 36 } QEMU_PACKED VFIODeviceStatePacket; 37 38 /* type safety */ 39 typedef struct VFIOStateBuffers { 40 GArray *array; 41 } VFIOStateBuffers; 42 43 typedef struct VFIOStateBuffer { 44 bool is_present; 45 char *data; 46 size_t len; 47 } VFIOStateBuffer; 48 49 typedef struct VFIOMultifd { 50 bool load_bufs_thread_running; 51 bool load_bufs_thread_want_exit; 52 53 VFIOStateBuffers load_bufs; 54 QemuCond load_bufs_buffer_ready_cond; 55 QemuCond load_bufs_thread_finished_cond; 56 QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */ 57 uint32_t load_buf_idx; 58 uint32_t load_buf_idx_last; 59 } VFIOMultifd; 60 61 static void vfio_state_buffer_clear(gpointer data) 62 { 63 VFIOStateBuffer *lb = data; 64 65 if (!lb->is_present) { 66 return; 67 } 68 69 g_clear_pointer(&lb->data, g_free); 70 lb->is_present = false; 71 } 72 73 static void vfio_state_buffers_init(VFIOStateBuffers *bufs) 74 { 75 bufs->array = g_array_new(FALSE, TRUE, sizeof(VFIOStateBuffer)); 76 g_array_set_clear_func(bufs->array, vfio_state_buffer_clear); 77 } 78 79 static void vfio_state_buffers_destroy(VFIOStateBuffers *bufs) 80 { 81 g_clear_pointer(&bufs->array, g_array_unref); 82 } 83 84 static void vfio_state_buffers_assert_init(VFIOStateBuffers *bufs) 85 { 86 assert(bufs->array); 87 } 88 89 static unsigned int vfio_state_buffers_size_get(VFIOStateBuffers *bufs) 90 { 91 return bufs->array->len; 92 } 93 94 static void vfio_state_buffers_size_set(VFIOStateBuffers *bufs, 95 unsigned int size) 96 { 97 g_array_set_size(bufs->array, size); 98 } 99 100 static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs, 101 unsigned int idx) 102 { 103 return &g_array_index(bufs->array, VFIOStateBuffer, idx); 104 } 105 106 /* called with load_bufs_mutex locked */ 107 static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev, 108 VFIODeviceStatePacket *packet, 109 size_t packet_total_size, 110 Error **errp) 111 { 112 VFIOMigration *migration = vbasedev->migration; 113 VFIOMultifd *multifd = migration->multifd; 114 VFIOStateBuffer *lb; 115 116 vfio_state_buffers_assert_init(&multifd->load_bufs); 117 if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) { 118 vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1); 119 } 120 121 lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx); 122 if (lb->is_present) { 123 error_setg(errp, "%s: state buffer %" PRIu32 " already filled", 124 vbasedev->name, packet->idx); 125 return false; 126 } 127 128 assert(packet->idx >= multifd->load_buf_idx); 129 130 lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet)); 131 lb->len = packet_total_size - sizeof(*packet); 132 lb->is_present = true; 133 134 return true; 135 } 136 137 bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size, 138 Error **errp) 139 { 140 VFIODevice *vbasedev = opaque; 141 VFIOMigration *migration = vbasedev->migration; 142 VFIOMultifd *multifd = migration->multifd; 143 VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data; 144 145 if (!vfio_multifd_transfer_enabled(vbasedev)) { 146 error_setg(errp, 147 "%s: got device state packet but not doing multifd transfer", 148 vbasedev->name); 149 return false; 150 } 151 152 assert(multifd); 153 154 if (data_size < sizeof(*packet)) { 155 error_setg(errp, "%s: packet too short at %zu (min is %zu)", 156 vbasedev->name, data_size, sizeof(*packet)); 157 return false; 158 } 159 160 packet->version = be32_to_cpu(packet->version); 161 if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) { 162 error_setg(errp, "%s: packet has unknown version %" PRIu32, 163 vbasedev->name, packet->version); 164 return false; 165 } 166 167 packet->idx = be32_to_cpu(packet->idx); 168 packet->flags = be32_to_cpu(packet->flags); 169 170 if (packet->idx == UINT32_MAX) { 171 error_setg(errp, "%s: packet index is invalid", vbasedev->name); 172 return false; 173 } 174 175 trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx); 176 177 /* 178 * Holding BQL here would violate the lock order and can cause 179 * a deadlock once we attempt to lock load_bufs_mutex below. 180 */ 181 assert(!bql_locked()); 182 183 WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { 184 /* config state packet should be the last one in the stream */ 185 if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) { 186 multifd->load_buf_idx_last = packet->idx; 187 } 188 189 if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size, 190 errp)) { 191 return false; 192 } 193 194 qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); 195 } 196 197 return true; 198 } 199 200 static bool vfio_load_bufs_thread_load_config(VFIODevice *vbasedev, 201 Error **errp) 202 { 203 VFIOMigration *migration = vbasedev->migration; 204 VFIOMultifd *multifd = migration->multifd; 205 VFIOStateBuffer *lb; 206 g_autoptr(QIOChannelBuffer) bioc = NULL; 207 g_autoptr(QEMUFile) f_out = NULL, f_in = NULL; 208 uint64_t mig_header; 209 int ret; 210 211 assert(multifd->load_buf_idx == multifd->load_buf_idx_last); 212 lb = vfio_state_buffers_at(&multifd->load_bufs, multifd->load_buf_idx); 213 assert(lb->is_present); 214 215 bioc = qio_channel_buffer_new(lb->len); 216 qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-load"); 217 218 f_out = qemu_file_new_output(QIO_CHANNEL(bioc)); 219 qemu_put_buffer(f_out, (uint8_t *)lb->data, lb->len); 220 221 ret = qemu_fflush(f_out); 222 if (ret) { 223 error_setg(errp, "%s: load config state flush failed: %d", 224 vbasedev->name, ret); 225 return false; 226 } 227 228 qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL); 229 f_in = qemu_file_new_input(QIO_CHANNEL(bioc)); 230 231 mig_header = qemu_get_be64(f_in); 232 if (mig_header != VFIO_MIG_FLAG_DEV_CONFIG_STATE) { 233 error_setg(errp, "%s: expected FLAG_DEV_CONFIG_STATE but got %" PRIx64, 234 vbasedev->name, mig_header); 235 return false; 236 } 237 238 bql_lock(); 239 ret = vfio_load_device_config_state(f_in, vbasedev); 240 bql_unlock(); 241 242 if (ret < 0) { 243 error_setg(errp, "%s: vfio_load_device_config_state() failed: %d", 244 vbasedev->name, ret); 245 return false; 246 } 247 248 return true; 249 } 250 251 static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd) 252 { 253 VFIOStateBuffer *lb; 254 unsigned int bufs_len; 255 256 bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs); 257 if (multifd->load_buf_idx >= bufs_len) { 258 assert(multifd->load_buf_idx == bufs_len); 259 return NULL; 260 } 261 262 lb = vfio_state_buffers_at(&multifd->load_bufs, 263 multifd->load_buf_idx); 264 if (!lb->is_present) { 265 return NULL; 266 } 267 268 return lb; 269 } 270 271 static bool vfio_load_state_buffer_write(VFIODevice *vbasedev, 272 VFIOStateBuffer *lb, 273 Error **errp) 274 { 275 VFIOMigration *migration = vbasedev->migration; 276 VFIOMultifd *multifd = migration->multifd; 277 g_autofree char *buf = NULL; 278 char *buf_cur; 279 size_t buf_len; 280 281 if (!lb->len) { 282 return true; 283 } 284 285 trace_vfio_load_state_device_buffer_load_start(vbasedev->name, 286 multifd->load_buf_idx); 287 288 /* lb might become re-allocated when we drop the lock */ 289 buf = g_steal_pointer(&lb->data); 290 buf_cur = buf; 291 buf_len = lb->len; 292 while (buf_len > 0) { 293 ssize_t wr_ret; 294 int errno_save; 295 296 /* 297 * Loading data to the device takes a while, 298 * drop the lock during this process. 299 */ 300 qemu_mutex_unlock(&multifd->load_bufs_mutex); 301 wr_ret = write(migration->data_fd, buf_cur, buf_len); 302 errno_save = errno; 303 qemu_mutex_lock(&multifd->load_bufs_mutex); 304 305 if (wr_ret < 0) { 306 error_setg(errp, 307 "%s: writing state buffer %" PRIu32 " failed: %d", 308 vbasedev->name, multifd->load_buf_idx, errno_save); 309 return false; 310 } 311 312 assert(wr_ret <= buf_len); 313 buf_len -= wr_ret; 314 buf_cur += wr_ret; 315 } 316 317 trace_vfio_load_state_device_buffer_load_end(vbasedev->name, 318 multifd->load_buf_idx); 319 320 return true; 321 } 322 323 static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd, 324 bool *should_quit) 325 { 326 return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit); 327 } 328 329 /* 330 * This thread is spawned by vfio_multifd_switchover_start() which gets 331 * called upon encountering the switchover point marker in main migration 332 * stream. 333 * 334 * It exits after either: 335 * * completing loading the remaining device state and device config, OR: 336 * * encountering some error while doing the above, OR: 337 * * being forcefully aborted by the migration core by it setting should_quit 338 * or by vfio_load_cleanup_load_bufs_thread() setting 339 * multifd->load_bufs_thread_want_exit. 340 */ 341 static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp) 342 { 343 VFIODevice *vbasedev = opaque; 344 VFIOMigration *migration = vbasedev->migration; 345 VFIOMultifd *multifd = migration->multifd; 346 bool ret = false; 347 348 trace_vfio_load_bufs_thread_start(vbasedev->name); 349 350 assert(multifd); 351 QEMU_LOCK_GUARD(&multifd->load_bufs_mutex); 352 353 assert(multifd->load_bufs_thread_running); 354 355 while (true) { 356 VFIOStateBuffer *lb; 357 358 /* 359 * Always check cancellation first after the buffer_ready wait below in 360 * case that cond was signalled by vfio_load_cleanup_load_bufs_thread(). 361 */ 362 if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) { 363 error_setg(errp, "operation cancelled"); 364 goto thread_exit; 365 } 366 367 assert(multifd->load_buf_idx <= multifd->load_buf_idx_last); 368 369 lb = vfio_load_state_buffer_get(multifd); 370 if (!lb) { 371 trace_vfio_load_state_device_buffer_starved(vbasedev->name, 372 multifd->load_buf_idx); 373 qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond, 374 &multifd->load_bufs_mutex); 375 continue; 376 } 377 378 if (multifd->load_buf_idx == multifd->load_buf_idx_last) { 379 break; 380 } 381 382 if (multifd->load_buf_idx == 0) { 383 trace_vfio_load_state_device_buffer_start(vbasedev->name); 384 } 385 386 if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) { 387 goto thread_exit; 388 } 389 390 if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) { 391 trace_vfio_load_state_device_buffer_end(vbasedev->name); 392 } 393 394 multifd->load_buf_idx++; 395 } 396 397 if (!vfio_load_bufs_thread_load_config(vbasedev, errp)) { 398 goto thread_exit; 399 } 400 401 ret = true; 402 403 thread_exit: 404 /* 405 * Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that 406 * this thread is exiting. 407 */ 408 multifd->load_bufs_thread_running = false; 409 qemu_cond_signal(&multifd->load_bufs_thread_finished_cond); 410 411 trace_vfio_load_bufs_thread_end(vbasedev->name); 412 413 return ret; 414 } 415 416 static VFIOMultifd *vfio_multifd_new(void) 417 { 418 VFIOMultifd *multifd = g_new(VFIOMultifd, 1); 419 420 vfio_state_buffers_init(&multifd->load_bufs); 421 422 qemu_mutex_init(&multifd->load_bufs_mutex); 423 424 multifd->load_buf_idx = 0; 425 multifd->load_buf_idx_last = UINT32_MAX; 426 qemu_cond_init(&multifd->load_bufs_buffer_ready_cond); 427 428 multifd->load_bufs_thread_running = false; 429 multifd->load_bufs_thread_want_exit = false; 430 qemu_cond_init(&multifd->load_bufs_thread_finished_cond); 431 432 return multifd; 433 } 434 435 /* 436 * Terminates vfio_load_bufs_thread by setting 437 * multifd->load_bufs_thread_want_exit and signalling all the conditions 438 * the thread could be blocked on. 439 * 440 * Waits for the thread to signal that it had finished. 441 */ 442 static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd) 443 { 444 /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ 445 bql_unlock(); 446 WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { 447 while (multifd->load_bufs_thread_running) { 448 multifd->load_bufs_thread_want_exit = true; 449 450 qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); 451 qemu_cond_wait(&multifd->load_bufs_thread_finished_cond, 452 &multifd->load_bufs_mutex); 453 } 454 } 455 bql_lock(); 456 } 457 458 static void vfio_multifd_free(VFIOMultifd *multifd) 459 { 460 vfio_load_cleanup_load_bufs_thread(multifd); 461 462 qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond); 463 vfio_state_buffers_destroy(&multifd->load_bufs); 464 qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond); 465 qemu_mutex_destroy(&multifd->load_bufs_mutex); 466 467 g_free(multifd); 468 } 469 470 void vfio_multifd_cleanup(VFIODevice *vbasedev) 471 { 472 VFIOMigration *migration = vbasedev->migration; 473 474 g_clear_pointer(&migration->multifd, vfio_multifd_free); 475 } 476 477 bool vfio_multifd_transfer_supported(void) 478 { 479 return multifd_device_state_supported() && 480 migrate_send_switchover_start(); 481 } 482 483 bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev) 484 { 485 VFIOMigration *migration = vbasedev->migration; 486 487 return migration->multifd_transfer; 488 } 489 490 bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp) 491 { 492 VFIOMigration *migration = vbasedev->migration; 493 494 /* 495 * Make a copy of this setting at the start in case it is changed 496 * mid-migration. 497 */ 498 if (vbasedev->migration_multifd_transfer == ON_OFF_AUTO_AUTO) { 499 migration->multifd_transfer = vfio_multifd_transfer_supported(); 500 } else { 501 migration->multifd_transfer = 502 vbasedev->migration_multifd_transfer == ON_OFF_AUTO_ON; 503 } 504 505 if (!vfio_multifd_transfer_enabled(vbasedev)) { 506 /* Nothing further to check or do */ 507 return true; 508 } 509 510 if (!vfio_multifd_transfer_supported()) { 511 error_setg(errp, 512 "%s: Multifd device transfer requested but unsupported in the current config", 513 vbasedev->name); 514 return false; 515 } 516 517 if (alloc_multifd) { 518 assert(!migration->multifd); 519 migration->multifd = vfio_multifd_new(); 520 } 521 522 return true; 523 } 524 525 void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f) 526 { 527 assert(vfio_multifd_transfer_enabled(vbasedev)); 528 529 /* 530 * Emit dummy NOP data on the main migration channel since the actual 531 * device state transfer is done via multifd channels. 532 */ 533 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 534 } 535 536 static bool 537 vfio_save_complete_precopy_thread_config_state(VFIODevice *vbasedev, 538 char *idstr, 539 uint32_t instance_id, 540 uint32_t idx, 541 Error **errp) 542 { 543 g_autoptr(QIOChannelBuffer) bioc = NULL; 544 g_autoptr(QEMUFile) f = NULL; 545 int ret; 546 g_autofree VFIODeviceStatePacket *packet = NULL; 547 size_t packet_len; 548 549 bioc = qio_channel_buffer_new(0); 550 qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save"); 551 552 f = qemu_file_new_output(QIO_CHANNEL(bioc)); 553 554 if (vfio_save_device_config_state(f, vbasedev, errp)) { 555 return false; 556 } 557 558 ret = qemu_fflush(f); 559 if (ret) { 560 error_setg(errp, "%s: save config state flush failed: %d", 561 vbasedev->name, ret); 562 return false; 563 } 564 565 packet_len = sizeof(*packet) + bioc->usage; 566 packet = g_malloc0(packet_len); 567 packet->version = cpu_to_be32(VFIO_DEVICE_STATE_PACKET_VER_CURRENT); 568 packet->idx = cpu_to_be32(idx); 569 packet->flags = cpu_to_be32(VFIO_DEVICE_STATE_CONFIG_STATE); 570 memcpy(&packet->data, bioc->data, bioc->usage); 571 572 if (!multifd_queue_device_state(idstr, instance_id, 573 (char *)packet, packet_len)) { 574 error_setg(errp, "%s: multifd config data queuing failed", 575 vbasedev->name); 576 return false; 577 } 578 579 vfio_migration_add_bytes_transferred(packet_len); 580 581 return true; 582 } 583 584 /* 585 * This thread is spawned by the migration core directly via 586 * .save_live_complete_precopy_thread SaveVMHandler. 587 * 588 * It exits after either: 589 * * completing saving the remaining device state and device config, OR: 590 * * encountering some error while doing the above, OR: 591 * * being forcefully aborted by the migration core by 592 * multifd_device_state_save_thread_should_exit() returning true. 593 */ 594 bool 595 vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d, 596 Error **errp) 597 { 598 VFIODevice *vbasedev = d->handler_opaque; 599 VFIOMigration *migration = vbasedev->migration; 600 bool ret = false; 601 g_autofree VFIODeviceStatePacket *packet = NULL; 602 uint32_t idx; 603 604 if (!vfio_multifd_transfer_enabled(vbasedev)) { 605 /* Nothing to do, vfio_save_complete_precopy() does the transfer. */ 606 return true; 607 } 608 609 trace_vfio_save_complete_precopy_thread_start(vbasedev->name, 610 d->idstr, d->instance_id); 611 612 /* We reach here with device state STOP or STOP_COPY only */ 613 if (vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, 614 VFIO_DEVICE_STATE_STOP, errp)) { 615 goto thread_exit; 616 } 617 618 packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size); 619 packet->version = cpu_to_be32(VFIO_DEVICE_STATE_PACKET_VER_CURRENT); 620 621 for (idx = 0; ; idx++) { 622 ssize_t data_size; 623 size_t packet_size; 624 625 if (multifd_device_state_save_thread_should_exit()) { 626 error_setg(errp, "operation cancelled"); 627 goto thread_exit; 628 } 629 630 data_size = read(migration->data_fd, &packet->data, 631 migration->data_buffer_size); 632 if (data_size < 0) { 633 error_setg(errp, "%s: reading state buffer %" PRIu32 " failed: %d", 634 vbasedev->name, idx, errno); 635 goto thread_exit; 636 } else if (data_size == 0) { 637 break; 638 } 639 640 packet->idx = cpu_to_be32(idx); 641 packet_size = sizeof(*packet) + data_size; 642 643 if (!multifd_queue_device_state(d->idstr, d->instance_id, 644 (char *)packet, packet_size)) { 645 error_setg(errp, "%s: multifd data queuing failed", vbasedev->name); 646 goto thread_exit; 647 } 648 649 vfio_migration_add_bytes_transferred(packet_size); 650 } 651 652 if (!vfio_save_complete_precopy_thread_config_state(vbasedev, 653 d->idstr, 654 d->instance_id, 655 idx, errp)) { 656 goto thread_exit; 657 } 658 659 ret = true; 660 661 thread_exit: 662 trace_vfio_save_complete_precopy_thread_end(vbasedev->name, ret); 663 664 return ret; 665 } 666 667 int vfio_multifd_switchover_start(VFIODevice *vbasedev) 668 { 669 VFIOMigration *migration = vbasedev->migration; 670 VFIOMultifd *multifd = migration->multifd; 671 672 assert(multifd); 673 674 /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ 675 bql_unlock(); 676 WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { 677 assert(!multifd->load_bufs_thread_running); 678 multifd->load_bufs_thread_running = true; 679 } 680 bql_lock(); 681 682 qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev); 683 684 return 0; 685 } 686