1 /* 2 * Multifd VFIO migration 3 * 4 * Copyright (C) 2024,2025 Oracle and/or its affiliates. 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2 or later. 7 * See the COPYING file in the top-level directory. 8 * 9 * SPDX-License-Identifier: GPL-2.0-or-later 10 */ 11 12 #include "qemu/osdep.h" 13 #include "hw/vfio/vfio-common.h" 14 #include "migration/misc.h" 15 #include "qapi/error.h" 16 #include "qemu/error-report.h" 17 #include "qemu/lockable.h" 18 #include "qemu/main-loop.h" 19 #include "qemu/thread.h" 20 #include "io/channel-buffer.h" 21 #include "migration/qemu-file.h" 22 #include "migration-multifd.h" 23 #include "trace.h" 24 25 #define VFIO_DEVICE_STATE_CONFIG_STATE (1) 26 27 #define VFIO_DEVICE_STATE_PACKET_VER_CURRENT (0) 28 29 typedef struct VFIODeviceStatePacket { 30 uint32_t version; 31 uint32_t idx; 32 uint32_t flags; 33 uint8_t data[0]; 34 } QEMU_PACKED VFIODeviceStatePacket; 35 36 /* type safety */ 37 typedef struct VFIOStateBuffers { 38 GArray *array; 39 } VFIOStateBuffers; 40 41 typedef struct VFIOStateBuffer { 42 bool is_present; 43 char *data; 44 size_t len; 45 } VFIOStateBuffer; 46 47 typedef struct VFIOMultifd { 48 bool load_bufs_thread_running; 49 bool load_bufs_thread_want_exit; 50 51 VFIOStateBuffers load_bufs; 52 QemuCond load_bufs_buffer_ready_cond; 53 QemuCond load_bufs_thread_finished_cond; 54 QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */ 55 uint32_t load_buf_idx; 56 uint32_t load_buf_idx_last; 57 } VFIOMultifd; 58 59 static void vfio_state_buffer_clear(gpointer data) 60 { 61 VFIOStateBuffer *lb = data; 62 63 if (!lb->is_present) { 64 return; 65 } 66 67 g_clear_pointer(&lb->data, g_free); 68 lb->is_present = false; 69 } 70 71 static void vfio_state_buffers_init(VFIOStateBuffers *bufs) 72 { 73 bufs->array = g_array_new(FALSE, TRUE, sizeof(VFIOStateBuffer)); 74 g_array_set_clear_func(bufs->array, vfio_state_buffer_clear); 75 } 76 77 static void vfio_state_buffers_destroy(VFIOStateBuffers *bufs) 78 { 79 g_clear_pointer(&bufs->array, g_array_unref); 80 } 81 82 static void vfio_state_buffers_assert_init(VFIOStateBuffers *bufs) 83 { 84 assert(bufs->array); 85 } 86 87 static unsigned int vfio_state_buffers_size_get(VFIOStateBuffers *bufs) 88 { 89 return bufs->array->len; 90 } 91 92 static void vfio_state_buffers_size_set(VFIOStateBuffers *bufs, 93 unsigned int size) 94 { 95 g_array_set_size(bufs->array, size); 96 } 97 98 static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs, 99 unsigned int idx) 100 { 101 return &g_array_index(bufs->array, VFIOStateBuffer, idx); 102 } 103 104 /* called with load_bufs_mutex locked */ 105 static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev, 106 VFIODeviceStatePacket *packet, 107 size_t packet_total_size, 108 Error **errp) 109 { 110 VFIOMigration *migration = vbasedev->migration; 111 VFIOMultifd *multifd = migration->multifd; 112 VFIOStateBuffer *lb; 113 114 vfio_state_buffers_assert_init(&multifd->load_bufs); 115 if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) { 116 vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1); 117 } 118 119 lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx); 120 if (lb->is_present) { 121 error_setg(errp, "%s: state buffer %" PRIu32 " already filled", 122 vbasedev->name, packet->idx); 123 return false; 124 } 125 126 assert(packet->idx >= multifd->load_buf_idx); 127 128 lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet)); 129 lb->len = packet_total_size - sizeof(*packet); 130 lb->is_present = true; 131 132 return true; 133 } 134 135 bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size, 136 Error **errp) 137 { 138 VFIODevice *vbasedev = opaque; 139 VFIOMigration *migration = vbasedev->migration; 140 VFIOMultifd *multifd = migration->multifd; 141 VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data; 142 143 if (!vfio_multifd_transfer_enabled(vbasedev)) { 144 error_setg(errp, 145 "%s: got device state packet but not doing multifd transfer", 146 vbasedev->name); 147 return false; 148 } 149 150 assert(multifd); 151 152 if (data_size < sizeof(*packet)) { 153 error_setg(errp, "%s: packet too short at %zu (min is %zu)", 154 vbasedev->name, data_size, sizeof(*packet)); 155 return false; 156 } 157 158 if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) { 159 error_setg(errp, "%s: packet has unknown version %" PRIu32, 160 vbasedev->name, packet->version); 161 return false; 162 } 163 164 if (packet->idx == UINT32_MAX) { 165 error_setg(errp, "%s: packet index is invalid", vbasedev->name); 166 return false; 167 } 168 169 trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx); 170 171 /* 172 * Holding BQL here would violate the lock order and can cause 173 * a deadlock once we attempt to lock load_bufs_mutex below. 174 */ 175 assert(!bql_locked()); 176 177 WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { 178 /* config state packet should be the last one in the stream */ 179 if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) { 180 multifd->load_buf_idx_last = packet->idx; 181 } 182 183 if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size, 184 errp)) { 185 return false; 186 } 187 188 qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); 189 } 190 191 return true; 192 } 193 194 static bool vfio_load_bufs_thread_load_config(VFIODevice *vbasedev, 195 Error **errp) 196 { 197 VFIOMigration *migration = vbasedev->migration; 198 VFIOMultifd *multifd = migration->multifd; 199 VFIOStateBuffer *lb; 200 g_autoptr(QIOChannelBuffer) bioc = NULL; 201 g_autoptr(QEMUFile) f_out = NULL, f_in = NULL; 202 uint64_t mig_header; 203 int ret; 204 205 assert(multifd->load_buf_idx == multifd->load_buf_idx_last); 206 lb = vfio_state_buffers_at(&multifd->load_bufs, multifd->load_buf_idx); 207 assert(lb->is_present); 208 209 bioc = qio_channel_buffer_new(lb->len); 210 qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-load"); 211 212 f_out = qemu_file_new_output(QIO_CHANNEL(bioc)); 213 qemu_put_buffer(f_out, (uint8_t *)lb->data, lb->len); 214 215 ret = qemu_fflush(f_out); 216 if (ret) { 217 error_setg(errp, "%s: load config state flush failed: %d", 218 vbasedev->name, ret); 219 return false; 220 } 221 222 qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL); 223 f_in = qemu_file_new_input(QIO_CHANNEL(bioc)); 224 225 mig_header = qemu_get_be64(f_in); 226 if (mig_header != VFIO_MIG_FLAG_DEV_CONFIG_STATE) { 227 error_setg(errp, "%s: expected FLAG_DEV_CONFIG_STATE but got %" PRIx64, 228 vbasedev->name, mig_header); 229 return false; 230 } 231 232 bql_lock(); 233 ret = vfio_load_device_config_state(f_in, vbasedev); 234 bql_unlock(); 235 236 if (ret < 0) { 237 error_setg(errp, "%s: vfio_load_device_config_state() failed: %d", 238 vbasedev->name, ret); 239 return false; 240 } 241 242 return true; 243 } 244 245 static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd) 246 { 247 VFIOStateBuffer *lb; 248 unsigned int bufs_len; 249 250 bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs); 251 if (multifd->load_buf_idx >= bufs_len) { 252 assert(multifd->load_buf_idx == bufs_len); 253 return NULL; 254 } 255 256 lb = vfio_state_buffers_at(&multifd->load_bufs, 257 multifd->load_buf_idx); 258 if (!lb->is_present) { 259 return NULL; 260 } 261 262 return lb; 263 } 264 265 static bool vfio_load_state_buffer_write(VFIODevice *vbasedev, 266 VFIOStateBuffer *lb, 267 Error **errp) 268 { 269 VFIOMigration *migration = vbasedev->migration; 270 VFIOMultifd *multifd = migration->multifd; 271 g_autofree char *buf = NULL; 272 char *buf_cur; 273 size_t buf_len; 274 275 if (!lb->len) { 276 return true; 277 } 278 279 trace_vfio_load_state_device_buffer_load_start(vbasedev->name, 280 multifd->load_buf_idx); 281 282 /* lb might become re-allocated when we drop the lock */ 283 buf = g_steal_pointer(&lb->data); 284 buf_cur = buf; 285 buf_len = lb->len; 286 while (buf_len > 0) { 287 ssize_t wr_ret; 288 int errno_save; 289 290 /* 291 * Loading data to the device takes a while, 292 * drop the lock during this process. 293 */ 294 qemu_mutex_unlock(&multifd->load_bufs_mutex); 295 wr_ret = write(migration->data_fd, buf_cur, buf_len); 296 errno_save = errno; 297 qemu_mutex_lock(&multifd->load_bufs_mutex); 298 299 if (wr_ret < 0) { 300 error_setg(errp, 301 "%s: writing state buffer %" PRIu32 " failed: %d", 302 vbasedev->name, multifd->load_buf_idx, errno_save); 303 return false; 304 } 305 306 assert(wr_ret <= buf_len); 307 buf_len -= wr_ret; 308 buf_cur += wr_ret; 309 } 310 311 trace_vfio_load_state_device_buffer_load_end(vbasedev->name, 312 multifd->load_buf_idx); 313 314 return true; 315 } 316 317 static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd, 318 bool *should_quit) 319 { 320 return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit); 321 } 322 323 /* 324 * This thread is spawned by vfio_multifd_switchover_start() which gets 325 * called upon encountering the switchover point marker in main migration 326 * stream. 327 * 328 * It exits after either: 329 * * completing loading the remaining device state and device config, OR: 330 * * encountering some error while doing the above, OR: 331 * * being forcefully aborted by the migration core by it setting should_quit 332 * or by vfio_load_cleanup_load_bufs_thread() setting 333 * multifd->load_bufs_thread_want_exit. 334 */ 335 static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp) 336 { 337 VFIODevice *vbasedev = opaque; 338 VFIOMigration *migration = vbasedev->migration; 339 VFIOMultifd *multifd = migration->multifd; 340 bool ret = false; 341 342 trace_vfio_load_bufs_thread_start(vbasedev->name); 343 344 assert(multifd); 345 QEMU_LOCK_GUARD(&multifd->load_bufs_mutex); 346 347 assert(multifd->load_bufs_thread_running); 348 349 while (true) { 350 VFIOStateBuffer *lb; 351 352 /* 353 * Always check cancellation first after the buffer_ready wait below in 354 * case that cond was signalled by vfio_load_cleanup_load_bufs_thread(). 355 */ 356 if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) { 357 error_setg(errp, "operation cancelled"); 358 goto thread_exit; 359 } 360 361 assert(multifd->load_buf_idx <= multifd->load_buf_idx_last); 362 363 lb = vfio_load_state_buffer_get(multifd); 364 if (!lb) { 365 trace_vfio_load_state_device_buffer_starved(vbasedev->name, 366 multifd->load_buf_idx); 367 qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond, 368 &multifd->load_bufs_mutex); 369 continue; 370 } 371 372 if (multifd->load_buf_idx == multifd->load_buf_idx_last) { 373 break; 374 } 375 376 if (multifd->load_buf_idx == 0) { 377 trace_vfio_load_state_device_buffer_start(vbasedev->name); 378 } 379 380 if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) { 381 goto thread_exit; 382 } 383 384 if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) { 385 trace_vfio_load_state_device_buffer_end(vbasedev->name); 386 } 387 388 multifd->load_buf_idx++; 389 } 390 391 if (!vfio_load_bufs_thread_load_config(vbasedev, errp)) { 392 goto thread_exit; 393 } 394 395 ret = true; 396 397 thread_exit: 398 /* 399 * Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that 400 * this thread is exiting. 401 */ 402 multifd->load_bufs_thread_running = false; 403 qemu_cond_signal(&multifd->load_bufs_thread_finished_cond); 404 405 trace_vfio_load_bufs_thread_end(vbasedev->name); 406 407 return ret; 408 } 409 410 static VFIOMultifd *vfio_multifd_new(void) 411 { 412 VFIOMultifd *multifd = g_new(VFIOMultifd, 1); 413 414 vfio_state_buffers_init(&multifd->load_bufs); 415 416 qemu_mutex_init(&multifd->load_bufs_mutex); 417 418 multifd->load_buf_idx = 0; 419 multifd->load_buf_idx_last = UINT32_MAX; 420 qemu_cond_init(&multifd->load_bufs_buffer_ready_cond); 421 422 multifd->load_bufs_thread_running = false; 423 multifd->load_bufs_thread_want_exit = false; 424 qemu_cond_init(&multifd->load_bufs_thread_finished_cond); 425 426 return multifd; 427 } 428 429 /* 430 * Terminates vfio_load_bufs_thread by setting 431 * multifd->load_bufs_thread_want_exit and signalling all the conditions 432 * the thread could be blocked on. 433 * 434 * Waits for the thread to signal that it had finished. 435 */ 436 static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd) 437 { 438 /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ 439 bql_unlock(); 440 WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { 441 while (multifd->load_bufs_thread_running) { 442 multifd->load_bufs_thread_want_exit = true; 443 444 qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); 445 qemu_cond_wait(&multifd->load_bufs_thread_finished_cond, 446 &multifd->load_bufs_mutex); 447 } 448 } 449 bql_lock(); 450 } 451 452 static void vfio_multifd_free(VFIOMultifd *multifd) 453 { 454 vfio_load_cleanup_load_bufs_thread(multifd); 455 456 qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond); 457 vfio_state_buffers_destroy(&multifd->load_bufs); 458 qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond); 459 qemu_mutex_destroy(&multifd->load_bufs_mutex); 460 461 g_free(multifd); 462 } 463 464 void vfio_multifd_cleanup(VFIODevice *vbasedev) 465 { 466 VFIOMigration *migration = vbasedev->migration; 467 468 g_clear_pointer(&migration->multifd, vfio_multifd_free); 469 } 470 471 bool vfio_multifd_transfer_supported(void) 472 { 473 return multifd_device_state_supported() && 474 migrate_send_switchover_start(); 475 } 476 477 bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev) 478 { 479 VFIOMigration *migration = vbasedev->migration; 480 481 return migration->multifd_transfer; 482 } 483 484 bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp) 485 { 486 VFIOMigration *migration = vbasedev->migration; 487 488 /* 489 * Make a copy of this setting at the start in case it is changed 490 * mid-migration. 491 */ 492 if (vbasedev->migration_multifd_transfer == ON_OFF_AUTO_AUTO) { 493 migration->multifd_transfer = vfio_multifd_transfer_supported(); 494 } else { 495 migration->multifd_transfer = 496 vbasedev->migration_multifd_transfer == ON_OFF_AUTO_ON; 497 } 498 499 if (!vfio_multifd_transfer_enabled(vbasedev)) { 500 /* Nothing further to check or do */ 501 return true; 502 } 503 504 if (!vfio_multifd_transfer_supported()) { 505 error_setg(errp, 506 "%s: Multifd device transfer requested but unsupported in the current config", 507 vbasedev->name); 508 return false; 509 } 510 511 if (alloc_multifd) { 512 assert(!migration->multifd); 513 migration->multifd = vfio_multifd_new(); 514 } 515 516 return true; 517 } 518 519 void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f) 520 { 521 assert(vfio_multifd_transfer_enabled(vbasedev)); 522 523 /* 524 * Emit dummy NOP data on the main migration channel since the actual 525 * device state transfer is done via multifd channels. 526 */ 527 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 528 } 529 530 static bool 531 vfio_save_complete_precopy_thread_config_state(VFIODevice *vbasedev, 532 char *idstr, 533 uint32_t instance_id, 534 uint32_t idx, 535 Error **errp) 536 { 537 g_autoptr(QIOChannelBuffer) bioc = NULL; 538 g_autoptr(QEMUFile) f = NULL; 539 int ret; 540 g_autofree VFIODeviceStatePacket *packet = NULL; 541 size_t packet_len; 542 543 bioc = qio_channel_buffer_new(0); 544 qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save"); 545 546 f = qemu_file_new_output(QIO_CHANNEL(bioc)); 547 548 if (vfio_save_device_config_state(f, vbasedev, errp)) { 549 return false; 550 } 551 552 ret = qemu_fflush(f); 553 if (ret) { 554 error_setg(errp, "%s: save config state flush failed: %d", 555 vbasedev->name, ret); 556 return false; 557 } 558 559 packet_len = sizeof(*packet) + bioc->usage; 560 packet = g_malloc0(packet_len); 561 packet->version = VFIO_DEVICE_STATE_PACKET_VER_CURRENT; 562 packet->idx = idx; 563 packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE; 564 memcpy(&packet->data, bioc->data, bioc->usage); 565 566 if (!multifd_queue_device_state(idstr, instance_id, 567 (char *)packet, packet_len)) { 568 error_setg(errp, "%s: multifd config data queuing failed", 569 vbasedev->name); 570 return false; 571 } 572 573 vfio_mig_add_bytes_transferred(packet_len); 574 575 return true; 576 } 577 578 /* 579 * This thread is spawned by the migration core directly via 580 * .save_live_complete_precopy_thread SaveVMHandler. 581 * 582 * It exits after either: 583 * * completing saving the remaining device state and device config, OR: 584 * * encountering some error while doing the above, OR: 585 * * being forcefully aborted by the migration core by 586 * multifd_device_state_save_thread_should_exit() returning true. 587 */ 588 bool 589 vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d, 590 Error **errp) 591 { 592 VFIODevice *vbasedev = d->handler_opaque; 593 VFIOMigration *migration = vbasedev->migration; 594 bool ret = false; 595 g_autofree VFIODeviceStatePacket *packet = NULL; 596 uint32_t idx; 597 598 if (!vfio_multifd_transfer_enabled(vbasedev)) { 599 /* Nothing to do, vfio_save_complete_precopy() does the transfer. */ 600 return true; 601 } 602 603 trace_vfio_save_complete_precopy_thread_start(vbasedev->name, 604 d->idstr, d->instance_id); 605 606 /* We reach here with device state STOP or STOP_COPY only */ 607 if (vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, 608 VFIO_DEVICE_STATE_STOP, errp)) { 609 goto thread_exit; 610 } 611 612 packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size); 613 packet->version = VFIO_DEVICE_STATE_PACKET_VER_CURRENT; 614 615 for (idx = 0; ; idx++) { 616 ssize_t data_size; 617 size_t packet_size; 618 619 if (multifd_device_state_save_thread_should_exit()) { 620 error_setg(errp, "operation cancelled"); 621 goto thread_exit; 622 } 623 624 data_size = read(migration->data_fd, &packet->data, 625 migration->data_buffer_size); 626 if (data_size < 0) { 627 error_setg(errp, "%s: reading state buffer %" PRIu32 " failed: %d", 628 vbasedev->name, idx, errno); 629 goto thread_exit; 630 } else if (data_size == 0) { 631 break; 632 } 633 634 packet->idx = idx; 635 packet_size = sizeof(*packet) + data_size; 636 637 if (!multifd_queue_device_state(d->idstr, d->instance_id, 638 (char *)packet, packet_size)) { 639 error_setg(errp, "%s: multifd data queuing failed", vbasedev->name); 640 goto thread_exit; 641 } 642 643 vfio_mig_add_bytes_transferred(packet_size); 644 } 645 646 if (!vfio_save_complete_precopy_thread_config_state(vbasedev, 647 d->idstr, 648 d->instance_id, 649 idx, errp)) { 650 goto thread_exit; 651 } 652 653 ret = true; 654 655 thread_exit: 656 trace_vfio_save_complete_precopy_thread_end(vbasedev->name, ret); 657 658 return ret; 659 } 660 661 int vfio_multifd_switchover_start(VFIODevice *vbasedev) 662 { 663 VFIOMigration *migration = vbasedev->migration; 664 VFIOMultifd *multifd = migration->multifd; 665 666 assert(multifd); 667 668 /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ 669 bql_unlock(); 670 WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { 671 assert(!multifd->load_bufs_thread_running); 672 multifd->load_bufs_thread_running = true; 673 } 674 bql_lock(); 675 676 qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev); 677 678 return 0; 679 } 680