1 /* 2 * Multifd VFIO migration 3 * 4 * Copyright (C) 2024,2025 Oracle and/or its affiliates. 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2 or later. 7 * See the COPYING file in the top-level directory. 8 * 9 * SPDX-License-Identifier: GPL-2.0-or-later 10 */ 11 12 #include "qemu/osdep.h" 13 #include "hw/vfio/vfio-common.h" 14 #include "migration/misc.h" 15 #include "qapi/error.h" 16 #include "qemu/error-report.h" 17 #include "qemu/lockable.h" 18 #include "qemu/main-loop.h" 19 #include "qemu/thread.h" 20 #include "io/channel-buffer.h" 21 #include "migration/qemu-file.h" 22 #include "migration-multifd.h" 23 #include "trace.h" 24 25 #define VFIO_DEVICE_STATE_CONFIG_STATE (1) 26 27 #define VFIO_DEVICE_STATE_PACKET_VER_CURRENT (0) 28 29 typedef struct VFIODeviceStatePacket { 30 uint32_t version; 31 uint32_t idx; 32 uint32_t flags; 33 uint8_t data[0]; 34 } QEMU_PACKED VFIODeviceStatePacket; 35 36 /* type safety */ 37 typedef struct VFIOStateBuffers { 38 GArray *array; 39 } VFIOStateBuffers; 40 41 typedef struct VFIOStateBuffer { 42 bool is_present; 43 char *data; 44 size_t len; 45 } VFIOStateBuffer; 46 47 typedef struct VFIOMultifd { 48 bool load_bufs_thread_running; 49 bool load_bufs_thread_want_exit; 50 51 VFIOStateBuffers load_bufs; 52 QemuCond load_bufs_buffer_ready_cond; 53 QemuCond load_bufs_thread_finished_cond; 54 QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */ 55 uint32_t load_buf_idx; 56 uint32_t load_buf_idx_last; 57 } VFIOMultifd; 58 59 static void vfio_state_buffer_clear(gpointer data) 60 { 61 VFIOStateBuffer *lb = data; 62 63 if (!lb->is_present) { 64 return; 65 } 66 67 g_clear_pointer(&lb->data, g_free); 68 lb->is_present = false; 69 } 70 71 static void vfio_state_buffers_init(VFIOStateBuffers *bufs) 72 { 73 bufs->array = g_array_new(FALSE, TRUE, sizeof(VFIOStateBuffer)); 74 g_array_set_clear_func(bufs->array, vfio_state_buffer_clear); 75 } 76 77 static void vfio_state_buffers_destroy(VFIOStateBuffers *bufs) 78 { 79 g_clear_pointer(&bufs->array, g_array_unref); 80 } 81 82 static void vfio_state_buffers_assert_init(VFIOStateBuffers *bufs) 83 { 84 assert(bufs->array); 85 } 86 87 static unsigned int vfio_state_buffers_size_get(VFIOStateBuffers *bufs) 88 { 89 return bufs->array->len; 90 } 91 92 static void vfio_state_buffers_size_set(VFIOStateBuffers *bufs, 93 unsigned int size) 94 { 95 g_array_set_size(bufs->array, size); 96 } 97 98 static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs, 99 unsigned int idx) 100 { 101 return &g_array_index(bufs->array, VFIOStateBuffer, idx); 102 } 103 104 /* called with load_bufs_mutex locked */ 105 static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev, 106 VFIODeviceStatePacket *packet, 107 size_t packet_total_size, 108 Error **errp) 109 { 110 VFIOMigration *migration = vbasedev->migration; 111 VFIOMultifd *multifd = migration->multifd; 112 VFIOStateBuffer *lb; 113 114 vfio_state_buffers_assert_init(&multifd->load_bufs); 115 if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) { 116 vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1); 117 } 118 119 lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx); 120 if (lb->is_present) { 121 error_setg(errp, "%s: state buffer %" PRIu32 " already filled", 122 vbasedev->name, packet->idx); 123 return false; 124 } 125 126 assert(packet->idx >= multifd->load_buf_idx); 127 128 lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet)); 129 lb->len = packet_total_size - sizeof(*packet); 130 lb->is_present = true; 131 132 return true; 133 } 134 135 bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size, 136 Error **errp) 137 { 138 VFIODevice *vbasedev = opaque; 139 VFIOMigration *migration = vbasedev->migration; 140 VFIOMultifd *multifd = migration->multifd; 141 VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data; 142 143 if (!vfio_multifd_transfer_enabled(vbasedev)) { 144 error_setg(errp, 145 "%s: got device state packet but not doing multifd transfer", 146 vbasedev->name); 147 return false; 148 } 149 150 assert(multifd); 151 152 if (data_size < sizeof(*packet)) { 153 error_setg(errp, "%s: packet too short at %zu (min is %zu)", 154 vbasedev->name, data_size, sizeof(*packet)); 155 return false; 156 } 157 158 if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) { 159 error_setg(errp, "%s: packet has unknown version %" PRIu32, 160 vbasedev->name, packet->version); 161 return false; 162 } 163 164 if (packet->idx == UINT32_MAX) { 165 error_setg(errp, "%s: packet index is invalid", vbasedev->name); 166 return false; 167 } 168 169 trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx); 170 171 /* 172 * Holding BQL here would violate the lock order and can cause 173 * a deadlock once we attempt to lock load_bufs_mutex below. 174 */ 175 assert(!bql_locked()); 176 177 WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { 178 /* config state packet should be the last one in the stream */ 179 if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) { 180 multifd->load_buf_idx_last = packet->idx; 181 } 182 183 if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size, 184 errp)) { 185 return false; 186 } 187 188 qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); 189 } 190 191 return true; 192 } 193 194 static bool vfio_load_bufs_thread_load_config(VFIODevice *vbasedev, 195 Error **errp) 196 { 197 VFIOMigration *migration = vbasedev->migration; 198 VFIOMultifd *multifd = migration->multifd; 199 VFIOStateBuffer *lb; 200 g_autoptr(QIOChannelBuffer) bioc = NULL; 201 g_autoptr(QEMUFile) f_out = NULL, f_in = NULL; 202 uint64_t mig_header; 203 int ret; 204 205 assert(multifd->load_buf_idx == multifd->load_buf_idx_last); 206 lb = vfio_state_buffers_at(&multifd->load_bufs, multifd->load_buf_idx); 207 assert(lb->is_present); 208 209 bioc = qio_channel_buffer_new(lb->len); 210 qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-load"); 211 212 f_out = qemu_file_new_output(QIO_CHANNEL(bioc)); 213 qemu_put_buffer(f_out, (uint8_t *)lb->data, lb->len); 214 215 ret = qemu_fflush(f_out); 216 if (ret) { 217 error_setg(errp, "%s: load config state flush failed: %d", 218 vbasedev->name, ret); 219 return false; 220 } 221 222 qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL); 223 f_in = qemu_file_new_input(QIO_CHANNEL(bioc)); 224 225 mig_header = qemu_get_be64(f_in); 226 if (mig_header != VFIO_MIG_FLAG_DEV_CONFIG_STATE) { 227 error_setg(errp, "%s: expected FLAG_DEV_CONFIG_STATE but got %" PRIx64, 228 vbasedev->name, mig_header); 229 return false; 230 } 231 232 bql_lock(); 233 ret = vfio_load_device_config_state(f_in, vbasedev); 234 bql_unlock(); 235 236 if (ret < 0) { 237 error_setg(errp, "%s: vfio_load_device_config_state() failed: %d", 238 vbasedev->name, ret); 239 return false; 240 } 241 242 return true; 243 } 244 245 static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd) 246 { 247 VFIOStateBuffer *lb; 248 unsigned int bufs_len; 249 250 bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs); 251 if (multifd->load_buf_idx >= bufs_len) { 252 assert(multifd->load_buf_idx == bufs_len); 253 return NULL; 254 } 255 256 lb = vfio_state_buffers_at(&multifd->load_bufs, 257 multifd->load_buf_idx); 258 if (!lb->is_present) { 259 return NULL; 260 } 261 262 return lb; 263 } 264 265 static bool vfio_load_state_buffer_write(VFIODevice *vbasedev, 266 VFIOStateBuffer *lb, 267 Error **errp) 268 { 269 VFIOMigration *migration = vbasedev->migration; 270 VFIOMultifd *multifd = migration->multifd; 271 g_autofree char *buf = NULL; 272 char *buf_cur; 273 size_t buf_len; 274 275 if (!lb->len) { 276 return true; 277 } 278 279 trace_vfio_load_state_device_buffer_load_start(vbasedev->name, 280 multifd->load_buf_idx); 281 282 /* lb might become re-allocated when we drop the lock */ 283 buf = g_steal_pointer(&lb->data); 284 buf_cur = buf; 285 buf_len = lb->len; 286 while (buf_len > 0) { 287 ssize_t wr_ret; 288 int errno_save; 289 290 /* 291 * Loading data to the device takes a while, 292 * drop the lock during this process. 293 */ 294 qemu_mutex_unlock(&multifd->load_bufs_mutex); 295 wr_ret = write(migration->data_fd, buf_cur, buf_len); 296 errno_save = errno; 297 qemu_mutex_lock(&multifd->load_bufs_mutex); 298 299 if (wr_ret < 0) { 300 error_setg(errp, 301 "%s: writing state buffer %" PRIu32 " failed: %d", 302 vbasedev->name, multifd->load_buf_idx, errno_save); 303 return false; 304 } 305 306 assert(wr_ret <= buf_len); 307 buf_len -= wr_ret; 308 buf_cur += wr_ret; 309 } 310 311 trace_vfio_load_state_device_buffer_load_end(vbasedev->name, 312 multifd->load_buf_idx); 313 314 return true; 315 } 316 317 static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd, 318 bool *should_quit) 319 { 320 return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit); 321 } 322 323 /* 324 * This thread is spawned by vfio_multifd_switchover_start() which gets 325 * called upon encountering the switchover point marker in main migration 326 * stream. 327 * 328 * It exits after either: 329 * * completing loading the remaining device state and device config, OR: 330 * * encountering some error while doing the above, OR: 331 * * being forcefully aborted by the migration core by it setting should_quit 332 * or by vfio_load_cleanup_load_bufs_thread() setting 333 * multifd->load_bufs_thread_want_exit. 334 */ 335 static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp) 336 { 337 VFIODevice *vbasedev = opaque; 338 VFIOMigration *migration = vbasedev->migration; 339 VFIOMultifd *multifd = migration->multifd; 340 bool ret = false; 341 342 trace_vfio_load_bufs_thread_start(vbasedev->name); 343 344 assert(multifd); 345 QEMU_LOCK_GUARD(&multifd->load_bufs_mutex); 346 347 assert(multifd->load_bufs_thread_running); 348 349 while (true) { 350 VFIOStateBuffer *lb; 351 352 /* 353 * Always check cancellation first after the buffer_ready wait below in 354 * case that cond was signalled by vfio_load_cleanup_load_bufs_thread(). 355 */ 356 if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) { 357 error_setg(errp, "operation cancelled"); 358 goto thread_exit; 359 } 360 361 assert(multifd->load_buf_idx <= multifd->load_buf_idx_last); 362 363 lb = vfio_load_state_buffer_get(multifd); 364 if (!lb) { 365 trace_vfio_load_state_device_buffer_starved(vbasedev->name, 366 multifd->load_buf_idx); 367 qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond, 368 &multifd->load_bufs_mutex); 369 continue; 370 } 371 372 if (multifd->load_buf_idx == multifd->load_buf_idx_last) { 373 break; 374 } 375 376 if (multifd->load_buf_idx == 0) { 377 trace_vfio_load_state_device_buffer_start(vbasedev->name); 378 } 379 380 if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) { 381 goto thread_exit; 382 } 383 384 if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) { 385 trace_vfio_load_state_device_buffer_end(vbasedev->name); 386 } 387 388 multifd->load_buf_idx++; 389 } 390 391 if (!vfio_load_bufs_thread_load_config(vbasedev, errp)) { 392 goto thread_exit; 393 } 394 395 ret = true; 396 397 thread_exit: 398 /* 399 * Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that 400 * this thread is exiting. 401 */ 402 multifd->load_bufs_thread_running = false; 403 qemu_cond_signal(&multifd->load_bufs_thread_finished_cond); 404 405 trace_vfio_load_bufs_thread_end(vbasedev->name); 406 407 return ret; 408 } 409 410 static VFIOMultifd *vfio_multifd_new(void) 411 { 412 VFIOMultifd *multifd = g_new(VFIOMultifd, 1); 413 414 vfio_state_buffers_init(&multifd->load_bufs); 415 416 qemu_mutex_init(&multifd->load_bufs_mutex); 417 418 multifd->load_buf_idx = 0; 419 multifd->load_buf_idx_last = UINT32_MAX; 420 qemu_cond_init(&multifd->load_bufs_buffer_ready_cond); 421 422 multifd->load_bufs_thread_running = false; 423 multifd->load_bufs_thread_want_exit = false; 424 qemu_cond_init(&multifd->load_bufs_thread_finished_cond); 425 426 return multifd; 427 } 428 429 /* 430 * Terminates vfio_load_bufs_thread by setting 431 * multifd->load_bufs_thread_want_exit and signalling all the conditions 432 * the thread could be blocked on. 433 * 434 * Waits for the thread to signal that it had finished. 435 */ 436 static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd) 437 { 438 /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ 439 bql_unlock(); 440 WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { 441 while (multifd->load_bufs_thread_running) { 442 multifd->load_bufs_thread_want_exit = true; 443 444 qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); 445 qemu_cond_wait(&multifd->load_bufs_thread_finished_cond, 446 &multifd->load_bufs_mutex); 447 } 448 } 449 bql_lock(); 450 } 451 452 static void vfio_multifd_free(VFIOMultifd *multifd) 453 { 454 vfio_load_cleanup_load_bufs_thread(multifd); 455 456 qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond); 457 vfio_state_buffers_destroy(&multifd->load_bufs); 458 qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond); 459 qemu_mutex_destroy(&multifd->load_bufs_mutex); 460 461 g_free(multifd); 462 } 463 464 void vfio_multifd_cleanup(VFIODevice *vbasedev) 465 { 466 VFIOMigration *migration = vbasedev->migration; 467 468 g_clear_pointer(&migration->multifd, vfio_multifd_free); 469 } 470 471 bool vfio_multifd_transfer_supported(void) 472 { 473 return multifd_device_state_supported() && 474 migrate_send_switchover_start(); 475 } 476 477 bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev) 478 { 479 VFIOMigration *migration = vbasedev->migration; 480 481 return migration->multifd_transfer; 482 } 483 484 bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp) 485 { 486 VFIOMigration *migration = vbasedev->migration; 487 488 if (vbasedev->migration_multifd_transfer == ON_OFF_AUTO_AUTO) { 489 migration->multifd_transfer = vfio_multifd_transfer_supported(); 490 } else { 491 migration->multifd_transfer = 492 vbasedev->migration_multifd_transfer == ON_OFF_AUTO_ON; 493 } 494 495 if (!vfio_multifd_transfer_enabled(vbasedev)) { 496 /* Nothing further to check or do */ 497 return true; 498 } 499 500 if (!vfio_multifd_transfer_supported()) { 501 error_setg(errp, 502 "%s: Multifd device transfer requested but unsupported in the current config", 503 vbasedev->name); 504 return false; 505 } 506 507 if (alloc_multifd) { 508 assert(!migration->multifd); 509 migration->multifd = vfio_multifd_new(); 510 } 511 512 return true; 513 } 514 515 void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f) 516 { 517 assert(vfio_multifd_transfer_enabled(vbasedev)); 518 519 /* 520 * Emit dummy NOP data on the main migration channel since the actual 521 * device state transfer is done via multifd channels. 522 */ 523 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 524 } 525 526 static bool 527 vfio_save_complete_precopy_thread_config_state(VFIODevice *vbasedev, 528 char *idstr, 529 uint32_t instance_id, 530 uint32_t idx, 531 Error **errp) 532 { 533 g_autoptr(QIOChannelBuffer) bioc = NULL; 534 g_autoptr(QEMUFile) f = NULL; 535 int ret; 536 g_autofree VFIODeviceStatePacket *packet = NULL; 537 size_t packet_len; 538 539 bioc = qio_channel_buffer_new(0); 540 qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save"); 541 542 f = qemu_file_new_output(QIO_CHANNEL(bioc)); 543 544 if (vfio_save_device_config_state(f, vbasedev, errp)) { 545 return false; 546 } 547 548 ret = qemu_fflush(f); 549 if (ret) { 550 error_setg(errp, "%s: save config state flush failed: %d", 551 vbasedev->name, ret); 552 return false; 553 } 554 555 packet_len = sizeof(*packet) + bioc->usage; 556 packet = g_malloc0(packet_len); 557 packet->version = VFIO_DEVICE_STATE_PACKET_VER_CURRENT; 558 packet->idx = idx; 559 packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE; 560 memcpy(&packet->data, bioc->data, bioc->usage); 561 562 if (!multifd_queue_device_state(idstr, instance_id, 563 (char *)packet, packet_len)) { 564 error_setg(errp, "%s: multifd config data queuing failed", 565 vbasedev->name); 566 return false; 567 } 568 569 vfio_mig_add_bytes_transferred(packet_len); 570 571 return true; 572 } 573 574 /* 575 * This thread is spawned by the migration core directly via 576 * .save_live_complete_precopy_thread SaveVMHandler. 577 * 578 * It exits after either: 579 * * completing saving the remaining device state and device config, OR: 580 * * encountering some error while doing the above, OR: 581 * * being forcefully aborted by the migration core by 582 * multifd_device_state_save_thread_should_exit() returning true. 583 */ 584 bool 585 vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d, 586 Error **errp) 587 { 588 VFIODevice *vbasedev = d->handler_opaque; 589 VFIOMigration *migration = vbasedev->migration; 590 bool ret = false; 591 g_autofree VFIODeviceStatePacket *packet = NULL; 592 uint32_t idx; 593 594 if (!vfio_multifd_transfer_enabled(vbasedev)) { 595 /* Nothing to do, vfio_save_complete_precopy() does the transfer. */ 596 return true; 597 } 598 599 trace_vfio_save_complete_precopy_thread_start(vbasedev->name, 600 d->idstr, d->instance_id); 601 602 /* We reach here with device state STOP or STOP_COPY only */ 603 if (vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, 604 VFIO_DEVICE_STATE_STOP, errp)) { 605 goto thread_exit; 606 } 607 608 packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size); 609 packet->version = VFIO_DEVICE_STATE_PACKET_VER_CURRENT; 610 611 for (idx = 0; ; idx++) { 612 ssize_t data_size; 613 size_t packet_size; 614 615 if (multifd_device_state_save_thread_should_exit()) { 616 error_setg(errp, "operation cancelled"); 617 goto thread_exit; 618 } 619 620 data_size = read(migration->data_fd, &packet->data, 621 migration->data_buffer_size); 622 if (data_size < 0) { 623 error_setg(errp, "%s: reading state buffer %" PRIu32 " failed: %d", 624 vbasedev->name, idx, errno); 625 goto thread_exit; 626 } else if (data_size == 0) { 627 break; 628 } 629 630 packet->idx = idx; 631 packet_size = sizeof(*packet) + data_size; 632 633 if (!multifd_queue_device_state(d->idstr, d->instance_id, 634 (char *)packet, packet_size)) { 635 error_setg(errp, "%s: multifd data queuing failed", vbasedev->name); 636 goto thread_exit; 637 } 638 639 vfio_mig_add_bytes_transferred(packet_size); 640 } 641 642 if (!vfio_save_complete_precopy_thread_config_state(vbasedev, 643 d->idstr, 644 d->instance_id, 645 idx, errp)) { 646 goto thread_exit; 647 } 648 649 ret = true; 650 651 thread_exit: 652 trace_vfio_save_complete_precopy_thread_end(vbasedev->name, ret); 653 654 return ret; 655 } 656 657 int vfio_multifd_switchover_start(VFIODevice *vbasedev) 658 { 659 VFIOMigration *migration = vbasedev->migration; 660 VFIOMultifd *multifd = migration->multifd; 661 662 assert(multifd); 663 664 /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ 665 bql_unlock(); 666 WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { 667 assert(!multifd->load_bufs_thread_running); 668 multifd->load_bufs_thread_running = true; 669 } 670 bql_lock(); 671 672 qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev); 673 674 return 0; 675 } 676