1 /* 2 * Multifd VFIO migration 3 * 4 * Copyright (C) 2024,2025 Oracle and/or its affiliates. 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2 or later. 7 * See the COPYING file in the top-level directory. 8 * 9 * SPDX-License-Identifier: GPL-2.0-or-later 10 */ 11 12 #include "qemu/osdep.h" 13 #include "hw/vfio/vfio-common.h" 14 #include "migration/misc.h" 15 #include "qapi/error.h" 16 #include "qemu/bswap.h" 17 #include "qemu/error-report.h" 18 #include "qemu/lockable.h" 19 #include "qemu/main-loop.h" 20 #include "qemu/thread.h" 21 #include "io/channel-buffer.h" 22 #include "migration/qemu-file.h" 23 #include "migration-multifd.h" 24 #include "trace.h" 25 26 #define VFIO_DEVICE_STATE_CONFIG_STATE (1) 27 28 #define VFIO_DEVICE_STATE_PACKET_VER_CURRENT (0) 29 30 typedef struct VFIODeviceStatePacket { 31 uint32_t version; 32 uint32_t idx; 33 uint32_t flags; 34 uint8_t data[0]; 35 } QEMU_PACKED VFIODeviceStatePacket; 36 37 /* type safety */ 38 typedef struct VFIOStateBuffers { 39 GArray *array; 40 } VFIOStateBuffers; 41 42 typedef struct VFIOStateBuffer { 43 bool is_present; 44 char *data; 45 size_t len; 46 } VFIOStateBuffer; 47 48 typedef struct VFIOMultifd { 49 bool load_bufs_thread_running; 50 bool load_bufs_thread_want_exit; 51 52 VFIOStateBuffers load_bufs; 53 QemuCond load_bufs_buffer_ready_cond; 54 QemuCond load_bufs_thread_finished_cond; 55 QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */ 56 uint32_t load_buf_idx; 57 uint32_t load_buf_idx_last; 58 } VFIOMultifd; 59 60 static void vfio_state_buffer_clear(gpointer data) 61 { 62 VFIOStateBuffer *lb = data; 63 64 if (!lb->is_present) { 65 return; 66 } 67 68 g_clear_pointer(&lb->data, g_free); 69 lb->is_present = false; 70 } 71 72 static void vfio_state_buffers_init(VFIOStateBuffers *bufs) 73 { 74 bufs->array = g_array_new(FALSE, TRUE, sizeof(VFIOStateBuffer)); 75 g_array_set_clear_func(bufs->array, vfio_state_buffer_clear); 76 } 77 78 static void vfio_state_buffers_destroy(VFIOStateBuffers *bufs) 79 { 80 g_clear_pointer(&bufs->array, g_array_unref); 81 } 82 83 static void vfio_state_buffers_assert_init(VFIOStateBuffers *bufs) 84 { 85 assert(bufs->array); 86 } 87 88 static unsigned int vfio_state_buffers_size_get(VFIOStateBuffers *bufs) 89 { 90 return bufs->array->len; 91 } 92 93 static void vfio_state_buffers_size_set(VFIOStateBuffers *bufs, 94 unsigned int size) 95 { 96 g_array_set_size(bufs->array, size); 97 } 98 99 static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs, 100 unsigned int idx) 101 { 102 return &g_array_index(bufs->array, VFIOStateBuffer, idx); 103 } 104 105 /* called with load_bufs_mutex locked */ 106 static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev, 107 VFIODeviceStatePacket *packet, 108 size_t packet_total_size, 109 Error **errp) 110 { 111 VFIOMigration *migration = vbasedev->migration; 112 VFIOMultifd *multifd = migration->multifd; 113 VFIOStateBuffer *lb; 114 115 vfio_state_buffers_assert_init(&multifd->load_bufs); 116 if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) { 117 vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1); 118 } 119 120 lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx); 121 if (lb->is_present) { 122 error_setg(errp, "%s: state buffer %" PRIu32 " already filled", 123 vbasedev->name, packet->idx); 124 return false; 125 } 126 127 assert(packet->idx >= multifd->load_buf_idx); 128 129 lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet)); 130 lb->len = packet_total_size - sizeof(*packet); 131 lb->is_present = true; 132 133 return true; 134 } 135 136 bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size, 137 Error **errp) 138 { 139 VFIODevice *vbasedev = opaque; 140 VFIOMigration *migration = vbasedev->migration; 141 VFIOMultifd *multifd = migration->multifd; 142 VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data; 143 144 if (!vfio_multifd_transfer_enabled(vbasedev)) { 145 error_setg(errp, 146 "%s: got device state packet but not doing multifd transfer", 147 vbasedev->name); 148 return false; 149 } 150 151 assert(multifd); 152 153 if (data_size < sizeof(*packet)) { 154 error_setg(errp, "%s: packet too short at %zu (min is %zu)", 155 vbasedev->name, data_size, sizeof(*packet)); 156 return false; 157 } 158 159 packet->version = be32_to_cpu(packet->version); 160 if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) { 161 error_setg(errp, "%s: packet has unknown version %" PRIu32, 162 vbasedev->name, packet->version); 163 return false; 164 } 165 166 packet->idx = be32_to_cpu(packet->idx); 167 packet->flags = be32_to_cpu(packet->flags); 168 169 if (packet->idx == UINT32_MAX) { 170 error_setg(errp, "%s: packet index is invalid", vbasedev->name); 171 return false; 172 } 173 174 trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx); 175 176 /* 177 * Holding BQL here would violate the lock order and can cause 178 * a deadlock once we attempt to lock load_bufs_mutex below. 179 */ 180 assert(!bql_locked()); 181 182 WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { 183 /* config state packet should be the last one in the stream */ 184 if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) { 185 multifd->load_buf_idx_last = packet->idx; 186 } 187 188 if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size, 189 errp)) { 190 return false; 191 } 192 193 qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); 194 } 195 196 return true; 197 } 198 199 static bool vfio_load_bufs_thread_load_config(VFIODevice *vbasedev, 200 Error **errp) 201 { 202 VFIOMigration *migration = vbasedev->migration; 203 VFIOMultifd *multifd = migration->multifd; 204 VFIOStateBuffer *lb; 205 g_autoptr(QIOChannelBuffer) bioc = NULL; 206 g_autoptr(QEMUFile) f_out = NULL, f_in = NULL; 207 uint64_t mig_header; 208 int ret; 209 210 assert(multifd->load_buf_idx == multifd->load_buf_idx_last); 211 lb = vfio_state_buffers_at(&multifd->load_bufs, multifd->load_buf_idx); 212 assert(lb->is_present); 213 214 bioc = qio_channel_buffer_new(lb->len); 215 qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-load"); 216 217 f_out = qemu_file_new_output(QIO_CHANNEL(bioc)); 218 qemu_put_buffer(f_out, (uint8_t *)lb->data, lb->len); 219 220 ret = qemu_fflush(f_out); 221 if (ret) { 222 error_setg(errp, "%s: load config state flush failed: %d", 223 vbasedev->name, ret); 224 return false; 225 } 226 227 qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL); 228 f_in = qemu_file_new_input(QIO_CHANNEL(bioc)); 229 230 mig_header = qemu_get_be64(f_in); 231 if (mig_header != VFIO_MIG_FLAG_DEV_CONFIG_STATE) { 232 error_setg(errp, "%s: expected FLAG_DEV_CONFIG_STATE but got %" PRIx64, 233 vbasedev->name, mig_header); 234 return false; 235 } 236 237 bql_lock(); 238 ret = vfio_load_device_config_state(f_in, vbasedev); 239 bql_unlock(); 240 241 if (ret < 0) { 242 error_setg(errp, "%s: vfio_load_device_config_state() failed: %d", 243 vbasedev->name, ret); 244 return false; 245 } 246 247 return true; 248 } 249 250 static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd) 251 { 252 VFIOStateBuffer *lb; 253 unsigned int bufs_len; 254 255 bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs); 256 if (multifd->load_buf_idx >= bufs_len) { 257 assert(multifd->load_buf_idx == bufs_len); 258 return NULL; 259 } 260 261 lb = vfio_state_buffers_at(&multifd->load_bufs, 262 multifd->load_buf_idx); 263 if (!lb->is_present) { 264 return NULL; 265 } 266 267 return lb; 268 } 269 270 static bool vfio_load_state_buffer_write(VFIODevice *vbasedev, 271 VFIOStateBuffer *lb, 272 Error **errp) 273 { 274 VFIOMigration *migration = vbasedev->migration; 275 VFIOMultifd *multifd = migration->multifd; 276 g_autofree char *buf = NULL; 277 char *buf_cur; 278 size_t buf_len; 279 280 if (!lb->len) { 281 return true; 282 } 283 284 trace_vfio_load_state_device_buffer_load_start(vbasedev->name, 285 multifd->load_buf_idx); 286 287 /* lb might become re-allocated when we drop the lock */ 288 buf = g_steal_pointer(&lb->data); 289 buf_cur = buf; 290 buf_len = lb->len; 291 while (buf_len > 0) { 292 ssize_t wr_ret; 293 int errno_save; 294 295 /* 296 * Loading data to the device takes a while, 297 * drop the lock during this process. 298 */ 299 qemu_mutex_unlock(&multifd->load_bufs_mutex); 300 wr_ret = write(migration->data_fd, buf_cur, buf_len); 301 errno_save = errno; 302 qemu_mutex_lock(&multifd->load_bufs_mutex); 303 304 if (wr_ret < 0) { 305 error_setg(errp, 306 "%s: writing state buffer %" PRIu32 " failed: %d", 307 vbasedev->name, multifd->load_buf_idx, errno_save); 308 return false; 309 } 310 311 assert(wr_ret <= buf_len); 312 buf_len -= wr_ret; 313 buf_cur += wr_ret; 314 } 315 316 trace_vfio_load_state_device_buffer_load_end(vbasedev->name, 317 multifd->load_buf_idx); 318 319 return true; 320 } 321 322 static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd, 323 bool *should_quit) 324 { 325 return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit); 326 } 327 328 /* 329 * This thread is spawned by vfio_multifd_switchover_start() which gets 330 * called upon encountering the switchover point marker in main migration 331 * stream. 332 * 333 * It exits after either: 334 * * completing loading the remaining device state and device config, OR: 335 * * encountering some error while doing the above, OR: 336 * * being forcefully aborted by the migration core by it setting should_quit 337 * or by vfio_load_cleanup_load_bufs_thread() setting 338 * multifd->load_bufs_thread_want_exit. 339 */ 340 static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp) 341 { 342 VFIODevice *vbasedev = opaque; 343 VFIOMigration *migration = vbasedev->migration; 344 VFIOMultifd *multifd = migration->multifd; 345 bool ret = false; 346 347 trace_vfio_load_bufs_thread_start(vbasedev->name); 348 349 assert(multifd); 350 QEMU_LOCK_GUARD(&multifd->load_bufs_mutex); 351 352 assert(multifd->load_bufs_thread_running); 353 354 while (true) { 355 VFIOStateBuffer *lb; 356 357 /* 358 * Always check cancellation first after the buffer_ready wait below in 359 * case that cond was signalled by vfio_load_cleanup_load_bufs_thread(). 360 */ 361 if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) { 362 error_setg(errp, "operation cancelled"); 363 goto thread_exit; 364 } 365 366 assert(multifd->load_buf_idx <= multifd->load_buf_idx_last); 367 368 lb = vfio_load_state_buffer_get(multifd); 369 if (!lb) { 370 trace_vfio_load_state_device_buffer_starved(vbasedev->name, 371 multifd->load_buf_idx); 372 qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond, 373 &multifd->load_bufs_mutex); 374 continue; 375 } 376 377 if (multifd->load_buf_idx == multifd->load_buf_idx_last) { 378 break; 379 } 380 381 if (multifd->load_buf_idx == 0) { 382 trace_vfio_load_state_device_buffer_start(vbasedev->name); 383 } 384 385 if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) { 386 goto thread_exit; 387 } 388 389 if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) { 390 trace_vfio_load_state_device_buffer_end(vbasedev->name); 391 } 392 393 multifd->load_buf_idx++; 394 } 395 396 if (!vfio_load_bufs_thread_load_config(vbasedev, errp)) { 397 goto thread_exit; 398 } 399 400 ret = true; 401 402 thread_exit: 403 /* 404 * Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that 405 * this thread is exiting. 406 */ 407 multifd->load_bufs_thread_running = false; 408 qemu_cond_signal(&multifd->load_bufs_thread_finished_cond); 409 410 trace_vfio_load_bufs_thread_end(vbasedev->name); 411 412 return ret; 413 } 414 415 static VFIOMultifd *vfio_multifd_new(void) 416 { 417 VFIOMultifd *multifd = g_new(VFIOMultifd, 1); 418 419 vfio_state_buffers_init(&multifd->load_bufs); 420 421 qemu_mutex_init(&multifd->load_bufs_mutex); 422 423 multifd->load_buf_idx = 0; 424 multifd->load_buf_idx_last = UINT32_MAX; 425 qemu_cond_init(&multifd->load_bufs_buffer_ready_cond); 426 427 multifd->load_bufs_thread_running = false; 428 multifd->load_bufs_thread_want_exit = false; 429 qemu_cond_init(&multifd->load_bufs_thread_finished_cond); 430 431 return multifd; 432 } 433 434 /* 435 * Terminates vfio_load_bufs_thread by setting 436 * multifd->load_bufs_thread_want_exit and signalling all the conditions 437 * the thread could be blocked on. 438 * 439 * Waits for the thread to signal that it had finished. 440 */ 441 static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd) 442 { 443 /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ 444 bql_unlock(); 445 WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { 446 while (multifd->load_bufs_thread_running) { 447 multifd->load_bufs_thread_want_exit = true; 448 449 qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); 450 qemu_cond_wait(&multifd->load_bufs_thread_finished_cond, 451 &multifd->load_bufs_mutex); 452 } 453 } 454 bql_lock(); 455 } 456 457 static void vfio_multifd_free(VFIOMultifd *multifd) 458 { 459 vfio_load_cleanup_load_bufs_thread(multifd); 460 461 qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond); 462 vfio_state_buffers_destroy(&multifd->load_bufs); 463 qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond); 464 qemu_mutex_destroy(&multifd->load_bufs_mutex); 465 466 g_free(multifd); 467 } 468 469 void vfio_multifd_cleanup(VFIODevice *vbasedev) 470 { 471 VFIOMigration *migration = vbasedev->migration; 472 473 g_clear_pointer(&migration->multifd, vfio_multifd_free); 474 } 475 476 bool vfio_multifd_transfer_supported(void) 477 { 478 return multifd_device_state_supported() && 479 migrate_send_switchover_start(); 480 } 481 482 bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev) 483 { 484 VFIOMigration *migration = vbasedev->migration; 485 486 return migration->multifd_transfer; 487 } 488 489 bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp) 490 { 491 VFIOMigration *migration = vbasedev->migration; 492 493 /* 494 * Make a copy of this setting at the start in case it is changed 495 * mid-migration. 496 */ 497 if (vbasedev->migration_multifd_transfer == ON_OFF_AUTO_AUTO) { 498 migration->multifd_transfer = vfio_multifd_transfer_supported(); 499 } else { 500 migration->multifd_transfer = 501 vbasedev->migration_multifd_transfer == ON_OFF_AUTO_ON; 502 } 503 504 if (!vfio_multifd_transfer_enabled(vbasedev)) { 505 /* Nothing further to check or do */ 506 return true; 507 } 508 509 if (!vfio_multifd_transfer_supported()) { 510 error_setg(errp, 511 "%s: Multifd device transfer requested but unsupported in the current config", 512 vbasedev->name); 513 return false; 514 } 515 516 if (alloc_multifd) { 517 assert(!migration->multifd); 518 migration->multifd = vfio_multifd_new(); 519 } 520 521 return true; 522 } 523 524 void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f) 525 { 526 assert(vfio_multifd_transfer_enabled(vbasedev)); 527 528 /* 529 * Emit dummy NOP data on the main migration channel since the actual 530 * device state transfer is done via multifd channels. 531 */ 532 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 533 } 534 535 static bool 536 vfio_save_complete_precopy_thread_config_state(VFIODevice *vbasedev, 537 char *idstr, 538 uint32_t instance_id, 539 uint32_t idx, 540 Error **errp) 541 { 542 g_autoptr(QIOChannelBuffer) bioc = NULL; 543 g_autoptr(QEMUFile) f = NULL; 544 int ret; 545 g_autofree VFIODeviceStatePacket *packet = NULL; 546 size_t packet_len; 547 548 bioc = qio_channel_buffer_new(0); 549 qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save"); 550 551 f = qemu_file_new_output(QIO_CHANNEL(bioc)); 552 553 if (vfio_save_device_config_state(f, vbasedev, errp)) { 554 return false; 555 } 556 557 ret = qemu_fflush(f); 558 if (ret) { 559 error_setg(errp, "%s: save config state flush failed: %d", 560 vbasedev->name, ret); 561 return false; 562 } 563 564 packet_len = sizeof(*packet) + bioc->usage; 565 packet = g_malloc0(packet_len); 566 packet->version = cpu_to_be32(VFIO_DEVICE_STATE_PACKET_VER_CURRENT); 567 packet->idx = cpu_to_be32(idx); 568 packet->flags = cpu_to_be32(VFIO_DEVICE_STATE_CONFIG_STATE); 569 memcpy(&packet->data, bioc->data, bioc->usage); 570 571 if (!multifd_queue_device_state(idstr, instance_id, 572 (char *)packet, packet_len)) { 573 error_setg(errp, "%s: multifd config data queuing failed", 574 vbasedev->name); 575 return false; 576 } 577 578 vfio_mig_add_bytes_transferred(packet_len); 579 580 return true; 581 } 582 583 /* 584 * This thread is spawned by the migration core directly via 585 * .save_live_complete_precopy_thread SaveVMHandler. 586 * 587 * It exits after either: 588 * * completing saving the remaining device state and device config, OR: 589 * * encountering some error while doing the above, OR: 590 * * being forcefully aborted by the migration core by 591 * multifd_device_state_save_thread_should_exit() returning true. 592 */ 593 bool 594 vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d, 595 Error **errp) 596 { 597 VFIODevice *vbasedev = d->handler_opaque; 598 VFIOMigration *migration = vbasedev->migration; 599 bool ret = false; 600 g_autofree VFIODeviceStatePacket *packet = NULL; 601 uint32_t idx; 602 603 if (!vfio_multifd_transfer_enabled(vbasedev)) { 604 /* Nothing to do, vfio_save_complete_precopy() does the transfer. */ 605 return true; 606 } 607 608 trace_vfio_save_complete_precopy_thread_start(vbasedev->name, 609 d->idstr, d->instance_id); 610 611 /* We reach here with device state STOP or STOP_COPY only */ 612 if (vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, 613 VFIO_DEVICE_STATE_STOP, errp)) { 614 goto thread_exit; 615 } 616 617 packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size); 618 packet->version = cpu_to_be32(VFIO_DEVICE_STATE_PACKET_VER_CURRENT); 619 620 for (idx = 0; ; idx++) { 621 ssize_t data_size; 622 size_t packet_size; 623 624 if (multifd_device_state_save_thread_should_exit()) { 625 error_setg(errp, "operation cancelled"); 626 goto thread_exit; 627 } 628 629 data_size = read(migration->data_fd, &packet->data, 630 migration->data_buffer_size); 631 if (data_size < 0) { 632 error_setg(errp, "%s: reading state buffer %" PRIu32 " failed: %d", 633 vbasedev->name, idx, errno); 634 goto thread_exit; 635 } else if (data_size == 0) { 636 break; 637 } 638 639 packet->idx = cpu_to_be32(idx); 640 packet_size = sizeof(*packet) + data_size; 641 642 if (!multifd_queue_device_state(d->idstr, d->instance_id, 643 (char *)packet, packet_size)) { 644 error_setg(errp, "%s: multifd data queuing failed", vbasedev->name); 645 goto thread_exit; 646 } 647 648 vfio_mig_add_bytes_transferred(packet_size); 649 } 650 651 if (!vfio_save_complete_precopy_thread_config_state(vbasedev, 652 d->idstr, 653 d->instance_id, 654 idx, errp)) { 655 goto thread_exit; 656 } 657 658 ret = true; 659 660 thread_exit: 661 trace_vfio_save_complete_precopy_thread_end(vbasedev->name, ret); 662 663 return ret; 664 } 665 666 int vfio_multifd_switchover_start(VFIODevice *vbasedev) 667 { 668 VFIOMigration *migration = vbasedev->migration; 669 VFIOMultifd *multifd = migration->multifd; 670 671 assert(multifd); 672 673 /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ 674 bql_unlock(); 675 WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { 676 assert(!multifd->load_bufs_thread_running); 677 multifd->load_bufs_thread_running = true; 678 } 679 bql_lock(); 680 681 qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev); 682 683 return 0; 684 } 685