1 /* 2 * Migration support for VFIO devices 3 * 4 * Copyright NVIDIA, Inc. 2020 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2. See 7 * the COPYING file in the top-level directory. 8 */ 9 10 #include "qemu/osdep.h" 11 #include "qemu/main-loop.h" 12 #include "qemu/cutils.h" 13 #include "qemu/units.h" 14 #include "qemu/error-report.h" 15 #include <linux/vfio.h> 16 #include <sys/ioctl.h> 17 18 #include "system/runstate.h" 19 #include "hw/vfio/vfio-common.h" 20 #include "migration/misc.h" 21 #include "migration/savevm.h" 22 #include "migration/vmstate.h" 23 #include "migration/qemu-file.h" 24 #include "migration/register.h" 25 #include "migration/blocker.h" 26 #include "migration-multifd.h" 27 #include "qapi/error.h" 28 #include "qapi/qapi-events-vfio.h" 29 #include "exec/ramlist.h" 30 #include "pci.h" 31 #include "trace.h" 32 #include "hw/hw.h" 33 34 /* 35 * This is an arbitrary size based on migration of mlx5 devices, where typically 36 * total device migration size is on the order of 100s of MB. Testing with 37 * larger values, e.g. 128MB and 1GB, did not show a performance improvement. 38 */ 39 #define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB) 40 41 static unsigned long bytes_transferred; 42 43 static const char *mig_state_to_str(enum vfio_device_mig_state state) 44 { 45 switch (state) { 46 case VFIO_DEVICE_STATE_ERROR: 47 return "ERROR"; 48 case VFIO_DEVICE_STATE_STOP: 49 return "STOP"; 50 case VFIO_DEVICE_STATE_RUNNING: 51 return "RUNNING"; 52 case VFIO_DEVICE_STATE_STOP_COPY: 53 return "STOP_COPY"; 54 case VFIO_DEVICE_STATE_RESUMING: 55 return "RESUMING"; 56 case VFIO_DEVICE_STATE_RUNNING_P2P: 57 return "RUNNING_P2P"; 58 case VFIO_DEVICE_STATE_PRE_COPY: 59 return "PRE_COPY"; 60 case VFIO_DEVICE_STATE_PRE_COPY_P2P: 61 return "PRE_COPY_P2P"; 62 default: 63 return "UNKNOWN STATE"; 64 } 65 } 66 67 static QapiVfioMigrationState 68 mig_state_to_qapi_state(enum vfio_device_mig_state state) 69 { 70 switch (state) { 71 case VFIO_DEVICE_STATE_STOP: 72 return QAPI_VFIO_MIGRATION_STATE_STOP; 73 case VFIO_DEVICE_STATE_RUNNING: 74 return QAPI_VFIO_MIGRATION_STATE_RUNNING; 75 case VFIO_DEVICE_STATE_STOP_COPY: 76 return QAPI_VFIO_MIGRATION_STATE_STOP_COPY; 77 case VFIO_DEVICE_STATE_RESUMING: 78 return QAPI_VFIO_MIGRATION_STATE_RESUMING; 79 case VFIO_DEVICE_STATE_RUNNING_P2P: 80 return QAPI_VFIO_MIGRATION_STATE_RUNNING_P2P; 81 case VFIO_DEVICE_STATE_PRE_COPY: 82 return QAPI_VFIO_MIGRATION_STATE_PRE_COPY; 83 case VFIO_DEVICE_STATE_PRE_COPY_P2P: 84 return QAPI_VFIO_MIGRATION_STATE_PRE_COPY_P2P; 85 default: 86 g_assert_not_reached(); 87 } 88 } 89 90 static void vfio_migration_send_event(VFIODevice *vbasedev) 91 { 92 VFIOMigration *migration = vbasedev->migration; 93 DeviceState *dev = vbasedev->dev; 94 g_autofree char *qom_path = NULL; 95 Object *obj; 96 97 if (!vbasedev->migration_events) { 98 return; 99 } 100 101 g_assert(vbasedev->ops->vfio_get_object); 102 obj = vbasedev->ops->vfio_get_object(vbasedev); 103 g_assert(obj); 104 qom_path = object_get_canonical_path(obj); 105 106 qapi_event_send_vfio_migration( 107 dev->id, qom_path, mig_state_to_qapi_state(migration->device_state)); 108 } 109 110 static void vfio_migration_set_device_state(VFIODevice *vbasedev, 111 enum vfio_device_mig_state state) 112 { 113 VFIOMigration *migration = vbasedev->migration; 114 115 trace_vfio_migration_set_device_state(vbasedev->name, 116 mig_state_to_str(state)); 117 118 migration->device_state = state; 119 vfio_migration_send_event(vbasedev); 120 } 121 122 int vfio_migration_set_state(VFIODevice *vbasedev, 123 enum vfio_device_mig_state new_state, 124 enum vfio_device_mig_state recover_state, 125 Error **errp) 126 { 127 VFIOMigration *migration = vbasedev->migration; 128 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 129 sizeof(struct vfio_device_feature_mig_state), 130 sizeof(uint64_t))] = {}; 131 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 132 struct vfio_device_feature_mig_state *mig_state = 133 (struct vfio_device_feature_mig_state *)feature->data; 134 int ret; 135 g_autofree char *error_prefix = 136 g_strdup_printf("%s: Failed setting device state to %s.", 137 vbasedev->name, mig_state_to_str(new_state)); 138 139 trace_vfio_migration_set_state(vbasedev->name, mig_state_to_str(new_state), 140 mig_state_to_str(recover_state)); 141 142 if (new_state == migration->device_state) { 143 return 0; 144 } 145 146 feature->argsz = sizeof(buf); 147 feature->flags = 148 VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE; 149 mig_state->device_state = new_state; 150 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 151 /* Try to set the device in some good state */ 152 ret = -errno; 153 154 if (recover_state == VFIO_DEVICE_STATE_ERROR) { 155 error_setg_errno(errp, errno, 156 "%s Recover state is ERROR. Resetting device", 157 error_prefix); 158 159 goto reset_device; 160 } 161 162 error_setg_errno(errp, errno, 163 "%s Setting device in recover state %s", 164 error_prefix, mig_state_to_str(recover_state)); 165 166 mig_state->device_state = recover_state; 167 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 168 ret = -errno; 169 /* 170 * If setting the device in recover state fails, report 171 * the error here and propagate the first error. 172 */ 173 error_report( 174 "%s: Failed setting device in recover state, err: %s. Resetting device", 175 vbasedev->name, strerror(errno)); 176 177 goto reset_device; 178 } 179 180 vfio_migration_set_device_state(vbasedev, recover_state); 181 182 return ret; 183 } 184 185 vfio_migration_set_device_state(vbasedev, new_state); 186 if (mig_state->data_fd != -1) { 187 if (migration->data_fd != -1) { 188 /* 189 * This can happen if the device is asynchronously reset and 190 * terminates a data transfer. 191 */ 192 error_setg(errp, "%s: data_fd out of sync", vbasedev->name); 193 close(mig_state->data_fd); 194 195 return -EBADF; 196 } 197 198 migration->data_fd = mig_state->data_fd; 199 } 200 201 return 0; 202 203 reset_device: 204 if (ioctl(vbasedev->fd, VFIO_DEVICE_RESET)) { 205 hw_error("%s: Failed resetting device, err: %s", vbasedev->name, 206 strerror(errno)); 207 } 208 209 vfio_migration_set_device_state(vbasedev, VFIO_DEVICE_STATE_RUNNING); 210 211 return ret; 212 } 213 214 /* 215 * Some device state transitions require resetting the device if they fail. 216 * This function sets the device in new_state and resets the device if that 217 * fails. Reset is done by using ERROR as the recover state. 218 */ 219 static int 220 vfio_migration_set_state_or_reset(VFIODevice *vbasedev, 221 enum vfio_device_mig_state new_state, 222 Error **errp) 223 { 224 return vfio_migration_set_state(vbasedev, new_state, 225 VFIO_DEVICE_STATE_ERROR, errp); 226 } 227 228 static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev, 229 uint64_t data_size) 230 { 231 VFIOMigration *migration = vbasedev->migration; 232 int ret; 233 234 ret = qemu_file_get_to_fd(f, migration->data_fd, data_size); 235 trace_vfio_load_state_device_data(vbasedev->name, data_size, ret); 236 237 return ret; 238 } 239 240 int vfio_save_device_config_state(QEMUFile *f, void *opaque, Error **errp) 241 { 242 VFIODevice *vbasedev = opaque; 243 int ret; 244 245 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_CONFIG_STATE); 246 247 if (vbasedev->ops && vbasedev->ops->vfio_save_config) { 248 ret = vbasedev->ops->vfio_save_config(vbasedev, f, errp); 249 if (ret) { 250 return ret; 251 } 252 } 253 254 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 255 256 trace_vfio_save_device_config_state(vbasedev->name); 257 258 ret = qemu_file_get_error(f); 259 if (ret < 0) { 260 error_setg_errno(errp, -ret, "Failed to save state"); 261 } 262 return ret; 263 } 264 265 int vfio_load_device_config_state(QEMUFile *f, void *opaque) 266 { 267 VFIODevice *vbasedev = opaque; 268 uint64_t data; 269 270 trace_vfio_load_device_config_state_start(vbasedev->name); 271 272 if (vbasedev->ops && vbasedev->ops->vfio_load_config) { 273 int ret; 274 275 ret = vbasedev->ops->vfio_load_config(vbasedev, f); 276 if (ret) { 277 error_report("%s: Failed to load device config space", 278 vbasedev->name); 279 return ret; 280 } 281 } 282 283 data = qemu_get_be64(f); 284 if (data != VFIO_MIG_FLAG_END_OF_STATE) { 285 error_report("%s: Failed loading device config space, " 286 "end flag incorrect 0x%"PRIx64, vbasedev->name, data); 287 return -EINVAL; 288 } 289 290 trace_vfio_load_device_config_state_end(vbasedev->name); 291 return qemu_file_get_error(f); 292 } 293 294 static void vfio_migration_cleanup(VFIODevice *vbasedev) 295 { 296 VFIOMigration *migration = vbasedev->migration; 297 298 close(migration->data_fd); 299 migration->data_fd = -1; 300 } 301 302 static int vfio_query_stop_copy_size(VFIODevice *vbasedev, 303 uint64_t *stop_copy_size) 304 { 305 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 306 sizeof(struct vfio_device_feature_mig_data_size), 307 sizeof(uint64_t))] = {}; 308 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 309 struct vfio_device_feature_mig_data_size *mig_data_size = 310 (struct vfio_device_feature_mig_data_size *)feature->data; 311 312 feature->argsz = sizeof(buf); 313 feature->flags = 314 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIG_DATA_SIZE; 315 316 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 317 return -errno; 318 } 319 320 *stop_copy_size = mig_data_size->stop_copy_length; 321 322 return 0; 323 } 324 325 static int vfio_query_precopy_size(VFIOMigration *migration) 326 { 327 struct vfio_precopy_info precopy = { 328 .argsz = sizeof(precopy), 329 }; 330 331 migration->precopy_init_size = 0; 332 migration->precopy_dirty_size = 0; 333 334 if (ioctl(migration->data_fd, VFIO_MIG_GET_PRECOPY_INFO, &precopy)) { 335 return -errno; 336 } 337 338 migration->precopy_init_size = precopy.initial_bytes; 339 migration->precopy_dirty_size = precopy.dirty_bytes; 340 341 return 0; 342 } 343 344 /* Returns the size of saved data on success and -errno on error */ 345 static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration) 346 { 347 ssize_t data_size; 348 349 data_size = read(migration->data_fd, migration->data_buffer, 350 migration->data_buffer_size); 351 if (data_size < 0) { 352 /* 353 * Pre-copy emptied all the device state for now. For more information, 354 * please refer to the Linux kernel VFIO uAPI. 355 */ 356 if (errno == ENOMSG) { 357 if (!migration->event_precopy_empty_hit) { 358 trace_vfio_save_block_precopy_empty_hit(migration->vbasedev->name); 359 migration->event_precopy_empty_hit = true; 360 } 361 return 0; 362 } 363 364 return -errno; 365 } 366 if (data_size == 0) { 367 return 0; 368 } 369 370 /* Non-empty read: re-arm the trace event */ 371 migration->event_precopy_empty_hit = false; 372 373 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); 374 qemu_put_be64(f, data_size); 375 qemu_put_buffer(f, migration->data_buffer, data_size); 376 vfio_mig_add_bytes_transferred(data_size); 377 378 trace_vfio_save_block(migration->vbasedev->name, data_size); 379 380 return qemu_file_get_error(f) ?: data_size; 381 } 382 383 static void vfio_update_estimated_pending_data(VFIOMigration *migration, 384 uint64_t data_size) 385 { 386 if (!data_size) { 387 /* 388 * Pre-copy emptied all the device state for now, update estimated sizes 389 * accordingly. 390 */ 391 migration->precopy_init_size = 0; 392 migration->precopy_dirty_size = 0; 393 394 return; 395 } 396 397 if (migration->precopy_init_size) { 398 uint64_t init_size = MIN(migration->precopy_init_size, data_size); 399 400 migration->precopy_init_size -= init_size; 401 data_size -= init_size; 402 } 403 404 migration->precopy_dirty_size -= MIN(migration->precopy_dirty_size, 405 data_size); 406 } 407 408 static bool vfio_precopy_supported(VFIODevice *vbasedev) 409 { 410 VFIOMigration *migration = vbasedev->migration; 411 412 return migration->mig_flags & VFIO_MIGRATION_PRE_COPY; 413 } 414 415 /* ---------------------------------------------------------------------- */ 416 417 static int vfio_save_prepare(void *opaque, Error **errp) 418 { 419 VFIODevice *vbasedev = opaque; 420 421 /* 422 * Snapshot doesn't use postcopy nor background snapshot, so allow snapshot 423 * even if they are on. 424 */ 425 if (runstate_check(RUN_STATE_SAVE_VM)) { 426 return 0; 427 } 428 429 if (migrate_postcopy_ram()) { 430 error_setg( 431 errp, "%s: VFIO migration is not supported with postcopy migration", 432 vbasedev->name); 433 return -EOPNOTSUPP; 434 } 435 436 if (migrate_background_snapshot()) { 437 error_setg( 438 errp, 439 "%s: VFIO migration is not supported with background snapshot", 440 vbasedev->name); 441 return -EOPNOTSUPP; 442 } 443 444 return 0; 445 } 446 447 static int vfio_save_setup(QEMUFile *f, void *opaque, Error **errp) 448 { 449 VFIODevice *vbasedev = opaque; 450 VFIOMigration *migration = vbasedev->migration; 451 uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE; 452 int ret; 453 454 if (!vfio_multifd_setup(vbasedev, false, errp)) { 455 return -EINVAL; 456 } 457 458 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); 459 460 vfio_query_stop_copy_size(vbasedev, &stop_copy_size); 461 migration->data_buffer_size = MIN(VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE, 462 stop_copy_size); 463 migration->data_buffer = g_try_malloc0(migration->data_buffer_size); 464 if (!migration->data_buffer) { 465 error_setg(errp, "%s: Failed to allocate migration data buffer", 466 vbasedev->name); 467 return -ENOMEM; 468 } 469 470 migration->event_save_iterate_started = false; 471 migration->event_precopy_empty_hit = false; 472 473 if (vfio_precopy_supported(vbasedev)) { 474 switch (migration->device_state) { 475 case VFIO_DEVICE_STATE_RUNNING: 476 ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_PRE_COPY, 477 VFIO_DEVICE_STATE_RUNNING, errp); 478 if (ret) { 479 return ret; 480 } 481 482 vfio_query_precopy_size(migration); 483 484 break; 485 case VFIO_DEVICE_STATE_STOP: 486 /* vfio_save_complete_precopy() will go to STOP_COPY */ 487 break; 488 default: 489 error_setg(errp, "%s: Invalid device state %d", vbasedev->name, 490 migration->device_state); 491 return -EINVAL; 492 } 493 } 494 495 trace_vfio_save_setup(vbasedev->name, migration->data_buffer_size); 496 497 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 498 499 ret = qemu_file_get_error(f); 500 if (ret < 0) { 501 error_setg_errno(errp, -ret, "%s: save setup failed", vbasedev->name); 502 } 503 504 return ret; 505 } 506 507 static void vfio_save_cleanup(void *opaque) 508 { 509 VFIODevice *vbasedev = opaque; 510 VFIOMigration *migration = vbasedev->migration; 511 Error *local_err = NULL; 512 int ret; 513 514 /* Currently a NOP, done for symmetry with load_cleanup() */ 515 vfio_multifd_cleanup(vbasedev); 516 517 /* 518 * Changing device state from STOP_COPY to STOP can take time. Do it here, 519 * after migration has completed, so it won't increase downtime. 520 */ 521 if (migration->device_state == VFIO_DEVICE_STATE_STOP_COPY) { 522 ret = vfio_migration_set_state_or_reset(vbasedev, 523 VFIO_DEVICE_STATE_STOP, 524 &local_err); 525 if (ret) { 526 error_report_err(local_err); 527 } 528 } 529 530 g_free(migration->data_buffer); 531 migration->data_buffer = NULL; 532 migration->precopy_init_size = 0; 533 migration->precopy_dirty_size = 0; 534 migration->initial_data_sent = false; 535 vfio_migration_cleanup(vbasedev); 536 trace_vfio_save_cleanup(vbasedev->name); 537 } 538 539 static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy, 540 uint64_t *can_postcopy) 541 { 542 VFIODevice *vbasedev = opaque; 543 VFIOMigration *migration = vbasedev->migration; 544 545 if (!vfio_device_state_is_precopy(vbasedev)) { 546 return; 547 } 548 549 *must_precopy += 550 migration->precopy_init_size + migration->precopy_dirty_size; 551 552 trace_vfio_state_pending_estimate(vbasedev->name, *must_precopy, 553 *can_postcopy, 554 migration->precopy_init_size, 555 migration->precopy_dirty_size); 556 } 557 558 /* 559 * Migration size of VFIO devices can be as little as a few KBs or as big as 560 * many GBs. This value should be big enough to cover the worst case. 561 */ 562 #define VFIO_MIG_STOP_COPY_SIZE (100 * GiB) 563 564 static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy, 565 uint64_t *can_postcopy) 566 { 567 VFIODevice *vbasedev = opaque; 568 VFIOMigration *migration = vbasedev->migration; 569 uint64_t stop_copy_size = VFIO_MIG_STOP_COPY_SIZE; 570 571 /* 572 * If getting pending migration size fails, VFIO_MIG_STOP_COPY_SIZE is 573 * reported so downtime limit won't be violated. 574 */ 575 vfio_query_stop_copy_size(vbasedev, &stop_copy_size); 576 *must_precopy += stop_copy_size; 577 578 if (vfio_device_state_is_precopy(vbasedev)) { 579 vfio_query_precopy_size(migration); 580 } 581 582 trace_vfio_state_pending_exact(vbasedev->name, *must_precopy, *can_postcopy, 583 stop_copy_size, migration->precopy_init_size, 584 migration->precopy_dirty_size); 585 } 586 587 static bool vfio_is_active_iterate(void *opaque) 588 { 589 VFIODevice *vbasedev = opaque; 590 591 return vfio_device_state_is_precopy(vbasedev); 592 } 593 594 /* 595 * Note about migration rate limiting: VFIO migration buffer size is currently 596 * limited to 1MB, so there is no need to check if migration rate exceeded (as 597 * in the worst case it will exceed by 1MB). However, if the buffer size is 598 * later changed to a bigger value, migration rate should be enforced here. 599 */ 600 static int vfio_save_iterate(QEMUFile *f, void *opaque) 601 { 602 VFIODevice *vbasedev = opaque; 603 VFIOMigration *migration = vbasedev->migration; 604 ssize_t data_size; 605 606 if (!migration->event_save_iterate_started) { 607 trace_vfio_save_iterate_start(vbasedev->name); 608 migration->event_save_iterate_started = true; 609 } 610 611 data_size = vfio_save_block(f, migration); 612 if (data_size < 0) { 613 return data_size; 614 } 615 616 vfio_update_estimated_pending_data(migration, data_size); 617 618 if (migrate_switchover_ack() && !migration->precopy_init_size && 619 !migration->initial_data_sent) { 620 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_INIT_DATA_SENT); 621 migration->initial_data_sent = true; 622 } else { 623 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 624 } 625 626 trace_vfio_save_iterate(vbasedev->name, migration->precopy_init_size, 627 migration->precopy_dirty_size); 628 629 return !migration->precopy_init_size && !migration->precopy_dirty_size; 630 } 631 632 static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) 633 { 634 VFIODevice *vbasedev = opaque; 635 ssize_t data_size; 636 int ret; 637 Error *local_err = NULL; 638 639 if (vfio_multifd_transfer_enabled(vbasedev)) { 640 vfio_multifd_emit_dummy_eos(vbasedev, f); 641 return 0; 642 } 643 644 trace_vfio_save_complete_precopy_start(vbasedev->name); 645 646 /* We reach here with device state STOP or STOP_COPY only */ 647 ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, 648 VFIO_DEVICE_STATE_STOP, &local_err); 649 if (ret) { 650 error_report_err(local_err); 651 return ret; 652 } 653 654 do { 655 data_size = vfio_save_block(f, vbasedev->migration); 656 if (data_size < 0) { 657 return data_size; 658 } 659 } while (data_size); 660 661 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 662 ret = qemu_file_get_error(f); 663 664 trace_vfio_save_complete_precopy(vbasedev->name, ret); 665 666 return ret; 667 } 668 669 static void vfio_save_state(QEMUFile *f, void *opaque) 670 { 671 VFIODevice *vbasedev = opaque; 672 Error *local_err = NULL; 673 int ret; 674 675 if (vfio_multifd_transfer_enabled(vbasedev)) { 676 vfio_multifd_emit_dummy_eos(vbasedev, f); 677 return; 678 } 679 680 ret = vfio_save_device_config_state(f, opaque, &local_err); 681 if (ret) { 682 error_prepend(&local_err, 683 "vfio: Failed to save device config space of %s - ", 684 vbasedev->name); 685 qemu_file_set_error_obj(f, ret, local_err); 686 } 687 } 688 689 static int vfio_load_setup(QEMUFile *f, void *opaque, Error **errp) 690 { 691 VFIODevice *vbasedev = opaque; 692 VFIOMigration *migration = vbasedev->migration; 693 int ret; 694 695 if (!vfio_multifd_setup(vbasedev, true, errp)) { 696 return -EINVAL; 697 } 698 699 ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING, 700 migration->device_state, errp); 701 if (ret) { 702 return ret; 703 } 704 705 return 0; 706 } 707 708 static int vfio_load_cleanup(void *opaque) 709 { 710 VFIODevice *vbasedev = opaque; 711 712 vfio_multifd_cleanup(vbasedev); 713 714 vfio_migration_cleanup(vbasedev); 715 trace_vfio_load_cleanup(vbasedev->name); 716 717 return 0; 718 } 719 720 static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) 721 { 722 VFIODevice *vbasedev = opaque; 723 int ret = 0; 724 uint64_t data; 725 726 data = qemu_get_be64(f); 727 while (data != VFIO_MIG_FLAG_END_OF_STATE) { 728 729 trace_vfio_load_state(vbasedev->name, data); 730 731 switch (data) { 732 case VFIO_MIG_FLAG_DEV_CONFIG_STATE: 733 { 734 if (vfio_multifd_transfer_enabled(vbasedev)) { 735 error_report("%s: got DEV_CONFIG_STATE in main migration " 736 "channel but doing multifd transfer", 737 vbasedev->name); 738 return -EINVAL; 739 } 740 741 return vfio_load_device_config_state(f, opaque); 742 } 743 case VFIO_MIG_FLAG_DEV_SETUP_STATE: 744 { 745 data = qemu_get_be64(f); 746 if (data == VFIO_MIG_FLAG_END_OF_STATE) { 747 return ret; 748 } else { 749 error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64, 750 vbasedev->name, data); 751 return -EINVAL; 752 } 753 break; 754 } 755 case VFIO_MIG_FLAG_DEV_DATA_STATE: 756 { 757 uint64_t data_size = qemu_get_be64(f); 758 759 if (data_size) { 760 ret = vfio_load_buffer(f, vbasedev, data_size); 761 if (ret < 0) { 762 return ret; 763 } 764 } 765 break; 766 } 767 case VFIO_MIG_FLAG_DEV_INIT_DATA_SENT: 768 { 769 if (!vfio_precopy_supported(vbasedev) || 770 !migrate_switchover_ack()) { 771 error_report("%s: Received INIT_DATA_SENT but switchover ack " 772 "is not used", vbasedev->name); 773 return -EINVAL; 774 } 775 776 ret = qemu_loadvm_approve_switchover(); 777 if (ret) { 778 error_report( 779 "%s: qemu_loadvm_approve_switchover failed, err=%d (%s)", 780 vbasedev->name, ret, strerror(-ret)); 781 } 782 783 return ret; 784 } 785 default: 786 error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data); 787 return -EINVAL; 788 } 789 790 data = qemu_get_be64(f); 791 ret = qemu_file_get_error(f); 792 if (ret) { 793 return ret; 794 } 795 } 796 return ret; 797 } 798 799 static bool vfio_switchover_ack_needed(void *opaque) 800 { 801 VFIODevice *vbasedev = opaque; 802 803 return vfio_precopy_supported(vbasedev); 804 } 805 806 static int vfio_switchover_start(void *opaque) 807 { 808 VFIODevice *vbasedev = opaque; 809 810 if (vfio_multifd_transfer_enabled(vbasedev)) { 811 return vfio_multifd_switchover_start(vbasedev); 812 } 813 814 return 0; 815 } 816 817 static const SaveVMHandlers savevm_vfio_handlers = { 818 .save_prepare = vfio_save_prepare, 819 .save_setup = vfio_save_setup, 820 .save_cleanup = vfio_save_cleanup, 821 .state_pending_estimate = vfio_state_pending_estimate, 822 .state_pending_exact = vfio_state_pending_exact, 823 .is_active_iterate = vfio_is_active_iterate, 824 .save_live_iterate = vfio_save_iterate, 825 .save_live_complete_precopy = vfio_save_complete_precopy, 826 .save_state = vfio_save_state, 827 .load_setup = vfio_load_setup, 828 .load_cleanup = vfio_load_cleanup, 829 .load_state = vfio_load_state, 830 .switchover_ack_needed = vfio_switchover_ack_needed, 831 /* 832 * Multifd support 833 */ 834 .load_state_buffer = vfio_multifd_load_state_buffer, 835 .switchover_start = vfio_switchover_start, 836 .save_live_complete_precopy_thread = vfio_multifd_save_complete_precopy_thread, 837 }; 838 839 /* ---------------------------------------------------------------------- */ 840 841 static void vfio_vmstate_change_prepare(void *opaque, bool running, 842 RunState state) 843 { 844 VFIODevice *vbasedev = opaque; 845 VFIOMigration *migration = vbasedev->migration; 846 enum vfio_device_mig_state new_state; 847 Error *local_err = NULL; 848 int ret; 849 850 new_state = migration->device_state == VFIO_DEVICE_STATE_PRE_COPY ? 851 VFIO_DEVICE_STATE_PRE_COPY_P2P : 852 VFIO_DEVICE_STATE_RUNNING_P2P; 853 854 ret = vfio_migration_set_state_or_reset(vbasedev, new_state, &local_err); 855 if (ret) { 856 /* 857 * Migration should be aborted in this case, but vm_state_notify() 858 * currently does not support reporting failures. 859 */ 860 migration_file_set_error(ret, local_err); 861 } 862 863 trace_vfio_vmstate_change_prepare(vbasedev->name, running, 864 RunState_str(state), 865 mig_state_to_str(new_state)); 866 } 867 868 static void vfio_vmstate_change(void *opaque, bool running, RunState state) 869 { 870 VFIODevice *vbasedev = opaque; 871 enum vfio_device_mig_state new_state; 872 Error *local_err = NULL; 873 int ret; 874 875 if (running) { 876 new_state = VFIO_DEVICE_STATE_RUNNING; 877 } else { 878 new_state = 879 (vfio_device_state_is_precopy(vbasedev) && 880 (state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSED)) ? 881 VFIO_DEVICE_STATE_STOP_COPY : 882 VFIO_DEVICE_STATE_STOP; 883 } 884 885 ret = vfio_migration_set_state_or_reset(vbasedev, new_state, &local_err); 886 if (ret) { 887 /* 888 * Migration should be aborted in this case, but vm_state_notify() 889 * currently does not support reporting failures. 890 */ 891 migration_file_set_error(ret, local_err); 892 } 893 894 trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state), 895 mig_state_to_str(new_state)); 896 } 897 898 static int vfio_migration_state_notifier(NotifierWithReturn *notifier, 899 MigrationEvent *e, Error **errp) 900 { 901 VFIOMigration *migration = container_of(notifier, VFIOMigration, 902 migration_state); 903 VFIODevice *vbasedev = migration->vbasedev; 904 Error *local_err = NULL; 905 int ret; 906 907 trace_vfio_migration_state_notifier(vbasedev->name, e->type); 908 909 if (e->type == MIG_EVENT_PRECOPY_FAILED) { 910 /* 911 * MigrationNotifyFunc may not return an error code and an Error 912 * object for MIG_EVENT_PRECOPY_FAILED. Hence, report the error 913 * locally and ignore the errp argument. 914 */ 915 ret = vfio_migration_set_state_or_reset(vbasedev, 916 VFIO_DEVICE_STATE_RUNNING, 917 &local_err); 918 if (ret) { 919 error_report_err(local_err); 920 } 921 } 922 return 0; 923 } 924 925 static void vfio_migration_free(VFIODevice *vbasedev) 926 { 927 g_free(vbasedev->migration); 928 vbasedev->migration = NULL; 929 } 930 931 static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags) 932 { 933 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 934 sizeof(struct vfio_device_feature_migration), 935 sizeof(uint64_t))] = {}; 936 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 937 struct vfio_device_feature_migration *mig = 938 (struct vfio_device_feature_migration *)feature->data; 939 940 feature->argsz = sizeof(buf); 941 feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION; 942 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 943 return -errno; 944 } 945 946 *mig_flags = mig->flags; 947 948 return 0; 949 } 950 951 static bool vfio_dma_logging_supported(VFIODevice *vbasedev) 952 { 953 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), 954 sizeof(uint64_t))] = {}; 955 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 956 957 feature->argsz = sizeof(buf); 958 feature->flags = VFIO_DEVICE_FEATURE_PROBE | 959 VFIO_DEVICE_FEATURE_DMA_LOGGING_START; 960 961 return !ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); 962 } 963 964 static int vfio_migration_init(VFIODevice *vbasedev) 965 { 966 int ret; 967 Object *obj; 968 VFIOMigration *migration; 969 char id[256] = ""; 970 g_autofree char *path = NULL, *oid = NULL; 971 uint64_t mig_flags = 0; 972 VMChangeStateHandler *prepare_cb; 973 974 if (!vbasedev->ops->vfio_get_object) { 975 return -EINVAL; 976 } 977 978 obj = vbasedev->ops->vfio_get_object(vbasedev); 979 if (!obj) { 980 return -EINVAL; 981 } 982 983 ret = vfio_migration_query_flags(vbasedev, &mig_flags); 984 if (ret) { 985 return ret; 986 } 987 988 /* Basic migration functionality must be supported */ 989 if (!(mig_flags & VFIO_MIGRATION_STOP_COPY)) { 990 return -EOPNOTSUPP; 991 } 992 993 vbasedev->migration = g_new0(VFIOMigration, 1); 994 migration = vbasedev->migration; 995 migration->vbasedev = vbasedev; 996 migration->device_state = VFIO_DEVICE_STATE_RUNNING; 997 migration->data_fd = -1; 998 migration->mig_flags = mig_flags; 999 1000 vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev); 1001 1002 oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj))); 1003 if (oid) { 1004 path = g_strdup_printf("%s/vfio", oid); 1005 } else { 1006 path = g_strdup("vfio"); 1007 } 1008 strpadcpy(id, sizeof(id), path, '\0'); 1009 1010 register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1, &savevm_vfio_handlers, 1011 vbasedev); 1012 1013 prepare_cb = migration->mig_flags & VFIO_MIGRATION_P2P ? 1014 vfio_vmstate_change_prepare : 1015 NULL; 1016 migration->vm_state = qdev_add_vm_change_state_handler_full( 1017 vbasedev->dev, vfio_vmstate_change, prepare_cb, vbasedev); 1018 migration_add_notifier(&migration->migration_state, 1019 vfio_migration_state_notifier); 1020 1021 return 0; 1022 } 1023 1024 static void vfio_migration_deinit(VFIODevice *vbasedev) 1025 { 1026 VFIOMigration *migration = vbasedev->migration; 1027 1028 migration_remove_notifier(&migration->migration_state); 1029 qemu_del_vm_change_state_handler(migration->vm_state); 1030 unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev); 1031 vfio_migration_free(vbasedev); 1032 vfio_unblock_multiple_devices_migration(); 1033 } 1034 1035 static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp) 1036 { 1037 if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { 1038 error_propagate(errp, err); 1039 return -EINVAL; 1040 } 1041 1042 vbasedev->migration_blocker = error_copy(err); 1043 error_free(err); 1044 1045 return migrate_add_blocker_normal(&vbasedev->migration_blocker, errp); 1046 } 1047 1048 /* ---------------------------------------------------------------------- */ 1049 1050 int64_t vfio_mig_bytes_transferred(void) 1051 { 1052 return MIN(qatomic_read(&bytes_transferred), INT64_MAX); 1053 } 1054 1055 void vfio_reset_bytes_transferred(void) 1056 { 1057 qatomic_set(&bytes_transferred, 0); 1058 } 1059 1060 void vfio_mig_add_bytes_transferred(unsigned long val) 1061 { 1062 qatomic_add(&bytes_transferred, val); 1063 } 1064 1065 /* 1066 * Return true when either migration initialized or blocker registered. 1067 * Currently only return false when adding blocker fails which will 1068 * de-register vfio device. 1069 */ 1070 bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp) 1071 { 1072 Error *err = NULL; 1073 int ret; 1074 1075 if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) { 1076 error_setg(&err, "%s: Migration is disabled for VFIO device", 1077 vbasedev->name); 1078 return !vfio_block_migration(vbasedev, err, errp); 1079 } 1080 1081 ret = vfio_migration_init(vbasedev); 1082 if (ret) { 1083 if (ret == -ENOTTY) { 1084 error_setg(&err, "%s: VFIO migration is not supported in kernel", 1085 vbasedev->name); 1086 } else { 1087 error_setg(&err, 1088 "%s: Migration couldn't be initialized for VFIO device, " 1089 "err: %d (%s)", 1090 vbasedev->name, ret, strerror(-ret)); 1091 } 1092 1093 return !vfio_block_migration(vbasedev, err, errp); 1094 } 1095 1096 if ((!vbasedev->dirty_pages_supported || 1097 vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) && 1098 !vbasedev->iommu_dirty_tracking) { 1099 if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) { 1100 error_setg(&err, 1101 "%s: VFIO device doesn't support device and " 1102 "IOMMU dirty tracking", vbasedev->name); 1103 goto add_blocker; 1104 } 1105 1106 warn_report("%s: VFIO device doesn't support device and " 1107 "IOMMU dirty tracking", vbasedev->name); 1108 } 1109 1110 ret = vfio_block_multiple_devices_migration(vbasedev, errp); 1111 if (ret) { 1112 goto out_deinit; 1113 } 1114 1115 if (vfio_viommu_preset(vbasedev)) { 1116 error_setg(&err, "%s: Migration is currently not supported " 1117 "with vIOMMU enabled", vbasedev->name); 1118 goto add_blocker; 1119 } 1120 1121 trace_vfio_migration_realize(vbasedev->name); 1122 return true; 1123 1124 add_blocker: 1125 ret = vfio_block_migration(vbasedev, err, errp); 1126 out_deinit: 1127 if (ret) { 1128 vfio_migration_deinit(vbasedev); 1129 } 1130 return !ret; 1131 } 1132 1133 void vfio_migration_exit(VFIODevice *vbasedev) 1134 { 1135 if (vbasedev->migration) { 1136 vfio_migration_deinit(vbasedev); 1137 } 1138 1139 migrate_del_blocker(&vbasedev->migration_blocker); 1140 } 1141