1 /* 2 * Migration support for VFIO devices 3 * 4 * Copyright NVIDIA, Inc. 2020 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2. See 7 * the COPYING file in the top-level directory. 8 */ 9 10 #include "qemu/osdep.h" 11 #include "qemu/main-loop.h" 12 #include "qemu/cutils.h" 13 #include "qemu/units.h" 14 #include "qemu/error-report.h" 15 #include <linux/vfio.h> 16 #include <sys/ioctl.h> 17 18 #include "system/runstate.h" 19 #include "hw/vfio/vfio-device.h" 20 #include "hw/vfio/vfio-migration.h" 21 #include "migration/misc.h" 22 #include "migration/savevm.h" 23 #include "migration/vmstate.h" 24 #include "migration/qemu-file.h" 25 #include "migration/register.h" 26 #include "migration/blocker.h" 27 #include "migration-multifd.h" 28 #include "qapi/error.h" 29 #include "qapi/qapi-events-vfio.h" 30 #include "exec/ramlist.h" 31 #include "pci.h" 32 #include "trace.h" 33 #include "hw/hw.h" 34 #include "vfio-migration-internal.h" 35 36 /* 37 * This is an arbitrary size based on migration of mlx5 devices, where typically 38 * total device migration size is on the order of 100s of MB. Testing with 39 * larger values, e.g. 128MB and 1GB, did not show a performance improvement. 40 */ 41 #define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB) 42 43 static unsigned long bytes_transferred; 44 45 static const char *mig_state_to_str(enum vfio_device_mig_state state) 46 { 47 switch (state) { 48 case VFIO_DEVICE_STATE_ERROR: 49 return "ERROR"; 50 case VFIO_DEVICE_STATE_STOP: 51 return "STOP"; 52 case VFIO_DEVICE_STATE_RUNNING: 53 return "RUNNING"; 54 case VFIO_DEVICE_STATE_STOP_COPY: 55 return "STOP_COPY"; 56 case VFIO_DEVICE_STATE_RESUMING: 57 return "RESUMING"; 58 case VFIO_DEVICE_STATE_RUNNING_P2P: 59 return "RUNNING_P2P"; 60 case VFIO_DEVICE_STATE_PRE_COPY: 61 return "PRE_COPY"; 62 case VFIO_DEVICE_STATE_PRE_COPY_P2P: 63 return "PRE_COPY_P2P"; 64 default: 65 return "UNKNOWN STATE"; 66 } 67 } 68 69 static QapiVfioMigrationState 70 mig_state_to_qapi_state(enum vfio_device_mig_state state) 71 { 72 switch (state) { 73 case VFIO_DEVICE_STATE_STOP: 74 return QAPI_VFIO_MIGRATION_STATE_STOP; 75 case VFIO_DEVICE_STATE_RUNNING: 76 return QAPI_VFIO_MIGRATION_STATE_RUNNING; 77 case VFIO_DEVICE_STATE_STOP_COPY: 78 return QAPI_VFIO_MIGRATION_STATE_STOP_COPY; 79 case VFIO_DEVICE_STATE_RESUMING: 80 return QAPI_VFIO_MIGRATION_STATE_RESUMING; 81 case VFIO_DEVICE_STATE_RUNNING_P2P: 82 return QAPI_VFIO_MIGRATION_STATE_RUNNING_P2P; 83 case VFIO_DEVICE_STATE_PRE_COPY: 84 return QAPI_VFIO_MIGRATION_STATE_PRE_COPY; 85 case VFIO_DEVICE_STATE_PRE_COPY_P2P: 86 return QAPI_VFIO_MIGRATION_STATE_PRE_COPY_P2P; 87 default: 88 g_assert_not_reached(); 89 } 90 } 91 92 static void vfio_migration_send_event(VFIODevice *vbasedev) 93 { 94 VFIOMigration *migration = vbasedev->migration; 95 DeviceState *dev = vbasedev->dev; 96 g_autofree char *qom_path = NULL; 97 Object *obj; 98 99 if (!vbasedev->migration_events) { 100 return; 101 } 102 103 g_assert(vbasedev->ops->vfio_get_object); 104 obj = vbasedev->ops->vfio_get_object(vbasedev); 105 g_assert(obj); 106 qom_path = object_get_canonical_path(obj); 107 108 qapi_event_send_vfio_migration( 109 dev->id, qom_path, mig_state_to_qapi_state(migration->device_state)); 110 } 111 112 static void vfio_migration_set_device_state(VFIODevice *vbasedev, 113 enum vfio_device_mig_state state) 114 { 115 VFIOMigration *migration = vbasedev->migration; 116 117 trace_vfio_migration_set_device_state(vbasedev->name, 118 mig_state_to_str(state)); 119 120 migration->device_state = state; 121 vfio_migration_send_event(vbasedev); 122 } 123 124 int vfio_migration_set_state(VFIODevice *vbasedev, 125 enum vfio_device_mig_state new_state, 126 enum vfio_device_mig_state recover_state, 127 Error **errp) 128 { 129 VFIOMigration *migration = vbasedev->migration; 130 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 131 sizeof(struct vfio_device_feature_mig_state), 132 sizeof(uint64_t))] = {}; 133 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 134 struct vfio_device_feature_mig_state *mig_state = 135 (struct vfio_device_feature_mig_state *)feature->data; 136 int ret; 137 g_autofree char *error_prefix = 138 g_strdup_printf("%s: Failed setting device state to %s.", 139 vbasedev->name, mig_state_to_str(new_state)); 140 141 trace_vfio_migration_set_state(vbasedev->name, mig_state_to_str(new_state), 142 mig_state_to_str(recover_state)); 143 144 if (new_state == migration->device_state) { 145 return 0; 146 } 147 148 feature->argsz = sizeof(buf); 149 feature->flags = 150 VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE; 151 mig_state->device_state = new_state; 152 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 153 /* Try to set the device in some good state */ 154 ret = -errno; 155 156 if (recover_state == VFIO_DEVICE_STATE_ERROR) { 157 error_setg_errno(errp, errno, 158 "%s Recover state is ERROR. Resetting device", 159 error_prefix); 160 161 goto reset_device; 162 } 163 164 error_setg_errno(errp, errno, 165 "%s Setting device in recover state %s", 166 error_prefix, mig_state_to_str(recover_state)); 167 168 mig_state->device_state = recover_state; 169 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 170 ret = -errno; 171 /* 172 * If setting the device in recover state fails, report 173 * the error here and propagate the first error. 174 */ 175 error_report( 176 "%s: Failed setting device in recover state, err: %s. Resetting device", 177 vbasedev->name, strerror(errno)); 178 179 goto reset_device; 180 } 181 182 vfio_migration_set_device_state(vbasedev, recover_state); 183 184 return ret; 185 } 186 187 vfio_migration_set_device_state(vbasedev, new_state); 188 if (mig_state->data_fd != -1) { 189 if (migration->data_fd != -1) { 190 /* 191 * This can happen if the device is asynchronously reset and 192 * terminates a data transfer. 193 */ 194 error_setg(errp, "%s: data_fd out of sync", vbasedev->name); 195 close(mig_state->data_fd); 196 197 return -EBADF; 198 } 199 200 migration->data_fd = mig_state->data_fd; 201 } 202 203 return 0; 204 205 reset_device: 206 if (ioctl(vbasedev->fd, VFIO_DEVICE_RESET)) { 207 hw_error("%s: Failed resetting device, err: %s", vbasedev->name, 208 strerror(errno)); 209 } 210 211 vfio_migration_set_device_state(vbasedev, VFIO_DEVICE_STATE_RUNNING); 212 213 return ret; 214 } 215 216 /* 217 * Some device state transitions require resetting the device if they fail. 218 * This function sets the device in new_state and resets the device if that 219 * fails. Reset is done by using ERROR as the recover state. 220 */ 221 static int 222 vfio_migration_set_state_or_reset(VFIODevice *vbasedev, 223 enum vfio_device_mig_state new_state, 224 Error **errp) 225 { 226 return vfio_migration_set_state(vbasedev, new_state, 227 VFIO_DEVICE_STATE_ERROR, errp); 228 } 229 230 static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev, 231 uint64_t data_size) 232 { 233 VFIOMigration *migration = vbasedev->migration; 234 int ret; 235 236 ret = qemu_file_get_to_fd(f, migration->data_fd, data_size); 237 trace_vfio_load_state_device_data(vbasedev->name, data_size, ret); 238 239 return ret; 240 } 241 242 int vfio_save_device_config_state(QEMUFile *f, void *opaque, Error **errp) 243 { 244 VFIODevice *vbasedev = opaque; 245 int ret; 246 247 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_CONFIG_STATE); 248 249 if (vbasedev->ops && vbasedev->ops->vfio_save_config) { 250 ret = vbasedev->ops->vfio_save_config(vbasedev, f, errp); 251 if (ret) { 252 return ret; 253 } 254 } 255 256 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 257 258 trace_vfio_save_device_config_state(vbasedev->name); 259 260 ret = qemu_file_get_error(f); 261 if (ret < 0) { 262 error_setg_errno(errp, -ret, "Failed to save state"); 263 } 264 return ret; 265 } 266 267 int vfio_load_device_config_state(QEMUFile *f, void *opaque) 268 { 269 VFIODevice *vbasedev = opaque; 270 uint64_t data; 271 272 trace_vfio_load_device_config_state_start(vbasedev->name); 273 274 if (vbasedev->ops && vbasedev->ops->vfio_load_config) { 275 int ret; 276 277 ret = vbasedev->ops->vfio_load_config(vbasedev, f); 278 if (ret) { 279 error_report("%s: Failed to load device config space", 280 vbasedev->name); 281 return ret; 282 } 283 } 284 285 data = qemu_get_be64(f); 286 if (data != VFIO_MIG_FLAG_END_OF_STATE) { 287 error_report("%s: Failed loading device config space, " 288 "end flag incorrect 0x%"PRIx64, vbasedev->name, data); 289 return -EINVAL; 290 } 291 292 trace_vfio_load_device_config_state_end(vbasedev->name); 293 return qemu_file_get_error(f); 294 } 295 296 static void vfio_migration_cleanup(VFIODevice *vbasedev) 297 { 298 VFIOMigration *migration = vbasedev->migration; 299 300 close(migration->data_fd); 301 migration->data_fd = -1; 302 } 303 304 static int vfio_query_stop_copy_size(VFIODevice *vbasedev, 305 uint64_t *stop_copy_size) 306 { 307 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 308 sizeof(struct vfio_device_feature_mig_data_size), 309 sizeof(uint64_t))] = {}; 310 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 311 struct vfio_device_feature_mig_data_size *mig_data_size = 312 (struct vfio_device_feature_mig_data_size *)feature->data; 313 314 feature->argsz = sizeof(buf); 315 feature->flags = 316 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIG_DATA_SIZE; 317 318 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 319 return -errno; 320 } 321 322 *stop_copy_size = mig_data_size->stop_copy_length; 323 324 return 0; 325 } 326 327 static int vfio_query_precopy_size(VFIOMigration *migration) 328 { 329 struct vfio_precopy_info precopy = { 330 .argsz = sizeof(precopy), 331 }; 332 333 migration->precopy_init_size = 0; 334 migration->precopy_dirty_size = 0; 335 336 if (ioctl(migration->data_fd, VFIO_MIG_GET_PRECOPY_INFO, &precopy)) { 337 return -errno; 338 } 339 340 migration->precopy_init_size = precopy.initial_bytes; 341 migration->precopy_dirty_size = precopy.dirty_bytes; 342 343 return 0; 344 } 345 346 /* Returns the size of saved data on success and -errno on error */ 347 static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration) 348 { 349 ssize_t data_size; 350 351 data_size = read(migration->data_fd, migration->data_buffer, 352 migration->data_buffer_size); 353 if (data_size < 0) { 354 /* 355 * Pre-copy emptied all the device state for now. For more information, 356 * please refer to the Linux kernel VFIO uAPI. 357 */ 358 if (errno == ENOMSG) { 359 if (!migration->event_precopy_empty_hit) { 360 trace_vfio_save_block_precopy_empty_hit(migration->vbasedev->name); 361 migration->event_precopy_empty_hit = true; 362 } 363 return 0; 364 } 365 366 return -errno; 367 } 368 if (data_size == 0) { 369 return 0; 370 } 371 372 /* Non-empty read: re-arm the trace event */ 373 migration->event_precopy_empty_hit = false; 374 375 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE); 376 qemu_put_be64(f, data_size); 377 qemu_put_buffer(f, migration->data_buffer, data_size); 378 vfio_migration_add_bytes_transferred(data_size); 379 380 trace_vfio_save_block(migration->vbasedev->name, data_size); 381 382 return qemu_file_get_error(f) ?: data_size; 383 } 384 385 static void vfio_update_estimated_pending_data(VFIOMigration *migration, 386 uint64_t data_size) 387 { 388 if (!data_size) { 389 /* 390 * Pre-copy emptied all the device state for now, update estimated sizes 391 * accordingly. 392 */ 393 migration->precopy_init_size = 0; 394 migration->precopy_dirty_size = 0; 395 396 return; 397 } 398 399 if (migration->precopy_init_size) { 400 uint64_t init_size = MIN(migration->precopy_init_size, data_size); 401 402 migration->precopy_init_size -= init_size; 403 data_size -= init_size; 404 } 405 406 migration->precopy_dirty_size -= MIN(migration->precopy_dirty_size, 407 data_size); 408 } 409 410 static bool vfio_precopy_supported(VFIODevice *vbasedev) 411 { 412 VFIOMigration *migration = vbasedev->migration; 413 414 return migration->mig_flags & VFIO_MIGRATION_PRE_COPY; 415 } 416 417 /* ---------------------------------------------------------------------- */ 418 419 static int vfio_save_prepare(void *opaque, Error **errp) 420 { 421 VFIODevice *vbasedev = opaque; 422 423 /* 424 * Snapshot doesn't use postcopy nor background snapshot, so allow snapshot 425 * even if they are on. 426 */ 427 if (runstate_check(RUN_STATE_SAVE_VM)) { 428 return 0; 429 } 430 431 if (migrate_postcopy_ram()) { 432 error_setg( 433 errp, "%s: VFIO migration is not supported with postcopy migration", 434 vbasedev->name); 435 return -EOPNOTSUPP; 436 } 437 438 if (migrate_background_snapshot()) { 439 error_setg( 440 errp, 441 "%s: VFIO migration is not supported with background snapshot", 442 vbasedev->name); 443 return -EOPNOTSUPP; 444 } 445 446 return 0; 447 } 448 449 static int vfio_save_setup(QEMUFile *f, void *opaque, Error **errp) 450 { 451 VFIODevice *vbasedev = opaque; 452 VFIOMigration *migration = vbasedev->migration; 453 uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE; 454 int ret; 455 456 if (!vfio_multifd_setup(vbasedev, false, errp)) { 457 return -EINVAL; 458 } 459 460 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE); 461 462 vfio_query_stop_copy_size(vbasedev, &stop_copy_size); 463 migration->data_buffer_size = MIN(VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE, 464 stop_copy_size); 465 migration->data_buffer = g_try_malloc0(migration->data_buffer_size); 466 if (!migration->data_buffer) { 467 error_setg(errp, "%s: Failed to allocate migration data buffer", 468 vbasedev->name); 469 return -ENOMEM; 470 } 471 472 migration->event_save_iterate_started = false; 473 migration->event_precopy_empty_hit = false; 474 475 if (vfio_precopy_supported(vbasedev)) { 476 switch (migration->device_state) { 477 case VFIO_DEVICE_STATE_RUNNING: 478 ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_PRE_COPY, 479 VFIO_DEVICE_STATE_RUNNING, errp); 480 if (ret) { 481 return ret; 482 } 483 484 vfio_query_precopy_size(migration); 485 486 break; 487 case VFIO_DEVICE_STATE_STOP: 488 /* vfio_save_complete_precopy() will go to STOP_COPY */ 489 break; 490 default: 491 error_setg(errp, "%s: Invalid device state %d", vbasedev->name, 492 migration->device_state); 493 return -EINVAL; 494 } 495 } 496 497 trace_vfio_save_setup(vbasedev->name, migration->data_buffer_size); 498 499 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 500 501 ret = qemu_file_get_error(f); 502 if (ret < 0) { 503 error_setg_errno(errp, -ret, "%s: save setup failed", vbasedev->name); 504 } 505 506 return ret; 507 } 508 509 static void vfio_save_cleanup(void *opaque) 510 { 511 VFIODevice *vbasedev = opaque; 512 VFIOMigration *migration = vbasedev->migration; 513 Error *local_err = NULL; 514 int ret; 515 516 /* Currently a NOP, done for symmetry with load_cleanup() */ 517 vfio_multifd_cleanup(vbasedev); 518 519 /* 520 * Changing device state from STOP_COPY to STOP can take time. Do it here, 521 * after migration has completed, so it won't increase downtime. 522 */ 523 if (migration->device_state == VFIO_DEVICE_STATE_STOP_COPY) { 524 ret = vfio_migration_set_state_or_reset(vbasedev, 525 VFIO_DEVICE_STATE_STOP, 526 &local_err); 527 if (ret) { 528 error_report_err(local_err); 529 } 530 } 531 532 g_free(migration->data_buffer); 533 migration->data_buffer = NULL; 534 migration->precopy_init_size = 0; 535 migration->precopy_dirty_size = 0; 536 migration->initial_data_sent = false; 537 vfio_migration_cleanup(vbasedev); 538 trace_vfio_save_cleanup(vbasedev->name); 539 } 540 541 static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy, 542 uint64_t *can_postcopy) 543 { 544 VFIODevice *vbasedev = opaque; 545 VFIOMigration *migration = vbasedev->migration; 546 547 if (!vfio_device_state_is_precopy(vbasedev)) { 548 return; 549 } 550 551 *must_precopy += 552 migration->precopy_init_size + migration->precopy_dirty_size; 553 554 trace_vfio_state_pending_estimate(vbasedev->name, *must_precopy, 555 *can_postcopy, 556 migration->precopy_init_size, 557 migration->precopy_dirty_size); 558 } 559 560 /* 561 * Migration size of VFIO devices can be as little as a few KBs or as big as 562 * many GBs. This value should be big enough to cover the worst case. 563 */ 564 #define VFIO_MIG_STOP_COPY_SIZE (100 * GiB) 565 566 static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy, 567 uint64_t *can_postcopy) 568 { 569 VFIODevice *vbasedev = opaque; 570 VFIOMigration *migration = vbasedev->migration; 571 uint64_t stop_copy_size = VFIO_MIG_STOP_COPY_SIZE; 572 573 /* 574 * If getting pending migration size fails, VFIO_MIG_STOP_COPY_SIZE is 575 * reported so downtime limit won't be violated. 576 */ 577 vfio_query_stop_copy_size(vbasedev, &stop_copy_size); 578 *must_precopy += stop_copy_size; 579 580 if (vfio_device_state_is_precopy(vbasedev)) { 581 vfio_query_precopy_size(migration); 582 } 583 584 trace_vfio_state_pending_exact(vbasedev->name, *must_precopy, *can_postcopy, 585 stop_copy_size, migration->precopy_init_size, 586 migration->precopy_dirty_size); 587 } 588 589 static bool vfio_is_active_iterate(void *opaque) 590 { 591 VFIODevice *vbasedev = opaque; 592 593 return vfio_device_state_is_precopy(vbasedev); 594 } 595 596 /* 597 * Note about migration rate limiting: VFIO migration buffer size is currently 598 * limited to 1MB, so there is no need to check if migration rate exceeded (as 599 * in the worst case it will exceed by 1MB). However, if the buffer size is 600 * later changed to a bigger value, migration rate should be enforced here. 601 */ 602 static int vfio_save_iterate(QEMUFile *f, void *opaque) 603 { 604 VFIODevice *vbasedev = opaque; 605 VFIOMigration *migration = vbasedev->migration; 606 ssize_t data_size; 607 608 if (!migration->event_save_iterate_started) { 609 trace_vfio_save_iterate_start(vbasedev->name); 610 migration->event_save_iterate_started = true; 611 } 612 613 data_size = vfio_save_block(f, migration); 614 if (data_size < 0) { 615 return data_size; 616 } 617 618 vfio_update_estimated_pending_data(migration, data_size); 619 620 if (migrate_switchover_ack() && !migration->precopy_init_size && 621 !migration->initial_data_sent) { 622 qemu_put_be64(f, VFIO_MIG_FLAG_DEV_INIT_DATA_SENT); 623 migration->initial_data_sent = true; 624 } else { 625 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 626 } 627 628 trace_vfio_save_iterate(vbasedev->name, migration->precopy_init_size, 629 migration->precopy_dirty_size); 630 631 return !migration->precopy_init_size && !migration->precopy_dirty_size; 632 } 633 634 static int vfio_save_complete_precopy(QEMUFile *f, void *opaque) 635 { 636 VFIODevice *vbasedev = opaque; 637 ssize_t data_size; 638 int ret; 639 Error *local_err = NULL; 640 641 if (vfio_multifd_transfer_enabled(vbasedev)) { 642 vfio_multifd_emit_dummy_eos(vbasedev, f); 643 return 0; 644 } 645 646 trace_vfio_save_complete_precopy_start(vbasedev->name); 647 648 /* We reach here with device state STOP or STOP_COPY only */ 649 ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY, 650 VFIO_DEVICE_STATE_STOP, &local_err); 651 if (ret) { 652 error_report_err(local_err); 653 return ret; 654 } 655 656 do { 657 data_size = vfio_save_block(f, vbasedev->migration); 658 if (data_size < 0) { 659 return data_size; 660 } 661 } while (data_size); 662 663 qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); 664 ret = qemu_file_get_error(f); 665 666 trace_vfio_save_complete_precopy(vbasedev->name, ret); 667 668 return ret; 669 } 670 671 static void vfio_save_state(QEMUFile *f, void *opaque) 672 { 673 VFIODevice *vbasedev = opaque; 674 Error *local_err = NULL; 675 int ret; 676 677 if (vfio_multifd_transfer_enabled(vbasedev)) { 678 vfio_multifd_emit_dummy_eos(vbasedev, f); 679 return; 680 } 681 682 ret = vfio_save_device_config_state(f, opaque, &local_err); 683 if (ret) { 684 error_prepend(&local_err, 685 "vfio: Failed to save device config space of %s - ", 686 vbasedev->name); 687 qemu_file_set_error_obj(f, ret, local_err); 688 } 689 } 690 691 static int vfio_load_setup(QEMUFile *f, void *opaque, Error **errp) 692 { 693 VFIODevice *vbasedev = opaque; 694 VFIOMigration *migration = vbasedev->migration; 695 int ret; 696 697 if (!vfio_multifd_setup(vbasedev, true, errp)) { 698 return -EINVAL; 699 } 700 701 ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING, 702 migration->device_state, errp); 703 if (ret) { 704 return ret; 705 } 706 707 return 0; 708 } 709 710 static int vfio_load_cleanup(void *opaque) 711 { 712 VFIODevice *vbasedev = opaque; 713 714 vfio_multifd_cleanup(vbasedev); 715 716 vfio_migration_cleanup(vbasedev); 717 trace_vfio_load_cleanup(vbasedev->name); 718 719 return 0; 720 } 721 722 static int vfio_load_state(QEMUFile *f, void *opaque, int version_id) 723 { 724 VFIODevice *vbasedev = opaque; 725 int ret = 0; 726 uint64_t data; 727 728 data = qemu_get_be64(f); 729 while (data != VFIO_MIG_FLAG_END_OF_STATE) { 730 731 trace_vfio_load_state(vbasedev->name, data); 732 733 switch (data) { 734 case VFIO_MIG_FLAG_DEV_CONFIG_STATE: 735 { 736 if (vfio_multifd_transfer_enabled(vbasedev)) { 737 error_report("%s: got DEV_CONFIG_STATE in main migration " 738 "channel but doing multifd transfer", 739 vbasedev->name); 740 return -EINVAL; 741 } 742 743 return vfio_load_device_config_state(f, opaque); 744 } 745 case VFIO_MIG_FLAG_DEV_SETUP_STATE: 746 { 747 data = qemu_get_be64(f); 748 if (data == VFIO_MIG_FLAG_END_OF_STATE) { 749 return ret; 750 } else { 751 error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64, 752 vbasedev->name, data); 753 return -EINVAL; 754 } 755 break; 756 } 757 case VFIO_MIG_FLAG_DEV_DATA_STATE: 758 { 759 uint64_t data_size = qemu_get_be64(f); 760 761 if (data_size) { 762 ret = vfio_load_buffer(f, vbasedev, data_size); 763 if (ret < 0) { 764 return ret; 765 } 766 } 767 break; 768 } 769 case VFIO_MIG_FLAG_DEV_INIT_DATA_SENT: 770 { 771 if (!vfio_precopy_supported(vbasedev) || 772 !migrate_switchover_ack()) { 773 error_report("%s: Received INIT_DATA_SENT but switchover ack " 774 "is not used", vbasedev->name); 775 return -EINVAL; 776 } 777 778 ret = qemu_loadvm_approve_switchover(); 779 if (ret) { 780 error_report( 781 "%s: qemu_loadvm_approve_switchover failed, err=%d (%s)", 782 vbasedev->name, ret, strerror(-ret)); 783 } 784 785 return ret; 786 } 787 default: 788 error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data); 789 return -EINVAL; 790 } 791 792 data = qemu_get_be64(f); 793 ret = qemu_file_get_error(f); 794 if (ret) { 795 return ret; 796 } 797 } 798 return ret; 799 } 800 801 static bool vfio_switchover_ack_needed(void *opaque) 802 { 803 VFIODevice *vbasedev = opaque; 804 805 return vfio_precopy_supported(vbasedev); 806 } 807 808 static int vfio_switchover_start(void *opaque) 809 { 810 VFIODevice *vbasedev = opaque; 811 812 if (vfio_multifd_transfer_enabled(vbasedev)) { 813 return vfio_multifd_switchover_start(vbasedev); 814 } 815 816 return 0; 817 } 818 819 static const SaveVMHandlers savevm_vfio_handlers = { 820 .save_prepare = vfio_save_prepare, 821 .save_setup = vfio_save_setup, 822 .save_cleanup = vfio_save_cleanup, 823 .state_pending_estimate = vfio_state_pending_estimate, 824 .state_pending_exact = vfio_state_pending_exact, 825 .is_active_iterate = vfio_is_active_iterate, 826 .save_live_iterate = vfio_save_iterate, 827 .save_live_complete_precopy = vfio_save_complete_precopy, 828 .save_state = vfio_save_state, 829 .load_setup = vfio_load_setup, 830 .load_cleanup = vfio_load_cleanup, 831 .load_state = vfio_load_state, 832 .switchover_ack_needed = vfio_switchover_ack_needed, 833 /* 834 * Multifd support 835 */ 836 .load_state_buffer = vfio_multifd_load_state_buffer, 837 .switchover_start = vfio_switchover_start, 838 .save_live_complete_precopy_thread = vfio_multifd_save_complete_precopy_thread, 839 }; 840 841 /* ---------------------------------------------------------------------- */ 842 843 static void vfio_vmstate_change_prepare(void *opaque, bool running, 844 RunState state) 845 { 846 VFIODevice *vbasedev = opaque; 847 VFIOMigration *migration = vbasedev->migration; 848 enum vfio_device_mig_state new_state; 849 Error *local_err = NULL; 850 int ret; 851 852 new_state = migration->device_state == VFIO_DEVICE_STATE_PRE_COPY ? 853 VFIO_DEVICE_STATE_PRE_COPY_P2P : 854 VFIO_DEVICE_STATE_RUNNING_P2P; 855 856 ret = vfio_migration_set_state_or_reset(vbasedev, new_state, &local_err); 857 if (ret) { 858 /* 859 * Migration should be aborted in this case, but vm_state_notify() 860 * currently does not support reporting failures. 861 */ 862 migration_file_set_error(ret, local_err); 863 } 864 865 trace_vfio_vmstate_change_prepare(vbasedev->name, running, 866 RunState_str(state), 867 mig_state_to_str(new_state)); 868 } 869 870 static void vfio_vmstate_change(void *opaque, bool running, RunState state) 871 { 872 VFIODevice *vbasedev = opaque; 873 enum vfio_device_mig_state new_state; 874 Error *local_err = NULL; 875 int ret; 876 877 if (running) { 878 new_state = VFIO_DEVICE_STATE_RUNNING; 879 } else { 880 new_state = 881 (vfio_device_state_is_precopy(vbasedev) && 882 (state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSED)) ? 883 VFIO_DEVICE_STATE_STOP_COPY : 884 VFIO_DEVICE_STATE_STOP; 885 } 886 887 ret = vfio_migration_set_state_or_reset(vbasedev, new_state, &local_err); 888 if (ret) { 889 /* 890 * Migration should be aborted in this case, but vm_state_notify() 891 * currently does not support reporting failures. 892 */ 893 migration_file_set_error(ret, local_err); 894 } 895 896 trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state), 897 mig_state_to_str(new_state)); 898 } 899 900 static int vfio_migration_state_notifier(NotifierWithReturn *notifier, 901 MigrationEvent *e, Error **errp) 902 { 903 VFIOMigration *migration = container_of(notifier, VFIOMigration, 904 migration_state); 905 VFIODevice *vbasedev = migration->vbasedev; 906 Error *local_err = NULL; 907 int ret; 908 909 trace_vfio_migration_state_notifier(vbasedev->name, e->type); 910 911 if (e->type == MIG_EVENT_PRECOPY_FAILED) { 912 /* 913 * MigrationNotifyFunc may not return an error code and an Error 914 * object for MIG_EVENT_PRECOPY_FAILED. Hence, report the error 915 * locally and ignore the errp argument. 916 */ 917 ret = vfio_migration_set_state_or_reset(vbasedev, 918 VFIO_DEVICE_STATE_RUNNING, 919 &local_err); 920 if (ret) { 921 error_report_err(local_err); 922 } 923 } 924 return 0; 925 } 926 927 static void vfio_migration_free(VFIODevice *vbasedev) 928 { 929 g_free(vbasedev->migration); 930 vbasedev->migration = NULL; 931 } 932 933 static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags) 934 { 935 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) + 936 sizeof(struct vfio_device_feature_migration), 937 sizeof(uint64_t))] = {}; 938 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 939 struct vfio_device_feature_migration *mig = 940 (struct vfio_device_feature_migration *)feature->data; 941 942 feature->argsz = sizeof(buf); 943 feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION; 944 if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { 945 return -errno; 946 } 947 948 *mig_flags = mig->flags; 949 950 return 0; 951 } 952 953 static bool vfio_dma_logging_supported(VFIODevice *vbasedev) 954 { 955 uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature), 956 sizeof(uint64_t))] = {}; 957 struct vfio_device_feature *feature = (struct vfio_device_feature *)buf; 958 959 feature->argsz = sizeof(buf); 960 feature->flags = VFIO_DEVICE_FEATURE_PROBE | 961 VFIO_DEVICE_FEATURE_DMA_LOGGING_START; 962 963 return !ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); 964 } 965 966 static int vfio_migration_init(VFIODevice *vbasedev) 967 { 968 int ret; 969 Object *obj; 970 VFIOMigration *migration; 971 char id[256] = ""; 972 g_autofree char *path = NULL, *oid = NULL; 973 uint64_t mig_flags = 0; 974 VMChangeStateHandler *prepare_cb; 975 976 if (!vbasedev->ops->vfio_get_object) { 977 return -EINVAL; 978 } 979 980 obj = vbasedev->ops->vfio_get_object(vbasedev); 981 if (!obj) { 982 return -EINVAL; 983 } 984 985 ret = vfio_migration_query_flags(vbasedev, &mig_flags); 986 if (ret) { 987 return ret; 988 } 989 990 /* Basic migration functionality must be supported */ 991 if (!(mig_flags & VFIO_MIGRATION_STOP_COPY)) { 992 return -EOPNOTSUPP; 993 } 994 995 vbasedev->migration = g_new0(VFIOMigration, 1); 996 migration = vbasedev->migration; 997 migration->vbasedev = vbasedev; 998 migration->device_state = VFIO_DEVICE_STATE_RUNNING; 999 migration->data_fd = -1; 1000 migration->mig_flags = mig_flags; 1001 1002 vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev); 1003 1004 oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj))); 1005 if (oid) { 1006 path = g_strdup_printf("%s/vfio", oid); 1007 } else { 1008 path = g_strdup("vfio"); 1009 } 1010 strpadcpy(id, sizeof(id), path, '\0'); 1011 1012 register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1, &savevm_vfio_handlers, 1013 vbasedev); 1014 1015 prepare_cb = migration->mig_flags & VFIO_MIGRATION_P2P ? 1016 vfio_vmstate_change_prepare : 1017 NULL; 1018 migration->vm_state = qdev_add_vm_change_state_handler_full( 1019 vbasedev->dev, vfio_vmstate_change, prepare_cb, vbasedev); 1020 migration_add_notifier(&migration->migration_state, 1021 vfio_migration_state_notifier); 1022 1023 return 0; 1024 } 1025 1026 static Error *multiple_devices_migration_blocker; 1027 1028 /* 1029 * Multiple devices migration is allowed only if all devices support P2P 1030 * migration. Single device migration is allowed regardless of P2P migration 1031 * support. 1032 */ 1033 static bool vfio_multiple_devices_migration_is_supported(void) 1034 { 1035 VFIODevice *vbasedev; 1036 unsigned int device_num = 0; 1037 bool all_support_p2p = true; 1038 1039 QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { 1040 if (vbasedev->migration) { 1041 device_num++; 1042 1043 if (!(vbasedev->migration->mig_flags & VFIO_MIGRATION_P2P)) { 1044 all_support_p2p = false; 1045 } 1046 } 1047 } 1048 1049 return all_support_p2p || device_num <= 1; 1050 } 1051 1052 static int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp) 1053 { 1054 if (vfio_multiple_devices_migration_is_supported()) { 1055 return 0; 1056 } 1057 1058 if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { 1059 error_setg(errp, "Multiple VFIO devices migration is supported only if " 1060 "all of them support P2P migration"); 1061 return -EINVAL; 1062 } 1063 1064 if (multiple_devices_migration_blocker) { 1065 return 0; 1066 } 1067 1068 error_setg(&multiple_devices_migration_blocker, 1069 "Multiple VFIO devices migration is supported only if all of " 1070 "them support P2P migration"); 1071 return migrate_add_blocker_normal(&multiple_devices_migration_blocker, 1072 errp); 1073 } 1074 1075 static void vfio_unblock_multiple_devices_migration(void) 1076 { 1077 if (!multiple_devices_migration_blocker || 1078 !vfio_multiple_devices_migration_is_supported()) { 1079 return; 1080 } 1081 1082 migrate_del_blocker(&multiple_devices_migration_blocker); 1083 } 1084 1085 static void vfio_migration_deinit(VFIODevice *vbasedev) 1086 { 1087 VFIOMigration *migration = vbasedev->migration; 1088 1089 migration_remove_notifier(&migration->migration_state); 1090 qemu_del_vm_change_state_handler(migration->vm_state); 1091 unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev); 1092 vfio_migration_free(vbasedev); 1093 vfio_unblock_multiple_devices_migration(); 1094 } 1095 1096 static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp) 1097 { 1098 if (vbasedev->enable_migration == ON_OFF_AUTO_ON) { 1099 error_propagate(errp, err); 1100 return -EINVAL; 1101 } 1102 1103 vbasedev->migration_blocker = error_copy(err); 1104 error_free(err); 1105 1106 return migrate_add_blocker_normal(&vbasedev->migration_blocker, errp); 1107 } 1108 1109 /* ---------------------------------------------------------------------- */ 1110 1111 int64_t vfio_migration_bytes_transferred(void) 1112 { 1113 return MIN(qatomic_read(&bytes_transferred), INT64_MAX); 1114 } 1115 1116 void vfio_migration_reset_bytes_transferred(void) 1117 { 1118 qatomic_set(&bytes_transferred, 0); 1119 } 1120 1121 void vfio_migration_add_bytes_transferred(unsigned long val) 1122 { 1123 qatomic_add(&bytes_transferred, val); 1124 } 1125 1126 bool vfio_migration_active(void) 1127 { 1128 VFIODevice *vbasedev; 1129 1130 if (QLIST_EMPTY(&vfio_device_list)) { 1131 return false; 1132 } 1133 1134 QLIST_FOREACH(vbasedev, &vfio_device_list, global_next) { 1135 if (vbasedev->migration_blocker) { 1136 return false; 1137 } 1138 } 1139 return true; 1140 } 1141 1142 static bool vfio_viommu_preset(VFIODevice *vbasedev) 1143 { 1144 return vbasedev->bcontainer->space->as != &address_space_memory; 1145 } 1146 1147 /* 1148 * Return true when either migration initialized or blocker registered. 1149 * Currently only return false when adding blocker fails which will 1150 * de-register vfio device. 1151 */ 1152 bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp) 1153 { 1154 Error *err = NULL; 1155 int ret; 1156 1157 if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) { 1158 error_setg(&err, "%s: Migration is disabled for VFIO device", 1159 vbasedev->name); 1160 return !vfio_block_migration(vbasedev, err, errp); 1161 } 1162 1163 ret = vfio_migration_init(vbasedev); 1164 if (ret) { 1165 if (ret == -ENOTTY) { 1166 error_setg(&err, "%s: VFIO migration is not supported in kernel", 1167 vbasedev->name); 1168 } else { 1169 error_setg(&err, 1170 "%s: Migration couldn't be initialized for VFIO device, " 1171 "err: %d (%s)", 1172 vbasedev->name, ret, strerror(-ret)); 1173 } 1174 1175 return !vfio_block_migration(vbasedev, err, errp); 1176 } 1177 1178 if ((!vbasedev->dirty_pages_supported || 1179 vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) && 1180 !vbasedev->iommu_dirty_tracking) { 1181 if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) { 1182 error_setg(&err, 1183 "%s: VFIO device doesn't support device and " 1184 "IOMMU dirty tracking", vbasedev->name); 1185 goto add_blocker; 1186 } 1187 1188 warn_report("%s: VFIO device doesn't support device and " 1189 "IOMMU dirty tracking", vbasedev->name); 1190 } 1191 1192 ret = vfio_block_multiple_devices_migration(vbasedev, errp); 1193 if (ret) { 1194 goto out_deinit; 1195 } 1196 1197 if (vfio_viommu_preset(vbasedev)) { 1198 error_setg(&err, "%s: Migration is currently not supported " 1199 "with vIOMMU enabled", vbasedev->name); 1200 goto add_blocker; 1201 } 1202 1203 trace_vfio_migration_realize(vbasedev->name); 1204 return true; 1205 1206 add_blocker: 1207 ret = vfio_block_migration(vbasedev, err, errp); 1208 out_deinit: 1209 if (ret) { 1210 vfio_migration_deinit(vbasedev); 1211 } 1212 return !ret; 1213 } 1214 1215 void vfio_migration_exit(VFIODevice *vbasedev) 1216 { 1217 if (vbasedev->migration) { 1218 vfio_migration_deinit(vbasedev); 1219 } 1220 1221 migrate_del_blocker(&vbasedev->migration_blocker); 1222 } 1223 1224 bool vfio_device_state_is_running(VFIODevice *vbasedev) 1225 { 1226 VFIOMigration *migration = vbasedev->migration; 1227 1228 return migration->device_state == VFIO_DEVICE_STATE_RUNNING || 1229 migration->device_state == VFIO_DEVICE_STATE_RUNNING_P2P; 1230 } 1231 1232 bool vfio_device_state_is_precopy(VFIODevice *vbasedev) 1233 { 1234 VFIOMigration *migration = vbasedev->migration; 1235 1236 return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY || 1237 migration->device_state == VFIO_DEVICE_STATE_PRE_COPY_P2P; 1238 } 1239