xref: /qemu/hw/vfio/migration.c (revision b103cc6e74ac92f070a0e004bd84334e845c20b5)
1 /*
2  * Migration support for VFIO devices
3  *
4  * Copyright NVIDIA, Inc. 2020
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2. See
7  * the COPYING file in the top-level directory.
8  */
9 
10 #include "qemu/osdep.h"
11 #include "qemu/main-loop.h"
12 #include "qemu/cutils.h"
13 #include "qemu/units.h"
14 #include "qemu/error-report.h"
15 #include <linux/vfio.h>
16 #include <sys/ioctl.h>
17 
18 #include "system/runstate.h"
19 #include "hw/vfio/vfio-common.h"
20 #include "migration/misc.h"
21 #include "migration/savevm.h"
22 #include "migration/vmstate.h"
23 #include "migration/qemu-file.h"
24 #include "migration/register.h"
25 #include "migration/blocker.h"
26 #include "migration-multifd.h"
27 #include "qapi/error.h"
28 #include "qapi/qapi-events-vfio.h"
29 #include "exec/ramlist.h"
30 #include "pci.h"
31 #include "trace.h"
32 #include "hw/hw.h"
33 
34 /*
35  * This is an arbitrary size based on migration of mlx5 devices, where typically
36  * total device migration size is on the order of 100s of MB. Testing with
37  * larger values, e.g. 128MB and 1GB, did not show a performance improvement.
38  */
39 #define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB)
40 
41 static unsigned long bytes_transferred;
42 
43 static const char *mig_state_to_str(enum vfio_device_mig_state state)
44 {
45     switch (state) {
46     case VFIO_DEVICE_STATE_ERROR:
47         return "ERROR";
48     case VFIO_DEVICE_STATE_STOP:
49         return "STOP";
50     case VFIO_DEVICE_STATE_RUNNING:
51         return "RUNNING";
52     case VFIO_DEVICE_STATE_STOP_COPY:
53         return "STOP_COPY";
54     case VFIO_DEVICE_STATE_RESUMING:
55         return "RESUMING";
56     case VFIO_DEVICE_STATE_RUNNING_P2P:
57         return "RUNNING_P2P";
58     case VFIO_DEVICE_STATE_PRE_COPY:
59         return "PRE_COPY";
60     case VFIO_DEVICE_STATE_PRE_COPY_P2P:
61         return "PRE_COPY_P2P";
62     default:
63         return "UNKNOWN STATE";
64     }
65 }
66 
67 static QapiVfioMigrationState
68 mig_state_to_qapi_state(enum vfio_device_mig_state state)
69 {
70     switch (state) {
71     case VFIO_DEVICE_STATE_STOP:
72         return QAPI_VFIO_MIGRATION_STATE_STOP;
73     case VFIO_DEVICE_STATE_RUNNING:
74         return QAPI_VFIO_MIGRATION_STATE_RUNNING;
75     case VFIO_DEVICE_STATE_STOP_COPY:
76         return QAPI_VFIO_MIGRATION_STATE_STOP_COPY;
77     case VFIO_DEVICE_STATE_RESUMING:
78         return QAPI_VFIO_MIGRATION_STATE_RESUMING;
79     case VFIO_DEVICE_STATE_RUNNING_P2P:
80         return QAPI_VFIO_MIGRATION_STATE_RUNNING_P2P;
81     case VFIO_DEVICE_STATE_PRE_COPY:
82         return QAPI_VFIO_MIGRATION_STATE_PRE_COPY;
83     case VFIO_DEVICE_STATE_PRE_COPY_P2P:
84         return QAPI_VFIO_MIGRATION_STATE_PRE_COPY_P2P;
85     default:
86         g_assert_not_reached();
87     }
88 }
89 
90 static void vfio_migration_send_event(VFIODevice *vbasedev)
91 {
92     VFIOMigration *migration = vbasedev->migration;
93     DeviceState *dev = vbasedev->dev;
94     g_autofree char *qom_path = NULL;
95     Object *obj;
96 
97     if (!vbasedev->migration_events) {
98         return;
99     }
100 
101     g_assert(vbasedev->ops->vfio_get_object);
102     obj = vbasedev->ops->vfio_get_object(vbasedev);
103     g_assert(obj);
104     qom_path = object_get_canonical_path(obj);
105 
106     qapi_event_send_vfio_migration(
107         dev->id, qom_path, mig_state_to_qapi_state(migration->device_state));
108 }
109 
110 static void vfio_migration_set_device_state(VFIODevice *vbasedev,
111                                             enum vfio_device_mig_state state)
112 {
113     VFIOMigration *migration = vbasedev->migration;
114 
115     trace_vfio_migration_set_device_state(vbasedev->name,
116                                           mig_state_to_str(state));
117 
118     migration->device_state = state;
119     vfio_migration_send_event(vbasedev);
120 }
121 
122 int vfio_migration_set_state(VFIODevice *vbasedev,
123                              enum vfio_device_mig_state new_state,
124                              enum vfio_device_mig_state recover_state,
125                              Error **errp)
126 {
127     VFIOMigration *migration = vbasedev->migration;
128     uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
129                               sizeof(struct vfio_device_feature_mig_state),
130                               sizeof(uint64_t))] = {};
131     struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
132     struct vfio_device_feature_mig_state *mig_state =
133         (struct vfio_device_feature_mig_state *)feature->data;
134     int ret;
135     g_autofree char *error_prefix =
136         g_strdup_printf("%s: Failed setting device state to %s.",
137                         vbasedev->name, mig_state_to_str(new_state));
138 
139     trace_vfio_migration_set_state(vbasedev->name, mig_state_to_str(new_state),
140                                    mig_state_to_str(recover_state));
141 
142     if (new_state == migration->device_state) {
143         return 0;
144     }
145 
146     feature->argsz = sizeof(buf);
147     feature->flags =
148         VFIO_DEVICE_FEATURE_SET | VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE;
149     mig_state->device_state = new_state;
150     if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
151         /* Try to set the device in some good state */
152         ret = -errno;
153 
154         if (recover_state == VFIO_DEVICE_STATE_ERROR) {
155             error_setg_errno(errp, errno,
156                              "%s Recover state is ERROR. Resetting device",
157                              error_prefix);
158 
159             goto reset_device;
160         }
161 
162         error_setg_errno(errp, errno,
163                          "%s Setting device in recover state %s",
164                          error_prefix, mig_state_to_str(recover_state));
165 
166         mig_state->device_state = recover_state;
167         if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
168             ret = -errno;
169             /*
170              * If setting the device in recover state fails, report
171              * the error here and propagate the first error.
172              */
173             error_report(
174                 "%s: Failed setting device in recover state, err: %s. Resetting device",
175                          vbasedev->name, strerror(errno));
176 
177             goto reset_device;
178         }
179 
180         vfio_migration_set_device_state(vbasedev, recover_state);
181 
182         return ret;
183     }
184 
185     vfio_migration_set_device_state(vbasedev, new_state);
186     if (mig_state->data_fd != -1) {
187         if (migration->data_fd != -1) {
188             /*
189              * This can happen if the device is asynchronously reset and
190              * terminates a data transfer.
191              */
192             error_setg(errp, "%s: data_fd out of sync", vbasedev->name);
193             close(mig_state->data_fd);
194 
195             return -EBADF;
196         }
197 
198         migration->data_fd = mig_state->data_fd;
199     }
200 
201     return 0;
202 
203 reset_device:
204     if (ioctl(vbasedev->fd, VFIO_DEVICE_RESET)) {
205         hw_error("%s: Failed resetting device, err: %s", vbasedev->name,
206                  strerror(errno));
207     }
208 
209     vfio_migration_set_device_state(vbasedev, VFIO_DEVICE_STATE_RUNNING);
210 
211     return ret;
212 }
213 
214 /*
215  * Some device state transitions require resetting the device if they fail.
216  * This function sets the device in new_state and resets the device if that
217  * fails. Reset is done by using ERROR as the recover state.
218  */
219 static int
220 vfio_migration_set_state_or_reset(VFIODevice *vbasedev,
221                                   enum vfio_device_mig_state new_state,
222                                   Error **errp)
223 {
224     return vfio_migration_set_state(vbasedev, new_state,
225                                     VFIO_DEVICE_STATE_ERROR, errp);
226 }
227 
228 static int vfio_load_buffer(QEMUFile *f, VFIODevice *vbasedev,
229                             uint64_t data_size)
230 {
231     VFIOMigration *migration = vbasedev->migration;
232     int ret;
233 
234     ret = qemu_file_get_to_fd(f, migration->data_fd, data_size);
235     trace_vfio_load_state_device_data(vbasedev->name, data_size, ret);
236 
237     return ret;
238 }
239 
240 int vfio_save_device_config_state(QEMUFile *f, void *opaque, Error **errp)
241 {
242     VFIODevice *vbasedev = opaque;
243     int ret;
244 
245     qemu_put_be64(f, VFIO_MIG_FLAG_DEV_CONFIG_STATE);
246 
247     if (vbasedev->ops && vbasedev->ops->vfio_save_config) {
248         ret = vbasedev->ops->vfio_save_config(vbasedev, f, errp);
249         if (ret) {
250             return ret;
251         }
252     }
253 
254     qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
255 
256     trace_vfio_save_device_config_state(vbasedev->name);
257 
258     ret = qemu_file_get_error(f);
259     if (ret < 0) {
260         error_setg_errno(errp, -ret, "Failed to save state");
261     }
262     return ret;
263 }
264 
265 int vfio_load_device_config_state(QEMUFile *f, void *opaque)
266 {
267     VFIODevice *vbasedev = opaque;
268     uint64_t data;
269 
270     trace_vfio_load_device_config_state_start(vbasedev->name);
271 
272     if (vbasedev->ops && vbasedev->ops->vfio_load_config) {
273         int ret;
274 
275         ret = vbasedev->ops->vfio_load_config(vbasedev, f);
276         if (ret) {
277             error_report("%s: Failed to load device config space",
278                          vbasedev->name);
279             return ret;
280         }
281     }
282 
283     data = qemu_get_be64(f);
284     if (data != VFIO_MIG_FLAG_END_OF_STATE) {
285         error_report("%s: Failed loading device config space, "
286                      "end flag incorrect 0x%"PRIx64, vbasedev->name, data);
287         return -EINVAL;
288     }
289 
290     trace_vfio_load_device_config_state_end(vbasedev->name);
291     return qemu_file_get_error(f);
292 }
293 
294 static void vfio_migration_cleanup(VFIODevice *vbasedev)
295 {
296     VFIOMigration *migration = vbasedev->migration;
297 
298     close(migration->data_fd);
299     migration->data_fd = -1;
300 }
301 
302 static int vfio_query_stop_copy_size(VFIODevice *vbasedev,
303                                      uint64_t *stop_copy_size)
304 {
305     uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
306                               sizeof(struct vfio_device_feature_mig_data_size),
307                               sizeof(uint64_t))] = {};
308     struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
309     struct vfio_device_feature_mig_data_size *mig_data_size =
310         (struct vfio_device_feature_mig_data_size *)feature->data;
311 
312     feature->argsz = sizeof(buf);
313     feature->flags =
314         VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIG_DATA_SIZE;
315 
316     if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
317         return -errno;
318     }
319 
320     *stop_copy_size = mig_data_size->stop_copy_length;
321 
322     return 0;
323 }
324 
325 static int vfio_query_precopy_size(VFIOMigration *migration)
326 {
327     struct vfio_precopy_info precopy = {
328         .argsz = sizeof(precopy),
329     };
330 
331     migration->precopy_init_size = 0;
332     migration->precopy_dirty_size = 0;
333 
334     if (ioctl(migration->data_fd, VFIO_MIG_GET_PRECOPY_INFO, &precopy)) {
335         return -errno;
336     }
337 
338     migration->precopy_init_size = precopy.initial_bytes;
339     migration->precopy_dirty_size = precopy.dirty_bytes;
340 
341     return 0;
342 }
343 
344 /* Returns the size of saved data on success and -errno on error */
345 static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration)
346 {
347     ssize_t data_size;
348 
349     data_size = read(migration->data_fd, migration->data_buffer,
350                      migration->data_buffer_size);
351     if (data_size < 0) {
352         /*
353          * Pre-copy emptied all the device state for now. For more information,
354          * please refer to the Linux kernel VFIO uAPI.
355          */
356         if (errno == ENOMSG) {
357             if (!migration->event_precopy_empty_hit) {
358                 trace_vfio_save_block_precopy_empty_hit(migration->vbasedev->name);
359                 migration->event_precopy_empty_hit = true;
360             }
361             return 0;
362         }
363 
364         return -errno;
365     }
366     if (data_size == 0) {
367         return 0;
368     }
369 
370     /* Non-empty read: re-arm the trace event */
371     migration->event_precopy_empty_hit = false;
372 
373     qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE);
374     qemu_put_be64(f, data_size);
375     qemu_put_buffer(f, migration->data_buffer, data_size);
376     vfio_mig_add_bytes_transferred(data_size);
377 
378     trace_vfio_save_block(migration->vbasedev->name, data_size);
379 
380     return qemu_file_get_error(f) ?: data_size;
381 }
382 
383 static void vfio_update_estimated_pending_data(VFIOMigration *migration,
384                                                uint64_t data_size)
385 {
386     if (!data_size) {
387         /*
388          * Pre-copy emptied all the device state for now, update estimated sizes
389          * accordingly.
390          */
391         migration->precopy_init_size = 0;
392         migration->precopy_dirty_size = 0;
393 
394         return;
395     }
396 
397     if (migration->precopy_init_size) {
398         uint64_t init_size = MIN(migration->precopy_init_size, data_size);
399 
400         migration->precopy_init_size -= init_size;
401         data_size -= init_size;
402     }
403 
404     migration->precopy_dirty_size -= MIN(migration->precopy_dirty_size,
405                                          data_size);
406 }
407 
408 static bool vfio_precopy_supported(VFIODevice *vbasedev)
409 {
410     VFIOMigration *migration = vbasedev->migration;
411 
412     return migration->mig_flags & VFIO_MIGRATION_PRE_COPY;
413 }
414 
415 /* ---------------------------------------------------------------------- */
416 
417 static int vfio_save_prepare(void *opaque, Error **errp)
418 {
419     VFIODevice *vbasedev = opaque;
420 
421     /*
422      * Snapshot doesn't use postcopy nor background snapshot, so allow snapshot
423      * even if they are on.
424      */
425     if (runstate_check(RUN_STATE_SAVE_VM)) {
426         return 0;
427     }
428 
429     if (migrate_postcopy_ram()) {
430         error_setg(
431             errp, "%s: VFIO migration is not supported with postcopy migration",
432             vbasedev->name);
433         return -EOPNOTSUPP;
434     }
435 
436     if (migrate_background_snapshot()) {
437         error_setg(
438             errp,
439             "%s: VFIO migration is not supported with background snapshot",
440             vbasedev->name);
441         return -EOPNOTSUPP;
442     }
443 
444     return 0;
445 }
446 
447 static int vfio_save_setup(QEMUFile *f, void *opaque, Error **errp)
448 {
449     VFIODevice *vbasedev = opaque;
450     VFIOMigration *migration = vbasedev->migration;
451     uint64_t stop_copy_size = VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE;
452     int ret;
453 
454     if (!vfio_multifd_setup(vbasedev, false, errp)) {
455         return -EINVAL;
456     }
457 
458     qemu_put_be64(f, VFIO_MIG_FLAG_DEV_SETUP_STATE);
459 
460     vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
461     migration->data_buffer_size = MIN(VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE,
462                                       stop_copy_size);
463     migration->data_buffer = g_try_malloc0(migration->data_buffer_size);
464     if (!migration->data_buffer) {
465         error_setg(errp, "%s: Failed to allocate migration data buffer",
466                    vbasedev->name);
467         return -ENOMEM;
468     }
469 
470     migration->event_save_iterate_started = false;
471     migration->event_precopy_empty_hit = false;
472 
473     if (vfio_precopy_supported(vbasedev)) {
474         switch (migration->device_state) {
475         case VFIO_DEVICE_STATE_RUNNING:
476             ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_PRE_COPY,
477                                            VFIO_DEVICE_STATE_RUNNING, errp);
478             if (ret) {
479                 return ret;
480             }
481 
482             vfio_query_precopy_size(migration);
483 
484             break;
485         case VFIO_DEVICE_STATE_STOP:
486             /* vfio_save_complete_precopy() will go to STOP_COPY */
487             break;
488         default:
489             error_setg(errp, "%s: Invalid device state %d", vbasedev->name,
490                        migration->device_state);
491             return -EINVAL;
492         }
493     }
494 
495     trace_vfio_save_setup(vbasedev->name, migration->data_buffer_size);
496 
497     qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
498 
499     ret = qemu_file_get_error(f);
500     if (ret < 0) {
501         error_setg_errno(errp, -ret, "%s: save setup failed", vbasedev->name);
502     }
503 
504     return ret;
505 }
506 
507 static void vfio_save_cleanup(void *opaque)
508 {
509     VFIODevice *vbasedev = opaque;
510     VFIOMigration *migration = vbasedev->migration;
511     Error *local_err = NULL;
512     int ret;
513 
514     /* Currently a NOP, done for symmetry with load_cleanup() */
515     vfio_multifd_cleanup(vbasedev);
516 
517     /*
518      * Changing device state from STOP_COPY to STOP can take time. Do it here,
519      * after migration has completed, so it won't increase downtime.
520      */
521     if (migration->device_state == VFIO_DEVICE_STATE_STOP_COPY) {
522         ret = vfio_migration_set_state_or_reset(vbasedev,
523                                                 VFIO_DEVICE_STATE_STOP,
524                                                 &local_err);
525         if (ret) {
526             error_report_err(local_err);
527         }
528     }
529 
530     g_free(migration->data_buffer);
531     migration->data_buffer = NULL;
532     migration->precopy_init_size = 0;
533     migration->precopy_dirty_size = 0;
534     migration->initial_data_sent = false;
535     vfio_migration_cleanup(vbasedev);
536     trace_vfio_save_cleanup(vbasedev->name);
537 }
538 
539 static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy,
540                                         uint64_t *can_postcopy)
541 {
542     VFIODevice *vbasedev = opaque;
543     VFIOMigration *migration = vbasedev->migration;
544 
545     if (!vfio_device_state_is_precopy(vbasedev)) {
546         return;
547     }
548 
549     *must_precopy +=
550         migration->precopy_init_size + migration->precopy_dirty_size;
551 
552     trace_vfio_state_pending_estimate(vbasedev->name, *must_precopy,
553                                       *can_postcopy,
554                                       migration->precopy_init_size,
555                                       migration->precopy_dirty_size);
556 }
557 
558 /*
559  * Migration size of VFIO devices can be as little as a few KBs or as big as
560  * many GBs. This value should be big enough to cover the worst case.
561  */
562 #define VFIO_MIG_STOP_COPY_SIZE (100 * GiB)
563 
564 static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy,
565                                      uint64_t *can_postcopy)
566 {
567     VFIODevice *vbasedev = opaque;
568     VFIOMigration *migration = vbasedev->migration;
569     uint64_t stop_copy_size = VFIO_MIG_STOP_COPY_SIZE;
570 
571     /*
572      * If getting pending migration size fails, VFIO_MIG_STOP_COPY_SIZE is
573      * reported so downtime limit won't be violated.
574      */
575     vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
576     *must_precopy += stop_copy_size;
577 
578     if (vfio_device_state_is_precopy(vbasedev)) {
579         vfio_query_precopy_size(migration);
580     }
581 
582     trace_vfio_state_pending_exact(vbasedev->name, *must_precopy, *can_postcopy,
583                                    stop_copy_size, migration->precopy_init_size,
584                                    migration->precopy_dirty_size);
585 }
586 
587 static bool vfio_is_active_iterate(void *opaque)
588 {
589     VFIODevice *vbasedev = opaque;
590 
591     return vfio_device_state_is_precopy(vbasedev);
592 }
593 
594 /*
595  * Note about migration rate limiting: VFIO migration buffer size is currently
596  * limited to 1MB, so there is no need to check if migration rate exceeded (as
597  * in the worst case it will exceed by 1MB). However, if the buffer size is
598  * later changed to a bigger value, migration rate should be enforced here.
599  */
600 static int vfio_save_iterate(QEMUFile *f, void *opaque)
601 {
602     VFIODevice *vbasedev = opaque;
603     VFIOMigration *migration = vbasedev->migration;
604     ssize_t data_size;
605 
606     if (!migration->event_save_iterate_started) {
607         trace_vfio_save_iterate_start(vbasedev->name);
608         migration->event_save_iterate_started = true;
609     }
610 
611     data_size = vfio_save_block(f, migration);
612     if (data_size < 0) {
613         return data_size;
614     }
615 
616     vfio_update_estimated_pending_data(migration, data_size);
617 
618     if (migrate_switchover_ack() && !migration->precopy_init_size &&
619         !migration->initial_data_sent) {
620         qemu_put_be64(f, VFIO_MIG_FLAG_DEV_INIT_DATA_SENT);
621         migration->initial_data_sent = true;
622     } else {
623         qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
624     }
625 
626     trace_vfio_save_iterate(vbasedev->name, migration->precopy_init_size,
627                             migration->precopy_dirty_size);
628 
629     return !migration->precopy_init_size && !migration->precopy_dirty_size;
630 }
631 
632 static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
633 {
634     VFIODevice *vbasedev = opaque;
635     ssize_t data_size;
636     int ret;
637     Error *local_err = NULL;
638 
639     if (vfio_multifd_transfer_enabled(vbasedev)) {
640         vfio_multifd_emit_dummy_eos(vbasedev, f);
641         return 0;
642     }
643 
644     trace_vfio_save_complete_precopy_start(vbasedev->name);
645 
646     /* We reach here with device state STOP or STOP_COPY only */
647     ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
648                                    VFIO_DEVICE_STATE_STOP, &local_err);
649     if (ret) {
650         error_report_err(local_err);
651         return ret;
652     }
653 
654     do {
655         data_size = vfio_save_block(f, vbasedev->migration);
656         if (data_size < 0) {
657             return data_size;
658         }
659     } while (data_size);
660 
661     qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
662     ret = qemu_file_get_error(f);
663 
664     trace_vfio_save_complete_precopy(vbasedev->name, ret);
665 
666     return ret;
667 }
668 
669 static void vfio_save_state(QEMUFile *f, void *opaque)
670 {
671     VFIODevice *vbasedev = opaque;
672     Error *local_err = NULL;
673     int ret;
674 
675     if (vfio_multifd_transfer_enabled(vbasedev)) {
676         vfio_multifd_emit_dummy_eos(vbasedev, f);
677         return;
678     }
679 
680     ret = vfio_save_device_config_state(f, opaque, &local_err);
681     if (ret) {
682         error_prepend(&local_err,
683                       "vfio: Failed to save device config space of %s - ",
684                       vbasedev->name);
685         qemu_file_set_error_obj(f, ret, local_err);
686     }
687 }
688 
689 static int vfio_load_setup(QEMUFile *f, void *opaque, Error **errp)
690 {
691     VFIODevice *vbasedev = opaque;
692     VFIOMigration *migration = vbasedev->migration;
693     int ret;
694 
695     if (!vfio_multifd_setup(vbasedev, true, errp)) {
696         return -EINVAL;
697     }
698 
699     ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RESUMING,
700                                    migration->device_state, errp);
701     if (ret) {
702         return ret;
703     }
704 
705     return 0;
706 }
707 
708 static int vfio_load_cleanup(void *opaque)
709 {
710     VFIODevice *vbasedev = opaque;
711 
712     vfio_multifd_cleanup(vbasedev);
713 
714     vfio_migration_cleanup(vbasedev);
715     trace_vfio_load_cleanup(vbasedev->name);
716 
717     return 0;
718 }
719 
720 static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
721 {
722     VFIODevice *vbasedev = opaque;
723     int ret = 0;
724     uint64_t data;
725 
726     data = qemu_get_be64(f);
727     while (data != VFIO_MIG_FLAG_END_OF_STATE) {
728 
729         trace_vfio_load_state(vbasedev->name, data);
730 
731         switch (data) {
732         case VFIO_MIG_FLAG_DEV_CONFIG_STATE:
733         {
734             if (vfio_multifd_transfer_enabled(vbasedev)) {
735                 error_report("%s: got DEV_CONFIG_STATE in main migration "
736                              "channel but doing multifd transfer",
737                              vbasedev->name);
738                 return -EINVAL;
739             }
740 
741             return vfio_load_device_config_state(f, opaque);
742         }
743         case VFIO_MIG_FLAG_DEV_SETUP_STATE:
744         {
745             data = qemu_get_be64(f);
746             if (data == VFIO_MIG_FLAG_END_OF_STATE) {
747                 return ret;
748             } else {
749                 error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64,
750                              vbasedev->name, data);
751                 return -EINVAL;
752             }
753             break;
754         }
755         case VFIO_MIG_FLAG_DEV_DATA_STATE:
756         {
757             uint64_t data_size = qemu_get_be64(f);
758 
759             if (data_size) {
760                 ret = vfio_load_buffer(f, vbasedev, data_size);
761                 if (ret < 0) {
762                     return ret;
763                 }
764             }
765             break;
766         }
767         case VFIO_MIG_FLAG_DEV_INIT_DATA_SENT:
768         {
769             if (!vfio_precopy_supported(vbasedev) ||
770                 !migrate_switchover_ack()) {
771                 error_report("%s: Received INIT_DATA_SENT but switchover ack "
772                              "is not used", vbasedev->name);
773                 return -EINVAL;
774             }
775 
776             ret = qemu_loadvm_approve_switchover();
777             if (ret) {
778                 error_report(
779                     "%s: qemu_loadvm_approve_switchover failed, err=%d (%s)",
780                     vbasedev->name, ret, strerror(-ret));
781             }
782 
783             return ret;
784         }
785         default:
786             error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data);
787             return -EINVAL;
788         }
789 
790         data = qemu_get_be64(f);
791         ret = qemu_file_get_error(f);
792         if (ret) {
793             return ret;
794         }
795     }
796     return ret;
797 }
798 
799 static bool vfio_switchover_ack_needed(void *opaque)
800 {
801     VFIODevice *vbasedev = opaque;
802 
803     return vfio_precopy_supported(vbasedev);
804 }
805 
806 static int vfio_switchover_start(void *opaque)
807 {
808     VFIODevice *vbasedev = opaque;
809 
810     if (vfio_multifd_transfer_enabled(vbasedev)) {
811         return vfio_multifd_switchover_start(vbasedev);
812     }
813 
814     return 0;
815 }
816 
817 static const SaveVMHandlers savevm_vfio_handlers = {
818     .save_prepare = vfio_save_prepare,
819     .save_setup = vfio_save_setup,
820     .save_cleanup = vfio_save_cleanup,
821     .state_pending_estimate = vfio_state_pending_estimate,
822     .state_pending_exact = vfio_state_pending_exact,
823     .is_active_iterate = vfio_is_active_iterate,
824     .save_live_iterate = vfio_save_iterate,
825     .save_live_complete_precopy = vfio_save_complete_precopy,
826     .save_state = vfio_save_state,
827     .load_setup = vfio_load_setup,
828     .load_cleanup = vfio_load_cleanup,
829     .load_state = vfio_load_state,
830     .switchover_ack_needed = vfio_switchover_ack_needed,
831     /*
832      * Multifd support
833      */
834     .load_state_buffer = vfio_multifd_load_state_buffer,
835     .switchover_start = vfio_switchover_start,
836     .save_live_complete_precopy_thread = vfio_multifd_save_complete_precopy_thread,
837 };
838 
839 /* ---------------------------------------------------------------------- */
840 
841 static void vfio_vmstate_change_prepare(void *opaque, bool running,
842                                         RunState state)
843 {
844     VFIODevice *vbasedev = opaque;
845     VFIOMigration *migration = vbasedev->migration;
846     enum vfio_device_mig_state new_state;
847     Error *local_err = NULL;
848     int ret;
849 
850     new_state = migration->device_state == VFIO_DEVICE_STATE_PRE_COPY ?
851                     VFIO_DEVICE_STATE_PRE_COPY_P2P :
852                     VFIO_DEVICE_STATE_RUNNING_P2P;
853 
854     ret = vfio_migration_set_state_or_reset(vbasedev, new_state, &local_err);
855     if (ret) {
856         /*
857          * Migration should be aborted in this case, but vm_state_notify()
858          * currently does not support reporting failures.
859          */
860         migration_file_set_error(ret, local_err);
861     }
862 
863     trace_vfio_vmstate_change_prepare(vbasedev->name, running,
864                                       RunState_str(state),
865                                       mig_state_to_str(new_state));
866 }
867 
868 static void vfio_vmstate_change(void *opaque, bool running, RunState state)
869 {
870     VFIODevice *vbasedev = opaque;
871     enum vfio_device_mig_state new_state;
872     Error *local_err = NULL;
873     int ret;
874 
875     if (running) {
876         new_state = VFIO_DEVICE_STATE_RUNNING;
877     } else {
878         new_state =
879             (vfio_device_state_is_precopy(vbasedev) &&
880              (state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSED)) ?
881                 VFIO_DEVICE_STATE_STOP_COPY :
882                 VFIO_DEVICE_STATE_STOP;
883     }
884 
885     ret = vfio_migration_set_state_or_reset(vbasedev, new_state, &local_err);
886     if (ret) {
887         /*
888          * Migration should be aborted in this case, but vm_state_notify()
889          * currently does not support reporting failures.
890          */
891         migration_file_set_error(ret, local_err);
892     }
893 
894     trace_vfio_vmstate_change(vbasedev->name, running, RunState_str(state),
895                               mig_state_to_str(new_state));
896 }
897 
898 static int vfio_migration_state_notifier(NotifierWithReturn *notifier,
899                                          MigrationEvent *e, Error **errp)
900 {
901     VFIOMigration *migration = container_of(notifier, VFIOMigration,
902                                             migration_state);
903     VFIODevice *vbasedev = migration->vbasedev;
904     Error *local_err = NULL;
905     int ret;
906 
907     trace_vfio_migration_state_notifier(vbasedev->name, e->type);
908 
909     if (e->type == MIG_EVENT_PRECOPY_FAILED) {
910         /*
911          * MigrationNotifyFunc may not return an error code and an Error
912          * object for MIG_EVENT_PRECOPY_FAILED. Hence, report the error
913          * locally and ignore the errp argument.
914          */
915         ret = vfio_migration_set_state_or_reset(vbasedev,
916                                                 VFIO_DEVICE_STATE_RUNNING,
917                                                 &local_err);
918         if (ret) {
919             error_report_err(local_err);
920         }
921     }
922     return 0;
923 }
924 
925 static void vfio_migration_free(VFIODevice *vbasedev)
926 {
927     g_free(vbasedev->migration);
928     vbasedev->migration = NULL;
929 }
930 
931 static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags)
932 {
933     uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
934                                   sizeof(struct vfio_device_feature_migration),
935                               sizeof(uint64_t))] = {};
936     struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
937     struct vfio_device_feature_migration *mig =
938         (struct vfio_device_feature_migration *)feature->data;
939 
940     feature->argsz = sizeof(buf);
941     feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION;
942     if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
943         return -errno;
944     }
945 
946     *mig_flags = mig->flags;
947 
948     return 0;
949 }
950 
951 static bool vfio_dma_logging_supported(VFIODevice *vbasedev)
952 {
953     uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
954                               sizeof(uint64_t))] = {};
955     struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
956 
957     feature->argsz = sizeof(buf);
958     feature->flags = VFIO_DEVICE_FEATURE_PROBE |
959                      VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
960 
961     return !ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
962 }
963 
964 static int vfio_migration_init(VFIODevice *vbasedev)
965 {
966     int ret;
967     Object *obj;
968     VFIOMigration *migration;
969     char id[256] = "";
970     g_autofree char *path = NULL, *oid = NULL;
971     uint64_t mig_flags = 0;
972     VMChangeStateHandler *prepare_cb;
973 
974     if (!vbasedev->ops->vfio_get_object) {
975         return -EINVAL;
976     }
977 
978     obj = vbasedev->ops->vfio_get_object(vbasedev);
979     if (!obj) {
980         return -EINVAL;
981     }
982 
983     ret = vfio_migration_query_flags(vbasedev, &mig_flags);
984     if (ret) {
985         return ret;
986     }
987 
988     /* Basic migration functionality must be supported */
989     if (!(mig_flags & VFIO_MIGRATION_STOP_COPY)) {
990         return -EOPNOTSUPP;
991     }
992 
993     vbasedev->migration = g_new0(VFIOMigration, 1);
994     migration = vbasedev->migration;
995     migration->vbasedev = vbasedev;
996     migration->device_state = VFIO_DEVICE_STATE_RUNNING;
997     migration->data_fd = -1;
998     migration->mig_flags = mig_flags;
999 
1000     vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev);
1001 
1002     oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj)));
1003     if (oid) {
1004         path = g_strdup_printf("%s/vfio", oid);
1005     } else {
1006         path = g_strdup("vfio");
1007     }
1008     strpadcpy(id, sizeof(id), path, '\0');
1009 
1010     register_savevm_live(id, VMSTATE_INSTANCE_ID_ANY, 1, &savevm_vfio_handlers,
1011                          vbasedev);
1012 
1013     prepare_cb = migration->mig_flags & VFIO_MIGRATION_P2P ?
1014                      vfio_vmstate_change_prepare :
1015                      NULL;
1016     migration->vm_state = qdev_add_vm_change_state_handler_full(
1017         vbasedev->dev, vfio_vmstate_change, prepare_cb, vbasedev);
1018     migration_add_notifier(&migration->migration_state,
1019                            vfio_migration_state_notifier);
1020 
1021     return 0;
1022 }
1023 
1024 static void vfio_migration_deinit(VFIODevice *vbasedev)
1025 {
1026     VFIOMigration *migration = vbasedev->migration;
1027 
1028     migration_remove_notifier(&migration->migration_state);
1029     qemu_del_vm_change_state_handler(migration->vm_state);
1030     unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev);
1031     vfio_migration_free(vbasedev);
1032     vfio_unblock_multiple_devices_migration();
1033 }
1034 
1035 static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp)
1036 {
1037     if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
1038         error_propagate(errp, err);
1039         return -EINVAL;
1040     }
1041 
1042     vbasedev->migration_blocker = error_copy(err);
1043     error_free(err);
1044 
1045     return migrate_add_blocker_normal(&vbasedev->migration_blocker, errp);
1046 }
1047 
1048 /* ---------------------------------------------------------------------- */
1049 
1050 int64_t vfio_mig_bytes_transferred(void)
1051 {
1052     return MIN(qatomic_read(&bytes_transferred), INT64_MAX);
1053 }
1054 
1055 void vfio_reset_bytes_transferred(void)
1056 {
1057     qatomic_set(&bytes_transferred, 0);
1058 }
1059 
1060 void vfio_mig_add_bytes_transferred(unsigned long val)
1061 {
1062     qatomic_add(&bytes_transferred, val);
1063 }
1064 
1065 /*
1066  * Return true when either migration initialized or blocker registered.
1067  * Currently only return false when adding blocker fails which will
1068  * de-register vfio device.
1069  */
1070 bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp)
1071 {
1072     Error *err = NULL;
1073     int ret;
1074 
1075     if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) {
1076         error_setg(&err, "%s: Migration is disabled for VFIO device",
1077                    vbasedev->name);
1078         return !vfio_block_migration(vbasedev, err, errp);
1079     }
1080 
1081     ret = vfio_migration_init(vbasedev);
1082     if (ret) {
1083         if (ret == -ENOTTY) {
1084             error_setg(&err, "%s: VFIO migration is not supported in kernel",
1085                        vbasedev->name);
1086         } else {
1087             error_setg(&err,
1088                        "%s: Migration couldn't be initialized for VFIO device, "
1089                        "err: %d (%s)",
1090                        vbasedev->name, ret, strerror(-ret));
1091         }
1092 
1093         return !vfio_block_migration(vbasedev, err, errp);
1094     }
1095 
1096     if ((!vbasedev->dirty_pages_supported ||
1097          vbasedev->device_dirty_page_tracking == ON_OFF_AUTO_OFF) &&
1098         !vbasedev->iommu_dirty_tracking) {
1099         if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) {
1100             error_setg(&err,
1101                        "%s: VFIO device doesn't support device and "
1102                        "IOMMU dirty tracking", vbasedev->name);
1103             goto add_blocker;
1104         }
1105 
1106         warn_report("%s: VFIO device doesn't support device and "
1107                     "IOMMU dirty tracking", vbasedev->name);
1108     }
1109 
1110     ret = vfio_block_multiple_devices_migration(vbasedev, errp);
1111     if (ret) {
1112         goto out_deinit;
1113     }
1114 
1115     if (vfio_viommu_preset(vbasedev)) {
1116         error_setg(&err, "%s: Migration is currently not supported "
1117                    "with vIOMMU enabled", vbasedev->name);
1118         goto add_blocker;
1119     }
1120 
1121     trace_vfio_migration_realize(vbasedev->name);
1122     return true;
1123 
1124 add_blocker:
1125     ret = vfio_block_migration(vbasedev, err, errp);
1126 out_deinit:
1127     if (ret) {
1128         vfio_migration_deinit(vbasedev);
1129     }
1130     return !ret;
1131 }
1132 
1133 void vfio_migration_exit(VFIODevice *vbasedev)
1134 {
1135     if (vbasedev->migration) {
1136         vfio_migration_deinit(vbasedev);
1137     }
1138 
1139     migrate_del_blocker(&vbasedev->migration_blocker);
1140 }
1141