xref: /qemu/hw/vfio/migration-multifd.c (revision 623af41dd331d1a57a41bc3374e3d134adb33f4c)
1 /*
2  * Multifd VFIO migration
3  *
4  * Copyright (C) 2024,2025 Oracle and/or its affiliates.
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
7  * See the COPYING file in the top-level directory.
8  *
9  * SPDX-License-Identifier: GPL-2.0-or-later
10  */
11 
12 #include "qemu/osdep.h"
13 #include "hw/vfio/vfio-common.h"
14 #include "migration/misc.h"
15 #include "qapi/error.h"
16 #include "qemu/error-report.h"
17 #include "qemu/lockable.h"
18 #include "qemu/main-loop.h"
19 #include "qemu/thread.h"
20 #include "io/channel-buffer.h"
21 #include "migration/qemu-file.h"
22 #include "migration-multifd.h"
23 #include "trace.h"
24 
25 #define VFIO_DEVICE_STATE_CONFIG_STATE (1)
26 
27 #define VFIO_DEVICE_STATE_PACKET_VER_CURRENT (0)
28 
29 typedef struct VFIODeviceStatePacket {
30     uint32_t version;
31     uint32_t idx;
32     uint32_t flags;
33     uint8_t data[0];
34 } QEMU_PACKED VFIODeviceStatePacket;
35 
36 /* type safety */
37 typedef struct VFIOStateBuffers {
38     GArray *array;
39 } VFIOStateBuffers;
40 
41 typedef struct VFIOStateBuffer {
42     bool is_present;
43     char *data;
44     size_t len;
45 } VFIOStateBuffer;
46 
47 typedef struct VFIOMultifd {
48     bool load_bufs_thread_running;
49     bool load_bufs_thread_want_exit;
50 
51     VFIOStateBuffers load_bufs;
52     QemuCond load_bufs_buffer_ready_cond;
53     QemuCond load_bufs_thread_finished_cond;
54     QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
55     uint32_t load_buf_idx;
56     uint32_t load_buf_idx_last;
57 } VFIOMultifd;
58 
59 static void vfio_state_buffer_clear(gpointer data)
60 {
61     VFIOStateBuffer *lb = data;
62 
63     if (!lb->is_present) {
64         return;
65     }
66 
67     g_clear_pointer(&lb->data, g_free);
68     lb->is_present = false;
69 }
70 
71 static void vfio_state_buffers_init(VFIOStateBuffers *bufs)
72 {
73     bufs->array = g_array_new(FALSE, TRUE, sizeof(VFIOStateBuffer));
74     g_array_set_clear_func(bufs->array, vfio_state_buffer_clear);
75 }
76 
77 static void vfio_state_buffers_destroy(VFIOStateBuffers *bufs)
78 {
79     g_clear_pointer(&bufs->array, g_array_unref);
80 }
81 
82 static void vfio_state_buffers_assert_init(VFIOStateBuffers *bufs)
83 {
84     assert(bufs->array);
85 }
86 
87 static unsigned int vfio_state_buffers_size_get(VFIOStateBuffers *bufs)
88 {
89     return bufs->array->len;
90 }
91 
92 static void vfio_state_buffers_size_set(VFIOStateBuffers *bufs,
93                                         unsigned int size)
94 {
95     g_array_set_size(bufs->array, size);
96 }
97 
98 static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs,
99                                               unsigned int idx)
100 {
101     return &g_array_index(bufs->array, VFIOStateBuffer, idx);
102 }
103 
104 /* called with load_bufs_mutex locked */
105 static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev,
106                                           VFIODeviceStatePacket *packet,
107                                           size_t packet_total_size,
108                                           Error **errp)
109 {
110     VFIOMigration *migration = vbasedev->migration;
111     VFIOMultifd *multifd = migration->multifd;
112     VFIOStateBuffer *lb;
113 
114     vfio_state_buffers_assert_init(&multifd->load_bufs);
115     if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) {
116         vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1);
117     }
118 
119     lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx);
120     if (lb->is_present) {
121         error_setg(errp, "%s: state buffer %" PRIu32 " already filled",
122                    vbasedev->name, packet->idx);
123         return false;
124     }
125 
126     assert(packet->idx >= multifd->load_buf_idx);
127 
128     lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet));
129     lb->len = packet_total_size - sizeof(*packet);
130     lb->is_present = true;
131 
132     return true;
133 }
134 
135 bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size,
136                                     Error **errp)
137 {
138     VFIODevice *vbasedev = opaque;
139     VFIOMigration *migration = vbasedev->migration;
140     VFIOMultifd *multifd = migration->multifd;
141     VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data;
142 
143     if (!vfio_multifd_transfer_enabled(vbasedev)) {
144         error_setg(errp,
145                    "%s: got device state packet but not doing multifd transfer",
146                    vbasedev->name);
147         return false;
148     }
149 
150     assert(multifd);
151 
152     if (data_size < sizeof(*packet)) {
153         error_setg(errp, "%s: packet too short at %zu (min is %zu)",
154                    vbasedev->name, data_size, sizeof(*packet));
155         return false;
156     }
157 
158     if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) {
159         error_setg(errp, "%s: packet has unknown version %" PRIu32,
160                    vbasedev->name, packet->version);
161         return false;
162     }
163 
164     if (packet->idx == UINT32_MAX) {
165         error_setg(errp, "%s: packet index is invalid", vbasedev->name);
166         return false;
167     }
168 
169     trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx);
170 
171     /*
172      * Holding BQL here would violate the lock order and can cause
173      * a deadlock once we attempt to lock load_bufs_mutex below.
174      */
175     assert(!bql_locked());
176 
177     WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
178         /* config state packet should be the last one in the stream */
179         if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) {
180             multifd->load_buf_idx_last = packet->idx;
181         }
182 
183         if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size,
184                                            errp)) {
185             return false;
186         }
187 
188         qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
189     }
190 
191     return true;
192 }
193 
194 static bool vfio_load_bufs_thread_load_config(VFIODevice *vbasedev,
195                                               Error **errp)
196 {
197     VFIOMigration *migration = vbasedev->migration;
198     VFIOMultifd *multifd = migration->multifd;
199     VFIOStateBuffer *lb;
200     g_autoptr(QIOChannelBuffer) bioc = NULL;
201     g_autoptr(QEMUFile) f_out = NULL, f_in = NULL;
202     uint64_t mig_header;
203     int ret;
204 
205     assert(multifd->load_buf_idx == multifd->load_buf_idx_last);
206     lb = vfio_state_buffers_at(&multifd->load_bufs, multifd->load_buf_idx);
207     assert(lb->is_present);
208 
209     bioc = qio_channel_buffer_new(lb->len);
210     qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-load");
211 
212     f_out = qemu_file_new_output(QIO_CHANNEL(bioc));
213     qemu_put_buffer(f_out, (uint8_t *)lb->data, lb->len);
214 
215     ret = qemu_fflush(f_out);
216     if (ret) {
217         error_setg(errp, "%s: load config state flush failed: %d",
218                    vbasedev->name, ret);
219         return false;
220     }
221 
222     qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL);
223     f_in = qemu_file_new_input(QIO_CHANNEL(bioc));
224 
225     mig_header = qemu_get_be64(f_in);
226     if (mig_header != VFIO_MIG_FLAG_DEV_CONFIG_STATE) {
227         error_setg(errp, "%s: expected FLAG_DEV_CONFIG_STATE but got %" PRIx64,
228                    vbasedev->name, mig_header);
229         return false;
230     }
231 
232     bql_lock();
233     ret = vfio_load_device_config_state(f_in, vbasedev);
234     bql_unlock();
235 
236     if (ret < 0) {
237         error_setg(errp, "%s: vfio_load_device_config_state() failed: %d",
238                    vbasedev->name, ret);
239         return false;
240     }
241 
242     return true;
243 }
244 
245 static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd)
246 {
247     VFIOStateBuffer *lb;
248     unsigned int bufs_len;
249 
250     bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs);
251     if (multifd->load_buf_idx >= bufs_len) {
252         assert(multifd->load_buf_idx == bufs_len);
253         return NULL;
254     }
255 
256     lb = vfio_state_buffers_at(&multifd->load_bufs,
257                                multifd->load_buf_idx);
258     if (!lb->is_present) {
259         return NULL;
260     }
261 
262     return lb;
263 }
264 
265 static bool vfio_load_state_buffer_write(VFIODevice *vbasedev,
266                                          VFIOStateBuffer *lb,
267                                          Error **errp)
268 {
269     VFIOMigration *migration = vbasedev->migration;
270     VFIOMultifd *multifd = migration->multifd;
271     g_autofree char *buf = NULL;
272     char *buf_cur;
273     size_t buf_len;
274 
275     if (!lb->len) {
276         return true;
277     }
278 
279     trace_vfio_load_state_device_buffer_load_start(vbasedev->name,
280                                                    multifd->load_buf_idx);
281 
282     /* lb might become re-allocated when we drop the lock */
283     buf = g_steal_pointer(&lb->data);
284     buf_cur = buf;
285     buf_len = lb->len;
286     while (buf_len > 0) {
287         ssize_t wr_ret;
288         int errno_save;
289 
290         /*
291          * Loading data to the device takes a while,
292          * drop the lock during this process.
293          */
294         qemu_mutex_unlock(&multifd->load_bufs_mutex);
295         wr_ret = write(migration->data_fd, buf_cur, buf_len);
296         errno_save = errno;
297         qemu_mutex_lock(&multifd->load_bufs_mutex);
298 
299         if (wr_ret < 0) {
300             error_setg(errp,
301                        "%s: writing state buffer %" PRIu32 " failed: %d",
302                        vbasedev->name, multifd->load_buf_idx, errno_save);
303             return false;
304         }
305 
306         assert(wr_ret <= buf_len);
307         buf_len -= wr_ret;
308         buf_cur += wr_ret;
309     }
310 
311     trace_vfio_load_state_device_buffer_load_end(vbasedev->name,
312                                                  multifd->load_buf_idx);
313 
314     return true;
315 }
316 
317 static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd,
318                                             bool *should_quit)
319 {
320     return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit);
321 }
322 
323 /*
324  * This thread is spawned by vfio_multifd_switchover_start() which gets
325  * called upon encountering the switchover point marker in main migration
326  * stream.
327  *
328  * It exits after either:
329  * * completing loading the remaining device state and device config, OR:
330  * * encountering some error while doing the above, OR:
331  * * being forcefully aborted by the migration core by it setting should_quit
332  *   or by vfio_load_cleanup_load_bufs_thread() setting
333  *   multifd->load_bufs_thread_want_exit.
334  */
335 static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp)
336 {
337     VFIODevice *vbasedev = opaque;
338     VFIOMigration *migration = vbasedev->migration;
339     VFIOMultifd *multifd = migration->multifd;
340     bool ret = false;
341 
342     trace_vfio_load_bufs_thread_start(vbasedev->name);
343 
344     assert(multifd);
345     QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
346 
347     assert(multifd->load_bufs_thread_running);
348 
349     while (true) {
350         VFIOStateBuffer *lb;
351 
352         /*
353          * Always check cancellation first after the buffer_ready wait below in
354          * case that cond was signalled by vfio_load_cleanup_load_bufs_thread().
355          */
356         if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) {
357             error_setg(errp, "operation cancelled");
358             goto thread_exit;
359         }
360 
361         assert(multifd->load_buf_idx <= multifd->load_buf_idx_last);
362 
363         lb = vfio_load_state_buffer_get(multifd);
364         if (!lb) {
365             trace_vfio_load_state_device_buffer_starved(vbasedev->name,
366                                                         multifd->load_buf_idx);
367             qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond,
368                            &multifd->load_bufs_mutex);
369             continue;
370         }
371 
372         if (multifd->load_buf_idx == multifd->load_buf_idx_last) {
373             break;
374         }
375 
376         if (multifd->load_buf_idx == 0) {
377             trace_vfio_load_state_device_buffer_start(vbasedev->name);
378         }
379 
380         if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) {
381             goto thread_exit;
382         }
383 
384         if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) {
385             trace_vfio_load_state_device_buffer_end(vbasedev->name);
386         }
387 
388         multifd->load_buf_idx++;
389     }
390 
391     if (!vfio_load_bufs_thread_load_config(vbasedev, errp)) {
392         goto thread_exit;
393     }
394 
395     ret = true;
396 
397 thread_exit:
398     /*
399      * Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that
400      * this thread is exiting.
401      */
402     multifd->load_bufs_thread_running = false;
403     qemu_cond_signal(&multifd->load_bufs_thread_finished_cond);
404 
405     trace_vfio_load_bufs_thread_end(vbasedev->name);
406 
407     return ret;
408 }
409 
410 static VFIOMultifd *vfio_multifd_new(void)
411 {
412     VFIOMultifd *multifd = g_new(VFIOMultifd, 1);
413 
414     vfio_state_buffers_init(&multifd->load_bufs);
415 
416     qemu_mutex_init(&multifd->load_bufs_mutex);
417 
418     multifd->load_buf_idx = 0;
419     multifd->load_buf_idx_last = UINT32_MAX;
420     qemu_cond_init(&multifd->load_bufs_buffer_ready_cond);
421 
422     multifd->load_bufs_thread_running = false;
423     multifd->load_bufs_thread_want_exit = false;
424     qemu_cond_init(&multifd->load_bufs_thread_finished_cond);
425 
426     return multifd;
427 }
428 
429 /*
430  * Terminates vfio_load_bufs_thread by setting
431  * multifd->load_bufs_thread_want_exit and signalling all the conditions
432  * the thread could be blocked on.
433  *
434  * Waits for the thread to signal that it had finished.
435  */
436 static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd)
437 {
438     /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
439     bql_unlock();
440     WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
441         while (multifd->load_bufs_thread_running) {
442             multifd->load_bufs_thread_want_exit = true;
443 
444             qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
445             qemu_cond_wait(&multifd->load_bufs_thread_finished_cond,
446                            &multifd->load_bufs_mutex);
447         }
448     }
449     bql_lock();
450 }
451 
452 static void vfio_multifd_free(VFIOMultifd *multifd)
453 {
454     vfio_load_cleanup_load_bufs_thread(multifd);
455 
456     qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond);
457     vfio_state_buffers_destroy(&multifd->load_bufs);
458     qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond);
459     qemu_mutex_destroy(&multifd->load_bufs_mutex);
460 
461     g_free(multifd);
462 }
463 
464 void vfio_multifd_cleanup(VFIODevice *vbasedev)
465 {
466     VFIOMigration *migration = vbasedev->migration;
467 
468     g_clear_pointer(&migration->multifd, vfio_multifd_free);
469 }
470 
471 bool vfio_multifd_transfer_supported(void)
472 {
473     return multifd_device_state_supported() &&
474         migrate_send_switchover_start();
475 }
476 
477 bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev)
478 {
479     VFIOMigration *migration = vbasedev->migration;
480 
481     return migration->multifd_transfer;
482 }
483 
484 bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp)
485 {
486     VFIOMigration *migration = vbasedev->migration;
487 
488     if (vbasedev->migration_multifd_transfer == ON_OFF_AUTO_AUTO) {
489         migration->multifd_transfer = vfio_multifd_transfer_supported();
490     } else {
491         migration->multifd_transfer =
492             vbasedev->migration_multifd_transfer == ON_OFF_AUTO_ON;
493     }
494 
495     if (!vfio_multifd_transfer_enabled(vbasedev)) {
496         /* Nothing further to check or do */
497         return true;
498     }
499 
500     if (!vfio_multifd_transfer_supported()) {
501         error_setg(errp,
502                    "%s: Multifd device transfer requested but unsupported in the current config",
503                    vbasedev->name);
504         return false;
505     }
506 
507     if (alloc_multifd) {
508         assert(!migration->multifd);
509         migration->multifd = vfio_multifd_new();
510     }
511 
512     return true;
513 }
514 
515 void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f)
516 {
517     assert(vfio_multifd_transfer_enabled(vbasedev));
518 
519     /*
520      * Emit dummy NOP data on the main migration channel since the actual
521      * device state transfer is done via multifd channels.
522      */
523     qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
524 }
525 
526 static bool
527 vfio_save_complete_precopy_thread_config_state(VFIODevice *vbasedev,
528                                                char *idstr,
529                                                uint32_t instance_id,
530                                                uint32_t idx,
531                                                Error **errp)
532 {
533     g_autoptr(QIOChannelBuffer) bioc = NULL;
534     g_autoptr(QEMUFile) f = NULL;
535     int ret;
536     g_autofree VFIODeviceStatePacket *packet = NULL;
537     size_t packet_len;
538 
539     bioc = qio_channel_buffer_new(0);
540     qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save");
541 
542     f = qemu_file_new_output(QIO_CHANNEL(bioc));
543 
544     if (vfio_save_device_config_state(f, vbasedev, errp)) {
545         return false;
546     }
547 
548     ret = qemu_fflush(f);
549     if (ret) {
550         error_setg(errp, "%s: save config state flush failed: %d",
551                    vbasedev->name, ret);
552         return false;
553     }
554 
555     packet_len = sizeof(*packet) + bioc->usage;
556     packet = g_malloc0(packet_len);
557     packet->version = VFIO_DEVICE_STATE_PACKET_VER_CURRENT;
558     packet->idx = idx;
559     packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE;
560     memcpy(&packet->data, bioc->data, bioc->usage);
561 
562     if (!multifd_queue_device_state(idstr, instance_id,
563                                     (char *)packet, packet_len)) {
564         error_setg(errp, "%s: multifd config data queuing failed",
565                    vbasedev->name);
566         return false;
567     }
568 
569     vfio_mig_add_bytes_transferred(packet_len);
570 
571     return true;
572 }
573 
574 /*
575  * This thread is spawned by the migration core directly via
576  * .save_live_complete_precopy_thread SaveVMHandler.
577  *
578  * It exits after either:
579  * * completing saving the remaining device state and device config, OR:
580  * * encountering some error while doing the above, OR:
581  * * being forcefully aborted by the migration core by
582  *   multifd_device_state_save_thread_should_exit() returning true.
583  */
584 bool
585 vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d,
586                                           Error **errp)
587 {
588     VFIODevice *vbasedev = d->handler_opaque;
589     VFIOMigration *migration = vbasedev->migration;
590     bool ret = false;
591     g_autofree VFIODeviceStatePacket *packet = NULL;
592     uint32_t idx;
593 
594     if (!vfio_multifd_transfer_enabled(vbasedev)) {
595         /* Nothing to do, vfio_save_complete_precopy() does the transfer. */
596         return true;
597     }
598 
599     trace_vfio_save_complete_precopy_thread_start(vbasedev->name,
600                                                   d->idstr, d->instance_id);
601 
602     /* We reach here with device state STOP or STOP_COPY only */
603     if (vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
604                                  VFIO_DEVICE_STATE_STOP, errp)) {
605         goto thread_exit;
606     }
607 
608     packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size);
609     packet->version = VFIO_DEVICE_STATE_PACKET_VER_CURRENT;
610 
611     for (idx = 0; ; idx++) {
612         ssize_t data_size;
613         size_t packet_size;
614 
615         if (multifd_device_state_save_thread_should_exit()) {
616             error_setg(errp, "operation cancelled");
617             goto thread_exit;
618         }
619 
620         data_size = read(migration->data_fd, &packet->data,
621                          migration->data_buffer_size);
622         if (data_size < 0) {
623             error_setg(errp, "%s: reading state buffer %" PRIu32 " failed: %d",
624                        vbasedev->name, idx, errno);
625             goto thread_exit;
626         } else if (data_size == 0) {
627             break;
628         }
629 
630         packet->idx = idx;
631         packet_size = sizeof(*packet) + data_size;
632 
633         if (!multifd_queue_device_state(d->idstr, d->instance_id,
634                                         (char *)packet, packet_size)) {
635             error_setg(errp, "%s: multifd data queuing failed", vbasedev->name);
636             goto thread_exit;
637         }
638 
639         vfio_mig_add_bytes_transferred(packet_size);
640     }
641 
642     if (!vfio_save_complete_precopy_thread_config_state(vbasedev,
643                                                         d->idstr,
644                                                         d->instance_id,
645                                                         idx, errp)) {
646         goto thread_exit;
647    }
648 
649     ret = true;
650 
651 thread_exit:
652     trace_vfio_save_complete_precopy_thread_end(vbasedev->name, ret);
653 
654     return ret;
655 }
656 
657 int vfio_multifd_switchover_start(VFIODevice *vbasedev)
658 {
659     VFIOMigration *migration = vbasedev->migration;
660     VFIOMultifd *multifd = migration->multifd;
661 
662     assert(multifd);
663 
664     /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
665     bql_unlock();
666     WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
667         assert(!multifd->load_bufs_thread_running);
668         multifd->load_bufs_thread_running = true;
669     }
670     bql_lock();
671 
672     qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev);
673 
674     return 0;
675 }
676