xref: /qemu/hw/vfio/migration-multifd.c (revision c07cd110a1824e2d046581af7375f16dac26e96f)
1 /*
2  * Multifd VFIO migration
3  *
4  * Copyright (C) 2024,2025 Oracle and/or its affiliates.
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
7  * See the COPYING file in the top-level directory.
8  *
9  * SPDX-License-Identifier: GPL-2.0-or-later
10  */
11 
12 #include "qemu/osdep.h"
13 #include "hw/vfio/vfio-common.h"
14 #include "migration/misc.h"
15 #include "qapi/error.h"
16 #include "qemu/bswap.h"
17 #include "qemu/error-report.h"
18 #include "qemu/lockable.h"
19 #include "qemu/main-loop.h"
20 #include "qemu/thread.h"
21 #include "io/channel-buffer.h"
22 #include "migration/qemu-file.h"
23 #include "migration-multifd.h"
24 #include "trace.h"
25 
26 #define VFIO_DEVICE_STATE_CONFIG_STATE (1)
27 
28 #define VFIO_DEVICE_STATE_PACKET_VER_CURRENT (0)
29 
30 typedef struct VFIODeviceStatePacket {
31     uint32_t version;
32     uint32_t idx;
33     uint32_t flags;
34     uint8_t data[0];
35 } QEMU_PACKED VFIODeviceStatePacket;
36 
37 /* type safety */
38 typedef struct VFIOStateBuffers {
39     GArray *array;
40 } VFIOStateBuffers;
41 
42 typedef struct VFIOStateBuffer {
43     bool is_present;
44     char *data;
45     size_t len;
46 } VFIOStateBuffer;
47 
48 typedef struct VFIOMultifd {
49     bool load_bufs_thread_running;
50     bool load_bufs_thread_want_exit;
51 
52     VFIOStateBuffers load_bufs;
53     QemuCond load_bufs_buffer_ready_cond;
54     QemuCond load_bufs_thread_finished_cond;
55     QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
56     uint32_t load_buf_idx;
57     uint32_t load_buf_idx_last;
58 } VFIOMultifd;
59 
60 static void vfio_state_buffer_clear(gpointer data)
61 {
62     VFIOStateBuffer *lb = data;
63 
64     if (!lb->is_present) {
65         return;
66     }
67 
68     g_clear_pointer(&lb->data, g_free);
69     lb->is_present = false;
70 }
71 
72 static void vfio_state_buffers_init(VFIOStateBuffers *bufs)
73 {
74     bufs->array = g_array_new(FALSE, TRUE, sizeof(VFIOStateBuffer));
75     g_array_set_clear_func(bufs->array, vfio_state_buffer_clear);
76 }
77 
78 static void vfio_state_buffers_destroy(VFIOStateBuffers *bufs)
79 {
80     g_clear_pointer(&bufs->array, g_array_unref);
81 }
82 
83 static void vfio_state_buffers_assert_init(VFIOStateBuffers *bufs)
84 {
85     assert(bufs->array);
86 }
87 
88 static unsigned int vfio_state_buffers_size_get(VFIOStateBuffers *bufs)
89 {
90     return bufs->array->len;
91 }
92 
93 static void vfio_state_buffers_size_set(VFIOStateBuffers *bufs,
94                                         unsigned int size)
95 {
96     g_array_set_size(bufs->array, size);
97 }
98 
99 static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs,
100                                               unsigned int idx)
101 {
102     return &g_array_index(bufs->array, VFIOStateBuffer, idx);
103 }
104 
105 /* called with load_bufs_mutex locked */
106 static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev,
107                                           VFIODeviceStatePacket *packet,
108                                           size_t packet_total_size,
109                                           Error **errp)
110 {
111     VFIOMigration *migration = vbasedev->migration;
112     VFIOMultifd *multifd = migration->multifd;
113     VFIOStateBuffer *lb;
114 
115     vfio_state_buffers_assert_init(&multifd->load_bufs);
116     if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) {
117         vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1);
118     }
119 
120     lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx);
121     if (lb->is_present) {
122         error_setg(errp, "%s: state buffer %" PRIu32 " already filled",
123                    vbasedev->name, packet->idx);
124         return false;
125     }
126 
127     assert(packet->idx >= multifd->load_buf_idx);
128 
129     lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet));
130     lb->len = packet_total_size - sizeof(*packet);
131     lb->is_present = true;
132 
133     return true;
134 }
135 
136 bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size,
137                                     Error **errp)
138 {
139     VFIODevice *vbasedev = opaque;
140     VFIOMigration *migration = vbasedev->migration;
141     VFIOMultifd *multifd = migration->multifd;
142     VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data;
143 
144     if (!vfio_multifd_transfer_enabled(vbasedev)) {
145         error_setg(errp,
146                    "%s: got device state packet but not doing multifd transfer",
147                    vbasedev->name);
148         return false;
149     }
150 
151     assert(multifd);
152 
153     if (data_size < sizeof(*packet)) {
154         error_setg(errp, "%s: packet too short at %zu (min is %zu)",
155                    vbasedev->name, data_size, sizeof(*packet));
156         return false;
157     }
158 
159     packet->version = be32_to_cpu(packet->version);
160     if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) {
161         error_setg(errp, "%s: packet has unknown version %" PRIu32,
162                    vbasedev->name, packet->version);
163         return false;
164     }
165 
166     packet->idx = be32_to_cpu(packet->idx);
167     packet->flags = be32_to_cpu(packet->flags);
168 
169     if (packet->idx == UINT32_MAX) {
170         error_setg(errp, "%s: packet index is invalid", vbasedev->name);
171         return false;
172     }
173 
174     trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx);
175 
176     /*
177      * Holding BQL here would violate the lock order and can cause
178      * a deadlock once we attempt to lock load_bufs_mutex below.
179      */
180     assert(!bql_locked());
181 
182     WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
183         /* config state packet should be the last one in the stream */
184         if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) {
185             multifd->load_buf_idx_last = packet->idx;
186         }
187 
188         if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size,
189                                            errp)) {
190             return false;
191         }
192 
193         qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
194     }
195 
196     return true;
197 }
198 
199 static bool vfio_load_bufs_thread_load_config(VFIODevice *vbasedev,
200                                               Error **errp)
201 {
202     VFIOMigration *migration = vbasedev->migration;
203     VFIOMultifd *multifd = migration->multifd;
204     VFIOStateBuffer *lb;
205     g_autoptr(QIOChannelBuffer) bioc = NULL;
206     g_autoptr(QEMUFile) f_out = NULL, f_in = NULL;
207     uint64_t mig_header;
208     int ret;
209 
210     assert(multifd->load_buf_idx == multifd->load_buf_idx_last);
211     lb = vfio_state_buffers_at(&multifd->load_bufs, multifd->load_buf_idx);
212     assert(lb->is_present);
213 
214     bioc = qio_channel_buffer_new(lb->len);
215     qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-load");
216 
217     f_out = qemu_file_new_output(QIO_CHANNEL(bioc));
218     qemu_put_buffer(f_out, (uint8_t *)lb->data, lb->len);
219 
220     ret = qemu_fflush(f_out);
221     if (ret) {
222         error_setg(errp, "%s: load config state flush failed: %d",
223                    vbasedev->name, ret);
224         return false;
225     }
226 
227     qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL);
228     f_in = qemu_file_new_input(QIO_CHANNEL(bioc));
229 
230     mig_header = qemu_get_be64(f_in);
231     if (mig_header != VFIO_MIG_FLAG_DEV_CONFIG_STATE) {
232         error_setg(errp, "%s: expected FLAG_DEV_CONFIG_STATE but got %" PRIx64,
233                    vbasedev->name, mig_header);
234         return false;
235     }
236 
237     bql_lock();
238     ret = vfio_load_device_config_state(f_in, vbasedev);
239     bql_unlock();
240 
241     if (ret < 0) {
242         error_setg(errp, "%s: vfio_load_device_config_state() failed: %d",
243                    vbasedev->name, ret);
244         return false;
245     }
246 
247     return true;
248 }
249 
250 static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd)
251 {
252     VFIOStateBuffer *lb;
253     unsigned int bufs_len;
254 
255     bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs);
256     if (multifd->load_buf_idx >= bufs_len) {
257         assert(multifd->load_buf_idx == bufs_len);
258         return NULL;
259     }
260 
261     lb = vfio_state_buffers_at(&multifd->load_bufs,
262                                multifd->load_buf_idx);
263     if (!lb->is_present) {
264         return NULL;
265     }
266 
267     return lb;
268 }
269 
270 static bool vfio_load_state_buffer_write(VFIODevice *vbasedev,
271                                          VFIOStateBuffer *lb,
272                                          Error **errp)
273 {
274     VFIOMigration *migration = vbasedev->migration;
275     VFIOMultifd *multifd = migration->multifd;
276     g_autofree char *buf = NULL;
277     char *buf_cur;
278     size_t buf_len;
279 
280     if (!lb->len) {
281         return true;
282     }
283 
284     trace_vfio_load_state_device_buffer_load_start(vbasedev->name,
285                                                    multifd->load_buf_idx);
286 
287     /* lb might become re-allocated when we drop the lock */
288     buf = g_steal_pointer(&lb->data);
289     buf_cur = buf;
290     buf_len = lb->len;
291     while (buf_len > 0) {
292         ssize_t wr_ret;
293         int errno_save;
294 
295         /*
296          * Loading data to the device takes a while,
297          * drop the lock during this process.
298          */
299         qemu_mutex_unlock(&multifd->load_bufs_mutex);
300         wr_ret = write(migration->data_fd, buf_cur, buf_len);
301         errno_save = errno;
302         qemu_mutex_lock(&multifd->load_bufs_mutex);
303 
304         if (wr_ret < 0) {
305             error_setg(errp,
306                        "%s: writing state buffer %" PRIu32 " failed: %d",
307                        vbasedev->name, multifd->load_buf_idx, errno_save);
308             return false;
309         }
310 
311         assert(wr_ret <= buf_len);
312         buf_len -= wr_ret;
313         buf_cur += wr_ret;
314     }
315 
316     trace_vfio_load_state_device_buffer_load_end(vbasedev->name,
317                                                  multifd->load_buf_idx);
318 
319     return true;
320 }
321 
322 static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd,
323                                             bool *should_quit)
324 {
325     return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit);
326 }
327 
328 /*
329  * This thread is spawned by vfio_multifd_switchover_start() which gets
330  * called upon encountering the switchover point marker in main migration
331  * stream.
332  *
333  * It exits after either:
334  * * completing loading the remaining device state and device config, OR:
335  * * encountering some error while doing the above, OR:
336  * * being forcefully aborted by the migration core by it setting should_quit
337  *   or by vfio_load_cleanup_load_bufs_thread() setting
338  *   multifd->load_bufs_thread_want_exit.
339  */
340 static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp)
341 {
342     VFIODevice *vbasedev = opaque;
343     VFIOMigration *migration = vbasedev->migration;
344     VFIOMultifd *multifd = migration->multifd;
345     bool ret = false;
346 
347     trace_vfio_load_bufs_thread_start(vbasedev->name);
348 
349     assert(multifd);
350     QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
351 
352     assert(multifd->load_bufs_thread_running);
353 
354     while (true) {
355         VFIOStateBuffer *lb;
356 
357         /*
358          * Always check cancellation first after the buffer_ready wait below in
359          * case that cond was signalled by vfio_load_cleanup_load_bufs_thread().
360          */
361         if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) {
362             error_setg(errp, "operation cancelled");
363             goto thread_exit;
364         }
365 
366         assert(multifd->load_buf_idx <= multifd->load_buf_idx_last);
367 
368         lb = vfio_load_state_buffer_get(multifd);
369         if (!lb) {
370             trace_vfio_load_state_device_buffer_starved(vbasedev->name,
371                                                         multifd->load_buf_idx);
372             qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond,
373                            &multifd->load_bufs_mutex);
374             continue;
375         }
376 
377         if (multifd->load_buf_idx == multifd->load_buf_idx_last) {
378             break;
379         }
380 
381         if (multifd->load_buf_idx == 0) {
382             trace_vfio_load_state_device_buffer_start(vbasedev->name);
383         }
384 
385         if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) {
386             goto thread_exit;
387         }
388 
389         if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) {
390             trace_vfio_load_state_device_buffer_end(vbasedev->name);
391         }
392 
393         multifd->load_buf_idx++;
394     }
395 
396     if (!vfio_load_bufs_thread_load_config(vbasedev, errp)) {
397         goto thread_exit;
398     }
399 
400     ret = true;
401 
402 thread_exit:
403     /*
404      * Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that
405      * this thread is exiting.
406      */
407     multifd->load_bufs_thread_running = false;
408     qemu_cond_signal(&multifd->load_bufs_thread_finished_cond);
409 
410     trace_vfio_load_bufs_thread_end(vbasedev->name);
411 
412     return ret;
413 }
414 
415 static VFIOMultifd *vfio_multifd_new(void)
416 {
417     VFIOMultifd *multifd = g_new(VFIOMultifd, 1);
418 
419     vfio_state_buffers_init(&multifd->load_bufs);
420 
421     qemu_mutex_init(&multifd->load_bufs_mutex);
422 
423     multifd->load_buf_idx = 0;
424     multifd->load_buf_idx_last = UINT32_MAX;
425     qemu_cond_init(&multifd->load_bufs_buffer_ready_cond);
426 
427     multifd->load_bufs_thread_running = false;
428     multifd->load_bufs_thread_want_exit = false;
429     qemu_cond_init(&multifd->load_bufs_thread_finished_cond);
430 
431     return multifd;
432 }
433 
434 /*
435  * Terminates vfio_load_bufs_thread by setting
436  * multifd->load_bufs_thread_want_exit and signalling all the conditions
437  * the thread could be blocked on.
438  *
439  * Waits for the thread to signal that it had finished.
440  */
441 static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd)
442 {
443     /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
444     bql_unlock();
445     WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
446         while (multifd->load_bufs_thread_running) {
447             multifd->load_bufs_thread_want_exit = true;
448 
449             qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
450             qemu_cond_wait(&multifd->load_bufs_thread_finished_cond,
451                            &multifd->load_bufs_mutex);
452         }
453     }
454     bql_lock();
455 }
456 
457 static void vfio_multifd_free(VFIOMultifd *multifd)
458 {
459     vfio_load_cleanup_load_bufs_thread(multifd);
460 
461     qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond);
462     vfio_state_buffers_destroy(&multifd->load_bufs);
463     qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond);
464     qemu_mutex_destroy(&multifd->load_bufs_mutex);
465 
466     g_free(multifd);
467 }
468 
469 void vfio_multifd_cleanup(VFIODevice *vbasedev)
470 {
471     VFIOMigration *migration = vbasedev->migration;
472 
473     g_clear_pointer(&migration->multifd, vfio_multifd_free);
474 }
475 
476 bool vfio_multifd_transfer_supported(void)
477 {
478     return multifd_device_state_supported() &&
479         migrate_send_switchover_start();
480 }
481 
482 bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev)
483 {
484     VFIOMigration *migration = vbasedev->migration;
485 
486     return migration->multifd_transfer;
487 }
488 
489 bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp)
490 {
491     VFIOMigration *migration = vbasedev->migration;
492 
493     /*
494      * Make a copy of this setting at the start in case it is changed
495      * mid-migration.
496      */
497     if (vbasedev->migration_multifd_transfer == ON_OFF_AUTO_AUTO) {
498         migration->multifd_transfer = vfio_multifd_transfer_supported();
499     } else {
500         migration->multifd_transfer =
501             vbasedev->migration_multifd_transfer == ON_OFF_AUTO_ON;
502     }
503 
504     if (!vfio_multifd_transfer_enabled(vbasedev)) {
505         /* Nothing further to check or do */
506         return true;
507     }
508 
509     if (!vfio_multifd_transfer_supported()) {
510         error_setg(errp,
511                    "%s: Multifd device transfer requested but unsupported in the current config",
512                    vbasedev->name);
513         return false;
514     }
515 
516     if (alloc_multifd) {
517         assert(!migration->multifd);
518         migration->multifd = vfio_multifd_new();
519     }
520 
521     return true;
522 }
523 
524 void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f)
525 {
526     assert(vfio_multifd_transfer_enabled(vbasedev));
527 
528     /*
529      * Emit dummy NOP data on the main migration channel since the actual
530      * device state transfer is done via multifd channels.
531      */
532     qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
533 }
534 
535 static bool
536 vfio_save_complete_precopy_thread_config_state(VFIODevice *vbasedev,
537                                                char *idstr,
538                                                uint32_t instance_id,
539                                                uint32_t idx,
540                                                Error **errp)
541 {
542     g_autoptr(QIOChannelBuffer) bioc = NULL;
543     g_autoptr(QEMUFile) f = NULL;
544     int ret;
545     g_autofree VFIODeviceStatePacket *packet = NULL;
546     size_t packet_len;
547 
548     bioc = qio_channel_buffer_new(0);
549     qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save");
550 
551     f = qemu_file_new_output(QIO_CHANNEL(bioc));
552 
553     if (vfio_save_device_config_state(f, vbasedev, errp)) {
554         return false;
555     }
556 
557     ret = qemu_fflush(f);
558     if (ret) {
559         error_setg(errp, "%s: save config state flush failed: %d",
560                    vbasedev->name, ret);
561         return false;
562     }
563 
564     packet_len = sizeof(*packet) + bioc->usage;
565     packet = g_malloc0(packet_len);
566     packet->version = cpu_to_be32(VFIO_DEVICE_STATE_PACKET_VER_CURRENT);
567     packet->idx = cpu_to_be32(idx);
568     packet->flags = cpu_to_be32(VFIO_DEVICE_STATE_CONFIG_STATE);
569     memcpy(&packet->data, bioc->data, bioc->usage);
570 
571     if (!multifd_queue_device_state(idstr, instance_id,
572                                     (char *)packet, packet_len)) {
573         error_setg(errp, "%s: multifd config data queuing failed",
574                    vbasedev->name);
575         return false;
576     }
577 
578     vfio_mig_add_bytes_transferred(packet_len);
579 
580     return true;
581 }
582 
583 /*
584  * This thread is spawned by the migration core directly via
585  * .save_live_complete_precopy_thread SaveVMHandler.
586  *
587  * It exits after either:
588  * * completing saving the remaining device state and device config, OR:
589  * * encountering some error while doing the above, OR:
590  * * being forcefully aborted by the migration core by
591  *   multifd_device_state_save_thread_should_exit() returning true.
592  */
593 bool
594 vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d,
595                                           Error **errp)
596 {
597     VFIODevice *vbasedev = d->handler_opaque;
598     VFIOMigration *migration = vbasedev->migration;
599     bool ret = false;
600     g_autofree VFIODeviceStatePacket *packet = NULL;
601     uint32_t idx;
602 
603     if (!vfio_multifd_transfer_enabled(vbasedev)) {
604         /* Nothing to do, vfio_save_complete_precopy() does the transfer. */
605         return true;
606     }
607 
608     trace_vfio_save_complete_precopy_thread_start(vbasedev->name,
609                                                   d->idstr, d->instance_id);
610 
611     /* We reach here with device state STOP or STOP_COPY only */
612     if (vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
613                                  VFIO_DEVICE_STATE_STOP, errp)) {
614         goto thread_exit;
615     }
616 
617     packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size);
618     packet->version = cpu_to_be32(VFIO_DEVICE_STATE_PACKET_VER_CURRENT);
619 
620     for (idx = 0; ; idx++) {
621         ssize_t data_size;
622         size_t packet_size;
623 
624         if (multifd_device_state_save_thread_should_exit()) {
625             error_setg(errp, "operation cancelled");
626             goto thread_exit;
627         }
628 
629         data_size = read(migration->data_fd, &packet->data,
630                          migration->data_buffer_size);
631         if (data_size < 0) {
632             error_setg(errp, "%s: reading state buffer %" PRIu32 " failed: %d",
633                        vbasedev->name, idx, errno);
634             goto thread_exit;
635         } else if (data_size == 0) {
636             break;
637         }
638 
639         packet->idx = cpu_to_be32(idx);
640         packet_size = sizeof(*packet) + data_size;
641 
642         if (!multifd_queue_device_state(d->idstr, d->instance_id,
643                                         (char *)packet, packet_size)) {
644             error_setg(errp, "%s: multifd data queuing failed", vbasedev->name);
645             goto thread_exit;
646         }
647 
648         vfio_mig_add_bytes_transferred(packet_size);
649     }
650 
651     if (!vfio_save_complete_precopy_thread_config_state(vbasedev,
652                                                         d->idstr,
653                                                         d->instance_id,
654                                                         idx, errp)) {
655         goto thread_exit;
656    }
657 
658     ret = true;
659 
660 thread_exit:
661     trace_vfio_save_complete_precopy_thread_end(vbasedev->name, ret);
662 
663     return ret;
664 }
665 
666 int vfio_multifd_switchover_start(VFIODevice *vbasedev)
667 {
668     VFIOMigration *migration = vbasedev->migration;
669     VFIOMultifd *multifd = migration->multifd;
670 
671     assert(multifd);
672 
673     /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
674     bql_unlock();
675     WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
676         assert(!multifd->load_bufs_thread_running);
677         multifd->load_bufs_thread_running = true;
678     }
679     bql_lock();
680 
681     qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev);
682 
683     return 0;
684 }
685