xref: /qemu/hw/vfio/migration-multifd.c (revision 6ff5da16000f908140723e164d33a0b51a6c4162)
1 /*
2  * Multifd VFIO migration
3  *
4  * Copyright (C) 2024,2025 Oracle and/or its affiliates.
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
7  * See the COPYING file in the top-level directory.
8  *
9  * SPDX-License-Identifier: GPL-2.0-or-later
10  */
11 
12 #include "qemu/osdep.h"
13 #include "hw/vfio/vfio-common.h"
14 #include "migration/misc.h"
15 #include "qapi/error.h"
16 #include "qemu/error-report.h"
17 #include "qemu/lockable.h"
18 #include "qemu/main-loop.h"
19 #include "qemu/thread.h"
20 #include "io/channel-buffer.h"
21 #include "migration/qemu-file.h"
22 #include "migration-multifd.h"
23 #include "trace.h"
24 
25 #define VFIO_DEVICE_STATE_CONFIG_STATE (1)
26 
27 #define VFIO_DEVICE_STATE_PACKET_VER_CURRENT (0)
28 
29 typedef struct VFIODeviceStatePacket {
30     uint32_t version;
31     uint32_t idx;
32     uint32_t flags;
33     uint8_t data[0];
34 } QEMU_PACKED VFIODeviceStatePacket;
35 
36 /* type safety */
37 typedef struct VFIOStateBuffers {
38     GArray *array;
39 } VFIOStateBuffers;
40 
41 typedef struct VFIOStateBuffer {
42     bool is_present;
43     char *data;
44     size_t len;
45 } VFIOStateBuffer;
46 
47 typedef struct VFIOMultifd {
48     bool load_bufs_thread_running;
49     bool load_bufs_thread_want_exit;
50 
51     VFIOStateBuffers load_bufs;
52     QemuCond load_bufs_buffer_ready_cond;
53     QemuCond load_bufs_thread_finished_cond;
54     QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
55     uint32_t load_buf_idx;
56     uint32_t load_buf_idx_last;
57 } VFIOMultifd;
58 
59 static void vfio_state_buffer_clear(gpointer data)
60 {
61     VFIOStateBuffer *lb = data;
62 
63     if (!lb->is_present) {
64         return;
65     }
66 
67     g_clear_pointer(&lb->data, g_free);
68     lb->is_present = false;
69 }
70 
71 static void vfio_state_buffers_init(VFIOStateBuffers *bufs)
72 {
73     bufs->array = g_array_new(FALSE, TRUE, sizeof(VFIOStateBuffer));
74     g_array_set_clear_func(bufs->array, vfio_state_buffer_clear);
75 }
76 
77 static void vfio_state_buffers_destroy(VFIOStateBuffers *bufs)
78 {
79     g_clear_pointer(&bufs->array, g_array_unref);
80 }
81 
82 static void vfio_state_buffers_assert_init(VFIOStateBuffers *bufs)
83 {
84     assert(bufs->array);
85 }
86 
87 static unsigned int vfio_state_buffers_size_get(VFIOStateBuffers *bufs)
88 {
89     return bufs->array->len;
90 }
91 
92 static void vfio_state_buffers_size_set(VFIOStateBuffers *bufs,
93                                         unsigned int size)
94 {
95     g_array_set_size(bufs->array, size);
96 }
97 
98 static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs,
99                                               unsigned int idx)
100 {
101     return &g_array_index(bufs->array, VFIOStateBuffer, idx);
102 }
103 
104 /* called with load_bufs_mutex locked */
105 static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev,
106                                           VFIODeviceStatePacket *packet,
107                                           size_t packet_total_size,
108                                           Error **errp)
109 {
110     VFIOMigration *migration = vbasedev->migration;
111     VFIOMultifd *multifd = migration->multifd;
112     VFIOStateBuffer *lb;
113 
114     vfio_state_buffers_assert_init(&multifd->load_bufs);
115     if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) {
116         vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1);
117     }
118 
119     lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx);
120     if (lb->is_present) {
121         error_setg(errp, "%s: state buffer %" PRIu32 " already filled",
122                    vbasedev->name, packet->idx);
123         return false;
124     }
125 
126     assert(packet->idx >= multifd->load_buf_idx);
127 
128     lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet));
129     lb->len = packet_total_size - sizeof(*packet);
130     lb->is_present = true;
131 
132     return true;
133 }
134 
135 bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size,
136                                     Error **errp)
137 {
138     VFIODevice *vbasedev = opaque;
139     VFIOMigration *migration = vbasedev->migration;
140     VFIOMultifd *multifd = migration->multifd;
141     VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data;
142 
143     if (!vfio_multifd_transfer_enabled(vbasedev)) {
144         error_setg(errp,
145                    "%s: got device state packet but not doing multifd transfer",
146                    vbasedev->name);
147         return false;
148     }
149 
150     assert(multifd);
151 
152     if (data_size < sizeof(*packet)) {
153         error_setg(errp, "%s: packet too short at %zu (min is %zu)",
154                    vbasedev->name, data_size, sizeof(*packet));
155         return false;
156     }
157 
158     if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) {
159         error_setg(errp, "%s: packet has unknown version %" PRIu32,
160                    vbasedev->name, packet->version);
161         return false;
162     }
163 
164     if (packet->idx == UINT32_MAX) {
165         error_setg(errp, "%s: packet index is invalid", vbasedev->name);
166         return false;
167     }
168 
169     trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx);
170 
171     /*
172      * Holding BQL here would violate the lock order and can cause
173      * a deadlock once we attempt to lock load_bufs_mutex below.
174      */
175     assert(!bql_locked());
176 
177     WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
178         /* config state packet should be the last one in the stream */
179         if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) {
180             multifd->load_buf_idx_last = packet->idx;
181         }
182 
183         if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size,
184                                            errp)) {
185             return false;
186         }
187 
188         qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
189     }
190 
191     return true;
192 }
193 
194 static bool vfio_load_bufs_thread_load_config(VFIODevice *vbasedev,
195                                               Error **errp)
196 {
197     VFIOMigration *migration = vbasedev->migration;
198     VFIOMultifd *multifd = migration->multifd;
199     VFIOStateBuffer *lb;
200     g_autoptr(QIOChannelBuffer) bioc = NULL;
201     g_autoptr(QEMUFile) f_out = NULL, f_in = NULL;
202     uint64_t mig_header;
203     int ret;
204 
205     assert(multifd->load_buf_idx == multifd->load_buf_idx_last);
206     lb = vfio_state_buffers_at(&multifd->load_bufs, multifd->load_buf_idx);
207     assert(lb->is_present);
208 
209     bioc = qio_channel_buffer_new(lb->len);
210     qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-load");
211 
212     f_out = qemu_file_new_output(QIO_CHANNEL(bioc));
213     qemu_put_buffer(f_out, (uint8_t *)lb->data, lb->len);
214 
215     ret = qemu_fflush(f_out);
216     if (ret) {
217         error_setg(errp, "%s: load config state flush failed: %d",
218                    vbasedev->name, ret);
219         return false;
220     }
221 
222     qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL);
223     f_in = qemu_file_new_input(QIO_CHANNEL(bioc));
224 
225     mig_header = qemu_get_be64(f_in);
226     if (mig_header != VFIO_MIG_FLAG_DEV_CONFIG_STATE) {
227         error_setg(errp, "%s: expected FLAG_DEV_CONFIG_STATE but got %" PRIx64,
228                    vbasedev->name, mig_header);
229         return false;
230     }
231 
232     bql_lock();
233     ret = vfio_load_device_config_state(f_in, vbasedev);
234     bql_unlock();
235 
236     if (ret < 0) {
237         error_setg(errp, "%s: vfio_load_device_config_state() failed: %d",
238                    vbasedev->name, ret);
239         return false;
240     }
241 
242     return true;
243 }
244 
245 static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd)
246 {
247     VFIOStateBuffer *lb;
248     unsigned int bufs_len;
249 
250     bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs);
251     if (multifd->load_buf_idx >= bufs_len) {
252         assert(multifd->load_buf_idx == bufs_len);
253         return NULL;
254     }
255 
256     lb = vfio_state_buffers_at(&multifd->load_bufs,
257                                multifd->load_buf_idx);
258     if (!lb->is_present) {
259         return NULL;
260     }
261 
262     return lb;
263 }
264 
265 static bool vfio_load_state_buffer_write(VFIODevice *vbasedev,
266                                          VFIOStateBuffer *lb,
267                                          Error **errp)
268 {
269     VFIOMigration *migration = vbasedev->migration;
270     VFIOMultifd *multifd = migration->multifd;
271     g_autofree char *buf = NULL;
272     char *buf_cur;
273     size_t buf_len;
274 
275     if (!lb->len) {
276         return true;
277     }
278 
279     trace_vfio_load_state_device_buffer_load_start(vbasedev->name,
280                                                    multifd->load_buf_idx);
281 
282     /* lb might become re-allocated when we drop the lock */
283     buf = g_steal_pointer(&lb->data);
284     buf_cur = buf;
285     buf_len = lb->len;
286     while (buf_len > 0) {
287         ssize_t wr_ret;
288         int errno_save;
289 
290         /*
291          * Loading data to the device takes a while,
292          * drop the lock during this process.
293          */
294         qemu_mutex_unlock(&multifd->load_bufs_mutex);
295         wr_ret = write(migration->data_fd, buf_cur, buf_len);
296         errno_save = errno;
297         qemu_mutex_lock(&multifd->load_bufs_mutex);
298 
299         if (wr_ret < 0) {
300             error_setg(errp,
301                        "%s: writing state buffer %" PRIu32 " failed: %d",
302                        vbasedev->name, multifd->load_buf_idx, errno_save);
303             return false;
304         }
305 
306         assert(wr_ret <= buf_len);
307         buf_len -= wr_ret;
308         buf_cur += wr_ret;
309     }
310 
311     trace_vfio_load_state_device_buffer_load_end(vbasedev->name,
312                                                  multifd->load_buf_idx);
313 
314     return true;
315 }
316 
317 static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd,
318                                             bool *should_quit)
319 {
320     return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit);
321 }
322 
323 /*
324  * This thread is spawned by vfio_multifd_switchover_start() which gets
325  * called upon encountering the switchover point marker in main migration
326  * stream.
327  *
328  * It exits after either:
329  * * completing loading the remaining device state and device config, OR:
330  * * encountering some error while doing the above, OR:
331  * * being forcefully aborted by the migration core by it setting should_quit
332  *   or by vfio_load_cleanup_load_bufs_thread() setting
333  *   multifd->load_bufs_thread_want_exit.
334  */
335 static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp)
336 {
337     VFIODevice *vbasedev = opaque;
338     VFIOMigration *migration = vbasedev->migration;
339     VFIOMultifd *multifd = migration->multifd;
340     bool ret = false;
341 
342     trace_vfio_load_bufs_thread_start(vbasedev->name);
343 
344     assert(multifd);
345     QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
346 
347     assert(multifd->load_bufs_thread_running);
348 
349     while (true) {
350         VFIOStateBuffer *lb;
351 
352         /*
353          * Always check cancellation first after the buffer_ready wait below in
354          * case that cond was signalled by vfio_load_cleanup_load_bufs_thread().
355          */
356         if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) {
357             error_setg(errp, "operation cancelled");
358             goto thread_exit;
359         }
360 
361         assert(multifd->load_buf_idx <= multifd->load_buf_idx_last);
362 
363         lb = vfio_load_state_buffer_get(multifd);
364         if (!lb) {
365             trace_vfio_load_state_device_buffer_starved(vbasedev->name,
366                                                         multifd->load_buf_idx);
367             qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond,
368                            &multifd->load_bufs_mutex);
369             continue;
370         }
371 
372         if (multifd->load_buf_idx == multifd->load_buf_idx_last) {
373             break;
374         }
375 
376         if (multifd->load_buf_idx == 0) {
377             trace_vfio_load_state_device_buffer_start(vbasedev->name);
378         }
379 
380         if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) {
381             goto thread_exit;
382         }
383 
384         if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) {
385             trace_vfio_load_state_device_buffer_end(vbasedev->name);
386         }
387 
388         multifd->load_buf_idx++;
389     }
390 
391     if (!vfio_load_bufs_thread_load_config(vbasedev, errp)) {
392         goto thread_exit;
393     }
394 
395     ret = true;
396 
397 thread_exit:
398     /*
399      * Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that
400      * this thread is exiting.
401      */
402     multifd->load_bufs_thread_running = false;
403     qemu_cond_signal(&multifd->load_bufs_thread_finished_cond);
404 
405     trace_vfio_load_bufs_thread_end(vbasedev->name);
406 
407     return ret;
408 }
409 
410 static VFIOMultifd *vfio_multifd_new(void)
411 {
412     VFIOMultifd *multifd = g_new(VFIOMultifd, 1);
413 
414     vfio_state_buffers_init(&multifd->load_bufs);
415 
416     qemu_mutex_init(&multifd->load_bufs_mutex);
417 
418     multifd->load_buf_idx = 0;
419     multifd->load_buf_idx_last = UINT32_MAX;
420     qemu_cond_init(&multifd->load_bufs_buffer_ready_cond);
421 
422     multifd->load_bufs_thread_running = false;
423     multifd->load_bufs_thread_want_exit = false;
424     qemu_cond_init(&multifd->load_bufs_thread_finished_cond);
425 
426     return multifd;
427 }
428 
429 /*
430  * Terminates vfio_load_bufs_thread by setting
431  * multifd->load_bufs_thread_want_exit and signalling all the conditions
432  * the thread could be blocked on.
433  *
434  * Waits for the thread to signal that it had finished.
435  */
436 static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd)
437 {
438     /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
439     bql_unlock();
440     WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
441         while (multifd->load_bufs_thread_running) {
442             multifd->load_bufs_thread_want_exit = true;
443 
444             qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
445             qemu_cond_wait(&multifd->load_bufs_thread_finished_cond,
446                            &multifd->load_bufs_mutex);
447         }
448     }
449     bql_lock();
450 }
451 
452 static void vfio_multifd_free(VFIOMultifd *multifd)
453 {
454     vfio_load_cleanup_load_bufs_thread(multifd);
455 
456     qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond);
457     vfio_state_buffers_destroy(&multifd->load_bufs);
458     qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond);
459     qemu_mutex_destroy(&multifd->load_bufs_mutex);
460 
461     g_free(multifd);
462 }
463 
464 void vfio_multifd_cleanup(VFIODevice *vbasedev)
465 {
466     VFIOMigration *migration = vbasedev->migration;
467 
468     g_clear_pointer(&migration->multifd, vfio_multifd_free);
469 }
470 
471 bool vfio_multifd_transfer_supported(void)
472 {
473     return multifd_device_state_supported() &&
474         migrate_send_switchover_start();
475 }
476 
477 bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev)
478 {
479     VFIOMigration *migration = vbasedev->migration;
480 
481     return migration->multifd_transfer;
482 }
483 
484 bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp)
485 {
486     VFIOMigration *migration = vbasedev->migration;
487 
488     /*
489      * Make a copy of this setting at the start in case it is changed
490      * mid-migration.
491      */
492     if (vbasedev->migration_multifd_transfer == ON_OFF_AUTO_AUTO) {
493         migration->multifd_transfer = vfio_multifd_transfer_supported();
494     } else {
495         migration->multifd_transfer =
496             vbasedev->migration_multifd_transfer == ON_OFF_AUTO_ON;
497     }
498 
499     if (!vfio_multifd_transfer_enabled(vbasedev)) {
500         /* Nothing further to check or do */
501         return true;
502     }
503 
504     if (!vfio_multifd_transfer_supported()) {
505         error_setg(errp,
506                    "%s: Multifd device transfer requested but unsupported in the current config",
507                    vbasedev->name);
508         return false;
509     }
510 
511     if (alloc_multifd) {
512         assert(!migration->multifd);
513         migration->multifd = vfio_multifd_new();
514     }
515 
516     return true;
517 }
518 
519 void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f)
520 {
521     assert(vfio_multifd_transfer_enabled(vbasedev));
522 
523     /*
524      * Emit dummy NOP data on the main migration channel since the actual
525      * device state transfer is done via multifd channels.
526      */
527     qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
528 }
529 
530 static bool
531 vfio_save_complete_precopy_thread_config_state(VFIODevice *vbasedev,
532                                                char *idstr,
533                                                uint32_t instance_id,
534                                                uint32_t idx,
535                                                Error **errp)
536 {
537     g_autoptr(QIOChannelBuffer) bioc = NULL;
538     g_autoptr(QEMUFile) f = NULL;
539     int ret;
540     g_autofree VFIODeviceStatePacket *packet = NULL;
541     size_t packet_len;
542 
543     bioc = qio_channel_buffer_new(0);
544     qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save");
545 
546     f = qemu_file_new_output(QIO_CHANNEL(bioc));
547 
548     if (vfio_save_device_config_state(f, vbasedev, errp)) {
549         return false;
550     }
551 
552     ret = qemu_fflush(f);
553     if (ret) {
554         error_setg(errp, "%s: save config state flush failed: %d",
555                    vbasedev->name, ret);
556         return false;
557     }
558 
559     packet_len = sizeof(*packet) + bioc->usage;
560     packet = g_malloc0(packet_len);
561     packet->version = VFIO_DEVICE_STATE_PACKET_VER_CURRENT;
562     packet->idx = idx;
563     packet->flags = VFIO_DEVICE_STATE_CONFIG_STATE;
564     memcpy(&packet->data, bioc->data, bioc->usage);
565 
566     if (!multifd_queue_device_state(idstr, instance_id,
567                                     (char *)packet, packet_len)) {
568         error_setg(errp, "%s: multifd config data queuing failed",
569                    vbasedev->name);
570         return false;
571     }
572 
573     vfio_mig_add_bytes_transferred(packet_len);
574 
575     return true;
576 }
577 
578 /*
579  * This thread is spawned by the migration core directly via
580  * .save_live_complete_precopy_thread SaveVMHandler.
581  *
582  * It exits after either:
583  * * completing saving the remaining device state and device config, OR:
584  * * encountering some error while doing the above, OR:
585  * * being forcefully aborted by the migration core by
586  *   multifd_device_state_save_thread_should_exit() returning true.
587  */
588 bool
589 vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d,
590                                           Error **errp)
591 {
592     VFIODevice *vbasedev = d->handler_opaque;
593     VFIOMigration *migration = vbasedev->migration;
594     bool ret = false;
595     g_autofree VFIODeviceStatePacket *packet = NULL;
596     uint32_t idx;
597 
598     if (!vfio_multifd_transfer_enabled(vbasedev)) {
599         /* Nothing to do, vfio_save_complete_precopy() does the transfer. */
600         return true;
601     }
602 
603     trace_vfio_save_complete_precopy_thread_start(vbasedev->name,
604                                                   d->idstr, d->instance_id);
605 
606     /* We reach here with device state STOP or STOP_COPY only */
607     if (vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
608                                  VFIO_DEVICE_STATE_STOP, errp)) {
609         goto thread_exit;
610     }
611 
612     packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size);
613     packet->version = VFIO_DEVICE_STATE_PACKET_VER_CURRENT;
614 
615     for (idx = 0; ; idx++) {
616         ssize_t data_size;
617         size_t packet_size;
618 
619         if (multifd_device_state_save_thread_should_exit()) {
620             error_setg(errp, "operation cancelled");
621             goto thread_exit;
622         }
623 
624         data_size = read(migration->data_fd, &packet->data,
625                          migration->data_buffer_size);
626         if (data_size < 0) {
627             error_setg(errp, "%s: reading state buffer %" PRIu32 " failed: %d",
628                        vbasedev->name, idx, errno);
629             goto thread_exit;
630         } else if (data_size == 0) {
631             break;
632         }
633 
634         packet->idx = idx;
635         packet_size = sizeof(*packet) + data_size;
636 
637         if (!multifd_queue_device_state(d->idstr, d->instance_id,
638                                         (char *)packet, packet_size)) {
639             error_setg(errp, "%s: multifd data queuing failed", vbasedev->name);
640             goto thread_exit;
641         }
642 
643         vfio_mig_add_bytes_transferred(packet_size);
644     }
645 
646     if (!vfio_save_complete_precopy_thread_config_state(vbasedev,
647                                                         d->idstr,
648                                                         d->instance_id,
649                                                         idx, errp)) {
650         goto thread_exit;
651    }
652 
653     ret = true;
654 
655 thread_exit:
656     trace_vfio_save_complete_precopy_thread_end(vbasedev->name, ret);
657 
658     return ret;
659 }
660 
661 int vfio_multifd_switchover_start(VFIODevice *vbasedev)
662 {
663     VFIOMigration *migration = vbasedev->migration;
664     VFIOMultifd *multifd = migration->multifd;
665 
666     assert(multifd);
667 
668     /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
669     bql_unlock();
670     WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
671         assert(!multifd->load_bufs_thread_running);
672         multifd->load_bufs_thread_running = true;
673     }
674     bql_lock();
675 
676     qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev);
677 
678     return 0;
679 }
680