xref: /qemu/hw/vfio/migration-multifd.c (revision c59748c1ff924963a67af9efd7e1a1ee6f82d6d6)
1 /*
2  * Multifd VFIO migration
3  *
4  * Copyright (C) 2024,2025 Oracle and/or its affiliates.
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
7  * See the COPYING file in the top-level directory.
8  *
9  * SPDX-License-Identifier: GPL-2.0-or-later
10  */
11 
12 #include "qemu/osdep.h"
13 #include "hw/vfio/vfio-common.h"
14 #include "migration/misc.h"
15 #include "qapi/error.h"
16 #include "qemu/error-report.h"
17 #include "qemu/lockable.h"
18 #include "qemu/main-loop.h"
19 #include "qemu/thread.h"
20 #include "migration/qemu-file.h"
21 #include "migration-multifd.h"
22 #include "trace.h"
23 
24 #define VFIO_DEVICE_STATE_CONFIG_STATE (1)
25 
26 #define VFIO_DEVICE_STATE_PACKET_VER_CURRENT (0)
27 
28 typedef struct VFIODeviceStatePacket {
29     uint32_t version;
30     uint32_t idx;
31     uint32_t flags;
32     uint8_t data[0];
33 } QEMU_PACKED VFIODeviceStatePacket;
34 
35 /* type safety */
36 typedef struct VFIOStateBuffers {
37     GArray *array;
38 } VFIOStateBuffers;
39 
40 typedef struct VFIOStateBuffer {
41     bool is_present;
42     char *data;
43     size_t len;
44 } VFIOStateBuffer;
45 
46 typedef struct VFIOMultifd {
47     bool load_bufs_thread_running;
48     bool load_bufs_thread_want_exit;
49 
50     VFIOStateBuffers load_bufs;
51     QemuCond load_bufs_buffer_ready_cond;
52     QemuCond load_bufs_thread_finished_cond;
53     QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
54     uint32_t load_buf_idx;
55     uint32_t load_buf_idx_last;
56 } VFIOMultifd;
57 
58 static void vfio_state_buffer_clear(gpointer data)
59 {
60     VFIOStateBuffer *lb = data;
61 
62     if (!lb->is_present) {
63         return;
64     }
65 
66     g_clear_pointer(&lb->data, g_free);
67     lb->is_present = false;
68 }
69 
70 static void vfio_state_buffers_init(VFIOStateBuffers *bufs)
71 {
72     bufs->array = g_array_new(FALSE, TRUE, sizeof(VFIOStateBuffer));
73     g_array_set_clear_func(bufs->array, vfio_state_buffer_clear);
74 }
75 
76 static void vfio_state_buffers_destroy(VFIOStateBuffers *bufs)
77 {
78     g_clear_pointer(&bufs->array, g_array_unref);
79 }
80 
81 static void vfio_state_buffers_assert_init(VFIOStateBuffers *bufs)
82 {
83     assert(bufs->array);
84 }
85 
86 static unsigned int vfio_state_buffers_size_get(VFIOStateBuffers *bufs)
87 {
88     return bufs->array->len;
89 }
90 
91 static void vfio_state_buffers_size_set(VFIOStateBuffers *bufs,
92                                         unsigned int size)
93 {
94     g_array_set_size(bufs->array, size);
95 }
96 
97 static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs,
98                                               unsigned int idx)
99 {
100     return &g_array_index(bufs->array, VFIOStateBuffer, idx);
101 }
102 
103 /* called with load_bufs_mutex locked */
104 static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev,
105                                           VFIODeviceStatePacket *packet,
106                                           size_t packet_total_size,
107                                           Error **errp)
108 {
109     VFIOMigration *migration = vbasedev->migration;
110     VFIOMultifd *multifd = migration->multifd;
111     VFIOStateBuffer *lb;
112 
113     vfio_state_buffers_assert_init(&multifd->load_bufs);
114     if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) {
115         vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1);
116     }
117 
118     lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx);
119     if (lb->is_present) {
120         error_setg(errp, "%s: state buffer %" PRIu32 " already filled",
121                    vbasedev->name, packet->idx);
122         return false;
123     }
124 
125     assert(packet->idx >= multifd->load_buf_idx);
126 
127     lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet));
128     lb->len = packet_total_size - sizeof(*packet);
129     lb->is_present = true;
130 
131     return true;
132 }
133 
134 bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size,
135                                     Error **errp)
136 {
137     VFIODevice *vbasedev = opaque;
138     VFIOMigration *migration = vbasedev->migration;
139     VFIOMultifd *multifd = migration->multifd;
140     VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data;
141 
142     if (!vfio_multifd_transfer_enabled(vbasedev)) {
143         error_setg(errp,
144                    "%s: got device state packet but not doing multifd transfer",
145                    vbasedev->name);
146         return false;
147     }
148 
149     assert(multifd);
150 
151     if (data_size < sizeof(*packet)) {
152         error_setg(errp, "%s: packet too short at %zu (min is %zu)",
153                    vbasedev->name, data_size, sizeof(*packet));
154         return false;
155     }
156 
157     if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) {
158         error_setg(errp, "%s: packet has unknown version %" PRIu32,
159                    vbasedev->name, packet->version);
160         return false;
161     }
162 
163     if (packet->idx == UINT32_MAX) {
164         error_setg(errp, "%s: packet index is invalid", vbasedev->name);
165         return false;
166     }
167 
168     trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx);
169 
170     /*
171      * Holding BQL here would violate the lock order and can cause
172      * a deadlock once we attempt to lock load_bufs_mutex below.
173      */
174     assert(!bql_locked());
175 
176     WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
177         /* config state packet should be the last one in the stream */
178         if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) {
179             multifd->load_buf_idx_last = packet->idx;
180         }
181 
182         if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size,
183                                            errp)) {
184             return false;
185         }
186 
187         qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
188     }
189 
190     return true;
191 }
192 
193 static bool vfio_load_bufs_thread_load_config(VFIODevice *vbasedev,
194                                               Error **errp)
195 {
196     error_setg(errp, "not yet there");
197     return false;
198 }
199 
200 static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd)
201 {
202     VFIOStateBuffer *lb;
203     unsigned int bufs_len;
204 
205     bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs);
206     if (multifd->load_buf_idx >= bufs_len) {
207         assert(multifd->load_buf_idx == bufs_len);
208         return NULL;
209     }
210 
211     lb = vfio_state_buffers_at(&multifd->load_bufs,
212                                multifd->load_buf_idx);
213     if (!lb->is_present) {
214         return NULL;
215     }
216 
217     return lb;
218 }
219 
220 static bool vfio_load_state_buffer_write(VFIODevice *vbasedev,
221                                          VFIOStateBuffer *lb,
222                                          Error **errp)
223 {
224     VFIOMigration *migration = vbasedev->migration;
225     VFIOMultifd *multifd = migration->multifd;
226     g_autofree char *buf = NULL;
227     char *buf_cur;
228     size_t buf_len;
229 
230     if (!lb->len) {
231         return true;
232     }
233 
234     trace_vfio_load_state_device_buffer_load_start(vbasedev->name,
235                                                    multifd->load_buf_idx);
236 
237     /* lb might become re-allocated when we drop the lock */
238     buf = g_steal_pointer(&lb->data);
239     buf_cur = buf;
240     buf_len = lb->len;
241     while (buf_len > 0) {
242         ssize_t wr_ret;
243         int errno_save;
244 
245         /*
246          * Loading data to the device takes a while,
247          * drop the lock during this process.
248          */
249         qemu_mutex_unlock(&multifd->load_bufs_mutex);
250         wr_ret = write(migration->data_fd, buf_cur, buf_len);
251         errno_save = errno;
252         qemu_mutex_lock(&multifd->load_bufs_mutex);
253 
254         if (wr_ret < 0) {
255             error_setg(errp,
256                        "%s: writing state buffer %" PRIu32 " failed: %d",
257                        vbasedev->name, multifd->load_buf_idx, errno_save);
258             return false;
259         }
260 
261         assert(wr_ret <= buf_len);
262         buf_len -= wr_ret;
263         buf_cur += wr_ret;
264     }
265 
266     trace_vfio_load_state_device_buffer_load_end(vbasedev->name,
267                                                  multifd->load_buf_idx);
268 
269     return true;
270 }
271 
272 static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd,
273                                             bool *should_quit)
274 {
275     return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit);
276 }
277 
278 /*
279  * This thread is spawned by vfio_multifd_switchover_start() which gets
280  * called upon encountering the switchover point marker in main migration
281  * stream.
282  *
283  * It exits after either:
284  * * completing loading the remaining device state and device config, OR:
285  * * encountering some error while doing the above, OR:
286  * * being forcefully aborted by the migration core by it setting should_quit
287  *   or by vfio_load_cleanup_load_bufs_thread() setting
288  *   multifd->load_bufs_thread_want_exit.
289  */
290 static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp)
291 {
292     VFIODevice *vbasedev = opaque;
293     VFIOMigration *migration = vbasedev->migration;
294     VFIOMultifd *multifd = migration->multifd;
295     bool ret = false;
296 
297     trace_vfio_load_bufs_thread_start(vbasedev->name);
298 
299     assert(multifd);
300     QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
301 
302     assert(multifd->load_bufs_thread_running);
303 
304     while (true) {
305         VFIOStateBuffer *lb;
306 
307         /*
308          * Always check cancellation first after the buffer_ready wait below in
309          * case that cond was signalled by vfio_load_cleanup_load_bufs_thread().
310          */
311         if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) {
312             error_setg(errp, "operation cancelled");
313             goto thread_exit;
314         }
315 
316         assert(multifd->load_buf_idx <= multifd->load_buf_idx_last);
317 
318         lb = vfio_load_state_buffer_get(multifd);
319         if (!lb) {
320             trace_vfio_load_state_device_buffer_starved(vbasedev->name,
321                                                         multifd->load_buf_idx);
322             qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond,
323                            &multifd->load_bufs_mutex);
324             continue;
325         }
326 
327         if (multifd->load_buf_idx == multifd->load_buf_idx_last) {
328             break;
329         }
330 
331         if (multifd->load_buf_idx == 0) {
332             trace_vfio_load_state_device_buffer_start(vbasedev->name);
333         }
334 
335         if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) {
336             goto thread_exit;
337         }
338 
339         if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) {
340             trace_vfio_load_state_device_buffer_end(vbasedev->name);
341         }
342 
343         multifd->load_buf_idx++;
344     }
345 
346     if (!vfio_load_bufs_thread_load_config(vbasedev, errp)) {
347         goto thread_exit;
348     }
349 
350     ret = true;
351 
352 thread_exit:
353     /*
354      * Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that
355      * this thread is exiting.
356      */
357     multifd->load_bufs_thread_running = false;
358     qemu_cond_signal(&multifd->load_bufs_thread_finished_cond);
359 
360     trace_vfio_load_bufs_thread_end(vbasedev->name);
361 
362     return ret;
363 }
364 
365 static VFIOMultifd *vfio_multifd_new(void)
366 {
367     VFIOMultifd *multifd = g_new(VFIOMultifd, 1);
368 
369     vfio_state_buffers_init(&multifd->load_bufs);
370 
371     qemu_mutex_init(&multifd->load_bufs_mutex);
372 
373     multifd->load_buf_idx = 0;
374     multifd->load_buf_idx_last = UINT32_MAX;
375     qemu_cond_init(&multifd->load_bufs_buffer_ready_cond);
376 
377     multifd->load_bufs_thread_running = false;
378     multifd->load_bufs_thread_want_exit = false;
379     qemu_cond_init(&multifd->load_bufs_thread_finished_cond);
380 
381     return multifd;
382 }
383 
384 /*
385  * Terminates vfio_load_bufs_thread by setting
386  * multifd->load_bufs_thread_want_exit and signalling all the conditions
387  * the thread could be blocked on.
388  *
389  * Waits for the thread to signal that it had finished.
390  */
391 static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd)
392 {
393     /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
394     bql_unlock();
395     WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
396         while (multifd->load_bufs_thread_running) {
397             multifd->load_bufs_thread_want_exit = true;
398 
399             qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
400             qemu_cond_wait(&multifd->load_bufs_thread_finished_cond,
401                            &multifd->load_bufs_mutex);
402         }
403     }
404     bql_lock();
405 }
406 
407 static void vfio_multifd_free(VFIOMultifd *multifd)
408 {
409     vfio_load_cleanup_load_bufs_thread(multifd);
410 
411     qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond);
412     vfio_state_buffers_destroy(&multifd->load_bufs);
413     qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond);
414     qemu_mutex_destroy(&multifd->load_bufs_mutex);
415 
416     g_free(multifd);
417 }
418 
419 void vfio_multifd_cleanup(VFIODevice *vbasedev)
420 {
421     VFIOMigration *migration = vbasedev->migration;
422 
423     g_clear_pointer(&migration->multifd, vfio_multifd_free);
424 }
425 
426 bool vfio_multifd_transfer_supported(void)
427 {
428     return multifd_device_state_supported() &&
429         migrate_send_switchover_start();
430 }
431 
432 bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev)
433 {
434     return false;
435 }
436 
437 bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp)
438 {
439     VFIOMigration *migration = vbasedev->migration;
440 
441     if (!vfio_multifd_transfer_enabled(vbasedev)) {
442         /* Nothing further to check or do */
443         return true;
444     }
445 
446     if (alloc_multifd) {
447         assert(!migration->multifd);
448         migration->multifd = vfio_multifd_new();
449     }
450 
451     return true;
452 }
453 
454 int vfio_multifd_switchover_start(VFIODevice *vbasedev)
455 {
456     VFIOMigration *migration = vbasedev->migration;
457     VFIOMultifd *multifd = migration->multifd;
458 
459     assert(multifd);
460 
461     /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
462     bql_unlock();
463     WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
464         assert(!multifd->load_bufs_thread_running);
465         multifd->load_bufs_thread_running = true;
466     }
467     bql_lock();
468 
469     qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev);
470 
471     return 0;
472 }
473