1 /* 2 * Multifd VFIO migration 3 * 4 * Copyright (C) 2024,2025 Oracle and/or its affiliates. 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2 or later. 7 * See the COPYING file in the top-level directory. 8 * 9 * SPDX-License-Identifier: GPL-2.0-or-later 10 */ 11 12 #include "qemu/osdep.h" 13 #include "hw/vfio/vfio-common.h" 14 #include "migration/misc.h" 15 #include "qapi/error.h" 16 #include "qemu/error-report.h" 17 #include "qemu/lockable.h" 18 #include "qemu/main-loop.h" 19 #include "qemu/thread.h" 20 #include "migration/qemu-file.h" 21 #include "migration-multifd.h" 22 #include "trace.h" 23 24 #define VFIO_DEVICE_STATE_CONFIG_STATE (1) 25 26 #define VFIO_DEVICE_STATE_PACKET_VER_CURRENT (0) 27 28 typedef struct VFIODeviceStatePacket { 29 uint32_t version; 30 uint32_t idx; 31 uint32_t flags; 32 uint8_t data[0]; 33 } QEMU_PACKED VFIODeviceStatePacket; 34 35 /* type safety */ 36 typedef struct VFIOStateBuffers { 37 GArray *array; 38 } VFIOStateBuffers; 39 40 typedef struct VFIOStateBuffer { 41 bool is_present; 42 char *data; 43 size_t len; 44 } VFIOStateBuffer; 45 46 typedef struct VFIOMultifd { 47 bool load_bufs_thread_running; 48 bool load_bufs_thread_want_exit; 49 50 VFIOStateBuffers load_bufs; 51 QemuCond load_bufs_buffer_ready_cond; 52 QemuCond load_bufs_thread_finished_cond; 53 QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */ 54 uint32_t load_buf_idx; 55 uint32_t load_buf_idx_last; 56 } VFIOMultifd; 57 58 static void vfio_state_buffer_clear(gpointer data) 59 { 60 VFIOStateBuffer *lb = data; 61 62 if (!lb->is_present) { 63 return; 64 } 65 66 g_clear_pointer(&lb->data, g_free); 67 lb->is_present = false; 68 } 69 70 static void vfio_state_buffers_init(VFIOStateBuffers *bufs) 71 { 72 bufs->array = g_array_new(FALSE, TRUE, sizeof(VFIOStateBuffer)); 73 g_array_set_clear_func(bufs->array, vfio_state_buffer_clear); 74 } 75 76 static void vfio_state_buffers_destroy(VFIOStateBuffers *bufs) 77 { 78 g_clear_pointer(&bufs->array, g_array_unref); 79 } 80 81 static void vfio_state_buffers_assert_init(VFIOStateBuffers *bufs) 82 { 83 assert(bufs->array); 84 } 85 86 static unsigned int vfio_state_buffers_size_get(VFIOStateBuffers *bufs) 87 { 88 return bufs->array->len; 89 } 90 91 static void vfio_state_buffers_size_set(VFIOStateBuffers *bufs, 92 unsigned int size) 93 { 94 g_array_set_size(bufs->array, size); 95 } 96 97 static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs, 98 unsigned int idx) 99 { 100 return &g_array_index(bufs->array, VFIOStateBuffer, idx); 101 } 102 103 /* called with load_bufs_mutex locked */ 104 static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev, 105 VFIODeviceStatePacket *packet, 106 size_t packet_total_size, 107 Error **errp) 108 { 109 VFIOMigration *migration = vbasedev->migration; 110 VFIOMultifd *multifd = migration->multifd; 111 VFIOStateBuffer *lb; 112 113 vfio_state_buffers_assert_init(&multifd->load_bufs); 114 if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) { 115 vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1); 116 } 117 118 lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx); 119 if (lb->is_present) { 120 error_setg(errp, "%s: state buffer %" PRIu32 " already filled", 121 vbasedev->name, packet->idx); 122 return false; 123 } 124 125 assert(packet->idx >= multifd->load_buf_idx); 126 127 lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet)); 128 lb->len = packet_total_size - sizeof(*packet); 129 lb->is_present = true; 130 131 return true; 132 } 133 134 bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size, 135 Error **errp) 136 { 137 VFIODevice *vbasedev = opaque; 138 VFIOMigration *migration = vbasedev->migration; 139 VFIOMultifd *multifd = migration->multifd; 140 VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data; 141 142 if (!vfio_multifd_transfer_enabled(vbasedev)) { 143 error_setg(errp, 144 "%s: got device state packet but not doing multifd transfer", 145 vbasedev->name); 146 return false; 147 } 148 149 assert(multifd); 150 151 if (data_size < sizeof(*packet)) { 152 error_setg(errp, "%s: packet too short at %zu (min is %zu)", 153 vbasedev->name, data_size, sizeof(*packet)); 154 return false; 155 } 156 157 if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) { 158 error_setg(errp, "%s: packet has unknown version %" PRIu32, 159 vbasedev->name, packet->version); 160 return false; 161 } 162 163 if (packet->idx == UINT32_MAX) { 164 error_setg(errp, "%s: packet index is invalid", vbasedev->name); 165 return false; 166 } 167 168 trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx); 169 170 /* 171 * Holding BQL here would violate the lock order and can cause 172 * a deadlock once we attempt to lock load_bufs_mutex below. 173 */ 174 assert(!bql_locked()); 175 176 WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { 177 /* config state packet should be the last one in the stream */ 178 if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) { 179 multifd->load_buf_idx_last = packet->idx; 180 } 181 182 if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size, 183 errp)) { 184 return false; 185 } 186 187 qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); 188 } 189 190 return true; 191 } 192 193 static bool vfio_load_bufs_thread_load_config(VFIODevice *vbasedev, 194 Error **errp) 195 { 196 error_setg(errp, "not yet there"); 197 return false; 198 } 199 200 static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd) 201 { 202 VFIOStateBuffer *lb; 203 unsigned int bufs_len; 204 205 bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs); 206 if (multifd->load_buf_idx >= bufs_len) { 207 assert(multifd->load_buf_idx == bufs_len); 208 return NULL; 209 } 210 211 lb = vfio_state_buffers_at(&multifd->load_bufs, 212 multifd->load_buf_idx); 213 if (!lb->is_present) { 214 return NULL; 215 } 216 217 return lb; 218 } 219 220 static bool vfio_load_state_buffer_write(VFIODevice *vbasedev, 221 VFIOStateBuffer *lb, 222 Error **errp) 223 { 224 VFIOMigration *migration = vbasedev->migration; 225 VFIOMultifd *multifd = migration->multifd; 226 g_autofree char *buf = NULL; 227 char *buf_cur; 228 size_t buf_len; 229 230 if (!lb->len) { 231 return true; 232 } 233 234 trace_vfio_load_state_device_buffer_load_start(vbasedev->name, 235 multifd->load_buf_idx); 236 237 /* lb might become re-allocated when we drop the lock */ 238 buf = g_steal_pointer(&lb->data); 239 buf_cur = buf; 240 buf_len = lb->len; 241 while (buf_len > 0) { 242 ssize_t wr_ret; 243 int errno_save; 244 245 /* 246 * Loading data to the device takes a while, 247 * drop the lock during this process. 248 */ 249 qemu_mutex_unlock(&multifd->load_bufs_mutex); 250 wr_ret = write(migration->data_fd, buf_cur, buf_len); 251 errno_save = errno; 252 qemu_mutex_lock(&multifd->load_bufs_mutex); 253 254 if (wr_ret < 0) { 255 error_setg(errp, 256 "%s: writing state buffer %" PRIu32 " failed: %d", 257 vbasedev->name, multifd->load_buf_idx, errno_save); 258 return false; 259 } 260 261 assert(wr_ret <= buf_len); 262 buf_len -= wr_ret; 263 buf_cur += wr_ret; 264 } 265 266 trace_vfio_load_state_device_buffer_load_end(vbasedev->name, 267 multifd->load_buf_idx); 268 269 return true; 270 } 271 272 static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd, 273 bool *should_quit) 274 { 275 return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit); 276 } 277 278 /* 279 * This thread is spawned by vfio_multifd_switchover_start() which gets 280 * called upon encountering the switchover point marker in main migration 281 * stream. 282 * 283 * It exits after either: 284 * * completing loading the remaining device state and device config, OR: 285 * * encountering some error while doing the above, OR: 286 * * being forcefully aborted by the migration core by it setting should_quit 287 * or by vfio_load_cleanup_load_bufs_thread() setting 288 * multifd->load_bufs_thread_want_exit. 289 */ 290 static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp) 291 { 292 VFIODevice *vbasedev = opaque; 293 VFIOMigration *migration = vbasedev->migration; 294 VFIOMultifd *multifd = migration->multifd; 295 bool ret = false; 296 297 trace_vfio_load_bufs_thread_start(vbasedev->name); 298 299 assert(multifd); 300 QEMU_LOCK_GUARD(&multifd->load_bufs_mutex); 301 302 assert(multifd->load_bufs_thread_running); 303 304 while (true) { 305 VFIOStateBuffer *lb; 306 307 /* 308 * Always check cancellation first after the buffer_ready wait below in 309 * case that cond was signalled by vfio_load_cleanup_load_bufs_thread(). 310 */ 311 if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) { 312 error_setg(errp, "operation cancelled"); 313 goto thread_exit; 314 } 315 316 assert(multifd->load_buf_idx <= multifd->load_buf_idx_last); 317 318 lb = vfio_load_state_buffer_get(multifd); 319 if (!lb) { 320 trace_vfio_load_state_device_buffer_starved(vbasedev->name, 321 multifd->load_buf_idx); 322 qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond, 323 &multifd->load_bufs_mutex); 324 continue; 325 } 326 327 if (multifd->load_buf_idx == multifd->load_buf_idx_last) { 328 break; 329 } 330 331 if (multifd->load_buf_idx == 0) { 332 trace_vfio_load_state_device_buffer_start(vbasedev->name); 333 } 334 335 if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) { 336 goto thread_exit; 337 } 338 339 if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) { 340 trace_vfio_load_state_device_buffer_end(vbasedev->name); 341 } 342 343 multifd->load_buf_idx++; 344 } 345 346 if (!vfio_load_bufs_thread_load_config(vbasedev, errp)) { 347 goto thread_exit; 348 } 349 350 ret = true; 351 352 thread_exit: 353 /* 354 * Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that 355 * this thread is exiting. 356 */ 357 multifd->load_bufs_thread_running = false; 358 qemu_cond_signal(&multifd->load_bufs_thread_finished_cond); 359 360 trace_vfio_load_bufs_thread_end(vbasedev->name); 361 362 return ret; 363 } 364 365 static VFIOMultifd *vfio_multifd_new(void) 366 { 367 VFIOMultifd *multifd = g_new(VFIOMultifd, 1); 368 369 vfio_state_buffers_init(&multifd->load_bufs); 370 371 qemu_mutex_init(&multifd->load_bufs_mutex); 372 373 multifd->load_buf_idx = 0; 374 multifd->load_buf_idx_last = UINT32_MAX; 375 qemu_cond_init(&multifd->load_bufs_buffer_ready_cond); 376 377 multifd->load_bufs_thread_running = false; 378 multifd->load_bufs_thread_want_exit = false; 379 qemu_cond_init(&multifd->load_bufs_thread_finished_cond); 380 381 return multifd; 382 } 383 384 /* 385 * Terminates vfio_load_bufs_thread by setting 386 * multifd->load_bufs_thread_want_exit and signalling all the conditions 387 * the thread could be blocked on. 388 * 389 * Waits for the thread to signal that it had finished. 390 */ 391 static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd) 392 { 393 /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ 394 bql_unlock(); 395 WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { 396 while (multifd->load_bufs_thread_running) { 397 multifd->load_bufs_thread_want_exit = true; 398 399 qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond); 400 qemu_cond_wait(&multifd->load_bufs_thread_finished_cond, 401 &multifd->load_bufs_mutex); 402 } 403 } 404 bql_lock(); 405 } 406 407 static void vfio_multifd_free(VFIOMultifd *multifd) 408 { 409 vfio_load_cleanup_load_bufs_thread(multifd); 410 411 qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond); 412 vfio_state_buffers_destroy(&multifd->load_bufs); 413 qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond); 414 qemu_mutex_destroy(&multifd->load_bufs_mutex); 415 416 g_free(multifd); 417 } 418 419 void vfio_multifd_cleanup(VFIODevice *vbasedev) 420 { 421 VFIOMigration *migration = vbasedev->migration; 422 423 g_clear_pointer(&migration->multifd, vfio_multifd_free); 424 } 425 426 bool vfio_multifd_transfer_supported(void) 427 { 428 return multifd_device_state_supported() && 429 migrate_send_switchover_start(); 430 } 431 432 bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev) 433 { 434 return false; 435 } 436 437 bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp) 438 { 439 VFIOMigration *migration = vbasedev->migration; 440 441 if (!vfio_multifd_transfer_enabled(vbasedev)) { 442 /* Nothing further to check or do */ 443 return true; 444 } 445 446 if (alloc_multifd) { 447 assert(!migration->multifd); 448 migration->multifd = vfio_multifd_new(); 449 } 450 451 return true; 452 } 453 454 int vfio_multifd_switchover_start(VFIODevice *vbasedev) 455 { 456 VFIOMigration *migration = vbasedev->migration; 457 VFIOMultifd *multifd = migration->multifd; 458 459 assert(multifd); 460 461 /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */ 462 bql_unlock(); 463 WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) { 464 assert(!multifd->load_bufs_thread_running); 465 multifd->load_bufs_thread_running = true; 466 } 467 bql_lock(); 468 469 qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev); 470 471 return 0; 472 } 473