1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 /* 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include "cmd.h" 7 8 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 }; 9 10 static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id) 11 { 12 int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); 13 void *query_cap = NULL, *cap; 14 int ret; 15 16 query_cap = kzalloc(query_sz, GFP_KERNEL); 17 if (!query_cap) 18 return -ENOMEM; 19 20 ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap, 21 MLX5_CAP_GENERAL_2); 22 if (ret) 23 goto out; 24 25 cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability); 26 if (!MLX5_GET(cmd_hca_cap_2, cap, migratable)) 27 ret = -EOPNOTSUPP; 28 out: 29 kfree(query_cap); 30 return ret; 31 } 32 33 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, 34 u16 *vhca_id); 35 static void 36 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev); 37 38 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) 39 { 40 struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 41 u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {}; 42 u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {}; 43 int err; 44 45 lockdep_assert_held(&mvdev->state_mutex); 46 if (mvdev->mdev_detach) 47 return -ENOTCONN; 48 49 /* 50 * In case PRE_COPY is used, saving_migf is exposed while the device is 51 * running. Make sure to run only once there is no active save command. 52 * Running both in parallel, might end-up with a failure in the save 53 * command once it will try to turn on 'tracking' on a suspended device. 54 */ 55 if (migf) { 56 err = wait_for_completion_interruptible(&migf->save_comp); 57 if (err) 58 return err; 59 } 60 61 MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA); 62 MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id); 63 MLX5_SET(suspend_vhca_in, in, op_mod, op_mod); 64 65 err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); 66 if (migf) 67 complete(&migf->save_comp); 68 69 return err; 70 } 71 72 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) 73 { 74 u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {}; 75 u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {}; 76 77 lockdep_assert_held(&mvdev->state_mutex); 78 if (mvdev->mdev_detach) 79 return -ENOTCONN; 80 81 MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA); 82 MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id); 83 MLX5_SET(resume_vhca_in, in, op_mod, op_mod); 84 85 return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out); 86 } 87 88 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, 89 size_t *state_size, u64 *total_size, 90 u8 query_flags) 91 { 92 u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; 93 u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; 94 bool inc = query_flags & MLX5VF_QUERY_INC; 95 int ret; 96 97 lockdep_assert_held(&mvdev->state_mutex); 98 if (mvdev->mdev_detach) 99 return -ENOTCONN; 100 101 /* 102 * In case PRE_COPY is used, saving_migf is exposed while device is 103 * running. Make sure to run only once there is no active save command. 104 * Running both in parallel, might end-up with a failure in the 105 * incremental query command on un-tracked vhca. 106 */ 107 if (inc) { 108 ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); 109 if (ret) 110 return ret; 111 if (mvdev->saving_migf->state == 112 MLX5_MIGF_STATE_PRE_COPY_ERROR) { 113 /* 114 * In case we had a PRE_COPY error, only query full 115 * image for final image 116 */ 117 if (!(query_flags & MLX5VF_QUERY_FINAL)) { 118 *state_size = 0; 119 complete(&mvdev->saving_migf->save_comp); 120 return 0; 121 } 122 query_flags &= ~MLX5VF_QUERY_INC; 123 } 124 } 125 126 MLX5_SET(query_vhca_migration_state_in, in, opcode, 127 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); 128 MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); 129 MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0); 130 MLX5_SET(query_vhca_migration_state_in, in, incremental, 131 query_flags & MLX5VF_QUERY_INC); 132 MLX5_SET(query_vhca_migration_state_in, in, chunk, mvdev->chunk_mode); 133 134 ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, 135 out); 136 if (inc) 137 complete(&mvdev->saving_migf->save_comp); 138 139 if (ret) 140 return ret; 141 142 *state_size = MLX5_GET(query_vhca_migration_state_out, out, 143 required_umem_size); 144 if (total_size) 145 *total_size = mvdev->chunk_mode ? 146 MLX5_GET64(query_vhca_migration_state_out, out, 147 remaining_total_size) : *state_size; 148 149 return 0; 150 } 151 152 static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev) 153 { 154 /* Mark the tracker under an error and wake it up if it's running */ 155 mvdev->tracker.is_err = true; 156 complete(&mvdev->tracker_comp); 157 } 158 159 static int mlx5fv_vf_event(struct notifier_block *nb, 160 unsigned long event, void *data) 161 { 162 struct mlx5vf_pci_core_device *mvdev = 163 container_of(nb, struct mlx5vf_pci_core_device, nb); 164 165 switch (event) { 166 case MLX5_PF_NOTIFY_ENABLE_VF: 167 mutex_lock(&mvdev->state_mutex); 168 mvdev->mdev_detach = false; 169 mlx5vf_state_mutex_unlock(mvdev); 170 break; 171 case MLX5_PF_NOTIFY_DISABLE_VF: 172 mlx5vf_cmd_close_migratable(mvdev); 173 mutex_lock(&mvdev->state_mutex); 174 mvdev->mdev_detach = true; 175 mlx5vf_state_mutex_unlock(mvdev); 176 break; 177 default: 178 break; 179 } 180 181 return 0; 182 } 183 184 void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev) 185 { 186 if (!mvdev->migrate_cap) 187 return; 188 189 /* Must be done outside the lock to let it progress */ 190 set_tracker_error(mvdev); 191 mutex_lock(&mvdev->state_mutex); 192 mlx5vf_disable_fds(mvdev); 193 _mlx5vf_free_page_tracker_resources(mvdev); 194 mlx5vf_state_mutex_unlock(mvdev); 195 } 196 197 void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev) 198 { 199 if (!mvdev->migrate_cap) 200 return; 201 202 mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id, 203 &mvdev->nb); 204 destroy_workqueue(mvdev->cb_wq); 205 } 206 207 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, 208 const struct vfio_migration_ops *mig_ops, 209 const struct vfio_log_ops *log_ops) 210 { 211 struct pci_dev *pdev = mvdev->core_device.pdev; 212 int ret; 213 214 if (!pdev->is_virtfn) 215 return; 216 217 mvdev->mdev = mlx5_vf_get_core_dev(pdev); 218 if (!mvdev->mdev) 219 return; 220 221 if (!MLX5_CAP_GEN(mvdev->mdev, migration)) 222 goto end; 223 224 mvdev->vf_id = pci_iov_vf_id(pdev); 225 if (mvdev->vf_id < 0) 226 goto end; 227 228 ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1); 229 if (ret) 230 goto end; 231 232 if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1, 233 &mvdev->vhca_id)) 234 goto end; 235 236 mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0); 237 if (!mvdev->cb_wq) 238 goto end; 239 240 mutex_init(&mvdev->state_mutex); 241 spin_lock_init(&mvdev->reset_lock); 242 mvdev->nb.notifier_call = mlx5fv_vf_event; 243 ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id, 244 &mvdev->nb); 245 if (ret) { 246 destroy_workqueue(mvdev->cb_wq); 247 goto end; 248 } 249 250 mvdev->migrate_cap = 1; 251 mvdev->core_device.vdev.migration_flags = 252 VFIO_MIGRATION_STOP_COPY | 253 VFIO_MIGRATION_P2P; 254 mvdev->core_device.vdev.mig_ops = mig_ops; 255 init_completion(&mvdev->tracker_comp); 256 if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization)) 257 mvdev->core_device.vdev.log_ops = log_ops; 258 259 if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) && 260 MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)) 261 mvdev->core_device.vdev.migration_flags |= 262 VFIO_MIGRATION_PRE_COPY; 263 264 end: 265 mlx5_vf_put_core_dev(mvdev->mdev); 266 } 267 268 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, 269 u16 *vhca_id) 270 { 271 u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; 272 int out_size; 273 void *out; 274 int ret; 275 276 out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out); 277 out = kzalloc(out_size, GFP_KERNEL); 278 if (!out) 279 return -ENOMEM; 280 281 MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); 282 MLX5_SET(query_hca_cap_in, in, other_function, 1); 283 MLX5_SET(query_hca_cap_in, in, function_id, function_id); 284 MLX5_SET(query_hca_cap_in, in, op_mod, 285 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 | 286 HCA_CAP_OPMOD_GET_CUR); 287 288 ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out); 289 if (ret) 290 goto err_exec; 291 292 *vhca_id = MLX5_GET(query_hca_cap_out, out, 293 capability.cmd_hca_cap.vhca_id); 294 295 err_exec: 296 kfree(out); 297 return ret; 298 } 299 300 static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, 301 struct mlx5_vhca_data_buffer *buf, 302 struct mlx5_vhca_recv_buf *recv_buf, 303 u32 *mkey) 304 { 305 size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) : 306 recv_buf->npages; 307 int err = 0, inlen; 308 __be64 *mtt; 309 void *mkc; 310 u32 *in; 311 312 inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + 313 sizeof(*mtt) * round_up(npages, 2); 314 315 in = kvzalloc(inlen, GFP_KERNEL); 316 if (!in) 317 return -ENOMEM; 318 319 MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 320 DIV_ROUND_UP(npages, 2)); 321 mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 322 323 if (buf) { 324 struct sg_dma_page_iter dma_iter; 325 326 for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) 327 *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); 328 } else { 329 int i; 330 331 for (i = 0; i < npages; i++) 332 *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]); 333 } 334 335 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 336 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); 337 MLX5_SET(mkc, mkc, lr, 1); 338 MLX5_SET(mkc, mkc, lw, 1); 339 MLX5_SET(mkc, mkc, rr, 1); 340 MLX5_SET(mkc, mkc, rw, 1); 341 MLX5_SET(mkc, mkc, pd, pdn); 342 MLX5_SET(mkc, mkc, bsf_octword_size, 0); 343 MLX5_SET(mkc, mkc, qpn, 0xffffff); 344 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); 345 MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); 346 MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE); 347 err = mlx5_core_create_mkey(mdev, mkey, in, inlen); 348 kvfree(in); 349 return err; 350 } 351 352 static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) 353 { 354 struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; 355 struct mlx5_core_dev *mdev = mvdev->mdev; 356 int ret; 357 358 lockdep_assert_held(&mvdev->state_mutex); 359 if (mvdev->mdev_detach) 360 return -ENOTCONN; 361 362 if (buf->dmaed || !buf->allocated_length) 363 return -EINVAL; 364 365 ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); 366 if (ret) 367 return ret; 368 369 ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey); 370 if (ret) 371 goto err; 372 373 buf->dmaed = true; 374 375 return 0; 376 err: 377 dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); 378 return ret; 379 } 380 381 void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) 382 { 383 struct mlx5_vf_migration_file *migf = buf->migf; 384 struct sg_page_iter sg_iter; 385 386 lockdep_assert_held(&migf->mvdev->state_mutex); 387 WARN_ON(migf->mvdev->mdev_detach); 388 389 if (buf->dmaed) { 390 mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); 391 dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, 392 buf->dma_dir, 0); 393 } 394 395 /* Undo alloc_pages_bulk_array() */ 396 for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) 397 __free_page(sg_page_iter_page(&sg_iter)); 398 sg_free_append_table(&buf->table); 399 kfree(buf); 400 } 401 402 struct mlx5_vhca_data_buffer * 403 mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, 404 size_t length, 405 enum dma_data_direction dma_dir) 406 { 407 struct mlx5_vhca_data_buffer *buf; 408 int ret; 409 410 buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); 411 if (!buf) 412 return ERR_PTR(-ENOMEM); 413 414 buf->dma_dir = dma_dir; 415 buf->migf = migf; 416 if (length) { 417 ret = mlx5vf_add_migration_pages(buf, 418 DIV_ROUND_UP_ULL(length, PAGE_SIZE)); 419 if (ret) 420 goto end; 421 422 if (dma_dir != DMA_NONE) { 423 ret = mlx5vf_dma_data_buffer(buf); 424 if (ret) 425 goto end; 426 } 427 } 428 429 return buf; 430 end: 431 mlx5vf_free_data_buffer(buf); 432 return ERR_PTR(ret); 433 } 434 435 void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) 436 { 437 spin_lock_irq(&buf->migf->list_lock); 438 buf->stop_copy_chunk_num = 0; 439 list_add_tail(&buf->buf_elm, &buf->migf->avail_list); 440 spin_unlock_irq(&buf->migf->list_lock); 441 } 442 443 struct mlx5_vhca_data_buffer * 444 mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, 445 size_t length, enum dma_data_direction dma_dir) 446 { 447 struct mlx5_vhca_data_buffer *buf, *temp_buf; 448 struct list_head free_list; 449 450 lockdep_assert_held(&migf->mvdev->state_mutex); 451 if (migf->mvdev->mdev_detach) 452 return ERR_PTR(-ENOTCONN); 453 454 INIT_LIST_HEAD(&free_list); 455 456 spin_lock_irq(&migf->list_lock); 457 list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { 458 if (buf->dma_dir == dma_dir) { 459 list_del_init(&buf->buf_elm); 460 if (buf->allocated_length >= length) { 461 spin_unlock_irq(&migf->list_lock); 462 goto found; 463 } 464 /* 465 * Prevent holding redundant buffers. Put in a free 466 * list and call at the end not under the spin lock 467 * (&migf->list_lock) to mlx5vf_free_data_buffer which 468 * might sleep. 469 */ 470 list_add(&buf->buf_elm, &free_list); 471 } 472 } 473 spin_unlock_irq(&migf->list_lock); 474 buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir); 475 476 found: 477 while ((temp_buf = list_first_entry_or_null(&free_list, 478 struct mlx5_vhca_data_buffer, buf_elm))) { 479 list_del(&temp_buf->buf_elm); 480 mlx5vf_free_data_buffer(temp_buf); 481 } 482 483 return buf; 484 } 485 486 static void 487 mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf, 488 struct mlx5vf_async_data *async_data) 489 { 490 kvfree(async_data->out); 491 complete(&migf->save_comp); 492 fput(migf->filp); 493 } 494 495 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) 496 { 497 struct mlx5vf_async_data *async_data = container_of(_work, 498 struct mlx5vf_async_data, work); 499 struct mlx5_vf_migration_file *migf = container_of(async_data, 500 struct mlx5_vf_migration_file, async_data); 501 502 mutex_lock(&migf->lock); 503 if (async_data->status) { 504 mlx5vf_put_data_buffer(async_data->buf); 505 if (async_data->header_buf) 506 mlx5vf_put_data_buffer(async_data->header_buf); 507 if (!async_data->stop_copy_chunk && 508 async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR) 509 migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR; 510 else 511 migf->state = MLX5_MIGF_STATE_ERROR; 512 wake_up_interruptible(&migf->poll_wait); 513 } 514 mutex_unlock(&migf->lock); 515 mlx5vf_save_callback_complete(migf, async_data); 516 } 517 518 static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, 519 size_t image_size, bool initial_pre_copy) 520 { 521 struct mlx5_vf_migration_file *migf = header_buf->migf; 522 struct mlx5_vf_migration_header header = {}; 523 unsigned long flags; 524 struct page *page; 525 u8 *to_buff; 526 527 header.record_size = cpu_to_le64(image_size); 528 header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY); 529 header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA); 530 page = mlx5vf_get_migration_page(header_buf, 0); 531 if (!page) 532 return -EINVAL; 533 to_buff = kmap_local_page(page); 534 memcpy(to_buff, &header, sizeof(header)); 535 kunmap_local(to_buff); 536 header_buf->length = sizeof(header); 537 header_buf->start_pos = header_buf->migf->max_pos; 538 migf->max_pos += header_buf->length; 539 spin_lock_irqsave(&migf->list_lock, flags); 540 list_add_tail(&header_buf->buf_elm, &migf->buf_list); 541 spin_unlock_irqrestore(&migf->list_lock, flags); 542 if (initial_pre_copy) 543 migf->pre_copy_initial_bytes += sizeof(header); 544 return 0; 545 } 546 547 static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) 548 { 549 struct mlx5vf_async_data *async_data = container_of(context, 550 struct mlx5vf_async_data, cb_work); 551 struct mlx5_vf_migration_file *migf = container_of(async_data, 552 struct mlx5_vf_migration_file, async_data); 553 554 if (!status) { 555 size_t next_required_umem_size = 0; 556 bool stop_copy_last_chunk; 557 size_t image_size; 558 unsigned long flags; 559 bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY && 560 !async_data->stop_copy_chunk; 561 562 image_size = MLX5_GET(save_vhca_state_out, async_data->out, 563 actual_image_size); 564 if (async_data->buf->stop_copy_chunk_num) 565 next_required_umem_size = MLX5_GET(save_vhca_state_out, 566 async_data->out, next_required_umem_size); 567 stop_copy_last_chunk = async_data->stop_copy_chunk && 568 !next_required_umem_size; 569 if (async_data->header_buf) { 570 status = add_buf_header(async_data->header_buf, image_size, 571 initial_pre_copy); 572 if (status) 573 goto err; 574 } 575 async_data->buf->length = image_size; 576 async_data->buf->start_pos = migf->max_pos; 577 migf->max_pos += async_data->buf->length; 578 spin_lock_irqsave(&migf->list_lock, flags); 579 list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); 580 if (async_data->buf->stop_copy_chunk_num) { 581 migf->num_ready_chunks++; 582 if (next_required_umem_size && 583 migf->num_ready_chunks >= MAX_NUM_CHUNKS) { 584 /* Delay the next SAVE till one chunk be consumed */ 585 migf->next_required_umem_size = next_required_umem_size; 586 next_required_umem_size = 0; 587 } 588 } 589 spin_unlock_irqrestore(&migf->list_lock, flags); 590 if (initial_pre_copy) { 591 migf->pre_copy_initial_bytes += image_size; 592 migf->state = MLX5_MIGF_STATE_PRE_COPY; 593 } 594 if (stop_copy_last_chunk) 595 migf->state = MLX5_MIGF_STATE_COMPLETE; 596 wake_up_interruptible(&migf->poll_wait); 597 if (next_required_umem_size) 598 mlx5vf_mig_file_set_save_work(migf, 599 /* Picking up the next chunk num */ 600 (async_data->buf->stop_copy_chunk_num % MAX_NUM_CHUNKS) + 1, 601 next_required_umem_size); 602 mlx5vf_save_callback_complete(migf, async_data); 603 return; 604 } 605 606 err: 607 /* The error flow can't run from an interrupt context */ 608 if (status == -EREMOTEIO) 609 status = MLX5_GET(save_vhca_state_out, async_data->out, status); 610 async_data->status = status; 611 queue_work(migf->mvdev->cb_wq, &async_data->work); 612 } 613 614 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, 615 struct mlx5_vf_migration_file *migf, 616 struct mlx5_vhca_data_buffer *buf, bool inc, 617 bool track) 618 { 619 u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); 620 u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; 621 struct mlx5_vhca_data_buffer *header_buf = NULL; 622 struct mlx5vf_async_data *async_data; 623 int err; 624 625 lockdep_assert_held(&mvdev->state_mutex); 626 if (mvdev->mdev_detach) 627 return -ENOTCONN; 628 629 err = wait_for_completion_interruptible(&migf->save_comp); 630 if (err) 631 return err; 632 633 if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) 634 /* 635 * In case we had a PRE_COPY error, SAVE is triggered only for 636 * the final image, read device full image. 637 */ 638 inc = false; 639 640 MLX5_SET(save_vhca_state_in, in, opcode, 641 MLX5_CMD_OP_SAVE_VHCA_STATE); 642 MLX5_SET(save_vhca_state_in, in, op_mod, 0); 643 MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); 644 MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); 645 MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); 646 MLX5_SET(save_vhca_state_in, in, incremental, inc); 647 MLX5_SET(save_vhca_state_in, in, set_track, track); 648 649 async_data = &migf->async_data; 650 async_data->buf = buf; 651 async_data->stop_copy_chunk = !track; 652 async_data->out = kvzalloc(out_size, GFP_KERNEL); 653 if (!async_data->out) { 654 err = -ENOMEM; 655 goto err_out; 656 } 657 658 if (MLX5VF_PRE_COPY_SUPP(mvdev)) { 659 if (async_data->stop_copy_chunk) { 660 u8 header_idx = buf->stop_copy_chunk_num ? 661 buf->stop_copy_chunk_num - 1 : 0; 662 663 header_buf = migf->buf_header[header_idx]; 664 migf->buf_header[header_idx] = NULL; 665 } 666 667 if (!header_buf) { 668 header_buf = mlx5vf_get_data_buffer(migf, 669 sizeof(struct mlx5_vf_migration_header), DMA_NONE); 670 if (IS_ERR(header_buf)) { 671 err = PTR_ERR(header_buf); 672 goto err_free; 673 } 674 } 675 } 676 677 if (async_data->stop_copy_chunk) 678 migf->state = MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK; 679 680 async_data->header_buf = header_buf; 681 get_file(migf->filp); 682 err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), 683 async_data->out, 684 out_size, mlx5vf_save_callback, 685 &async_data->cb_work); 686 if (err) 687 goto err_exec; 688 689 return 0; 690 691 err_exec: 692 if (header_buf) 693 mlx5vf_put_data_buffer(header_buf); 694 fput(migf->filp); 695 err_free: 696 kvfree(async_data->out); 697 err_out: 698 complete(&migf->save_comp); 699 return err; 700 } 701 702 int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, 703 struct mlx5_vf_migration_file *migf, 704 struct mlx5_vhca_data_buffer *buf) 705 { 706 u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; 707 u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; 708 int err; 709 710 lockdep_assert_held(&mvdev->state_mutex); 711 if (mvdev->mdev_detach) 712 return -ENOTCONN; 713 714 if (!buf->dmaed) { 715 err = mlx5vf_dma_data_buffer(buf); 716 if (err) 717 return err; 718 } 719 720 MLX5_SET(load_vhca_state_in, in, opcode, 721 MLX5_CMD_OP_LOAD_VHCA_STATE); 722 MLX5_SET(load_vhca_state_in, in, op_mod, 0); 723 MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id); 724 MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey); 725 MLX5_SET(load_vhca_state_in, in, size, buf->length); 726 return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out); 727 } 728 729 int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf) 730 { 731 int err; 732 733 lockdep_assert_held(&migf->mvdev->state_mutex); 734 if (migf->mvdev->mdev_detach) 735 return -ENOTCONN; 736 737 err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn); 738 return err; 739 } 740 741 void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) 742 { 743 lockdep_assert_held(&migf->mvdev->state_mutex); 744 if (migf->mvdev->mdev_detach) 745 return; 746 747 mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn); 748 } 749 750 void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) 751 { 752 struct mlx5_vhca_data_buffer *entry; 753 int i; 754 755 lockdep_assert_held(&migf->mvdev->state_mutex); 756 WARN_ON(migf->mvdev->mdev_detach); 757 758 for (i = 0; i < MAX_NUM_CHUNKS; i++) { 759 if (migf->buf[i]) { 760 mlx5vf_free_data_buffer(migf->buf[i]); 761 migf->buf[i] = NULL; 762 } 763 764 if (migf->buf_header[i]) { 765 mlx5vf_free_data_buffer(migf->buf_header[i]); 766 migf->buf_header[i] = NULL; 767 } 768 } 769 770 list_splice(&migf->avail_list, &migf->buf_list); 771 772 while ((entry = list_first_entry_or_null(&migf->buf_list, 773 struct mlx5_vhca_data_buffer, buf_elm))) { 774 list_del(&entry->buf_elm); 775 mlx5vf_free_data_buffer(entry); 776 } 777 778 mlx5vf_cmd_dealloc_pd(migf); 779 } 780 781 static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev, 782 struct mlx5vf_pci_core_device *mvdev, 783 struct rb_root_cached *ranges, u32 nnodes) 784 { 785 int max_num_range = 786 MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range); 787 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 788 int record_size = MLX5_ST_SZ_BYTES(page_track_range); 789 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; 790 struct interval_tree_node *node = NULL; 791 u64 total_ranges_len = 0; 792 u32 num_ranges = nnodes; 793 u8 log_addr_space_size; 794 void *range_list_ptr; 795 void *obj_context; 796 void *cmd_hdr; 797 int inlen; 798 void *in; 799 int err; 800 int i; 801 802 if (num_ranges > max_num_range) { 803 vfio_combine_iova_ranges(ranges, nnodes, max_num_range); 804 num_ranges = max_num_range; 805 } 806 807 inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) + 808 record_size * num_ranges; 809 in = kzalloc(inlen, GFP_KERNEL); 810 if (!in) 811 return -ENOMEM; 812 813 cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in, 814 general_obj_in_cmd_hdr); 815 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, 816 MLX5_CMD_OP_CREATE_GENERAL_OBJECT); 817 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, 818 MLX5_OBJ_TYPE_PAGE_TRACK); 819 obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context); 820 MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id); 821 MLX5_SET(page_track, obj_context, track_type, 1); 822 MLX5_SET(page_track, obj_context, log_page_size, 823 ilog2(tracker->host_qp->tracked_page_size)); 824 MLX5_SET(page_track, obj_context, log_msg_size, 825 ilog2(tracker->host_qp->max_msg_size)); 826 MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn); 827 MLX5_SET(page_track, obj_context, num_ranges, num_ranges); 828 829 range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range); 830 node = interval_tree_iter_first(ranges, 0, ULONG_MAX); 831 for (i = 0; i < num_ranges; i++) { 832 void *addr_range_i_base = range_list_ptr + record_size * i; 833 unsigned long length = node->last - node->start + 1; 834 835 MLX5_SET64(page_track_range, addr_range_i_base, start_address, 836 node->start); 837 MLX5_SET64(page_track_range, addr_range_i_base, length, length); 838 total_ranges_len += length; 839 node = interval_tree_iter_next(node, 0, ULONG_MAX); 840 } 841 842 WARN_ON(node); 843 log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len)); 844 if (log_addr_space_size < 845 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) || 846 log_addr_space_size > 847 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) { 848 err = -EOPNOTSUPP; 849 goto out; 850 } 851 852 MLX5_SET(page_track, obj_context, log_addr_space_size, 853 log_addr_space_size); 854 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); 855 if (err) 856 goto out; 857 858 tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); 859 out: 860 kfree(in); 861 return err; 862 } 863 864 static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev, 865 u32 tracker_id) 866 { 867 u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; 868 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; 869 870 MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); 871 MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); 872 MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id); 873 874 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); 875 } 876 877 static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev, 878 u32 tracker_id, unsigned long iova, 879 unsigned long length, u32 tracker_state) 880 { 881 u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {}; 882 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; 883 void *obj_context; 884 void *cmd_hdr; 885 886 cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr); 887 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT); 888 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); 889 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id); 890 891 obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context); 892 MLX5_SET64(page_track, obj_context, modify_field_select, 0x3); 893 MLX5_SET64(page_track, obj_context, range_start_address, iova); 894 MLX5_SET64(page_track, obj_context, length, length); 895 MLX5_SET(page_track, obj_context, state, tracker_state); 896 897 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); 898 } 899 900 static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev, 901 struct mlx5_vhca_cq_buf *buf, int nent, 902 int cqe_size) 903 { 904 struct mlx5_frag_buf *frag_buf = &buf->frag_buf; 905 u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0); 906 u8 log_wq_sz = ilog2(cqe_size); 907 int err; 908 909 err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf, 910 mdev->priv.numa_node); 911 if (err) 912 return err; 913 914 mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc); 915 buf->cqe_size = cqe_size; 916 buf->nent = nent; 917 return 0; 918 } 919 920 static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf) 921 { 922 struct mlx5_cqe64 *cqe64; 923 void *cqe; 924 int i; 925 926 for (i = 0; i < buf->nent; i++) { 927 cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i); 928 cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64; 929 cqe64->op_own = MLX5_CQE_INVALID << 4; 930 } 931 } 932 933 static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev, 934 struct mlx5_vhca_cq *cq) 935 { 936 mlx5_core_destroy_cq(mdev, &cq->mcq); 937 mlx5_frag_buf_free(mdev, &cq->buf.frag_buf); 938 mlx5_db_free(mdev, &cq->db); 939 } 940 941 static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type) 942 { 943 if (type != MLX5_EVENT_TYPE_CQ_ERROR) 944 return; 945 946 set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device, 947 tracker.cq.mcq)); 948 } 949 950 static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type, 951 void *data) 952 { 953 struct mlx5_vhca_page_tracker *tracker = 954 mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb); 955 struct mlx5vf_pci_core_device *mvdev = container_of( 956 tracker, struct mlx5vf_pci_core_device, tracker); 957 struct mlx5_eqe *eqe = data; 958 u8 event_type = (u8)type; 959 u8 queue_type; 960 int qp_num; 961 962 switch (event_type) { 963 case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: 964 case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: 965 case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: 966 queue_type = eqe->data.qp_srq.type; 967 if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP) 968 break; 969 qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; 970 if (qp_num != tracker->host_qp->qpn && 971 qp_num != tracker->fw_qp->qpn) 972 break; 973 set_tracker_error(mvdev); 974 break; 975 default: 976 break; 977 } 978 979 return NOTIFY_OK; 980 } 981 982 static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq, 983 struct mlx5_eqe *eqe) 984 { 985 struct mlx5vf_pci_core_device *mvdev = 986 container_of(mcq, struct mlx5vf_pci_core_device, 987 tracker.cq.mcq); 988 989 complete(&mvdev->tracker_comp); 990 } 991 992 static int mlx5vf_create_cq(struct mlx5_core_dev *mdev, 993 struct mlx5_vhca_page_tracker *tracker, 994 size_t ncqe) 995 { 996 int cqe_size = cache_line_size() == 128 ? 128 : 64; 997 u32 out[MLX5_ST_SZ_DW(create_cq_out)]; 998 struct mlx5_vhca_cq *cq; 999 int inlen, err, eqn; 1000 void *cqc, *in; 1001 __be64 *pas; 1002 int vector; 1003 1004 cq = &tracker->cq; 1005 ncqe = roundup_pow_of_two(ncqe); 1006 err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node); 1007 if (err) 1008 return err; 1009 1010 cq->ncqe = ncqe; 1011 cq->mcq.set_ci_db = cq->db.db; 1012 cq->mcq.arm_db = cq->db.db + 1; 1013 cq->mcq.cqe_sz = cqe_size; 1014 err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size); 1015 if (err) 1016 goto err_db_free; 1017 1018 init_cq_frag_buf(&cq->buf); 1019 inlen = MLX5_ST_SZ_BYTES(create_cq_in) + 1020 MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * 1021 cq->buf.frag_buf.npages; 1022 in = kvzalloc(inlen, GFP_KERNEL); 1023 if (!in) { 1024 err = -ENOMEM; 1025 goto err_buff; 1026 } 1027 1028 vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev); 1029 err = mlx5_comp_eqn_get(mdev, vector, &eqn); 1030 if (err) 1031 goto err_vec; 1032 1033 cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); 1034 MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe)); 1035 MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); 1036 MLX5_SET(cqc, cqc, uar_page, tracker->uar->index); 1037 MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift - 1038 MLX5_ADAPTER_PAGE_SHIFT); 1039 MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma); 1040 pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); 1041 mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas); 1042 cq->mcq.comp = mlx5vf_cq_complete; 1043 cq->mcq.event = mlx5vf_cq_event; 1044 err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); 1045 if (err) 1046 goto err_vec; 1047 1048 mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map, 1049 cq->mcq.cons_index); 1050 kvfree(in); 1051 return 0; 1052 1053 err_vec: 1054 kvfree(in); 1055 err_buff: 1056 mlx5_frag_buf_free(mdev, &cq->buf.frag_buf); 1057 err_db_free: 1058 mlx5_db_free(mdev, &cq->db); 1059 return err; 1060 } 1061 1062 static struct mlx5_vhca_qp * 1063 mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev, 1064 struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr) 1065 { 1066 u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; 1067 struct mlx5_vhca_qp *qp; 1068 u8 log_rq_stride; 1069 u8 log_rq_sz; 1070 void *qpc; 1071 int inlen; 1072 void *in; 1073 int err; 1074 1075 qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT); 1076 if (!qp) 1077 return ERR_PTR(-ENOMEM); 1078 1079 err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node); 1080 if (err) 1081 goto err_free; 1082 1083 if (max_recv_wr) { 1084 qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr); 1085 log_rq_stride = ilog2(MLX5_SEND_WQE_DS); 1086 log_rq_sz = ilog2(qp->rq.wqe_cnt); 1087 err = mlx5_frag_buf_alloc_node(mdev, 1088 wq_get_byte_sz(log_rq_sz, log_rq_stride), 1089 &qp->buf, mdev->priv.numa_node); 1090 if (err) 1091 goto err_db_free; 1092 mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc); 1093 } 1094 1095 qp->rq.db = &qp->db.db[MLX5_RCV_DBR]; 1096 inlen = MLX5_ST_SZ_BYTES(create_qp_in) + 1097 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * 1098 qp->buf.npages; 1099 in = kvzalloc(inlen, GFP_KERNEL); 1100 if (!in) { 1101 err = -ENOMEM; 1102 goto err_in; 1103 } 1104 1105 qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); 1106 MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); 1107 MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); 1108 MLX5_SET(qpc, qpc, pd, tracker->pdn); 1109 MLX5_SET(qpc, qpc, uar_page, tracker->uar->index); 1110 MLX5_SET(qpc, qpc, log_page_size, 1111 qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); 1112 MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev)); 1113 if (MLX5_CAP_GEN(mdev, cqe_version) == 1) 1114 MLX5_SET(qpc, qpc, user_index, 0xFFFFFF); 1115 MLX5_SET(qpc, qpc, no_sq, 1); 1116 if (max_recv_wr) { 1117 MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn); 1118 MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4); 1119 MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz); 1120 MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ); 1121 MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); 1122 mlx5_fill_page_frag_array(&qp->buf, 1123 (__be64 *)MLX5_ADDR_OF(create_qp_in, 1124 in, pas)); 1125 } else { 1126 MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ); 1127 } 1128 1129 MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); 1130 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); 1131 kvfree(in); 1132 if (err) 1133 goto err_in; 1134 1135 qp->qpn = MLX5_GET(create_qp_out, out, qpn); 1136 return qp; 1137 1138 err_in: 1139 if (max_recv_wr) 1140 mlx5_frag_buf_free(mdev, &qp->buf); 1141 err_db_free: 1142 mlx5_db_free(mdev, &qp->db); 1143 err_free: 1144 kfree(qp); 1145 return ERR_PTR(err); 1146 } 1147 1148 static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp) 1149 { 1150 struct mlx5_wqe_data_seg *data; 1151 unsigned int ix; 1152 1153 WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt); 1154 ix = qp->rq.pc & (qp->rq.wqe_cnt - 1); 1155 data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix); 1156 data->byte_count = cpu_to_be32(qp->max_msg_size); 1157 data->lkey = cpu_to_be32(qp->recv_buf.mkey); 1158 data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset); 1159 qp->rq.pc++; 1160 /* Make sure that descriptors are written before doorbell record. */ 1161 dma_wmb(); 1162 *qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff); 1163 } 1164 1165 static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev, 1166 struct mlx5_vhca_qp *qp, u32 remote_qpn, 1167 bool host_qp) 1168 { 1169 u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {}; 1170 u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {}; 1171 u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {}; 1172 void *qpc; 1173 int ret; 1174 1175 /* Init */ 1176 qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc); 1177 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); 1178 MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED); 1179 MLX5_SET(qpc, qpc, rre, 1); 1180 MLX5_SET(qpc, qpc, rwe, 1); 1181 MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP); 1182 MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn); 1183 ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in); 1184 if (ret) 1185 return ret; 1186 1187 if (host_qp) { 1188 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; 1189 int i; 1190 1191 for (i = 0; i < qp->rq.wqe_cnt; i++) { 1192 mlx5vf_post_recv(qp); 1193 recv_buf->next_rq_offset += qp->max_msg_size; 1194 } 1195 } 1196 1197 /* RTR */ 1198 qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc); 1199 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn); 1200 MLX5_SET(qpc, qpc, mtu, IB_MTU_4096); 1201 MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg)); 1202 MLX5_SET(qpc, qpc, remote_qpn, remote_qpn); 1203 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); 1204 MLX5_SET(qpc, qpc, primary_address_path.fl, 1); 1205 MLX5_SET(qpc, qpc, min_rnr_nak, 1); 1206 MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP); 1207 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn); 1208 ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in); 1209 if (ret || host_qp) 1210 return ret; 1211 1212 /* RTS */ 1213 qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc); 1214 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn); 1215 MLX5_SET(qpc, qpc, retry_count, 7); 1216 MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */ 1217 MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */ 1218 MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP); 1219 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn); 1220 1221 return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in); 1222 } 1223 1224 static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev, 1225 struct mlx5_vhca_qp *qp) 1226 { 1227 u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; 1228 1229 MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); 1230 MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); 1231 mlx5_cmd_exec_in(mdev, destroy_qp, in); 1232 1233 mlx5_frag_buf_free(mdev, &qp->buf); 1234 mlx5_db_free(mdev, &qp->db); 1235 kfree(qp); 1236 } 1237 1238 static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf) 1239 { 1240 int i; 1241 1242 /* Undo alloc_pages_bulk_array() */ 1243 for (i = 0; i < recv_buf->npages; i++) 1244 __free_page(recv_buf->page_list[i]); 1245 1246 kvfree(recv_buf->page_list); 1247 } 1248 1249 static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, 1250 unsigned int npages) 1251 { 1252 unsigned int filled = 0, done = 0; 1253 int i; 1254 1255 recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list), 1256 GFP_KERNEL_ACCOUNT); 1257 if (!recv_buf->page_list) 1258 return -ENOMEM; 1259 1260 for (;;) { 1261 filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, 1262 npages - done, 1263 recv_buf->page_list + done); 1264 if (!filled) 1265 goto err; 1266 1267 done += filled; 1268 if (done == npages) 1269 break; 1270 } 1271 1272 recv_buf->npages = npages; 1273 return 0; 1274 1275 err: 1276 for (i = 0; i < npages; i++) { 1277 if (recv_buf->page_list[i]) 1278 __free_page(recv_buf->page_list[i]); 1279 } 1280 1281 kvfree(recv_buf->page_list); 1282 return -ENOMEM; 1283 } 1284 1285 static int register_dma_recv_pages(struct mlx5_core_dev *mdev, 1286 struct mlx5_vhca_recv_buf *recv_buf) 1287 { 1288 int i, j; 1289 1290 recv_buf->dma_addrs = kvcalloc(recv_buf->npages, 1291 sizeof(*recv_buf->dma_addrs), 1292 GFP_KERNEL_ACCOUNT); 1293 if (!recv_buf->dma_addrs) 1294 return -ENOMEM; 1295 1296 for (i = 0; i < recv_buf->npages; i++) { 1297 recv_buf->dma_addrs[i] = dma_map_page(mdev->device, 1298 recv_buf->page_list[i], 1299 0, PAGE_SIZE, 1300 DMA_FROM_DEVICE); 1301 if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i])) 1302 goto error; 1303 } 1304 return 0; 1305 1306 error: 1307 for (j = 0; j < i; j++) 1308 dma_unmap_single(mdev->device, recv_buf->dma_addrs[j], 1309 PAGE_SIZE, DMA_FROM_DEVICE); 1310 1311 kvfree(recv_buf->dma_addrs); 1312 return -ENOMEM; 1313 } 1314 1315 static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev, 1316 struct mlx5_vhca_recv_buf *recv_buf) 1317 { 1318 int i; 1319 1320 for (i = 0; i < recv_buf->npages; i++) 1321 dma_unmap_single(mdev->device, recv_buf->dma_addrs[i], 1322 PAGE_SIZE, DMA_FROM_DEVICE); 1323 1324 kvfree(recv_buf->dma_addrs); 1325 } 1326 1327 static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev, 1328 struct mlx5_vhca_qp *qp) 1329 { 1330 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; 1331 1332 mlx5_core_destroy_mkey(mdev, recv_buf->mkey); 1333 unregister_dma_recv_pages(mdev, recv_buf); 1334 free_recv_pages(&qp->recv_buf); 1335 } 1336 1337 static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, 1338 struct mlx5_vhca_qp *qp, u32 pdn, 1339 u64 rq_size) 1340 { 1341 unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE); 1342 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; 1343 int err; 1344 1345 err = alloc_recv_pages(recv_buf, npages); 1346 if (err < 0) 1347 return err; 1348 1349 err = register_dma_recv_pages(mdev, recv_buf); 1350 if (err) 1351 goto end; 1352 1353 err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey); 1354 if (err) 1355 goto err_create_mkey; 1356 1357 return 0; 1358 1359 err_create_mkey: 1360 unregister_dma_recv_pages(mdev, recv_buf); 1361 end: 1362 free_recv_pages(recv_buf); 1363 return err; 1364 } 1365 1366 static void 1367 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev) 1368 { 1369 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 1370 struct mlx5_core_dev *mdev = mvdev->mdev; 1371 1372 lockdep_assert_held(&mvdev->state_mutex); 1373 1374 if (!mvdev->log_active) 1375 return; 1376 1377 WARN_ON(mvdev->mdev_detach); 1378 1379 mlx5_eq_notifier_unregister(mdev, &tracker->nb); 1380 mlx5vf_cmd_destroy_tracker(mdev, tracker->id); 1381 mlx5vf_destroy_qp(mdev, tracker->fw_qp); 1382 mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp); 1383 mlx5vf_destroy_qp(mdev, tracker->host_qp); 1384 mlx5vf_destroy_cq(mdev, &tracker->cq); 1385 mlx5_core_dealloc_pd(mdev, tracker->pdn); 1386 mlx5_put_uars_page(mdev, tracker->uar); 1387 mvdev->log_active = false; 1388 } 1389 1390 int mlx5vf_stop_page_tracker(struct vfio_device *vdev) 1391 { 1392 struct mlx5vf_pci_core_device *mvdev = container_of( 1393 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1394 1395 mutex_lock(&mvdev->state_mutex); 1396 if (!mvdev->log_active) 1397 goto end; 1398 1399 _mlx5vf_free_page_tracker_resources(mvdev); 1400 mvdev->log_active = false; 1401 end: 1402 mlx5vf_state_mutex_unlock(mvdev); 1403 return 0; 1404 } 1405 1406 int mlx5vf_start_page_tracker(struct vfio_device *vdev, 1407 struct rb_root_cached *ranges, u32 nnodes, 1408 u64 *page_size) 1409 { 1410 struct mlx5vf_pci_core_device *mvdev = container_of( 1411 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1412 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 1413 u8 log_tracked_page = ilog2(*page_size); 1414 struct mlx5_vhca_qp *host_qp; 1415 struct mlx5_vhca_qp *fw_qp; 1416 struct mlx5_core_dev *mdev; 1417 u32 max_msg_size = PAGE_SIZE; 1418 u64 rq_size = SZ_2M; 1419 u32 max_recv_wr; 1420 int err; 1421 1422 mutex_lock(&mvdev->state_mutex); 1423 if (mvdev->mdev_detach) { 1424 err = -ENOTCONN; 1425 goto end; 1426 } 1427 1428 if (mvdev->log_active) { 1429 err = -EINVAL; 1430 goto end; 1431 } 1432 1433 mdev = mvdev->mdev; 1434 memset(tracker, 0, sizeof(*tracker)); 1435 tracker->uar = mlx5_get_uars_page(mdev); 1436 if (IS_ERR(tracker->uar)) { 1437 err = PTR_ERR(tracker->uar); 1438 goto end; 1439 } 1440 1441 err = mlx5_core_alloc_pd(mdev, &tracker->pdn); 1442 if (err) 1443 goto err_uar; 1444 1445 max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size); 1446 err = mlx5vf_create_cq(mdev, tracker, max_recv_wr); 1447 if (err) 1448 goto err_dealloc_pd; 1449 1450 host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr); 1451 if (IS_ERR(host_qp)) { 1452 err = PTR_ERR(host_qp); 1453 goto err_cq; 1454 } 1455 1456 host_qp->max_msg_size = max_msg_size; 1457 if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1458 pg_track_log_min_page_size)) { 1459 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1460 pg_track_log_min_page_size); 1461 } else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1462 pg_track_log_max_page_size)) { 1463 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1464 pg_track_log_max_page_size); 1465 } 1466 1467 host_qp->tracked_page_size = (1ULL << log_tracked_page); 1468 err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn, 1469 rq_size); 1470 if (err) 1471 goto err_host_qp; 1472 1473 fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0); 1474 if (IS_ERR(fw_qp)) { 1475 err = PTR_ERR(fw_qp); 1476 goto err_recv_resources; 1477 } 1478 1479 err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true); 1480 if (err) 1481 goto err_activate; 1482 1483 err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false); 1484 if (err) 1485 goto err_activate; 1486 1487 tracker->host_qp = host_qp; 1488 tracker->fw_qp = fw_qp; 1489 err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes); 1490 if (err) 1491 goto err_activate; 1492 1493 MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY); 1494 mlx5_eq_notifier_register(mdev, &tracker->nb); 1495 *page_size = host_qp->tracked_page_size; 1496 mvdev->log_active = true; 1497 mlx5vf_state_mutex_unlock(mvdev); 1498 return 0; 1499 1500 err_activate: 1501 mlx5vf_destroy_qp(mdev, fw_qp); 1502 err_recv_resources: 1503 mlx5vf_free_qp_recv_resources(mdev, host_qp); 1504 err_host_qp: 1505 mlx5vf_destroy_qp(mdev, host_qp); 1506 err_cq: 1507 mlx5vf_destroy_cq(mdev, &tracker->cq); 1508 err_dealloc_pd: 1509 mlx5_core_dealloc_pd(mdev, tracker->pdn); 1510 err_uar: 1511 mlx5_put_uars_page(mdev, tracker->uar); 1512 end: 1513 mlx5vf_state_mutex_unlock(mvdev); 1514 return err; 1515 } 1516 1517 static void 1518 set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp, 1519 struct iova_bitmap *dirty) 1520 { 1521 u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry); 1522 u32 nent = size / entry_size; 1523 struct page *page; 1524 u64 addr; 1525 u64 *buf; 1526 int i; 1527 1528 if (WARN_ON(index >= qp->recv_buf.npages || 1529 (nent > qp->max_msg_size / entry_size))) 1530 return; 1531 1532 page = qp->recv_buf.page_list[index]; 1533 buf = kmap_local_page(page); 1534 for (i = 0; i < nent; i++) { 1535 addr = MLX5_GET(page_track_report_entry, buf + i, 1536 dirty_address_low); 1537 addr |= (u64)MLX5_GET(page_track_report_entry, buf + i, 1538 dirty_address_high) << 32; 1539 iova_bitmap_set(dirty, addr, qp->tracked_page_size); 1540 } 1541 kunmap_local(buf); 1542 } 1543 1544 static void 1545 mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe, 1546 struct iova_bitmap *dirty, int *tracker_status) 1547 { 1548 u32 size; 1549 int ix; 1550 1551 qp->rq.cc++; 1552 *tracker_status = be32_to_cpu(cqe->immediate) >> 28; 1553 size = be32_to_cpu(cqe->byte_cnt); 1554 ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1); 1555 1556 /* zero length CQE, no data */ 1557 WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING); 1558 if (size) 1559 set_report_output(size, ix, qp, dirty); 1560 1561 qp->recv_buf.next_rq_offset = ix * qp->max_msg_size; 1562 mlx5vf_post_recv(qp); 1563 } 1564 1565 static void *get_cqe(struct mlx5_vhca_cq *cq, int n) 1566 { 1567 return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n); 1568 } 1569 1570 static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n) 1571 { 1572 void *cqe = get_cqe(cq, n & (cq->ncqe - 1)); 1573 struct mlx5_cqe64 *cqe64; 1574 1575 cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; 1576 1577 if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) && 1578 !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) { 1579 return cqe64; 1580 } else { 1581 return NULL; 1582 } 1583 } 1584 1585 static int 1586 mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp, 1587 struct iova_bitmap *dirty, int *tracker_status) 1588 { 1589 struct mlx5_cqe64 *cqe; 1590 u8 opcode; 1591 1592 cqe = get_sw_cqe(cq, cq->mcq.cons_index); 1593 if (!cqe) 1594 return CQ_EMPTY; 1595 1596 ++cq->mcq.cons_index; 1597 /* 1598 * Make sure we read CQ entry contents after we've checked the 1599 * ownership bit. 1600 */ 1601 rmb(); 1602 opcode = get_cqe_opcode(cqe); 1603 switch (opcode) { 1604 case MLX5_CQE_RESP_SEND_IMM: 1605 mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status); 1606 return CQ_OK; 1607 default: 1608 return CQ_POLL_ERR; 1609 } 1610 } 1611 1612 int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova, 1613 unsigned long length, 1614 struct iova_bitmap *dirty) 1615 { 1616 struct mlx5vf_pci_core_device *mvdev = container_of( 1617 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1618 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 1619 struct mlx5_vhca_cq *cq = &tracker->cq; 1620 struct mlx5_core_dev *mdev; 1621 int poll_err, err; 1622 1623 mutex_lock(&mvdev->state_mutex); 1624 if (!mvdev->log_active) { 1625 err = -EINVAL; 1626 goto end; 1627 } 1628 1629 if (mvdev->mdev_detach) { 1630 err = -ENOTCONN; 1631 goto end; 1632 } 1633 1634 mdev = mvdev->mdev; 1635 err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length, 1636 MLX5_PAGE_TRACK_STATE_REPORTING); 1637 if (err) 1638 goto end; 1639 1640 tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING; 1641 while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING && 1642 !tracker->is_err) { 1643 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty, 1644 &tracker->status); 1645 if (poll_err == CQ_EMPTY) { 1646 mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map, 1647 cq->mcq.cons_index); 1648 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, 1649 dirty, &tracker->status); 1650 if (poll_err == CQ_EMPTY) { 1651 wait_for_completion(&mvdev->tracker_comp); 1652 continue; 1653 } 1654 } 1655 if (poll_err == CQ_POLL_ERR) { 1656 err = -EIO; 1657 goto end; 1658 } 1659 mlx5_cq_set_ci(&cq->mcq); 1660 } 1661 1662 if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR) 1663 tracker->is_err = true; 1664 1665 if (tracker->is_err) 1666 err = -EIO; 1667 end: 1668 mlx5vf_state_mutex_unlock(mvdev); 1669 return err; 1670 } 1671