1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 /* 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include "cmd.h" 7 8 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 }; 9 10 static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id) 11 { 12 int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); 13 void *query_cap = NULL, *cap; 14 int ret; 15 16 query_cap = kzalloc(query_sz, GFP_KERNEL); 17 if (!query_cap) 18 return -ENOMEM; 19 20 ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap, 21 MLX5_CAP_GENERAL_2); 22 if (ret) 23 goto out; 24 25 cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability); 26 if (!MLX5_GET(cmd_hca_cap_2, cap, migratable)) 27 ret = -EOPNOTSUPP; 28 out: 29 kfree(query_cap); 30 return ret; 31 } 32 33 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, 34 u16 *vhca_id); 35 static void 36 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev); 37 38 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) 39 { 40 struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 41 u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {}; 42 u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {}; 43 int err; 44 45 lockdep_assert_held(&mvdev->state_mutex); 46 if (mvdev->mdev_detach) 47 return -ENOTCONN; 48 49 /* 50 * In case PRE_COPY is used, saving_migf is exposed while the device is 51 * running. Make sure to run only once there is no active save command. 52 * Running both in parallel, might end-up with a failure in the save 53 * command once it will try to turn on 'tracking' on a suspended device. 54 */ 55 if (migf) { 56 err = wait_for_completion_interruptible(&migf->save_comp); 57 if (err) 58 return err; 59 } 60 61 MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA); 62 MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id); 63 MLX5_SET(suspend_vhca_in, in, op_mod, op_mod); 64 65 err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); 66 if (migf) 67 complete(&migf->save_comp); 68 69 return err; 70 } 71 72 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) 73 { 74 u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {}; 75 u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {}; 76 77 lockdep_assert_held(&mvdev->state_mutex); 78 if (mvdev->mdev_detach) 79 return -ENOTCONN; 80 81 MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA); 82 MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id); 83 MLX5_SET(resume_vhca_in, in, op_mod, op_mod); 84 85 return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out); 86 } 87 88 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, 89 size_t *state_size, u64 *total_size, 90 u8 query_flags) 91 { 92 u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; 93 u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; 94 bool inc = query_flags & MLX5VF_QUERY_INC; 95 int ret; 96 97 lockdep_assert_held(&mvdev->state_mutex); 98 if (mvdev->mdev_detach) 99 return -ENOTCONN; 100 101 /* 102 * In case PRE_COPY is used, saving_migf is exposed while device is 103 * running. Make sure to run only once there is no active save command. 104 * Running both in parallel, might end-up with a failure in the 105 * incremental query command on un-tracked vhca. 106 */ 107 if (inc) { 108 ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); 109 if (ret) 110 return ret; 111 if (mvdev->saving_migf->state == 112 MLX5_MIGF_STATE_PRE_COPY_ERROR) { 113 /* 114 * In case we had a PRE_COPY error, only query full 115 * image for final image 116 */ 117 if (!(query_flags & MLX5VF_QUERY_FINAL)) { 118 *state_size = 0; 119 complete(&mvdev->saving_migf->save_comp); 120 return 0; 121 } 122 query_flags &= ~MLX5VF_QUERY_INC; 123 } 124 } 125 126 MLX5_SET(query_vhca_migration_state_in, in, opcode, 127 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); 128 MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); 129 MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0); 130 MLX5_SET(query_vhca_migration_state_in, in, incremental, 131 query_flags & MLX5VF_QUERY_INC); 132 MLX5_SET(query_vhca_migration_state_in, in, chunk, mvdev->chunk_mode); 133 134 ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, 135 out); 136 if (inc) 137 complete(&mvdev->saving_migf->save_comp); 138 139 if (ret) 140 return ret; 141 142 *state_size = MLX5_GET(query_vhca_migration_state_out, out, 143 required_umem_size); 144 if (total_size) 145 *total_size = mvdev->chunk_mode ? 146 MLX5_GET64(query_vhca_migration_state_out, out, 147 remaining_total_size) : *state_size; 148 149 return 0; 150 } 151 152 static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev) 153 { 154 /* Mark the tracker under an error and wake it up if it's running */ 155 mvdev->tracker.is_err = true; 156 complete(&mvdev->tracker_comp); 157 } 158 159 static int mlx5fv_vf_event(struct notifier_block *nb, 160 unsigned long event, void *data) 161 { 162 struct mlx5vf_pci_core_device *mvdev = 163 container_of(nb, struct mlx5vf_pci_core_device, nb); 164 165 switch (event) { 166 case MLX5_PF_NOTIFY_ENABLE_VF: 167 mutex_lock(&mvdev->state_mutex); 168 mvdev->mdev_detach = false; 169 mlx5vf_state_mutex_unlock(mvdev); 170 break; 171 case MLX5_PF_NOTIFY_DISABLE_VF: 172 mlx5vf_cmd_close_migratable(mvdev); 173 mutex_lock(&mvdev->state_mutex); 174 mvdev->mdev_detach = true; 175 mlx5vf_state_mutex_unlock(mvdev); 176 break; 177 default: 178 break; 179 } 180 181 return 0; 182 } 183 184 void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev) 185 { 186 if (!mvdev->migrate_cap) 187 return; 188 189 /* Must be done outside the lock to let it progress */ 190 set_tracker_error(mvdev); 191 mutex_lock(&mvdev->state_mutex); 192 mlx5vf_disable_fds(mvdev); 193 _mlx5vf_free_page_tracker_resources(mvdev); 194 mlx5vf_state_mutex_unlock(mvdev); 195 } 196 197 void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev) 198 { 199 if (!mvdev->migrate_cap) 200 return; 201 202 mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id, 203 &mvdev->nb); 204 destroy_workqueue(mvdev->cb_wq); 205 } 206 207 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, 208 const struct vfio_migration_ops *mig_ops, 209 const struct vfio_log_ops *log_ops) 210 { 211 struct pci_dev *pdev = mvdev->core_device.pdev; 212 int ret; 213 214 if (!pdev->is_virtfn) 215 return; 216 217 mvdev->mdev = mlx5_vf_get_core_dev(pdev); 218 if (!mvdev->mdev) 219 return; 220 221 if (!MLX5_CAP_GEN(mvdev->mdev, migration)) 222 goto end; 223 224 mvdev->vf_id = pci_iov_vf_id(pdev); 225 if (mvdev->vf_id < 0) 226 goto end; 227 228 ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1); 229 if (ret) 230 goto end; 231 232 if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1, 233 &mvdev->vhca_id)) 234 goto end; 235 236 mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0); 237 if (!mvdev->cb_wq) 238 goto end; 239 240 mutex_init(&mvdev->state_mutex); 241 spin_lock_init(&mvdev->reset_lock); 242 mvdev->nb.notifier_call = mlx5fv_vf_event; 243 ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id, 244 &mvdev->nb); 245 if (ret) { 246 destroy_workqueue(mvdev->cb_wq); 247 goto end; 248 } 249 250 mvdev->migrate_cap = 1; 251 mvdev->core_device.vdev.migration_flags = 252 VFIO_MIGRATION_STOP_COPY | 253 VFIO_MIGRATION_P2P; 254 mvdev->core_device.vdev.mig_ops = mig_ops; 255 init_completion(&mvdev->tracker_comp); 256 if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization)) 257 mvdev->core_device.vdev.log_ops = log_ops; 258 259 if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) && 260 MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)) 261 mvdev->core_device.vdev.migration_flags |= 262 VFIO_MIGRATION_PRE_COPY; 263 264 end: 265 mlx5_vf_put_core_dev(mvdev->mdev); 266 } 267 268 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, 269 u16 *vhca_id) 270 { 271 u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; 272 int out_size; 273 void *out; 274 int ret; 275 276 out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out); 277 out = kzalloc(out_size, GFP_KERNEL); 278 if (!out) 279 return -ENOMEM; 280 281 MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); 282 MLX5_SET(query_hca_cap_in, in, other_function, 1); 283 MLX5_SET(query_hca_cap_in, in, function_id, function_id); 284 MLX5_SET(query_hca_cap_in, in, op_mod, 285 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 | 286 HCA_CAP_OPMOD_GET_CUR); 287 288 ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out); 289 if (ret) 290 goto err_exec; 291 292 *vhca_id = MLX5_GET(query_hca_cap_out, out, 293 capability.cmd_hca_cap.vhca_id); 294 295 err_exec: 296 kfree(out); 297 return ret; 298 } 299 300 static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, 301 struct mlx5_vhca_data_buffer *buf, 302 struct mlx5_vhca_recv_buf *recv_buf, 303 u32 *mkey) 304 { 305 size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) : 306 recv_buf->npages; 307 int err = 0, inlen; 308 __be64 *mtt; 309 void *mkc; 310 u32 *in; 311 312 inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + 313 sizeof(*mtt) * round_up(npages, 2); 314 315 in = kvzalloc(inlen, GFP_KERNEL); 316 if (!in) 317 return -ENOMEM; 318 319 MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 320 DIV_ROUND_UP(npages, 2)); 321 mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 322 323 if (buf) { 324 struct sg_dma_page_iter dma_iter; 325 326 for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) 327 *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); 328 } else { 329 int i; 330 331 for (i = 0; i < npages; i++) 332 *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]); 333 } 334 335 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 336 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); 337 MLX5_SET(mkc, mkc, lr, 1); 338 MLX5_SET(mkc, mkc, lw, 1); 339 MLX5_SET(mkc, mkc, rr, 1); 340 MLX5_SET(mkc, mkc, rw, 1); 341 MLX5_SET(mkc, mkc, pd, pdn); 342 MLX5_SET(mkc, mkc, bsf_octword_size, 0); 343 MLX5_SET(mkc, mkc, qpn, 0xffffff); 344 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); 345 MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); 346 MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE); 347 err = mlx5_core_create_mkey(mdev, mkey, in, inlen); 348 kvfree(in); 349 return err; 350 } 351 352 static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) 353 { 354 struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; 355 struct mlx5_core_dev *mdev = mvdev->mdev; 356 int ret; 357 358 lockdep_assert_held(&mvdev->state_mutex); 359 if (mvdev->mdev_detach) 360 return -ENOTCONN; 361 362 if (buf->dmaed || !buf->allocated_length) 363 return -EINVAL; 364 365 ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); 366 if (ret) 367 return ret; 368 369 ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey); 370 if (ret) 371 goto err; 372 373 buf->dmaed = true; 374 375 return 0; 376 err: 377 dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); 378 return ret; 379 } 380 381 void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) 382 { 383 struct mlx5_vf_migration_file *migf = buf->migf; 384 struct sg_page_iter sg_iter; 385 386 lockdep_assert_held(&migf->mvdev->state_mutex); 387 WARN_ON(migf->mvdev->mdev_detach); 388 389 if (buf->dmaed) { 390 mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); 391 dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, 392 buf->dma_dir, 0); 393 } 394 395 /* Undo alloc_pages_bulk_array() */ 396 for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) 397 __free_page(sg_page_iter_page(&sg_iter)); 398 sg_free_append_table(&buf->table); 399 kfree(buf); 400 } 401 402 struct mlx5_vhca_data_buffer * 403 mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, 404 size_t length, 405 enum dma_data_direction dma_dir) 406 { 407 struct mlx5_vhca_data_buffer *buf; 408 int ret; 409 410 buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); 411 if (!buf) 412 return ERR_PTR(-ENOMEM); 413 414 buf->dma_dir = dma_dir; 415 buf->migf = migf; 416 if (length) { 417 ret = mlx5vf_add_migration_pages(buf, 418 DIV_ROUND_UP_ULL(length, PAGE_SIZE)); 419 if (ret) 420 goto end; 421 422 if (dma_dir != DMA_NONE) { 423 ret = mlx5vf_dma_data_buffer(buf); 424 if (ret) 425 goto end; 426 } 427 } 428 429 return buf; 430 end: 431 mlx5vf_free_data_buffer(buf); 432 return ERR_PTR(ret); 433 } 434 435 void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) 436 { 437 spin_lock_irq(&buf->migf->list_lock); 438 list_add_tail(&buf->buf_elm, &buf->migf->avail_list); 439 spin_unlock_irq(&buf->migf->list_lock); 440 } 441 442 struct mlx5_vhca_data_buffer * 443 mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, 444 size_t length, enum dma_data_direction dma_dir) 445 { 446 struct mlx5_vhca_data_buffer *buf, *temp_buf; 447 struct list_head free_list; 448 449 lockdep_assert_held(&migf->mvdev->state_mutex); 450 if (migf->mvdev->mdev_detach) 451 return ERR_PTR(-ENOTCONN); 452 453 INIT_LIST_HEAD(&free_list); 454 455 spin_lock_irq(&migf->list_lock); 456 list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { 457 if (buf->dma_dir == dma_dir) { 458 list_del_init(&buf->buf_elm); 459 if (buf->allocated_length >= length) { 460 spin_unlock_irq(&migf->list_lock); 461 goto found; 462 } 463 /* 464 * Prevent holding redundant buffers. Put in a free 465 * list and call at the end not under the spin lock 466 * (&migf->list_lock) to mlx5vf_free_data_buffer which 467 * might sleep. 468 */ 469 list_add(&buf->buf_elm, &free_list); 470 } 471 } 472 spin_unlock_irq(&migf->list_lock); 473 buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir); 474 475 found: 476 while ((temp_buf = list_first_entry_or_null(&free_list, 477 struct mlx5_vhca_data_buffer, buf_elm))) { 478 list_del(&temp_buf->buf_elm); 479 mlx5vf_free_data_buffer(temp_buf); 480 } 481 482 return buf; 483 } 484 485 static void 486 mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf, 487 struct mlx5vf_async_data *async_data) 488 { 489 kvfree(async_data->out); 490 complete(&migf->save_comp); 491 fput(migf->filp); 492 } 493 494 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) 495 { 496 struct mlx5vf_async_data *async_data = container_of(_work, 497 struct mlx5vf_async_data, work); 498 struct mlx5_vf_migration_file *migf = container_of(async_data, 499 struct mlx5_vf_migration_file, async_data); 500 501 mutex_lock(&migf->lock); 502 if (async_data->status) { 503 mlx5vf_put_data_buffer(async_data->buf); 504 if (async_data->header_buf) 505 mlx5vf_put_data_buffer(async_data->header_buf); 506 if (!async_data->stop_copy_chunk && 507 async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR) 508 migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR; 509 else 510 migf->state = MLX5_MIGF_STATE_ERROR; 511 wake_up_interruptible(&migf->poll_wait); 512 } 513 mutex_unlock(&migf->lock); 514 mlx5vf_save_callback_complete(migf, async_data); 515 } 516 517 static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, 518 size_t image_size, bool initial_pre_copy) 519 { 520 struct mlx5_vf_migration_file *migf = header_buf->migf; 521 struct mlx5_vf_migration_header header = {}; 522 unsigned long flags; 523 struct page *page; 524 u8 *to_buff; 525 526 header.record_size = cpu_to_le64(image_size); 527 header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY); 528 header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA); 529 page = mlx5vf_get_migration_page(header_buf, 0); 530 if (!page) 531 return -EINVAL; 532 to_buff = kmap_local_page(page); 533 memcpy(to_buff, &header, sizeof(header)); 534 kunmap_local(to_buff); 535 header_buf->length = sizeof(header); 536 header_buf->start_pos = header_buf->migf->max_pos; 537 migf->max_pos += header_buf->length; 538 spin_lock_irqsave(&migf->list_lock, flags); 539 list_add_tail(&header_buf->buf_elm, &migf->buf_list); 540 spin_unlock_irqrestore(&migf->list_lock, flags); 541 if (initial_pre_copy) 542 migf->pre_copy_initial_bytes += sizeof(header); 543 return 0; 544 } 545 546 static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) 547 { 548 struct mlx5vf_async_data *async_data = container_of(context, 549 struct mlx5vf_async_data, cb_work); 550 struct mlx5_vf_migration_file *migf = container_of(async_data, 551 struct mlx5_vf_migration_file, async_data); 552 553 if (!status) { 554 size_t image_size; 555 unsigned long flags; 556 bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY && 557 !async_data->stop_copy_chunk; 558 559 image_size = MLX5_GET(save_vhca_state_out, async_data->out, 560 actual_image_size); 561 if (async_data->header_buf) { 562 status = add_buf_header(async_data->header_buf, image_size, 563 initial_pre_copy); 564 if (status) 565 goto err; 566 } 567 async_data->buf->length = image_size; 568 async_data->buf->start_pos = migf->max_pos; 569 migf->max_pos += async_data->buf->length; 570 spin_lock_irqsave(&migf->list_lock, flags); 571 list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); 572 spin_unlock_irqrestore(&migf->list_lock, flags); 573 if (initial_pre_copy) 574 migf->pre_copy_initial_bytes += image_size; 575 migf->state = async_data->stop_copy_chunk ? 576 MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY; 577 wake_up_interruptible(&migf->poll_wait); 578 mlx5vf_save_callback_complete(migf, async_data); 579 return; 580 } 581 582 err: 583 /* The error flow can't run from an interrupt context */ 584 if (status == -EREMOTEIO) 585 status = MLX5_GET(save_vhca_state_out, async_data->out, status); 586 async_data->status = status; 587 queue_work(migf->mvdev->cb_wq, &async_data->work); 588 } 589 590 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, 591 struct mlx5_vf_migration_file *migf, 592 struct mlx5_vhca_data_buffer *buf, bool inc, 593 bool track) 594 { 595 u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); 596 u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; 597 struct mlx5_vhca_data_buffer *header_buf = NULL; 598 struct mlx5vf_async_data *async_data; 599 int err; 600 601 lockdep_assert_held(&mvdev->state_mutex); 602 if (mvdev->mdev_detach) 603 return -ENOTCONN; 604 605 err = wait_for_completion_interruptible(&migf->save_comp); 606 if (err) 607 return err; 608 609 if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) 610 /* 611 * In case we had a PRE_COPY error, SAVE is triggered only for 612 * the final image, read device full image. 613 */ 614 inc = false; 615 616 MLX5_SET(save_vhca_state_in, in, opcode, 617 MLX5_CMD_OP_SAVE_VHCA_STATE); 618 MLX5_SET(save_vhca_state_in, in, op_mod, 0); 619 MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); 620 MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); 621 MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); 622 MLX5_SET(save_vhca_state_in, in, incremental, inc); 623 MLX5_SET(save_vhca_state_in, in, set_track, track); 624 625 async_data = &migf->async_data; 626 async_data->buf = buf; 627 async_data->stop_copy_chunk = !track; 628 async_data->out = kvzalloc(out_size, GFP_KERNEL); 629 if (!async_data->out) { 630 err = -ENOMEM; 631 goto err_out; 632 } 633 634 if (MLX5VF_PRE_COPY_SUPP(mvdev)) { 635 if (async_data->stop_copy_chunk && migf->buf_header) { 636 header_buf = migf->buf_header; 637 migf->buf_header = NULL; 638 } else { 639 header_buf = mlx5vf_get_data_buffer(migf, 640 sizeof(struct mlx5_vf_migration_header), DMA_NONE); 641 if (IS_ERR(header_buf)) { 642 err = PTR_ERR(header_buf); 643 goto err_free; 644 } 645 } 646 } 647 648 if (async_data->stop_copy_chunk) 649 migf->state = MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK; 650 651 async_data->header_buf = header_buf; 652 get_file(migf->filp); 653 err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), 654 async_data->out, 655 out_size, mlx5vf_save_callback, 656 &async_data->cb_work); 657 if (err) 658 goto err_exec; 659 660 return 0; 661 662 err_exec: 663 if (header_buf) 664 mlx5vf_put_data_buffer(header_buf); 665 fput(migf->filp); 666 err_free: 667 kvfree(async_data->out); 668 err_out: 669 complete(&migf->save_comp); 670 return err; 671 } 672 673 int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, 674 struct mlx5_vf_migration_file *migf, 675 struct mlx5_vhca_data_buffer *buf) 676 { 677 u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; 678 u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; 679 int err; 680 681 lockdep_assert_held(&mvdev->state_mutex); 682 if (mvdev->mdev_detach) 683 return -ENOTCONN; 684 685 if (!buf->dmaed) { 686 err = mlx5vf_dma_data_buffer(buf); 687 if (err) 688 return err; 689 } 690 691 MLX5_SET(load_vhca_state_in, in, opcode, 692 MLX5_CMD_OP_LOAD_VHCA_STATE); 693 MLX5_SET(load_vhca_state_in, in, op_mod, 0); 694 MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id); 695 MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey); 696 MLX5_SET(load_vhca_state_in, in, size, buf->length); 697 return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out); 698 } 699 700 int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf) 701 { 702 int err; 703 704 lockdep_assert_held(&migf->mvdev->state_mutex); 705 if (migf->mvdev->mdev_detach) 706 return -ENOTCONN; 707 708 err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn); 709 return err; 710 } 711 712 void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) 713 { 714 lockdep_assert_held(&migf->mvdev->state_mutex); 715 if (migf->mvdev->mdev_detach) 716 return; 717 718 mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn); 719 } 720 721 void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) 722 { 723 struct mlx5_vhca_data_buffer *entry; 724 725 lockdep_assert_held(&migf->mvdev->state_mutex); 726 WARN_ON(migf->mvdev->mdev_detach); 727 728 if (migf->buf) { 729 mlx5vf_free_data_buffer(migf->buf); 730 migf->buf = NULL; 731 } 732 733 if (migf->buf_header) { 734 mlx5vf_free_data_buffer(migf->buf_header); 735 migf->buf_header = NULL; 736 } 737 738 list_splice(&migf->avail_list, &migf->buf_list); 739 740 while ((entry = list_first_entry_or_null(&migf->buf_list, 741 struct mlx5_vhca_data_buffer, buf_elm))) { 742 list_del(&entry->buf_elm); 743 mlx5vf_free_data_buffer(entry); 744 } 745 746 mlx5vf_cmd_dealloc_pd(migf); 747 } 748 749 static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev, 750 struct mlx5vf_pci_core_device *mvdev, 751 struct rb_root_cached *ranges, u32 nnodes) 752 { 753 int max_num_range = 754 MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range); 755 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 756 int record_size = MLX5_ST_SZ_BYTES(page_track_range); 757 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; 758 struct interval_tree_node *node = NULL; 759 u64 total_ranges_len = 0; 760 u32 num_ranges = nnodes; 761 u8 log_addr_space_size; 762 void *range_list_ptr; 763 void *obj_context; 764 void *cmd_hdr; 765 int inlen; 766 void *in; 767 int err; 768 int i; 769 770 if (num_ranges > max_num_range) { 771 vfio_combine_iova_ranges(ranges, nnodes, max_num_range); 772 num_ranges = max_num_range; 773 } 774 775 inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) + 776 record_size * num_ranges; 777 in = kzalloc(inlen, GFP_KERNEL); 778 if (!in) 779 return -ENOMEM; 780 781 cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in, 782 general_obj_in_cmd_hdr); 783 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, 784 MLX5_CMD_OP_CREATE_GENERAL_OBJECT); 785 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, 786 MLX5_OBJ_TYPE_PAGE_TRACK); 787 obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context); 788 MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id); 789 MLX5_SET(page_track, obj_context, track_type, 1); 790 MLX5_SET(page_track, obj_context, log_page_size, 791 ilog2(tracker->host_qp->tracked_page_size)); 792 MLX5_SET(page_track, obj_context, log_msg_size, 793 ilog2(tracker->host_qp->max_msg_size)); 794 MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn); 795 MLX5_SET(page_track, obj_context, num_ranges, num_ranges); 796 797 range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range); 798 node = interval_tree_iter_first(ranges, 0, ULONG_MAX); 799 for (i = 0; i < num_ranges; i++) { 800 void *addr_range_i_base = range_list_ptr + record_size * i; 801 unsigned long length = node->last - node->start + 1; 802 803 MLX5_SET64(page_track_range, addr_range_i_base, start_address, 804 node->start); 805 MLX5_SET64(page_track_range, addr_range_i_base, length, length); 806 total_ranges_len += length; 807 node = interval_tree_iter_next(node, 0, ULONG_MAX); 808 } 809 810 WARN_ON(node); 811 log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len)); 812 if (log_addr_space_size < 813 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) || 814 log_addr_space_size > 815 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) { 816 err = -EOPNOTSUPP; 817 goto out; 818 } 819 820 MLX5_SET(page_track, obj_context, log_addr_space_size, 821 log_addr_space_size); 822 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); 823 if (err) 824 goto out; 825 826 tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); 827 out: 828 kfree(in); 829 return err; 830 } 831 832 static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev, 833 u32 tracker_id) 834 { 835 u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; 836 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; 837 838 MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); 839 MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); 840 MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id); 841 842 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); 843 } 844 845 static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev, 846 u32 tracker_id, unsigned long iova, 847 unsigned long length, u32 tracker_state) 848 { 849 u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {}; 850 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; 851 void *obj_context; 852 void *cmd_hdr; 853 854 cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr); 855 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT); 856 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); 857 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id); 858 859 obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context); 860 MLX5_SET64(page_track, obj_context, modify_field_select, 0x3); 861 MLX5_SET64(page_track, obj_context, range_start_address, iova); 862 MLX5_SET64(page_track, obj_context, length, length); 863 MLX5_SET(page_track, obj_context, state, tracker_state); 864 865 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); 866 } 867 868 static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev, 869 struct mlx5_vhca_cq_buf *buf, int nent, 870 int cqe_size) 871 { 872 struct mlx5_frag_buf *frag_buf = &buf->frag_buf; 873 u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0); 874 u8 log_wq_sz = ilog2(cqe_size); 875 int err; 876 877 err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf, 878 mdev->priv.numa_node); 879 if (err) 880 return err; 881 882 mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc); 883 buf->cqe_size = cqe_size; 884 buf->nent = nent; 885 return 0; 886 } 887 888 static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf) 889 { 890 struct mlx5_cqe64 *cqe64; 891 void *cqe; 892 int i; 893 894 for (i = 0; i < buf->nent; i++) { 895 cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i); 896 cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64; 897 cqe64->op_own = MLX5_CQE_INVALID << 4; 898 } 899 } 900 901 static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev, 902 struct mlx5_vhca_cq *cq) 903 { 904 mlx5_core_destroy_cq(mdev, &cq->mcq); 905 mlx5_frag_buf_free(mdev, &cq->buf.frag_buf); 906 mlx5_db_free(mdev, &cq->db); 907 } 908 909 static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type) 910 { 911 if (type != MLX5_EVENT_TYPE_CQ_ERROR) 912 return; 913 914 set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device, 915 tracker.cq.mcq)); 916 } 917 918 static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type, 919 void *data) 920 { 921 struct mlx5_vhca_page_tracker *tracker = 922 mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb); 923 struct mlx5vf_pci_core_device *mvdev = container_of( 924 tracker, struct mlx5vf_pci_core_device, tracker); 925 struct mlx5_eqe *eqe = data; 926 u8 event_type = (u8)type; 927 u8 queue_type; 928 int qp_num; 929 930 switch (event_type) { 931 case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: 932 case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: 933 case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: 934 queue_type = eqe->data.qp_srq.type; 935 if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP) 936 break; 937 qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; 938 if (qp_num != tracker->host_qp->qpn && 939 qp_num != tracker->fw_qp->qpn) 940 break; 941 set_tracker_error(mvdev); 942 break; 943 default: 944 break; 945 } 946 947 return NOTIFY_OK; 948 } 949 950 static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq, 951 struct mlx5_eqe *eqe) 952 { 953 struct mlx5vf_pci_core_device *mvdev = 954 container_of(mcq, struct mlx5vf_pci_core_device, 955 tracker.cq.mcq); 956 957 complete(&mvdev->tracker_comp); 958 } 959 960 static int mlx5vf_create_cq(struct mlx5_core_dev *mdev, 961 struct mlx5_vhca_page_tracker *tracker, 962 size_t ncqe) 963 { 964 int cqe_size = cache_line_size() == 128 ? 128 : 64; 965 u32 out[MLX5_ST_SZ_DW(create_cq_out)]; 966 struct mlx5_vhca_cq *cq; 967 int inlen, err, eqn; 968 void *cqc, *in; 969 __be64 *pas; 970 int vector; 971 972 cq = &tracker->cq; 973 ncqe = roundup_pow_of_two(ncqe); 974 err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node); 975 if (err) 976 return err; 977 978 cq->ncqe = ncqe; 979 cq->mcq.set_ci_db = cq->db.db; 980 cq->mcq.arm_db = cq->db.db + 1; 981 cq->mcq.cqe_sz = cqe_size; 982 err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size); 983 if (err) 984 goto err_db_free; 985 986 init_cq_frag_buf(&cq->buf); 987 inlen = MLX5_ST_SZ_BYTES(create_cq_in) + 988 MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * 989 cq->buf.frag_buf.npages; 990 in = kvzalloc(inlen, GFP_KERNEL); 991 if (!in) { 992 err = -ENOMEM; 993 goto err_buff; 994 } 995 996 vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev); 997 err = mlx5_comp_eqn_get(mdev, vector, &eqn); 998 if (err) 999 goto err_vec; 1000 1001 cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); 1002 MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe)); 1003 MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); 1004 MLX5_SET(cqc, cqc, uar_page, tracker->uar->index); 1005 MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift - 1006 MLX5_ADAPTER_PAGE_SHIFT); 1007 MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma); 1008 pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); 1009 mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas); 1010 cq->mcq.comp = mlx5vf_cq_complete; 1011 cq->mcq.event = mlx5vf_cq_event; 1012 err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); 1013 if (err) 1014 goto err_vec; 1015 1016 mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map, 1017 cq->mcq.cons_index); 1018 kvfree(in); 1019 return 0; 1020 1021 err_vec: 1022 kvfree(in); 1023 err_buff: 1024 mlx5_frag_buf_free(mdev, &cq->buf.frag_buf); 1025 err_db_free: 1026 mlx5_db_free(mdev, &cq->db); 1027 return err; 1028 } 1029 1030 static struct mlx5_vhca_qp * 1031 mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev, 1032 struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr) 1033 { 1034 u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; 1035 struct mlx5_vhca_qp *qp; 1036 u8 log_rq_stride; 1037 u8 log_rq_sz; 1038 void *qpc; 1039 int inlen; 1040 void *in; 1041 int err; 1042 1043 qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT); 1044 if (!qp) 1045 return ERR_PTR(-ENOMEM); 1046 1047 err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node); 1048 if (err) 1049 goto err_free; 1050 1051 if (max_recv_wr) { 1052 qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr); 1053 log_rq_stride = ilog2(MLX5_SEND_WQE_DS); 1054 log_rq_sz = ilog2(qp->rq.wqe_cnt); 1055 err = mlx5_frag_buf_alloc_node(mdev, 1056 wq_get_byte_sz(log_rq_sz, log_rq_stride), 1057 &qp->buf, mdev->priv.numa_node); 1058 if (err) 1059 goto err_db_free; 1060 mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc); 1061 } 1062 1063 qp->rq.db = &qp->db.db[MLX5_RCV_DBR]; 1064 inlen = MLX5_ST_SZ_BYTES(create_qp_in) + 1065 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * 1066 qp->buf.npages; 1067 in = kvzalloc(inlen, GFP_KERNEL); 1068 if (!in) { 1069 err = -ENOMEM; 1070 goto err_in; 1071 } 1072 1073 qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); 1074 MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); 1075 MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); 1076 MLX5_SET(qpc, qpc, pd, tracker->pdn); 1077 MLX5_SET(qpc, qpc, uar_page, tracker->uar->index); 1078 MLX5_SET(qpc, qpc, log_page_size, 1079 qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); 1080 MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev)); 1081 if (MLX5_CAP_GEN(mdev, cqe_version) == 1) 1082 MLX5_SET(qpc, qpc, user_index, 0xFFFFFF); 1083 MLX5_SET(qpc, qpc, no_sq, 1); 1084 if (max_recv_wr) { 1085 MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn); 1086 MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4); 1087 MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz); 1088 MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ); 1089 MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); 1090 mlx5_fill_page_frag_array(&qp->buf, 1091 (__be64 *)MLX5_ADDR_OF(create_qp_in, 1092 in, pas)); 1093 } else { 1094 MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ); 1095 } 1096 1097 MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); 1098 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); 1099 kvfree(in); 1100 if (err) 1101 goto err_in; 1102 1103 qp->qpn = MLX5_GET(create_qp_out, out, qpn); 1104 return qp; 1105 1106 err_in: 1107 if (max_recv_wr) 1108 mlx5_frag_buf_free(mdev, &qp->buf); 1109 err_db_free: 1110 mlx5_db_free(mdev, &qp->db); 1111 err_free: 1112 kfree(qp); 1113 return ERR_PTR(err); 1114 } 1115 1116 static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp) 1117 { 1118 struct mlx5_wqe_data_seg *data; 1119 unsigned int ix; 1120 1121 WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt); 1122 ix = qp->rq.pc & (qp->rq.wqe_cnt - 1); 1123 data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix); 1124 data->byte_count = cpu_to_be32(qp->max_msg_size); 1125 data->lkey = cpu_to_be32(qp->recv_buf.mkey); 1126 data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset); 1127 qp->rq.pc++; 1128 /* Make sure that descriptors are written before doorbell record. */ 1129 dma_wmb(); 1130 *qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff); 1131 } 1132 1133 static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev, 1134 struct mlx5_vhca_qp *qp, u32 remote_qpn, 1135 bool host_qp) 1136 { 1137 u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {}; 1138 u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {}; 1139 u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {}; 1140 void *qpc; 1141 int ret; 1142 1143 /* Init */ 1144 qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc); 1145 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); 1146 MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED); 1147 MLX5_SET(qpc, qpc, rre, 1); 1148 MLX5_SET(qpc, qpc, rwe, 1); 1149 MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP); 1150 MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn); 1151 ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in); 1152 if (ret) 1153 return ret; 1154 1155 if (host_qp) { 1156 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; 1157 int i; 1158 1159 for (i = 0; i < qp->rq.wqe_cnt; i++) { 1160 mlx5vf_post_recv(qp); 1161 recv_buf->next_rq_offset += qp->max_msg_size; 1162 } 1163 } 1164 1165 /* RTR */ 1166 qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc); 1167 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn); 1168 MLX5_SET(qpc, qpc, mtu, IB_MTU_4096); 1169 MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg)); 1170 MLX5_SET(qpc, qpc, remote_qpn, remote_qpn); 1171 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); 1172 MLX5_SET(qpc, qpc, primary_address_path.fl, 1); 1173 MLX5_SET(qpc, qpc, min_rnr_nak, 1); 1174 MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP); 1175 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn); 1176 ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in); 1177 if (ret || host_qp) 1178 return ret; 1179 1180 /* RTS */ 1181 qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc); 1182 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn); 1183 MLX5_SET(qpc, qpc, retry_count, 7); 1184 MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */ 1185 MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */ 1186 MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP); 1187 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn); 1188 1189 return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in); 1190 } 1191 1192 static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev, 1193 struct mlx5_vhca_qp *qp) 1194 { 1195 u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; 1196 1197 MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); 1198 MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); 1199 mlx5_cmd_exec_in(mdev, destroy_qp, in); 1200 1201 mlx5_frag_buf_free(mdev, &qp->buf); 1202 mlx5_db_free(mdev, &qp->db); 1203 kfree(qp); 1204 } 1205 1206 static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf) 1207 { 1208 int i; 1209 1210 /* Undo alloc_pages_bulk_array() */ 1211 for (i = 0; i < recv_buf->npages; i++) 1212 __free_page(recv_buf->page_list[i]); 1213 1214 kvfree(recv_buf->page_list); 1215 } 1216 1217 static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, 1218 unsigned int npages) 1219 { 1220 unsigned int filled = 0, done = 0; 1221 int i; 1222 1223 recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list), 1224 GFP_KERNEL_ACCOUNT); 1225 if (!recv_buf->page_list) 1226 return -ENOMEM; 1227 1228 for (;;) { 1229 filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, 1230 npages - done, 1231 recv_buf->page_list + done); 1232 if (!filled) 1233 goto err; 1234 1235 done += filled; 1236 if (done == npages) 1237 break; 1238 } 1239 1240 recv_buf->npages = npages; 1241 return 0; 1242 1243 err: 1244 for (i = 0; i < npages; i++) { 1245 if (recv_buf->page_list[i]) 1246 __free_page(recv_buf->page_list[i]); 1247 } 1248 1249 kvfree(recv_buf->page_list); 1250 return -ENOMEM; 1251 } 1252 1253 static int register_dma_recv_pages(struct mlx5_core_dev *mdev, 1254 struct mlx5_vhca_recv_buf *recv_buf) 1255 { 1256 int i, j; 1257 1258 recv_buf->dma_addrs = kvcalloc(recv_buf->npages, 1259 sizeof(*recv_buf->dma_addrs), 1260 GFP_KERNEL_ACCOUNT); 1261 if (!recv_buf->dma_addrs) 1262 return -ENOMEM; 1263 1264 for (i = 0; i < recv_buf->npages; i++) { 1265 recv_buf->dma_addrs[i] = dma_map_page(mdev->device, 1266 recv_buf->page_list[i], 1267 0, PAGE_SIZE, 1268 DMA_FROM_DEVICE); 1269 if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i])) 1270 goto error; 1271 } 1272 return 0; 1273 1274 error: 1275 for (j = 0; j < i; j++) 1276 dma_unmap_single(mdev->device, recv_buf->dma_addrs[j], 1277 PAGE_SIZE, DMA_FROM_DEVICE); 1278 1279 kvfree(recv_buf->dma_addrs); 1280 return -ENOMEM; 1281 } 1282 1283 static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev, 1284 struct mlx5_vhca_recv_buf *recv_buf) 1285 { 1286 int i; 1287 1288 for (i = 0; i < recv_buf->npages; i++) 1289 dma_unmap_single(mdev->device, recv_buf->dma_addrs[i], 1290 PAGE_SIZE, DMA_FROM_DEVICE); 1291 1292 kvfree(recv_buf->dma_addrs); 1293 } 1294 1295 static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev, 1296 struct mlx5_vhca_qp *qp) 1297 { 1298 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; 1299 1300 mlx5_core_destroy_mkey(mdev, recv_buf->mkey); 1301 unregister_dma_recv_pages(mdev, recv_buf); 1302 free_recv_pages(&qp->recv_buf); 1303 } 1304 1305 static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, 1306 struct mlx5_vhca_qp *qp, u32 pdn, 1307 u64 rq_size) 1308 { 1309 unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE); 1310 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; 1311 int err; 1312 1313 err = alloc_recv_pages(recv_buf, npages); 1314 if (err < 0) 1315 return err; 1316 1317 err = register_dma_recv_pages(mdev, recv_buf); 1318 if (err) 1319 goto end; 1320 1321 err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey); 1322 if (err) 1323 goto err_create_mkey; 1324 1325 return 0; 1326 1327 err_create_mkey: 1328 unregister_dma_recv_pages(mdev, recv_buf); 1329 end: 1330 free_recv_pages(recv_buf); 1331 return err; 1332 } 1333 1334 static void 1335 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev) 1336 { 1337 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 1338 struct mlx5_core_dev *mdev = mvdev->mdev; 1339 1340 lockdep_assert_held(&mvdev->state_mutex); 1341 1342 if (!mvdev->log_active) 1343 return; 1344 1345 WARN_ON(mvdev->mdev_detach); 1346 1347 mlx5_eq_notifier_unregister(mdev, &tracker->nb); 1348 mlx5vf_cmd_destroy_tracker(mdev, tracker->id); 1349 mlx5vf_destroy_qp(mdev, tracker->fw_qp); 1350 mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp); 1351 mlx5vf_destroy_qp(mdev, tracker->host_qp); 1352 mlx5vf_destroy_cq(mdev, &tracker->cq); 1353 mlx5_core_dealloc_pd(mdev, tracker->pdn); 1354 mlx5_put_uars_page(mdev, tracker->uar); 1355 mvdev->log_active = false; 1356 } 1357 1358 int mlx5vf_stop_page_tracker(struct vfio_device *vdev) 1359 { 1360 struct mlx5vf_pci_core_device *mvdev = container_of( 1361 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1362 1363 mutex_lock(&mvdev->state_mutex); 1364 if (!mvdev->log_active) 1365 goto end; 1366 1367 _mlx5vf_free_page_tracker_resources(mvdev); 1368 mvdev->log_active = false; 1369 end: 1370 mlx5vf_state_mutex_unlock(mvdev); 1371 return 0; 1372 } 1373 1374 int mlx5vf_start_page_tracker(struct vfio_device *vdev, 1375 struct rb_root_cached *ranges, u32 nnodes, 1376 u64 *page_size) 1377 { 1378 struct mlx5vf_pci_core_device *mvdev = container_of( 1379 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1380 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 1381 u8 log_tracked_page = ilog2(*page_size); 1382 struct mlx5_vhca_qp *host_qp; 1383 struct mlx5_vhca_qp *fw_qp; 1384 struct mlx5_core_dev *mdev; 1385 u32 max_msg_size = PAGE_SIZE; 1386 u64 rq_size = SZ_2M; 1387 u32 max_recv_wr; 1388 int err; 1389 1390 mutex_lock(&mvdev->state_mutex); 1391 if (mvdev->mdev_detach) { 1392 err = -ENOTCONN; 1393 goto end; 1394 } 1395 1396 if (mvdev->log_active) { 1397 err = -EINVAL; 1398 goto end; 1399 } 1400 1401 mdev = mvdev->mdev; 1402 memset(tracker, 0, sizeof(*tracker)); 1403 tracker->uar = mlx5_get_uars_page(mdev); 1404 if (IS_ERR(tracker->uar)) { 1405 err = PTR_ERR(tracker->uar); 1406 goto end; 1407 } 1408 1409 err = mlx5_core_alloc_pd(mdev, &tracker->pdn); 1410 if (err) 1411 goto err_uar; 1412 1413 max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size); 1414 err = mlx5vf_create_cq(mdev, tracker, max_recv_wr); 1415 if (err) 1416 goto err_dealloc_pd; 1417 1418 host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr); 1419 if (IS_ERR(host_qp)) { 1420 err = PTR_ERR(host_qp); 1421 goto err_cq; 1422 } 1423 1424 host_qp->max_msg_size = max_msg_size; 1425 if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1426 pg_track_log_min_page_size)) { 1427 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1428 pg_track_log_min_page_size); 1429 } else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1430 pg_track_log_max_page_size)) { 1431 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1432 pg_track_log_max_page_size); 1433 } 1434 1435 host_qp->tracked_page_size = (1ULL << log_tracked_page); 1436 err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn, 1437 rq_size); 1438 if (err) 1439 goto err_host_qp; 1440 1441 fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0); 1442 if (IS_ERR(fw_qp)) { 1443 err = PTR_ERR(fw_qp); 1444 goto err_recv_resources; 1445 } 1446 1447 err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true); 1448 if (err) 1449 goto err_activate; 1450 1451 err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false); 1452 if (err) 1453 goto err_activate; 1454 1455 tracker->host_qp = host_qp; 1456 tracker->fw_qp = fw_qp; 1457 err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes); 1458 if (err) 1459 goto err_activate; 1460 1461 MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY); 1462 mlx5_eq_notifier_register(mdev, &tracker->nb); 1463 *page_size = host_qp->tracked_page_size; 1464 mvdev->log_active = true; 1465 mlx5vf_state_mutex_unlock(mvdev); 1466 return 0; 1467 1468 err_activate: 1469 mlx5vf_destroy_qp(mdev, fw_qp); 1470 err_recv_resources: 1471 mlx5vf_free_qp_recv_resources(mdev, host_qp); 1472 err_host_qp: 1473 mlx5vf_destroy_qp(mdev, host_qp); 1474 err_cq: 1475 mlx5vf_destroy_cq(mdev, &tracker->cq); 1476 err_dealloc_pd: 1477 mlx5_core_dealloc_pd(mdev, tracker->pdn); 1478 err_uar: 1479 mlx5_put_uars_page(mdev, tracker->uar); 1480 end: 1481 mlx5vf_state_mutex_unlock(mvdev); 1482 return err; 1483 } 1484 1485 static void 1486 set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp, 1487 struct iova_bitmap *dirty) 1488 { 1489 u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry); 1490 u32 nent = size / entry_size; 1491 struct page *page; 1492 u64 addr; 1493 u64 *buf; 1494 int i; 1495 1496 if (WARN_ON(index >= qp->recv_buf.npages || 1497 (nent > qp->max_msg_size / entry_size))) 1498 return; 1499 1500 page = qp->recv_buf.page_list[index]; 1501 buf = kmap_local_page(page); 1502 for (i = 0; i < nent; i++) { 1503 addr = MLX5_GET(page_track_report_entry, buf + i, 1504 dirty_address_low); 1505 addr |= (u64)MLX5_GET(page_track_report_entry, buf + i, 1506 dirty_address_high) << 32; 1507 iova_bitmap_set(dirty, addr, qp->tracked_page_size); 1508 } 1509 kunmap_local(buf); 1510 } 1511 1512 static void 1513 mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe, 1514 struct iova_bitmap *dirty, int *tracker_status) 1515 { 1516 u32 size; 1517 int ix; 1518 1519 qp->rq.cc++; 1520 *tracker_status = be32_to_cpu(cqe->immediate) >> 28; 1521 size = be32_to_cpu(cqe->byte_cnt); 1522 ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1); 1523 1524 /* zero length CQE, no data */ 1525 WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING); 1526 if (size) 1527 set_report_output(size, ix, qp, dirty); 1528 1529 qp->recv_buf.next_rq_offset = ix * qp->max_msg_size; 1530 mlx5vf_post_recv(qp); 1531 } 1532 1533 static void *get_cqe(struct mlx5_vhca_cq *cq, int n) 1534 { 1535 return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n); 1536 } 1537 1538 static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n) 1539 { 1540 void *cqe = get_cqe(cq, n & (cq->ncqe - 1)); 1541 struct mlx5_cqe64 *cqe64; 1542 1543 cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; 1544 1545 if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) && 1546 !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) { 1547 return cqe64; 1548 } else { 1549 return NULL; 1550 } 1551 } 1552 1553 static int 1554 mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp, 1555 struct iova_bitmap *dirty, int *tracker_status) 1556 { 1557 struct mlx5_cqe64 *cqe; 1558 u8 opcode; 1559 1560 cqe = get_sw_cqe(cq, cq->mcq.cons_index); 1561 if (!cqe) 1562 return CQ_EMPTY; 1563 1564 ++cq->mcq.cons_index; 1565 /* 1566 * Make sure we read CQ entry contents after we've checked the 1567 * ownership bit. 1568 */ 1569 rmb(); 1570 opcode = get_cqe_opcode(cqe); 1571 switch (opcode) { 1572 case MLX5_CQE_RESP_SEND_IMM: 1573 mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status); 1574 return CQ_OK; 1575 default: 1576 return CQ_POLL_ERR; 1577 } 1578 } 1579 1580 int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova, 1581 unsigned long length, 1582 struct iova_bitmap *dirty) 1583 { 1584 struct mlx5vf_pci_core_device *mvdev = container_of( 1585 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1586 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 1587 struct mlx5_vhca_cq *cq = &tracker->cq; 1588 struct mlx5_core_dev *mdev; 1589 int poll_err, err; 1590 1591 mutex_lock(&mvdev->state_mutex); 1592 if (!mvdev->log_active) { 1593 err = -EINVAL; 1594 goto end; 1595 } 1596 1597 if (mvdev->mdev_detach) { 1598 err = -ENOTCONN; 1599 goto end; 1600 } 1601 1602 mdev = mvdev->mdev; 1603 err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length, 1604 MLX5_PAGE_TRACK_STATE_REPORTING); 1605 if (err) 1606 goto end; 1607 1608 tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING; 1609 while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING && 1610 !tracker->is_err) { 1611 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty, 1612 &tracker->status); 1613 if (poll_err == CQ_EMPTY) { 1614 mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map, 1615 cq->mcq.cons_index); 1616 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, 1617 dirty, &tracker->status); 1618 if (poll_err == CQ_EMPTY) { 1619 wait_for_completion(&mvdev->tracker_comp); 1620 continue; 1621 } 1622 } 1623 if (poll_err == CQ_POLL_ERR) { 1624 err = -EIO; 1625 goto end; 1626 } 1627 mlx5_cq_set_ci(&cq->mcq); 1628 } 1629 1630 if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR) 1631 tracker->is_err = true; 1632 1633 if (tracker->is_err) 1634 err = -EIO; 1635 end: 1636 mlx5vf_state_mutex_unlock(mvdev); 1637 return err; 1638 } 1639