1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 /* 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include "cmd.h" 7 8 enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 }; 9 10 static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id) 11 { 12 int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); 13 void *query_cap = NULL, *cap; 14 int ret; 15 16 query_cap = kzalloc(query_sz, GFP_KERNEL); 17 if (!query_cap) 18 return -ENOMEM; 19 20 ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap, 21 MLX5_CAP_GENERAL_2); 22 if (ret) 23 goto out; 24 25 cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability); 26 if (!MLX5_GET(cmd_hca_cap_2, cap, migratable)) 27 ret = -EOPNOTSUPP; 28 out: 29 kfree(query_cap); 30 return ret; 31 } 32 33 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, 34 u16 *vhca_id); 35 static void 36 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev); 37 38 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) 39 { 40 struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 41 u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {}; 42 u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {}; 43 int err; 44 45 lockdep_assert_held(&mvdev->state_mutex); 46 if (mvdev->mdev_detach) 47 return -ENOTCONN; 48 49 /* 50 * In case PRE_COPY is used, saving_migf is exposed while the device is 51 * running. Make sure to run only once there is no active save command. 52 * Running both in parallel, might end-up with a failure in the save 53 * command once it will try to turn on 'tracking' on a suspended device. 54 */ 55 if (migf) { 56 err = wait_for_completion_interruptible(&migf->save_comp); 57 if (err) 58 return err; 59 } 60 61 MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA); 62 MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id); 63 MLX5_SET(suspend_vhca_in, in, op_mod, op_mod); 64 65 err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out); 66 if (migf) 67 complete(&migf->save_comp); 68 69 return err; 70 } 71 72 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) 73 { 74 u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {}; 75 u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {}; 76 77 lockdep_assert_held(&mvdev->state_mutex); 78 if (mvdev->mdev_detach) 79 return -ENOTCONN; 80 81 MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA); 82 MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id); 83 MLX5_SET(resume_vhca_in, in, op_mod, op_mod); 84 85 return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out); 86 } 87 88 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, 89 size_t *state_size, u8 query_flags) 90 { 91 u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; 92 u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; 93 bool inc = query_flags & MLX5VF_QUERY_INC; 94 int ret; 95 96 lockdep_assert_held(&mvdev->state_mutex); 97 if (mvdev->mdev_detach) 98 return -ENOTCONN; 99 100 /* 101 * In case PRE_COPY is used, saving_migf is exposed while device is 102 * running. Make sure to run only once there is no active save command. 103 * Running both in parallel, might end-up with a failure in the 104 * incremental query command on un-tracked vhca. 105 */ 106 if (inc) { 107 ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp); 108 if (ret) 109 return ret; 110 if (mvdev->saving_migf->state == 111 MLX5_MIGF_STATE_PRE_COPY_ERROR) { 112 /* 113 * In case we had a PRE_COPY error, only query full 114 * image for final image 115 */ 116 if (!(query_flags & MLX5VF_QUERY_FINAL)) { 117 *state_size = 0; 118 complete(&mvdev->saving_migf->save_comp); 119 return 0; 120 } 121 query_flags &= ~MLX5VF_QUERY_INC; 122 } 123 } 124 125 MLX5_SET(query_vhca_migration_state_in, in, opcode, 126 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE); 127 MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id); 128 MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0); 129 MLX5_SET(query_vhca_migration_state_in, in, incremental, 130 query_flags & MLX5VF_QUERY_INC); 131 132 ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in, 133 out); 134 if (inc) 135 complete(&mvdev->saving_migf->save_comp); 136 137 if (ret) 138 return ret; 139 140 *state_size = MLX5_GET(query_vhca_migration_state_out, out, 141 required_umem_size); 142 return 0; 143 } 144 145 static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev) 146 { 147 /* Mark the tracker under an error and wake it up if it's running */ 148 mvdev->tracker.is_err = true; 149 complete(&mvdev->tracker_comp); 150 } 151 152 static int mlx5fv_vf_event(struct notifier_block *nb, 153 unsigned long event, void *data) 154 { 155 struct mlx5vf_pci_core_device *mvdev = 156 container_of(nb, struct mlx5vf_pci_core_device, nb); 157 158 switch (event) { 159 case MLX5_PF_NOTIFY_ENABLE_VF: 160 mutex_lock(&mvdev->state_mutex); 161 mvdev->mdev_detach = false; 162 mlx5vf_state_mutex_unlock(mvdev); 163 break; 164 case MLX5_PF_NOTIFY_DISABLE_VF: 165 mlx5vf_cmd_close_migratable(mvdev); 166 mutex_lock(&mvdev->state_mutex); 167 mvdev->mdev_detach = true; 168 mlx5vf_state_mutex_unlock(mvdev); 169 break; 170 default: 171 break; 172 } 173 174 return 0; 175 } 176 177 void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev) 178 { 179 if (!mvdev->migrate_cap) 180 return; 181 182 /* Must be done outside the lock to let it progress */ 183 set_tracker_error(mvdev); 184 mutex_lock(&mvdev->state_mutex); 185 mlx5vf_disable_fds(mvdev); 186 _mlx5vf_free_page_tracker_resources(mvdev); 187 mlx5vf_state_mutex_unlock(mvdev); 188 } 189 190 void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev) 191 { 192 if (!mvdev->migrate_cap) 193 return; 194 195 mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id, 196 &mvdev->nb); 197 destroy_workqueue(mvdev->cb_wq); 198 } 199 200 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, 201 const struct vfio_migration_ops *mig_ops, 202 const struct vfio_log_ops *log_ops) 203 { 204 struct pci_dev *pdev = mvdev->core_device.pdev; 205 int ret; 206 207 if (!pdev->is_virtfn) 208 return; 209 210 mvdev->mdev = mlx5_vf_get_core_dev(pdev); 211 if (!mvdev->mdev) 212 return; 213 214 if (!MLX5_CAP_GEN(mvdev->mdev, migration)) 215 goto end; 216 217 mvdev->vf_id = pci_iov_vf_id(pdev); 218 if (mvdev->vf_id < 0) 219 goto end; 220 221 ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1); 222 if (ret) 223 goto end; 224 225 if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1, 226 &mvdev->vhca_id)) 227 goto end; 228 229 mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0); 230 if (!mvdev->cb_wq) 231 goto end; 232 233 mutex_init(&mvdev->state_mutex); 234 spin_lock_init(&mvdev->reset_lock); 235 mvdev->nb.notifier_call = mlx5fv_vf_event; 236 ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id, 237 &mvdev->nb); 238 if (ret) { 239 destroy_workqueue(mvdev->cb_wq); 240 goto end; 241 } 242 243 mvdev->migrate_cap = 1; 244 mvdev->core_device.vdev.migration_flags = 245 VFIO_MIGRATION_STOP_COPY | 246 VFIO_MIGRATION_P2P; 247 mvdev->core_device.vdev.mig_ops = mig_ops; 248 init_completion(&mvdev->tracker_comp); 249 if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization)) 250 mvdev->core_device.vdev.log_ops = log_ops; 251 252 if (MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) && 253 MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)) 254 mvdev->core_device.vdev.migration_flags |= 255 VFIO_MIGRATION_PRE_COPY; 256 257 end: 258 mlx5_vf_put_core_dev(mvdev->mdev); 259 } 260 261 static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, 262 u16 *vhca_id) 263 { 264 u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; 265 int out_size; 266 void *out; 267 int ret; 268 269 out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out); 270 out = kzalloc(out_size, GFP_KERNEL); 271 if (!out) 272 return -ENOMEM; 273 274 MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); 275 MLX5_SET(query_hca_cap_in, in, other_function, 1); 276 MLX5_SET(query_hca_cap_in, in, function_id, function_id); 277 MLX5_SET(query_hca_cap_in, in, op_mod, 278 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 | 279 HCA_CAP_OPMOD_GET_CUR); 280 281 ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out); 282 if (ret) 283 goto err_exec; 284 285 *vhca_id = MLX5_GET(query_hca_cap_out, out, 286 capability.cmd_hca_cap.vhca_id); 287 288 err_exec: 289 kfree(out); 290 return ret; 291 } 292 293 static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, 294 struct mlx5_vhca_data_buffer *buf, 295 struct mlx5_vhca_recv_buf *recv_buf, 296 u32 *mkey) 297 { 298 size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) : 299 recv_buf->npages; 300 int err = 0, inlen; 301 __be64 *mtt; 302 void *mkc; 303 u32 *in; 304 305 inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + 306 sizeof(*mtt) * round_up(npages, 2); 307 308 in = kvzalloc(inlen, GFP_KERNEL); 309 if (!in) 310 return -ENOMEM; 311 312 MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 313 DIV_ROUND_UP(npages, 2)); 314 mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 315 316 if (buf) { 317 struct sg_dma_page_iter dma_iter; 318 319 for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) 320 *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); 321 } else { 322 int i; 323 324 for (i = 0; i < npages; i++) 325 *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]); 326 } 327 328 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 329 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); 330 MLX5_SET(mkc, mkc, lr, 1); 331 MLX5_SET(mkc, mkc, lw, 1); 332 MLX5_SET(mkc, mkc, rr, 1); 333 MLX5_SET(mkc, mkc, rw, 1); 334 MLX5_SET(mkc, mkc, pd, pdn); 335 MLX5_SET(mkc, mkc, bsf_octword_size, 0); 336 MLX5_SET(mkc, mkc, qpn, 0xffffff); 337 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); 338 MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); 339 MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE); 340 err = mlx5_core_create_mkey(mdev, mkey, in, inlen); 341 kvfree(in); 342 return err; 343 } 344 345 static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) 346 { 347 struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; 348 struct mlx5_core_dev *mdev = mvdev->mdev; 349 int ret; 350 351 lockdep_assert_held(&mvdev->state_mutex); 352 if (mvdev->mdev_detach) 353 return -ENOTCONN; 354 355 if (buf->dmaed || !buf->allocated_length) 356 return -EINVAL; 357 358 ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); 359 if (ret) 360 return ret; 361 362 ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey); 363 if (ret) 364 goto err; 365 366 buf->dmaed = true; 367 368 return 0; 369 err: 370 dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); 371 return ret; 372 } 373 374 void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) 375 { 376 struct mlx5_vf_migration_file *migf = buf->migf; 377 struct sg_page_iter sg_iter; 378 379 lockdep_assert_held(&migf->mvdev->state_mutex); 380 WARN_ON(migf->mvdev->mdev_detach); 381 382 if (buf->dmaed) { 383 mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); 384 dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, 385 buf->dma_dir, 0); 386 } 387 388 /* Undo alloc_pages_bulk_array() */ 389 for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) 390 __free_page(sg_page_iter_page(&sg_iter)); 391 sg_free_append_table(&buf->table); 392 kfree(buf); 393 } 394 395 struct mlx5_vhca_data_buffer * 396 mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, 397 size_t length, 398 enum dma_data_direction dma_dir) 399 { 400 struct mlx5_vhca_data_buffer *buf; 401 int ret; 402 403 buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); 404 if (!buf) 405 return ERR_PTR(-ENOMEM); 406 407 buf->dma_dir = dma_dir; 408 buf->migf = migf; 409 if (length) { 410 ret = mlx5vf_add_migration_pages(buf, 411 DIV_ROUND_UP_ULL(length, PAGE_SIZE)); 412 if (ret) 413 goto end; 414 415 if (dma_dir != DMA_NONE) { 416 ret = mlx5vf_dma_data_buffer(buf); 417 if (ret) 418 goto end; 419 } 420 } 421 422 return buf; 423 end: 424 mlx5vf_free_data_buffer(buf); 425 return ERR_PTR(ret); 426 } 427 428 void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) 429 { 430 spin_lock_irq(&buf->migf->list_lock); 431 list_add_tail(&buf->buf_elm, &buf->migf->avail_list); 432 spin_unlock_irq(&buf->migf->list_lock); 433 } 434 435 struct mlx5_vhca_data_buffer * 436 mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, 437 size_t length, enum dma_data_direction dma_dir) 438 { 439 struct mlx5_vhca_data_buffer *buf, *temp_buf; 440 struct list_head free_list; 441 442 lockdep_assert_held(&migf->mvdev->state_mutex); 443 if (migf->mvdev->mdev_detach) 444 return ERR_PTR(-ENOTCONN); 445 446 INIT_LIST_HEAD(&free_list); 447 448 spin_lock_irq(&migf->list_lock); 449 list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { 450 if (buf->dma_dir == dma_dir) { 451 list_del_init(&buf->buf_elm); 452 if (buf->allocated_length >= length) { 453 spin_unlock_irq(&migf->list_lock); 454 goto found; 455 } 456 /* 457 * Prevent holding redundant buffers. Put in a free 458 * list and call at the end not under the spin lock 459 * (&migf->list_lock) to mlx5vf_free_data_buffer which 460 * might sleep. 461 */ 462 list_add(&buf->buf_elm, &free_list); 463 } 464 } 465 spin_unlock_irq(&migf->list_lock); 466 buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir); 467 468 found: 469 while ((temp_buf = list_first_entry_or_null(&free_list, 470 struct mlx5_vhca_data_buffer, buf_elm))) { 471 list_del(&temp_buf->buf_elm); 472 mlx5vf_free_data_buffer(temp_buf); 473 } 474 475 return buf; 476 } 477 478 static void 479 mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf, 480 struct mlx5vf_async_data *async_data) 481 { 482 kvfree(async_data->out); 483 complete(&migf->save_comp); 484 fput(migf->filp); 485 } 486 487 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) 488 { 489 struct mlx5vf_async_data *async_data = container_of(_work, 490 struct mlx5vf_async_data, work); 491 struct mlx5_vf_migration_file *migf = container_of(async_data, 492 struct mlx5_vf_migration_file, async_data); 493 494 mutex_lock(&migf->lock); 495 if (async_data->status) { 496 mlx5vf_put_data_buffer(async_data->buf); 497 if (async_data->header_buf) 498 mlx5vf_put_data_buffer(async_data->header_buf); 499 if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR) 500 migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR; 501 else 502 migf->state = MLX5_MIGF_STATE_ERROR; 503 wake_up_interruptible(&migf->poll_wait); 504 } 505 mutex_unlock(&migf->lock); 506 mlx5vf_save_callback_complete(migf, async_data); 507 } 508 509 static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, 510 size_t image_size, bool initial_pre_copy) 511 { 512 struct mlx5_vf_migration_file *migf = header_buf->migf; 513 struct mlx5_vf_migration_header header = {}; 514 unsigned long flags; 515 struct page *page; 516 u8 *to_buff; 517 518 header.record_size = cpu_to_le64(image_size); 519 header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY); 520 header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA); 521 page = mlx5vf_get_migration_page(header_buf, 0); 522 if (!page) 523 return -EINVAL; 524 to_buff = kmap_local_page(page); 525 memcpy(to_buff, &header, sizeof(header)); 526 kunmap_local(to_buff); 527 header_buf->length = sizeof(header); 528 header_buf->start_pos = header_buf->migf->max_pos; 529 migf->max_pos += header_buf->length; 530 spin_lock_irqsave(&migf->list_lock, flags); 531 list_add_tail(&header_buf->buf_elm, &migf->buf_list); 532 spin_unlock_irqrestore(&migf->list_lock, flags); 533 if (initial_pre_copy) 534 migf->pre_copy_initial_bytes += sizeof(header); 535 return 0; 536 } 537 538 static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) 539 { 540 struct mlx5vf_async_data *async_data = container_of(context, 541 struct mlx5vf_async_data, cb_work); 542 struct mlx5_vf_migration_file *migf = container_of(async_data, 543 struct mlx5_vf_migration_file, async_data); 544 545 if (!status) { 546 size_t image_size; 547 unsigned long flags; 548 bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY && 549 !async_data->last_chunk; 550 551 image_size = MLX5_GET(save_vhca_state_out, async_data->out, 552 actual_image_size); 553 if (async_data->header_buf) { 554 status = add_buf_header(async_data->header_buf, image_size, 555 initial_pre_copy); 556 if (status) 557 goto err; 558 } 559 async_data->buf->length = image_size; 560 async_data->buf->start_pos = migf->max_pos; 561 migf->max_pos += async_data->buf->length; 562 spin_lock_irqsave(&migf->list_lock, flags); 563 list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); 564 spin_unlock_irqrestore(&migf->list_lock, flags); 565 if (initial_pre_copy) 566 migf->pre_copy_initial_bytes += image_size; 567 migf->state = async_data->last_chunk ? 568 MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY; 569 wake_up_interruptible(&migf->poll_wait); 570 mlx5vf_save_callback_complete(migf, async_data); 571 return; 572 } 573 574 err: 575 /* The error flow can't run from an interrupt context */ 576 if (status == -EREMOTEIO) 577 status = MLX5_GET(save_vhca_state_out, async_data->out, status); 578 async_data->status = status; 579 queue_work(migf->mvdev->cb_wq, &async_data->work); 580 } 581 582 int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, 583 struct mlx5_vf_migration_file *migf, 584 struct mlx5_vhca_data_buffer *buf, bool inc, 585 bool track) 586 { 587 u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out); 588 u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {}; 589 struct mlx5_vhca_data_buffer *header_buf = NULL; 590 struct mlx5vf_async_data *async_data; 591 int err; 592 593 lockdep_assert_held(&mvdev->state_mutex); 594 if (mvdev->mdev_detach) 595 return -ENOTCONN; 596 597 err = wait_for_completion_interruptible(&migf->save_comp); 598 if (err) 599 return err; 600 601 if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) 602 /* 603 * In case we had a PRE_COPY error, SAVE is triggered only for 604 * the final image, read device full image. 605 */ 606 inc = false; 607 608 MLX5_SET(save_vhca_state_in, in, opcode, 609 MLX5_CMD_OP_SAVE_VHCA_STATE); 610 MLX5_SET(save_vhca_state_in, in, op_mod, 0); 611 MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); 612 MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); 613 MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); 614 MLX5_SET(save_vhca_state_in, in, incremental, inc); 615 MLX5_SET(save_vhca_state_in, in, set_track, track); 616 617 async_data = &migf->async_data; 618 async_data->buf = buf; 619 async_data->last_chunk = !track; 620 async_data->out = kvzalloc(out_size, GFP_KERNEL); 621 if (!async_data->out) { 622 err = -ENOMEM; 623 goto err_out; 624 } 625 626 if (MLX5VF_PRE_COPY_SUPP(mvdev)) { 627 if (async_data->last_chunk && migf->buf_header) { 628 header_buf = migf->buf_header; 629 migf->buf_header = NULL; 630 } else { 631 header_buf = mlx5vf_get_data_buffer(migf, 632 sizeof(struct mlx5_vf_migration_header), DMA_NONE); 633 if (IS_ERR(header_buf)) { 634 err = PTR_ERR(header_buf); 635 goto err_free; 636 } 637 } 638 } 639 640 if (async_data->last_chunk) 641 migf->state = MLX5_MIGF_STATE_SAVE_LAST; 642 643 async_data->header_buf = header_buf; 644 get_file(migf->filp); 645 err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), 646 async_data->out, 647 out_size, mlx5vf_save_callback, 648 &async_data->cb_work); 649 if (err) 650 goto err_exec; 651 652 return 0; 653 654 err_exec: 655 if (header_buf) 656 mlx5vf_put_data_buffer(header_buf); 657 fput(migf->filp); 658 err_free: 659 kvfree(async_data->out); 660 err_out: 661 complete(&migf->save_comp); 662 return err; 663 } 664 665 int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, 666 struct mlx5_vf_migration_file *migf, 667 struct mlx5_vhca_data_buffer *buf) 668 { 669 u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {}; 670 u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {}; 671 int err; 672 673 lockdep_assert_held(&mvdev->state_mutex); 674 if (mvdev->mdev_detach) 675 return -ENOTCONN; 676 677 if (!buf->dmaed) { 678 err = mlx5vf_dma_data_buffer(buf); 679 if (err) 680 return err; 681 } 682 683 MLX5_SET(load_vhca_state_in, in, opcode, 684 MLX5_CMD_OP_LOAD_VHCA_STATE); 685 MLX5_SET(load_vhca_state_in, in, op_mod, 0); 686 MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id); 687 MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey); 688 MLX5_SET(load_vhca_state_in, in, size, buf->length); 689 return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out); 690 } 691 692 int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf) 693 { 694 int err; 695 696 lockdep_assert_held(&migf->mvdev->state_mutex); 697 if (migf->mvdev->mdev_detach) 698 return -ENOTCONN; 699 700 err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn); 701 return err; 702 } 703 704 void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf) 705 { 706 lockdep_assert_held(&migf->mvdev->state_mutex); 707 if (migf->mvdev->mdev_detach) 708 return; 709 710 mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn); 711 } 712 713 void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) 714 { 715 struct mlx5_vhca_data_buffer *entry; 716 717 lockdep_assert_held(&migf->mvdev->state_mutex); 718 WARN_ON(migf->mvdev->mdev_detach); 719 720 if (migf->buf) { 721 mlx5vf_free_data_buffer(migf->buf); 722 migf->buf = NULL; 723 } 724 725 if (migf->buf_header) { 726 mlx5vf_free_data_buffer(migf->buf_header); 727 migf->buf_header = NULL; 728 } 729 730 list_splice(&migf->avail_list, &migf->buf_list); 731 732 while ((entry = list_first_entry_or_null(&migf->buf_list, 733 struct mlx5_vhca_data_buffer, buf_elm))) { 734 list_del(&entry->buf_elm); 735 mlx5vf_free_data_buffer(entry); 736 } 737 738 mlx5vf_cmd_dealloc_pd(migf); 739 } 740 741 static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev, 742 struct mlx5vf_pci_core_device *mvdev, 743 struct rb_root_cached *ranges, u32 nnodes) 744 { 745 int max_num_range = 746 MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range); 747 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 748 int record_size = MLX5_ST_SZ_BYTES(page_track_range); 749 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; 750 struct interval_tree_node *node = NULL; 751 u64 total_ranges_len = 0; 752 u32 num_ranges = nnodes; 753 u8 log_addr_space_size; 754 void *range_list_ptr; 755 void *obj_context; 756 void *cmd_hdr; 757 int inlen; 758 void *in; 759 int err; 760 int i; 761 762 if (num_ranges > max_num_range) { 763 vfio_combine_iova_ranges(ranges, nnodes, max_num_range); 764 num_ranges = max_num_range; 765 } 766 767 inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) + 768 record_size * num_ranges; 769 in = kzalloc(inlen, GFP_KERNEL); 770 if (!in) 771 return -ENOMEM; 772 773 cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in, 774 general_obj_in_cmd_hdr); 775 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, 776 MLX5_CMD_OP_CREATE_GENERAL_OBJECT); 777 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, 778 MLX5_OBJ_TYPE_PAGE_TRACK); 779 obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context); 780 MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id); 781 MLX5_SET(page_track, obj_context, track_type, 1); 782 MLX5_SET(page_track, obj_context, log_page_size, 783 ilog2(tracker->host_qp->tracked_page_size)); 784 MLX5_SET(page_track, obj_context, log_msg_size, 785 ilog2(tracker->host_qp->max_msg_size)); 786 MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn); 787 MLX5_SET(page_track, obj_context, num_ranges, num_ranges); 788 789 range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range); 790 node = interval_tree_iter_first(ranges, 0, ULONG_MAX); 791 for (i = 0; i < num_ranges; i++) { 792 void *addr_range_i_base = range_list_ptr + record_size * i; 793 unsigned long length = node->last - node->start + 1; 794 795 MLX5_SET64(page_track_range, addr_range_i_base, start_address, 796 node->start); 797 MLX5_SET64(page_track_range, addr_range_i_base, length, length); 798 total_ranges_len += length; 799 node = interval_tree_iter_next(node, 0, ULONG_MAX); 800 } 801 802 WARN_ON(node); 803 log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len)); 804 if (log_addr_space_size < 805 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) || 806 log_addr_space_size > 807 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) { 808 err = -EOPNOTSUPP; 809 goto out; 810 } 811 812 MLX5_SET(page_track, obj_context, log_addr_space_size, 813 log_addr_space_size); 814 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); 815 if (err) 816 goto out; 817 818 tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); 819 out: 820 kfree(in); 821 return err; 822 } 823 824 static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev, 825 u32 tracker_id) 826 { 827 u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; 828 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; 829 830 MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); 831 MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); 832 MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id); 833 834 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); 835 } 836 837 static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev, 838 u32 tracker_id, unsigned long iova, 839 unsigned long length, u32 tracker_state) 840 { 841 u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {}; 842 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; 843 void *obj_context; 844 void *cmd_hdr; 845 846 cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr); 847 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT); 848 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); 849 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id); 850 851 obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context); 852 MLX5_SET64(page_track, obj_context, modify_field_select, 0x3); 853 MLX5_SET64(page_track, obj_context, range_start_address, iova); 854 MLX5_SET64(page_track, obj_context, length, length); 855 MLX5_SET(page_track, obj_context, state, tracker_state); 856 857 return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); 858 } 859 860 static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev, 861 struct mlx5_vhca_cq_buf *buf, int nent, 862 int cqe_size) 863 { 864 struct mlx5_frag_buf *frag_buf = &buf->frag_buf; 865 u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0); 866 u8 log_wq_sz = ilog2(cqe_size); 867 int err; 868 869 err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf, 870 mdev->priv.numa_node); 871 if (err) 872 return err; 873 874 mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc); 875 buf->cqe_size = cqe_size; 876 buf->nent = nent; 877 return 0; 878 } 879 880 static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf) 881 { 882 struct mlx5_cqe64 *cqe64; 883 void *cqe; 884 int i; 885 886 for (i = 0; i < buf->nent; i++) { 887 cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i); 888 cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64; 889 cqe64->op_own = MLX5_CQE_INVALID << 4; 890 } 891 } 892 893 static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev, 894 struct mlx5_vhca_cq *cq) 895 { 896 mlx5_core_destroy_cq(mdev, &cq->mcq); 897 mlx5_frag_buf_free(mdev, &cq->buf.frag_buf); 898 mlx5_db_free(mdev, &cq->db); 899 } 900 901 static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type) 902 { 903 if (type != MLX5_EVENT_TYPE_CQ_ERROR) 904 return; 905 906 set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device, 907 tracker.cq.mcq)); 908 } 909 910 static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type, 911 void *data) 912 { 913 struct mlx5_vhca_page_tracker *tracker = 914 mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb); 915 struct mlx5vf_pci_core_device *mvdev = container_of( 916 tracker, struct mlx5vf_pci_core_device, tracker); 917 struct mlx5_eqe *eqe = data; 918 u8 event_type = (u8)type; 919 u8 queue_type; 920 int qp_num; 921 922 switch (event_type) { 923 case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: 924 case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: 925 case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: 926 queue_type = eqe->data.qp_srq.type; 927 if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP) 928 break; 929 qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; 930 if (qp_num != tracker->host_qp->qpn && 931 qp_num != tracker->fw_qp->qpn) 932 break; 933 set_tracker_error(mvdev); 934 break; 935 default: 936 break; 937 } 938 939 return NOTIFY_OK; 940 } 941 942 static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq, 943 struct mlx5_eqe *eqe) 944 { 945 struct mlx5vf_pci_core_device *mvdev = 946 container_of(mcq, struct mlx5vf_pci_core_device, 947 tracker.cq.mcq); 948 949 complete(&mvdev->tracker_comp); 950 } 951 952 static int mlx5vf_create_cq(struct mlx5_core_dev *mdev, 953 struct mlx5_vhca_page_tracker *tracker, 954 size_t ncqe) 955 { 956 int cqe_size = cache_line_size() == 128 ? 128 : 64; 957 u32 out[MLX5_ST_SZ_DW(create_cq_out)]; 958 struct mlx5_vhca_cq *cq; 959 int inlen, err, eqn; 960 void *cqc, *in; 961 __be64 *pas; 962 int vector; 963 964 cq = &tracker->cq; 965 ncqe = roundup_pow_of_two(ncqe); 966 err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node); 967 if (err) 968 return err; 969 970 cq->ncqe = ncqe; 971 cq->mcq.set_ci_db = cq->db.db; 972 cq->mcq.arm_db = cq->db.db + 1; 973 cq->mcq.cqe_sz = cqe_size; 974 err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size); 975 if (err) 976 goto err_db_free; 977 978 init_cq_frag_buf(&cq->buf); 979 inlen = MLX5_ST_SZ_BYTES(create_cq_in) + 980 MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * 981 cq->buf.frag_buf.npages; 982 in = kvzalloc(inlen, GFP_KERNEL); 983 if (!in) { 984 err = -ENOMEM; 985 goto err_buff; 986 } 987 988 vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev); 989 err = mlx5_comp_eqn_get(mdev, vector, &eqn); 990 if (err) 991 goto err_vec; 992 993 cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); 994 MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe)); 995 MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); 996 MLX5_SET(cqc, cqc, uar_page, tracker->uar->index); 997 MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift - 998 MLX5_ADAPTER_PAGE_SHIFT); 999 MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma); 1000 pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); 1001 mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas); 1002 cq->mcq.comp = mlx5vf_cq_complete; 1003 cq->mcq.event = mlx5vf_cq_event; 1004 err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); 1005 if (err) 1006 goto err_vec; 1007 1008 mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map, 1009 cq->mcq.cons_index); 1010 kvfree(in); 1011 return 0; 1012 1013 err_vec: 1014 kvfree(in); 1015 err_buff: 1016 mlx5_frag_buf_free(mdev, &cq->buf.frag_buf); 1017 err_db_free: 1018 mlx5_db_free(mdev, &cq->db); 1019 return err; 1020 } 1021 1022 static struct mlx5_vhca_qp * 1023 mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev, 1024 struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr) 1025 { 1026 u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; 1027 struct mlx5_vhca_qp *qp; 1028 u8 log_rq_stride; 1029 u8 log_rq_sz; 1030 void *qpc; 1031 int inlen; 1032 void *in; 1033 int err; 1034 1035 qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT); 1036 if (!qp) 1037 return ERR_PTR(-ENOMEM); 1038 1039 err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node); 1040 if (err) 1041 goto err_free; 1042 1043 if (max_recv_wr) { 1044 qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr); 1045 log_rq_stride = ilog2(MLX5_SEND_WQE_DS); 1046 log_rq_sz = ilog2(qp->rq.wqe_cnt); 1047 err = mlx5_frag_buf_alloc_node(mdev, 1048 wq_get_byte_sz(log_rq_sz, log_rq_stride), 1049 &qp->buf, mdev->priv.numa_node); 1050 if (err) 1051 goto err_db_free; 1052 mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc); 1053 } 1054 1055 qp->rq.db = &qp->db.db[MLX5_RCV_DBR]; 1056 inlen = MLX5_ST_SZ_BYTES(create_qp_in) + 1057 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * 1058 qp->buf.npages; 1059 in = kvzalloc(inlen, GFP_KERNEL); 1060 if (!in) { 1061 err = -ENOMEM; 1062 goto err_in; 1063 } 1064 1065 qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); 1066 MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); 1067 MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); 1068 MLX5_SET(qpc, qpc, pd, tracker->pdn); 1069 MLX5_SET(qpc, qpc, uar_page, tracker->uar->index); 1070 MLX5_SET(qpc, qpc, log_page_size, 1071 qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); 1072 MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev)); 1073 if (MLX5_CAP_GEN(mdev, cqe_version) == 1) 1074 MLX5_SET(qpc, qpc, user_index, 0xFFFFFF); 1075 MLX5_SET(qpc, qpc, no_sq, 1); 1076 if (max_recv_wr) { 1077 MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn); 1078 MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4); 1079 MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz); 1080 MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ); 1081 MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); 1082 mlx5_fill_page_frag_array(&qp->buf, 1083 (__be64 *)MLX5_ADDR_OF(create_qp_in, 1084 in, pas)); 1085 } else { 1086 MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ); 1087 } 1088 1089 MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); 1090 err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); 1091 kvfree(in); 1092 if (err) 1093 goto err_in; 1094 1095 qp->qpn = MLX5_GET(create_qp_out, out, qpn); 1096 return qp; 1097 1098 err_in: 1099 if (max_recv_wr) 1100 mlx5_frag_buf_free(mdev, &qp->buf); 1101 err_db_free: 1102 mlx5_db_free(mdev, &qp->db); 1103 err_free: 1104 kfree(qp); 1105 return ERR_PTR(err); 1106 } 1107 1108 static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp) 1109 { 1110 struct mlx5_wqe_data_seg *data; 1111 unsigned int ix; 1112 1113 WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt); 1114 ix = qp->rq.pc & (qp->rq.wqe_cnt - 1); 1115 data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix); 1116 data->byte_count = cpu_to_be32(qp->max_msg_size); 1117 data->lkey = cpu_to_be32(qp->recv_buf.mkey); 1118 data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset); 1119 qp->rq.pc++; 1120 /* Make sure that descriptors are written before doorbell record. */ 1121 dma_wmb(); 1122 *qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff); 1123 } 1124 1125 static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev, 1126 struct mlx5_vhca_qp *qp, u32 remote_qpn, 1127 bool host_qp) 1128 { 1129 u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {}; 1130 u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {}; 1131 u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {}; 1132 void *qpc; 1133 int ret; 1134 1135 /* Init */ 1136 qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc); 1137 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); 1138 MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED); 1139 MLX5_SET(qpc, qpc, rre, 1); 1140 MLX5_SET(qpc, qpc, rwe, 1); 1141 MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP); 1142 MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn); 1143 ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in); 1144 if (ret) 1145 return ret; 1146 1147 if (host_qp) { 1148 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; 1149 int i; 1150 1151 for (i = 0; i < qp->rq.wqe_cnt; i++) { 1152 mlx5vf_post_recv(qp); 1153 recv_buf->next_rq_offset += qp->max_msg_size; 1154 } 1155 } 1156 1157 /* RTR */ 1158 qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc); 1159 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn); 1160 MLX5_SET(qpc, qpc, mtu, IB_MTU_4096); 1161 MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg)); 1162 MLX5_SET(qpc, qpc, remote_qpn, remote_qpn); 1163 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); 1164 MLX5_SET(qpc, qpc, primary_address_path.fl, 1); 1165 MLX5_SET(qpc, qpc, min_rnr_nak, 1); 1166 MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP); 1167 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn); 1168 ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in); 1169 if (ret || host_qp) 1170 return ret; 1171 1172 /* RTS */ 1173 qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc); 1174 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn); 1175 MLX5_SET(qpc, qpc, retry_count, 7); 1176 MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */ 1177 MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */ 1178 MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP); 1179 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn); 1180 1181 return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in); 1182 } 1183 1184 static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev, 1185 struct mlx5_vhca_qp *qp) 1186 { 1187 u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; 1188 1189 MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); 1190 MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); 1191 mlx5_cmd_exec_in(mdev, destroy_qp, in); 1192 1193 mlx5_frag_buf_free(mdev, &qp->buf); 1194 mlx5_db_free(mdev, &qp->db); 1195 kfree(qp); 1196 } 1197 1198 static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf) 1199 { 1200 int i; 1201 1202 /* Undo alloc_pages_bulk_array() */ 1203 for (i = 0; i < recv_buf->npages; i++) 1204 __free_page(recv_buf->page_list[i]); 1205 1206 kvfree(recv_buf->page_list); 1207 } 1208 1209 static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, 1210 unsigned int npages) 1211 { 1212 unsigned int filled = 0, done = 0; 1213 int i; 1214 1215 recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list), 1216 GFP_KERNEL_ACCOUNT); 1217 if (!recv_buf->page_list) 1218 return -ENOMEM; 1219 1220 for (;;) { 1221 filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, 1222 npages - done, 1223 recv_buf->page_list + done); 1224 if (!filled) 1225 goto err; 1226 1227 done += filled; 1228 if (done == npages) 1229 break; 1230 } 1231 1232 recv_buf->npages = npages; 1233 return 0; 1234 1235 err: 1236 for (i = 0; i < npages; i++) { 1237 if (recv_buf->page_list[i]) 1238 __free_page(recv_buf->page_list[i]); 1239 } 1240 1241 kvfree(recv_buf->page_list); 1242 return -ENOMEM; 1243 } 1244 1245 static int register_dma_recv_pages(struct mlx5_core_dev *mdev, 1246 struct mlx5_vhca_recv_buf *recv_buf) 1247 { 1248 int i, j; 1249 1250 recv_buf->dma_addrs = kvcalloc(recv_buf->npages, 1251 sizeof(*recv_buf->dma_addrs), 1252 GFP_KERNEL_ACCOUNT); 1253 if (!recv_buf->dma_addrs) 1254 return -ENOMEM; 1255 1256 for (i = 0; i < recv_buf->npages; i++) { 1257 recv_buf->dma_addrs[i] = dma_map_page(mdev->device, 1258 recv_buf->page_list[i], 1259 0, PAGE_SIZE, 1260 DMA_FROM_DEVICE); 1261 if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i])) 1262 goto error; 1263 } 1264 return 0; 1265 1266 error: 1267 for (j = 0; j < i; j++) 1268 dma_unmap_single(mdev->device, recv_buf->dma_addrs[j], 1269 PAGE_SIZE, DMA_FROM_DEVICE); 1270 1271 kvfree(recv_buf->dma_addrs); 1272 return -ENOMEM; 1273 } 1274 1275 static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev, 1276 struct mlx5_vhca_recv_buf *recv_buf) 1277 { 1278 int i; 1279 1280 for (i = 0; i < recv_buf->npages; i++) 1281 dma_unmap_single(mdev->device, recv_buf->dma_addrs[i], 1282 PAGE_SIZE, DMA_FROM_DEVICE); 1283 1284 kvfree(recv_buf->dma_addrs); 1285 } 1286 1287 static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev, 1288 struct mlx5_vhca_qp *qp) 1289 { 1290 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; 1291 1292 mlx5_core_destroy_mkey(mdev, recv_buf->mkey); 1293 unregister_dma_recv_pages(mdev, recv_buf); 1294 free_recv_pages(&qp->recv_buf); 1295 } 1296 1297 static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, 1298 struct mlx5_vhca_qp *qp, u32 pdn, 1299 u64 rq_size) 1300 { 1301 unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE); 1302 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; 1303 int err; 1304 1305 err = alloc_recv_pages(recv_buf, npages); 1306 if (err < 0) 1307 return err; 1308 1309 err = register_dma_recv_pages(mdev, recv_buf); 1310 if (err) 1311 goto end; 1312 1313 err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey); 1314 if (err) 1315 goto err_create_mkey; 1316 1317 return 0; 1318 1319 err_create_mkey: 1320 unregister_dma_recv_pages(mdev, recv_buf); 1321 end: 1322 free_recv_pages(recv_buf); 1323 return err; 1324 } 1325 1326 static void 1327 _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev) 1328 { 1329 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 1330 struct mlx5_core_dev *mdev = mvdev->mdev; 1331 1332 lockdep_assert_held(&mvdev->state_mutex); 1333 1334 if (!mvdev->log_active) 1335 return; 1336 1337 WARN_ON(mvdev->mdev_detach); 1338 1339 mlx5_eq_notifier_unregister(mdev, &tracker->nb); 1340 mlx5vf_cmd_destroy_tracker(mdev, tracker->id); 1341 mlx5vf_destroy_qp(mdev, tracker->fw_qp); 1342 mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp); 1343 mlx5vf_destroy_qp(mdev, tracker->host_qp); 1344 mlx5vf_destroy_cq(mdev, &tracker->cq); 1345 mlx5_core_dealloc_pd(mdev, tracker->pdn); 1346 mlx5_put_uars_page(mdev, tracker->uar); 1347 mvdev->log_active = false; 1348 } 1349 1350 int mlx5vf_stop_page_tracker(struct vfio_device *vdev) 1351 { 1352 struct mlx5vf_pci_core_device *mvdev = container_of( 1353 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1354 1355 mutex_lock(&mvdev->state_mutex); 1356 if (!mvdev->log_active) 1357 goto end; 1358 1359 _mlx5vf_free_page_tracker_resources(mvdev); 1360 mvdev->log_active = false; 1361 end: 1362 mlx5vf_state_mutex_unlock(mvdev); 1363 return 0; 1364 } 1365 1366 int mlx5vf_start_page_tracker(struct vfio_device *vdev, 1367 struct rb_root_cached *ranges, u32 nnodes, 1368 u64 *page_size) 1369 { 1370 struct mlx5vf_pci_core_device *mvdev = container_of( 1371 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1372 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 1373 u8 log_tracked_page = ilog2(*page_size); 1374 struct mlx5_vhca_qp *host_qp; 1375 struct mlx5_vhca_qp *fw_qp; 1376 struct mlx5_core_dev *mdev; 1377 u32 max_msg_size = PAGE_SIZE; 1378 u64 rq_size = SZ_2M; 1379 u32 max_recv_wr; 1380 int err; 1381 1382 mutex_lock(&mvdev->state_mutex); 1383 if (mvdev->mdev_detach) { 1384 err = -ENOTCONN; 1385 goto end; 1386 } 1387 1388 if (mvdev->log_active) { 1389 err = -EINVAL; 1390 goto end; 1391 } 1392 1393 mdev = mvdev->mdev; 1394 memset(tracker, 0, sizeof(*tracker)); 1395 tracker->uar = mlx5_get_uars_page(mdev); 1396 if (IS_ERR(tracker->uar)) { 1397 err = PTR_ERR(tracker->uar); 1398 goto end; 1399 } 1400 1401 err = mlx5_core_alloc_pd(mdev, &tracker->pdn); 1402 if (err) 1403 goto err_uar; 1404 1405 max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size); 1406 err = mlx5vf_create_cq(mdev, tracker, max_recv_wr); 1407 if (err) 1408 goto err_dealloc_pd; 1409 1410 host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr); 1411 if (IS_ERR(host_qp)) { 1412 err = PTR_ERR(host_qp); 1413 goto err_cq; 1414 } 1415 1416 host_qp->max_msg_size = max_msg_size; 1417 if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1418 pg_track_log_min_page_size)) { 1419 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1420 pg_track_log_min_page_size); 1421 } else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1422 pg_track_log_max_page_size)) { 1423 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev, 1424 pg_track_log_max_page_size); 1425 } 1426 1427 host_qp->tracked_page_size = (1ULL << log_tracked_page); 1428 err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn, 1429 rq_size); 1430 if (err) 1431 goto err_host_qp; 1432 1433 fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0); 1434 if (IS_ERR(fw_qp)) { 1435 err = PTR_ERR(fw_qp); 1436 goto err_recv_resources; 1437 } 1438 1439 err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true); 1440 if (err) 1441 goto err_activate; 1442 1443 err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false); 1444 if (err) 1445 goto err_activate; 1446 1447 tracker->host_qp = host_qp; 1448 tracker->fw_qp = fw_qp; 1449 err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes); 1450 if (err) 1451 goto err_activate; 1452 1453 MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY); 1454 mlx5_eq_notifier_register(mdev, &tracker->nb); 1455 *page_size = host_qp->tracked_page_size; 1456 mvdev->log_active = true; 1457 mlx5vf_state_mutex_unlock(mvdev); 1458 return 0; 1459 1460 err_activate: 1461 mlx5vf_destroy_qp(mdev, fw_qp); 1462 err_recv_resources: 1463 mlx5vf_free_qp_recv_resources(mdev, host_qp); 1464 err_host_qp: 1465 mlx5vf_destroy_qp(mdev, host_qp); 1466 err_cq: 1467 mlx5vf_destroy_cq(mdev, &tracker->cq); 1468 err_dealloc_pd: 1469 mlx5_core_dealloc_pd(mdev, tracker->pdn); 1470 err_uar: 1471 mlx5_put_uars_page(mdev, tracker->uar); 1472 end: 1473 mlx5vf_state_mutex_unlock(mvdev); 1474 return err; 1475 } 1476 1477 static void 1478 set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp, 1479 struct iova_bitmap *dirty) 1480 { 1481 u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry); 1482 u32 nent = size / entry_size; 1483 struct page *page; 1484 u64 addr; 1485 u64 *buf; 1486 int i; 1487 1488 if (WARN_ON(index >= qp->recv_buf.npages || 1489 (nent > qp->max_msg_size / entry_size))) 1490 return; 1491 1492 page = qp->recv_buf.page_list[index]; 1493 buf = kmap_local_page(page); 1494 for (i = 0; i < nent; i++) { 1495 addr = MLX5_GET(page_track_report_entry, buf + i, 1496 dirty_address_low); 1497 addr |= (u64)MLX5_GET(page_track_report_entry, buf + i, 1498 dirty_address_high) << 32; 1499 iova_bitmap_set(dirty, addr, qp->tracked_page_size); 1500 } 1501 kunmap_local(buf); 1502 } 1503 1504 static void 1505 mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe, 1506 struct iova_bitmap *dirty, int *tracker_status) 1507 { 1508 u32 size; 1509 int ix; 1510 1511 qp->rq.cc++; 1512 *tracker_status = be32_to_cpu(cqe->immediate) >> 28; 1513 size = be32_to_cpu(cqe->byte_cnt); 1514 ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1); 1515 1516 /* zero length CQE, no data */ 1517 WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING); 1518 if (size) 1519 set_report_output(size, ix, qp, dirty); 1520 1521 qp->recv_buf.next_rq_offset = ix * qp->max_msg_size; 1522 mlx5vf_post_recv(qp); 1523 } 1524 1525 static void *get_cqe(struct mlx5_vhca_cq *cq, int n) 1526 { 1527 return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n); 1528 } 1529 1530 static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n) 1531 { 1532 void *cqe = get_cqe(cq, n & (cq->ncqe - 1)); 1533 struct mlx5_cqe64 *cqe64; 1534 1535 cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; 1536 1537 if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) && 1538 !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) { 1539 return cqe64; 1540 } else { 1541 return NULL; 1542 } 1543 } 1544 1545 static int 1546 mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp, 1547 struct iova_bitmap *dirty, int *tracker_status) 1548 { 1549 struct mlx5_cqe64 *cqe; 1550 u8 opcode; 1551 1552 cqe = get_sw_cqe(cq, cq->mcq.cons_index); 1553 if (!cqe) 1554 return CQ_EMPTY; 1555 1556 ++cq->mcq.cons_index; 1557 /* 1558 * Make sure we read CQ entry contents after we've checked the 1559 * ownership bit. 1560 */ 1561 rmb(); 1562 opcode = get_cqe_opcode(cqe); 1563 switch (opcode) { 1564 case MLX5_CQE_RESP_SEND_IMM: 1565 mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status); 1566 return CQ_OK; 1567 default: 1568 return CQ_POLL_ERR; 1569 } 1570 } 1571 1572 int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova, 1573 unsigned long length, 1574 struct iova_bitmap *dirty) 1575 { 1576 struct mlx5vf_pci_core_device *mvdev = container_of( 1577 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1578 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; 1579 struct mlx5_vhca_cq *cq = &tracker->cq; 1580 struct mlx5_core_dev *mdev; 1581 int poll_err, err; 1582 1583 mutex_lock(&mvdev->state_mutex); 1584 if (!mvdev->log_active) { 1585 err = -EINVAL; 1586 goto end; 1587 } 1588 1589 if (mvdev->mdev_detach) { 1590 err = -ENOTCONN; 1591 goto end; 1592 } 1593 1594 mdev = mvdev->mdev; 1595 err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length, 1596 MLX5_PAGE_TRACK_STATE_REPORTING); 1597 if (err) 1598 goto end; 1599 1600 tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING; 1601 while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING && 1602 !tracker->is_err) { 1603 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty, 1604 &tracker->status); 1605 if (poll_err == CQ_EMPTY) { 1606 mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map, 1607 cq->mcq.cons_index); 1608 poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, 1609 dirty, &tracker->status); 1610 if (poll_err == CQ_EMPTY) { 1611 wait_for_completion(&mvdev->tracker_comp); 1612 continue; 1613 } 1614 } 1615 if (poll_err == CQ_POLL_ERR) { 1616 err = -EIO; 1617 goto end; 1618 } 1619 mlx5_cq_set_ci(&cq->mcq); 1620 } 1621 1622 if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR) 1623 tracker->is_err = true; 1624 1625 if (tracker->is_err) 1626 err = -EIO; 1627 end: 1628 mlx5vf_state_mutex_unlock(mvdev); 1629 return err; 1630 } 1631