xref: /linux/drivers/infiniband/hw/mlx5/umr.c (revision ab93e0dd72c37d378dd936f031ffb83ff2bd87ce)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. */
3 
4 #include <rdma/ib_umem_odp.h>
5 #include "mlx5_ib.h"
6 #include "umr.h"
7 #include "wr.h"
8 
9 /*
10  * We can't use an array for xlt_emergency_page because dma_map_single doesn't
11  * work on kernel modules memory
12  */
13 void *xlt_emergency_page;
14 static DEFINE_MUTEX(xlt_emergency_page_mutex);
15 
get_umr_enable_mr_mask(void)16 static __be64 get_umr_enable_mr_mask(void)
17 {
18 	u64 result;
19 
20 	result = MLX5_MKEY_MASK_KEY |
21 		 MLX5_MKEY_MASK_FREE;
22 
23 	return cpu_to_be64(result);
24 }
25 
get_umr_disable_mr_mask(void)26 static __be64 get_umr_disable_mr_mask(void)
27 {
28 	u64 result;
29 
30 	result = MLX5_MKEY_MASK_FREE;
31 
32 	return cpu_to_be64(result);
33 }
34 
get_umr_update_translation_mask(struct mlx5_ib_dev * dev)35 static __be64 get_umr_update_translation_mask(struct mlx5_ib_dev *dev)
36 {
37 	u64 result;
38 
39 	result = MLX5_MKEY_MASK_LEN |
40 		 MLX5_MKEY_MASK_PAGE_SIZE |
41 		 MLX5_MKEY_MASK_START_ADDR;
42 	if (MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5))
43 		result |= MLX5_MKEY_MASK_PAGE_SIZE_5;
44 
45 	return cpu_to_be64(result);
46 }
47 
get_umr_update_access_mask(struct mlx5_ib_dev * dev)48 static __be64 get_umr_update_access_mask(struct mlx5_ib_dev *dev)
49 {
50 	u64 result;
51 
52 	result = MLX5_MKEY_MASK_LR |
53 		 MLX5_MKEY_MASK_LW |
54 		 MLX5_MKEY_MASK_RR |
55 		 MLX5_MKEY_MASK_RW;
56 
57 	if (MLX5_CAP_GEN(dev->mdev, atomic))
58 		result |= MLX5_MKEY_MASK_A;
59 
60 	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
61 		result |= MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE;
62 
63 	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
64 		result |= MLX5_MKEY_MASK_RELAXED_ORDERING_READ;
65 
66 	return cpu_to_be64(result);
67 }
68 
get_umr_update_pd_mask(void)69 static __be64 get_umr_update_pd_mask(void)
70 {
71 	u64 result;
72 
73 	result = MLX5_MKEY_MASK_PD;
74 
75 	return cpu_to_be64(result);
76 }
77 
umr_check_mkey_mask(struct mlx5_ib_dev * dev,u64 mask)78 static int umr_check_mkey_mask(struct mlx5_ib_dev *dev, u64 mask)
79 {
80 	if (mask & MLX5_MKEY_MASK_PAGE_SIZE &&
81 	    MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled))
82 		return -EPERM;
83 
84 	if (mask & MLX5_MKEY_MASK_A &&
85 	    MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
86 		return -EPERM;
87 
88 	if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE &&
89 	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
90 		return -EPERM;
91 
92 	if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_READ &&
93 	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
94 		return -EPERM;
95 
96 	return 0;
97 }
98 
99 enum {
100 	MAX_UMR_WR = 128,
101 };
102 
mlx5r_umr_qp_rst2rts(struct mlx5_ib_dev * dev,struct ib_qp * qp)103 static int mlx5r_umr_qp_rst2rts(struct mlx5_ib_dev *dev, struct ib_qp *qp)
104 {
105 	struct ib_qp_attr attr = {};
106 	int ret;
107 
108 	attr.qp_state = IB_QPS_INIT;
109 	attr.port_num = 1;
110 	ret = ib_modify_qp(qp, &attr,
111 			   IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT);
112 	if (ret) {
113 		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
114 		return ret;
115 	}
116 
117 	memset(&attr, 0, sizeof(attr));
118 	attr.qp_state = IB_QPS_RTR;
119 
120 	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
121 	if (ret) {
122 		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
123 		return ret;
124 	}
125 
126 	memset(&attr, 0, sizeof(attr));
127 	attr.qp_state = IB_QPS_RTS;
128 	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
129 	if (ret) {
130 		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
131 		return ret;
132 	}
133 
134 	return 0;
135 }
136 
mlx5r_umr_resource_init(struct mlx5_ib_dev * dev)137 int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev)
138 {
139 	struct ib_qp_init_attr init_attr = {};
140 	struct ib_cq *cq;
141 	struct ib_qp *qp;
142 	int ret = 0;
143 
144 
145 	/*
146 	 * UMR qp is set once, never changed until device unload.
147 	 * Avoid taking the mutex if initialization is already done.
148 	 */
149 	if (dev->umrc.qp)
150 		return 0;
151 
152 	mutex_lock(&dev->umrc.init_lock);
153 	/* First user allocates the UMR resources. Skip if already allocated. */
154 	if (dev->umrc.qp)
155 		goto unlock;
156 
157 	cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
158 	if (IS_ERR(cq)) {
159 		mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
160 		ret = PTR_ERR(cq);
161 		goto unlock;
162 	}
163 
164 	init_attr.send_cq = cq;
165 	init_attr.recv_cq = cq;
166 	init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
167 	init_attr.cap.max_send_wr = MAX_UMR_WR;
168 	init_attr.cap.max_send_sge = 1;
169 	init_attr.qp_type = MLX5_IB_QPT_REG_UMR;
170 	init_attr.port_num = 1;
171 	qp = ib_create_qp(dev->umrc.pd, &init_attr);
172 	if (IS_ERR(qp)) {
173 		mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
174 		ret = PTR_ERR(qp);
175 		goto destroy_cq;
176 	}
177 
178 	ret = mlx5r_umr_qp_rst2rts(dev, qp);
179 	if (ret)
180 		goto destroy_qp;
181 
182 	dev->umrc.cq = cq;
183 
184 	sema_init(&dev->umrc.sem, MAX_UMR_WR);
185 	mutex_init(&dev->umrc.lock);
186 	dev->umrc.state = MLX5_UMR_STATE_ACTIVE;
187 	dev->umrc.qp = qp;
188 
189 	mutex_unlock(&dev->umrc.init_lock);
190 	return 0;
191 
192 destroy_qp:
193 	ib_destroy_qp(qp);
194 destroy_cq:
195 	ib_free_cq(cq);
196 unlock:
197 	mutex_unlock(&dev->umrc.init_lock);
198 	return ret;
199 }
200 
mlx5r_umr_resource_cleanup(struct mlx5_ib_dev * dev)201 void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev)
202 {
203 	if (dev->umrc.state == MLX5_UMR_STATE_UNINIT)
204 		return;
205 	mutex_destroy(&dev->umrc.lock);
206 	/* After device init, UMR cp/qp are not unset during the lifetime. */
207 	ib_destroy_qp(dev->umrc.qp);
208 	ib_free_cq(dev->umrc.cq);
209 }
210 
mlx5r_umr_init(struct mlx5_ib_dev * dev)211 int mlx5r_umr_init(struct mlx5_ib_dev *dev)
212 {
213 	struct ib_pd *pd;
214 
215 	pd = ib_alloc_pd(&dev->ib_dev, 0);
216 	if (IS_ERR(pd)) {
217 		mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
218 		return PTR_ERR(pd);
219 	}
220 	dev->umrc.pd = pd;
221 
222 	mutex_init(&dev->umrc.init_lock);
223 
224 	return 0;
225 }
226 
mlx5r_umr_cleanup(struct mlx5_ib_dev * dev)227 void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev)
228 {
229 	if (!dev->umrc.pd)
230 		return;
231 
232 	mutex_destroy(&dev->umrc.init_lock);
233 	ib_dealloc_pd(dev->umrc.pd);
234 }
235 
236 
mlx5r_umr_post_send(struct ib_qp * ibqp,u32 mkey,struct ib_cqe * cqe,struct mlx5r_umr_wqe * wqe,bool with_data)237 static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
238 			       struct mlx5r_umr_wqe *wqe, bool with_data)
239 {
240 	unsigned int wqe_size =
241 		with_data ? sizeof(struct mlx5r_umr_wqe) :
242 			    sizeof(struct mlx5r_umr_wqe) -
243 				    sizeof(struct mlx5_wqe_data_seg);
244 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
245 	struct mlx5_core_dev *mdev = dev->mdev;
246 	struct mlx5_ib_qp *qp = to_mqp(ibqp);
247 	struct mlx5_wqe_ctrl_seg *ctrl;
248 	union {
249 		struct ib_cqe *ib_cqe;
250 		u64 wr_id;
251 	} id;
252 	void *cur_edge, *seg;
253 	unsigned long flags;
254 	unsigned int idx;
255 	int size, err;
256 
257 	if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR))
258 		return -EIO;
259 
260 	spin_lock_irqsave(&qp->sq.lock, flags);
261 
262 	err = mlx5r_begin_wqe(qp, &seg, &ctrl, &idx, &size, &cur_edge, 0,
263 			      cpu_to_be32(mkey), false, false);
264 	if (WARN_ON(err))
265 		goto out;
266 
267 	qp->sq.wr_data[idx] = MLX5_IB_WR_UMR;
268 
269 	mlx5r_memcpy_send_wqe(&qp->sq, &cur_edge, &seg, &size, wqe, wqe_size);
270 
271 	id.ib_cqe = cqe;
272 	mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0,
273 			 MLX5_FENCE_MODE_INITIATOR_SMALL, MLX5_OPCODE_UMR);
274 
275 	mlx5r_ring_db(qp, 1, ctrl);
276 
277 out:
278 	spin_unlock_irqrestore(&qp->sq.lock, flags);
279 
280 	return err;
281 }
282 
mlx5r_umr_recover(struct mlx5_ib_dev * dev,u32 mkey,struct mlx5r_umr_context * umr_context,struct mlx5r_umr_wqe * wqe,bool with_data)283 static int mlx5r_umr_recover(struct mlx5_ib_dev *dev, u32 mkey,
284 			     struct mlx5r_umr_context *umr_context,
285 			     struct mlx5r_umr_wqe *wqe, bool with_data)
286 {
287 	struct umr_common *umrc = &dev->umrc;
288 	struct ib_qp_attr attr;
289 	int err;
290 
291 	mutex_lock(&umrc->lock);
292 	/* Preventing any further WRs to be sent now */
293 	if (umrc->state != MLX5_UMR_STATE_RECOVER) {
294 		mlx5_ib_warn(dev, "UMR recovery encountered an unexpected state=%d\n",
295 			     umrc->state);
296 		umrc->state = MLX5_UMR_STATE_RECOVER;
297 	}
298 	mutex_unlock(&umrc->lock);
299 
300 	/* Sending a final/barrier WR (the failed one) and wait for its completion.
301 	 * This will ensure that all the previous WRs got a completion before
302 	 * we set the QP state to RESET.
303 	 */
304 	err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context->cqe, wqe,
305 				  with_data);
306 	if (err) {
307 		mlx5_ib_warn(dev, "UMR recovery post send failed, err %d\n", err);
308 		goto err;
309 	}
310 
311 	/* Since the QP is in an error state, it will only receive
312 	 * IB_WC_WR_FLUSH_ERR. However, as it serves only as a barrier
313 	 * we don't care about its status.
314 	 */
315 	wait_for_completion(&umr_context->done);
316 
317 	attr.qp_state = IB_QPS_RESET;
318 	err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
319 	if (err) {
320 		mlx5_ib_warn(dev, "Couldn't modify UMR QP to RESET, err=%d\n", err);
321 		goto err;
322 	}
323 
324 	err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
325 	if (err) {
326 		mlx5_ib_warn(dev, "Couldn't modify UMR QP to RTS, err=%d\n", err);
327 		goto err;
328 	}
329 
330 	umrc->state = MLX5_UMR_STATE_ACTIVE;
331 	return 0;
332 
333 err:
334 	umrc->state = MLX5_UMR_STATE_ERR;
335 	return err;
336 }
337 
mlx5r_umr_done(struct ib_cq * cq,struct ib_wc * wc)338 static void mlx5r_umr_done(struct ib_cq *cq, struct ib_wc *wc)
339 {
340 	struct mlx5_ib_umr_context *context =
341 		container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
342 
343 	context->status = wc->status;
344 	complete(&context->done);
345 }
346 
mlx5r_umr_init_context(struct mlx5r_umr_context * context)347 static inline void mlx5r_umr_init_context(struct mlx5r_umr_context *context)
348 {
349 	context->cqe.done = mlx5r_umr_done;
350 	init_completion(&context->done);
351 }
352 
mlx5r_umr_post_send_wait(struct mlx5_ib_dev * dev,u32 mkey,struct mlx5r_umr_wqe * wqe,bool with_data)353 static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey,
354 				   struct mlx5r_umr_wqe *wqe, bool with_data)
355 {
356 	struct umr_common *umrc = &dev->umrc;
357 	struct mlx5r_umr_context umr_context;
358 	int err;
359 
360 	err = umr_check_mkey_mask(dev, be64_to_cpu(wqe->ctrl_seg.mkey_mask));
361 	if (WARN_ON(err))
362 		return err;
363 
364 	mlx5r_umr_init_context(&umr_context);
365 
366 	down(&umrc->sem);
367 	while (true) {
368 		mutex_lock(&umrc->lock);
369 		if (umrc->state == MLX5_UMR_STATE_ERR) {
370 			mutex_unlock(&umrc->lock);
371 			err = -EFAULT;
372 			break;
373 		}
374 
375 		if (umrc->state == MLX5_UMR_STATE_RECOVER) {
376 			mutex_unlock(&umrc->lock);
377 			usleep_range(3000, 5000);
378 			continue;
379 		}
380 
381 		err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe,
382 					  with_data);
383 		mutex_unlock(&umrc->lock);
384 		if (err) {
385 			mlx5_ib_warn(dev, "UMR post send failed, err %d\n",
386 				     err);
387 			break;
388 		}
389 
390 		wait_for_completion(&umr_context.done);
391 
392 		if (umr_context.status == IB_WC_SUCCESS)
393 			break;
394 
395 		if (umr_context.status == IB_WC_WR_FLUSH_ERR)
396 			continue;
397 
398 		WARN_ON_ONCE(1);
399 		mlx5_ib_warn(dev,
400 			"reg umr failed (%u). Trying to recover and resubmit the flushed WQEs, mkey = %u\n",
401 			umr_context.status, mkey);
402 		err = mlx5r_umr_recover(dev, mkey, &umr_context, wqe, with_data);
403 		if (err)
404 			mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n",
405 				     err);
406 		err = -EFAULT;
407 		break;
408 	}
409 	up(&umrc->sem);
410 	return err;
411 }
412 
413 /**
414  * mlx5r_umr_revoke_mr - Fence all DMA on the MR
415  * @mr: The MR to fence
416  *
417  * Upon return the NIC will not be doing any DMA to the pages under the MR,
418  * and any DMA in progress will be completed. Failure of this function
419  * indicates the HW has failed catastrophically.
420  */
mlx5r_umr_revoke_mr(struct mlx5_ib_mr * mr)421 int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr)
422 {
423 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
424 	struct mlx5r_umr_wqe wqe = {};
425 
426 	if (dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
427 		return 0;
428 
429 	wqe.ctrl_seg.mkey_mask |= get_umr_update_pd_mask();
430 	wqe.ctrl_seg.mkey_mask |= get_umr_disable_mr_mask();
431 	wqe.ctrl_seg.flags |= MLX5_UMR_INLINE;
432 
433 	MLX5_SET(mkc, &wqe.mkey_seg, free, 1);
434 	MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(dev->umrc.pd)->pdn);
435 	MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff);
436 	MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0,
437 		 mlx5_mkey_variant(mr->mmkey.key));
438 
439 	return mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false);
440 }
441 
mlx5r_umr_set_access_flags(struct mlx5_ib_dev * dev,struct mlx5_mkey_seg * seg,unsigned int access_flags)442 static void mlx5r_umr_set_access_flags(struct mlx5_ib_dev *dev,
443 				       struct mlx5_mkey_seg *seg,
444 				       unsigned int access_flags)
445 {
446 	bool ro_read = (access_flags & IB_ACCESS_RELAXED_ORDERING) &&
447 		       (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
448 			pcie_relaxed_ordering_enabled(dev->mdev->pdev));
449 
450 	MLX5_SET(mkc, seg, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
451 	MLX5_SET(mkc, seg, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE));
452 	MLX5_SET(mkc, seg, rr, !!(access_flags & IB_ACCESS_REMOTE_READ));
453 	MLX5_SET(mkc, seg, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE));
454 	MLX5_SET(mkc, seg, lr, 1);
455 	MLX5_SET(mkc, seg, relaxed_ordering_write,
456 		 !!(access_flags & IB_ACCESS_RELAXED_ORDERING));
457 	MLX5_SET(mkc, seg, relaxed_ordering_read, ro_read);
458 }
459 
mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr * mr,struct ib_pd * pd,int access_flags)460 int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd,
461 			      int access_flags)
462 {
463 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
464 	struct mlx5r_umr_wqe wqe = {};
465 	int err;
466 
467 	wqe.ctrl_seg.mkey_mask = get_umr_update_access_mask(dev);
468 	wqe.ctrl_seg.mkey_mask |= get_umr_update_pd_mask();
469 	wqe.ctrl_seg.flags = MLX5_UMR_CHECK_FREE;
470 	wqe.ctrl_seg.flags |= MLX5_UMR_INLINE;
471 
472 	mlx5r_umr_set_access_flags(dev, &wqe.mkey_seg, access_flags);
473 	MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(pd)->pdn);
474 	MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff);
475 	MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0,
476 		 mlx5_mkey_variant(mr->mmkey.key));
477 
478 	err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false);
479 	if (err)
480 		return err;
481 
482 	mr->access_flags = access_flags;
483 	return 0;
484 }
485 
486 #define MLX5_MAX_UMR_CHUNK                                                     \
487 	((1 << (MLX5_MAX_UMR_SHIFT + 4)) - MLX5_UMR_FLEX_ALIGNMENT)
488 #define MLX5_SPARE_UMR_CHUNK 0x10000
489 
490 /*
491  * Allocate a temporary buffer to hold the per-page information to transfer to
492  * HW. For efficiency this should be as large as it can be, but buffer
493  * allocation failure is not allowed, so try smaller sizes.
494  */
mlx5r_umr_alloc_xlt(size_t * nents,size_t ent_size,gfp_t gfp_mask)495 static void *mlx5r_umr_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask)
496 {
497 	const size_t xlt_chunk_align = MLX5_UMR_FLEX_ALIGNMENT / ent_size;
498 	size_t size;
499 	void *res = NULL;
500 
501 	static_assert(PAGE_SIZE % MLX5_UMR_FLEX_ALIGNMENT == 0);
502 
503 	/*
504 	 * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the
505 	 * allocation can't trigger any kind of reclaim.
506 	 */
507 	might_sleep();
508 
509 	gfp_mask |= __GFP_ZERO | __GFP_NORETRY;
510 
511 	/*
512 	 * If the system already has a suitable high order page then just use
513 	 * that, but don't try hard to create one. This max is about 1M, so a
514 	 * free x86 huge page will satisfy it.
515 	 */
516 	size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align),
517 		     MLX5_MAX_UMR_CHUNK);
518 	*nents = size / ent_size;
519 	res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
520 				       get_order(size));
521 	if (res)
522 		return res;
523 
524 	if (size > MLX5_SPARE_UMR_CHUNK) {
525 		size = MLX5_SPARE_UMR_CHUNK;
526 		*nents = size / ent_size;
527 		res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
528 					       get_order(size));
529 		if (res)
530 			return res;
531 	}
532 
533 	*nents = PAGE_SIZE / ent_size;
534 	res = (void *)__get_free_page(gfp_mask);
535 	if (res)
536 		return res;
537 
538 	mutex_lock(&xlt_emergency_page_mutex);
539 	memset(xlt_emergency_page, 0, PAGE_SIZE);
540 	return xlt_emergency_page;
541 }
542 
mlx5r_umr_free_xlt(void * xlt,size_t length)543 static void mlx5r_umr_free_xlt(void *xlt, size_t length)
544 {
545 	if (xlt == xlt_emergency_page) {
546 		mutex_unlock(&xlt_emergency_page_mutex);
547 		return;
548 	}
549 
550 	free_pages((unsigned long)xlt, get_order(length));
551 }
552 
mlx5r_umr_unmap_free_xlt(struct mlx5_ib_dev * dev,void * xlt,struct ib_sge * sg)553 static void mlx5r_umr_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt,
554 				     struct ib_sge *sg)
555 {
556 	struct device *ddev = &dev->mdev->pdev->dev;
557 
558 	dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE);
559 	mlx5r_umr_free_xlt(xlt, sg->length);
560 }
561 
562 /*
563  * Create an XLT buffer ready for submission.
564  */
mlx5r_umr_create_xlt(struct mlx5_ib_dev * dev,struct ib_sge * sg,size_t nents,size_t ent_size,unsigned int flags)565 static void *mlx5r_umr_create_xlt(struct mlx5_ib_dev *dev, struct ib_sge *sg,
566 				  size_t nents, size_t ent_size,
567 				  unsigned int flags)
568 {
569 	struct device *ddev = &dev->mdev->pdev->dev;
570 	dma_addr_t dma;
571 	void *xlt;
572 
573 	xlt = mlx5r_umr_alloc_xlt(&nents, ent_size,
574 				 flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC :
575 								  GFP_KERNEL);
576 	sg->length = nents * ent_size;
577 	dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE);
578 	if (dma_mapping_error(ddev, dma)) {
579 		mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
580 		mlx5r_umr_free_xlt(xlt, sg->length);
581 		return NULL;
582 	}
583 	sg->addr = dma;
584 	sg->lkey = dev->umrc.pd->local_dma_lkey;
585 
586 	return xlt;
587 }
588 
589 static void
mlx5r_umr_set_update_xlt_ctrl_seg(struct mlx5_wqe_umr_ctrl_seg * ctrl_seg,unsigned int flags,struct ib_sge * sg)590 mlx5r_umr_set_update_xlt_ctrl_seg(struct mlx5_wqe_umr_ctrl_seg *ctrl_seg,
591 				  unsigned int flags, struct ib_sge *sg)
592 {
593 	if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
594 		/* fail if free */
595 		ctrl_seg->flags = MLX5_UMR_CHECK_FREE;
596 	else
597 		/* fail if not free */
598 		ctrl_seg->flags = MLX5_UMR_CHECK_NOT_FREE;
599 	ctrl_seg->xlt_octowords =
600 		cpu_to_be16(mlx5r_umr_get_xlt_octo(sg->length));
601 }
602 
mlx5r_umr_set_update_xlt_mkey_seg(struct mlx5_ib_dev * dev,struct mlx5_mkey_seg * mkey_seg,struct mlx5_ib_mr * mr,unsigned int page_shift)603 static void mlx5r_umr_set_update_xlt_mkey_seg(struct mlx5_ib_dev *dev,
604 					      struct mlx5_mkey_seg *mkey_seg,
605 					      struct mlx5_ib_mr *mr,
606 					      unsigned int page_shift)
607 {
608 	mlx5r_umr_set_access_flags(dev, mkey_seg, mr->access_flags);
609 	MLX5_SET(mkc, mkey_seg, pd, to_mpd(mr->ibmr.pd)->pdn);
610 	MLX5_SET64(mkc, mkey_seg, start_addr, mr->ibmr.iova);
611 	MLX5_SET64(mkc, mkey_seg, len, mr->ibmr.length);
612 	MLX5_SET(mkc, mkey_seg, log_page_size, page_shift);
613 	MLX5_SET(mkc, mkey_seg, qpn, 0xffffff);
614 	MLX5_SET(mkc, mkey_seg, mkey_7_0, mlx5_mkey_variant(mr->mmkey.key));
615 }
616 
617 static void
mlx5r_umr_set_update_xlt_data_seg(struct mlx5_wqe_data_seg * data_seg,struct ib_sge * sg)618 mlx5r_umr_set_update_xlt_data_seg(struct mlx5_wqe_data_seg *data_seg,
619 				  struct ib_sge *sg)
620 {
621 	data_seg->byte_count = cpu_to_be32(sg->length);
622 	data_seg->lkey = cpu_to_be32(sg->lkey);
623 	data_seg->addr = cpu_to_be64(sg->addr);
624 }
625 
mlx5r_umr_update_offset(struct mlx5_wqe_umr_ctrl_seg * ctrl_seg,u64 offset)626 static void mlx5r_umr_update_offset(struct mlx5_wqe_umr_ctrl_seg *ctrl_seg,
627 				    u64 offset)
628 {
629 	u64 octo_offset = mlx5r_umr_get_xlt_octo(offset);
630 
631 	ctrl_seg->xlt_offset = cpu_to_be16(octo_offset & 0xffff);
632 	ctrl_seg->xlt_offset_47_16 = cpu_to_be32(octo_offset >> 16);
633 	ctrl_seg->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN;
634 }
635 
mlx5r_umr_final_update_xlt(struct mlx5_ib_dev * dev,struct mlx5r_umr_wqe * wqe,struct mlx5_ib_mr * mr,struct ib_sge * sg,unsigned int flags)636 static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev,
637 				       struct mlx5r_umr_wqe *wqe,
638 				       struct mlx5_ib_mr *mr, struct ib_sge *sg,
639 				       unsigned int flags)
640 {
641 	bool update_pd_access, update_translation;
642 
643 	if (flags & MLX5_IB_UPD_XLT_ENABLE)
644 		wqe->ctrl_seg.mkey_mask |= get_umr_enable_mr_mask();
645 
646 	update_pd_access = flags & MLX5_IB_UPD_XLT_ENABLE ||
647 			   flags & MLX5_IB_UPD_XLT_PD ||
648 			   flags & MLX5_IB_UPD_XLT_ACCESS;
649 
650 	if (update_pd_access) {
651 		wqe->ctrl_seg.mkey_mask |= get_umr_update_access_mask(dev);
652 		wqe->ctrl_seg.mkey_mask |= get_umr_update_pd_mask();
653 	}
654 
655 	update_translation =
656 		flags & MLX5_IB_UPD_XLT_ENABLE || flags & MLX5_IB_UPD_XLT_ADDR;
657 
658 	if (update_translation) {
659 		wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask(dev);
660 		if (!mr->ibmr.length)
661 			MLX5_SET(mkc, &wqe->mkey_seg, length64, 1);
662 		if (flags & MLX5_IB_UPD_XLT_KEEP_PGSZ)
663 			wqe->ctrl_seg.mkey_mask &=
664 				cpu_to_be64(~MLX5_MKEY_MASK_PAGE_SIZE);
665 	}
666 
667 	wqe->ctrl_seg.xlt_octowords =
668 		cpu_to_be16(mlx5r_umr_get_xlt_octo(sg->length));
669 	wqe->data_seg.byte_count = cpu_to_be32(sg->length);
670 }
671 
672 static void
_mlx5r_umr_init_wqe(struct mlx5_ib_mr * mr,struct mlx5r_umr_wqe * wqe,struct ib_sge * sg,unsigned int flags,unsigned int page_shift,bool dd)673 _mlx5r_umr_init_wqe(struct mlx5_ib_mr *mr, struct mlx5r_umr_wqe *wqe,
674 		    struct ib_sge *sg, unsigned int flags,
675 		    unsigned int page_shift, bool dd)
676 {
677 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
678 
679 	mlx5r_umr_set_update_xlt_ctrl_seg(&wqe->ctrl_seg, flags, sg);
680 	mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe->mkey_seg, mr, page_shift);
681 	if (dd) /* Use the data direct internal kernel PD */
682 		MLX5_SET(mkc, &wqe->mkey_seg, pd, dev->ddr.pdn);
683 	mlx5r_umr_set_update_xlt_data_seg(&wqe->data_seg, sg);
684 }
685 
686 static int
_mlx5r_umr_update_mr_pas(struct mlx5_ib_mr * mr,unsigned int flags,bool dd,size_t start_block,size_t nblocks)687 _mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd,
688 			 size_t start_block, size_t nblocks)
689 {
690 	size_t ent_size = dd ? sizeof(struct mlx5_ksm) : sizeof(struct mlx5_mtt);
691 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
692 	struct device *ddev = &dev->mdev->pdev->dev;
693 	struct mlx5r_umr_wqe wqe = {};
694 	size_t processed_blocks = 0;
695 	struct ib_block_iter biter;
696 	size_t cur_block_idx = 0;
697 	struct mlx5_ksm *cur_ksm;
698 	struct mlx5_mtt *cur_mtt;
699 	size_t orig_sg_length;
700 	size_t total_blocks;
701 	size_t final_size;
702 	void *curr_entry;
703 	struct ib_sge sg;
704 	void *entry;
705 	u64 offset;
706 	int err = 0;
707 
708 	total_blocks = ib_umem_num_dma_blocks(mr->umem, 1UL << mr->page_shift);
709 	if (start_block > total_blocks)
710 		return -EINVAL;
711 
712 	/* nblocks 0 means update all blocks starting from start_block */
713 	if (nblocks)
714 		total_blocks = nblocks;
715 
716 	entry = mlx5r_umr_create_xlt(dev, &sg, total_blocks, ent_size, flags);
717 	if (!entry)
718 		return -ENOMEM;
719 
720 	orig_sg_length = sg.length;
721 
722 	_mlx5r_umr_init_wqe(mr, &wqe, &sg, flags, mr->page_shift, dd);
723 
724 	/* Set initial translation offset to start_block */
725 	offset = (u64)start_block * ent_size;
726 	mlx5r_umr_update_offset(&wqe.ctrl_seg, offset);
727 
728 	if (dd)
729 		cur_ksm = entry;
730 	else
731 		cur_mtt = entry;
732 
733 	curr_entry = entry;
734 
735 	rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) {
736 		if (cur_block_idx < start_block) {
737 			cur_block_idx++;
738 			continue;
739 		}
740 
741 		if (nblocks && processed_blocks >= nblocks)
742 			break;
743 
744 		if (curr_entry == entry + sg.length) {
745 			dma_sync_single_for_device(ddev, sg.addr, sg.length,
746 						   DMA_TO_DEVICE);
747 
748 			err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe,
749 						       true);
750 			if (err)
751 				goto err;
752 			dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
753 						DMA_TO_DEVICE);
754 			offset += sg.length;
755 			mlx5r_umr_update_offset(&wqe.ctrl_seg, offset);
756 			if (dd)
757 				cur_ksm = entry;
758 			else
759 				cur_mtt = entry;
760 		}
761 
762 		if (dd) {
763 			cur_ksm->va = cpu_to_be64(rdma_block_iter_dma_address(&biter));
764 			cur_ksm->key = cpu_to_be32(dev->ddr.mkey);
765 			if (mr->umem->is_dmabuf &&
766 			    (flags & MLX5_IB_UPD_XLT_ZAP)) {
767 				cur_ksm->va = 0;
768 				cur_ksm->key = 0;
769 			}
770 			cur_ksm++;
771 			curr_entry = cur_ksm;
772 		} else {
773 			cur_mtt->ptag =
774 				cpu_to_be64(rdma_block_iter_dma_address(&biter) |
775 					    MLX5_IB_MTT_PRESENT);
776 			if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP))
777 				cur_mtt->ptag = 0;
778 			cur_mtt++;
779 			curr_entry = cur_mtt;
780 		}
781 
782 		processed_blocks++;
783 	}
784 
785 	final_size = curr_entry - entry;
786 	sg.length = ALIGN(final_size, MLX5_UMR_FLEX_ALIGNMENT);
787 	memset(curr_entry, 0, sg.length - final_size);
788 	mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags);
789 
790 	dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE);
791 	err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, true);
792 
793 err:
794 	sg.length = orig_sg_length;
795 	mlx5r_umr_unmap_free_xlt(dev, entry, &sg);
796 	return err;
797 }
798 
mlx5r_umr_update_data_direct_ksm_pas_range(struct mlx5_ib_mr * mr,unsigned int flags,size_t start_block,size_t nblocks)799 int mlx5r_umr_update_data_direct_ksm_pas_range(struct mlx5_ib_mr *mr,
800 					       unsigned int flags,
801 					       size_t start_block,
802 					       size_t nblocks)
803 {
804 	/* No invalidation flow is expected */
805 	if (WARN_ON(!mr->umem->is_dmabuf) || ((flags & MLX5_IB_UPD_XLT_ZAP) &&
806 	    !(flags & MLX5_IB_UPD_XLT_KEEP_PGSZ)))
807 		return -EINVAL;
808 
809 	return _mlx5r_umr_update_mr_pas(mr, flags, true, start_block, nblocks);
810 }
811 
mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr * mr,unsigned int flags)812 int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr,
813 					 unsigned int flags)
814 {
815 	return mlx5r_umr_update_data_direct_ksm_pas_range(mr, flags, 0, 0);
816 }
817 
mlx5r_umr_update_mr_pas_range(struct mlx5_ib_mr * mr,unsigned int flags,size_t start_block,size_t nblocks)818 int mlx5r_umr_update_mr_pas_range(struct mlx5_ib_mr *mr, unsigned int flags,
819 				  size_t start_block, size_t nblocks)
820 {
821 	if (WARN_ON(mr->umem->is_odp))
822 		return -EINVAL;
823 
824 	return _mlx5r_umr_update_mr_pas(mr, flags, false, start_block, nblocks);
825 }
826 
827 /*
828  * Send the DMA list to the HW for a normal MR using UMR.
829  * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
830  * flag may be used.
831  */
mlx5r_umr_update_mr_pas(struct mlx5_ib_mr * mr,unsigned int flags)832 int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
833 {
834 	return mlx5r_umr_update_mr_pas_range(mr, flags, 0, 0);
835 }
836 
umr_can_use_indirect_mkey(struct mlx5_ib_dev * dev)837 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
838 {
839 	return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
840 }
841 
mlx5r_umr_update_xlt(struct mlx5_ib_mr * mr,u64 idx,int npages,int page_shift,int flags)842 int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
843 			 int page_shift, int flags)
844 {
845 	int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
846 			       ? sizeof(struct mlx5_klm)
847 			       : sizeof(struct mlx5_mtt);
848 	const int page_align = MLX5_UMR_FLEX_ALIGNMENT / desc_size;
849 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
850 	struct device *ddev = &dev->mdev->pdev->dev;
851 	const int page_mask = page_align - 1;
852 	struct mlx5r_umr_wqe wqe = {};
853 	size_t pages_mapped = 0;
854 	size_t pages_to_map = 0;
855 	size_t size_to_map = 0;
856 	size_t orig_sg_length;
857 	size_t pages_iter;
858 	struct ib_sge sg;
859 	int err = 0;
860 	void *xlt;
861 
862 	if ((flags & MLX5_IB_UPD_XLT_INDIRECT) &&
863 	    !umr_can_use_indirect_mkey(dev))
864 		return -EPERM;
865 
866 	if (WARN_ON(!mr->umem->is_odp))
867 		return -EINVAL;
868 
869 	/* UMR copies MTTs in units of MLX5_UMR_FLEX_ALIGNMENT bytes,
870 	 * so we need to align the offset and length accordingly
871 	 */
872 	if (idx & page_mask) {
873 		npages += idx & page_mask;
874 		idx &= ~page_mask;
875 	}
876 	pages_to_map = ALIGN(npages, page_align);
877 
878 	xlt = mlx5r_umr_create_xlt(dev, &sg, npages, desc_size, flags);
879 	if (!xlt)
880 		return -ENOMEM;
881 
882 	pages_iter = sg.length / desc_size;
883 	orig_sg_length = sg.length;
884 
885 	if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
886 		struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
887 		size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
888 
889 		pages_to_map = min_t(size_t, pages_to_map, max_pages);
890 	}
891 
892 	mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg);
893 	mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr, page_shift);
894 	mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg);
895 
896 	for (pages_mapped = 0;
897 	     pages_mapped < pages_to_map && !err;
898 	     pages_mapped += pages_iter, idx += pages_iter) {
899 		npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
900 		size_to_map = npages * desc_size;
901 		dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
902 					DMA_TO_DEVICE);
903 		/*
904 		 * npages is the maximum number of pages to map, but we
905 		 * can't guarantee that all pages are actually mapped.
906 		 *
907 		 * For example, if page is p2p of type which is not supported
908 		 * for mapping, the number of pages mapped will be less than
909 		 * requested.
910 		 */
911 		err = mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
912 		if (err)
913 			return err;
914 		dma_sync_single_for_device(ddev, sg.addr, sg.length,
915 					   DMA_TO_DEVICE);
916 		sg.length = ALIGN(size_to_map, MLX5_UMR_FLEX_ALIGNMENT);
917 
918 		if (pages_mapped + pages_iter >= pages_to_map)
919 			mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags);
920 		mlx5r_umr_update_offset(&wqe.ctrl_seg, idx * desc_size);
921 		err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, true);
922 	}
923 	sg.length = orig_sg_length;
924 	mlx5r_umr_unmap_free_xlt(dev, xlt, &sg);
925 	return err;
926 }
927 
928 /*
929  * Update only the page-size (log_page_size) field of an existing memory key
930  * using UMR.  This is useful when the MR's physical layout stays the same
931  * but the optimal page shift has changed (e.g. dmabuf after pages are
932  * pinned and the HW can switch from 4K to huge-page alignment).
933  */
mlx5r_umr_update_mr_page_shift(struct mlx5_ib_mr * mr,unsigned int page_shift,bool dd)934 int mlx5r_umr_update_mr_page_shift(struct mlx5_ib_mr *mr,
935 				   unsigned int page_shift,
936 				   bool dd)
937 {
938 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
939 	struct mlx5r_umr_wqe wqe = {};
940 	int err;
941 
942 	/* Build UMR wqe: we touch only PAGE_SIZE, so use the dedicated mask */
943 	wqe.ctrl_seg.mkey_mask = get_umr_update_translation_mask(dev);
944 
945 	/* MR must be free while page size is modified */
946 	wqe.ctrl_seg.flags = MLX5_UMR_CHECK_FREE | MLX5_UMR_INLINE;
947 
948 	/* Fill mkey segment with the new page size, keep the rest unchanged */
949 	MLX5_SET(mkc, &wqe.mkey_seg, log_page_size, page_shift);
950 
951 	if (dd)
952 		MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn);
953 	else
954 		MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(mr->ibmr.pd)->pdn);
955 
956 	MLX5_SET64(mkc, &wqe.mkey_seg, start_addr, mr->ibmr.iova);
957 	MLX5_SET64(mkc, &wqe.mkey_seg, len, mr->ibmr.length);
958 	MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff);
959 	MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0,
960 		 mlx5_mkey_variant(mr->mmkey.key));
961 
962 	err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false);
963 	if (!err)
964 		mr->page_shift = page_shift;
965 
966 	return err;
967 }
968 
969 static inline int
_mlx5r_dmabuf_umr_update_pas(struct mlx5_ib_mr * mr,unsigned int flags,size_t start_block,size_t nblocks,bool dd)970 _mlx5r_dmabuf_umr_update_pas(struct mlx5_ib_mr *mr, unsigned int flags,
971 			     size_t start_block, size_t nblocks, bool dd)
972 {
973 	if (dd)
974 		return mlx5r_umr_update_data_direct_ksm_pas_range(mr, flags,
975 								  start_block,
976 								  nblocks);
977 	else
978 		return mlx5r_umr_update_mr_pas_range(mr, flags, start_block,
979 						     nblocks);
980 }
981 
982 /**
983  * This function makes an mkey non-present by zapping the translation entries of
984  * the mkey by zapping (zeroing out) the first N entries, where N is determined
985  * by the largest page size supported by the device and the MR length.
986  * It then updates the mkey's page size to the largest possible value, ensuring
987  * the MR is completely non-present and safe for further updates.
988  * It is useful to update the page size of a dmabuf MR on a page fault.
989  *
990  * Return: On success, returns the number of entries that were zapped.
991  *         On error, returns a negative error code.
992  */
_mlx5r_umr_zap_mkey(struct mlx5_ib_mr * mr,unsigned int flags,unsigned int page_shift,size_t * nblocks,bool dd)993 static int _mlx5r_umr_zap_mkey(struct mlx5_ib_mr *mr,
994 			       unsigned int flags,
995 			       unsigned int page_shift,
996 			       size_t *nblocks,
997 			       bool dd)
998 {
999 	unsigned int old_page_shift = mr->page_shift;
1000 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
1001 	unsigned int max_page_shift;
1002 	size_t page_shift_nblocks;
1003 	unsigned int max_log_size;
1004 	int access_mode;
1005 	int err;
1006 
1007 	access_mode = dd ? MLX5_MKC_ACCESS_MODE_KSM : MLX5_MKC_ACCESS_MODE_MTT;
1008 	flags |= MLX5_IB_UPD_XLT_KEEP_PGSZ | MLX5_IB_UPD_XLT_ZAP |
1009 		 MLX5_IB_UPD_XLT_ATOMIC;
1010 	max_log_size = get_max_log_entity_size_cap(dev, access_mode);
1011 	max_page_shift = order_base_2(mr->ibmr.length);
1012 	max_page_shift = min(max(max_page_shift, page_shift), max_log_size);
1013 	/* Count blocks in units of max_page_shift, we will zap exactly this
1014 	 * many to make the whole MR non-present.
1015 	 * Block size must be aligned to MLX5_UMR_FLEX_ALIGNMENT since it may
1016 	 * be used as offset into the XLT later on.
1017 	 */
1018 	*nblocks = ib_umem_num_dma_blocks(mr->umem, 1UL << max_page_shift);
1019 	if (dd)
1020 		*nblocks = ALIGN(*nblocks, MLX5_UMR_KSM_NUM_ENTRIES_ALIGNMENT);
1021 	else
1022 		*nblocks = ALIGN(*nblocks, MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT);
1023 	page_shift_nblocks = ib_umem_num_dma_blocks(mr->umem,
1024 						    1UL << page_shift);
1025 	/* If the number of blocks at max possible page shift is greater than
1026 	 * the number of blocks at the new page size, we should just go over the
1027 	 * whole mkey entries.
1028 	 */
1029 	if (*nblocks >= page_shift_nblocks)
1030 		*nblocks = 0;
1031 
1032 	/* Make the first nblocks entries non-present without changing
1033 	 * page size yet.
1034 	 */
1035 	if (*nblocks)
1036 		mr->page_shift = max_page_shift;
1037 	err = _mlx5r_dmabuf_umr_update_pas(mr, flags, 0, *nblocks, dd);
1038 	if (err) {
1039 		mr->page_shift = old_page_shift;
1040 		return err;
1041 	}
1042 
1043 	/* Change page size to the max page size now that the MR is completely
1044 	 * non-present.
1045 	 */
1046 	if (*nblocks) {
1047 		err = mlx5r_umr_update_mr_page_shift(mr, max_page_shift, dd);
1048 		if (err) {
1049 			mr->page_shift = old_page_shift;
1050 			return err;
1051 		}
1052 	}
1053 
1054 	return 0;
1055 }
1056 
1057 /**
1058  * mlx5r_umr_dmabuf_update_pgsz - Safely update DMABUF MR page size and its
1059  * entries accordingly
1060  * @mr:        The memory region to update
1061  * @xlt_flags: Translation table update flags
1062  * @page_shift: The new (optimized) page shift to use
1063  *
1064  * This function updates the page size and mkey translation entries for a DMABUF
1065  * MR in a safe, multi-step process to avoid exposing partially updated mappings
1066  * The update is performed in 5 steps:
1067  *   1. Make the first X entries non-present, while X is calculated to be
1068  *        minimal according to a large page shift that can be used to cover the
1069  *        MR length.
1070  *   2. Update the page size to the large supported page size
1071  *   3. Load the remaining N-X entries according to the (optimized) page_shift
1072  *   4. Update the page size according to the (optimized) page_shift
1073  *   5. Load the first X entries with the correct translations
1074  *
1075  * This ensures that at no point is the MR accessible with a partially updated
1076  * translation table, maintaining correctness and preventing access to stale or
1077  * inconsistent mappings.
1078  *
1079  * Returns 0 on success or a negative error code on failure.
1080  */
mlx5r_umr_dmabuf_update_pgsz(struct mlx5_ib_mr * mr,u32 xlt_flags,unsigned int page_shift)1081 int mlx5r_umr_dmabuf_update_pgsz(struct mlx5_ib_mr *mr, u32 xlt_flags,
1082 				 unsigned int page_shift)
1083 {
1084 	unsigned int old_page_shift = mr->page_shift;
1085 	size_t zapped_blocks;
1086 	size_t total_blocks;
1087 	int err;
1088 
1089 	err = _mlx5r_umr_zap_mkey(mr, xlt_flags, page_shift, &zapped_blocks,
1090 				  mr->data_direct);
1091 	if (err)
1092 		return err;
1093 
1094 	/* _mlx5r_umr_zap_mkey already enables the mkey */
1095 	xlt_flags &= ~MLX5_IB_UPD_XLT_ENABLE;
1096 	mr->page_shift = page_shift;
1097 	total_blocks = ib_umem_num_dma_blocks(mr->umem, 1UL << mr->page_shift);
1098 	if (zapped_blocks && zapped_blocks < total_blocks) {
1099 		/* Update PAS according to the new page size but don't update
1100 		 * the page size in the mkey yet.
1101 		 */
1102 		err = _mlx5r_dmabuf_umr_update_pas(
1103 			mr,
1104 			xlt_flags | MLX5_IB_UPD_XLT_KEEP_PGSZ,
1105 			zapped_blocks,
1106 			total_blocks - zapped_blocks,
1107 			mr->data_direct);
1108 		if (err)
1109 			goto err;
1110 	}
1111 
1112 	err = mlx5r_umr_update_mr_page_shift(mr, mr->page_shift,
1113 					     mr->data_direct);
1114 	if (err)
1115 		goto err;
1116 	err = _mlx5r_dmabuf_umr_update_pas(mr, xlt_flags, 0, zapped_blocks,
1117 					   mr->data_direct);
1118 	if (err)
1119 		goto err;
1120 
1121 	return 0;
1122 err:
1123 	mr->page_shift = old_page_shift;
1124 	return err;
1125 }
1126