xref: /linux/drivers/iommu/iommufd/io_pagetable.c (revision c93529ad4fa8d8d8cb21649e70a46991a1dda0f8)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3  *
4  * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5  * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6  * list for access by an in-kernel user.
7  *
8  * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9  * between the domains and xarray.
10  */
11 #include <linux/err.h>
12 #include <linux/errno.h>
13 #include <linux/iommu.h>
14 #include <linux/iommufd.h>
15 #include <linux/lockdep.h>
16 #include <linux/sched/mm.h>
17 #include <linux/slab.h>
18 #include <uapi/linux/iommufd.h>
19 
20 #include "double_span.h"
21 #include "io_pagetable.h"
22 
23 struct iopt_pages_list {
24 	struct iopt_pages *pages;
25 	struct iopt_area *area;
26 	struct list_head next;
27 	unsigned long start_byte;
28 	unsigned long length;
29 };
30 
iopt_area_contig_init(struct iopt_area_contig_iter * iter,struct io_pagetable * iopt,unsigned long iova,unsigned long last_iova)31 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
32 					struct io_pagetable *iopt,
33 					unsigned long iova,
34 					unsigned long last_iova)
35 {
36 	lockdep_assert_held(&iopt->iova_rwsem);
37 
38 	iter->cur_iova = iova;
39 	iter->last_iova = last_iova;
40 	iter->area = iopt_area_iter_first(iopt, iova, iova);
41 	if (!iter->area)
42 		return NULL;
43 	if (!iter->area->pages) {
44 		iter->area = NULL;
45 		return NULL;
46 	}
47 	return iter->area;
48 }
49 
iopt_area_contig_next(struct iopt_area_contig_iter * iter)50 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
51 {
52 	unsigned long last_iova;
53 
54 	if (!iter->area)
55 		return NULL;
56 	last_iova = iopt_area_last_iova(iter->area);
57 	if (iter->last_iova <= last_iova)
58 		return NULL;
59 
60 	iter->cur_iova = last_iova + 1;
61 	iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
62 					 iter->last_iova);
63 	if (!iter->area)
64 		return NULL;
65 	if (iter->cur_iova != iopt_area_iova(iter->area) ||
66 	    !iter->area->pages) {
67 		iter->area = NULL;
68 		return NULL;
69 	}
70 	return iter->area;
71 }
72 
__alloc_iova_check_range(unsigned long * start,unsigned long last,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)73 static bool __alloc_iova_check_range(unsigned long *start, unsigned long last,
74 				     unsigned long length,
75 				     unsigned long iova_alignment,
76 				     unsigned long page_offset)
77 {
78 	unsigned long aligned_start;
79 
80 	/* ALIGN_UP() */
81 	if (check_add_overflow(*start, iova_alignment - 1, &aligned_start))
82 		return false;
83 	aligned_start &= ~(iova_alignment - 1);
84 	aligned_start |= page_offset;
85 
86 	if (aligned_start >= last || last - aligned_start < length - 1)
87 		return false;
88 	*start = aligned_start;
89 	return true;
90 }
91 
__alloc_iova_check_hole(struct interval_tree_double_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)92 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
93 				    unsigned long length,
94 				    unsigned long iova_alignment,
95 				    unsigned long page_offset)
96 {
97 	if (span->is_used)
98 		return false;
99 	return __alloc_iova_check_range(&span->start_hole, span->last_hole,
100 					length, iova_alignment, page_offset);
101 }
102 
__alloc_iova_check_used(struct interval_tree_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)103 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
104 				    unsigned long length,
105 				    unsigned long iova_alignment,
106 				    unsigned long page_offset)
107 {
108 	if (span->is_hole)
109 		return false;
110 	return __alloc_iova_check_range(&span->start_used, span->last_used,
111 					length, iova_alignment, page_offset);
112 }
113 
114 /*
115  * Automatically find a block of IOVA that is not being used and not reserved.
116  * Does not return a 0 IOVA even if it is valid.
117  */
iopt_alloc_iova(struct io_pagetable * iopt,unsigned long * iova,unsigned long addr,unsigned long length)118 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
119 			   unsigned long addr, unsigned long length)
120 {
121 	unsigned long page_offset = addr % PAGE_SIZE;
122 	struct interval_tree_double_span_iter used_span;
123 	struct interval_tree_span_iter allowed_span;
124 	unsigned long max_alignment = PAGE_SIZE;
125 	unsigned long iova_alignment;
126 
127 	lockdep_assert_held(&iopt->iova_rwsem);
128 
129 	/* Protect roundup_pow-of_two() from overflow */
130 	if (length == 0 || length >= ULONG_MAX / 2)
131 		return -EOVERFLOW;
132 
133 	/*
134 	 * Keep alignment present in addr when building the IOVA, which
135 	 * increases the chance we can map a THP.
136 	 */
137 	if (!addr)
138 		iova_alignment = roundup_pow_of_two(length);
139 	else
140 		iova_alignment = min_t(unsigned long,
141 				       roundup_pow_of_two(length),
142 				       1UL << __ffs64(addr));
143 
144 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
145 	max_alignment = HPAGE_SIZE;
146 #endif
147 	/* Protect against ALIGN() overflow */
148 	if (iova_alignment >= max_alignment)
149 		iova_alignment = max_alignment;
150 
151 	if (iova_alignment < iopt->iova_alignment)
152 		return -EINVAL;
153 
154 	interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
155 				    PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
156 		if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
157 			allowed_span.start_used = PAGE_SIZE;
158 			allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
159 			allowed_span.is_hole = false;
160 		}
161 
162 		if (!__alloc_iova_check_used(&allowed_span, length,
163 					     iova_alignment, page_offset))
164 			continue;
165 
166 		interval_tree_for_each_double_span(
167 			&used_span, &iopt->reserved_itree, &iopt->area_itree,
168 			allowed_span.start_used, allowed_span.last_used) {
169 			if (!__alloc_iova_check_hole(&used_span, length,
170 						     iova_alignment,
171 						     page_offset))
172 				continue;
173 
174 			*iova = used_span.start_hole;
175 			return 0;
176 		}
177 	}
178 	return -ENOSPC;
179 }
180 
iopt_check_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length)181 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
182 			   unsigned long length)
183 {
184 	unsigned long last;
185 
186 	lockdep_assert_held(&iopt->iova_rwsem);
187 
188 	if ((iova & (iopt->iova_alignment - 1)))
189 		return -EINVAL;
190 
191 	if (check_add_overflow(iova, length - 1, &last))
192 		return -EOVERFLOW;
193 
194 	/* No reserved IOVA intersects the range */
195 	if (iopt_reserved_iter_first(iopt, iova, last))
196 		return -EINVAL;
197 
198 	/* Check that there is not already a mapping in the range */
199 	if (iopt_area_iter_first(iopt, iova, last))
200 		return -EEXIST;
201 	return 0;
202 }
203 
204 /*
205  * The area takes a slice of the pages from start_bytes to start_byte + length
206  */
iopt_insert_area(struct io_pagetable * iopt,struct iopt_area * area,struct iopt_pages * pages,unsigned long iova,unsigned long start_byte,unsigned long length,int iommu_prot)207 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
208 			    struct iopt_pages *pages, unsigned long iova,
209 			    unsigned long start_byte, unsigned long length,
210 			    int iommu_prot)
211 {
212 	lockdep_assert_held_write(&iopt->iova_rwsem);
213 
214 	if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
215 		return -EPERM;
216 
217 	area->iommu_prot = iommu_prot;
218 	area->page_offset = start_byte % PAGE_SIZE;
219 	if (area->page_offset & (iopt->iova_alignment - 1))
220 		return -EINVAL;
221 
222 	area->node.start = iova;
223 	if (check_add_overflow(iova, length - 1, &area->node.last))
224 		return -EOVERFLOW;
225 
226 	area->pages_node.start = start_byte / PAGE_SIZE;
227 	if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
228 		return -EOVERFLOW;
229 	area->pages_node.last = area->pages_node.last / PAGE_SIZE;
230 	if (WARN_ON(area->pages_node.last >= pages->npages))
231 		return -EOVERFLOW;
232 
233 	/*
234 	 * The area is inserted with a NULL pages indicating it is not fully
235 	 * initialized yet.
236 	 */
237 	area->iopt = iopt;
238 	interval_tree_insert(&area->node, &iopt->area_itree);
239 	return 0;
240 }
241 
iopt_area_alloc(void)242 static struct iopt_area *iopt_area_alloc(void)
243 {
244 	struct iopt_area *area;
245 
246 	area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
247 	if (!area)
248 		return NULL;
249 	RB_CLEAR_NODE(&area->node.rb);
250 	RB_CLEAR_NODE(&area->pages_node.rb);
251 	return area;
252 }
253 
iopt_alloc_area_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)254 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
255 				 struct list_head *pages_list,
256 				 unsigned long length, unsigned long *dst_iova,
257 				 int iommu_prot, unsigned int flags)
258 {
259 	struct iopt_pages_list *elm;
260 	unsigned long start;
261 	unsigned long iova;
262 	int rc = 0;
263 
264 	list_for_each_entry(elm, pages_list, next) {
265 		elm->area = iopt_area_alloc();
266 		if (!elm->area)
267 			return -ENOMEM;
268 	}
269 
270 	down_write(&iopt->iova_rwsem);
271 	if ((length & (iopt->iova_alignment - 1)) || !length) {
272 		rc = -EINVAL;
273 		goto out_unlock;
274 	}
275 
276 	if (flags & IOPT_ALLOC_IOVA) {
277 		/* Use the first entry to guess the ideal IOVA alignment */
278 		elm = list_first_entry(pages_list, struct iopt_pages_list,
279 				       next);
280 		switch (elm->pages->type) {
281 		case IOPT_ADDRESS_USER:
282 			start = elm->start_byte + (uintptr_t)elm->pages->uptr;
283 			break;
284 		case IOPT_ADDRESS_FILE:
285 			start = elm->start_byte + elm->pages->start;
286 			break;
287 		}
288 		rc = iopt_alloc_iova(iopt, dst_iova, start, length);
289 		if (rc)
290 			goto out_unlock;
291 		if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
292 		    WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
293 			rc = -EINVAL;
294 			goto out_unlock;
295 		}
296 	} else {
297 		rc = iopt_check_iova(iopt, *dst_iova, length);
298 		if (rc)
299 			goto out_unlock;
300 	}
301 
302 	/*
303 	 * Areas are created with a NULL pages so that the IOVA space is
304 	 * reserved and we can unlock the iova_rwsem.
305 	 */
306 	iova = *dst_iova;
307 	list_for_each_entry(elm, pages_list, next) {
308 		rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
309 				      elm->start_byte, elm->length, iommu_prot);
310 		if (rc)
311 			goto out_unlock;
312 		iova += elm->length;
313 	}
314 
315 out_unlock:
316 	up_write(&iopt->iova_rwsem);
317 	return rc;
318 }
319 
iopt_abort_area(struct iopt_area * area)320 static void iopt_abort_area(struct iopt_area *area)
321 {
322 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
323 		WARN_ON(area->pages);
324 	if (area->iopt) {
325 		down_write(&area->iopt->iova_rwsem);
326 		interval_tree_remove(&area->node, &area->iopt->area_itree);
327 		up_write(&area->iopt->iova_rwsem);
328 	}
329 	kfree(area);
330 }
331 
iopt_free_pages_list(struct list_head * pages_list)332 void iopt_free_pages_list(struct list_head *pages_list)
333 {
334 	struct iopt_pages_list *elm;
335 
336 	while ((elm = list_first_entry_or_null(pages_list,
337 					       struct iopt_pages_list, next))) {
338 		if (elm->area)
339 			iopt_abort_area(elm->area);
340 		if (elm->pages)
341 			iopt_put_pages(elm->pages);
342 		list_del(&elm->next);
343 		kfree(elm);
344 	}
345 }
346 
iopt_fill_domains_pages(struct list_head * pages_list)347 static int iopt_fill_domains_pages(struct list_head *pages_list)
348 {
349 	struct iopt_pages_list *undo_elm;
350 	struct iopt_pages_list *elm;
351 	int rc;
352 
353 	list_for_each_entry(elm, pages_list, next) {
354 		rc = iopt_area_fill_domains(elm->area, elm->pages);
355 		if (rc)
356 			goto err_undo;
357 	}
358 	return 0;
359 
360 err_undo:
361 	list_for_each_entry(undo_elm, pages_list, next) {
362 		if (undo_elm == elm)
363 			break;
364 		iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
365 	}
366 	return rc;
367 }
368 
iopt_map_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)369 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
370 		   unsigned long length, unsigned long *dst_iova,
371 		   int iommu_prot, unsigned int flags)
372 {
373 	struct iopt_pages_list *elm;
374 	int rc;
375 
376 	rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
377 				   iommu_prot, flags);
378 	if (rc)
379 		return rc;
380 
381 	down_read(&iopt->domains_rwsem);
382 	rc = iopt_fill_domains_pages(pages_list);
383 	if (rc)
384 		goto out_unlock_domains;
385 
386 	down_write(&iopt->iova_rwsem);
387 	list_for_each_entry(elm, pages_list, next) {
388 		/*
389 		 * area->pages must be set inside the domains_rwsem to ensure
390 		 * any newly added domains will get filled. Moves the reference
391 		 * in from the list.
392 		 */
393 		elm->area->pages = elm->pages;
394 		elm->pages = NULL;
395 		elm->area = NULL;
396 	}
397 	up_write(&iopt->iova_rwsem);
398 out_unlock_domains:
399 	up_read(&iopt->domains_rwsem);
400 	return rc;
401 }
402 
iopt_map_common(struct iommufd_ctx * ictx,struct io_pagetable * iopt,struct iopt_pages * pages,unsigned long * iova,unsigned long length,unsigned long start_byte,int iommu_prot,unsigned int flags)403 static int iopt_map_common(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
404 			   struct iopt_pages *pages, unsigned long *iova,
405 			   unsigned long length, unsigned long start_byte,
406 			   int iommu_prot, unsigned int flags)
407 {
408 	struct iopt_pages_list elm = {};
409 	LIST_HEAD(pages_list);
410 	int rc;
411 
412 	elm.pages = pages;
413 	elm.start_byte = start_byte;
414 	if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
415 	    elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
416 		elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
417 	elm.length = length;
418 	list_add(&elm.next, &pages_list);
419 
420 	rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
421 	if (rc) {
422 		if (elm.area)
423 			iopt_abort_area(elm.area);
424 		if (elm.pages)
425 			iopt_put_pages(elm.pages);
426 		return rc;
427 	}
428 	return 0;
429 }
430 
431 /**
432  * iopt_map_user_pages() - Map a user VA to an iova in the io page table
433  * @ictx: iommufd_ctx the iopt is part of
434  * @iopt: io_pagetable to act on
435  * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
436  *        the chosen iova on output. Otherwise is the iova to map to on input
437  * @uptr: User VA to map
438  * @length: Number of bytes to map
439  * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
440  * @flags: IOPT_ALLOC_IOVA or zero
441  *
442  * iova, uptr, and length must be aligned to iova_alignment. For domain backed
443  * page tables this will pin the pages and load them into the domain at iova.
444  * For non-domain page tables this will only setup a lazy reference and the
445  * caller must use iopt_access_pages() to touch them.
446  *
447  * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
448  * destroyed.
449  */
iopt_map_user_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,void __user * uptr,unsigned long length,int iommu_prot,unsigned int flags)450 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
451 			unsigned long *iova, void __user *uptr,
452 			unsigned long length, int iommu_prot,
453 			unsigned int flags)
454 {
455 	struct iopt_pages *pages;
456 
457 	pages = iopt_alloc_user_pages(uptr, length, iommu_prot & IOMMU_WRITE);
458 	if (IS_ERR(pages))
459 		return PTR_ERR(pages);
460 
461 	return iopt_map_common(ictx, iopt, pages, iova, length,
462 			       uptr - pages->uptr, iommu_prot, flags);
463 }
464 
465 /**
466  * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file.
467  * @ictx: iommufd_ctx the iopt is part of
468  * @iopt: io_pagetable to act on
469  * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
470  *        the chosen iova on output. Otherwise is the iova to map to on input
471  * @file: file to map
472  * @start: map file starting at this byte offset
473  * @length: Number of bytes to map
474  * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
475  * @flags: IOPT_ALLOC_IOVA or zero
476  */
iopt_map_file_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,struct file * file,unsigned long start,unsigned long length,int iommu_prot,unsigned int flags)477 int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
478 			unsigned long *iova, struct file *file,
479 			unsigned long start, unsigned long length,
480 			int iommu_prot, unsigned int flags)
481 {
482 	struct iopt_pages *pages;
483 
484 	pages = iopt_alloc_file_pages(file, start, length,
485 				      iommu_prot & IOMMU_WRITE);
486 	if (IS_ERR(pages))
487 		return PTR_ERR(pages);
488 	return iopt_map_common(ictx, iopt, pages, iova, length,
489 			       start - pages->start, iommu_prot, flags);
490 }
491 
492 struct iova_bitmap_fn_arg {
493 	unsigned long flags;
494 	struct io_pagetable *iopt;
495 	struct iommu_domain *domain;
496 	struct iommu_dirty_bitmap *dirty;
497 };
498 
__iommu_read_and_clear_dirty(struct iova_bitmap * bitmap,unsigned long iova,size_t length,void * opaque)499 static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
500 					unsigned long iova, size_t length,
501 					void *opaque)
502 {
503 	struct iopt_area *area;
504 	struct iopt_area_contig_iter iter;
505 	struct iova_bitmap_fn_arg *arg = opaque;
506 	struct iommu_domain *domain = arg->domain;
507 	struct iommu_dirty_bitmap *dirty = arg->dirty;
508 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
509 	unsigned long last_iova = iova + length - 1;
510 	unsigned long flags = arg->flags;
511 	int ret;
512 
513 	iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
514 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
515 
516 		ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
517 						last - iter.cur_iova + 1, flags,
518 						dirty);
519 		if (ret)
520 			return ret;
521 	}
522 
523 	if (!iopt_area_contig_done(&iter))
524 		return -EINVAL;
525 	return 0;
526 }
527 
528 static int
iommu_read_and_clear_dirty(struct iommu_domain * domain,struct io_pagetable * iopt,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)529 iommu_read_and_clear_dirty(struct iommu_domain *domain,
530 			   struct io_pagetable *iopt, unsigned long flags,
531 			   struct iommu_hwpt_get_dirty_bitmap *bitmap)
532 {
533 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
534 	struct iommu_iotlb_gather gather;
535 	struct iommu_dirty_bitmap dirty;
536 	struct iova_bitmap_fn_arg arg;
537 	struct iova_bitmap *iter;
538 	int ret = 0;
539 
540 	if (!ops || !ops->read_and_clear_dirty)
541 		return -EOPNOTSUPP;
542 
543 	iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
544 				 bitmap->page_size,
545 				 u64_to_user_ptr(bitmap->data));
546 	if (IS_ERR(iter))
547 		return -ENOMEM;
548 
549 	iommu_dirty_bitmap_init(&dirty, iter, &gather);
550 
551 	arg.flags = flags;
552 	arg.iopt = iopt;
553 	arg.domain = domain;
554 	arg.dirty = &dirty;
555 	iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
556 
557 	if (!(flags & IOMMU_DIRTY_NO_CLEAR))
558 		iommu_iotlb_sync(domain, &gather);
559 
560 	iova_bitmap_free(iter);
561 
562 	return ret;
563 }
564 
iommufd_check_iova_range(struct io_pagetable * iopt,struct iommu_hwpt_get_dirty_bitmap * bitmap)565 int iommufd_check_iova_range(struct io_pagetable *iopt,
566 			     struct iommu_hwpt_get_dirty_bitmap *bitmap)
567 {
568 	size_t iommu_pgsize = iopt->iova_alignment;
569 	u64 last_iova;
570 
571 	if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
572 		return -EOVERFLOW;
573 
574 	if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
575 		return -EOVERFLOW;
576 
577 	if ((bitmap->iova & (iommu_pgsize - 1)) ||
578 	    ((last_iova + 1) & (iommu_pgsize - 1)))
579 		return -EINVAL;
580 
581 	if (!bitmap->page_size)
582 		return -EINVAL;
583 
584 	if ((bitmap->iova & (bitmap->page_size - 1)) ||
585 	    ((last_iova + 1) & (bitmap->page_size - 1)))
586 		return -EINVAL;
587 
588 	return 0;
589 }
590 
iopt_read_and_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)591 int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
592 				   struct iommu_domain *domain,
593 				   unsigned long flags,
594 				   struct iommu_hwpt_get_dirty_bitmap *bitmap)
595 {
596 	int ret;
597 
598 	ret = iommufd_check_iova_range(iopt, bitmap);
599 	if (ret)
600 		return ret;
601 
602 	down_read(&iopt->iova_rwsem);
603 	ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
604 	up_read(&iopt->iova_rwsem);
605 
606 	return ret;
607 }
608 
iopt_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain)609 static int iopt_clear_dirty_data(struct io_pagetable *iopt,
610 				 struct iommu_domain *domain)
611 {
612 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
613 	struct iommu_iotlb_gather gather;
614 	struct iommu_dirty_bitmap dirty;
615 	struct iopt_area *area;
616 	int ret = 0;
617 
618 	lockdep_assert_held_read(&iopt->iova_rwsem);
619 
620 	iommu_dirty_bitmap_init(&dirty, NULL, &gather);
621 
622 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
623 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
624 		if (!area->pages)
625 			continue;
626 
627 		ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
628 						iopt_area_length(area), 0,
629 						&dirty);
630 		if (ret)
631 			break;
632 	}
633 
634 	iommu_iotlb_sync(domain, &gather);
635 	return ret;
636 }
637 
iopt_set_dirty_tracking(struct io_pagetable * iopt,struct iommu_domain * domain,bool enable)638 int iopt_set_dirty_tracking(struct io_pagetable *iopt,
639 			    struct iommu_domain *domain, bool enable)
640 {
641 	const struct iommu_dirty_ops *ops = domain->dirty_ops;
642 	int ret = 0;
643 
644 	if (!ops)
645 		return -EOPNOTSUPP;
646 
647 	down_read(&iopt->iova_rwsem);
648 
649 	/* Clear dirty bits from PTEs to ensure a clean snapshot */
650 	if (enable) {
651 		ret = iopt_clear_dirty_data(iopt, domain);
652 		if (ret)
653 			goto out_unlock;
654 	}
655 
656 	ret = ops->set_dirty_tracking(domain, enable);
657 
658 out_unlock:
659 	up_read(&iopt->iova_rwsem);
660 	return ret;
661 }
662 
iopt_get_pages(struct io_pagetable * iopt,unsigned long iova,unsigned long length,struct list_head * pages_list)663 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
664 		   unsigned long length, struct list_head *pages_list)
665 {
666 	struct iopt_area_contig_iter iter;
667 	unsigned long last_iova;
668 	struct iopt_area *area;
669 	int rc;
670 
671 	if (!length)
672 		return -EINVAL;
673 	if (check_add_overflow(iova, length - 1, &last_iova))
674 		return -EOVERFLOW;
675 
676 	down_read(&iopt->iova_rwsem);
677 	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
678 		struct iopt_pages_list *elm;
679 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
680 
681 		elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
682 		if (!elm) {
683 			rc = -ENOMEM;
684 			goto err_free;
685 		}
686 		elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
687 		elm->pages = area->pages;
688 		elm->length = (last - iter.cur_iova) + 1;
689 		kref_get(&elm->pages->kref);
690 		list_add_tail(&elm->next, pages_list);
691 	}
692 	if (!iopt_area_contig_done(&iter)) {
693 		rc = -ENOENT;
694 		goto err_free;
695 	}
696 	up_read(&iopt->iova_rwsem);
697 	return 0;
698 err_free:
699 	up_read(&iopt->iova_rwsem);
700 	iopt_free_pages_list(pages_list);
701 	return rc;
702 }
703 
iopt_unmap_iova_range(struct io_pagetable * iopt,unsigned long start,unsigned long last,unsigned long * unmapped)704 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
705 				 unsigned long last, unsigned long *unmapped)
706 {
707 	struct iopt_area *area;
708 	unsigned long unmapped_bytes = 0;
709 	unsigned int tries = 0;
710 	int rc = -ENOENT;
711 
712 	/*
713 	 * The domains_rwsem must be held in read mode any time any area->pages
714 	 * is NULL. This prevents domain attach/detatch from running
715 	 * concurrently with cleaning up the area.
716 	 */
717 again:
718 	down_read(&iopt->domains_rwsem);
719 	down_write(&iopt->iova_rwsem);
720 	while ((area = iopt_area_iter_first(iopt, start, last))) {
721 		unsigned long area_last = iopt_area_last_iova(area);
722 		unsigned long area_first = iopt_area_iova(area);
723 		struct iopt_pages *pages;
724 
725 		/* Userspace should not race map/unmap's of the same area */
726 		if (!area->pages) {
727 			rc = -EBUSY;
728 			goto out_unlock_iova;
729 		}
730 
731 		/* The area is locked by an object that has not been destroyed */
732 		if (area->num_locks) {
733 			rc = -EBUSY;
734 			goto out_unlock_iova;
735 		}
736 
737 		if (area_first < start || area_last > last) {
738 			rc = -ENOENT;
739 			goto out_unlock_iova;
740 		}
741 
742 		if (area_first != start)
743 			tries = 0;
744 
745 		/*
746 		 * num_accesses writers must hold the iova_rwsem too, so we can
747 		 * safely read it under the write side of the iovam_rwsem
748 		 * without the pages->mutex.
749 		 */
750 		if (area->num_accesses) {
751 			size_t length = iopt_area_length(area);
752 
753 			start = area_first;
754 			area->prevent_access = true;
755 			up_write(&iopt->iova_rwsem);
756 			up_read(&iopt->domains_rwsem);
757 
758 			iommufd_access_notify_unmap(iopt, area_first, length);
759 			/* Something is not responding to unmap requests. */
760 			tries++;
761 			if (WARN_ON(tries > 100)) {
762 				rc = -EDEADLOCK;
763 				goto out_unmapped;
764 			}
765 			goto again;
766 		}
767 
768 		pages = area->pages;
769 		area->pages = NULL;
770 		up_write(&iopt->iova_rwsem);
771 
772 		iopt_area_unfill_domains(area, pages);
773 		iopt_abort_area(area);
774 		iopt_put_pages(pages);
775 
776 		unmapped_bytes += area_last - area_first + 1;
777 
778 		down_write(&iopt->iova_rwsem);
779 	}
780 	if (unmapped_bytes)
781 		rc = 0;
782 
783 out_unlock_iova:
784 	up_write(&iopt->iova_rwsem);
785 	up_read(&iopt->domains_rwsem);
786 out_unmapped:
787 	if (unmapped)
788 		*unmapped = unmapped_bytes;
789 	return rc;
790 }
791 
792 /**
793  * iopt_unmap_iova() - Remove a range of iova
794  * @iopt: io_pagetable to act on
795  * @iova: Starting iova to unmap
796  * @length: Number of bytes to unmap
797  * @unmapped: Return number of bytes unmapped
798  *
799  * The requested range must be a superset of existing ranges.
800  * Splitting/truncating IOVA mappings is not allowed.
801  */
iopt_unmap_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length,unsigned long * unmapped)802 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
803 		    unsigned long length, unsigned long *unmapped)
804 {
805 	unsigned long iova_last;
806 
807 	if (!length)
808 		return -EINVAL;
809 
810 	if (check_add_overflow(iova, length - 1, &iova_last))
811 		return -EOVERFLOW;
812 
813 	return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
814 }
815 
iopt_unmap_all(struct io_pagetable * iopt,unsigned long * unmapped)816 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
817 {
818 	int rc;
819 
820 	rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
821 	/* If the IOVAs are empty then unmap all succeeds */
822 	if (rc == -ENOENT)
823 		return 0;
824 	return rc;
825 }
826 
827 /* The caller must always free all the nodes in the allowed_iova rb_root. */
iopt_set_allow_iova(struct io_pagetable * iopt,struct rb_root_cached * allowed_iova)828 int iopt_set_allow_iova(struct io_pagetable *iopt,
829 			struct rb_root_cached *allowed_iova)
830 {
831 	struct iopt_allowed *allowed;
832 
833 	down_write(&iopt->iova_rwsem);
834 	swap(*allowed_iova, iopt->allowed_itree);
835 
836 	for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
837 	     allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
838 		if (iopt_reserved_iter_first(iopt, allowed->node.start,
839 					     allowed->node.last)) {
840 			swap(*allowed_iova, iopt->allowed_itree);
841 			up_write(&iopt->iova_rwsem);
842 			return -EADDRINUSE;
843 		}
844 	}
845 	up_write(&iopt->iova_rwsem);
846 	return 0;
847 }
848 
iopt_reserve_iova(struct io_pagetable * iopt,unsigned long start,unsigned long last,void * owner)849 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
850 		      unsigned long last, void *owner)
851 {
852 	struct iopt_reserved *reserved;
853 
854 	lockdep_assert_held_write(&iopt->iova_rwsem);
855 
856 	if (iopt_area_iter_first(iopt, start, last) ||
857 	    iopt_allowed_iter_first(iopt, start, last))
858 		return -EADDRINUSE;
859 
860 	reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
861 	if (!reserved)
862 		return -ENOMEM;
863 	reserved->node.start = start;
864 	reserved->node.last = last;
865 	reserved->owner = owner;
866 	interval_tree_insert(&reserved->node, &iopt->reserved_itree);
867 	return 0;
868 }
869 
__iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)870 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
871 {
872 	struct iopt_reserved *reserved, *next;
873 
874 	lockdep_assert_held_write(&iopt->iova_rwsem);
875 
876 	for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
877 	     reserved = next) {
878 		next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
879 
880 		if (reserved->owner == owner) {
881 			interval_tree_remove(&reserved->node,
882 					     &iopt->reserved_itree);
883 			kfree(reserved);
884 		}
885 	}
886 }
887 
iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)888 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
889 {
890 	down_write(&iopt->iova_rwsem);
891 	__iopt_remove_reserved_iova(iopt, owner);
892 	up_write(&iopt->iova_rwsem);
893 }
894 
iopt_init_table(struct io_pagetable * iopt)895 void iopt_init_table(struct io_pagetable *iopt)
896 {
897 	init_rwsem(&iopt->iova_rwsem);
898 	init_rwsem(&iopt->domains_rwsem);
899 	iopt->area_itree = RB_ROOT_CACHED;
900 	iopt->allowed_itree = RB_ROOT_CACHED;
901 	iopt->reserved_itree = RB_ROOT_CACHED;
902 	xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
903 	xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
904 
905 	/*
906 	 * iopt's start as SW tables that can use the entire size_t IOVA space
907 	 * due to the use of size_t in the APIs. They have no alignment
908 	 * restriction.
909 	 */
910 	iopt->iova_alignment = 1;
911 }
912 
iopt_destroy_table(struct io_pagetable * iopt)913 void iopt_destroy_table(struct io_pagetable *iopt)
914 {
915 	struct interval_tree_node *node;
916 
917 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
918 		iopt_remove_reserved_iova(iopt, NULL);
919 
920 	while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
921 						ULONG_MAX))) {
922 		interval_tree_remove(node, &iopt->allowed_itree);
923 		kfree(container_of(node, struct iopt_allowed, node));
924 	}
925 
926 	WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
927 	WARN_ON(!xa_empty(&iopt->domains));
928 	WARN_ON(!xa_empty(&iopt->access_list));
929 	WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
930 }
931 
932 /**
933  * iopt_unfill_domain() - Unfill a domain with PFNs
934  * @iopt: io_pagetable to act on
935  * @domain: domain to unfill
936  *
937  * This is used when removing a domain from the iopt. Every area in the iopt
938  * will be unmapped from the domain. The domain must already be removed from the
939  * domains xarray.
940  */
iopt_unfill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)941 static void iopt_unfill_domain(struct io_pagetable *iopt,
942 			       struct iommu_domain *domain)
943 {
944 	struct iopt_area *area;
945 
946 	lockdep_assert_held(&iopt->iova_rwsem);
947 	lockdep_assert_held_write(&iopt->domains_rwsem);
948 
949 	/*
950 	 * Some other domain is holding all the pfns still, rapidly unmap this
951 	 * domain.
952 	 */
953 	if (iopt->next_domain_id != 0) {
954 		/* Pick an arbitrary remaining domain to act as storage */
955 		struct iommu_domain *storage_domain =
956 			xa_load(&iopt->domains, 0);
957 
958 		for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
959 		     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
960 			struct iopt_pages *pages = area->pages;
961 
962 			if (!pages)
963 				continue;
964 
965 			mutex_lock(&pages->mutex);
966 			if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
967 				WARN_ON(!area->storage_domain);
968 			if (area->storage_domain == domain)
969 				area->storage_domain = storage_domain;
970 			mutex_unlock(&pages->mutex);
971 
972 			iopt_area_unmap_domain(area, domain);
973 		}
974 		return;
975 	}
976 
977 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
978 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
979 		struct iopt_pages *pages = area->pages;
980 
981 		if (!pages)
982 			continue;
983 
984 		mutex_lock(&pages->mutex);
985 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
986 		WARN_ON(area->storage_domain != domain);
987 		area->storage_domain = NULL;
988 		iopt_area_unfill_domain(area, pages, domain);
989 		mutex_unlock(&pages->mutex);
990 	}
991 }
992 
993 /**
994  * iopt_fill_domain() - Fill a domain with PFNs
995  * @iopt: io_pagetable to act on
996  * @domain: domain to fill
997  *
998  * Fill the domain with PFNs from every area in the iopt. On failure the domain
999  * is left unchanged.
1000  */
iopt_fill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1001 static int iopt_fill_domain(struct io_pagetable *iopt,
1002 			    struct iommu_domain *domain)
1003 {
1004 	struct iopt_area *end_area;
1005 	struct iopt_area *area;
1006 	int rc;
1007 
1008 	lockdep_assert_held(&iopt->iova_rwsem);
1009 	lockdep_assert_held_write(&iopt->domains_rwsem);
1010 
1011 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1012 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1013 		struct iopt_pages *pages = area->pages;
1014 
1015 		if (!pages)
1016 			continue;
1017 
1018 		mutex_lock(&pages->mutex);
1019 		rc = iopt_area_fill_domain(area, domain);
1020 		if (rc) {
1021 			mutex_unlock(&pages->mutex);
1022 			goto out_unfill;
1023 		}
1024 		if (!area->storage_domain) {
1025 			WARN_ON(iopt->next_domain_id != 0);
1026 			area->storage_domain = domain;
1027 			interval_tree_insert(&area->pages_node,
1028 					     &pages->domains_itree);
1029 		}
1030 		mutex_unlock(&pages->mutex);
1031 	}
1032 	return 0;
1033 
1034 out_unfill:
1035 	end_area = area;
1036 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1037 	     area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1038 		struct iopt_pages *pages = area->pages;
1039 
1040 		if (area == end_area)
1041 			break;
1042 		if (!pages)
1043 			continue;
1044 		mutex_lock(&pages->mutex);
1045 		if (iopt->next_domain_id == 0) {
1046 			interval_tree_remove(&area->pages_node,
1047 					     &pages->domains_itree);
1048 			area->storage_domain = NULL;
1049 		}
1050 		iopt_area_unfill_domain(area, pages, domain);
1051 		mutex_unlock(&pages->mutex);
1052 	}
1053 	return rc;
1054 }
1055 
1056 /* All existing area's conform to an increased page size */
iopt_check_iova_alignment(struct io_pagetable * iopt,unsigned long new_iova_alignment)1057 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
1058 				     unsigned long new_iova_alignment)
1059 {
1060 	unsigned long align_mask = new_iova_alignment - 1;
1061 	struct iopt_area *area;
1062 
1063 	lockdep_assert_held(&iopt->iova_rwsem);
1064 	lockdep_assert_held(&iopt->domains_rwsem);
1065 
1066 	for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1067 	     area = iopt_area_iter_next(area, 0, ULONG_MAX))
1068 		if ((iopt_area_iova(area) & align_mask) ||
1069 		    (iopt_area_length(area) & align_mask) ||
1070 		    (area->page_offset & align_mask))
1071 			return -EADDRINUSE;
1072 
1073 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
1074 		struct iommufd_access *access;
1075 		unsigned long index;
1076 
1077 		xa_for_each(&iopt->access_list, index, access)
1078 			if (WARN_ON(access->iova_alignment >
1079 				    new_iova_alignment))
1080 				return -EADDRINUSE;
1081 	}
1082 	return 0;
1083 }
1084 
iopt_table_add_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1085 int iopt_table_add_domain(struct io_pagetable *iopt,
1086 			  struct iommu_domain *domain)
1087 {
1088 	const struct iommu_domain_geometry *geometry = &domain->geometry;
1089 	struct iommu_domain *iter_domain;
1090 	unsigned int new_iova_alignment;
1091 	unsigned long index;
1092 	int rc;
1093 
1094 	down_write(&iopt->domains_rwsem);
1095 	down_write(&iopt->iova_rwsem);
1096 
1097 	xa_for_each(&iopt->domains, index, iter_domain) {
1098 		if (WARN_ON(iter_domain == domain)) {
1099 			rc = -EEXIST;
1100 			goto out_unlock;
1101 		}
1102 	}
1103 
1104 	/*
1105 	 * The io page size drives the iova_alignment. Internally the iopt_pages
1106 	 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
1107 	 * objects into the iommu_domain.
1108 	 *
1109 	 * A iommu_domain must always be able to accept PAGE_SIZE to be
1110 	 * compatible as we can't guarantee higher contiguity.
1111 	 */
1112 	new_iova_alignment = max_t(unsigned long,
1113 				   1UL << __ffs(domain->pgsize_bitmap),
1114 				   iopt->iova_alignment);
1115 	if (new_iova_alignment > PAGE_SIZE) {
1116 		rc = -EINVAL;
1117 		goto out_unlock;
1118 	}
1119 	if (new_iova_alignment != iopt->iova_alignment) {
1120 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1121 		if (rc)
1122 			goto out_unlock;
1123 	}
1124 
1125 	/* No area exists that is outside the allowed domain aperture */
1126 	if (geometry->aperture_start != 0) {
1127 		rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
1128 				       domain);
1129 		if (rc)
1130 			goto out_reserved;
1131 	}
1132 	if (geometry->aperture_end != ULONG_MAX) {
1133 		rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
1134 				       ULONG_MAX, domain);
1135 		if (rc)
1136 			goto out_reserved;
1137 	}
1138 
1139 	rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
1140 	if (rc)
1141 		goto out_reserved;
1142 
1143 	rc = iopt_fill_domain(iopt, domain);
1144 	if (rc)
1145 		goto out_release;
1146 
1147 	iopt->iova_alignment = new_iova_alignment;
1148 	xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
1149 	iopt->next_domain_id++;
1150 	up_write(&iopt->iova_rwsem);
1151 	up_write(&iopt->domains_rwsem);
1152 	return 0;
1153 out_release:
1154 	xa_release(&iopt->domains, iopt->next_domain_id);
1155 out_reserved:
1156 	__iopt_remove_reserved_iova(iopt, domain);
1157 out_unlock:
1158 	up_write(&iopt->iova_rwsem);
1159 	up_write(&iopt->domains_rwsem);
1160 	return rc;
1161 }
1162 
iopt_calculate_iova_alignment(struct io_pagetable * iopt)1163 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
1164 {
1165 	unsigned long new_iova_alignment;
1166 	struct iommufd_access *access;
1167 	struct iommu_domain *domain;
1168 	unsigned long index;
1169 
1170 	lockdep_assert_held_write(&iopt->iova_rwsem);
1171 	lockdep_assert_held(&iopt->domains_rwsem);
1172 
1173 	/* See batch_iommu_map_small() */
1174 	if (iopt->disable_large_pages)
1175 		new_iova_alignment = PAGE_SIZE;
1176 	else
1177 		new_iova_alignment = 1;
1178 
1179 	xa_for_each(&iopt->domains, index, domain)
1180 		new_iova_alignment = max_t(unsigned long,
1181 					   1UL << __ffs(domain->pgsize_bitmap),
1182 					   new_iova_alignment);
1183 	xa_for_each(&iopt->access_list, index, access)
1184 		new_iova_alignment = max_t(unsigned long,
1185 					   access->iova_alignment,
1186 					   new_iova_alignment);
1187 
1188 	if (new_iova_alignment > iopt->iova_alignment) {
1189 		int rc;
1190 
1191 		rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1192 		if (rc)
1193 			return rc;
1194 	}
1195 	iopt->iova_alignment = new_iova_alignment;
1196 	return 0;
1197 }
1198 
iopt_table_remove_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1199 void iopt_table_remove_domain(struct io_pagetable *iopt,
1200 			      struct iommu_domain *domain)
1201 {
1202 	struct iommu_domain *iter_domain = NULL;
1203 	unsigned long index;
1204 
1205 	down_write(&iopt->domains_rwsem);
1206 	down_write(&iopt->iova_rwsem);
1207 
1208 	xa_for_each(&iopt->domains, index, iter_domain)
1209 		if (iter_domain == domain)
1210 			break;
1211 	if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
1212 		goto out_unlock;
1213 
1214 	/*
1215 	 * Compress the xarray to keep it linear by swapping the entry to erase
1216 	 * with the tail entry and shrinking the tail.
1217 	 */
1218 	iopt->next_domain_id--;
1219 	iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
1220 	if (index != iopt->next_domain_id)
1221 		xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
1222 
1223 	iopt_unfill_domain(iopt, domain);
1224 	__iopt_remove_reserved_iova(iopt, domain);
1225 
1226 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1227 out_unlock:
1228 	up_write(&iopt->iova_rwsem);
1229 	up_write(&iopt->domains_rwsem);
1230 }
1231 
1232 /**
1233  * iopt_area_split - Split an area into two parts at iova
1234  * @area: The area to split
1235  * @iova: Becomes the last of a new area
1236  *
1237  * This splits an area into two. It is part of the VFIO compatibility to allow
1238  * poking a hole in the mapping. The two areas continue to point at the same
1239  * iopt_pages, just with different starting bytes.
1240  */
iopt_area_split(struct iopt_area * area,unsigned long iova)1241 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
1242 {
1243 	unsigned long alignment = area->iopt->iova_alignment;
1244 	unsigned long last_iova = iopt_area_last_iova(area);
1245 	unsigned long start_iova = iopt_area_iova(area);
1246 	unsigned long new_start = iova + 1;
1247 	struct io_pagetable *iopt = area->iopt;
1248 	struct iopt_pages *pages = area->pages;
1249 	struct iopt_area *lhs;
1250 	struct iopt_area *rhs;
1251 	int rc;
1252 
1253 	lockdep_assert_held_write(&iopt->iova_rwsem);
1254 
1255 	if (iova == start_iova || iova == last_iova)
1256 		return 0;
1257 
1258 	if (!pages || area->prevent_access)
1259 		return -EBUSY;
1260 
1261 	if (new_start & (alignment - 1) ||
1262 	    iopt_area_start_byte(area, new_start) & (alignment - 1))
1263 		return -EINVAL;
1264 
1265 	lhs = iopt_area_alloc();
1266 	if (!lhs)
1267 		return -ENOMEM;
1268 
1269 	rhs = iopt_area_alloc();
1270 	if (!rhs) {
1271 		rc = -ENOMEM;
1272 		goto err_free_lhs;
1273 	}
1274 
1275 	mutex_lock(&pages->mutex);
1276 	/*
1277 	 * Splitting is not permitted if an access exists, we don't track enough
1278 	 * information to split existing accesses.
1279 	 */
1280 	if (area->num_accesses) {
1281 		rc = -EINVAL;
1282 		goto err_unlock;
1283 	}
1284 
1285 	/*
1286 	 * Splitting is not permitted if a domain could have been mapped with
1287 	 * huge pages.
1288 	 */
1289 	if (area->storage_domain && !iopt->disable_large_pages) {
1290 		rc = -EINVAL;
1291 		goto err_unlock;
1292 	}
1293 
1294 	interval_tree_remove(&area->node, &iopt->area_itree);
1295 	rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1296 			      iopt_area_start_byte(area, start_iova),
1297 			      (new_start - 1) - start_iova + 1,
1298 			      area->iommu_prot);
1299 	if (WARN_ON(rc))
1300 		goto err_insert;
1301 
1302 	rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1303 			      iopt_area_start_byte(area, new_start),
1304 			      last_iova - new_start + 1, area->iommu_prot);
1305 	if (WARN_ON(rc))
1306 		goto err_remove_lhs;
1307 
1308 	/*
1309 	 * If the original area has filled a domain, domains_itree has to be
1310 	 * updated.
1311 	 */
1312 	if (area->storage_domain) {
1313 		interval_tree_remove(&area->pages_node, &pages->domains_itree);
1314 		interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1315 		interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1316 	}
1317 
1318 	lhs->storage_domain = area->storage_domain;
1319 	lhs->pages = area->pages;
1320 	rhs->storage_domain = area->storage_domain;
1321 	rhs->pages = area->pages;
1322 	kref_get(&rhs->pages->kref);
1323 	kfree(area);
1324 	mutex_unlock(&pages->mutex);
1325 
1326 	/*
1327 	 * No change to domains or accesses because the pages hasn't been
1328 	 * changed
1329 	 */
1330 	return 0;
1331 
1332 err_remove_lhs:
1333 	interval_tree_remove(&lhs->node, &iopt->area_itree);
1334 err_insert:
1335 	interval_tree_insert(&area->node, &iopt->area_itree);
1336 err_unlock:
1337 	mutex_unlock(&pages->mutex);
1338 	kfree(rhs);
1339 err_free_lhs:
1340 	kfree(lhs);
1341 	return rc;
1342 }
1343 
iopt_cut_iova(struct io_pagetable * iopt,unsigned long * iovas,size_t num_iovas)1344 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1345 		  size_t num_iovas)
1346 {
1347 	int rc = 0;
1348 	int i;
1349 
1350 	down_write(&iopt->iova_rwsem);
1351 	for (i = 0; i < num_iovas; i++) {
1352 		struct iopt_area *area;
1353 
1354 		area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1355 		if (!area)
1356 			continue;
1357 		rc = iopt_area_split(area, iovas[i]);
1358 		if (rc)
1359 			break;
1360 	}
1361 	up_write(&iopt->iova_rwsem);
1362 	return rc;
1363 }
1364 
iopt_enable_large_pages(struct io_pagetable * iopt)1365 void iopt_enable_large_pages(struct io_pagetable *iopt)
1366 {
1367 	int rc;
1368 
1369 	down_write(&iopt->domains_rwsem);
1370 	down_write(&iopt->iova_rwsem);
1371 	WRITE_ONCE(iopt->disable_large_pages, false);
1372 	rc = iopt_calculate_iova_alignment(iopt);
1373 	WARN_ON(rc);
1374 	up_write(&iopt->iova_rwsem);
1375 	up_write(&iopt->domains_rwsem);
1376 }
1377 
iopt_disable_large_pages(struct io_pagetable * iopt)1378 int iopt_disable_large_pages(struct io_pagetable *iopt)
1379 {
1380 	int rc = 0;
1381 
1382 	down_write(&iopt->domains_rwsem);
1383 	down_write(&iopt->iova_rwsem);
1384 	if (iopt->disable_large_pages)
1385 		goto out_unlock;
1386 
1387 	/* Won't do it if domains already have pages mapped in them */
1388 	if (!xa_empty(&iopt->domains) &&
1389 	    !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1390 		rc = -EINVAL;
1391 		goto out_unlock;
1392 	}
1393 
1394 	WRITE_ONCE(iopt->disable_large_pages, true);
1395 	rc = iopt_calculate_iova_alignment(iopt);
1396 	if (rc)
1397 		WRITE_ONCE(iopt->disable_large_pages, false);
1398 out_unlock:
1399 	up_write(&iopt->iova_rwsem);
1400 	up_write(&iopt->domains_rwsem);
1401 	return rc;
1402 }
1403 
iopt_add_access(struct io_pagetable * iopt,struct iommufd_access * access)1404 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1405 {
1406 	u32 new_id;
1407 	int rc;
1408 
1409 	down_write(&iopt->domains_rwsem);
1410 	down_write(&iopt->iova_rwsem);
1411 	rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b,
1412 		      GFP_KERNEL_ACCOUNT);
1413 
1414 	if (rc)
1415 		goto out_unlock;
1416 
1417 	rc = iopt_calculate_iova_alignment(iopt);
1418 	if (rc) {
1419 		xa_erase(&iopt->access_list, new_id);
1420 		goto out_unlock;
1421 	}
1422 	access->iopt_access_list_id = new_id;
1423 
1424 out_unlock:
1425 	up_write(&iopt->iova_rwsem);
1426 	up_write(&iopt->domains_rwsem);
1427 	return rc;
1428 }
1429 
iopt_remove_access(struct io_pagetable * iopt,struct iommufd_access * access,u32 iopt_access_list_id)1430 void iopt_remove_access(struct io_pagetable *iopt,
1431 			struct iommufd_access *access, u32 iopt_access_list_id)
1432 {
1433 	down_write(&iopt->domains_rwsem);
1434 	down_write(&iopt->iova_rwsem);
1435 	WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1436 	WARN_ON(iopt_calculate_iova_alignment(iopt));
1437 	up_write(&iopt->iova_rwsem);
1438 	up_write(&iopt->domains_rwsem);
1439 }
1440 
1441 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
iopt_table_enforce_dev_resv_regions(struct io_pagetable * iopt,struct device * dev,phys_addr_t * sw_msi_start)1442 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1443 					struct device *dev,
1444 					phys_addr_t *sw_msi_start)
1445 {
1446 	struct iommu_resv_region *resv;
1447 	LIST_HEAD(resv_regions);
1448 	unsigned int num_hw_msi = 0;
1449 	unsigned int num_sw_msi = 0;
1450 	int rc;
1451 
1452 	if (iommufd_should_fail())
1453 		return -EINVAL;
1454 
1455 	down_write(&iopt->iova_rwsem);
1456 	/* FIXME: drivers allocate memory but there is no failure propogated */
1457 	iommu_get_resv_regions(dev, &resv_regions);
1458 
1459 	list_for_each_entry(resv, &resv_regions, list) {
1460 		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1461 			continue;
1462 
1463 		if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1464 			num_hw_msi++;
1465 		if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1466 			*sw_msi_start = resv->start;
1467 			num_sw_msi++;
1468 		}
1469 
1470 		rc = iopt_reserve_iova(iopt, resv->start,
1471 				       resv->length - 1 + resv->start, dev);
1472 		if (rc)
1473 			goto out_reserved;
1474 	}
1475 
1476 	/* Drivers must offer sane combinations of regions */
1477 	if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1478 		rc = -EINVAL;
1479 		goto out_reserved;
1480 	}
1481 
1482 	rc = 0;
1483 	goto out_free_resv;
1484 
1485 out_reserved:
1486 	__iopt_remove_reserved_iova(iopt, dev);
1487 out_free_resv:
1488 	iommu_put_resv_regions(dev, &resv_regions);
1489 	up_write(&iopt->iova_rwsem);
1490 	return rc;
1491 }
1492