1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3 *
4 * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5 * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6 * list for access by an in-kernel user.
7 *
8 * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9 * between the domains and xarray.
10 */
11 #include <linux/err.h>
12 #include <linux/errno.h>
13 #include <linux/iommu.h>
14 #include <linux/iommufd.h>
15 #include <linux/lockdep.h>
16 #include <linux/sched/mm.h>
17 #include <linux/slab.h>
18 #include <uapi/linux/iommufd.h>
19
20 #include "double_span.h"
21 #include "io_pagetable.h"
22
23 struct iopt_pages_list {
24 struct iopt_pages *pages;
25 struct iopt_area *area;
26 struct list_head next;
27 unsigned long start_byte;
28 unsigned long length;
29 };
30
iopt_area_contig_init(struct iopt_area_contig_iter * iter,struct io_pagetable * iopt,unsigned long iova,unsigned long last_iova)31 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
32 struct io_pagetable *iopt,
33 unsigned long iova,
34 unsigned long last_iova)
35 {
36 lockdep_assert_held(&iopt->iova_rwsem);
37
38 iter->cur_iova = iova;
39 iter->last_iova = last_iova;
40 iter->area = iopt_area_iter_first(iopt, iova, iova);
41 if (!iter->area)
42 return NULL;
43 if (!iter->area->pages) {
44 iter->area = NULL;
45 return NULL;
46 }
47 return iter->area;
48 }
49
iopt_area_contig_next(struct iopt_area_contig_iter * iter)50 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
51 {
52 unsigned long last_iova;
53
54 if (!iter->area)
55 return NULL;
56 last_iova = iopt_area_last_iova(iter->area);
57 if (iter->last_iova <= last_iova)
58 return NULL;
59
60 iter->cur_iova = last_iova + 1;
61 iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
62 iter->last_iova);
63 if (!iter->area)
64 return NULL;
65 if (iter->cur_iova != iopt_area_iova(iter->area) ||
66 !iter->area->pages) {
67 iter->area = NULL;
68 return NULL;
69 }
70 return iter->area;
71 }
72
__alloc_iova_check_range(unsigned long * start,unsigned long last,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)73 static bool __alloc_iova_check_range(unsigned long *start, unsigned long last,
74 unsigned long length,
75 unsigned long iova_alignment,
76 unsigned long page_offset)
77 {
78 unsigned long aligned_start;
79
80 /* ALIGN_UP() */
81 if (check_add_overflow(*start, iova_alignment - 1, &aligned_start))
82 return false;
83 aligned_start &= ~(iova_alignment - 1);
84 aligned_start |= page_offset;
85
86 if (aligned_start >= last || last - aligned_start < length - 1)
87 return false;
88 *start = aligned_start;
89 return true;
90 }
91
__alloc_iova_check_hole(struct interval_tree_double_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)92 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
93 unsigned long length,
94 unsigned long iova_alignment,
95 unsigned long page_offset)
96 {
97 if (span->is_used)
98 return false;
99 return __alloc_iova_check_range(&span->start_hole, span->last_hole,
100 length, iova_alignment, page_offset);
101 }
102
__alloc_iova_check_used(struct interval_tree_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)103 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
104 unsigned long length,
105 unsigned long iova_alignment,
106 unsigned long page_offset)
107 {
108 if (span->is_hole)
109 return false;
110 return __alloc_iova_check_range(&span->start_used, span->last_used,
111 length, iova_alignment, page_offset);
112 }
113
114 /*
115 * Automatically find a block of IOVA that is not being used and not reserved.
116 * Does not return a 0 IOVA even if it is valid.
117 */
iopt_alloc_iova(struct io_pagetable * iopt,unsigned long * iova,unsigned long addr,unsigned long length)118 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
119 unsigned long addr, unsigned long length)
120 {
121 unsigned long page_offset = addr % PAGE_SIZE;
122 struct interval_tree_double_span_iter used_span;
123 struct interval_tree_span_iter allowed_span;
124 unsigned long max_alignment = PAGE_SIZE;
125 unsigned long iova_alignment;
126
127 lockdep_assert_held(&iopt->iova_rwsem);
128
129 /* Protect roundup_pow-of_two() from overflow */
130 if (length == 0 || length >= ULONG_MAX / 2)
131 return -EOVERFLOW;
132
133 /*
134 * Keep alignment present in addr when building the IOVA, which
135 * increases the chance we can map a THP.
136 */
137 if (!addr)
138 iova_alignment = roundup_pow_of_two(length);
139 else
140 iova_alignment = min_t(unsigned long,
141 roundup_pow_of_two(length),
142 1UL << __ffs64(addr));
143
144 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
145 max_alignment = HPAGE_SIZE;
146 #endif
147 /* Protect against ALIGN() overflow */
148 if (iova_alignment >= max_alignment)
149 iova_alignment = max_alignment;
150
151 if (iova_alignment < iopt->iova_alignment)
152 return -EINVAL;
153
154 interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
155 PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
156 if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
157 allowed_span.start_used = PAGE_SIZE;
158 allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
159 allowed_span.is_hole = false;
160 }
161
162 if (!__alloc_iova_check_used(&allowed_span, length,
163 iova_alignment, page_offset))
164 continue;
165
166 interval_tree_for_each_double_span(
167 &used_span, &iopt->reserved_itree, &iopt->area_itree,
168 allowed_span.start_used, allowed_span.last_used) {
169 if (!__alloc_iova_check_hole(&used_span, length,
170 iova_alignment,
171 page_offset))
172 continue;
173
174 *iova = used_span.start_hole;
175 return 0;
176 }
177 }
178 return -ENOSPC;
179 }
180
iopt_check_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length)181 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
182 unsigned long length)
183 {
184 unsigned long last;
185
186 lockdep_assert_held(&iopt->iova_rwsem);
187
188 if ((iova & (iopt->iova_alignment - 1)))
189 return -EINVAL;
190
191 if (check_add_overflow(iova, length - 1, &last))
192 return -EOVERFLOW;
193
194 /* No reserved IOVA intersects the range */
195 if (iopt_reserved_iter_first(iopt, iova, last))
196 return -EINVAL;
197
198 /* Check that there is not already a mapping in the range */
199 if (iopt_area_iter_first(iopt, iova, last))
200 return -EEXIST;
201 return 0;
202 }
203
204 /*
205 * The area takes a slice of the pages from start_bytes to start_byte + length
206 */
iopt_insert_area(struct io_pagetable * iopt,struct iopt_area * area,struct iopt_pages * pages,unsigned long iova,unsigned long start_byte,unsigned long length,int iommu_prot)207 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
208 struct iopt_pages *pages, unsigned long iova,
209 unsigned long start_byte, unsigned long length,
210 int iommu_prot)
211 {
212 lockdep_assert_held_write(&iopt->iova_rwsem);
213
214 if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
215 return -EPERM;
216
217 area->iommu_prot = iommu_prot;
218 area->page_offset = start_byte % PAGE_SIZE;
219 if (area->page_offset & (iopt->iova_alignment - 1))
220 return -EINVAL;
221
222 area->node.start = iova;
223 if (check_add_overflow(iova, length - 1, &area->node.last))
224 return -EOVERFLOW;
225
226 area->pages_node.start = start_byte / PAGE_SIZE;
227 if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
228 return -EOVERFLOW;
229 area->pages_node.last = area->pages_node.last / PAGE_SIZE;
230 if (WARN_ON(area->pages_node.last >= pages->npages))
231 return -EOVERFLOW;
232
233 /*
234 * The area is inserted with a NULL pages indicating it is not fully
235 * initialized yet.
236 */
237 area->iopt = iopt;
238 interval_tree_insert(&area->node, &iopt->area_itree);
239 return 0;
240 }
241
iopt_area_alloc(void)242 static struct iopt_area *iopt_area_alloc(void)
243 {
244 struct iopt_area *area;
245
246 area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
247 if (!area)
248 return NULL;
249 RB_CLEAR_NODE(&area->node.rb);
250 RB_CLEAR_NODE(&area->pages_node.rb);
251 return area;
252 }
253
iopt_alloc_area_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)254 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
255 struct list_head *pages_list,
256 unsigned long length, unsigned long *dst_iova,
257 int iommu_prot, unsigned int flags)
258 {
259 struct iopt_pages_list *elm;
260 unsigned long start;
261 unsigned long iova;
262 int rc = 0;
263
264 list_for_each_entry(elm, pages_list, next) {
265 elm->area = iopt_area_alloc();
266 if (!elm->area)
267 return -ENOMEM;
268 }
269
270 down_write(&iopt->iova_rwsem);
271 if ((length & (iopt->iova_alignment - 1)) || !length) {
272 rc = -EINVAL;
273 goto out_unlock;
274 }
275
276 if (flags & IOPT_ALLOC_IOVA) {
277 /* Use the first entry to guess the ideal IOVA alignment */
278 elm = list_first_entry(pages_list, struct iopt_pages_list,
279 next);
280 switch (elm->pages->type) {
281 case IOPT_ADDRESS_USER:
282 start = elm->start_byte + (uintptr_t)elm->pages->uptr;
283 break;
284 case IOPT_ADDRESS_FILE:
285 start = elm->start_byte + elm->pages->start;
286 break;
287 }
288 rc = iopt_alloc_iova(iopt, dst_iova, start, length);
289 if (rc)
290 goto out_unlock;
291 if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
292 WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
293 rc = -EINVAL;
294 goto out_unlock;
295 }
296 } else {
297 rc = iopt_check_iova(iopt, *dst_iova, length);
298 if (rc)
299 goto out_unlock;
300 }
301
302 /*
303 * Areas are created with a NULL pages so that the IOVA space is
304 * reserved and we can unlock the iova_rwsem.
305 */
306 iova = *dst_iova;
307 list_for_each_entry(elm, pages_list, next) {
308 rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
309 elm->start_byte, elm->length, iommu_prot);
310 if (rc)
311 goto out_unlock;
312 iova += elm->length;
313 }
314
315 out_unlock:
316 up_write(&iopt->iova_rwsem);
317 return rc;
318 }
319
iopt_abort_area(struct iopt_area * area)320 static void iopt_abort_area(struct iopt_area *area)
321 {
322 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
323 WARN_ON(area->pages);
324 if (area->iopt) {
325 down_write(&area->iopt->iova_rwsem);
326 interval_tree_remove(&area->node, &area->iopt->area_itree);
327 up_write(&area->iopt->iova_rwsem);
328 }
329 kfree(area);
330 }
331
iopt_free_pages_list(struct list_head * pages_list)332 void iopt_free_pages_list(struct list_head *pages_list)
333 {
334 struct iopt_pages_list *elm;
335
336 while ((elm = list_first_entry_or_null(pages_list,
337 struct iopt_pages_list, next))) {
338 if (elm->area)
339 iopt_abort_area(elm->area);
340 if (elm->pages)
341 iopt_put_pages(elm->pages);
342 list_del(&elm->next);
343 kfree(elm);
344 }
345 }
346
iopt_fill_domains_pages(struct list_head * pages_list)347 static int iopt_fill_domains_pages(struct list_head *pages_list)
348 {
349 struct iopt_pages_list *undo_elm;
350 struct iopt_pages_list *elm;
351 int rc;
352
353 list_for_each_entry(elm, pages_list, next) {
354 rc = iopt_area_fill_domains(elm->area, elm->pages);
355 if (rc)
356 goto err_undo;
357 }
358 return 0;
359
360 err_undo:
361 list_for_each_entry(undo_elm, pages_list, next) {
362 if (undo_elm == elm)
363 break;
364 iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
365 }
366 return rc;
367 }
368
iopt_map_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)369 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
370 unsigned long length, unsigned long *dst_iova,
371 int iommu_prot, unsigned int flags)
372 {
373 struct iopt_pages_list *elm;
374 int rc;
375
376 rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
377 iommu_prot, flags);
378 if (rc)
379 return rc;
380
381 down_read(&iopt->domains_rwsem);
382 rc = iopt_fill_domains_pages(pages_list);
383 if (rc)
384 goto out_unlock_domains;
385
386 down_write(&iopt->iova_rwsem);
387 list_for_each_entry(elm, pages_list, next) {
388 /*
389 * area->pages must be set inside the domains_rwsem to ensure
390 * any newly added domains will get filled. Moves the reference
391 * in from the list.
392 */
393 elm->area->pages = elm->pages;
394 elm->pages = NULL;
395 elm->area = NULL;
396 }
397 up_write(&iopt->iova_rwsem);
398 out_unlock_domains:
399 up_read(&iopt->domains_rwsem);
400 return rc;
401 }
402
iopt_map_common(struct iommufd_ctx * ictx,struct io_pagetable * iopt,struct iopt_pages * pages,unsigned long * iova,unsigned long length,unsigned long start_byte,int iommu_prot,unsigned int flags)403 static int iopt_map_common(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
404 struct iopt_pages *pages, unsigned long *iova,
405 unsigned long length, unsigned long start_byte,
406 int iommu_prot, unsigned int flags)
407 {
408 struct iopt_pages_list elm = {};
409 LIST_HEAD(pages_list);
410 int rc;
411
412 elm.pages = pages;
413 elm.start_byte = start_byte;
414 if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
415 elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
416 elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
417 elm.length = length;
418 list_add(&elm.next, &pages_list);
419
420 rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
421 if (rc) {
422 if (elm.area)
423 iopt_abort_area(elm.area);
424 if (elm.pages)
425 iopt_put_pages(elm.pages);
426 return rc;
427 }
428 return 0;
429 }
430
431 /**
432 * iopt_map_user_pages() - Map a user VA to an iova in the io page table
433 * @ictx: iommufd_ctx the iopt is part of
434 * @iopt: io_pagetable to act on
435 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
436 * the chosen iova on output. Otherwise is the iova to map to on input
437 * @uptr: User VA to map
438 * @length: Number of bytes to map
439 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
440 * @flags: IOPT_ALLOC_IOVA or zero
441 *
442 * iova, uptr, and length must be aligned to iova_alignment. For domain backed
443 * page tables this will pin the pages and load them into the domain at iova.
444 * For non-domain page tables this will only setup a lazy reference and the
445 * caller must use iopt_access_pages() to touch them.
446 *
447 * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
448 * destroyed.
449 */
iopt_map_user_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,void __user * uptr,unsigned long length,int iommu_prot,unsigned int flags)450 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
451 unsigned long *iova, void __user *uptr,
452 unsigned long length, int iommu_prot,
453 unsigned int flags)
454 {
455 struct iopt_pages *pages;
456
457 pages = iopt_alloc_user_pages(uptr, length, iommu_prot & IOMMU_WRITE);
458 if (IS_ERR(pages))
459 return PTR_ERR(pages);
460
461 return iopt_map_common(ictx, iopt, pages, iova, length,
462 uptr - pages->uptr, iommu_prot, flags);
463 }
464
465 /**
466 * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file.
467 * @ictx: iommufd_ctx the iopt is part of
468 * @iopt: io_pagetable to act on
469 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
470 * the chosen iova on output. Otherwise is the iova to map to on input
471 * @file: file to map
472 * @start: map file starting at this byte offset
473 * @length: Number of bytes to map
474 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
475 * @flags: IOPT_ALLOC_IOVA or zero
476 */
iopt_map_file_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,struct file * file,unsigned long start,unsigned long length,int iommu_prot,unsigned int flags)477 int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
478 unsigned long *iova, struct file *file,
479 unsigned long start, unsigned long length,
480 int iommu_prot, unsigned int flags)
481 {
482 struct iopt_pages *pages;
483
484 pages = iopt_alloc_file_pages(file, start, length,
485 iommu_prot & IOMMU_WRITE);
486 if (IS_ERR(pages))
487 return PTR_ERR(pages);
488 return iopt_map_common(ictx, iopt, pages, iova, length,
489 start - pages->start, iommu_prot, flags);
490 }
491
492 struct iova_bitmap_fn_arg {
493 unsigned long flags;
494 struct io_pagetable *iopt;
495 struct iommu_domain *domain;
496 struct iommu_dirty_bitmap *dirty;
497 };
498
__iommu_read_and_clear_dirty(struct iova_bitmap * bitmap,unsigned long iova,size_t length,void * opaque)499 static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
500 unsigned long iova, size_t length,
501 void *opaque)
502 {
503 struct iopt_area *area;
504 struct iopt_area_contig_iter iter;
505 struct iova_bitmap_fn_arg *arg = opaque;
506 struct iommu_domain *domain = arg->domain;
507 struct iommu_dirty_bitmap *dirty = arg->dirty;
508 const struct iommu_dirty_ops *ops = domain->dirty_ops;
509 unsigned long last_iova = iova + length - 1;
510 unsigned long flags = arg->flags;
511 int ret;
512
513 iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
514 unsigned long last = min(last_iova, iopt_area_last_iova(area));
515
516 ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
517 last - iter.cur_iova + 1, flags,
518 dirty);
519 if (ret)
520 return ret;
521 }
522
523 if (!iopt_area_contig_done(&iter))
524 return -EINVAL;
525 return 0;
526 }
527
528 static int
iommu_read_and_clear_dirty(struct iommu_domain * domain,struct io_pagetable * iopt,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)529 iommu_read_and_clear_dirty(struct iommu_domain *domain,
530 struct io_pagetable *iopt, unsigned long flags,
531 struct iommu_hwpt_get_dirty_bitmap *bitmap)
532 {
533 const struct iommu_dirty_ops *ops = domain->dirty_ops;
534 struct iommu_iotlb_gather gather;
535 struct iommu_dirty_bitmap dirty;
536 struct iova_bitmap_fn_arg arg;
537 struct iova_bitmap *iter;
538 int ret = 0;
539
540 if (!ops || !ops->read_and_clear_dirty)
541 return -EOPNOTSUPP;
542
543 iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
544 bitmap->page_size,
545 u64_to_user_ptr(bitmap->data));
546 if (IS_ERR(iter))
547 return -ENOMEM;
548
549 iommu_dirty_bitmap_init(&dirty, iter, &gather);
550
551 arg.flags = flags;
552 arg.iopt = iopt;
553 arg.domain = domain;
554 arg.dirty = &dirty;
555 iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
556
557 if (!(flags & IOMMU_DIRTY_NO_CLEAR))
558 iommu_iotlb_sync(domain, &gather);
559
560 iova_bitmap_free(iter);
561
562 return ret;
563 }
564
iommufd_check_iova_range(struct io_pagetable * iopt,struct iommu_hwpt_get_dirty_bitmap * bitmap)565 int iommufd_check_iova_range(struct io_pagetable *iopt,
566 struct iommu_hwpt_get_dirty_bitmap *bitmap)
567 {
568 size_t iommu_pgsize = iopt->iova_alignment;
569 u64 last_iova;
570
571 if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
572 return -EOVERFLOW;
573
574 if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
575 return -EOVERFLOW;
576
577 if ((bitmap->iova & (iommu_pgsize - 1)) ||
578 ((last_iova + 1) & (iommu_pgsize - 1)))
579 return -EINVAL;
580
581 if (!bitmap->page_size)
582 return -EINVAL;
583
584 if ((bitmap->iova & (bitmap->page_size - 1)) ||
585 ((last_iova + 1) & (bitmap->page_size - 1)))
586 return -EINVAL;
587
588 return 0;
589 }
590
iopt_read_and_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain,unsigned long flags,struct iommu_hwpt_get_dirty_bitmap * bitmap)591 int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
592 struct iommu_domain *domain,
593 unsigned long flags,
594 struct iommu_hwpt_get_dirty_bitmap *bitmap)
595 {
596 int ret;
597
598 ret = iommufd_check_iova_range(iopt, bitmap);
599 if (ret)
600 return ret;
601
602 down_read(&iopt->iova_rwsem);
603 ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
604 up_read(&iopt->iova_rwsem);
605
606 return ret;
607 }
608
iopt_clear_dirty_data(struct io_pagetable * iopt,struct iommu_domain * domain)609 static int iopt_clear_dirty_data(struct io_pagetable *iopt,
610 struct iommu_domain *domain)
611 {
612 const struct iommu_dirty_ops *ops = domain->dirty_ops;
613 struct iommu_iotlb_gather gather;
614 struct iommu_dirty_bitmap dirty;
615 struct iopt_area *area;
616 int ret = 0;
617
618 lockdep_assert_held_read(&iopt->iova_rwsem);
619
620 iommu_dirty_bitmap_init(&dirty, NULL, &gather);
621
622 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
623 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
624 if (!area->pages)
625 continue;
626
627 ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
628 iopt_area_length(area), 0,
629 &dirty);
630 if (ret)
631 break;
632 }
633
634 iommu_iotlb_sync(domain, &gather);
635 return ret;
636 }
637
iopt_set_dirty_tracking(struct io_pagetable * iopt,struct iommu_domain * domain,bool enable)638 int iopt_set_dirty_tracking(struct io_pagetable *iopt,
639 struct iommu_domain *domain, bool enable)
640 {
641 const struct iommu_dirty_ops *ops = domain->dirty_ops;
642 int ret = 0;
643
644 if (!ops)
645 return -EOPNOTSUPP;
646
647 down_read(&iopt->iova_rwsem);
648
649 /* Clear dirty bits from PTEs to ensure a clean snapshot */
650 if (enable) {
651 ret = iopt_clear_dirty_data(iopt, domain);
652 if (ret)
653 goto out_unlock;
654 }
655
656 ret = ops->set_dirty_tracking(domain, enable);
657
658 out_unlock:
659 up_read(&iopt->iova_rwsem);
660 return ret;
661 }
662
iopt_get_pages(struct io_pagetable * iopt,unsigned long iova,unsigned long length,struct list_head * pages_list)663 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
664 unsigned long length, struct list_head *pages_list)
665 {
666 struct iopt_area_contig_iter iter;
667 unsigned long last_iova;
668 struct iopt_area *area;
669 int rc;
670
671 if (!length)
672 return -EINVAL;
673 if (check_add_overflow(iova, length - 1, &last_iova))
674 return -EOVERFLOW;
675
676 down_read(&iopt->iova_rwsem);
677 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
678 struct iopt_pages_list *elm;
679 unsigned long last = min(last_iova, iopt_area_last_iova(area));
680
681 elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
682 if (!elm) {
683 rc = -ENOMEM;
684 goto err_free;
685 }
686 elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
687 elm->pages = area->pages;
688 elm->length = (last - iter.cur_iova) + 1;
689 kref_get(&elm->pages->kref);
690 list_add_tail(&elm->next, pages_list);
691 }
692 if (!iopt_area_contig_done(&iter)) {
693 rc = -ENOENT;
694 goto err_free;
695 }
696 up_read(&iopt->iova_rwsem);
697 return 0;
698 err_free:
699 up_read(&iopt->iova_rwsem);
700 iopt_free_pages_list(pages_list);
701 return rc;
702 }
703
iopt_unmap_iova_range(struct io_pagetable * iopt,unsigned long start,unsigned long last,unsigned long * unmapped)704 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
705 unsigned long last, unsigned long *unmapped)
706 {
707 struct iopt_area *area;
708 unsigned long unmapped_bytes = 0;
709 unsigned int tries = 0;
710 int rc = -ENOENT;
711
712 /*
713 * The domains_rwsem must be held in read mode any time any area->pages
714 * is NULL. This prevents domain attach/detatch from running
715 * concurrently with cleaning up the area.
716 */
717 again:
718 down_read(&iopt->domains_rwsem);
719 down_write(&iopt->iova_rwsem);
720 while ((area = iopt_area_iter_first(iopt, start, last))) {
721 unsigned long area_last = iopt_area_last_iova(area);
722 unsigned long area_first = iopt_area_iova(area);
723 struct iopt_pages *pages;
724
725 /* Userspace should not race map/unmap's of the same area */
726 if (!area->pages) {
727 rc = -EBUSY;
728 goto out_unlock_iova;
729 }
730
731 /* The area is locked by an object that has not been destroyed */
732 if (area->num_locks) {
733 rc = -EBUSY;
734 goto out_unlock_iova;
735 }
736
737 if (area_first < start || area_last > last) {
738 rc = -ENOENT;
739 goto out_unlock_iova;
740 }
741
742 if (area_first != start)
743 tries = 0;
744
745 /*
746 * num_accesses writers must hold the iova_rwsem too, so we can
747 * safely read it under the write side of the iovam_rwsem
748 * without the pages->mutex.
749 */
750 if (area->num_accesses) {
751 size_t length = iopt_area_length(area);
752
753 start = area_first;
754 area->prevent_access = true;
755 up_write(&iopt->iova_rwsem);
756 up_read(&iopt->domains_rwsem);
757
758 iommufd_access_notify_unmap(iopt, area_first, length);
759 /* Something is not responding to unmap requests. */
760 tries++;
761 if (WARN_ON(tries > 100)) {
762 rc = -EDEADLOCK;
763 goto out_unmapped;
764 }
765 goto again;
766 }
767
768 pages = area->pages;
769 area->pages = NULL;
770 up_write(&iopt->iova_rwsem);
771
772 iopt_area_unfill_domains(area, pages);
773 iopt_abort_area(area);
774 iopt_put_pages(pages);
775
776 unmapped_bytes += area_last - area_first + 1;
777
778 down_write(&iopt->iova_rwsem);
779 }
780 if (unmapped_bytes)
781 rc = 0;
782
783 out_unlock_iova:
784 up_write(&iopt->iova_rwsem);
785 up_read(&iopt->domains_rwsem);
786 out_unmapped:
787 if (unmapped)
788 *unmapped = unmapped_bytes;
789 return rc;
790 }
791
792 /**
793 * iopt_unmap_iova() - Remove a range of iova
794 * @iopt: io_pagetable to act on
795 * @iova: Starting iova to unmap
796 * @length: Number of bytes to unmap
797 * @unmapped: Return number of bytes unmapped
798 *
799 * The requested range must be a superset of existing ranges.
800 * Splitting/truncating IOVA mappings is not allowed.
801 */
iopt_unmap_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length,unsigned long * unmapped)802 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
803 unsigned long length, unsigned long *unmapped)
804 {
805 unsigned long iova_last;
806
807 if (!length)
808 return -EINVAL;
809
810 if (check_add_overflow(iova, length - 1, &iova_last))
811 return -EOVERFLOW;
812
813 return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
814 }
815
iopt_unmap_all(struct io_pagetable * iopt,unsigned long * unmapped)816 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
817 {
818 int rc;
819
820 rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
821 /* If the IOVAs are empty then unmap all succeeds */
822 if (rc == -ENOENT)
823 return 0;
824 return rc;
825 }
826
827 /* The caller must always free all the nodes in the allowed_iova rb_root. */
iopt_set_allow_iova(struct io_pagetable * iopt,struct rb_root_cached * allowed_iova)828 int iopt_set_allow_iova(struct io_pagetable *iopt,
829 struct rb_root_cached *allowed_iova)
830 {
831 struct iopt_allowed *allowed;
832
833 down_write(&iopt->iova_rwsem);
834 swap(*allowed_iova, iopt->allowed_itree);
835
836 for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
837 allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
838 if (iopt_reserved_iter_first(iopt, allowed->node.start,
839 allowed->node.last)) {
840 swap(*allowed_iova, iopt->allowed_itree);
841 up_write(&iopt->iova_rwsem);
842 return -EADDRINUSE;
843 }
844 }
845 up_write(&iopt->iova_rwsem);
846 return 0;
847 }
848
iopt_reserve_iova(struct io_pagetable * iopt,unsigned long start,unsigned long last,void * owner)849 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
850 unsigned long last, void *owner)
851 {
852 struct iopt_reserved *reserved;
853
854 lockdep_assert_held_write(&iopt->iova_rwsem);
855
856 if (iopt_area_iter_first(iopt, start, last) ||
857 iopt_allowed_iter_first(iopt, start, last))
858 return -EADDRINUSE;
859
860 reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
861 if (!reserved)
862 return -ENOMEM;
863 reserved->node.start = start;
864 reserved->node.last = last;
865 reserved->owner = owner;
866 interval_tree_insert(&reserved->node, &iopt->reserved_itree);
867 return 0;
868 }
869
__iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)870 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
871 {
872 struct iopt_reserved *reserved, *next;
873
874 lockdep_assert_held_write(&iopt->iova_rwsem);
875
876 for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
877 reserved = next) {
878 next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
879
880 if (reserved->owner == owner) {
881 interval_tree_remove(&reserved->node,
882 &iopt->reserved_itree);
883 kfree(reserved);
884 }
885 }
886 }
887
iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)888 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
889 {
890 down_write(&iopt->iova_rwsem);
891 __iopt_remove_reserved_iova(iopt, owner);
892 up_write(&iopt->iova_rwsem);
893 }
894
iopt_init_table(struct io_pagetable * iopt)895 void iopt_init_table(struct io_pagetable *iopt)
896 {
897 init_rwsem(&iopt->iova_rwsem);
898 init_rwsem(&iopt->domains_rwsem);
899 iopt->area_itree = RB_ROOT_CACHED;
900 iopt->allowed_itree = RB_ROOT_CACHED;
901 iopt->reserved_itree = RB_ROOT_CACHED;
902 xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
903 xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
904
905 /*
906 * iopt's start as SW tables that can use the entire size_t IOVA space
907 * due to the use of size_t in the APIs. They have no alignment
908 * restriction.
909 */
910 iopt->iova_alignment = 1;
911 }
912
iopt_destroy_table(struct io_pagetable * iopt)913 void iopt_destroy_table(struct io_pagetable *iopt)
914 {
915 struct interval_tree_node *node;
916
917 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
918 iopt_remove_reserved_iova(iopt, NULL);
919
920 while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
921 ULONG_MAX))) {
922 interval_tree_remove(node, &iopt->allowed_itree);
923 kfree(container_of(node, struct iopt_allowed, node));
924 }
925
926 WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
927 WARN_ON(!xa_empty(&iopt->domains));
928 WARN_ON(!xa_empty(&iopt->access_list));
929 WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
930 }
931
932 /**
933 * iopt_unfill_domain() - Unfill a domain with PFNs
934 * @iopt: io_pagetable to act on
935 * @domain: domain to unfill
936 *
937 * This is used when removing a domain from the iopt. Every area in the iopt
938 * will be unmapped from the domain. The domain must already be removed from the
939 * domains xarray.
940 */
iopt_unfill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)941 static void iopt_unfill_domain(struct io_pagetable *iopt,
942 struct iommu_domain *domain)
943 {
944 struct iopt_area *area;
945
946 lockdep_assert_held(&iopt->iova_rwsem);
947 lockdep_assert_held_write(&iopt->domains_rwsem);
948
949 /*
950 * Some other domain is holding all the pfns still, rapidly unmap this
951 * domain.
952 */
953 if (iopt->next_domain_id != 0) {
954 /* Pick an arbitrary remaining domain to act as storage */
955 struct iommu_domain *storage_domain =
956 xa_load(&iopt->domains, 0);
957
958 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
959 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
960 struct iopt_pages *pages = area->pages;
961
962 if (!pages)
963 continue;
964
965 mutex_lock(&pages->mutex);
966 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
967 WARN_ON(!area->storage_domain);
968 if (area->storage_domain == domain)
969 area->storage_domain = storage_domain;
970 mutex_unlock(&pages->mutex);
971
972 iopt_area_unmap_domain(area, domain);
973 }
974 return;
975 }
976
977 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
978 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
979 struct iopt_pages *pages = area->pages;
980
981 if (!pages)
982 continue;
983
984 mutex_lock(&pages->mutex);
985 interval_tree_remove(&area->pages_node, &pages->domains_itree);
986 WARN_ON(area->storage_domain != domain);
987 area->storage_domain = NULL;
988 iopt_area_unfill_domain(area, pages, domain);
989 mutex_unlock(&pages->mutex);
990 }
991 }
992
993 /**
994 * iopt_fill_domain() - Fill a domain with PFNs
995 * @iopt: io_pagetable to act on
996 * @domain: domain to fill
997 *
998 * Fill the domain with PFNs from every area in the iopt. On failure the domain
999 * is left unchanged.
1000 */
iopt_fill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1001 static int iopt_fill_domain(struct io_pagetable *iopt,
1002 struct iommu_domain *domain)
1003 {
1004 struct iopt_area *end_area;
1005 struct iopt_area *area;
1006 int rc;
1007
1008 lockdep_assert_held(&iopt->iova_rwsem);
1009 lockdep_assert_held_write(&iopt->domains_rwsem);
1010
1011 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1012 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1013 struct iopt_pages *pages = area->pages;
1014
1015 if (!pages)
1016 continue;
1017
1018 mutex_lock(&pages->mutex);
1019 rc = iopt_area_fill_domain(area, domain);
1020 if (rc) {
1021 mutex_unlock(&pages->mutex);
1022 goto out_unfill;
1023 }
1024 if (!area->storage_domain) {
1025 WARN_ON(iopt->next_domain_id != 0);
1026 area->storage_domain = domain;
1027 interval_tree_insert(&area->pages_node,
1028 &pages->domains_itree);
1029 }
1030 mutex_unlock(&pages->mutex);
1031 }
1032 return 0;
1033
1034 out_unfill:
1035 end_area = area;
1036 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1037 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
1038 struct iopt_pages *pages = area->pages;
1039
1040 if (area == end_area)
1041 break;
1042 if (!pages)
1043 continue;
1044 mutex_lock(&pages->mutex);
1045 if (iopt->next_domain_id == 0) {
1046 interval_tree_remove(&area->pages_node,
1047 &pages->domains_itree);
1048 area->storage_domain = NULL;
1049 }
1050 iopt_area_unfill_domain(area, pages, domain);
1051 mutex_unlock(&pages->mutex);
1052 }
1053 return rc;
1054 }
1055
1056 /* All existing area's conform to an increased page size */
iopt_check_iova_alignment(struct io_pagetable * iopt,unsigned long new_iova_alignment)1057 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
1058 unsigned long new_iova_alignment)
1059 {
1060 unsigned long align_mask = new_iova_alignment - 1;
1061 struct iopt_area *area;
1062
1063 lockdep_assert_held(&iopt->iova_rwsem);
1064 lockdep_assert_held(&iopt->domains_rwsem);
1065
1066 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
1067 area = iopt_area_iter_next(area, 0, ULONG_MAX))
1068 if ((iopt_area_iova(area) & align_mask) ||
1069 (iopt_area_length(area) & align_mask) ||
1070 (area->page_offset & align_mask))
1071 return -EADDRINUSE;
1072
1073 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
1074 struct iommufd_access *access;
1075 unsigned long index;
1076
1077 xa_for_each(&iopt->access_list, index, access)
1078 if (WARN_ON(access->iova_alignment >
1079 new_iova_alignment))
1080 return -EADDRINUSE;
1081 }
1082 return 0;
1083 }
1084
iopt_table_add_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1085 int iopt_table_add_domain(struct io_pagetable *iopt,
1086 struct iommu_domain *domain)
1087 {
1088 const struct iommu_domain_geometry *geometry = &domain->geometry;
1089 struct iommu_domain *iter_domain;
1090 unsigned int new_iova_alignment;
1091 unsigned long index;
1092 int rc;
1093
1094 down_write(&iopt->domains_rwsem);
1095 down_write(&iopt->iova_rwsem);
1096
1097 xa_for_each(&iopt->domains, index, iter_domain) {
1098 if (WARN_ON(iter_domain == domain)) {
1099 rc = -EEXIST;
1100 goto out_unlock;
1101 }
1102 }
1103
1104 /*
1105 * The io page size drives the iova_alignment. Internally the iopt_pages
1106 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
1107 * objects into the iommu_domain.
1108 *
1109 * A iommu_domain must always be able to accept PAGE_SIZE to be
1110 * compatible as we can't guarantee higher contiguity.
1111 */
1112 new_iova_alignment = max_t(unsigned long,
1113 1UL << __ffs(domain->pgsize_bitmap),
1114 iopt->iova_alignment);
1115 if (new_iova_alignment > PAGE_SIZE) {
1116 rc = -EINVAL;
1117 goto out_unlock;
1118 }
1119 if (new_iova_alignment != iopt->iova_alignment) {
1120 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1121 if (rc)
1122 goto out_unlock;
1123 }
1124
1125 /* No area exists that is outside the allowed domain aperture */
1126 if (geometry->aperture_start != 0) {
1127 rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
1128 domain);
1129 if (rc)
1130 goto out_reserved;
1131 }
1132 if (geometry->aperture_end != ULONG_MAX) {
1133 rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
1134 ULONG_MAX, domain);
1135 if (rc)
1136 goto out_reserved;
1137 }
1138
1139 rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
1140 if (rc)
1141 goto out_reserved;
1142
1143 rc = iopt_fill_domain(iopt, domain);
1144 if (rc)
1145 goto out_release;
1146
1147 iopt->iova_alignment = new_iova_alignment;
1148 xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
1149 iopt->next_domain_id++;
1150 up_write(&iopt->iova_rwsem);
1151 up_write(&iopt->domains_rwsem);
1152 return 0;
1153 out_release:
1154 xa_release(&iopt->domains, iopt->next_domain_id);
1155 out_reserved:
1156 __iopt_remove_reserved_iova(iopt, domain);
1157 out_unlock:
1158 up_write(&iopt->iova_rwsem);
1159 up_write(&iopt->domains_rwsem);
1160 return rc;
1161 }
1162
iopt_calculate_iova_alignment(struct io_pagetable * iopt)1163 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
1164 {
1165 unsigned long new_iova_alignment;
1166 struct iommufd_access *access;
1167 struct iommu_domain *domain;
1168 unsigned long index;
1169
1170 lockdep_assert_held_write(&iopt->iova_rwsem);
1171 lockdep_assert_held(&iopt->domains_rwsem);
1172
1173 /* See batch_iommu_map_small() */
1174 if (iopt->disable_large_pages)
1175 new_iova_alignment = PAGE_SIZE;
1176 else
1177 new_iova_alignment = 1;
1178
1179 xa_for_each(&iopt->domains, index, domain)
1180 new_iova_alignment = max_t(unsigned long,
1181 1UL << __ffs(domain->pgsize_bitmap),
1182 new_iova_alignment);
1183 xa_for_each(&iopt->access_list, index, access)
1184 new_iova_alignment = max_t(unsigned long,
1185 access->iova_alignment,
1186 new_iova_alignment);
1187
1188 if (new_iova_alignment > iopt->iova_alignment) {
1189 int rc;
1190
1191 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
1192 if (rc)
1193 return rc;
1194 }
1195 iopt->iova_alignment = new_iova_alignment;
1196 return 0;
1197 }
1198
iopt_table_remove_domain(struct io_pagetable * iopt,struct iommu_domain * domain)1199 void iopt_table_remove_domain(struct io_pagetable *iopt,
1200 struct iommu_domain *domain)
1201 {
1202 struct iommu_domain *iter_domain = NULL;
1203 unsigned long index;
1204
1205 down_write(&iopt->domains_rwsem);
1206 down_write(&iopt->iova_rwsem);
1207
1208 xa_for_each(&iopt->domains, index, iter_domain)
1209 if (iter_domain == domain)
1210 break;
1211 if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
1212 goto out_unlock;
1213
1214 /*
1215 * Compress the xarray to keep it linear by swapping the entry to erase
1216 * with the tail entry and shrinking the tail.
1217 */
1218 iopt->next_domain_id--;
1219 iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
1220 if (index != iopt->next_domain_id)
1221 xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
1222
1223 iopt_unfill_domain(iopt, domain);
1224 __iopt_remove_reserved_iova(iopt, domain);
1225
1226 WARN_ON(iopt_calculate_iova_alignment(iopt));
1227 out_unlock:
1228 up_write(&iopt->iova_rwsem);
1229 up_write(&iopt->domains_rwsem);
1230 }
1231
1232 /**
1233 * iopt_area_split - Split an area into two parts at iova
1234 * @area: The area to split
1235 * @iova: Becomes the last of a new area
1236 *
1237 * This splits an area into two. It is part of the VFIO compatibility to allow
1238 * poking a hole in the mapping. The two areas continue to point at the same
1239 * iopt_pages, just with different starting bytes.
1240 */
iopt_area_split(struct iopt_area * area,unsigned long iova)1241 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
1242 {
1243 unsigned long alignment = area->iopt->iova_alignment;
1244 unsigned long last_iova = iopt_area_last_iova(area);
1245 unsigned long start_iova = iopt_area_iova(area);
1246 unsigned long new_start = iova + 1;
1247 struct io_pagetable *iopt = area->iopt;
1248 struct iopt_pages *pages = area->pages;
1249 struct iopt_area *lhs;
1250 struct iopt_area *rhs;
1251 int rc;
1252
1253 lockdep_assert_held_write(&iopt->iova_rwsem);
1254
1255 if (iova == start_iova || iova == last_iova)
1256 return 0;
1257
1258 if (!pages || area->prevent_access)
1259 return -EBUSY;
1260
1261 if (new_start & (alignment - 1) ||
1262 iopt_area_start_byte(area, new_start) & (alignment - 1))
1263 return -EINVAL;
1264
1265 lhs = iopt_area_alloc();
1266 if (!lhs)
1267 return -ENOMEM;
1268
1269 rhs = iopt_area_alloc();
1270 if (!rhs) {
1271 rc = -ENOMEM;
1272 goto err_free_lhs;
1273 }
1274
1275 mutex_lock(&pages->mutex);
1276 /*
1277 * Splitting is not permitted if an access exists, we don't track enough
1278 * information to split existing accesses.
1279 */
1280 if (area->num_accesses) {
1281 rc = -EINVAL;
1282 goto err_unlock;
1283 }
1284
1285 /*
1286 * Splitting is not permitted if a domain could have been mapped with
1287 * huge pages.
1288 */
1289 if (area->storage_domain && !iopt->disable_large_pages) {
1290 rc = -EINVAL;
1291 goto err_unlock;
1292 }
1293
1294 interval_tree_remove(&area->node, &iopt->area_itree);
1295 rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1296 iopt_area_start_byte(area, start_iova),
1297 (new_start - 1) - start_iova + 1,
1298 area->iommu_prot);
1299 if (WARN_ON(rc))
1300 goto err_insert;
1301
1302 rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1303 iopt_area_start_byte(area, new_start),
1304 last_iova - new_start + 1, area->iommu_prot);
1305 if (WARN_ON(rc))
1306 goto err_remove_lhs;
1307
1308 /*
1309 * If the original area has filled a domain, domains_itree has to be
1310 * updated.
1311 */
1312 if (area->storage_domain) {
1313 interval_tree_remove(&area->pages_node, &pages->domains_itree);
1314 interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
1315 interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
1316 }
1317
1318 lhs->storage_domain = area->storage_domain;
1319 lhs->pages = area->pages;
1320 rhs->storage_domain = area->storage_domain;
1321 rhs->pages = area->pages;
1322 kref_get(&rhs->pages->kref);
1323 kfree(area);
1324 mutex_unlock(&pages->mutex);
1325
1326 /*
1327 * No change to domains or accesses because the pages hasn't been
1328 * changed
1329 */
1330 return 0;
1331
1332 err_remove_lhs:
1333 interval_tree_remove(&lhs->node, &iopt->area_itree);
1334 err_insert:
1335 interval_tree_insert(&area->node, &iopt->area_itree);
1336 err_unlock:
1337 mutex_unlock(&pages->mutex);
1338 kfree(rhs);
1339 err_free_lhs:
1340 kfree(lhs);
1341 return rc;
1342 }
1343
iopt_cut_iova(struct io_pagetable * iopt,unsigned long * iovas,size_t num_iovas)1344 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1345 size_t num_iovas)
1346 {
1347 int rc = 0;
1348 int i;
1349
1350 down_write(&iopt->iova_rwsem);
1351 for (i = 0; i < num_iovas; i++) {
1352 struct iopt_area *area;
1353
1354 area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1355 if (!area)
1356 continue;
1357 rc = iopt_area_split(area, iovas[i]);
1358 if (rc)
1359 break;
1360 }
1361 up_write(&iopt->iova_rwsem);
1362 return rc;
1363 }
1364
iopt_enable_large_pages(struct io_pagetable * iopt)1365 void iopt_enable_large_pages(struct io_pagetable *iopt)
1366 {
1367 int rc;
1368
1369 down_write(&iopt->domains_rwsem);
1370 down_write(&iopt->iova_rwsem);
1371 WRITE_ONCE(iopt->disable_large_pages, false);
1372 rc = iopt_calculate_iova_alignment(iopt);
1373 WARN_ON(rc);
1374 up_write(&iopt->iova_rwsem);
1375 up_write(&iopt->domains_rwsem);
1376 }
1377
iopt_disable_large_pages(struct io_pagetable * iopt)1378 int iopt_disable_large_pages(struct io_pagetable *iopt)
1379 {
1380 int rc = 0;
1381
1382 down_write(&iopt->domains_rwsem);
1383 down_write(&iopt->iova_rwsem);
1384 if (iopt->disable_large_pages)
1385 goto out_unlock;
1386
1387 /* Won't do it if domains already have pages mapped in them */
1388 if (!xa_empty(&iopt->domains) &&
1389 !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1390 rc = -EINVAL;
1391 goto out_unlock;
1392 }
1393
1394 WRITE_ONCE(iopt->disable_large_pages, true);
1395 rc = iopt_calculate_iova_alignment(iopt);
1396 if (rc)
1397 WRITE_ONCE(iopt->disable_large_pages, false);
1398 out_unlock:
1399 up_write(&iopt->iova_rwsem);
1400 up_write(&iopt->domains_rwsem);
1401 return rc;
1402 }
1403
iopt_add_access(struct io_pagetable * iopt,struct iommufd_access * access)1404 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1405 {
1406 u32 new_id;
1407 int rc;
1408
1409 down_write(&iopt->domains_rwsem);
1410 down_write(&iopt->iova_rwsem);
1411 rc = xa_alloc(&iopt->access_list, &new_id, access, xa_limit_16b,
1412 GFP_KERNEL_ACCOUNT);
1413
1414 if (rc)
1415 goto out_unlock;
1416
1417 rc = iopt_calculate_iova_alignment(iopt);
1418 if (rc) {
1419 xa_erase(&iopt->access_list, new_id);
1420 goto out_unlock;
1421 }
1422 access->iopt_access_list_id = new_id;
1423
1424 out_unlock:
1425 up_write(&iopt->iova_rwsem);
1426 up_write(&iopt->domains_rwsem);
1427 return rc;
1428 }
1429
iopt_remove_access(struct io_pagetable * iopt,struct iommufd_access * access,u32 iopt_access_list_id)1430 void iopt_remove_access(struct io_pagetable *iopt,
1431 struct iommufd_access *access, u32 iopt_access_list_id)
1432 {
1433 down_write(&iopt->domains_rwsem);
1434 down_write(&iopt->iova_rwsem);
1435 WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access);
1436 WARN_ON(iopt_calculate_iova_alignment(iopt));
1437 up_write(&iopt->iova_rwsem);
1438 up_write(&iopt->domains_rwsem);
1439 }
1440
1441 /* Narrow the valid_iova_itree to include reserved ranges from a device. */
iopt_table_enforce_dev_resv_regions(struct io_pagetable * iopt,struct device * dev,phys_addr_t * sw_msi_start)1442 int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt,
1443 struct device *dev,
1444 phys_addr_t *sw_msi_start)
1445 {
1446 struct iommu_resv_region *resv;
1447 LIST_HEAD(resv_regions);
1448 unsigned int num_hw_msi = 0;
1449 unsigned int num_sw_msi = 0;
1450 int rc;
1451
1452 if (iommufd_should_fail())
1453 return -EINVAL;
1454
1455 down_write(&iopt->iova_rwsem);
1456 /* FIXME: drivers allocate memory but there is no failure propogated */
1457 iommu_get_resv_regions(dev, &resv_regions);
1458
1459 list_for_each_entry(resv, &resv_regions, list) {
1460 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1461 continue;
1462
1463 if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1464 num_hw_msi++;
1465 if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1466 *sw_msi_start = resv->start;
1467 num_sw_msi++;
1468 }
1469
1470 rc = iopt_reserve_iova(iopt, resv->start,
1471 resv->length - 1 + resv->start, dev);
1472 if (rc)
1473 goto out_reserved;
1474 }
1475
1476 /* Drivers must offer sane combinations of regions */
1477 if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1478 rc = -EINVAL;
1479 goto out_reserved;
1480 }
1481
1482 rc = 0;
1483 goto out_free_resv;
1484
1485 out_reserved:
1486 __iopt_remove_reserved_iova(iopt, dev);
1487 out_free_resv:
1488 iommu_put_resv_regions(dev, &resv_regions);
1489 up_write(&iopt->iova_rwsem);
1490 return rc;
1491 }
1492