1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO: IOMMU DMA mapping support for Type1 IOMMU
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  *
12  * We arbitrarily define a Type1 IOMMU as one matching the below code.
13  * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
14  * VT-d, but that makes it harder to re-use as theoretically anyone
15  * implementing a similar IOMMU could make use of this.  We expect the
16  * IOMMU to support the IOMMU API and have few to no restrictions around
17  * the IOVA range that can be mapped.  The Type1 IOMMU is currently
18  * optimized for relatively static mappings of a userspace process with
19  * userspace pages pinned into memory.  We also assume devices and IOMMU
20  * domains are PCI based as the IOMMU API is still centered around a
21  * device/bus interface rather than a group interface.
22  */
23 
24 #include <linux/compat.h>
25 #include <linux/device.h>
26 #include <linux/fs.h>
27 #include <linux/highmem.h>
28 #include <linux/iommu.h>
29 #include <linux/module.h>
30 #include <linux/mm.h>
31 #include <linux/kthread.h>
32 #include <linux/rbtree.h>
33 #include <linux/sched/signal.h>
34 #include <linux/sched/mm.h>
35 #include <linux/slab.h>
36 #include <linux/uaccess.h>
37 #include <linux/vfio.h>
38 #include <linux/workqueue.h>
39 #include <linux/notifier.h>
40 #include "vfio.h"
41 
42 #define DRIVER_VERSION  "0.2"
43 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
44 #define DRIVER_DESC     "Type1 IOMMU driver for VFIO"
45 
46 static bool allow_unsafe_interrupts;
47 module_param_named(allow_unsafe_interrupts,
48 		   allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
49 MODULE_PARM_DESC(allow_unsafe_interrupts,
50 		 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
51 
52 static bool disable_hugepages;
53 module_param_named(disable_hugepages,
54 		   disable_hugepages, bool, S_IRUGO | S_IWUSR);
55 MODULE_PARM_DESC(disable_hugepages,
56 		 "Disable VFIO IOMMU support for IOMMU hugepages.");
57 
58 static unsigned int dma_entry_limit __read_mostly = U16_MAX;
59 module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
60 MODULE_PARM_DESC(dma_entry_limit,
61 		 "Maximum number of user DMA mappings per container (65535).");
62 
63 struct vfio_iommu {
64 	struct list_head	domain_list;
65 	struct list_head	iova_list;
66 	struct mutex		lock;
67 	struct rb_root		dma_list;
68 	struct list_head	device_list;
69 	struct mutex		device_list_lock;
70 	unsigned int		dma_avail;
71 	unsigned int		vaddr_invalid_count;
72 	uint64_t		pgsize_bitmap;
73 	uint64_t		num_non_pinned_groups;
74 	bool			v2;
75 	bool			dirty_page_tracking;
76 	struct list_head	emulated_iommu_groups;
77 };
78 
79 struct vfio_domain {
80 	struct iommu_domain	*domain;
81 	struct list_head	next;
82 	struct list_head	group_list;
83 	bool			fgsp : 1;	/* Fine-grained super pages */
84 	bool			enforce_cache_coherency : 1;
85 };
86 
87 struct vfio_dma {
88 	struct rb_node		node;
89 	dma_addr_t		iova;		/* Device address */
90 	unsigned long		vaddr;		/* Process virtual addr */
91 	size_t			size;		/* Map size (bytes) */
92 	int			prot;		/* IOMMU_READ/WRITE */
93 	bool			iommu_mapped;
94 	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
95 	bool			vaddr_invalid;
96 	struct task_struct	*task;
97 	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
98 	unsigned long		*bitmap;
99 	struct mm_struct	*mm;
100 	size_t			locked_vm;
101 };
102 
103 struct vfio_batch {
104 	struct page		**pages;	/* for pin_user_pages_remote */
105 	struct page		*fallback_page; /* if pages alloc fails */
106 	unsigned int		capacity;	/* length of pages array */
107 	unsigned int		size;		/* of batch currently */
108 	unsigned int		offset;		/* of next entry in pages */
109 };
110 
111 struct vfio_iommu_group {
112 	struct iommu_group	*iommu_group;
113 	struct list_head	next;
114 	bool			pinned_page_dirty_scope;
115 };
116 
117 struct vfio_iova {
118 	struct list_head	list;
119 	dma_addr_t		start;
120 	dma_addr_t		end;
121 };
122 
123 /*
124  * Guest RAM pinning working set or DMA target
125  */
126 struct vfio_pfn {
127 	struct rb_node		node;
128 	dma_addr_t		iova;		/* Device address */
129 	unsigned long		pfn;		/* Host pfn */
130 	unsigned int		ref_count;
131 };
132 
133 struct vfio_regions {
134 	struct list_head list;
135 	dma_addr_t iova;
136 	phys_addr_t phys;
137 	size_t len;
138 };
139 
140 #define DIRTY_BITMAP_BYTES(n)	(ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
141 
142 /*
143  * Input argument of number of bits to bitmap_set() is unsigned integer, which
144  * further casts to signed integer for unaligned multi-bit operation,
145  * __bitmap_set().
146  * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
147  * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
148  * system.
149  */
150 #define DIRTY_BITMAP_PAGES_MAX	 ((u64)INT_MAX)
151 #define DIRTY_BITMAP_SIZE_MAX	 DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
152 
153 static int put_pfn(unsigned long pfn, int prot);
154 
155 static struct vfio_iommu_group*
156 vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
157 			    struct iommu_group *iommu_group);
158 
159 /*
160  * This code handles mapping and unmapping of user data buffers
161  * into DMA'ble space using the IOMMU
162  */
163 
vfio_find_dma(struct vfio_iommu * iommu,dma_addr_t start,size_t size)164 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
165 				      dma_addr_t start, size_t size)
166 {
167 	struct rb_node *node = iommu->dma_list.rb_node;
168 
169 	while (node) {
170 		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
171 
172 		if (start + size <= dma->iova)
173 			node = node->rb_left;
174 		else if (start >= dma->iova + dma->size)
175 			node = node->rb_right;
176 		else
177 			return dma;
178 	}
179 
180 	return NULL;
181 }
182 
vfio_find_dma_first_node(struct vfio_iommu * iommu,dma_addr_t start,u64 size)183 static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
184 						dma_addr_t start, u64 size)
185 {
186 	struct rb_node *res = NULL;
187 	struct rb_node *node = iommu->dma_list.rb_node;
188 	struct vfio_dma *dma_res = NULL;
189 
190 	while (node) {
191 		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
192 
193 		if (start < dma->iova + dma->size) {
194 			res = node;
195 			dma_res = dma;
196 			if (start >= dma->iova)
197 				break;
198 			node = node->rb_left;
199 		} else {
200 			node = node->rb_right;
201 		}
202 	}
203 	if (res && size && dma_res->iova >= start + size)
204 		res = NULL;
205 	return res;
206 }
207 
vfio_link_dma(struct vfio_iommu * iommu,struct vfio_dma * new)208 static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
209 {
210 	struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
211 	struct vfio_dma *dma;
212 
213 	while (*link) {
214 		parent = *link;
215 		dma = rb_entry(parent, struct vfio_dma, node);
216 
217 		if (new->iova + new->size <= dma->iova)
218 			link = &(*link)->rb_left;
219 		else
220 			link = &(*link)->rb_right;
221 	}
222 
223 	rb_link_node(&new->node, parent, link);
224 	rb_insert_color(&new->node, &iommu->dma_list);
225 }
226 
vfio_unlink_dma(struct vfio_iommu * iommu,struct vfio_dma * old)227 static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
228 {
229 	rb_erase(&old->node, &iommu->dma_list);
230 }
231 
232 
vfio_dma_bitmap_alloc(struct vfio_dma * dma,size_t pgsize)233 static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
234 {
235 	uint64_t npages = dma->size / pgsize;
236 
237 	if (npages > DIRTY_BITMAP_PAGES_MAX)
238 		return -EINVAL;
239 
240 	/*
241 	 * Allocate extra 64 bits that are used to calculate shift required for
242 	 * bitmap_shift_left() to manipulate and club unaligned number of pages
243 	 * in adjacent vfio_dma ranges.
244 	 */
245 	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64),
246 			       GFP_KERNEL);
247 	if (!dma->bitmap)
248 		return -ENOMEM;
249 
250 	return 0;
251 }
252 
vfio_dma_bitmap_free(struct vfio_dma * dma)253 static void vfio_dma_bitmap_free(struct vfio_dma *dma)
254 {
255 	kvfree(dma->bitmap);
256 	dma->bitmap = NULL;
257 }
258 
vfio_dma_populate_bitmap(struct vfio_dma * dma,size_t pgsize)259 static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
260 {
261 	struct rb_node *p;
262 	unsigned long pgshift = __ffs(pgsize);
263 
264 	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
265 		struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
266 
267 		bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) >> pgshift, 1);
268 	}
269 }
270 
vfio_iommu_populate_bitmap_full(struct vfio_iommu * iommu)271 static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu)
272 {
273 	struct rb_node *n;
274 	unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
275 
276 	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
277 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
278 
279 		bitmap_set(dma->bitmap, 0, dma->size >> pgshift);
280 	}
281 }
282 
vfio_dma_bitmap_alloc_all(struct vfio_iommu * iommu,size_t pgsize)283 static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
284 {
285 	struct rb_node *n;
286 
287 	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
288 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
289 		int ret;
290 
291 		ret = vfio_dma_bitmap_alloc(dma, pgsize);
292 		if (ret) {
293 			struct rb_node *p;
294 
295 			for (p = rb_prev(n); p; p = rb_prev(p)) {
296 				struct vfio_dma *dma = rb_entry(n,
297 							struct vfio_dma, node);
298 
299 				vfio_dma_bitmap_free(dma);
300 			}
301 			return ret;
302 		}
303 		vfio_dma_populate_bitmap(dma, pgsize);
304 	}
305 	return 0;
306 }
307 
vfio_dma_bitmap_free_all(struct vfio_iommu * iommu)308 static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
309 {
310 	struct rb_node *n;
311 
312 	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
313 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
314 
315 		vfio_dma_bitmap_free(dma);
316 	}
317 }
318 
319 /*
320  * Helper Functions for host iova-pfn list
321  */
vfio_find_vpfn(struct vfio_dma * dma,dma_addr_t iova)322 static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
323 {
324 	struct vfio_pfn *vpfn;
325 	struct rb_node *node = dma->pfn_list.rb_node;
326 
327 	while (node) {
328 		vpfn = rb_entry(node, struct vfio_pfn, node);
329 
330 		if (iova < vpfn->iova)
331 			node = node->rb_left;
332 		else if (iova > vpfn->iova)
333 			node = node->rb_right;
334 		else
335 			return vpfn;
336 	}
337 	return NULL;
338 }
339 
vfio_link_pfn(struct vfio_dma * dma,struct vfio_pfn * new)340 static void vfio_link_pfn(struct vfio_dma *dma,
341 			  struct vfio_pfn *new)
342 {
343 	struct rb_node **link, *parent = NULL;
344 	struct vfio_pfn *vpfn;
345 
346 	link = &dma->pfn_list.rb_node;
347 	while (*link) {
348 		parent = *link;
349 		vpfn = rb_entry(parent, struct vfio_pfn, node);
350 
351 		if (new->iova < vpfn->iova)
352 			link = &(*link)->rb_left;
353 		else
354 			link = &(*link)->rb_right;
355 	}
356 
357 	rb_link_node(&new->node, parent, link);
358 	rb_insert_color(&new->node, &dma->pfn_list);
359 }
360 
vfio_unlink_pfn(struct vfio_dma * dma,struct vfio_pfn * old)361 static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old)
362 {
363 	rb_erase(&old->node, &dma->pfn_list);
364 }
365 
vfio_add_to_pfn_list(struct vfio_dma * dma,dma_addr_t iova,unsigned long pfn)366 static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
367 				unsigned long pfn)
368 {
369 	struct vfio_pfn *vpfn;
370 
371 	vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
372 	if (!vpfn)
373 		return -ENOMEM;
374 
375 	vpfn->iova = iova;
376 	vpfn->pfn = pfn;
377 	vpfn->ref_count = 1;
378 	vfio_link_pfn(dma, vpfn);
379 	return 0;
380 }
381 
vfio_remove_from_pfn_list(struct vfio_dma * dma,struct vfio_pfn * vpfn)382 static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
383 				      struct vfio_pfn *vpfn)
384 {
385 	vfio_unlink_pfn(dma, vpfn);
386 	kfree(vpfn);
387 }
388 
vfio_iova_get_vfio_pfn(struct vfio_dma * dma,unsigned long iova)389 static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
390 					       unsigned long iova)
391 {
392 	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
393 
394 	if (vpfn)
395 		vpfn->ref_count++;
396 	return vpfn;
397 }
398 
vfio_iova_put_vfio_pfn(struct vfio_dma * dma,struct vfio_pfn * vpfn)399 static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
400 {
401 	int ret = 0;
402 
403 	vpfn->ref_count--;
404 	if (!vpfn->ref_count) {
405 		ret = put_pfn(vpfn->pfn, dma->prot);
406 		vfio_remove_from_pfn_list(dma, vpfn);
407 	}
408 	return ret;
409 }
410 
mm_lock_acct(struct task_struct * task,struct mm_struct * mm,bool lock_cap,long npage)411 static int mm_lock_acct(struct task_struct *task, struct mm_struct *mm,
412 			bool lock_cap, long npage)
413 {
414 	int ret = mmap_write_lock_killable(mm);
415 
416 	if (ret)
417 		return ret;
418 
419 	ret = __account_locked_vm(mm, abs(npage), npage > 0, task, lock_cap);
420 	mmap_write_unlock(mm);
421 	return ret;
422 }
423 
vfio_lock_acct(struct vfio_dma * dma,long npage,bool async)424 static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
425 {
426 	struct mm_struct *mm;
427 	int ret;
428 
429 	if (!npage)
430 		return 0;
431 
432 	mm = dma->mm;
433 	if (async && !mmget_not_zero(mm))
434 		return -ESRCH; /* process exited */
435 
436 	ret = mm_lock_acct(dma->task, mm, dma->lock_cap, npage);
437 	if (!ret)
438 		dma->locked_vm += npage;
439 
440 	if (async)
441 		mmput(mm);
442 
443 	return ret;
444 }
445 
446 /*
447  * Some mappings aren't backed by a struct page, for example an mmap'd
448  * MMIO range for our own or another device.  These use a different
449  * pfn conversion and shouldn't be tracked as locked pages.
450  * For compound pages, any driver that sets the reserved bit in head
451  * page needs to set the reserved bit in all subpages to be safe.
452  */
is_invalid_reserved_pfn(unsigned long pfn)453 static bool is_invalid_reserved_pfn(unsigned long pfn)
454 {
455 	if (pfn_valid(pfn))
456 		return PageReserved(pfn_to_page(pfn));
457 
458 	return true;
459 }
460 
put_pfn(unsigned long pfn,int prot)461 static int put_pfn(unsigned long pfn, int prot)
462 {
463 	if (!is_invalid_reserved_pfn(pfn)) {
464 		struct page *page = pfn_to_page(pfn);
465 
466 		unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
467 		return 1;
468 	}
469 	return 0;
470 }
471 
472 #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
473 
__vfio_batch_init(struct vfio_batch * batch,bool single)474 static void __vfio_batch_init(struct vfio_batch *batch, bool single)
475 {
476 	batch->size = 0;
477 	batch->offset = 0;
478 
479 	if (single || unlikely(disable_hugepages))
480 		goto fallback;
481 
482 	batch->pages = (struct page **) __get_free_page(GFP_KERNEL);
483 	if (!batch->pages)
484 		goto fallback;
485 
486 	batch->capacity = VFIO_BATCH_MAX_CAPACITY;
487 	return;
488 
489 fallback:
490 	batch->pages = &batch->fallback_page;
491 	batch->capacity = 1;
492 }
493 
vfio_batch_init(struct vfio_batch * batch)494 static void vfio_batch_init(struct vfio_batch *batch)
495 {
496 	__vfio_batch_init(batch, false);
497 }
498 
vfio_batch_init_single(struct vfio_batch * batch)499 static void vfio_batch_init_single(struct vfio_batch *batch)
500 {
501 	__vfio_batch_init(batch, true);
502 }
503 
vfio_batch_unpin(struct vfio_batch * batch,struct vfio_dma * dma)504 static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma)
505 {
506 	while (batch->size) {
507 		unsigned long pfn = page_to_pfn(batch->pages[batch->offset]);
508 
509 		put_pfn(pfn, dma->prot);
510 		batch->offset++;
511 		batch->size--;
512 	}
513 }
514 
vfio_batch_fini(struct vfio_batch * batch)515 static void vfio_batch_fini(struct vfio_batch *batch)
516 {
517 	if (batch->capacity == VFIO_BATCH_MAX_CAPACITY)
518 		free_page((unsigned long)batch->pages);
519 }
520 
follow_fault_pfn(struct vm_area_struct * vma,struct mm_struct * mm,unsigned long vaddr,unsigned long * pfn,unsigned long * addr_mask,bool write_fault)521 static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
522 			    unsigned long vaddr, unsigned long *pfn,
523 			    unsigned long *addr_mask, bool write_fault)
524 {
525 	struct follow_pfnmap_args args = { .vma = vma, .address = vaddr };
526 	int ret;
527 
528 	ret = follow_pfnmap_start(&args);
529 	if (ret) {
530 		bool unlocked = false;
531 
532 		ret = fixup_user_fault(mm, vaddr,
533 				       FAULT_FLAG_REMOTE |
534 				       (write_fault ?  FAULT_FLAG_WRITE : 0),
535 				       &unlocked);
536 		if (unlocked)
537 			return -EAGAIN;
538 
539 		if (ret)
540 			return ret;
541 
542 		ret = follow_pfnmap_start(&args);
543 		if (ret)
544 			return ret;
545 	}
546 
547 	if (write_fault && !args.writable) {
548 		ret = -EFAULT;
549 	} else {
550 		*pfn = args.pfn;
551 		*addr_mask = args.addr_mask;
552 	}
553 
554 	follow_pfnmap_end(&args);
555 	return ret;
556 }
557 
558 /*
559  * Returns the positive number of pfns successfully obtained or a negative
560  * error code.  The initial pfn is stored in the pfn arg.  For page-backed
561  * pfns, the provided batch is also updated to indicate the filled pages and
562  * initial offset.  For VM_PFNMAP pfns, only the returned number of pfns and
563  * returned initial pfn are provided; subsequent pfns are contiguous.
564  */
vaddr_get_pfns(struct mm_struct * mm,unsigned long vaddr,unsigned long npages,int prot,unsigned long * pfn,struct vfio_batch * batch)565 static long vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
566 			   unsigned long npages, int prot, unsigned long *pfn,
567 			   struct vfio_batch *batch)
568 {
569 	unsigned long pin_pages = min_t(unsigned long, npages, batch->capacity);
570 	struct vm_area_struct *vma;
571 	unsigned int flags = 0;
572 	long ret;
573 
574 	if (prot & IOMMU_WRITE)
575 		flags |= FOLL_WRITE;
576 
577 	mmap_read_lock(mm);
578 	ret = pin_user_pages_remote(mm, vaddr, pin_pages, flags | FOLL_LONGTERM,
579 				    batch->pages, NULL);
580 	if (ret > 0) {
581 		*pfn = page_to_pfn(batch->pages[0]);
582 		batch->size = ret;
583 		batch->offset = 0;
584 		goto done;
585 	} else if (!ret) {
586 		ret = -EFAULT;
587 	}
588 
589 	vaddr = untagged_addr_remote(mm, vaddr);
590 
591 retry:
592 	vma = vma_lookup(mm, vaddr);
593 
594 	if (vma && vma->vm_flags & VM_PFNMAP) {
595 		unsigned long addr_mask;
596 
597 		ret = follow_fault_pfn(vma, mm, vaddr, pfn, &addr_mask,
598 				       prot & IOMMU_WRITE);
599 		if (ret == -EAGAIN)
600 			goto retry;
601 
602 		if (!ret) {
603 			if (is_invalid_reserved_pfn(*pfn)) {
604 				unsigned long epfn;
605 
606 				epfn = (*pfn | (~addr_mask >> PAGE_SHIFT)) + 1;
607 				ret = min_t(long, npages, epfn - *pfn);
608 			} else {
609 				ret = -EFAULT;
610 			}
611 		}
612 	}
613 done:
614 	mmap_read_unlock(mm);
615 	return ret;
616 }
617 
618 /*
619  * Attempt to pin pages.  We really don't want to track all the pfns and
620  * the iommu can only map chunks of consecutive pfns anyway, so get the
621  * first page and all consecutive pages with the same locking.
622  */
vfio_pin_pages_remote(struct vfio_dma * dma,unsigned long vaddr,unsigned long npage,unsigned long * pfn_base,unsigned long limit,struct vfio_batch * batch)623 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
624 				  unsigned long npage, unsigned long *pfn_base,
625 				  unsigned long limit, struct vfio_batch *batch)
626 {
627 	unsigned long pfn;
628 	struct mm_struct *mm = current->mm;
629 	long ret, pinned = 0, lock_acct = 0;
630 	bool rsvd;
631 	dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
632 
633 	/* This code path is only user initiated */
634 	if (!mm)
635 		return -ENODEV;
636 
637 	if (batch->size) {
638 		/* Leftover pages in batch from an earlier call. */
639 		*pfn_base = page_to_pfn(batch->pages[batch->offset]);
640 		pfn = *pfn_base;
641 		rsvd = is_invalid_reserved_pfn(*pfn_base);
642 	} else {
643 		*pfn_base = 0;
644 	}
645 
646 	if (unlikely(disable_hugepages))
647 		npage = 1;
648 
649 	while (npage) {
650 		if (!batch->size) {
651 			/* Empty batch, so refill it. */
652 			ret = vaddr_get_pfns(mm, vaddr, npage, dma->prot,
653 					     &pfn, batch);
654 			if (ret < 0)
655 				goto unpin_out;
656 
657 			if (!*pfn_base) {
658 				*pfn_base = pfn;
659 				rsvd = is_invalid_reserved_pfn(*pfn_base);
660 			}
661 
662 			/* Handle pfnmap */
663 			if (!batch->size) {
664 				if (pfn != *pfn_base + pinned || !rsvd)
665 					goto out;
666 
667 				pinned += ret;
668 				npage -= ret;
669 				vaddr += (PAGE_SIZE * ret);
670 				iova += (PAGE_SIZE * ret);
671 				continue;
672 			}
673 		}
674 
675 		/*
676 		 * pfn is preset for the first iteration of this inner loop
677 		 * due to the fact that vaddr_get_pfns() needs to provide the
678 		 * initial pfn for pfnmaps.  Therefore to reduce redundancy,
679 		 * the next pfn is fetched at the end of the loop.
680 		 * A PageReserved() page could still qualify as page backed
681 		 * and rsvd here, and therefore continues to use the batch.
682 		 */
683 		while (true) {
684 			if (pfn != *pfn_base + pinned ||
685 			    rsvd != is_invalid_reserved_pfn(pfn))
686 				goto out;
687 
688 			/*
689 			 * Reserved pages aren't counted against the user,
690 			 * externally pinned pages are already counted against
691 			 * the user.
692 			 */
693 			if (!rsvd && !vfio_find_vpfn(dma, iova)) {
694 				if (!dma->lock_cap &&
695 				    mm->locked_vm + lock_acct + 1 > limit) {
696 					pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
697 						__func__, limit << PAGE_SHIFT);
698 					ret = -ENOMEM;
699 					goto unpin_out;
700 				}
701 				lock_acct++;
702 			}
703 
704 			pinned++;
705 			npage--;
706 			vaddr += PAGE_SIZE;
707 			iova += PAGE_SIZE;
708 			batch->offset++;
709 			batch->size--;
710 
711 			if (!batch->size)
712 				break;
713 
714 			pfn = page_to_pfn(batch->pages[batch->offset]);
715 		}
716 	}
717 
718 out:
719 	ret = vfio_lock_acct(dma, lock_acct, false);
720 
721 unpin_out:
722 	if (ret < 0) {
723 		if (pinned && !rsvd) {
724 			for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
725 				put_pfn(pfn, dma->prot);
726 		}
727 		vfio_batch_unpin(batch, dma);
728 
729 		return ret;
730 	}
731 
732 	return pinned;
733 }
734 
vfio_unpin_pages_remote(struct vfio_dma * dma,dma_addr_t iova,unsigned long pfn,unsigned long npage,bool do_accounting)735 static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
736 				    unsigned long pfn, unsigned long npage,
737 				    bool do_accounting)
738 {
739 	long unlocked = 0, locked = 0;
740 	long i;
741 
742 	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
743 		if (put_pfn(pfn++, dma->prot)) {
744 			unlocked++;
745 			if (vfio_find_vpfn(dma, iova))
746 				locked++;
747 		}
748 	}
749 
750 	if (do_accounting)
751 		vfio_lock_acct(dma, locked - unlocked, true);
752 
753 	return unlocked;
754 }
755 
vfio_pin_page_external(struct vfio_dma * dma,unsigned long vaddr,unsigned long * pfn_base,bool do_accounting)756 static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
757 				  unsigned long *pfn_base, bool do_accounting)
758 {
759 	struct vfio_batch batch;
760 	struct mm_struct *mm;
761 	int ret;
762 
763 	mm = dma->mm;
764 	if (!mmget_not_zero(mm))
765 		return -ENODEV;
766 
767 	vfio_batch_init_single(&batch);
768 
769 	ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, &batch);
770 	if (ret != 1)
771 		goto out;
772 
773 	ret = 0;
774 
775 	if (do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
776 		ret = vfio_lock_acct(dma, 1, false);
777 		if (ret) {
778 			put_pfn(*pfn_base, dma->prot);
779 			if (ret == -ENOMEM)
780 				pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK "
781 					"(%ld) exceeded\n", __func__,
782 					dma->task->comm, task_pid_nr(dma->task),
783 					task_rlimit(dma->task, RLIMIT_MEMLOCK));
784 		}
785 	}
786 
787 out:
788 	vfio_batch_fini(&batch);
789 	mmput(mm);
790 	return ret;
791 }
792 
vfio_unpin_page_external(struct vfio_dma * dma,dma_addr_t iova,bool do_accounting)793 static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
794 				    bool do_accounting)
795 {
796 	int unlocked;
797 	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
798 
799 	if (!vpfn)
800 		return 0;
801 
802 	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
803 
804 	if (do_accounting)
805 		vfio_lock_acct(dma, -unlocked, true);
806 
807 	return unlocked;
808 }
809 
vfio_iommu_type1_pin_pages(void * iommu_data,struct iommu_group * iommu_group,dma_addr_t user_iova,int npage,int prot,struct page ** pages)810 static int vfio_iommu_type1_pin_pages(void *iommu_data,
811 				      struct iommu_group *iommu_group,
812 				      dma_addr_t user_iova,
813 				      int npage, int prot,
814 				      struct page **pages)
815 {
816 	struct vfio_iommu *iommu = iommu_data;
817 	struct vfio_iommu_group *group;
818 	int i, j, ret;
819 	unsigned long remote_vaddr;
820 	struct vfio_dma *dma;
821 	bool do_accounting;
822 
823 	if (!iommu || !pages)
824 		return -EINVAL;
825 
826 	/* Supported for v2 version only */
827 	if (!iommu->v2)
828 		return -EACCES;
829 
830 	mutex_lock(&iommu->lock);
831 
832 	if (WARN_ONCE(iommu->vaddr_invalid_count,
833 		      "vfio_pin_pages not allowed with VFIO_UPDATE_VADDR\n")) {
834 		ret = -EBUSY;
835 		goto pin_done;
836 	}
837 
838 	/* Fail if no dma_umap notifier is registered */
839 	if (list_empty(&iommu->device_list)) {
840 		ret = -EINVAL;
841 		goto pin_done;
842 	}
843 
844 	/*
845 	 * If iommu capable domain exist in the container then all pages are
846 	 * already pinned and accounted. Accounting should be done if there is no
847 	 * iommu capable domain in the container.
848 	 */
849 	do_accounting = list_empty(&iommu->domain_list);
850 
851 	for (i = 0; i < npage; i++) {
852 		unsigned long phys_pfn;
853 		dma_addr_t iova;
854 		struct vfio_pfn *vpfn;
855 
856 		iova = user_iova + PAGE_SIZE * i;
857 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
858 		if (!dma) {
859 			ret = -EINVAL;
860 			goto pin_unwind;
861 		}
862 
863 		if ((dma->prot & prot) != prot) {
864 			ret = -EPERM;
865 			goto pin_unwind;
866 		}
867 
868 		vpfn = vfio_iova_get_vfio_pfn(dma, iova);
869 		if (vpfn) {
870 			pages[i] = pfn_to_page(vpfn->pfn);
871 			continue;
872 		}
873 
874 		remote_vaddr = dma->vaddr + (iova - dma->iova);
875 		ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn,
876 					     do_accounting);
877 		if (ret)
878 			goto pin_unwind;
879 
880 		if (!pfn_valid(phys_pfn)) {
881 			ret = -EINVAL;
882 			goto pin_unwind;
883 		}
884 
885 		ret = vfio_add_to_pfn_list(dma, iova, phys_pfn);
886 		if (ret) {
887 			if (put_pfn(phys_pfn, dma->prot) && do_accounting)
888 				vfio_lock_acct(dma, -1, true);
889 			goto pin_unwind;
890 		}
891 
892 		pages[i] = pfn_to_page(phys_pfn);
893 
894 		if (iommu->dirty_page_tracking) {
895 			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
896 
897 			/*
898 			 * Bitmap populated with the smallest supported page
899 			 * size
900 			 */
901 			bitmap_set(dma->bitmap,
902 				   (iova - dma->iova) >> pgshift, 1);
903 		}
904 	}
905 	ret = i;
906 
907 	group = vfio_iommu_find_iommu_group(iommu, iommu_group);
908 	if (!group->pinned_page_dirty_scope) {
909 		group->pinned_page_dirty_scope = true;
910 		iommu->num_non_pinned_groups--;
911 	}
912 
913 	goto pin_done;
914 
915 pin_unwind:
916 	pages[i] = NULL;
917 	for (j = 0; j < i; j++) {
918 		dma_addr_t iova;
919 
920 		iova = user_iova + PAGE_SIZE * j;
921 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
922 		vfio_unpin_page_external(dma, iova, do_accounting);
923 		pages[j] = NULL;
924 	}
925 pin_done:
926 	mutex_unlock(&iommu->lock);
927 	return ret;
928 }
929 
vfio_iommu_type1_unpin_pages(void * iommu_data,dma_addr_t user_iova,int npage)930 static void vfio_iommu_type1_unpin_pages(void *iommu_data,
931 					 dma_addr_t user_iova, int npage)
932 {
933 	struct vfio_iommu *iommu = iommu_data;
934 	bool do_accounting;
935 	int i;
936 
937 	/* Supported for v2 version only */
938 	if (WARN_ON(!iommu->v2))
939 		return;
940 
941 	mutex_lock(&iommu->lock);
942 
943 	do_accounting = list_empty(&iommu->domain_list);
944 	for (i = 0; i < npage; i++) {
945 		dma_addr_t iova = user_iova + PAGE_SIZE * i;
946 		struct vfio_dma *dma;
947 
948 		dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
949 		if (!dma)
950 			break;
951 
952 		vfio_unpin_page_external(dma, iova, do_accounting);
953 	}
954 
955 	mutex_unlock(&iommu->lock);
956 
957 	WARN_ON(i != npage);
958 }
959 
vfio_sync_unpin(struct vfio_dma * dma,struct vfio_domain * domain,struct list_head * regions,struct iommu_iotlb_gather * iotlb_gather)960 static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
961 			    struct list_head *regions,
962 			    struct iommu_iotlb_gather *iotlb_gather)
963 {
964 	long unlocked = 0;
965 	struct vfio_regions *entry, *next;
966 
967 	iommu_iotlb_sync(domain->domain, iotlb_gather);
968 
969 	list_for_each_entry_safe(entry, next, regions, list) {
970 		unlocked += vfio_unpin_pages_remote(dma,
971 						    entry->iova,
972 						    entry->phys >> PAGE_SHIFT,
973 						    entry->len >> PAGE_SHIFT,
974 						    false);
975 		list_del(&entry->list);
976 		kfree(entry);
977 	}
978 
979 	cond_resched();
980 
981 	return unlocked;
982 }
983 
984 /*
985  * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
986  * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
987  * of these regions (currently using a list).
988  *
989  * This value specifies maximum number of regions for each IOTLB flush sync.
990  */
991 #define VFIO_IOMMU_TLB_SYNC_MAX		512
992 
unmap_unpin_fast(struct vfio_domain * domain,struct vfio_dma * dma,dma_addr_t * iova,size_t len,phys_addr_t phys,long * unlocked,struct list_head * unmapped_list,int * unmapped_cnt,struct iommu_iotlb_gather * iotlb_gather)993 static size_t unmap_unpin_fast(struct vfio_domain *domain,
994 			       struct vfio_dma *dma, dma_addr_t *iova,
995 			       size_t len, phys_addr_t phys, long *unlocked,
996 			       struct list_head *unmapped_list,
997 			       int *unmapped_cnt,
998 			       struct iommu_iotlb_gather *iotlb_gather)
999 {
1000 	size_t unmapped = 0;
1001 	struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
1002 
1003 	if (entry) {
1004 		unmapped = iommu_unmap_fast(domain->domain, *iova, len,
1005 					    iotlb_gather);
1006 
1007 		if (!unmapped) {
1008 			kfree(entry);
1009 		} else {
1010 			entry->iova = *iova;
1011 			entry->phys = phys;
1012 			entry->len  = unmapped;
1013 			list_add_tail(&entry->list, unmapped_list);
1014 
1015 			*iova += unmapped;
1016 			(*unmapped_cnt)++;
1017 		}
1018 	}
1019 
1020 	/*
1021 	 * Sync if the number of fast-unmap regions hits the limit
1022 	 * or in case of errors.
1023 	 */
1024 	if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
1025 		*unlocked += vfio_sync_unpin(dma, domain, unmapped_list,
1026 					     iotlb_gather);
1027 		*unmapped_cnt = 0;
1028 	}
1029 
1030 	return unmapped;
1031 }
1032 
unmap_unpin_slow(struct vfio_domain * domain,struct vfio_dma * dma,dma_addr_t * iova,size_t len,phys_addr_t phys,long * unlocked)1033 static size_t unmap_unpin_slow(struct vfio_domain *domain,
1034 			       struct vfio_dma *dma, dma_addr_t *iova,
1035 			       size_t len, phys_addr_t phys,
1036 			       long *unlocked)
1037 {
1038 	size_t unmapped = iommu_unmap(domain->domain, *iova, len);
1039 
1040 	if (unmapped) {
1041 		*unlocked += vfio_unpin_pages_remote(dma, *iova,
1042 						     phys >> PAGE_SHIFT,
1043 						     unmapped >> PAGE_SHIFT,
1044 						     false);
1045 		*iova += unmapped;
1046 		cond_resched();
1047 	}
1048 	return unmapped;
1049 }
1050 
vfio_unmap_unpin(struct vfio_iommu * iommu,struct vfio_dma * dma,bool do_accounting)1051 static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
1052 			     bool do_accounting)
1053 {
1054 	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
1055 	struct vfio_domain *domain, *d;
1056 	LIST_HEAD(unmapped_region_list);
1057 	struct iommu_iotlb_gather iotlb_gather;
1058 	int unmapped_region_cnt = 0;
1059 	long unlocked = 0;
1060 
1061 	if (!dma->size)
1062 		return 0;
1063 
1064 	if (list_empty(&iommu->domain_list))
1065 		return 0;
1066 
1067 	/*
1068 	 * We use the IOMMU to track the physical addresses, otherwise we'd
1069 	 * need a much more complicated tracking system.  Unfortunately that
1070 	 * means we need to use one of the iommu domains to figure out the
1071 	 * pfns to unpin.  The rest need to be unmapped in advance so we have
1072 	 * no iommu translations remaining when the pages are unpinned.
1073 	 */
1074 	domain = d = list_first_entry(&iommu->domain_list,
1075 				      struct vfio_domain, next);
1076 
1077 	list_for_each_entry_continue(d, &iommu->domain_list, next) {
1078 		iommu_unmap(d->domain, dma->iova, dma->size);
1079 		cond_resched();
1080 	}
1081 
1082 	iommu_iotlb_gather_init(&iotlb_gather);
1083 	while (iova < end) {
1084 		size_t unmapped, len;
1085 		phys_addr_t phys, next;
1086 
1087 		phys = iommu_iova_to_phys(domain->domain, iova);
1088 		if (WARN_ON(!phys)) {
1089 			iova += PAGE_SIZE;
1090 			continue;
1091 		}
1092 
1093 		/*
1094 		 * To optimize for fewer iommu_unmap() calls, each of which
1095 		 * may require hardware cache flushing, try to find the
1096 		 * largest contiguous physical memory chunk to unmap.
1097 		 */
1098 		for (len = PAGE_SIZE;
1099 		     !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
1100 			next = iommu_iova_to_phys(domain->domain, iova + len);
1101 			if (next != phys + len)
1102 				break;
1103 		}
1104 
1105 		/*
1106 		 * First, try to use fast unmap/unpin. In case of failure,
1107 		 * switch to slow unmap/unpin path.
1108 		 */
1109 		unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
1110 					    &unlocked, &unmapped_region_list,
1111 					    &unmapped_region_cnt,
1112 					    &iotlb_gather);
1113 		if (!unmapped) {
1114 			unmapped = unmap_unpin_slow(domain, dma, &iova, len,
1115 						    phys, &unlocked);
1116 			if (WARN_ON(!unmapped))
1117 				break;
1118 		}
1119 	}
1120 
1121 	dma->iommu_mapped = false;
1122 
1123 	if (unmapped_region_cnt) {
1124 		unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list,
1125 					    &iotlb_gather);
1126 	}
1127 
1128 	if (do_accounting) {
1129 		vfio_lock_acct(dma, -unlocked, true);
1130 		return 0;
1131 	}
1132 	return unlocked;
1133 }
1134 
vfio_remove_dma(struct vfio_iommu * iommu,struct vfio_dma * dma)1135 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
1136 {
1137 	WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
1138 	vfio_unmap_unpin(iommu, dma, true);
1139 	vfio_unlink_dma(iommu, dma);
1140 	put_task_struct(dma->task);
1141 	mmdrop(dma->mm);
1142 	vfio_dma_bitmap_free(dma);
1143 	if (dma->vaddr_invalid)
1144 		iommu->vaddr_invalid_count--;
1145 	kfree(dma);
1146 	iommu->dma_avail++;
1147 }
1148 
vfio_update_pgsize_bitmap(struct vfio_iommu * iommu)1149 static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
1150 {
1151 	struct vfio_domain *domain;
1152 
1153 	iommu->pgsize_bitmap = ULONG_MAX;
1154 
1155 	list_for_each_entry(domain, &iommu->domain_list, next)
1156 		iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
1157 
1158 	/*
1159 	 * In case the IOMMU supports page sizes smaller than PAGE_SIZE
1160 	 * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes.
1161 	 * That way the user will be able to map/unmap buffers whose size/
1162 	 * start address is aligned with PAGE_SIZE. Pinning code uses that
1163 	 * granularity while iommu driver can use the sub-PAGE_SIZE size
1164 	 * to map the buffer.
1165 	 */
1166 	if (iommu->pgsize_bitmap & ~PAGE_MASK) {
1167 		iommu->pgsize_bitmap &= PAGE_MASK;
1168 		iommu->pgsize_bitmap |= PAGE_SIZE;
1169 	}
1170 }
1171 
update_user_bitmap(u64 __user * bitmap,struct vfio_iommu * iommu,struct vfio_dma * dma,dma_addr_t base_iova,size_t pgsize)1172 static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1173 			      struct vfio_dma *dma, dma_addr_t base_iova,
1174 			      size_t pgsize)
1175 {
1176 	unsigned long pgshift = __ffs(pgsize);
1177 	unsigned long nbits = dma->size >> pgshift;
1178 	unsigned long bit_offset = (dma->iova - base_iova) >> pgshift;
1179 	unsigned long copy_offset = bit_offset / BITS_PER_LONG;
1180 	unsigned long shift = bit_offset % BITS_PER_LONG;
1181 	unsigned long leftover;
1182 
1183 	/*
1184 	 * mark all pages dirty if any IOMMU capable device is not able
1185 	 * to report dirty pages and all pages are pinned and mapped.
1186 	 */
1187 	if (iommu->num_non_pinned_groups && dma->iommu_mapped)
1188 		bitmap_set(dma->bitmap, 0, nbits);
1189 
1190 	if (shift) {
1191 		bitmap_shift_left(dma->bitmap, dma->bitmap, shift,
1192 				  nbits + shift);
1193 
1194 		if (copy_from_user(&leftover,
1195 				   (void __user *)(bitmap + copy_offset),
1196 				   sizeof(leftover)))
1197 			return -EFAULT;
1198 
1199 		bitmap_or(dma->bitmap, dma->bitmap, &leftover, shift);
1200 	}
1201 
1202 	if (copy_to_user((void __user *)(bitmap + copy_offset), dma->bitmap,
1203 			 DIRTY_BITMAP_BYTES(nbits + shift)))
1204 		return -EFAULT;
1205 
1206 	return 0;
1207 }
1208 
vfio_iova_dirty_bitmap(u64 __user * bitmap,struct vfio_iommu * iommu,dma_addr_t iova,size_t size,size_t pgsize)1209 static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1210 				  dma_addr_t iova, size_t size, size_t pgsize)
1211 {
1212 	struct vfio_dma *dma;
1213 	struct rb_node *n;
1214 	unsigned long pgshift = __ffs(pgsize);
1215 	int ret;
1216 
1217 	/*
1218 	 * GET_BITMAP request must fully cover vfio_dma mappings.  Multiple
1219 	 * vfio_dma mappings may be clubbed by specifying large ranges, but
1220 	 * there must not be any previous mappings bisected by the range.
1221 	 * An error will be returned if these conditions are not met.
1222 	 */
1223 	dma = vfio_find_dma(iommu, iova, 1);
1224 	if (dma && dma->iova != iova)
1225 		return -EINVAL;
1226 
1227 	dma = vfio_find_dma(iommu, iova + size - 1, 0);
1228 	if (dma && dma->iova + dma->size != iova + size)
1229 		return -EINVAL;
1230 
1231 	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1232 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1233 
1234 		if (dma->iova < iova)
1235 			continue;
1236 
1237 		if (dma->iova > iova + size - 1)
1238 			break;
1239 
1240 		ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize);
1241 		if (ret)
1242 			return ret;
1243 
1244 		/*
1245 		 * Re-populate bitmap to include all pinned pages which are
1246 		 * considered as dirty but exclude pages which are unpinned and
1247 		 * pages which are marked dirty by vfio_dma_rw()
1248 		 */
1249 		bitmap_clear(dma->bitmap, 0, dma->size >> pgshift);
1250 		vfio_dma_populate_bitmap(dma, pgsize);
1251 	}
1252 	return 0;
1253 }
1254 
verify_bitmap_size(uint64_t npages,uint64_t bitmap_size)1255 static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
1256 {
1257 	if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX) ||
1258 	    (bitmap_size < DIRTY_BITMAP_BYTES(npages)))
1259 		return -EINVAL;
1260 
1261 	return 0;
1262 }
1263 
1264 /*
1265  * Notify VFIO drivers using vfio_register_emulated_iommu_dev() to invalidate
1266  * and unmap iovas within the range we're about to unmap. Drivers MUST unpin
1267  * pages in response to an invalidation.
1268  */
vfio_notify_dma_unmap(struct vfio_iommu * iommu,struct vfio_dma * dma)1269 static void vfio_notify_dma_unmap(struct vfio_iommu *iommu,
1270 				  struct vfio_dma *dma)
1271 {
1272 	struct vfio_device *device;
1273 
1274 	if (list_empty(&iommu->device_list))
1275 		return;
1276 
1277 	/*
1278 	 * The device is expected to call vfio_unpin_pages() for any IOVA it has
1279 	 * pinned within the range. Since vfio_unpin_pages() will eventually
1280 	 * call back down to this code and try to obtain the iommu->lock we must
1281 	 * drop it.
1282 	 */
1283 	mutex_lock(&iommu->device_list_lock);
1284 	mutex_unlock(&iommu->lock);
1285 
1286 	list_for_each_entry(device, &iommu->device_list, iommu_entry)
1287 		device->ops->dma_unmap(device, dma->iova, dma->size);
1288 
1289 	mutex_unlock(&iommu->device_list_lock);
1290 	mutex_lock(&iommu->lock);
1291 }
1292 
vfio_dma_do_unmap(struct vfio_iommu * iommu,struct vfio_iommu_type1_dma_unmap * unmap,struct vfio_bitmap * bitmap)1293 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
1294 			     struct vfio_iommu_type1_dma_unmap *unmap,
1295 			     struct vfio_bitmap *bitmap)
1296 {
1297 	struct vfio_dma *dma, *dma_last = NULL;
1298 	size_t unmapped = 0, pgsize;
1299 	int ret = -EINVAL, retries = 0;
1300 	unsigned long pgshift;
1301 	dma_addr_t iova = unmap->iova;
1302 	u64 size = unmap->size;
1303 	bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL;
1304 	bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR;
1305 	struct rb_node *n, *first_n;
1306 
1307 	mutex_lock(&iommu->lock);
1308 
1309 	/* Cannot update vaddr if mdev is present. */
1310 	if (invalidate_vaddr && !list_empty(&iommu->emulated_iommu_groups)) {
1311 		ret = -EBUSY;
1312 		goto unlock;
1313 	}
1314 
1315 	pgshift = __ffs(iommu->pgsize_bitmap);
1316 	pgsize = (size_t)1 << pgshift;
1317 
1318 	if (iova & (pgsize - 1))
1319 		goto unlock;
1320 
1321 	if (unmap_all) {
1322 		if (iova || size)
1323 			goto unlock;
1324 		size = U64_MAX;
1325 	} else if (!size || size & (pgsize - 1) ||
1326 		   iova + size - 1 < iova || size > SIZE_MAX) {
1327 		goto unlock;
1328 	}
1329 
1330 	/* When dirty tracking is enabled, allow only min supported pgsize */
1331 	if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
1332 	    (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
1333 		goto unlock;
1334 	}
1335 
1336 	WARN_ON((pgsize - 1) & PAGE_MASK);
1337 again:
1338 	/*
1339 	 * vfio-iommu-type1 (v1) - User mappings were coalesced together to
1340 	 * avoid tracking individual mappings.  This means that the granularity
1341 	 * of the original mapping was lost and the user was allowed to attempt
1342 	 * to unmap any range.  Depending on the contiguousness of physical
1343 	 * memory and page sizes supported by the IOMMU, arbitrary unmaps may
1344 	 * or may not have worked.  We only guaranteed unmap granularity
1345 	 * matching the original mapping; even though it was untracked here,
1346 	 * the original mappings are reflected in IOMMU mappings.  This
1347 	 * resulted in a couple unusual behaviors.  First, if a range is not
1348 	 * able to be unmapped, ex. a set of 4k pages that was mapped as a
1349 	 * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
1350 	 * a zero sized unmap.  Also, if an unmap request overlaps the first
1351 	 * address of a hugepage, the IOMMU will unmap the entire hugepage.
1352 	 * This also returns success and the returned unmap size reflects the
1353 	 * actual size unmapped.
1354 	 *
1355 	 * We attempt to maintain compatibility with this "v1" interface, but
1356 	 * we take control out of the hands of the IOMMU.  Therefore, an unmap
1357 	 * request offset from the beginning of the original mapping will
1358 	 * return success with zero sized unmap.  And an unmap request covering
1359 	 * the first iova of mapping will unmap the entire range.
1360 	 *
1361 	 * The v2 version of this interface intends to be more deterministic.
1362 	 * Unmap requests must fully cover previous mappings.  Multiple
1363 	 * mappings may still be unmaped by specifying large ranges, but there
1364 	 * must not be any previous mappings bisected by the range.  An error
1365 	 * will be returned if these conditions are not met.  The v2 interface
1366 	 * will only return success and a size of zero if there were no
1367 	 * mappings within the range.
1368 	 */
1369 	if (iommu->v2 && !unmap_all) {
1370 		dma = vfio_find_dma(iommu, iova, 1);
1371 		if (dma && dma->iova != iova)
1372 			goto unlock;
1373 
1374 		dma = vfio_find_dma(iommu, iova + size - 1, 0);
1375 		if (dma && dma->iova + dma->size != iova + size)
1376 			goto unlock;
1377 	}
1378 
1379 	ret = 0;
1380 	n = first_n = vfio_find_dma_first_node(iommu, iova, size);
1381 
1382 	while (n) {
1383 		dma = rb_entry(n, struct vfio_dma, node);
1384 		if (dma->iova >= iova + size)
1385 			break;
1386 
1387 		if (!iommu->v2 && iova > dma->iova)
1388 			break;
1389 
1390 		if (invalidate_vaddr) {
1391 			if (dma->vaddr_invalid) {
1392 				struct rb_node *last_n = n;
1393 
1394 				for (n = first_n; n != last_n; n = rb_next(n)) {
1395 					dma = rb_entry(n,
1396 						       struct vfio_dma, node);
1397 					dma->vaddr_invalid = false;
1398 					iommu->vaddr_invalid_count--;
1399 				}
1400 				ret = -EINVAL;
1401 				unmapped = 0;
1402 				break;
1403 			}
1404 			dma->vaddr_invalid = true;
1405 			iommu->vaddr_invalid_count++;
1406 			unmapped += dma->size;
1407 			n = rb_next(n);
1408 			continue;
1409 		}
1410 
1411 		if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
1412 			if (dma_last == dma) {
1413 				BUG_ON(++retries > 10);
1414 			} else {
1415 				dma_last = dma;
1416 				retries = 0;
1417 			}
1418 
1419 			vfio_notify_dma_unmap(iommu, dma);
1420 			goto again;
1421 		}
1422 
1423 		if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
1424 			ret = update_user_bitmap(bitmap->data, iommu, dma,
1425 						 iova, pgsize);
1426 			if (ret)
1427 				break;
1428 		}
1429 
1430 		unmapped += dma->size;
1431 		n = rb_next(n);
1432 		vfio_remove_dma(iommu, dma);
1433 	}
1434 
1435 unlock:
1436 	mutex_unlock(&iommu->lock);
1437 
1438 	/* Report how much was unmapped */
1439 	unmap->size = unmapped;
1440 
1441 	return ret;
1442 }
1443 
vfio_iommu_map(struct vfio_iommu * iommu,dma_addr_t iova,unsigned long pfn,long npage,int prot)1444 static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
1445 			  unsigned long pfn, long npage, int prot)
1446 {
1447 	struct vfio_domain *d;
1448 	int ret;
1449 
1450 	list_for_each_entry(d, &iommu->domain_list, next) {
1451 		ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
1452 				npage << PAGE_SHIFT, prot | IOMMU_CACHE,
1453 				GFP_KERNEL_ACCOUNT);
1454 		if (ret)
1455 			goto unwind;
1456 
1457 		cond_resched();
1458 	}
1459 
1460 	return 0;
1461 
1462 unwind:
1463 	list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) {
1464 		iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
1465 		cond_resched();
1466 	}
1467 
1468 	return ret;
1469 }
1470 
vfio_pin_map_dma(struct vfio_iommu * iommu,struct vfio_dma * dma,size_t map_size)1471 static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
1472 			    size_t map_size)
1473 {
1474 	dma_addr_t iova = dma->iova;
1475 	unsigned long vaddr = dma->vaddr;
1476 	struct vfio_batch batch;
1477 	size_t size = map_size;
1478 	long npage;
1479 	unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1480 	int ret = 0;
1481 
1482 	vfio_batch_init(&batch);
1483 
1484 	while (size) {
1485 		/* Pin a contiguous chunk of memory */
1486 		npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
1487 					      size >> PAGE_SHIFT, &pfn, limit,
1488 					      &batch);
1489 		if (npage <= 0) {
1490 			WARN_ON(!npage);
1491 			ret = (int)npage;
1492 			break;
1493 		}
1494 
1495 		/* Map it! */
1496 		ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
1497 				     dma->prot);
1498 		if (ret) {
1499 			vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
1500 						npage, true);
1501 			vfio_batch_unpin(&batch, dma);
1502 			break;
1503 		}
1504 
1505 		size -= npage << PAGE_SHIFT;
1506 		dma->size += npage << PAGE_SHIFT;
1507 	}
1508 
1509 	vfio_batch_fini(&batch);
1510 	dma->iommu_mapped = true;
1511 
1512 	if (ret)
1513 		vfio_remove_dma(iommu, dma);
1514 
1515 	return ret;
1516 }
1517 
1518 /*
1519  * Check dma map request is within a valid iova range
1520  */
vfio_iommu_iova_dma_valid(struct vfio_iommu * iommu,dma_addr_t start,dma_addr_t end)1521 static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
1522 				      dma_addr_t start, dma_addr_t end)
1523 {
1524 	struct list_head *iova = &iommu->iova_list;
1525 	struct vfio_iova *node;
1526 
1527 	list_for_each_entry(node, iova, list) {
1528 		if (start >= node->start && end <= node->end)
1529 			return true;
1530 	}
1531 
1532 	/*
1533 	 * Check for list_empty() as well since a container with
1534 	 * a single mdev device will have an empty list.
1535 	 */
1536 	return list_empty(iova);
1537 }
1538 
vfio_change_dma_owner(struct vfio_dma * dma)1539 static int vfio_change_dma_owner(struct vfio_dma *dma)
1540 {
1541 	struct task_struct *task = current->group_leader;
1542 	struct mm_struct *mm = current->mm;
1543 	long npage = dma->locked_vm;
1544 	bool lock_cap;
1545 	int ret;
1546 
1547 	if (mm == dma->mm)
1548 		return 0;
1549 
1550 	lock_cap = capable(CAP_IPC_LOCK);
1551 	ret = mm_lock_acct(task, mm, lock_cap, npage);
1552 	if (ret)
1553 		return ret;
1554 
1555 	if (mmget_not_zero(dma->mm)) {
1556 		mm_lock_acct(dma->task, dma->mm, dma->lock_cap, -npage);
1557 		mmput(dma->mm);
1558 	}
1559 
1560 	if (dma->task != task) {
1561 		put_task_struct(dma->task);
1562 		dma->task = get_task_struct(task);
1563 	}
1564 	mmdrop(dma->mm);
1565 	dma->mm = mm;
1566 	mmgrab(dma->mm);
1567 	dma->lock_cap = lock_cap;
1568 	return 0;
1569 }
1570 
vfio_dma_do_map(struct vfio_iommu * iommu,struct vfio_iommu_type1_dma_map * map)1571 static int vfio_dma_do_map(struct vfio_iommu *iommu,
1572 			   struct vfio_iommu_type1_dma_map *map)
1573 {
1574 	bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR;
1575 	dma_addr_t iova = map->iova;
1576 	unsigned long vaddr = map->vaddr;
1577 	size_t size = map->size;
1578 	int ret = 0, prot = 0;
1579 	size_t pgsize;
1580 	struct vfio_dma *dma;
1581 
1582 	/* Verify that none of our __u64 fields overflow */
1583 	if (map->size != size || map->vaddr != vaddr || map->iova != iova)
1584 		return -EINVAL;
1585 
1586 	/* READ/WRITE from device perspective */
1587 	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
1588 		prot |= IOMMU_WRITE;
1589 	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
1590 		prot |= IOMMU_READ;
1591 
1592 	if ((prot && set_vaddr) || (!prot && !set_vaddr))
1593 		return -EINVAL;
1594 
1595 	mutex_lock(&iommu->lock);
1596 
1597 	pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
1598 
1599 	WARN_ON((pgsize - 1) & PAGE_MASK);
1600 
1601 	if (!size || (size | iova | vaddr) & (pgsize - 1)) {
1602 		ret = -EINVAL;
1603 		goto out_unlock;
1604 	}
1605 
1606 	/* Don't allow IOVA or virtual address wrap */
1607 	if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) {
1608 		ret = -EINVAL;
1609 		goto out_unlock;
1610 	}
1611 
1612 	dma = vfio_find_dma(iommu, iova, size);
1613 	if (set_vaddr) {
1614 		if (!dma) {
1615 			ret = -ENOENT;
1616 		} else if (!dma->vaddr_invalid || dma->iova != iova ||
1617 			   dma->size != size) {
1618 			ret = -EINVAL;
1619 		} else {
1620 			ret = vfio_change_dma_owner(dma);
1621 			if (ret)
1622 				goto out_unlock;
1623 			dma->vaddr = vaddr;
1624 			dma->vaddr_invalid = false;
1625 			iommu->vaddr_invalid_count--;
1626 		}
1627 		goto out_unlock;
1628 	} else if (dma) {
1629 		ret = -EEXIST;
1630 		goto out_unlock;
1631 	}
1632 
1633 	if (!iommu->dma_avail) {
1634 		ret = -ENOSPC;
1635 		goto out_unlock;
1636 	}
1637 
1638 	if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
1639 		ret = -EINVAL;
1640 		goto out_unlock;
1641 	}
1642 
1643 	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
1644 	if (!dma) {
1645 		ret = -ENOMEM;
1646 		goto out_unlock;
1647 	}
1648 
1649 	iommu->dma_avail--;
1650 	dma->iova = iova;
1651 	dma->vaddr = vaddr;
1652 	dma->prot = prot;
1653 
1654 	/*
1655 	 * We need to be able to both add to a task's locked memory and test
1656 	 * against the locked memory limit and we need to be able to do both
1657 	 * outside of this call path as pinning can be asynchronous via the
1658 	 * external interfaces for mdev devices.  RLIMIT_MEMLOCK requires a
1659 	 * task_struct. Save the group_leader so that all DMA tracking uses
1660 	 * the same task, to make debugging easier.  VM locked pages requires
1661 	 * an mm_struct, so grab the mm in case the task dies.
1662 	 */
1663 	get_task_struct(current->group_leader);
1664 	dma->task = current->group_leader;
1665 	dma->lock_cap = capable(CAP_IPC_LOCK);
1666 	dma->mm = current->mm;
1667 	mmgrab(dma->mm);
1668 
1669 	dma->pfn_list = RB_ROOT;
1670 
1671 	/* Insert zero-sized and grow as we map chunks of it */
1672 	vfio_link_dma(iommu, dma);
1673 
1674 	/* Don't pin and map if container doesn't contain IOMMU capable domain*/
1675 	if (list_empty(&iommu->domain_list))
1676 		dma->size = size;
1677 	else
1678 		ret = vfio_pin_map_dma(iommu, dma, size);
1679 
1680 	if (!ret && iommu->dirty_page_tracking) {
1681 		ret = vfio_dma_bitmap_alloc(dma, pgsize);
1682 		if (ret)
1683 			vfio_remove_dma(iommu, dma);
1684 	}
1685 
1686 out_unlock:
1687 	mutex_unlock(&iommu->lock);
1688 	return ret;
1689 }
1690 
vfio_iommu_replay(struct vfio_iommu * iommu,struct vfio_domain * domain)1691 static int vfio_iommu_replay(struct vfio_iommu *iommu,
1692 			     struct vfio_domain *domain)
1693 {
1694 	struct vfio_batch batch;
1695 	struct vfio_domain *d = NULL;
1696 	struct rb_node *n;
1697 	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1698 	int ret;
1699 
1700 	/* Arbitrarily pick the first domain in the list for lookups */
1701 	if (!list_empty(&iommu->domain_list))
1702 		d = list_first_entry(&iommu->domain_list,
1703 				     struct vfio_domain, next);
1704 
1705 	vfio_batch_init(&batch);
1706 
1707 	n = rb_first(&iommu->dma_list);
1708 
1709 	for (; n; n = rb_next(n)) {
1710 		struct vfio_dma *dma;
1711 		dma_addr_t iova;
1712 
1713 		dma = rb_entry(n, struct vfio_dma, node);
1714 		iova = dma->iova;
1715 
1716 		while (iova < dma->iova + dma->size) {
1717 			phys_addr_t phys;
1718 			size_t size;
1719 
1720 			if (dma->iommu_mapped) {
1721 				phys_addr_t p;
1722 				dma_addr_t i;
1723 
1724 				if (WARN_ON(!d)) { /* mapped w/o a domain?! */
1725 					ret = -EINVAL;
1726 					goto unwind;
1727 				}
1728 
1729 				phys = iommu_iova_to_phys(d->domain, iova);
1730 
1731 				if (WARN_ON(!phys)) {
1732 					iova += PAGE_SIZE;
1733 					continue;
1734 				}
1735 
1736 				size = PAGE_SIZE;
1737 				p = phys + size;
1738 				i = iova + size;
1739 				while (i < dma->iova + dma->size &&
1740 				       p == iommu_iova_to_phys(d->domain, i)) {
1741 					size += PAGE_SIZE;
1742 					p += PAGE_SIZE;
1743 					i += PAGE_SIZE;
1744 				}
1745 			} else {
1746 				unsigned long pfn;
1747 				unsigned long vaddr = dma->vaddr +
1748 						     (iova - dma->iova);
1749 				size_t n = dma->iova + dma->size - iova;
1750 				long npage;
1751 
1752 				npage = vfio_pin_pages_remote(dma, vaddr,
1753 							      n >> PAGE_SHIFT,
1754 							      &pfn, limit,
1755 							      &batch);
1756 				if (npage <= 0) {
1757 					WARN_ON(!npage);
1758 					ret = (int)npage;
1759 					goto unwind;
1760 				}
1761 
1762 				phys = pfn << PAGE_SHIFT;
1763 				size = npage << PAGE_SHIFT;
1764 			}
1765 
1766 			ret = iommu_map(domain->domain, iova, phys, size,
1767 					dma->prot | IOMMU_CACHE,
1768 					GFP_KERNEL_ACCOUNT);
1769 			if (ret) {
1770 				if (!dma->iommu_mapped) {
1771 					vfio_unpin_pages_remote(dma, iova,
1772 							phys >> PAGE_SHIFT,
1773 							size >> PAGE_SHIFT,
1774 							true);
1775 					vfio_batch_unpin(&batch, dma);
1776 				}
1777 				goto unwind;
1778 			}
1779 
1780 			iova += size;
1781 		}
1782 	}
1783 
1784 	/* All dmas are now mapped, defer to second tree walk for unwind */
1785 	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1786 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1787 
1788 		dma->iommu_mapped = true;
1789 	}
1790 
1791 	vfio_batch_fini(&batch);
1792 	return 0;
1793 
1794 unwind:
1795 	for (; n; n = rb_prev(n)) {
1796 		struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1797 		dma_addr_t iova;
1798 
1799 		if (dma->iommu_mapped) {
1800 			iommu_unmap(domain->domain, dma->iova, dma->size);
1801 			continue;
1802 		}
1803 
1804 		iova = dma->iova;
1805 		while (iova < dma->iova + dma->size) {
1806 			phys_addr_t phys, p;
1807 			size_t size;
1808 			dma_addr_t i;
1809 
1810 			phys = iommu_iova_to_phys(domain->domain, iova);
1811 			if (!phys) {
1812 				iova += PAGE_SIZE;
1813 				continue;
1814 			}
1815 
1816 			size = PAGE_SIZE;
1817 			p = phys + size;
1818 			i = iova + size;
1819 			while (i < dma->iova + dma->size &&
1820 			       p == iommu_iova_to_phys(domain->domain, i)) {
1821 				size += PAGE_SIZE;
1822 				p += PAGE_SIZE;
1823 				i += PAGE_SIZE;
1824 			}
1825 
1826 			iommu_unmap(domain->domain, iova, size);
1827 			vfio_unpin_pages_remote(dma, iova, phys >> PAGE_SHIFT,
1828 						size >> PAGE_SHIFT, true);
1829 		}
1830 	}
1831 
1832 	vfio_batch_fini(&batch);
1833 	return ret;
1834 }
1835 
1836 /*
1837  * We change our unmap behavior slightly depending on whether the IOMMU
1838  * supports fine-grained superpages.  IOMMUs like AMD-Vi will use a superpage
1839  * for practically any contiguous power-of-two mapping we give it.  This means
1840  * we don't need to look for contiguous chunks ourselves to make unmapping
1841  * more efficient.  On IOMMUs with coarse-grained super pages, like Intel VT-d
1842  * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
1843  * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
1844  * hugetlbfs is in use.
1845  */
vfio_test_domain_fgsp(struct vfio_domain * domain,struct list_head * regions)1846 static void vfio_test_domain_fgsp(struct vfio_domain *domain, struct list_head *regions)
1847 {
1848 	int ret, order = get_order(PAGE_SIZE * 2);
1849 	struct vfio_iova *region;
1850 	struct page *pages;
1851 	dma_addr_t start;
1852 
1853 	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
1854 	if (!pages)
1855 		return;
1856 
1857 	list_for_each_entry(region, regions, list) {
1858 		start = ALIGN(region->start, PAGE_SIZE * 2);
1859 		if (start >= region->end || (region->end - start < PAGE_SIZE * 2))
1860 			continue;
1861 
1862 		ret = iommu_map(domain->domain, start, page_to_phys(pages), PAGE_SIZE * 2,
1863 				IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE,
1864 				GFP_KERNEL_ACCOUNT);
1865 		if (!ret) {
1866 			size_t unmapped = iommu_unmap(domain->domain, start, PAGE_SIZE);
1867 
1868 			if (unmapped == PAGE_SIZE)
1869 				iommu_unmap(domain->domain, start + PAGE_SIZE, PAGE_SIZE);
1870 			else
1871 				domain->fgsp = true;
1872 		}
1873 		break;
1874 	}
1875 
1876 	__free_pages(pages, order);
1877 }
1878 
find_iommu_group(struct vfio_domain * domain,struct iommu_group * iommu_group)1879 static struct vfio_iommu_group *find_iommu_group(struct vfio_domain *domain,
1880 						 struct iommu_group *iommu_group)
1881 {
1882 	struct vfio_iommu_group *g;
1883 
1884 	list_for_each_entry(g, &domain->group_list, next) {
1885 		if (g->iommu_group == iommu_group)
1886 			return g;
1887 	}
1888 
1889 	return NULL;
1890 }
1891 
1892 static struct vfio_iommu_group*
vfio_iommu_find_iommu_group(struct vfio_iommu * iommu,struct iommu_group * iommu_group)1893 vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
1894 			    struct iommu_group *iommu_group)
1895 {
1896 	struct vfio_iommu_group *group;
1897 	struct vfio_domain *domain;
1898 
1899 	list_for_each_entry(domain, &iommu->domain_list, next) {
1900 		group = find_iommu_group(domain, iommu_group);
1901 		if (group)
1902 			return group;
1903 	}
1904 
1905 	list_for_each_entry(group, &iommu->emulated_iommu_groups, next)
1906 		if (group->iommu_group == iommu_group)
1907 			return group;
1908 	return NULL;
1909 }
1910 
vfio_iommu_has_sw_msi(struct list_head * group_resv_regions,phys_addr_t * base)1911 static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
1912 				  phys_addr_t *base)
1913 {
1914 	struct iommu_resv_region *region;
1915 	bool ret = false;
1916 
1917 	list_for_each_entry(region, group_resv_regions, list) {
1918 		/*
1919 		 * The presence of any 'real' MSI regions should take
1920 		 * precedence over the software-managed one if the
1921 		 * IOMMU driver happens to advertise both types.
1922 		 */
1923 		if (region->type == IOMMU_RESV_MSI) {
1924 			ret = false;
1925 			break;
1926 		}
1927 
1928 		if (region->type == IOMMU_RESV_SW_MSI) {
1929 			*base = region->start;
1930 			ret = true;
1931 		}
1932 	}
1933 
1934 	return ret;
1935 }
1936 
1937 /*
1938  * This is a helper function to insert an address range to iova list.
1939  * The list is initially created with a single entry corresponding to
1940  * the IOMMU domain geometry to which the device group is attached.
1941  * The list aperture gets modified when a new domain is added to the
1942  * container if the new aperture doesn't conflict with the current one
1943  * or with any existing dma mappings. The list is also modified to
1944  * exclude any reserved regions associated with the device group.
1945  */
vfio_iommu_iova_insert(struct list_head * head,dma_addr_t start,dma_addr_t end)1946 static int vfio_iommu_iova_insert(struct list_head *head,
1947 				  dma_addr_t start, dma_addr_t end)
1948 {
1949 	struct vfio_iova *region;
1950 
1951 	region = kmalloc(sizeof(*region), GFP_KERNEL);
1952 	if (!region)
1953 		return -ENOMEM;
1954 
1955 	INIT_LIST_HEAD(&region->list);
1956 	region->start = start;
1957 	region->end = end;
1958 
1959 	list_add_tail(&region->list, head);
1960 	return 0;
1961 }
1962 
1963 /*
1964  * Check the new iommu aperture conflicts with existing aper or with any
1965  * existing dma mappings.
1966  */
vfio_iommu_aper_conflict(struct vfio_iommu * iommu,dma_addr_t start,dma_addr_t end)1967 static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
1968 				     dma_addr_t start, dma_addr_t end)
1969 {
1970 	struct vfio_iova *first, *last;
1971 	struct list_head *iova = &iommu->iova_list;
1972 
1973 	if (list_empty(iova))
1974 		return false;
1975 
1976 	/* Disjoint sets, return conflict */
1977 	first = list_first_entry(iova, struct vfio_iova, list);
1978 	last = list_last_entry(iova, struct vfio_iova, list);
1979 	if (start > last->end || end < first->start)
1980 		return true;
1981 
1982 	/* Check for any existing dma mappings below the new start */
1983 	if (start > first->start) {
1984 		if (vfio_find_dma(iommu, first->start, start - first->start))
1985 			return true;
1986 	}
1987 
1988 	/* Check for any existing dma mappings beyond the new end */
1989 	if (end < last->end) {
1990 		if (vfio_find_dma(iommu, end + 1, last->end - end))
1991 			return true;
1992 	}
1993 
1994 	return false;
1995 }
1996 
1997 /*
1998  * Resize iommu iova aperture window. This is called only if the new
1999  * aperture has no conflict with existing aperture and dma mappings.
2000  */
vfio_iommu_aper_resize(struct list_head * iova,dma_addr_t start,dma_addr_t end)2001 static int vfio_iommu_aper_resize(struct list_head *iova,
2002 				  dma_addr_t start, dma_addr_t end)
2003 {
2004 	struct vfio_iova *node, *next;
2005 
2006 	if (list_empty(iova))
2007 		return vfio_iommu_iova_insert(iova, start, end);
2008 
2009 	/* Adjust iova list start */
2010 	list_for_each_entry_safe(node, next, iova, list) {
2011 		if (start < node->start)
2012 			break;
2013 		if (start >= node->start && start < node->end) {
2014 			node->start = start;
2015 			break;
2016 		}
2017 		/* Delete nodes before new start */
2018 		list_del(&node->list);
2019 		kfree(node);
2020 	}
2021 
2022 	/* Adjust iova list end */
2023 	list_for_each_entry_safe(node, next, iova, list) {
2024 		if (end > node->end)
2025 			continue;
2026 		if (end > node->start && end <= node->end) {
2027 			node->end = end;
2028 			continue;
2029 		}
2030 		/* Delete nodes after new end */
2031 		list_del(&node->list);
2032 		kfree(node);
2033 	}
2034 
2035 	return 0;
2036 }
2037 
2038 /*
2039  * Check reserved region conflicts with existing dma mappings
2040  */
vfio_iommu_resv_conflict(struct vfio_iommu * iommu,struct list_head * resv_regions)2041 static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
2042 				     struct list_head *resv_regions)
2043 {
2044 	struct iommu_resv_region *region;
2045 
2046 	/* Check for conflict with existing dma mappings */
2047 	list_for_each_entry(region, resv_regions, list) {
2048 		if (region->type == IOMMU_RESV_DIRECT_RELAXABLE)
2049 			continue;
2050 
2051 		if (vfio_find_dma(iommu, region->start, region->length))
2052 			return true;
2053 	}
2054 
2055 	return false;
2056 }
2057 
2058 /*
2059  * Check iova region overlap with  reserved regions and
2060  * exclude them from the iommu iova range
2061  */
vfio_iommu_resv_exclude(struct list_head * iova,struct list_head * resv_regions)2062 static int vfio_iommu_resv_exclude(struct list_head *iova,
2063 				   struct list_head *resv_regions)
2064 {
2065 	struct iommu_resv_region *resv;
2066 	struct vfio_iova *n, *next;
2067 
2068 	list_for_each_entry(resv, resv_regions, list) {
2069 		phys_addr_t start, end;
2070 
2071 		if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
2072 			continue;
2073 
2074 		start = resv->start;
2075 		end = resv->start + resv->length - 1;
2076 
2077 		list_for_each_entry_safe(n, next, iova, list) {
2078 			int ret = 0;
2079 
2080 			/* No overlap */
2081 			if (start > n->end || end < n->start)
2082 				continue;
2083 			/*
2084 			 * Insert a new node if current node overlaps with the
2085 			 * reserve region to exclude that from valid iova range.
2086 			 * Note that, new node is inserted before the current
2087 			 * node and finally the current node is deleted keeping
2088 			 * the list updated and sorted.
2089 			 */
2090 			if (start > n->start)
2091 				ret = vfio_iommu_iova_insert(&n->list, n->start,
2092 							     start - 1);
2093 			if (!ret && end < n->end)
2094 				ret = vfio_iommu_iova_insert(&n->list, end + 1,
2095 							     n->end);
2096 			if (ret)
2097 				return ret;
2098 
2099 			list_del(&n->list);
2100 			kfree(n);
2101 		}
2102 	}
2103 
2104 	if (list_empty(iova))
2105 		return -EINVAL;
2106 
2107 	return 0;
2108 }
2109 
vfio_iommu_resv_free(struct list_head * resv_regions)2110 static void vfio_iommu_resv_free(struct list_head *resv_regions)
2111 {
2112 	struct iommu_resv_region *n, *next;
2113 
2114 	list_for_each_entry_safe(n, next, resv_regions, list) {
2115 		list_del(&n->list);
2116 		kfree(n);
2117 	}
2118 }
2119 
vfio_iommu_iova_free(struct list_head * iova)2120 static void vfio_iommu_iova_free(struct list_head *iova)
2121 {
2122 	struct vfio_iova *n, *next;
2123 
2124 	list_for_each_entry_safe(n, next, iova, list) {
2125 		list_del(&n->list);
2126 		kfree(n);
2127 	}
2128 }
2129 
vfio_iommu_iova_get_copy(struct vfio_iommu * iommu,struct list_head * iova_copy)2130 static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu,
2131 				    struct list_head *iova_copy)
2132 {
2133 	struct list_head *iova = &iommu->iova_list;
2134 	struct vfio_iova *n;
2135 	int ret;
2136 
2137 	list_for_each_entry(n, iova, list) {
2138 		ret = vfio_iommu_iova_insert(iova_copy, n->start, n->end);
2139 		if (ret)
2140 			goto out_free;
2141 	}
2142 
2143 	return 0;
2144 
2145 out_free:
2146 	vfio_iommu_iova_free(iova_copy);
2147 	return ret;
2148 }
2149 
vfio_iommu_iova_insert_copy(struct vfio_iommu * iommu,struct list_head * iova_copy)2150 static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
2151 					struct list_head *iova_copy)
2152 {
2153 	struct list_head *iova = &iommu->iova_list;
2154 
2155 	vfio_iommu_iova_free(iova);
2156 
2157 	list_splice_tail(iova_copy, iova);
2158 }
2159 
vfio_iommu_domain_alloc(struct device * dev,void * data)2160 static int vfio_iommu_domain_alloc(struct device *dev, void *data)
2161 {
2162 	struct iommu_domain **domain = data;
2163 
2164 	*domain = iommu_paging_domain_alloc(dev);
2165 	return 1; /* Don't iterate */
2166 }
2167 
vfio_iommu_type1_attach_group(void * iommu_data,struct iommu_group * iommu_group,enum vfio_group_type type)2168 static int vfio_iommu_type1_attach_group(void *iommu_data,
2169 		struct iommu_group *iommu_group, enum vfio_group_type type)
2170 {
2171 	struct vfio_iommu *iommu = iommu_data;
2172 	struct vfio_iommu_group *group;
2173 	struct vfio_domain *domain, *d;
2174 	bool resv_msi;
2175 	phys_addr_t resv_msi_base = 0;
2176 	struct iommu_domain_geometry *geo;
2177 	LIST_HEAD(iova_copy);
2178 	LIST_HEAD(group_resv_regions);
2179 	int ret = -EBUSY;
2180 
2181 	mutex_lock(&iommu->lock);
2182 
2183 	/* Attach could require pinning, so disallow while vaddr is invalid. */
2184 	if (iommu->vaddr_invalid_count)
2185 		goto out_unlock;
2186 
2187 	/* Check for duplicates */
2188 	ret = -EINVAL;
2189 	if (vfio_iommu_find_iommu_group(iommu, iommu_group))
2190 		goto out_unlock;
2191 
2192 	ret = -ENOMEM;
2193 	group = kzalloc(sizeof(*group), GFP_KERNEL);
2194 	if (!group)
2195 		goto out_unlock;
2196 	group->iommu_group = iommu_group;
2197 
2198 	if (type == VFIO_EMULATED_IOMMU) {
2199 		list_add(&group->next, &iommu->emulated_iommu_groups);
2200 		/*
2201 		 * An emulated IOMMU group cannot dirty memory directly, it can
2202 		 * only use interfaces that provide dirty tracking.
2203 		 * The iommu scope can only be promoted with the addition of a
2204 		 * dirty tracking group.
2205 		 */
2206 		group->pinned_page_dirty_scope = true;
2207 		ret = 0;
2208 		goto out_unlock;
2209 	}
2210 
2211 	ret = -ENOMEM;
2212 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2213 	if (!domain)
2214 		goto out_free_group;
2215 
2216 	/*
2217 	 * Going via the iommu_group iterator avoids races, and trivially gives
2218 	 * us a representative device for the IOMMU API call. We don't actually
2219 	 * want to iterate beyond the first device (if any).
2220 	 */
2221 	iommu_group_for_each_dev(iommu_group, &domain->domain,
2222 				 vfio_iommu_domain_alloc);
2223 	if (IS_ERR(domain->domain)) {
2224 		ret = PTR_ERR(domain->domain);
2225 		goto out_free_domain;
2226 	}
2227 
2228 	ret = iommu_attach_group(domain->domain, group->iommu_group);
2229 	if (ret)
2230 		goto out_domain;
2231 
2232 	/* Get aperture info */
2233 	geo = &domain->domain->geometry;
2234 	if (vfio_iommu_aper_conflict(iommu, geo->aperture_start,
2235 				     geo->aperture_end)) {
2236 		ret = -EINVAL;
2237 		goto out_detach;
2238 	}
2239 
2240 	ret = iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
2241 	if (ret)
2242 		goto out_detach;
2243 
2244 	if (vfio_iommu_resv_conflict(iommu, &group_resv_regions)) {
2245 		ret = -EINVAL;
2246 		goto out_detach;
2247 	}
2248 
2249 	/*
2250 	 * We don't want to work on the original iova list as the list
2251 	 * gets modified and in case of failure we have to retain the
2252 	 * original list. Get a copy here.
2253 	 */
2254 	ret = vfio_iommu_iova_get_copy(iommu, &iova_copy);
2255 	if (ret)
2256 		goto out_detach;
2257 
2258 	ret = vfio_iommu_aper_resize(&iova_copy, geo->aperture_start,
2259 				     geo->aperture_end);
2260 	if (ret)
2261 		goto out_detach;
2262 
2263 	ret = vfio_iommu_resv_exclude(&iova_copy, &group_resv_regions);
2264 	if (ret)
2265 		goto out_detach;
2266 
2267 	resv_msi = vfio_iommu_has_sw_msi(&group_resv_regions, &resv_msi_base);
2268 
2269 	INIT_LIST_HEAD(&domain->group_list);
2270 	list_add(&group->next, &domain->group_list);
2271 
2272 	if (!allow_unsafe_interrupts &&
2273 	    !iommu_group_has_isolated_msi(iommu_group)) {
2274 		pr_warn("%s: No interrupt remapping support.  Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
2275 		       __func__);
2276 		ret = -EPERM;
2277 		goto out_detach;
2278 	}
2279 
2280 	/*
2281 	 * If the IOMMU can block non-coherent operations (ie PCIe TLPs with
2282 	 * no-snoop set) then VFIO always turns this feature on because on Intel
2283 	 * platforms it optimizes KVM to disable wbinvd emulation.
2284 	 */
2285 	if (domain->domain->ops->enforce_cache_coherency)
2286 		domain->enforce_cache_coherency =
2287 			domain->domain->ops->enforce_cache_coherency(
2288 				domain->domain);
2289 
2290 	/*
2291 	 * Try to match an existing compatible domain.  We don't want to
2292 	 * preclude an IOMMU driver supporting multiple bus_types and being
2293 	 * able to include different bus_types in the same IOMMU domain, so
2294 	 * we test whether the domains use the same iommu_ops rather than
2295 	 * testing if they're on the same bus_type.
2296 	 */
2297 	list_for_each_entry(d, &iommu->domain_list, next) {
2298 		if (d->domain->ops == domain->domain->ops &&
2299 		    d->enforce_cache_coherency ==
2300 			    domain->enforce_cache_coherency) {
2301 			iommu_detach_group(domain->domain, group->iommu_group);
2302 			if (!iommu_attach_group(d->domain,
2303 						group->iommu_group)) {
2304 				list_add(&group->next, &d->group_list);
2305 				iommu_domain_free(domain->domain);
2306 				kfree(domain);
2307 				goto done;
2308 			}
2309 
2310 			ret = iommu_attach_group(domain->domain,
2311 						 group->iommu_group);
2312 			if (ret)
2313 				goto out_domain;
2314 		}
2315 	}
2316 
2317 	vfio_test_domain_fgsp(domain, &iova_copy);
2318 
2319 	/* replay mappings on new domains */
2320 	ret = vfio_iommu_replay(iommu, domain);
2321 	if (ret)
2322 		goto out_detach;
2323 
2324 	if (resv_msi) {
2325 		ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
2326 		if (ret && ret != -ENODEV)
2327 			goto out_detach;
2328 	}
2329 
2330 	list_add(&domain->next, &iommu->domain_list);
2331 	vfio_update_pgsize_bitmap(iommu);
2332 done:
2333 	/* Delete the old one and insert new iova list */
2334 	vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2335 
2336 	/*
2337 	 * An iommu backed group can dirty memory directly and therefore
2338 	 * demotes the iommu scope until it declares itself dirty tracking
2339 	 * capable via the page pinning interface.
2340 	 */
2341 	iommu->num_non_pinned_groups++;
2342 	mutex_unlock(&iommu->lock);
2343 	vfio_iommu_resv_free(&group_resv_regions);
2344 
2345 	return 0;
2346 
2347 out_detach:
2348 	iommu_detach_group(domain->domain, group->iommu_group);
2349 out_domain:
2350 	iommu_domain_free(domain->domain);
2351 	vfio_iommu_iova_free(&iova_copy);
2352 	vfio_iommu_resv_free(&group_resv_regions);
2353 out_free_domain:
2354 	kfree(domain);
2355 out_free_group:
2356 	kfree(group);
2357 out_unlock:
2358 	mutex_unlock(&iommu->lock);
2359 	return ret;
2360 }
2361 
vfio_iommu_unmap_unpin_all(struct vfio_iommu * iommu)2362 static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
2363 {
2364 	struct rb_node *node;
2365 
2366 	while ((node = rb_first(&iommu->dma_list)))
2367 		vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
2368 }
2369 
vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu * iommu)2370 static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
2371 {
2372 	struct rb_node *n, *p;
2373 
2374 	n = rb_first(&iommu->dma_list);
2375 	for (; n; n = rb_next(n)) {
2376 		struct vfio_dma *dma;
2377 		long locked = 0, unlocked = 0;
2378 
2379 		dma = rb_entry(n, struct vfio_dma, node);
2380 		unlocked += vfio_unmap_unpin(iommu, dma, false);
2381 		p = rb_first(&dma->pfn_list);
2382 		for (; p; p = rb_next(p)) {
2383 			struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
2384 							 node);
2385 
2386 			if (!is_invalid_reserved_pfn(vpfn->pfn))
2387 				locked++;
2388 		}
2389 		vfio_lock_acct(dma, locked - unlocked, true);
2390 	}
2391 }
2392 
2393 /*
2394  * Called when a domain is removed in detach. It is possible that
2395  * the removed domain decided the iova aperture window. Modify the
2396  * iova aperture with the smallest window among existing domains.
2397  */
vfio_iommu_aper_expand(struct vfio_iommu * iommu,struct list_head * iova_copy)2398 static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
2399 				   struct list_head *iova_copy)
2400 {
2401 	struct vfio_domain *domain;
2402 	struct vfio_iova *node;
2403 	dma_addr_t start = 0;
2404 	dma_addr_t end = (dma_addr_t)~0;
2405 
2406 	if (list_empty(iova_copy))
2407 		return;
2408 
2409 	list_for_each_entry(domain, &iommu->domain_list, next) {
2410 		struct iommu_domain_geometry *geo = &domain->domain->geometry;
2411 
2412 		if (geo->aperture_start > start)
2413 			start = geo->aperture_start;
2414 		if (geo->aperture_end < end)
2415 			end = geo->aperture_end;
2416 	}
2417 
2418 	/* Modify aperture limits. The new aper is either same or bigger */
2419 	node = list_first_entry(iova_copy, struct vfio_iova, list);
2420 	node->start = start;
2421 	node = list_last_entry(iova_copy, struct vfio_iova, list);
2422 	node->end = end;
2423 }
2424 
2425 /*
2426  * Called when a group is detached. The reserved regions for that
2427  * group can be part of valid iova now. But since reserved regions
2428  * may be duplicated among groups, populate the iova valid regions
2429  * list again.
2430  */
vfio_iommu_resv_refresh(struct vfio_iommu * iommu,struct list_head * iova_copy)2431 static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
2432 				   struct list_head *iova_copy)
2433 {
2434 	struct vfio_domain *d;
2435 	struct vfio_iommu_group *g;
2436 	struct vfio_iova *node;
2437 	dma_addr_t start, end;
2438 	LIST_HEAD(resv_regions);
2439 	int ret;
2440 
2441 	if (list_empty(iova_copy))
2442 		return -EINVAL;
2443 
2444 	list_for_each_entry(d, &iommu->domain_list, next) {
2445 		list_for_each_entry(g, &d->group_list, next) {
2446 			ret = iommu_get_group_resv_regions(g->iommu_group,
2447 							   &resv_regions);
2448 			if (ret)
2449 				goto done;
2450 		}
2451 	}
2452 
2453 	node = list_first_entry(iova_copy, struct vfio_iova, list);
2454 	start = node->start;
2455 	node = list_last_entry(iova_copy, struct vfio_iova, list);
2456 	end = node->end;
2457 
2458 	/* purge the iova list and create new one */
2459 	vfio_iommu_iova_free(iova_copy);
2460 
2461 	ret = vfio_iommu_aper_resize(iova_copy, start, end);
2462 	if (ret)
2463 		goto done;
2464 
2465 	/* Exclude current reserved regions from iova ranges */
2466 	ret = vfio_iommu_resv_exclude(iova_copy, &resv_regions);
2467 done:
2468 	vfio_iommu_resv_free(&resv_regions);
2469 	return ret;
2470 }
2471 
vfio_iommu_type1_detach_group(void * iommu_data,struct iommu_group * iommu_group)2472 static void vfio_iommu_type1_detach_group(void *iommu_data,
2473 					  struct iommu_group *iommu_group)
2474 {
2475 	struct vfio_iommu *iommu = iommu_data;
2476 	struct vfio_domain *domain;
2477 	struct vfio_iommu_group *group;
2478 	bool update_dirty_scope = false;
2479 	LIST_HEAD(iova_copy);
2480 
2481 	mutex_lock(&iommu->lock);
2482 	list_for_each_entry(group, &iommu->emulated_iommu_groups, next) {
2483 		if (group->iommu_group != iommu_group)
2484 			continue;
2485 		update_dirty_scope = !group->pinned_page_dirty_scope;
2486 		list_del(&group->next);
2487 		kfree(group);
2488 
2489 		if (list_empty(&iommu->emulated_iommu_groups) &&
2490 		    list_empty(&iommu->domain_list)) {
2491 			WARN_ON(!list_empty(&iommu->device_list));
2492 			vfio_iommu_unmap_unpin_all(iommu);
2493 		}
2494 		goto detach_group_done;
2495 	}
2496 
2497 	/*
2498 	 * Get a copy of iova list. This will be used to update
2499 	 * and to replace the current one later. Please note that
2500 	 * we will leave the original list as it is if update fails.
2501 	 */
2502 	vfio_iommu_iova_get_copy(iommu, &iova_copy);
2503 
2504 	list_for_each_entry(domain, &iommu->domain_list, next) {
2505 		group = find_iommu_group(domain, iommu_group);
2506 		if (!group)
2507 			continue;
2508 
2509 		iommu_detach_group(domain->domain, group->iommu_group);
2510 		update_dirty_scope = !group->pinned_page_dirty_scope;
2511 		list_del(&group->next);
2512 		kfree(group);
2513 		/*
2514 		 * Group ownership provides privilege, if the group list is
2515 		 * empty, the domain goes away. If it's the last domain with
2516 		 * iommu and external domain doesn't exist, then all the
2517 		 * mappings go away too. If it's the last domain with iommu and
2518 		 * external domain exist, update accounting
2519 		 */
2520 		if (list_empty(&domain->group_list)) {
2521 			if (list_is_singular(&iommu->domain_list)) {
2522 				if (list_empty(&iommu->emulated_iommu_groups)) {
2523 					WARN_ON(!list_empty(
2524 						&iommu->device_list));
2525 					vfio_iommu_unmap_unpin_all(iommu);
2526 				} else {
2527 					vfio_iommu_unmap_unpin_reaccount(iommu);
2528 				}
2529 			}
2530 			iommu_domain_free(domain->domain);
2531 			list_del(&domain->next);
2532 			kfree(domain);
2533 			vfio_iommu_aper_expand(iommu, &iova_copy);
2534 			vfio_update_pgsize_bitmap(iommu);
2535 		}
2536 		break;
2537 	}
2538 
2539 	if (!vfio_iommu_resv_refresh(iommu, &iova_copy))
2540 		vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2541 	else
2542 		vfio_iommu_iova_free(&iova_copy);
2543 
2544 detach_group_done:
2545 	/*
2546 	 * Removal of a group without dirty tracking may allow the iommu scope
2547 	 * to be promoted.
2548 	 */
2549 	if (update_dirty_scope) {
2550 		iommu->num_non_pinned_groups--;
2551 		if (iommu->dirty_page_tracking)
2552 			vfio_iommu_populate_bitmap_full(iommu);
2553 	}
2554 	mutex_unlock(&iommu->lock);
2555 }
2556 
vfio_iommu_type1_open(unsigned long arg)2557 static void *vfio_iommu_type1_open(unsigned long arg)
2558 {
2559 	struct vfio_iommu *iommu;
2560 
2561 	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
2562 	if (!iommu)
2563 		return ERR_PTR(-ENOMEM);
2564 
2565 	switch (arg) {
2566 	case VFIO_TYPE1_IOMMU:
2567 		break;
2568 	case __VFIO_RESERVED_TYPE1_NESTING_IOMMU:
2569 	case VFIO_TYPE1v2_IOMMU:
2570 		iommu->v2 = true;
2571 		break;
2572 	default:
2573 		kfree(iommu);
2574 		return ERR_PTR(-EINVAL);
2575 	}
2576 
2577 	INIT_LIST_HEAD(&iommu->domain_list);
2578 	INIT_LIST_HEAD(&iommu->iova_list);
2579 	iommu->dma_list = RB_ROOT;
2580 	iommu->dma_avail = dma_entry_limit;
2581 	mutex_init(&iommu->lock);
2582 	mutex_init(&iommu->device_list_lock);
2583 	INIT_LIST_HEAD(&iommu->device_list);
2584 	iommu->pgsize_bitmap = PAGE_MASK;
2585 	INIT_LIST_HEAD(&iommu->emulated_iommu_groups);
2586 
2587 	return iommu;
2588 }
2589 
vfio_release_domain(struct vfio_domain * domain)2590 static void vfio_release_domain(struct vfio_domain *domain)
2591 {
2592 	struct vfio_iommu_group *group, *group_tmp;
2593 
2594 	list_for_each_entry_safe(group, group_tmp,
2595 				 &domain->group_list, next) {
2596 		iommu_detach_group(domain->domain, group->iommu_group);
2597 		list_del(&group->next);
2598 		kfree(group);
2599 	}
2600 
2601 	iommu_domain_free(domain->domain);
2602 }
2603 
vfio_iommu_type1_release(void * iommu_data)2604 static void vfio_iommu_type1_release(void *iommu_data)
2605 {
2606 	struct vfio_iommu *iommu = iommu_data;
2607 	struct vfio_domain *domain, *domain_tmp;
2608 	struct vfio_iommu_group *group, *next_group;
2609 
2610 	list_for_each_entry_safe(group, next_group,
2611 			&iommu->emulated_iommu_groups, next) {
2612 		list_del(&group->next);
2613 		kfree(group);
2614 	}
2615 
2616 	vfio_iommu_unmap_unpin_all(iommu);
2617 
2618 	list_for_each_entry_safe(domain, domain_tmp,
2619 				 &iommu->domain_list, next) {
2620 		vfio_release_domain(domain);
2621 		list_del(&domain->next);
2622 		kfree(domain);
2623 	}
2624 
2625 	vfio_iommu_iova_free(&iommu->iova_list);
2626 
2627 	kfree(iommu);
2628 }
2629 
vfio_domains_have_enforce_cache_coherency(struct vfio_iommu * iommu)2630 static int vfio_domains_have_enforce_cache_coherency(struct vfio_iommu *iommu)
2631 {
2632 	struct vfio_domain *domain;
2633 	int ret = 1;
2634 
2635 	mutex_lock(&iommu->lock);
2636 	list_for_each_entry(domain, &iommu->domain_list, next) {
2637 		if (!(domain->enforce_cache_coherency)) {
2638 			ret = 0;
2639 			break;
2640 		}
2641 	}
2642 	mutex_unlock(&iommu->lock);
2643 
2644 	return ret;
2645 }
2646 
vfio_iommu_has_emulated(struct vfio_iommu * iommu)2647 static bool vfio_iommu_has_emulated(struct vfio_iommu *iommu)
2648 {
2649 	bool ret;
2650 
2651 	mutex_lock(&iommu->lock);
2652 	ret = !list_empty(&iommu->emulated_iommu_groups);
2653 	mutex_unlock(&iommu->lock);
2654 	return ret;
2655 }
2656 
vfio_iommu_type1_check_extension(struct vfio_iommu * iommu,unsigned long arg)2657 static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
2658 					    unsigned long arg)
2659 {
2660 	switch (arg) {
2661 	case VFIO_TYPE1_IOMMU:
2662 	case VFIO_TYPE1v2_IOMMU:
2663 	case VFIO_UNMAP_ALL:
2664 		return 1;
2665 	case VFIO_UPDATE_VADDR:
2666 		/*
2667 		 * Disable this feature if mdevs are present.  They cannot
2668 		 * safely pin/unpin/rw while vaddrs are being updated.
2669 		 */
2670 		return iommu && !vfio_iommu_has_emulated(iommu);
2671 	case VFIO_DMA_CC_IOMMU:
2672 		if (!iommu)
2673 			return 0;
2674 		return vfio_domains_have_enforce_cache_coherency(iommu);
2675 	default:
2676 		return 0;
2677 	}
2678 }
2679 
vfio_iommu_iova_add_cap(struct vfio_info_cap * caps,struct vfio_iommu_type1_info_cap_iova_range * cap_iovas,size_t size)2680 static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
2681 		 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
2682 		 size_t size)
2683 {
2684 	struct vfio_info_cap_header *header;
2685 	struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
2686 
2687 	header = vfio_info_cap_add(caps, size,
2688 				   VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
2689 	if (IS_ERR(header))
2690 		return PTR_ERR(header);
2691 
2692 	iova_cap = container_of(header,
2693 				struct vfio_iommu_type1_info_cap_iova_range,
2694 				header);
2695 	iova_cap->nr_iovas = cap_iovas->nr_iovas;
2696 	memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
2697 	       cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
2698 	return 0;
2699 }
2700 
vfio_iommu_iova_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2701 static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
2702 				      struct vfio_info_cap *caps)
2703 {
2704 	struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
2705 	struct vfio_iova *iova;
2706 	size_t size;
2707 	int iovas = 0, i = 0, ret;
2708 
2709 	list_for_each_entry(iova, &iommu->iova_list, list)
2710 		iovas++;
2711 
2712 	if (!iovas) {
2713 		/*
2714 		 * Return 0 as a container with a single mdev device
2715 		 * will have an empty list
2716 		 */
2717 		return 0;
2718 	}
2719 
2720 	size = struct_size(cap_iovas, iova_ranges, iovas);
2721 
2722 	cap_iovas = kzalloc(size, GFP_KERNEL);
2723 	if (!cap_iovas)
2724 		return -ENOMEM;
2725 
2726 	cap_iovas->nr_iovas = iovas;
2727 
2728 	list_for_each_entry(iova, &iommu->iova_list, list) {
2729 		cap_iovas->iova_ranges[i].start = iova->start;
2730 		cap_iovas->iova_ranges[i].end = iova->end;
2731 		i++;
2732 	}
2733 
2734 	ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
2735 
2736 	kfree(cap_iovas);
2737 	return ret;
2738 }
2739 
vfio_iommu_migration_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2740 static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
2741 					   struct vfio_info_cap *caps)
2742 {
2743 	struct vfio_iommu_type1_info_cap_migration cap_mig = {};
2744 
2745 	cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
2746 	cap_mig.header.version = 1;
2747 
2748 	cap_mig.flags = 0;
2749 	/* support minimum pgsize */
2750 	cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
2751 	cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
2752 
2753 	return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig));
2754 }
2755 
vfio_iommu_dma_avail_build_caps(struct vfio_iommu * iommu,struct vfio_info_cap * caps)2756 static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu,
2757 					   struct vfio_info_cap *caps)
2758 {
2759 	struct vfio_iommu_type1_info_dma_avail cap_dma_avail;
2760 
2761 	cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL;
2762 	cap_dma_avail.header.version = 1;
2763 
2764 	cap_dma_avail.avail = iommu->dma_avail;
2765 
2766 	return vfio_info_add_capability(caps, &cap_dma_avail.header,
2767 					sizeof(cap_dma_avail));
2768 }
2769 
vfio_iommu_type1_get_info(struct vfio_iommu * iommu,unsigned long arg)2770 static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
2771 				     unsigned long arg)
2772 {
2773 	struct vfio_iommu_type1_info info = {};
2774 	unsigned long minsz;
2775 	struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
2776 	int ret;
2777 
2778 	minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
2779 
2780 	if (copy_from_user(&info, (void __user *)arg, minsz))
2781 		return -EFAULT;
2782 
2783 	if (info.argsz < minsz)
2784 		return -EINVAL;
2785 
2786 	minsz = min_t(size_t, info.argsz, sizeof(info));
2787 
2788 	mutex_lock(&iommu->lock);
2789 	info.flags = VFIO_IOMMU_INFO_PGSIZES;
2790 
2791 	info.iova_pgsizes = iommu->pgsize_bitmap;
2792 
2793 	ret = vfio_iommu_migration_build_caps(iommu, &caps);
2794 
2795 	if (!ret)
2796 		ret = vfio_iommu_dma_avail_build_caps(iommu, &caps);
2797 
2798 	if (!ret)
2799 		ret = vfio_iommu_iova_build_caps(iommu, &caps);
2800 
2801 	mutex_unlock(&iommu->lock);
2802 
2803 	if (ret)
2804 		return ret;
2805 
2806 	if (caps.size) {
2807 		info.flags |= VFIO_IOMMU_INFO_CAPS;
2808 
2809 		if (info.argsz < sizeof(info) + caps.size) {
2810 			info.argsz = sizeof(info) + caps.size;
2811 		} else {
2812 			vfio_info_cap_shift(&caps, sizeof(info));
2813 			if (copy_to_user((void __user *)arg +
2814 					sizeof(info), caps.buf,
2815 					caps.size)) {
2816 				kfree(caps.buf);
2817 				return -EFAULT;
2818 			}
2819 			info.cap_offset = sizeof(info);
2820 		}
2821 
2822 		kfree(caps.buf);
2823 	}
2824 
2825 	return copy_to_user((void __user *)arg, &info, minsz) ?
2826 			-EFAULT : 0;
2827 }
2828 
vfio_iommu_type1_map_dma(struct vfio_iommu * iommu,unsigned long arg)2829 static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
2830 				    unsigned long arg)
2831 {
2832 	struct vfio_iommu_type1_dma_map map;
2833 	unsigned long minsz;
2834 	uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE |
2835 			VFIO_DMA_MAP_FLAG_VADDR;
2836 
2837 	minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
2838 
2839 	if (copy_from_user(&map, (void __user *)arg, minsz))
2840 		return -EFAULT;
2841 
2842 	if (map.argsz < minsz || map.flags & ~mask)
2843 		return -EINVAL;
2844 
2845 	return vfio_dma_do_map(iommu, &map);
2846 }
2847 
vfio_iommu_type1_unmap_dma(struct vfio_iommu * iommu,unsigned long arg)2848 static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
2849 				      unsigned long arg)
2850 {
2851 	struct vfio_iommu_type1_dma_unmap unmap;
2852 	struct vfio_bitmap bitmap = { 0 };
2853 	uint32_t mask = VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP |
2854 			VFIO_DMA_UNMAP_FLAG_VADDR |
2855 			VFIO_DMA_UNMAP_FLAG_ALL;
2856 	unsigned long minsz;
2857 	int ret;
2858 
2859 	minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
2860 
2861 	if (copy_from_user(&unmap, (void __user *)arg, minsz))
2862 		return -EFAULT;
2863 
2864 	if (unmap.argsz < minsz || unmap.flags & ~mask)
2865 		return -EINVAL;
2866 
2867 	if ((unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
2868 	    (unmap.flags & (VFIO_DMA_UNMAP_FLAG_ALL |
2869 			    VFIO_DMA_UNMAP_FLAG_VADDR)))
2870 		return -EINVAL;
2871 
2872 	if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
2873 		unsigned long pgshift;
2874 
2875 		if (unmap.argsz < (minsz + sizeof(bitmap)))
2876 			return -EINVAL;
2877 
2878 		if (copy_from_user(&bitmap,
2879 				   (void __user *)(arg + minsz),
2880 				   sizeof(bitmap)))
2881 			return -EFAULT;
2882 
2883 		if (!access_ok((void __user *)bitmap.data, bitmap.size))
2884 			return -EINVAL;
2885 
2886 		pgshift = __ffs(bitmap.pgsize);
2887 		ret = verify_bitmap_size(unmap.size >> pgshift,
2888 					 bitmap.size);
2889 		if (ret)
2890 			return ret;
2891 	}
2892 
2893 	ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
2894 	if (ret)
2895 		return ret;
2896 
2897 	return copy_to_user((void __user *)arg, &unmap, minsz) ?
2898 			-EFAULT : 0;
2899 }
2900 
vfio_iommu_type1_dirty_pages(struct vfio_iommu * iommu,unsigned long arg)2901 static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
2902 					unsigned long arg)
2903 {
2904 	struct vfio_iommu_type1_dirty_bitmap dirty;
2905 	uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
2906 			VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
2907 			VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
2908 	unsigned long minsz;
2909 	int ret = 0;
2910 
2911 	if (!iommu->v2)
2912 		return -EACCES;
2913 
2914 	minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, flags);
2915 
2916 	if (copy_from_user(&dirty, (void __user *)arg, minsz))
2917 		return -EFAULT;
2918 
2919 	if (dirty.argsz < minsz || dirty.flags & ~mask)
2920 		return -EINVAL;
2921 
2922 	/* only one flag should be set at a time */
2923 	if (__ffs(dirty.flags) != __fls(dirty.flags))
2924 		return -EINVAL;
2925 
2926 	if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
2927 		size_t pgsize;
2928 
2929 		mutex_lock(&iommu->lock);
2930 		pgsize = 1 << __ffs(iommu->pgsize_bitmap);
2931 		if (!iommu->dirty_page_tracking) {
2932 			ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
2933 			if (!ret)
2934 				iommu->dirty_page_tracking = true;
2935 		}
2936 		mutex_unlock(&iommu->lock);
2937 		return ret;
2938 	} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
2939 		mutex_lock(&iommu->lock);
2940 		if (iommu->dirty_page_tracking) {
2941 			iommu->dirty_page_tracking = false;
2942 			vfio_dma_bitmap_free_all(iommu);
2943 		}
2944 		mutex_unlock(&iommu->lock);
2945 		return 0;
2946 	} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
2947 		struct vfio_iommu_type1_dirty_bitmap_get range;
2948 		unsigned long pgshift;
2949 		size_t data_size = dirty.argsz - minsz;
2950 		size_t iommu_pgsize;
2951 
2952 		if (!data_size || data_size < sizeof(range))
2953 			return -EINVAL;
2954 
2955 		if (copy_from_user(&range, (void __user *)(arg + minsz),
2956 				   sizeof(range)))
2957 			return -EFAULT;
2958 
2959 		if (range.iova + range.size < range.iova)
2960 			return -EINVAL;
2961 		if (!access_ok((void __user *)range.bitmap.data,
2962 			       range.bitmap.size))
2963 			return -EINVAL;
2964 
2965 		pgshift = __ffs(range.bitmap.pgsize);
2966 		ret = verify_bitmap_size(range.size >> pgshift,
2967 					 range.bitmap.size);
2968 		if (ret)
2969 			return ret;
2970 
2971 		mutex_lock(&iommu->lock);
2972 
2973 		iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
2974 
2975 		/* allow only smallest supported pgsize */
2976 		if (range.bitmap.pgsize != iommu_pgsize) {
2977 			ret = -EINVAL;
2978 			goto out_unlock;
2979 		}
2980 		if (range.iova & (iommu_pgsize - 1)) {
2981 			ret = -EINVAL;
2982 			goto out_unlock;
2983 		}
2984 		if (!range.size || range.size & (iommu_pgsize - 1)) {
2985 			ret = -EINVAL;
2986 			goto out_unlock;
2987 		}
2988 
2989 		if (iommu->dirty_page_tracking)
2990 			ret = vfio_iova_dirty_bitmap(range.bitmap.data,
2991 						     iommu, range.iova,
2992 						     range.size,
2993 						     range.bitmap.pgsize);
2994 		else
2995 			ret = -EINVAL;
2996 out_unlock:
2997 		mutex_unlock(&iommu->lock);
2998 
2999 		return ret;
3000 	}
3001 
3002 	return -EINVAL;
3003 }
3004 
vfio_iommu_type1_ioctl(void * iommu_data,unsigned int cmd,unsigned long arg)3005 static long vfio_iommu_type1_ioctl(void *iommu_data,
3006 				   unsigned int cmd, unsigned long arg)
3007 {
3008 	struct vfio_iommu *iommu = iommu_data;
3009 
3010 	switch (cmd) {
3011 	case VFIO_CHECK_EXTENSION:
3012 		return vfio_iommu_type1_check_extension(iommu, arg);
3013 	case VFIO_IOMMU_GET_INFO:
3014 		return vfio_iommu_type1_get_info(iommu, arg);
3015 	case VFIO_IOMMU_MAP_DMA:
3016 		return vfio_iommu_type1_map_dma(iommu, arg);
3017 	case VFIO_IOMMU_UNMAP_DMA:
3018 		return vfio_iommu_type1_unmap_dma(iommu, arg);
3019 	case VFIO_IOMMU_DIRTY_PAGES:
3020 		return vfio_iommu_type1_dirty_pages(iommu, arg);
3021 	default:
3022 		return -ENOTTY;
3023 	}
3024 }
3025 
vfio_iommu_type1_register_device(void * iommu_data,struct vfio_device * vdev)3026 static void vfio_iommu_type1_register_device(void *iommu_data,
3027 					     struct vfio_device *vdev)
3028 {
3029 	struct vfio_iommu *iommu = iommu_data;
3030 
3031 	if (!vdev->ops->dma_unmap)
3032 		return;
3033 
3034 	/*
3035 	 * list_empty(&iommu->device_list) is tested under the iommu->lock while
3036 	 * iteration for dma_unmap must be done under the device_list_lock.
3037 	 * Holding both locks here allows avoiding the device_list_lock in
3038 	 * several fast paths. See vfio_notify_dma_unmap()
3039 	 */
3040 	mutex_lock(&iommu->lock);
3041 	mutex_lock(&iommu->device_list_lock);
3042 	list_add(&vdev->iommu_entry, &iommu->device_list);
3043 	mutex_unlock(&iommu->device_list_lock);
3044 	mutex_unlock(&iommu->lock);
3045 }
3046 
vfio_iommu_type1_unregister_device(void * iommu_data,struct vfio_device * vdev)3047 static void vfio_iommu_type1_unregister_device(void *iommu_data,
3048 					       struct vfio_device *vdev)
3049 {
3050 	struct vfio_iommu *iommu = iommu_data;
3051 
3052 	if (!vdev->ops->dma_unmap)
3053 		return;
3054 
3055 	mutex_lock(&iommu->lock);
3056 	mutex_lock(&iommu->device_list_lock);
3057 	list_del(&vdev->iommu_entry);
3058 	mutex_unlock(&iommu->device_list_lock);
3059 	mutex_unlock(&iommu->lock);
3060 }
3061 
vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu * iommu,dma_addr_t user_iova,void * data,size_t count,bool write,size_t * copied)3062 static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
3063 					 dma_addr_t user_iova, void *data,
3064 					 size_t count, bool write,
3065 					 size_t *copied)
3066 {
3067 	struct mm_struct *mm;
3068 	unsigned long vaddr;
3069 	struct vfio_dma *dma;
3070 	bool kthread = current->mm == NULL;
3071 	size_t offset;
3072 
3073 	*copied = 0;
3074 
3075 	dma = vfio_find_dma(iommu, user_iova, 1);
3076 	if (!dma)
3077 		return -EINVAL;
3078 
3079 	if ((write && !(dma->prot & IOMMU_WRITE)) ||
3080 			!(dma->prot & IOMMU_READ))
3081 		return -EPERM;
3082 
3083 	mm = dma->mm;
3084 	if (!mmget_not_zero(mm))
3085 		return -EPERM;
3086 
3087 	if (kthread)
3088 		kthread_use_mm(mm);
3089 	else if (current->mm != mm)
3090 		goto out;
3091 
3092 	offset = user_iova - dma->iova;
3093 
3094 	if (count > dma->size - offset)
3095 		count = dma->size - offset;
3096 
3097 	vaddr = dma->vaddr + offset;
3098 
3099 	if (write) {
3100 		*copied = copy_to_user((void __user *)vaddr, data,
3101 					 count) ? 0 : count;
3102 		if (*copied && iommu->dirty_page_tracking) {
3103 			unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
3104 			/*
3105 			 * Bitmap populated with the smallest supported page
3106 			 * size
3107 			 */
3108 			bitmap_set(dma->bitmap, offset >> pgshift,
3109 				   ((offset + *copied - 1) >> pgshift) -
3110 				   (offset >> pgshift) + 1);
3111 		}
3112 	} else
3113 		*copied = copy_from_user(data, (void __user *)vaddr,
3114 					   count) ? 0 : count;
3115 	if (kthread)
3116 		kthread_unuse_mm(mm);
3117 out:
3118 	mmput(mm);
3119 	return *copied ? 0 : -EFAULT;
3120 }
3121 
vfio_iommu_type1_dma_rw(void * iommu_data,dma_addr_t user_iova,void * data,size_t count,bool write)3122 static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
3123 				   void *data, size_t count, bool write)
3124 {
3125 	struct vfio_iommu *iommu = iommu_data;
3126 	int ret = 0;
3127 	size_t done;
3128 
3129 	mutex_lock(&iommu->lock);
3130 
3131 	if (WARN_ONCE(iommu->vaddr_invalid_count,
3132 		      "vfio_dma_rw not allowed with VFIO_UPDATE_VADDR\n")) {
3133 		ret = -EBUSY;
3134 		goto out;
3135 	}
3136 
3137 	while (count > 0) {
3138 		ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data,
3139 						    count, write, &done);
3140 		if (ret)
3141 			break;
3142 
3143 		count -= done;
3144 		data += done;
3145 		user_iova += done;
3146 	}
3147 
3148 out:
3149 	mutex_unlock(&iommu->lock);
3150 	return ret;
3151 }
3152 
3153 static struct iommu_domain *
vfio_iommu_type1_group_iommu_domain(void * iommu_data,struct iommu_group * iommu_group)3154 vfio_iommu_type1_group_iommu_domain(void *iommu_data,
3155 				    struct iommu_group *iommu_group)
3156 {
3157 	struct iommu_domain *domain = ERR_PTR(-ENODEV);
3158 	struct vfio_iommu *iommu = iommu_data;
3159 	struct vfio_domain *d;
3160 
3161 	if (!iommu || !iommu_group)
3162 		return ERR_PTR(-EINVAL);
3163 
3164 	mutex_lock(&iommu->lock);
3165 	list_for_each_entry(d, &iommu->domain_list, next) {
3166 		if (find_iommu_group(d, iommu_group)) {
3167 			domain = d->domain;
3168 			break;
3169 		}
3170 	}
3171 	mutex_unlock(&iommu->lock);
3172 
3173 	return domain;
3174 }
3175 
3176 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
3177 	.name			= "vfio-iommu-type1",
3178 	.owner			= THIS_MODULE,
3179 	.open			= vfio_iommu_type1_open,
3180 	.release		= vfio_iommu_type1_release,
3181 	.ioctl			= vfio_iommu_type1_ioctl,
3182 	.attach_group		= vfio_iommu_type1_attach_group,
3183 	.detach_group		= vfio_iommu_type1_detach_group,
3184 	.pin_pages		= vfio_iommu_type1_pin_pages,
3185 	.unpin_pages		= vfio_iommu_type1_unpin_pages,
3186 	.register_device	= vfio_iommu_type1_register_device,
3187 	.unregister_device	= vfio_iommu_type1_unregister_device,
3188 	.dma_rw			= vfio_iommu_type1_dma_rw,
3189 	.group_iommu_domain	= vfio_iommu_type1_group_iommu_domain,
3190 };
3191 
vfio_iommu_type1_init(void)3192 static int __init vfio_iommu_type1_init(void)
3193 {
3194 	return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
3195 }
3196 
vfio_iommu_type1_cleanup(void)3197 static void __exit vfio_iommu_type1_cleanup(void)
3198 {
3199 	vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
3200 }
3201 
3202 module_init(vfio_iommu_type1_init);
3203 module_exit(vfio_iommu_type1_cleanup);
3204 
3205 MODULE_VERSION(DRIVER_VERSION);
3206 MODULE_LICENSE("GPL v2");
3207 MODULE_AUTHOR(DRIVER_AUTHOR);
3208 MODULE_DESCRIPTION(DRIVER_DESC);
3209