xref: /linux/drivers/vfio/vfio_main.c (revision f0bf3eac92b2be5f34b944cb82f1c23db642c7f5)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12 
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/fs.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #if IS_ENABLED(CONFIG_KVM)
20 #include <linux/kvm_host.h>
21 #endif
22 #include <linux/list.h>
23 #include <linux/miscdevice.h>
24 #include <linux/module.h>
25 #include <linux/mount.h>
26 #include <linux/mutex.h>
27 #include <linux/pci.h>
28 #include <linux/pseudo_fs.h>
29 #include <linux/rwsem.h>
30 #include <linux/sched.h>
31 #include <linux/seq_file.h>
32 #include <linux/slab.h>
33 #include <linux/stat.h>
34 #include <linux/string.h>
35 #include <linux/uaccess.h>
36 #include <linux/vfio.h>
37 #include <linux/wait.h>
38 #include <linux/sched/signal.h>
39 #include <linux/pm_runtime.h>
40 #include <linux/interval_tree.h>
41 #include <linux/iova_bitmap.h>
42 #include <linux/iommufd.h>
43 #include "vfio.h"
44 
45 #define DRIVER_VERSION	"0.3"
46 #define DRIVER_AUTHOR	"Alex Williamson <alex.williamson@redhat.com>"
47 #define DRIVER_DESC	"VFIO - User Level meta-driver"
48 
49 #define VFIO_MAGIC 0x5646494f /* "VFIO" */
50 
51 static struct vfio {
52 	struct class			*device_class;
53 	struct ida			device_ida;
54 	struct vfsmount			*vfs_mount;
55 	int				fs_count;
56 } vfio;
57 
58 #ifdef CONFIG_VFIO_NOIOMMU
59 bool vfio_noiommu __read_mostly;
60 module_param_named(enable_unsafe_noiommu_mode,
61 		   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
62 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
63 #endif
64 
65 static DEFINE_XARRAY(vfio_device_set_xa);
66 
vfio_assign_device_set(struct vfio_device * device,void * set_id)67 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
68 {
69 	unsigned long idx = (unsigned long)set_id;
70 	struct vfio_device_set *new_dev_set;
71 	struct vfio_device_set *dev_set;
72 
73 	if (WARN_ON(!set_id))
74 		return -EINVAL;
75 
76 	/*
77 	 * Atomically acquire a singleton object in the xarray for this set_id
78 	 */
79 	xa_lock(&vfio_device_set_xa);
80 	dev_set = xa_load(&vfio_device_set_xa, idx);
81 	if (dev_set)
82 		goto found_get_ref;
83 	xa_unlock(&vfio_device_set_xa);
84 
85 	new_dev_set = kzalloc_obj(*new_dev_set);
86 	if (!new_dev_set)
87 		return -ENOMEM;
88 	mutex_init(&new_dev_set->lock);
89 	INIT_LIST_HEAD(&new_dev_set->device_list);
90 	new_dev_set->set_id = set_id;
91 
92 	xa_lock(&vfio_device_set_xa);
93 	dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
94 			       GFP_KERNEL);
95 	if (!dev_set) {
96 		dev_set = new_dev_set;
97 		goto found_get_ref;
98 	}
99 
100 	kfree(new_dev_set);
101 	if (xa_is_err(dev_set)) {
102 		xa_unlock(&vfio_device_set_xa);
103 		return xa_err(dev_set);
104 	}
105 
106 found_get_ref:
107 	dev_set->device_count++;
108 	xa_unlock(&vfio_device_set_xa);
109 	mutex_lock(&dev_set->lock);
110 	device->dev_set = dev_set;
111 	list_add_tail(&device->dev_set_list, &dev_set->device_list);
112 	mutex_unlock(&dev_set->lock);
113 	return 0;
114 }
115 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
116 
vfio_release_device_set(struct vfio_device * device)117 static void vfio_release_device_set(struct vfio_device *device)
118 {
119 	struct vfio_device_set *dev_set = device->dev_set;
120 
121 	if (!dev_set)
122 		return;
123 
124 	mutex_lock(&dev_set->lock);
125 	list_del(&device->dev_set_list);
126 	mutex_unlock(&dev_set->lock);
127 
128 	xa_lock(&vfio_device_set_xa);
129 	if (!--dev_set->device_count) {
130 		__xa_erase(&vfio_device_set_xa,
131 			   (unsigned long)dev_set->set_id);
132 		mutex_destroy(&dev_set->lock);
133 		kfree(dev_set);
134 	}
135 	xa_unlock(&vfio_device_set_xa);
136 }
137 
vfio_device_set_open_count(struct vfio_device_set * dev_set)138 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
139 {
140 	struct vfio_device *cur;
141 	unsigned int open_count = 0;
142 
143 	lockdep_assert_held(&dev_set->lock);
144 
145 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
146 		open_count += cur->open_count;
147 	return open_count;
148 }
149 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
150 
151 struct vfio_device *
vfio_find_device_in_devset(struct vfio_device_set * dev_set,struct device * dev)152 vfio_find_device_in_devset(struct vfio_device_set *dev_set,
153 			   struct device *dev)
154 {
155 	struct vfio_device *cur;
156 
157 	lockdep_assert_held(&dev_set->lock);
158 
159 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
160 		if (cur->dev == dev)
161 			return cur;
162 	return NULL;
163 }
164 EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
165 
166 /*
167  * Device objects - create, release, get, put, search
168  */
169 /* Device reference always implies a group reference */
vfio_device_put_registration(struct vfio_device * device)170 void vfio_device_put_registration(struct vfio_device *device)
171 {
172 	if (refcount_dec_and_test(&device->refcount))
173 		complete(&device->comp);
174 }
175 EXPORT_SYMBOL_GPL(vfio_device_put_registration);
176 
vfio_device_try_get_registration(struct vfio_device * device)177 bool vfio_device_try_get_registration(struct vfio_device *device)
178 {
179 	return refcount_inc_not_zero(&device->refcount);
180 }
181 EXPORT_SYMBOL_GPL(vfio_device_try_get_registration);
182 
183 /*
184  * VFIO driver API
185  */
186 /* Release helper called by vfio_put_device() */
vfio_device_release(struct device * dev)187 static void vfio_device_release(struct device *dev)
188 {
189 	struct vfio_device *device =
190 			container_of(dev, struct vfio_device, device);
191 
192 	vfio_release_device_set(device);
193 	ida_free(&vfio.device_ida, device->index);
194 
195 	if (device->ops->release)
196 		device->ops->release(device);
197 
198 	iput(device->inode);
199 	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
200 	kvfree(device);
201 }
202 
203 static int vfio_init_device(struct vfio_device *device, struct device *dev,
204 			    const struct vfio_device_ops *ops);
205 
206 /*
207  * Allocate and initialize vfio_device so it can be registered to vfio
208  * core.
209  *
210  * Drivers should use the wrapper vfio_alloc_device() for allocation.
211  * @size is the size of the structure to be allocated, including any
212  * private data used by the driver.
213  *
214  * Driver may provide an @init callback to cover device private data.
215  *
216  * Use vfio_put_device() to release the structure after success return.
217  */
_vfio_alloc_device(size_t size,struct device * dev,const struct vfio_device_ops * ops)218 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
219 				       const struct vfio_device_ops *ops)
220 {
221 	struct vfio_device *device;
222 	int ret;
223 
224 	if (WARN_ON(size < sizeof(struct vfio_device)))
225 		return ERR_PTR(-EINVAL);
226 
227 	device = kvzalloc(size, GFP_KERNEL);
228 	if (!device)
229 		return ERR_PTR(-ENOMEM);
230 
231 	ret = vfio_init_device(device, dev, ops);
232 	if (ret)
233 		goto out_free;
234 	return device;
235 
236 out_free:
237 	kvfree(device);
238 	return ERR_PTR(ret);
239 }
240 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
241 
vfio_fs_init_fs_context(struct fs_context * fc)242 static int vfio_fs_init_fs_context(struct fs_context *fc)
243 {
244 	return init_pseudo(fc, VFIO_MAGIC) ? 0 : -ENOMEM;
245 }
246 
247 static struct file_system_type vfio_fs_type = {
248 	.name = "vfio",
249 	.owner = THIS_MODULE,
250 	.init_fs_context = vfio_fs_init_fs_context,
251 	.kill_sb = kill_anon_super,
252 };
253 
vfio_fs_inode_new(void)254 static struct inode *vfio_fs_inode_new(void)
255 {
256 	struct inode *inode;
257 	int ret;
258 
259 	ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count);
260 	if (ret)
261 		return ERR_PTR(ret);
262 
263 	inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb);
264 	if (IS_ERR(inode))
265 		simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
266 
267 	return inode;
268 }
269 
270 /*
271  * Initialize a vfio_device so it can be registered to vfio core.
272  */
vfio_init_device(struct vfio_device * device,struct device * dev,const struct vfio_device_ops * ops)273 static int vfio_init_device(struct vfio_device *device, struct device *dev,
274 			    const struct vfio_device_ops *ops)
275 {
276 	int ret;
277 
278 	ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
279 	if (ret < 0) {
280 		dev_dbg(dev, "Error to alloc index\n");
281 		return ret;
282 	}
283 
284 	device->index = ret;
285 	init_completion(&device->comp);
286 	device->dev = dev;
287 	device->ops = ops;
288 	device->inode = vfio_fs_inode_new();
289 	if (IS_ERR(device->inode)) {
290 		ret = PTR_ERR(device->inode);
291 		goto out_inode;
292 	}
293 
294 	if (ops->init) {
295 		ret = ops->init(device);
296 		if (ret)
297 			goto out_uninit;
298 	}
299 
300 	device_initialize(&device->device);
301 	device->device.release = vfio_device_release;
302 	device->device.class = vfio.device_class;
303 	device->device.parent = device->dev;
304 	return 0;
305 
306 out_uninit:
307 	iput(device->inode);
308 	simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
309 out_inode:
310 	vfio_release_device_set(device);
311 	ida_free(&vfio.device_ida, device->index);
312 	return ret;
313 }
314 
__vfio_register_dev(struct vfio_device * device,enum vfio_group_type type)315 static int __vfio_register_dev(struct vfio_device *device,
316 			       enum vfio_group_type type)
317 {
318 	int ret;
319 
320 	if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
321 		    (!device->ops->bind_iommufd ||
322 		     !device->ops->unbind_iommufd ||
323 		     !device->ops->attach_ioas ||
324 		     !device->ops->detach_ioas)))
325 		return -EINVAL;
326 
327 	/*
328 	 * If the driver doesn't specify a set then the device is added to a
329 	 * singleton set just for itself.
330 	 */
331 	if (!device->dev_set)
332 		vfio_assign_device_set(device, device);
333 
334 	ret = dev_set_name(&device->device, "vfio%d", device->index);
335 	if (ret)
336 		return ret;
337 
338 	ret = vfio_device_set_group(device, type);
339 	if (ret)
340 		return ret;
341 
342 	/*
343 	 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
344 	 * restore cache coherency. It has to be checked here because it is only
345 	 * valid for cases where we are using iommu groups.
346 	 */
347 	if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) &&
348 	    !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) {
349 		ret = -EINVAL;
350 		goto err_out;
351 	}
352 
353 	ret = vfio_device_add(device);
354 	if (ret)
355 		goto err_out;
356 
357 	/* Refcounting can't start until the driver calls register */
358 	refcount_set(&device->refcount, 1);
359 
360 	vfio_device_group_register(device);
361 	vfio_device_debugfs_init(device);
362 
363 	return 0;
364 err_out:
365 	vfio_device_remove_group(device);
366 	return ret;
367 }
368 
vfio_register_group_dev(struct vfio_device * device)369 int vfio_register_group_dev(struct vfio_device *device)
370 {
371 	return __vfio_register_dev(device, VFIO_IOMMU);
372 }
373 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
374 
375 /*
376  * Register a virtual device without IOMMU backing.  The user of this
377  * device must not be able to directly trigger unmediated DMA.
378  */
vfio_register_emulated_iommu_dev(struct vfio_device * device)379 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
380 {
381 	return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
382 }
383 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
384 
385 /*
386  * Decrement the device reference count and wait for the device to be
387  * removed.  Open file descriptors for the device... */
vfio_unregister_group_dev(struct vfio_device * device)388 void vfio_unregister_group_dev(struct vfio_device *device)
389 {
390 	unsigned int i = 0;
391 	bool interrupted = false;
392 	long rc;
393 
394 	/*
395 	 * Prevent new device opened by userspace via the
396 	 * VFIO_GROUP_GET_DEVICE_FD in the group path.
397 	 */
398 	vfio_device_group_unregister(device);
399 
400 	/*
401 	 * Balances vfio_device_add() in register path, also prevents
402 	 * new device opened by userspace in the cdev path.
403 	 */
404 	vfio_device_del(device);
405 
406 	vfio_device_put_registration(device);
407 	rc = try_wait_for_completion(&device->comp);
408 	while (rc <= 0) {
409 		if (device->ops->request)
410 			device->ops->request(device, i++);
411 
412 		if (interrupted) {
413 			rc = wait_for_completion_timeout(&device->comp,
414 							 HZ * 10);
415 		} else {
416 			rc = wait_for_completion_interruptible_timeout(
417 				&device->comp, HZ * 10);
418 			if (rc < 0) {
419 				interrupted = true;
420 				dev_warn(device->dev,
421 					 "Device is currently in use, task"
422 					 " \"%s\" (%d) "
423 					 "blocked until device is released",
424 					 current->comm, task_pid_nr(current));
425 			}
426 		}
427 	}
428 
429 	vfio_device_debugfs_exit(device);
430 	/* Balances vfio_device_set_group in register path */
431 	vfio_device_remove_group(device);
432 }
433 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
434 
435 #if IS_ENABLED(CONFIG_KVM)
vfio_device_get_kvm_safe(struct vfio_device * device,struct kvm * kvm)436 void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
437 {
438 	void (*pfn)(struct kvm *kvm);
439 	bool (*fn)(struct kvm *kvm);
440 	bool ret;
441 
442 	lockdep_assert_held(&device->dev_set->lock);
443 
444 	if (!kvm)
445 		return;
446 
447 	pfn = symbol_get(kvm_put_kvm);
448 	if (WARN_ON(!pfn))
449 		return;
450 
451 	fn = symbol_get(kvm_get_kvm_safe);
452 	if (WARN_ON(!fn)) {
453 		symbol_put(kvm_put_kvm);
454 		return;
455 	}
456 
457 	ret = fn(kvm);
458 	symbol_put(kvm_get_kvm_safe);
459 	if (!ret) {
460 		symbol_put(kvm_put_kvm);
461 		return;
462 	}
463 
464 	device->put_kvm = pfn;
465 	device->kvm = kvm;
466 }
467 
vfio_device_put_kvm(struct vfio_device * device)468 void vfio_device_put_kvm(struct vfio_device *device)
469 {
470 	lockdep_assert_held(&device->dev_set->lock);
471 
472 	if (!device->kvm)
473 		return;
474 
475 	if (WARN_ON(!device->put_kvm))
476 		goto clear;
477 
478 	device->put_kvm(device->kvm);
479 	device->put_kvm = NULL;
480 	symbol_put(kvm_put_kvm);
481 
482 clear:
483 	device->kvm = NULL;
484 }
485 #endif
486 
487 /* true if the vfio_device has open_device() called but not close_device() */
vfio_assert_device_open(struct vfio_device * device)488 static bool vfio_assert_device_open(struct vfio_device *device)
489 {
490 	return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
491 }
492 
493 struct vfio_device_file *
vfio_allocate_device_file(struct vfio_device * device)494 vfio_allocate_device_file(struct vfio_device *device)
495 {
496 	struct vfio_device_file *df;
497 
498 	df = kzalloc_obj(*df, GFP_KERNEL_ACCOUNT);
499 	if (!df)
500 		return ERR_PTR(-ENOMEM);
501 
502 	df->device = device;
503 	spin_lock_init(&df->kvm_ref_lock);
504 
505 	return df;
506 }
507 
vfio_df_device_first_open(struct vfio_device_file * df)508 static int vfio_df_device_first_open(struct vfio_device_file *df)
509 {
510 	struct vfio_device *device = df->device;
511 	struct iommufd_ctx *iommufd = df->iommufd;
512 	int ret;
513 
514 	lockdep_assert_held(&device->dev_set->lock);
515 
516 	if (!try_module_get(device->dev->driver->owner))
517 		return -ENODEV;
518 
519 	if (iommufd)
520 		ret = vfio_df_iommufd_bind(df);
521 	else
522 		ret = vfio_device_group_use_iommu(device);
523 	if (ret)
524 		goto err_module_put;
525 
526 	if (device->ops->open_device) {
527 		ret = device->ops->open_device(device);
528 		if (ret)
529 			goto err_unuse_iommu;
530 	}
531 	return 0;
532 
533 err_unuse_iommu:
534 	if (iommufd)
535 		vfio_df_iommufd_unbind(df);
536 	else
537 		vfio_device_group_unuse_iommu(device);
538 err_module_put:
539 	module_put(device->dev->driver->owner);
540 	return ret;
541 }
542 
vfio_df_device_last_close(struct vfio_device_file * df)543 static void vfio_df_device_last_close(struct vfio_device_file *df)
544 {
545 	struct vfio_device *device = df->device;
546 	struct iommufd_ctx *iommufd = df->iommufd;
547 
548 	lockdep_assert_held(&device->dev_set->lock);
549 
550 	if (device->ops->close_device)
551 		device->ops->close_device(device);
552 	if (iommufd)
553 		vfio_df_iommufd_unbind(df);
554 	else
555 		vfio_device_group_unuse_iommu(device);
556 	device->precopy_info_v2 = 0;
557 	module_put(device->dev->driver->owner);
558 }
559 
vfio_df_open(struct vfio_device_file * df)560 int vfio_df_open(struct vfio_device_file *df)
561 {
562 	struct vfio_device *device = df->device;
563 	int ret = 0;
564 
565 	lockdep_assert_held(&device->dev_set->lock);
566 
567 	/*
568 	 * Only the group path allows the device to be opened multiple
569 	 * times.  The device cdev path doesn't have a secure way for it.
570 	 */
571 	if (device->open_count != 0 && !df->group)
572 		return -EINVAL;
573 
574 	device->open_count++;
575 	if (device->open_count == 1) {
576 		ret = vfio_df_device_first_open(df);
577 		if (ret)
578 			device->open_count--;
579 	}
580 
581 	return ret;
582 }
583 
vfio_df_close(struct vfio_device_file * df)584 void vfio_df_close(struct vfio_device_file *df)
585 {
586 	struct vfio_device *device = df->device;
587 
588 	lockdep_assert_held(&device->dev_set->lock);
589 
590 	if (!vfio_assert_device_open(device))
591 		return;
592 	if (device->open_count == 1)
593 		vfio_df_device_last_close(df);
594 	device->open_count--;
595 }
596 
597 /*
598  * Wrapper around pm_runtime_resume_and_get().
599  * Return error code on failure or 0 on success.
600  */
vfio_device_pm_runtime_get(struct vfio_device * device)601 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
602 {
603 	struct device *dev = device->dev;
604 
605 	if (dev->driver && dev->driver->pm) {
606 		int ret;
607 
608 		ret = pm_runtime_resume_and_get(dev);
609 		if (ret) {
610 			dev_info_ratelimited(dev,
611 				"vfio: runtime resume failed %d\n", ret);
612 			return -EIO;
613 		}
614 	}
615 
616 	return 0;
617 }
618 
619 /*
620  * Wrapper around pm_runtime_put().
621  */
vfio_device_pm_runtime_put(struct vfio_device * device)622 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
623 {
624 	struct device *dev = device->dev;
625 
626 	if (dev->driver && dev->driver->pm)
627 		pm_runtime_put(dev);
628 }
629 
630 /*
631  * VFIO Device fd
632  */
vfio_device_fops_release(struct inode * inode,struct file * filep)633 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
634 {
635 	struct vfio_device_file *df = filep->private_data;
636 	struct vfio_device *device = df->device;
637 
638 	if (df->group)
639 		vfio_df_group_close(df);
640 	else
641 		vfio_df_unbind_iommufd(df);
642 
643 	vfio_device_put_registration(device);
644 
645 	kfree(df);
646 
647 	return 0;
648 }
649 
650 /*
651  * vfio_mig_get_next_state - Compute the next step in the FSM
652  * @cur_fsm - The current state the device is in
653  * @new_fsm - The target state to reach
654  * @next_fsm - Pointer to the next step to get to new_fsm
655  *
656  * Return 0 upon success, otherwise -errno
657  * Upon success the next step in the state progression between cur_fsm and
658  * new_fsm will be set in next_fsm.
659  *
660  * This breaks down requests for combination transitions into smaller steps and
661  * returns the next step to get to new_fsm. The function may need to be called
662  * multiple times before reaching new_fsm.
663  *
664  */
vfio_mig_get_next_state(struct vfio_device * device,enum vfio_device_mig_state cur_fsm,enum vfio_device_mig_state new_fsm,enum vfio_device_mig_state * next_fsm)665 int vfio_mig_get_next_state(struct vfio_device *device,
666 			    enum vfio_device_mig_state cur_fsm,
667 			    enum vfio_device_mig_state new_fsm,
668 			    enum vfio_device_mig_state *next_fsm)
669 {
670 	enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
671 	/*
672 	 * The coding in this table requires the driver to implement the
673 	 * following FSM arcs:
674 	 *         RESUMING -> STOP
675 	 *         STOP -> RESUMING
676 	 *         STOP -> STOP_COPY
677 	 *         STOP_COPY -> STOP
678 	 *
679 	 * If P2P is supported then the driver must also implement these FSM
680 	 * arcs:
681 	 *         RUNNING -> RUNNING_P2P
682 	 *         RUNNING_P2P -> RUNNING
683 	 *         RUNNING_P2P -> STOP
684 	 *         STOP -> RUNNING_P2P
685 	 *
686 	 * If precopy is supported then the driver must support these additional
687 	 * FSM arcs:
688 	 *         RUNNING -> PRE_COPY
689 	 *         PRE_COPY -> RUNNING
690 	 *         PRE_COPY -> STOP_COPY
691 	 * However, if precopy and P2P are supported together then the driver
692 	 * must support these additional arcs beyond the P2P arcs above:
693 	 *         PRE_COPY -> RUNNING
694 	 *         PRE_COPY -> PRE_COPY_P2P
695 	 *         PRE_COPY_P2P -> PRE_COPY
696 	 *         PRE_COPY_P2P -> RUNNING_P2P
697 	 *         PRE_COPY_P2P -> STOP_COPY
698 	 *         RUNNING -> PRE_COPY
699 	 *         RUNNING_P2P -> PRE_COPY_P2P
700 	 *
701 	 * Without P2P and precopy the driver must implement:
702 	 *         RUNNING -> STOP
703 	 *         STOP -> RUNNING
704 	 *
705 	 * The coding will step through multiple states for some combination
706 	 * transitions; if all optional features are supported, this means the
707 	 * following ones:
708 	 *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
709 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P
710 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
711 	 *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
712 	 *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
713 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
714 	 *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
715 	 *         RESUMING -> STOP -> RUNNING_P2P
716 	 *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
717 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
718 	 *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
719 	 *         RESUMING -> STOP -> STOP_COPY
720 	 *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
721 	 *         RUNNING -> RUNNING_P2P -> STOP
722 	 *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
723 	 *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
724 	 *         RUNNING_P2P -> RUNNING -> PRE_COPY
725 	 *         RUNNING_P2P -> STOP -> RESUMING
726 	 *         RUNNING_P2P -> STOP -> STOP_COPY
727 	 *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
728 	 *         STOP -> RUNNING_P2P -> RUNNING
729 	 *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
730 	 *         STOP_COPY -> STOP -> RESUMING
731 	 *         STOP_COPY -> STOP -> RUNNING_P2P
732 	 *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
733 	 *
734 	 *  The following transitions are blocked:
735 	 *         STOP_COPY -> PRE_COPY
736 	 *         STOP_COPY -> PRE_COPY_P2P
737 	 */
738 	static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
739 		[VFIO_DEVICE_STATE_STOP] = {
740 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
741 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
742 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
743 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
744 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
745 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
746 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
747 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
748 		},
749 		[VFIO_DEVICE_STATE_RUNNING] = {
750 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
751 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
752 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
753 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
754 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
755 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
756 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
757 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
758 		},
759 		[VFIO_DEVICE_STATE_PRE_COPY] = {
760 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
761 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
762 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
763 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
764 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
765 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
766 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
767 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
768 		},
769 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
770 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
771 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
772 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
773 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
774 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
775 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
776 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
777 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
778 		},
779 		[VFIO_DEVICE_STATE_STOP_COPY] = {
780 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
781 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
782 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
783 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
784 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
785 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
786 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
787 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
788 		},
789 		[VFIO_DEVICE_STATE_RESUMING] = {
790 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
791 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
792 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
793 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
794 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
795 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
796 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
797 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
798 		},
799 		[VFIO_DEVICE_STATE_RUNNING_P2P] = {
800 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
801 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
802 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
803 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
804 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
805 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
806 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
807 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
808 		},
809 		[VFIO_DEVICE_STATE_ERROR] = {
810 			[VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
811 			[VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
812 			[VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
813 			[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
814 			[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
815 			[VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
816 			[VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
817 			[VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
818 		},
819 	};
820 
821 	static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
822 		[VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
823 		[VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
824 		[VFIO_DEVICE_STATE_PRE_COPY] =
825 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
826 		[VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
827 						   VFIO_MIGRATION_P2P |
828 						   VFIO_MIGRATION_PRE_COPY,
829 		[VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
830 		[VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
831 		[VFIO_DEVICE_STATE_RUNNING_P2P] =
832 			VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
833 		[VFIO_DEVICE_STATE_ERROR] = ~0U,
834 	};
835 
836 	if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
837 		    (state_flags_table[cur_fsm] & device->migration_flags) !=
838 			state_flags_table[cur_fsm]))
839 		return -EINVAL;
840 
841 	if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
842 	   (state_flags_table[new_fsm] & device->migration_flags) !=
843 			state_flags_table[new_fsm])
844 		return -EINVAL;
845 
846 	/*
847 	 * Arcs touching optional and unsupported states are skipped over. The
848 	 * driver will instead see an arc from the original state to the next
849 	 * logical state, as per the above comment.
850 	 */
851 	*next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
852 	while ((state_flags_table[*next_fsm] & device->migration_flags) !=
853 			state_flags_table[*next_fsm])
854 		*next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
855 
856 	return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
857 }
858 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
859 
860 /*
861  * Convert the drivers's struct file into a FD number and return it to userspace
862  */
vfio_ioct_mig_return_fd(struct file * filp,void __user * arg,struct vfio_device_feature_mig_state * mig)863 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
864 				   struct vfio_device_feature_mig_state *mig)
865 {
866 	int ret;
867 	int fd;
868 
869 	fd = get_unused_fd_flags(O_CLOEXEC);
870 	if (fd < 0) {
871 		ret = fd;
872 		goto out_fput;
873 	}
874 
875 	mig->data_fd = fd;
876 	if (copy_to_user(arg, mig, sizeof(*mig))) {
877 		ret = -EFAULT;
878 		goto out_put_unused;
879 	}
880 	fd_install(fd, filp);
881 	return 0;
882 
883 out_put_unused:
884 	put_unused_fd(fd);
885 out_fput:
886 	fput(filp);
887 	return ret;
888 }
889 
890 static int
vfio_ioctl_device_feature_mig_device_state(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)891 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
892 					   u32 flags, void __user *arg,
893 					   size_t argsz)
894 {
895 	size_t minsz =
896 		offsetofend(struct vfio_device_feature_mig_state, data_fd);
897 	struct vfio_device_feature_mig_state mig;
898 	struct file *filp = NULL;
899 	int ret;
900 
901 	if (!device->mig_ops)
902 		return -ENOTTY;
903 
904 	ret = vfio_check_feature(flags, argsz,
905 				 VFIO_DEVICE_FEATURE_SET |
906 				 VFIO_DEVICE_FEATURE_GET,
907 				 sizeof(mig));
908 	if (ret != 1)
909 		return ret;
910 
911 	if (copy_from_user(&mig, arg, minsz))
912 		return -EFAULT;
913 
914 	if (flags & VFIO_DEVICE_FEATURE_GET) {
915 		enum vfio_device_mig_state curr_state;
916 
917 		ret = device->mig_ops->migration_get_state(device,
918 							   &curr_state);
919 		if (ret)
920 			return ret;
921 		mig.device_state = curr_state;
922 		goto out_copy;
923 	}
924 
925 	/* Handle the VFIO_DEVICE_FEATURE_SET */
926 	filp = device->mig_ops->migration_set_state(device, mig.device_state);
927 	if (IS_ERR(filp) || !filp)
928 		goto out_copy;
929 
930 	return vfio_ioct_mig_return_fd(filp, arg, &mig);
931 out_copy:
932 	mig.data_fd = -1;
933 	if (copy_to_user(arg, &mig, sizeof(mig)))
934 		return -EFAULT;
935 	if (IS_ERR(filp))
936 		return PTR_ERR(filp);
937 	return 0;
938 }
939 
940 static int
vfio_ioctl_device_feature_migration_data_size(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)941 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
942 					      u32 flags, void __user *arg,
943 					      size_t argsz)
944 {
945 	struct vfio_device_feature_mig_data_size data_size = {};
946 	unsigned long stop_copy_length;
947 	int ret;
948 
949 	if (!device->mig_ops)
950 		return -ENOTTY;
951 
952 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
953 				 sizeof(data_size));
954 	if (ret != 1)
955 		return ret;
956 
957 	ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
958 	if (ret)
959 		return ret;
960 
961 	data_size.stop_copy_length = stop_copy_length;
962 	if (copy_to_user(arg, &data_size, sizeof(data_size)))
963 		return -EFAULT;
964 
965 	return 0;
966 }
967 
968 static int
vfio_ioctl_device_feature_migration_precopy_info_v2(struct vfio_device * device,u32 flags,size_t argsz)969 vfio_ioctl_device_feature_migration_precopy_info_v2(struct vfio_device *device,
970 						    u32 flags, size_t argsz)
971 {
972 	int ret;
973 
974 	if (!(device->migration_flags & VFIO_MIGRATION_PRE_COPY))
975 		return -EINVAL;
976 
977 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0);
978 	if (ret != 1)
979 		return ret;
980 
981 	device->precopy_info_v2 = 1;
982 	return 0;
983 }
984 
vfio_ioctl_device_feature_migration(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)985 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
986 					       u32 flags, void __user *arg,
987 					       size_t argsz)
988 {
989 	struct vfio_device_feature_migration mig = {
990 		.flags = device->migration_flags,
991 	};
992 	int ret;
993 
994 	if (!device->mig_ops)
995 		return -ENOTTY;
996 
997 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
998 				 sizeof(mig));
999 	if (ret != 1)
1000 		return ret;
1001 	if (copy_to_user(arg, &mig, sizeof(mig)))
1002 		return -EFAULT;
1003 	return 0;
1004 }
1005 
vfio_combine_iova_ranges(struct rb_root_cached * root,u32 cur_nodes,u32 req_nodes)1006 void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
1007 			      u32 req_nodes)
1008 {
1009 	struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
1010 	unsigned long min_gap, curr_gap;
1011 
1012 	/* Special shortcut when a single range is required */
1013 	if (req_nodes == 1) {
1014 		unsigned long last;
1015 
1016 		comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
1017 
1018 		/* Empty list */
1019 		if (WARN_ON_ONCE(!comb_start))
1020 			return;
1021 
1022 		curr = comb_start;
1023 		while (curr) {
1024 			last = curr->last;
1025 			prev = curr;
1026 			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1027 			if (prev != comb_start)
1028 				interval_tree_remove(prev, root);
1029 		}
1030 		comb_start->last = last;
1031 		return;
1032 	}
1033 
1034 	/* Combine ranges which have the smallest gap */
1035 	while (cur_nodes > req_nodes) {
1036 		prev = NULL;
1037 		min_gap = ULONG_MAX;
1038 		curr = interval_tree_iter_first(root, 0, ULONG_MAX);
1039 		while (curr) {
1040 			if (prev) {
1041 				curr_gap = curr->start - prev->last;
1042 				if (curr_gap < min_gap) {
1043 					min_gap = curr_gap;
1044 					comb_start = prev;
1045 					comb_end = curr;
1046 				}
1047 			}
1048 			prev = curr;
1049 			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
1050 		}
1051 
1052 		/* Empty list or no nodes to combine */
1053 		if (WARN_ON_ONCE(min_gap == ULONG_MAX))
1054 			break;
1055 
1056 		comb_start->last = comb_end->last;
1057 		interval_tree_remove(comb_end, root);
1058 		cur_nodes--;
1059 	}
1060 }
1061 EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges);
1062 
1063 /* Ranges should fit into a single kernel page */
1064 #define LOG_MAX_RANGES \
1065 	(PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
1066 
1067 static int
vfio_ioctl_device_feature_logging_start(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)1068 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
1069 					u32 flags, void __user *arg,
1070 					size_t argsz)
1071 {
1072 	size_t minsz =
1073 		offsetofend(struct vfio_device_feature_dma_logging_control,
1074 			    ranges);
1075 	struct vfio_device_feature_dma_logging_range __user *ranges;
1076 	struct vfio_device_feature_dma_logging_control control;
1077 	struct vfio_device_feature_dma_logging_range range;
1078 	struct rb_root_cached root = RB_ROOT_CACHED;
1079 	struct interval_tree_node *nodes;
1080 	u64 iova_end;
1081 	u32 nnodes;
1082 	int i, ret;
1083 
1084 	if (!device->log_ops)
1085 		return -ENOTTY;
1086 
1087 	ret = vfio_check_feature(flags, argsz,
1088 				 VFIO_DEVICE_FEATURE_SET,
1089 				 sizeof(control));
1090 	if (ret != 1)
1091 		return ret;
1092 
1093 	if (copy_from_user(&control, arg, minsz))
1094 		return -EFAULT;
1095 
1096 	nnodes = control.num_ranges;
1097 	if (!nnodes)
1098 		return -EINVAL;
1099 
1100 	if (nnodes > LOG_MAX_RANGES)
1101 		return -E2BIG;
1102 
1103 	ranges = u64_to_user_ptr(control.ranges);
1104 	nodes = kmalloc_objs(struct interval_tree_node, nnodes);
1105 	if (!nodes)
1106 		return -ENOMEM;
1107 
1108 	for (i = 0; i < nnodes; i++) {
1109 		if (copy_from_user(&range, &ranges[i], sizeof(range))) {
1110 			ret = -EFAULT;
1111 			goto end;
1112 		}
1113 		if (!IS_ALIGNED(range.iova, control.page_size) ||
1114 		    !IS_ALIGNED(range.length, control.page_size)) {
1115 			ret = -EINVAL;
1116 			goto end;
1117 		}
1118 
1119 		if (check_add_overflow(range.iova, range.length, &iova_end) ||
1120 		    iova_end > ULONG_MAX) {
1121 			ret = -EOVERFLOW;
1122 			goto end;
1123 		}
1124 
1125 		nodes[i].start = range.iova;
1126 		nodes[i].last = range.iova + range.length - 1;
1127 		if (interval_tree_iter_first(&root, nodes[i].start,
1128 					     nodes[i].last)) {
1129 			/* Range overlapping */
1130 			ret = -EINVAL;
1131 			goto end;
1132 		}
1133 		interval_tree_insert(nodes + i, &root);
1134 	}
1135 
1136 	ret = device->log_ops->log_start(device, &root, nnodes,
1137 					 &control.page_size);
1138 	if (ret)
1139 		goto end;
1140 
1141 	if (copy_to_user(arg, &control, sizeof(control))) {
1142 		ret = -EFAULT;
1143 		device->log_ops->log_stop(device);
1144 	}
1145 
1146 end:
1147 	kfree(nodes);
1148 	return ret;
1149 }
1150 
1151 static int
vfio_ioctl_device_feature_logging_stop(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)1152 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1153 				       u32 flags, void __user *arg,
1154 				       size_t argsz)
1155 {
1156 	int ret;
1157 
1158 	if (!device->log_ops)
1159 		return -ENOTTY;
1160 
1161 	ret = vfio_check_feature(flags, argsz,
1162 				 VFIO_DEVICE_FEATURE_SET, 0);
1163 	if (ret != 1)
1164 		return ret;
1165 
1166 	return device->log_ops->log_stop(device);
1167 }
1168 
vfio_device_log_read_and_clear(struct iova_bitmap * iter,unsigned long iova,size_t length,void * opaque)1169 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1170 					  unsigned long iova, size_t length,
1171 					  void *opaque)
1172 {
1173 	struct vfio_device *device = opaque;
1174 
1175 	return device->log_ops->log_read_and_clear(device, iova, length, iter);
1176 }
1177 
1178 static int
vfio_ioctl_device_feature_logging_report(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)1179 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1180 					 u32 flags, void __user *arg,
1181 					 size_t argsz)
1182 {
1183 	size_t minsz =
1184 		offsetofend(struct vfio_device_feature_dma_logging_report,
1185 			    bitmap);
1186 	struct vfio_device_feature_dma_logging_report report;
1187 	struct iova_bitmap *iter;
1188 	u64 iova_end;
1189 	int ret;
1190 
1191 	if (!device->log_ops)
1192 		return -ENOTTY;
1193 
1194 	ret = vfio_check_feature(flags, argsz,
1195 				 VFIO_DEVICE_FEATURE_GET,
1196 				 sizeof(report));
1197 	if (ret != 1)
1198 		return ret;
1199 
1200 	if (copy_from_user(&report, arg, minsz))
1201 		return -EFAULT;
1202 
1203 	if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1204 		return -EINVAL;
1205 
1206 	if (check_add_overflow(report.iova, report.length, &iova_end) ||
1207 	    iova_end > ULONG_MAX)
1208 		return -EOVERFLOW;
1209 
1210 	iter = iova_bitmap_alloc(report.iova, report.length,
1211 				 report.page_size,
1212 				 u64_to_user_ptr(report.bitmap));
1213 	if (IS_ERR(iter))
1214 		return PTR_ERR(iter);
1215 
1216 	ret = iova_bitmap_for_each(iter, device,
1217 				   vfio_device_log_read_and_clear);
1218 
1219 	iova_bitmap_free(iter);
1220 	return ret;
1221 }
1222 
vfio_ioctl_device_feature(struct vfio_device * device,struct vfio_device_feature __user * arg)1223 static int vfio_ioctl_device_feature(struct vfio_device *device,
1224 				     struct vfio_device_feature __user *arg)
1225 {
1226 	size_t minsz = offsetofend(struct vfio_device_feature, flags);
1227 	struct vfio_device_feature feature;
1228 
1229 	if (copy_from_user(&feature, arg, minsz))
1230 		return -EFAULT;
1231 
1232 	if (feature.argsz < minsz)
1233 		return -EINVAL;
1234 
1235 	/* Check unknown flags */
1236 	if (feature.flags &
1237 	    ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1238 	      VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1239 		return -EINVAL;
1240 
1241 	/* GET & SET are mutually exclusive except with PROBE */
1242 	if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1243 	    (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1244 	    (feature.flags & VFIO_DEVICE_FEATURE_GET))
1245 		return -EINVAL;
1246 
1247 	switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1248 	case VFIO_DEVICE_FEATURE_MIGRATION:
1249 		return vfio_ioctl_device_feature_migration(
1250 			device, feature.flags, arg->data,
1251 			feature.argsz - minsz);
1252 	case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1253 		return vfio_ioctl_device_feature_mig_device_state(
1254 			device, feature.flags, arg->data,
1255 			feature.argsz - minsz);
1256 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1257 		return vfio_ioctl_device_feature_logging_start(
1258 			device, feature.flags, arg->data,
1259 			feature.argsz - minsz);
1260 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1261 		return vfio_ioctl_device_feature_logging_stop(
1262 			device, feature.flags, arg->data,
1263 			feature.argsz - minsz);
1264 	case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1265 		return vfio_ioctl_device_feature_logging_report(
1266 			device, feature.flags, arg->data,
1267 			feature.argsz - minsz);
1268 	case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1269 		return vfio_ioctl_device_feature_migration_data_size(
1270 			device, feature.flags, arg->data,
1271 			feature.argsz - minsz);
1272 	case VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2:
1273 		return vfio_ioctl_device_feature_migration_precopy_info_v2(
1274 			device, feature.flags, feature.argsz - minsz);
1275 	default:
1276 		if (unlikely(!device->ops->device_feature))
1277 			return -ENOTTY;
1278 		return device->ops->device_feature(device, feature.flags,
1279 						   arg->data,
1280 						   feature.argsz - minsz);
1281 	}
1282 }
1283 
vfio_get_region_info(struct vfio_device * device,struct vfio_region_info __user * arg)1284 static long vfio_get_region_info(struct vfio_device *device,
1285 				 struct vfio_region_info __user *arg)
1286 {
1287 	unsigned long minsz = offsetofend(struct vfio_region_info, offset);
1288 	struct vfio_region_info info = {};
1289 	struct vfio_info_cap caps = {};
1290 	int ret;
1291 
1292 	if (unlikely(!device->ops->get_region_info_caps))
1293 		return -EINVAL;
1294 
1295 	if (copy_from_user(&info, arg, minsz))
1296 		return -EFAULT;
1297 	if (info.argsz < minsz)
1298 		return -EINVAL;
1299 
1300 	ret = device->ops->get_region_info_caps(device, &info, &caps);
1301 	if (ret)
1302 		goto out_free;
1303 
1304 	if (caps.size) {
1305 		info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
1306 		if (info.argsz < sizeof(info) + caps.size) {
1307 			info.argsz = sizeof(info) + caps.size;
1308 			info.cap_offset = 0;
1309 		} else {
1310 			vfio_info_cap_shift(&caps, sizeof(info));
1311 			if (copy_to_user(arg + 1, caps.buf, caps.size)) {
1312 				ret = -EFAULT;
1313 				goto out_free;
1314 			}
1315 			info.cap_offset = sizeof(info);
1316 		}
1317 	}
1318 
1319 	if (copy_to_user(arg, &info, minsz)){
1320 		ret = -EFAULT;
1321 		goto out_free;
1322 	}
1323 
1324 out_free:
1325 	kfree(caps.buf);
1326 	return ret;
1327 }
1328 
vfio_device_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1329 static long vfio_device_fops_unl_ioctl(struct file *filep,
1330 				       unsigned int cmd, unsigned long arg)
1331 {
1332 	struct vfio_device_file *df = filep->private_data;
1333 	struct vfio_device *device = df->device;
1334 	void __user *uptr = (void __user *)arg;
1335 	int ret;
1336 
1337 	if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
1338 		return vfio_df_ioctl_bind_iommufd(df, uptr);
1339 
1340 	/* Paired with smp_store_release() following vfio_df_open() */
1341 	if (!smp_load_acquire(&df->access_granted))
1342 		return -EINVAL;
1343 
1344 	ret = vfio_device_pm_runtime_get(device);
1345 	if (ret)
1346 		return ret;
1347 
1348 	/* cdev only ioctls */
1349 	if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
1350 		switch (cmd) {
1351 		case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
1352 			ret = vfio_df_ioctl_attach_pt(df, uptr);
1353 			goto out;
1354 
1355 		case VFIO_DEVICE_DETACH_IOMMUFD_PT:
1356 			ret = vfio_df_ioctl_detach_pt(df, uptr);
1357 			goto out;
1358 		}
1359 	}
1360 
1361 	switch (cmd) {
1362 	case VFIO_DEVICE_FEATURE:
1363 		ret = vfio_ioctl_device_feature(device, uptr);
1364 		break;
1365 
1366 	case VFIO_DEVICE_GET_REGION_INFO:
1367 		ret = vfio_get_region_info(device, uptr);
1368 		break;
1369 
1370 	default:
1371 		if (unlikely(!device->ops->ioctl))
1372 			ret = -EINVAL;
1373 		else
1374 			ret = device->ops->ioctl(device, cmd, arg);
1375 		break;
1376 	}
1377 out:
1378 	vfio_device_pm_runtime_put(device);
1379 	return ret;
1380 }
1381 
vfio_device_fops_read(struct file * filep,char __user * buf,size_t count,loff_t * ppos)1382 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1383 				     size_t count, loff_t *ppos)
1384 {
1385 	struct vfio_device_file *df = filep->private_data;
1386 	struct vfio_device *device = df->device;
1387 
1388 	/* Paired with smp_store_release() following vfio_df_open() */
1389 	if (!smp_load_acquire(&df->access_granted))
1390 		return -EINVAL;
1391 
1392 	if (unlikely(!device->ops->read))
1393 		return -EINVAL;
1394 
1395 	return device->ops->read(device, buf, count, ppos);
1396 }
1397 
vfio_device_fops_write(struct file * filep,const char __user * buf,size_t count,loff_t * ppos)1398 static ssize_t vfio_device_fops_write(struct file *filep,
1399 				      const char __user *buf,
1400 				      size_t count, loff_t *ppos)
1401 {
1402 	struct vfio_device_file *df = filep->private_data;
1403 	struct vfio_device *device = df->device;
1404 
1405 	/* Paired with smp_store_release() following vfio_df_open() */
1406 	if (!smp_load_acquire(&df->access_granted))
1407 		return -EINVAL;
1408 
1409 	if (unlikely(!device->ops->write))
1410 		return -EINVAL;
1411 
1412 	return device->ops->write(device, buf, count, ppos);
1413 }
1414 
vfio_device_fops_mmap(struct file * filep,struct vm_area_struct * vma)1415 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1416 {
1417 	struct vfio_device_file *df = filep->private_data;
1418 	struct vfio_device *device = df->device;
1419 
1420 	/* Paired with smp_store_release() following vfio_df_open() */
1421 	if (!smp_load_acquire(&df->access_granted))
1422 		return -EINVAL;
1423 
1424 	if (unlikely(!device->ops->mmap))
1425 		return -EINVAL;
1426 
1427 	return device->ops->mmap(device, vma);
1428 }
1429 
1430 #ifdef CONFIG_PROC_FS
vfio_device_show_fdinfo(struct seq_file * m,struct file * filep)1431 static void vfio_device_show_fdinfo(struct seq_file *m, struct file *filep)
1432 {
1433 	char *path;
1434 	struct vfio_device_file *df = filep->private_data;
1435 	struct vfio_device *device = df->device;
1436 
1437 	path = kobject_get_path(&device->dev->kobj, GFP_KERNEL);
1438 	if (!path)
1439 		return;
1440 
1441 	seq_printf(m, "vfio-device-syspath: /sys%s\n", path);
1442 	kfree(path);
1443 }
1444 #endif
1445 
1446 const struct file_operations vfio_device_fops = {
1447 	.owner		= THIS_MODULE,
1448 	.open		= vfio_device_fops_cdev_open,
1449 	.release	= vfio_device_fops_release,
1450 	.read		= vfio_device_fops_read,
1451 	.write		= vfio_device_fops_write,
1452 	.unlocked_ioctl	= vfio_device_fops_unl_ioctl,
1453 	.compat_ioctl	= compat_ptr_ioctl,
1454 	.mmap		= vfio_device_fops_mmap,
1455 #ifdef CONFIG_PROC_FS
1456 	.show_fdinfo	= vfio_device_show_fdinfo,
1457 #endif
1458 };
1459 
vfio_device_from_file(struct file * file)1460 static struct vfio_device *vfio_device_from_file(struct file *file)
1461 {
1462 	struct vfio_device_file *df = file->private_data;
1463 
1464 	if (file->f_op != &vfio_device_fops)
1465 		return NULL;
1466 	return df->device;
1467 }
1468 
1469 /**
1470  * vfio_file_is_valid - True if the file is valid vfio file
1471  * @file: VFIO group file or VFIO device file
1472  */
vfio_file_is_valid(struct file * file)1473 bool vfio_file_is_valid(struct file *file)
1474 {
1475 	return vfio_group_from_file(file) ||
1476 	       vfio_device_from_file(file);
1477 }
1478 EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1479 
1480 /**
1481  * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1482  *        is always CPU cache coherent
1483  * @file: VFIO group file or VFIO device file
1484  *
1485  * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1486  * bit in DMA transactions. A return of false indicates that the user has
1487  * rights to access additional instructions such as wbinvd on x86.
1488  */
vfio_file_enforced_coherent(struct file * file)1489 bool vfio_file_enforced_coherent(struct file *file)
1490 {
1491 	struct vfio_device *device;
1492 	struct vfio_group *group;
1493 
1494 	group = vfio_group_from_file(file);
1495 	if (group)
1496 		return vfio_group_enforced_coherent(group);
1497 
1498 	device = vfio_device_from_file(file);
1499 	if (device)
1500 		return device_iommu_capable(device->dev,
1501 					    IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
1502 
1503 	return true;
1504 }
1505 EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1506 
vfio_device_file_set_kvm(struct file * file,struct kvm * kvm)1507 static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
1508 {
1509 	struct vfio_device_file *df = file->private_data;
1510 
1511 	/*
1512 	 * The kvm is first recorded in the vfio_device_file, and will
1513 	 * be propagated to vfio_device::kvm when the file is bound to
1514 	 * iommufd successfully in the vfio device cdev path.
1515 	 */
1516 	spin_lock(&df->kvm_ref_lock);
1517 	df->kvm = kvm;
1518 	spin_unlock(&df->kvm_ref_lock);
1519 }
1520 
1521 /**
1522  * vfio_file_set_kvm - Link a kvm with VFIO drivers
1523  * @file: VFIO group file or VFIO device file
1524  * @kvm: KVM to link
1525  *
1526  * When a VFIO device is first opened the KVM will be available in
1527  * device->kvm if one was associated with the file.
1528  */
vfio_file_set_kvm(struct file * file,struct kvm * kvm)1529 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1530 {
1531 	struct vfio_group *group;
1532 
1533 	group = vfio_group_from_file(file);
1534 	if (group)
1535 		vfio_group_set_kvm(group, kvm);
1536 
1537 	if (vfio_device_from_file(file))
1538 		vfio_device_file_set_kvm(file, kvm);
1539 }
1540 EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1541 
1542 /*
1543  * Sub-module support
1544  */
1545 /*
1546  * Helper for managing a buffer of info chain capabilities, allocate or
1547  * reallocate a buffer with additional @size, filling in @id and @version
1548  * of the capability.  A pointer to the new capability is returned.
1549  *
1550  * NB. The chain is based at the head of the buffer, so new entries are
1551  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1552  * next offsets prior to copying to the user buffer.
1553  */
vfio_info_cap_add(struct vfio_info_cap * caps,size_t size,u16 id,u16 version)1554 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1555 					       size_t size, u16 id, u16 version)
1556 {
1557 	void *buf;
1558 	struct vfio_info_cap_header *header, *tmp;
1559 
1560 	/* Ensure that the next capability struct will be aligned */
1561 	size = ALIGN(size, sizeof(u64));
1562 
1563 	buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1564 	if (!buf) {
1565 		kfree(caps->buf);
1566 		caps->buf = NULL;
1567 		caps->size = 0;
1568 		return ERR_PTR(-ENOMEM);
1569 	}
1570 
1571 	caps->buf = buf;
1572 	header = buf + caps->size;
1573 
1574 	/* Eventually copied to user buffer, zero */
1575 	memset(header, 0, size);
1576 
1577 	header->id = id;
1578 	header->version = version;
1579 
1580 	/* Add to the end of the capability chain */
1581 	for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1582 		; /* nothing */
1583 
1584 	tmp->next = caps->size;
1585 	caps->size += size;
1586 
1587 	return header;
1588 }
1589 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1590 
vfio_info_cap_shift(struct vfio_info_cap * caps,size_t offset)1591 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1592 {
1593 	struct vfio_info_cap_header *tmp;
1594 	void *buf = (void *)caps->buf;
1595 
1596 	/* Capability structs should start with proper alignment */
1597 	WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
1598 
1599 	for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1600 		tmp->next += offset;
1601 }
1602 EXPORT_SYMBOL(vfio_info_cap_shift);
1603 
vfio_info_add_capability(struct vfio_info_cap * caps,struct vfio_info_cap_header * cap,size_t size)1604 int vfio_info_add_capability(struct vfio_info_cap *caps,
1605 			     struct vfio_info_cap_header *cap, size_t size)
1606 {
1607 	struct vfio_info_cap_header *header;
1608 
1609 	header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1610 	if (IS_ERR(header))
1611 		return PTR_ERR(header);
1612 
1613 	memcpy(header + 1, cap + 1, size - sizeof(*header));
1614 
1615 	return 0;
1616 }
1617 EXPORT_SYMBOL(vfio_info_add_capability);
1618 
vfio_set_irqs_validate_and_prepare(struct vfio_irq_set * hdr,int num_irqs,int max_irq_type,size_t * data_size)1619 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1620 				       int max_irq_type, size_t *data_size)
1621 {
1622 	unsigned long minsz;
1623 	size_t size;
1624 
1625 	minsz = offsetofend(struct vfio_irq_set, count);
1626 
1627 	if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1628 	    (hdr->count >= (U32_MAX - hdr->start)) ||
1629 	    (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1630 				VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1631 		return -EINVAL;
1632 
1633 	if (data_size)
1634 		*data_size = 0;
1635 
1636 	if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1637 		return -EINVAL;
1638 
1639 	switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1640 	case VFIO_IRQ_SET_DATA_NONE:
1641 		size = 0;
1642 		break;
1643 	case VFIO_IRQ_SET_DATA_BOOL:
1644 		size = sizeof(uint8_t);
1645 		break;
1646 	case VFIO_IRQ_SET_DATA_EVENTFD:
1647 		size = sizeof(int32_t);
1648 		break;
1649 	default:
1650 		return -EINVAL;
1651 	}
1652 
1653 	if (size) {
1654 		if (hdr->argsz - minsz < hdr->count * size)
1655 			return -EINVAL;
1656 
1657 		if (!data_size)
1658 			return -EINVAL;
1659 
1660 		*data_size = hdr->count * size;
1661 	}
1662 
1663 	return 0;
1664 }
1665 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1666 
1667 /*
1668  * Pin contiguous user pages and return their associated host pages for local
1669  * domain only.
1670  * @device [in]  : device
1671  * @iova [in]    : starting IOVA of user pages to be pinned.
1672  * @npage [in]   : count of pages to be pinned.  This count should not
1673  *		   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1674  * @prot [in]    : protection flags
1675  * @pages[out]   : array of host pages
1676  * Return error or number of pages pinned.
1677  *
1678  * A driver may only call this function if the vfio_device was created
1679  * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1680  */
vfio_pin_pages(struct vfio_device * device,dma_addr_t iova,int npage,int prot,struct page ** pages)1681 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1682 		   int npage, int prot, struct page **pages)
1683 {
1684 	/* group->container cannot change while a vfio device is open */
1685 	if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1686 		return -EINVAL;
1687 	if (!device->ops->dma_unmap)
1688 		return -EINVAL;
1689 	if (vfio_device_has_container(device))
1690 		return vfio_device_container_pin_pages(device, iova,
1691 						       npage, prot, pages);
1692 	if (device->iommufd_access) {
1693 		int ret;
1694 
1695 		if (iova > ULONG_MAX)
1696 			return -EINVAL;
1697 		/*
1698 		 * VFIO ignores the sub page offset, npages is from the start of
1699 		 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1700 		 * the sub page offset by doing:
1701 		 *     pages[0] + (iova % PAGE_SIZE)
1702 		 */
1703 		ret = iommufd_access_pin_pages(
1704 			device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1705 			npage * PAGE_SIZE, pages,
1706 			(prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1707 		if (ret)
1708 			return ret;
1709 		return npage;
1710 	}
1711 	return -EINVAL;
1712 }
1713 EXPORT_SYMBOL(vfio_pin_pages);
1714 
1715 /*
1716  * Unpin contiguous host pages for local domain only.
1717  * @device [in]  : device
1718  * @iova [in]    : starting address of user pages to be unpinned.
1719  * @npage [in]   : count of pages to be unpinned.  This count should not
1720  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1721  */
vfio_unpin_pages(struct vfio_device * device,dma_addr_t iova,int npage)1722 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1723 {
1724 	if (WARN_ON(!vfio_assert_device_open(device)))
1725 		return;
1726 	if (WARN_ON(!device->ops->dma_unmap))
1727 		return;
1728 
1729 	if (vfio_device_has_container(device)) {
1730 		vfio_device_container_unpin_pages(device, iova, npage);
1731 		return;
1732 	}
1733 	if (device->iommufd_access) {
1734 		if (WARN_ON(iova > ULONG_MAX))
1735 			return;
1736 		iommufd_access_unpin_pages(device->iommufd_access,
1737 					   ALIGN_DOWN(iova, PAGE_SIZE),
1738 					   npage * PAGE_SIZE);
1739 		return;
1740 	}
1741 }
1742 EXPORT_SYMBOL(vfio_unpin_pages);
1743 
1744 /*
1745  * This interface allows the CPUs to perform some sort of virtual DMA on
1746  * behalf of the device.
1747  *
1748  * CPUs read/write from/into a range of IOVAs pointing to user space memory
1749  * into/from a kernel buffer.
1750  *
1751  * As the read/write of user space memory is conducted via the CPUs and is
1752  * not a real device DMA, it is not necessary to pin the user space memory.
1753  *
1754  * @device [in]		: VFIO device
1755  * @iova [in]		: base IOVA of a user space buffer
1756  * @data [in]		: pointer to kernel buffer
1757  * @len [in]		: kernel buffer length
1758  * @write		: indicate read or write
1759  * Return error code on failure or 0 on success.
1760  */
vfio_dma_rw(struct vfio_device * device,dma_addr_t iova,void * data,size_t len,bool write)1761 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1762 		size_t len, bool write)
1763 {
1764 	if (!data || len <= 0 || !vfio_assert_device_open(device))
1765 		return -EINVAL;
1766 
1767 	if (vfio_device_has_container(device))
1768 		return vfio_device_container_dma_rw(device, iova,
1769 						    data, len, write);
1770 
1771 	if (device->iommufd_access) {
1772 		unsigned int flags = 0;
1773 
1774 		if (iova > ULONG_MAX)
1775 			return -EINVAL;
1776 
1777 		/* VFIO historically tries to auto-detect a kthread */
1778 		if (!current->mm)
1779 			flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1780 		if (write)
1781 			flags |= IOMMUFD_ACCESS_RW_WRITE;
1782 		return iommufd_access_rw(device->iommufd_access, iova, data,
1783 					 len, flags);
1784 	}
1785 	return -EINVAL;
1786 }
1787 EXPORT_SYMBOL(vfio_dma_rw);
1788 
1789 /*
1790  * Module/class support
1791  */
vfio_init(void)1792 static int __init vfio_init(void)
1793 {
1794 	int ret;
1795 
1796 	ida_init(&vfio.device_ida);
1797 
1798 	ret = vfio_group_init();
1799 	if (ret)
1800 		return ret;
1801 
1802 	ret = vfio_virqfd_init();
1803 	if (ret)
1804 		goto err_virqfd;
1805 
1806 	/* /sys/class/vfio-dev/vfioX */
1807 	vfio.device_class = class_create("vfio-dev");
1808 	if (IS_ERR(vfio.device_class)) {
1809 		ret = PTR_ERR(vfio.device_class);
1810 		goto err_dev_class;
1811 	}
1812 
1813 	ret = vfio_cdev_init(vfio.device_class);
1814 	if (ret)
1815 		goto err_alloc_dev_chrdev;
1816 
1817 	vfio_debugfs_create_root();
1818 	pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1819 	return 0;
1820 
1821 err_alloc_dev_chrdev:
1822 	class_destroy(vfio.device_class);
1823 	vfio.device_class = NULL;
1824 err_dev_class:
1825 	vfio_virqfd_exit();
1826 err_virqfd:
1827 	vfio_group_cleanup();
1828 	return ret;
1829 }
1830 
vfio_cleanup(void)1831 static void __exit vfio_cleanup(void)
1832 {
1833 	vfio_debugfs_remove_root();
1834 	ida_destroy(&vfio.device_ida);
1835 	vfio_cdev_cleanup();
1836 	class_destroy(vfio.device_class);
1837 	vfio.device_class = NULL;
1838 	vfio_virqfd_exit();
1839 	vfio_group_cleanup();
1840 	xa_destroy(&vfio_device_set_xa);
1841 }
1842 
1843 module_init(vfio_init);
1844 module_exit(vfio_cleanup);
1845 
1846 MODULE_IMPORT_NS("IOMMUFD");
1847 MODULE_VERSION(DRIVER_VERSION);
1848 MODULE_LICENSE("GPL v2");
1849 MODULE_AUTHOR(DRIVER_AUTHOR);
1850 MODULE_DESCRIPTION(DRIVER_DESC);
1851 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
1852