xref: /linux/virt/kvm/guest_memfd.c (revision fc825e513cd494cfcbeb47acf5738fe64f3a9051)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/anon_inodes.h>
3 #include <linux/backing-dev.h>
4 #include <linux/falloc.h>
5 #include <linux/fs.h>
6 #include <linux/kvm_host.h>
7 #include <linux/mempolicy.h>
8 #include <linux/pseudo_fs.h>
9 #include <linux/pagemap.h>
10 
11 #include "kvm_mm.h"
12 
13 static struct vfsmount *kvm_gmem_mnt;
14 
15 /*
16  * A guest_memfd instance can be associated multiple VMs, each with its own
17  * "view" of the underlying physical memory.
18  *
19  * The gmem's inode is effectively the raw underlying physical storage, and is
20  * used to track properties of the physical memory, while each gmem file is
21  * effectively a single VM's view of that storage, and is used to track assets
22  * specific to its associated VM, e.g. memslots=>gmem bindings.
23  */
24 struct gmem_file {
25 	struct kvm *kvm;
26 	struct xarray bindings;
27 	struct list_head entry;
28 };
29 
30 struct gmem_inode {
31 	struct shared_policy policy;
32 	struct inode vfs_inode;
33 	struct list_head gmem_file_list;
34 
35 	u64 flags;
36 };
37 
GMEM_I(struct inode * inode)38 static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
39 {
40 	return container_of(inode, struct gmem_inode, vfs_inode);
41 }
42 
43 #define kvm_gmem_for_each_file(f, inode) \
44 	list_for_each_entry(f, &GMEM_I(inode)->gmem_file_list, entry)
45 
46 /**
47  * folio_file_pfn - like folio_file_page, but return a pfn.
48  * @folio: The folio which contains this index.
49  * @index: The index we want to look up.
50  *
51  * Return: The pfn for this index.
52  */
folio_file_pfn(struct folio * folio,pgoff_t index)53 static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
54 {
55 	return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
56 }
57 
kvm_gmem_get_index(struct kvm_memory_slot * slot,gfn_t gfn)58 static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
59 {
60 	return gfn - slot->base_gfn + slot->gmem.pgoff;
61 }
62 
__kvm_gmem_prepare_folio(struct kvm * kvm,struct kvm_memory_slot * slot,pgoff_t index,struct folio * folio)63 static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
64 				    pgoff_t index, struct folio *folio)
65 {
66 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
67 	kvm_pfn_t pfn = folio_file_pfn(folio, index);
68 	gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff;
69 	int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio));
70 	if (rc) {
71 		pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
72 				    index, gfn, pfn, rc);
73 		return rc;
74 	}
75 #endif
76 
77 	return 0;
78 }
79 
80 /*
81  * Process @folio, which contains @gfn, so that the guest can use it.
82  * The folio must be locked and the gfn must be contained in @slot.
83  * On successful return the guest sees a zero page so as to avoid
84  * leaking host data and the up-to-date flag is set.
85  */
kvm_gmem_prepare_folio(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,struct folio * folio)86 static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
87 				  gfn_t gfn, struct folio *folio)
88 {
89 	pgoff_t index;
90 
91 	/*
92 	 * Preparing huge folios should always be safe, since it should
93 	 * be possible to split them later if needed.
94 	 *
95 	 * Right now the folio order is always going to be zero, but the
96 	 * code is ready for huge folios.  The only assumption is that
97 	 * the base pgoff of memslots is naturally aligned with the
98 	 * requested page order, ensuring that huge folios can also use
99 	 * huge page table entries for GPA->HPA mapping.
100 	 *
101 	 * The order will be passed when creating the guest_memfd, and
102 	 * checked when creating memslots.
103 	 */
104 	WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio)));
105 	index = kvm_gmem_get_index(slot, gfn);
106 	index = ALIGN_DOWN(index, folio_nr_pages(folio));
107 
108 	return __kvm_gmem_prepare_folio(kvm, slot, index, folio);
109 }
110 
111 /*
112  * Returns a locked folio on success.  The caller is responsible for
113  * setting the up-to-date flag before the memory is mapped into the guest.
114  * There is no backing storage for the memory, so the folio will remain
115  * up-to-date until it's removed.
116  *
117  * Ignore accessed, referenced, and dirty flags.  The memory is
118  * unevictable and there is no storage to write back to.
119  */
kvm_gmem_get_folio(struct inode * inode,pgoff_t index)120 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
121 {
122 	/* TODO: Support huge pages. */
123 	struct mempolicy *policy;
124 	struct folio *folio;
125 
126 	/*
127 	 * Fast-path: See if folio is already present in mapping to avoid
128 	 * policy_lookup.
129 	 */
130 	folio = __filemap_get_folio(inode->i_mapping, index,
131 				    FGP_LOCK | FGP_ACCESSED, 0);
132 	if (!IS_ERR(folio))
133 		return folio;
134 
135 	policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index);
136 	folio = __filemap_get_folio_mpol(inode->i_mapping, index,
137 					 FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
138 					 mapping_gfp_mask(inode->i_mapping), policy);
139 	mpol_cond_put(policy);
140 
141 	/*
142 	 * External interfaces like kvm_gmem_get_pfn() support dealing
143 	 * with hugepages to a degree, but internally, guest_memfd currently
144 	 * assumes that all folios are order-0 and handling would need
145 	 * to be updated for anything otherwise (e.g. page-clearing
146 	 * operations).
147 	 */
148 	WARN_ON_ONCE(!IS_ERR(folio) && folio_order(folio));
149 
150 	return folio;
151 }
152 
kvm_gmem_get_invalidate_filter(struct inode * inode)153 static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode)
154 {
155 	if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)
156 		return KVM_FILTER_SHARED;
157 
158 	return KVM_FILTER_PRIVATE;
159 }
160 
__kvm_gmem_invalidate_begin(struct gmem_file * f,pgoff_t start,pgoff_t end,enum kvm_gfn_range_filter attr_filter)161 static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start,
162 					pgoff_t end,
163 					enum kvm_gfn_range_filter attr_filter)
164 {
165 	bool flush = false, found_memslot = false;
166 	struct kvm_memory_slot *slot;
167 	struct kvm *kvm = f->kvm;
168 	unsigned long index;
169 
170 	xa_for_each_range(&f->bindings, index, slot, start, end - 1) {
171 		pgoff_t pgoff = slot->gmem.pgoff;
172 
173 		struct kvm_gfn_range gfn_range = {
174 			.start = slot->base_gfn + max(pgoff, start) - pgoff,
175 			.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
176 			.slot = slot,
177 			.may_block = true,
178 			.attr_filter = attr_filter,
179 		};
180 
181 		if (!found_memslot) {
182 			found_memslot = true;
183 
184 			KVM_MMU_LOCK(kvm);
185 			kvm_mmu_invalidate_begin(kvm);
186 		}
187 
188 		flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
189 	}
190 
191 	if (flush)
192 		kvm_flush_remote_tlbs(kvm);
193 
194 	if (found_memslot)
195 		KVM_MMU_UNLOCK(kvm);
196 }
197 
kvm_gmem_invalidate_begin(struct inode * inode,pgoff_t start,pgoff_t end)198 static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start,
199 				      pgoff_t end)
200 {
201 	enum kvm_gfn_range_filter attr_filter;
202 	struct gmem_file *f;
203 
204 	attr_filter = kvm_gmem_get_invalidate_filter(inode);
205 
206 	kvm_gmem_for_each_file(f, inode)
207 		__kvm_gmem_invalidate_begin(f, start, end, attr_filter);
208 }
209 
__kvm_gmem_invalidate_end(struct gmem_file * f,pgoff_t start,pgoff_t end)210 static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start,
211 				      pgoff_t end)
212 {
213 	struct kvm *kvm = f->kvm;
214 
215 	if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
216 		KVM_MMU_LOCK(kvm);
217 		kvm_mmu_invalidate_end(kvm);
218 		KVM_MMU_UNLOCK(kvm);
219 	}
220 }
221 
kvm_gmem_invalidate_end(struct inode * inode,pgoff_t start,pgoff_t end)222 static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start,
223 				    pgoff_t end)
224 {
225 	struct gmem_file *f;
226 
227 	kvm_gmem_for_each_file(f, inode)
228 		__kvm_gmem_invalidate_end(f, start, end);
229 }
230 
kvm_gmem_punch_hole(struct inode * inode,loff_t offset,loff_t len)231 static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
232 {
233 	pgoff_t start = offset >> PAGE_SHIFT;
234 	pgoff_t end = (offset + len) >> PAGE_SHIFT;
235 
236 	/*
237 	 * Bindings must be stable across invalidation to ensure the start+end
238 	 * are balanced.
239 	 */
240 	filemap_invalidate_lock(inode->i_mapping);
241 
242 	kvm_gmem_invalidate_begin(inode, start, end);
243 
244 	truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
245 
246 	kvm_gmem_invalidate_end(inode, start, end);
247 
248 	filemap_invalidate_unlock(inode->i_mapping);
249 
250 	return 0;
251 }
252 
kvm_gmem_allocate(struct inode * inode,loff_t offset,loff_t len)253 static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
254 {
255 	struct address_space *mapping = inode->i_mapping;
256 	pgoff_t start, index, end;
257 	int r;
258 
259 	/* Dedicated guest is immutable by default. */
260 	if (offset + len > i_size_read(inode))
261 		return -EINVAL;
262 
263 	filemap_invalidate_lock_shared(mapping);
264 
265 	start = offset >> PAGE_SHIFT;
266 	end = (offset + len) >> PAGE_SHIFT;
267 
268 	r = 0;
269 	for (index = start; index < end; ) {
270 		struct folio *folio;
271 
272 		if (signal_pending(current)) {
273 			r = -EINTR;
274 			break;
275 		}
276 
277 		folio = kvm_gmem_get_folio(inode, index);
278 		if (IS_ERR(folio)) {
279 			r = PTR_ERR(folio);
280 			break;
281 		}
282 
283 		index = folio_next_index(folio);
284 
285 		folio_unlock(folio);
286 		folio_put(folio);
287 
288 		/* 64-bit only, wrapping the index should be impossible. */
289 		if (WARN_ON_ONCE(!index))
290 			break;
291 
292 		cond_resched();
293 	}
294 
295 	filemap_invalidate_unlock_shared(mapping);
296 
297 	return r;
298 }
299 
kvm_gmem_fallocate(struct file * file,int mode,loff_t offset,loff_t len)300 static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
301 			       loff_t len)
302 {
303 	int ret;
304 
305 	if (!(mode & FALLOC_FL_KEEP_SIZE))
306 		return -EOPNOTSUPP;
307 
308 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
309 		return -EOPNOTSUPP;
310 
311 	if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
312 		return -EINVAL;
313 
314 	if (mode & FALLOC_FL_PUNCH_HOLE)
315 		ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
316 	else
317 		ret = kvm_gmem_allocate(file_inode(file), offset, len);
318 
319 	if (!ret)
320 		file_modified(file);
321 	return ret;
322 }
323 
kvm_gmem_release(struct inode * inode,struct file * file)324 static int kvm_gmem_release(struct inode *inode, struct file *file)
325 {
326 	struct gmem_file *f = file->private_data;
327 	struct kvm_memory_slot *slot;
328 	struct kvm *kvm = f->kvm;
329 	unsigned long index;
330 
331 	/*
332 	 * Prevent concurrent attempts to *unbind* a memslot.  This is the last
333 	 * reference to the file and thus no new bindings can be created, but
334 	 * dereferencing the slot for existing bindings needs to be protected
335 	 * against memslot updates, specifically so that unbind doesn't race
336 	 * and free the memslot (kvm_gmem_get_file() will return NULL).
337 	 *
338 	 * Since .release is called only when the reference count is zero,
339 	 * after which file_ref_get() and get_file_active() fail,
340 	 * kvm_gmem_get_pfn() cannot be using the file concurrently.
341 	 * file_ref_put() provides a full barrier, and get_file_active() the
342 	 * matching acquire barrier.
343 	 */
344 	mutex_lock(&kvm->slots_lock);
345 
346 	filemap_invalidate_lock(inode->i_mapping);
347 
348 	xa_for_each(&f->bindings, index, slot)
349 		WRITE_ONCE(slot->gmem.file, NULL);
350 
351 	/*
352 	 * All in-flight operations are gone and new bindings can be created.
353 	 * Zap all SPTEs pointed at by this file.  Do not free the backing
354 	 * memory, as its lifetime is associated with the inode, not the file.
355 	 */
356 	__kvm_gmem_invalidate_begin(f, 0, -1ul,
357 				    kvm_gmem_get_invalidate_filter(inode));
358 	__kvm_gmem_invalidate_end(f, 0, -1ul);
359 
360 	list_del(&f->entry);
361 
362 	filemap_invalidate_unlock(inode->i_mapping);
363 
364 	mutex_unlock(&kvm->slots_lock);
365 
366 	xa_destroy(&f->bindings);
367 	kfree(f);
368 
369 	kvm_put_kvm(kvm);
370 
371 	return 0;
372 }
373 
kvm_gmem_get_file(struct kvm_memory_slot * slot)374 static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
375 {
376 	/*
377 	 * Do not return slot->gmem.file if it has already been closed;
378 	 * there might be some time between the last fput() and when
379 	 * kvm_gmem_release() clears slot->gmem.file.
380 	 */
381 	return get_file_active(&slot->gmem.file);
382 }
383 
384 DEFINE_CLASS(gmem_get_file, struct file *, if (_T) fput(_T),
385 	     kvm_gmem_get_file(slot), struct kvm_memory_slot *slot);
386 
kvm_gmem_supports_mmap(struct inode * inode)387 static bool kvm_gmem_supports_mmap(struct inode *inode)
388 {
389 	return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP;
390 }
391 
kvm_gmem_fault_user_mapping(struct vm_fault * vmf)392 static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
393 {
394 	struct inode *inode = file_inode(vmf->vma->vm_file);
395 	struct folio *folio;
396 	vm_fault_t ret = VM_FAULT_LOCKED;
397 
398 	if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
399 		return VM_FAULT_SIGBUS;
400 
401 	if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
402 		return VM_FAULT_SIGBUS;
403 
404 	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
405 	if (IS_ERR(folio)) {
406 		if (PTR_ERR(folio) == -EAGAIN)
407 			return VM_FAULT_RETRY;
408 
409 		return vmf_error(PTR_ERR(folio));
410 	}
411 
412 	if (WARN_ON_ONCE(folio_test_large(folio))) {
413 		ret = VM_FAULT_SIGBUS;
414 		goto out_folio;
415 	}
416 
417 	if (!folio_test_uptodate(folio)) {
418 		clear_highpage(folio_page(folio, 0));
419 		folio_mark_uptodate(folio);
420 	}
421 
422 	vmf->page = folio_file_page(folio, vmf->pgoff);
423 
424 out_folio:
425 	if (ret != VM_FAULT_LOCKED) {
426 		folio_unlock(folio);
427 		folio_put(folio);
428 	}
429 
430 	return ret;
431 }
432 
433 #ifdef CONFIG_NUMA
kvm_gmem_set_policy(struct vm_area_struct * vma,struct mempolicy * mpol)434 static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
435 {
436 	struct inode *inode = file_inode(vma->vm_file);
437 
438 	return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol);
439 }
440 
kvm_gmem_get_policy(struct vm_area_struct * vma,unsigned long addr,pgoff_t * pgoff)441 static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
442 					     unsigned long addr, pgoff_t *pgoff)
443 {
444 	struct inode *inode = file_inode(vma->vm_file);
445 
446 	*pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
447 
448 	/*
449 	 * Return the memory policy for this index, or NULL if none is set.
450 	 *
451 	 * Returning NULL, e.g. instead of the current task's memory policy, is
452 	 * important for the .get_policy kernel ABI: it indicates that no
453 	 * explicit policy has been set via mbind() for this memory. The caller
454 	 * can then replace NULL with the default memory policy instead of the
455 	 * current task's memory policy.
456 	 */
457 	return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
458 }
459 #endif /* CONFIG_NUMA */
460 
461 static const struct vm_operations_struct kvm_gmem_vm_ops = {
462 	.fault		= kvm_gmem_fault_user_mapping,
463 #ifdef CONFIG_NUMA
464 	.get_policy	= kvm_gmem_get_policy,
465 	.set_policy	= kvm_gmem_set_policy,
466 #endif
467 };
468 
kvm_gmem_mmap(struct file * file,struct vm_area_struct * vma)469 static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
470 {
471 	if (!kvm_gmem_supports_mmap(file_inode(file)))
472 		return -ENODEV;
473 
474 	if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
475 	    (VM_SHARED | VM_MAYSHARE)) {
476 		return -EINVAL;
477 	}
478 
479 	vma->vm_ops = &kvm_gmem_vm_ops;
480 
481 	return 0;
482 }
483 
484 static struct file_operations kvm_gmem_fops = {
485 	.mmap		= kvm_gmem_mmap,
486 	.open		= generic_file_open,
487 	.release	= kvm_gmem_release,
488 	.fallocate	= kvm_gmem_fallocate,
489 };
490 
kvm_gmem_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)491 static int kvm_gmem_migrate_folio(struct address_space *mapping,
492 				  struct folio *dst, struct folio *src,
493 				  enum migrate_mode mode)
494 {
495 	WARN_ON_ONCE(1);
496 	return -EINVAL;
497 }
498 
kvm_gmem_error_folio(struct address_space * mapping,struct folio * folio)499 static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
500 {
501 	pgoff_t start, end;
502 
503 	filemap_invalidate_lock_shared(mapping);
504 
505 	start = folio->index;
506 	end = start + folio_nr_pages(folio);
507 
508 	kvm_gmem_invalidate_begin(mapping->host, start, end);
509 
510 	/*
511 	 * Do not truncate the range, what action is taken in response to the
512 	 * error is userspace's decision (assuming the architecture supports
513 	 * gracefully handling memory errors).  If/when the guest attempts to
514 	 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
515 	 * at which point KVM can either terminate the VM or propagate the
516 	 * error to userspace.
517 	 */
518 
519 	kvm_gmem_invalidate_end(mapping->host, start, end);
520 
521 	filemap_invalidate_unlock_shared(mapping);
522 
523 	return MF_DELAYED;
524 }
525 
526 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
kvm_gmem_free_folio(struct folio * folio)527 static void kvm_gmem_free_folio(struct folio *folio)
528 {
529 	struct page *page = folio_page(folio, 0);
530 	kvm_pfn_t pfn = page_to_pfn(page);
531 	int order = folio_order(folio);
532 
533 	kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
534 }
535 #endif
536 
537 static const struct address_space_operations kvm_gmem_aops = {
538 	.dirty_folio = noop_dirty_folio,
539 	.migrate_folio	= kvm_gmem_migrate_folio,
540 	.error_remove_folio = kvm_gmem_error_folio,
541 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
542 	.free_folio = kvm_gmem_free_folio,
543 #endif
544 };
545 
kvm_gmem_setattr(struct mnt_idmap * idmap,struct dentry * dentry,struct iattr * attr)546 static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
547 			    struct iattr *attr)
548 {
549 	return -EINVAL;
550 }
551 static const struct inode_operations kvm_gmem_iops = {
552 	.setattr	= kvm_gmem_setattr,
553 };
554 
kvm_arch_supports_gmem_init_shared(struct kvm * kvm)555 bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
556 {
557 	return true;
558 }
559 
__kvm_gmem_create(struct kvm * kvm,loff_t size,u64 flags)560 static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
561 {
562 	static const char *name = "[kvm-gmem]";
563 	struct gmem_file *f;
564 	struct inode *inode;
565 	struct file *file;
566 	int fd, err;
567 
568 	fd = get_unused_fd_flags(0);
569 	if (fd < 0)
570 		return fd;
571 
572 	f = kzalloc_obj(*f);
573 	if (!f) {
574 		err = -ENOMEM;
575 		goto err_fd;
576 	}
577 
578 	/* __fput() will take care of fops_put(). */
579 	if (!fops_get(&kvm_gmem_fops)) {
580 		err = -ENOENT;
581 		goto err_gmem;
582 	}
583 
584 	inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL);
585 	if (IS_ERR(inode)) {
586 		err = PTR_ERR(inode);
587 		goto err_fops;
588 	}
589 
590 	inode->i_op = &kvm_gmem_iops;
591 	inode->i_mapping->a_ops = &kvm_gmem_aops;
592 	inode->i_mode |= S_IFREG;
593 	inode->i_size = size;
594 	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
595 	mapping_set_inaccessible(inode->i_mapping);
596 	/* Unmovable mappings are supposed to be marked unevictable as well. */
597 	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
598 
599 	GMEM_I(inode)->flags = flags;
600 
601 	file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
602 	if (IS_ERR(file)) {
603 		err = PTR_ERR(file);
604 		goto err_inode;
605 	}
606 
607 	file->f_flags |= O_LARGEFILE;
608 	file->private_data = f;
609 
610 	kvm_get_kvm(kvm);
611 	f->kvm = kvm;
612 	xa_init(&f->bindings);
613 	list_add(&f->entry, &GMEM_I(inode)->gmem_file_list);
614 
615 	fd_install(fd, file);
616 	return fd;
617 
618 err_inode:
619 	iput(inode);
620 err_fops:
621 	fops_put(&kvm_gmem_fops);
622 err_gmem:
623 	kfree(f);
624 err_fd:
625 	put_unused_fd(fd);
626 	return err;
627 }
628 
kvm_gmem_create(struct kvm * kvm,struct kvm_create_guest_memfd * args)629 int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
630 {
631 	loff_t size = args->size;
632 	u64 flags = args->flags;
633 
634 	if (flags & ~kvm_gmem_get_supported_flags(kvm))
635 		return -EINVAL;
636 
637 	if (size <= 0 || !PAGE_ALIGNED(size))
638 		return -EINVAL;
639 
640 	return __kvm_gmem_create(kvm, size, flags);
641 }
642 
kvm_gmem_bind(struct kvm * kvm,struct kvm_memory_slot * slot,unsigned int fd,loff_t offset)643 int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
644 		  unsigned int fd, loff_t offset)
645 {
646 	loff_t size = slot->npages << PAGE_SHIFT;
647 	unsigned long start, end;
648 	struct gmem_file *f;
649 	struct inode *inode;
650 	struct file *file;
651 	int r = -EINVAL;
652 
653 	BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
654 
655 	file = fget(fd);
656 	if (!file)
657 		return -EBADF;
658 
659 	if (file->f_op != &kvm_gmem_fops)
660 		goto err;
661 
662 	f = file->private_data;
663 	if (f->kvm != kvm)
664 		goto err;
665 
666 	inode = file_inode(file);
667 
668 	if (offset < 0 || !PAGE_ALIGNED(offset) ||
669 	    offset + size > i_size_read(inode))
670 		goto err;
671 
672 	filemap_invalidate_lock(inode->i_mapping);
673 
674 	start = offset >> PAGE_SHIFT;
675 	end = start + slot->npages;
676 
677 	if (!xa_empty(&f->bindings) &&
678 	    xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
679 		filemap_invalidate_unlock(inode->i_mapping);
680 		goto err;
681 	}
682 
683 	/*
684 	 * memslots of flag KVM_MEM_GUEST_MEMFD are immutable to change, so
685 	 * kvm_gmem_bind() must occur on a new memslot.  Because the memslot
686 	 * is not visible yet, kvm_gmem_get_pfn() is guaranteed to see the file.
687 	 */
688 	WRITE_ONCE(slot->gmem.file, file);
689 	slot->gmem.pgoff = start;
690 	if (kvm_gmem_supports_mmap(inode))
691 		slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
692 
693 	xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL);
694 	filemap_invalidate_unlock(inode->i_mapping);
695 
696 	/*
697 	 * Drop the reference to the file, even on success.  The file pins KVM,
698 	 * not the other way 'round.  Active bindings are invalidated if the
699 	 * file is closed before memslots are destroyed.
700 	 */
701 	r = 0;
702 err:
703 	fput(file);
704 	return r;
705 }
706 
__kvm_gmem_unbind(struct kvm_memory_slot * slot,struct gmem_file * f)707 static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct gmem_file *f)
708 {
709 	unsigned long start = slot->gmem.pgoff;
710 	unsigned long end = start + slot->npages;
711 
712 	xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL);
713 
714 	/*
715 	 * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn()
716 	 * cannot see this memslot.
717 	 */
718 	WRITE_ONCE(slot->gmem.file, NULL);
719 }
720 
kvm_gmem_unbind(struct kvm_memory_slot * slot)721 void kvm_gmem_unbind(struct kvm_memory_slot *slot)
722 {
723 	/*
724 	 * Nothing to do if the underlying file was _already_ closed, as
725 	 * kvm_gmem_release() invalidates and nullifies all bindings.
726 	 */
727 	if (!slot->gmem.file)
728 		return;
729 
730 	CLASS(gmem_get_file, file)(slot);
731 
732 	/*
733 	 * However, if the file is _being_ closed, then the bindings need to be
734 	 * removed as kvm_gmem_release() might not run until after the memslot
735 	 * is freed.  Note, modifying the bindings is safe even though the file
736 	 * is dying as kvm_gmem_release() nullifies slot->gmem.file under
737 	 * slots_lock, and only puts its reference to KVM after destroying all
738 	 * bindings.  I.e. reaching this point means kvm_gmem_release() hasn't
739 	 * yet destroyed the bindings or freed the gmem_file, and can't do so
740 	 * until the caller drops slots_lock.
741 	 */
742 	if (!file) {
743 		__kvm_gmem_unbind(slot, slot->gmem.file->private_data);
744 		return;
745 	}
746 
747 	filemap_invalidate_lock(file->f_mapping);
748 	__kvm_gmem_unbind(slot, file->private_data);
749 	filemap_invalidate_unlock(file->f_mapping);
750 }
751 
752 /* Returns a locked folio on success.  */
__kvm_gmem_get_pfn(struct file * file,struct kvm_memory_slot * slot,pgoff_t index,kvm_pfn_t * pfn,int * max_order)753 static struct folio *__kvm_gmem_get_pfn(struct file *file,
754 					struct kvm_memory_slot *slot,
755 					pgoff_t index, kvm_pfn_t *pfn,
756 					int *max_order)
757 {
758 	struct file *slot_file = READ_ONCE(slot->gmem.file);
759 	struct gmem_file *f = file->private_data;
760 	struct folio *folio;
761 
762 	if (file != slot_file) {
763 		WARN_ON_ONCE(slot_file);
764 		return ERR_PTR(-EFAULT);
765 	}
766 
767 	if (xa_load(&f->bindings, index) != slot) {
768 		WARN_ON_ONCE(xa_load(&f->bindings, index));
769 		return ERR_PTR(-EIO);
770 	}
771 
772 	folio = kvm_gmem_get_folio(file_inode(file), index);
773 	if (IS_ERR(folio))
774 		return folio;
775 
776 	if (folio_test_hwpoison(folio)) {
777 		folio_unlock(folio);
778 		folio_put(folio);
779 		return ERR_PTR(-EHWPOISON);
780 	}
781 
782 	*pfn = folio_file_pfn(folio, index);
783 	if (max_order)
784 		*max_order = 0;
785 
786 	return folio;
787 }
788 
kvm_gmem_get_pfn(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,kvm_pfn_t * pfn,struct page ** page,int * max_order)789 int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
790 		     gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
791 		     int *max_order)
792 {
793 	pgoff_t index = kvm_gmem_get_index(slot, gfn);
794 	struct folio *folio;
795 	int r = 0;
796 
797 	CLASS(gmem_get_file, file)(slot);
798 	if (!file)
799 		return -EFAULT;
800 
801 	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
802 	if (IS_ERR(folio))
803 		return PTR_ERR(folio);
804 
805 	if (!folio_test_uptodate(folio)) {
806 		clear_highpage(folio_page(folio, 0));
807 		folio_mark_uptodate(folio);
808 	}
809 
810 	r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
811 
812 	folio_unlock(folio);
813 
814 	if (!r)
815 		*page = folio_file_page(folio, index);
816 	else
817 		folio_put(folio);
818 
819 	return r;
820 }
821 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
822 
823 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
824 
__kvm_gmem_populate(struct kvm * kvm,struct kvm_memory_slot * slot,struct file * file,gfn_t gfn,struct page * src_page,kvm_gmem_populate_cb post_populate,void * opaque)825 static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot,
826 				struct file *file, gfn_t gfn, struct page *src_page,
827 				kvm_gmem_populate_cb post_populate, void *opaque)
828 {
829 	pgoff_t index = kvm_gmem_get_index(slot, gfn);
830 	struct folio *folio;
831 	kvm_pfn_t pfn;
832 	int ret;
833 
834 	filemap_invalidate_lock(file->f_mapping);
835 
836 	folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, NULL);
837 	if (IS_ERR(folio)) {
838 		ret = PTR_ERR(folio);
839 		goto out_unlock;
840 	}
841 
842 	folio_unlock(folio);
843 
844 	if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1,
845 					     KVM_MEMORY_ATTRIBUTE_PRIVATE,
846 					     KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
847 		ret = -EINVAL;
848 		goto out_put_folio;
849 	}
850 
851 	ret = post_populate(kvm, gfn, pfn, src_page, opaque);
852 	if (!ret)
853 		folio_mark_uptodate(folio);
854 
855 out_put_folio:
856 	folio_put(folio);
857 out_unlock:
858 	filemap_invalidate_unlock(file->f_mapping);
859 	return ret;
860 }
861 
kvm_gmem_populate(struct kvm * kvm,gfn_t start_gfn,void __user * src,long npages,kvm_gmem_populate_cb post_populate,void * opaque)862 long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
863 		       kvm_gmem_populate_cb post_populate, void *opaque)
864 {
865 	struct kvm_memory_slot *slot;
866 	int ret = 0;
867 	long i;
868 
869 	lockdep_assert_held(&kvm->slots_lock);
870 
871 	if (WARN_ON_ONCE(npages <= 0))
872 		return -EINVAL;
873 
874 	if (WARN_ON_ONCE(!PAGE_ALIGNED(src)))
875 		return -EINVAL;
876 
877 	slot = gfn_to_memslot(kvm, start_gfn);
878 	if (!kvm_slot_has_gmem(slot))
879 		return -EINVAL;
880 
881 	CLASS(gmem_get_file, file)(slot);
882 	if (!file)
883 		return -EFAULT;
884 
885 	npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
886 	for (i = 0; i < npages; i++) {
887 		struct page *src_page = NULL;
888 
889 		if (signal_pending(current)) {
890 			ret = -EINTR;
891 			break;
892 		}
893 
894 		if (src) {
895 			unsigned long uaddr = (unsigned long)src + i * PAGE_SIZE;
896 
897 			ret = get_user_pages_fast(uaddr, 1, 0, &src_page);
898 			if (ret < 0)
899 				break;
900 			if (ret != 1) {
901 				ret = -ENOMEM;
902 				break;
903 			}
904 		}
905 
906 		ret = __kvm_gmem_populate(kvm, slot, file, start_gfn + i, src_page,
907 					  post_populate, opaque);
908 
909 		if (src_page)
910 			put_page(src_page);
911 
912 		if (ret)
913 			break;
914 	}
915 
916 	return ret && !i ? ret : i;
917 }
918 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate);
919 #endif
920 
921 static struct kmem_cache *kvm_gmem_inode_cachep;
922 
kvm_gmem_init_inode_once(void * __gi)923 static void kvm_gmem_init_inode_once(void *__gi)
924 {
925 	struct gmem_inode *gi = __gi;
926 
927 	/*
928 	 * Note!  Don't initialize the inode with anything specific to the
929 	 * guest_memfd instance, or that might be specific to how the inode is
930 	 * used (from the VFS-layer's perspective).  This hook is called only
931 	 * during the initial slab allocation, i.e. only fields/state that are
932 	 * idempotent across _all_ use of the inode _object_ can be initialized
933 	 * at this time!
934 	 */
935 	inode_init_once(&gi->vfs_inode);
936 }
937 
kvm_gmem_alloc_inode(struct super_block * sb)938 static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
939 {
940 	struct gmem_inode *gi;
941 
942 	gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL);
943 	if (!gi)
944 		return NULL;
945 
946 	mpol_shared_policy_init(&gi->policy, NULL);
947 
948 	gi->flags = 0;
949 	INIT_LIST_HEAD(&gi->gmem_file_list);
950 	return &gi->vfs_inode;
951 }
952 
kvm_gmem_destroy_inode(struct inode * inode)953 static void kvm_gmem_destroy_inode(struct inode *inode)
954 {
955 	mpol_free_shared_policy(&GMEM_I(inode)->policy);
956 }
957 
kvm_gmem_free_inode(struct inode * inode)958 static void kvm_gmem_free_inode(struct inode *inode)
959 {
960 	kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode));
961 }
962 
963 static const struct super_operations kvm_gmem_super_operations = {
964 	.statfs		= simple_statfs,
965 	.alloc_inode	= kvm_gmem_alloc_inode,
966 	.destroy_inode	= kvm_gmem_destroy_inode,
967 	.free_inode	= kvm_gmem_free_inode,
968 };
969 
kvm_gmem_init_fs_context(struct fs_context * fc)970 static int kvm_gmem_init_fs_context(struct fs_context *fc)
971 {
972 	struct pseudo_fs_context *ctx;
973 
974 	if (!init_pseudo(fc, GUEST_MEMFD_MAGIC))
975 		return -ENOMEM;
976 
977 	fc->s_iflags |= SB_I_NOEXEC;
978 	fc->s_iflags |= SB_I_NODEV;
979 	ctx = fc->fs_private;
980 	ctx->ops = &kvm_gmem_super_operations;
981 
982 	return 0;
983 }
984 
985 static struct file_system_type kvm_gmem_fs = {
986 	.name		 = "guest_memfd",
987 	.init_fs_context = kvm_gmem_init_fs_context,
988 	.kill_sb	 = kill_anon_super,
989 };
990 
kvm_gmem_init_mount(void)991 static int kvm_gmem_init_mount(void)
992 {
993 	kvm_gmem_mnt = kern_mount(&kvm_gmem_fs);
994 
995 	if (IS_ERR(kvm_gmem_mnt))
996 		return PTR_ERR(kvm_gmem_mnt);
997 
998 	kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
999 	return 0;
1000 }
1001 
kvm_gmem_init(struct module * module)1002 int kvm_gmem_init(struct module *module)
1003 {
1004 	struct kmem_cache_args args = {
1005 		.align = 0,
1006 		.ctor = kvm_gmem_init_inode_once,
1007 	};
1008 	int ret;
1009 
1010 	kvm_gmem_fops.owner = module;
1011 	kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache",
1012 						  sizeof(struct gmem_inode),
1013 						  &args, SLAB_ACCOUNT);
1014 	if (!kvm_gmem_inode_cachep)
1015 		return -ENOMEM;
1016 
1017 	ret = kvm_gmem_init_mount();
1018 	if (ret) {
1019 		kmem_cache_destroy(kvm_gmem_inode_cachep);
1020 		return ret;
1021 	}
1022 	return 0;
1023 }
1024 
kvm_gmem_exit(void)1025 void kvm_gmem_exit(void)
1026 {
1027 	kern_unmount(kvm_gmem_mnt);
1028 	kvm_gmem_mnt = NULL;
1029 	rcu_barrier();
1030 	kmem_cache_destroy(kvm_gmem_inode_cachep);
1031 }
1032