1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/anon_inodes.h>
3 #include <linux/backing-dev.h>
4 #include <linux/falloc.h>
5 #include <linux/fs.h>
6 #include <linux/kvm_host.h>
7 #include <linux/mempolicy.h>
8 #include <linux/pseudo_fs.h>
9 #include <linux/pagemap.h>
10
11 #include "kvm_mm.h"
12
13 static struct vfsmount *kvm_gmem_mnt;
14
15 /*
16 * A guest_memfd instance can be associated multiple VMs, each with its own
17 * "view" of the underlying physical memory.
18 *
19 * The gmem's inode is effectively the raw underlying physical storage, and is
20 * used to track properties of the physical memory, while each gmem file is
21 * effectively a single VM's view of that storage, and is used to track assets
22 * specific to its associated VM, e.g. memslots=>gmem bindings.
23 */
24 struct gmem_file {
25 struct kvm *kvm;
26 struct xarray bindings;
27 struct list_head entry;
28 };
29
30 struct gmem_inode {
31 struct shared_policy policy;
32 struct inode vfs_inode;
33 struct list_head gmem_file_list;
34
35 u64 flags;
36 };
37
GMEM_I(struct inode * inode)38 static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
39 {
40 return container_of(inode, struct gmem_inode, vfs_inode);
41 }
42
43 #define kvm_gmem_for_each_file(f, inode) \
44 list_for_each_entry(f, &GMEM_I(inode)->gmem_file_list, entry)
45
46 /**
47 * folio_file_pfn - like folio_file_page, but return a pfn.
48 * @folio: The folio which contains this index.
49 * @index: The index we want to look up.
50 *
51 * Return: The pfn for this index.
52 */
folio_file_pfn(struct folio * folio,pgoff_t index)53 static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
54 {
55 return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
56 }
57
kvm_gmem_get_index(struct kvm_memory_slot * slot,gfn_t gfn)58 static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
59 {
60 return gfn - slot->base_gfn + slot->gmem.pgoff;
61 }
62
__kvm_gmem_prepare_folio(struct kvm * kvm,struct kvm_memory_slot * slot,pgoff_t index,struct folio * folio)63 static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
64 pgoff_t index, struct folio *folio)
65 {
66 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
67 kvm_pfn_t pfn = folio_file_pfn(folio, index);
68 gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff;
69 int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio));
70 if (rc) {
71 pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
72 index, gfn, pfn, rc);
73 return rc;
74 }
75 #endif
76
77 return 0;
78 }
79
80 /*
81 * Process @folio, which contains @gfn, so that the guest can use it.
82 * The folio must be locked and the gfn must be contained in @slot.
83 * On successful return the guest sees a zero page so as to avoid
84 * leaking host data and the up-to-date flag is set.
85 */
kvm_gmem_prepare_folio(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,struct folio * folio)86 static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
87 gfn_t gfn, struct folio *folio)
88 {
89 pgoff_t index;
90
91 /*
92 * Preparing huge folios should always be safe, since it should
93 * be possible to split them later if needed.
94 *
95 * Right now the folio order is always going to be zero, but the
96 * code is ready for huge folios. The only assumption is that
97 * the base pgoff of memslots is naturally aligned with the
98 * requested page order, ensuring that huge folios can also use
99 * huge page table entries for GPA->HPA mapping.
100 *
101 * The order will be passed when creating the guest_memfd, and
102 * checked when creating memslots.
103 */
104 WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio)));
105 index = kvm_gmem_get_index(slot, gfn);
106 index = ALIGN_DOWN(index, folio_nr_pages(folio));
107
108 return __kvm_gmem_prepare_folio(kvm, slot, index, folio);
109 }
110
111 /*
112 * Returns a locked folio on success. The caller is responsible for
113 * setting the up-to-date flag before the memory is mapped into the guest.
114 * There is no backing storage for the memory, so the folio will remain
115 * up-to-date until it's removed.
116 *
117 * Ignore accessed, referenced, and dirty flags. The memory is
118 * unevictable and there is no storage to write back to.
119 */
kvm_gmem_get_folio(struct inode * inode,pgoff_t index)120 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
121 {
122 /* TODO: Support huge pages. */
123 struct mempolicy *policy;
124 struct folio *folio;
125
126 /*
127 * Fast-path: See if folio is already present in mapping to avoid
128 * policy_lookup.
129 */
130 folio = __filemap_get_folio(inode->i_mapping, index,
131 FGP_LOCK | FGP_ACCESSED, 0);
132 if (!IS_ERR(folio))
133 return folio;
134
135 policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index);
136 folio = __filemap_get_folio_mpol(inode->i_mapping, index,
137 FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
138 mapping_gfp_mask(inode->i_mapping), policy);
139 mpol_cond_put(policy);
140
141 /*
142 * External interfaces like kvm_gmem_get_pfn() support dealing
143 * with hugepages to a degree, but internally, guest_memfd currently
144 * assumes that all folios are order-0 and handling would need
145 * to be updated for anything otherwise (e.g. page-clearing
146 * operations).
147 */
148 WARN_ON_ONCE(!IS_ERR(folio) && folio_order(folio));
149
150 return folio;
151 }
152
kvm_gmem_get_invalidate_filter(struct inode * inode)153 static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode)
154 {
155 if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)
156 return KVM_FILTER_SHARED;
157
158 return KVM_FILTER_PRIVATE;
159 }
160
__kvm_gmem_invalidate_begin(struct gmem_file * f,pgoff_t start,pgoff_t end,enum kvm_gfn_range_filter attr_filter)161 static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start,
162 pgoff_t end,
163 enum kvm_gfn_range_filter attr_filter)
164 {
165 bool flush = false, found_memslot = false;
166 struct kvm_memory_slot *slot;
167 struct kvm *kvm = f->kvm;
168 unsigned long index;
169
170 xa_for_each_range(&f->bindings, index, slot, start, end - 1) {
171 pgoff_t pgoff = slot->gmem.pgoff;
172
173 struct kvm_gfn_range gfn_range = {
174 .start = slot->base_gfn + max(pgoff, start) - pgoff,
175 .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
176 .slot = slot,
177 .may_block = true,
178 .attr_filter = attr_filter,
179 };
180
181 if (!found_memslot) {
182 found_memslot = true;
183
184 KVM_MMU_LOCK(kvm);
185 kvm_mmu_invalidate_begin(kvm);
186 }
187
188 flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
189 }
190
191 if (flush)
192 kvm_flush_remote_tlbs(kvm);
193
194 if (found_memslot)
195 KVM_MMU_UNLOCK(kvm);
196 }
197
kvm_gmem_invalidate_begin(struct inode * inode,pgoff_t start,pgoff_t end)198 static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start,
199 pgoff_t end)
200 {
201 enum kvm_gfn_range_filter attr_filter;
202 struct gmem_file *f;
203
204 attr_filter = kvm_gmem_get_invalidate_filter(inode);
205
206 kvm_gmem_for_each_file(f, inode)
207 __kvm_gmem_invalidate_begin(f, start, end, attr_filter);
208 }
209
__kvm_gmem_invalidate_end(struct gmem_file * f,pgoff_t start,pgoff_t end)210 static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start,
211 pgoff_t end)
212 {
213 struct kvm *kvm = f->kvm;
214
215 if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
216 KVM_MMU_LOCK(kvm);
217 kvm_mmu_invalidate_end(kvm);
218 KVM_MMU_UNLOCK(kvm);
219 }
220 }
221
kvm_gmem_invalidate_end(struct inode * inode,pgoff_t start,pgoff_t end)222 static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start,
223 pgoff_t end)
224 {
225 struct gmem_file *f;
226
227 kvm_gmem_for_each_file(f, inode)
228 __kvm_gmem_invalidate_end(f, start, end);
229 }
230
kvm_gmem_punch_hole(struct inode * inode,loff_t offset,loff_t len)231 static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
232 {
233 pgoff_t start = offset >> PAGE_SHIFT;
234 pgoff_t end = (offset + len) >> PAGE_SHIFT;
235
236 /*
237 * Bindings must be stable across invalidation to ensure the start+end
238 * are balanced.
239 */
240 filemap_invalidate_lock(inode->i_mapping);
241
242 kvm_gmem_invalidate_begin(inode, start, end);
243
244 truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
245
246 kvm_gmem_invalidate_end(inode, start, end);
247
248 filemap_invalidate_unlock(inode->i_mapping);
249
250 return 0;
251 }
252
kvm_gmem_allocate(struct inode * inode,loff_t offset,loff_t len)253 static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
254 {
255 struct address_space *mapping = inode->i_mapping;
256 pgoff_t start, index, end;
257 int r;
258
259 /* Dedicated guest is immutable by default. */
260 if (offset + len > i_size_read(inode))
261 return -EINVAL;
262
263 filemap_invalidate_lock_shared(mapping);
264
265 start = offset >> PAGE_SHIFT;
266 end = (offset + len) >> PAGE_SHIFT;
267
268 r = 0;
269 for (index = start; index < end; ) {
270 struct folio *folio;
271
272 if (signal_pending(current)) {
273 r = -EINTR;
274 break;
275 }
276
277 folio = kvm_gmem_get_folio(inode, index);
278 if (IS_ERR(folio)) {
279 r = PTR_ERR(folio);
280 break;
281 }
282
283 index = folio_next_index(folio);
284
285 folio_unlock(folio);
286 folio_put(folio);
287
288 /* 64-bit only, wrapping the index should be impossible. */
289 if (WARN_ON_ONCE(!index))
290 break;
291
292 cond_resched();
293 }
294
295 filemap_invalidate_unlock_shared(mapping);
296
297 return r;
298 }
299
kvm_gmem_fallocate(struct file * file,int mode,loff_t offset,loff_t len)300 static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
301 loff_t len)
302 {
303 int ret;
304
305 if (!(mode & FALLOC_FL_KEEP_SIZE))
306 return -EOPNOTSUPP;
307
308 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
309 return -EOPNOTSUPP;
310
311 if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
312 return -EINVAL;
313
314 if (mode & FALLOC_FL_PUNCH_HOLE)
315 ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
316 else
317 ret = kvm_gmem_allocate(file_inode(file), offset, len);
318
319 if (!ret)
320 file_modified(file);
321 return ret;
322 }
323
kvm_gmem_release(struct inode * inode,struct file * file)324 static int kvm_gmem_release(struct inode *inode, struct file *file)
325 {
326 struct gmem_file *f = file->private_data;
327 struct kvm_memory_slot *slot;
328 struct kvm *kvm = f->kvm;
329 unsigned long index;
330
331 /*
332 * Prevent concurrent attempts to *unbind* a memslot. This is the last
333 * reference to the file and thus no new bindings can be created, but
334 * dereferencing the slot for existing bindings needs to be protected
335 * against memslot updates, specifically so that unbind doesn't race
336 * and free the memslot (kvm_gmem_get_file() will return NULL).
337 *
338 * Since .release is called only when the reference count is zero,
339 * after which file_ref_get() and get_file_active() fail,
340 * kvm_gmem_get_pfn() cannot be using the file concurrently.
341 * file_ref_put() provides a full barrier, and get_file_active() the
342 * matching acquire barrier.
343 */
344 mutex_lock(&kvm->slots_lock);
345
346 filemap_invalidate_lock(inode->i_mapping);
347
348 xa_for_each(&f->bindings, index, slot)
349 WRITE_ONCE(slot->gmem.file, NULL);
350
351 /*
352 * All in-flight operations are gone and new bindings can be created.
353 * Zap all SPTEs pointed at by this file. Do not free the backing
354 * memory, as its lifetime is associated with the inode, not the file.
355 */
356 __kvm_gmem_invalidate_begin(f, 0, -1ul,
357 kvm_gmem_get_invalidate_filter(inode));
358 __kvm_gmem_invalidate_end(f, 0, -1ul);
359
360 list_del(&f->entry);
361
362 filemap_invalidate_unlock(inode->i_mapping);
363
364 mutex_unlock(&kvm->slots_lock);
365
366 xa_destroy(&f->bindings);
367 kfree(f);
368
369 kvm_put_kvm(kvm);
370
371 return 0;
372 }
373
kvm_gmem_get_file(struct kvm_memory_slot * slot)374 static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
375 {
376 /*
377 * Do not return slot->gmem.file if it has already been closed;
378 * there might be some time between the last fput() and when
379 * kvm_gmem_release() clears slot->gmem.file.
380 */
381 return get_file_active(&slot->gmem.file);
382 }
383
384 DEFINE_CLASS(gmem_get_file, struct file *, if (_T) fput(_T),
385 kvm_gmem_get_file(slot), struct kvm_memory_slot *slot);
386
kvm_gmem_supports_mmap(struct inode * inode)387 static bool kvm_gmem_supports_mmap(struct inode *inode)
388 {
389 return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP;
390 }
391
kvm_gmem_fault_user_mapping(struct vm_fault * vmf)392 static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
393 {
394 struct inode *inode = file_inode(vmf->vma->vm_file);
395 struct folio *folio;
396 vm_fault_t ret = VM_FAULT_LOCKED;
397
398 if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
399 return VM_FAULT_SIGBUS;
400
401 if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
402 return VM_FAULT_SIGBUS;
403
404 folio = kvm_gmem_get_folio(inode, vmf->pgoff);
405 if (IS_ERR(folio)) {
406 if (PTR_ERR(folio) == -EAGAIN)
407 return VM_FAULT_RETRY;
408
409 return vmf_error(PTR_ERR(folio));
410 }
411
412 if (WARN_ON_ONCE(folio_test_large(folio))) {
413 ret = VM_FAULT_SIGBUS;
414 goto out_folio;
415 }
416
417 if (!folio_test_uptodate(folio)) {
418 clear_highpage(folio_page(folio, 0));
419 folio_mark_uptodate(folio);
420 }
421
422 vmf->page = folio_file_page(folio, vmf->pgoff);
423
424 out_folio:
425 if (ret != VM_FAULT_LOCKED) {
426 folio_unlock(folio);
427 folio_put(folio);
428 }
429
430 return ret;
431 }
432
433 #ifdef CONFIG_NUMA
kvm_gmem_set_policy(struct vm_area_struct * vma,struct mempolicy * mpol)434 static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
435 {
436 struct inode *inode = file_inode(vma->vm_file);
437
438 return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol);
439 }
440
kvm_gmem_get_policy(struct vm_area_struct * vma,unsigned long addr,pgoff_t * pgoff)441 static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
442 unsigned long addr, pgoff_t *pgoff)
443 {
444 struct inode *inode = file_inode(vma->vm_file);
445
446 *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
447
448 /*
449 * Return the memory policy for this index, or NULL if none is set.
450 *
451 * Returning NULL, e.g. instead of the current task's memory policy, is
452 * important for the .get_policy kernel ABI: it indicates that no
453 * explicit policy has been set via mbind() for this memory. The caller
454 * can then replace NULL with the default memory policy instead of the
455 * current task's memory policy.
456 */
457 return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
458 }
459 #endif /* CONFIG_NUMA */
460
461 static const struct vm_operations_struct kvm_gmem_vm_ops = {
462 .fault = kvm_gmem_fault_user_mapping,
463 #ifdef CONFIG_NUMA
464 .get_policy = kvm_gmem_get_policy,
465 .set_policy = kvm_gmem_set_policy,
466 #endif
467 };
468
kvm_gmem_mmap(struct file * file,struct vm_area_struct * vma)469 static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
470 {
471 if (!kvm_gmem_supports_mmap(file_inode(file)))
472 return -ENODEV;
473
474 if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
475 (VM_SHARED | VM_MAYSHARE)) {
476 return -EINVAL;
477 }
478
479 vma->vm_ops = &kvm_gmem_vm_ops;
480
481 return 0;
482 }
483
484 static struct file_operations kvm_gmem_fops = {
485 .mmap = kvm_gmem_mmap,
486 .open = generic_file_open,
487 .release = kvm_gmem_release,
488 .fallocate = kvm_gmem_fallocate,
489 };
490
kvm_gmem_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)491 static int kvm_gmem_migrate_folio(struct address_space *mapping,
492 struct folio *dst, struct folio *src,
493 enum migrate_mode mode)
494 {
495 WARN_ON_ONCE(1);
496 return -EINVAL;
497 }
498
kvm_gmem_error_folio(struct address_space * mapping,struct folio * folio)499 static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
500 {
501 pgoff_t start, end;
502
503 filemap_invalidate_lock_shared(mapping);
504
505 start = folio->index;
506 end = start + folio_nr_pages(folio);
507
508 kvm_gmem_invalidate_begin(mapping->host, start, end);
509
510 /*
511 * Do not truncate the range, what action is taken in response to the
512 * error is userspace's decision (assuming the architecture supports
513 * gracefully handling memory errors). If/when the guest attempts to
514 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
515 * at which point KVM can either terminate the VM or propagate the
516 * error to userspace.
517 */
518
519 kvm_gmem_invalidate_end(mapping->host, start, end);
520
521 filemap_invalidate_unlock_shared(mapping);
522
523 return MF_DELAYED;
524 }
525
526 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
kvm_gmem_free_folio(struct folio * folio)527 static void kvm_gmem_free_folio(struct folio *folio)
528 {
529 struct page *page = folio_page(folio, 0);
530 kvm_pfn_t pfn = page_to_pfn(page);
531 int order = folio_order(folio);
532
533 kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
534 }
535 #endif
536
537 static const struct address_space_operations kvm_gmem_aops = {
538 .dirty_folio = noop_dirty_folio,
539 .migrate_folio = kvm_gmem_migrate_folio,
540 .error_remove_folio = kvm_gmem_error_folio,
541 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
542 .free_folio = kvm_gmem_free_folio,
543 #endif
544 };
545
kvm_gmem_setattr(struct mnt_idmap * idmap,struct dentry * dentry,struct iattr * attr)546 static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
547 struct iattr *attr)
548 {
549 return -EINVAL;
550 }
551 static const struct inode_operations kvm_gmem_iops = {
552 .setattr = kvm_gmem_setattr,
553 };
554
kvm_arch_supports_gmem_init_shared(struct kvm * kvm)555 bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
556 {
557 return true;
558 }
559
__kvm_gmem_create(struct kvm * kvm,loff_t size,u64 flags)560 static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
561 {
562 static const char *name = "[kvm-gmem]";
563 struct gmem_file *f;
564 struct inode *inode;
565 struct file *file;
566 int fd, err;
567
568 fd = get_unused_fd_flags(0);
569 if (fd < 0)
570 return fd;
571
572 f = kzalloc_obj(*f);
573 if (!f) {
574 err = -ENOMEM;
575 goto err_fd;
576 }
577
578 /* __fput() will take care of fops_put(). */
579 if (!fops_get(&kvm_gmem_fops)) {
580 err = -ENOENT;
581 goto err_gmem;
582 }
583
584 inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL);
585 if (IS_ERR(inode)) {
586 err = PTR_ERR(inode);
587 goto err_fops;
588 }
589
590 inode->i_op = &kvm_gmem_iops;
591 inode->i_mapping->a_ops = &kvm_gmem_aops;
592 inode->i_mode |= S_IFREG;
593 inode->i_size = size;
594 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
595 mapping_set_inaccessible(inode->i_mapping);
596 /* Unmovable mappings are supposed to be marked unevictable as well. */
597 WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
598
599 GMEM_I(inode)->flags = flags;
600
601 file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
602 if (IS_ERR(file)) {
603 err = PTR_ERR(file);
604 goto err_inode;
605 }
606
607 file->f_flags |= O_LARGEFILE;
608 file->private_data = f;
609
610 kvm_get_kvm(kvm);
611 f->kvm = kvm;
612 xa_init(&f->bindings);
613 list_add(&f->entry, &GMEM_I(inode)->gmem_file_list);
614
615 fd_install(fd, file);
616 return fd;
617
618 err_inode:
619 iput(inode);
620 err_fops:
621 fops_put(&kvm_gmem_fops);
622 err_gmem:
623 kfree(f);
624 err_fd:
625 put_unused_fd(fd);
626 return err;
627 }
628
kvm_gmem_create(struct kvm * kvm,struct kvm_create_guest_memfd * args)629 int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
630 {
631 loff_t size = args->size;
632 u64 flags = args->flags;
633
634 if (flags & ~kvm_gmem_get_supported_flags(kvm))
635 return -EINVAL;
636
637 if (size <= 0 || !PAGE_ALIGNED(size))
638 return -EINVAL;
639
640 return __kvm_gmem_create(kvm, size, flags);
641 }
642
kvm_gmem_bind(struct kvm * kvm,struct kvm_memory_slot * slot,unsigned int fd,loff_t offset)643 int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
644 unsigned int fd, loff_t offset)
645 {
646 loff_t size = slot->npages << PAGE_SHIFT;
647 unsigned long start, end;
648 struct gmem_file *f;
649 struct inode *inode;
650 struct file *file;
651 int r = -EINVAL;
652
653 BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
654
655 file = fget(fd);
656 if (!file)
657 return -EBADF;
658
659 if (file->f_op != &kvm_gmem_fops)
660 goto err;
661
662 f = file->private_data;
663 if (f->kvm != kvm)
664 goto err;
665
666 inode = file_inode(file);
667
668 if (offset < 0 || !PAGE_ALIGNED(offset) ||
669 offset + size > i_size_read(inode))
670 goto err;
671
672 filemap_invalidate_lock(inode->i_mapping);
673
674 start = offset >> PAGE_SHIFT;
675 end = start + slot->npages;
676
677 if (!xa_empty(&f->bindings) &&
678 xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
679 filemap_invalidate_unlock(inode->i_mapping);
680 goto err;
681 }
682
683 /*
684 * memslots of flag KVM_MEM_GUEST_MEMFD are immutable to change, so
685 * kvm_gmem_bind() must occur on a new memslot. Because the memslot
686 * is not visible yet, kvm_gmem_get_pfn() is guaranteed to see the file.
687 */
688 WRITE_ONCE(slot->gmem.file, file);
689 slot->gmem.pgoff = start;
690 if (kvm_gmem_supports_mmap(inode))
691 slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
692
693 xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL);
694 filemap_invalidate_unlock(inode->i_mapping);
695
696 /*
697 * Drop the reference to the file, even on success. The file pins KVM,
698 * not the other way 'round. Active bindings are invalidated if the
699 * file is closed before memslots are destroyed.
700 */
701 r = 0;
702 err:
703 fput(file);
704 return r;
705 }
706
__kvm_gmem_unbind(struct kvm_memory_slot * slot,struct gmem_file * f)707 static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct gmem_file *f)
708 {
709 unsigned long start = slot->gmem.pgoff;
710 unsigned long end = start + slot->npages;
711
712 xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL);
713
714 /*
715 * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn()
716 * cannot see this memslot.
717 */
718 WRITE_ONCE(slot->gmem.file, NULL);
719 }
720
kvm_gmem_unbind(struct kvm_memory_slot * slot)721 void kvm_gmem_unbind(struct kvm_memory_slot *slot)
722 {
723 /*
724 * Nothing to do if the underlying file was _already_ closed, as
725 * kvm_gmem_release() invalidates and nullifies all bindings.
726 */
727 if (!slot->gmem.file)
728 return;
729
730 CLASS(gmem_get_file, file)(slot);
731
732 /*
733 * However, if the file is _being_ closed, then the bindings need to be
734 * removed as kvm_gmem_release() might not run until after the memslot
735 * is freed. Note, modifying the bindings is safe even though the file
736 * is dying as kvm_gmem_release() nullifies slot->gmem.file under
737 * slots_lock, and only puts its reference to KVM after destroying all
738 * bindings. I.e. reaching this point means kvm_gmem_release() hasn't
739 * yet destroyed the bindings or freed the gmem_file, and can't do so
740 * until the caller drops slots_lock.
741 */
742 if (!file) {
743 __kvm_gmem_unbind(slot, slot->gmem.file->private_data);
744 return;
745 }
746
747 filemap_invalidate_lock(file->f_mapping);
748 __kvm_gmem_unbind(slot, file->private_data);
749 filemap_invalidate_unlock(file->f_mapping);
750 }
751
752 /* Returns a locked folio on success. */
__kvm_gmem_get_pfn(struct file * file,struct kvm_memory_slot * slot,pgoff_t index,kvm_pfn_t * pfn,int * max_order)753 static struct folio *__kvm_gmem_get_pfn(struct file *file,
754 struct kvm_memory_slot *slot,
755 pgoff_t index, kvm_pfn_t *pfn,
756 int *max_order)
757 {
758 struct file *slot_file = READ_ONCE(slot->gmem.file);
759 struct gmem_file *f = file->private_data;
760 struct folio *folio;
761
762 if (file != slot_file) {
763 WARN_ON_ONCE(slot_file);
764 return ERR_PTR(-EFAULT);
765 }
766
767 if (xa_load(&f->bindings, index) != slot) {
768 WARN_ON_ONCE(xa_load(&f->bindings, index));
769 return ERR_PTR(-EIO);
770 }
771
772 folio = kvm_gmem_get_folio(file_inode(file), index);
773 if (IS_ERR(folio))
774 return folio;
775
776 if (folio_test_hwpoison(folio)) {
777 folio_unlock(folio);
778 folio_put(folio);
779 return ERR_PTR(-EHWPOISON);
780 }
781
782 *pfn = folio_file_pfn(folio, index);
783 if (max_order)
784 *max_order = 0;
785
786 return folio;
787 }
788
kvm_gmem_get_pfn(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,kvm_pfn_t * pfn,struct page ** page,int * max_order)789 int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
790 gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
791 int *max_order)
792 {
793 pgoff_t index = kvm_gmem_get_index(slot, gfn);
794 struct folio *folio;
795 int r = 0;
796
797 CLASS(gmem_get_file, file)(slot);
798 if (!file)
799 return -EFAULT;
800
801 folio = __kvm_gmem_get_pfn(file, slot, index, pfn, max_order);
802 if (IS_ERR(folio))
803 return PTR_ERR(folio);
804
805 if (!folio_test_uptodate(folio)) {
806 clear_highpage(folio_page(folio, 0));
807 folio_mark_uptodate(folio);
808 }
809
810 r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
811
812 folio_unlock(folio);
813
814 if (!r)
815 *page = folio_file_page(folio, index);
816 else
817 folio_put(folio);
818
819 return r;
820 }
821 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
822
823 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
824
__kvm_gmem_populate(struct kvm * kvm,struct kvm_memory_slot * slot,struct file * file,gfn_t gfn,struct page * src_page,kvm_gmem_populate_cb post_populate,void * opaque)825 static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot,
826 struct file *file, gfn_t gfn, struct page *src_page,
827 kvm_gmem_populate_cb post_populate, void *opaque)
828 {
829 pgoff_t index = kvm_gmem_get_index(slot, gfn);
830 struct folio *folio;
831 kvm_pfn_t pfn;
832 int ret;
833
834 filemap_invalidate_lock(file->f_mapping);
835
836 folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, NULL);
837 if (IS_ERR(folio)) {
838 ret = PTR_ERR(folio);
839 goto out_unlock;
840 }
841
842 folio_unlock(folio);
843
844 if (!kvm_range_has_memory_attributes(kvm, gfn, gfn + 1,
845 KVM_MEMORY_ATTRIBUTE_PRIVATE,
846 KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
847 ret = -EINVAL;
848 goto out_put_folio;
849 }
850
851 ret = post_populate(kvm, gfn, pfn, src_page, opaque);
852 if (!ret)
853 folio_mark_uptodate(folio);
854
855 out_put_folio:
856 folio_put(folio);
857 out_unlock:
858 filemap_invalidate_unlock(file->f_mapping);
859 return ret;
860 }
861
kvm_gmem_populate(struct kvm * kvm,gfn_t start_gfn,void __user * src,long npages,kvm_gmem_populate_cb post_populate,void * opaque)862 long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
863 kvm_gmem_populate_cb post_populate, void *opaque)
864 {
865 struct kvm_memory_slot *slot;
866 int ret = 0;
867 long i;
868
869 lockdep_assert_held(&kvm->slots_lock);
870
871 if (WARN_ON_ONCE(npages <= 0))
872 return -EINVAL;
873
874 if (WARN_ON_ONCE(!PAGE_ALIGNED(src)))
875 return -EINVAL;
876
877 slot = gfn_to_memslot(kvm, start_gfn);
878 if (!kvm_slot_has_gmem(slot))
879 return -EINVAL;
880
881 CLASS(gmem_get_file, file)(slot);
882 if (!file)
883 return -EFAULT;
884
885 npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
886 for (i = 0; i < npages; i++) {
887 struct page *src_page = NULL;
888
889 if (signal_pending(current)) {
890 ret = -EINTR;
891 break;
892 }
893
894 if (src) {
895 unsigned long uaddr = (unsigned long)src + i * PAGE_SIZE;
896
897 ret = get_user_pages_fast(uaddr, 1, 0, &src_page);
898 if (ret < 0)
899 break;
900 if (ret != 1) {
901 ret = -ENOMEM;
902 break;
903 }
904 }
905
906 ret = __kvm_gmem_populate(kvm, slot, file, start_gfn + i, src_page,
907 post_populate, opaque);
908
909 if (src_page)
910 put_page(src_page);
911
912 if (ret)
913 break;
914 }
915
916 return ret && !i ? ret : i;
917 }
918 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate);
919 #endif
920
921 static struct kmem_cache *kvm_gmem_inode_cachep;
922
kvm_gmem_init_inode_once(void * __gi)923 static void kvm_gmem_init_inode_once(void *__gi)
924 {
925 struct gmem_inode *gi = __gi;
926
927 /*
928 * Note! Don't initialize the inode with anything specific to the
929 * guest_memfd instance, or that might be specific to how the inode is
930 * used (from the VFS-layer's perspective). This hook is called only
931 * during the initial slab allocation, i.e. only fields/state that are
932 * idempotent across _all_ use of the inode _object_ can be initialized
933 * at this time!
934 */
935 inode_init_once(&gi->vfs_inode);
936 }
937
kvm_gmem_alloc_inode(struct super_block * sb)938 static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
939 {
940 struct gmem_inode *gi;
941
942 gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL);
943 if (!gi)
944 return NULL;
945
946 mpol_shared_policy_init(&gi->policy, NULL);
947
948 gi->flags = 0;
949 INIT_LIST_HEAD(&gi->gmem_file_list);
950 return &gi->vfs_inode;
951 }
952
kvm_gmem_destroy_inode(struct inode * inode)953 static void kvm_gmem_destroy_inode(struct inode *inode)
954 {
955 mpol_free_shared_policy(&GMEM_I(inode)->policy);
956 }
957
kvm_gmem_free_inode(struct inode * inode)958 static void kvm_gmem_free_inode(struct inode *inode)
959 {
960 kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode));
961 }
962
963 static const struct super_operations kvm_gmem_super_operations = {
964 .statfs = simple_statfs,
965 .alloc_inode = kvm_gmem_alloc_inode,
966 .destroy_inode = kvm_gmem_destroy_inode,
967 .free_inode = kvm_gmem_free_inode,
968 };
969
kvm_gmem_init_fs_context(struct fs_context * fc)970 static int kvm_gmem_init_fs_context(struct fs_context *fc)
971 {
972 struct pseudo_fs_context *ctx;
973
974 if (!init_pseudo(fc, GUEST_MEMFD_MAGIC))
975 return -ENOMEM;
976
977 fc->s_iflags |= SB_I_NOEXEC;
978 fc->s_iflags |= SB_I_NODEV;
979 ctx = fc->fs_private;
980 ctx->ops = &kvm_gmem_super_operations;
981
982 return 0;
983 }
984
985 static struct file_system_type kvm_gmem_fs = {
986 .name = "guest_memfd",
987 .init_fs_context = kvm_gmem_init_fs_context,
988 .kill_sb = kill_anon_super,
989 };
990
kvm_gmem_init_mount(void)991 static int kvm_gmem_init_mount(void)
992 {
993 kvm_gmem_mnt = kern_mount(&kvm_gmem_fs);
994
995 if (IS_ERR(kvm_gmem_mnt))
996 return PTR_ERR(kvm_gmem_mnt);
997
998 kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
999 return 0;
1000 }
1001
kvm_gmem_init(struct module * module)1002 int kvm_gmem_init(struct module *module)
1003 {
1004 struct kmem_cache_args args = {
1005 .align = 0,
1006 .ctor = kvm_gmem_init_inode_once,
1007 };
1008 int ret;
1009
1010 kvm_gmem_fops.owner = module;
1011 kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache",
1012 sizeof(struct gmem_inode),
1013 &args, SLAB_ACCOUNT);
1014 if (!kvm_gmem_inode_cachep)
1015 return -ENOMEM;
1016
1017 ret = kvm_gmem_init_mount();
1018 if (ret) {
1019 kmem_cache_destroy(kvm_gmem_inode_cachep);
1020 return ret;
1021 }
1022 return 0;
1023 }
1024
kvm_gmem_exit(void)1025 void kvm_gmem_exit(void)
1026 {
1027 kern_unmount(kvm_gmem_mnt);
1028 kvm_gmem_mnt = NULL;
1029 rcu_barrier();
1030 kmem_cache_destroy(kvm_gmem_inode_cachep);
1031 }
1032