xref: /linux/arch/x86/kernel/kvm.c (revision ac633ba77c84fa5be1ec081967be081d6e25577e)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * KVM paravirt_ops implementation
4  *
5  * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6  * Copyright IBM Corporation, 2007
7  *   Authors: Anthony Liguori <aliguori@us.ibm.com>
8  */
9 
10 #define pr_fmt(fmt) "kvm-guest: " fmt
11 
12 #include <linux/context_tracking.h>
13 #include <linux/init.h>
14 #include <linux/irq.h>
15 #include <linux/kernel.h>
16 #include <linux/kvm_para.h>
17 #include <linux/cpu.h>
18 #include <linux/mm.h>
19 #include <linux/highmem.h>
20 #include <linux/hardirq.h>
21 #include <linux/notifier.h>
22 #include <linux/reboot.h>
23 #include <linux/hash.h>
24 #include <linux/sched.h>
25 #include <linux/slab.h>
26 #include <linux/kprobes.h>
27 #include <linux/nmi.h>
28 #include <linux/swait.h>
29 #include <linux/syscore_ops.h>
30 #include <linux/cc_platform.h>
31 #include <linux/efi.h>
32 #include <linux/kvm_types.h>
33 #include <linux/sched/cputime.h>
34 #include <asm/timer.h>
35 #include <asm/cpu.h>
36 #include <asm/traps.h>
37 #include <asm/desc.h>
38 #include <asm/tlbflush.h>
39 #include <asm/apic.h>
40 #include <asm/apicdef.h>
41 #include <asm/hypervisor.h>
42 #include <asm/mtrr.h>
43 #include <asm/tlb.h>
44 #include <asm/cpuidle_haltpoll.h>
45 #include <asm/msr.h>
46 #include <asm/ptrace.h>
47 #include <asm/reboot.h>
48 #include <asm/svm.h>
49 #include <asm/e820/api.h>
50 
51 DEFINE_STATIC_KEY_FALSE_RO(kvm_async_pf_enabled);
52 
53 static int kvmapf = 1;
54 
parse_no_kvmapf(char * arg)55 static int __init parse_no_kvmapf(char *arg)
56 {
57         kvmapf = 0;
58         return 0;
59 }
60 
61 early_param("no-kvmapf", parse_no_kvmapf);
62 
63 static int steal_acc = 1;
parse_no_stealacc(char * arg)64 static int __init parse_no_stealacc(char *arg)
65 {
66         steal_acc = 0;
67         return 0;
68 }
69 
70 early_param("no-steal-acc", parse_no_stealacc);
71 
72 static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled);
73 static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
74 DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
75 static int has_steal_clock = 0;
76 
77 static int has_guest_poll = 0;
78 
79 #define KVM_TASK_SLEEP_HASHBITS 8
80 #define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
81 
82 struct kvm_task_sleep_node {
83 	struct hlist_node link;
84 	struct swait_queue_head wq;
85 	u32 token;
86 	int cpu;
87 	bool dummy;
88 };
89 
90 static struct kvm_task_sleep_head {
91 	raw_spinlock_t lock;
92 	struct hlist_head list;
93 } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
94 
_find_apf_task(struct kvm_task_sleep_head * b,u32 token)95 static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
96 						  u32 token)
97 {
98 	struct hlist_node *p;
99 
100 	hlist_for_each(p, &b->list) {
101 		struct kvm_task_sleep_node *n =
102 			hlist_entry(p, typeof(*n), link);
103 		if (n->token == token)
104 			return n;
105 	}
106 
107 	return NULL;
108 }
109 
kvm_async_pf_queue_task(u32 token,struct kvm_task_sleep_node * n)110 static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
111 {
112 	u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
113 	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
114 	struct kvm_task_sleep_node *e;
115 
116 	raw_spin_lock(&b->lock);
117 	e = _find_apf_task(b, token);
118 	if (e) {
119 		struct kvm_task_sleep_node *dummy = NULL;
120 
121 		/*
122 		 * The entry can either be a 'dummy' entry (which is put on the
123 		 * list when wake-up happens ahead of APF handling completion)
124 		 * or a token from another task which should not be touched.
125 		 */
126 		if (e->dummy) {
127 			hlist_del(&e->link);
128 			dummy = e;
129 		}
130 
131 		raw_spin_unlock(&b->lock);
132 		kfree(dummy);
133 		return false;
134 	}
135 
136 	n->token = token;
137 	n->cpu = smp_processor_id();
138 	n->dummy = false;
139 	init_swait_queue_head(&n->wq);
140 	hlist_add_head(&n->link, &b->list);
141 	raw_spin_unlock(&b->lock);
142 	return true;
143 }
144 
145 /*
146  * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled
147  * @token:	Token to identify the sleep node entry
148  *
149  * Invoked from the async pagefault handling code or from the VM exit page
150  * fault handler. In both cases RCU is watching.
151  */
kvm_async_pf_task_wait_schedule(u32 token)152 void kvm_async_pf_task_wait_schedule(u32 token)
153 {
154 	struct kvm_task_sleep_node n;
155 	DECLARE_SWAITQUEUE(wait);
156 
157 	lockdep_assert_irqs_disabled();
158 
159 	if (!kvm_async_pf_queue_task(token, &n))
160 		return;
161 
162 	for (;;) {
163 		prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
164 		if (hlist_unhashed(&n.link))
165 			break;
166 
167 		local_irq_enable();
168 		schedule();
169 		local_irq_disable();
170 	}
171 	finish_swait(&n.wq, &wait);
172 }
173 EXPORT_SYMBOL_FOR_KVM(kvm_async_pf_task_wait_schedule);
174 
apf_task_wake_one(struct kvm_task_sleep_node * n)175 static void apf_task_wake_one(struct kvm_task_sleep_node *n)
176 {
177 	hlist_del_init(&n->link);
178 	if (swq_has_sleeper(&n->wq))
179 		swake_up_one(&n->wq);
180 }
181 
apf_task_wake_all(void)182 static void apf_task_wake_all(void)
183 {
184 	int i;
185 
186 	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
187 		struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
188 		struct kvm_task_sleep_node *n;
189 		struct hlist_node *p, *next;
190 
191 		raw_spin_lock(&b->lock);
192 		hlist_for_each_safe(p, next, &b->list) {
193 			n = hlist_entry(p, typeof(*n), link);
194 			if (n->cpu == smp_processor_id())
195 				apf_task_wake_one(n);
196 		}
197 		raw_spin_unlock(&b->lock);
198 	}
199 }
200 
kvm_async_pf_task_wake(u32 token)201 static void kvm_async_pf_task_wake(u32 token)
202 {
203 	u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
204 	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
205 	struct kvm_task_sleep_node *n, *dummy = NULL;
206 
207 	if (token == ~0) {
208 		apf_task_wake_all();
209 		return;
210 	}
211 
212 again:
213 	raw_spin_lock(&b->lock);
214 	n = _find_apf_task(b, token);
215 	if (!n) {
216 		/*
217 		 * Async #PF not yet handled, add a dummy entry for the token.
218 		 * Allocating the token must be down outside of the raw lock
219 		 * as the allocator is preemptible on PREEMPT_RT kernels.
220 		 */
221 		if (!dummy) {
222 			raw_spin_unlock(&b->lock);
223 			dummy = kzalloc_obj(*dummy, GFP_ATOMIC);
224 
225 			/*
226 			 * Continue looping on allocation failure, eventually
227 			 * the async #PF will be handled and allocating a new
228 			 * node will be unnecessary.
229 			 */
230 			if (!dummy)
231 				cpu_relax();
232 
233 			/*
234 			 * Recheck for async #PF completion before enqueueing
235 			 * the dummy token to avoid duplicate list entries.
236 			 */
237 			goto again;
238 		}
239 		dummy->token = token;
240 		dummy->cpu = smp_processor_id();
241 		dummy->dummy = true;
242 		init_swait_queue_head(&dummy->wq);
243 		hlist_add_head(&dummy->link, &b->list);
244 		dummy = NULL;
245 	} else {
246 		apf_task_wake_one(n);
247 	}
248 	raw_spin_unlock(&b->lock);
249 
250 	/* A dummy token might be allocated and ultimately not used.  */
251 	kfree(dummy);
252 }
253 
kvm_read_and_reset_apf_flags(void)254 noinstr u32 kvm_read_and_reset_apf_flags(void)
255 {
256 	u32 flags = 0;
257 
258 	if (__this_cpu_read(async_pf_enabled)) {
259 		flags = __this_cpu_read(apf_reason.flags);
260 		__this_cpu_write(apf_reason.flags, 0);
261 	}
262 
263 	return flags;
264 }
265 EXPORT_SYMBOL_FOR_KVM(kvm_read_and_reset_apf_flags);
266 
__kvm_handle_async_pf(struct pt_regs * regs,u32 token)267 noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
268 {
269 	u32 flags = kvm_read_and_reset_apf_flags();
270 	irqentry_state_t state;
271 
272 	if (!flags)
273 		return false;
274 
275 	state = irqentry_enter(regs);
276 	instrumentation_begin();
277 
278 	/*
279 	 * If the host managed to inject an async #PF into an interrupt
280 	 * disabled region, then die hard as this is not going to end well
281 	 * and the host side is seriously broken.
282 	 */
283 	if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
284 		panic("Host injected async #PF in interrupt disabled region\n");
285 
286 	if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
287 		if (unlikely(!(user_mode(regs))))
288 			panic("Host injected async #PF in kernel mode\n");
289 		/* Page is swapped out by the host. */
290 		kvm_async_pf_task_wait_schedule(token);
291 	} else {
292 		WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags);
293 	}
294 
295 	instrumentation_end();
296 	irqentry_exit(regs, state);
297 	return true;
298 }
299 
DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)300 DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
301 {
302 	struct pt_regs *old_regs = set_irq_regs(regs);
303 	u32 token;
304 
305 	apic_eoi();
306 
307 	inc_irq_stat(irq_hv_callback_count);
308 
309 	if (__this_cpu_read(async_pf_enabled)) {
310 		token = __this_cpu_read(apf_reason.token);
311 		kvm_async_pf_task_wake(token);
312 		__this_cpu_write(apf_reason.token, 0);
313 		wrmsrq(MSR_KVM_ASYNC_PF_ACK, 1);
314 	}
315 
316 	set_irq_regs(old_regs);
317 }
318 
paravirt_ops_setup(void)319 static void __init paravirt_ops_setup(void)
320 {
321 	pv_info.name = "KVM";
322 
323 	if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
324 		pv_info.io_delay = false;
325 
326 #ifdef CONFIG_X86_IO_APIC
327 	no_timer_check = 1;
328 #endif
329 }
330 
kvm_register_steal_time(void)331 static void kvm_register_steal_time(void)
332 {
333 	int cpu = smp_processor_id();
334 	struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
335 
336 	if (!has_steal_clock)
337 		return;
338 
339 	wrmsrq(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
340 	pr_debug("stealtime: cpu %d, msr %llx\n", cpu,
341 		(unsigned long long) slow_virt_to_phys(st));
342 }
343 
344 static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
345 
kvm_guest_apic_eoi_write(void)346 static notrace __maybe_unused void kvm_guest_apic_eoi_write(void)
347 {
348 	/**
349 	 * This relies on __test_and_clear_bit to modify the memory
350 	 * in a way that is atomic with respect to the local CPU.
351 	 * The hypervisor only accesses this memory from the local CPU so
352 	 * there's no need for lock or memory barriers.
353 	 * An optimization barrier is implied in apic write.
354 	 */
355 	if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
356 		return;
357 	apic_native_eoi();
358 }
359 
kvm_guest_cpu_init(void)360 static void kvm_guest_cpu_init(void)
361 {
362 	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
363 		u64 pa;
364 
365 		WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
366 
367 		pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
368 		pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
369 
370 		if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
371 			pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
372 
373 		wrmsrq(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
374 
375 		wrmsrq(MSR_KVM_ASYNC_PF_EN, pa);
376 		__this_cpu_write(async_pf_enabled, true);
377 		pr_debug("setup async PF for cpu %d\n", smp_processor_id());
378 	}
379 
380 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
381 		unsigned long pa;
382 
383 		/* Size alignment is implied but just to make it explicit. */
384 		BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
385 		__this_cpu_write(kvm_apic_eoi, 0);
386 		pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
387 			| KVM_MSR_ENABLED;
388 		wrmsrq(MSR_KVM_PV_EOI_EN, pa);
389 	}
390 
391 	if (has_steal_clock)
392 		kvm_register_steal_time();
393 }
394 
kvm_pv_disable_apf(void)395 static void kvm_pv_disable_apf(void)
396 {
397 	if (!__this_cpu_read(async_pf_enabled))
398 		return;
399 
400 	wrmsrq(MSR_KVM_ASYNC_PF_EN, 0);
401 	__this_cpu_write(async_pf_enabled, false);
402 
403 	pr_debug("disable async PF for cpu %d\n", smp_processor_id());
404 }
405 
kvm_disable_steal_time(void)406 static void kvm_disable_steal_time(void)
407 {
408 	if (!has_steal_clock)
409 		return;
410 
411 	wrmsrq(MSR_KVM_STEAL_TIME, 0);
412 }
413 
kvm_steal_clock(int cpu)414 static u64 kvm_steal_clock(int cpu)
415 {
416 	u64 steal;
417 	struct kvm_steal_time *src;
418 	int version;
419 
420 	src = &per_cpu(steal_time, cpu);
421 	do {
422 		version = src->version;
423 		virt_rmb();
424 		steal = src->steal;
425 		virt_rmb();
426 	} while ((version & 1) || (version != src->version));
427 
428 	return steal;
429 }
430 
__set_percpu_decrypted(void * ptr,unsigned long size)431 static inline __init void __set_percpu_decrypted(void *ptr, unsigned long size)
432 {
433 	early_set_memory_decrypted((unsigned long) ptr, size);
434 }
435 
436 /*
437  * Iterate through all possible CPUs and map the memory region pointed
438  * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
439  *
440  * Note: we iterate through all possible CPUs to ensure that CPUs
441  * hotplugged will have their per-cpu variable already mapped as
442  * decrypted.
443  */
sev_map_percpu_data(void)444 static void __init sev_map_percpu_data(void)
445 {
446 	int cpu;
447 
448 	if (cc_vendor != CC_VENDOR_AMD ||
449 	    !cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
450 		return;
451 
452 	for_each_possible_cpu(cpu) {
453 		__set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason));
454 		__set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time));
455 		__set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi));
456 	}
457 }
458 
kvm_guest_cpu_offline(bool shutdown)459 static void kvm_guest_cpu_offline(bool shutdown)
460 {
461 	kvm_disable_steal_time();
462 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
463 		wrmsrq(MSR_KVM_PV_EOI_EN, 0);
464 	if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
465 		wrmsrq(MSR_KVM_MIGRATION_CONTROL, 0);
466 	kvm_pv_disable_apf();
467 	if (!shutdown)
468 		apf_task_wake_all();
469 	kvmclock_disable();
470 }
471 
kvm_cpu_online(unsigned int cpu)472 static int kvm_cpu_online(unsigned int cpu)
473 {
474 	unsigned long flags;
475 
476 	local_irq_save(flags);
477 	kvm_guest_cpu_init();
478 	local_irq_restore(flags);
479 	return 0;
480 }
481 
482 #ifdef CONFIG_SMP
483 
484 static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
485 
pv_tlb_flush_supported(void)486 static bool pv_tlb_flush_supported(void)
487 {
488 	return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
489 		!kvm_para_has_hint(KVM_HINTS_REALTIME) &&
490 		kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
491 		!boot_cpu_has(X86_FEATURE_MWAIT) &&
492 		(num_possible_cpus() != 1));
493 }
494 
pv_ipi_supported(void)495 static bool pv_ipi_supported(void)
496 {
497 	return (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI) &&
498 	       (num_possible_cpus() != 1));
499 }
500 
pv_sched_yield_supported(void)501 static bool pv_sched_yield_supported(void)
502 {
503 	return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
504 		!kvm_para_has_hint(KVM_HINTS_REALTIME) &&
505 	    kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
506 	    !boot_cpu_has(X86_FEATURE_MWAIT) &&
507 	    (num_possible_cpus() != 1));
508 }
509 
510 #define KVM_IPI_CLUSTER_SIZE	(2 * BITS_PER_LONG)
511 
__send_ipi_mask(const struct cpumask * mask,int vector)512 static void __send_ipi_mask(const struct cpumask *mask, int vector)
513 {
514 	unsigned long flags;
515 	int cpu, min = 0, max = 0;
516 #ifdef CONFIG_X86_64
517 	__uint128_t ipi_bitmap = 0;
518 #else
519 	u64 ipi_bitmap = 0;
520 #endif
521 	u32 apic_id, icr;
522 	long ret;
523 
524 	if (cpumask_empty(mask))
525 		return;
526 
527 	local_irq_save(flags);
528 
529 	switch (vector) {
530 	default:
531 		icr = APIC_DM_FIXED | vector;
532 		break;
533 	case NMI_VECTOR:
534 		icr = APIC_DM_NMI;
535 		break;
536 	}
537 
538 	for_each_cpu(cpu, mask) {
539 		apic_id = per_cpu(x86_cpu_to_apicid, cpu);
540 		if (!ipi_bitmap) {
541 			min = max = apic_id;
542 		} else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
543 			ipi_bitmap <<= min - apic_id;
544 			min = apic_id;
545 		} else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) {
546 			max = apic_id < max ? max : apic_id;
547 		} else {
548 			ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
549 				(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
550 			WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
551 				  ret);
552 			min = max = apic_id;
553 			ipi_bitmap = 0;
554 		}
555 		__set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
556 	}
557 
558 	if (ipi_bitmap) {
559 		ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
560 			(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
561 		WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
562 			  ret);
563 	}
564 
565 	local_irq_restore(flags);
566 }
567 
kvm_send_ipi_mask(const struct cpumask * mask,int vector)568 static void kvm_send_ipi_mask(const struct cpumask *mask, int vector)
569 {
570 	__send_ipi_mask(mask, vector);
571 }
572 
kvm_send_ipi_mask_allbutself(const struct cpumask * mask,int vector)573 static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
574 {
575 	unsigned int this_cpu = smp_processor_id();
576 	struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
577 	const struct cpumask *local_mask;
578 
579 	cpumask_copy(new_mask, mask);
580 	cpumask_clear_cpu(this_cpu, new_mask);
581 	local_mask = new_mask;
582 	__send_ipi_mask(local_mask, vector);
583 }
584 
setup_efi_kvm_sev_migration(void)585 static int __init setup_efi_kvm_sev_migration(void)
586 {
587 	efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled";
588 	efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID;
589 	efi_status_t status;
590 	unsigned long size;
591 	bool enabled;
592 
593 	if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) ||
594 	    !kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
595 		return 0;
596 
597 	if (!efi_enabled(EFI_BOOT))
598 		return 0;
599 
600 	if (!efi_enabled(EFI_RUNTIME_SERVICES)) {
601 		pr_info("%s : EFI runtime services are not enabled\n", __func__);
602 		return 0;
603 	}
604 
605 	size = sizeof(enabled);
606 
607 	/* Get variable contents into buffer */
608 	status = efi.get_variable(efi_sev_live_migration_enabled,
609 				  &efi_variable_guid, NULL, &size, &enabled);
610 
611 	if (status == EFI_NOT_FOUND) {
612 		pr_info("%s : EFI live migration variable not found\n", __func__);
613 		return 0;
614 	}
615 
616 	if (status != EFI_SUCCESS) {
617 		pr_info("%s : EFI variable retrieval failed\n", __func__);
618 		return 0;
619 	}
620 
621 	if (enabled == 0) {
622 		pr_info("%s: live migration disabled in EFI\n", __func__);
623 		return 0;
624 	}
625 
626 	pr_info("%s : live migration enabled in EFI\n", __func__);
627 	wrmsrq(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY);
628 
629 	return 1;
630 }
631 
632 late_initcall(setup_efi_kvm_sev_migration);
633 
634 /*
635  * Set the IPI entry points
636  */
kvm_setup_pv_ipi(void)637 static __init void kvm_setup_pv_ipi(void)
638 {
639 	apic_update_callback(send_IPI_mask, kvm_send_ipi_mask);
640 	apic_update_callback(send_IPI_mask_allbutself, kvm_send_ipi_mask_allbutself);
641 	pr_info("setup PV IPIs\n");
642 }
643 
kvm_smp_send_call_func_ipi(const struct cpumask * mask)644 static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
645 {
646 	int cpu;
647 
648 	native_send_call_func_ipi(mask);
649 
650 	/* Make sure other vCPUs get a chance to run if they need to. */
651 	for_each_cpu(cpu, mask) {
652 		if (!idle_cpu(cpu) && vcpu_is_preempted(cpu)) {
653 			kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu));
654 			break;
655 		}
656 	}
657 }
658 
kvm_flush_tlb_multi(const struct cpumask * cpumask,const struct flush_tlb_info * info)659 static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
660 			const struct flush_tlb_info *info)
661 {
662 	u8 state;
663 	int cpu;
664 	struct kvm_steal_time *src;
665 	struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
666 
667 	cpumask_copy(flushmask, cpumask);
668 	/*
669 	 * We have to call flush only on online vCPUs. And
670 	 * queue flush_on_enter for pre-empted vCPUs
671 	 */
672 	for_each_cpu(cpu, flushmask) {
673 		/*
674 		 * The local vCPU is never preempted, so we do not explicitly
675 		 * skip check for local vCPU - it will never be cleared from
676 		 * flushmask.
677 		 */
678 		src = &per_cpu(steal_time, cpu);
679 		state = READ_ONCE(src->preempted);
680 		if ((state & KVM_VCPU_PREEMPTED)) {
681 			if (try_cmpxchg(&src->preempted, &state,
682 					state | KVM_VCPU_FLUSH_TLB))
683 				__cpumask_clear_cpu(cpu, flushmask);
684 		}
685 	}
686 
687 	native_flush_tlb_multi(flushmask, info);
688 }
689 
kvm_alloc_cpumask(void)690 static __init int kvm_alloc_cpumask(void)
691 {
692 	int cpu;
693 
694 	if (!kvm_para_available() || nopv)
695 		return 0;
696 
697 	if (pv_tlb_flush_supported() || pv_ipi_supported())
698 		for_each_possible_cpu(cpu) {
699 			zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
700 				GFP_KERNEL, cpu_to_node(cpu));
701 		}
702 
703 	return 0;
704 }
705 arch_initcall(kvm_alloc_cpumask);
706 
kvm_smp_prepare_boot_cpu(void)707 static void __init kvm_smp_prepare_boot_cpu(void)
708 {
709 	/*
710 	 * Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
711 	 * shares the guest physical address with the hypervisor.
712 	 */
713 	sev_map_percpu_data();
714 
715 	kvm_guest_cpu_init();
716 	native_smp_prepare_boot_cpu();
717 	kvm_spinlock_init();
718 }
719 
kvm_cpu_down_prepare(unsigned int cpu)720 static int kvm_cpu_down_prepare(unsigned int cpu)
721 {
722 	unsigned long flags;
723 
724 	local_irq_save(flags);
725 	kvm_guest_cpu_offline(false);
726 	local_irq_restore(flags);
727 	return 0;
728 }
729 
730 #endif
731 
kvm_suspend(void * data)732 static int kvm_suspend(void *data)
733 {
734 	u64 val = 0;
735 
736 	kvm_guest_cpu_offline(false);
737 
738 #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
739 	if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
740 		rdmsrq(MSR_KVM_POLL_CONTROL, val);
741 	has_guest_poll = !(val & 1);
742 #endif
743 	return 0;
744 }
745 
kvm_resume(void * data)746 static void kvm_resume(void *data)
747 {
748 	kvm_cpu_online(raw_smp_processor_id());
749 
750 #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
751 	if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
752 		wrmsrq(MSR_KVM_POLL_CONTROL, 0);
753 #endif
754 }
755 
756 static const struct syscore_ops kvm_syscore_ops = {
757 	.suspend	= kvm_suspend,
758 	.resume		= kvm_resume,
759 };
760 
761 static struct syscore kvm_syscore = {
762 	.ops = &kvm_syscore_ops,
763 };
764 
kvm_pv_guest_cpu_reboot(void * unused)765 static void kvm_pv_guest_cpu_reboot(void *unused)
766 {
767 	kvm_guest_cpu_offline(true);
768 }
769 
kvm_pv_reboot_notify(struct notifier_block * nb,unsigned long code,void * unused)770 static int kvm_pv_reboot_notify(struct notifier_block *nb,
771 				unsigned long code, void *unused)
772 {
773 	if (code == SYS_RESTART)
774 		on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
775 	return NOTIFY_DONE;
776 }
777 
778 static struct notifier_block kvm_pv_reboot_nb = {
779 	.notifier_call = kvm_pv_reboot_notify,
780 };
781 
782 /*
783  * After a PV feature is registered, the host will keep writing to the
784  * registered memory location. If the guest happens to shutdown, this memory
785  * won't be valid. In cases like kexec, in which you install a new kernel, this
786  * means a random memory location will be kept being written.
787  */
788 #ifdef CONFIG_CRASH_DUMP
kvm_crash_shutdown(struct pt_regs * regs)789 static void kvm_crash_shutdown(struct pt_regs *regs)
790 {
791 	kvm_guest_cpu_offline(true);
792 	native_machine_crash_shutdown(regs);
793 }
794 #endif
795 
796 #if defined(CONFIG_X86_32) || !defined(CONFIG_SMP)
797 bool __kvm_vcpu_is_preempted(long cpu);
798 
__kvm_vcpu_is_preempted(long cpu)799 __visible bool __kvm_vcpu_is_preempted(long cpu)
800 {
801 	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
802 
803 	return !!(src->preempted & KVM_VCPU_PREEMPTED);
804 }
805 PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
806 
807 #else
808 
809 #include <asm/asm-offsets.h>
810 
811 extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
812 
813 /*
814  * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
815  * restoring to/from the stack.
816  */
817 #define PV_VCPU_PREEMPTED_ASM						     \
818  "movq   __per_cpu_offset(,%rdi,8), %rax\n\t"				     \
819  "cmpb   $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \
820  "setne  %al\n\t"
821 
822 DEFINE_ASM_FUNC(__raw_callee_save___kvm_vcpu_is_preempted,
823 		PV_VCPU_PREEMPTED_ASM, .text);
824 #endif
825 
kvm_guest_init(void)826 static void __init kvm_guest_init(void)
827 {
828 	int i;
829 
830 	paravirt_ops_setup();
831 	register_reboot_notifier(&kvm_pv_reboot_nb);
832 	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
833 		raw_spin_lock_init(&async_pf_sleepers[i].lock);
834 
835 	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
836 		has_steal_clock = 1;
837 		static_call_update(pv_steal_clock, kvm_steal_clock);
838 
839 #ifdef CONFIG_PARAVIRT_SPINLOCKS
840 		pv_ops_lock.vcpu_is_preempted =
841 			PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
842 #endif
843 	}
844 
845 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
846 		apic_update_callback(eoi, kvm_guest_apic_eoi_write);
847 
848 	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
849 		static_branch_enable(&kvm_async_pf_enabled);
850 		sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt);
851 	}
852 
853 #ifdef CONFIG_SMP
854 	if (pv_tlb_flush_supported()) {
855 		pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
856 		pr_info("KVM setup pv remote TLB flush\n");
857 	}
858 
859 	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
860 	if (pv_sched_yield_supported()) {
861 		smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
862 		pr_info("setup PV sched yield\n");
863 	}
864 	if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
865 				      kvm_cpu_online, kvm_cpu_down_prepare) < 0)
866 		pr_err("failed to install cpu hotplug callbacks\n");
867 #else
868 	sev_map_percpu_data();
869 	kvm_guest_cpu_init();
870 #endif
871 
872 #ifdef CONFIG_CRASH_DUMP
873 	machine_ops.crash_shutdown = kvm_crash_shutdown;
874 #endif
875 
876 	register_syscore(&kvm_syscore);
877 
878 	/*
879 	 * Hard lockup detection is enabled by default. Disable it, as guests
880 	 * can get false positives too easily, for example if the host is
881 	 * overcommitted.
882 	 */
883 	hardlockup_detector_disable();
884 }
885 
__kvm_cpuid_base(void)886 static noinline uint32_t __kvm_cpuid_base(void)
887 {
888 	if (boot_cpu_data.cpuid_level < 0)
889 		return 0;	/* So we don't blow up on old processors */
890 
891 	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
892 		return cpuid_base_hypervisor(KVM_SIGNATURE, 0);
893 
894 	return 0;
895 }
896 
kvm_cpuid_base(void)897 static inline uint32_t kvm_cpuid_base(void)
898 {
899 	static int kvm_cpuid_base = -1;
900 
901 	if (kvm_cpuid_base == -1)
902 		kvm_cpuid_base = __kvm_cpuid_base();
903 
904 	return kvm_cpuid_base;
905 }
906 
kvm_para_available(void)907 bool kvm_para_available(void)
908 {
909 	return kvm_cpuid_base() != 0;
910 }
911 EXPORT_SYMBOL_GPL(kvm_para_available);
912 
kvm_arch_para_features(void)913 unsigned int kvm_arch_para_features(void)
914 {
915 	return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
916 }
917 
kvm_arch_para_hints(void)918 unsigned int kvm_arch_para_hints(void)
919 {
920 	return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES);
921 }
922 EXPORT_SYMBOL_GPL(kvm_arch_para_hints);
923 
kvm_detect(void)924 static uint32_t __init kvm_detect(void)
925 {
926 	return kvm_cpuid_base();
927 }
928 
kvm_apic_init(void)929 static void __init kvm_apic_init(void)
930 {
931 #ifdef CONFIG_SMP
932 	if (pv_ipi_supported())
933 		kvm_setup_pv_ipi();
934 #endif
935 }
936 
kvm_msi_ext_dest_id(void)937 static bool __init kvm_msi_ext_dest_id(void)
938 {
939 	return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID);
940 }
941 
kvm_sev_hc_page_enc_status(unsigned long pfn,int npages,bool enc)942 static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc)
943 {
944 	kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, pfn << PAGE_SHIFT, npages,
945 			   KVM_MAP_GPA_RANGE_ENC_STAT(enc) | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
946 }
947 
kvm_init_platform(void)948 static void __init kvm_init_platform(void)
949 {
950 	u64 tolud = PFN_PHYS(e820__end_of_low_ram_pfn());
951 	/*
952 	 * Note, hardware requires variable MTRR ranges to be power-of-2 sized
953 	 * and naturally aligned.  But when forcing guest MTRR state, Linux
954 	 * doesn't program the forced ranges into hardware.  Don't bother doing
955 	 * the math to generate a technically-legal range.
956 	 */
957 	struct mtrr_var_range pci_hole = {
958 		.base_lo = tolud | X86_MEMTYPE_UC,
959 		.mask_lo = (u32)(~(SZ_4G - tolud - 1)) | MTRR_PHYSMASK_V,
960 		.mask_hi = (BIT_ULL(boot_cpu_data.x86_phys_bits) - 1) >> 32,
961 	};
962 
963 	if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
964 	    kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) {
965 		unsigned long nr_pages;
966 		int i;
967 
968 		pv_ops.mmu.notify_page_enc_status_changed =
969 			kvm_sev_hc_page_enc_status;
970 
971 		/*
972 		 * Reset the host's shared pages list related to kernel
973 		 * specific page encryption status settings before we load a
974 		 * new kernel by kexec. Reset the page encryption status
975 		 * during early boot instead of just before kexec to avoid SMP
976 		 * races during kvm_pv_guest_cpu_reboot().
977 		 * NOTE: We cannot reset the complete shared pages list
978 		 * here as we need to retain the UEFI/OVMF firmware
979 		 * specific settings.
980 		 */
981 
982 		for (i = 0; i < e820_table->nr_entries; i++) {
983 			struct e820_entry *entry = &e820_table->entries[i];
984 
985 			if (entry->type != E820_TYPE_RAM)
986 				continue;
987 
988 			nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE);
989 
990 			kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, entry->addr,
991 				       nr_pages,
992 				       KVM_MAP_GPA_RANGE_ENCRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
993 		}
994 
995 		/*
996 		 * Ensure that _bss_decrypted section is marked as decrypted in the
997 		 * shared pages list.
998 		 */
999 		early_set_mem_enc_dec_hypercall((unsigned long)__start_bss_decrypted,
1000 						__end_bss_decrypted - __start_bss_decrypted, 0);
1001 
1002 		/*
1003 		 * If not booted using EFI, enable Live migration support.
1004 		 */
1005 		if (!efi_enabled(EFI_BOOT))
1006 			wrmsrq(MSR_KVM_MIGRATION_CONTROL,
1007 			       KVM_MIGRATION_READY);
1008 	}
1009 	kvmclock_init();
1010 	x86_platform.apic_post_init = kvm_apic_init;
1011 
1012 	/*
1013 	 * Set WB as the default cache mode for SEV-SNP and TDX, with a single
1014 	 * UC range for the legacy PCI hole, e.g. so that devices that expect
1015 	 * to get UC/WC mappings don't get surprised with WB.
1016 	 */
1017 	guest_force_mtrr_state(&pci_hole, 1, MTRR_TYPE_WRBACK);
1018 }
1019 
1020 #if defined(CONFIG_AMD_MEM_ENCRYPT)
kvm_sev_es_hcall_prepare(struct ghcb * ghcb,struct pt_regs * regs)1021 static void kvm_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs)
1022 {
1023 	/* RAX and CPL are already in the GHCB */
1024 	ghcb_set_rbx(ghcb, regs->bx);
1025 	ghcb_set_rcx(ghcb, regs->cx);
1026 	ghcb_set_rdx(ghcb, regs->dx);
1027 	ghcb_set_rsi(ghcb, regs->si);
1028 }
1029 
kvm_sev_es_hcall_finish(struct ghcb * ghcb,struct pt_regs * regs)1030 static bool kvm_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
1031 {
1032 	/* No checking of the return state needed */
1033 	return true;
1034 }
1035 #endif
1036 
1037 const __initconst struct hypervisor_x86 x86_hyper_kvm = {
1038 	.name				= "KVM",
1039 	.detect				= kvm_detect,
1040 	.type				= X86_HYPER_KVM,
1041 	.init.guest_late_init		= kvm_guest_init,
1042 	.init.x2apic_available		= kvm_para_available,
1043 	.init.msi_ext_dest_id		= kvm_msi_ext_dest_id,
1044 	.init.init_platform		= kvm_init_platform,
1045 #if defined(CONFIG_AMD_MEM_ENCRYPT)
1046 	.runtime.sev_es_hcall_prepare	= kvm_sev_es_hcall_prepare,
1047 	.runtime.sev_es_hcall_finish	= kvm_sev_es_hcall_finish,
1048 #endif
1049 };
1050 
activate_jump_labels(void)1051 static __init int activate_jump_labels(void)
1052 {
1053 	if (has_steal_clock) {
1054 		static_key_slow_inc(&paravirt_steal_enabled);
1055 		if (steal_acc)
1056 			static_key_slow_inc(&paravirt_steal_rq_enabled);
1057 	}
1058 
1059 	return 0;
1060 }
1061 arch_initcall(activate_jump_labels);
1062 
1063 #ifdef CONFIG_PARAVIRT_SPINLOCKS
1064 
1065 /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
kvm_kick_cpu(int cpu)1066 static void kvm_kick_cpu(int cpu)
1067 {
1068 	unsigned long flags = 0;
1069 	u32 apicid;
1070 
1071 	apicid = per_cpu(x86_cpu_to_apicid, cpu);
1072 	kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
1073 }
1074 
1075 #include <asm/qspinlock.h>
1076 
kvm_wait(u8 * ptr,u8 val)1077 static void kvm_wait(u8 *ptr, u8 val)
1078 {
1079 	if (in_nmi())
1080 		return;
1081 
1082 	/*
1083 	 * halt until it's our turn and kicked. Note that we do safe halt
1084 	 * for irq enabled case to avoid hang when lock info is overwritten
1085 	 * in irq spinlock slowpath and no spurious interrupt occur to save us.
1086 	 */
1087 	if (irqs_disabled()) {
1088 		if (READ_ONCE(*ptr) == val)
1089 			halt();
1090 	} else {
1091 		local_irq_disable();
1092 
1093 		/* safe_halt() will enable IRQ */
1094 		if (READ_ONCE(*ptr) == val)
1095 			safe_halt();
1096 		else
1097 			local_irq_enable();
1098 	}
1099 }
1100 
1101 /*
1102  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
1103  */
kvm_spinlock_init(void)1104 void __init kvm_spinlock_init(void)
1105 {
1106 	/*
1107 	 * Disable PV spinlocks and use native qspinlock when dedicated pCPUs
1108 	 * are available.
1109 	 */
1110 	if (kvm_para_has_hint(KVM_HINTS_REALTIME)) {
1111 		pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n");
1112 		goto out;
1113 	}
1114 
1115 	if (num_possible_cpus() == 1) {
1116 		pr_info("PV spinlocks disabled, single CPU\n");
1117 		goto out;
1118 	}
1119 
1120 	if (nopvspin) {
1121 		pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n");
1122 		goto out;
1123 	}
1124 
1125 	/*
1126 	 * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an
1127 	 * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is
1128 	 * preferred over native qspinlock when vCPU is preempted.
1129 	 */
1130 	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) {
1131 		pr_info("PV spinlocks disabled, no host support\n");
1132 		return;
1133 	}
1134 
1135 	pr_info("PV spinlocks enabled\n");
1136 
1137 	__pv_init_lock_hash();
1138 	pv_ops_lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
1139 	pv_ops_lock.queued_spin_unlock =
1140 		PV_CALLEE_SAVE(__pv_queued_spin_unlock);
1141 	pv_ops_lock.wait = kvm_wait;
1142 	pv_ops_lock.kick = kvm_kick_cpu;
1143 
1144 	/*
1145 	 * When PV spinlock is enabled which is preferred over
1146 	 * virt_spin_lock(), virt_spin_lock_key's value is meaningless.
1147 	 * Just disable it anyway.
1148 	 */
1149 out:
1150 	static_branch_disable(&virt_spin_lock_key);
1151 }
1152 
1153 #endif	/* CONFIG_PARAVIRT_SPINLOCKS */
1154 
1155 #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
1156 
kvm_disable_host_haltpoll(void * i)1157 static void kvm_disable_host_haltpoll(void *i)
1158 {
1159 	wrmsrq(MSR_KVM_POLL_CONTROL, 0);
1160 }
1161 
kvm_enable_host_haltpoll(void * i)1162 static void kvm_enable_host_haltpoll(void *i)
1163 {
1164 	wrmsrq(MSR_KVM_POLL_CONTROL, 1);
1165 }
1166 
arch_haltpoll_enable(unsigned int cpu)1167 void arch_haltpoll_enable(unsigned int cpu)
1168 {
1169 	if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) {
1170 		pr_err_once("host does not support poll control\n");
1171 		pr_err_once("host upgrade recommended\n");
1172 		return;
1173 	}
1174 
1175 	/* Enable guest halt poll disables host halt poll */
1176 	smp_call_function_single(cpu, kvm_disable_host_haltpoll, NULL, 1);
1177 }
1178 EXPORT_SYMBOL_GPL(arch_haltpoll_enable);
1179 
arch_haltpoll_disable(unsigned int cpu)1180 void arch_haltpoll_disable(unsigned int cpu)
1181 {
1182 	if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
1183 		return;
1184 
1185 	/* Disable guest halt poll enables host halt poll */
1186 	smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1);
1187 }
1188 EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
1189 #endif
1190