1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * KVM paravirt_ops implementation
4 *
5 * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6 * Copyright IBM Corporation, 2007
7 * Authors: Anthony Liguori <aliguori@us.ibm.com>
8 */
9
10 #define pr_fmt(fmt) "kvm-guest: " fmt
11
12 #include <linux/context_tracking.h>
13 #include <linux/init.h>
14 #include <linux/irq.h>
15 #include <linux/kernel.h>
16 #include <linux/kvm_para.h>
17 #include <linux/cpu.h>
18 #include <linux/mm.h>
19 #include <linux/highmem.h>
20 #include <linux/hardirq.h>
21 #include <linux/notifier.h>
22 #include <linux/reboot.h>
23 #include <linux/hash.h>
24 #include <linux/sched.h>
25 #include <linux/slab.h>
26 #include <linux/kprobes.h>
27 #include <linux/nmi.h>
28 #include <linux/swait.h>
29 #include <linux/syscore_ops.h>
30 #include <linux/cc_platform.h>
31 #include <linux/efi.h>
32 #include <linux/kvm_types.h>
33 #include <linux/sched/cputime.h>
34 #include <asm/timer.h>
35 #include <asm/cpu.h>
36 #include <asm/traps.h>
37 #include <asm/desc.h>
38 #include <asm/tlbflush.h>
39 #include <asm/apic.h>
40 #include <asm/apicdef.h>
41 #include <asm/hypervisor.h>
42 #include <asm/mtrr.h>
43 #include <asm/tlb.h>
44 #include <asm/cpuidle_haltpoll.h>
45 #include <asm/msr.h>
46 #include <asm/ptrace.h>
47 #include <asm/reboot.h>
48 #include <asm/svm.h>
49 #include <asm/e820/api.h>
50
51 DEFINE_STATIC_KEY_FALSE_RO(kvm_async_pf_enabled);
52
53 static int kvmapf = 1;
54
parse_no_kvmapf(char * arg)55 static int __init parse_no_kvmapf(char *arg)
56 {
57 kvmapf = 0;
58 return 0;
59 }
60
61 early_param("no-kvmapf", parse_no_kvmapf);
62
63 static int steal_acc = 1;
parse_no_stealacc(char * arg)64 static int __init parse_no_stealacc(char *arg)
65 {
66 steal_acc = 0;
67 return 0;
68 }
69
70 early_param("no-steal-acc", parse_no_stealacc);
71
72 static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled);
73 static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
74 DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
75 static int has_steal_clock = 0;
76
77 static int has_guest_poll = 0;
78
79 #define KVM_TASK_SLEEP_HASHBITS 8
80 #define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
81
82 struct kvm_task_sleep_node {
83 struct hlist_node link;
84 struct swait_queue_head wq;
85 u32 token;
86 int cpu;
87 bool dummy;
88 };
89
90 static struct kvm_task_sleep_head {
91 raw_spinlock_t lock;
92 struct hlist_head list;
93 } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
94
_find_apf_task(struct kvm_task_sleep_head * b,u32 token)95 static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
96 u32 token)
97 {
98 struct hlist_node *p;
99
100 hlist_for_each(p, &b->list) {
101 struct kvm_task_sleep_node *n =
102 hlist_entry(p, typeof(*n), link);
103 if (n->token == token)
104 return n;
105 }
106
107 return NULL;
108 }
109
kvm_async_pf_queue_task(u32 token,struct kvm_task_sleep_node * n)110 static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
111 {
112 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
113 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
114 struct kvm_task_sleep_node *e;
115
116 raw_spin_lock(&b->lock);
117 e = _find_apf_task(b, token);
118 if (e) {
119 struct kvm_task_sleep_node *dummy = NULL;
120
121 /*
122 * The entry can either be a 'dummy' entry (which is put on the
123 * list when wake-up happens ahead of APF handling completion)
124 * or a token from another task which should not be touched.
125 */
126 if (e->dummy) {
127 hlist_del(&e->link);
128 dummy = e;
129 }
130
131 raw_spin_unlock(&b->lock);
132 kfree(dummy);
133 return false;
134 }
135
136 n->token = token;
137 n->cpu = smp_processor_id();
138 n->dummy = false;
139 init_swait_queue_head(&n->wq);
140 hlist_add_head(&n->link, &b->list);
141 raw_spin_unlock(&b->lock);
142 return true;
143 }
144
145 /*
146 * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled
147 * @token: Token to identify the sleep node entry
148 *
149 * Invoked from the async pagefault handling code or from the VM exit page
150 * fault handler. In both cases RCU is watching.
151 */
kvm_async_pf_task_wait_schedule(u32 token)152 void kvm_async_pf_task_wait_schedule(u32 token)
153 {
154 struct kvm_task_sleep_node n;
155 DECLARE_SWAITQUEUE(wait);
156
157 lockdep_assert_irqs_disabled();
158
159 if (!kvm_async_pf_queue_task(token, &n))
160 return;
161
162 for (;;) {
163 prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
164 if (hlist_unhashed(&n.link))
165 break;
166
167 local_irq_enable();
168 schedule();
169 local_irq_disable();
170 }
171 finish_swait(&n.wq, &wait);
172 }
173 EXPORT_SYMBOL_FOR_KVM(kvm_async_pf_task_wait_schedule);
174
apf_task_wake_one(struct kvm_task_sleep_node * n)175 static void apf_task_wake_one(struct kvm_task_sleep_node *n)
176 {
177 hlist_del_init(&n->link);
178 if (swq_has_sleeper(&n->wq))
179 swake_up_one(&n->wq);
180 }
181
apf_task_wake_all(void)182 static void apf_task_wake_all(void)
183 {
184 int i;
185
186 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
187 struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
188 struct kvm_task_sleep_node *n;
189 struct hlist_node *p, *next;
190
191 raw_spin_lock(&b->lock);
192 hlist_for_each_safe(p, next, &b->list) {
193 n = hlist_entry(p, typeof(*n), link);
194 if (n->cpu == smp_processor_id())
195 apf_task_wake_one(n);
196 }
197 raw_spin_unlock(&b->lock);
198 }
199 }
200
kvm_async_pf_task_wake(u32 token)201 static void kvm_async_pf_task_wake(u32 token)
202 {
203 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
204 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
205 struct kvm_task_sleep_node *n, *dummy = NULL;
206
207 if (token == ~0) {
208 apf_task_wake_all();
209 return;
210 }
211
212 again:
213 raw_spin_lock(&b->lock);
214 n = _find_apf_task(b, token);
215 if (!n) {
216 /*
217 * Async #PF not yet handled, add a dummy entry for the token.
218 * Allocating the token must be down outside of the raw lock
219 * as the allocator is preemptible on PREEMPT_RT kernels.
220 */
221 if (!dummy) {
222 raw_spin_unlock(&b->lock);
223 dummy = kzalloc_obj(*dummy, GFP_ATOMIC);
224
225 /*
226 * Continue looping on allocation failure, eventually
227 * the async #PF will be handled and allocating a new
228 * node will be unnecessary.
229 */
230 if (!dummy)
231 cpu_relax();
232
233 /*
234 * Recheck for async #PF completion before enqueueing
235 * the dummy token to avoid duplicate list entries.
236 */
237 goto again;
238 }
239 dummy->token = token;
240 dummy->cpu = smp_processor_id();
241 dummy->dummy = true;
242 init_swait_queue_head(&dummy->wq);
243 hlist_add_head(&dummy->link, &b->list);
244 dummy = NULL;
245 } else {
246 apf_task_wake_one(n);
247 }
248 raw_spin_unlock(&b->lock);
249
250 /* A dummy token might be allocated and ultimately not used. */
251 kfree(dummy);
252 }
253
kvm_read_and_reset_apf_flags(void)254 noinstr u32 kvm_read_and_reset_apf_flags(void)
255 {
256 u32 flags = 0;
257
258 if (__this_cpu_read(async_pf_enabled)) {
259 flags = __this_cpu_read(apf_reason.flags);
260 __this_cpu_write(apf_reason.flags, 0);
261 }
262
263 return flags;
264 }
265 EXPORT_SYMBOL_FOR_KVM(kvm_read_and_reset_apf_flags);
266
__kvm_handle_async_pf(struct pt_regs * regs,u32 token)267 noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
268 {
269 u32 flags = kvm_read_and_reset_apf_flags();
270 irqentry_state_t state;
271
272 if (!flags)
273 return false;
274
275 state = irqentry_enter(regs);
276 instrumentation_begin();
277
278 /*
279 * If the host managed to inject an async #PF into an interrupt
280 * disabled region, then die hard as this is not going to end well
281 * and the host side is seriously broken.
282 */
283 if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
284 panic("Host injected async #PF in interrupt disabled region\n");
285
286 if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
287 if (unlikely(!(user_mode(regs))))
288 panic("Host injected async #PF in kernel mode\n");
289 /* Page is swapped out by the host. */
290 kvm_async_pf_task_wait_schedule(token);
291 } else {
292 WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags);
293 }
294
295 instrumentation_end();
296 irqentry_exit(regs, state);
297 return true;
298 }
299
DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)300 DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
301 {
302 struct pt_regs *old_regs = set_irq_regs(regs);
303 u32 token;
304
305 apic_eoi();
306
307 inc_irq_stat(irq_hv_callback_count);
308
309 if (__this_cpu_read(async_pf_enabled)) {
310 token = __this_cpu_read(apf_reason.token);
311 kvm_async_pf_task_wake(token);
312 __this_cpu_write(apf_reason.token, 0);
313 wrmsrq(MSR_KVM_ASYNC_PF_ACK, 1);
314 }
315
316 set_irq_regs(old_regs);
317 }
318
paravirt_ops_setup(void)319 static void __init paravirt_ops_setup(void)
320 {
321 pv_info.name = "KVM";
322
323 if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
324 pv_info.io_delay = false;
325
326 #ifdef CONFIG_X86_IO_APIC
327 no_timer_check = 1;
328 #endif
329 }
330
kvm_register_steal_time(void)331 static void kvm_register_steal_time(void)
332 {
333 int cpu = smp_processor_id();
334 struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
335
336 if (!has_steal_clock)
337 return;
338
339 wrmsrq(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
340 pr_debug("stealtime: cpu %d, msr %llx\n", cpu,
341 (unsigned long long) slow_virt_to_phys(st));
342 }
343
344 static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
345
kvm_guest_apic_eoi_write(void)346 static notrace __maybe_unused void kvm_guest_apic_eoi_write(void)
347 {
348 /**
349 * This relies on __test_and_clear_bit to modify the memory
350 * in a way that is atomic with respect to the local CPU.
351 * The hypervisor only accesses this memory from the local CPU so
352 * there's no need for lock or memory barriers.
353 * An optimization barrier is implied in apic write.
354 */
355 if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
356 return;
357 apic_native_eoi();
358 }
359
kvm_guest_cpu_init(void)360 static void kvm_guest_cpu_init(void)
361 {
362 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
363 u64 pa;
364
365 WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
366
367 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
368 pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
369
370 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
371 pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
372
373 wrmsrq(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
374
375 wrmsrq(MSR_KVM_ASYNC_PF_EN, pa);
376 __this_cpu_write(async_pf_enabled, true);
377 pr_debug("setup async PF for cpu %d\n", smp_processor_id());
378 }
379
380 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
381 unsigned long pa;
382
383 /* Size alignment is implied but just to make it explicit. */
384 BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
385 __this_cpu_write(kvm_apic_eoi, 0);
386 pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
387 | KVM_MSR_ENABLED;
388 wrmsrq(MSR_KVM_PV_EOI_EN, pa);
389 }
390
391 if (has_steal_clock)
392 kvm_register_steal_time();
393 }
394
kvm_pv_disable_apf(void)395 static void kvm_pv_disable_apf(void)
396 {
397 if (!__this_cpu_read(async_pf_enabled))
398 return;
399
400 wrmsrq(MSR_KVM_ASYNC_PF_EN, 0);
401 __this_cpu_write(async_pf_enabled, false);
402
403 pr_debug("disable async PF for cpu %d\n", smp_processor_id());
404 }
405
kvm_disable_steal_time(void)406 static void kvm_disable_steal_time(void)
407 {
408 if (!has_steal_clock)
409 return;
410
411 wrmsrq(MSR_KVM_STEAL_TIME, 0);
412 }
413
kvm_steal_clock(int cpu)414 static u64 kvm_steal_clock(int cpu)
415 {
416 u64 steal;
417 struct kvm_steal_time *src;
418 int version;
419
420 src = &per_cpu(steal_time, cpu);
421 do {
422 version = src->version;
423 virt_rmb();
424 steal = src->steal;
425 virt_rmb();
426 } while ((version & 1) || (version != src->version));
427
428 return steal;
429 }
430
__set_percpu_decrypted(void * ptr,unsigned long size)431 static inline __init void __set_percpu_decrypted(void *ptr, unsigned long size)
432 {
433 early_set_memory_decrypted((unsigned long) ptr, size);
434 }
435
436 /*
437 * Iterate through all possible CPUs and map the memory region pointed
438 * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
439 *
440 * Note: we iterate through all possible CPUs to ensure that CPUs
441 * hotplugged will have their per-cpu variable already mapped as
442 * decrypted.
443 */
sev_map_percpu_data(void)444 static void __init sev_map_percpu_data(void)
445 {
446 int cpu;
447
448 if (cc_vendor != CC_VENDOR_AMD ||
449 !cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
450 return;
451
452 for_each_possible_cpu(cpu) {
453 __set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason));
454 __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time));
455 __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi));
456 }
457 }
458
kvm_guest_cpu_offline(bool shutdown)459 static void kvm_guest_cpu_offline(bool shutdown)
460 {
461 kvm_disable_steal_time();
462 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
463 wrmsrq(MSR_KVM_PV_EOI_EN, 0);
464 if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
465 wrmsrq(MSR_KVM_MIGRATION_CONTROL, 0);
466 kvm_pv_disable_apf();
467 if (!shutdown)
468 apf_task_wake_all();
469 kvmclock_disable();
470 }
471
kvm_cpu_online(unsigned int cpu)472 static int kvm_cpu_online(unsigned int cpu)
473 {
474 unsigned long flags;
475
476 local_irq_save(flags);
477 kvm_guest_cpu_init();
478 local_irq_restore(flags);
479 return 0;
480 }
481
482 #ifdef CONFIG_SMP
483
484 static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
485
pv_tlb_flush_supported(void)486 static bool pv_tlb_flush_supported(void)
487 {
488 return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
489 !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
490 kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
491 !boot_cpu_has(X86_FEATURE_MWAIT) &&
492 (num_possible_cpus() != 1));
493 }
494
pv_ipi_supported(void)495 static bool pv_ipi_supported(void)
496 {
497 return (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI) &&
498 (num_possible_cpus() != 1));
499 }
500
pv_sched_yield_supported(void)501 static bool pv_sched_yield_supported(void)
502 {
503 return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
504 !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
505 kvm_para_has_feature(KVM_FEATURE_STEAL_TIME) &&
506 !boot_cpu_has(X86_FEATURE_MWAIT) &&
507 (num_possible_cpus() != 1));
508 }
509
510 #define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG)
511
__send_ipi_mask(const struct cpumask * mask,int vector)512 static void __send_ipi_mask(const struct cpumask *mask, int vector)
513 {
514 unsigned long flags;
515 int cpu, min = 0, max = 0;
516 #ifdef CONFIG_X86_64
517 __uint128_t ipi_bitmap = 0;
518 #else
519 u64 ipi_bitmap = 0;
520 #endif
521 u32 apic_id, icr;
522 long ret;
523
524 if (cpumask_empty(mask))
525 return;
526
527 local_irq_save(flags);
528
529 switch (vector) {
530 default:
531 icr = APIC_DM_FIXED | vector;
532 break;
533 case NMI_VECTOR:
534 icr = APIC_DM_NMI;
535 break;
536 }
537
538 for_each_cpu(cpu, mask) {
539 apic_id = per_cpu(x86_cpu_to_apicid, cpu);
540 if (!ipi_bitmap) {
541 min = max = apic_id;
542 } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
543 ipi_bitmap <<= min - apic_id;
544 min = apic_id;
545 } else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) {
546 max = apic_id < max ? max : apic_id;
547 } else {
548 ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
549 (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
550 WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
551 ret);
552 min = max = apic_id;
553 ipi_bitmap = 0;
554 }
555 __set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
556 }
557
558 if (ipi_bitmap) {
559 ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
560 (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
561 WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
562 ret);
563 }
564
565 local_irq_restore(flags);
566 }
567
kvm_send_ipi_mask(const struct cpumask * mask,int vector)568 static void kvm_send_ipi_mask(const struct cpumask *mask, int vector)
569 {
570 __send_ipi_mask(mask, vector);
571 }
572
kvm_send_ipi_mask_allbutself(const struct cpumask * mask,int vector)573 static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
574 {
575 unsigned int this_cpu = smp_processor_id();
576 struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
577 const struct cpumask *local_mask;
578
579 cpumask_copy(new_mask, mask);
580 cpumask_clear_cpu(this_cpu, new_mask);
581 local_mask = new_mask;
582 __send_ipi_mask(local_mask, vector);
583 }
584
setup_efi_kvm_sev_migration(void)585 static int __init setup_efi_kvm_sev_migration(void)
586 {
587 efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled";
588 efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID;
589 efi_status_t status;
590 unsigned long size;
591 bool enabled;
592
593 if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) ||
594 !kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL))
595 return 0;
596
597 if (!efi_enabled(EFI_BOOT))
598 return 0;
599
600 if (!efi_enabled(EFI_RUNTIME_SERVICES)) {
601 pr_info("%s : EFI runtime services are not enabled\n", __func__);
602 return 0;
603 }
604
605 size = sizeof(enabled);
606
607 /* Get variable contents into buffer */
608 status = efi.get_variable(efi_sev_live_migration_enabled,
609 &efi_variable_guid, NULL, &size, &enabled);
610
611 if (status == EFI_NOT_FOUND) {
612 pr_info("%s : EFI live migration variable not found\n", __func__);
613 return 0;
614 }
615
616 if (status != EFI_SUCCESS) {
617 pr_info("%s : EFI variable retrieval failed\n", __func__);
618 return 0;
619 }
620
621 if (enabled == 0) {
622 pr_info("%s: live migration disabled in EFI\n", __func__);
623 return 0;
624 }
625
626 pr_info("%s : live migration enabled in EFI\n", __func__);
627 wrmsrq(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY);
628
629 return 1;
630 }
631
632 late_initcall(setup_efi_kvm_sev_migration);
633
634 /*
635 * Set the IPI entry points
636 */
kvm_setup_pv_ipi(void)637 static __init void kvm_setup_pv_ipi(void)
638 {
639 apic_update_callback(send_IPI_mask, kvm_send_ipi_mask);
640 apic_update_callback(send_IPI_mask_allbutself, kvm_send_ipi_mask_allbutself);
641 pr_info("setup PV IPIs\n");
642 }
643
kvm_smp_send_call_func_ipi(const struct cpumask * mask)644 static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
645 {
646 int cpu;
647
648 native_send_call_func_ipi(mask);
649
650 /* Make sure other vCPUs get a chance to run if they need to. */
651 for_each_cpu(cpu, mask) {
652 if (!idle_cpu(cpu) && vcpu_is_preempted(cpu)) {
653 kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu));
654 break;
655 }
656 }
657 }
658
kvm_flush_tlb_multi(const struct cpumask * cpumask,const struct flush_tlb_info * info)659 static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
660 const struct flush_tlb_info *info)
661 {
662 u8 state;
663 int cpu;
664 struct kvm_steal_time *src;
665 struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
666
667 cpumask_copy(flushmask, cpumask);
668 /*
669 * We have to call flush only on online vCPUs. And
670 * queue flush_on_enter for pre-empted vCPUs
671 */
672 for_each_cpu(cpu, flushmask) {
673 /*
674 * The local vCPU is never preempted, so we do not explicitly
675 * skip check for local vCPU - it will never be cleared from
676 * flushmask.
677 */
678 src = &per_cpu(steal_time, cpu);
679 state = READ_ONCE(src->preempted);
680 if ((state & KVM_VCPU_PREEMPTED)) {
681 if (try_cmpxchg(&src->preempted, &state,
682 state | KVM_VCPU_FLUSH_TLB))
683 __cpumask_clear_cpu(cpu, flushmask);
684 }
685 }
686
687 native_flush_tlb_multi(flushmask, info);
688 }
689
kvm_alloc_cpumask(void)690 static __init int kvm_alloc_cpumask(void)
691 {
692 int cpu;
693
694 if (!kvm_para_available() || nopv)
695 return 0;
696
697 if (pv_tlb_flush_supported() || pv_ipi_supported())
698 for_each_possible_cpu(cpu) {
699 zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
700 GFP_KERNEL, cpu_to_node(cpu));
701 }
702
703 return 0;
704 }
705 arch_initcall(kvm_alloc_cpumask);
706
kvm_smp_prepare_boot_cpu(void)707 static void __init kvm_smp_prepare_boot_cpu(void)
708 {
709 /*
710 * Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
711 * shares the guest physical address with the hypervisor.
712 */
713 sev_map_percpu_data();
714
715 kvm_guest_cpu_init();
716 native_smp_prepare_boot_cpu();
717 kvm_spinlock_init();
718 }
719
kvm_cpu_down_prepare(unsigned int cpu)720 static int kvm_cpu_down_prepare(unsigned int cpu)
721 {
722 unsigned long flags;
723
724 local_irq_save(flags);
725 kvm_guest_cpu_offline(false);
726 local_irq_restore(flags);
727 return 0;
728 }
729
730 #endif
731
kvm_suspend(void * data)732 static int kvm_suspend(void *data)
733 {
734 u64 val = 0;
735
736 kvm_guest_cpu_offline(false);
737
738 #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
739 if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
740 rdmsrq(MSR_KVM_POLL_CONTROL, val);
741 has_guest_poll = !(val & 1);
742 #endif
743 return 0;
744 }
745
kvm_resume(void * data)746 static void kvm_resume(void *data)
747 {
748 kvm_cpu_online(raw_smp_processor_id());
749
750 #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
751 if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
752 wrmsrq(MSR_KVM_POLL_CONTROL, 0);
753 #endif
754 }
755
756 static const struct syscore_ops kvm_syscore_ops = {
757 .suspend = kvm_suspend,
758 .resume = kvm_resume,
759 };
760
761 static struct syscore kvm_syscore = {
762 .ops = &kvm_syscore_ops,
763 };
764
kvm_pv_guest_cpu_reboot(void * unused)765 static void kvm_pv_guest_cpu_reboot(void *unused)
766 {
767 kvm_guest_cpu_offline(true);
768 }
769
kvm_pv_reboot_notify(struct notifier_block * nb,unsigned long code,void * unused)770 static int kvm_pv_reboot_notify(struct notifier_block *nb,
771 unsigned long code, void *unused)
772 {
773 if (code == SYS_RESTART)
774 on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
775 return NOTIFY_DONE;
776 }
777
778 static struct notifier_block kvm_pv_reboot_nb = {
779 .notifier_call = kvm_pv_reboot_notify,
780 };
781
782 /*
783 * After a PV feature is registered, the host will keep writing to the
784 * registered memory location. If the guest happens to shutdown, this memory
785 * won't be valid. In cases like kexec, in which you install a new kernel, this
786 * means a random memory location will be kept being written.
787 */
788 #ifdef CONFIG_CRASH_DUMP
kvm_crash_shutdown(struct pt_regs * regs)789 static void kvm_crash_shutdown(struct pt_regs *regs)
790 {
791 kvm_guest_cpu_offline(true);
792 native_machine_crash_shutdown(regs);
793 }
794 #endif
795
796 #if defined(CONFIG_X86_32) || !defined(CONFIG_SMP)
797 bool __kvm_vcpu_is_preempted(long cpu);
798
__kvm_vcpu_is_preempted(long cpu)799 __visible bool __kvm_vcpu_is_preempted(long cpu)
800 {
801 struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
802
803 return !!(src->preempted & KVM_VCPU_PREEMPTED);
804 }
805 PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
806
807 #else
808
809 #include <asm/asm-offsets.h>
810
811 extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
812
813 /*
814 * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
815 * restoring to/from the stack.
816 */
817 #define PV_VCPU_PREEMPTED_ASM \
818 "movq __per_cpu_offset(,%rdi,8), %rax\n\t" \
819 "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \
820 "setne %al\n\t"
821
822 DEFINE_ASM_FUNC(__raw_callee_save___kvm_vcpu_is_preempted,
823 PV_VCPU_PREEMPTED_ASM, .text);
824 #endif
825
kvm_guest_init(void)826 static void __init kvm_guest_init(void)
827 {
828 int i;
829
830 paravirt_ops_setup();
831 register_reboot_notifier(&kvm_pv_reboot_nb);
832 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
833 raw_spin_lock_init(&async_pf_sleepers[i].lock);
834
835 if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
836 has_steal_clock = 1;
837 static_call_update(pv_steal_clock, kvm_steal_clock);
838
839 #ifdef CONFIG_PARAVIRT_SPINLOCKS
840 pv_ops_lock.vcpu_is_preempted =
841 PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
842 #endif
843 }
844
845 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
846 apic_update_callback(eoi, kvm_guest_apic_eoi_write);
847
848 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
849 static_branch_enable(&kvm_async_pf_enabled);
850 sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt);
851 }
852
853 #ifdef CONFIG_SMP
854 if (pv_tlb_flush_supported()) {
855 pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
856 pr_info("KVM setup pv remote TLB flush\n");
857 }
858
859 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
860 if (pv_sched_yield_supported()) {
861 smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
862 pr_info("setup PV sched yield\n");
863 }
864 if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
865 kvm_cpu_online, kvm_cpu_down_prepare) < 0)
866 pr_err("failed to install cpu hotplug callbacks\n");
867 #else
868 sev_map_percpu_data();
869 kvm_guest_cpu_init();
870 #endif
871
872 #ifdef CONFIG_CRASH_DUMP
873 machine_ops.crash_shutdown = kvm_crash_shutdown;
874 #endif
875
876 register_syscore(&kvm_syscore);
877
878 /*
879 * Hard lockup detection is enabled by default. Disable it, as guests
880 * can get false positives too easily, for example if the host is
881 * overcommitted.
882 */
883 hardlockup_detector_disable();
884 }
885
__kvm_cpuid_base(void)886 static noinline uint32_t __kvm_cpuid_base(void)
887 {
888 if (boot_cpu_data.cpuid_level < 0)
889 return 0; /* So we don't blow up on old processors */
890
891 if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
892 return cpuid_base_hypervisor(KVM_SIGNATURE, 0);
893
894 return 0;
895 }
896
kvm_cpuid_base(void)897 static inline uint32_t kvm_cpuid_base(void)
898 {
899 static int kvm_cpuid_base = -1;
900
901 if (kvm_cpuid_base == -1)
902 kvm_cpuid_base = __kvm_cpuid_base();
903
904 return kvm_cpuid_base;
905 }
906
kvm_para_available(void)907 bool kvm_para_available(void)
908 {
909 return kvm_cpuid_base() != 0;
910 }
911 EXPORT_SYMBOL_GPL(kvm_para_available);
912
kvm_arch_para_features(void)913 unsigned int kvm_arch_para_features(void)
914 {
915 return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
916 }
917
kvm_arch_para_hints(void)918 unsigned int kvm_arch_para_hints(void)
919 {
920 return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES);
921 }
922 EXPORT_SYMBOL_GPL(kvm_arch_para_hints);
923
kvm_detect(void)924 static uint32_t __init kvm_detect(void)
925 {
926 return kvm_cpuid_base();
927 }
928
kvm_apic_init(void)929 static void __init kvm_apic_init(void)
930 {
931 #ifdef CONFIG_SMP
932 if (pv_ipi_supported())
933 kvm_setup_pv_ipi();
934 #endif
935 }
936
kvm_msi_ext_dest_id(void)937 static bool __init kvm_msi_ext_dest_id(void)
938 {
939 return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID);
940 }
941
kvm_sev_hc_page_enc_status(unsigned long pfn,int npages,bool enc)942 static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc)
943 {
944 kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, pfn << PAGE_SHIFT, npages,
945 KVM_MAP_GPA_RANGE_ENC_STAT(enc) | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
946 }
947
kvm_init_platform(void)948 static void __init kvm_init_platform(void)
949 {
950 u64 tolud = PFN_PHYS(e820__end_of_low_ram_pfn());
951 /*
952 * Note, hardware requires variable MTRR ranges to be power-of-2 sized
953 * and naturally aligned. But when forcing guest MTRR state, Linux
954 * doesn't program the forced ranges into hardware. Don't bother doing
955 * the math to generate a technically-legal range.
956 */
957 struct mtrr_var_range pci_hole = {
958 .base_lo = tolud | X86_MEMTYPE_UC,
959 .mask_lo = (u32)(~(SZ_4G - tolud - 1)) | MTRR_PHYSMASK_V,
960 .mask_hi = (BIT_ULL(boot_cpu_data.x86_phys_bits) - 1) >> 32,
961 };
962
963 if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
964 kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) {
965 unsigned long nr_pages;
966 int i;
967
968 pv_ops.mmu.notify_page_enc_status_changed =
969 kvm_sev_hc_page_enc_status;
970
971 /*
972 * Reset the host's shared pages list related to kernel
973 * specific page encryption status settings before we load a
974 * new kernel by kexec. Reset the page encryption status
975 * during early boot instead of just before kexec to avoid SMP
976 * races during kvm_pv_guest_cpu_reboot().
977 * NOTE: We cannot reset the complete shared pages list
978 * here as we need to retain the UEFI/OVMF firmware
979 * specific settings.
980 */
981
982 for (i = 0; i < e820_table->nr_entries; i++) {
983 struct e820_entry *entry = &e820_table->entries[i];
984
985 if (entry->type != E820_TYPE_RAM)
986 continue;
987
988 nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE);
989
990 kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, entry->addr,
991 nr_pages,
992 KVM_MAP_GPA_RANGE_ENCRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K);
993 }
994
995 /*
996 * Ensure that _bss_decrypted section is marked as decrypted in the
997 * shared pages list.
998 */
999 early_set_mem_enc_dec_hypercall((unsigned long)__start_bss_decrypted,
1000 __end_bss_decrypted - __start_bss_decrypted, 0);
1001
1002 /*
1003 * If not booted using EFI, enable Live migration support.
1004 */
1005 if (!efi_enabled(EFI_BOOT))
1006 wrmsrq(MSR_KVM_MIGRATION_CONTROL,
1007 KVM_MIGRATION_READY);
1008 }
1009 kvmclock_init();
1010 x86_platform.apic_post_init = kvm_apic_init;
1011
1012 /*
1013 * Set WB as the default cache mode for SEV-SNP and TDX, with a single
1014 * UC range for the legacy PCI hole, e.g. so that devices that expect
1015 * to get UC/WC mappings don't get surprised with WB.
1016 */
1017 guest_force_mtrr_state(&pci_hole, 1, MTRR_TYPE_WRBACK);
1018 }
1019
1020 #if defined(CONFIG_AMD_MEM_ENCRYPT)
kvm_sev_es_hcall_prepare(struct ghcb * ghcb,struct pt_regs * regs)1021 static void kvm_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs)
1022 {
1023 /* RAX and CPL are already in the GHCB */
1024 ghcb_set_rbx(ghcb, regs->bx);
1025 ghcb_set_rcx(ghcb, regs->cx);
1026 ghcb_set_rdx(ghcb, regs->dx);
1027 ghcb_set_rsi(ghcb, regs->si);
1028 }
1029
kvm_sev_es_hcall_finish(struct ghcb * ghcb,struct pt_regs * regs)1030 static bool kvm_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
1031 {
1032 /* No checking of the return state needed */
1033 return true;
1034 }
1035 #endif
1036
1037 const __initconst struct hypervisor_x86 x86_hyper_kvm = {
1038 .name = "KVM",
1039 .detect = kvm_detect,
1040 .type = X86_HYPER_KVM,
1041 .init.guest_late_init = kvm_guest_init,
1042 .init.x2apic_available = kvm_para_available,
1043 .init.msi_ext_dest_id = kvm_msi_ext_dest_id,
1044 .init.init_platform = kvm_init_platform,
1045 #if defined(CONFIG_AMD_MEM_ENCRYPT)
1046 .runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare,
1047 .runtime.sev_es_hcall_finish = kvm_sev_es_hcall_finish,
1048 #endif
1049 };
1050
activate_jump_labels(void)1051 static __init int activate_jump_labels(void)
1052 {
1053 if (has_steal_clock) {
1054 static_key_slow_inc(¶virt_steal_enabled);
1055 if (steal_acc)
1056 static_key_slow_inc(¶virt_steal_rq_enabled);
1057 }
1058
1059 return 0;
1060 }
1061 arch_initcall(activate_jump_labels);
1062
1063 #ifdef CONFIG_PARAVIRT_SPINLOCKS
1064
1065 /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
kvm_kick_cpu(int cpu)1066 static void kvm_kick_cpu(int cpu)
1067 {
1068 unsigned long flags = 0;
1069 u32 apicid;
1070
1071 apicid = per_cpu(x86_cpu_to_apicid, cpu);
1072 kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
1073 }
1074
1075 #include <asm/qspinlock.h>
1076
kvm_wait(u8 * ptr,u8 val)1077 static void kvm_wait(u8 *ptr, u8 val)
1078 {
1079 if (in_nmi())
1080 return;
1081
1082 /*
1083 * halt until it's our turn and kicked. Note that we do safe halt
1084 * for irq enabled case to avoid hang when lock info is overwritten
1085 * in irq spinlock slowpath and no spurious interrupt occur to save us.
1086 */
1087 if (irqs_disabled()) {
1088 if (READ_ONCE(*ptr) == val)
1089 halt();
1090 } else {
1091 local_irq_disable();
1092
1093 /* safe_halt() will enable IRQ */
1094 if (READ_ONCE(*ptr) == val)
1095 safe_halt();
1096 else
1097 local_irq_enable();
1098 }
1099 }
1100
1101 /*
1102 * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
1103 */
kvm_spinlock_init(void)1104 void __init kvm_spinlock_init(void)
1105 {
1106 /*
1107 * Disable PV spinlocks and use native qspinlock when dedicated pCPUs
1108 * are available.
1109 */
1110 if (kvm_para_has_hint(KVM_HINTS_REALTIME)) {
1111 pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n");
1112 goto out;
1113 }
1114
1115 if (num_possible_cpus() == 1) {
1116 pr_info("PV spinlocks disabled, single CPU\n");
1117 goto out;
1118 }
1119
1120 if (nopvspin) {
1121 pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n");
1122 goto out;
1123 }
1124
1125 /*
1126 * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an
1127 * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is
1128 * preferred over native qspinlock when vCPU is preempted.
1129 */
1130 if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) {
1131 pr_info("PV spinlocks disabled, no host support\n");
1132 return;
1133 }
1134
1135 pr_info("PV spinlocks enabled\n");
1136
1137 __pv_init_lock_hash();
1138 pv_ops_lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
1139 pv_ops_lock.queued_spin_unlock =
1140 PV_CALLEE_SAVE(__pv_queued_spin_unlock);
1141 pv_ops_lock.wait = kvm_wait;
1142 pv_ops_lock.kick = kvm_kick_cpu;
1143
1144 /*
1145 * When PV spinlock is enabled which is preferred over
1146 * virt_spin_lock(), virt_spin_lock_key's value is meaningless.
1147 * Just disable it anyway.
1148 */
1149 out:
1150 static_branch_disable(&virt_spin_lock_key);
1151 }
1152
1153 #endif /* CONFIG_PARAVIRT_SPINLOCKS */
1154
1155 #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
1156
kvm_disable_host_haltpoll(void * i)1157 static void kvm_disable_host_haltpoll(void *i)
1158 {
1159 wrmsrq(MSR_KVM_POLL_CONTROL, 0);
1160 }
1161
kvm_enable_host_haltpoll(void * i)1162 static void kvm_enable_host_haltpoll(void *i)
1163 {
1164 wrmsrq(MSR_KVM_POLL_CONTROL, 1);
1165 }
1166
arch_haltpoll_enable(unsigned int cpu)1167 void arch_haltpoll_enable(unsigned int cpu)
1168 {
1169 if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) {
1170 pr_err_once("host does not support poll control\n");
1171 pr_err_once("host upgrade recommended\n");
1172 return;
1173 }
1174
1175 /* Enable guest halt poll disables host halt poll */
1176 smp_call_function_single(cpu, kvm_disable_host_haltpoll, NULL, 1);
1177 }
1178 EXPORT_SYMBOL_GPL(arch_haltpoll_enable);
1179
arch_haltpoll_disable(unsigned int cpu)1180 void arch_haltpoll_disable(unsigned int cpu)
1181 {
1182 if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
1183 return;
1184
1185 /* Disable guest halt poll enables host halt poll */
1186 smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1);
1187 }
1188 EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
1189 #endif
1190