1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (c) 2009, Microsoft Corporation.
4 *
5 * Authors:
6 * Haiyang Zhang <haiyangz@microsoft.com>
7 * Hank Janssen <hjanssen@microsoft.com>
8 */
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10
11 #include <linux/io.h>
12 #include <linux/kernel.h>
13 #include <linux/mm.h>
14 #include <linux/slab.h>
15 #include <linux/vmalloc.h>
16 #include <linux/hyperv.h>
17 #include <linux/random.h>
18 #include <linux/clockchips.h>
19 #include <linux/delay.h>
20 #include <linux/interrupt.h>
21 #include <clocksource/hyperv_timer.h>
22 #include <asm/mshyperv.h>
23 #include <linux/set_memory.h>
24 #include "hyperv_vmbus.h"
25
26 /* The one and only */
27 struct hv_context hv_context;
28
29 /*
30 * hv_init - Main initialization routine.
31 *
32 * This routine must be called before any other routines in here are called
33 */
hv_init(void)34 int hv_init(void)
35 {
36 hv_context.cpu_context = alloc_percpu(struct hv_per_cpu_context);
37 if (!hv_context.cpu_context)
38 return -ENOMEM;
39 return 0;
40 }
41
42 /*
43 * hv_post_message - Post a message using the hypervisor message IPC.
44 *
45 * This involves a hypercall.
46 */
hv_post_message(union hv_connection_id connection_id,enum hv_message_type message_type,void * payload,size_t payload_size)47 int hv_post_message(union hv_connection_id connection_id,
48 enum hv_message_type message_type,
49 void *payload, size_t payload_size)
50 {
51 struct hv_input_post_message *aligned_msg;
52 unsigned long flags;
53 u64 status;
54
55 if (payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT)
56 return -EMSGSIZE;
57
58 local_irq_save(flags);
59
60 /*
61 * A TDX VM with the paravisor must use the decrypted post_msg_page: see
62 * the comment in struct hv_per_cpu_context. A SNP VM with the paravisor
63 * can use the encrypted hyperv_pcpu_input_arg because it copies the
64 * input into the GHCB page, which has been decrypted by the paravisor.
65 */
66 if (hv_isolation_type_tdx() && ms_hyperv.paravisor_present)
67 aligned_msg = this_cpu_ptr(hv_context.cpu_context)->post_msg_page;
68 else
69 aligned_msg = *this_cpu_ptr(hyperv_pcpu_input_arg);
70
71 aligned_msg->connectionid = connection_id;
72 aligned_msg->reserved = 0;
73 aligned_msg->message_type = message_type;
74 aligned_msg->payload_size = payload_size;
75 memcpy((void *)aligned_msg->payload, payload, payload_size);
76
77 if (ms_hyperv.paravisor_present) {
78 if (hv_isolation_type_tdx())
79 status = hv_tdx_hypercall(HVCALL_POST_MESSAGE,
80 virt_to_phys(aligned_msg), 0);
81 else if (hv_isolation_type_snp())
82 status = hv_ghcb_hypercall(HVCALL_POST_MESSAGE,
83 aligned_msg, NULL,
84 sizeof(*aligned_msg));
85 else
86 status = HV_STATUS_INVALID_PARAMETER;
87 } else {
88 u64 control = HVCALL_POST_MESSAGE;
89
90 control |= hv_nested ? HV_HYPERCALL_NESTED : 0;
91 status = hv_do_hypercall(control, aligned_msg, NULL);
92 }
93
94 local_irq_restore(flags);
95
96 return hv_result(status);
97 }
98
hv_synic_alloc(void)99 int hv_synic_alloc(void)
100 {
101 int cpu, ret = -ENOMEM;
102 struct hv_per_cpu_context *hv_cpu;
103
104 /*
105 * First, zero all per-cpu memory areas so hv_synic_free() can
106 * detect what memory has been allocated and cleanup properly
107 * after any failures.
108 */
109 for_each_present_cpu(cpu) {
110 hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu);
111 memset(hv_cpu, 0, sizeof(*hv_cpu));
112 }
113
114 hv_context.hv_numa_map = kcalloc(nr_node_ids, sizeof(struct cpumask),
115 GFP_KERNEL);
116 if (!hv_context.hv_numa_map) {
117 pr_err("Unable to allocate NUMA map\n");
118 goto err;
119 }
120
121 for_each_present_cpu(cpu) {
122 hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu);
123
124 tasklet_init(&hv_cpu->msg_dpc,
125 vmbus_on_msg_dpc, (unsigned long)hv_cpu);
126
127 if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) {
128 hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC);
129 if (!hv_cpu->post_msg_page) {
130 pr_err("Unable to allocate post msg page\n");
131 goto err;
132 }
133
134 ret = set_memory_decrypted((unsigned long)hv_cpu->post_msg_page, 1);
135 if (ret) {
136 pr_err("Failed to decrypt post msg page: %d\n", ret);
137 /* Just leak the page, as it's unsafe to free the page. */
138 hv_cpu->post_msg_page = NULL;
139 goto err;
140 }
141
142 memset(hv_cpu->post_msg_page, 0, PAGE_SIZE);
143 }
144
145 /*
146 * Synic message and event pages are allocated by paravisor.
147 * Skip these pages allocation here.
148 */
149 if (!ms_hyperv.paravisor_present && !hv_root_partition()) {
150 hv_cpu->synic_message_page =
151 (void *)get_zeroed_page(GFP_ATOMIC);
152 if (!hv_cpu->synic_message_page) {
153 pr_err("Unable to allocate SYNIC message page\n");
154 goto err;
155 }
156
157 hv_cpu->synic_event_page =
158 (void *)get_zeroed_page(GFP_ATOMIC);
159 if (!hv_cpu->synic_event_page) {
160 pr_err("Unable to allocate SYNIC event page\n");
161
162 free_page((unsigned long)hv_cpu->synic_message_page);
163 hv_cpu->synic_message_page = NULL;
164 goto err;
165 }
166 }
167
168 if (!ms_hyperv.paravisor_present &&
169 (hv_isolation_type_snp() || hv_isolation_type_tdx())) {
170 ret = set_memory_decrypted((unsigned long)
171 hv_cpu->synic_message_page, 1);
172 if (ret) {
173 pr_err("Failed to decrypt SYNIC msg page: %d\n", ret);
174 hv_cpu->synic_message_page = NULL;
175
176 /*
177 * Free the event page here so that hv_synic_free()
178 * won't later try to re-encrypt it.
179 */
180 free_page((unsigned long)hv_cpu->synic_event_page);
181 hv_cpu->synic_event_page = NULL;
182 goto err;
183 }
184
185 ret = set_memory_decrypted((unsigned long)
186 hv_cpu->synic_event_page, 1);
187 if (ret) {
188 pr_err("Failed to decrypt SYNIC event page: %d\n", ret);
189 hv_cpu->synic_event_page = NULL;
190 goto err;
191 }
192
193 memset(hv_cpu->synic_message_page, 0, PAGE_SIZE);
194 memset(hv_cpu->synic_event_page, 0, PAGE_SIZE);
195 }
196 }
197
198 return 0;
199
200 err:
201 /*
202 * Any memory allocations that succeeded will be freed when
203 * the caller cleans up by calling hv_synic_free()
204 */
205 return ret;
206 }
207
hv_synic_free(void)208 void hv_synic_free(void)
209 {
210 int cpu, ret;
211
212 for_each_present_cpu(cpu) {
213 struct hv_per_cpu_context *hv_cpu =
214 per_cpu_ptr(hv_context.cpu_context, cpu);
215
216 /* It's better to leak the page if the encryption fails. */
217 if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) {
218 if (hv_cpu->post_msg_page) {
219 ret = set_memory_encrypted((unsigned long)
220 hv_cpu->post_msg_page, 1);
221 if (ret) {
222 pr_err("Failed to encrypt post msg page: %d\n", ret);
223 hv_cpu->post_msg_page = NULL;
224 }
225 }
226 }
227
228 if (!ms_hyperv.paravisor_present &&
229 (hv_isolation_type_snp() || hv_isolation_type_tdx())) {
230 if (hv_cpu->synic_message_page) {
231 ret = set_memory_encrypted((unsigned long)
232 hv_cpu->synic_message_page, 1);
233 if (ret) {
234 pr_err("Failed to encrypt SYNIC msg page: %d\n", ret);
235 hv_cpu->synic_message_page = NULL;
236 }
237 }
238
239 if (hv_cpu->synic_event_page) {
240 ret = set_memory_encrypted((unsigned long)
241 hv_cpu->synic_event_page, 1);
242 if (ret) {
243 pr_err("Failed to encrypt SYNIC event page: %d\n", ret);
244 hv_cpu->synic_event_page = NULL;
245 }
246 }
247 }
248
249 free_page((unsigned long)hv_cpu->post_msg_page);
250 free_page((unsigned long)hv_cpu->synic_event_page);
251 free_page((unsigned long)hv_cpu->synic_message_page);
252 }
253
254 kfree(hv_context.hv_numa_map);
255 }
256
257 /*
258 * hv_synic_init - Initialize the Synthetic Interrupt Controller.
259 *
260 * If it is already initialized by another entity (ie x2v shim), we need to
261 * retrieve the initialized message and event pages. Otherwise, we create and
262 * initialize the message and event pages.
263 */
hv_synic_enable_regs(unsigned int cpu)264 void hv_synic_enable_regs(unsigned int cpu)
265 {
266 struct hv_per_cpu_context *hv_cpu =
267 per_cpu_ptr(hv_context.cpu_context, cpu);
268 union hv_synic_simp simp;
269 union hv_synic_siefp siefp;
270 union hv_synic_sint shared_sint;
271 union hv_synic_scontrol sctrl;
272
273 /* Setup the Synic's message page */
274 simp.as_uint64 = hv_get_msr(HV_MSR_SIMP);
275 simp.simp_enabled = 1;
276
277 if (ms_hyperv.paravisor_present || hv_root_partition()) {
278 /* Mask out vTOM bit. ioremap_cache() maps decrypted */
279 u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) &
280 ~ms_hyperv.shared_gpa_boundary;
281 hv_cpu->synic_message_page =
282 (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
283 if (!hv_cpu->synic_message_page)
284 pr_err("Fail to map synic message page.\n");
285 } else {
286 simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page)
287 >> HV_HYP_PAGE_SHIFT;
288 }
289
290 hv_set_msr(HV_MSR_SIMP, simp.as_uint64);
291
292 /* Setup the Synic's event page */
293 siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP);
294 siefp.siefp_enabled = 1;
295
296 if (ms_hyperv.paravisor_present || hv_root_partition()) {
297 /* Mask out vTOM bit. ioremap_cache() maps decrypted */
298 u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) &
299 ~ms_hyperv.shared_gpa_boundary;
300 hv_cpu->synic_event_page =
301 (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
302 if (!hv_cpu->synic_event_page)
303 pr_err("Fail to map synic event page.\n");
304 } else {
305 siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page)
306 >> HV_HYP_PAGE_SHIFT;
307 }
308
309 hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64);
310
311 /* Setup the shared SINT. */
312 if (vmbus_irq != -1)
313 enable_percpu_irq(vmbus_irq, 0);
314 shared_sint.as_uint64 = hv_get_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT);
315
316 shared_sint.vector = vmbus_interrupt;
317 shared_sint.masked = false;
318 shared_sint.auto_eoi = hv_recommend_using_aeoi();
319 hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
320
321 /* Enable the global synic bit */
322 sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL);
323 sctrl.enable = 1;
324
325 hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
326 }
327
hv_synic_init(unsigned int cpu)328 int hv_synic_init(unsigned int cpu)
329 {
330 hv_synic_enable_regs(cpu);
331
332 hv_stimer_legacy_init(cpu, VMBUS_MESSAGE_SINT);
333
334 return 0;
335 }
336
hv_synic_disable_regs(unsigned int cpu)337 void hv_synic_disable_regs(unsigned int cpu)
338 {
339 struct hv_per_cpu_context *hv_cpu =
340 per_cpu_ptr(hv_context.cpu_context, cpu);
341 union hv_synic_sint shared_sint;
342 union hv_synic_simp simp;
343 union hv_synic_siefp siefp;
344 union hv_synic_scontrol sctrl;
345
346 shared_sint.as_uint64 = hv_get_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT);
347
348 shared_sint.masked = 1;
349
350 /* Need to correctly cleanup in the case of SMP!!! */
351 /* Disable the interrupt */
352 hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
353
354 simp.as_uint64 = hv_get_msr(HV_MSR_SIMP);
355 /*
356 * In Isolation VM, sim and sief pages are allocated by
357 * paravisor. These pages also will be used by kdump
358 * kernel. So just reset enable bit here and keep page
359 * addresses.
360 */
361 simp.simp_enabled = 0;
362 if (ms_hyperv.paravisor_present || hv_root_partition()) {
363 iounmap(hv_cpu->synic_message_page);
364 hv_cpu->synic_message_page = NULL;
365 } else {
366 simp.base_simp_gpa = 0;
367 }
368
369 hv_set_msr(HV_MSR_SIMP, simp.as_uint64);
370
371 siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP);
372 siefp.siefp_enabled = 0;
373
374 if (ms_hyperv.paravisor_present || hv_root_partition()) {
375 iounmap(hv_cpu->synic_event_page);
376 hv_cpu->synic_event_page = NULL;
377 } else {
378 siefp.base_siefp_gpa = 0;
379 }
380
381 hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64);
382
383 /* Disable the global synic bit */
384 sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL);
385 sctrl.enable = 0;
386 hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
387
388 if (vmbus_irq != -1)
389 disable_percpu_irq(vmbus_irq);
390 }
391
392 #define HV_MAX_TRIES 3
393 /*
394 * Scan the event flags page of 'this' CPU looking for any bit that is set. If we find one
395 * bit set, then wait for a few milliseconds. Repeat these steps for a maximum of 3 times.
396 * Return 'true', if there is still any set bit after this operation; 'false', otherwise.
397 *
398 * If a bit is set, that means there is a pending channel interrupt. The expectation is
399 * that the normal interrupt handling mechanism will find and process the channel interrupt
400 * "very soon", and in the process clear the bit.
401 */
hv_synic_event_pending(void)402 static bool hv_synic_event_pending(void)
403 {
404 struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context);
405 union hv_synic_event_flags *event =
406 (union hv_synic_event_flags *)hv_cpu->synic_event_page + VMBUS_MESSAGE_SINT;
407 unsigned long *recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */
408 bool pending;
409 u32 relid;
410 int tries = 0;
411
412 retry:
413 pending = false;
414 for_each_set_bit(relid, recv_int_page, HV_EVENT_FLAGS_COUNT) {
415 /* Special case - VMBus channel protocol messages */
416 if (relid == 0)
417 continue;
418 pending = true;
419 break;
420 }
421 if (pending && tries++ < HV_MAX_TRIES) {
422 usleep_range(10000, 20000);
423 goto retry;
424 }
425 return pending;
426 }
427
hv_pick_new_cpu(struct vmbus_channel * channel)428 static int hv_pick_new_cpu(struct vmbus_channel *channel)
429 {
430 int ret = -EBUSY;
431 int start;
432 int cpu;
433
434 lockdep_assert_cpus_held();
435 lockdep_assert_held(&vmbus_connection.channel_mutex);
436
437 /*
438 * We can't assume that the relevant interrupts will be sent before
439 * the cpu is offlined on older versions of hyperv.
440 */
441 if (vmbus_proto_version < VERSION_WIN10_V5_3)
442 return -EBUSY;
443
444 start = get_random_u32_below(nr_cpu_ids);
445
446 for_each_cpu_wrap(cpu, cpu_online_mask, start) {
447 if (channel->target_cpu == cpu ||
448 channel->target_cpu == VMBUS_CONNECT_CPU)
449 continue;
450
451 ret = vmbus_channel_set_cpu(channel, cpu);
452 if (!ret)
453 break;
454 }
455
456 if (ret)
457 ret = vmbus_channel_set_cpu(channel, VMBUS_CONNECT_CPU);
458
459 return ret;
460 }
461
462 /*
463 * hv_synic_cleanup - Cleanup routine for hv_synic_init().
464 */
hv_synic_cleanup(unsigned int cpu)465 int hv_synic_cleanup(unsigned int cpu)
466 {
467 struct vmbus_channel *channel, *sc;
468 int ret = 0;
469
470 if (vmbus_connection.conn_state != CONNECTED)
471 goto always_cleanup;
472
473 /*
474 * Hyper-V does not provide a way to change the connect CPU once
475 * it is set; we must prevent the connect CPU from going offline
476 * while the VM is running normally. But in the panic or kexec()
477 * path where the vmbus is already disconnected, the CPU must be
478 * allowed to shut down.
479 */
480 if (cpu == VMBUS_CONNECT_CPU)
481 return -EBUSY;
482
483 /*
484 * Search for channels which are bound to the CPU we're about to
485 * cleanup.
486 */
487 mutex_lock(&vmbus_connection.channel_mutex);
488 list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
489 if (channel->target_cpu == cpu) {
490 ret = hv_pick_new_cpu(channel);
491 if (ret) {
492 mutex_unlock(&vmbus_connection.channel_mutex);
493 return ret;
494 }
495 }
496 list_for_each_entry(sc, &channel->sc_list, sc_list) {
497 if (sc->target_cpu == cpu) {
498 ret = hv_pick_new_cpu(sc);
499 if (ret) {
500 mutex_unlock(&vmbus_connection.channel_mutex);
501 return ret;
502 }
503 }
504 }
505 }
506 mutex_unlock(&vmbus_connection.channel_mutex);
507
508 /*
509 * Scan the event flags page looking for bits that are set and waiting
510 * with a timeout for vmbus_chan_sched() to process such bits. If bits
511 * are still set after this operation and VMBus is connected, fail the
512 * CPU offlining operation.
513 */
514 if (vmbus_proto_version >= VERSION_WIN10_V4_1 && hv_synic_event_pending())
515 return -EBUSY;
516
517 always_cleanup:
518 hv_stimer_legacy_cleanup(cpu);
519
520 hv_synic_disable_regs(cpu);
521
522 return ret;
523 }
524