1af7b0868SMatt Evans #include "kvm/kvm.h" 2af7b0868SMatt Evans #include "kvm/boot-protocol.h" 3af7b0868SMatt Evans #include "kvm/cpufeature.h" 4af7b0868SMatt Evans #include "kvm/interrupt.h" 5af7b0868SMatt Evans #include "kvm/mptable.h" 6af7b0868SMatt Evans #include "kvm/util.h" 70b69bdefSMatt Evans #include "kvm/8250-serial.h" 80b69bdefSMatt Evans #include "kvm/virtio-console.h" 9af7b0868SMatt Evans 10af7b0868SMatt Evans #include <asm/bootparam.h> 11af7b0868SMatt Evans #include <linux/kvm.h> 12af7b0868SMatt Evans 13af7b0868SMatt Evans #include <sys/types.h> 14af7b0868SMatt Evans #include <sys/ioctl.h> 15af7b0868SMatt Evans #include <sys/mman.h> 16af7b0868SMatt Evans #include <sys/stat.h> 17af7b0868SMatt Evans #include <stdbool.h> 18af7b0868SMatt Evans #include <stdlib.h> 19af7b0868SMatt Evans #include <string.h> 20af7b0868SMatt Evans #include <unistd.h> 21af7b0868SMatt Evans #include <stdio.h> 22af7b0868SMatt Evans #include <fcntl.h> 23af7b0868SMatt Evans 24af7b0868SMatt Evans struct kvm_ext kvm_req_ext[] = { 25af7b0868SMatt Evans { DEFINE_KVM_EXT(KVM_CAP_COALESCED_MMIO) }, 26af7b0868SMatt Evans { DEFINE_KVM_EXT(KVM_CAP_SET_TSS_ADDR) }, 27af7b0868SMatt Evans { DEFINE_KVM_EXT(KVM_CAP_PIT2) }, 28af7b0868SMatt Evans { DEFINE_KVM_EXT(KVM_CAP_USER_MEMORY) }, 29af7b0868SMatt Evans { DEFINE_KVM_EXT(KVM_CAP_IRQ_ROUTING) }, 30af7b0868SMatt Evans { DEFINE_KVM_EXT(KVM_CAP_IRQCHIP) }, 31af7b0868SMatt Evans { DEFINE_KVM_EXT(KVM_CAP_HLT) }, 32af7b0868SMatt Evans { DEFINE_KVM_EXT(KVM_CAP_IRQ_INJECT_STATUS) }, 33af7b0868SMatt Evans { DEFINE_KVM_EXT(KVM_CAP_EXT_CPUID) }, 34af7b0868SMatt Evans { 0, 0 } 35af7b0868SMatt Evans }; 36af7b0868SMatt Evans 37af7b0868SMatt Evans bool kvm__arch_cpu_supports_vm(void) 38af7b0868SMatt Evans { 39af7b0868SMatt Evans struct cpuid_regs regs; 40af7b0868SMatt Evans u32 eax_base; 41af7b0868SMatt Evans int feature; 42af7b0868SMatt Evans 43af7b0868SMatt Evans regs = (struct cpuid_regs) { 44af7b0868SMatt Evans .eax = 0x00, 45af7b0868SMatt Evans }; 46af7b0868SMatt Evans host_cpuid(®s); 47af7b0868SMatt Evans 48af7b0868SMatt Evans switch (regs.ebx) { 49af7b0868SMatt Evans case CPUID_VENDOR_INTEL_1: 50af7b0868SMatt Evans eax_base = 0x00; 51af7b0868SMatt Evans feature = KVM__X86_FEATURE_VMX; 52af7b0868SMatt Evans break; 53af7b0868SMatt Evans 54af7b0868SMatt Evans case CPUID_VENDOR_AMD_1: 55af7b0868SMatt Evans eax_base = 0x80000000; 56af7b0868SMatt Evans feature = KVM__X86_FEATURE_SVM; 57af7b0868SMatt Evans break; 58af7b0868SMatt Evans 59af7b0868SMatt Evans default: 60af7b0868SMatt Evans return false; 61af7b0868SMatt Evans } 62af7b0868SMatt Evans 63af7b0868SMatt Evans regs = (struct cpuid_regs) { 64af7b0868SMatt Evans .eax = eax_base, 65af7b0868SMatt Evans }; 66af7b0868SMatt Evans host_cpuid(®s); 67af7b0868SMatt Evans 68af7b0868SMatt Evans if (regs.eax < eax_base + 0x01) 69af7b0868SMatt Evans return false; 70af7b0868SMatt Evans 71af7b0868SMatt Evans regs = (struct cpuid_regs) { 72af7b0868SMatt Evans .eax = eax_base + 0x01 73af7b0868SMatt Evans }; 74af7b0868SMatt Evans host_cpuid(®s); 75af7b0868SMatt Evans 76af7b0868SMatt Evans return regs.ecx & (1 << feature); 77af7b0868SMatt Evans } 78af7b0868SMatt Evans 79af7b0868SMatt Evans /* 80af7b0868SMatt Evans * Allocating RAM size bigger than 4GB requires us to leave a gap 81af7b0868SMatt Evans * in the RAM which is used for PCI MMIO, hotplug, and unconfigured 82af7b0868SMatt Evans * devices (see documentation of e820_setup_gap() for details). 83af7b0868SMatt Evans * 84af7b0868SMatt Evans * If we're required to initialize RAM bigger than 4GB, we will create 85af7b0868SMatt Evans * a gap between 0xe0000000 and 0x100000000 in the guest virtual mem space. 86af7b0868SMatt Evans */ 87af7b0868SMatt Evans 88af7b0868SMatt Evans void kvm__init_ram(struct kvm *kvm) 89af7b0868SMatt Evans { 90af7b0868SMatt Evans u64 phys_start, phys_size; 91af7b0868SMatt Evans void *host_mem; 92af7b0868SMatt Evans 93af7b0868SMatt Evans if (kvm->ram_size < KVM_32BIT_GAP_START) { 94af7b0868SMatt Evans /* Use a single block of RAM for 32bit RAM */ 95af7b0868SMatt Evans 96af7b0868SMatt Evans phys_start = 0; 97af7b0868SMatt Evans phys_size = kvm->ram_size; 98af7b0868SMatt Evans host_mem = kvm->ram_start; 99af7b0868SMatt Evans 100af7b0868SMatt Evans kvm__register_mem(kvm, phys_start, phys_size, host_mem); 101af7b0868SMatt Evans } else { 102af7b0868SMatt Evans /* First RAM range from zero to the PCI gap: */ 103af7b0868SMatt Evans 104af7b0868SMatt Evans phys_start = 0; 105af7b0868SMatt Evans phys_size = KVM_32BIT_GAP_START; 106af7b0868SMatt Evans host_mem = kvm->ram_start; 107af7b0868SMatt Evans 108af7b0868SMatt Evans kvm__register_mem(kvm, phys_start, phys_size, host_mem); 109af7b0868SMatt Evans 110af7b0868SMatt Evans /* Second RAM range from 4GB to the end of RAM: */ 111af7b0868SMatt Evans 112*f7abc4cdSHongyong Zang phys_start = KVM_32BIT_MAX_MEM_SIZE; 113*f7abc4cdSHongyong Zang phys_size = kvm->ram_size - phys_start; 114af7b0868SMatt Evans host_mem = kvm->ram_start + phys_start; 115af7b0868SMatt Evans 116af7b0868SMatt Evans kvm__register_mem(kvm, phys_start, phys_size, host_mem); 117af7b0868SMatt Evans } 118af7b0868SMatt Evans } 119af7b0868SMatt Evans 1208e704a7aSMatt Evans /* Arch-specific commandline setup */ 1218e704a7aSMatt Evans void kvm__arch_set_cmdline(char *cmdline, bool video) 1228e704a7aSMatt Evans { 1238e704a7aSMatt Evans strcpy(cmdline, "noapic noacpi pci=conf1 reboot=k panic=1 i8042.direct=1 " 1248e704a7aSMatt Evans "i8042.dumbkbd=1 i8042.nopnp=1"); 1253a60be06SSasha Levin if (video) 1268e704a7aSMatt Evans strcat(cmdline, " video=vesafb console=tty0"); 1273a60be06SSasha Levin else 1288e704a7aSMatt Evans strcat(cmdline, " console=ttyS0 earlyprintk=serial i8042.noaux=1"); 1298e704a7aSMatt Evans } 1308e704a7aSMatt Evans 13161061257SMatt Evans /* This function wraps the decision between hugetlbfs map (if requested) or normal mmap */ 13261061257SMatt Evans static void *mmap_anon_or_hugetlbfs(const char *hugetlbfs_path, u64 size) 13361061257SMatt Evans { 1343a60be06SSasha Levin if (hugetlbfs_path) 13561061257SMatt Evans /* 13661061257SMatt Evans * We don't /need/ to map guest RAM from hugetlbfs, but we do so 13761061257SMatt Evans * if the user specifies a hugetlbfs path. 13861061257SMatt Evans */ 13961061257SMatt Evans return mmap_hugetlbfs(hugetlbfs_path, size); 1403a60be06SSasha Levin else 14161061257SMatt Evans return mmap(NULL, size, PROT_RW, MAP_ANON_NORESERVE, -1, 0); 14261061257SMatt Evans } 14361061257SMatt Evans 144af7b0868SMatt Evans /* Architecture-specific KVM init */ 1457eff9f49SWanlong Gao void kvm__arch_init(struct kvm *kvm, const char *hugetlbfs_path, u64 ram_size) 146af7b0868SMatt Evans { 147af7b0868SMatt Evans struct kvm_pit_config pit_config = { .flags = 0, }; 148af7b0868SMatt Evans int ret; 149af7b0868SMatt Evans 150af7b0868SMatt Evans ret = ioctl(kvm->vm_fd, KVM_SET_TSS_ADDR, 0xfffbd000); 151af7b0868SMatt Evans if (ret < 0) 152af7b0868SMatt Evans die_perror("KVM_SET_TSS_ADDR ioctl"); 153af7b0868SMatt Evans 154af7b0868SMatt Evans ret = ioctl(kvm->vm_fd, KVM_CREATE_PIT2, &pit_config); 155af7b0868SMatt Evans if (ret < 0) 156af7b0868SMatt Evans die_perror("KVM_CREATE_PIT2 ioctl"); 157af7b0868SMatt Evans 158*f7abc4cdSHongyong Zang if (ram_size < KVM_32BIT_GAP_START) { 159af7b0868SMatt Evans kvm->ram_size = ram_size; 16061061257SMatt Evans kvm->ram_start = mmap_anon_or_hugetlbfs(hugetlbfs_path, ram_size); 161af7b0868SMatt Evans } else { 16261061257SMatt Evans kvm->ram_start = mmap_anon_or_hugetlbfs(hugetlbfs_path, ram_size + KVM_32BIT_GAP_SIZE); 163*f7abc4cdSHongyong Zang kvm->ram_size = ram_size + KVM_32BIT_GAP_SIZE; 1643a60be06SSasha Levin if (kvm->ram_start != MAP_FAILED) 165af7b0868SMatt Evans /* 166af7b0868SMatt Evans * We mprotect the gap (see kvm__init_ram() for details) PROT_NONE so that 167af7b0868SMatt Evans * if we accidently write to it, we will know. 168af7b0868SMatt Evans */ 169af7b0868SMatt Evans mprotect(kvm->ram_start + KVM_32BIT_GAP_START, KVM_32BIT_GAP_SIZE, PROT_NONE); 170af7b0868SMatt Evans } 171af7b0868SMatt Evans if (kvm->ram_start == MAP_FAILED) 172af7b0868SMatt Evans die("out of memory"); 173af7b0868SMatt Evans 174af7b0868SMatt Evans madvise(kvm->ram_start, kvm->ram_size, MADV_MERGEABLE); 175af7b0868SMatt Evans 176af7b0868SMatt Evans ret = ioctl(kvm->vm_fd, KVM_CREATE_IRQCHIP); 177af7b0868SMatt Evans if (ret < 0) 178af7b0868SMatt Evans die_perror("KVM_CREATE_IRQCHIP ioctl"); 179af7b0868SMatt Evans } 180af7b0868SMatt Evans 181e56e2de7SLai Jiangshan void kvm__arch_delete_ram(struct kvm *kvm) 182e56e2de7SLai Jiangshan { 183e56e2de7SLai Jiangshan if (kvm->ram_size < KVM_32BIT_GAP_START) 184e56e2de7SLai Jiangshan munmap(kvm->ram_start, kvm->ram_size); 185e56e2de7SLai Jiangshan else 186e56e2de7SLai Jiangshan munmap(kvm->ram_start, kvm->ram_size + KVM_32BIT_GAP_SIZE); 187e56e2de7SLai Jiangshan } 188e56e2de7SLai Jiangshan 189af7b0868SMatt Evans void kvm__irq_line(struct kvm *kvm, int irq, int level) 190af7b0868SMatt Evans { 191af7b0868SMatt Evans struct kvm_irq_level irq_level; 192af7b0868SMatt Evans 193af7b0868SMatt Evans irq_level = (struct kvm_irq_level) { 194af7b0868SMatt Evans { 195af7b0868SMatt Evans .irq = irq, 196af7b0868SMatt Evans }, 197af7b0868SMatt Evans .level = level, 198af7b0868SMatt Evans }; 199af7b0868SMatt Evans 200af7b0868SMatt Evans if (ioctl(kvm->vm_fd, KVM_IRQ_LINE, &irq_level) < 0) 201af7b0868SMatt Evans die_perror("KVM_IRQ_LINE failed"); 202af7b0868SMatt Evans } 203af7b0868SMatt Evans 204af7b0868SMatt Evans void kvm__irq_trigger(struct kvm *kvm, int irq) 205af7b0868SMatt Evans { 206af7b0868SMatt Evans kvm__irq_line(kvm, irq, 1); 207af7b0868SMatt Evans kvm__irq_line(kvm, irq, 0); 208af7b0868SMatt Evans } 209af7b0868SMatt Evans 210af7b0868SMatt Evans #define BOOT_LOADER_SELECTOR 0x1000 211af7b0868SMatt Evans #define BOOT_LOADER_IP 0x0000 212af7b0868SMatt Evans #define BOOT_LOADER_SP 0x8000 213af7b0868SMatt Evans #define BOOT_CMDLINE_OFFSET 0x20000 214af7b0868SMatt Evans 215af7b0868SMatt Evans #define BOOT_PROTOCOL_REQUIRED 0x206 216af7b0868SMatt Evans #define LOAD_HIGH 0x01 217af7b0868SMatt Evans 218604dbd63SMatt Evans int load_flat_binary(struct kvm *kvm, int fd_kernel, int fd_initrd, const char *kernel_cmdline) 219af7b0868SMatt Evans { 220af7b0868SMatt Evans void *p; 221af7b0868SMatt Evans int nr; 222af7b0868SMatt Evans 223604dbd63SMatt Evans /* 224604dbd63SMatt Evans * Some architectures may support loading an initrd alongside the flat kernel, 225604dbd63SMatt Evans * but we do not. 226604dbd63SMatt Evans */ 227604dbd63SMatt Evans if (fd_initrd != -1) 228604dbd63SMatt Evans pr_warning("Loading initrd with flat binary not supported."); 229604dbd63SMatt Evans 230604dbd63SMatt Evans if (lseek(fd_kernel, 0, SEEK_SET) < 0) 231af7b0868SMatt Evans die_perror("lseek"); 232af7b0868SMatt Evans 233af7b0868SMatt Evans p = guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, BOOT_LOADER_IP); 234af7b0868SMatt Evans 235604dbd63SMatt Evans while ((nr = read(fd_kernel, p, 65536)) > 0) 236af7b0868SMatt Evans p += nr; 237af7b0868SMatt Evans 238af7b0868SMatt Evans kvm->boot_selector = BOOT_LOADER_SELECTOR; 239af7b0868SMatt Evans kvm->boot_ip = BOOT_LOADER_IP; 240af7b0868SMatt Evans kvm->boot_sp = BOOT_LOADER_SP; 241af7b0868SMatt Evans 242af7b0868SMatt Evans return true; 243af7b0868SMatt Evans } 244af7b0868SMatt Evans 245af7b0868SMatt Evans static const char *BZIMAGE_MAGIC = "HdrS"; 246af7b0868SMatt Evans 247af7b0868SMatt Evans bool load_bzimage(struct kvm *kvm, int fd_kernel, 248af7b0868SMatt Evans int fd_initrd, const char *kernel_cmdline, u16 vidmode) 249af7b0868SMatt Evans { 250af7b0868SMatt Evans struct boot_params *kern_boot; 251af7b0868SMatt Evans unsigned long setup_sects; 252af7b0868SMatt Evans struct boot_params boot; 253af7b0868SMatt Evans size_t cmdline_size; 254af7b0868SMatt Evans ssize_t setup_size; 255af7b0868SMatt Evans void *p; 256af7b0868SMatt Evans int nr; 257af7b0868SMatt Evans 258af7b0868SMatt Evans /* 259af7b0868SMatt Evans * See Documentation/x86/boot.txt for details no bzImage on-disk and 260af7b0868SMatt Evans * memory layout. 261af7b0868SMatt Evans */ 262af7b0868SMatt Evans 263af7b0868SMatt Evans if (lseek(fd_kernel, 0, SEEK_SET) < 0) 264af7b0868SMatt Evans die_perror("lseek"); 265af7b0868SMatt Evans 266af7b0868SMatt Evans if (read(fd_kernel, &boot, sizeof(boot)) != sizeof(boot)) 267af7b0868SMatt Evans return false; 268af7b0868SMatt Evans 269af7b0868SMatt Evans if (memcmp(&boot.hdr.header, BZIMAGE_MAGIC, strlen(BZIMAGE_MAGIC))) 270af7b0868SMatt Evans return false; 271af7b0868SMatt Evans 272af7b0868SMatt Evans if (boot.hdr.version < BOOT_PROTOCOL_REQUIRED) 273af7b0868SMatt Evans die("Too old kernel"); 274af7b0868SMatt Evans 275af7b0868SMatt Evans if (lseek(fd_kernel, 0, SEEK_SET) < 0) 276af7b0868SMatt Evans die_perror("lseek"); 277af7b0868SMatt Evans 278af7b0868SMatt Evans if (!boot.hdr.setup_sects) 279af7b0868SMatt Evans boot.hdr.setup_sects = BZ_DEFAULT_SETUP_SECTS; 280af7b0868SMatt Evans setup_sects = boot.hdr.setup_sects + 1; 281af7b0868SMatt Evans 282af7b0868SMatt Evans setup_size = setup_sects << 9; 283af7b0868SMatt Evans p = guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, BOOT_LOADER_IP); 284af7b0868SMatt Evans 285af7b0868SMatt Evans /* copy setup.bin to mem*/ 286af7b0868SMatt Evans if (read(fd_kernel, p, setup_size) != setup_size) 287af7b0868SMatt Evans die_perror("read"); 288af7b0868SMatt Evans 289af7b0868SMatt Evans /* copy vmlinux.bin to BZ_KERNEL_START*/ 290af7b0868SMatt Evans p = guest_flat_to_host(kvm, BZ_KERNEL_START); 291af7b0868SMatt Evans 292af7b0868SMatt Evans while ((nr = read(fd_kernel, p, 65536)) > 0) 293af7b0868SMatt Evans p += nr; 294af7b0868SMatt Evans 295af7b0868SMatt Evans p = guest_flat_to_host(kvm, BOOT_CMDLINE_OFFSET); 296af7b0868SMatt Evans if (kernel_cmdline) { 297af7b0868SMatt Evans cmdline_size = strlen(kernel_cmdline) + 1; 298af7b0868SMatt Evans if (cmdline_size > boot.hdr.cmdline_size) 299af7b0868SMatt Evans cmdline_size = boot.hdr.cmdline_size; 300af7b0868SMatt Evans 301af7b0868SMatt Evans memset(p, 0, boot.hdr.cmdline_size); 302af7b0868SMatt Evans memcpy(p, kernel_cmdline, cmdline_size - 1); 303af7b0868SMatt Evans } 304af7b0868SMatt Evans 305af7b0868SMatt Evans kern_boot = guest_real_to_host(kvm, BOOT_LOADER_SELECTOR, 0x00); 306af7b0868SMatt Evans 307af7b0868SMatt Evans kern_boot->hdr.cmd_line_ptr = BOOT_CMDLINE_OFFSET; 308af7b0868SMatt Evans kern_boot->hdr.type_of_loader = 0xff; 309af7b0868SMatt Evans kern_boot->hdr.heap_end_ptr = 0xfe00; 310af7b0868SMatt Evans kern_boot->hdr.loadflags |= CAN_USE_HEAP; 311af7b0868SMatt Evans kern_boot->hdr.vid_mode = vidmode; 312af7b0868SMatt Evans 313af7b0868SMatt Evans /* 314af7b0868SMatt Evans * Read initrd image into guest memory 315af7b0868SMatt Evans */ 316af7b0868SMatt Evans if (fd_initrd >= 0) { 317af7b0868SMatt Evans struct stat initrd_stat; 318af7b0868SMatt Evans unsigned long addr; 319af7b0868SMatt Evans 320af7b0868SMatt Evans if (fstat(fd_initrd, &initrd_stat)) 321af7b0868SMatt Evans die_perror("fstat"); 322af7b0868SMatt Evans 323af7b0868SMatt Evans addr = boot.hdr.initrd_addr_max & ~0xfffff; 324af7b0868SMatt Evans for (;;) { 325af7b0868SMatt Evans if (addr < BZ_KERNEL_START) 326af7b0868SMatt Evans die("Not enough memory for initrd"); 327af7b0868SMatt Evans else if (addr < (kvm->ram_size - initrd_stat.st_size)) 328af7b0868SMatt Evans break; 329af7b0868SMatt Evans addr -= 0x100000; 330af7b0868SMatt Evans } 331af7b0868SMatt Evans 332af7b0868SMatt Evans p = guest_flat_to_host(kvm, addr); 333af7b0868SMatt Evans nr = read(fd_initrd, p, initrd_stat.st_size); 334af7b0868SMatt Evans if (nr != initrd_stat.st_size) 335af7b0868SMatt Evans die("Failed to read initrd"); 336af7b0868SMatt Evans 337af7b0868SMatt Evans kern_boot->hdr.ramdisk_image = addr; 338af7b0868SMatt Evans kern_boot->hdr.ramdisk_size = initrd_stat.st_size; 339af7b0868SMatt Evans } 340af7b0868SMatt Evans 341af7b0868SMatt Evans kvm->boot_selector = BOOT_LOADER_SELECTOR; 342af7b0868SMatt Evans /* 343af7b0868SMatt Evans * The real-mode setup code starts at offset 0x200 of a bzImage. See 344af7b0868SMatt Evans * Documentation/x86/boot.txt for details. 345af7b0868SMatt Evans */ 346af7b0868SMatt Evans kvm->boot_ip = BOOT_LOADER_IP + 0x200; 347af7b0868SMatt Evans kvm->boot_sp = BOOT_LOADER_SP; 348af7b0868SMatt Evans 349af7b0868SMatt Evans return true; 350af7b0868SMatt Evans } 351af7b0868SMatt Evans 352af7b0868SMatt Evans /** 353af7b0868SMatt Evans * kvm__arch_setup_firmware - inject BIOS into guest system memory 354af7b0868SMatt Evans * @kvm - guest system descriptor 355af7b0868SMatt Evans * 356af7b0868SMatt Evans * This function is a main routine where we poke guest memory 357af7b0868SMatt Evans * and install BIOS there. 358af7b0868SMatt Evans */ 359f7f9d02bSCyrill Gorcunov int kvm__arch_setup_firmware(struct kvm *kvm) 360af7b0868SMatt Evans { 361af7b0868SMatt Evans /* standart minimal configuration */ 362af7b0868SMatt Evans setup_bios(kvm); 363af7b0868SMatt Evans 364af7b0868SMatt Evans /* FIXME: SMP, ACPI and friends here */ 365af7b0868SMatt Evans 366af7b0868SMatt Evans /* MP table */ 367f7f9d02bSCyrill Gorcunov return mptable_setup(kvm, kvm->nrcpus); 368af7b0868SMatt Evans } 3690b69bdefSMatt Evans 3700b69bdefSMatt Evans void kvm__arch_periodic_poll(struct kvm *kvm) 3710b69bdefSMatt Evans { 372f6b8ccc1SThomas Gleixner serial8250__update_consoles(kvm); 3730b69bdefSMatt Evans virtio_console__inject_interrupt(kvm); 3740b69bdefSMatt Evans } 375