1 /* 2 * VFIO regions 3 * 4 * Copyright Red Hat, Inc. 2012 5 * 6 * Authors: 7 * Alex Williamson <alex.williamson@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Based on qemu-kvm device-assignment: 13 * Adapted for KVM by Qumranet. 14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) 15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) 16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) 17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) 18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) 19 */ 20 21 #include "qemu/osdep.h" 22 #include <sys/ioctl.h> 23 24 #include "hw/vfio/vfio-region.h" 25 #include "hw/vfio/vfio-device.h" 26 #include "hw/hw.h" 27 #include "trace.h" 28 #include "qapi/error.h" 29 #include "qemu/error-report.h" 30 #include "qemu/units.h" 31 #include "monitor/monitor.h" 32 #include "vfio-helpers.h" 33 34 /* 35 * IO Port/MMIO - Beware of the endians, VFIO is always little endian 36 */ 37 void vfio_region_write(void *opaque, hwaddr addr, 38 uint64_t data, unsigned size) 39 { 40 VFIORegion *region = opaque; 41 VFIODevice *vbasedev = region->vbasedev; 42 union { 43 uint8_t byte; 44 uint16_t word; 45 uint32_t dword; 46 uint64_t qword; 47 } buf; 48 49 switch (size) { 50 case 1: 51 buf.byte = data; 52 break; 53 case 2: 54 buf.word = cpu_to_le16(data); 55 break; 56 case 4: 57 buf.dword = cpu_to_le32(data); 58 break; 59 case 8: 60 buf.qword = cpu_to_le64(data); 61 break; 62 default: 63 hw_error("vfio: unsupported write size, %u bytes", size); 64 break; 65 } 66 67 if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { 68 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 69 ",%d) failed: %m", 70 __func__, vbasedev->name, region->nr, 71 addr, data, size); 72 } 73 74 trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size); 75 76 /* 77 * A read or write to a BAR always signals an INTx EOI. This will 78 * do nothing if not pending (including not in INTx mode). We assume 79 * that a BAR access is in response to an interrupt and that BAR 80 * accesses will service the interrupt. Unfortunately, we don't know 81 * which access will service the interrupt, so we're potentially 82 * getting quite a few host interrupts per guest interrupt. 83 */ 84 vbasedev->ops->vfio_eoi(vbasedev); 85 } 86 87 uint64_t vfio_region_read(void *opaque, 88 hwaddr addr, unsigned size) 89 { 90 VFIORegion *region = opaque; 91 VFIODevice *vbasedev = region->vbasedev; 92 union { 93 uint8_t byte; 94 uint16_t word; 95 uint32_t dword; 96 uint64_t qword; 97 } buf; 98 uint64_t data = 0; 99 100 if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { 101 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m", 102 __func__, vbasedev->name, region->nr, 103 addr, size); 104 return (uint64_t)-1; 105 } 106 switch (size) { 107 case 1: 108 data = buf.byte; 109 break; 110 case 2: 111 data = le16_to_cpu(buf.word); 112 break; 113 case 4: 114 data = le32_to_cpu(buf.dword); 115 break; 116 case 8: 117 data = le64_to_cpu(buf.qword); 118 break; 119 default: 120 hw_error("vfio: unsupported read size, %u bytes", size); 121 break; 122 } 123 124 trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data); 125 126 /* Same as write above */ 127 vbasedev->ops->vfio_eoi(vbasedev); 128 129 return data; 130 } 131 132 static const MemoryRegionOps vfio_region_ops = { 133 .read = vfio_region_read, 134 .write = vfio_region_write, 135 .endianness = DEVICE_LITTLE_ENDIAN, 136 .valid = { 137 .min_access_size = 1, 138 .max_access_size = 8, 139 }, 140 .impl = { 141 .min_access_size = 1, 142 .max_access_size = 8, 143 }, 144 }; 145 146 static int vfio_setup_region_sparse_mmaps(VFIORegion *region, 147 struct vfio_region_info *info) 148 { 149 struct vfio_info_cap_header *hdr; 150 struct vfio_region_info_cap_sparse_mmap *sparse; 151 int i, j; 152 153 hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP); 154 if (!hdr) { 155 return -ENODEV; 156 } 157 158 sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header); 159 160 trace_vfio_region_sparse_mmap_header(region->vbasedev->name, 161 region->nr, sparse->nr_areas); 162 163 region->mmaps = g_new0(VFIOMmap, sparse->nr_areas); 164 165 for (i = 0, j = 0; i < sparse->nr_areas; i++) { 166 if (sparse->areas[i].size) { 167 trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset, 168 sparse->areas[i].offset + 169 sparse->areas[i].size - 1); 170 region->mmaps[j].offset = sparse->areas[i].offset; 171 region->mmaps[j].size = sparse->areas[i].size; 172 j++; 173 } 174 } 175 176 region->nr_mmaps = j; 177 region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap)); 178 179 return 0; 180 } 181 182 int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, 183 int index, const char *name) 184 { 185 g_autofree struct vfio_region_info *info = NULL; 186 int ret; 187 188 ret = vfio_get_region_info(vbasedev, index, &info); 189 if (ret) { 190 return ret; 191 } 192 193 region->vbasedev = vbasedev; 194 region->flags = info->flags; 195 region->size = info->size; 196 region->fd_offset = info->offset; 197 region->nr = index; 198 199 if (region->size) { 200 region->mem = g_new0(MemoryRegion, 1); 201 memory_region_init_io(region->mem, obj, &vfio_region_ops, 202 region, name, region->size); 203 204 if (!vbasedev->no_mmap && 205 region->flags & VFIO_REGION_INFO_FLAG_MMAP) { 206 207 ret = vfio_setup_region_sparse_mmaps(region, info); 208 209 if (ret) { 210 region->nr_mmaps = 1; 211 region->mmaps = g_new0(VFIOMmap, region->nr_mmaps); 212 region->mmaps[0].offset = 0; 213 region->mmaps[0].size = region->size; 214 } 215 } 216 } 217 218 trace_vfio_region_setup(vbasedev->name, index, name, 219 region->flags, region->fd_offset, region->size); 220 return 0; 221 } 222 223 static void vfio_subregion_unmap(VFIORegion *region, int index) 224 { 225 trace_vfio_region_unmap(memory_region_name(®ion->mmaps[index].mem), 226 region->mmaps[index].offset, 227 region->mmaps[index].offset + 228 region->mmaps[index].size - 1); 229 memory_region_del_subregion(region->mem, ®ion->mmaps[index].mem); 230 munmap(region->mmaps[index].mmap, region->mmaps[index].size); 231 object_unparent(OBJECT(®ion->mmaps[index].mem)); 232 region->mmaps[index].mmap = NULL; 233 } 234 235 int vfio_region_mmap(VFIORegion *region) 236 { 237 int i, ret, prot = 0; 238 char *name; 239 240 if (!region->mem) { 241 return 0; 242 } 243 244 prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0; 245 prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0; 246 247 for (i = 0; i < region->nr_mmaps; i++) { 248 size_t align = MIN(1ULL << ctz64(region->mmaps[i].size), 1 * GiB); 249 void *map_base, *map_align; 250 251 /* 252 * Align the mmap for more efficient mapping in the kernel. Ideally 253 * we'd know the PMD and PUD mapping sizes to use as discrete alignment 254 * intervals, but we don't. As of Linux v6.12, the largest PUD size 255 * supporting huge pfnmap is 1GiB (ARCH_SUPPORTS_PUD_PFNMAP is only set 256 * on x86_64). Align by power-of-two size, capped at 1GiB. 257 * 258 * NB. qemu_memalign() and friends actually allocate memory, whereas 259 * the region size here can exceed host memory, therefore we manually 260 * create an oversized anonymous mapping and clean it up for alignment. 261 */ 262 map_base = mmap(0, region->mmaps[i].size + align, PROT_NONE, 263 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 264 if (map_base == MAP_FAILED) { 265 ret = -errno; 266 goto no_mmap; 267 } 268 269 map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align); 270 munmap(map_base, map_align - map_base); 271 munmap(map_align + region->mmaps[i].size, 272 align - (map_align - map_base)); 273 274 region->mmaps[i].mmap = mmap(map_align, region->mmaps[i].size, prot, 275 MAP_SHARED | MAP_FIXED, 276 region->vbasedev->fd, 277 region->fd_offset + 278 region->mmaps[i].offset); 279 if (region->mmaps[i].mmap == MAP_FAILED) { 280 ret = -errno; 281 goto no_mmap; 282 } 283 284 name = g_strdup_printf("%s mmaps[%d]", 285 memory_region_name(region->mem), i); 286 memory_region_init_ram_device_ptr(®ion->mmaps[i].mem, 287 memory_region_owner(region->mem), 288 name, region->mmaps[i].size, 289 region->mmaps[i].mmap); 290 g_free(name); 291 memory_region_add_subregion(region->mem, region->mmaps[i].offset, 292 ®ion->mmaps[i].mem); 293 294 trace_vfio_region_mmap(memory_region_name(®ion->mmaps[i].mem), 295 region->mmaps[i].offset, 296 region->mmaps[i].offset + 297 region->mmaps[i].size - 1); 298 } 299 300 return 0; 301 302 no_mmap: 303 trace_vfio_region_mmap_fault(memory_region_name(region->mem), i, 304 region->fd_offset + region->mmaps[i].offset, 305 region->fd_offset + region->mmaps[i].offset + 306 region->mmaps[i].size - 1, ret); 307 308 region->mmaps[i].mmap = NULL; 309 310 for (i--; i >= 0; i--) { 311 vfio_subregion_unmap(region, i); 312 } 313 314 return ret; 315 } 316 317 void vfio_region_unmap(VFIORegion *region) 318 { 319 int i; 320 321 if (!region->mem) { 322 return; 323 } 324 325 for (i = 0; i < region->nr_mmaps; i++) { 326 if (region->mmaps[i].mmap) { 327 vfio_subregion_unmap(region, i); 328 } 329 } 330 } 331 332 void vfio_region_exit(VFIORegion *region) 333 { 334 int i; 335 336 if (!region->mem) { 337 return; 338 } 339 340 for (i = 0; i < region->nr_mmaps; i++) { 341 if (region->mmaps[i].mmap) { 342 memory_region_del_subregion(region->mem, ®ion->mmaps[i].mem); 343 } 344 } 345 346 trace_vfio_region_exit(region->vbasedev->name, region->nr); 347 } 348 349 void vfio_region_finalize(VFIORegion *region) 350 { 351 int i; 352 353 if (!region->mem) { 354 return; 355 } 356 357 for (i = 0; i < region->nr_mmaps; i++) { 358 if (region->mmaps[i].mmap) { 359 munmap(region->mmaps[i].mmap, region->mmaps[i].size); 360 object_unparent(OBJECT(®ion->mmaps[i].mem)); 361 } 362 } 363 364 object_unparent(OBJECT(region->mem)); 365 366 g_free(region->mem); 367 g_free(region->mmaps); 368 369 trace_vfio_region_finalize(region->vbasedev->name, region->nr); 370 371 region->mem = NULL; 372 region->mmaps = NULL; 373 region->nr_mmaps = 0; 374 region->size = 0; 375 region->flags = 0; 376 region->nr = 0; 377 } 378 379 void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled) 380 { 381 int i; 382 383 if (!region->mem) { 384 return; 385 } 386 387 for (i = 0; i < region->nr_mmaps; i++) { 388 if (region->mmaps[i].mmap) { 389 memory_region_set_enabled(®ion->mmaps[i].mem, enabled); 390 } 391 } 392 393 trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem), 394 enabled); 395 } 396