xref: /qemu/hw/vfio-user/pci.c (revision aec6836c73403cffa56b9a4c5556451ee16071fe)
1 /*
2  * vfio PCI device over a UNIX socket.
3  *
4  * Copyright © 2018, 2021 Oracle and/or its affiliates.
5  *
6  * SPDX-License-Identifier: GPL-2.0-or-later
7  */
8 
9 #include <sys/ioctl.h>
10 #include "qemu/osdep.h"
11 #include "qapi-visit-sockets.h"
12 #include "qemu/error-report.h"
13 
14 #include "hw/qdev-properties.h"
15 #include "hw/vfio/pci.h"
16 #include "hw/vfio-user/device.h"
17 #include "hw/vfio-user/proxy.h"
18 
19 #define TYPE_VFIO_USER_PCI "vfio-user-pci"
20 OBJECT_DECLARE_SIMPLE_TYPE(VFIOUserPCIDevice, VFIO_USER_PCI)
21 
22 struct VFIOUserPCIDevice {
23     VFIOPCIDevice device;
24     SocketAddress *socket;
25     bool send_queued;   /* all sends are queued */
26     uint32_t wait_time; /* timeout for message replies */
27     bool no_post;       /* all region writes are sync */
28 };
29 
30 /*
31  * The server maintains the device's pending interrupts,
32  * via its MSIX table and PBA, so we treat these accesses
33  * like PCI config space and forward them.
34  */
vfio_user_pba_read(void * opaque,hwaddr addr,unsigned size)35 static uint64_t vfio_user_pba_read(void *opaque, hwaddr addr,
36                                    unsigned size)
37 {
38     VFIOPCIDevice *vdev = opaque;
39     VFIORegion *region = &vdev->bars[vdev->msix->pba_bar].region;
40     uint64_t data;
41 
42     /* server copy is what matters */
43     data = vfio_region_read(region, addr + vdev->msix->pba_offset, size);
44     return data;
45 }
46 
vfio_user_pba_write(void * opaque,hwaddr addr,uint64_t data,unsigned size)47 static void vfio_user_pba_write(void *opaque, hwaddr addr,
48                                   uint64_t data, unsigned size)
49 {
50     /* dropped */
51 }
52 
53 static const MemoryRegionOps vfio_user_pba_ops = {
54     .read = vfio_user_pba_read,
55     .write = vfio_user_pba_write,
56     .endianness = DEVICE_LITTLE_ENDIAN,
57 };
58 
vfio_user_msix_setup(VFIOPCIDevice * vdev)59 static void vfio_user_msix_setup(VFIOPCIDevice *vdev)
60 {
61     MemoryRegion *vfio_reg, *msix_reg, *pba_reg;
62 
63     pba_reg = g_new0(MemoryRegion, 1);
64     vdev->msix->pba_region = pba_reg;
65 
66     vfio_reg = vdev->bars[vdev->msix->pba_bar].mr;
67     msix_reg = &vdev->pdev.msix_pba_mmio;
68     memory_region_init_io(pba_reg, OBJECT(vdev), &vfio_user_pba_ops, vdev,
69                           "VFIO MSIX PBA", int128_get64(msix_reg->size));
70     memory_region_add_subregion_overlap(vfio_reg, vdev->msix->pba_offset,
71                                         pba_reg, 1);
72 }
73 
vfio_user_msix_teardown(VFIOPCIDevice * vdev)74 static void vfio_user_msix_teardown(VFIOPCIDevice *vdev)
75 {
76     MemoryRegion *mr, *sub;
77 
78     mr = vdev->bars[vdev->msix->pba_bar].mr;
79     sub = vdev->msix->pba_region;
80     memory_region_del_subregion(mr, sub);
81 
82     g_free(vdev->msix->pba_region);
83     vdev->msix->pba_region = NULL;
84 }
85 
vfio_user_dma_read(VFIOPCIDevice * vdev,VFIOUserDMARW * msg)86 static void vfio_user_dma_read(VFIOPCIDevice *vdev, VFIOUserDMARW *msg)
87 {
88     PCIDevice *pdev = &vdev->pdev;
89     VFIOUserProxy *proxy = vdev->vbasedev.proxy;
90     VFIOUserDMARW *res;
91     MemTxResult r;
92     size_t size;
93 
94     if (msg->hdr.size < sizeof(*msg)) {
95         vfio_user_send_error(proxy, &msg->hdr, EINVAL);
96         return;
97     }
98     if (msg->count > proxy->max_xfer_size) {
99         vfio_user_send_error(proxy, &msg->hdr, E2BIG);
100         return;
101     }
102 
103     /* switch to our own message buffer */
104     size = msg->count + sizeof(VFIOUserDMARW);
105     res = g_malloc0(size);
106     memcpy(res, msg, sizeof(*res));
107     g_free(msg);
108 
109     r = pci_dma_read(pdev, res->offset, &res->data, res->count);
110 
111     switch (r) {
112     case MEMTX_OK:
113         if (res->hdr.flags & VFIO_USER_NO_REPLY) {
114             g_free(res);
115             return;
116         }
117         vfio_user_send_reply(proxy, &res->hdr, size);
118         break;
119     case MEMTX_ERROR:
120         vfio_user_send_error(proxy, &res->hdr, EFAULT);
121         break;
122     case MEMTX_DECODE_ERROR:
123         vfio_user_send_error(proxy, &res->hdr, ENODEV);
124         break;
125     case MEMTX_ACCESS_ERROR:
126         vfio_user_send_error(proxy, &res->hdr, EPERM);
127         break;
128     default:
129         error_printf("vfio_user_dma_read unknown error %d\n", r);
130         vfio_user_send_error(vdev->vbasedev.proxy, &res->hdr, EINVAL);
131     }
132 }
133 
vfio_user_dma_write(VFIOPCIDevice * vdev,VFIOUserDMARW * msg)134 static void vfio_user_dma_write(VFIOPCIDevice *vdev, VFIOUserDMARW *msg)
135 {
136     PCIDevice *pdev = &vdev->pdev;
137     VFIOUserProxy *proxy = vdev->vbasedev.proxy;
138     MemTxResult r;
139 
140     if (msg->hdr.size < sizeof(*msg)) {
141         vfio_user_send_error(proxy, &msg->hdr, EINVAL);
142         return;
143     }
144     /* make sure transfer count isn't larger than the message data */
145     if (msg->count > msg->hdr.size - sizeof(*msg)) {
146         vfio_user_send_error(proxy, &msg->hdr, E2BIG);
147         return;
148     }
149 
150     r = pci_dma_write(pdev, msg->offset, &msg->data, msg->count);
151 
152     switch (r) {
153     case MEMTX_OK:
154         if ((msg->hdr.flags & VFIO_USER_NO_REPLY) == 0) {
155             vfio_user_send_reply(proxy, &msg->hdr, sizeof(msg->hdr));
156         } else {
157             g_free(msg);
158         }
159         break;
160     case MEMTX_ERROR:
161         vfio_user_send_error(proxy, &msg->hdr, EFAULT);
162         break;
163     case MEMTX_DECODE_ERROR:
164         vfio_user_send_error(proxy, &msg->hdr, ENODEV);
165         break;
166     case MEMTX_ACCESS_ERROR:
167         vfio_user_send_error(proxy, &msg->hdr, EPERM);
168         break;
169     default:
170         error_printf("vfio_user_dma_write unknown error %d\n", r);
171         vfio_user_send_error(vdev->vbasedev.proxy, &msg->hdr, EINVAL);
172     }
173 }
174 
175 /*
176  * Incoming request message callback.
177  *
178  * Runs off main loop, so BQL held.
179  */
vfio_user_pci_process_req(void * opaque,VFIOUserMsg * msg)180 static void vfio_user_pci_process_req(void *opaque, VFIOUserMsg *msg)
181 {
182     VFIOPCIDevice *vdev = opaque;
183     VFIOUserHdr *hdr = msg->hdr;
184 
185     /* no incoming PCI requests pass FDs */
186     if (msg->fds != NULL) {
187         vfio_user_send_error(vdev->vbasedev.proxy, hdr, EINVAL);
188         vfio_user_putfds(msg);
189         return;
190     }
191 
192     switch (hdr->command) {
193     case VFIO_USER_DMA_READ:
194         vfio_user_dma_read(vdev, (VFIOUserDMARW *)hdr);
195         break;
196     case VFIO_USER_DMA_WRITE:
197         vfio_user_dma_write(vdev, (VFIOUserDMARW *)hdr);
198         break;
199     default:
200         error_printf("vfio_user_pci_process_req unknown cmd %d\n",
201                      hdr->command);
202         vfio_user_send_error(vdev->vbasedev.proxy, hdr, ENOSYS);
203     }
204 }
205 
206 /*
207  * Emulated devices don't use host hot reset
208  */
vfio_user_compute_needs_reset(VFIODevice * vbasedev)209 static void vfio_user_compute_needs_reset(VFIODevice *vbasedev)
210 {
211     vbasedev->needs_reset = false;
212 }
213 
vfio_user_pci_get_object(VFIODevice * vbasedev)214 static Object *vfio_user_pci_get_object(VFIODevice *vbasedev)
215 {
216     VFIOUserPCIDevice *vdev = container_of(vbasedev, VFIOUserPCIDevice,
217                                            device.vbasedev);
218 
219     return OBJECT(vdev);
220 }
221 
222 static VFIODeviceOps vfio_user_pci_ops = {
223     .vfio_compute_needs_reset = vfio_user_compute_needs_reset,
224     .vfio_eoi = vfio_pci_intx_eoi,
225     .vfio_get_object = vfio_user_pci_get_object,
226     /* No live migration support yet. */
227     .vfio_save_config = NULL,
228     .vfio_load_config = NULL,
229 };
230 
vfio_user_pci_realize(PCIDevice * pdev,Error ** errp)231 static void vfio_user_pci_realize(PCIDevice *pdev, Error **errp)
232 {
233     ERRP_GUARD();
234     VFIOUserPCIDevice *udev = VFIO_USER_PCI(pdev);
235     VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev);
236     VFIODevice *vbasedev = &vdev->vbasedev;
237     const char *sock_name;
238     AddressSpace *as;
239     SocketAddress addr;
240     VFIOUserProxy *proxy;
241 
242     if (!udev->socket) {
243         error_setg(errp, "No socket specified");
244         error_append_hint(errp, "e.g. -device '{"
245             "\"driver\":\"vfio-user-pci\", "
246             "\"socket\": {\"path\": \"/tmp/vfio-user.sock\", "
247             "\"type\": \"unix\"}'"
248             "}'\n");
249         return;
250     }
251 
252     sock_name = udev->socket->u.q_unix.path;
253 
254     vbasedev->name = g_strdup_printf("vfio-user:%s", sock_name);
255 
256     memset(&addr, 0, sizeof(addr));
257     addr.type = SOCKET_ADDRESS_TYPE_UNIX;
258     addr.u.q_unix.path = (char *)sock_name;
259     proxy = vfio_user_connect_dev(&addr, errp);
260     if (!proxy) {
261         return;
262     }
263     vbasedev->proxy = proxy;
264     vfio_user_set_handler(vbasedev, vfio_user_pci_process_req, vdev);
265 
266     vbasedev->name = g_strdup_printf("vfio-user:%s", sock_name);
267 
268     if (udev->send_queued) {
269         proxy->flags |= VFIO_PROXY_FORCE_QUEUED;
270     }
271 
272     if (udev->no_post) {
273         proxy->flags |= VFIO_PROXY_NO_POST;
274     }
275 
276     /* user specified or 5 sec default */
277     proxy->wait_time = udev->wait_time;
278 
279     if (!vfio_user_validate_version(proxy, errp)) {
280         goto error;
281     }
282 
283     /*
284      * Use socket-based device I/O instead of vfio kernel driver.
285      */
286     vbasedev->io_ops = &vfio_user_device_io_ops_sock;
287 
288     /*
289      * vfio-user devices are effectively mdevs (don't use a host iommu).
290      */
291     vbasedev->mdev = true;
292 
293     /*
294      * Enable per-region fds.
295      */
296     vbasedev->use_region_fds = true;
297 
298     as = pci_device_iommu_address_space(pdev);
299     if (!vfio_device_attach_by_iommu_type(TYPE_VFIO_IOMMU_USER,
300                                           vbasedev->name, vbasedev,
301                                           as, errp)) {
302         goto error;
303     }
304 
305     if (!vfio_pci_populate_device(vdev, errp)) {
306         goto error;
307     }
308 
309     if (!vfio_pci_config_setup(vdev, errp)) {
310         goto error;
311     }
312 
313     /*
314      * vfio_pci_config_setup will have registered the device's BARs
315      * and setup any MSIX BARs, so errors after it succeeds must
316      * use out_teardown
317      */
318 
319     if (!vfio_pci_add_capabilities(vdev, errp)) {
320         goto out_teardown;
321     }
322 
323     if (vdev->msix != NULL) {
324         vfio_user_msix_setup(vdev);
325     }
326 
327     if (!vfio_pci_interrupt_setup(vdev, errp)) {
328         goto out_teardown;
329     }
330 
331     vfio_pci_register_err_notifier(vdev);
332     vfio_pci_register_req_notifier(vdev);
333 
334     return;
335 
336 out_teardown:
337     vfio_pci_teardown_msi(vdev);
338     vfio_pci_bars_exit(vdev);
339 error:
340     error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.name);
341     vfio_pci_put_device(vdev);
342 }
343 
vfio_user_instance_init(Object * obj)344 static void vfio_user_instance_init(Object *obj)
345 {
346     PCIDevice *pci_dev = PCI_DEVICE(obj);
347     VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
348     VFIODevice *vbasedev = &vdev->vbasedev;
349 
350     device_add_bootindex_property(obj, &vdev->bootindex,
351                                   "bootindex", NULL,
352                                   &pci_dev->qdev);
353     vdev->host.domain = ~0U;
354     vdev->host.bus = ~0U;
355     vdev->host.slot = ~0U;
356     vdev->host.function = ~0U;
357 
358     vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_user_pci_ops,
359                      DEVICE(vdev), false);
360 
361     vdev->nv_gpudirect_clique = 0xFF;
362 
363     /*
364      * QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command
365      * line, therefore, no need to wait to realize like other devices.
366      */
367     pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS;
368 }
369 
vfio_user_instance_finalize(Object * obj)370 static void vfio_user_instance_finalize(Object *obj)
371 {
372     VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj);
373     VFIODevice *vbasedev = &vdev->vbasedev;
374 
375     if (vdev->msix != NULL) {
376         vfio_user_msix_teardown(vdev);
377     }
378 
379     vfio_pci_put_device(vdev);
380 
381     if (vbasedev->proxy != NULL) {
382         vfio_user_disconnect(vbasedev->proxy);
383     }
384 }
385 
vfio_user_pci_reset(DeviceState * dev)386 static void vfio_user_pci_reset(DeviceState *dev)
387 {
388     VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev);
389     VFIODevice *vbasedev = &vdev->vbasedev;
390 
391     vfio_pci_pre_reset(vdev);
392 
393     if (vbasedev->reset_works) {
394         vfio_user_device_reset(vbasedev->proxy);
395     }
396 
397     vfio_pci_post_reset(vdev);
398 }
399 
400 static const Property vfio_user_pci_dev_properties[] = {
401     DEFINE_PROP_UINT32("x-pci-vendor-id", VFIOPCIDevice,
402                        vendor_id, PCI_ANY_ID),
403     DEFINE_PROP_UINT32("x-pci-device-id", VFIOPCIDevice,
404                        device_id, PCI_ANY_ID),
405     DEFINE_PROP_UINT32("x-pci-sub-vendor-id", VFIOPCIDevice,
406                        sub_vendor_id, PCI_ANY_ID),
407     DEFINE_PROP_UINT32("x-pci-sub-device-id", VFIOPCIDevice,
408                        sub_device_id, PCI_ANY_ID),
409     DEFINE_PROP_BOOL("x-send-queued", VFIOUserPCIDevice, send_queued, false),
410     DEFINE_PROP_UINT32("x-msg-timeout", VFIOUserPCIDevice, wait_time, 5000),
411     DEFINE_PROP_BOOL("x-no-posted-writes", VFIOUserPCIDevice, no_post, false),
412 };
413 
vfio_user_pci_set_socket(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)414 static void vfio_user_pci_set_socket(Object *obj, Visitor *v, const char *name,
415                                      void *opaque, Error **errp)
416 {
417     VFIOUserPCIDevice *udev = VFIO_USER_PCI(obj);
418     bool success;
419 
420     if (udev->device.vbasedev.proxy) {
421         error_setg(errp, "Proxy is connected");
422         return;
423     }
424 
425     qapi_free_SocketAddress(udev->socket);
426 
427     udev->socket = NULL;
428 
429     success = visit_type_SocketAddress(v, name, &udev->socket, errp);
430 
431     if (!success) {
432         return;
433     }
434 
435     if (udev->socket->type != SOCKET_ADDRESS_TYPE_UNIX) {
436         error_setg(errp, "Unsupported socket type %s",
437                    SocketAddressType_str(udev->socket->type));
438         qapi_free_SocketAddress(udev->socket);
439         udev->socket = NULL;
440         return;
441     }
442 }
443 
vfio_user_pci_dev_class_init(ObjectClass * klass,const void * data)444 static void vfio_user_pci_dev_class_init(ObjectClass *klass, const void *data)
445 {
446     DeviceClass *dc = DEVICE_CLASS(klass);
447     PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
448 
449     device_class_set_legacy_reset(dc, vfio_user_pci_reset);
450     device_class_set_props(dc, vfio_user_pci_dev_properties);
451 
452     object_class_property_add(klass, "socket", "SocketAddress", NULL,
453                               vfio_user_pci_set_socket, NULL, NULL);
454     object_class_property_set_description(klass, "socket",
455                                           "SocketAddress (UNIX sockets only)");
456 
457     dc->desc = "VFIO over socket PCI device assignment";
458     pdc->realize = vfio_user_pci_realize;
459 }
460 
461 static const TypeInfo vfio_user_pci_dev_info = {
462     .name = TYPE_VFIO_USER_PCI,
463     .parent = TYPE_VFIO_PCI_BASE,
464     .instance_size = sizeof(VFIOUserPCIDevice),
465     .class_init = vfio_user_pci_dev_class_init,
466     .instance_init = vfio_user_instance_init,
467     .instance_finalize = vfio_user_instance_finalize,
468 };
469 
register_vfio_user_dev_type(void)470 static void register_vfio_user_dev_type(void)
471 {
472     type_register_static(&vfio_user_pci_dev_info);
473 }
474 
475  type_init(register_vfio_user_dev_type)
476