xref: /qemu/hw/virtio/vhost-vdpa.c (revision 273e0003f0005cc17292dedae01e5edb0064b69c)
1 /*
2  * vhost-vdpa
3  *
4  *  Copyright(c) 2017-2018 Intel Corporation.
5  *  Copyright(c) 2020 Red Hat, Inc.
6  *
7  * This work is licensed under the terms of the GNU GPL, version 2 or later.
8  * See the COPYING file in the top-level directory.
9  *
10  */
11 
12 #include "qemu/osdep.h"
13 #include <linux/vhost.h>
14 #include <linux/vfio.h>
15 #include <sys/eventfd.h>
16 #include <sys/ioctl.h>
17 #include "hw/virtio/vhost.h"
18 #include "hw/virtio/vhost-backend.h"
19 #include "hw/virtio/virtio-net.h"
20 #include "hw/virtio/vhost-shadow-virtqueue.h"
21 #include "hw/virtio/vhost-vdpa.h"
22 #include "exec/address-spaces.h"
23 #include "migration/blocker.h"
24 #include "qemu/cutils.h"
25 #include "qemu/main-loop.h"
26 #include "cpu.h"
27 #include "trace.h"
28 #include "qapi/error.h"
29 
30 /*
31  * Return one past the end of the end of section. Be careful with uint64_t
32  * conversions!
33  */
34 static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section)
35 {
36     Int128 llend = int128_make64(section->offset_within_address_space);
37     llend = int128_add(llend, section->size);
38     llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
39 
40     return llend;
41 }
42 
43 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
44                                                 uint64_t iova_min,
45                                                 uint64_t iova_max)
46 {
47     Int128 llend;
48 
49     if ((!memory_region_is_ram(section->mr) &&
50          !memory_region_is_iommu(section->mr)) ||
51         memory_region_is_protected(section->mr) ||
52         /* vhost-vDPA doesn't allow MMIO to be mapped  */
53         memory_region_is_ram_device(section->mr)) {
54         return true;
55     }
56 
57     if (section->offset_within_address_space < iova_min) {
58         error_report("RAM section out of device range (min=0x%" PRIx64
59                      ", addr=0x%" HWADDR_PRIx ")",
60                      iova_min, section->offset_within_address_space);
61         return true;
62     }
63 
64     llend = vhost_vdpa_section_end(section);
65     if (int128_gt(llend, int128_make64(iova_max))) {
66         error_report("RAM section out of device range (max=0x%" PRIx64
67                      ", end addr=0x%" PRIx64 ")",
68                      iova_max, int128_get64(llend));
69         return true;
70     }
71 
72     return false;
73 }
74 
75 int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
76                        void *vaddr, bool readonly)
77 {
78     struct vhost_msg_v2 msg = {};
79     int fd = v->device_fd;
80     int ret = 0;
81 
82     msg.type = v->msg_type;
83     msg.iotlb.iova = iova;
84     msg.iotlb.size = size;
85     msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr;
86     msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW;
87     msg.iotlb.type = VHOST_IOTLB_UPDATE;
88 
89    trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.iotlb.iova, msg.iotlb.size,
90                             msg.iotlb.uaddr, msg.iotlb.perm, msg.iotlb.type);
91 
92     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
93         error_report("failed to write, fd=%d, errno=%d (%s)",
94             fd, errno, strerror(errno));
95         return -EIO ;
96     }
97 
98     return ret;
99 }
100 
101 int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size)
102 {
103     struct vhost_msg_v2 msg = {};
104     int fd = v->device_fd;
105     int ret = 0;
106 
107     msg.type = v->msg_type;
108     msg.iotlb.iova = iova;
109     msg.iotlb.size = size;
110     msg.iotlb.type = VHOST_IOTLB_INVALIDATE;
111 
112     trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.iotlb.iova,
113                                msg.iotlb.size, msg.iotlb.type);
114 
115     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
116         error_report("failed to write, fd=%d, errno=%d (%s)",
117             fd, errno, strerror(errno));
118         return -EIO ;
119     }
120 
121     return ret;
122 }
123 
124 static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa *v)
125 {
126     int fd = v->device_fd;
127     struct vhost_msg_v2 msg = {
128         .type = v->msg_type,
129         .iotlb.type = VHOST_IOTLB_BATCH_BEGIN,
130     };
131 
132     trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type);
133     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
134         error_report("failed to write, fd=%d, errno=%d (%s)",
135                      fd, errno, strerror(errno));
136     }
137 }
138 
139 static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa *v)
140 {
141     if (v->dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) &&
142         !v->iotlb_batch_begin_sent) {
143         vhost_vdpa_listener_begin_batch(v);
144     }
145 
146     v->iotlb_batch_begin_sent = true;
147 }
148 
149 static void vhost_vdpa_listener_commit(MemoryListener *listener)
150 {
151     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
152     struct vhost_dev *dev = v->dev;
153     struct vhost_msg_v2 msg = {};
154     int fd = v->device_fd;
155 
156     if (!(dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) {
157         return;
158     }
159 
160     if (!v->iotlb_batch_begin_sent) {
161         return;
162     }
163 
164     msg.type = v->msg_type;
165     msg.iotlb.type = VHOST_IOTLB_BATCH_END;
166 
167     trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type);
168     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
169         error_report("failed to write, fd=%d, errno=%d (%s)",
170                      fd, errno, strerror(errno));
171     }
172 
173     v->iotlb_batch_begin_sent = false;
174 }
175 
176 static void vhost_vdpa_listener_region_add(MemoryListener *listener,
177                                            MemoryRegionSection *section)
178 {
179     DMAMap mem_region = {};
180     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
181     hwaddr iova;
182     Int128 llend, llsize;
183     void *vaddr;
184     int ret;
185 
186     if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
187                                             v->iova_range.last)) {
188         return;
189     }
190 
191     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
192                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
193         error_report("%s received unaligned region", __func__);
194         return;
195     }
196 
197     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
198     llend = vhost_vdpa_section_end(section);
199     if (int128_ge(int128_make64(iova), llend)) {
200         return;
201     }
202 
203     memory_region_ref(section->mr);
204 
205     /* Here we assume that memory_region_is_ram(section->mr)==true */
206 
207     vaddr = memory_region_get_ram_ptr(section->mr) +
208             section->offset_within_region +
209             (iova - section->offset_within_address_space);
210 
211     trace_vhost_vdpa_listener_region_add(v, iova, int128_get64(llend),
212                                          vaddr, section->readonly);
213 
214     llsize = int128_sub(llend, int128_make64(iova));
215     if (v->shadow_vqs_enabled) {
216         int r;
217 
218         mem_region.translated_addr = (hwaddr)(uintptr_t)vaddr,
219         mem_region.size = int128_get64(llsize) - 1,
220         mem_region.perm = IOMMU_ACCESS_FLAG(true, section->readonly),
221 
222         r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
223         if (unlikely(r != IOVA_OK)) {
224             error_report("Can't allocate a mapping (%d)", r);
225             goto fail;
226         }
227 
228         iova = mem_region.iova;
229     }
230 
231     vhost_vdpa_iotlb_batch_begin_once(v);
232     ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
233                              vaddr, section->readonly);
234     if (ret) {
235         error_report("vhost vdpa map fail!");
236         goto fail_map;
237     }
238 
239     return;
240 
241 fail_map:
242     if (v->shadow_vqs_enabled) {
243         vhost_iova_tree_remove(v->iova_tree, mem_region);
244     }
245 
246 fail:
247     /*
248      * On the initfn path, store the first error in the container so we
249      * can gracefully fail.  Runtime, there's not much we can do other
250      * than throw a hardware error.
251      */
252     error_report("vhost-vdpa: DMA mapping failed, unable to continue");
253     return;
254 
255 }
256 
257 static void vhost_vdpa_listener_region_del(MemoryListener *listener,
258                                            MemoryRegionSection *section)
259 {
260     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
261     hwaddr iova;
262     Int128 llend, llsize;
263     int ret;
264 
265     if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
266                                             v->iova_range.last)) {
267         return;
268     }
269 
270     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
271                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
272         error_report("%s received unaligned region", __func__);
273         return;
274     }
275 
276     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
277     llend = vhost_vdpa_section_end(section);
278 
279     trace_vhost_vdpa_listener_region_del(v, iova, int128_get64(llend));
280 
281     if (int128_ge(int128_make64(iova), llend)) {
282         return;
283     }
284 
285     llsize = int128_sub(llend, int128_make64(iova));
286 
287     if (v->shadow_vqs_enabled) {
288         const DMAMap *result;
289         const void *vaddr = memory_region_get_ram_ptr(section->mr) +
290             section->offset_within_region +
291             (iova - section->offset_within_address_space);
292         DMAMap mem_region = {
293             .translated_addr = (hwaddr)(uintptr_t)vaddr,
294             .size = int128_get64(llsize) - 1,
295         };
296 
297         result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
298         if (!result) {
299             /* The memory listener map wasn't mapped */
300             return;
301         }
302         iova = result->iova;
303         vhost_iova_tree_remove(v->iova_tree, *result);
304     }
305     vhost_vdpa_iotlb_batch_begin_once(v);
306     ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
307     if (ret) {
308         error_report("vhost_vdpa dma unmap error!");
309     }
310 
311     memory_region_unref(section->mr);
312 }
313 /*
314  * IOTLB API is used by vhost-vdpa which requires incremental updating
315  * of the mapping. So we can not use generic vhost memory listener which
316  * depends on the addnop().
317  */
318 static const MemoryListener vhost_vdpa_memory_listener = {
319     .name = "vhost-vdpa",
320     .commit = vhost_vdpa_listener_commit,
321     .region_add = vhost_vdpa_listener_region_add,
322     .region_del = vhost_vdpa_listener_region_del,
323 };
324 
325 static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request,
326                              void *arg)
327 {
328     struct vhost_vdpa *v = dev->opaque;
329     int fd = v->device_fd;
330     int ret;
331 
332     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
333 
334     ret = ioctl(fd, request, arg);
335     return ret < 0 ? -errno : ret;
336 }
337 
338 static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status)
339 {
340     uint8_t s;
341     int ret;
342 
343     trace_vhost_vdpa_add_status(dev, status);
344     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
345     if (ret < 0) {
346         return ret;
347     }
348 
349     s |= status;
350 
351     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s);
352     if (ret < 0) {
353         return ret;
354     }
355 
356     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
357     if (ret < 0) {
358         return ret;
359     }
360 
361     if (!(s & status)) {
362         return -EIO;
363     }
364 
365     return 0;
366 }
367 
368 /*
369  * The use of this function is for requests that only need to be
370  * applied once. Typically such request occurs at the beginning
371  * of operation, and before setting up queues. It should not be
372  * used for request that performs operation until all queues are
373  * set, which would need to check dev->vq_index_end instead.
374  */
375 static bool vhost_vdpa_first_dev(struct vhost_dev *dev)
376 {
377     struct vhost_vdpa *v = dev->opaque;
378 
379     return v->index == 0;
380 }
381 
382 static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
383                                        uint64_t *features)
384 {
385     int ret;
386 
387     ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
388     trace_vhost_vdpa_get_features(dev, *features);
389     return ret;
390 }
391 
392 static void vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v)
393 {
394     g_autoptr(GPtrArray) shadow_vqs = NULL;
395 
396     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
397     for (unsigned n = 0; n < hdev->nvqs; ++n) {
398         VhostShadowVirtqueue *svq;
399 
400         svq = vhost_svq_new(v->shadow_vq_ops, v->shadow_vq_ops_opaque);
401         g_ptr_array_add(shadow_vqs, svq);
402     }
403 
404     v->shadow_vqs = g_steal_pointer(&shadow_vqs);
405 }
406 
407 static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
408 {
409     struct vhost_vdpa *v;
410     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
411     trace_vhost_vdpa_init(dev, opaque);
412     int ret;
413 
414     /*
415      * Similar to VFIO, we end up pinning all guest memory and have to
416      * disable discarding of RAM.
417      */
418     ret = ram_block_discard_disable(true);
419     if (ret) {
420         error_report("Cannot set discarding of RAM broken");
421         return ret;
422     }
423 
424     v = opaque;
425     v->dev = dev;
426     dev->opaque =  opaque ;
427     v->listener = vhost_vdpa_memory_listener;
428     v->msg_type = VHOST_IOTLB_MSG_V2;
429     vhost_vdpa_init_svq(dev, v);
430 
431     if (!vhost_vdpa_first_dev(dev)) {
432         return 0;
433     }
434 
435     vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
436                                VIRTIO_CONFIG_S_DRIVER);
437 
438     return 0;
439 }
440 
441 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
442                                             int queue_index)
443 {
444     size_t page_size = qemu_real_host_page_size();
445     struct vhost_vdpa *v = dev->opaque;
446     VirtIODevice *vdev = dev->vdev;
447     VhostVDPAHostNotifier *n;
448 
449     n = &v->notifier[queue_index];
450 
451     if (n->addr) {
452         virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false);
453         object_unparent(OBJECT(&n->mr));
454         munmap(n->addr, page_size);
455         n->addr = NULL;
456     }
457 }
458 
459 static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index)
460 {
461     size_t page_size = qemu_real_host_page_size();
462     struct vhost_vdpa *v = dev->opaque;
463     VirtIODevice *vdev = dev->vdev;
464     VhostVDPAHostNotifier *n;
465     int fd = v->device_fd;
466     void *addr;
467     char *name;
468 
469     vhost_vdpa_host_notifier_uninit(dev, queue_index);
470 
471     n = &v->notifier[queue_index];
472 
473     addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd,
474                 queue_index * page_size);
475     if (addr == MAP_FAILED) {
476         goto err;
477     }
478 
479     name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]",
480                            v, queue_index);
481     memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
482                                       page_size, addr);
483     g_free(name);
484 
485     if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) {
486         object_unparent(OBJECT(&n->mr));
487         munmap(addr, page_size);
488         goto err;
489     }
490     n->addr = addr;
491 
492     return 0;
493 
494 err:
495     return -1;
496 }
497 
498 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
499 {
500     int i;
501 
502     for (i = dev->vq_index; i < dev->vq_index + n; i++) {
503         vhost_vdpa_host_notifier_uninit(dev, i);
504     }
505 }
506 
507 static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
508 {
509     struct vhost_vdpa *v = dev->opaque;
510     int i;
511 
512     if (v->shadow_vqs_enabled) {
513         /* FIXME SVQ is not compatible with host notifiers mr */
514         return;
515     }
516 
517     for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
518         if (vhost_vdpa_host_notifier_init(dev, i)) {
519             goto err;
520         }
521     }
522 
523     return;
524 
525 err:
526     vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index);
527     return;
528 }
529 
530 static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
531 {
532     struct vhost_vdpa *v = dev->opaque;
533     size_t idx;
534 
535     for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
536         vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
537     }
538     g_ptr_array_free(v->shadow_vqs, true);
539 }
540 
541 static int vhost_vdpa_cleanup(struct vhost_dev *dev)
542 {
543     struct vhost_vdpa *v;
544     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
545     v = dev->opaque;
546     trace_vhost_vdpa_cleanup(dev, v);
547     vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
548     memory_listener_unregister(&v->listener);
549     vhost_vdpa_svq_cleanup(dev);
550 
551     dev->opaque = NULL;
552     ram_block_discard_disable(false);
553 
554     return 0;
555 }
556 
557 static int vhost_vdpa_memslots_limit(struct vhost_dev *dev)
558 {
559     trace_vhost_vdpa_memslots_limit(dev, INT_MAX);
560     return INT_MAX;
561 }
562 
563 static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
564                                     struct vhost_memory *mem)
565 {
566     if (!vhost_vdpa_first_dev(dev)) {
567         return 0;
568     }
569 
570     trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding);
571     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) &&
572         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) {
573         int i;
574         for (i = 0; i < mem->nregions; i++) {
575             trace_vhost_vdpa_dump_regions(dev, i,
576                                           mem->regions[i].guest_phys_addr,
577                                           mem->regions[i].memory_size,
578                                           mem->regions[i].userspace_addr,
579                                           mem->regions[i].flags_padding);
580         }
581     }
582     if (mem->padding) {
583         return -EINVAL;
584     }
585 
586     return 0;
587 }
588 
589 static int vhost_vdpa_set_features(struct vhost_dev *dev,
590                                    uint64_t features)
591 {
592     struct vhost_vdpa *v = dev->opaque;
593     int ret;
594 
595     if (!vhost_vdpa_first_dev(dev)) {
596         return 0;
597     }
598 
599     if (v->shadow_vqs_enabled) {
600         if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
601             /*
602              * QEMU is just trying to enable or disable logging. SVQ handles
603              * this sepparately, so no need to forward this.
604              */
605             v->acked_features = features;
606             return 0;
607         }
608 
609         v->acked_features = features;
610 
611         /* We must not ack _F_LOG if SVQ is enabled */
612         features &= ~BIT_ULL(VHOST_F_LOG_ALL);
613     }
614 
615     trace_vhost_vdpa_set_features(dev, features);
616     ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
617     if (ret) {
618         return ret;
619     }
620 
621     return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
622 }
623 
624 static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev)
625 {
626     uint64_t features;
627     uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
628         0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH;
629     int r;
630 
631     if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) {
632         return -EFAULT;
633     }
634 
635     features &= f;
636 
637     if (vhost_vdpa_first_dev(dev)) {
638         r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features);
639         if (r) {
640             return -EFAULT;
641         }
642     }
643 
644     dev->backend_cap = features;
645 
646     return 0;
647 }
648 
649 static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
650                                     uint32_t *device_id)
651 {
652     int ret;
653     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id);
654     trace_vhost_vdpa_get_device_id(dev, *device_id);
655     return ret;
656 }
657 
658 static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
659 {
660     if (!v->shadow_vqs_enabled) {
661         return;
662     }
663 
664     for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
665         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
666         vhost_svq_stop(svq);
667     }
668 }
669 
670 static int vhost_vdpa_reset_device(struct vhost_dev *dev)
671 {
672     struct vhost_vdpa *v = dev->opaque;
673     int ret;
674     uint8_t status = 0;
675 
676     vhost_vdpa_reset_svq(v);
677 
678     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
679     trace_vhost_vdpa_reset_device(dev, status);
680     return ret;
681 }
682 
683 static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx)
684 {
685     assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
686 
687     trace_vhost_vdpa_get_vq_index(dev, idx, idx);
688     return idx;
689 }
690 
691 static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev)
692 {
693     int i;
694     trace_vhost_vdpa_set_vring_ready(dev);
695     for (i = 0; i < dev->nvqs; ++i) {
696         struct vhost_vring_state state = {
697             .index = dev->vq_index + i,
698             .num = 1,
699         };
700         vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);
701     }
702     return 0;
703 }
704 
705 static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config,
706                                    uint32_t config_len)
707 {
708     int b, len;
709     char line[QEMU_HEXDUMP_LINE_LEN];
710 
711     for (b = 0; b < config_len; b += 16) {
712         len = config_len - b;
713         qemu_hexdump_line(line, b, config, len, false);
714         trace_vhost_vdpa_dump_config(dev, line);
715     }
716 }
717 
718 static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data,
719                                    uint32_t offset, uint32_t size,
720                                    uint32_t flags)
721 {
722     struct vhost_vdpa_config *config;
723     int ret;
724     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
725 
726     trace_vhost_vdpa_set_config(dev, offset, size, flags);
727     config = g_malloc(size + config_size);
728     config->off = offset;
729     config->len = size;
730     memcpy(config->buf, data, size);
731     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) &&
732         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
733         vhost_vdpa_dump_config(dev, data, size);
734     }
735     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config);
736     g_free(config);
737     return ret;
738 }
739 
740 static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
741                                    uint32_t config_len, Error **errp)
742 {
743     struct vhost_vdpa_config *v_config;
744     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
745     int ret;
746 
747     trace_vhost_vdpa_get_config(dev, config, config_len);
748     v_config = g_malloc(config_len + config_size);
749     v_config->len = config_len;
750     v_config->off = 0;
751     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config);
752     memcpy(config, v_config->buf, config_len);
753     g_free(v_config);
754     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) &&
755         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
756         vhost_vdpa_dump_config(dev, config, config_len);
757     }
758     return ret;
759  }
760 
761 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
762                                          struct vhost_vring_state *ring)
763 {
764     trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
765     return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
766 }
767 
768 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
769                                          struct vhost_vring_file *file)
770 {
771     trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
772     return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
773 }
774 
775 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
776                                          struct vhost_vring_file *file)
777 {
778     trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
779     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
780 }
781 
782 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
783                                          struct vhost_vring_addr *addr)
784 {
785     trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
786                                 addr->desc_user_addr, addr->used_user_addr,
787                                 addr->avail_user_addr,
788                                 addr->log_guest_addr);
789 
790     return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
791 
792 }
793 
794 /**
795  * Set the shadow virtqueue descriptors to the device
796  *
797  * @dev: The vhost device model
798  * @svq: The shadow virtqueue
799  * @idx: The index of the virtqueue in the vhost device
800  * @errp: Error
801  *
802  * Note that this function does not rewind kick file descriptor if cannot set
803  * call one.
804  */
805 static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
806                                   VhostShadowVirtqueue *svq, unsigned idx,
807                                   Error **errp)
808 {
809     struct vhost_vring_file file = {
810         .index = dev->vq_index + idx,
811     };
812     const EventNotifier *event_notifier = &svq->hdev_kick;
813     int r;
814 
815     r = event_notifier_init(&svq->hdev_kick, 0);
816     if (r != 0) {
817         error_setg_errno(errp, -r, "Couldn't create kick event notifier");
818         goto err_init_hdev_kick;
819     }
820 
821     r = event_notifier_init(&svq->hdev_call, 0);
822     if (r != 0) {
823         error_setg_errno(errp, -r, "Couldn't create call event notifier");
824         goto err_init_hdev_call;
825     }
826 
827     file.fd = event_notifier_get_fd(event_notifier);
828     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
829     if (unlikely(r != 0)) {
830         error_setg_errno(errp, -r, "Can't set device kick fd");
831         goto err_init_set_dev_fd;
832     }
833 
834     event_notifier = &svq->hdev_call;
835     file.fd = event_notifier_get_fd(event_notifier);
836     r = vhost_vdpa_set_vring_dev_call(dev, &file);
837     if (unlikely(r != 0)) {
838         error_setg_errno(errp, -r, "Can't set device call fd");
839         goto err_init_set_dev_fd;
840     }
841 
842     return 0;
843 
844 err_init_set_dev_fd:
845     event_notifier_set_handler(&svq->hdev_call, NULL);
846 
847 err_init_hdev_call:
848     event_notifier_cleanup(&svq->hdev_kick);
849 
850 err_init_hdev_kick:
851     return r;
852 }
853 
854 /**
855  * Unmap a SVQ area in the device
856  */
857 static void vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr addr)
858 {
859     const DMAMap needle = {
860         .translated_addr = addr,
861     };
862     const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, &needle);
863     hwaddr size;
864     int r;
865 
866     if (unlikely(!result)) {
867         error_report("Unable to find SVQ address to unmap");
868         return;
869     }
870 
871     size = ROUND_UP(result->size, qemu_real_host_page_size());
872     r = vhost_vdpa_dma_unmap(v, result->iova, size);
873     if (unlikely(r < 0)) {
874         error_report("Unable to unmap SVQ vring: %s (%d)", g_strerror(-r), -r);
875         return;
876     }
877 
878     vhost_iova_tree_remove(v->iova_tree, *result);
879 }
880 
881 static void vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
882                                        const VhostShadowVirtqueue *svq)
883 {
884     struct vhost_vdpa *v = dev->opaque;
885     struct vhost_vring_addr svq_addr;
886 
887     vhost_svq_get_vring_addr(svq, &svq_addr);
888 
889     vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr);
890 
891     vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr);
892 }
893 
894 /**
895  * Map the SVQ area in the device
896  *
897  * @v: Vhost-vdpa device
898  * @needle: The area to search iova
899  * @errorp: Error pointer
900  */
901 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
902                                     Error **errp)
903 {
904     int r;
905 
906     r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
907     if (unlikely(r != IOVA_OK)) {
908         error_setg(errp, "Cannot allocate iova (%d)", r);
909         return false;
910     }
911 
912     r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
913                            (void *)(uintptr_t)needle->translated_addr,
914                            needle->perm == IOMMU_RO);
915     if (unlikely(r != 0)) {
916         error_setg_errno(errp, -r, "Cannot map region to device");
917         vhost_iova_tree_remove(v->iova_tree, *needle);
918     }
919 
920     return r == 0;
921 }
922 
923 /**
924  * Map the shadow virtqueue rings in the device
925  *
926  * @dev: The vhost device
927  * @svq: The shadow virtqueue
928  * @addr: Assigned IOVA addresses
929  * @errp: Error pointer
930  */
931 static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
932                                      const VhostShadowVirtqueue *svq,
933                                      struct vhost_vring_addr *addr,
934                                      Error **errp)
935 {
936     ERRP_GUARD();
937     DMAMap device_region, driver_region;
938     struct vhost_vring_addr svq_addr;
939     struct vhost_vdpa *v = dev->opaque;
940     size_t device_size = vhost_svq_device_area_size(svq);
941     size_t driver_size = vhost_svq_driver_area_size(svq);
942     size_t avail_offset;
943     bool ok;
944 
945     vhost_svq_get_vring_addr(svq, &svq_addr);
946 
947     driver_region = (DMAMap) {
948         .translated_addr = svq_addr.desc_user_addr,
949         .size = driver_size - 1,
950         .perm = IOMMU_RO,
951     };
952     ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
953     if (unlikely(!ok)) {
954         error_prepend(errp, "Cannot create vq driver region: ");
955         return false;
956     }
957     addr->desc_user_addr = driver_region.iova;
958     avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
959     addr->avail_user_addr = driver_region.iova + avail_offset;
960 
961     device_region = (DMAMap) {
962         .translated_addr = svq_addr.used_user_addr,
963         .size = device_size - 1,
964         .perm = IOMMU_RW,
965     };
966     ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
967     if (unlikely(!ok)) {
968         error_prepend(errp, "Cannot create vq device region: ");
969         vhost_vdpa_svq_unmap_ring(v, driver_region.translated_addr);
970     }
971     addr->used_user_addr = device_region.iova;
972 
973     return ok;
974 }
975 
976 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
977                                  VhostShadowVirtqueue *svq, unsigned idx,
978                                  Error **errp)
979 {
980     uint16_t vq_index = dev->vq_index + idx;
981     struct vhost_vring_state s = {
982         .index = vq_index,
983     };
984     int r;
985 
986     r = vhost_vdpa_set_dev_vring_base(dev, &s);
987     if (unlikely(r)) {
988         error_setg_errno(errp, -r, "Cannot set vring base");
989         return false;
990     }
991 
992     r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
993     return r == 0;
994 }
995 
996 static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
997 {
998     struct vhost_vdpa *v = dev->opaque;
999     Error *err = NULL;
1000     unsigned i;
1001 
1002     if (!v->shadow_vqs_enabled) {
1003         return true;
1004     }
1005 
1006     for (i = 0; i < v->shadow_vqs->len; ++i) {
1007         VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
1008         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1009         struct vhost_vring_addr addr = {
1010             .index = dev->vq_index + i,
1011         };
1012         int r;
1013         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
1014         if (unlikely(!ok)) {
1015             goto err;
1016         }
1017 
1018         vhost_svq_start(svq, dev->vdev, vq, v->iova_tree);
1019         ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
1020         if (unlikely(!ok)) {
1021             goto err_map;
1022         }
1023 
1024         /* Override vring GPA set by vhost subsystem */
1025         r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
1026         if (unlikely(r != 0)) {
1027             error_setg_errno(&err, -r, "Cannot set device address");
1028             goto err_set_addr;
1029         }
1030     }
1031 
1032     return true;
1033 
1034 err_set_addr:
1035     vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
1036 
1037 err_map:
1038     vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
1039 
1040 err:
1041     error_reportf_err(err, "Cannot setup SVQ %u: ", i);
1042     for (unsigned j = 0; j < i; ++j) {
1043         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
1044         vhost_vdpa_svq_unmap_rings(dev, svq);
1045         vhost_svq_stop(svq);
1046     }
1047 
1048     return false;
1049 }
1050 
1051 static void vhost_vdpa_svqs_stop(struct vhost_dev *dev)
1052 {
1053     struct vhost_vdpa *v = dev->opaque;
1054 
1055     if (!v->shadow_vqs_enabled) {
1056         return;
1057     }
1058 
1059     for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
1060         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1061         vhost_vdpa_svq_unmap_rings(dev, svq);
1062 
1063         event_notifier_cleanup(&svq->hdev_kick);
1064         event_notifier_cleanup(&svq->hdev_call);
1065     }
1066 }
1067 
1068 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
1069 {
1070     struct vhost_vdpa *v = dev->opaque;
1071     bool ok;
1072     trace_vhost_vdpa_dev_start(dev, started);
1073 
1074     if (started) {
1075         vhost_vdpa_host_notifiers_init(dev);
1076         ok = vhost_vdpa_svqs_start(dev);
1077         if (unlikely(!ok)) {
1078             return -1;
1079         }
1080         vhost_vdpa_set_vring_ready(dev);
1081     } else {
1082         vhost_vdpa_svqs_stop(dev);
1083         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
1084     }
1085 
1086     if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
1087         return 0;
1088     }
1089 
1090     if (started) {
1091         memory_listener_register(&v->listener, &address_space_memory);
1092         return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
1093     } else {
1094         vhost_vdpa_reset_device(dev);
1095         vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
1096                                    VIRTIO_CONFIG_S_DRIVER);
1097         memory_listener_unregister(&v->listener);
1098 
1099         return 0;
1100     }
1101 }
1102 
1103 static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
1104                                      struct vhost_log *log)
1105 {
1106     struct vhost_vdpa *v = dev->opaque;
1107     if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) {
1108         return 0;
1109     }
1110 
1111     trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd,
1112                                   log->log);
1113     return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base);
1114 }
1115 
1116 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
1117                                        struct vhost_vring_addr *addr)
1118 {
1119     struct vhost_vdpa *v = dev->opaque;
1120 
1121     if (v->shadow_vqs_enabled) {
1122         /*
1123          * Device vring addr was set at device start. SVQ base is handled by
1124          * VirtQueue code.
1125          */
1126         return 0;
1127     }
1128 
1129     return vhost_vdpa_set_vring_dev_addr(dev, addr);
1130 }
1131 
1132 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
1133                                       struct vhost_vring_state *ring)
1134 {
1135     trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num);
1136     return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring);
1137 }
1138 
1139 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
1140                                        struct vhost_vring_state *ring)
1141 {
1142     struct vhost_vdpa *v = dev->opaque;
1143     VirtQueue *vq = virtio_get_queue(dev->vdev, ring->index);
1144 
1145     /*
1146      * vhost-vdpa devices does not support in-flight requests. Set all of them
1147      * as available.
1148      *
1149      * TODO: This is ok for networking, but other kinds of devices might
1150      * have problems with these retransmissions.
1151      */
1152     while (virtqueue_rewind(vq, 1)) {
1153         continue;
1154     }
1155     if (v->shadow_vqs_enabled) {
1156         /*
1157          * Device vring base was set at device start. SVQ base is handled by
1158          * VirtQueue code.
1159          */
1160         return 0;
1161     }
1162 
1163     return vhost_vdpa_set_dev_vring_base(dev, ring);
1164 }
1165 
1166 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
1167                                        struct vhost_vring_state *ring)
1168 {
1169     struct vhost_vdpa *v = dev->opaque;
1170     int ret;
1171 
1172     if (v->shadow_vqs_enabled) {
1173         ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index);
1174         return 0;
1175     }
1176 
1177     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
1178     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
1179     return ret;
1180 }
1181 
1182 static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
1183                                        struct vhost_vring_file *file)
1184 {
1185     struct vhost_vdpa *v = dev->opaque;
1186     int vdpa_idx = file->index - dev->vq_index;
1187 
1188     if (v->shadow_vqs_enabled) {
1189         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1190         vhost_svq_set_svq_kick_fd(svq, file->fd);
1191         return 0;
1192     } else {
1193         return vhost_vdpa_set_vring_dev_kick(dev, file);
1194     }
1195 }
1196 
1197 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
1198                                        struct vhost_vring_file *file)
1199 {
1200     struct vhost_vdpa *v = dev->opaque;
1201 
1202     if (v->shadow_vqs_enabled) {
1203         int vdpa_idx = file->index - dev->vq_index;
1204         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1205 
1206         vhost_svq_set_svq_call_fd(svq, file->fd);
1207         return 0;
1208     } else {
1209         return vhost_vdpa_set_vring_dev_call(dev, file);
1210     }
1211 }
1212 
1213 static int vhost_vdpa_get_features(struct vhost_dev *dev,
1214                                      uint64_t *features)
1215 {
1216     struct vhost_vdpa *v = dev->opaque;
1217     int ret = vhost_vdpa_get_dev_features(dev, features);
1218 
1219     if (ret == 0 && v->shadow_vqs_enabled) {
1220         /* Add SVQ logging capabilities */
1221         *features |= BIT_ULL(VHOST_F_LOG_ALL);
1222     }
1223 
1224     return ret;
1225 }
1226 
1227 static int vhost_vdpa_set_owner(struct vhost_dev *dev)
1228 {
1229     if (!vhost_vdpa_first_dev(dev)) {
1230         return 0;
1231     }
1232 
1233     trace_vhost_vdpa_set_owner(dev);
1234     return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL);
1235 }
1236 
1237 static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
1238                     struct vhost_vring_addr *addr, struct vhost_virtqueue *vq)
1239 {
1240     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
1241     addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys;
1242     addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys;
1243     addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys;
1244     trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr,
1245                                  addr->avail_user_addr, addr->used_user_addr);
1246     return 0;
1247 }
1248 
1249 static bool  vhost_vdpa_force_iommu(struct vhost_dev *dev)
1250 {
1251     return true;
1252 }
1253 
1254 const VhostOps vdpa_ops = {
1255         .backend_type = VHOST_BACKEND_TYPE_VDPA,
1256         .vhost_backend_init = vhost_vdpa_init,
1257         .vhost_backend_cleanup = vhost_vdpa_cleanup,
1258         .vhost_set_log_base = vhost_vdpa_set_log_base,
1259         .vhost_set_vring_addr = vhost_vdpa_set_vring_addr,
1260         .vhost_set_vring_num = vhost_vdpa_set_vring_num,
1261         .vhost_set_vring_base = vhost_vdpa_set_vring_base,
1262         .vhost_get_vring_base = vhost_vdpa_get_vring_base,
1263         .vhost_set_vring_kick = vhost_vdpa_set_vring_kick,
1264         .vhost_set_vring_call = vhost_vdpa_set_vring_call,
1265         .vhost_get_features = vhost_vdpa_get_features,
1266         .vhost_set_backend_cap = vhost_vdpa_set_backend_cap,
1267         .vhost_set_owner = vhost_vdpa_set_owner,
1268         .vhost_set_vring_endian = NULL,
1269         .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit,
1270         .vhost_set_mem_table = vhost_vdpa_set_mem_table,
1271         .vhost_set_features = vhost_vdpa_set_features,
1272         .vhost_reset_device = vhost_vdpa_reset_device,
1273         .vhost_get_vq_index = vhost_vdpa_get_vq_index,
1274         .vhost_get_config  = vhost_vdpa_get_config,
1275         .vhost_set_config = vhost_vdpa_set_config,
1276         .vhost_requires_shm_log = NULL,
1277         .vhost_migration_done = NULL,
1278         .vhost_backend_can_merge = NULL,
1279         .vhost_net_set_mtu = NULL,
1280         .vhost_set_iotlb_callback = NULL,
1281         .vhost_send_device_iotlb_msg = NULL,
1282         .vhost_dev_start = vhost_vdpa_dev_start,
1283         .vhost_get_device_id = vhost_vdpa_get_device_id,
1284         .vhost_vq_get_addr = vhost_vdpa_vq_get_addr,
1285         .vhost_force_iommu = vhost_vdpa_force_iommu,
1286 };
1287