xref: /qemu/hw/virtio/vhost-vdpa.c (revision 258a03941fd23108a322d09abc9c55341e09688d)
1 /*
2  * vhost-vdpa
3  *
4  *  Copyright(c) 2017-2018 Intel Corporation.
5  *  Copyright(c) 2020 Red Hat, Inc.
6  *
7  * This work is licensed under the terms of the GNU GPL, version 2 or later.
8  * See the COPYING file in the top-level directory.
9  *
10  */
11 
12 #include "qemu/osdep.h"
13 #include <linux/vhost.h>
14 #include <linux/vfio.h>
15 #include <sys/eventfd.h>
16 #include <sys/ioctl.h>
17 #include "hw/virtio/vhost.h"
18 #include "hw/virtio/vhost-backend.h"
19 #include "hw/virtio/virtio-net.h"
20 #include "hw/virtio/vhost-shadow-virtqueue.h"
21 #include "hw/virtio/vhost-vdpa.h"
22 #include "exec/address-spaces.h"
23 #include "migration/blocker.h"
24 #include "qemu/cutils.h"
25 #include "qemu/main-loop.h"
26 #include "cpu.h"
27 #include "trace.h"
28 #include "qapi/error.h"
29 
30 /*
31  * Return one past the end of the end of section. Be careful with uint64_t
32  * conversions!
33  */
34 static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section)
35 {
36     Int128 llend = int128_make64(section->offset_within_address_space);
37     llend = int128_add(llend, section->size);
38     llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
39 
40     return llend;
41 }
42 
43 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
44                                                 uint64_t iova_min,
45                                                 uint64_t iova_max)
46 {
47     Int128 llend;
48 
49     if ((!memory_region_is_ram(section->mr) &&
50          !memory_region_is_iommu(section->mr)) ||
51         memory_region_is_protected(section->mr) ||
52         /* vhost-vDPA doesn't allow MMIO to be mapped  */
53         memory_region_is_ram_device(section->mr)) {
54         return true;
55     }
56 
57     if (section->offset_within_address_space < iova_min) {
58         error_report("RAM section out of device range (min=0x%" PRIx64
59                      ", addr=0x%" HWADDR_PRIx ")",
60                      iova_min, section->offset_within_address_space);
61         return true;
62     }
63 
64     llend = vhost_vdpa_section_end(section);
65     if (int128_gt(llend, int128_make64(iova_max))) {
66         error_report("RAM section out of device range (max=0x%" PRIx64
67                      ", end addr=0x%" PRIx64 ")",
68                      iova_max, int128_get64(llend));
69         return true;
70     }
71 
72     return false;
73 }
74 
75 int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
76                        void *vaddr, bool readonly)
77 {
78     struct vhost_msg_v2 msg = {};
79     int fd = v->device_fd;
80     int ret = 0;
81 
82     msg.type = v->msg_type;
83     msg.iotlb.iova = iova;
84     msg.iotlb.size = size;
85     msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr;
86     msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW;
87     msg.iotlb.type = VHOST_IOTLB_UPDATE;
88 
89    trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.iotlb.iova, msg.iotlb.size,
90                             msg.iotlb.uaddr, msg.iotlb.perm, msg.iotlb.type);
91 
92     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
93         error_report("failed to write, fd=%d, errno=%d (%s)",
94             fd, errno, strerror(errno));
95         return -EIO ;
96     }
97 
98     return ret;
99 }
100 
101 int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size)
102 {
103     struct vhost_msg_v2 msg = {};
104     int fd = v->device_fd;
105     int ret = 0;
106 
107     msg.type = v->msg_type;
108     msg.iotlb.iova = iova;
109     msg.iotlb.size = size;
110     msg.iotlb.type = VHOST_IOTLB_INVALIDATE;
111 
112     trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.iotlb.iova,
113                                msg.iotlb.size, msg.iotlb.type);
114 
115     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
116         error_report("failed to write, fd=%d, errno=%d (%s)",
117             fd, errno, strerror(errno));
118         return -EIO ;
119     }
120 
121     return ret;
122 }
123 
124 static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa *v)
125 {
126     int fd = v->device_fd;
127     struct vhost_msg_v2 msg = {
128         .type = v->msg_type,
129         .iotlb.type = VHOST_IOTLB_BATCH_BEGIN,
130     };
131 
132     trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type);
133     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
134         error_report("failed to write, fd=%d, errno=%d (%s)",
135                      fd, errno, strerror(errno));
136     }
137 }
138 
139 static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa *v)
140 {
141     if (v->dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) &&
142         !v->iotlb_batch_begin_sent) {
143         vhost_vdpa_listener_begin_batch(v);
144     }
145 
146     v->iotlb_batch_begin_sent = true;
147 }
148 
149 static void vhost_vdpa_listener_commit(MemoryListener *listener)
150 {
151     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
152     struct vhost_dev *dev = v->dev;
153     struct vhost_msg_v2 msg = {};
154     int fd = v->device_fd;
155 
156     if (!(dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) {
157         return;
158     }
159 
160     if (!v->iotlb_batch_begin_sent) {
161         return;
162     }
163 
164     msg.type = v->msg_type;
165     msg.iotlb.type = VHOST_IOTLB_BATCH_END;
166 
167     trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type);
168     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
169         error_report("failed to write, fd=%d, errno=%d (%s)",
170                      fd, errno, strerror(errno));
171     }
172 
173     v->iotlb_batch_begin_sent = false;
174 }
175 
176 static void vhost_vdpa_listener_region_add(MemoryListener *listener,
177                                            MemoryRegionSection *section)
178 {
179     DMAMap mem_region = {};
180     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
181     hwaddr iova;
182     Int128 llend, llsize;
183     void *vaddr;
184     int ret;
185 
186     if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
187                                             v->iova_range.last)) {
188         return;
189     }
190 
191     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
192                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
193         error_report("%s received unaligned region", __func__);
194         return;
195     }
196 
197     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
198     llend = vhost_vdpa_section_end(section);
199     if (int128_ge(int128_make64(iova), llend)) {
200         return;
201     }
202 
203     memory_region_ref(section->mr);
204 
205     /* Here we assume that memory_region_is_ram(section->mr)==true */
206 
207     vaddr = memory_region_get_ram_ptr(section->mr) +
208             section->offset_within_region +
209             (iova - section->offset_within_address_space);
210 
211     trace_vhost_vdpa_listener_region_add(v, iova, int128_get64(llend),
212                                          vaddr, section->readonly);
213 
214     llsize = int128_sub(llend, int128_make64(iova));
215     if (v->shadow_vqs_enabled) {
216         int r;
217 
218         mem_region.translated_addr = (hwaddr)(uintptr_t)vaddr,
219         mem_region.size = int128_get64(llsize) - 1,
220         mem_region.perm = IOMMU_ACCESS_FLAG(true, section->readonly),
221 
222         r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
223         if (unlikely(r != IOVA_OK)) {
224             error_report("Can't allocate a mapping (%d)", r);
225             goto fail;
226         }
227 
228         iova = mem_region.iova;
229     }
230 
231     vhost_vdpa_iotlb_batch_begin_once(v);
232     ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
233                              vaddr, section->readonly);
234     if (ret) {
235         error_report("vhost vdpa map fail!");
236         goto fail_map;
237     }
238 
239     return;
240 
241 fail_map:
242     if (v->shadow_vqs_enabled) {
243         vhost_iova_tree_remove(v->iova_tree, mem_region);
244     }
245 
246 fail:
247     /*
248      * On the initfn path, store the first error in the container so we
249      * can gracefully fail.  Runtime, there's not much we can do other
250      * than throw a hardware error.
251      */
252     error_report("vhost-vdpa: DMA mapping failed, unable to continue");
253     return;
254 
255 }
256 
257 static void vhost_vdpa_listener_region_del(MemoryListener *listener,
258                                            MemoryRegionSection *section)
259 {
260     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
261     hwaddr iova;
262     Int128 llend, llsize;
263     int ret;
264 
265     if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
266                                             v->iova_range.last)) {
267         return;
268     }
269 
270     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
271                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
272         error_report("%s received unaligned region", __func__);
273         return;
274     }
275 
276     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
277     llend = vhost_vdpa_section_end(section);
278 
279     trace_vhost_vdpa_listener_region_del(v, iova, int128_get64(llend));
280 
281     if (int128_ge(int128_make64(iova), llend)) {
282         return;
283     }
284 
285     llsize = int128_sub(llend, int128_make64(iova));
286 
287     if (v->shadow_vqs_enabled) {
288         const DMAMap *result;
289         const void *vaddr = memory_region_get_ram_ptr(section->mr) +
290             section->offset_within_region +
291             (iova - section->offset_within_address_space);
292         DMAMap mem_region = {
293             .translated_addr = (hwaddr)(uintptr_t)vaddr,
294             .size = int128_get64(llsize) - 1,
295         };
296 
297         result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
298         if (!result) {
299             /* The memory listener map wasn't mapped */
300             return;
301         }
302         iova = result->iova;
303         vhost_iova_tree_remove(v->iova_tree, *result);
304     }
305     vhost_vdpa_iotlb_batch_begin_once(v);
306     ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
307     if (ret) {
308         error_report("vhost_vdpa dma unmap error!");
309     }
310 
311     memory_region_unref(section->mr);
312 }
313 /*
314  * IOTLB API is used by vhost-vdpa which requires incremental updating
315  * of the mapping. So we can not use generic vhost memory listener which
316  * depends on the addnop().
317  */
318 static const MemoryListener vhost_vdpa_memory_listener = {
319     .name = "vhost-vdpa",
320     .commit = vhost_vdpa_listener_commit,
321     .region_add = vhost_vdpa_listener_region_add,
322     .region_del = vhost_vdpa_listener_region_del,
323 };
324 
325 static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request,
326                              void *arg)
327 {
328     struct vhost_vdpa *v = dev->opaque;
329     int fd = v->device_fd;
330     int ret;
331 
332     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
333 
334     ret = ioctl(fd, request, arg);
335     return ret < 0 ? -errno : ret;
336 }
337 
338 static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status)
339 {
340     uint8_t s;
341     int ret;
342 
343     trace_vhost_vdpa_add_status(dev, status);
344     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
345     if (ret < 0) {
346         return ret;
347     }
348 
349     s |= status;
350 
351     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s);
352     if (ret < 0) {
353         return ret;
354     }
355 
356     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
357     if (ret < 0) {
358         return ret;
359     }
360 
361     if (!(s & status)) {
362         return -EIO;
363     }
364 
365     return 0;
366 }
367 
368 /*
369  * The use of this function is for requests that only need to be
370  * applied once. Typically such request occurs at the beginning
371  * of operation, and before setting up queues. It should not be
372  * used for request that performs operation until all queues are
373  * set, which would need to check dev->vq_index_end instead.
374  */
375 static bool vhost_vdpa_first_dev(struct vhost_dev *dev)
376 {
377     struct vhost_vdpa *v = dev->opaque;
378 
379     return v->index == 0;
380 }
381 
382 static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
383                                        uint64_t *features)
384 {
385     int ret;
386 
387     ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
388     trace_vhost_vdpa_get_features(dev, *features);
389     return ret;
390 }
391 
392 static void vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v)
393 {
394     g_autoptr(GPtrArray) shadow_vqs = NULL;
395 
396     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
397     for (unsigned n = 0; n < hdev->nvqs; ++n) {
398         VhostShadowVirtqueue *svq;
399 
400         svq = vhost_svq_new(v->shadow_vq_ops, v->shadow_vq_ops_opaque);
401         g_ptr_array_add(shadow_vqs, svq);
402     }
403 
404     v->shadow_vqs = g_steal_pointer(&shadow_vqs);
405 }
406 
407 static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
408 {
409     struct vhost_vdpa *v;
410     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
411     trace_vhost_vdpa_init(dev, opaque);
412     int ret;
413 
414     /*
415      * Similar to VFIO, we end up pinning all guest memory and have to
416      * disable discarding of RAM.
417      */
418     ret = ram_block_discard_disable(true);
419     if (ret) {
420         error_report("Cannot set discarding of RAM broken");
421         return ret;
422     }
423 
424     v = opaque;
425     v->dev = dev;
426     dev->opaque =  opaque ;
427     v->listener = vhost_vdpa_memory_listener;
428     v->msg_type = VHOST_IOTLB_MSG_V2;
429     vhost_vdpa_init_svq(dev, v);
430 
431     if (!vhost_vdpa_first_dev(dev)) {
432         return 0;
433     }
434 
435     vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
436                                VIRTIO_CONFIG_S_DRIVER);
437 
438     return 0;
439 }
440 
441 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
442                                             int queue_index)
443 {
444     size_t page_size = qemu_real_host_page_size();
445     struct vhost_vdpa *v = dev->opaque;
446     VirtIODevice *vdev = dev->vdev;
447     VhostVDPAHostNotifier *n;
448 
449     n = &v->notifier[queue_index];
450 
451     if (n->addr) {
452         virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false);
453         object_unparent(OBJECT(&n->mr));
454         munmap(n->addr, page_size);
455         n->addr = NULL;
456     }
457 }
458 
459 static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index)
460 {
461     size_t page_size = qemu_real_host_page_size();
462     struct vhost_vdpa *v = dev->opaque;
463     VirtIODevice *vdev = dev->vdev;
464     VhostVDPAHostNotifier *n;
465     int fd = v->device_fd;
466     void *addr;
467     char *name;
468 
469     vhost_vdpa_host_notifier_uninit(dev, queue_index);
470 
471     n = &v->notifier[queue_index];
472 
473     addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd,
474                 queue_index * page_size);
475     if (addr == MAP_FAILED) {
476         goto err;
477     }
478 
479     name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]",
480                            v, queue_index);
481     memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
482                                       page_size, addr);
483     g_free(name);
484 
485     if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) {
486         object_unparent(OBJECT(&n->mr));
487         munmap(addr, page_size);
488         goto err;
489     }
490     n->addr = addr;
491 
492     return 0;
493 
494 err:
495     return -1;
496 }
497 
498 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
499 {
500     int i;
501 
502     for (i = dev->vq_index; i < dev->vq_index + n; i++) {
503         vhost_vdpa_host_notifier_uninit(dev, i);
504     }
505 }
506 
507 static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
508 {
509     struct vhost_vdpa *v = dev->opaque;
510     int i;
511 
512     if (v->shadow_vqs_enabled) {
513         /* FIXME SVQ is not compatible with host notifiers mr */
514         return;
515     }
516 
517     for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
518         if (vhost_vdpa_host_notifier_init(dev, i)) {
519             goto err;
520         }
521     }
522 
523     return;
524 
525 err:
526     vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index);
527     return;
528 }
529 
530 static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
531 {
532     struct vhost_vdpa *v = dev->opaque;
533     size_t idx;
534 
535     if (!v->shadow_vqs) {
536         return;
537     }
538 
539     for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
540         vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
541     }
542     g_ptr_array_free(v->shadow_vqs, true);
543 }
544 
545 static int vhost_vdpa_cleanup(struct vhost_dev *dev)
546 {
547     struct vhost_vdpa *v;
548     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
549     v = dev->opaque;
550     trace_vhost_vdpa_cleanup(dev, v);
551     vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
552     memory_listener_unregister(&v->listener);
553     vhost_vdpa_svq_cleanup(dev);
554 
555     dev->opaque = NULL;
556     ram_block_discard_disable(false);
557 
558     return 0;
559 }
560 
561 static int vhost_vdpa_memslots_limit(struct vhost_dev *dev)
562 {
563     trace_vhost_vdpa_memslots_limit(dev, INT_MAX);
564     return INT_MAX;
565 }
566 
567 static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
568                                     struct vhost_memory *mem)
569 {
570     if (!vhost_vdpa_first_dev(dev)) {
571         return 0;
572     }
573 
574     trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding);
575     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) &&
576         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) {
577         int i;
578         for (i = 0; i < mem->nregions; i++) {
579             trace_vhost_vdpa_dump_regions(dev, i,
580                                           mem->regions[i].guest_phys_addr,
581                                           mem->regions[i].memory_size,
582                                           mem->regions[i].userspace_addr,
583                                           mem->regions[i].flags_padding);
584         }
585     }
586     if (mem->padding) {
587         return -EINVAL;
588     }
589 
590     return 0;
591 }
592 
593 static int vhost_vdpa_set_features(struct vhost_dev *dev,
594                                    uint64_t features)
595 {
596     struct vhost_vdpa *v = dev->opaque;
597     int ret;
598 
599     if (!vhost_vdpa_first_dev(dev)) {
600         return 0;
601     }
602 
603     if (v->shadow_vqs_enabled) {
604         if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
605             /*
606              * QEMU is just trying to enable or disable logging. SVQ handles
607              * this sepparately, so no need to forward this.
608              */
609             v->acked_features = features;
610             return 0;
611         }
612 
613         v->acked_features = features;
614 
615         /* We must not ack _F_LOG if SVQ is enabled */
616         features &= ~BIT_ULL(VHOST_F_LOG_ALL);
617     }
618 
619     trace_vhost_vdpa_set_features(dev, features);
620     ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
621     if (ret) {
622         return ret;
623     }
624 
625     return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
626 }
627 
628 static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev)
629 {
630     uint64_t features;
631     uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
632         0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH;
633     int r;
634 
635     if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) {
636         return -EFAULT;
637     }
638 
639     features &= f;
640 
641     if (vhost_vdpa_first_dev(dev)) {
642         r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features);
643         if (r) {
644             return -EFAULT;
645         }
646     }
647 
648     dev->backend_cap = features;
649 
650     return 0;
651 }
652 
653 static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
654                                     uint32_t *device_id)
655 {
656     int ret;
657     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id);
658     trace_vhost_vdpa_get_device_id(dev, *device_id);
659     return ret;
660 }
661 
662 static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
663 {
664     if (!v->shadow_vqs_enabled) {
665         return;
666     }
667 
668     for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
669         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
670         vhost_svq_stop(svq);
671     }
672 }
673 
674 static int vhost_vdpa_reset_device(struct vhost_dev *dev)
675 {
676     struct vhost_vdpa *v = dev->opaque;
677     int ret;
678     uint8_t status = 0;
679 
680     vhost_vdpa_reset_svq(v);
681 
682     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
683     trace_vhost_vdpa_reset_device(dev, status);
684     return ret;
685 }
686 
687 static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx)
688 {
689     assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
690 
691     trace_vhost_vdpa_get_vq_index(dev, idx, idx);
692     return idx;
693 }
694 
695 static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev)
696 {
697     int i;
698     trace_vhost_vdpa_set_vring_ready(dev);
699     for (i = 0; i < dev->nvqs; ++i) {
700         struct vhost_vring_state state = {
701             .index = dev->vq_index + i,
702             .num = 1,
703         };
704         vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);
705     }
706     return 0;
707 }
708 
709 static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config,
710                                    uint32_t config_len)
711 {
712     int b, len;
713     char line[QEMU_HEXDUMP_LINE_LEN];
714 
715     for (b = 0; b < config_len; b += 16) {
716         len = config_len - b;
717         qemu_hexdump_line(line, b, config, len, false);
718         trace_vhost_vdpa_dump_config(dev, line);
719     }
720 }
721 
722 static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data,
723                                    uint32_t offset, uint32_t size,
724                                    uint32_t flags)
725 {
726     struct vhost_vdpa_config *config;
727     int ret;
728     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
729 
730     trace_vhost_vdpa_set_config(dev, offset, size, flags);
731     config = g_malloc(size + config_size);
732     config->off = offset;
733     config->len = size;
734     memcpy(config->buf, data, size);
735     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) &&
736         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
737         vhost_vdpa_dump_config(dev, data, size);
738     }
739     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config);
740     g_free(config);
741     return ret;
742 }
743 
744 static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
745                                    uint32_t config_len, Error **errp)
746 {
747     struct vhost_vdpa_config *v_config;
748     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
749     int ret;
750 
751     trace_vhost_vdpa_get_config(dev, config, config_len);
752     v_config = g_malloc(config_len + config_size);
753     v_config->len = config_len;
754     v_config->off = 0;
755     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config);
756     memcpy(config, v_config->buf, config_len);
757     g_free(v_config);
758     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) &&
759         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
760         vhost_vdpa_dump_config(dev, config, config_len);
761     }
762     return ret;
763  }
764 
765 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
766                                          struct vhost_vring_state *ring)
767 {
768     trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
769     return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
770 }
771 
772 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
773                                          struct vhost_vring_file *file)
774 {
775     trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
776     return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
777 }
778 
779 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
780                                          struct vhost_vring_file *file)
781 {
782     trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
783     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
784 }
785 
786 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
787                                          struct vhost_vring_addr *addr)
788 {
789     trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
790                                 addr->desc_user_addr, addr->used_user_addr,
791                                 addr->avail_user_addr,
792                                 addr->log_guest_addr);
793 
794     return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
795 
796 }
797 
798 /**
799  * Set the shadow virtqueue descriptors to the device
800  *
801  * @dev: The vhost device model
802  * @svq: The shadow virtqueue
803  * @idx: The index of the virtqueue in the vhost device
804  * @errp: Error
805  *
806  * Note that this function does not rewind kick file descriptor if cannot set
807  * call one.
808  */
809 static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
810                                   VhostShadowVirtqueue *svq, unsigned idx,
811                                   Error **errp)
812 {
813     struct vhost_vring_file file = {
814         .index = dev->vq_index + idx,
815     };
816     const EventNotifier *event_notifier = &svq->hdev_kick;
817     int r;
818 
819     r = event_notifier_init(&svq->hdev_kick, 0);
820     if (r != 0) {
821         error_setg_errno(errp, -r, "Couldn't create kick event notifier");
822         goto err_init_hdev_kick;
823     }
824 
825     r = event_notifier_init(&svq->hdev_call, 0);
826     if (r != 0) {
827         error_setg_errno(errp, -r, "Couldn't create call event notifier");
828         goto err_init_hdev_call;
829     }
830 
831     file.fd = event_notifier_get_fd(event_notifier);
832     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
833     if (unlikely(r != 0)) {
834         error_setg_errno(errp, -r, "Can't set device kick fd");
835         goto err_init_set_dev_fd;
836     }
837 
838     event_notifier = &svq->hdev_call;
839     file.fd = event_notifier_get_fd(event_notifier);
840     r = vhost_vdpa_set_vring_dev_call(dev, &file);
841     if (unlikely(r != 0)) {
842         error_setg_errno(errp, -r, "Can't set device call fd");
843         goto err_init_set_dev_fd;
844     }
845 
846     return 0;
847 
848 err_init_set_dev_fd:
849     event_notifier_set_handler(&svq->hdev_call, NULL);
850 
851 err_init_hdev_call:
852     event_notifier_cleanup(&svq->hdev_kick);
853 
854 err_init_hdev_kick:
855     return r;
856 }
857 
858 /**
859  * Unmap a SVQ area in the device
860  */
861 static void vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr addr)
862 {
863     const DMAMap needle = {
864         .translated_addr = addr,
865     };
866     const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, &needle);
867     hwaddr size;
868     int r;
869 
870     if (unlikely(!result)) {
871         error_report("Unable to find SVQ address to unmap");
872         return;
873     }
874 
875     size = ROUND_UP(result->size, qemu_real_host_page_size());
876     r = vhost_vdpa_dma_unmap(v, result->iova, size);
877     if (unlikely(r < 0)) {
878         error_report("Unable to unmap SVQ vring: %s (%d)", g_strerror(-r), -r);
879         return;
880     }
881 
882     vhost_iova_tree_remove(v->iova_tree, *result);
883 }
884 
885 static void vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
886                                        const VhostShadowVirtqueue *svq)
887 {
888     struct vhost_vdpa *v = dev->opaque;
889     struct vhost_vring_addr svq_addr;
890 
891     vhost_svq_get_vring_addr(svq, &svq_addr);
892 
893     vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr);
894 
895     vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr);
896 }
897 
898 /**
899  * Map the SVQ area in the device
900  *
901  * @v: Vhost-vdpa device
902  * @needle: The area to search iova
903  * @errorp: Error pointer
904  */
905 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
906                                     Error **errp)
907 {
908     int r;
909 
910     r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
911     if (unlikely(r != IOVA_OK)) {
912         error_setg(errp, "Cannot allocate iova (%d)", r);
913         return false;
914     }
915 
916     r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
917                            (void *)(uintptr_t)needle->translated_addr,
918                            needle->perm == IOMMU_RO);
919     if (unlikely(r != 0)) {
920         error_setg_errno(errp, -r, "Cannot map region to device");
921         vhost_iova_tree_remove(v->iova_tree, *needle);
922     }
923 
924     return r == 0;
925 }
926 
927 /**
928  * Map the shadow virtqueue rings in the device
929  *
930  * @dev: The vhost device
931  * @svq: The shadow virtqueue
932  * @addr: Assigned IOVA addresses
933  * @errp: Error pointer
934  */
935 static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
936                                      const VhostShadowVirtqueue *svq,
937                                      struct vhost_vring_addr *addr,
938                                      Error **errp)
939 {
940     ERRP_GUARD();
941     DMAMap device_region, driver_region;
942     struct vhost_vring_addr svq_addr;
943     struct vhost_vdpa *v = dev->opaque;
944     size_t device_size = vhost_svq_device_area_size(svq);
945     size_t driver_size = vhost_svq_driver_area_size(svq);
946     size_t avail_offset;
947     bool ok;
948 
949     vhost_svq_get_vring_addr(svq, &svq_addr);
950 
951     driver_region = (DMAMap) {
952         .translated_addr = svq_addr.desc_user_addr,
953         .size = driver_size - 1,
954         .perm = IOMMU_RO,
955     };
956     ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
957     if (unlikely(!ok)) {
958         error_prepend(errp, "Cannot create vq driver region: ");
959         return false;
960     }
961     addr->desc_user_addr = driver_region.iova;
962     avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
963     addr->avail_user_addr = driver_region.iova + avail_offset;
964 
965     device_region = (DMAMap) {
966         .translated_addr = svq_addr.used_user_addr,
967         .size = device_size - 1,
968         .perm = IOMMU_RW,
969     };
970     ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
971     if (unlikely(!ok)) {
972         error_prepend(errp, "Cannot create vq device region: ");
973         vhost_vdpa_svq_unmap_ring(v, driver_region.translated_addr);
974     }
975     addr->used_user_addr = device_region.iova;
976 
977     return ok;
978 }
979 
980 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
981                                  VhostShadowVirtqueue *svq, unsigned idx,
982                                  Error **errp)
983 {
984     uint16_t vq_index = dev->vq_index + idx;
985     struct vhost_vring_state s = {
986         .index = vq_index,
987     };
988     int r;
989 
990     r = vhost_vdpa_set_dev_vring_base(dev, &s);
991     if (unlikely(r)) {
992         error_setg_errno(errp, -r, "Cannot set vring base");
993         return false;
994     }
995 
996     r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
997     return r == 0;
998 }
999 
1000 static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
1001 {
1002     struct vhost_vdpa *v = dev->opaque;
1003     Error *err = NULL;
1004     unsigned i;
1005 
1006     if (!v->shadow_vqs_enabled) {
1007         return true;
1008     }
1009 
1010     for (i = 0; i < v->shadow_vqs->len; ++i) {
1011         VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
1012         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1013         struct vhost_vring_addr addr = {
1014             .index = dev->vq_index + i,
1015         };
1016         int r;
1017         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
1018         if (unlikely(!ok)) {
1019             goto err;
1020         }
1021 
1022         vhost_svq_start(svq, dev->vdev, vq, v->iova_tree);
1023         ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
1024         if (unlikely(!ok)) {
1025             goto err_map;
1026         }
1027 
1028         /* Override vring GPA set by vhost subsystem */
1029         r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
1030         if (unlikely(r != 0)) {
1031             error_setg_errno(&err, -r, "Cannot set device address");
1032             goto err_set_addr;
1033         }
1034     }
1035 
1036     return true;
1037 
1038 err_set_addr:
1039     vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
1040 
1041 err_map:
1042     vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
1043 
1044 err:
1045     error_reportf_err(err, "Cannot setup SVQ %u: ", i);
1046     for (unsigned j = 0; j < i; ++j) {
1047         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
1048         vhost_vdpa_svq_unmap_rings(dev, svq);
1049         vhost_svq_stop(svq);
1050     }
1051 
1052     return false;
1053 }
1054 
1055 static void vhost_vdpa_svqs_stop(struct vhost_dev *dev)
1056 {
1057     struct vhost_vdpa *v = dev->opaque;
1058 
1059     if (!v->shadow_vqs_enabled) {
1060         return;
1061     }
1062 
1063     for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
1064         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1065         vhost_vdpa_svq_unmap_rings(dev, svq);
1066 
1067         event_notifier_cleanup(&svq->hdev_kick);
1068         event_notifier_cleanup(&svq->hdev_call);
1069     }
1070 }
1071 
1072 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
1073 {
1074     struct vhost_vdpa *v = dev->opaque;
1075     bool ok;
1076     trace_vhost_vdpa_dev_start(dev, started);
1077 
1078     if (started) {
1079         vhost_vdpa_host_notifiers_init(dev);
1080         ok = vhost_vdpa_svqs_start(dev);
1081         if (unlikely(!ok)) {
1082             return -1;
1083         }
1084         vhost_vdpa_set_vring_ready(dev);
1085     } else {
1086         vhost_vdpa_svqs_stop(dev);
1087         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
1088     }
1089 
1090     if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
1091         return 0;
1092     }
1093 
1094     if (started) {
1095         memory_listener_register(&v->listener, &address_space_memory);
1096         return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
1097     } else {
1098         vhost_vdpa_reset_device(dev);
1099         vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
1100                                    VIRTIO_CONFIG_S_DRIVER);
1101         memory_listener_unregister(&v->listener);
1102 
1103         return 0;
1104     }
1105 }
1106 
1107 static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
1108                                      struct vhost_log *log)
1109 {
1110     struct vhost_vdpa *v = dev->opaque;
1111     if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) {
1112         return 0;
1113     }
1114 
1115     trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd,
1116                                   log->log);
1117     return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base);
1118 }
1119 
1120 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
1121                                        struct vhost_vring_addr *addr)
1122 {
1123     struct vhost_vdpa *v = dev->opaque;
1124 
1125     if (v->shadow_vqs_enabled) {
1126         /*
1127          * Device vring addr was set at device start. SVQ base is handled by
1128          * VirtQueue code.
1129          */
1130         return 0;
1131     }
1132 
1133     return vhost_vdpa_set_vring_dev_addr(dev, addr);
1134 }
1135 
1136 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
1137                                       struct vhost_vring_state *ring)
1138 {
1139     trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num);
1140     return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring);
1141 }
1142 
1143 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
1144                                        struct vhost_vring_state *ring)
1145 {
1146     struct vhost_vdpa *v = dev->opaque;
1147     VirtQueue *vq = virtio_get_queue(dev->vdev, ring->index);
1148 
1149     /*
1150      * vhost-vdpa devices does not support in-flight requests. Set all of them
1151      * as available.
1152      *
1153      * TODO: This is ok for networking, but other kinds of devices might
1154      * have problems with these retransmissions.
1155      */
1156     while (virtqueue_rewind(vq, 1)) {
1157         continue;
1158     }
1159     if (v->shadow_vqs_enabled) {
1160         /*
1161          * Device vring base was set at device start. SVQ base is handled by
1162          * VirtQueue code.
1163          */
1164         return 0;
1165     }
1166 
1167     return vhost_vdpa_set_dev_vring_base(dev, ring);
1168 }
1169 
1170 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
1171                                        struct vhost_vring_state *ring)
1172 {
1173     struct vhost_vdpa *v = dev->opaque;
1174     int ret;
1175 
1176     if (v->shadow_vqs_enabled) {
1177         ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index);
1178         return 0;
1179     }
1180 
1181     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
1182     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
1183     return ret;
1184 }
1185 
1186 static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
1187                                        struct vhost_vring_file *file)
1188 {
1189     struct vhost_vdpa *v = dev->opaque;
1190     int vdpa_idx = file->index - dev->vq_index;
1191 
1192     if (v->shadow_vqs_enabled) {
1193         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1194         vhost_svq_set_svq_kick_fd(svq, file->fd);
1195         return 0;
1196     } else {
1197         return vhost_vdpa_set_vring_dev_kick(dev, file);
1198     }
1199 }
1200 
1201 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
1202                                        struct vhost_vring_file *file)
1203 {
1204     struct vhost_vdpa *v = dev->opaque;
1205 
1206     if (v->shadow_vqs_enabled) {
1207         int vdpa_idx = file->index - dev->vq_index;
1208         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1209 
1210         vhost_svq_set_svq_call_fd(svq, file->fd);
1211         return 0;
1212     } else {
1213         return vhost_vdpa_set_vring_dev_call(dev, file);
1214     }
1215 }
1216 
1217 static int vhost_vdpa_get_features(struct vhost_dev *dev,
1218                                      uint64_t *features)
1219 {
1220     struct vhost_vdpa *v = dev->opaque;
1221     int ret = vhost_vdpa_get_dev_features(dev, features);
1222 
1223     if (ret == 0 && v->shadow_vqs_enabled) {
1224         /* Add SVQ logging capabilities */
1225         *features |= BIT_ULL(VHOST_F_LOG_ALL);
1226     }
1227 
1228     return ret;
1229 }
1230 
1231 static int vhost_vdpa_set_owner(struct vhost_dev *dev)
1232 {
1233     if (!vhost_vdpa_first_dev(dev)) {
1234         return 0;
1235     }
1236 
1237     trace_vhost_vdpa_set_owner(dev);
1238     return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL);
1239 }
1240 
1241 static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
1242                     struct vhost_vring_addr *addr, struct vhost_virtqueue *vq)
1243 {
1244     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
1245     addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys;
1246     addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys;
1247     addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys;
1248     trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr,
1249                                  addr->avail_user_addr, addr->used_user_addr);
1250     return 0;
1251 }
1252 
1253 static bool  vhost_vdpa_force_iommu(struct vhost_dev *dev)
1254 {
1255     return true;
1256 }
1257 
1258 const VhostOps vdpa_ops = {
1259         .backend_type = VHOST_BACKEND_TYPE_VDPA,
1260         .vhost_backend_init = vhost_vdpa_init,
1261         .vhost_backend_cleanup = vhost_vdpa_cleanup,
1262         .vhost_set_log_base = vhost_vdpa_set_log_base,
1263         .vhost_set_vring_addr = vhost_vdpa_set_vring_addr,
1264         .vhost_set_vring_num = vhost_vdpa_set_vring_num,
1265         .vhost_set_vring_base = vhost_vdpa_set_vring_base,
1266         .vhost_get_vring_base = vhost_vdpa_get_vring_base,
1267         .vhost_set_vring_kick = vhost_vdpa_set_vring_kick,
1268         .vhost_set_vring_call = vhost_vdpa_set_vring_call,
1269         .vhost_get_features = vhost_vdpa_get_features,
1270         .vhost_set_backend_cap = vhost_vdpa_set_backend_cap,
1271         .vhost_set_owner = vhost_vdpa_set_owner,
1272         .vhost_set_vring_endian = NULL,
1273         .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit,
1274         .vhost_set_mem_table = vhost_vdpa_set_mem_table,
1275         .vhost_set_features = vhost_vdpa_set_features,
1276         .vhost_reset_device = vhost_vdpa_reset_device,
1277         .vhost_get_vq_index = vhost_vdpa_get_vq_index,
1278         .vhost_get_config  = vhost_vdpa_get_config,
1279         .vhost_set_config = vhost_vdpa_set_config,
1280         .vhost_requires_shm_log = NULL,
1281         .vhost_migration_done = NULL,
1282         .vhost_backend_can_merge = NULL,
1283         .vhost_net_set_mtu = NULL,
1284         .vhost_set_iotlb_callback = NULL,
1285         .vhost_send_device_iotlb_msg = NULL,
1286         .vhost_dev_start = vhost_vdpa_dev_start,
1287         .vhost_get_device_id = vhost_vdpa_get_device_id,
1288         .vhost_vq_get_addr = vhost_vdpa_vq_get_addr,
1289         .vhost_force_iommu = vhost_vdpa_force_iommu,
1290 };
1291