xref: /qemu/hw/virtio/vhost-vdpa.c (revision bd907ae4b00ebedad5e586af05ea3d6490318d45)
1 /*
2  * vhost-vdpa
3  *
4  *  Copyright(c) 2017-2018 Intel Corporation.
5  *  Copyright(c) 2020 Red Hat, Inc.
6  *
7  * This work is licensed under the terms of the GNU GPL, version 2 or later.
8  * See the COPYING file in the top-level directory.
9  *
10  */
11 
12 #include "qemu/osdep.h"
13 #include <linux/vhost.h>
14 #include <linux/vfio.h>
15 #include <sys/eventfd.h>
16 #include <sys/ioctl.h>
17 #include "hw/virtio/vhost.h"
18 #include "hw/virtio/vhost-backend.h"
19 #include "hw/virtio/virtio-net.h"
20 #include "hw/virtio/vhost-shadow-virtqueue.h"
21 #include "hw/virtio/vhost-vdpa.h"
22 #include "exec/address-spaces.h"
23 #include "qemu/cutils.h"
24 #include "qemu/main-loop.h"
25 #include "cpu.h"
26 #include "trace.h"
27 #include "qapi/error.h"
28 
29 /*
30  * Return one past the end of the end of section. Be careful with uint64_t
31  * conversions!
32  */
33 static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section)
34 {
35     Int128 llend = int128_make64(section->offset_within_address_space);
36     llend = int128_add(llend, section->size);
37     llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
38 
39     return llend;
40 }
41 
42 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
43                                                 uint64_t iova_min,
44                                                 uint64_t iova_max)
45 {
46     Int128 llend;
47 
48     if ((!memory_region_is_ram(section->mr) &&
49          !memory_region_is_iommu(section->mr)) ||
50         memory_region_is_protected(section->mr) ||
51         /* vhost-vDPA doesn't allow MMIO to be mapped  */
52         memory_region_is_ram_device(section->mr)) {
53         return true;
54     }
55 
56     if (section->offset_within_address_space < iova_min) {
57         error_report("RAM section out of device range (min=0x%" PRIx64
58                      ", addr=0x%" HWADDR_PRIx ")",
59                      iova_min, section->offset_within_address_space);
60         return true;
61     }
62 
63     llend = vhost_vdpa_section_end(section);
64     if (int128_gt(llend, int128_make64(iova_max))) {
65         error_report("RAM section out of device range (max=0x%" PRIx64
66                      ", end addr=0x%" PRIx64 ")",
67                      iova_max, int128_get64(llend));
68         return true;
69     }
70 
71     return false;
72 }
73 
74 int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
75                        void *vaddr, bool readonly)
76 {
77     struct vhost_msg_v2 msg = {};
78     int fd = v->device_fd;
79     int ret = 0;
80 
81     msg.type = v->msg_type;
82     msg.iotlb.iova = iova;
83     msg.iotlb.size = size;
84     msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr;
85     msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW;
86     msg.iotlb.type = VHOST_IOTLB_UPDATE;
87 
88    trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.iotlb.iova, msg.iotlb.size,
89                             msg.iotlb.uaddr, msg.iotlb.perm, msg.iotlb.type);
90 
91     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
92         error_report("failed to write, fd=%d, errno=%d (%s)",
93             fd, errno, strerror(errno));
94         return -EIO ;
95     }
96 
97     return ret;
98 }
99 
100 int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova, hwaddr size)
101 {
102     struct vhost_msg_v2 msg = {};
103     int fd = v->device_fd;
104     int ret = 0;
105 
106     msg.type = v->msg_type;
107     msg.iotlb.iova = iova;
108     msg.iotlb.size = size;
109     msg.iotlb.type = VHOST_IOTLB_INVALIDATE;
110 
111     trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.iotlb.iova,
112                                msg.iotlb.size, msg.iotlb.type);
113 
114     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
115         error_report("failed to write, fd=%d, errno=%d (%s)",
116             fd, errno, strerror(errno));
117         return -EIO ;
118     }
119 
120     return ret;
121 }
122 
123 static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa *v)
124 {
125     int fd = v->device_fd;
126     struct vhost_msg_v2 msg = {
127         .type = v->msg_type,
128         .iotlb.type = VHOST_IOTLB_BATCH_BEGIN,
129     };
130 
131     trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type);
132     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
133         error_report("failed to write, fd=%d, errno=%d (%s)",
134                      fd, errno, strerror(errno));
135     }
136 }
137 
138 static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa *v)
139 {
140     if (v->dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) &&
141         !v->iotlb_batch_begin_sent) {
142         vhost_vdpa_listener_begin_batch(v);
143     }
144 
145     v->iotlb_batch_begin_sent = true;
146 }
147 
148 static void vhost_vdpa_listener_commit(MemoryListener *listener)
149 {
150     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
151     struct vhost_dev *dev = v->dev;
152     struct vhost_msg_v2 msg = {};
153     int fd = v->device_fd;
154 
155     if (!(dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) {
156         return;
157     }
158 
159     if (!v->iotlb_batch_begin_sent) {
160         return;
161     }
162 
163     msg.type = v->msg_type;
164     msg.iotlb.type = VHOST_IOTLB_BATCH_END;
165 
166     trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type);
167     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
168         error_report("failed to write, fd=%d, errno=%d (%s)",
169                      fd, errno, strerror(errno));
170     }
171 
172     v->iotlb_batch_begin_sent = false;
173 }
174 
175 static void vhost_vdpa_listener_region_add(MemoryListener *listener,
176                                            MemoryRegionSection *section)
177 {
178     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
179     hwaddr iova;
180     Int128 llend, llsize;
181     void *vaddr;
182     int ret;
183 
184     if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
185                                             v->iova_range.last)) {
186         return;
187     }
188 
189     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
190                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
191         error_report("%s received unaligned region", __func__);
192         return;
193     }
194 
195     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
196     llend = vhost_vdpa_section_end(section);
197     if (int128_ge(int128_make64(iova), llend)) {
198         return;
199     }
200 
201     memory_region_ref(section->mr);
202 
203     /* Here we assume that memory_region_is_ram(section->mr)==true */
204 
205     vaddr = memory_region_get_ram_ptr(section->mr) +
206             section->offset_within_region +
207             (iova - section->offset_within_address_space);
208 
209     trace_vhost_vdpa_listener_region_add(v, iova, int128_get64(llend),
210                                          vaddr, section->readonly);
211 
212     llsize = int128_sub(llend, int128_make64(iova));
213     if (v->shadow_vqs_enabled) {
214         DMAMap mem_region = {
215             .translated_addr = (hwaddr)(uintptr_t)vaddr,
216             .size = int128_get64(llsize) - 1,
217             .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
218         };
219 
220         int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
221         if (unlikely(r != IOVA_OK)) {
222             error_report("Can't allocate a mapping (%d)", r);
223             goto fail;
224         }
225 
226         iova = mem_region.iova;
227     }
228 
229     vhost_vdpa_iotlb_batch_begin_once(v);
230     ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
231                              vaddr, section->readonly);
232     if (ret) {
233         error_report("vhost vdpa map fail!");
234         goto fail;
235     }
236 
237     return;
238 
239 fail:
240     /*
241      * On the initfn path, store the first error in the container so we
242      * can gracefully fail.  Runtime, there's not much we can do other
243      * than throw a hardware error.
244      */
245     error_report("vhost-vdpa: DMA mapping failed, unable to continue");
246     return;
247 
248 }
249 
250 static void vhost_vdpa_listener_region_del(MemoryListener *listener,
251                                            MemoryRegionSection *section)
252 {
253     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
254     hwaddr iova;
255     Int128 llend, llsize;
256     int ret;
257 
258     if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
259                                             v->iova_range.last)) {
260         return;
261     }
262 
263     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
264                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
265         error_report("%s received unaligned region", __func__);
266         return;
267     }
268 
269     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
270     llend = vhost_vdpa_section_end(section);
271 
272     trace_vhost_vdpa_listener_region_del(v, iova, int128_get64(llend));
273 
274     if (int128_ge(int128_make64(iova), llend)) {
275         return;
276     }
277 
278     llsize = int128_sub(llend, int128_make64(iova));
279 
280     if (v->shadow_vqs_enabled) {
281         const DMAMap *result;
282         const void *vaddr = memory_region_get_ram_ptr(section->mr) +
283             section->offset_within_region +
284             (iova - section->offset_within_address_space);
285         DMAMap mem_region = {
286             .translated_addr = (hwaddr)(uintptr_t)vaddr,
287             .size = int128_get64(llsize) - 1,
288         };
289 
290         result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
291         iova = result->iova;
292         vhost_iova_tree_remove(v->iova_tree, &mem_region);
293     }
294     vhost_vdpa_iotlb_batch_begin_once(v);
295     ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
296     if (ret) {
297         error_report("vhost_vdpa dma unmap error!");
298     }
299 
300     memory_region_unref(section->mr);
301 }
302 /*
303  * IOTLB API is used by vhost-vdpa which requires incremental updating
304  * of the mapping. So we can not use generic vhost memory listener which
305  * depends on the addnop().
306  */
307 static const MemoryListener vhost_vdpa_memory_listener = {
308     .name = "vhost-vdpa",
309     .commit = vhost_vdpa_listener_commit,
310     .region_add = vhost_vdpa_listener_region_add,
311     .region_del = vhost_vdpa_listener_region_del,
312 };
313 
314 static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request,
315                              void *arg)
316 {
317     struct vhost_vdpa *v = dev->opaque;
318     int fd = v->device_fd;
319     int ret;
320 
321     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
322 
323     ret = ioctl(fd, request, arg);
324     return ret < 0 ? -errno : ret;
325 }
326 
327 static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status)
328 {
329     uint8_t s;
330     int ret;
331 
332     trace_vhost_vdpa_add_status(dev, status);
333     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
334     if (ret < 0) {
335         return ret;
336     }
337 
338     s |= status;
339 
340     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s);
341     if (ret < 0) {
342         return ret;
343     }
344 
345     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
346     if (ret < 0) {
347         return ret;
348     }
349 
350     if (!(s & status)) {
351         return -EIO;
352     }
353 
354     return 0;
355 }
356 
357 static void vhost_vdpa_get_iova_range(struct vhost_vdpa *v)
358 {
359     int ret = vhost_vdpa_call(v->dev, VHOST_VDPA_GET_IOVA_RANGE,
360                               &v->iova_range);
361     if (ret != 0) {
362         v->iova_range.first = 0;
363         v->iova_range.last = UINT64_MAX;
364     }
365 
366     trace_vhost_vdpa_get_iova_range(v->dev, v->iova_range.first,
367                                     v->iova_range.last);
368 }
369 
370 /*
371  * The use of this function is for requests that only need to be
372  * applied once. Typically such request occurs at the beginning
373  * of operation, and before setting up queues. It should not be
374  * used for request that performs operation until all queues are
375  * set, which would need to check dev->vq_index_end instead.
376  */
377 static bool vhost_vdpa_first_dev(struct vhost_dev *dev)
378 {
379     struct vhost_vdpa *v = dev->opaque;
380 
381     return v->index == 0;
382 }
383 
384 static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
385                                        uint64_t *features)
386 {
387     int ret;
388 
389     ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
390     trace_vhost_vdpa_get_features(dev, *features);
391     return ret;
392 }
393 
394 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
395                                Error **errp)
396 {
397     g_autoptr(GPtrArray) shadow_vqs = NULL;
398     uint64_t dev_features, svq_features;
399     int r;
400     bool ok;
401 
402     if (!v->shadow_vqs_enabled) {
403         return 0;
404     }
405 
406     r = vhost_vdpa_get_dev_features(hdev, &dev_features);
407     if (r != 0) {
408         error_setg_errno(errp, -r, "Can't get vdpa device features");
409         return r;
410     }
411 
412     svq_features = dev_features;
413     ok = vhost_svq_valid_features(svq_features, errp);
414     if (unlikely(!ok)) {
415         return -1;
416     }
417 
418     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
419     for (unsigned n = 0; n < hdev->nvqs; ++n) {
420         g_autoptr(VhostShadowVirtqueue) svq;
421 
422         svq = vhost_svq_new(v->iova_tree, v->shadow_vq_ops,
423                             v->shadow_vq_ops_opaque);
424         if (unlikely(!svq)) {
425             error_setg(errp, "Cannot create svq %u", n);
426             return -1;
427         }
428         g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq));
429     }
430 
431     v->shadow_vqs = g_steal_pointer(&shadow_vqs);
432     return 0;
433 }
434 
435 static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
436 {
437     struct vhost_vdpa *v;
438     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
439     trace_vhost_vdpa_init(dev, opaque);
440     int ret;
441 
442     /*
443      * Similar to VFIO, we end up pinning all guest memory and have to
444      * disable discarding of RAM.
445      */
446     ret = ram_block_discard_disable(true);
447     if (ret) {
448         error_report("Cannot set discarding of RAM broken");
449         return ret;
450     }
451 
452     v = opaque;
453     v->dev = dev;
454     dev->opaque =  opaque ;
455     v->listener = vhost_vdpa_memory_listener;
456     v->msg_type = VHOST_IOTLB_MSG_V2;
457     ret = vhost_vdpa_init_svq(dev, v, errp);
458     if (ret) {
459         goto err;
460     }
461 
462     vhost_vdpa_get_iova_range(v);
463 
464     if (!vhost_vdpa_first_dev(dev)) {
465         return 0;
466     }
467 
468     vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
469                                VIRTIO_CONFIG_S_DRIVER);
470 
471     return 0;
472 
473 err:
474     ram_block_discard_disable(false);
475     return ret;
476 }
477 
478 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
479                                             int queue_index)
480 {
481     size_t page_size = qemu_real_host_page_size();
482     struct vhost_vdpa *v = dev->opaque;
483     VirtIODevice *vdev = dev->vdev;
484     VhostVDPAHostNotifier *n;
485 
486     n = &v->notifier[queue_index];
487 
488     if (n->addr) {
489         virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false);
490         object_unparent(OBJECT(&n->mr));
491         munmap(n->addr, page_size);
492         n->addr = NULL;
493     }
494 }
495 
496 static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index)
497 {
498     size_t page_size = qemu_real_host_page_size();
499     struct vhost_vdpa *v = dev->opaque;
500     VirtIODevice *vdev = dev->vdev;
501     VhostVDPAHostNotifier *n;
502     int fd = v->device_fd;
503     void *addr;
504     char *name;
505 
506     vhost_vdpa_host_notifier_uninit(dev, queue_index);
507 
508     n = &v->notifier[queue_index];
509 
510     addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd,
511                 queue_index * page_size);
512     if (addr == MAP_FAILED) {
513         goto err;
514     }
515 
516     name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]",
517                            v, queue_index);
518     memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
519                                       page_size, addr);
520     g_free(name);
521 
522     if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) {
523         object_unparent(OBJECT(&n->mr));
524         munmap(addr, page_size);
525         goto err;
526     }
527     n->addr = addr;
528 
529     return 0;
530 
531 err:
532     return -1;
533 }
534 
535 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
536 {
537     int i;
538 
539     for (i = dev->vq_index; i < dev->vq_index + n; i++) {
540         vhost_vdpa_host_notifier_uninit(dev, i);
541     }
542 }
543 
544 static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
545 {
546     struct vhost_vdpa *v = dev->opaque;
547     int i;
548 
549     if (v->shadow_vqs_enabled) {
550         /* FIXME SVQ is not compatible with host notifiers mr */
551         return;
552     }
553 
554     for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
555         if (vhost_vdpa_host_notifier_init(dev, i)) {
556             goto err;
557         }
558     }
559 
560     return;
561 
562 err:
563     vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index);
564     return;
565 }
566 
567 static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
568 {
569     struct vhost_vdpa *v = dev->opaque;
570     size_t idx;
571 
572     if (!v->shadow_vqs) {
573         return;
574     }
575 
576     for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
577         vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
578     }
579     g_ptr_array_free(v->shadow_vqs, true);
580 }
581 
582 static int vhost_vdpa_cleanup(struct vhost_dev *dev)
583 {
584     struct vhost_vdpa *v;
585     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
586     v = dev->opaque;
587     trace_vhost_vdpa_cleanup(dev, v);
588     vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
589     memory_listener_unregister(&v->listener);
590     vhost_vdpa_svq_cleanup(dev);
591 
592     dev->opaque = NULL;
593     ram_block_discard_disable(false);
594 
595     return 0;
596 }
597 
598 static int vhost_vdpa_memslots_limit(struct vhost_dev *dev)
599 {
600     trace_vhost_vdpa_memslots_limit(dev, INT_MAX);
601     return INT_MAX;
602 }
603 
604 static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
605                                     struct vhost_memory *mem)
606 {
607     if (!vhost_vdpa_first_dev(dev)) {
608         return 0;
609     }
610 
611     trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding);
612     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) &&
613         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) {
614         int i;
615         for (i = 0; i < mem->nregions; i++) {
616             trace_vhost_vdpa_dump_regions(dev, i,
617                                           mem->regions[i].guest_phys_addr,
618                                           mem->regions[i].memory_size,
619                                           mem->regions[i].userspace_addr,
620                                           mem->regions[i].flags_padding);
621         }
622     }
623     if (mem->padding) {
624         return -EINVAL;
625     }
626 
627     return 0;
628 }
629 
630 static int vhost_vdpa_set_features(struct vhost_dev *dev,
631                                    uint64_t features)
632 {
633     struct vhost_vdpa *v = dev->opaque;
634     int ret;
635 
636     if (!vhost_vdpa_first_dev(dev)) {
637         return 0;
638     }
639 
640     if (v->shadow_vqs_enabled) {
641         if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
642             /*
643              * QEMU is just trying to enable or disable logging. SVQ handles
644              * this sepparately, so no need to forward this.
645              */
646             v->acked_features = features;
647             return 0;
648         }
649 
650         v->acked_features = features;
651 
652         /* We must not ack _F_LOG if SVQ is enabled */
653         features &= ~BIT_ULL(VHOST_F_LOG_ALL);
654     }
655 
656     trace_vhost_vdpa_set_features(dev, features);
657     ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
658     if (ret) {
659         return ret;
660     }
661 
662     return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
663 }
664 
665 static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev)
666 {
667     uint64_t features;
668     uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
669         0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH;
670     int r;
671 
672     if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) {
673         return -EFAULT;
674     }
675 
676     features &= f;
677 
678     if (vhost_vdpa_first_dev(dev)) {
679         r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features);
680         if (r) {
681             return -EFAULT;
682         }
683     }
684 
685     dev->backend_cap = features;
686 
687     return 0;
688 }
689 
690 static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
691                                     uint32_t *device_id)
692 {
693     int ret;
694     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id);
695     trace_vhost_vdpa_get_device_id(dev, *device_id);
696     return ret;
697 }
698 
699 static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
700 {
701     if (!v->shadow_vqs_enabled) {
702         return;
703     }
704 
705     for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
706         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
707         vhost_svq_stop(svq);
708     }
709 }
710 
711 static int vhost_vdpa_reset_device(struct vhost_dev *dev)
712 {
713     struct vhost_vdpa *v = dev->opaque;
714     int ret;
715     uint8_t status = 0;
716 
717     vhost_vdpa_reset_svq(v);
718 
719     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
720     trace_vhost_vdpa_reset_device(dev, status);
721     return ret;
722 }
723 
724 static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx)
725 {
726     assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
727 
728     trace_vhost_vdpa_get_vq_index(dev, idx, idx);
729     return idx;
730 }
731 
732 static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev)
733 {
734     int i;
735     trace_vhost_vdpa_set_vring_ready(dev);
736     for (i = 0; i < dev->nvqs; ++i) {
737         struct vhost_vring_state state = {
738             .index = dev->vq_index + i,
739             .num = 1,
740         };
741         vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);
742     }
743     return 0;
744 }
745 
746 static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config,
747                                    uint32_t config_len)
748 {
749     int b, len;
750     char line[QEMU_HEXDUMP_LINE_LEN];
751 
752     for (b = 0; b < config_len; b += 16) {
753         len = config_len - b;
754         qemu_hexdump_line(line, b, config, len, false);
755         trace_vhost_vdpa_dump_config(dev, line);
756     }
757 }
758 
759 static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data,
760                                    uint32_t offset, uint32_t size,
761                                    uint32_t flags)
762 {
763     struct vhost_vdpa_config *config;
764     int ret;
765     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
766 
767     trace_vhost_vdpa_set_config(dev, offset, size, flags);
768     config = g_malloc(size + config_size);
769     config->off = offset;
770     config->len = size;
771     memcpy(config->buf, data, size);
772     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) &&
773         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
774         vhost_vdpa_dump_config(dev, data, size);
775     }
776     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config);
777     g_free(config);
778     return ret;
779 }
780 
781 static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
782                                    uint32_t config_len, Error **errp)
783 {
784     struct vhost_vdpa_config *v_config;
785     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
786     int ret;
787 
788     trace_vhost_vdpa_get_config(dev, config, config_len);
789     v_config = g_malloc(config_len + config_size);
790     v_config->len = config_len;
791     v_config->off = 0;
792     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config);
793     memcpy(config, v_config->buf, config_len);
794     g_free(v_config);
795     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) &&
796         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
797         vhost_vdpa_dump_config(dev, config, config_len);
798     }
799     return ret;
800  }
801 
802 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
803                                          struct vhost_vring_state *ring)
804 {
805     trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
806     return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
807 }
808 
809 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
810                                          struct vhost_vring_file *file)
811 {
812     trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
813     return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
814 }
815 
816 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
817                                          struct vhost_vring_file *file)
818 {
819     trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
820     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
821 }
822 
823 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
824                                          struct vhost_vring_addr *addr)
825 {
826     trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
827                                 addr->desc_user_addr, addr->used_user_addr,
828                                 addr->avail_user_addr,
829                                 addr->log_guest_addr);
830 
831     return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
832 
833 }
834 
835 /**
836  * Set the shadow virtqueue descriptors to the device
837  *
838  * @dev: The vhost device model
839  * @svq: The shadow virtqueue
840  * @idx: The index of the virtqueue in the vhost device
841  * @errp: Error
842  *
843  * Note that this function does not rewind kick file descriptor if cannot set
844  * call one.
845  */
846 static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
847                                   VhostShadowVirtqueue *svq, unsigned idx,
848                                   Error **errp)
849 {
850     struct vhost_vring_file file = {
851         .index = dev->vq_index + idx,
852     };
853     const EventNotifier *event_notifier = &svq->hdev_kick;
854     int r;
855 
856     file.fd = event_notifier_get_fd(event_notifier);
857     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
858     if (unlikely(r != 0)) {
859         error_setg_errno(errp, -r, "Can't set device kick fd");
860         return r;
861     }
862 
863     event_notifier = &svq->hdev_call;
864     file.fd = event_notifier_get_fd(event_notifier);
865     r = vhost_vdpa_set_vring_dev_call(dev, &file);
866     if (unlikely(r != 0)) {
867         error_setg_errno(errp, -r, "Can't set device call fd");
868     }
869 
870     return r;
871 }
872 
873 /**
874  * Unmap a SVQ area in the device
875  */
876 static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
877                                       const DMAMap *needle)
878 {
879     const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
880     hwaddr size;
881     int r;
882 
883     if (unlikely(!result)) {
884         error_report("Unable to find SVQ address to unmap");
885         return false;
886     }
887 
888     size = ROUND_UP(result->size, qemu_real_host_page_size());
889     r = vhost_vdpa_dma_unmap(v, result->iova, size);
890     return r == 0;
891 }
892 
893 static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
894                                        const VhostShadowVirtqueue *svq)
895 {
896     DMAMap needle = {};
897     struct vhost_vdpa *v = dev->opaque;
898     struct vhost_vring_addr svq_addr;
899     bool ok;
900 
901     vhost_svq_get_vring_addr(svq, &svq_addr);
902 
903     needle.translated_addr = svq_addr.desc_user_addr;
904     ok = vhost_vdpa_svq_unmap_ring(v, &needle);
905     if (unlikely(!ok)) {
906         return false;
907     }
908 
909     needle.translated_addr = svq_addr.used_user_addr;
910     return vhost_vdpa_svq_unmap_ring(v, &needle);
911 }
912 
913 /**
914  * Map the SVQ area in the device
915  *
916  * @v: Vhost-vdpa device
917  * @needle: The area to search iova
918  * @errorp: Error pointer
919  */
920 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
921                                     Error **errp)
922 {
923     int r;
924 
925     r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
926     if (unlikely(r != IOVA_OK)) {
927         error_setg(errp, "Cannot allocate iova (%d)", r);
928         return false;
929     }
930 
931     r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
932                            (void *)(uintptr_t)needle->translated_addr,
933                            needle->perm == IOMMU_RO);
934     if (unlikely(r != 0)) {
935         error_setg_errno(errp, -r, "Cannot map region to device");
936         vhost_iova_tree_remove(v->iova_tree, needle);
937     }
938 
939     return r == 0;
940 }
941 
942 /**
943  * Map the shadow virtqueue rings in the device
944  *
945  * @dev: The vhost device
946  * @svq: The shadow virtqueue
947  * @addr: Assigned IOVA addresses
948  * @errp: Error pointer
949  */
950 static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
951                                      const VhostShadowVirtqueue *svq,
952                                      struct vhost_vring_addr *addr,
953                                      Error **errp)
954 {
955     DMAMap device_region, driver_region;
956     struct vhost_vring_addr svq_addr;
957     struct vhost_vdpa *v = dev->opaque;
958     size_t device_size = vhost_svq_device_area_size(svq);
959     size_t driver_size = vhost_svq_driver_area_size(svq);
960     size_t avail_offset;
961     bool ok;
962 
963     ERRP_GUARD();
964     vhost_svq_get_vring_addr(svq, &svq_addr);
965 
966     driver_region = (DMAMap) {
967         .translated_addr = svq_addr.desc_user_addr,
968         .size = driver_size - 1,
969         .perm = IOMMU_RO,
970     };
971     ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
972     if (unlikely(!ok)) {
973         error_prepend(errp, "Cannot create vq driver region: ");
974         return false;
975     }
976     addr->desc_user_addr = driver_region.iova;
977     avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
978     addr->avail_user_addr = driver_region.iova + avail_offset;
979 
980     device_region = (DMAMap) {
981         .translated_addr = svq_addr.used_user_addr,
982         .size = device_size - 1,
983         .perm = IOMMU_RW,
984     };
985     ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
986     if (unlikely(!ok)) {
987         error_prepend(errp, "Cannot create vq device region: ");
988         vhost_vdpa_svq_unmap_ring(v, &driver_region);
989     }
990     addr->used_user_addr = device_region.iova;
991 
992     return ok;
993 }
994 
995 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
996                                  VhostShadowVirtqueue *svq, unsigned idx,
997                                  Error **errp)
998 {
999     uint16_t vq_index = dev->vq_index + idx;
1000     struct vhost_vring_state s = {
1001         .index = vq_index,
1002     };
1003     int r;
1004 
1005     r = vhost_vdpa_set_dev_vring_base(dev, &s);
1006     if (unlikely(r)) {
1007         error_setg_errno(errp, -r, "Cannot set vring base");
1008         return false;
1009     }
1010 
1011     r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
1012     return r == 0;
1013 }
1014 
1015 static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
1016 {
1017     struct vhost_vdpa *v = dev->opaque;
1018     Error *err = NULL;
1019     unsigned i;
1020 
1021     if (!v->shadow_vqs) {
1022         return true;
1023     }
1024 
1025     for (i = 0; i < v->shadow_vqs->len; ++i) {
1026         VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
1027         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1028         struct vhost_vring_addr addr = {
1029             .index = dev->vq_index + i,
1030         };
1031         int r;
1032         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
1033         if (unlikely(!ok)) {
1034             goto err;
1035         }
1036 
1037         vhost_svq_start(svq, dev->vdev, vq);
1038         ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
1039         if (unlikely(!ok)) {
1040             goto err_map;
1041         }
1042 
1043         /* Override vring GPA set by vhost subsystem */
1044         r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
1045         if (unlikely(r != 0)) {
1046             error_setg_errno(&err, -r, "Cannot set device address");
1047             goto err_set_addr;
1048         }
1049     }
1050 
1051     return true;
1052 
1053 err_set_addr:
1054     vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
1055 
1056 err_map:
1057     vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
1058 
1059 err:
1060     error_reportf_err(err, "Cannot setup SVQ %u: ", i);
1061     for (unsigned j = 0; j < i; ++j) {
1062         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
1063         vhost_vdpa_svq_unmap_rings(dev, svq);
1064         vhost_svq_stop(svq);
1065     }
1066 
1067     return false;
1068 }
1069 
1070 static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
1071 {
1072     struct vhost_vdpa *v = dev->opaque;
1073 
1074     if (!v->shadow_vqs) {
1075         return true;
1076     }
1077 
1078     for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
1079         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1080         bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
1081         if (unlikely(!ok)) {
1082             return false;
1083         }
1084     }
1085 
1086     return true;
1087 }
1088 
1089 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
1090 {
1091     struct vhost_vdpa *v = dev->opaque;
1092     bool ok;
1093     trace_vhost_vdpa_dev_start(dev, started);
1094 
1095     if (started) {
1096         vhost_vdpa_host_notifiers_init(dev);
1097         ok = vhost_vdpa_svqs_start(dev);
1098         if (unlikely(!ok)) {
1099             return -1;
1100         }
1101         vhost_vdpa_set_vring_ready(dev);
1102     } else {
1103         ok = vhost_vdpa_svqs_stop(dev);
1104         if (unlikely(!ok)) {
1105             return -1;
1106         }
1107         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
1108     }
1109 
1110     if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
1111         return 0;
1112     }
1113 
1114     if (started) {
1115         memory_listener_register(&v->listener, &address_space_memory);
1116         return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
1117     } else {
1118         vhost_vdpa_reset_device(dev);
1119         vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
1120                                    VIRTIO_CONFIG_S_DRIVER);
1121         memory_listener_unregister(&v->listener);
1122 
1123         return 0;
1124     }
1125 }
1126 
1127 static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
1128                                      struct vhost_log *log)
1129 {
1130     struct vhost_vdpa *v = dev->opaque;
1131     if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) {
1132         return 0;
1133     }
1134 
1135     trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd,
1136                                   log->log);
1137     return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base);
1138 }
1139 
1140 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
1141                                        struct vhost_vring_addr *addr)
1142 {
1143     struct vhost_vdpa *v = dev->opaque;
1144 
1145     if (v->shadow_vqs_enabled) {
1146         /*
1147          * Device vring addr was set at device start. SVQ base is handled by
1148          * VirtQueue code.
1149          */
1150         return 0;
1151     }
1152 
1153     return vhost_vdpa_set_vring_dev_addr(dev, addr);
1154 }
1155 
1156 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
1157                                       struct vhost_vring_state *ring)
1158 {
1159     trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num);
1160     return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring);
1161 }
1162 
1163 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
1164                                        struct vhost_vring_state *ring)
1165 {
1166     struct vhost_vdpa *v = dev->opaque;
1167 
1168     if (v->shadow_vqs_enabled) {
1169         /*
1170          * Device vring base was set at device start. SVQ base is handled by
1171          * VirtQueue code.
1172          */
1173         return 0;
1174     }
1175 
1176     return vhost_vdpa_set_dev_vring_base(dev, ring);
1177 }
1178 
1179 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
1180                                        struct vhost_vring_state *ring)
1181 {
1182     struct vhost_vdpa *v = dev->opaque;
1183     int vdpa_idx = ring->index - dev->vq_index;
1184     int ret;
1185 
1186     if (v->shadow_vqs_enabled) {
1187         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1188 
1189         /*
1190          * Setting base as last used idx, so destination will see as available
1191          * all the entries that the device did not use, including the in-flight
1192          * processing ones.
1193          *
1194          * TODO: This is ok for networking, but other kinds of devices might
1195          * have problems with these retransmissions.
1196          */
1197         ring->num = svq->last_used_idx;
1198         return 0;
1199     }
1200 
1201     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
1202     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
1203     return ret;
1204 }
1205 
1206 static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
1207                                        struct vhost_vring_file *file)
1208 {
1209     struct vhost_vdpa *v = dev->opaque;
1210     int vdpa_idx = file->index - dev->vq_index;
1211 
1212     if (v->shadow_vqs_enabled) {
1213         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1214         vhost_svq_set_svq_kick_fd(svq, file->fd);
1215         return 0;
1216     } else {
1217         return vhost_vdpa_set_vring_dev_kick(dev, file);
1218     }
1219 }
1220 
1221 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
1222                                        struct vhost_vring_file *file)
1223 {
1224     struct vhost_vdpa *v = dev->opaque;
1225 
1226     if (v->shadow_vqs_enabled) {
1227         int vdpa_idx = file->index - dev->vq_index;
1228         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1229 
1230         vhost_svq_set_svq_call_fd(svq, file->fd);
1231         return 0;
1232     } else {
1233         return vhost_vdpa_set_vring_dev_call(dev, file);
1234     }
1235 }
1236 
1237 static int vhost_vdpa_get_features(struct vhost_dev *dev,
1238                                      uint64_t *features)
1239 {
1240     struct vhost_vdpa *v = dev->opaque;
1241     int ret = vhost_vdpa_get_dev_features(dev, features);
1242 
1243     if (ret == 0 && v->shadow_vqs_enabled) {
1244         /* Add SVQ logging capabilities */
1245         *features |= BIT_ULL(VHOST_F_LOG_ALL);
1246     }
1247 
1248     return ret;
1249 }
1250 
1251 static int vhost_vdpa_set_owner(struct vhost_dev *dev)
1252 {
1253     if (!vhost_vdpa_first_dev(dev)) {
1254         return 0;
1255     }
1256 
1257     trace_vhost_vdpa_set_owner(dev);
1258     return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL);
1259 }
1260 
1261 static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
1262                     struct vhost_vring_addr *addr, struct vhost_virtqueue *vq)
1263 {
1264     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
1265     addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys;
1266     addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys;
1267     addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys;
1268     trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr,
1269                                  addr->avail_user_addr, addr->used_user_addr);
1270     return 0;
1271 }
1272 
1273 static bool  vhost_vdpa_force_iommu(struct vhost_dev *dev)
1274 {
1275     return true;
1276 }
1277 
1278 const VhostOps vdpa_ops = {
1279         .backend_type = VHOST_BACKEND_TYPE_VDPA,
1280         .vhost_backend_init = vhost_vdpa_init,
1281         .vhost_backend_cleanup = vhost_vdpa_cleanup,
1282         .vhost_set_log_base = vhost_vdpa_set_log_base,
1283         .vhost_set_vring_addr = vhost_vdpa_set_vring_addr,
1284         .vhost_set_vring_num = vhost_vdpa_set_vring_num,
1285         .vhost_set_vring_base = vhost_vdpa_set_vring_base,
1286         .vhost_get_vring_base = vhost_vdpa_get_vring_base,
1287         .vhost_set_vring_kick = vhost_vdpa_set_vring_kick,
1288         .vhost_set_vring_call = vhost_vdpa_set_vring_call,
1289         .vhost_get_features = vhost_vdpa_get_features,
1290         .vhost_set_backend_cap = vhost_vdpa_set_backend_cap,
1291         .vhost_set_owner = vhost_vdpa_set_owner,
1292         .vhost_set_vring_endian = NULL,
1293         .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit,
1294         .vhost_set_mem_table = vhost_vdpa_set_mem_table,
1295         .vhost_set_features = vhost_vdpa_set_features,
1296         .vhost_reset_device = vhost_vdpa_reset_device,
1297         .vhost_get_vq_index = vhost_vdpa_get_vq_index,
1298         .vhost_get_config  = vhost_vdpa_get_config,
1299         .vhost_set_config = vhost_vdpa_set_config,
1300         .vhost_requires_shm_log = NULL,
1301         .vhost_migration_done = NULL,
1302         .vhost_backend_can_merge = NULL,
1303         .vhost_net_set_mtu = NULL,
1304         .vhost_set_iotlb_callback = NULL,
1305         .vhost_send_device_iotlb_msg = NULL,
1306         .vhost_dev_start = vhost_vdpa_dev_start,
1307         .vhost_get_device_id = vhost_vdpa_get_device_id,
1308         .vhost_vq_get_addr = vhost_vdpa_vq_get_addr,
1309         .vhost_force_iommu = vhost_vdpa_force_iommu,
1310 };
1311