xref: /qemu/hw/virtio/vhost-vdpa.c (revision cd831ed5c4add8ed6ee980c3645b241cbef5130f)
1 /*
2  * vhost-vdpa
3  *
4  *  Copyright(c) 2017-2018 Intel Corporation.
5  *  Copyright(c) 2020 Red Hat, Inc.
6  *
7  * This work is licensed under the terms of the GNU GPL, version 2 or later.
8  * See the COPYING file in the top-level directory.
9  *
10  */
11 
12 #include "qemu/osdep.h"
13 #include <linux/vhost.h>
14 #include <linux/vfio.h>
15 #include <sys/eventfd.h>
16 #include <sys/ioctl.h>
17 #include "hw/virtio/vhost.h"
18 #include "hw/virtio/vhost-backend.h"
19 #include "hw/virtio/virtio-net.h"
20 #include "hw/virtio/vhost-shadow-virtqueue.h"
21 #include "hw/virtio/vhost-vdpa.h"
22 #include "exec/address-spaces.h"
23 #include "migration/blocker.h"
24 #include "qemu/cutils.h"
25 #include "qemu/main-loop.h"
26 #include "cpu.h"
27 #include "trace.h"
28 #include "qapi/error.h"
29 
30 /*
31  * Return one past the end of the end of section. Be careful with uint64_t
32  * conversions!
33  */
34 static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section)
35 {
36     Int128 llend = int128_make64(section->offset_within_address_space);
37     llend = int128_add(llend, section->size);
38     llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
39 
40     return llend;
41 }
42 
43 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
44                                                 uint64_t iova_min,
45                                                 uint64_t iova_max)
46 {
47     Int128 llend;
48 
49     if ((!memory_region_is_ram(section->mr) &&
50          !memory_region_is_iommu(section->mr)) ||
51         memory_region_is_protected(section->mr) ||
52         /* vhost-vDPA doesn't allow MMIO to be mapped  */
53         memory_region_is_ram_device(section->mr)) {
54         return true;
55     }
56 
57     if (section->offset_within_address_space < iova_min) {
58         error_report("RAM section out of device range (min=0x%" PRIx64
59                      ", addr=0x%" HWADDR_PRIx ")",
60                      iova_min, section->offset_within_address_space);
61         return true;
62     }
63 
64     llend = vhost_vdpa_section_end(section);
65     if (int128_gt(llend, int128_make64(iova_max))) {
66         error_report("RAM section out of device range (max=0x%" PRIx64
67                      ", end addr=0x%" PRIx64 ")",
68                      iova_max, int128_get64(llend));
69         return true;
70     }
71 
72     return false;
73 }
74 
75 /*
76  * The caller must set asid = 0 if the device does not support asid.
77  * This is not an ABI break since it is set to 0 by the initializer anyway.
78  */
79 int vhost_vdpa_dma_map(struct vhost_vdpa *v, uint32_t asid, hwaddr iova,
80                        hwaddr size, void *vaddr, bool readonly)
81 {
82     struct vhost_msg_v2 msg = {};
83     int fd = v->device_fd;
84     int ret = 0;
85 
86     msg.type = v->msg_type;
87     msg.asid = asid;
88     msg.iotlb.iova = iova;
89     msg.iotlb.size = size;
90     msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr;
91     msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW;
92     msg.iotlb.type = VHOST_IOTLB_UPDATE;
93 
94     trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.asid, msg.iotlb.iova,
95                              msg.iotlb.size, msg.iotlb.uaddr, msg.iotlb.perm,
96                              msg.iotlb.type);
97 
98     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
99         error_report("failed to write, fd=%d, errno=%d (%s)",
100             fd, errno, strerror(errno));
101         return -EIO ;
102     }
103 
104     return ret;
105 }
106 
107 /*
108  * The caller must set asid = 0 if the device does not support asid.
109  * This is not an ABI break since it is set to 0 by the initializer anyway.
110  */
111 int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, uint32_t asid, hwaddr iova,
112                          hwaddr size)
113 {
114     struct vhost_msg_v2 msg = {};
115     int fd = v->device_fd;
116     int ret = 0;
117 
118     msg.type = v->msg_type;
119     msg.asid = asid;
120     msg.iotlb.iova = iova;
121     msg.iotlb.size = size;
122     msg.iotlb.type = VHOST_IOTLB_INVALIDATE;
123 
124     trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.asid, msg.iotlb.iova,
125                                msg.iotlb.size, msg.iotlb.type);
126 
127     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
128         error_report("failed to write, fd=%d, errno=%d (%s)",
129             fd, errno, strerror(errno));
130         return -EIO ;
131     }
132 
133     return ret;
134 }
135 
136 static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa *v)
137 {
138     int fd = v->device_fd;
139     struct vhost_msg_v2 msg = {
140         .type = v->msg_type,
141         .iotlb.type = VHOST_IOTLB_BATCH_BEGIN,
142     };
143 
144     trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type);
145     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
146         error_report("failed to write, fd=%d, errno=%d (%s)",
147                      fd, errno, strerror(errno));
148     }
149 }
150 
151 static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa *v)
152 {
153     if (v->dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) &&
154         !v->iotlb_batch_begin_sent) {
155         vhost_vdpa_listener_begin_batch(v);
156     }
157 
158     v->iotlb_batch_begin_sent = true;
159 }
160 
161 static void vhost_vdpa_listener_commit(MemoryListener *listener)
162 {
163     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
164     struct vhost_dev *dev = v->dev;
165     struct vhost_msg_v2 msg = {};
166     int fd = v->device_fd;
167 
168     if (!(dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) {
169         return;
170     }
171 
172     if (!v->iotlb_batch_begin_sent) {
173         return;
174     }
175 
176     msg.type = v->msg_type;
177     msg.iotlb.type = VHOST_IOTLB_BATCH_END;
178 
179     trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type);
180     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
181         error_report("failed to write, fd=%d, errno=%d (%s)",
182                      fd, errno, strerror(errno));
183     }
184 
185     v->iotlb_batch_begin_sent = false;
186 }
187 
188 static void vhost_vdpa_listener_region_add(MemoryListener *listener,
189                                            MemoryRegionSection *section)
190 {
191     DMAMap mem_region = {};
192     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
193     hwaddr iova;
194     Int128 llend, llsize;
195     void *vaddr;
196     int ret;
197 
198     if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
199                                             v->iova_range.last)) {
200         return;
201     }
202 
203     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
204                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
205         error_report("%s received unaligned region", __func__);
206         return;
207     }
208 
209     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
210     llend = vhost_vdpa_section_end(section);
211     if (int128_ge(int128_make64(iova), llend)) {
212         return;
213     }
214 
215     memory_region_ref(section->mr);
216 
217     /* Here we assume that memory_region_is_ram(section->mr)==true */
218 
219     vaddr = memory_region_get_ram_ptr(section->mr) +
220             section->offset_within_region +
221             (iova - section->offset_within_address_space);
222 
223     trace_vhost_vdpa_listener_region_add(v, iova, int128_get64(llend),
224                                          vaddr, section->readonly);
225 
226     llsize = int128_sub(llend, int128_make64(iova));
227     if (v->shadow_vqs_enabled) {
228         int r;
229 
230         mem_region.translated_addr = (hwaddr)(uintptr_t)vaddr,
231         mem_region.size = int128_get64(llsize) - 1,
232         mem_region.perm = IOMMU_ACCESS_FLAG(true, section->readonly),
233 
234         r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
235         if (unlikely(r != IOVA_OK)) {
236             error_report("Can't allocate a mapping (%d)", r);
237             goto fail;
238         }
239 
240         iova = mem_region.iova;
241     }
242 
243     vhost_vdpa_iotlb_batch_begin_once(v);
244     ret = vhost_vdpa_dma_map(v, VHOST_VDPA_GUEST_PA_ASID, iova,
245                              int128_get64(llsize), vaddr, section->readonly);
246     if (ret) {
247         error_report("vhost vdpa map fail!");
248         goto fail_map;
249     }
250 
251     return;
252 
253 fail_map:
254     if (v->shadow_vqs_enabled) {
255         vhost_iova_tree_remove(v->iova_tree, mem_region);
256     }
257 
258 fail:
259     /*
260      * On the initfn path, store the first error in the container so we
261      * can gracefully fail.  Runtime, there's not much we can do other
262      * than throw a hardware error.
263      */
264     error_report("vhost-vdpa: DMA mapping failed, unable to continue");
265     return;
266 
267 }
268 
269 static void vhost_vdpa_listener_region_del(MemoryListener *listener,
270                                            MemoryRegionSection *section)
271 {
272     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
273     hwaddr iova;
274     Int128 llend, llsize;
275     int ret;
276 
277     if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
278                                             v->iova_range.last)) {
279         return;
280     }
281 
282     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
283                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
284         error_report("%s received unaligned region", __func__);
285         return;
286     }
287 
288     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
289     llend = vhost_vdpa_section_end(section);
290 
291     trace_vhost_vdpa_listener_region_del(v, iova, int128_get64(llend));
292 
293     if (int128_ge(int128_make64(iova), llend)) {
294         return;
295     }
296 
297     llsize = int128_sub(llend, int128_make64(iova));
298 
299     if (v->shadow_vqs_enabled) {
300         const DMAMap *result;
301         const void *vaddr = memory_region_get_ram_ptr(section->mr) +
302             section->offset_within_region +
303             (iova - section->offset_within_address_space);
304         DMAMap mem_region = {
305             .translated_addr = (hwaddr)(uintptr_t)vaddr,
306             .size = int128_get64(llsize) - 1,
307         };
308 
309         result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
310         if (!result) {
311             /* The memory listener map wasn't mapped */
312             return;
313         }
314         iova = result->iova;
315         vhost_iova_tree_remove(v->iova_tree, *result);
316     }
317     vhost_vdpa_iotlb_batch_begin_once(v);
318     ret = vhost_vdpa_dma_unmap(v, VHOST_VDPA_GUEST_PA_ASID, iova,
319                                int128_get64(llsize));
320     if (ret) {
321         error_report("vhost_vdpa dma unmap error!");
322     }
323 
324     memory_region_unref(section->mr);
325 }
326 /*
327  * IOTLB API is used by vhost-vdpa which requires incremental updating
328  * of the mapping. So we can not use generic vhost memory listener which
329  * depends on the addnop().
330  */
331 static const MemoryListener vhost_vdpa_memory_listener = {
332     .name = "vhost-vdpa",
333     .commit = vhost_vdpa_listener_commit,
334     .region_add = vhost_vdpa_listener_region_add,
335     .region_del = vhost_vdpa_listener_region_del,
336 };
337 
338 static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request,
339                              void *arg)
340 {
341     struct vhost_vdpa *v = dev->opaque;
342     int fd = v->device_fd;
343     int ret;
344 
345     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
346 
347     ret = ioctl(fd, request, arg);
348     return ret < 0 ? -errno : ret;
349 }
350 
351 static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status)
352 {
353     uint8_t s;
354     int ret;
355 
356     trace_vhost_vdpa_add_status(dev, status);
357     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
358     if (ret < 0) {
359         return ret;
360     }
361 
362     s |= status;
363 
364     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s);
365     if (ret < 0) {
366         return ret;
367     }
368 
369     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
370     if (ret < 0) {
371         return ret;
372     }
373 
374     if (!(s & status)) {
375         return -EIO;
376     }
377 
378     return 0;
379 }
380 
381 /*
382  * The use of this function is for requests that only need to be
383  * applied once. Typically such request occurs at the beginning
384  * of operation, and before setting up queues. It should not be
385  * used for request that performs operation until all queues are
386  * set, which would need to check dev->vq_index_end instead.
387  */
388 static bool vhost_vdpa_first_dev(struct vhost_dev *dev)
389 {
390     struct vhost_vdpa *v = dev->opaque;
391 
392     return v->index == 0;
393 }
394 
395 static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
396                                        uint64_t *features)
397 {
398     int ret;
399 
400     ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
401     trace_vhost_vdpa_get_features(dev, *features);
402     return ret;
403 }
404 
405 static void vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v)
406 {
407     g_autoptr(GPtrArray) shadow_vqs = NULL;
408 
409     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
410     for (unsigned n = 0; n < hdev->nvqs; ++n) {
411         VhostShadowVirtqueue *svq;
412 
413         svq = vhost_svq_new(v->shadow_vq_ops, v->shadow_vq_ops_opaque);
414         g_ptr_array_add(shadow_vqs, svq);
415     }
416 
417     v->shadow_vqs = g_steal_pointer(&shadow_vqs);
418 }
419 
420 static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
421 {
422     struct vhost_vdpa *v;
423     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
424     trace_vhost_vdpa_init(dev, opaque);
425     int ret;
426 
427     /*
428      * Similar to VFIO, we end up pinning all guest memory and have to
429      * disable discarding of RAM.
430      */
431     ret = ram_block_discard_disable(true);
432     if (ret) {
433         error_report("Cannot set discarding of RAM broken");
434         return ret;
435     }
436 
437     v = opaque;
438     v->dev = dev;
439     dev->opaque =  opaque ;
440     v->listener = vhost_vdpa_memory_listener;
441     v->msg_type = VHOST_IOTLB_MSG_V2;
442     vhost_vdpa_init_svq(dev, v);
443 
444     if (!vhost_vdpa_first_dev(dev)) {
445         return 0;
446     }
447 
448     vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
449                                VIRTIO_CONFIG_S_DRIVER);
450 
451     return 0;
452 }
453 
454 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
455                                             int queue_index)
456 {
457     size_t page_size = qemu_real_host_page_size();
458     struct vhost_vdpa *v = dev->opaque;
459     VirtIODevice *vdev = dev->vdev;
460     VhostVDPAHostNotifier *n;
461 
462     n = &v->notifier[queue_index];
463 
464     if (n->addr) {
465         virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false);
466         object_unparent(OBJECT(&n->mr));
467         munmap(n->addr, page_size);
468         n->addr = NULL;
469     }
470 }
471 
472 static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index)
473 {
474     size_t page_size = qemu_real_host_page_size();
475     struct vhost_vdpa *v = dev->opaque;
476     VirtIODevice *vdev = dev->vdev;
477     VhostVDPAHostNotifier *n;
478     int fd = v->device_fd;
479     void *addr;
480     char *name;
481 
482     vhost_vdpa_host_notifier_uninit(dev, queue_index);
483 
484     n = &v->notifier[queue_index];
485 
486     addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd,
487                 queue_index * page_size);
488     if (addr == MAP_FAILED) {
489         goto err;
490     }
491 
492     name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]",
493                            v, queue_index);
494     memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
495                                       page_size, addr);
496     g_free(name);
497 
498     if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) {
499         object_unparent(OBJECT(&n->mr));
500         munmap(addr, page_size);
501         goto err;
502     }
503     n->addr = addr;
504 
505     return 0;
506 
507 err:
508     return -1;
509 }
510 
511 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
512 {
513     int i;
514 
515     for (i = dev->vq_index; i < dev->vq_index + n; i++) {
516         vhost_vdpa_host_notifier_uninit(dev, i);
517     }
518 }
519 
520 static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
521 {
522     struct vhost_vdpa *v = dev->opaque;
523     int i;
524 
525     if (v->shadow_vqs_enabled) {
526         /* FIXME SVQ is not compatible with host notifiers mr */
527         return;
528     }
529 
530     for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
531         if (vhost_vdpa_host_notifier_init(dev, i)) {
532             goto err;
533         }
534     }
535 
536     return;
537 
538 err:
539     vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index);
540     return;
541 }
542 
543 static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
544 {
545     struct vhost_vdpa *v = dev->opaque;
546     size_t idx;
547 
548     for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
549         vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
550     }
551     g_ptr_array_free(v->shadow_vqs, true);
552 }
553 
554 static int vhost_vdpa_cleanup(struct vhost_dev *dev)
555 {
556     struct vhost_vdpa *v;
557     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
558     v = dev->opaque;
559     trace_vhost_vdpa_cleanup(dev, v);
560     vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
561     memory_listener_unregister(&v->listener);
562     vhost_vdpa_svq_cleanup(dev);
563 
564     dev->opaque = NULL;
565     ram_block_discard_disable(false);
566 
567     return 0;
568 }
569 
570 static int vhost_vdpa_memslots_limit(struct vhost_dev *dev)
571 {
572     trace_vhost_vdpa_memslots_limit(dev, INT_MAX);
573     return INT_MAX;
574 }
575 
576 static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
577                                     struct vhost_memory *mem)
578 {
579     if (!vhost_vdpa_first_dev(dev)) {
580         return 0;
581     }
582 
583     trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding);
584     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) &&
585         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) {
586         int i;
587         for (i = 0; i < mem->nregions; i++) {
588             trace_vhost_vdpa_dump_regions(dev, i,
589                                           mem->regions[i].guest_phys_addr,
590                                           mem->regions[i].memory_size,
591                                           mem->regions[i].userspace_addr,
592                                           mem->regions[i].flags_padding);
593         }
594     }
595     if (mem->padding) {
596         return -EINVAL;
597     }
598 
599     return 0;
600 }
601 
602 static int vhost_vdpa_set_features(struct vhost_dev *dev,
603                                    uint64_t features)
604 {
605     struct vhost_vdpa *v = dev->opaque;
606     int ret;
607 
608     if (!vhost_vdpa_first_dev(dev)) {
609         return 0;
610     }
611 
612     if (v->shadow_vqs_enabled) {
613         if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
614             /*
615              * QEMU is just trying to enable or disable logging. SVQ handles
616              * this sepparately, so no need to forward this.
617              */
618             v->acked_features = features;
619             return 0;
620         }
621 
622         v->acked_features = features;
623 
624         /* We must not ack _F_LOG if SVQ is enabled */
625         features &= ~BIT_ULL(VHOST_F_LOG_ALL);
626     }
627 
628     trace_vhost_vdpa_set_features(dev, features);
629     ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
630     if (ret) {
631         return ret;
632     }
633 
634     return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
635 }
636 
637 static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev)
638 {
639     uint64_t features;
640     uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
641         0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH;
642     int r;
643 
644     if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) {
645         return -EFAULT;
646     }
647 
648     features &= f;
649 
650     if (vhost_vdpa_first_dev(dev)) {
651         r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features);
652         if (r) {
653             return -EFAULT;
654         }
655     }
656 
657     dev->backend_cap = features;
658 
659     return 0;
660 }
661 
662 static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
663                                     uint32_t *device_id)
664 {
665     int ret;
666     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id);
667     trace_vhost_vdpa_get_device_id(dev, *device_id);
668     return ret;
669 }
670 
671 static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
672 {
673     if (!v->shadow_vqs_enabled) {
674         return;
675     }
676 
677     for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
678         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
679         vhost_svq_stop(svq);
680     }
681 }
682 
683 static int vhost_vdpa_reset_device(struct vhost_dev *dev)
684 {
685     struct vhost_vdpa *v = dev->opaque;
686     int ret;
687     uint8_t status = 0;
688 
689     vhost_vdpa_reset_svq(v);
690 
691     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
692     trace_vhost_vdpa_reset_device(dev, status);
693     return ret;
694 }
695 
696 static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx)
697 {
698     assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
699 
700     trace_vhost_vdpa_get_vq_index(dev, idx, idx);
701     return idx;
702 }
703 
704 static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev)
705 {
706     int i;
707     trace_vhost_vdpa_set_vring_ready(dev);
708     for (i = 0; i < dev->nvqs; ++i) {
709         struct vhost_vring_state state = {
710             .index = dev->vq_index + i,
711             .num = 1,
712         };
713         vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);
714     }
715     return 0;
716 }
717 
718 static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config,
719                                    uint32_t config_len)
720 {
721     int b, len;
722     char line[QEMU_HEXDUMP_LINE_LEN];
723 
724     for (b = 0; b < config_len; b += 16) {
725         len = config_len - b;
726         qemu_hexdump_line(line, b, config, len, false);
727         trace_vhost_vdpa_dump_config(dev, line);
728     }
729 }
730 
731 static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data,
732                                    uint32_t offset, uint32_t size,
733                                    uint32_t flags)
734 {
735     struct vhost_vdpa_config *config;
736     int ret;
737     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
738 
739     trace_vhost_vdpa_set_config(dev, offset, size, flags);
740     config = g_malloc(size + config_size);
741     config->off = offset;
742     config->len = size;
743     memcpy(config->buf, data, size);
744     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) &&
745         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
746         vhost_vdpa_dump_config(dev, data, size);
747     }
748     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config);
749     g_free(config);
750     return ret;
751 }
752 
753 static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
754                                    uint32_t config_len, Error **errp)
755 {
756     struct vhost_vdpa_config *v_config;
757     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
758     int ret;
759 
760     trace_vhost_vdpa_get_config(dev, config, config_len);
761     v_config = g_malloc(config_len + config_size);
762     v_config->len = config_len;
763     v_config->off = 0;
764     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config);
765     memcpy(config, v_config->buf, config_len);
766     g_free(v_config);
767     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) &&
768         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
769         vhost_vdpa_dump_config(dev, config, config_len);
770     }
771     return ret;
772  }
773 
774 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
775                                          struct vhost_vring_state *ring)
776 {
777     trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
778     return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
779 }
780 
781 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
782                                          struct vhost_vring_file *file)
783 {
784     trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
785     return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
786 }
787 
788 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
789                                          struct vhost_vring_file *file)
790 {
791     trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
792     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
793 }
794 
795 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
796                                          struct vhost_vring_addr *addr)
797 {
798     trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
799                                 addr->desc_user_addr, addr->used_user_addr,
800                                 addr->avail_user_addr,
801                                 addr->log_guest_addr);
802 
803     return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
804 
805 }
806 
807 /**
808  * Set the shadow virtqueue descriptors to the device
809  *
810  * @dev: The vhost device model
811  * @svq: The shadow virtqueue
812  * @idx: The index of the virtqueue in the vhost device
813  * @errp: Error
814  *
815  * Note that this function does not rewind kick file descriptor if cannot set
816  * call one.
817  */
818 static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
819                                   VhostShadowVirtqueue *svq, unsigned idx,
820                                   Error **errp)
821 {
822     struct vhost_vring_file file = {
823         .index = dev->vq_index + idx,
824     };
825     const EventNotifier *event_notifier = &svq->hdev_kick;
826     int r;
827 
828     r = event_notifier_init(&svq->hdev_kick, 0);
829     if (r != 0) {
830         error_setg_errno(errp, -r, "Couldn't create kick event notifier");
831         goto err_init_hdev_kick;
832     }
833 
834     r = event_notifier_init(&svq->hdev_call, 0);
835     if (r != 0) {
836         error_setg_errno(errp, -r, "Couldn't create call event notifier");
837         goto err_init_hdev_call;
838     }
839 
840     file.fd = event_notifier_get_fd(event_notifier);
841     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
842     if (unlikely(r != 0)) {
843         error_setg_errno(errp, -r, "Can't set device kick fd");
844         goto err_init_set_dev_fd;
845     }
846 
847     event_notifier = &svq->hdev_call;
848     file.fd = event_notifier_get_fd(event_notifier);
849     r = vhost_vdpa_set_vring_dev_call(dev, &file);
850     if (unlikely(r != 0)) {
851         error_setg_errno(errp, -r, "Can't set device call fd");
852         goto err_init_set_dev_fd;
853     }
854 
855     return 0;
856 
857 err_init_set_dev_fd:
858     event_notifier_set_handler(&svq->hdev_call, NULL);
859 
860 err_init_hdev_call:
861     event_notifier_cleanup(&svq->hdev_kick);
862 
863 err_init_hdev_kick:
864     return r;
865 }
866 
867 /**
868  * Unmap a SVQ area in the device
869  */
870 static void vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr addr)
871 {
872     const DMAMap needle = {
873         .translated_addr = addr,
874     };
875     const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, &needle);
876     hwaddr size;
877     int r;
878 
879     if (unlikely(!result)) {
880         error_report("Unable to find SVQ address to unmap");
881         return;
882     }
883 
884     size = ROUND_UP(result->size, qemu_real_host_page_size());
885     r = vhost_vdpa_dma_unmap(v, v->address_space_id, result->iova, size);
886     if (unlikely(r < 0)) {
887         error_report("Unable to unmap SVQ vring: %s (%d)", g_strerror(-r), -r);
888         return;
889     }
890 
891     vhost_iova_tree_remove(v->iova_tree, *result);
892 }
893 
894 static void vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
895                                        const VhostShadowVirtqueue *svq)
896 {
897     struct vhost_vdpa *v = dev->opaque;
898     struct vhost_vring_addr svq_addr;
899 
900     vhost_svq_get_vring_addr(svq, &svq_addr);
901 
902     vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr);
903 
904     vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr);
905 }
906 
907 /**
908  * Map the SVQ area in the device
909  *
910  * @v: Vhost-vdpa device
911  * @needle: The area to search iova
912  * @errorp: Error pointer
913  */
914 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
915                                     Error **errp)
916 {
917     int r;
918 
919     r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
920     if (unlikely(r != IOVA_OK)) {
921         error_setg(errp, "Cannot allocate iova (%d)", r);
922         return false;
923     }
924 
925     r = vhost_vdpa_dma_map(v, v->address_space_id, needle->iova,
926                            needle->size + 1,
927                            (void *)(uintptr_t)needle->translated_addr,
928                            needle->perm == IOMMU_RO);
929     if (unlikely(r != 0)) {
930         error_setg_errno(errp, -r, "Cannot map region to device");
931         vhost_iova_tree_remove(v->iova_tree, *needle);
932     }
933 
934     return r == 0;
935 }
936 
937 /**
938  * Map the shadow virtqueue rings in the device
939  *
940  * @dev: The vhost device
941  * @svq: The shadow virtqueue
942  * @addr: Assigned IOVA addresses
943  * @errp: Error pointer
944  */
945 static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
946                                      const VhostShadowVirtqueue *svq,
947                                      struct vhost_vring_addr *addr,
948                                      Error **errp)
949 {
950     ERRP_GUARD();
951     DMAMap device_region, driver_region;
952     struct vhost_vring_addr svq_addr;
953     struct vhost_vdpa *v = dev->opaque;
954     size_t device_size = vhost_svq_device_area_size(svq);
955     size_t driver_size = vhost_svq_driver_area_size(svq);
956     size_t avail_offset;
957     bool ok;
958 
959     vhost_svq_get_vring_addr(svq, &svq_addr);
960 
961     driver_region = (DMAMap) {
962         .translated_addr = svq_addr.desc_user_addr,
963         .size = driver_size - 1,
964         .perm = IOMMU_RO,
965     };
966     ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
967     if (unlikely(!ok)) {
968         error_prepend(errp, "Cannot create vq driver region: ");
969         return false;
970     }
971     addr->desc_user_addr = driver_region.iova;
972     avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
973     addr->avail_user_addr = driver_region.iova + avail_offset;
974 
975     device_region = (DMAMap) {
976         .translated_addr = svq_addr.used_user_addr,
977         .size = device_size - 1,
978         .perm = IOMMU_RW,
979     };
980     ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
981     if (unlikely(!ok)) {
982         error_prepend(errp, "Cannot create vq device region: ");
983         vhost_vdpa_svq_unmap_ring(v, driver_region.translated_addr);
984     }
985     addr->used_user_addr = device_region.iova;
986 
987     return ok;
988 }
989 
990 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
991                                  VhostShadowVirtqueue *svq, unsigned idx,
992                                  Error **errp)
993 {
994     uint16_t vq_index = dev->vq_index + idx;
995     struct vhost_vring_state s = {
996         .index = vq_index,
997     };
998     int r;
999 
1000     r = vhost_vdpa_set_dev_vring_base(dev, &s);
1001     if (unlikely(r)) {
1002         error_setg_errno(errp, -r, "Cannot set vring base");
1003         return false;
1004     }
1005 
1006     r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
1007     return r == 0;
1008 }
1009 
1010 static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
1011 {
1012     struct vhost_vdpa *v = dev->opaque;
1013     Error *err = NULL;
1014     unsigned i;
1015 
1016     if (!v->shadow_vqs_enabled) {
1017         return true;
1018     }
1019 
1020     for (i = 0; i < v->shadow_vqs->len; ++i) {
1021         VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
1022         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1023         struct vhost_vring_addr addr = {
1024             .index = dev->vq_index + i,
1025         };
1026         int r;
1027         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
1028         if (unlikely(!ok)) {
1029             goto err;
1030         }
1031 
1032         vhost_svq_start(svq, dev->vdev, vq, v->iova_tree);
1033         ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
1034         if (unlikely(!ok)) {
1035             goto err_map;
1036         }
1037 
1038         /* Override vring GPA set by vhost subsystem */
1039         r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
1040         if (unlikely(r != 0)) {
1041             error_setg_errno(&err, -r, "Cannot set device address");
1042             goto err_set_addr;
1043         }
1044     }
1045 
1046     return true;
1047 
1048 err_set_addr:
1049     vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
1050 
1051 err_map:
1052     vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
1053 
1054 err:
1055     error_reportf_err(err, "Cannot setup SVQ %u: ", i);
1056     for (unsigned j = 0; j < i; ++j) {
1057         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
1058         vhost_vdpa_svq_unmap_rings(dev, svq);
1059         vhost_svq_stop(svq);
1060     }
1061 
1062     return false;
1063 }
1064 
1065 static void vhost_vdpa_svqs_stop(struct vhost_dev *dev)
1066 {
1067     struct vhost_vdpa *v = dev->opaque;
1068 
1069     if (!v->shadow_vqs_enabled) {
1070         return;
1071     }
1072 
1073     for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
1074         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1075         vhost_vdpa_svq_unmap_rings(dev, svq);
1076 
1077         event_notifier_cleanup(&svq->hdev_kick);
1078         event_notifier_cleanup(&svq->hdev_call);
1079     }
1080 }
1081 
1082 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
1083 {
1084     struct vhost_vdpa *v = dev->opaque;
1085     bool ok;
1086     trace_vhost_vdpa_dev_start(dev, started);
1087 
1088     if (started) {
1089         vhost_vdpa_host_notifiers_init(dev);
1090         ok = vhost_vdpa_svqs_start(dev);
1091         if (unlikely(!ok)) {
1092             return -1;
1093         }
1094         vhost_vdpa_set_vring_ready(dev);
1095     } else {
1096         vhost_vdpa_svqs_stop(dev);
1097         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
1098     }
1099 
1100     if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
1101         return 0;
1102     }
1103 
1104     if (started) {
1105         memory_listener_register(&v->listener, &address_space_memory);
1106         return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
1107     } else {
1108         vhost_vdpa_reset_device(dev);
1109         vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
1110                                    VIRTIO_CONFIG_S_DRIVER);
1111         memory_listener_unregister(&v->listener);
1112 
1113         return 0;
1114     }
1115 }
1116 
1117 static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
1118                                      struct vhost_log *log)
1119 {
1120     struct vhost_vdpa *v = dev->opaque;
1121     if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) {
1122         return 0;
1123     }
1124 
1125     trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd,
1126                                   log->log);
1127     return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base);
1128 }
1129 
1130 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
1131                                        struct vhost_vring_addr *addr)
1132 {
1133     struct vhost_vdpa *v = dev->opaque;
1134 
1135     if (v->shadow_vqs_enabled) {
1136         /*
1137          * Device vring addr was set at device start. SVQ base is handled by
1138          * VirtQueue code.
1139          */
1140         return 0;
1141     }
1142 
1143     return vhost_vdpa_set_vring_dev_addr(dev, addr);
1144 }
1145 
1146 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
1147                                       struct vhost_vring_state *ring)
1148 {
1149     trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num);
1150     return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring);
1151 }
1152 
1153 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
1154                                        struct vhost_vring_state *ring)
1155 {
1156     struct vhost_vdpa *v = dev->opaque;
1157     VirtQueue *vq = virtio_get_queue(dev->vdev, ring->index);
1158 
1159     /*
1160      * vhost-vdpa devices does not support in-flight requests. Set all of them
1161      * as available.
1162      *
1163      * TODO: This is ok for networking, but other kinds of devices might
1164      * have problems with these retransmissions.
1165      */
1166     while (virtqueue_rewind(vq, 1)) {
1167         continue;
1168     }
1169     if (v->shadow_vqs_enabled) {
1170         /*
1171          * Device vring base was set at device start. SVQ base is handled by
1172          * VirtQueue code.
1173          */
1174         return 0;
1175     }
1176 
1177     return vhost_vdpa_set_dev_vring_base(dev, ring);
1178 }
1179 
1180 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
1181                                        struct vhost_vring_state *ring)
1182 {
1183     struct vhost_vdpa *v = dev->opaque;
1184     int ret;
1185 
1186     if (v->shadow_vqs_enabled) {
1187         ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index);
1188         return 0;
1189     }
1190 
1191     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
1192     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
1193     return ret;
1194 }
1195 
1196 static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
1197                                        struct vhost_vring_file *file)
1198 {
1199     struct vhost_vdpa *v = dev->opaque;
1200     int vdpa_idx = file->index - dev->vq_index;
1201 
1202     if (v->shadow_vqs_enabled) {
1203         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1204         vhost_svq_set_svq_kick_fd(svq, file->fd);
1205         return 0;
1206     } else {
1207         return vhost_vdpa_set_vring_dev_kick(dev, file);
1208     }
1209 }
1210 
1211 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
1212                                        struct vhost_vring_file *file)
1213 {
1214     struct vhost_vdpa *v = dev->opaque;
1215 
1216     if (v->shadow_vqs_enabled) {
1217         int vdpa_idx = file->index - dev->vq_index;
1218         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1219 
1220         vhost_svq_set_svq_call_fd(svq, file->fd);
1221         return 0;
1222     } else {
1223         return vhost_vdpa_set_vring_dev_call(dev, file);
1224     }
1225 }
1226 
1227 static int vhost_vdpa_get_features(struct vhost_dev *dev,
1228                                      uint64_t *features)
1229 {
1230     struct vhost_vdpa *v = dev->opaque;
1231     int ret = vhost_vdpa_get_dev_features(dev, features);
1232 
1233     if (ret == 0 && v->shadow_vqs_enabled) {
1234         /* Add SVQ logging capabilities */
1235         *features |= BIT_ULL(VHOST_F_LOG_ALL);
1236     }
1237 
1238     return ret;
1239 }
1240 
1241 static int vhost_vdpa_set_owner(struct vhost_dev *dev)
1242 {
1243     if (!vhost_vdpa_first_dev(dev)) {
1244         return 0;
1245     }
1246 
1247     trace_vhost_vdpa_set_owner(dev);
1248     return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL);
1249 }
1250 
1251 static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
1252                     struct vhost_vring_addr *addr, struct vhost_virtqueue *vq)
1253 {
1254     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
1255     addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys;
1256     addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys;
1257     addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys;
1258     trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr,
1259                                  addr->avail_user_addr, addr->used_user_addr);
1260     return 0;
1261 }
1262 
1263 static bool  vhost_vdpa_force_iommu(struct vhost_dev *dev)
1264 {
1265     return true;
1266 }
1267 
1268 const VhostOps vdpa_ops = {
1269         .backend_type = VHOST_BACKEND_TYPE_VDPA,
1270         .vhost_backend_init = vhost_vdpa_init,
1271         .vhost_backend_cleanup = vhost_vdpa_cleanup,
1272         .vhost_set_log_base = vhost_vdpa_set_log_base,
1273         .vhost_set_vring_addr = vhost_vdpa_set_vring_addr,
1274         .vhost_set_vring_num = vhost_vdpa_set_vring_num,
1275         .vhost_set_vring_base = vhost_vdpa_set_vring_base,
1276         .vhost_get_vring_base = vhost_vdpa_get_vring_base,
1277         .vhost_set_vring_kick = vhost_vdpa_set_vring_kick,
1278         .vhost_set_vring_call = vhost_vdpa_set_vring_call,
1279         .vhost_get_features = vhost_vdpa_get_features,
1280         .vhost_set_backend_cap = vhost_vdpa_set_backend_cap,
1281         .vhost_set_owner = vhost_vdpa_set_owner,
1282         .vhost_set_vring_endian = NULL,
1283         .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit,
1284         .vhost_set_mem_table = vhost_vdpa_set_mem_table,
1285         .vhost_set_features = vhost_vdpa_set_features,
1286         .vhost_reset_device = vhost_vdpa_reset_device,
1287         .vhost_get_vq_index = vhost_vdpa_get_vq_index,
1288         .vhost_get_config  = vhost_vdpa_get_config,
1289         .vhost_set_config = vhost_vdpa_set_config,
1290         .vhost_requires_shm_log = NULL,
1291         .vhost_migration_done = NULL,
1292         .vhost_backend_can_merge = NULL,
1293         .vhost_net_set_mtu = NULL,
1294         .vhost_set_iotlb_callback = NULL,
1295         .vhost_send_device_iotlb_msg = NULL,
1296         .vhost_dev_start = vhost_vdpa_dev_start,
1297         .vhost_get_device_id = vhost_vdpa_get_device_id,
1298         .vhost_vq_get_addr = vhost_vdpa_vq_get_addr,
1299         .vhost_force_iommu = vhost_vdpa_force_iommu,
1300 };
1301