xref: /qemu/hw/virtio/vhost-vdpa.c (revision 2a53c4f5c534a1ab825ba03e0d3ec45a7c2b90d8)
1 /*
2  * vhost-vdpa
3  *
4  *  Copyright(c) 2017-2018 Intel Corporation.
5  *  Copyright(c) 2020 Red Hat, Inc.
6  *
7  * This work is licensed under the terms of the GNU GPL, version 2 or later.
8  * See the COPYING file in the top-level directory.
9  *
10  */
11 
12 #include "qemu/osdep.h"
13 #include <linux/vhost.h>
14 #include <linux/vfio.h>
15 #include <sys/eventfd.h>
16 #include <sys/ioctl.h>
17 #include "exec/target_page.h"
18 #include "hw/virtio/vhost.h"
19 #include "hw/virtio/vhost-backend.h"
20 #include "hw/virtio/virtio-net.h"
21 #include "hw/virtio/vhost-shadow-virtqueue.h"
22 #include "hw/virtio/vhost-vdpa.h"
23 #include "system/address-spaces.h"
24 #include "migration/blocker.h"
25 #include "qemu/cutils.h"
26 #include "qemu/main-loop.h"
27 #include "trace.h"
28 #include "qapi/error.h"
29 
30 /*
31  * Return one past the end of the end of section. Be careful with uint64_t
32  * conversions!
33  */
vhost_vdpa_section_end(const MemoryRegionSection * section,int page_mask)34 static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section,
35                                      int page_mask)
36 {
37     Int128 llend = int128_make64(section->offset_within_address_space);
38     llend = int128_add(llend, section->size);
39     llend = int128_and(llend, int128_exts64(page_mask));
40 
41     return llend;
42 }
43 
vhost_vdpa_listener_skipped_section(MemoryRegionSection * section,uint64_t iova_min,uint64_t iova_max,int page_mask)44 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
45                                                 uint64_t iova_min,
46                                                 uint64_t iova_max,
47                                                 int page_mask)
48 {
49     Int128 llend;
50     bool is_ram = memory_region_is_ram(section->mr);
51     bool is_iommu = memory_region_is_iommu(section->mr);
52     bool is_protected = memory_region_is_protected(section->mr);
53 
54     /* vhost-vDPA doesn't allow MMIO to be mapped  */
55     bool is_ram_device = memory_region_is_ram_device(section->mr);
56 
57     if ((!is_ram && !is_iommu) || is_protected || is_ram_device) {
58         trace_vhost_vdpa_skipped_memory_section(is_ram, is_iommu, is_protected,
59                                                 is_ram_device, iova_min,
60                                                 iova_max, page_mask);
61         return true;
62     }
63 
64     if (section->offset_within_address_space < iova_min) {
65         error_report("RAM section out of device range (min=0x%" PRIx64
66                      ", addr=0x%" HWADDR_PRIx ")",
67                      iova_min, section->offset_within_address_space);
68         return true;
69     }
70     /*
71      * While using vIOMMU, sometimes the section will be larger than iova_max,
72      * but the memory that actually maps is smaller, so move the check to
73      * function vhost_vdpa_iommu_map_notify(). That function will use the actual
74      * size that maps to the kernel
75      */
76 
77     if (!is_iommu) {
78         llend = vhost_vdpa_section_end(section, page_mask);
79         if (int128_gt(llend, int128_make64(iova_max))) {
80             error_report("RAM section out of device range (max=0x%" PRIx64
81                          ", end addr=0x%" PRIx64 ")",
82                          iova_max, int128_get64(llend));
83             return true;
84         }
85     }
86 
87     return false;
88 }
89 
90 /*
91  * The caller must set asid = 0 if the device does not support asid.
92  * This is not an ABI break since it is set to 0 by the initializer anyway.
93  */
vhost_vdpa_dma_map(VhostVDPAShared * s,uint32_t asid,hwaddr iova,hwaddr size,void * vaddr,bool readonly)94 int vhost_vdpa_dma_map(VhostVDPAShared *s, uint32_t asid, hwaddr iova,
95                        hwaddr size, void *vaddr, bool readonly)
96 {
97     struct vhost_msg_v2 msg = {};
98     int fd = s->device_fd;
99     int ret = 0;
100 
101     msg.type = VHOST_IOTLB_MSG_V2;
102     msg.asid = asid;
103     msg.iotlb.iova = iova;
104     msg.iotlb.size = size;
105     msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr;
106     msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW;
107     msg.iotlb.type = VHOST_IOTLB_UPDATE;
108 
109     trace_vhost_vdpa_dma_map(s, fd, msg.type, msg.asid, msg.iotlb.iova,
110                              msg.iotlb.size, msg.iotlb.uaddr, msg.iotlb.perm,
111                              msg.iotlb.type);
112 
113     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
114         error_report("failed to write, fd=%d, errno=%d (%s)",
115             fd, errno, strerror(errno));
116         return -EIO ;
117     }
118 
119     return ret;
120 }
121 
122 /*
123  * The caller must set asid = 0 if the device does not support asid.
124  * This is not an ABI break since it is set to 0 by the initializer anyway.
125  */
vhost_vdpa_dma_unmap(VhostVDPAShared * s,uint32_t asid,hwaddr iova,hwaddr size)126 int vhost_vdpa_dma_unmap(VhostVDPAShared *s, uint32_t asid, hwaddr iova,
127                          hwaddr size)
128 {
129     struct vhost_msg_v2 msg = {};
130     int fd = s->device_fd;
131     int ret = 0;
132 
133     msg.type = VHOST_IOTLB_MSG_V2;
134     msg.asid = asid;
135     msg.iotlb.iova = iova;
136     msg.iotlb.size = size;
137     msg.iotlb.type = VHOST_IOTLB_INVALIDATE;
138 
139     trace_vhost_vdpa_dma_unmap(s, fd, msg.type, msg.asid, msg.iotlb.iova,
140                                msg.iotlb.size, msg.iotlb.type);
141 
142     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
143         error_report("failed to write, fd=%d, errno=%d (%s)",
144             fd, errno, strerror(errno));
145         return -EIO ;
146     }
147 
148     return ret;
149 }
150 
vhost_vdpa_listener_begin_batch(VhostVDPAShared * s)151 static void vhost_vdpa_listener_begin_batch(VhostVDPAShared *s)
152 {
153     int fd = s->device_fd;
154     struct vhost_msg_v2 msg = {
155         .type = VHOST_IOTLB_MSG_V2,
156         .iotlb.type = VHOST_IOTLB_BATCH_BEGIN,
157     };
158 
159     trace_vhost_vdpa_listener_begin_batch(s, fd, msg.type, msg.iotlb.type);
160     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
161         error_report("failed to write, fd=%d, errno=%d (%s)",
162                      fd, errno, strerror(errno));
163     }
164 }
165 
vhost_vdpa_iotlb_batch_begin_once(VhostVDPAShared * s)166 static void vhost_vdpa_iotlb_batch_begin_once(VhostVDPAShared *s)
167 {
168     if (s->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) &&
169         !s->iotlb_batch_begin_sent) {
170         vhost_vdpa_listener_begin_batch(s);
171     }
172 
173     s->iotlb_batch_begin_sent = true;
174 }
175 
vhost_vdpa_listener_commit(MemoryListener * listener)176 static void vhost_vdpa_listener_commit(MemoryListener *listener)
177 {
178     VhostVDPAShared *s = container_of(listener, VhostVDPAShared, listener);
179     struct vhost_msg_v2 msg = {};
180     int fd = s->device_fd;
181 
182     if (!(s->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) {
183         return;
184     }
185 
186     if (!s->iotlb_batch_begin_sent) {
187         return;
188     }
189 
190     msg.type = VHOST_IOTLB_MSG_V2;
191     msg.iotlb.type = VHOST_IOTLB_BATCH_END;
192 
193     trace_vhost_vdpa_listener_commit(s, fd, msg.type, msg.iotlb.type);
194     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
195         error_report("failed to write, fd=%d, errno=%d (%s)",
196                      fd, errno, strerror(errno));
197     }
198 
199     s->iotlb_batch_begin_sent = false;
200 }
201 
vhost_vdpa_iommu_map_notify(IOMMUNotifier * n,IOMMUTLBEntry * iotlb)202 static void vhost_vdpa_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
203 {
204     struct vdpa_iommu *iommu = container_of(n, struct vdpa_iommu, n);
205 
206     hwaddr iova = iotlb->iova + iommu->iommu_offset;
207     VhostVDPAShared *s = iommu->dev_shared;
208     void *vaddr;
209     int ret;
210     Int128 llend;
211     Error *local_err = NULL;
212     MemoryRegion *mr;
213     hwaddr xlat;
214 
215     if (iotlb->target_as != &address_space_memory) {
216         error_report("Wrong target AS \"%s\", only system memory is allowed",
217                      iotlb->target_as->name ? iotlb->target_as->name : "none");
218         return;
219     }
220     RCU_READ_LOCK_GUARD();
221     /* check if RAM section out of device range */
222     llend = int128_add(int128_makes64(iotlb->addr_mask), int128_makes64(iova));
223     if (int128_gt(llend, int128_make64(s->iova_range.last))) {
224         error_report("RAM section out of device range (max=0x%" PRIx64
225                      ", end addr=0x%" PRIx64 ")",
226                      s->iova_range.last, int128_get64(llend));
227         return;
228     }
229 
230     if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
231         bool read_only;
232 
233         mr = memory_translate_iotlb(iotlb, &xlat, &local_err);
234         if (!mr) {
235             error_report_err(local_err);
236             return;
237         }
238         vaddr = memory_region_get_ram_ptr(mr) + xlat;
239         read_only = !(iotlb->perm & IOMMU_WO) || mr->readonly;
240 
241         ret = vhost_vdpa_dma_map(s, VHOST_VDPA_GUEST_PA_ASID, iova,
242                                  iotlb->addr_mask + 1, vaddr, read_only);
243         if (ret) {
244             error_report("vhost_vdpa_dma_map(%p, 0x%" HWADDR_PRIx ", "
245                          "0x%" HWADDR_PRIx ", %p) = %d (%m)",
246                          s, iova, iotlb->addr_mask + 1, vaddr, ret);
247         }
248     } else {
249         ret = vhost_vdpa_dma_unmap(s, VHOST_VDPA_GUEST_PA_ASID, iova,
250                                    iotlb->addr_mask + 1);
251         if (ret) {
252             error_report("vhost_vdpa_dma_unmap(%p, 0x%" HWADDR_PRIx ", "
253                          "0x%" HWADDR_PRIx ") = %d (%m)",
254                          s, iova, iotlb->addr_mask + 1, ret);
255         }
256     }
257 }
258 
vhost_vdpa_iommu_region_add(MemoryListener * listener,MemoryRegionSection * section)259 static void vhost_vdpa_iommu_region_add(MemoryListener *listener,
260                                         MemoryRegionSection *section)
261 {
262     VhostVDPAShared *s = container_of(listener, VhostVDPAShared, listener);
263 
264     struct vdpa_iommu *iommu;
265     Int128 end;
266     int iommu_idx;
267     IOMMUMemoryRegion *iommu_mr;
268     int ret;
269 
270     iommu_mr = IOMMU_MEMORY_REGION(section->mr);
271 
272     iommu = g_malloc0(sizeof(*iommu));
273     end = int128_add(int128_make64(section->offset_within_region),
274                      section->size);
275     end = int128_sub(end, int128_one());
276     iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
277                                                    MEMTXATTRS_UNSPECIFIED);
278     iommu->iommu_mr = iommu_mr;
279     iommu_notifier_init(&iommu->n, vhost_vdpa_iommu_map_notify,
280                         IOMMU_NOTIFIER_IOTLB_EVENTS,
281                         section->offset_within_region,
282                         int128_get64(end),
283                         iommu_idx);
284     iommu->iommu_offset = section->offset_within_address_space -
285                           section->offset_within_region;
286     iommu->dev_shared = s;
287 
288     ret = memory_region_register_iommu_notifier(section->mr, &iommu->n, NULL);
289     if (ret) {
290         g_free(iommu);
291         return;
292     }
293 
294     QLIST_INSERT_HEAD(&s->iommu_list, iommu, iommu_next);
295     memory_region_iommu_replay(iommu->iommu_mr, &iommu->n);
296 }
297 
vhost_vdpa_iommu_region_del(MemoryListener * listener,MemoryRegionSection * section)298 static void vhost_vdpa_iommu_region_del(MemoryListener *listener,
299                                         MemoryRegionSection *section)
300 {
301     VhostVDPAShared *s = container_of(listener, VhostVDPAShared, listener);
302 
303     struct vdpa_iommu *iommu;
304 
305     QLIST_FOREACH(iommu, &s->iommu_list, iommu_next)
306     {
307         if (MEMORY_REGION(iommu->iommu_mr) == section->mr &&
308             iommu->n.start == section->offset_within_region) {
309             memory_region_unregister_iommu_notifier(section->mr, &iommu->n);
310             QLIST_REMOVE(iommu, iommu_next);
311             g_free(iommu);
312             break;
313         }
314     }
315 }
316 
vhost_vdpa_listener_region_add(MemoryListener * listener,MemoryRegionSection * section)317 static void vhost_vdpa_listener_region_add(MemoryListener *listener,
318                                            MemoryRegionSection *section)
319 {
320     DMAMap mem_region = {};
321     VhostVDPAShared *s = container_of(listener, VhostVDPAShared, listener);
322     hwaddr iova;
323     Int128 llend, llsize;
324     void *vaddr;
325     int ret;
326     int page_size = qemu_target_page_size();
327     int page_mask = -page_size;
328 
329     if (vhost_vdpa_listener_skipped_section(section, s->iova_range.first,
330                                             s->iova_range.last, page_mask)) {
331         return;
332     }
333     if (memory_region_is_iommu(section->mr)) {
334         vhost_vdpa_iommu_region_add(listener, section);
335         return;
336     }
337 
338     if (unlikely((section->offset_within_address_space & ~page_mask) !=
339                  (section->offset_within_region & ~page_mask))) {
340         trace_vhost_vdpa_listener_region_add_unaligned(s, section->mr->name,
341                        section->offset_within_address_space & ~page_mask,
342                        section->offset_within_region & ~page_mask);
343         return;
344     }
345 
346     iova = ROUND_UP(section->offset_within_address_space, page_size);
347     llend = vhost_vdpa_section_end(section, page_mask);
348     if (int128_ge(int128_make64(iova), llend)) {
349         return;
350     }
351 
352     memory_region_ref(section->mr);
353 
354     /* Here we assume that memory_region_is_ram(section->mr)==true */
355 
356     vaddr = memory_region_get_ram_ptr(section->mr) +
357             section->offset_within_region +
358             (iova - section->offset_within_address_space);
359 
360     trace_vhost_vdpa_listener_region_add(s, iova, int128_get64(llend),
361                                          vaddr, section->readonly);
362 
363     llsize = int128_sub(llend, int128_make64(iova));
364     if (s->shadow_data) {
365         int r;
366         hwaddr gpa = section->offset_within_address_space;
367 
368         mem_region.size = int128_get64(llsize) - 1,
369         mem_region.perm = IOMMU_ACCESS_FLAG(true, section->readonly),
370 
371         r = vhost_iova_tree_map_alloc_gpa(s->iova_tree, &mem_region, gpa);
372         if (unlikely(r != IOVA_OK)) {
373             error_report("Can't allocate a mapping (%d)", r);
374 
375             if (mem_region.translated_addr == gpa) {
376                 error_report("Insertion to GPA->IOVA tree failed");
377                 /* Remove the mapping from the IOVA-only tree */
378                 goto fail_map;
379             }
380             goto fail;
381         }
382 
383         iova = mem_region.iova;
384     }
385 
386     vhost_vdpa_iotlb_batch_begin_once(s);
387     ret = vhost_vdpa_dma_map(s, VHOST_VDPA_GUEST_PA_ASID, iova,
388                              int128_get64(llsize), vaddr, section->readonly);
389     if (ret) {
390         error_report("vhost vdpa map fail!");
391         goto fail_map;
392     }
393 
394     return;
395 
396 fail_map:
397     if (s->shadow_data) {
398         vhost_iova_tree_remove_gpa(s->iova_tree, mem_region);
399     }
400 
401 fail:
402     /*
403      * On the initfn path, store the first error in the container so we
404      * can gracefully fail.  Runtime, there's not much we can do other
405      * than throw a hardware error.
406      */
407     error_report("vhost-vdpa: DMA mapping failed, unable to continue");
408     return;
409 
410 }
411 
vhost_vdpa_listener_region_del(MemoryListener * listener,MemoryRegionSection * section)412 static void vhost_vdpa_listener_region_del(MemoryListener *listener,
413                                            MemoryRegionSection *section)
414 {
415     VhostVDPAShared *s = container_of(listener, VhostVDPAShared, listener);
416     hwaddr iova;
417     Int128 llend, llsize;
418     int ret;
419     int page_size = qemu_target_page_size();
420     int page_mask = -page_size;
421 
422     if (vhost_vdpa_listener_skipped_section(section, s->iova_range.first,
423                                             s->iova_range.last, page_mask)) {
424         return;
425     }
426     if (memory_region_is_iommu(section->mr)) {
427         vhost_vdpa_iommu_region_del(listener, section);
428     }
429 
430     if (unlikely((section->offset_within_address_space & ~page_mask) !=
431                  (section->offset_within_region & ~page_mask))) {
432         trace_vhost_vdpa_listener_region_del_unaligned(s, section->mr->name,
433                        section->offset_within_address_space & ~page_mask,
434                        section->offset_within_region & ~page_mask);
435         return;
436     }
437 
438     iova = ROUND_UP(section->offset_within_address_space, page_size);
439     llend = vhost_vdpa_section_end(section, page_mask);
440 
441     trace_vhost_vdpa_listener_region_del(s, iova,
442         int128_get64(int128_sub(llend, int128_one())));
443 
444     if (int128_ge(int128_make64(iova), llend)) {
445         return;
446     }
447 
448     llsize = int128_sub(llend, int128_make64(iova));
449 
450     if (s->shadow_data) {
451         const DMAMap *result;
452         DMAMap mem_region = {
453             .translated_addr = section->offset_within_address_space,
454             .size = int128_get64(llsize) - 1,
455         };
456 
457         result = vhost_iova_tree_find_gpa(s->iova_tree, &mem_region);
458         if (!result) {
459             /* The memory listener map wasn't mapped */
460             return;
461         }
462         iova = result->iova;
463         vhost_iova_tree_remove_gpa(s->iova_tree, *result);
464     }
465     vhost_vdpa_iotlb_batch_begin_once(s);
466     /*
467      * The unmap ioctl doesn't accept a full 64-bit. need to check it
468      */
469     if (int128_eq(llsize, int128_2_64())) {
470         llsize = int128_rshift(llsize, 1);
471         ret = vhost_vdpa_dma_unmap(s, VHOST_VDPA_GUEST_PA_ASID, iova,
472                                    int128_get64(llsize));
473 
474         if (ret) {
475             error_report("vhost_vdpa_dma_unmap(%p, 0x%" HWADDR_PRIx ", "
476                          "0x%" HWADDR_PRIx ") = %d (%m)",
477                          s, iova, int128_get64(llsize), ret);
478         }
479         iova += int128_get64(llsize);
480     }
481     ret = vhost_vdpa_dma_unmap(s, VHOST_VDPA_GUEST_PA_ASID, iova,
482                                int128_get64(llsize));
483 
484     if (ret) {
485         error_report("vhost_vdpa_dma_unmap(%p, 0x%" HWADDR_PRIx ", "
486                      "0x%" HWADDR_PRIx ") = %d (%m)",
487                      s, iova, int128_get64(llsize), ret);
488     }
489 
490     memory_region_unref(section->mr);
491 }
492 /*
493  * IOTLB API is used by vhost-vdpa which requires incremental updating
494  * of the mapping. So we can not use generic vhost memory listener which
495  * depends on the addnop().
496  */
497 static const MemoryListener vhost_vdpa_memory_listener = {
498     .name = "vhost-vdpa",
499     .commit = vhost_vdpa_listener_commit,
500     .region_add = vhost_vdpa_listener_region_add,
501     .region_del = vhost_vdpa_listener_region_del,
502 };
503 
vhost_vdpa_call(struct vhost_dev * dev,unsigned long int request,void * arg)504 static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request,
505                              void *arg)
506 {
507     struct vhost_vdpa *v = dev->opaque;
508     int fd = v->shared->device_fd;
509     int ret;
510 
511     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
512 
513     ret = ioctl(fd, request, arg);
514     return ret < 0 ? -errno : ret;
515 }
516 
vhost_vdpa_add_status(struct vhost_dev * dev,uint8_t status)517 static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status)
518 {
519     uint8_t s;
520     int ret;
521 
522     trace_vhost_vdpa_add_status(dev, status);
523     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
524     if (ret < 0) {
525         return ret;
526     }
527     if ((s & status) == status) {
528         /* Don't set bits already set */
529         return 0;
530     }
531 
532     s |= status;
533 
534     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s);
535     if (ret < 0) {
536         return ret;
537     }
538 
539     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
540     if (ret < 0) {
541         return ret;
542     }
543 
544     if (!(s & status)) {
545         return -EIO;
546     }
547 
548     return 0;
549 }
550 
vhost_vdpa_get_iova_range(int fd,struct vhost_vdpa_iova_range * iova_range)551 int vhost_vdpa_get_iova_range(int fd, struct vhost_vdpa_iova_range *iova_range)
552 {
553     int ret = ioctl(fd, VHOST_VDPA_GET_IOVA_RANGE, iova_range);
554 
555     return ret < 0 ? -errno : 0;
556 }
557 
558 /*
559  * The use of this function is for requests that only need to be
560  * applied once. Typically such request occurs at the beginning
561  * of operation, and before setting up queues. It should not be
562  * used for request that performs operation until all queues are
563  * set, which would need to check dev->vq_index_end instead.
564  */
vhost_vdpa_first_dev(struct vhost_dev * dev)565 static bool vhost_vdpa_first_dev(struct vhost_dev *dev)
566 {
567     struct vhost_vdpa *v = dev->opaque;
568 
569     return v->index == 0;
570 }
571 
vhost_vdpa_last_dev(struct vhost_dev * dev)572 static bool vhost_vdpa_last_dev(struct vhost_dev *dev)
573 {
574     return dev->vq_index + dev->nvqs == dev->vq_index_end;
575 }
576 
vhost_vdpa_get_dev_features(struct vhost_dev * dev,uint64_t * features)577 static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
578                                        uint64_t *features)
579 {
580     int ret;
581 
582     ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
583     trace_vhost_vdpa_get_features(dev, *features);
584     return ret;
585 }
586 
vhost_vdpa_init_svq(struct vhost_dev * hdev,struct vhost_vdpa * v)587 static void vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v)
588 {
589     g_autoptr(GPtrArray) shadow_vqs = NULL;
590 
591     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
592     for (unsigned n = 0; n < hdev->nvqs; ++n) {
593         VhostShadowVirtqueue *svq;
594 
595         svq = vhost_svq_new(v->shadow_vq_ops, v->shadow_vq_ops_opaque);
596         g_ptr_array_add(shadow_vqs, svq);
597     }
598 
599     v->shadow_vqs = g_steal_pointer(&shadow_vqs);
600 }
601 
vhost_vdpa_set_backend_cap(struct vhost_dev * dev)602 static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev)
603 {
604     struct vhost_vdpa *v = dev->opaque;
605 
606     uint64_t features;
607     uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
608         0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH |
609         0x1ULL << VHOST_BACKEND_F_IOTLB_ASID |
610         0x1ULL << VHOST_BACKEND_F_SUSPEND;
611     int r;
612 
613     if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) {
614         return -EFAULT;
615     }
616 
617     features &= f;
618 
619     if (vhost_vdpa_first_dev(dev)) {
620         r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features);
621         if (r) {
622             return -EFAULT;
623         }
624     }
625 
626     dev->backend_cap = features;
627     v->shared->backend_cap = features;
628 
629     return 0;
630 }
631 
vhost_vdpa_init(struct vhost_dev * dev,void * opaque,Error ** errp)632 static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
633 {
634     struct vhost_vdpa *v = opaque;
635     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
636     trace_vhost_vdpa_init(dev, v->shared, opaque);
637     int ret;
638 
639     v->dev = dev;
640     dev->opaque =  opaque ;
641 
642     ret = vhost_vdpa_set_backend_cap(dev);
643     if (unlikely(ret != 0)) {
644         return ret;
645     }
646 
647     vhost_vdpa_init_svq(dev, v);
648 
649     error_propagate(&dev->migration_blocker, v->migration_blocker);
650     if (!vhost_vdpa_first_dev(dev)) {
651         return 0;
652     }
653 
654     /*
655      * If dev->shadow_vqs_enabled at initialization that means the device has
656      * been started with x-svq=on, so don't block migration
657      */
658     if (dev->migration_blocker == NULL && !v->shadow_vqs_enabled) {
659         /* We don't have dev->features yet */
660         uint64_t features;
661         ret = vhost_vdpa_get_dev_features(dev, &features);
662         if (unlikely(ret)) {
663             error_setg_errno(errp, -ret, "Could not get device features");
664             return ret;
665         }
666         vhost_svq_valid_features(features, &dev->migration_blocker);
667     }
668 
669     /*
670      * Similar to VFIO, we end up pinning all guest memory and have to
671      * disable discarding of RAM.
672      */
673     ret = ram_block_discard_disable(true);
674     if (ret) {
675         error_report("Cannot set discarding of RAM broken");
676         return ret;
677     }
678 
679     vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
680                                VIRTIO_CONFIG_S_DRIVER);
681 
682     v->shared->listener = vhost_vdpa_memory_listener;
683     return 0;
684 }
685 
vhost_vdpa_host_notifier_uninit(struct vhost_dev * dev,int queue_index)686 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
687                                             int queue_index)
688 {
689     size_t page_size = qemu_real_host_page_size();
690     struct vhost_vdpa *v = dev->opaque;
691     VirtIODevice *vdev = dev->vdev;
692     VhostVDPAHostNotifier *n;
693 
694     n = &v->notifier[queue_index];
695 
696     if (n->addr) {
697         virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false);
698         object_unparent(OBJECT(&n->mr));
699         munmap(n->addr, page_size);
700         n->addr = NULL;
701     }
702 }
703 
vhost_vdpa_host_notifier_init(struct vhost_dev * dev,int queue_index)704 static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index)
705 {
706     size_t page_size = qemu_real_host_page_size();
707     struct vhost_vdpa *v = dev->opaque;
708     VirtIODevice *vdev = dev->vdev;
709     VhostVDPAHostNotifier *n;
710     int fd = v->shared->device_fd;
711     void *addr;
712     char *name;
713 
714     vhost_vdpa_host_notifier_uninit(dev, queue_index);
715 
716     n = &v->notifier[queue_index];
717 
718     addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd,
719                 queue_index * page_size);
720     if (addr == MAP_FAILED) {
721         goto err;
722     }
723 
724     name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]",
725                            v, queue_index);
726     memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
727                                       page_size, addr);
728     g_free(name);
729 
730     if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) {
731         object_unparent(OBJECT(&n->mr));
732         munmap(addr, page_size);
733         goto err;
734     }
735     n->addr = addr;
736 
737     return 0;
738 
739 err:
740     return -1;
741 }
742 
vhost_vdpa_host_notifiers_uninit(struct vhost_dev * dev,int n)743 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
744 {
745     int i;
746 
747     /*
748      * Pack all the changes to the memory regions in a single
749      * transaction to avoid a few updating of the address space
750      * topology.
751      */
752     memory_region_transaction_begin();
753 
754     for (i = dev->vq_index; i < dev->vq_index + n; i++) {
755         vhost_vdpa_host_notifier_uninit(dev, i);
756     }
757 
758     memory_region_transaction_commit();
759 }
760 
vhost_vdpa_host_notifiers_init(struct vhost_dev * dev)761 static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
762 {
763     struct vhost_vdpa *v = dev->opaque;
764     int i;
765 
766     if (v->shadow_vqs_enabled) {
767         /* FIXME SVQ is not compatible with host notifiers mr */
768         return;
769     }
770 
771     /*
772      * Pack all the changes to the memory regions in a single
773      * transaction to avoid a few updating of the address space
774      * topology.
775      */
776     memory_region_transaction_begin();
777 
778     for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
779         if (vhost_vdpa_host_notifier_init(dev, i)) {
780             vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index);
781             break;
782         }
783     }
784 
785     memory_region_transaction_commit();
786 }
787 
vhost_vdpa_svq_cleanup(struct vhost_dev * dev)788 static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
789 {
790     struct vhost_vdpa *v = dev->opaque;
791     size_t idx;
792 
793     for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
794         vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
795     }
796     g_ptr_array_free(v->shadow_vqs, true);
797 }
798 
vhost_vdpa_cleanup(struct vhost_dev * dev)799 static int vhost_vdpa_cleanup(struct vhost_dev *dev)
800 {
801     struct vhost_vdpa *v;
802     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
803     v = dev->opaque;
804     trace_vhost_vdpa_cleanup(dev, v);
805     if (vhost_vdpa_first_dev(dev)) {
806         ram_block_discard_disable(false);
807         memory_listener_unregister(&v->shared->listener);
808     }
809 
810     vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
811     vhost_vdpa_svq_cleanup(dev);
812 
813     dev->opaque = NULL;
814 
815     return 0;
816 }
817 
vhost_vdpa_memslots_limit(struct vhost_dev * dev)818 static int vhost_vdpa_memslots_limit(struct vhost_dev *dev)
819 {
820     trace_vhost_vdpa_memslots_limit(dev, INT_MAX);
821     return INT_MAX;
822 }
823 
vhost_vdpa_set_mem_table(struct vhost_dev * dev,struct vhost_memory * mem)824 static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
825                                     struct vhost_memory *mem)
826 {
827     if (!vhost_vdpa_first_dev(dev)) {
828         return 0;
829     }
830 
831     trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding);
832     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) &&
833         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) {
834         int i;
835         for (i = 0; i < mem->nregions; i++) {
836             trace_vhost_vdpa_dump_regions(dev, i,
837                                           mem->regions[i].guest_phys_addr,
838                                           mem->regions[i].memory_size,
839                                           mem->regions[i].userspace_addr,
840                                           mem->regions[i].flags_padding);
841         }
842     }
843     if (mem->padding) {
844         return -EINVAL;
845     }
846 
847     return 0;
848 }
849 
vhost_vdpa_set_features(struct vhost_dev * dev,uint64_t features)850 static int vhost_vdpa_set_features(struct vhost_dev *dev,
851                                    uint64_t features)
852 {
853     struct vhost_vdpa *v = dev->opaque;
854     int ret;
855 
856     if (!vhost_vdpa_first_dev(dev)) {
857         return 0;
858     }
859 
860     if (v->shadow_vqs_enabled) {
861         if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
862             /*
863              * QEMU is just trying to enable or disable logging. SVQ handles
864              * this sepparately, so no need to forward this.
865              */
866             v->acked_features = features;
867             return 0;
868         }
869 
870         v->acked_features = features;
871 
872         /* We must not ack _F_LOG if SVQ is enabled */
873         features &= ~BIT_ULL(VHOST_F_LOG_ALL);
874     }
875 
876     trace_vhost_vdpa_set_features(dev, features);
877     ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
878     if (ret) {
879         return ret;
880     }
881 
882     return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
883 }
884 
vhost_vdpa_get_device_id(struct vhost_dev * dev,uint32_t * device_id)885 static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
886                                     uint32_t *device_id)
887 {
888     int ret;
889     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id);
890     trace_vhost_vdpa_get_device_id(dev, *device_id);
891     return ret;
892 }
893 
vhost_vdpa_reset_device(struct vhost_dev * dev)894 static int vhost_vdpa_reset_device(struct vhost_dev *dev)
895 {
896     struct vhost_vdpa *v = dev->opaque;
897     int ret;
898     uint8_t status = 0;
899 
900     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
901     trace_vhost_vdpa_reset_device(dev);
902     if (ret) {
903         return ret;
904     }
905 
906     memory_listener_unregister(&v->shared->listener);
907     v->shared->listener_registered = false;
908     v->suspended = false;
909     return 0;
910 }
911 
vhost_vdpa_get_vq_index(struct vhost_dev * dev,int idx)912 static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx)
913 {
914     assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
915 
916     trace_vhost_vdpa_get_vq_index(dev, idx, idx);
917     return idx;
918 }
919 
vhost_vdpa_set_vring_enable_one(struct vhost_vdpa * v,unsigned idx,int enable)920 static int vhost_vdpa_set_vring_enable_one(struct vhost_vdpa *v, unsigned idx,
921                                            int enable)
922 {
923     struct vhost_dev *dev = v->dev;
924     struct vhost_vring_state state = {
925         .index = idx,
926         .num = enable,
927     };
928     int r = vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);
929 
930     trace_vhost_vdpa_set_vring_enable_one(dev, idx, enable, r);
931     return r;
932 }
933 
vhost_vdpa_set_vring_enable(struct vhost_dev * dev,int enable)934 static int vhost_vdpa_set_vring_enable(struct vhost_dev *dev, int enable)
935 {
936     struct vhost_vdpa *v = dev->opaque;
937     unsigned int i;
938     int ret;
939 
940     for (i = 0; i < dev->nvqs; ++i) {
941         ret = vhost_vdpa_set_vring_enable_one(v, i, enable);
942         if (ret < 0) {
943             return ret;
944         }
945     }
946 
947     return 0;
948 }
949 
vhost_vdpa_set_vring_ready(struct vhost_vdpa * v,unsigned idx)950 int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx)
951 {
952     return vhost_vdpa_set_vring_enable_one(v, idx, 1);
953 }
954 
vhost_vdpa_set_config_call(struct vhost_dev * dev,int fd)955 static int vhost_vdpa_set_config_call(struct vhost_dev *dev,
956                                        int fd)
957 {
958     trace_vhost_vdpa_set_config_call(dev, fd);
959     return vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG_CALL, &fd);
960 }
961 
vhost_vdpa_dump_config(struct vhost_dev * dev,const uint8_t * config,uint32_t config_len)962 static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config,
963                                    uint32_t config_len)
964 {
965     g_autoptr(GString) str = g_string_sized_new(4 * 16);
966     size_t b, len;
967 
968     for (b = 0; b < config_len; b += len) {
969         len = MIN(config_len - b, 16);
970 
971         g_string_truncate(str, 0);
972         qemu_hexdump_line(str, config + b, len, 1, 4);
973         trace_vhost_vdpa_dump_config(dev, b, str->str);
974     }
975 }
976 
vhost_vdpa_set_config(struct vhost_dev * dev,const uint8_t * data,uint32_t offset,uint32_t size,uint32_t flags)977 static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data,
978                                    uint32_t offset, uint32_t size,
979                                    uint32_t flags)
980 {
981     struct vhost_vdpa_config *config;
982     int ret;
983     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
984 
985     trace_vhost_vdpa_set_config(dev, offset, size, flags);
986     config = g_malloc(size + config_size);
987     config->off = offset;
988     config->len = size;
989     memcpy(config->buf, data, size);
990     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) &&
991         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
992         vhost_vdpa_dump_config(dev, data, size);
993     }
994     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config);
995     g_free(config);
996     return ret;
997 }
998 
vhost_vdpa_get_config(struct vhost_dev * dev,uint8_t * config,uint32_t config_len,Error ** errp)999 static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
1000                                    uint32_t config_len, Error **errp)
1001 {
1002     struct vhost_vdpa_config *v_config;
1003     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
1004     int ret;
1005 
1006     trace_vhost_vdpa_get_config(dev, config, config_len);
1007     v_config = g_malloc(config_len + config_size);
1008     v_config->len = config_len;
1009     v_config->off = 0;
1010     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config);
1011     memcpy(config, v_config->buf, config_len);
1012     g_free(v_config);
1013     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) &&
1014         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
1015         vhost_vdpa_dump_config(dev, config, config_len);
1016     }
1017     return ret;
1018  }
1019 
vhost_vdpa_set_dev_vring_base(struct vhost_dev * dev,struct vhost_vring_state * ring)1020 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
1021                                          struct vhost_vring_state *ring)
1022 {
1023     struct vhost_vdpa *v = dev->opaque;
1024 
1025     trace_vhost_vdpa_set_dev_vring_base(dev, ring->index, ring->num,
1026                                         v->shadow_vqs_enabled);
1027     return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
1028 }
1029 
vhost_vdpa_set_vring_dev_kick(struct vhost_dev * dev,struct vhost_vring_file * file)1030 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
1031                                          struct vhost_vring_file *file)
1032 {
1033     trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
1034     return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
1035 }
1036 
vhost_vdpa_set_vring_dev_call(struct vhost_dev * dev,struct vhost_vring_file * file)1037 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
1038                                          struct vhost_vring_file *file)
1039 {
1040     trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
1041     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
1042 }
1043 
vhost_vdpa_set_vring_dev_addr(struct vhost_dev * dev,struct vhost_vring_addr * addr)1044 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
1045                                          struct vhost_vring_addr *addr)
1046 {
1047     trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
1048                                 addr->desc_user_addr, addr->used_user_addr,
1049                                 addr->avail_user_addr,
1050                                 addr->log_guest_addr);
1051 
1052     return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
1053 
1054 }
1055 
1056 /**
1057  * Set the shadow virtqueue descriptors to the device
1058  *
1059  * @dev: The vhost device model
1060  * @svq: The shadow virtqueue
1061  * @idx: The index of the virtqueue in the vhost device
1062  * @errp: Error
1063  *
1064  * Note that this function does not rewind kick file descriptor if cannot set
1065  * call one.
1066  */
vhost_vdpa_svq_set_fds(struct vhost_dev * dev,VhostShadowVirtqueue * svq,unsigned idx,Error ** errp)1067 static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
1068                                   VhostShadowVirtqueue *svq, unsigned idx,
1069                                   Error **errp)
1070 {
1071     struct vhost_vring_file file = {
1072         .index = dev->vq_index + idx,
1073     };
1074     const EventNotifier *event_notifier = &svq->hdev_kick;
1075     int r;
1076 
1077     r = event_notifier_init(&svq->hdev_kick, 0);
1078     if (r != 0) {
1079         error_setg_errno(errp, -r, "Couldn't create kick event notifier");
1080         goto err_init_hdev_kick;
1081     }
1082 
1083     r = event_notifier_init(&svq->hdev_call, 0);
1084     if (r != 0) {
1085         error_setg_errno(errp, -r, "Couldn't create call event notifier");
1086         goto err_init_hdev_call;
1087     }
1088 
1089     file.fd = event_notifier_get_fd(event_notifier);
1090     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
1091     if (unlikely(r != 0)) {
1092         error_setg_errno(errp, -r, "Can't set device kick fd");
1093         goto err_init_set_dev_fd;
1094     }
1095 
1096     event_notifier = &svq->hdev_call;
1097     file.fd = event_notifier_get_fd(event_notifier);
1098     r = vhost_vdpa_set_vring_dev_call(dev, &file);
1099     if (unlikely(r != 0)) {
1100         error_setg_errno(errp, -r, "Can't set device call fd");
1101         goto err_init_set_dev_fd;
1102     }
1103 
1104     return 0;
1105 
1106 err_init_set_dev_fd:
1107     event_notifier_set_handler(&svq->hdev_call, NULL);
1108 
1109 err_init_hdev_call:
1110     event_notifier_cleanup(&svq->hdev_kick);
1111 
1112 err_init_hdev_kick:
1113     return r;
1114 }
1115 
1116 /**
1117  * Unmap a SVQ area in the device
1118  */
vhost_vdpa_svq_unmap_ring(struct vhost_vdpa * v,hwaddr addr)1119 static void vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr addr)
1120 {
1121     const DMAMap needle = {
1122         .translated_addr = addr,
1123     };
1124     const DMAMap *result = vhost_iova_tree_find_iova(v->shared->iova_tree,
1125                                                      &needle);
1126     hwaddr size;
1127     int r;
1128 
1129     if (unlikely(!result)) {
1130         error_report("Unable to find SVQ address to unmap");
1131         return;
1132     }
1133 
1134     size = ROUND_UP(result->size, qemu_real_host_page_size());
1135     r = vhost_vdpa_dma_unmap(v->shared, v->address_space_id, result->iova,
1136                              size);
1137     if (unlikely(r < 0)) {
1138         error_report("Unable to unmap SVQ vring: %s (%d)", g_strerror(-r), -r);
1139         return;
1140     }
1141 
1142     vhost_iova_tree_remove(v->shared->iova_tree, *result);
1143 }
1144 
vhost_vdpa_svq_unmap_rings(struct vhost_dev * dev,const VhostShadowVirtqueue * svq)1145 static void vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
1146                                        const VhostShadowVirtqueue *svq)
1147 {
1148     struct vhost_vdpa *v = dev->opaque;
1149     struct vhost_vring_addr svq_addr;
1150 
1151     vhost_svq_get_vring_addr(svq, &svq_addr);
1152 
1153     vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr);
1154 
1155     vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr);
1156 }
1157 
1158 /**
1159  * Map the SVQ area in the device
1160  *
1161  * @v: Vhost-vdpa device
1162  * @needle: The area to search iova
1163  * @taddr: The translated address (HVA)
1164  * @errorp: Error pointer
1165  */
vhost_vdpa_svq_map_ring(struct vhost_vdpa * v,DMAMap * needle,hwaddr taddr,Error ** errp)1166 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
1167                                     hwaddr taddr, Error **errp)
1168 {
1169     int r;
1170 
1171     r = vhost_iova_tree_map_alloc(v->shared->iova_tree, needle, taddr);
1172     if (unlikely(r != IOVA_OK)) {
1173         error_setg(errp, "Cannot allocate iova (%d)", r);
1174 
1175         if (needle->translated_addr == taddr) {
1176             error_append_hint(errp, "Insertion to IOVA->HVA tree failed");
1177             /* Remove the mapping from the IOVA-only tree */
1178             vhost_iova_tree_remove(v->shared->iova_tree, *needle);
1179         }
1180         return false;
1181     }
1182 
1183     r = vhost_vdpa_dma_map(v->shared, v->address_space_id, needle->iova,
1184                            needle->size + 1,
1185                            (void *)(uintptr_t)needle->translated_addr,
1186                            needle->perm == IOMMU_RO);
1187     if (unlikely(r != 0)) {
1188         error_setg_errno(errp, -r, "Cannot map region to device");
1189         vhost_iova_tree_remove(v->shared->iova_tree, *needle);
1190     }
1191 
1192     return r == 0;
1193 }
1194 
1195 /**
1196  * Map the shadow virtqueue rings in the device
1197  *
1198  * @dev: The vhost device
1199  * @svq: The shadow virtqueue
1200  * @addr: Assigned IOVA addresses
1201  * @errp: Error pointer
1202  */
vhost_vdpa_svq_map_rings(struct vhost_dev * dev,const VhostShadowVirtqueue * svq,struct vhost_vring_addr * addr,Error ** errp)1203 static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
1204                                      const VhostShadowVirtqueue *svq,
1205                                      struct vhost_vring_addr *addr,
1206                                      Error **errp)
1207 {
1208     ERRP_GUARD();
1209     DMAMap device_region, driver_region;
1210     struct vhost_vring_addr svq_addr;
1211     struct vhost_vdpa *v = dev->opaque;
1212     size_t device_size = vhost_svq_device_area_size(svq);
1213     size_t driver_size = vhost_svq_driver_area_size(svq);
1214     size_t avail_offset;
1215     bool ok;
1216 
1217     vhost_svq_get_vring_addr(svq, &svq_addr);
1218 
1219     driver_region = (DMAMap) {
1220         .size = driver_size - 1,
1221         .perm = IOMMU_RO,
1222     };
1223     ok = vhost_vdpa_svq_map_ring(v, &driver_region, svq_addr.desc_user_addr,
1224                                  errp);
1225     if (unlikely(!ok)) {
1226         error_prepend(errp, "Cannot create vq driver region: ");
1227         return false;
1228     }
1229     addr->desc_user_addr = driver_region.iova;
1230     avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
1231     addr->avail_user_addr = driver_region.iova + avail_offset;
1232 
1233     device_region = (DMAMap) {
1234         .size = device_size - 1,
1235         .perm = IOMMU_RW,
1236     };
1237     ok = vhost_vdpa_svq_map_ring(v, &device_region, svq_addr.used_user_addr,
1238                                  errp);
1239     if (unlikely(!ok)) {
1240         error_prepend(errp, "Cannot create vq device region: ");
1241         vhost_vdpa_svq_unmap_ring(v, driver_region.translated_addr);
1242     }
1243     addr->used_user_addr = device_region.iova;
1244 
1245     return ok;
1246 }
1247 
vhost_vdpa_svq_setup(struct vhost_dev * dev,VhostShadowVirtqueue * svq,unsigned idx,Error ** errp)1248 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
1249                                  VhostShadowVirtqueue *svq, unsigned idx,
1250                                  Error **errp)
1251 {
1252     uint16_t vq_index = dev->vq_index + idx;
1253     struct vhost_vring_state s = {
1254         .index = vq_index,
1255     };
1256     int r;
1257 
1258     r = vhost_vdpa_set_dev_vring_base(dev, &s);
1259     if (unlikely(r)) {
1260         error_setg_errno(errp, -r, "Cannot set vring base");
1261         return false;
1262     }
1263 
1264     r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
1265     return r == 0;
1266 }
1267 
vhost_vdpa_svqs_start(struct vhost_dev * dev)1268 static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
1269 {
1270     struct vhost_vdpa *v = dev->opaque;
1271     Error *err = NULL;
1272     unsigned i;
1273 
1274     if (!v->shadow_vqs_enabled) {
1275         return true;
1276     }
1277 
1278     for (i = 0; i < v->shadow_vqs->len; ++i) {
1279         VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
1280         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1281         struct vhost_vring_addr addr = {
1282             .index = dev->vq_index + i,
1283         };
1284         int r;
1285         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
1286         if (unlikely(!ok)) {
1287             goto err;
1288         }
1289 
1290         vhost_svq_start(svq, dev->vdev, vq, v->shared->iova_tree);
1291         ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
1292         if (unlikely(!ok)) {
1293             goto err_map;
1294         }
1295 
1296         /* Override vring GPA set by vhost subsystem */
1297         r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
1298         if (unlikely(r != 0)) {
1299             error_setg_errno(&err, -r, "Cannot set device address");
1300             goto err_set_addr;
1301         }
1302     }
1303 
1304     return true;
1305 
1306 err_set_addr:
1307     vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
1308 
1309 err_map:
1310     vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
1311 
1312 err:
1313     error_reportf_err(err, "Cannot setup SVQ %u: ", i);
1314     for (unsigned j = 0; j < i; ++j) {
1315         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
1316         vhost_vdpa_svq_unmap_rings(dev, svq);
1317         vhost_svq_stop(svq);
1318     }
1319 
1320     return false;
1321 }
1322 
vhost_vdpa_svqs_stop(struct vhost_dev * dev)1323 static void vhost_vdpa_svqs_stop(struct vhost_dev *dev)
1324 {
1325     struct vhost_vdpa *v = dev->opaque;
1326 
1327     if (!v->shadow_vqs_enabled) {
1328         return;
1329     }
1330 
1331     for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
1332         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1333 
1334         vhost_svq_stop(svq);
1335         vhost_vdpa_svq_unmap_rings(dev, svq);
1336 
1337         event_notifier_cleanup(&svq->hdev_kick);
1338         event_notifier_cleanup(&svq->hdev_call);
1339     }
1340 }
1341 
vhost_vdpa_suspend(struct vhost_dev * dev)1342 static void vhost_vdpa_suspend(struct vhost_dev *dev)
1343 {
1344     struct vhost_vdpa *v = dev->opaque;
1345     int r;
1346 
1347     if (!vhost_vdpa_first_dev(dev)) {
1348         return;
1349     }
1350 
1351     if (dev->backend_cap & BIT_ULL(VHOST_BACKEND_F_SUSPEND)) {
1352         trace_vhost_vdpa_suspend(dev);
1353         r = ioctl(v->shared->device_fd, VHOST_VDPA_SUSPEND);
1354         if (unlikely(r)) {
1355             error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno);
1356         } else {
1357             v->suspended = true;
1358             return;
1359         }
1360     }
1361 
1362     vhost_vdpa_reset_device(dev);
1363 }
1364 
vhost_vdpa_dev_start(struct vhost_dev * dev,bool started)1365 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
1366 {
1367     struct vhost_vdpa *v = dev->opaque;
1368     bool ok;
1369     trace_vhost_vdpa_dev_start(dev, started);
1370 
1371     if (started) {
1372         vhost_vdpa_host_notifiers_init(dev);
1373         ok = vhost_vdpa_svqs_start(dev);
1374         if (unlikely(!ok)) {
1375             return -1;
1376         }
1377     } else {
1378         vhost_vdpa_suspend(dev);
1379         vhost_vdpa_svqs_stop(dev);
1380         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
1381     }
1382 
1383     if (!vhost_vdpa_last_dev(dev)) {
1384         return 0;
1385     }
1386 
1387     if (started) {
1388         if (vhost_dev_has_iommu(dev) && (v->shadow_vqs_enabled)) {
1389             error_report("SVQ can not work while IOMMU enable, please disable"
1390                          "IOMMU and try again");
1391             return -1;
1392         }
1393         if (v->shared->listener_registered &&
1394             dev->vdev->dma_as != v->shared->listener.address_space) {
1395             memory_listener_unregister(&v->shared->listener);
1396             v->shared->listener_registered = false;
1397         }
1398         if (!v->shared->listener_registered) {
1399             memory_listener_register(&v->shared->listener, dev->vdev->dma_as);
1400             v->shared->listener_registered = true;
1401         }
1402 
1403         return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
1404     }
1405 
1406     return 0;
1407 }
1408 
vhost_vdpa_reset_status(struct vhost_dev * dev)1409 static void vhost_vdpa_reset_status(struct vhost_dev *dev)
1410 {
1411     if (!vhost_vdpa_last_dev(dev)) {
1412         return;
1413     }
1414 
1415     vhost_vdpa_reset_device(dev);
1416     vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
1417                                VIRTIO_CONFIG_S_DRIVER);
1418 }
1419 
vhost_vdpa_set_log_base(struct vhost_dev * dev,uint64_t base,struct vhost_log * log)1420 static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
1421                                      struct vhost_log *log)
1422 {
1423     struct vhost_vdpa *v = dev->opaque;
1424     if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) {
1425         return 0;
1426     }
1427 
1428     trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd,
1429                                   log->log);
1430     return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base);
1431 }
1432 
vhost_vdpa_set_vring_addr(struct vhost_dev * dev,struct vhost_vring_addr * addr)1433 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
1434                                        struct vhost_vring_addr *addr)
1435 {
1436     struct vhost_vdpa *v = dev->opaque;
1437 
1438     if (v->shadow_vqs_enabled) {
1439         /*
1440          * Device vring addr was set at device start. SVQ base is handled by
1441          * VirtQueue code.
1442          */
1443         return 0;
1444     }
1445 
1446     return vhost_vdpa_set_vring_dev_addr(dev, addr);
1447 }
1448 
vhost_vdpa_set_vring_num(struct vhost_dev * dev,struct vhost_vring_state * ring)1449 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
1450                                       struct vhost_vring_state *ring)
1451 {
1452     trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num);
1453     return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring);
1454 }
1455 
vhost_vdpa_set_vring_base(struct vhost_dev * dev,struct vhost_vring_state * ring)1456 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
1457                                        struct vhost_vring_state *ring)
1458 {
1459     struct vhost_vdpa *v = dev->opaque;
1460 
1461     if (v->shadow_vqs_enabled) {
1462         /*
1463          * Device vring base was set at device start. SVQ base is handled by
1464          * VirtQueue code.
1465          */
1466         return 0;
1467     }
1468 
1469     return vhost_vdpa_set_dev_vring_base(dev, ring);
1470 }
1471 
vhost_vdpa_get_vring_base(struct vhost_dev * dev,struct vhost_vring_state * ring)1472 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
1473                                        struct vhost_vring_state *ring)
1474 {
1475     struct vhost_vdpa *v = dev->opaque;
1476     int ret;
1477 
1478     if (v->shadow_vqs_enabled) {
1479         ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index);
1480         trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num, true);
1481         return 0;
1482     }
1483 
1484     if (!v->suspended) {
1485         /*
1486          * Cannot trust in value returned by device, let vhost recover used
1487          * idx from guest.
1488          */
1489         return -1;
1490     }
1491 
1492     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
1493     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num, false);
1494     return ret;
1495 }
1496 
vhost_vdpa_set_vring_kick(struct vhost_dev * dev,struct vhost_vring_file * file)1497 static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
1498                                        struct vhost_vring_file *file)
1499 {
1500     struct vhost_vdpa *v = dev->opaque;
1501     int vdpa_idx = file->index - dev->vq_index;
1502 
1503     if (v->shadow_vqs_enabled) {
1504         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1505         vhost_svq_set_svq_kick_fd(svq, file->fd);
1506         return 0;
1507     } else {
1508         return vhost_vdpa_set_vring_dev_kick(dev, file);
1509     }
1510 }
1511 
vhost_vdpa_set_vring_call(struct vhost_dev * dev,struct vhost_vring_file * file)1512 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
1513                                        struct vhost_vring_file *file)
1514 {
1515     struct vhost_vdpa *v = dev->opaque;
1516     int vdpa_idx = file->index - dev->vq_index;
1517     VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1518 
1519     /* Remember last call fd because we can switch to SVQ anytime. */
1520     vhost_svq_set_svq_call_fd(svq, file->fd);
1521     /*
1522      * When SVQ is transitioning to off, shadow_vqs_enabled has
1523      * not been set back to false yet, but the underlying call fd
1524      * will have to switch back to the guest notifier to signal the
1525      * passthrough virtqueues. In other situations, SVQ's own call
1526      * fd shall be used to signal the device model.
1527      */
1528     if (v->shadow_vqs_enabled &&
1529         v->shared->svq_switching != SVQ_TSTATE_DISABLING) {
1530         return 0;
1531     }
1532 
1533     return vhost_vdpa_set_vring_dev_call(dev, file);
1534 }
1535 
vhost_vdpa_get_features(struct vhost_dev * dev,uint64_t * features)1536 static int vhost_vdpa_get_features(struct vhost_dev *dev,
1537                                      uint64_t *features)
1538 {
1539     int ret = vhost_vdpa_get_dev_features(dev, features);
1540 
1541     if (ret == 0) {
1542         /* Add SVQ logging capabilities */
1543         *features |= BIT_ULL(VHOST_F_LOG_ALL);
1544     }
1545 
1546     return ret;
1547 }
1548 
vhost_vdpa_set_owner(struct vhost_dev * dev)1549 static int vhost_vdpa_set_owner(struct vhost_dev *dev)
1550 {
1551     int r;
1552     struct vhost_vdpa *v;
1553 
1554     if (!vhost_vdpa_first_dev(dev)) {
1555         return 0;
1556     }
1557 
1558     trace_vhost_vdpa_set_owner(dev);
1559     r = vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL);
1560     if (unlikely(r < 0)) {
1561         return r;
1562     }
1563 
1564     /*
1565      * Being optimistic and listening address space memory. If the device
1566      * uses vIOMMU, it is changed at vhost_vdpa_dev_start.
1567      */
1568     v = dev->opaque;
1569     memory_listener_register(&v->shared->listener, &address_space_memory);
1570     v->shared->listener_registered = true;
1571     return 0;
1572 }
1573 
vhost_vdpa_vq_get_addr(struct vhost_dev * dev,struct vhost_vring_addr * addr,struct vhost_virtqueue * vq)1574 static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
1575                     struct vhost_vring_addr *addr, struct vhost_virtqueue *vq)
1576 {
1577     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
1578     addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys;
1579     addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys;
1580     addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys;
1581     trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr,
1582                                  addr->avail_user_addr, addr->used_user_addr);
1583     return 0;
1584 }
1585 
vhost_vdpa_force_iommu(struct vhost_dev * dev)1586 static bool  vhost_vdpa_force_iommu(struct vhost_dev *dev)
1587 {
1588     return true;
1589 }
1590 
1591 const VhostOps vdpa_ops = {
1592         .backend_type = VHOST_BACKEND_TYPE_VDPA,
1593         .vhost_backend_init = vhost_vdpa_init,
1594         .vhost_backend_cleanup = vhost_vdpa_cleanup,
1595         .vhost_set_log_base = vhost_vdpa_set_log_base,
1596         .vhost_set_vring_addr = vhost_vdpa_set_vring_addr,
1597         .vhost_set_vring_num = vhost_vdpa_set_vring_num,
1598         .vhost_set_vring_base = vhost_vdpa_set_vring_base,
1599         .vhost_get_vring_base = vhost_vdpa_get_vring_base,
1600         .vhost_set_vring_kick = vhost_vdpa_set_vring_kick,
1601         .vhost_set_vring_call = vhost_vdpa_set_vring_call,
1602         .vhost_get_features = vhost_vdpa_get_features,
1603         .vhost_set_owner = vhost_vdpa_set_owner,
1604         .vhost_set_vring_endian = NULL,
1605         .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit,
1606         .vhost_set_mem_table = vhost_vdpa_set_mem_table,
1607         .vhost_set_features = vhost_vdpa_set_features,
1608         .vhost_reset_device = vhost_vdpa_reset_device,
1609         .vhost_get_vq_index = vhost_vdpa_get_vq_index,
1610         .vhost_set_vring_enable = vhost_vdpa_set_vring_enable,
1611         .vhost_get_config  = vhost_vdpa_get_config,
1612         .vhost_set_config = vhost_vdpa_set_config,
1613         .vhost_requires_shm_log = NULL,
1614         .vhost_migration_done = NULL,
1615         .vhost_net_set_mtu = NULL,
1616         .vhost_set_iotlb_callback = NULL,
1617         .vhost_send_device_iotlb_msg = NULL,
1618         .vhost_dev_start = vhost_vdpa_dev_start,
1619         .vhost_get_device_id = vhost_vdpa_get_device_id,
1620         .vhost_vq_get_addr = vhost_vdpa_vq_get_addr,
1621         .vhost_force_iommu = vhost_vdpa_force_iommu,
1622         .vhost_set_config_call = vhost_vdpa_set_config_call,
1623         .vhost_reset_status = vhost_vdpa_reset_status,
1624 };
1625