1 /*
2 * vhost-vdpa
3 *
4 * Copyright(c) 2017-2018 Intel Corporation.
5 * Copyright(c) 2020 Red Hat, Inc.
6 *
7 * This work is licensed under the terms of the GNU GPL, version 2 or later.
8 * See the COPYING file in the top-level directory.
9 *
10 */
11
12 #include "qemu/osdep.h"
13 #include <linux/vhost.h>
14 #include <linux/vfio.h>
15 #include <sys/eventfd.h>
16 #include <sys/ioctl.h>
17 #include "exec/target_page.h"
18 #include "hw/virtio/vhost.h"
19 #include "hw/virtio/vhost-backend.h"
20 #include "hw/virtio/virtio-net.h"
21 #include "hw/virtio/vhost-shadow-virtqueue.h"
22 #include "hw/virtio/vhost-vdpa.h"
23 #include "system/address-spaces.h"
24 #include "migration/blocker.h"
25 #include "qemu/cutils.h"
26 #include "qemu/main-loop.h"
27 #include "trace.h"
28 #include "qapi/error.h"
29
30 /*
31 * Return one past the end of the end of section. Be careful with uint64_t
32 * conversions!
33 */
vhost_vdpa_section_end(const MemoryRegionSection * section,int page_mask)34 static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section,
35 int page_mask)
36 {
37 Int128 llend = int128_make64(section->offset_within_address_space);
38 llend = int128_add(llend, section->size);
39 llend = int128_and(llend, int128_exts64(page_mask));
40
41 return llend;
42 }
43
vhost_vdpa_listener_skipped_section(MemoryRegionSection * section,uint64_t iova_min,uint64_t iova_max,int page_mask)44 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
45 uint64_t iova_min,
46 uint64_t iova_max,
47 int page_mask)
48 {
49 Int128 llend;
50 bool is_ram = memory_region_is_ram(section->mr);
51 bool is_iommu = memory_region_is_iommu(section->mr);
52 bool is_protected = memory_region_is_protected(section->mr);
53
54 /* vhost-vDPA doesn't allow MMIO to be mapped */
55 bool is_ram_device = memory_region_is_ram_device(section->mr);
56
57 if ((!is_ram && !is_iommu) || is_protected || is_ram_device) {
58 trace_vhost_vdpa_skipped_memory_section(is_ram, is_iommu, is_protected,
59 is_ram_device, iova_min,
60 iova_max, page_mask);
61 return true;
62 }
63
64 if (section->offset_within_address_space < iova_min) {
65 error_report("RAM section out of device range (min=0x%" PRIx64
66 ", addr=0x%" HWADDR_PRIx ")",
67 iova_min, section->offset_within_address_space);
68 return true;
69 }
70 /*
71 * While using vIOMMU, sometimes the section will be larger than iova_max,
72 * but the memory that actually maps is smaller, so move the check to
73 * function vhost_vdpa_iommu_map_notify(). That function will use the actual
74 * size that maps to the kernel
75 */
76
77 if (!is_iommu) {
78 llend = vhost_vdpa_section_end(section, page_mask);
79 if (int128_gt(llend, int128_make64(iova_max))) {
80 error_report("RAM section out of device range (max=0x%" PRIx64
81 ", end addr=0x%" PRIx64 ")",
82 iova_max, int128_get64(llend));
83 return true;
84 }
85 }
86
87 return false;
88 }
89
90 /*
91 * The caller must set asid = 0 if the device does not support asid.
92 * This is not an ABI break since it is set to 0 by the initializer anyway.
93 */
vhost_vdpa_dma_map(VhostVDPAShared * s,uint32_t asid,hwaddr iova,hwaddr size,void * vaddr,bool readonly)94 int vhost_vdpa_dma_map(VhostVDPAShared *s, uint32_t asid, hwaddr iova,
95 hwaddr size, void *vaddr, bool readonly)
96 {
97 struct vhost_msg_v2 msg = {};
98 int fd = s->device_fd;
99 int ret = 0;
100
101 msg.type = VHOST_IOTLB_MSG_V2;
102 msg.asid = asid;
103 msg.iotlb.iova = iova;
104 msg.iotlb.size = size;
105 msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr;
106 msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW;
107 msg.iotlb.type = VHOST_IOTLB_UPDATE;
108
109 trace_vhost_vdpa_dma_map(s, fd, msg.type, msg.asid, msg.iotlb.iova,
110 msg.iotlb.size, msg.iotlb.uaddr, msg.iotlb.perm,
111 msg.iotlb.type);
112
113 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
114 error_report("failed to write, fd=%d, errno=%d (%s)",
115 fd, errno, strerror(errno));
116 return -EIO ;
117 }
118
119 return ret;
120 }
121
122 /*
123 * The caller must set asid = 0 if the device does not support asid.
124 * This is not an ABI break since it is set to 0 by the initializer anyway.
125 */
vhost_vdpa_dma_unmap(VhostVDPAShared * s,uint32_t asid,hwaddr iova,hwaddr size)126 int vhost_vdpa_dma_unmap(VhostVDPAShared *s, uint32_t asid, hwaddr iova,
127 hwaddr size)
128 {
129 struct vhost_msg_v2 msg = {};
130 int fd = s->device_fd;
131 int ret = 0;
132
133 msg.type = VHOST_IOTLB_MSG_V2;
134 msg.asid = asid;
135 msg.iotlb.iova = iova;
136 msg.iotlb.size = size;
137 msg.iotlb.type = VHOST_IOTLB_INVALIDATE;
138
139 trace_vhost_vdpa_dma_unmap(s, fd, msg.type, msg.asid, msg.iotlb.iova,
140 msg.iotlb.size, msg.iotlb.type);
141
142 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
143 error_report("failed to write, fd=%d, errno=%d (%s)",
144 fd, errno, strerror(errno));
145 return -EIO ;
146 }
147
148 return ret;
149 }
150
vhost_vdpa_listener_begin_batch(VhostVDPAShared * s)151 static void vhost_vdpa_listener_begin_batch(VhostVDPAShared *s)
152 {
153 int fd = s->device_fd;
154 struct vhost_msg_v2 msg = {
155 .type = VHOST_IOTLB_MSG_V2,
156 .iotlb.type = VHOST_IOTLB_BATCH_BEGIN,
157 };
158
159 trace_vhost_vdpa_listener_begin_batch(s, fd, msg.type, msg.iotlb.type);
160 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
161 error_report("failed to write, fd=%d, errno=%d (%s)",
162 fd, errno, strerror(errno));
163 }
164 }
165
vhost_vdpa_iotlb_batch_begin_once(VhostVDPAShared * s)166 static void vhost_vdpa_iotlb_batch_begin_once(VhostVDPAShared *s)
167 {
168 if (s->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) &&
169 !s->iotlb_batch_begin_sent) {
170 vhost_vdpa_listener_begin_batch(s);
171 }
172
173 s->iotlb_batch_begin_sent = true;
174 }
175
vhost_vdpa_listener_commit(MemoryListener * listener)176 static void vhost_vdpa_listener_commit(MemoryListener *listener)
177 {
178 VhostVDPAShared *s = container_of(listener, VhostVDPAShared, listener);
179 struct vhost_msg_v2 msg = {};
180 int fd = s->device_fd;
181
182 if (!(s->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) {
183 return;
184 }
185
186 if (!s->iotlb_batch_begin_sent) {
187 return;
188 }
189
190 msg.type = VHOST_IOTLB_MSG_V2;
191 msg.iotlb.type = VHOST_IOTLB_BATCH_END;
192
193 trace_vhost_vdpa_listener_commit(s, fd, msg.type, msg.iotlb.type);
194 if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
195 error_report("failed to write, fd=%d, errno=%d (%s)",
196 fd, errno, strerror(errno));
197 }
198
199 s->iotlb_batch_begin_sent = false;
200 }
201
vhost_vdpa_iommu_map_notify(IOMMUNotifier * n,IOMMUTLBEntry * iotlb)202 static void vhost_vdpa_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
203 {
204 struct vdpa_iommu *iommu = container_of(n, struct vdpa_iommu, n);
205
206 hwaddr iova = iotlb->iova + iommu->iommu_offset;
207 VhostVDPAShared *s = iommu->dev_shared;
208 void *vaddr;
209 int ret;
210 Int128 llend;
211 Error *local_err = NULL;
212 MemoryRegion *mr;
213 hwaddr xlat;
214
215 if (iotlb->target_as != &address_space_memory) {
216 error_report("Wrong target AS \"%s\", only system memory is allowed",
217 iotlb->target_as->name ? iotlb->target_as->name : "none");
218 return;
219 }
220 RCU_READ_LOCK_GUARD();
221 /* check if RAM section out of device range */
222 llend = int128_add(int128_makes64(iotlb->addr_mask), int128_makes64(iova));
223 if (int128_gt(llend, int128_make64(s->iova_range.last))) {
224 error_report("RAM section out of device range (max=0x%" PRIx64
225 ", end addr=0x%" PRIx64 ")",
226 s->iova_range.last, int128_get64(llend));
227 return;
228 }
229
230 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
231 bool read_only;
232
233 mr = memory_translate_iotlb(iotlb, &xlat, &local_err);
234 if (!mr) {
235 error_report_err(local_err);
236 return;
237 }
238 vaddr = memory_region_get_ram_ptr(mr) + xlat;
239 read_only = !(iotlb->perm & IOMMU_WO) || mr->readonly;
240
241 ret = vhost_vdpa_dma_map(s, VHOST_VDPA_GUEST_PA_ASID, iova,
242 iotlb->addr_mask + 1, vaddr, read_only);
243 if (ret) {
244 error_report("vhost_vdpa_dma_map(%p, 0x%" HWADDR_PRIx ", "
245 "0x%" HWADDR_PRIx ", %p) = %d (%m)",
246 s, iova, iotlb->addr_mask + 1, vaddr, ret);
247 }
248 } else {
249 ret = vhost_vdpa_dma_unmap(s, VHOST_VDPA_GUEST_PA_ASID, iova,
250 iotlb->addr_mask + 1);
251 if (ret) {
252 error_report("vhost_vdpa_dma_unmap(%p, 0x%" HWADDR_PRIx ", "
253 "0x%" HWADDR_PRIx ") = %d (%m)",
254 s, iova, iotlb->addr_mask + 1, ret);
255 }
256 }
257 }
258
vhost_vdpa_iommu_region_add(MemoryListener * listener,MemoryRegionSection * section)259 static void vhost_vdpa_iommu_region_add(MemoryListener *listener,
260 MemoryRegionSection *section)
261 {
262 VhostVDPAShared *s = container_of(listener, VhostVDPAShared, listener);
263
264 struct vdpa_iommu *iommu;
265 Int128 end;
266 int iommu_idx;
267 IOMMUMemoryRegion *iommu_mr;
268 int ret;
269
270 iommu_mr = IOMMU_MEMORY_REGION(section->mr);
271
272 iommu = g_malloc0(sizeof(*iommu));
273 end = int128_add(int128_make64(section->offset_within_region),
274 section->size);
275 end = int128_sub(end, int128_one());
276 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
277 MEMTXATTRS_UNSPECIFIED);
278 iommu->iommu_mr = iommu_mr;
279 iommu_notifier_init(&iommu->n, vhost_vdpa_iommu_map_notify,
280 IOMMU_NOTIFIER_IOTLB_EVENTS,
281 section->offset_within_region,
282 int128_get64(end),
283 iommu_idx);
284 iommu->iommu_offset = section->offset_within_address_space -
285 section->offset_within_region;
286 iommu->dev_shared = s;
287
288 ret = memory_region_register_iommu_notifier(section->mr, &iommu->n, NULL);
289 if (ret) {
290 g_free(iommu);
291 return;
292 }
293
294 QLIST_INSERT_HEAD(&s->iommu_list, iommu, iommu_next);
295 memory_region_iommu_replay(iommu->iommu_mr, &iommu->n);
296 }
297
vhost_vdpa_iommu_region_del(MemoryListener * listener,MemoryRegionSection * section)298 static void vhost_vdpa_iommu_region_del(MemoryListener *listener,
299 MemoryRegionSection *section)
300 {
301 VhostVDPAShared *s = container_of(listener, VhostVDPAShared, listener);
302
303 struct vdpa_iommu *iommu;
304
305 QLIST_FOREACH(iommu, &s->iommu_list, iommu_next)
306 {
307 if (MEMORY_REGION(iommu->iommu_mr) == section->mr &&
308 iommu->n.start == section->offset_within_region) {
309 memory_region_unregister_iommu_notifier(section->mr, &iommu->n);
310 QLIST_REMOVE(iommu, iommu_next);
311 g_free(iommu);
312 break;
313 }
314 }
315 }
316
vhost_vdpa_listener_region_add(MemoryListener * listener,MemoryRegionSection * section)317 static void vhost_vdpa_listener_region_add(MemoryListener *listener,
318 MemoryRegionSection *section)
319 {
320 DMAMap mem_region = {};
321 VhostVDPAShared *s = container_of(listener, VhostVDPAShared, listener);
322 hwaddr iova;
323 Int128 llend, llsize;
324 void *vaddr;
325 int ret;
326 int page_size = qemu_target_page_size();
327 int page_mask = -page_size;
328
329 if (vhost_vdpa_listener_skipped_section(section, s->iova_range.first,
330 s->iova_range.last, page_mask)) {
331 return;
332 }
333 if (memory_region_is_iommu(section->mr)) {
334 vhost_vdpa_iommu_region_add(listener, section);
335 return;
336 }
337
338 if (unlikely((section->offset_within_address_space & ~page_mask) !=
339 (section->offset_within_region & ~page_mask))) {
340 trace_vhost_vdpa_listener_region_add_unaligned(s, section->mr->name,
341 section->offset_within_address_space & ~page_mask,
342 section->offset_within_region & ~page_mask);
343 return;
344 }
345
346 iova = ROUND_UP(section->offset_within_address_space, page_size);
347 llend = vhost_vdpa_section_end(section, page_mask);
348 if (int128_ge(int128_make64(iova), llend)) {
349 return;
350 }
351
352 memory_region_ref(section->mr);
353
354 /* Here we assume that memory_region_is_ram(section->mr)==true */
355
356 vaddr = memory_region_get_ram_ptr(section->mr) +
357 section->offset_within_region +
358 (iova - section->offset_within_address_space);
359
360 trace_vhost_vdpa_listener_region_add(s, iova, int128_get64(llend),
361 vaddr, section->readonly);
362
363 llsize = int128_sub(llend, int128_make64(iova));
364 if (s->shadow_data) {
365 int r;
366 hwaddr gpa = section->offset_within_address_space;
367
368 mem_region.size = int128_get64(llsize) - 1,
369 mem_region.perm = IOMMU_ACCESS_FLAG(true, section->readonly),
370
371 r = vhost_iova_tree_map_alloc_gpa(s->iova_tree, &mem_region, gpa);
372 if (unlikely(r != IOVA_OK)) {
373 error_report("Can't allocate a mapping (%d)", r);
374
375 if (mem_region.translated_addr == gpa) {
376 error_report("Insertion to GPA->IOVA tree failed");
377 /* Remove the mapping from the IOVA-only tree */
378 goto fail_map;
379 }
380 goto fail;
381 }
382
383 iova = mem_region.iova;
384 }
385
386 vhost_vdpa_iotlb_batch_begin_once(s);
387 ret = vhost_vdpa_dma_map(s, VHOST_VDPA_GUEST_PA_ASID, iova,
388 int128_get64(llsize), vaddr, section->readonly);
389 if (ret) {
390 error_report("vhost vdpa map fail!");
391 goto fail_map;
392 }
393
394 return;
395
396 fail_map:
397 if (s->shadow_data) {
398 vhost_iova_tree_remove_gpa(s->iova_tree, mem_region);
399 }
400
401 fail:
402 /*
403 * On the initfn path, store the first error in the container so we
404 * can gracefully fail. Runtime, there's not much we can do other
405 * than throw a hardware error.
406 */
407 error_report("vhost-vdpa: DMA mapping failed, unable to continue");
408 return;
409
410 }
411
vhost_vdpa_listener_region_del(MemoryListener * listener,MemoryRegionSection * section)412 static void vhost_vdpa_listener_region_del(MemoryListener *listener,
413 MemoryRegionSection *section)
414 {
415 VhostVDPAShared *s = container_of(listener, VhostVDPAShared, listener);
416 hwaddr iova;
417 Int128 llend, llsize;
418 int ret;
419 int page_size = qemu_target_page_size();
420 int page_mask = -page_size;
421
422 if (vhost_vdpa_listener_skipped_section(section, s->iova_range.first,
423 s->iova_range.last, page_mask)) {
424 return;
425 }
426 if (memory_region_is_iommu(section->mr)) {
427 vhost_vdpa_iommu_region_del(listener, section);
428 }
429
430 if (unlikely((section->offset_within_address_space & ~page_mask) !=
431 (section->offset_within_region & ~page_mask))) {
432 trace_vhost_vdpa_listener_region_del_unaligned(s, section->mr->name,
433 section->offset_within_address_space & ~page_mask,
434 section->offset_within_region & ~page_mask);
435 return;
436 }
437
438 iova = ROUND_UP(section->offset_within_address_space, page_size);
439 llend = vhost_vdpa_section_end(section, page_mask);
440
441 trace_vhost_vdpa_listener_region_del(s, iova,
442 int128_get64(int128_sub(llend, int128_one())));
443
444 if (int128_ge(int128_make64(iova), llend)) {
445 return;
446 }
447
448 llsize = int128_sub(llend, int128_make64(iova));
449
450 if (s->shadow_data) {
451 const DMAMap *result;
452 DMAMap mem_region = {
453 .translated_addr = section->offset_within_address_space,
454 .size = int128_get64(llsize) - 1,
455 };
456
457 result = vhost_iova_tree_find_gpa(s->iova_tree, &mem_region);
458 if (!result) {
459 /* The memory listener map wasn't mapped */
460 return;
461 }
462 iova = result->iova;
463 vhost_iova_tree_remove_gpa(s->iova_tree, *result);
464 }
465 vhost_vdpa_iotlb_batch_begin_once(s);
466 /*
467 * The unmap ioctl doesn't accept a full 64-bit. need to check it
468 */
469 if (int128_eq(llsize, int128_2_64())) {
470 llsize = int128_rshift(llsize, 1);
471 ret = vhost_vdpa_dma_unmap(s, VHOST_VDPA_GUEST_PA_ASID, iova,
472 int128_get64(llsize));
473
474 if (ret) {
475 error_report("vhost_vdpa_dma_unmap(%p, 0x%" HWADDR_PRIx ", "
476 "0x%" HWADDR_PRIx ") = %d (%m)",
477 s, iova, int128_get64(llsize), ret);
478 }
479 iova += int128_get64(llsize);
480 }
481 ret = vhost_vdpa_dma_unmap(s, VHOST_VDPA_GUEST_PA_ASID, iova,
482 int128_get64(llsize));
483
484 if (ret) {
485 error_report("vhost_vdpa_dma_unmap(%p, 0x%" HWADDR_PRIx ", "
486 "0x%" HWADDR_PRIx ") = %d (%m)",
487 s, iova, int128_get64(llsize), ret);
488 }
489
490 memory_region_unref(section->mr);
491 }
492 /*
493 * IOTLB API is used by vhost-vdpa which requires incremental updating
494 * of the mapping. So we can not use generic vhost memory listener which
495 * depends on the addnop().
496 */
497 static const MemoryListener vhost_vdpa_memory_listener = {
498 .name = "vhost-vdpa",
499 .commit = vhost_vdpa_listener_commit,
500 .region_add = vhost_vdpa_listener_region_add,
501 .region_del = vhost_vdpa_listener_region_del,
502 };
503
vhost_vdpa_call(struct vhost_dev * dev,unsigned long int request,void * arg)504 static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request,
505 void *arg)
506 {
507 struct vhost_vdpa *v = dev->opaque;
508 int fd = v->shared->device_fd;
509 int ret;
510
511 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
512
513 ret = ioctl(fd, request, arg);
514 return ret < 0 ? -errno : ret;
515 }
516
vhost_vdpa_add_status(struct vhost_dev * dev,uint8_t status)517 static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status)
518 {
519 uint8_t s;
520 int ret;
521
522 trace_vhost_vdpa_add_status(dev, status);
523 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
524 if (ret < 0) {
525 return ret;
526 }
527 if ((s & status) == status) {
528 /* Don't set bits already set */
529 return 0;
530 }
531
532 s |= status;
533
534 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s);
535 if (ret < 0) {
536 return ret;
537 }
538
539 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
540 if (ret < 0) {
541 return ret;
542 }
543
544 if (!(s & status)) {
545 return -EIO;
546 }
547
548 return 0;
549 }
550
vhost_vdpa_get_iova_range(int fd,struct vhost_vdpa_iova_range * iova_range)551 int vhost_vdpa_get_iova_range(int fd, struct vhost_vdpa_iova_range *iova_range)
552 {
553 int ret = ioctl(fd, VHOST_VDPA_GET_IOVA_RANGE, iova_range);
554
555 return ret < 0 ? -errno : 0;
556 }
557
558 /*
559 * The use of this function is for requests that only need to be
560 * applied once. Typically such request occurs at the beginning
561 * of operation, and before setting up queues. It should not be
562 * used for request that performs operation until all queues are
563 * set, which would need to check dev->vq_index_end instead.
564 */
vhost_vdpa_first_dev(struct vhost_dev * dev)565 static bool vhost_vdpa_first_dev(struct vhost_dev *dev)
566 {
567 struct vhost_vdpa *v = dev->opaque;
568
569 return v->index == 0;
570 }
571
vhost_vdpa_last_dev(struct vhost_dev * dev)572 static bool vhost_vdpa_last_dev(struct vhost_dev *dev)
573 {
574 return dev->vq_index + dev->nvqs == dev->vq_index_end;
575 }
576
vhost_vdpa_get_dev_features(struct vhost_dev * dev,uint64_t * features)577 static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
578 uint64_t *features)
579 {
580 int ret;
581
582 ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
583 trace_vhost_vdpa_get_features(dev, *features);
584 return ret;
585 }
586
vhost_vdpa_init_svq(struct vhost_dev * hdev,struct vhost_vdpa * v)587 static void vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v)
588 {
589 g_autoptr(GPtrArray) shadow_vqs = NULL;
590
591 shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
592 for (unsigned n = 0; n < hdev->nvqs; ++n) {
593 VhostShadowVirtqueue *svq;
594
595 svq = vhost_svq_new(v->shadow_vq_ops, v->shadow_vq_ops_opaque);
596 g_ptr_array_add(shadow_vqs, svq);
597 }
598
599 v->shadow_vqs = g_steal_pointer(&shadow_vqs);
600 }
601
vhost_vdpa_set_backend_cap(struct vhost_dev * dev)602 static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev)
603 {
604 struct vhost_vdpa *v = dev->opaque;
605
606 uint64_t features;
607 uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
608 0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH |
609 0x1ULL << VHOST_BACKEND_F_IOTLB_ASID |
610 0x1ULL << VHOST_BACKEND_F_SUSPEND;
611 int r;
612
613 if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) {
614 return -EFAULT;
615 }
616
617 features &= f;
618
619 if (vhost_vdpa_first_dev(dev)) {
620 r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features);
621 if (r) {
622 return -EFAULT;
623 }
624 }
625
626 dev->backend_cap = features;
627 v->shared->backend_cap = features;
628
629 return 0;
630 }
631
vhost_vdpa_init(struct vhost_dev * dev,void * opaque,Error ** errp)632 static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
633 {
634 struct vhost_vdpa *v = opaque;
635 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
636 trace_vhost_vdpa_init(dev, v->shared, opaque);
637 int ret;
638
639 v->dev = dev;
640 dev->opaque = opaque ;
641
642 ret = vhost_vdpa_set_backend_cap(dev);
643 if (unlikely(ret != 0)) {
644 return ret;
645 }
646
647 vhost_vdpa_init_svq(dev, v);
648
649 error_propagate(&dev->migration_blocker, v->migration_blocker);
650 if (!vhost_vdpa_first_dev(dev)) {
651 return 0;
652 }
653
654 /*
655 * If dev->shadow_vqs_enabled at initialization that means the device has
656 * been started with x-svq=on, so don't block migration
657 */
658 if (dev->migration_blocker == NULL && !v->shadow_vqs_enabled) {
659 /* We don't have dev->features yet */
660 uint64_t features;
661 ret = vhost_vdpa_get_dev_features(dev, &features);
662 if (unlikely(ret)) {
663 error_setg_errno(errp, -ret, "Could not get device features");
664 return ret;
665 }
666 vhost_svq_valid_features(features, &dev->migration_blocker);
667 }
668
669 /*
670 * Similar to VFIO, we end up pinning all guest memory and have to
671 * disable discarding of RAM.
672 */
673 ret = ram_block_discard_disable(true);
674 if (ret) {
675 error_report("Cannot set discarding of RAM broken");
676 return ret;
677 }
678
679 vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
680 VIRTIO_CONFIG_S_DRIVER);
681
682 v->shared->listener = vhost_vdpa_memory_listener;
683 return 0;
684 }
685
vhost_vdpa_host_notifier_uninit(struct vhost_dev * dev,int queue_index)686 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
687 int queue_index)
688 {
689 size_t page_size = qemu_real_host_page_size();
690 struct vhost_vdpa *v = dev->opaque;
691 VirtIODevice *vdev = dev->vdev;
692 VhostVDPAHostNotifier *n;
693
694 n = &v->notifier[queue_index];
695
696 if (n->addr) {
697 virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false);
698 object_unparent(OBJECT(&n->mr));
699 munmap(n->addr, page_size);
700 n->addr = NULL;
701 }
702 }
703
vhost_vdpa_host_notifier_init(struct vhost_dev * dev,int queue_index)704 static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index)
705 {
706 size_t page_size = qemu_real_host_page_size();
707 struct vhost_vdpa *v = dev->opaque;
708 VirtIODevice *vdev = dev->vdev;
709 VhostVDPAHostNotifier *n;
710 int fd = v->shared->device_fd;
711 void *addr;
712 char *name;
713
714 vhost_vdpa_host_notifier_uninit(dev, queue_index);
715
716 n = &v->notifier[queue_index];
717
718 addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd,
719 queue_index * page_size);
720 if (addr == MAP_FAILED) {
721 goto err;
722 }
723
724 name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]",
725 v, queue_index);
726 memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
727 page_size, addr);
728 g_free(name);
729
730 if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) {
731 object_unparent(OBJECT(&n->mr));
732 munmap(addr, page_size);
733 goto err;
734 }
735 n->addr = addr;
736
737 return 0;
738
739 err:
740 return -1;
741 }
742
vhost_vdpa_host_notifiers_uninit(struct vhost_dev * dev,int n)743 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
744 {
745 int i;
746
747 /*
748 * Pack all the changes to the memory regions in a single
749 * transaction to avoid a few updating of the address space
750 * topology.
751 */
752 memory_region_transaction_begin();
753
754 for (i = dev->vq_index; i < dev->vq_index + n; i++) {
755 vhost_vdpa_host_notifier_uninit(dev, i);
756 }
757
758 memory_region_transaction_commit();
759 }
760
vhost_vdpa_host_notifiers_init(struct vhost_dev * dev)761 static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
762 {
763 struct vhost_vdpa *v = dev->opaque;
764 int i;
765
766 if (v->shadow_vqs_enabled) {
767 /* FIXME SVQ is not compatible with host notifiers mr */
768 return;
769 }
770
771 /*
772 * Pack all the changes to the memory regions in a single
773 * transaction to avoid a few updating of the address space
774 * topology.
775 */
776 memory_region_transaction_begin();
777
778 for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
779 if (vhost_vdpa_host_notifier_init(dev, i)) {
780 vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index);
781 break;
782 }
783 }
784
785 memory_region_transaction_commit();
786 }
787
vhost_vdpa_svq_cleanup(struct vhost_dev * dev)788 static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
789 {
790 struct vhost_vdpa *v = dev->opaque;
791 size_t idx;
792
793 for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
794 vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
795 }
796 g_ptr_array_free(v->shadow_vqs, true);
797 }
798
vhost_vdpa_cleanup(struct vhost_dev * dev)799 static int vhost_vdpa_cleanup(struct vhost_dev *dev)
800 {
801 struct vhost_vdpa *v;
802 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
803 v = dev->opaque;
804 trace_vhost_vdpa_cleanup(dev, v);
805 if (vhost_vdpa_first_dev(dev)) {
806 ram_block_discard_disable(false);
807 memory_listener_unregister(&v->shared->listener);
808 }
809
810 vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
811 vhost_vdpa_svq_cleanup(dev);
812
813 dev->opaque = NULL;
814
815 return 0;
816 }
817
vhost_vdpa_memslots_limit(struct vhost_dev * dev)818 static int vhost_vdpa_memslots_limit(struct vhost_dev *dev)
819 {
820 trace_vhost_vdpa_memslots_limit(dev, INT_MAX);
821 return INT_MAX;
822 }
823
vhost_vdpa_set_mem_table(struct vhost_dev * dev,struct vhost_memory * mem)824 static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
825 struct vhost_memory *mem)
826 {
827 if (!vhost_vdpa_first_dev(dev)) {
828 return 0;
829 }
830
831 trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding);
832 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) &&
833 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) {
834 int i;
835 for (i = 0; i < mem->nregions; i++) {
836 trace_vhost_vdpa_dump_regions(dev, i,
837 mem->regions[i].guest_phys_addr,
838 mem->regions[i].memory_size,
839 mem->regions[i].userspace_addr,
840 mem->regions[i].flags_padding);
841 }
842 }
843 if (mem->padding) {
844 return -EINVAL;
845 }
846
847 return 0;
848 }
849
vhost_vdpa_set_features(struct vhost_dev * dev,uint64_t features)850 static int vhost_vdpa_set_features(struct vhost_dev *dev,
851 uint64_t features)
852 {
853 struct vhost_vdpa *v = dev->opaque;
854 int ret;
855
856 if (!vhost_vdpa_first_dev(dev)) {
857 return 0;
858 }
859
860 if (v->shadow_vqs_enabled) {
861 if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
862 /*
863 * QEMU is just trying to enable or disable logging. SVQ handles
864 * this sepparately, so no need to forward this.
865 */
866 v->acked_features = features;
867 return 0;
868 }
869
870 v->acked_features = features;
871
872 /* We must not ack _F_LOG if SVQ is enabled */
873 features &= ~BIT_ULL(VHOST_F_LOG_ALL);
874 }
875
876 trace_vhost_vdpa_set_features(dev, features);
877 ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
878 if (ret) {
879 return ret;
880 }
881
882 return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
883 }
884
vhost_vdpa_get_device_id(struct vhost_dev * dev,uint32_t * device_id)885 static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
886 uint32_t *device_id)
887 {
888 int ret;
889 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id);
890 trace_vhost_vdpa_get_device_id(dev, *device_id);
891 return ret;
892 }
893
vhost_vdpa_reset_device(struct vhost_dev * dev)894 static int vhost_vdpa_reset_device(struct vhost_dev *dev)
895 {
896 struct vhost_vdpa *v = dev->opaque;
897 int ret;
898 uint8_t status = 0;
899
900 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
901 trace_vhost_vdpa_reset_device(dev);
902 if (ret) {
903 return ret;
904 }
905
906 memory_listener_unregister(&v->shared->listener);
907 v->shared->listener_registered = false;
908 v->suspended = false;
909 return 0;
910 }
911
vhost_vdpa_get_vq_index(struct vhost_dev * dev,int idx)912 static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx)
913 {
914 assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
915
916 trace_vhost_vdpa_get_vq_index(dev, idx, idx);
917 return idx;
918 }
919
vhost_vdpa_set_vring_enable_one(struct vhost_vdpa * v,unsigned idx,int enable)920 static int vhost_vdpa_set_vring_enable_one(struct vhost_vdpa *v, unsigned idx,
921 int enable)
922 {
923 struct vhost_dev *dev = v->dev;
924 struct vhost_vring_state state = {
925 .index = idx,
926 .num = enable,
927 };
928 int r = vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);
929
930 trace_vhost_vdpa_set_vring_enable_one(dev, idx, enable, r);
931 return r;
932 }
933
vhost_vdpa_set_vring_enable(struct vhost_dev * dev,int enable)934 static int vhost_vdpa_set_vring_enable(struct vhost_dev *dev, int enable)
935 {
936 struct vhost_vdpa *v = dev->opaque;
937 unsigned int i;
938 int ret;
939
940 for (i = 0; i < dev->nvqs; ++i) {
941 ret = vhost_vdpa_set_vring_enable_one(v, i, enable);
942 if (ret < 0) {
943 return ret;
944 }
945 }
946
947 return 0;
948 }
949
vhost_vdpa_set_vring_ready(struct vhost_vdpa * v,unsigned idx)950 int vhost_vdpa_set_vring_ready(struct vhost_vdpa *v, unsigned idx)
951 {
952 return vhost_vdpa_set_vring_enable_one(v, idx, 1);
953 }
954
vhost_vdpa_set_config_call(struct vhost_dev * dev,int fd)955 static int vhost_vdpa_set_config_call(struct vhost_dev *dev,
956 int fd)
957 {
958 trace_vhost_vdpa_set_config_call(dev, fd);
959 return vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG_CALL, &fd);
960 }
961
vhost_vdpa_dump_config(struct vhost_dev * dev,const uint8_t * config,uint32_t config_len)962 static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config,
963 uint32_t config_len)
964 {
965 g_autoptr(GString) str = g_string_sized_new(4 * 16);
966 size_t b, len;
967
968 for (b = 0; b < config_len; b += len) {
969 len = MIN(config_len - b, 16);
970
971 g_string_truncate(str, 0);
972 qemu_hexdump_line(str, config + b, len, 1, 4);
973 trace_vhost_vdpa_dump_config(dev, b, str->str);
974 }
975 }
976
vhost_vdpa_set_config(struct vhost_dev * dev,const uint8_t * data,uint32_t offset,uint32_t size,uint32_t flags)977 static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data,
978 uint32_t offset, uint32_t size,
979 uint32_t flags)
980 {
981 struct vhost_vdpa_config *config;
982 int ret;
983 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
984
985 trace_vhost_vdpa_set_config(dev, offset, size, flags);
986 config = g_malloc(size + config_size);
987 config->off = offset;
988 config->len = size;
989 memcpy(config->buf, data, size);
990 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) &&
991 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
992 vhost_vdpa_dump_config(dev, data, size);
993 }
994 ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config);
995 g_free(config);
996 return ret;
997 }
998
vhost_vdpa_get_config(struct vhost_dev * dev,uint8_t * config,uint32_t config_len,Error ** errp)999 static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
1000 uint32_t config_len, Error **errp)
1001 {
1002 struct vhost_vdpa_config *v_config;
1003 unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
1004 int ret;
1005
1006 trace_vhost_vdpa_get_config(dev, config, config_len);
1007 v_config = g_malloc(config_len + config_size);
1008 v_config->len = config_len;
1009 v_config->off = 0;
1010 ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config);
1011 memcpy(config, v_config->buf, config_len);
1012 g_free(v_config);
1013 if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) &&
1014 trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
1015 vhost_vdpa_dump_config(dev, config, config_len);
1016 }
1017 return ret;
1018 }
1019
vhost_vdpa_set_dev_vring_base(struct vhost_dev * dev,struct vhost_vring_state * ring)1020 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
1021 struct vhost_vring_state *ring)
1022 {
1023 struct vhost_vdpa *v = dev->opaque;
1024
1025 trace_vhost_vdpa_set_dev_vring_base(dev, ring->index, ring->num,
1026 v->shadow_vqs_enabled);
1027 return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
1028 }
1029
vhost_vdpa_set_vring_dev_kick(struct vhost_dev * dev,struct vhost_vring_file * file)1030 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
1031 struct vhost_vring_file *file)
1032 {
1033 trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
1034 return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
1035 }
1036
vhost_vdpa_set_vring_dev_call(struct vhost_dev * dev,struct vhost_vring_file * file)1037 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
1038 struct vhost_vring_file *file)
1039 {
1040 trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
1041 return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
1042 }
1043
vhost_vdpa_set_vring_dev_addr(struct vhost_dev * dev,struct vhost_vring_addr * addr)1044 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
1045 struct vhost_vring_addr *addr)
1046 {
1047 trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
1048 addr->desc_user_addr, addr->used_user_addr,
1049 addr->avail_user_addr,
1050 addr->log_guest_addr);
1051
1052 return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
1053
1054 }
1055
1056 /**
1057 * Set the shadow virtqueue descriptors to the device
1058 *
1059 * @dev: The vhost device model
1060 * @svq: The shadow virtqueue
1061 * @idx: The index of the virtqueue in the vhost device
1062 * @errp: Error
1063 *
1064 * Note that this function does not rewind kick file descriptor if cannot set
1065 * call one.
1066 */
vhost_vdpa_svq_set_fds(struct vhost_dev * dev,VhostShadowVirtqueue * svq,unsigned idx,Error ** errp)1067 static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
1068 VhostShadowVirtqueue *svq, unsigned idx,
1069 Error **errp)
1070 {
1071 struct vhost_vring_file file = {
1072 .index = dev->vq_index + idx,
1073 };
1074 const EventNotifier *event_notifier = &svq->hdev_kick;
1075 int r;
1076
1077 r = event_notifier_init(&svq->hdev_kick, 0);
1078 if (r != 0) {
1079 error_setg_errno(errp, -r, "Couldn't create kick event notifier");
1080 goto err_init_hdev_kick;
1081 }
1082
1083 r = event_notifier_init(&svq->hdev_call, 0);
1084 if (r != 0) {
1085 error_setg_errno(errp, -r, "Couldn't create call event notifier");
1086 goto err_init_hdev_call;
1087 }
1088
1089 file.fd = event_notifier_get_fd(event_notifier);
1090 r = vhost_vdpa_set_vring_dev_kick(dev, &file);
1091 if (unlikely(r != 0)) {
1092 error_setg_errno(errp, -r, "Can't set device kick fd");
1093 goto err_init_set_dev_fd;
1094 }
1095
1096 event_notifier = &svq->hdev_call;
1097 file.fd = event_notifier_get_fd(event_notifier);
1098 r = vhost_vdpa_set_vring_dev_call(dev, &file);
1099 if (unlikely(r != 0)) {
1100 error_setg_errno(errp, -r, "Can't set device call fd");
1101 goto err_init_set_dev_fd;
1102 }
1103
1104 return 0;
1105
1106 err_init_set_dev_fd:
1107 event_notifier_set_handler(&svq->hdev_call, NULL);
1108
1109 err_init_hdev_call:
1110 event_notifier_cleanup(&svq->hdev_kick);
1111
1112 err_init_hdev_kick:
1113 return r;
1114 }
1115
1116 /**
1117 * Unmap a SVQ area in the device
1118 */
vhost_vdpa_svq_unmap_ring(struct vhost_vdpa * v,hwaddr addr)1119 static void vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr addr)
1120 {
1121 const DMAMap needle = {
1122 .translated_addr = addr,
1123 };
1124 const DMAMap *result = vhost_iova_tree_find_iova(v->shared->iova_tree,
1125 &needle);
1126 hwaddr size;
1127 int r;
1128
1129 if (unlikely(!result)) {
1130 error_report("Unable to find SVQ address to unmap");
1131 return;
1132 }
1133
1134 size = ROUND_UP(result->size, qemu_real_host_page_size());
1135 r = vhost_vdpa_dma_unmap(v->shared, v->address_space_id, result->iova,
1136 size);
1137 if (unlikely(r < 0)) {
1138 error_report("Unable to unmap SVQ vring: %s (%d)", g_strerror(-r), -r);
1139 return;
1140 }
1141
1142 vhost_iova_tree_remove(v->shared->iova_tree, *result);
1143 }
1144
vhost_vdpa_svq_unmap_rings(struct vhost_dev * dev,const VhostShadowVirtqueue * svq)1145 static void vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
1146 const VhostShadowVirtqueue *svq)
1147 {
1148 struct vhost_vdpa *v = dev->opaque;
1149 struct vhost_vring_addr svq_addr;
1150
1151 vhost_svq_get_vring_addr(svq, &svq_addr);
1152
1153 vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr);
1154
1155 vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr);
1156 }
1157
1158 /**
1159 * Map the SVQ area in the device
1160 *
1161 * @v: Vhost-vdpa device
1162 * @needle: The area to search iova
1163 * @taddr: The translated address (HVA)
1164 * @errorp: Error pointer
1165 */
vhost_vdpa_svq_map_ring(struct vhost_vdpa * v,DMAMap * needle,hwaddr taddr,Error ** errp)1166 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
1167 hwaddr taddr, Error **errp)
1168 {
1169 int r;
1170
1171 r = vhost_iova_tree_map_alloc(v->shared->iova_tree, needle, taddr);
1172 if (unlikely(r != IOVA_OK)) {
1173 error_setg(errp, "Cannot allocate iova (%d)", r);
1174
1175 if (needle->translated_addr == taddr) {
1176 error_append_hint(errp, "Insertion to IOVA->HVA tree failed");
1177 /* Remove the mapping from the IOVA-only tree */
1178 vhost_iova_tree_remove(v->shared->iova_tree, *needle);
1179 }
1180 return false;
1181 }
1182
1183 r = vhost_vdpa_dma_map(v->shared, v->address_space_id, needle->iova,
1184 needle->size + 1,
1185 (void *)(uintptr_t)needle->translated_addr,
1186 needle->perm == IOMMU_RO);
1187 if (unlikely(r != 0)) {
1188 error_setg_errno(errp, -r, "Cannot map region to device");
1189 vhost_iova_tree_remove(v->shared->iova_tree, *needle);
1190 }
1191
1192 return r == 0;
1193 }
1194
1195 /**
1196 * Map the shadow virtqueue rings in the device
1197 *
1198 * @dev: The vhost device
1199 * @svq: The shadow virtqueue
1200 * @addr: Assigned IOVA addresses
1201 * @errp: Error pointer
1202 */
vhost_vdpa_svq_map_rings(struct vhost_dev * dev,const VhostShadowVirtqueue * svq,struct vhost_vring_addr * addr,Error ** errp)1203 static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
1204 const VhostShadowVirtqueue *svq,
1205 struct vhost_vring_addr *addr,
1206 Error **errp)
1207 {
1208 ERRP_GUARD();
1209 DMAMap device_region, driver_region;
1210 struct vhost_vring_addr svq_addr;
1211 struct vhost_vdpa *v = dev->opaque;
1212 size_t device_size = vhost_svq_device_area_size(svq);
1213 size_t driver_size = vhost_svq_driver_area_size(svq);
1214 size_t avail_offset;
1215 bool ok;
1216
1217 vhost_svq_get_vring_addr(svq, &svq_addr);
1218
1219 driver_region = (DMAMap) {
1220 .size = driver_size - 1,
1221 .perm = IOMMU_RO,
1222 };
1223 ok = vhost_vdpa_svq_map_ring(v, &driver_region, svq_addr.desc_user_addr,
1224 errp);
1225 if (unlikely(!ok)) {
1226 error_prepend(errp, "Cannot create vq driver region: ");
1227 return false;
1228 }
1229 addr->desc_user_addr = driver_region.iova;
1230 avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
1231 addr->avail_user_addr = driver_region.iova + avail_offset;
1232
1233 device_region = (DMAMap) {
1234 .size = device_size - 1,
1235 .perm = IOMMU_RW,
1236 };
1237 ok = vhost_vdpa_svq_map_ring(v, &device_region, svq_addr.used_user_addr,
1238 errp);
1239 if (unlikely(!ok)) {
1240 error_prepend(errp, "Cannot create vq device region: ");
1241 vhost_vdpa_svq_unmap_ring(v, driver_region.translated_addr);
1242 }
1243 addr->used_user_addr = device_region.iova;
1244
1245 return ok;
1246 }
1247
vhost_vdpa_svq_setup(struct vhost_dev * dev,VhostShadowVirtqueue * svq,unsigned idx,Error ** errp)1248 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
1249 VhostShadowVirtqueue *svq, unsigned idx,
1250 Error **errp)
1251 {
1252 uint16_t vq_index = dev->vq_index + idx;
1253 struct vhost_vring_state s = {
1254 .index = vq_index,
1255 };
1256 int r;
1257
1258 r = vhost_vdpa_set_dev_vring_base(dev, &s);
1259 if (unlikely(r)) {
1260 error_setg_errno(errp, -r, "Cannot set vring base");
1261 return false;
1262 }
1263
1264 r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
1265 return r == 0;
1266 }
1267
vhost_vdpa_svqs_start(struct vhost_dev * dev)1268 static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
1269 {
1270 struct vhost_vdpa *v = dev->opaque;
1271 Error *err = NULL;
1272 unsigned i;
1273
1274 if (!v->shadow_vqs_enabled) {
1275 return true;
1276 }
1277
1278 for (i = 0; i < v->shadow_vqs->len; ++i) {
1279 VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
1280 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1281 struct vhost_vring_addr addr = {
1282 .index = dev->vq_index + i,
1283 };
1284 int r;
1285 bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
1286 if (unlikely(!ok)) {
1287 goto err;
1288 }
1289
1290 vhost_svq_start(svq, dev->vdev, vq, v->shared->iova_tree);
1291 ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
1292 if (unlikely(!ok)) {
1293 goto err_map;
1294 }
1295
1296 /* Override vring GPA set by vhost subsystem */
1297 r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
1298 if (unlikely(r != 0)) {
1299 error_setg_errno(&err, -r, "Cannot set device address");
1300 goto err_set_addr;
1301 }
1302 }
1303
1304 return true;
1305
1306 err_set_addr:
1307 vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
1308
1309 err_map:
1310 vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
1311
1312 err:
1313 error_reportf_err(err, "Cannot setup SVQ %u: ", i);
1314 for (unsigned j = 0; j < i; ++j) {
1315 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
1316 vhost_vdpa_svq_unmap_rings(dev, svq);
1317 vhost_svq_stop(svq);
1318 }
1319
1320 return false;
1321 }
1322
vhost_vdpa_svqs_stop(struct vhost_dev * dev)1323 static void vhost_vdpa_svqs_stop(struct vhost_dev *dev)
1324 {
1325 struct vhost_vdpa *v = dev->opaque;
1326
1327 if (!v->shadow_vqs_enabled) {
1328 return;
1329 }
1330
1331 for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
1332 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1333
1334 vhost_svq_stop(svq);
1335 vhost_vdpa_svq_unmap_rings(dev, svq);
1336
1337 event_notifier_cleanup(&svq->hdev_kick);
1338 event_notifier_cleanup(&svq->hdev_call);
1339 }
1340 }
1341
vhost_vdpa_suspend(struct vhost_dev * dev)1342 static void vhost_vdpa_suspend(struct vhost_dev *dev)
1343 {
1344 struct vhost_vdpa *v = dev->opaque;
1345 int r;
1346
1347 if (!vhost_vdpa_first_dev(dev)) {
1348 return;
1349 }
1350
1351 if (dev->backend_cap & BIT_ULL(VHOST_BACKEND_F_SUSPEND)) {
1352 trace_vhost_vdpa_suspend(dev);
1353 r = ioctl(v->shared->device_fd, VHOST_VDPA_SUSPEND);
1354 if (unlikely(r)) {
1355 error_report("Cannot suspend: %s(%d)", g_strerror(errno), errno);
1356 } else {
1357 v->suspended = true;
1358 return;
1359 }
1360 }
1361
1362 vhost_vdpa_reset_device(dev);
1363 }
1364
vhost_vdpa_dev_start(struct vhost_dev * dev,bool started)1365 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
1366 {
1367 struct vhost_vdpa *v = dev->opaque;
1368 bool ok;
1369 trace_vhost_vdpa_dev_start(dev, started);
1370
1371 if (started) {
1372 vhost_vdpa_host_notifiers_init(dev);
1373 ok = vhost_vdpa_svqs_start(dev);
1374 if (unlikely(!ok)) {
1375 return -1;
1376 }
1377 } else {
1378 vhost_vdpa_suspend(dev);
1379 vhost_vdpa_svqs_stop(dev);
1380 vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
1381 }
1382
1383 if (!vhost_vdpa_last_dev(dev)) {
1384 return 0;
1385 }
1386
1387 if (started) {
1388 if (vhost_dev_has_iommu(dev) && (v->shadow_vqs_enabled)) {
1389 error_report("SVQ can not work while IOMMU enable, please disable"
1390 "IOMMU and try again");
1391 return -1;
1392 }
1393 if (v->shared->listener_registered &&
1394 dev->vdev->dma_as != v->shared->listener.address_space) {
1395 memory_listener_unregister(&v->shared->listener);
1396 v->shared->listener_registered = false;
1397 }
1398 if (!v->shared->listener_registered) {
1399 memory_listener_register(&v->shared->listener, dev->vdev->dma_as);
1400 v->shared->listener_registered = true;
1401 }
1402
1403 return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
1404 }
1405
1406 return 0;
1407 }
1408
vhost_vdpa_reset_status(struct vhost_dev * dev)1409 static void vhost_vdpa_reset_status(struct vhost_dev *dev)
1410 {
1411 if (!vhost_vdpa_last_dev(dev)) {
1412 return;
1413 }
1414
1415 vhost_vdpa_reset_device(dev);
1416 vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
1417 VIRTIO_CONFIG_S_DRIVER);
1418 }
1419
vhost_vdpa_set_log_base(struct vhost_dev * dev,uint64_t base,struct vhost_log * log)1420 static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
1421 struct vhost_log *log)
1422 {
1423 struct vhost_vdpa *v = dev->opaque;
1424 if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) {
1425 return 0;
1426 }
1427
1428 trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd,
1429 log->log);
1430 return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base);
1431 }
1432
vhost_vdpa_set_vring_addr(struct vhost_dev * dev,struct vhost_vring_addr * addr)1433 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
1434 struct vhost_vring_addr *addr)
1435 {
1436 struct vhost_vdpa *v = dev->opaque;
1437
1438 if (v->shadow_vqs_enabled) {
1439 /*
1440 * Device vring addr was set at device start. SVQ base is handled by
1441 * VirtQueue code.
1442 */
1443 return 0;
1444 }
1445
1446 return vhost_vdpa_set_vring_dev_addr(dev, addr);
1447 }
1448
vhost_vdpa_set_vring_num(struct vhost_dev * dev,struct vhost_vring_state * ring)1449 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
1450 struct vhost_vring_state *ring)
1451 {
1452 trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num);
1453 return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring);
1454 }
1455
vhost_vdpa_set_vring_base(struct vhost_dev * dev,struct vhost_vring_state * ring)1456 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
1457 struct vhost_vring_state *ring)
1458 {
1459 struct vhost_vdpa *v = dev->opaque;
1460
1461 if (v->shadow_vqs_enabled) {
1462 /*
1463 * Device vring base was set at device start. SVQ base is handled by
1464 * VirtQueue code.
1465 */
1466 return 0;
1467 }
1468
1469 return vhost_vdpa_set_dev_vring_base(dev, ring);
1470 }
1471
vhost_vdpa_get_vring_base(struct vhost_dev * dev,struct vhost_vring_state * ring)1472 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
1473 struct vhost_vring_state *ring)
1474 {
1475 struct vhost_vdpa *v = dev->opaque;
1476 int ret;
1477
1478 if (v->shadow_vqs_enabled) {
1479 ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index);
1480 trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num, true);
1481 return 0;
1482 }
1483
1484 if (!v->suspended) {
1485 /*
1486 * Cannot trust in value returned by device, let vhost recover used
1487 * idx from guest.
1488 */
1489 return -1;
1490 }
1491
1492 ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
1493 trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num, false);
1494 return ret;
1495 }
1496
vhost_vdpa_set_vring_kick(struct vhost_dev * dev,struct vhost_vring_file * file)1497 static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
1498 struct vhost_vring_file *file)
1499 {
1500 struct vhost_vdpa *v = dev->opaque;
1501 int vdpa_idx = file->index - dev->vq_index;
1502
1503 if (v->shadow_vqs_enabled) {
1504 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1505 vhost_svq_set_svq_kick_fd(svq, file->fd);
1506 return 0;
1507 } else {
1508 return vhost_vdpa_set_vring_dev_kick(dev, file);
1509 }
1510 }
1511
vhost_vdpa_set_vring_call(struct vhost_dev * dev,struct vhost_vring_file * file)1512 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
1513 struct vhost_vring_file *file)
1514 {
1515 struct vhost_vdpa *v = dev->opaque;
1516 int vdpa_idx = file->index - dev->vq_index;
1517 VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1518
1519 /* Remember last call fd because we can switch to SVQ anytime. */
1520 vhost_svq_set_svq_call_fd(svq, file->fd);
1521 /*
1522 * When SVQ is transitioning to off, shadow_vqs_enabled has
1523 * not been set back to false yet, but the underlying call fd
1524 * will have to switch back to the guest notifier to signal the
1525 * passthrough virtqueues. In other situations, SVQ's own call
1526 * fd shall be used to signal the device model.
1527 */
1528 if (v->shadow_vqs_enabled &&
1529 v->shared->svq_switching != SVQ_TSTATE_DISABLING) {
1530 return 0;
1531 }
1532
1533 return vhost_vdpa_set_vring_dev_call(dev, file);
1534 }
1535
vhost_vdpa_get_features(struct vhost_dev * dev,uint64_t * features)1536 static int vhost_vdpa_get_features(struct vhost_dev *dev,
1537 uint64_t *features)
1538 {
1539 int ret = vhost_vdpa_get_dev_features(dev, features);
1540
1541 if (ret == 0) {
1542 /* Add SVQ logging capabilities */
1543 *features |= BIT_ULL(VHOST_F_LOG_ALL);
1544 }
1545
1546 return ret;
1547 }
1548
vhost_vdpa_set_owner(struct vhost_dev * dev)1549 static int vhost_vdpa_set_owner(struct vhost_dev *dev)
1550 {
1551 int r;
1552 struct vhost_vdpa *v;
1553
1554 if (!vhost_vdpa_first_dev(dev)) {
1555 return 0;
1556 }
1557
1558 trace_vhost_vdpa_set_owner(dev);
1559 r = vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL);
1560 if (unlikely(r < 0)) {
1561 return r;
1562 }
1563
1564 /*
1565 * Being optimistic and listening address space memory. If the device
1566 * uses vIOMMU, it is changed at vhost_vdpa_dev_start.
1567 */
1568 v = dev->opaque;
1569 memory_listener_register(&v->shared->listener, &address_space_memory);
1570 v->shared->listener_registered = true;
1571 return 0;
1572 }
1573
vhost_vdpa_vq_get_addr(struct vhost_dev * dev,struct vhost_vring_addr * addr,struct vhost_virtqueue * vq)1574 static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
1575 struct vhost_vring_addr *addr, struct vhost_virtqueue *vq)
1576 {
1577 assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
1578 addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys;
1579 addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys;
1580 addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys;
1581 trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr,
1582 addr->avail_user_addr, addr->used_user_addr);
1583 return 0;
1584 }
1585
vhost_vdpa_force_iommu(struct vhost_dev * dev)1586 static bool vhost_vdpa_force_iommu(struct vhost_dev *dev)
1587 {
1588 return true;
1589 }
1590
1591 const VhostOps vdpa_ops = {
1592 .backend_type = VHOST_BACKEND_TYPE_VDPA,
1593 .vhost_backend_init = vhost_vdpa_init,
1594 .vhost_backend_cleanup = vhost_vdpa_cleanup,
1595 .vhost_set_log_base = vhost_vdpa_set_log_base,
1596 .vhost_set_vring_addr = vhost_vdpa_set_vring_addr,
1597 .vhost_set_vring_num = vhost_vdpa_set_vring_num,
1598 .vhost_set_vring_base = vhost_vdpa_set_vring_base,
1599 .vhost_get_vring_base = vhost_vdpa_get_vring_base,
1600 .vhost_set_vring_kick = vhost_vdpa_set_vring_kick,
1601 .vhost_set_vring_call = vhost_vdpa_set_vring_call,
1602 .vhost_get_features = vhost_vdpa_get_features,
1603 .vhost_set_owner = vhost_vdpa_set_owner,
1604 .vhost_set_vring_endian = NULL,
1605 .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit,
1606 .vhost_set_mem_table = vhost_vdpa_set_mem_table,
1607 .vhost_set_features = vhost_vdpa_set_features,
1608 .vhost_reset_device = vhost_vdpa_reset_device,
1609 .vhost_get_vq_index = vhost_vdpa_get_vq_index,
1610 .vhost_set_vring_enable = vhost_vdpa_set_vring_enable,
1611 .vhost_get_config = vhost_vdpa_get_config,
1612 .vhost_set_config = vhost_vdpa_set_config,
1613 .vhost_requires_shm_log = NULL,
1614 .vhost_migration_done = NULL,
1615 .vhost_net_set_mtu = NULL,
1616 .vhost_set_iotlb_callback = NULL,
1617 .vhost_send_device_iotlb_msg = NULL,
1618 .vhost_dev_start = vhost_vdpa_dev_start,
1619 .vhost_get_device_id = vhost_vdpa_get_device_id,
1620 .vhost_vq_get_addr = vhost_vdpa_vq_get_addr,
1621 .vhost_force_iommu = vhost_vdpa_force_iommu,
1622 .vhost_set_config_call = vhost_vdpa_set_config_call,
1623 .vhost_reset_status = vhost_vdpa_reset_status,
1624 };
1625