xref: /qemu/subprojects/libvduse/libvduse.c (revision 518ac42879560e656a62c3f4ed16569cc169202b)
1 /*
2  * VDUSE (vDPA Device in Userspace) library
3  *
4  * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
5  *   Portions of codes and concepts borrowed from libvhost-user.c, so:
6  *     Copyright IBM, Corp. 2007
7  *     Copyright (c) 2016 Red Hat, Inc.
8  *
9  * Author:
10  *   Xie Yongji <xieyongji@bytedance.com>
11  *   Anthony Liguori <aliguori@us.ibm.com>
12  *   Marc-André Lureau <mlureau@redhat.com>
13  *   Victor Kaplansky <victork@redhat.com>
14  *
15  * This work is licensed under the terms of the GNU GPL, version 2 or
16  * later.  See the COPYING file in the top-level directory.
17  */
18 
19 #ifndef _GNU_SOURCE
20 #define _GNU_SOURCE
21 #endif
22 
23 #include <stdlib.h>
24 #include <stdio.h>
25 #include <stdbool.h>
26 #include <stddef.h>
27 #include <errno.h>
28 #include <string.h>
29 #include <assert.h>
30 #include <endian.h>
31 #include <unistd.h>
32 #include <limits.h>
33 #include <fcntl.h>
34 #include <inttypes.h>
35 
36 #include <sys/ioctl.h>
37 #include <sys/eventfd.h>
38 #include <sys/mman.h>
39 
40 #include "include/atomic.h"
41 #include "linux-headers/linux/virtio_ring.h"
42 #include "linux-headers/linux/virtio_config.h"
43 #include "linux-headers/linux/vduse.h"
44 #include "libvduse.h"
45 
46 #define VDUSE_VQ_ALIGN 4096
47 #define MAX_IOVA_REGIONS 256
48 
49 #define LOG_ALIGNMENT 64
50 
51 /* Round number down to multiple */
52 #define ALIGN_DOWN(n, m) ((n) / (m) * (m))
53 
54 /* Round number up to multiple */
55 #define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
56 
57 #ifndef unlikely
58 #define unlikely(x)   __builtin_expect(!!(x), 0)
59 #endif
60 
61 typedef struct VduseDescStateSplit {
62     uint8_t inflight;
63     uint8_t padding[5];
64     uint16_t next;
65     uint64_t counter;
66 } VduseDescStateSplit;
67 
68 typedef struct VduseVirtqLogInflight {
69     uint64_t features;
70     uint16_t version;
71     uint16_t desc_num;
72     uint16_t last_batch_head;
73     uint16_t used_idx;
74     VduseDescStateSplit desc[];
75 } VduseVirtqLogInflight;
76 
77 typedef struct VduseVirtqLog {
78     VduseVirtqLogInflight inflight;
79 } VduseVirtqLog;
80 
81 typedef struct VduseVirtqInflightDesc {
82     uint16_t index;
83     uint64_t counter;
84 } VduseVirtqInflightDesc;
85 
86 typedef struct VduseRing {
87     unsigned int num;
88     uint64_t desc_addr;
89     uint64_t avail_addr;
90     uint64_t used_addr;
91     struct vring_desc *desc;
92     struct vring_avail *avail;
93     struct vring_used *used;
94 } VduseRing;
95 
96 struct VduseVirtq {
97     VduseRing vring;
98     uint16_t last_avail_idx;
99     uint16_t shadow_avail_idx;
100     uint16_t used_idx;
101     uint16_t signalled_used;
102     bool signalled_used_valid;
103     int index;
104     int inuse;
105     bool ready;
106     int fd;
107     VduseDev *dev;
108     VduseVirtqInflightDesc *resubmit_list;
109     uint16_t resubmit_num;
110     uint64_t counter;
111     VduseVirtqLog *log;
112 };
113 
114 typedef struct VduseIovaRegion {
115     uint64_t iova;
116     uint64_t size;
117     uint64_t mmap_offset;
118     uint64_t mmap_addr;
119 } VduseIovaRegion;
120 
121 struct VduseDev {
122     VduseVirtq *vqs;
123     VduseIovaRegion regions[MAX_IOVA_REGIONS];
124     int num_regions;
125     char *name;
126     uint32_t device_id;
127     uint32_t vendor_id;
128     uint16_t num_queues;
129     uint16_t queue_size;
130     uint64_t features;
131     const VduseOps *ops;
132     int fd;
133     int ctrl_fd;
134     void *priv;
135     void *log;
136 };
137 
138 static inline size_t vduse_vq_log_size(uint16_t queue_size)
139 {
140     return ALIGN_UP(sizeof(VduseDescStateSplit) * queue_size +
141                     sizeof(VduseVirtqLogInflight), LOG_ALIGNMENT);
142 }
143 
144 static void *vduse_log_get(const char *filename, size_t size)
145 {
146     void *ptr = MAP_FAILED;
147     int fd;
148 
149     fd = open(filename, O_RDWR | O_CREAT, 0600);
150     if (fd == -1) {
151         return MAP_FAILED;
152     }
153 
154     if (ftruncate(fd, size) == -1) {
155         goto out;
156     }
157 
158     ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
159 
160 out:
161     close(fd);
162     return ptr;
163 }
164 
165 static inline bool has_feature(uint64_t features, unsigned int fbit)
166 {
167     assert(fbit < 64);
168     return !!(features & (1ULL << fbit));
169 }
170 
171 static inline bool vduse_dev_has_feature(VduseDev *dev, unsigned int fbit)
172 {
173     return has_feature(dev->features, fbit);
174 }
175 
176 uint64_t vduse_get_virtio_features(void)
177 {
178     return (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
179            (1ULL << VIRTIO_F_VERSION_1) |
180            (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
181            (1ULL << VIRTIO_RING_F_EVENT_IDX) |
182            (1ULL << VIRTIO_RING_F_INDIRECT_DESC);
183 }
184 
185 VduseDev *vduse_queue_get_dev(VduseVirtq *vq)
186 {
187     return vq->dev;
188 }
189 
190 int vduse_queue_get_fd(VduseVirtq *vq)
191 {
192     return vq->fd;
193 }
194 
195 void *vduse_dev_get_priv(VduseDev *dev)
196 {
197     return dev->priv;
198 }
199 
200 VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index)
201 {
202     return &dev->vqs[index];
203 }
204 
205 int vduse_dev_get_fd(VduseDev *dev)
206 {
207     return dev->fd;
208 }
209 
210 static int vduse_inject_irq(VduseDev *dev, int index)
211 {
212     return ioctl(dev->fd, VDUSE_VQ_INJECT_IRQ, &index);
213 }
214 
215 static int inflight_desc_compare(const void *a, const void *b)
216 {
217     VduseVirtqInflightDesc *desc0 = (VduseVirtqInflightDesc *)a,
218                            *desc1 = (VduseVirtqInflightDesc *)b;
219 
220     if (desc1->counter > desc0->counter &&
221         (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) {
222         return 1;
223     }
224 
225     return -1;
226 }
227 
228 static int vduse_queue_check_inflights(VduseVirtq *vq)
229 {
230     int i = 0;
231     VduseDev *dev = vq->dev;
232 
233     vq->used_idx = le16toh(vq->vring.used->idx);
234     vq->resubmit_num = 0;
235     vq->resubmit_list = NULL;
236     vq->counter = 0;
237 
238     if (unlikely(vq->log->inflight.used_idx != vq->used_idx)) {
239         if (vq->log->inflight.last_batch_head > VIRTQUEUE_MAX_SIZE) {
240             return -1;
241         }
242 
243         vq->log->inflight.desc[vq->log->inflight.last_batch_head].inflight = 0;
244 
245         barrier();
246 
247         vq->log->inflight.used_idx = vq->used_idx;
248     }
249 
250     for (i = 0; i < vq->log->inflight.desc_num; i++) {
251         if (vq->log->inflight.desc[i].inflight == 1) {
252             vq->inuse++;
253         }
254     }
255 
256     vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
257 
258     if (vq->inuse) {
259         vq->resubmit_list = calloc(vq->inuse, sizeof(VduseVirtqInflightDesc));
260         if (!vq->resubmit_list) {
261             return -1;
262         }
263 
264         for (i = 0; i < vq->log->inflight.desc_num; i++) {
265             if (vq->log->inflight.desc[i].inflight) {
266                 vq->resubmit_list[vq->resubmit_num].index = i;
267                 vq->resubmit_list[vq->resubmit_num].counter =
268                                         vq->log->inflight.desc[i].counter;
269                 vq->resubmit_num++;
270             }
271         }
272 
273         if (vq->resubmit_num > 1) {
274             qsort(vq->resubmit_list, vq->resubmit_num,
275                   sizeof(VduseVirtqInflightDesc), inflight_desc_compare);
276         }
277         vq->counter = vq->resubmit_list[0].counter + 1;
278     }
279 
280     vduse_inject_irq(dev, vq->index);
281 
282     return 0;
283 }
284 
285 static int vduse_queue_inflight_get(VduseVirtq *vq, int desc_idx)
286 {
287     vq->log->inflight.desc[desc_idx].counter = vq->counter++;
288 
289     barrier();
290 
291     vq->log->inflight.desc[desc_idx].inflight = 1;
292 
293     return 0;
294 }
295 
296 static int vduse_queue_inflight_pre_put(VduseVirtq *vq, int desc_idx)
297 {
298     vq->log->inflight.last_batch_head = desc_idx;
299 
300     return 0;
301 }
302 
303 static int vduse_queue_inflight_post_put(VduseVirtq *vq, int desc_idx)
304 {
305     vq->log->inflight.desc[desc_idx].inflight = 0;
306 
307     barrier();
308 
309     vq->log->inflight.used_idx = vq->used_idx;
310 
311     return 0;
312 }
313 
314 static void vduse_iova_remove_region(VduseDev *dev, uint64_t start,
315                                      uint64_t last)
316 {
317     int i;
318 
319     if (last == start) {
320         return;
321     }
322 
323     for (i = 0; i < MAX_IOVA_REGIONS; i++) {
324         if (!dev->regions[i].mmap_addr) {
325             continue;
326         }
327 
328         if (start <= dev->regions[i].iova &&
329             last >= (dev->regions[i].iova + dev->regions[i].size - 1)) {
330             munmap((void *)(uintptr_t)dev->regions[i].mmap_addr,
331                    dev->regions[i].mmap_offset + dev->regions[i].size);
332             dev->regions[i].mmap_addr = 0;
333             dev->num_regions--;
334         }
335     }
336 }
337 
338 static int vduse_iova_add_region(VduseDev *dev, int fd,
339                                  uint64_t offset, uint64_t start,
340                                  uint64_t last, int prot)
341 {
342     int i;
343     uint64_t size = last - start + 1;
344     void *mmap_addr = mmap(0, size + offset, prot, MAP_SHARED, fd, 0);
345 
346     if (mmap_addr == MAP_FAILED) {
347         close(fd);
348         return -EINVAL;
349     }
350 
351     for (i = 0; i < MAX_IOVA_REGIONS; i++) {
352         if (!dev->regions[i].mmap_addr) {
353             dev->regions[i].mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
354             dev->regions[i].mmap_offset = offset;
355             dev->regions[i].iova = start;
356             dev->regions[i].size = size;
357             dev->num_regions++;
358             break;
359         }
360     }
361     assert(i < MAX_IOVA_REGIONS);
362     close(fd);
363 
364     return 0;
365 }
366 
367 static int perm_to_prot(uint8_t perm)
368 {
369     int prot = 0;
370 
371     switch (perm) {
372     case VDUSE_ACCESS_WO:
373         prot |= PROT_WRITE;
374         break;
375     case VDUSE_ACCESS_RO:
376         prot |= PROT_READ;
377         break;
378     case VDUSE_ACCESS_RW:
379         prot |= PROT_READ | PROT_WRITE;
380         break;
381     default:
382         break;
383     }
384 
385     return prot;
386 }
387 
388 static inline void *iova_to_va(VduseDev *dev, uint64_t *plen, uint64_t iova)
389 {
390     int i, ret;
391     struct vduse_iotlb_entry entry;
392 
393     for (i = 0; i < MAX_IOVA_REGIONS; i++) {
394         VduseIovaRegion *r = &dev->regions[i];
395 
396         if (!r->mmap_addr) {
397             continue;
398         }
399 
400         if ((iova >= r->iova) && (iova < (r->iova + r->size))) {
401             if ((iova + *plen) > (r->iova + r->size)) {
402                 *plen = r->iova + r->size - iova;
403             }
404             return (void *)(uintptr_t)(iova - r->iova +
405                    r->mmap_addr + r->mmap_offset);
406         }
407     }
408 
409     entry.start = iova;
410     entry.last = iova + 1;
411     ret = ioctl(dev->fd, VDUSE_IOTLB_GET_FD, &entry);
412     if (ret < 0) {
413         return NULL;
414     }
415 
416     if (!vduse_iova_add_region(dev, ret, entry.offset, entry.start,
417                                entry.last, perm_to_prot(entry.perm))) {
418         return iova_to_va(dev, plen, iova);
419     }
420 
421     return NULL;
422 }
423 
424 static inline uint16_t vring_avail_flags(VduseVirtq *vq)
425 {
426     return le16toh(vq->vring.avail->flags);
427 }
428 
429 static inline uint16_t vring_avail_idx(VduseVirtq *vq)
430 {
431     vq->shadow_avail_idx = le16toh(vq->vring.avail->idx);
432 
433     return vq->shadow_avail_idx;
434 }
435 
436 static inline uint16_t vring_avail_ring(VduseVirtq *vq, int i)
437 {
438     return le16toh(vq->vring.avail->ring[i]);
439 }
440 
441 static inline uint16_t vring_get_used_event(VduseVirtq *vq)
442 {
443     return vring_avail_ring(vq, vq->vring.num);
444 }
445 
446 static bool vduse_queue_get_head(VduseVirtq *vq, unsigned int idx,
447                                  unsigned int *head)
448 {
449     /*
450      * Grab the next descriptor number they're advertising, and increment
451      * the index we've seen.
452      */
453     *head = vring_avail_ring(vq, idx % vq->vring.num);
454 
455     /* If their number is silly, that's a fatal mistake. */
456     if (*head >= vq->vring.num) {
457         fprintf(stderr, "Guest says index %u is available\n", *head);
458         return false;
459     }
460 
461     return true;
462 }
463 
464 static int
465 vduse_queue_read_indirect_desc(VduseDev *dev, struct vring_desc *desc,
466                                uint64_t addr, size_t len)
467 {
468     struct vring_desc *ori_desc;
469     uint64_t read_len;
470 
471     if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) {
472         return -1;
473     }
474 
475     if (len == 0) {
476         return -1;
477     }
478 
479     while (len) {
480         read_len = len;
481         ori_desc = iova_to_va(dev, &read_len, addr);
482         if (!ori_desc) {
483             return -1;
484         }
485 
486         memcpy(desc, ori_desc, read_len);
487         len -= read_len;
488         addr += read_len;
489         desc += read_len;
490     }
491 
492     return 0;
493 }
494 
495 enum {
496     VIRTQUEUE_READ_DESC_ERROR = -1,
497     VIRTQUEUE_READ_DESC_DONE = 0,   /* end of chain */
498     VIRTQUEUE_READ_DESC_MORE = 1,   /* more buffers in chain */
499 };
500 
501 static int vduse_queue_read_next_desc(struct vring_desc *desc, int i,
502                                       unsigned int max, unsigned int *next)
503 {
504     /* If this descriptor says it doesn't chain, we're done. */
505     if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) {
506         return VIRTQUEUE_READ_DESC_DONE;
507     }
508 
509     /* Check they're not leading us off end of descriptors. */
510     *next = desc[i].next;
511     /* Make sure compiler knows to grab that: we don't want it changing! */
512     smp_wmb();
513 
514     if (*next >= max) {
515         fprintf(stderr, "Desc next is %u\n", *next);
516         return VIRTQUEUE_READ_DESC_ERROR;
517     }
518 
519     return VIRTQUEUE_READ_DESC_MORE;
520 }
521 
522 /*
523  * Fetch avail_idx from VQ memory only when we really need to know if
524  * guest has added some buffers.
525  */
526 static bool vduse_queue_empty(VduseVirtq *vq)
527 {
528     if (unlikely(!vq->vring.avail)) {
529         return true;
530     }
531 
532     if (vq->shadow_avail_idx != vq->last_avail_idx) {
533         return false;
534     }
535 
536     return vring_avail_idx(vq) == vq->last_avail_idx;
537 }
538 
539 static bool vduse_queue_should_notify(VduseVirtq *vq)
540 {
541     VduseDev *dev = vq->dev;
542     uint16_t old, new;
543     bool v;
544 
545     /* We need to expose used array entries before checking used event. */
546     smp_mb();
547 
548     /* Always notify when queue is empty (when feature acknowledge) */
549     if (vduse_dev_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
550         !vq->inuse && vduse_queue_empty(vq)) {
551         return true;
552     }
553 
554     if (!vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
555         return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
556     }
557 
558     v = vq->signalled_used_valid;
559     vq->signalled_used_valid = true;
560     old = vq->signalled_used;
561     new = vq->signalled_used = vq->used_idx;
562     return !v || vring_need_event(vring_get_used_event(vq), new, old);
563 }
564 
565 void vduse_queue_notify(VduseVirtq *vq)
566 {
567     VduseDev *dev = vq->dev;
568 
569     if (unlikely(!vq->vring.avail)) {
570         return;
571     }
572 
573     if (!vduse_queue_should_notify(vq)) {
574         return;
575     }
576 
577     if (vduse_inject_irq(dev, vq->index) < 0) {
578         fprintf(stderr, "Error inject irq for vq %d: %s\n",
579                 vq->index, strerror(errno));
580     }
581 }
582 
583 static inline void vring_set_avail_event(VduseVirtq *vq, uint16_t val)
584 {
585     *((uint16_t *)&vq->vring.used->ring[vq->vring.num]) = htole16(val);
586 }
587 
588 static bool vduse_queue_map_single_desc(VduseVirtq *vq, unsigned int *p_num_sg,
589                                    struct iovec *iov, unsigned int max_num_sg,
590                                    bool is_write, uint64_t pa, size_t sz)
591 {
592     unsigned num_sg = *p_num_sg;
593     VduseDev *dev = vq->dev;
594 
595     assert(num_sg <= max_num_sg);
596 
597     if (!sz) {
598         fprintf(stderr, "virtio: zero sized buffers are not allowed\n");
599         return false;
600     }
601 
602     while (sz) {
603         uint64_t len = sz;
604 
605         if (num_sg == max_num_sg) {
606             fprintf(stderr,
607                     "virtio: too many descriptors in indirect table\n");
608             return false;
609         }
610 
611         iov[num_sg].iov_base = iova_to_va(dev, &len, pa);
612         if (iov[num_sg].iov_base == NULL) {
613             fprintf(stderr, "virtio: invalid address for buffers\n");
614             return false;
615         }
616         iov[num_sg++].iov_len = len;
617         sz -= len;
618         pa += len;
619     }
620 
621     *p_num_sg = num_sg;
622     return true;
623 }
624 
625 static void *vduse_queue_alloc_element(size_t sz, unsigned out_num,
626                                        unsigned in_num)
627 {
628     VduseVirtqElement *elem;
629     size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0]));
630     size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
631     size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
632 
633     assert(sz >= sizeof(VduseVirtqElement));
634     elem = malloc(out_sg_end);
635     if (!elem) {
636         return NULL;
637     }
638     elem->out_num = out_num;
639     elem->in_num = in_num;
640     elem->in_sg = (void *)elem + in_sg_ofs;
641     elem->out_sg = (void *)elem + out_sg_ofs;
642     return elem;
643 }
644 
645 static void *vduse_queue_map_desc(VduseVirtq *vq, unsigned int idx, size_t sz)
646 {
647     struct vring_desc *desc = vq->vring.desc;
648     VduseDev *dev = vq->dev;
649     uint64_t desc_addr, read_len;
650     unsigned int desc_len;
651     unsigned int max = vq->vring.num;
652     unsigned int i = idx;
653     VduseVirtqElement *elem;
654     struct iovec iov[VIRTQUEUE_MAX_SIZE];
655     struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
656     unsigned int out_num = 0, in_num = 0;
657     int rc;
658 
659     if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) {
660         if (le32toh(desc[i].len) % sizeof(struct vring_desc)) {
661             fprintf(stderr, "Invalid size for indirect buffer table\n");
662             return NULL;
663         }
664 
665         /* loop over the indirect descriptor table */
666         desc_addr = le64toh(desc[i].addr);
667         desc_len = le32toh(desc[i].len);
668         max = desc_len / sizeof(struct vring_desc);
669         read_len = desc_len;
670         desc = iova_to_va(dev, &read_len, desc_addr);
671         if (unlikely(desc && read_len != desc_len)) {
672             /* Failed to use zero copy */
673             desc = NULL;
674             if (!vduse_queue_read_indirect_desc(dev, desc_buf,
675                                                 desc_addr,
676                                                 desc_len)) {
677                 desc = desc_buf;
678             }
679         }
680         if (!desc) {
681             fprintf(stderr, "Invalid indirect buffer table\n");
682             return NULL;
683         }
684         i = 0;
685     }
686 
687     /* Collect all the descriptors */
688     do {
689         if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
690             if (!vduse_queue_map_single_desc(vq, &in_num, iov + out_num,
691                                              VIRTQUEUE_MAX_SIZE - out_num,
692                                              true, le64toh(desc[i].addr),
693                                              le32toh(desc[i].len))) {
694                 return NULL;
695             }
696         } else {
697             if (in_num) {
698                 fprintf(stderr, "Incorrect order for descriptors\n");
699                 return NULL;
700             }
701             if (!vduse_queue_map_single_desc(vq, &out_num, iov,
702                                              VIRTQUEUE_MAX_SIZE, false,
703                                              le64toh(desc[i].addr),
704                                              le32toh(desc[i].len))) {
705                 return NULL;
706             }
707         }
708 
709         /* If we've got too many, that implies a descriptor loop. */
710         if ((in_num + out_num) > max) {
711             fprintf(stderr, "Looped descriptor\n");
712             return NULL;
713         }
714         rc = vduse_queue_read_next_desc(desc, i, max, &i);
715     } while (rc == VIRTQUEUE_READ_DESC_MORE);
716 
717     if (rc == VIRTQUEUE_READ_DESC_ERROR) {
718         fprintf(stderr, "read descriptor error\n");
719         return NULL;
720     }
721 
722     /* Now copy what we have collected and mapped */
723     elem = vduse_queue_alloc_element(sz, out_num, in_num);
724     if (!elem) {
725         fprintf(stderr, "read descriptor error\n");
726         return NULL;
727     }
728     elem->index = idx;
729     for (i = 0; i < out_num; i++) {
730         elem->out_sg[i] = iov[i];
731     }
732     for (i = 0; i < in_num; i++) {
733         elem->in_sg[i] = iov[out_num + i];
734     }
735 
736     return elem;
737 }
738 
739 void *vduse_queue_pop(VduseVirtq *vq, size_t sz)
740 {
741     unsigned int head;
742     VduseVirtqElement *elem;
743     VduseDev *dev = vq->dev;
744     int i;
745 
746     if (unlikely(!vq->vring.avail)) {
747         return NULL;
748     }
749 
750     if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) {
751         i = (--vq->resubmit_num);
752         elem = vduse_queue_map_desc(vq, vq->resubmit_list[i].index, sz);
753 
754         if (!vq->resubmit_num) {
755             free(vq->resubmit_list);
756             vq->resubmit_list = NULL;
757         }
758 
759         return elem;
760     }
761 
762     if (vduse_queue_empty(vq)) {
763         return NULL;
764     }
765     /* Needed after virtio_queue_empty() */
766     smp_rmb();
767 
768     if (vq->inuse >= vq->vring.num) {
769         fprintf(stderr, "Virtqueue size exceeded: %d\n", vq->inuse);
770         return NULL;
771     }
772 
773     if (!vduse_queue_get_head(vq, vq->last_avail_idx++, &head)) {
774         return NULL;
775     }
776 
777     if (vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
778         vring_set_avail_event(vq, vq->last_avail_idx);
779     }
780 
781     elem = vduse_queue_map_desc(vq, head, sz);
782 
783     if (!elem) {
784         return NULL;
785     }
786 
787     vq->inuse++;
788 
789     vduse_queue_inflight_get(vq, head);
790 
791     return elem;
792 }
793 
794 static inline void vring_used_write(VduseVirtq *vq,
795                                     struct vring_used_elem *uelem, int i)
796 {
797     struct vring_used *used = vq->vring.used;
798 
799     used->ring[i] = *uelem;
800 }
801 
802 static void vduse_queue_fill(VduseVirtq *vq, const VduseVirtqElement *elem,
803                              unsigned int len, unsigned int idx)
804 {
805     struct vring_used_elem uelem;
806 
807     if (unlikely(!vq->vring.used)) {
808         return;
809     }
810 
811     idx = (idx + vq->used_idx) % vq->vring.num;
812 
813     uelem.id = htole32(elem->index);
814     uelem.len = htole32(len);
815     vring_used_write(vq, &uelem, idx);
816 }
817 
818 static inline void vring_used_idx_set(VduseVirtq *vq, uint16_t val)
819 {
820     vq->vring.used->idx = htole16(val);
821     vq->used_idx = val;
822 }
823 
824 static void vduse_queue_flush(VduseVirtq *vq, unsigned int count)
825 {
826     uint16_t old, new;
827 
828     if (unlikely(!vq->vring.used)) {
829         return;
830     }
831 
832     /* Make sure buffer is written before we update index. */
833     smp_wmb();
834 
835     old = vq->used_idx;
836     new = old + count;
837     vring_used_idx_set(vq, new);
838     vq->inuse -= count;
839     if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) {
840         vq->signalled_used_valid = false;
841     }
842 }
843 
844 void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem,
845                       unsigned int len)
846 {
847     vduse_queue_fill(vq, elem, len, 0);
848     vduse_queue_inflight_pre_put(vq, elem->index);
849     vduse_queue_flush(vq, 1);
850     vduse_queue_inflight_post_put(vq, elem->index);
851 }
852 
853 static int vduse_queue_update_vring(VduseVirtq *vq, uint64_t desc_addr,
854                                     uint64_t avail_addr, uint64_t used_addr)
855 {
856     struct VduseDev *dev = vq->dev;
857     uint64_t len;
858 
859     len = sizeof(struct vring_desc);
860     vq->vring.desc = iova_to_va(dev, &len, desc_addr);
861     if (len != sizeof(struct vring_desc)) {
862         return -EINVAL;
863     }
864 
865     len = sizeof(struct vring_avail);
866     vq->vring.avail = iova_to_va(dev, &len, avail_addr);
867     if (len != sizeof(struct vring_avail)) {
868         return -EINVAL;
869     }
870 
871     len = sizeof(struct vring_used);
872     vq->vring.used = iova_to_va(dev, &len, used_addr);
873     if (len != sizeof(struct vring_used)) {
874         return -EINVAL;
875     }
876 
877     if (!vq->vring.desc || !vq->vring.avail || !vq->vring.used) {
878         fprintf(stderr, "Failed to get vq[%d] iova mapping\n", vq->index);
879         return -EINVAL;
880     }
881 
882     return 0;
883 }
884 
885 static void vduse_queue_enable(VduseVirtq *vq)
886 {
887     struct VduseDev *dev = vq->dev;
888     struct vduse_vq_info vq_info;
889     struct vduse_vq_eventfd vq_eventfd;
890     int fd;
891 
892     vq_info.index = vq->index;
893     if (ioctl(dev->fd, VDUSE_VQ_GET_INFO, &vq_info)) {
894         fprintf(stderr, "Failed to get vq[%d] info: %s\n",
895                 vq->index, strerror(errno));
896         return;
897     }
898 
899     if (!vq_info.ready) {
900         return;
901     }
902 
903     vq->vring.num = vq_info.num;
904     vq->vring.desc_addr = vq_info.desc_addr;
905     vq->vring.avail_addr = vq_info.driver_addr;
906     vq->vring.used_addr = vq_info.device_addr;
907 
908     if (vduse_queue_update_vring(vq, vq_info.desc_addr,
909                                  vq_info.driver_addr, vq_info.device_addr)) {
910         fprintf(stderr, "Failed to update vring for vq[%d]\n", vq->index);
911         return;
912     }
913 
914     fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
915     if (fd < 0) {
916         fprintf(stderr, "Failed to init eventfd for vq[%d]\n", vq->index);
917         return;
918     }
919 
920     vq_eventfd.index = vq->index;
921     vq_eventfd.fd = fd;
922     if (ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &vq_eventfd)) {
923         fprintf(stderr, "Failed to setup kick fd for vq[%d]\n", vq->index);
924         close(fd);
925         return;
926     }
927 
928     vq->fd = fd;
929     vq->signalled_used_valid = false;
930     vq->ready = true;
931 
932     if (vduse_queue_check_inflights(vq)) {
933         fprintf(stderr, "Failed to check inflights for vq[%d]\n", vq->index);
934         close(fd);
935         return;
936     }
937 
938     dev->ops->enable_queue(dev, vq);
939 }
940 
941 static void vduse_queue_disable(VduseVirtq *vq)
942 {
943     struct VduseDev *dev = vq->dev;
944     struct vduse_vq_eventfd eventfd;
945 
946     if (!vq->ready) {
947         return;
948     }
949 
950     dev->ops->disable_queue(dev, vq);
951 
952     eventfd.index = vq->index;
953     eventfd.fd = VDUSE_EVENTFD_DEASSIGN;
954     ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &eventfd);
955     close(vq->fd);
956 
957     assert(vq->inuse == 0);
958 
959     vq->vring.num = 0;
960     vq->vring.desc_addr = 0;
961     vq->vring.avail_addr = 0;
962     vq->vring.used_addr = 0;
963     vq->vring.desc = 0;
964     vq->vring.avail = 0;
965     vq->vring.used = 0;
966     vq->ready = false;
967     vq->fd = -1;
968 }
969 
970 static void vduse_dev_start_dataplane(VduseDev *dev)
971 {
972     int i;
973 
974     if (ioctl(dev->fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
975         fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
976         return;
977     }
978     assert(vduse_dev_has_feature(dev, VIRTIO_F_VERSION_1));
979 
980     for (i = 0; i < dev->num_queues; i++) {
981         vduse_queue_enable(&dev->vqs[i]);
982     }
983 }
984 
985 static void vduse_dev_stop_dataplane(VduseDev *dev)
986 {
987     size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE);
988     int i;
989 
990     for (i = 0; i < dev->num_queues; i++) {
991         vduse_queue_disable(&dev->vqs[i]);
992     }
993     if (dev->log) {
994         memset(dev->log, 0, log_size);
995     }
996     dev->features = 0;
997     vduse_iova_remove_region(dev, 0, ULONG_MAX);
998 }
999 
1000 int vduse_dev_handler(VduseDev *dev)
1001 {
1002     struct vduse_dev_request req;
1003     struct vduse_dev_response resp = { 0 };
1004     VduseVirtq *vq;
1005     int i, ret;
1006 
1007     ret = read(dev->fd, &req, sizeof(req));
1008     if (ret != sizeof(req)) {
1009         fprintf(stderr, "Read request error [%d]: %s\n",
1010                 ret, strerror(errno));
1011         return -errno;
1012     }
1013     resp.request_id = req.request_id;
1014 
1015     switch (req.type) {
1016     case VDUSE_GET_VQ_STATE:
1017         vq = &dev->vqs[req.vq_state.index];
1018         resp.vq_state.split.avail_index = vq->last_avail_idx;
1019         resp.result = VDUSE_REQ_RESULT_OK;
1020         break;
1021     case VDUSE_SET_STATUS:
1022         if (req.s.status & VIRTIO_CONFIG_S_DRIVER_OK) {
1023             vduse_dev_start_dataplane(dev);
1024         } else if (req.s.status == 0) {
1025             vduse_dev_stop_dataplane(dev);
1026         }
1027         resp.result = VDUSE_REQ_RESULT_OK;
1028         break;
1029     case VDUSE_UPDATE_IOTLB:
1030         /* The iova will be updated by iova_to_va() later, so just remove it */
1031         vduse_iova_remove_region(dev, req.iova.start, req.iova.last);
1032         for (i = 0; i < dev->num_queues; i++) {
1033             VduseVirtq *vq = &dev->vqs[i];
1034             if (vq->ready) {
1035                 if (vduse_queue_update_vring(vq, vq->vring.desc_addr,
1036                                              vq->vring.avail_addr,
1037                                              vq->vring.used_addr)) {
1038                     fprintf(stderr, "Failed to update vring for vq[%d]\n",
1039                             vq->index);
1040                 }
1041             }
1042         }
1043         resp.result = VDUSE_REQ_RESULT_OK;
1044         break;
1045     default:
1046         resp.result = VDUSE_REQ_RESULT_FAILED;
1047         break;
1048     }
1049 
1050     ret = write(dev->fd, &resp, sizeof(resp));
1051     if (ret != sizeof(resp)) {
1052         fprintf(stderr, "Write request %d error [%d]: %s\n",
1053                 req.type, ret, strerror(errno));
1054         return -errno;
1055     }
1056     return 0;
1057 }
1058 
1059 int vduse_dev_update_config(VduseDev *dev, uint32_t size,
1060                             uint32_t offset, char *buffer)
1061 {
1062     int ret;
1063     struct vduse_config_data *data;
1064 
1065     data = malloc(offsetof(struct vduse_config_data, buffer) + size);
1066     if (!data) {
1067         return -ENOMEM;
1068     }
1069 
1070     data->offset = offset;
1071     data->length = size;
1072     memcpy(data->buffer, buffer, size);
1073 
1074     ret = ioctl(dev->fd, VDUSE_DEV_SET_CONFIG, data);
1075     free(data);
1076 
1077     if (ret) {
1078         return -errno;
1079     }
1080 
1081     if (ioctl(dev->fd, VDUSE_DEV_INJECT_CONFIG_IRQ)) {
1082         return -errno;
1083     }
1084 
1085     return 0;
1086 }
1087 
1088 int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size)
1089 {
1090     VduseVirtq *vq = &dev->vqs[index];
1091     struct vduse_vq_config vq_config = { 0 };
1092 
1093     if (max_size > VIRTQUEUE_MAX_SIZE) {
1094         return -EINVAL;
1095     }
1096 
1097     vq_config.index = vq->index;
1098     vq_config.max_size = max_size;
1099 
1100     if (ioctl(dev->fd, VDUSE_VQ_SETUP, &vq_config)) {
1101         return -errno;
1102     }
1103 
1104     vduse_queue_enable(vq);
1105 
1106     return 0;
1107 }
1108 
1109 int vduse_set_reconnect_log_file(VduseDev *dev, const char *filename)
1110 {
1111 
1112     size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE);
1113     void *log;
1114     int i;
1115 
1116     dev->log = log = vduse_log_get(filename, log_size);
1117     if (log == MAP_FAILED) {
1118         fprintf(stderr, "Failed to get vduse log\n");
1119         return -EINVAL;
1120     }
1121 
1122     for (i = 0; i < dev->num_queues; i++) {
1123         dev->vqs[i].log = log;
1124         dev->vqs[i].log->inflight.desc_num = VIRTQUEUE_MAX_SIZE;
1125         log = (void *)((char *)log + vduse_vq_log_size(VIRTQUEUE_MAX_SIZE));
1126     }
1127 
1128     return 0;
1129 }
1130 
1131 static int vduse_dev_init_vqs(VduseDev *dev, uint16_t num_queues)
1132 {
1133     VduseVirtq *vqs;
1134     int i;
1135 
1136     vqs = calloc(sizeof(VduseVirtq), num_queues);
1137     if (!vqs) {
1138         return -ENOMEM;
1139     }
1140 
1141     for (i = 0; i < num_queues; i++) {
1142         vqs[i].index = i;
1143         vqs[i].dev = dev;
1144         vqs[i].fd = -1;
1145     }
1146     dev->vqs = vqs;
1147 
1148     return 0;
1149 }
1150 
1151 static int vduse_dev_init(VduseDev *dev, const char *name,
1152                           uint16_t num_queues, const VduseOps *ops,
1153                           void *priv)
1154 {
1155     char *dev_path, *dev_name;
1156     int ret, fd;
1157 
1158     dev_path = malloc(strlen(name) + strlen("/dev/vduse/") + 1);
1159     if (!dev_path) {
1160         return -ENOMEM;
1161     }
1162     sprintf(dev_path, "/dev/vduse/%s", name);
1163 
1164     fd = open(dev_path, O_RDWR);
1165     free(dev_path);
1166     if (fd < 0) {
1167         fprintf(stderr, "Failed to open vduse dev %s: %s\n",
1168                 name, strerror(errno));
1169         return -errno;
1170     }
1171 
1172     if (ioctl(fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
1173         fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
1174         close(fd);
1175         return -errno;
1176     }
1177 
1178     dev_name = strdup(name);
1179     if (!dev_name) {
1180         close(fd);
1181         return -ENOMEM;
1182     }
1183 
1184     ret = vduse_dev_init_vqs(dev, num_queues);
1185     if (ret) {
1186         free(dev_name);
1187         close(fd);
1188         return ret;
1189     }
1190 
1191     dev->name = dev_name;
1192     dev->num_queues = num_queues;
1193     dev->fd = fd;
1194     dev->ops = ops;
1195     dev->priv = priv;
1196 
1197     return 0;
1198 }
1199 
1200 static inline bool vduse_name_is_invalid(const char *name)
1201 {
1202     return strlen(name) >= VDUSE_NAME_MAX || strstr(name, "..");
1203 }
1204 
1205 VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues,
1206                                  const VduseOps *ops, void *priv)
1207 {
1208     VduseDev *dev;
1209     int ret;
1210 
1211     if (!ops || !ops->enable_queue || !ops->disable_queue) {
1212         fprintf(stderr, "Invalid parameter for vduse\n");
1213         return NULL;
1214     }
1215 
1216     dev = calloc(sizeof(VduseDev), 1);
1217     if (!dev) {
1218         fprintf(stderr, "Failed to allocate vduse device\n");
1219         return NULL;
1220     }
1221 
1222     if (ioctl(fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
1223         fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
1224         free(dev);
1225         return NULL;
1226     }
1227 
1228     ret = vduse_dev_init_vqs(dev, num_queues);
1229     if (ret) {
1230         fprintf(stderr, "Failed to init vqs\n");
1231         free(dev);
1232         return NULL;
1233     }
1234 
1235     dev->num_queues = num_queues;
1236     dev->fd = fd;
1237     dev->ops = ops;
1238     dev->priv = priv;
1239 
1240     return dev;
1241 }
1242 
1243 VduseDev *vduse_dev_create_by_name(const char *name, uint16_t num_queues,
1244                                    const VduseOps *ops, void *priv)
1245 {
1246     VduseDev *dev;
1247     int ret;
1248 
1249     if (!name || vduse_name_is_invalid(name) || !ops ||
1250         !ops->enable_queue || !ops->disable_queue) {
1251         fprintf(stderr, "Invalid parameter for vduse\n");
1252         return NULL;
1253     }
1254 
1255     dev = calloc(sizeof(VduseDev), 1);
1256     if (!dev) {
1257         fprintf(stderr, "Failed to allocate vduse device\n");
1258         return NULL;
1259     }
1260 
1261     ret = vduse_dev_init(dev, name, num_queues, ops, priv);
1262     if (ret < 0) {
1263         fprintf(stderr, "Failed to init vduse device %s: %s\n",
1264                 name, strerror(-ret));
1265         free(dev);
1266         return NULL;
1267     }
1268 
1269     return dev;
1270 }
1271 
1272 VduseDev *vduse_dev_create(const char *name, uint32_t device_id,
1273                            uint32_t vendor_id, uint64_t features,
1274                            uint16_t num_queues, uint32_t config_size,
1275                            char *config, const VduseOps *ops, void *priv)
1276 {
1277     VduseDev *dev;
1278     int ret, ctrl_fd;
1279     uint64_t version;
1280     struct vduse_dev_config *dev_config;
1281     size_t size = offsetof(struct vduse_dev_config, config);
1282 
1283     if (!name || vduse_name_is_invalid(name) ||
1284         !has_feature(features,  VIRTIO_F_VERSION_1) || !config ||
1285         !config_size || !ops || !ops->enable_queue || !ops->disable_queue) {
1286         fprintf(stderr, "Invalid parameter for vduse\n");
1287         return NULL;
1288     }
1289 
1290     dev = calloc(sizeof(VduseDev), 1);
1291     if (!dev) {
1292         fprintf(stderr, "Failed to allocate vduse device\n");
1293         return NULL;
1294     }
1295 
1296     ctrl_fd = open("/dev/vduse/control", O_RDWR);
1297     if (ctrl_fd < 0) {
1298         fprintf(stderr, "Failed to open /dev/vduse/control: %s\n",
1299                 strerror(errno));
1300         goto err_ctrl;
1301     }
1302 
1303     version = VDUSE_API_VERSION;
1304     if (ioctl(ctrl_fd, VDUSE_SET_API_VERSION, &version)) {
1305         fprintf(stderr, "Failed to set api version %" PRIu64 ": %s\n",
1306                 version, strerror(errno));
1307         goto err_dev;
1308     }
1309 
1310     dev_config = calloc(size + config_size, 1);
1311     if (!dev_config) {
1312         fprintf(stderr, "Failed to allocate config space\n");
1313         goto err_dev;
1314     }
1315 
1316     assert(!vduse_name_is_invalid(name));
1317     strcpy(dev_config->name, name);
1318     dev_config->device_id = device_id;
1319     dev_config->vendor_id = vendor_id;
1320     dev_config->features = features;
1321     dev_config->vq_num = num_queues;
1322     dev_config->vq_align = VDUSE_VQ_ALIGN;
1323     dev_config->config_size = config_size;
1324     memcpy(dev_config->config, config, config_size);
1325 
1326     ret = ioctl(ctrl_fd, VDUSE_CREATE_DEV, dev_config);
1327     free(dev_config);
1328     if (ret && errno != EEXIST) {
1329         fprintf(stderr, "Failed to create vduse device %s: %s\n",
1330                 name, strerror(errno));
1331         goto err_dev;
1332     }
1333     dev->ctrl_fd = ctrl_fd;
1334 
1335     ret = vduse_dev_init(dev, name, num_queues, ops, priv);
1336     if (ret < 0) {
1337         fprintf(stderr, "Failed to init vduse device %s: %s\n",
1338                 name, strerror(-ret));
1339         goto err;
1340     }
1341 
1342     return dev;
1343 err:
1344     ioctl(ctrl_fd, VDUSE_DESTROY_DEV, name);
1345 err_dev:
1346     close(ctrl_fd);
1347 err_ctrl:
1348     free(dev);
1349 
1350     return NULL;
1351 }
1352 
1353 int vduse_dev_destroy(VduseDev *dev)
1354 {
1355     size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE);
1356     int i, ret = 0;
1357 
1358     if (dev->log) {
1359         munmap(dev->log, log_size);
1360     }
1361     for (i = 0; i < dev->num_queues; i++) {
1362         free(dev->vqs[i].resubmit_list);
1363     }
1364     free(dev->vqs);
1365     if (dev->fd >= 0) {
1366         close(dev->fd);
1367         dev->fd = -1;
1368     }
1369     if (dev->ctrl_fd >= 0) {
1370         if (ioctl(dev->ctrl_fd, VDUSE_DESTROY_DEV, dev->name)) {
1371             ret = -errno;
1372         }
1373         close(dev->ctrl_fd);
1374         dev->ctrl_fd = -1;
1375     }
1376     free(dev->name);
1377     free(dev);
1378 
1379     return ret;
1380 }
1381