xref: /qemu/hw/vfio-user/proxy.c (revision 3bdb738b734c77f93f93f8119c8f6ba8a9c5947c)
1 /*
2  * vfio protocol over a UNIX socket.
3  *
4  * Copyright © 2018, 2021 Oracle and/or its affiliates.
5  *
6  * SPDX-License-Identifier: GPL-2.0-or-later
7  */
8 
9 #include "qemu/osdep.h"
10 #include <sys/ioctl.h>
11 
12 #include "hw/vfio/vfio-device.h"
13 #include "hw/vfio-user/proxy.h"
14 #include "hw/vfio-user/trace.h"
15 #include "qapi/error.h"
16 #include "qobject/qdict.h"
17 #include "qobject/qjson.h"
18 #include "qobject/qnum.h"
19 #include "qemu/error-report.h"
20 #include "qemu/lockable.h"
21 #include "qemu/main-loop.h"
22 #include "system/iothread.h"
23 
24 static int wait_time = 5000;   /* wait up to 5 sec for busy servers */
25 static IOThread *vfio_user_iothread;
26 
27 static void vfio_user_shutdown(VFIOUserProxy *proxy);
28 static VFIOUserMsg *vfio_user_getmsg(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
29                                      VFIOUserFDs *fds);
30 static VFIOUserFDs *vfio_user_getfds(int numfds);
31 static void vfio_user_recycle(VFIOUserProxy *proxy, VFIOUserMsg *msg);
32 
33 static void vfio_user_recv(void *opaque);
34 static void vfio_user_send(void *opaque);
35 static void vfio_user_cb(void *opaque);
36 
37 static void vfio_user_request(void *opaque);
38 
39 static inline void vfio_user_set_error(VFIOUserHdr *hdr, uint32_t err)
40 {
41     hdr->flags |= VFIO_USER_ERROR;
42     hdr->error_reply = err;
43 }
44 
45 /*
46  * Functions called by main, CPU, or iothread threads
47  */
48 
49 static void vfio_user_shutdown(VFIOUserProxy *proxy)
50 {
51     qio_channel_shutdown(proxy->ioc, QIO_CHANNEL_SHUTDOWN_READ, NULL);
52     qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx, NULL,
53                                    proxy->ctx, NULL, NULL);
54 }
55 
56 /*
57  * Same return values as qio_channel_writev_full():
58  *
59  * QIO_CHANNEL_ERR_BLOCK: *errp not set
60  * -1: *errp will be populated
61  * otherwise: bytes written
62  */
63 static ssize_t vfio_user_send_qio(VFIOUserProxy *proxy, VFIOUserMsg *msg,
64                                   Error **errp)
65 {
66     VFIOUserFDs *fds =  msg->fds;
67     struct iovec iov = {
68         .iov_base = msg->hdr,
69         .iov_len = msg->hdr->size,
70     };
71     size_t numfds = 0;
72     int *fdp = NULL;
73     ssize_t ret;
74 
75     if (fds != NULL && fds->send_fds != 0) {
76         numfds = fds->send_fds;
77         fdp = fds->fds;
78     }
79 
80     ret = qio_channel_writev_full(proxy->ioc, &iov, 1, fdp, numfds, 0, errp);
81 
82     if (ret == -1) {
83         vfio_user_set_error(msg->hdr, EIO);
84         vfio_user_shutdown(proxy);
85     }
86     trace_vfio_user_send_write(msg->hdr->id, ret);
87 
88     return ret;
89 }
90 
91 static VFIOUserMsg *vfio_user_getmsg(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
92                                      VFIOUserFDs *fds)
93 {
94     VFIOUserMsg *msg;
95 
96     msg = QTAILQ_FIRST(&proxy->free);
97     if (msg != NULL) {
98         QTAILQ_REMOVE(&proxy->free, msg, next);
99     } else {
100         msg = g_malloc0(sizeof(*msg));
101         qemu_cond_init(&msg->cv);
102     }
103 
104     msg->hdr = hdr;
105     msg->fds = fds;
106     return msg;
107 }
108 
109 /*
110  * Recycle a message list entry to the free list.
111  */
112 static void vfio_user_recycle(VFIOUserProxy *proxy, VFIOUserMsg *msg)
113 {
114     if (msg->type == VFIO_MSG_NONE) {
115         error_printf("vfio_user_recycle - freeing free msg\n");
116         return;
117     }
118 
119     /* free msg buffer if no one is waiting to consume the reply */
120     if (msg->type == VFIO_MSG_NOWAIT || msg->type == VFIO_MSG_ASYNC) {
121         g_free(msg->hdr);
122         if (msg->fds != NULL) {
123             g_free(msg->fds);
124         }
125     }
126 
127     msg->type = VFIO_MSG_NONE;
128     msg->hdr = NULL;
129     msg->fds = NULL;
130     msg->complete = false;
131     msg->pending = false;
132     QTAILQ_INSERT_HEAD(&proxy->free, msg, next);
133 }
134 
135 static VFIOUserFDs *vfio_user_getfds(int numfds)
136 {
137     VFIOUserFDs *fds = g_malloc0(sizeof(*fds) + (numfds * sizeof(int)));
138 
139     fds->fds = (int *)((char *)fds + sizeof(*fds));
140 
141     return fds;
142 }
143 
144 /*
145  * Functions only called by iothread
146  */
147 
148 /*
149  * Process a received message.
150  */
151 static void vfio_user_process(VFIOUserProxy *proxy, VFIOUserMsg *msg,
152                               bool isreply)
153 {
154 
155     /*
156      * Replies signal a waiter, if none just check for errors
157      * and free the message buffer.
158      *
159      * Requests get queued for the BH.
160      */
161     if (isreply) {
162         msg->complete = true;
163         if (msg->type == VFIO_MSG_WAIT) {
164             qemu_cond_signal(&msg->cv);
165         } else {
166             if (msg->hdr->flags & VFIO_USER_ERROR) {
167                 error_printf("vfio_user_process: error reply on async ");
168                 error_printf("request command %x error %s\n",
169                              msg->hdr->command,
170                              strerror(msg->hdr->error_reply));
171             }
172             /* youngest nowait msg has been ack'd */
173             if (proxy->last_nowait == msg) {
174                 proxy->last_nowait = NULL;
175             }
176             vfio_user_recycle(proxy, msg);
177         }
178     } else {
179         QTAILQ_INSERT_TAIL(&proxy->incoming, msg, next);
180         qemu_bh_schedule(proxy->req_bh);
181     }
182 }
183 
184 /*
185  * Complete a partial message read
186  */
187 static int vfio_user_complete(VFIOUserProxy *proxy, Error **errp)
188 {
189     VFIOUserMsg *msg = proxy->part_recv;
190     size_t msgleft = proxy->recv_left;
191     bool isreply;
192     char *data;
193     int ret;
194 
195     data = (char *)msg->hdr + (msg->hdr->size - msgleft);
196     while (msgleft > 0) {
197         ret = qio_channel_read(proxy->ioc, data, msgleft, errp);
198 
199         /* error or would block */
200         if (ret <= 0) {
201             /* try for rest on next iternation */
202             if (ret == QIO_CHANNEL_ERR_BLOCK) {
203                 proxy->recv_left = msgleft;
204             }
205             return ret;
206         }
207         trace_vfio_user_recv_read(msg->hdr->id, ret);
208 
209         msgleft -= ret;
210         data += ret;
211     }
212 
213     /*
214      * Read complete message, process it.
215      */
216     proxy->part_recv = NULL;
217     proxy->recv_left = 0;
218     isreply = (msg->hdr->flags & VFIO_USER_TYPE) == VFIO_USER_REPLY;
219     vfio_user_process(proxy, msg, isreply);
220 
221     /* return positive value */
222     return 1;
223 }
224 
225 /*
226  * Receive and process one incoming message.
227  *
228  * For replies, find matching outgoing request and wake any waiters.
229  * For requests, queue in incoming list and run request BH.
230  */
231 static int vfio_user_recv_one(VFIOUserProxy *proxy, Error **errp)
232 {
233     VFIOUserMsg *msg = NULL;
234     g_autofree int *fdp = NULL;
235     VFIOUserFDs *reqfds;
236     VFIOUserHdr hdr;
237     struct iovec iov = {
238         .iov_base = &hdr,
239         .iov_len = sizeof(hdr),
240     };
241     bool isreply = false;
242     int i, ret;
243     size_t msgleft, numfds = 0;
244     char *data = NULL;
245     char *buf = NULL;
246 
247     /*
248      * Complete any partial reads
249      */
250     if (proxy->part_recv != NULL) {
251         ret = vfio_user_complete(proxy, errp);
252 
253         /* still not complete, try later */
254         if (ret == QIO_CHANNEL_ERR_BLOCK) {
255             return ret;
256         }
257 
258         if (ret <= 0) {
259             goto fatal;
260         }
261         /* else fall into reading another msg */
262     }
263 
264     /*
265      * Read header
266      */
267     ret = qio_channel_readv_full(proxy->ioc, &iov, 1, &fdp, &numfds, 0,
268                                  errp);
269     if (ret == QIO_CHANNEL_ERR_BLOCK) {
270         return ret;
271     }
272 
273     /* read error or other side closed connection */
274     if (ret <= 0) {
275         goto fatal;
276     }
277 
278     if (ret < sizeof(hdr)) {
279         error_setg(errp, "short read of header");
280         goto fatal;
281     }
282 
283     /*
284      * Validate header
285      */
286     if (hdr.size < sizeof(VFIOUserHdr)) {
287         error_setg(errp, "bad header size");
288         goto fatal;
289     }
290     switch (hdr.flags & VFIO_USER_TYPE) {
291     case VFIO_USER_REQUEST:
292         isreply = false;
293         break;
294     case VFIO_USER_REPLY:
295         isreply = true;
296         break;
297     default:
298         error_setg(errp, "unknown message type");
299         goto fatal;
300     }
301     trace_vfio_user_recv_hdr(proxy->sockname, hdr.id, hdr.command, hdr.size,
302                              hdr.flags);
303 
304     /*
305      * For replies, find the matching pending request.
306      * For requests, reap incoming FDs.
307      */
308     if (isreply) {
309         QTAILQ_FOREACH(msg, &proxy->pending, next) {
310             if (hdr.id == msg->id) {
311                 break;
312             }
313         }
314         if (msg == NULL) {
315             error_setg(errp, "unexpected reply");
316             goto err;
317         }
318         QTAILQ_REMOVE(&proxy->pending, msg, next);
319 
320         /*
321          * Process any received FDs
322          */
323         if (numfds != 0) {
324             if (msg->fds == NULL || msg->fds->recv_fds < numfds) {
325                 error_setg(errp, "unexpected FDs");
326                 goto err;
327             }
328             msg->fds->recv_fds = numfds;
329             memcpy(msg->fds->fds, fdp, numfds * sizeof(int));
330         }
331     } else {
332         if (numfds != 0) {
333             reqfds = vfio_user_getfds(numfds);
334             memcpy(reqfds->fds, fdp, numfds * sizeof(int));
335         } else {
336             reqfds = NULL;
337         }
338     }
339 
340     /*
341      * Put the whole message into a single buffer.
342      */
343     if (isreply) {
344         if (hdr.size > msg->rsize) {
345             error_setg(errp, "reply larger than recv buffer");
346             goto err;
347         }
348         *msg->hdr = hdr;
349         data = (char *)msg->hdr + sizeof(hdr);
350     } else {
351         buf = g_malloc0(hdr.size);
352         memcpy(buf, &hdr, sizeof(hdr));
353         data = buf + sizeof(hdr);
354         msg = vfio_user_getmsg(proxy, (VFIOUserHdr *)buf, reqfds);
355         msg->type = VFIO_MSG_REQ;
356     }
357 
358     /*
359      * Read rest of message.
360      */
361     msgleft = hdr.size - sizeof(hdr);
362     while (msgleft > 0) {
363         ret = qio_channel_read(proxy->ioc, data, msgleft, errp);
364 
365         /* prepare to complete read on next iternation */
366         if (ret == QIO_CHANNEL_ERR_BLOCK) {
367             proxy->part_recv = msg;
368             proxy->recv_left = msgleft;
369             return ret;
370         }
371 
372         if (ret <= 0) {
373             goto fatal;
374         }
375         trace_vfio_user_recv_read(hdr.id, ret);
376 
377         msgleft -= ret;
378         data += ret;
379     }
380 
381     vfio_user_process(proxy, msg, isreply);
382     return 0;
383 
384     /*
385      * fatal means the other side closed or we don't trust the stream
386      * err means this message is corrupt
387      */
388 fatal:
389     vfio_user_shutdown(proxy);
390     proxy->state = VFIO_PROXY_ERROR;
391 
392     /* set error if server side closed */
393     if (ret == 0) {
394         error_setg(errp, "server closed socket");
395     }
396 
397 err:
398     for (i = 0; i < numfds; i++) {
399         close(fdp[i]);
400     }
401     if (isreply && msg != NULL) {
402         /* force an error to keep sending thread from hanging */
403         vfio_user_set_error(msg->hdr, EINVAL);
404         msg->complete = true;
405         qemu_cond_signal(&msg->cv);
406     }
407     return -1;
408 }
409 
410 static void vfio_user_recv(void *opaque)
411 {
412     VFIOUserProxy *proxy = opaque;
413 
414     QEMU_LOCK_GUARD(&proxy->lock);
415 
416     if (proxy->state == VFIO_PROXY_CONNECTED) {
417         Error *local_err = NULL;
418 
419         while (vfio_user_recv_one(proxy, &local_err) == 0) {
420             ;
421         }
422 
423         if (local_err != NULL) {
424             error_report_err(local_err);
425         }
426     }
427 }
428 
429 /*
430  * Send a single message, same return semantics as vfio_user_send_qio().
431  *
432  * Sent async messages are freed, others are moved to pending queue.
433  */
434 static ssize_t vfio_user_send_one(VFIOUserProxy *proxy, Error **errp)
435 {
436     VFIOUserMsg *msg;
437     ssize_t ret;
438 
439     msg = QTAILQ_FIRST(&proxy->outgoing);
440     ret = vfio_user_send_qio(proxy, msg, errp);
441     if (ret < 0) {
442         return ret;
443     }
444 
445     QTAILQ_REMOVE(&proxy->outgoing, msg, next);
446     if (msg->type == VFIO_MSG_ASYNC) {
447         vfio_user_recycle(proxy, msg);
448     } else {
449         QTAILQ_INSERT_TAIL(&proxy->pending, msg, next);
450         msg->pending = true;
451     }
452 
453     return ret;
454 }
455 
456 /*
457  * Send messages from outgoing queue when the socket buffer has space.
458  * If we deplete 'outgoing', remove ourselves from the poll list.
459  */
460 static void vfio_user_send(void *opaque)
461 {
462     VFIOUserProxy *proxy = opaque;
463 
464     QEMU_LOCK_GUARD(&proxy->lock);
465 
466     if (proxy->state == VFIO_PROXY_CONNECTED) {
467         while (!QTAILQ_EMPTY(&proxy->outgoing)) {
468             Error *local_err = NULL;
469             int ret;
470 
471             ret = vfio_user_send_one(proxy, &local_err);
472 
473             if (ret == QIO_CHANNEL_ERR_BLOCK) {
474                 return;
475             } else if (ret == -1) {
476                 error_report_err(local_err);
477                 return;
478             }
479         }
480         qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx,
481                                        vfio_user_recv, NULL, NULL, proxy);
482     }
483 }
484 
485 static void vfio_user_cb(void *opaque)
486 {
487     VFIOUserProxy *proxy = opaque;
488 
489     QEMU_LOCK_GUARD(&proxy->lock);
490 
491     proxy->state = VFIO_PROXY_CLOSED;
492     qemu_cond_signal(&proxy->close_cv);
493 }
494 
495 
496 /*
497  * Functions called by main or CPU threads
498  */
499 
500 /*
501  * Process incoming requests.
502  *
503  * The bus-specific callback has the form:
504  *    request(opaque, msg)
505  * where 'opaque' was specified in vfio_user_set_handler
506  * and 'msg' is the inbound message.
507  *
508  * The callback is responsible for disposing of the message buffer,
509  * usually by re-using it when calling vfio_send_reply or vfio_send_error,
510  * both of which free their message buffer when the reply is sent.
511  *
512  * If the callback uses a new buffer, it needs to free the old one.
513  */
514 static void vfio_user_request(void *opaque)
515 {
516     VFIOUserProxy *proxy = opaque;
517     VFIOUserMsgQ new, free;
518     VFIOUserMsg *msg, *m1;
519 
520     /* reap all incoming */
521     QTAILQ_INIT(&new);
522     WITH_QEMU_LOCK_GUARD(&proxy->lock) {
523         QTAILQ_FOREACH_SAFE(msg, &proxy->incoming, next, m1) {
524             QTAILQ_REMOVE(&proxy->incoming, msg, next);
525             QTAILQ_INSERT_TAIL(&new, msg, next);
526         }
527     }
528 
529     /* process list */
530     QTAILQ_INIT(&free);
531     QTAILQ_FOREACH_SAFE(msg, &new, next, m1) {
532         QTAILQ_REMOVE(&new, msg, next);
533         trace_vfio_user_recv_request(msg->hdr->command);
534         proxy->request(proxy->req_arg, msg);
535         QTAILQ_INSERT_HEAD(&free, msg, next);
536     }
537 
538     /* free list */
539     WITH_QEMU_LOCK_GUARD(&proxy->lock) {
540         QTAILQ_FOREACH_SAFE(msg, &free, next, m1) {
541             vfio_user_recycle(proxy, msg);
542         }
543     }
544 }
545 
546 /*
547  * Messages are queued onto the proxy's outgoing list.
548  *
549  * It handles 3 types of messages:
550  *
551  * async messages - replies and posted writes
552  *
553  * There will be no reply from the server, so message
554  * buffers are freed after they're sent.
555  *
556  * nowait messages - map/unmap during address space transactions
557  *
558  * These are also sent async, but a reply is expected so that
559  * vfio_wait_reqs() can wait for the youngest nowait request.
560  * They transition from the outgoing list to the pending list
561  * when sent, and are freed when the reply is received.
562  *
563  * wait messages - all other requests
564  *
565  * The reply to these messages is waited for by their caller.
566  * They also transition from outgoing to pending when sent, but
567  * the message buffer is returned to the caller with the reply
568  * contents.  The caller is responsible for freeing these messages.
569  *
570  * As an optimization, if the outgoing list and the socket send
571  * buffer are empty, the message is sent inline instead of being
572  * added to the outgoing list.  The rest of the transitions are
573  * unchanged.
574  */
575 static bool vfio_user_send_queued(VFIOUserProxy *proxy, VFIOUserMsg *msg,
576                                   Error **errp)
577 {
578     int ret;
579 
580     /*
581      * Unsent outgoing msgs - add to tail
582      */
583     if (!QTAILQ_EMPTY(&proxy->outgoing)) {
584         QTAILQ_INSERT_TAIL(&proxy->outgoing, msg, next);
585         return true;
586     }
587 
588     /*
589      * Try inline - if blocked, queue it and kick send poller
590      */
591     if (proxy->flags & VFIO_PROXY_FORCE_QUEUED) {
592         ret = QIO_CHANNEL_ERR_BLOCK;
593     } else {
594         ret = vfio_user_send_qio(proxy, msg, errp);
595     }
596 
597     if (ret == QIO_CHANNEL_ERR_BLOCK) {
598         QTAILQ_INSERT_HEAD(&proxy->outgoing, msg, next);
599         qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx,
600                                        vfio_user_recv, proxy->ctx,
601                                        vfio_user_send, proxy);
602         return true;
603     }
604     if (ret == -1) {
605         return false;
606     }
607 
608     /*
609      * Sent - free async, add others to pending
610      */
611     if (msg->type == VFIO_MSG_ASYNC) {
612         vfio_user_recycle(proxy, msg);
613     } else {
614         QTAILQ_INSERT_TAIL(&proxy->pending, msg, next);
615         msg->pending = true;
616     }
617 
618     return true;
619 }
620 
621 /*
622  * Returns false if we did not successfully receive a reply message, in which
623  * case @errp will be populated.
624  *
625  * In either case, the caller must free @hdr and @fds if needed.
626  */
627 bool vfio_user_send_wait(VFIOUserProxy *proxy, VFIOUserHdr *hdr,
628                          VFIOUserFDs *fds, int rsize, Error **errp)
629 {
630     VFIOUserMsg *msg;
631     bool ok = false;
632 
633     if (hdr->flags & VFIO_USER_NO_REPLY) {
634         error_setg_errno(errp, EINVAL, "%s on NO_REPLY message", __func__);
635         return false;
636     }
637 
638     qemu_mutex_lock(&proxy->lock);
639 
640     msg = vfio_user_getmsg(proxy, hdr, fds);
641     msg->id = hdr->id;
642     msg->rsize = rsize ? rsize : hdr->size;
643     msg->type = VFIO_MSG_WAIT;
644 
645     ok = vfio_user_send_queued(proxy, msg, errp);
646 
647     if (ok) {
648         while (!msg->complete) {
649             if (!qemu_cond_timedwait(&msg->cv, &proxy->lock, wait_time)) {
650                 VFIOUserMsgQ *list;
651 
652                 list = msg->pending ? &proxy->pending : &proxy->outgoing;
653                 QTAILQ_REMOVE(list, msg, next);
654                 error_setg_errno(errp, ETIMEDOUT,
655                                  "timed out waiting for reply");
656                 ok = false;
657                 break;
658             }
659         }
660     }
661 
662     vfio_user_recycle(proxy, msg);
663 
664     qemu_mutex_unlock(&proxy->lock);
665 
666     return ok;
667 }
668 
669 static QLIST_HEAD(, VFIOUserProxy) vfio_user_sockets =
670     QLIST_HEAD_INITIALIZER(vfio_user_sockets);
671 
672 VFIOUserProxy *vfio_user_connect_dev(SocketAddress *addr, Error **errp)
673 {
674     VFIOUserProxy *proxy;
675     QIOChannelSocket *sioc;
676     QIOChannel *ioc;
677     char *sockname;
678 
679     if (addr->type != SOCKET_ADDRESS_TYPE_UNIX) {
680         error_setg(errp, "vfio_user_connect - bad address family");
681         return NULL;
682     }
683     sockname = addr->u.q_unix.path;
684 
685     sioc = qio_channel_socket_new();
686     ioc = QIO_CHANNEL(sioc);
687     if (qio_channel_socket_connect_sync(sioc, addr, errp)) {
688         object_unref(OBJECT(ioc));
689         return NULL;
690     }
691     qio_channel_set_blocking(ioc, false, NULL);
692 
693     proxy = g_malloc0(sizeof(VFIOUserProxy));
694     proxy->sockname = g_strdup_printf("unix:%s", sockname);
695     proxy->ioc = ioc;
696 
697     /* init defaults */
698     proxy->max_xfer_size = VFIO_USER_DEF_MAX_XFER;
699     proxy->max_send_fds = VFIO_USER_DEF_MAX_FDS;
700     proxy->max_dma = VFIO_USER_DEF_MAP_MAX;
701     proxy->dma_pgsizes = VFIO_USER_DEF_PGSIZE;
702     proxy->max_bitmap = VFIO_USER_DEF_MAX_BITMAP;
703     proxy->migr_pgsize = VFIO_USER_DEF_PGSIZE;
704 
705     proxy->flags = VFIO_PROXY_CLIENT;
706     proxy->state = VFIO_PROXY_CONNECTED;
707 
708     qemu_mutex_init(&proxy->lock);
709     qemu_cond_init(&proxy->close_cv);
710 
711     if (vfio_user_iothread == NULL) {
712         vfio_user_iothread = iothread_create("VFIO user", errp);
713     }
714 
715     proxy->ctx = iothread_get_aio_context(vfio_user_iothread);
716     proxy->req_bh = qemu_bh_new(vfio_user_request, proxy);
717 
718     QTAILQ_INIT(&proxy->outgoing);
719     QTAILQ_INIT(&proxy->incoming);
720     QTAILQ_INIT(&proxy->free);
721     QTAILQ_INIT(&proxy->pending);
722     QLIST_INSERT_HEAD(&vfio_user_sockets, proxy, next);
723 
724     return proxy;
725 }
726 
727 void vfio_user_set_handler(VFIODevice *vbasedev,
728                            void (*handler)(void *opaque, VFIOUserMsg *msg),
729                            void *req_arg)
730 {
731     VFIOUserProxy *proxy = vbasedev->proxy;
732 
733     proxy->request = handler;
734     proxy->req_arg = req_arg;
735     qio_channel_set_aio_fd_handler(proxy->ioc, proxy->ctx,
736                                    vfio_user_recv, NULL, NULL, proxy);
737 }
738 
739 void vfio_user_disconnect(VFIOUserProxy *proxy)
740 {
741     VFIOUserMsg *r1, *r2;
742 
743     qemu_mutex_lock(&proxy->lock);
744 
745     /* our side is quitting */
746     if (proxy->state == VFIO_PROXY_CONNECTED) {
747         vfio_user_shutdown(proxy);
748         if (!QTAILQ_EMPTY(&proxy->pending)) {
749             error_printf("vfio_user_disconnect: outstanding requests\n");
750         }
751     }
752     object_unref(OBJECT(proxy->ioc));
753     proxy->ioc = NULL;
754     qemu_bh_delete(proxy->req_bh);
755     proxy->req_bh = NULL;
756 
757     proxy->state = VFIO_PROXY_CLOSING;
758     QTAILQ_FOREACH_SAFE(r1, &proxy->outgoing, next, r2) {
759         qemu_cond_destroy(&r1->cv);
760         QTAILQ_REMOVE(&proxy->outgoing, r1, next);
761         g_free(r1);
762     }
763     QTAILQ_FOREACH_SAFE(r1, &proxy->incoming, next, r2) {
764         qemu_cond_destroy(&r1->cv);
765         QTAILQ_REMOVE(&proxy->incoming, r1, next);
766         g_free(r1);
767     }
768     QTAILQ_FOREACH_SAFE(r1, &proxy->pending, next, r2) {
769         qemu_cond_destroy(&r1->cv);
770         QTAILQ_REMOVE(&proxy->pending, r1, next);
771         g_free(r1);
772     }
773     QTAILQ_FOREACH_SAFE(r1, &proxy->free, next, r2) {
774         qemu_cond_destroy(&r1->cv);
775         QTAILQ_REMOVE(&proxy->free, r1, next);
776         g_free(r1);
777     }
778 
779     /*
780      * Make sure the iothread isn't blocking anywhere
781      * with a ref to this proxy by waiting for a BH
782      * handler to run after the proxy fd handlers were
783      * deleted above.
784      */
785     aio_bh_schedule_oneshot(proxy->ctx, vfio_user_cb, proxy);
786     qemu_cond_wait(&proxy->close_cv, &proxy->lock);
787 
788     /* we now hold the only ref to proxy */
789     qemu_mutex_unlock(&proxy->lock);
790     qemu_cond_destroy(&proxy->close_cv);
791     qemu_mutex_destroy(&proxy->lock);
792 
793     QLIST_REMOVE(proxy, next);
794     if (QLIST_EMPTY(&vfio_user_sockets)) {
795         iothread_destroy(vfio_user_iothread);
796         vfio_user_iothread = NULL;
797     }
798 
799     g_free(proxy->sockname);
800     g_free(proxy);
801 }
802 
803 void vfio_user_request_msg(VFIOUserHdr *hdr, uint16_t cmd,
804                            uint32_t size, uint32_t flags)
805 {
806     static uint16_t next_id;
807 
808     hdr->id = qatomic_fetch_inc(&next_id);
809     hdr->command = cmd;
810     hdr->size = size;
811     hdr->flags = (flags & ~VFIO_USER_TYPE) | VFIO_USER_REQUEST;
812     hdr->error_reply = 0;
813 }
814 
815 struct cap_entry {
816     const char *name;
817     bool (*check)(VFIOUserProxy *proxy, QObject *qobj, Error **errp);
818 };
819 
820 static bool caps_parse(VFIOUserProxy *proxy, QDict *qdict,
821                        struct cap_entry caps[], Error **errp)
822 {
823     QObject *qobj;
824     struct cap_entry *p;
825 
826     for (p = caps; p->name != NULL; p++) {
827         qobj = qdict_get(qdict, p->name);
828         if (qobj != NULL) {
829             if (!p->check(proxy, qobj, errp)) {
830                 return false;
831             }
832             qdict_del(qdict, p->name);
833         }
834     }
835 
836     /* warning, for now */
837     if (qdict_size(qdict) != 0) {
838         warn_report("spurious capabilities");
839     }
840     return true;
841 }
842 
843 static bool check_migr_pgsize(VFIOUserProxy *proxy, QObject *qobj, Error **errp)
844 {
845     QNum *qn = qobject_to(QNum, qobj);
846     uint64_t pgsize;
847 
848     if (qn == NULL || !qnum_get_try_uint(qn, &pgsize)) {
849         error_setg(errp, "malformed %s", VFIO_USER_CAP_PGSIZE);
850         return false;
851     }
852 
853     /* must be larger than default */
854     if (pgsize & (VFIO_USER_DEF_PGSIZE - 1)) {
855         error_setg(errp, "pgsize 0x%"PRIx64" too small", pgsize);
856         return false;
857     }
858 
859     proxy->migr_pgsize = pgsize;
860     return true;
861 }
862 
863 static bool check_bitmap(VFIOUserProxy *proxy, QObject *qobj, Error **errp)
864 {
865     QNum *qn = qobject_to(QNum, qobj);
866     uint64_t bitmap_size;
867 
868     if (qn == NULL || !qnum_get_try_uint(qn, &bitmap_size)) {
869         error_setg(errp, "malformed %s", VFIO_USER_CAP_MAX_BITMAP);
870         return false;
871     }
872 
873     /* can only lower it */
874     if (bitmap_size > VFIO_USER_DEF_MAX_BITMAP) {
875         error_setg(errp, "%s too large", VFIO_USER_CAP_MAX_BITMAP);
876         return false;
877     }
878 
879     proxy->max_bitmap = bitmap_size;
880     return true;
881 }
882 
883 static struct cap_entry caps_migr[] = {
884     { VFIO_USER_CAP_PGSIZE, check_migr_pgsize },
885     { VFIO_USER_CAP_MAX_BITMAP, check_bitmap },
886     { NULL }
887 };
888 
889 static bool check_max_fds(VFIOUserProxy *proxy, QObject *qobj, Error **errp)
890 {
891     QNum *qn = qobject_to(QNum, qobj);
892     uint64_t max_send_fds;
893 
894     if (qn == NULL || !qnum_get_try_uint(qn, &max_send_fds) ||
895         max_send_fds > VFIO_USER_MAX_MAX_FDS) {
896         error_setg(errp, "malformed %s", VFIO_USER_CAP_MAX_FDS);
897         return false;
898     }
899     proxy->max_send_fds = max_send_fds;
900     return true;
901 }
902 
903 static bool check_max_xfer(VFIOUserProxy *proxy, QObject *qobj, Error **errp)
904 {
905     QNum *qn = qobject_to(QNum, qobj);
906     uint64_t max_xfer_size;
907 
908     if (qn == NULL || !qnum_get_try_uint(qn, &max_xfer_size) ||
909         max_xfer_size > VFIO_USER_MAX_MAX_XFER) {
910         error_setg(errp, "malformed %s", VFIO_USER_CAP_MAX_XFER);
911         return false;
912     }
913     proxy->max_xfer_size = max_xfer_size;
914     return true;
915 }
916 
917 static bool check_pgsizes(VFIOUserProxy *proxy, QObject *qobj, Error **errp)
918 {
919     QNum *qn = qobject_to(QNum, qobj);
920     uint64_t pgsizes;
921 
922     if (qn == NULL || !qnum_get_try_uint(qn, &pgsizes)) {
923         error_setg(errp, "malformed %s", VFIO_USER_CAP_PGSIZES);
924         return false;
925     }
926 
927     /* must be larger than default */
928     if (pgsizes & (VFIO_USER_DEF_PGSIZE - 1)) {
929         error_setg(errp, "pgsize 0x%"PRIx64" too small", pgsizes);
930         return false;
931     }
932 
933     proxy->dma_pgsizes = pgsizes;
934     return true;
935 }
936 
937 static bool check_max_dma(VFIOUserProxy *proxy, QObject *qobj, Error **errp)
938 {
939     QNum *qn = qobject_to(QNum, qobj);
940     uint64_t max_dma;
941 
942     if (qn == NULL || !qnum_get_try_uint(qn, &max_dma)) {
943         error_setg(errp, "malformed %s", VFIO_USER_CAP_MAP_MAX);
944         return false;
945     }
946 
947     /* can only lower it */
948     if (max_dma > VFIO_USER_DEF_MAP_MAX) {
949         error_setg(errp, "%s too large", VFIO_USER_CAP_MAP_MAX);
950         return false;
951     }
952 
953     proxy->max_dma = max_dma;
954     return true;
955 }
956 
957 static bool check_migr(VFIOUserProxy *proxy, QObject *qobj, Error **errp)
958 {
959     QDict *qdict = qobject_to(QDict, qobj);
960 
961     if (qdict == NULL) {
962         error_setg(errp, "malformed %s", VFIO_USER_CAP_MAX_FDS);
963         return true;
964     }
965     return caps_parse(proxy, qdict, caps_migr, errp);
966 }
967 
968 static struct cap_entry caps_cap[] = {
969     { VFIO_USER_CAP_MAX_FDS, check_max_fds },
970     { VFIO_USER_CAP_MAX_XFER, check_max_xfer },
971     { VFIO_USER_CAP_PGSIZES, check_pgsizes },
972     { VFIO_USER_CAP_MAP_MAX, check_max_dma },
973     { VFIO_USER_CAP_MIGR, check_migr },
974     { NULL }
975 };
976 
977 static bool check_cap(VFIOUserProxy *proxy, QObject *qobj, Error **errp)
978 {
979    QDict *qdict = qobject_to(QDict, qobj);
980 
981     if (qdict == NULL) {
982         error_setg(errp, "malformed %s", VFIO_USER_CAP);
983         return false;
984     }
985     return caps_parse(proxy, qdict, caps_cap, errp);
986 }
987 
988 static struct cap_entry ver_0_0[] = {
989     { VFIO_USER_CAP, check_cap },
990     { NULL }
991 };
992 
993 static bool caps_check(VFIOUserProxy *proxy, int minor, const char *caps,
994                        Error **errp)
995 {
996     QObject *qobj;
997     QDict *qdict;
998     bool ret;
999 
1000     qobj = qobject_from_json(caps, NULL);
1001     if (qobj == NULL) {
1002         error_setg(errp, "malformed capabilities %s", caps);
1003         return false;
1004     }
1005     qdict = qobject_to(QDict, qobj);
1006     if (qdict == NULL) {
1007         error_setg(errp, "capabilities %s not an object", caps);
1008         qobject_unref(qobj);
1009         return false;
1010     }
1011     ret = caps_parse(proxy, qdict, ver_0_0, errp);
1012 
1013     qobject_unref(qobj);
1014     return ret;
1015 }
1016 
1017 static GString *caps_json(void)
1018 {
1019     QDict *dict = qdict_new();
1020     QDict *capdict = qdict_new();
1021     QDict *migdict = qdict_new();
1022     GString *str;
1023 
1024     qdict_put_int(migdict, VFIO_USER_CAP_PGSIZE, VFIO_USER_DEF_PGSIZE);
1025     qdict_put_int(migdict, VFIO_USER_CAP_MAX_BITMAP, VFIO_USER_DEF_MAX_BITMAP);
1026     qdict_put_obj(capdict, VFIO_USER_CAP_MIGR, QOBJECT(migdict));
1027 
1028     qdict_put_int(capdict, VFIO_USER_CAP_MAX_FDS, VFIO_USER_MAX_MAX_FDS);
1029     qdict_put_int(capdict, VFIO_USER_CAP_MAX_XFER, VFIO_USER_DEF_MAX_XFER);
1030     qdict_put_int(capdict, VFIO_USER_CAP_PGSIZES, VFIO_USER_DEF_PGSIZE);
1031     qdict_put_int(capdict, VFIO_USER_CAP_MAP_MAX, VFIO_USER_DEF_MAP_MAX);
1032 
1033     qdict_put_obj(dict, VFIO_USER_CAP, QOBJECT(capdict));
1034 
1035     str = qobject_to_json(QOBJECT(dict));
1036     qobject_unref(dict);
1037     return str;
1038 }
1039 
1040 bool vfio_user_validate_version(VFIOUserProxy *proxy, Error **errp)
1041 {
1042     g_autofree VFIOUserVersion *msgp = NULL;
1043     GString *caps;
1044     char *reply;
1045     int size, caplen;
1046 
1047     caps = caps_json();
1048     caplen = caps->len + 1;
1049     size = sizeof(*msgp) + caplen;
1050     msgp = g_malloc0(size);
1051 
1052     vfio_user_request_msg(&msgp->hdr, VFIO_USER_VERSION, size, 0);
1053     msgp->major = VFIO_USER_MAJOR_VER;
1054     msgp->minor = VFIO_USER_MINOR_VER;
1055     memcpy(&msgp->capabilities, caps->str, caplen);
1056     g_string_free(caps, true);
1057     trace_vfio_user_version(msgp->major, msgp->minor, msgp->capabilities);
1058 
1059     if (!vfio_user_send_wait(proxy, &msgp->hdr, NULL, 0, errp)) {
1060         return false;
1061     }
1062 
1063     if (msgp->hdr.flags & VFIO_USER_ERROR) {
1064         error_setg_errno(errp, msgp->hdr.error_reply, "version reply");
1065         return false;
1066     }
1067 
1068     if (msgp->major != VFIO_USER_MAJOR_VER ||
1069         msgp->minor > VFIO_USER_MINOR_VER) {
1070         error_setg(errp, "incompatible server version");
1071         return false;
1072     }
1073 
1074     reply = msgp->capabilities;
1075     if (reply[msgp->hdr.size - sizeof(*msgp) - 1] != '\0') {
1076         error_setg(errp, "corrupt version reply");
1077         return false;
1078     }
1079 
1080     if (!caps_check(proxy, msgp->minor, reply, errp)) {
1081         return false;
1082     }
1083 
1084     trace_vfio_user_version(msgp->major, msgp->minor, msgp->capabilities);
1085     return true;
1086 }
1087