xref: /src/sys/dev/hyperv/hvsock/hv_sock.c (revision 19261079b74319502c6ffa1249920079f0f69a72)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2020 Microsoft Corp.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/bus.h>
34 #include <sys/domain.h>
35 #include <sys/lock.h>
36 #include <sys/kernel.h>
37 #include <sys/types.h>
38 #include <sys/malloc.h>
39 #include <sys/module.h>
40 #include <sys/mutex.h>
41 #include <sys/proc.h>
42 #include <sys/protosw.h>
43 #include <sys/socket.h>
44 #include <sys/sysctl.h>
45 #include <sys/sysproto.h>
46 #include <sys/systm.h>
47 #include <sys/sockbuf.h>
48 #include <sys/sx.h>
49 #include <sys/uio.h>
50 
51 #include <net/vnet.h>
52 
53 #include <dev/hyperv/vmbus/vmbus_reg.h>
54 
55 #include "hv_sock.h"
56 
57 #define HVSOCK_DBG_NONE			0x0
58 #define HVSOCK_DBG_INFO			0x1
59 #define HVSOCK_DBG_ERR			0x2
60 #define HVSOCK_DBG_VERBOSE		0x3
61 
62 
63 SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket");
64 
65 static int hvs_dbg_level;
66 SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level,
67     0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose");
68 
69 
70 #define HVSOCK_DBG(level, ...) do {					\
71 	if (hvs_dbg_level >= (level))					\
72 		printf(__VA_ARGS__);					\
73 	} while (0)
74 
75 MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures");
76 
77 static int hvs_dom_probe(void);
78 
79 /* The MTU is 16KB per host side's design */
80 #define HVSOCK_MTU_SIZE		(1024 * 16)
81 #define HVSOCK_SEND_BUF_SZ	(PAGE_SIZE - sizeof(struct vmpipe_proto_header))
82 
83 #define HVSOCK_HEADER_LEN	(sizeof(struct hvs_pkt_header))
84 
85 #define HVSOCK_PKT_LEN(payload_len)	(HVSOCK_HEADER_LEN + \
86 					 roundup2(payload_len, 8) + \
87 					 sizeof(uint64_t))
88 
89 
90 static struct domain		hv_socket_domain;
91 
92 /*
93  * HyperV Transport sockets
94  */
95 static struct pr_usrreqs	hvs_trans_usrreqs = {
96 	.pru_attach =		hvs_trans_attach,
97 	.pru_bind =		hvs_trans_bind,
98 	.pru_listen =		hvs_trans_listen,
99 	.pru_accept =		hvs_trans_accept,
100 	.pru_connect =		hvs_trans_connect,
101 	.pru_peeraddr =		hvs_trans_peeraddr,
102 	.pru_sockaddr =		hvs_trans_sockaddr,
103 	.pru_soreceive =	hvs_trans_soreceive,
104 	.pru_sosend =		hvs_trans_sosend,
105 	.pru_disconnect =	hvs_trans_disconnect,
106 	.pru_close =		hvs_trans_close,
107 	.pru_detach =		hvs_trans_detach,
108 	.pru_shutdown =		hvs_trans_shutdown,
109 	.pru_abort =		hvs_trans_abort,
110 };
111 
112 /*
113  * Definitions of protocols supported in HyperV socket domain
114  */
115 static struct protosw		hv_socket_protosw[] = {
116 {
117 	.pr_type =		SOCK_STREAM,
118 	.pr_domain =		&hv_socket_domain,
119 	.pr_protocol =		HYPERV_SOCK_PROTO_TRANS,
120 	.pr_flags =		PR_CONNREQUIRED,
121 	.pr_init =		hvs_trans_init,
122 	.pr_usrreqs =		&hvs_trans_usrreqs,
123 },
124 };
125 
126 static struct domain		hv_socket_domain = {
127 	.dom_family =		AF_HYPERV,
128 	.dom_name =		"hyperv",
129 	.dom_probe =		hvs_dom_probe,
130 	.dom_protosw =		hv_socket_protosw,
131 	.dom_protoswNPROTOSW =	&hv_socket_protosw[nitems(hv_socket_protosw)]
132 };
133 
134 VNET_DOMAIN_SET(hv_socket_);
135 
136 #define MAX_PORT			((uint32_t)0xFFFFFFFF)
137 #define MIN_PORT			((uint32_t)0x0)
138 
139 /* 00000000-facb-11e6-bd58-64006a7986d3 */
140 static const struct hyperv_guid srv_id_template = {
141 	.hv_guid = {
142 	    0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11,
143 	    0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 }
144 };
145 
146 static int		hvsock_br_callback(void *, int, void *);
147 static uint32_t		hvsock_canread_check(struct hvs_pcb *);
148 static uint32_t		hvsock_canwrite_check(struct hvs_pcb *);
149 static int		hvsock_send_data(struct vmbus_channel *chan,
150     struct uio *uio, uint32_t to_write, struct sockbuf *sb);
151 
152 
153 
154 /* Globals */
155 static struct sx		hvs_trans_socks_sx;
156 static struct mtx		hvs_trans_socks_mtx;
157 static LIST_HEAD(, hvs_pcb)	hvs_trans_bound_socks;
158 static LIST_HEAD(, hvs_pcb)	hvs_trans_connected_socks;
159 static uint32_t			previous_auto_bound_port;
160 
161 static void
162 hvsock_print_guid(struct hyperv_guid *guid)
163 {
164 	unsigned char *p = (unsigned char *)guid;
165 
166 	HVSOCK_DBG(HVSOCK_DBG_INFO,
167 	    "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n",
168 	    *(unsigned int *)p,
169 	    *((unsigned short *) &p[4]),
170 	    *((unsigned short *) &p[6]),
171 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
172 }
173 
174 static bool
175 is_valid_srv_id(const struct hyperv_guid *id)
176 {
177 	return !memcmp(&id->hv_guid[4],
178 	    &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4);
179 }
180 
181 static unsigned int
182 get_port_by_srv_id(const struct hyperv_guid *srv_id)
183 {
184 	return *((const unsigned int *)srv_id);
185 }
186 
187 static void
188 set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port)
189 {
190 	*((unsigned int *)srv_id) = port;
191 }
192 
193 
194 static void
195 __hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list)
196 {
197 	struct hvs_pcb *p = NULL;
198 
199 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
200 
201 	if (!pcb)
202 		return;
203 
204 	if (list & HVS_LIST_BOUND) {
205 		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
206 			if  (p == pcb)
207 				LIST_REMOVE(p, bound_next);
208 	}
209 
210 	if (list & HVS_LIST_CONNECTED) {
211 		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
212 			if (p == pcb)
213 				LIST_REMOVE(pcb, connected_next);
214 	}
215 }
216 
217 static void
218 __hvs_remove_socket_from_list(struct socket *so, unsigned char list)
219 {
220 	struct hvs_pcb *pcb = so2hvspcb(so);
221 
222 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
223 
224 	__hvs_remove_pcb_from_list(pcb, list);
225 }
226 
227 static void
228 __hvs_insert_socket_on_list(struct socket *so, unsigned char list)
229 {
230 	struct hvs_pcb *pcb = so2hvspcb(so);
231 
232 	if (list & HVS_LIST_BOUND)
233 		LIST_INSERT_HEAD(&hvs_trans_bound_socks,
234 		   pcb, bound_next);
235 
236 	if (list & HVS_LIST_CONNECTED)
237 		LIST_INSERT_HEAD(&hvs_trans_connected_socks,
238 		   pcb, connected_next);
239 }
240 
241 void
242 hvs_remove_socket_from_list(struct socket *so, unsigned char list)
243 {
244 	if (!so || !so->so_pcb) {
245 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
246 		    "%s: socket or so_pcb is null\n", __func__);
247 		return;
248 	}
249 
250 	mtx_lock(&hvs_trans_socks_mtx);
251 	__hvs_remove_socket_from_list(so, list);
252 	mtx_unlock(&hvs_trans_socks_mtx);
253 }
254 
255 static void
256 hvs_insert_socket_on_list(struct socket *so, unsigned char list)
257 {
258 	if (!so || !so->so_pcb) {
259 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
260 		    "%s: socket or so_pcb is null\n", __func__);
261 		return;
262 	}
263 
264 	mtx_lock(&hvs_trans_socks_mtx);
265 	__hvs_insert_socket_on_list(so, list);
266 	mtx_unlock(&hvs_trans_socks_mtx);
267 }
268 
269 static struct socket *
270 __hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
271 {
272 	struct hvs_pcb *p = NULL;
273 
274 	if (list & HVS_LIST_BOUND)
275 		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
276 			if (p->so != NULL &&
277 			    addr->hvs_port == p->local_addr.hvs_port)
278 				return p->so;
279 
280 	if (list & HVS_LIST_CONNECTED)
281 		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
282 			if (p->so != NULL &&
283 			    addr->hvs_port == p->local_addr.hvs_port)
284 				return p->so;
285 
286 	return NULL;
287 }
288 
289 static struct socket *
290 hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
291 {
292 	struct socket *s = NULL;
293 
294 	mtx_lock(&hvs_trans_socks_mtx);
295 	s = __hvs_find_socket_on_list(addr, list);
296 	mtx_unlock(&hvs_trans_socks_mtx);
297 
298 	return s;
299 }
300 
301 static inline void
302 hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port)
303 {
304 	memset(addr, 0, sizeof(*addr));
305 	addr->sa_family = AF_HYPERV;
306 	addr->sa_len = sizeof(*addr);
307 	addr->hvs_port = port;
308 }
309 
310 void
311 hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id)
312 {
313 	hvs_addr_set(addr, get_port_by_srv_id(svr_id));
314 }
315 
316 int
317 hvs_trans_lock(void)
318 {
319 	sx_xlock(&hvs_trans_socks_sx);
320 	return (0);
321 }
322 
323 void
324 hvs_trans_unlock(void)
325 {
326 	sx_xunlock(&hvs_trans_socks_sx);
327 }
328 
329 static int
330 hvs_dom_probe(void)
331 {
332 
333 	/* Don't even give us a chance to attach on non-HyperV. */
334 	if (vm_guest != VM_GUEST_HV)
335 		return (ENXIO);
336 	return (0);
337 }
338 
339 void
340 hvs_trans_init(void)
341 {
342 	/* Skip initialization of globals for non-default instances. */
343 	if (!IS_DEFAULT_VNET(curvnet))
344 		return;
345 
346 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
347 	    "%s: HyperV Socket hvs_trans_init called\n", __func__);
348 
349 	/* Initialize Globals */
350 	previous_auto_bound_port = MAX_PORT;
351 	sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx");
352 	mtx_init(&hvs_trans_socks_mtx,
353 	    "hvs_trans_socks_mtx", NULL, MTX_DEF);
354 	LIST_INIT(&hvs_trans_bound_socks);
355 	LIST_INIT(&hvs_trans_connected_socks);
356 }
357 
358 /*
359  * Called in two cases:
360  * 1) When user calls socket();
361  * 2) When we accept new incoming conneciton and call sonewconn().
362  */
363 int
364 hvs_trans_attach(struct socket *so, int proto, struct thread *td)
365 {
366 	struct hvs_pcb *pcb = so2hvspcb(so);
367 
368 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
369 	    "%s: HyperV Socket hvs_trans_attach called\n", __func__);
370 
371 	if (so->so_type != SOCK_STREAM)
372 		return (ESOCKTNOSUPPORT);
373 
374 	if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS)
375 		return (EPROTONOSUPPORT);
376 
377 	if (pcb != NULL)
378 		return (EISCONN);
379 	pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO);
380 	if (pcb == NULL)
381 		return (ENOMEM);
382 
383 	pcb->so = so;
384 	so->so_pcb = (void *)pcb;
385 
386 	return (0);
387 }
388 
389 void
390 hvs_trans_detach(struct socket *so)
391 {
392 	struct hvs_pcb *pcb;
393 
394 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
395 	    "%s: HyperV Socket hvs_trans_detach called\n", __func__);
396 
397 	(void) hvs_trans_lock();
398 	pcb = so2hvspcb(so);
399 	if (pcb == NULL) {
400 		hvs_trans_unlock();
401 		return;
402 	}
403 
404 	if (SOLISTENING(so)) {
405 		bzero(pcb, sizeof(*pcb));
406 		free(pcb, M_HVSOCK);
407 	}
408 
409 	so->so_pcb = NULL;
410 
411 	hvs_trans_unlock();
412 }
413 
414 int
415 hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td)
416 {
417 	struct hvs_pcb *pcb = so2hvspcb(so);
418 	struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr;
419 	int error = 0;
420 
421 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
422 	    "%s: HyperV Socket hvs_trans_bind called\n", __func__);
423 
424 	if (sa == NULL) {
425 		return (EINVAL);
426 	}
427 
428 	if (pcb == NULL) {
429 		return (EINVAL);
430 	}
431 
432 	if (sa->sa_family != AF_HYPERV) {
433 		HVSOCK_DBG(HVSOCK_DBG_ERR,
434 		    "%s: Not supported, sa_family is %u\n",
435 		    __func__, sa->sa_family);
436 		return (EAFNOSUPPORT);
437 	}
438 	if (sa->sa_len != sizeof(*sa)) {
439 		HVSOCK_DBG(HVSOCK_DBG_ERR,
440 		    "%s: Not supported, sa_len is %u\n",
441 		    __func__, sa->sa_len);
442 		return (EINVAL);
443 	}
444 
445 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
446 	    "%s: binding port = 0x%x\n", __func__, sa->hvs_port);
447 
448 	mtx_lock(&hvs_trans_socks_mtx);
449 	if (__hvs_find_socket_on_list(sa,
450 	    HVS_LIST_BOUND | HVS_LIST_CONNECTED)) {
451 		error = EADDRINUSE;
452 	} else {
453 		/*
454 		 * The address is available for us to bind.
455 		 * Add socket to the bound list.
456 		 */
457 		hvs_addr_set(&pcb->local_addr, sa->hvs_port);
458 		hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY);
459 		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
460 	}
461 	mtx_unlock(&hvs_trans_socks_mtx);
462 
463 	return (error);
464 }
465 
466 int
467 hvs_trans_listen(struct socket *so, int backlog, struct thread *td)
468 {
469 	struct hvs_pcb *pcb = so2hvspcb(so);
470 	struct socket *bound_so;
471 	int error;
472 
473 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
474 	    "%s: HyperV Socket hvs_trans_listen called\n", __func__);
475 
476 	if (pcb == NULL)
477 		return (EINVAL);
478 
479 	/* Check if the address is already bound and it was by us. */
480 	bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND);
481 	if (bound_so == NULL || bound_so != so) {
482 		HVSOCK_DBG(HVSOCK_DBG_ERR,
483 		    "%s: Address not bound or not by us.\n", __func__);
484 		return (EADDRNOTAVAIL);
485 	}
486 
487 	SOCK_LOCK(so);
488 	error = solisten_proto_check(so);
489 	if (error == 0)
490 		solisten_proto(so, backlog);
491 	SOCK_UNLOCK(so);
492 
493 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
494 	    "%s: HyperV Socket listen error = %d\n", __func__, error);
495 	return (error);
496 }
497 
498 int
499 hvs_trans_accept(struct socket *so, struct sockaddr **nam)
500 {
501 	struct hvs_pcb *pcb = so2hvspcb(so);
502 
503 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
504 	    "%s: HyperV Socket hvs_trans_accept called\n", __func__);
505 
506 	if (pcb == NULL)
507 		return (EINVAL);
508 
509 	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr,
510 	    M_NOWAIT);
511 
512 	return ((*nam == NULL) ? ENOMEM : 0);
513 }
514 
515 int
516 hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
517 {
518 	struct hvs_pcb *pcb = so2hvspcb(so);
519 	struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam;
520 	bool found_auto_bound_port = false;
521 	int i, error = 0;
522 
523 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
524 	    "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n",
525 	    __func__, raddr->hvs_port);
526 
527 	if (pcb == NULL)
528 		return (EINVAL);
529 
530 	/* Verify the remote address */
531 	if (raddr == NULL)
532 		return (EINVAL);
533 	if (raddr->sa_family != AF_HYPERV)
534 		return (EAFNOSUPPORT);
535 	if (raddr->sa_len != sizeof(*raddr))
536 		return (EINVAL);
537 
538 	mtx_lock(&hvs_trans_socks_mtx);
539 	if (so->so_state &
540 	    (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) {
541 			HVSOCK_DBG(HVSOCK_DBG_ERR,
542 			    "%s: socket connect in progress\n",
543 			    __func__);
544 			error = EINPROGRESS;
545 			goto out;
546 	}
547 
548 	/*
549 	 * Find an available port for us to auto bind the local
550 	 * address.
551 	 */
552 	hvs_addr_set(&pcb->local_addr, 0);
553 
554 	for (i = previous_auto_bound_port - 1;
555 	    i != previous_auto_bound_port; i --) {
556 		if (i == MIN_PORT)
557 			i = MAX_PORT;
558 
559 		pcb->local_addr.hvs_port = i;
560 
561 		if (__hvs_find_socket_on_list(&pcb->local_addr,
562 		    HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) {
563 			found_auto_bound_port = true;
564 			previous_auto_bound_port = i;
565 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
566 			    "%s: found local bound port is %x\n",
567 			    __func__, pcb->local_addr.hvs_port);
568 			break;
569 		}
570 	}
571 
572 	if (found_auto_bound_port == true) {
573 		/* Found available port for auto bound, put on list */
574 		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
575 		/* Set VM service ID */
576 		pcb->vm_srv_id = srv_id_template;
577 		set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port);
578 		/* Set host service ID and remote port */
579 		pcb->host_srv_id = srv_id_template;
580 		set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port);
581 		hvs_addr_set(&pcb->remote_addr, raddr->hvs_port);
582 
583 		/* Change the socket state to SS_ISCONNECTING */
584 		soisconnecting(so);
585 	} else {
586 		HVSOCK_DBG(HVSOCK_DBG_ERR,
587 		    "%s: No local port available for auto bound\n",
588 		    __func__);
589 		error = EADDRINUSE;
590 	}
591 
592 	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is ");
593 	hvsock_print_guid(&pcb->vm_srv_id);
594 	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is ");
595 	hvsock_print_guid(&pcb->host_srv_id);
596 
597 out:
598 	mtx_unlock(&hvs_trans_socks_mtx);
599 
600 	if (found_auto_bound_port == true)
601 		 vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id);
602 
603 	return (error);
604 }
605 
606 int
607 hvs_trans_disconnect(struct socket *so)
608 {
609 	struct hvs_pcb *pcb;
610 
611 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
612 	    "%s: HyperV Socket hvs_trans_disconnect called\n", __func__);
613 
614 	(void) hvs_trans_lock();
615 	pcb = so2hvspcb(so);
616 	if (pcb == NULL) {
617 		hvs_trans_unlock();
618 		return (EINVAL);
619 	}
620 
621 	/* If socket is already disconnected, skip this */
622 	if ((so->so_state & SS_ISDISCONNECTED) == 0)
623 		soisdisconnecting(so);
624 
625 	hvs_trans_unlock();
626 
627 	return (0);
628 }
629 
630 #define SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
631 struct hvs_callback_arg {
632 	struct uio *uio;
633 	struct sockbuf *sb;
634 };
635 
636 int
637 hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr,
638     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
639 {
640 	struct hvs_pcb *pcb = so2hvspcb(so);
641 	struct sockbuf *sb;
642 	ssize_t orig_resid;
643 	uint32_t canread, to_read;
644 	int flags, error = 0;
645 	struct hvs_callback_arg cbarg;
646 
647 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
648 	    "%s: HyperV Socket hvs_trans_soreceive called\n", __func__);
649 
650 	if (so->so_type != SOCK_STREAM)
651 		return (EINVAL);
652 	if (pcb == NULL)
653 		return (EINVAL);
654 
655 	if (flagsp != NULL)
656 		flags = *flagsp &~ MSG_EOR;
657 	else
658 		flags = 0;
659 
660 	if (flags & MSG_PEEK)
661 		return (EOPNOTSUPP);
662 
663 	/* If no space to copy out anything */
664 	if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ)
665 		return (EINVAL);
666 
667 	orig_resid = uio->uio_resid;
668 
669 	/* Prevent other readers from entering the socket. */
670 	error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
671 	if (error) {
672 		HVSOCK_DBG(HVSOCK_DBG_ERR,
673 		    "%s: soiolock returned error = %d\n", __func__, error);
674 		return (error);
675 	}
676 
677 	sb = &so->so_rcv;
678 	SOCKBUF_LOCK(sb);
679 
680 	cbarg.uio = uio;
681 	cbarg.sb = sb;
682 	/*
683 	 * If the socket is closing, there might still be some data
684 	 * in rx br to read. However we need to make sure
685 	 * the channel is still open.
686 	 */
687 	if ((sb->sb_state & SBS_CANTRCVMORE) &&
688 	    (so->so_state & SS_ISDISCONNECTED)) {
689 		/* Other thread already closed the channel */
690 		error = EPIPE;
691 		goto out;
692 	}
693 
694 	while (true) {
695 		while (uio->uio_resid > 0 &&
696 		    (canread = hvsock_canread_check(pcb)) > 0) {
697 			to_read = MIN(canread, uio->uio_resid);
698 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
699 			    "%s: to_read = %u, skip = %u\n", __func__, to_read,
700 			    (unsigned int)(sizeof(struct hvs_pkt_header) +
701 			    pcb->recv_data_off));
702 
703 			error = vmbus_chan_recv_peek_call(pcb->chan, to_read,
704 			    sizeof(struct hvs_pkt_header) + pcb->recv_data_off,
705 			    hvsock_br_callback, (void *)&cbarg);
706 			/*
707 			 * It is possible socket is disconnected becasue
708 			 * we released lock in hvsock_br_callback. So we
709 			 * need to check the state to make sure it is not
710 			 * disconnected.
711 			 */
712 			if (error || so->so_state & SS_ISDISCONNECTED) {
713 				break;
714 			}
715 
716 			pcb->recv_data_len -= to_read;
717 			pcb->recv_data_off += to_read;
718 		}
719 
720 		if (error)
721 			break;
722 
723 		/* Abort if socket has reported problems. */
724 		if (so->so_error) {
725 			if (so->so_error == ESHUTDOWN &&
726 			    orig_resid > uio->uio_resid) {
727 				/*
728 				 * Although we got a FIN, we also received
729 				 * some data in this round. Delivery it
730 				 * to user.
731 				 */
732 				error = 0;
733 			} else {
734 				if (so->so_error != ESHUTDOWN)
735 					error = so->so_error;
736 			}
737 
738 			break;
739 		}
740 
741 		/* Cannot received more. */
742 		if (sb->sb_state & SBS_CANTRCVMORE)
743 			break;
744 
745 		/* We are done if buffer has been filled */
746 		if (uio->uio_resid == 0)
747 			break;
748 
749 		if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid)
750 			break;
751 
752 		/* Buffer ring is empty and we shall not block */
753 		if ((so->so_state & SS_NBIO) ||
754 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
755 			if (orig_resid == uio->uio_resid) {
756 				/* We have not read anything */
757 				error = EAGAIN;
758 			}
759 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
760 			    "%s: non blocked read return, error %d.\n",
761 			    __func__, error);
762 			break;
763 		}
764 
765 		/*
766 		 * Wait and block until (more) data comes in.
767 		 * Note: Drops the sockbuf lock during wait.
768 		 */
769 		error = sbwait(sb);
770 
771 		if (error)
772 			break;
773 
774 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
775 		    "%s: wake up from sbwait, read available is %u\n",
776 		    __func__, vmbus_chan_read_available(pcb->chan));
777 	}
778 
779 out:
780 	SOCKBUF_UNLOCK(sb);
781 	SOCK_IO_RECV_UNLOCK(so);
782 
783 	/* We recieved a FIN in this call */
784 	if (so->so_error == ESHUTDOWN) {
785 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
786 			/* Send has already closed */
787 			soisdisconnecting(so);
788 		} else {
789 			/* Just close the receive side */
790 			socantrcvmore(so);
791 		}
792 	}
793 
794 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
795 	    "%s: returning error = %d, so_error = %d\n",
796 	    __func__, error, so->so_error);
797 
798 	return (error);
799 }
800 
801 int
802 hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
803     struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td)
804 {
805 	struct hvs_pcb *pcb = so2hvspcb(so);
806 	struct sockbuf *sb;
807 	ssize_t orig_resid;
808 	uint32_t canwrite, to_write;
809 	int error = 0;
810 
811 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
812 	    "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %zd\n",
813 	    __func__, uio->uio_resid);
814 
815 	if (so->so_type != SOCK_STREAM)
816 		return (EINVAL);
817 	if (pcb == NULL)
818 		return (EINVAL);
819 
820 	/* If nothing to send */
821 	if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE)
822 		return (EINVAL);
823 
824 	orig_resid = uio->uio_resid;
825 
826 	/* Prevent other writers from entering the socket. */
827 	error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
828 	if (error) {
829 		HVSOCK_DBG(HVSOCK_DBG_ERR,
830 		    "%s: soiolocak returned error = %d\n", __func__, error);
831 		return (error);
832 	}
833 
834 	sb = &so->so_snd;
835 	SOCKBUF_LOCK(sb);
836 
837 	if ((sb->sb_state & SBS_CANTSENDMORE) ||
838 	    so->so_error == ESHUTDOWN) {
839 		error = EPIPE;
840 		goto out;
841 	}
842 
843 	while (uio->uio_resid > 0) {
844 		canwrite = hvsock_canwrite_check(pcb);
845 		if (canwrite == 0) {
846 			/* We have sent some data */
847 			if (orig_resid > uio->uio_resid)
848 				break;
849 			/*
850 			 * We have not sent any data and it is
851 			 * non-blocked io
852 			 */
853 			if (so->so_state & SS_NBIO ||
854 			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
855 				error = EWOULDBLOCK;
856 				break;
857 			} else {
858 				/*
859 				 * We are here because there is no space on
860 				 * send buffer ring. Signal the other side
861 				 * to read and free more space.
862 				 * Sleep wait until space avaiable to send
863 				 * Note: Drops the sockbuf lock during wait.
864 				 */
865 				error = sbwait(sb);
866 
867 				if (error)
868 					break;
869 
870 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
871 				    "%s: wake up from sbwait, space avail on "
872 				    "tx ring is %u\n",
873 				    __func__,
874 				    vmbus_chan_write_available(pcb->chan));
875 
876 				continue;
877 			}
878 		}
879 		to_write = MIN(canwrite, uio->uio_resid);
880 		to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ);
881 
882 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
883 		    "%s: canwrite is %u, to_write = %u\n", __func__,
884 		    canwrite, to_write);
885 		error = hvsock_send_data(pcb->chan, uio, to_write, sb);
886 
887 		if (error)
888 			break;
889 	}
890 
891 out:
892 	SOCKBUF_UNLOCK(sb);
893 	SOCK_IO_SEND_UNLOCK(so);
894 
895 	return (error);
896 }
897 
898 int
899 hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam)
900 {
901 	struct hvs_pcb *pcb = so2hvspcb(so);
902 
903 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
904 	    "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__);
905 
906 	if (pcb == NULL)
907 		return (EINVAL);
908 
909 	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT);
910 
911 	return ((*nam == NULL)? ENOMEM : 0);
912 }
913 
914 int
915 hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam)
916 {
917 	struct hvs_pcb *pcb = so2hvspcb(so);
918 
919 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
920 	    "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__);
921 
922 	if (pcb == NULL)
923 		return (EINVAL);
924 
925 	*nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT);
926 
927 	return ((*nam == NULL)? ENOMEM : 0);
928 }
929 
930 void
931 hvs_trans_close(struct socket *so)
932 {
933 	struct hvs_pcb *pcb;
934 
935 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
936 	    "%s: HyperV Socket hvs_trans_close called\n", __func__);
937 
938 	(void) hvs_trans_lock();
939 	pcb = so2hvspcb(so);
940 	if (!pcb) {
941 		hvs_trans_unlock();
942 		return;
943 	}
944 
945 	if (so->so_state & SS_ISCONNECTED) {
946 		/* Send a FIN to peer */
947 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
948 		    "%s: hvs_trans_close sending a FIN to host\n", __func__);
949 		(void) hvsock_send_data(pcb->chan, NULL, 0, NULL);
950 	}
951 
952 	if (so->so_state &
953 	    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
954 		soisdisconnected(so);
955 
956 	pcb->chan = NULL;
957 	pcb->so = NULL;
958 
959 	if (SOLISTENING(so)) {
960 		mtx_lock(&hvs_trans_socks_mtx);
961 		/* Remove from bound list */
962 		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
963 		mtx_unlock(&hvs_trans_socks_mtx);
964 	}
965 
966 	hvs_trans_unlock();
967 
968 	return;
969 }
970 
971 void
972 hvs_trans_abort(struct socket *so)
973 {
974 	struct hvs_pcb *pcb = so2hvspcb(so);
975 
976 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
977 	    "%s: HyperV Socket hvs_trans_abort called\n", __func__);
978 
979 	(void) hvs_trans_lock();
980 	if (pcb == NULL) {
981 		hvs_trans_unlock();
982 		return;
983 	}
984 
985 	if (SOLISTENING(so)) {
986 		mtx_lock(&hvs_trans_socks_mtx);
987 		/* Remove from bound list */
988 		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
989 		mtx_unlock(&hvs_trans_socks_mtx);
990 	}
991 
992 	if (so->so_state & SS_ISCONNECTED) {
993 		(void) sodisconnect(so);
994 	}
995 	hvs_trans_unlock();
996 
997 	return;
998 }
999 
1000 int
1001 hvs_trans_shutdown(struct socket *so)
1002 {
1003 	struct hvs_pcb *pcb = so2hvspcb(so);
1004 	struct sockbuf *sb;
1005 
1006 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1007 	    "%s: HyperV Socket hvs_trans_shutdown called\n", __func__);
1008 
1009 	if (pcb == NULL)
1010 		return (EINVAL);
1011 
1012 	/*
1013 	 * Only get called with the shutdown method is SHUT_WR or
1014 	 * SHUT_RDWR.
1015 	 * When the method is SHUT_RD or SHUT_RDWR, the caller
1016 	 * already set the SBS_CANTRCVMORE on receive side socket
1017 	 * buffer.
1018 	 */
1019 	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1020 		/*
1021 		 * SHUT_WR only case.
1022 		 * Receive side is still open. Just close
1023 		 * the send side.
1024 		 */
1025 		socantsendmore(so);
1026 	} else {
1027 		/* SHUT_RDWR case */
1028 		if (so->so_state & SS_ISCONNECTED) {
1029 			/* Send a FIN to peer */
1030 			sb = &so->so_snd;
1031 			SOCKBUF_LOCK(sb);
1032 			(void) hvsock_send_data(pcb->chan, NULL, 0, sb);
1033 			SOCKBUF_UNLOCK(sb);
1034 
1035 			soisdisconnecting(so);
1036 		}
1037 	}
1038 
1039 	return (0);
1040 }
1041 
1042 /* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is
1043  * <port> (see struct sockaddr_hvs).
1044  *
1045  * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV:
1046  * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user-
1047  * guide/make-integration-service, and the endpoint is <VmID, ServiceId> with
1048  * the below sockaddr:
1049  *
1050  * struct SOCKADDR_HV
1051  * {
1052  *    ADDRESS_FAMILY Family;
1053  *    USHORT Reserved;
1054  *    GUID VmId;
1055  *    GUID ServiceId;
1056  * };
1057  * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via
1058  * VMBus, because here it's obvious the host and the VM can easily identify
1059  * each other. Though the VmID is useful on the host, especially in the case
1060  * of Windows container, FreeBSD VM doesn't need it at all.
1061  *
1062  * To be compatible with similar infrastructure in Linux VMs, we have
1063  * to limit the available GUID space of SOCKADDR_HV so that we can create
1064  * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID.
1065  * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is:
1066  *
1067  ****************************************************************************
1068  * The only valid Service GUIDs, from the perspectives of both the host and *
1069  * FreeBSD VM, that can be connected by the other end, must conform to this *
1070  * format: <port>-facb-11e6-bd58-64006a7986d3.                              *
1071  ****************************************************************************
1072  *
1073  * When we write apps on the host to connect(), the GUID ServiceID is used.
1074  * When we write apps in FreeBSD VM to connect(), we only need to specify the
1075  * port and the driver will form the GUID and use that to request the host.
1076  *
1077  * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the
1078  * auto-generated remote port for a connect request initiated by the host's
1079  * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the
1080  * FreeBSD guest.
1081  */
1082 
1083 /*
1084  * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before)
1085  * restricts HyperV socket ring buffer size to six 4K pages. Newer
1086  * HyperV hosts doen't have this limit.
1087  */
1088 #define HVS_RINGBUF_RCV_SIZE	(PAGE_SIZE * 6)
1089 #define HVS_RINGBUF_SND_SIZE	(PAGE_SIZE * 6)
1090 #define HVS_RINGBUF_MAX_SIZE	(PAGE_SIZE * 64)
1091 
1092 struct hvsock_sc {
1093 	device_t		dev;
1094 	struct hvs_pcb		*pcb;
1095 	struct vmbus_channel	*channel;
1096 };
1097 
1098 static bool
1099 hvsock_chan_readable(struct vmbus_channel *chan)
1100 {
1101 	uint32_t readable = vmbus_chan_read_available(chan);
1102 
1103 	return (readable >= HVSOCK_PKT_LEN(0));
1104 }
1105 
1106 static void
1107 hvsock_chan_cb(struct vmbus_channel *chan, void *context)
1108 {
1109 	struct hvs_pcb *pcb = (struct hvs_pcb *) context;
1110 	struct socket *so;
1111 	uint32_t canwrite;
1112 
1113 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1114 	    "%s: host send us a wakeup on rb data, pcb = %p\n",
1115 	    __func__, pcb);
1116 
1117 	/*
1118 	 * Check if the socket is still attached and valid.
1119 	 * Here we know channel is still open. Need to make
1120 	 * sure the socket has not been closed or freed.
1121 	 */
1122 	(void) hvs_trans_lock();
1123 	so = hsvpcb2so(pcb);
1124 
1125 	if (pcb->chan != NULL && so != NULL) {
1126 		/*
1127 		 * Wake up reader if there are data to read.
1128 		 */
1129 		SOCKBUF_LOCK(&(so)->so_rcv);
1130 
1131 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1132 		    "%s: read available = %u\n", __func__,
1133 		    vmbus_chan_read_available(pcb->chan));
1134 
1135 		if (hvsock_chan_readable(pcb->chan))
1136 			sorwakeup_locked(so);
1137 		else
1138 			SOCKBUF_UNLOCK(&(so)->so_rcv);
1139 
1140 		/*
1141 		 * Wake up sender if space becomes available to write.
1142 		 */
1143 		SOCKBUF_LOCK(&(so)->so_snd);
1144 		canwrite = hvsock_canwrite_check(pcb);
1145 
1146 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1147 		    "%s: canwrite = %u\n", __func__, canwrite);
1148 
1149 		if (canwrite > 0) {
1150 			sowwakeup_locked(so);
1151 		} else {
1152 			SOCKBUF_UNLOCK(&(so)->so_snd);
1153 		}
1154 	}
1155 
1156 	hvs_trans_unlock();
1157 
1158 	return;
1159 }
1160 
1161 static int
1162 hvsock_br_callback(void *datap, int cplen, void *cbarg)
1163 {
1164 	struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg;
1165 	struct uio *uio = arg->uio;
1166 	struct sockbuf *sb = arg->sb;
1167 	int error = 0;
1168 
1169 	if (cbarg == NULL || datap == NULL)
1170 		return (EINVAL);
1171 
1172 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1173 	    "%s: called, uio_rw = %s, uio_resid = %zd, cplen = %u, "
1174 	    "datap = %p\n",
1175 	    __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br",
1176 	    uio->uio_resid, cplen, datap);
1177 
1178 	if (sb)
1179 		SOCKBUF_UNLOCK(sb);
1180 
1181 	error = uiomove(datap, cplen, uio);
1182 
1183 	if (sb)
1184 		SOCKBUF_LOCK(sb);
1185 
1186 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1187 	    "%s: after uiomove, uio_resid = %zd, error = %d\n",
1188 	    __func__, uio->uio_resid, error);
1189 
1190 	return (error);
1191 }
1192 
1193 static int
1194 hvsock_send_data(struct vmbus_channel *chan, struct uio *uio,
1195     uint32_t to_write, struct sockbuf *sb)
1196 {
1197 	struct hvs_pkt_header hvs_pkt;
1198 	int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0;
1199 	uint64_t pad = 0;
1200 	struct iovec iov[3];
1201 	struct hvs_callback_arg cbarg;
1202 
1203 	if (chan == NULL)
1204 		return (ENOTCONN);
1205 
1206 	hlen = sizeof(struct vmbus_chanpkt_hdr);
1207 	hvs_pkthlen = sizeof(struct hvs_pkt_header);
1208 	hvs_pktlen = hvs_pkthlen + to_write;
1209 	pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen);
1210 
1211 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1212 	    "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, "
1213 	    "pad_pktlen = %u, data_len = %u\n",
1214 	    __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write);
1215 
1216 	hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND;
1217 	hvs_pkt.chan_pkt_hdr.cph_flags = 0;
1218 	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen);
1219 	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen);
1220 	hvs_pkt.chan_pkt_hdr.cph_xactid = 0;
1221 
1222 	hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1;
1223 	hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write;
1224 
1225 	cbarg.uio = uio;
1226 	cbarg.sb = sb;
1227 
1228 	if (uio && to_write > 0) {
1229 		iov[0].iov_base = &hvs_pkt;
1230 		iov[0].iov_len = hvs_pkthlen;
1231 		iov[1].iov_base = NULL;
1232 		iov[1].iov_len = to_write;
1233 		iov[2].iov_base = &pad;
1234 		iov[2].iov_len = pad_pktlen - hvs_pktlen;
1235 
1236 		error = vmbus_chan_iov_send(chan, iov, 3,
1237 		    hvsock_br_callback, &cbarg);
1238 	} else {
1239 		if (to_write == 0) {
1240 			iov[0].iov_base = &hvs_pkt;
1241 			iov[0].iov_len = hvs_pkthlen;
1242 			iov[1].iov_base = &pad;
1243 			iov[1].iov_len = pad_pktlen - hvs_pktlen;
1244 			error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL);
1245 		}
1246 	}
1247 
1248 	if (error) {
1249 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1250 		    "%s: error = %d\n", __func__, error);
1251 	}
1252 
1253 	return (error);
1254 }
1255 
1256 /*
1257  * Check if we have data on current ring buffer to read
1258  * or not. If not, advance the ring buffer read index to
1259  * next packet. Update the recev_data_len and recev_data_off
1260  * to new value.
1261  * Return the number of bytes can read.
1262  */
1263 static uint32_t
1264 hvsock_canread_check(struct hvs_pcb *pcb)
1265 {
1266 	uint32_t advance;
1267 	uint32_t tlen, hlen, dlen;
1268 	uint32_t bytes_canread = 0;
1269 	int error;
1270 
1271 	if (pcb == NULL || pcb->chan == NULL) {
1272 		pcb->so->so_error = EIO;
1273 		return (0);
1274 	}
1275 
1276 	/* Still have data not read yet on current packet */
1277 	if (pcb->recv_data_len > 0)
1278 		return (pcb->recv_data_len);
1279 
1280 	if (pcb->rb_init)
1281 		advance =
1282 		    VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
1283 	else
1284 		advance = 0;
1285 
1286 	bytes_canread = vmbus_chan_read_available(pcb->chan);
1287 
1288 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1289 	    "%s: bytes_canread on br = %u, advance = %u\n",
1290 	    __func__, bytes_canread, advance);
1291 
1292 	if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) {
1293 		/*
1294 		 * Nothing to read. Need to advance the rindex before
1295 		 * calling sbwait, so host knows to wake us up when data
1296 		 * is available to read on rb.
1297 		 */
1298 		error = vmbus_chan_recv_idxadv(pcb->chan, advance);
1299 		if (error) {
1300 			HVSOCK_DBG(HVSOCK_DBG_ERR,
1301 			    "%s: after calling vmbus_chan_recv_idxadv, "
1302 			    "got error = %d\n",  __func__, error);
1303 			return (0);
1304 		} else {
1305 			pcb->rb_init = false;
1306 			pcb->recv_data_len = 0;
1307 			pcb->recv_data_off = 0;
1308 			bytes_canread = vmbus_chan_read_available(pcb->chan);
1309 
1310 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1311 			    "%s: advanced %u bytes, "
1312 			    " bytes_canread on br now = %u\n",
1313 			    __func__, advance, bytes_canread);
1314 
1315 			if (bytes_canread == 0)
1316 				return (0);
1317 			else
1318 				advance = 0;
1319 		}
1320 	}
1321 
1322 	if (bytes_canread <
1323 	    advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t)))
1324 		return (0);
1325 
1326 	error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt,
1327 	    sizeof(struct hvs_pkt_header), advance);
1328 
1329 	/* Don't have anything to read */
1330 	if (error) {
1331 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1332 		    "%s: after calling vmbus_chan_recv_peek, got error = %d\n",
1333 		    __func__, error);
1334 		return (0);
1335 	}
1336 
1337 	/*
1338 	 * We just read in a new packet header. Do some sanity checks.
1339 	 */
1340 	tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
1341 	hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen);
1342 	dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size;
1343 	if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) ||
1344 	    __predict_false(hlen > tlen) ||
1345 	    __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) {
1346 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1347 		    "invalid tlen(%u), hlen(%u) or dlen(%u)\n",
1348 		    tlen, hlen, dlen);
1349 		pcb->so->so_error = EIO;
1350 		return (0);
1351 	}
1352 	if (pcb->rb_init == false)
1353 		pcb->rb_init = true;
1354 
1355 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1356 	    "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n",
1357 	    tlen, hlen, dlen);
1358 
1359 	/* The other side has sent a close FIN */
1360 	if (dlen == 0) {
1361 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1362 		    "%s: Received FIN from other side\n", __func__);
1363 		/* inform the caller by seting so_error to ESHUTDOWN */
1364 		pcb->so->so_error = ESHUTDOWN;
1365 	}
1366 
1367 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1368 	    "%s: canread on receive ring is %u \n", __func__, dlen);
1369 
1370 	pcb->recv_data_len = dlen;
1371 	pcb->recv_data_off = 0;
1372 
1373 	return (pcb->recv_data_len);
1374 }
1375 
1376 static uint32_t
1377 hvsock_canwrite_check(struct hvs_pcb *pcb)
1378 {
1379 	uint32_t writeable;
1380 	uint32_t ret;
1381 
1382 	if (pcb == NULL || pcb->chan == NULL)
1383 		return (0);
1384 
1385 	writeable = vmbus_chan_write_available(pcb->chan);
1386 
1387 	/*
1388 	 * We must always reserve a 0-length-payload packet for the FIN.
1389 	 */
1390 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1391 	    "%s: writeable is %u, should be greater than %ju\n",
1392 	    __func__, writeable,
1393 	    (uintmax_t)(HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)));
1394 
1395 	if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) {
1396 		/*
1397 		 * The Tx ring seems full.
1398 		 */
1399 		return (0);
1400 	}
1401 
1402 	ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0);
1403 
1404 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1405 	    "%s: available size is %u\n", __func__, rounddown2(ret, 8));
1406 
1407 	return (rounddown2(ret, 8));
1408 }
1409 
1410 static void
1411 hvsock_set_chan_pending_send_size(struct vmbus_channel *chan)
1412 {
1413 	vmbus_chan_set_pending_send_size(chan,
1414 	    HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ));
1415 }
1416 
1417 static int
1418 hvsock_open_channel(struct vmbus_channel *chan, struct socket *so)
1419 {
1420 	unsigned int rcvbuf, sndbuf;
1421 	struct hvs_pcb *pcb = so2hvspcb(so);
1422 	int ret;
1423 
1424 	if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) {
1425 		sndbuf = HVS_RINGBUF_SND_SIZE;
1426 		rcvbuf = HVS_RINGBUF_RCV_SIZE;
1427 	} else {
1428 		sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE);
1429 		sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE);
1430 		sndbuf = rounddown2(sndbuf, PAGE_SIZE);
1431 		rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE);
1432 		rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE);
1433 		rcvbuf = rounddown2(rcvbuf, PAGE_SIZE);
1434 	}
1435 
1436 	/*
1437 	 * Can only read whatever user provided size of data
1438 	 * from ring buffer. Turn off batched reading.
1439 	 */
1440 	vmbus_chan_set_readbatch(chan, false);
1441 
1442 	ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0,
1443 	    hvsock_chan_cb, pcb);
1444 
1445 	if (ret != 0) {
1446 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1447 		    "%s: failed to open hvsock channel, sndbuf = %u, "
1448 		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
1449 	} else {
1450 		HVSOCK_DBG(HVSOCK_DBG_INFO,
1451 		    "%s: hvsock channel opened, sndbuf = %u, i"
1452 		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
1453 		/*
1454 		 * Se the pending send size so to receive wakeup
1455 		 * signals from host when there is enough space on
1456 		 * rx buffer ring to write.
1457 		 */
1458 		hvsock_set_chan_pending_send_size(chan);
1459 	}
1460 
1461 	return ret;
1462 }
1463 
1464 /*
1465  * Guest is listening passively on the socket. Open channel and
1466  * create a new socket for the conneciton.
1467  */
1468 static void
1469 hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so,
1470     struct hvsock_sc *sc)
1471 {
1472 	struct socket *new_so;
1473 	struct hvs_pcb *new_pcb, *pcb;
1474 	int error;
1475 
1476 	/* Do nothing if socket is not listening */
1477 	if (!SOLISTENING(so)) {
1478 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1479 		    "%s: socket is not a listening one\n", __func__);
1480 		return;
1481 	}
1482 
1483 	/*
1484 	 * Create a new socket. This will call pru_attach to complete
1485 	 * the socket initialization and put the new socket onto
1486 	 * listening socket's sol_incomp list, waiting to be promoted
1487 	 * to sol_comp list.
1488 	 * The new socket created has ref count 0. There is no other
1489 	 * thread that changes the state of this new one at the
1490 	 * moment, so we don't need to hold its lock while opening
1491 	 * channel and filling out its pcb information.
1492 	 */
1493 	new_so = sonewconn(so, 0);
1494 	if (!new_so)
1495 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1496 		    "%s: creating new socket failed\n", __func__);
1497 
1498 	/*
1499 	 * Now open the vmbus channel. If it fails, the socket will be
1500 	 * on the listening socket's sol_incomp queue until it is
1501 	 * replaced and aborted.
1502 	 */
1503 	error = hvsock_open_channel(chan, new_so);
1504 	if (error) {
1505 		new_so->so_error = error;
1506 		return;
1507 	}
1508 
1509 	pcb = so->so_pcb;
1510 	new_pcb = new_so->so_pcb;
1511 
1512 	hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port);
1513 	/* Remote port is unknown to guest in this type of conneciton */
1514 	hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN);
1515 	new_pcb->chan = chan;
1516 	new_pcb->recv_data_len = 0;
1517 	new_pcb->recv_data_off = 0;
1518 	new_pcb->rb_init = false;
1519 
1520 	new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan);
1521 	new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan);
1522 
1523 	hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED);
1524 
1525 	sc->pcb = new_pcb;
1526 
1527 	/*
1528 	 * Change the socket state to SS_ISCONNECTED. This will promote
1529 	 * the socket to sol_comp queue and wake up the thread which
1530 	 * is accepting connection.
1531 	 */
1532 	soisconnected(new_so);
1533 }
1534 
1535 
1536 /*
1537  * Guest is actively connecting to host.
1538  */
1539 static void
1540 hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so)
1541 {
1542 	struct hvs_pcb *pcb;
1543 	int error;
1544 
1545 	error = hvsock_open_channel(chan, so);
1546 	if (error) {
1547 		so->so_error = error;
1548 		return;
1549 	}
1550 
1551 	pcb = so->so_pcb;
1552 	pcb->chan = chan;
1553 	pcb->recv_data_len = 0;
1554 	pcb->recv_data_off = 0;
1555 	pcb->rb_init = false;
1556 
1557 	mtx_lock(&hvs_trans_socks_mtx);
1558 	__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
1559 	__hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED);
1560 	mtx_unlock(&hvs_trans_socks_mtx);
1561 
1562 	/*
1563 	 * Change the socket state to SS_ISCONNECTED. This will wake up
1564 	 * the thread sleeping in connect call.
1565 	 */
1566 	soisconnected(so);
1567 }
1568 
1569 static void
1570 hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc)
1571 {
1572 	struct hyperv_guid *inst_guid, *type_guid;
1573 	bool conn_from_host;
1574 	struct sockaddr_hvs addr;
1575 	struct socket *so;
1576 	struct hvs_pcb *pcb;
1577 
1578 	type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan);
1579 	inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan);
1580 	conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan);
1581 
1582 	HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is ");
1583 	hvsock_print_guid(type_guid);
1584 	HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is ");
1585 	hvsock_print_guid(inst_guid);
1586 	HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n",
1587 	    (conn_from_host == true ) ? "from" : "to");
1588 
1589 	/*
1590 	 * The listening port should be in [0, MAX_LISTEN_PORT]
1591 	 */
1592 	if (!is_valid_srv_id(type_guid))
1593 		return;
1594 
1595 	/*
1596 	 * There should be a bound socket already created no matter
1597 	 * it is a passive or active connection.
1598 	 * For host initiated connection (passive on guest side),
1599 	 * the  type_guid contains the port which guest is bound and
1600 	 * listening.
1601 	 * For the guest initiated connection (active on guest side),
1602 	 * the inst_guid contains the port that guest has auto bound
1603 	 * to.
1604 	 */
1605 	hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid);
1606 	so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND);
1607 	if (!so) {
1608 		HVSOCK_DBG(HVSOCK_DBG_ERR,
1609 		    "%s: no bound socket found for port %u\n",
1610 		    __func__, addr.hvs_port);
1611 		return;
1612 	}
1613 
1614 	if (conn_from_host) {
1615 		hvsock_open_conn_passive(chan, so, sc);
1616 	} else {
1617 		(void) hvs_trans_lock();
1618 		pcb = so->so_pcb;
1619 		if (pcb && pcb->so) {
1620 			sc->pcb = so2hvspcb(so);
1621 			hvsock_open_conn_active(chan, so);
1622 		} else {
1623 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1624 			    "%s: channel detached before open\n", __func__);
1625 		}
1626 		hvs_trans_unlock();
1627 	}
1628 
1629 }
1630 
1631 static int
1632 hvsock_probe(device_t dev)
1633 {
1634 	struct vmbus_channel *channel = vmbus_get_channel(dev);
1635 
1636 	if (!channel || !vmbus_chan_is_hvs(channel)) {
1637 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1638 		    "hvsock_probe called but not a hvsock channel id %u\n",
1639 		    vmbus_chan_id(channel));
1640 
1641 		return ENXIO;
1642 	} else {
1643 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1644 		    "hvsock_probe got a hvsock channel id %u\n",
1645 		    vmbus_chan_id(channel));
1646 
1647 		return BUS_PROBE_DEFAULT;
1648 	}
1649 }
1650 
1651 static int
1652 hvsock_attach(device_t dev)
1653 {
1654 	struct vmbus_channel *channel = vmbus_get_channel(dev);
1655 	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
1656 
1657 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n");
1658 
1659 	hvsock_open_connection(channel, sc);
1660 
1661 	/*
1662 	 * Always return success. On error the host will rescind the device
1663 	 * in 30 seconds and we can do cleanup at that time in
1664 	 * vmbus_chan_msgproc_chrescind().
1665 	 */
1666 	return (0);
1667 }
1668 
1669 static int
1670 hvsock_detach(device_t dev)
1671 {
1672 	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
1673 	struct socket *so;
1674 	int retry;
1675 
1676 	if (bootverbose)
1677 		device_printf(dev, "hvsock_detach called.\n");
1678 
1679 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n");
1680 
1681 	if (sc->pcb != NULL) {
1682 		(void) hvs_trans_lock();
1683 
1684 		so = hsvpcb2so(sc->pcb);
1685 		if (so) {
1686 			/* Close the connection */
1687 			if (so->so_state &
1688 			    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
1689 				soisdisconnected(so);
1690 		}
1691 
1692 		mtx_lock(&hvs_trans_socks_mtx);
1693 		__hvs_remove_pcb_from_list(sc->pcb,
1694 		    HVS_LIST_BOUND | HVS_LIST_CONNECTED);
1695 		mtx_unlock(&hvs_trans_socks_mtx);
1696 
1697 		/*
1698 		 * Close channel while no reader and sender are working
1699 		 * on the buffer rings.
1700 		 */
1701 		if (so) {
1702 			retry = 0;
1703 			while (SOCK_IO_RECV_LOCK(so, 0) == EWOULDBLOCK) {
1704 				/*
1705 				 * Someone is reading, rx br is busy
1706 				 */
1707 				soisdisconnected(so);
1708 				DELAY(500);
1709 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1710 				    "waiting for rx reader to exit, "
1711 				    "retry = %d\n", retry++);
1712 			}
1713 			retry = 0;
1714 			while (SOCK_IO_SEND_LOCK(so, 0) == EWOULDBLOCK) {
1715 				/*
1716 				 * Someone is sending, tx br is busy
1717 				 */
1718 				soisdisconnected(so);
1719 				DELAY(500);
1720 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
1721 				    "waiting for tx sender to exit, "
1722 				    "retry = %d\n", retry++);
1723 			}
1724 		}
1725 
1726 
1727 		bzero(sc->pcb, sizeof(struct hvs_pcb));
1728 		free(sc->pcb, M_HVSOCK);
1729 		sc->pcb = NULL;
1730 
1731 		if (so) {
1732 			SOCK_IO_RECV_UNLOCK(so);
1733 			SOCK_IO_SEND_UNLOCK(so);
1734 			so->so_pcb = NULL;
1735 		}
1736 
1737 		hvs_trans_unlock();
1738 	}
1739 
1740 	vmbus_chan_close(vmbus_get_channel(dev));
1741 
1742 	return (0);
1743 }
1744 
1745 static device_method_t hvsock_methods[] = {
1746 	/* Device interface */
1747 	DEVMETHOD(device_probe, hvsock_probe),
1748 	DEVMETHOD(device_attach, hvsock_attach),
1749 	DEVMETHOD(device_detach, hvsock_detach),
1750 	DEVMETHOD_END
1751 };
1752 
1753 static driver_t hvsock_driver = {
1754 	"hv_sock",
1755 	hvsock_methods,
1756 	sizeof(struct hvsock_sc)
1757 };
1758 
1759 static devclass_t hvsock_devclass;
1760 
1761 DRIVER_MODULE(hvsock, vmbus, hvsock_driver, hvsock_devclass, NULL, NULL);
1762 MODULE_VERSION(hvsock, 1);
1763 MODULE_DEPEND(hvsock, vmbus, 1, 1, 1);
1764