1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2020 Microsoft Corp. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/bus.h> 34 #include <sys/domain.h> 35 #include <sys/lock.h> 36 #include <sys/kernel.h> 37 #include <sys/types.h> 38 #include <sys/malloc.h> 39 #include <sys/module.h> 40 #include <sys/mutex.h> 41 #include <sys/proc.h> 42 #include <sys/protosw.h> 43 #include <sys/socket.h> 44 #include <sys/sysctl.h> 45 #include <sys/sysproto.h> 46 #include <sys/systm.h> 47 #include <sys/sockbuf.h> 48 #include <sys/sx.h> 49 #include <sys/uio.h> 50 51 #include <net/vnet.h> 52 53 #include <dev/hyperv/vmbus/vmbus_reg.h> 54 55 #include "hv_sock.h" 56 57 #define HVSOCK_DBG_NONE 0x0 58 #define HVSOCK_DBG_INFO 0x1 59 #define HVSOCK_DBG_ERR 0x2 60 #define HVSOCK_DBG_VERBOSE 0x3 61 62 63 SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket"); 64 65 static int hvs_dbg_level; 66 SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level, 67 0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose"); 68 69 70 #define HVSOCK_DBG(level, ...) do { \ 71 if (hvs_dbg_level >= (level)) \ 72 printf(__VA_ARGS__); \ 73 } while (0) 74 75 MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures"); 76 77 static int hvs_dom_probe(void); 78 79 /* The MTU is 16KB per host side's design */ 80 #define HVSOCK_MTU_SIZE (1024 * 16) 81 #define HVSOCK_SEND_BUF_SZ (PAGE_SIZE - sizeof(struct vmpipe_proto_header)) 82 83 #define HVSOCK_HEADER_LEN (sizeof(struct hvs_pkt_header)) 84 85 #define HVSOCK_PKT_LEN(payload_len) (HVSOCK_HEADER_LEN + \ 86 roundup2(payload_len, 8) + \ 87 sizeof(uint64_t)) 88 89 90 static struct domain hv_socket_domain; 91 92 /* 93 * HyperV Transport sockets 94 */ 95 static struct pr_usrreqs hvs_trans_usrreqs = { 96 .pru_attach = hvs_trans_attach, 97 .pru_bind = hvs_trans_bind, 98 .pru_listen = hvs_trans_listen, 99 .pru_accept = hvs_trans_accept, 100 .pru_connect = hvs_trans_connect, 101 .pru_peeraddr = hvs_trans_peeraddr, 102 .pru_sockaddr = hvs_trans_sockaddr, 103 .pru_soreceive = hvs_trans_soreceive, 104 .pru_sosend = hvs_trans_sosend, 105 .pru_disconnect = hvs_trans_disconnect, 106 .pru_close = hvs_trans_close, 107 .pru_detach = hvs_trans_detach, 108 .pru_shutdown = hvs_trans_shutdown, 109 .pru_abort = hvs_trans_abort, 110 }; 111 112 /* 113 * Definitions of protocols supported in HyperV socket domain 114 */ 115 static struct protosw hv_socket_protosw[] = { 116 { 117 .pr_type = SOCK_STREAM, 118 .pr_domain = &hv_socket_domain, 119 .pr_protocol = HYPERV_SOCK_PROTO_TRANS, 120 .pr_flags = PR_CONNREQUIRED, 121 .pr_init = hvs_trans_init, 122 .pr_usrreqs = &hvs_trans_usrreqs, 123 }, 124 }; 125 126 static struct domain hv_socket_domain = { 127 .dom_family = AF_HYPERV, 128 .dom_name = "hyperv", 129 .dom_probe = hvs_dom_probe, 130 .dom_protosw = hv_socket_protosw, 131 .dom_protoswNPROTOSW = &hv_socket_protosw[nitems(hv_socket_protosw)] 132 }; 133 134 VNET_DOMAIN_SET(hv_socket_); 135 136 #define MAX_PORT ((uint32_t)0xFFFFFFFF) 137 #define MIN_PORT ((uint32_t)0x0) 138 139 /* 00000000-facb-11e6-bd58-64006a7986d3 */ 140 static const struct hyperv_guid srv_id_template = { 141 .hv_guid = { 142 0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11, 143 0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 } 144 }; 145 146 static int hvsock_br_callback(void *, int, void *); 147 static uint32_t hvsock_canread_check(struct hvs_pcb *); 148 static uint32_t hvsock_canwrite_check(struct hvs_pcb *); 149 static int hvsock_send_data(struct vmbus_channel *chan, 150 struct uio *uio, uint32_t to_write, struct sockbuf *sb); 151 152 153 154 /* Globals */ 155 static struct sx hvs_trans_socks_sx; 156 static struct mtx hvs_trans_socks_mtx; 157 static LIST_HEAD(, hvs_pcb) hvs_trans_bound_socks; 158 static LIST_HEAD(, hvs_pcb) hvs_trans_connected_socks; 159 static uint32_t previous_auto_bound_port; 160 161 static void 162 hvsock_print_guid(struct hyperv_guid *guid) 163 { 164 unsigned char *p = (unsigned char *)guid; 165 166 HVSOCK_DBG(HVSOCK_DBG_INFO, 167 "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n", 168 *(unsigned int *)p, 169 *((unsigned short *) &p[4]), 170 *((unsigned short *) &p[6]), 171 p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); 172 } 173 174 static bool 175 is_valid_srv_id(const struct hyperv_guid *id) 176 { 177 return !memcmp(&id->hv_guid[4], 178 &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4); 179 } 180 181 static unsigned int 182 get_port_by_srv_id(const struct hyperv_guid *srv_id) 183 { 184 return *((const unsigned int *)srv_id); 185 } 186 187 static void 188 set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port) 189 { 190 *((unsigned int *)srv_id) = port; 191 } 192 193 194 static void 195 __hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list) 196 { 197 struct hvs_pcb *p = NULL; 198 199 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); 200 201 if (!pcb) 202 return; 203 204 if (list & HVS_LIST_BOUND) { 205 LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) 206 if (p == pcb) 207 LIST_REMOVE(p, bound_next); 208 } 209 210 if (list & HVS_LIST_CONNECTED) { 211 LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) 212 if (p == pcb) 213 LIST_REMOVE(pcb, connected_next); 214 } 215 } 216 217 static void 218 __hvs_remove_socket_from_list(struct socket *so, unsigned char list) 219 { 220 struct hvs_pcb *pcb = so2hvspcb(so); 221 222 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb); 223 224 __hvs_remove_pcb_from_list(pcb, list); 225 } 226 227 static void 228 __hvs_insert_socket_on_list(struct socket *so, unsigned char list) 229 { 230 struct hvs_pcb *pcb = so2hvspcb(so); 231 232 if (list & HVS_LIST_BOUND) 233 LIST_INSERT_HEAD(&hvs_trans_bound_socks, 234 pcb, bound_next); 235 236 if (list & HVS_LIST_CONNECTED) 237 LIST_INSERT_HEAD(&hvs_trans_connected_socks, 238 pcb, connected_next); 239 } 240 241 void 242 hvs_remove_socket_from_list(struct socket *so, unsigned char list) 243 { 244 if (!so || !so->so_pcb) { 245 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 246 "%s: socket or so_pcb is null\n", __func__); 247 return; 248 } 249 250 mtx_lock(&hvs_trans_socks_mtx); 251 __hvs_remove_socket_from_list(so, list); 252 mtx_unlock(&hvs_trans_socks_mtx); 253 } 254 255 static void 256 hvs_insert_socket_on_list(struct socket *so, unsigned char list) 257 { 258 if (!so || !so->so_pcb) { 259 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 260 "%s: socket or so_pcb is null\n", __func__); 261 return; 262 } 263 264 mtx_lock(&hvs_trans_socks_mtx); 265 __hvs_insert_socket_on_list(so, list); 266 mtx_unlock(&hvs_trans_socks_mtx); 267 } 268 269 static struct socket * 270 __hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) 271 { 272 struct hvs_pcb *p = NULL; 273 274 if (list & HVS_LIST_BOUND) 275 LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next) 276 if (p->so != NULL && 277 addr->hvs_port == p->local_addr.hvs_port) 278 return p->so; 279 280 if (list & HVS_LIST_CONNECTED) 281 LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next) 282 if (p->so != NULL && 283 addr->hvs_port == p->local_addr.hvs_port) 284 return p->so; 285 286 return NULL; 287 } 288 289 static struct socket * 290 hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list) 291 { 292 struct socket *s = NULL; 293 294 mtx_lock(&hvs_trans_socks_mtx); 295 s = __hvs_find_socket_on_list(addr, list); 296 mtx_unlock(&hvs_trans_socks_mtx); 297 298 return s; 299 } 300 301 static inline void 302 hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port) 303 { 304 memset(addr, 0, sizeof(*addr)); 305 addr->sa_family = AF_HYPERV; 306 addr->sa_len = sizeof(*addr); 307 addr->hvs_port = port; 308 } 309 310 void 311 hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id) 312 { 313 hvs_addr_set(addr, get_port_by_srv_id(svr_id)); 314 } 315 316 int 317 hvs_trans_lock(void) 318 { 319 sx_xlock(&hvs_trans_socks_sx); 320 return (0); 321 } 322 323 void 324 hvs_trans_unlock(void) 325 { 326 sx_xunlock(&hvs_trans_socks_sx); 327 } 328 329 static int 330 hvs_dom_probe(void) 331 { 332 333 /* Don't even give us a chance to attach on non-HyperV. */ 334 if (vm_guest != VM_GUEST_HV) 335 return (ENXIO); 336 return (0); 337 } 338 339 void 340 hvs_trans_init(void) 341 { 342 /* Skip initialization of globals for non-default instances. */ 343 if (!IS_DEFAULT_VNET(curvnet)) 344 return; 345 346 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 347 "%s: HyperV Socket hvs_trans_init called\n", __func__); 348 349 /* Initialize Globals */ 350 previous_auto_bound_port = MAX_PORT; 351 sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx"); 352 mtx_init(&hvs_trans_socks_mtx, 353 "hvs_trans_socks_mtx", NULL, MTX_DEF); 354 LIST_INIT(&hvs_trans_bound_socks); 355 LIST_INIT(&hvs_trans_connected_socks); 356 } 357 358 /* 359 * Called in two cases: 360 * 1) When user calls socket(); 361 * 2) When we accept new incoming conneciton and call sonewconn(). 362 */ 363 int 364 hvs_trans_attach(struct socket *so, int proto, struct thread *td) 365 { 366 struct hvs_pcb *pcb = so2hvspcb(so); 367 368 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 369 "%s: HyperV Socket hvs_trans_attach called\n", __func__); 370 371 if (so->so_type != SOCK_STREAM) 372 return (ESOCKTNOSUPPORT); 373 374 if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS) 375 return (EPROTONOSUPPORT); 376 377 if (pcb != NULL) 378 return (EISCONN); 379 pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO); 380 if (pcb == NULL) 381 return (ENOMEM); 382 383 pcb->so = so; 384 so->so_pcb = (void *)pcb; 385 386 return (0); 387 } 388 389 void 390 hvs_trans_detach(struct socket *so) 391 { 392 struct hvs_pcb *pcb; 393 394 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 395 "%s: HyperV Socket hvs_trans_detach called\n", __func__); 396 397 (void) hvs_trans_lock(); 398 pcb = so2hvspcb(so); 399 if (pcb == NULL) { 400 hvs_trans_unlock(); 401 return; 402 } 403 404 if (SOLISTENING(so)) { 405 bzero(pcb, sizeof(*pcb)); 406 free(pcb, M_HVSOCK); 407 } 408 409 so->so_pcb = NULL; 410 411 hvs_trans_unlock(); 412 } 413 414 int 415 hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td) 416 { 417 struct hvs_pcb *pcb = so2hvspcb(so); 418 struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr; 419 int error = 0; 420 421 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 422 "%s: HyperV Socket hvs_trans_bind called\n", __func__); 423 424 if (sa == NULL) { 425 return (EINVAL); 426 } 427 428 if (pcb == NULL) { 429 return (EINVAL); 430 } 431 432 if (sa->sa_family != AF_HYPERV) { 433 HVSOCK_DBG(HVSOCK_DBG_ERR, 434 "%s: Not supported, sa_family is %u\n", 435 __func__, sa->sa_family); 436 return (EAFNOSUPPORT); 437 } 438 if (sa->sa_len != sizeof(*sa)) { 439 HVSOCK_DBG(HVSOCK_DBG_ERR, 440 "%s: Not supported, sa_len is %u\n", 441 __func__, sa->sa_len); 442 return (EINVAL); 443 } 444 445 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 446 "%s: binding port = 0x%x\n", __func__, sa->hvs_port); 447 448 mtx_lock(&hvs_trans_socks_mtx); 449 if (__hvs_find_socket_on_list(sa, 450 HVS_LIST_BOUND | HVS_LIST_CONNECTED)) { 451 error = EADDRINUSE; 452 } else { 453 /* 454 * The address is available for us to bind. 455 * Add socket to the bound list. 456 */ 457 hvs_addr_set(&pcb->local_addr, sa->hvs_port); 458 hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY); 459 __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); 460 } 461 mtx_unlock(&hvs_trans_socks_mtx); 462 463 return (error); 464 } 465 466 int 467 hvs_trans_listen(struct socket *so, int backlog, struct thread *td) 468 { 469 struct hvs_pcb *pcb = so2hvspcb(so); 470 struct socket *bound_so; 471 int error; 472 473 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 474 "%s: HyperV Socket hvs_trans_listen called\n", __func__); 475 476 if (pcb == NULL) 477 return (EINVAL); 478 479 /* Check if the address is already bound and it was by us. */ 480 bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND); 481 if (bound_so == NULL || bound_so != so) { 482 HVSOCK_DBG(HVSOCK_DBG_ERR, 483 "%s: Address not bound or not by us.\n", __func__); 484 return (EADDRNOTAVAIL); 485 } 486 487 SOCK_LOCK(so); 488 error = solisten_proto_check(so); 489 if (error == 0) 490 solisten_proto(so, backlog); 491 SOCK_UNLOCK(so); 492 493 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 494 "%s: HyperV Socket listen error = %d\n", __func__, error); 495 return (error); 496 } 497 498 int 499 hvs_trans_accept(struct socket *so, struct sockaddr **nam) 500 { 501 struct hvs_pcb *pcb = so2hvspcb(so); 502 503 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 504 "%s: HyperV Socket hvs_trans_accept called\n", __func__); 505 506 if (pcb == NULL) 507 return (EINVAL); 508 509 *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, 510 M_NOWAIT); 511 512 return ((*nam == NULL) ? ENOMEM : 0); 513 } 514 515 int 516 hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 517 { 518 struct hvs_pcb *pcb = so2hvspcb(so); 519 struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam; 520 bool found_auto_bound_port = false; 521 int i, error = 0; 522 523 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 524 "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n", 525 __func__, raddr->hvs_port); 526 527 if (pcb == NULL) 528 return (EINVAL); 529 530 /* Verify the remote address */ 531 if (raddr == NULL) 532 return (EINVAL); 533 if (raddr->sa_family != AF_HYPERV) 534 return (EAFNOSUPPORT); 535 if (raddr->sa_len != sizeof(*raddr)) 536 return (EINVAL); 537 538 mtx_lock(&hvs_trans_socks_mtx); 539 if (so->so_state & 540 (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) { 541 HVSOCK_DBG(HVSOCK_DBG_ERR, 542 "%s: socket connect in progress\n", 543 __func__); 544 error = EINPROGRESS; 545 goto out; 546 } 547 548 /* 549 * Find an available port for us to auto bind the local 550 * address. 551 */ 552 hvs_addr_set(&pcb->local_addr, 0); 553 554 for (i = previous_auto_bound_port - 1; 555 i != previous_auto_bound_port; i --) { 556 if (i == MIN_PORT) 557 i = MAX_PORT; 558 559 pcb->local_addr.hvs_port = i; 560 561 if (__hvs_find_socket_on_list(&pcb->local_addr, 562 HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) { 563 found_auto_bound_port = true; 564 previous_auto_bound_port = i; 565 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 566 "%s: found local bound port is %x\n", 567 __func__, pcb->local_addr.hvs_port); 568 break; 569 } 570 } 571 572 if (found_auto_bound_port == true) { 573 /* Found available port for auto bound, put on list */ 574 __hvs_insert_socket_on_list(so, HVS_LIST_BOUND); 575 /* Set VM service ID */ 576 pcb->vm_srv_id = srv_id_template; 577 set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port); 578 /* Set host service ID and remote port */ 579 pcb->host_srv_id = srv_id_template; 580 set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port); 581 hvs_addr_set(&pcb->remote_addr, raddr->hvs_port); 582 583 /* Change the socket state to SS_ISCONNECTING */ 584 soisconnecting(so); 585 } else { 586 HVSOCK_DBG(HVSOCK_DBG_ERR, 587 "%s: No local port available for auto bound\n", 588 __func__); 589 error = EADDRINUSE; 590 } 591 592 HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is "); 593 hvsock_print_guid(&pcb->vm_srv_id); 594 HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is "); 595 hvsock_print_guid(&pcb->host_srv_id); 596 597 out: 598 mtx_unlock(&hvs_trans_socks_mtx); 599 600 if (found_auto_bound_port == true) 601 vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id); 602 603 return (error); 604 } 605 606 int 607 hvs_trans_disconnect(struct socket *so) 608 { 609 struct hvs_pcb *pcb; 610 611 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 612 "%s: HyperV Socket hvs_trans_disconnect called\n", __func__); 613 614 (void) hvs_trans_lock(); 615 pcb = so2hvspcb(so); 616 if (pcb == NULL) { 617 hvs_trans_unlock(); 618 return (EINVAL); 619 } 620 621 /* If socket is already disconnected, skip this */ 622 if ((so->so_state & SS_ISDISCONNECTED) == 0) 623 soisdisconnecting(so); 624 625 hvs_trans_unlock(); 626 627 return (0); 628 } 629 630 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 631 struct hvs_callback_arg { 632 struct uio *uio; 633 struct sockbuf *sb; 634 }; 635 636 int 637 hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr, 638 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 639 { 640 struct hvs_pcb *pcb = so2hvspcb(so); 641 struct sockbuf *sb; 642 ssize_t orig_resid; 643 uint32_t canread, to_read; 644 int flags, error = 0; 645 struct hvs_callback_arg cbarg; 646 647 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 648 "%s: HyperV Socket hvs_trans_soreceive called\n", __func__); 649 650 if (so->so_type != SOCK_STREAM) 651 return (EINVAL); 652 if (pcb == NULL) 653 return (EINVAL); 654 655 if (flagsp != NULL) 656 flags = *flagsp &~ MSG_EOR; 657 else 658 flags = 0; 659 660 if (flags & MSG_PEEK) 661 return (EOPNOTSUPP); 662 663 /* If no space to copy out anything */ 664 if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ) 665 return (EINVAL); 666 667 orig_resid = uio->uio_resid; 668 669 /* Prevent other readers from entering the socket. */ 670 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); 671 if (error) { 672 HVSOCK_DBG(HVSOCK_DBG_ERR, 673 "%s: soiolock returned error = %d\n", __func__, error); 674 return (error); 675 } 676 677 sb = &so->so_rcv; 678 SOCKBUF_LOCK(sb); 679 680 cbarg.uio = uio; 681 cbarg.sb = sb; 682 /* 683 * If the socket is closing, there might still be some data 684 * in rx br to read. However we need to make sure 685 * the channel is still open. 686 */ 687 if ((sb->sb_state & SBS_CANTRCVMORE) && 688 (so->so_state & SS_ISDISCONNECTED)) { 689 /* Other thread already closed the channel */ 690 error = EPIPE; 691 goto out; 692 } 693 694 while (true) { 695 while (uio->uio_resid > 0 && 696 (canread = hvsock_canread_check(pcb)) > 0) { 697 to_read = MIN(canread, uio->uio_resid); 698 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 699 "%s: to_read = %u, skip = %u\n", __func__, to_read, 700 (unsigned int)(sizeof(struct hvs_pkt_header) + 701 pcb->recv_data_off)); 702 703 error = vmbus_chan_recv_peek_call(pcb->chan, to_read, 704 sizeof(struct hvs_pkt_header) + pcb->recv_data_off, 705 hvsock_br_callback, (void *)&cbarg); 706 /* 707 * It is possible socket is disconnected becasue 708 * we released lock in hvsock_br_callback. So we 709 * need to check the state to make sure it is not 710 * disconnected. 711 */ 712 if (error || so->so_state & SS_ISDISCONNECTED) { 713 break; 714 } 715 716 pcb->recv_data_len -= to_read; 717 pcb->recv_data_off += to_read; 718 } 719 720 if (error) 721 break; 722 723 /* Abort if socket has reported problems. */ 724 if (so->so_error) { 725 if (so->so_error == ESHUTDOWN && 726 orig_resid > uio->uio_resid) { 727 /* 728 * Although we got a FIN, we also received 729 * some data in this round. Delivery it 730 * to user. 731 */ 732 error = 0; 733 } else { 734 if (so->so_error != ESHUTDOWN) 735 error = so->so_error; 736 } 737 738 break; 739 } 740 741 /* Cannot received more. */ 742 if (sb->sb_state & SBS_CANTRCVMORE) 743 break; 744 745 /* We are done if buffer has been filled */ 746 if (uio->uio_resid == 0) 747 break; 748 749 if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid) 750 break; 751 752 /* Buffer ring is empty and we shall not block */ 753 if ((so->so_state & SS_NBIO) || 754 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 755 if (orig_resid == uio->uio_resid) { 756 /* We have not read anything */ 757 error = EAGAIN; 758 } 759 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 760 "%s: non blocked read return, error %d.\n", 761 __func__, error); 762 break; 763 } 764 765 /* 766 * Wait and block until (more) data comes in. 767 * Note: Drops the sockbuf lock during wait. 768 */ 769 error = sbwait(sb); 770 771 if (error) 772 break; 773 774 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 775 "%s: wake up from sbwait, read available is %u\n", 776 __func__, vmbus_chan_read_available(pcb->chan)); 777 } 778 779 out: 780 SOCKBUF_UNLOCK(sb); 781 SOCK_IO_RECV_UNLOCK(so); 782 783 /* We recieved a FIN in this call */ 784 if (so->so_error == ESHUTDOWN) { 785 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 786 /* Send has already closed */ 787 soisdisconnecting(so); 788 } else { 789 /* Just close the receive side */ 790 socantrcvmore(so); 791 } 792 } 793 794 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 795 "%s: returning error = %d, so_error = %d\n", 796 __func__, error, so->so_error); 797 798 return (error); 799 } 800 801 int 802 hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 803 struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td) 804 { 805 struct hvs_pcb *pcb = so2hvspcb(so); 806 struct sockbuf *sb; 807 ssize_t orig_resid; 808 uint32_t canwrite, to_write; 809 int error = 0; 810 811 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 812 "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %zd\n", 813 __func__, uio->uio_resid); 814 815 if (so->so_type != SOCK_STREAM) 816 return (EINVAL); 817 if (pcb == NULL) 818 return (EINVAL); 819 820 /* If nothing to send */ 821 if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE) 822 return (EINVAL); 823 824 orig_resid = uio->uio_resid; 825 826 /* Prevent other writers from entering the socket. */ 827 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); 828 if (error) { 829 HVSOCK_DBG(HVSOCK_DBG_ERR, 830 "%s: soiolocak returned error = %d\n", __func__, error); 831 return (error); 832 } 833 834 sb = &so->so_snd; 835 SOCKBUF_LOCK(sb); 836 837 if ((sb->sb_state & SBS_CANTSENDMORE) || 838 so->so_error == ESHUTDOWN) { 839 error = EPIPE; 840 goto out; 841 } 842 843 while (uio->uio_resid > 0) { 844 canwrite = hvsock_canwrite_check(pcb); 845 if (canwrite == 0) { 846 /* We have sent some data */ 847 if (orig_resid > uio->uio_resid) 848 break; 849 /* 850 * We have not sent any data and it is 851 * non-blocked io 852 */ 853 if (so->so_state & SS_NBIO || 854 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 855 error = EWOULDBLOCK; 856 break; 857 } else { 858 /* 859 * We are here because there is no space on 860 * send buffer ring. Signal the other side 861 * to read and free more space. 862 * Sleep wait until space avaiable to send 863 * Note: Drops the sockbuf lock during wait. 864 */ 865 error = sbwait(sb); 866 867 if (error) 868 break; 869 870 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 871 "%s: wake up from sbwait, space avail on " 872 "tx ring is %u\n", 873 __func__, 874 vmbus_chan_write_available(pcb->chan)); 875 876 continue; 877 } 878 } 879 to_write = MIN(canwrite, uio->uio_resid); 880 to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ); 881 882 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 883 "%s: canwrite is %u, to_write = %u\n", __func__, 884 canwrite, to_write); 885 error = hvsock_send_data(pcb->chan, uio, to_write, sb); 886 887 if (error) 888 break; 889 } 890 891 out: 892 SOCKBUF_UNLOCK(sb); 893 SOCK_IO_SEND_UNLOCK(so); 894 895 return (error); 896 } 897 898 int 899 hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam) 900 { 901 struct hvs_pcb *pcb = so2hvspcb(so); 902 903 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 904 "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__); 905 906 if (pcb == NULL) 907 return (EINVAL); 908 909 *nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT); 910 911 return ((*nam == NULL)? ENOMEM : 0); 912 } 913 914 int 915 hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam) 916 { 917 struct hvs_pcb *pcb = so2hvspcb(so); 918 919 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 920 "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__); 921 922 if (pcb == NULL) 923 return (EINVAL); 924 925 *nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT); 926 927 return ((*nam == NULL)? ENOMEM : 0); 928 } 929 930 void 931 hvs_trans_close(struct socket *so) 932 { 933 struct hvs_pcb *pcb; 934 935 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 936 "%s: HyperV Socket hvs_trans_close called\n", __func__); 937 938 (void) hvs_trans_lock(); 939 pcb = so2hvspcb(so); 940 if (!pcb) { 941 hvs_trans_unlock(); 942 return; 943 } 944 945 if (so->so_state & SS_ISCONNECTED) { 946 /* Send a FIN to peer */ 947 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 948 "%s: hvs_trans_close sending a FIN to host\n", __func__); 949 (void) hvsock_send_data(pcb->chan, NULL, 0, NULL); 950 } 951 952 if (so->so_state & 953 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 954 soisdisconnected(so); 955 956 pcb->chan = NULL; 957 pcb->so = NULL; 958 959 if (SOLISTENING(so)) { 960 mtx_lock(&hvs_trans_socks_mtx); 961 /* Remove from bound list */ 962 __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); 963 mtx_unlock(&hvs_trans_socks_mtx); 964 } 965 966 hvs_trans_unlock(); 967 968 return; 969 } 970 971 void 972 hvs_trans_abort(struct socket *so) 973 { 974 struct hvs_pcb *pcb = so2hvspcb(so); 975 976 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 977 "%s: HyperV Socket hvs_trans_abort called\n", __func__); 978 979 (void) hvs_trans_lock(); 980 if (pcb == NULL) { 981 hvs_trans_unlock(); 982 return; 983 } 984 985 if (SOLISTENING(so)) { 986 mtx_lock(&hvs_trans_socks_mtx); 987 /* Remove from bound list */ 988 __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); 989 mtx_unlock(&hvs_trans_socks_mtx); 990 } 991 992 if (so->so_state & SS_ISCONNECTED) { 993 (void) sodisconnect(so); 994 } 995 hvs_trans_unlock(); 996 997 return; 998 } 999 1000 int 1001 hvs_trans_shutdown(struct socket *so) 1002 { 1003 struct hvs_pcb *pcb = so2hvspcb(so); 1004 struct sockbuf *sb; 1005 1006 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1007 "%s: HyperV Socket hvs_trans_shutdown called\n", __func__); 1008 1009 if (pcb == NULL) 1010 return (EINVAL); 1011 1012 /* 1013 * Only get called with the shutdown method is SHUT_WR or 1014 * SHUT_RDWR. 1015 * When the method is SHUT_RD or SHUT_RDWR, the caller 1016 * already set the SBS_CANTRCVMORE on receive side socket 1017 * buffer. 1018 */ 1019 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1020 /* 1021 * SHUT_WR only case. 1022 * Receive side is still open. Just close 1023 * the send side. 1024 */ 1025 socantsendmore(so); 1026 } else { 1027 /* SHUT_RDWR case */ 1028 if (so->so_state & SS_ISCONNECTED) { 1029 /* Send a FIN to peer */ 1030 sb = &so->so_snd; 1031 SOCKBUF_LOCK(sb); 1032 (void) hvsock_send_data(pcb->chan, NULL, 0, sb); 1033 SOCKBUF_UNLOCK(sb); 1034 1035 soisdisconnecting(so); 1036 } 1037 } 1038 1039 return (0); 1040 } 1041 1042 /* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is 1043 * <port> (see struct sockaddr_hvs). 1044 * 1045 * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV: 1046 * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user- 1047 * guide/make-integration-service, and the endpoint is <VmID, ServiceId> with 1048 * the below sockaddr: 1049 * 1050 * struct SOCKADDR_HV 1051 * { 1052 * ADDRESS_FAMILY Family; 1053 * USHORT Reserved; 1054 * GUID VmId; 1055 * GUID ServiceId; 1056 * }; 1057 * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via 1058 * VMBus, because here it's obvious the host and the VM can easily identify 1059 * each other. Though the VmID is useful on the host, especially in the case 1060 * of Windows container, FreeBSD VM doesn't need it at all. 1061 * 1062 * To be compatible with similar infrastructure in Linux VMs, we have 1063 * to limit the available GUID space of SOCKADDR_HV so that we can create 1064 * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID. 1065 * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is: 1066 * 1067 **************************************************************************** 1068 * The only valid Service GUIDs, from the perspectives of both the host and * 1069 * FreeBSD VM, that can be connected by the other end, must conform to this * 1070 * format: <port>-facb-11e6-bd58-64006a7986d3. * 1071 **************************************************************************** 1072 * 1073 * When we write apps on the host to connect(), the GUID ServiceID is used. 1074 * When we write apps in FreeBSD VM to connect(), we only need to specify the 1075 * port and the driver will form the GUID and use that to request the host. 1076 * 1077 * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the 1078 * auto-generated remote port for a connect request initiated by the host's 1079 * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the 1080 * FreeBSD guest. 1081 */ 1082 1083 /* 1084 * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before) 1085 * restricts HyperV socket ring buffer size to six 4K pages. Newer 1086 * HyperV hosts doen't have this limit. 1087 */ 1088 #define HVS_RINGBUF_RCV_SIZE (PAGE_SIZE * 6) 1089 #define HVS_RINGBUF_SND_SIZE (PAGE_SIZE * 6) 1090 #define HVS_RINGBUF_MAX_SIZE (PAGE_SIZE * 64) 1091 1092 struct hvsock_sc { 1093 device_t dev; 1094 struct hvs_pcb *pcb; 1095 struct vmbus_channel *channel; 1096 }; 1097 1098 static bool 1099 hvsock_chan_readable(struct vmbus_channel *chan) 1100 { 1101 uint32_t readable = vmbus_chan_read_available(chan); 1102 1103 return (readable >= HVSOCK_PKT_LEN(0)); 1104 } 1105 1106 static void 1107 hvsock_chan_cb(struct vmbus_channel *chan, void *context) 1108 { 1109 struct hvs_pcb *pcb = (struct hvs_pcb *) context; 1110 struct socket *so; 1111 uint32_t canwrite; 1112 1113 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1114 "%s: host send us a wakeup on rb data, pcb = %p\n", 1115 __func__, pcb); 1116 1117 /* 1118 * Check if the socket is still attached and valid. 1119 * Here we know channel is still open. Need to make 1120 * sure the socket has not been closed or freed. 1121 */ 1122 (void) hvs_trans_lock(); 1123 so = hsvpcb2so(pcb); 1124 1125 if (pcb->chan != NULL && so != NULL) { 1126 /* 1127 * Wake up reader if there are data to read. 1128 */ 1129 SOCKBUF_LOCK(&(so)->so_rcv); 1130 1131 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1132 "%s: read available = %u\n", __func__, 1133 vmbus_chan_read_available(pcb->chan)); 1134 1135 if (hvsock_chan_readable(pcb->chan)) 1136 sorwakeup_locked(so); 1137 else 1138 SOCKBUF_UNLOCK(&(so)->so_rcv); 1139 1140 /* 1141 * Wake up sender if space becomes available to write. 1142 */ 1143 SOCKBUF_LOCK(&(so)->so_snd); 1144 canwrite = hvsock_canwrite_check(pcb); 1145 1146 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1147 "%s: canwrite = %u\n", __func__, canwrite); 1148 1149 if (canwrite > 0) { 1150 sowwakeup_locked(so); 1151 } else { 1152 SOCKBUF_UNLOCK(&(so)->so_snd); 1153 } 1154 } 1155 1156 hvs_trans_unlock(); 1157 1158 return; 1159 } 1160 1161 static int 1162 hvsock_br_callback(void *datap, int cplen, void *cbarg) 1163 { 1164 struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg; 1165 struct uio *uio = arg->uio; 1166 struct sockbuf *sb = arg->sb; 1167 int error = 0; 1168 1169 if (cbarg == NULL || datap == NULL) 1170 return (EINVAL); 1171 1172 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1173 "%s: called, uio_rw = %s, uio_resid = %zd, cplen = %u, " 1174 "datap = %p\n", 1175 __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br", 1176 uio->uio_resid, cplen, datap); 1177 1178 if (sb) 1179 SOCKBUF_UNLOCK(sb); 1180 1181 error = uiomove(datap, cplen, uio); 1182 1183 if (sb) 1184 SOCKBUF_LOCK(sb); 1185 1186 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1187 "%s: after uiomove, uio_resid = %zd, error = %d\n", 1188 __func__, uio->uio_resid, error); 1189 1190 return (error); 1191 } 1192 1193 static int 1194 hvsock_send_data(struct vmbus_channel *chan, struct uio *uio, 1195 uint32_t to_write, struct sockbuf *sb) 1196 { 1197 struct hvs_pkt_header hvs_pkt; 1198 int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0; 1199 uint64_t pad = 0; 1200 struct iovec iov[3]; 1201 struct hvs_callback_arg cbarg; 1202 1203 if (chan == NULL) 1204 return (ENOTCONN); 1205 1206 hlen = sizeof(struct vmbus_chanpkt_hdr); 1207 hvs_pkthlen = sizeof(struct hvs_pkt_header); 1208 hvs_pktlen = hvs_pkthlen + to_write; 1209 pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen); 1210 1211 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1212 "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, " 1213 "pad_pktlen = %u, data_len = %u\n", 1214 __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write); 1215 1216 hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND; 1217 hvs_pkt.chan_pkt_hdr.cph_flags = 0; 1218 VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen); 1219 VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen); 1220 hvs_pkt.chan_pkt_hdr.cph_xactid = 0; 1221 1222 hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1; 1223 hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write; 1224 1225 cbarg.uio = uio; 1226 cbarg.sb = sb; 1227 1228 if (uio && to_write > 0) { 1229 iov[0].iov_base = &hvs_pkt; 1230 iov[0].iov_len = hvs_pkthlen; 1231 iov[1].iov_base = NULL; 1232 iov[1].iov_len = to_write; 1233 iov[2].iov_base = &pad; 1234 iov[2].iov_len = pad_pktlen - hvs_pktlen; 1235 1236 error = vmbus_chan_iov_send(chan, iov, 3, 1237 hvsock_br_callback, &cbarg); 1238 } else { 1239 if (to_write == 0) { 1240 iov[0].iov_base = &hvs_pkt; 1241 iov[0].iov_len = hvs_pkthlen; 1242 iov[1].iov_base = &pad; 1243 iov[1].iov_len = pad_pktlen - hvs_pktlen; 1244 error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL); 1245 } 1246 } 1247 1248 if (error) { 1249 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1250 "%s: error = %d\n", __func__, error); 1251 } 1252 1253 return (error); 1254 } 1255 1256 /* 1257 * Check if we have data on current ring buffer to read 1258 * or not. If not, advance the ring buffer read index to 1259 * next packet. Update the recev_data_len and recev_data_off 1260 * to new value. 1261 * Return the number of bytes can read. 1262 */ 1263 static uint32_t 1264 hvsock_canread_check(struct hvs_pcb *pcb) 1265 { 1266 uint32_t advance; 1267 uint32_t tlen, hlen, dlen; 1268 uint32_t bytes_canread = 0; 1269 int error; 1270 1271 if (pcb == NULL || pcb->chan == NULL) { 1272 pcb->so->so_error = EIO; 1273 return (0); 1274 } 1275 1276 /* Still have data not read yet on current packet */ 1277 if (pcb->recv_data_len > 0) 1278 return (pcb->recv_data_len); 1279 1280 if (pcb->rb_init) 1281 advance = 1282 VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); 1283 else 1284 advance = 0; 1285 1286 bytes_canread = vmbus_chan_read_available(pcb->chan); 1287 1288 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1289 "%s: bytes_canread on br = %u, advance = %u\n", 1290 __func__, bytes_canread, advance); 1291 1292 if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) { 1293 /* 1294 * Nothing to read. Need to advance the rindex before 1295 * calling sbwait, so host knows to wake us up when data 1296 * is available to read on rb. 1297 */ 1298 error = vmbus_chan_recv_idxadv(pcb->chan, advance); 1299 if (error) { 1300 HVSOCK_DBG(HVSOCK_DBG_ERR, 1301 "%s: after calling vmbus_chan_recv_idxadv, " 1302 "got error = %d\n", __func__, error); 1303 return (0); 1304 } else { 1305 pcb->rb_init = false; 1306 pcb->recv_data_len = 0; 1307 pcb->recv_data_off = 0; 1308 bytes_canread = vmbus_chan_read_available(pcb->chan); 1309 1310 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1311 "%s: advanced %u bytes, " 1312 " bytes_canread on br now = %u\n", 1313 __func__, advance, bytes_canread); 1314 1315 if (bytes_canread == 0) 1316 return (0); 1317 else 1318 advance = 0; 1319 } 1320 } 1321 1322 if (bytes_canread < 1323 advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t))) 1324 return (0); 1325 1326 error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt, 1327 sizeof(struct hvs_pkt_header), advance); 1328 1329 /* Don't have anything to read */ 1330 if (error) { 1331 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1332 "%s: after calling vmbus_chan_recv_peek, got error = %d\n", 1333 __func__, error); 1334 return (0); 1335 } 1336 1337 /* 1338 * We just read in a new packet header. Do some sanity checks. 1339 */ 1340 tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen); 1341 hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen); 1342 dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size; 1343 if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) || 1344 __predict_false(hlen > tlen) || 1345 __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) { 1346 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1347 "invalid tlen(%u), hlen(%u) or dlen(%u)\n", 1348 tlen, hlen, dlen); 1349 pcb->so->so_error = EIO; 1350 return (0); 1351 } 1352 if (pcb->rb_init == false) 1353 pcb->rb_init = true; 1354 1355 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1356 "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n", 1357 tlen, hlen, dlen); 1358 1359 /* The other side has sent a close FIN */ 1360 if (dlen == 0) { 1361 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1362 "%s: Received FIN from other side\n", __func__); 1363 /* inform the caller by seting so_error to ESHUTDOWN */ 1364 pcb->so->so_error = ESHUTDOWN; 1365 } 1366 1367 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1368 "%s: canread on receive ring is %u \n", __func__, dlen); 1369 1370 pcb->recv_data_len = dlen; 1371 pcb->recv_data_off = 0; 1372 1373 return (pcb->recv_data_len); 1374 } 1375 1376 static uint32_t 1377 hvsock_canwrite_check(struct hvs_pcb *pcb) 1378 { 1379 uint32_t writeable; 1380 uint32_t ret; 1381 1382 if (pcb == NULL || pcb->chan == NULL) 1383 return (0); 1384 1385 writeable = vmbus_chan_write_available(pcb->chan); 1386 1387 /* 1388 * We must always reserve a 0-length-payload packet for the FIN. 1389 */ 1390 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1391 "%s: writeable is %u, should be greater than %ju\n", 1392 __func__, writeable, 1393 (uintmax_t)(HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0))); 1394 1395 if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) { 1396 /* 1397 * The Tx ring seems full. 1398 */ 1399 return (0); 1400 } 1401 1402 ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0); 1403 1404 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1405 "%s: available size is %u\n", __func__, rounddown2(ret, 8)); 1406 1407 return (rounddown2(ret, 8)); 1408 } 1409 1410 static void 1411 hvsock_set_chan_pending_send_size(struct vmbus_channel *chan) 1412 { 1413 vmbus_chan_set_pending_send_size(chan, 1414 HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ)); 1415 } 1416 1417 static int 1418 hvsock_open_channel(struct vmbus_channel *chan, struct socket *so) 1419 { 1420 unsigned int rcvbuf, sndbuf; 1421 struct hvs_pcb *pcb = so2hvspcb(so); 1422 int ret; 1423 1424 if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) { 1425 sndbuf = HVS_RINGBUF_SND_SIZE; 1426 rcvbuf = HVS_RINGBUF_RCV_SIZE; 1427 } else { 1428 sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE); 1429 sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE); 1430 sndbuf = rounddown2(sndbuf, PAGE_SIZE); 1431 rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE); 1432 rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE); 1433 rcvbuf = rounddown2(rcvbuf, PAGE_SIZE); 1434 } 1435 1436 /* 1437 * Can only read whatever user provided size of data 1438 * from ring buffer. Turn off batched reading. 1439 */ 1440 vmbus_chan_set_readbatch(chan, false); 1441 1442 ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0, 1443 hvsock_chan_cb, pcb); 1444 1445 if (ret != 0) { 1446 HVSOCK_DBG(HVSOCK_DBG_ERR, 1447 "%s: failed to open hvsock channel, sndbuf = %u, " 1448 "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); 1449 } else { 1450 HVSOCK_DBG(HVSOCK_DBG_INFO, 1451 "%s: hvsock channel opened, sndbuf = %u, i" 1452 "rcvbuf = %u\n", __func__, sndbuf, rcvbuf); 1453 /* 1454 * Se the pending send size so to receive wakeup 1455 * signals from host when there is enough space on 1456 * rx buffer ring to write. 1457 */ 1458 hvsock_set_chan_pending_send_size(chan); 1459 } 1460 1461 return ret; 1462 } 1463 1464 /* 1465 * Guest is listening passively on the socket. Open channel and 1466 * create a new socket for the conneciton. 1467 */ 1468 static void 1469 hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so, 1470 struct hvsock_sc *sc) 1471 { 1472 struct socket *new_so; 1473 struct hvs_pcb *new_pcb, *pcb; 1474 int error; 1475 1476 /* Do nothing if socket is not listening */ 1477 if (!SOLISTENING(so)) { 1478 HVSOCK_DBG(HVSOCK_DBG_ERR, 1479 "%s: socket is not a listening one\n", __func__); 1480 return; 1481 } 1482 1483 /* 1484 * Create a new socket. This will call pru_attach to complete 1485 * the socket initialization and put the new socket onto 1486 * listening socket's sol_incomp list, waiting to be promoted 1487 * to sol_comp list. 1488 * The new socket created has ref count 0. There is no other 1489 * thread that changes the state of this new one at the 1490 * moment, so we don't need to hold its lock while opening 1491 * channel and filling out its pcb information. 1492 */ 1493 new_so = sonewconn(so, 0); 1494 if (!new_so) 1495 HVSOCK_DBG(HVSOCK_DBG_ERR, 1496 "%s: creating new socket failed\n", __func__); 1497 1498 /* 1499 * Now open the vmbus channel. If it fails, the socket will be 1500 * on the listening socket's sol_incomp queue until it is 1501 * replaced and aborted. 1502 */ 1503 error = hvsock_open_channel(chan, new_so); 1504 if (error) { 1505 new_so->so_error = error; 1506 return; 1507 } 1508 1509 pcb = so->so_pcb; 1510 new_pcb = new_so->so_pcb; 1511 1512 hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port); 1513 /* Remote port is unknown to guest in this type of conneciton */ 1514 hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN); 1515 new_pcb->chan = chan; 1516 new_pcb->recv_data_len = 0; 1517 new_pcb->recv_data_off = 0; 1518 new_pcb->rb_init = false; 1519 1520 new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan); 1521 new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan); 1522 1523 hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED); 1524 1525 sc->pcb = new_pcb; 1526 1527 /* 1528 * Change the socket state to SS_ISCONNECTED. This will promote 1529 * the socket to sol_comp queue and wake up the thread which 1530 * is accepting connection. 1531 */ 1532 soisconnected(new_so); 1533 } 1534 1535 1536 /* 1537 * Guest is actively connecting to host. 1538 */ 1539 static void 1540 hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so) 1541 { 1542 struct hvs_pcb *pcb; 1543 int error; 1544 1545 error = hvsock_open_channel(chan, so); 1546 if (error) { 1547 so->so_error = error; 1548 return; 1549 } 1550 1551 pcb = so->so_pcb; 1552 pcb->chan = chan; 1553 pcb->recv_data_len = 0; 1554 pcb->recv_data_off = 0; 1555 pcb->rb_init = false; 1556 1557 mtx_lock(&hvs_trans_socks_mtx); 1558 __hvs_remove_socket_from_list(so, HVS_LIST_BOUND); 1559 __hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED); 1560 mtx_unlock(&hvs_trans_socks_mtx); 1561 1562 /* 1563 * Change the socket state to SS_ISCONNECTED. This will wake up 1564 * the thread sleeping in connect call. 1565 */ 1566 soisconnected(so); 1567 } 1568 1569 static void 1570 hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc) 1571 { 1572 struct hyperv_guid *inst_guid, *type_guid; 1573 bool conn_from_host; 1574 struct sockaddr_hvs addr; 1575 struct socket *so; 1576 struct hvs_pcb *pcb; 1577 1578 type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan); 1579 inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan); 1580 conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan); 1581 1582 HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is "); 1583 hvsock_print_guid(type_guid); 1584 HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is "); 1585 hvsock_print_guid(inst_guid); 1586 HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n", 1587 (conn_from_host == true ) ? "from" : "to"); 1588 1589 /* 1590 * The listening port should be in [0, MAX_LISTEN_PORT] 1591 */ 1592 if (!is_valid_srv_id(type_guid)) 1593 return; 1594 1595 /* 1596 * There should be a bound socket already created no matter 1597 * it is a passive or active connection. 1598 * For host initiated connection (passive on guest side), 1599 * the type_guid contains the port which guest is bound and 1600 * listening. 1601 * For the guest initiated connection (active on guest side), 1602 * the inst_guid contains the port that guest has auto bound 1603 * to. 1604 */ 1605 hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid); 1606 so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND); 1607 if (!so) { 1608 HVSOCK_DBG(HVSOCK_DBG_ERR, 1609 "%s: no bound socket found for port %u\n", 1610 __func__, addr.hvs_port); 1611 return; 1612 } 1613 1614 if (conn_from_host) { 1615 hvsock_open_conn_passive(chan, so, sc); 1616 } else { 1617 (void) hvs_trans_lock(); 1618 pcb = so->so_pcb; 1619 if (pcb && pcb->so) { 1620 sc->pcb = so2hvspcb(so); 1621 hvsock_open_conn_active(chan, so); 1622 } else { 1623 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1624 "%s: channel detached before open\n", __func__); 1625 } 1626 hvs_trans_unlock(); 1627 } 1628 1629 } 1630 1631 static int 1632 hvsock_probe(device_t dev) 1633 { 1634 struct vmbus_channel *channel = vmbus_get_channel(dev); 1635 1636 if (!channel || !vmbus_chan_is_hvs(channel)) { 1637 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1638 "hvsock_probe called but not a hvsock channel id %u\n", 1639 vmbus_chan_id(channel)); 1640 1641 return ENXIO; 1642 } else { 1643 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1644 "hvsock_probe got a hvsock channel id %u\n", 1645 vmbus_chan_id(channel)); 1646 1647 return BUS_PROBE_DEFAULT; 1648 } 1649 } 1650 1651 static int 1652 hvsock_attach(device_t dev) 1653 { 1654 struct vmbus_channel *channel = vmbus_get_channel(dev); 1655 struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); 1656 1657 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n"); 1658 1659 hvsock_open_connection(channel, sc); 1660 1661 /* 1662 * Always return success. On error the host will rescind the device 1663 * in 30 seconds and we can do cleanup at that time in 1664 * vmbus_chan_msgproc_chrescind(). 1665 */ 1666 return (0); 1667 } 1668 1669 static int 1670 hvsock_detach(device_t dev) 1671 { 1672 struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev); 1673 struct socket *so; 1674 int retry; 1675 1676 if (bootverbose) 1677 device_printf(dev, "hvsock_detach called.\n"); 1678 1679 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n"); 1680 1681 if (sc->pcb != NULL) { 1682 (void) hvs_trans_lock(); 1683 1684 so = hsvpcb2so(sc->pcb); 1685 if (so) { 1686 /* Close the connection */ 1687 if (so->so_state & 1688 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) 1689 soisdisconnected(so); 1690 } 1691 1692 mtx_lock(&hvs_trans_socks_mtx); 1693 __hvs_remove_pcb_from_list(sc->pcb, 1694 HVS_LIST_BOUND | HVS_LIST_CONNECTED); 1695 mtx_unlock(&hvs_trans_socks_mtx); 1696 1697 /* 1698 * Close channel while no reader and sender are working 1699 * on the buffer rings. 1700 */ 1701 if (so) { 1702 retry = 0; 1703 while (SOCK_IO_RECV_LOCK(so, 0) == EWOULDBLOCK) { 1704 /* 1705 * Someone is reading, rx br is busy 1706 */ 1707 soisdisconnected(so); 1708 DELAY(500); 1709 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1710 "waiting for rx reader to exit, " 1711 "retry = %d\n", retry++); 1712 } 1713 retry = 0; 1714 while (SOCK_IO_SEND_LOCK(so, 0) == EWOULDBLOCK) { 1715 /* 1716 * Someone is sending, tx br is busy 1717 */ 1718 soisdisconnected(so); 1719 DELAY(500); 1720 HVSOCK_DBG(HVSOCK_DBG_VERBOSE, 1721 "waiting for tx sender to exit, " 1722 "retry = %d\n", retry++); 1723 } 1724 } 1725 1726 1727 bzero(sc->pcb, sizeof(struct hvs_pcb)); 1728 free(sc->pcb, M_HVSOCK); 1729 sc->pcb = NULL; 1730 1731 if (so) { 1732 SOCK_IO_RECV_UNLOCK(so); 1733 SOCK_IO_SEND_UNLOCK(so); 1734 so->so_pcb = NULL; 1735 } 1736 1737 hvs_trans_unlock(); 1738 } 1739 1740 vmbus_chan_close(vmbus_get_channel(dev)); 1741 1742 return (0); 1743 } 1744 1745 static device_method_t hvsock_methods[] = { 1746 /* Device interface */ 1747 DEVMETHOD(device_probe, hvsock_probe), 1748 DEVMETHOD(device_attach, hvsock_attach), 1749 DEVMETHOD(device_detach, hvsock_detach), 1750 DEVMETHOD_END 1751 }; 1752 1753 static driver_t hvsock_driver = { 1754 "hv_sock", 1755 hvsock_methods, 1756 sizeof(struct hvsock_sc) 1757 }; 1758 1759 static devclass_t hvsock_devclass; 1760 1761 DRIVER_MODULE(hvsock, vmbus, hvsock_driver, hvsock_devclass, NULL, NULL); 1762 MODULE_VERSION(hvsock, 1); 1763 MODULE_DEPEND(hvsock, vmbus, 1, 1, 1); 1764