1 /* 2 * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 * 32 */ 33 #include <linux/module.h> 34 #include <linux/errno.h> 35 #include <linux/kernel.h> 36 #include <linux/gfp.h> 37 #include <linux/in.h> 38 #include <linux/ipv6.h> 39 #include <linux/poll.h> 40 #include <net/sock.h> 41 42 #include "rds.h" 43 44 /* this is just used for stats gathering :/ */ 45 static DEFINE_SPINLOCK(rds_sock_lock); 46 static unsigned long rds_sock_count; 47 static LIST_HEAD(rds_sock_list); 48 DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq); 49 50 /* 51 * This is called as the final descriptor referencing this socket is closed. 52 * We have to unbind the socket so that another socket can be bound to the 53 * address it was using. 54 * 55 * We have to be careful about racing with the incoming path. sock_orphan() 56 * sets SOCK_DEAD and we use that as an indicator to the rx path that new 57 * messages shouldn't be queued. 58 */ 59 static int rds_release(struct socket *sock) 60 { 61 struct sock *sk = sock->sk; 62 struct rds_sock *rs; 63 64 if (!sk) 65 goto out; 66 67 rs = rds_sk_to_rs(sk); 68 69 sock_orphan(sk); 70 /* Note - rds_clear_recv_queue grabs rs_recv_lock, so 71 * that ensures the recv path has completed messing 72 * with the socket. */ 73 rds_clear_recv_queue(rs); 74 rds_cong_remove_socket(rs); 75 76 rds_remove_bound(rs); 77 78 rds_send_drop_to(rs, NULL); 79 rds_rdma_drop_keys(rs); 80 rds_notify_queue_get(rs, NULL); 81 rds_notify_msg_zcopy_purge(&rs->rs_zcookie_queue); 82 83 spin_lock_bh(&rds_sock_lock); 84 list_del_init(&rs->rs_item); 85 rds_sock_count--; 86 spin_unlock_bh(&rds_sock_lock); 87 88 rds_trans_put(rs->rs_transport); 89 90 sock->sk = NULL; 91 sock_put(sk); 92 out: 93 return 0; 94 } 95 96 /* 97 * Careful not to race with rds_release -> sock_orphan which clears sk_sleep. 98 * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK 99 * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but 100 * this seems more conservative. 101 * NB - normally, one would use sk_callback_lock for this, but we can 102 * get here from interrupts, whereas the network code grabs sk_callback_lock 103 * with _lock_bh only - so relying on sk_callback_lock introduces livelocks. 104 */ 105 void rds_wake_sk_sleep(struct rds_sock *rs) 106 { 107 unsigned long flags; 108 109 read_lock_irqsave(&rs->rs_recv_lock, flags); 110 __rds_wake_sk_sleep(rds_rs_to_sk(rs)); 111 read_unlock_irqrestore(&rs->rs_recv_lock, flags); 112 } 113 114 static int rds_getname(struct socket *sock, struct sockaddr *uaddr, 115 int peer) 116 { 117 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 118 struct sockaddr_in6 *sin6; 119 struct sockaddr_in *sin; 120 int uaddr_len; 121 122 /* racey, don't care */ 123 if (peer) { 124 if (ipv6_addr_any(&rs->rs_conn_addr)) 125 return -ENOTCONN; 126 127 if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) { 128 sin = (struct sockaddr_in *)uaddr; 129 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 130 sin->sin_family = AF_INET; 131 sin->sin_port = rs->rs_conn_port; 132 sin->sin_addr.s_addr = rs->rs_conn_addr_v4; 133 uaddr_len = sizeof(*sin); 134 } else { 135 sin6 = (struct sockaddr_in6 *)uaddr; 136 sin6->sin6_family = AF_INET6; 137 sin6->sin6_port = rs->rs_conn_port; 138 sin6->sin6_addr = rs->rs_conn_addr; 139 sin6->sin6_flowinfo = 0; 140 /* scope_id is the same as in the bound address. */ 141 sin6->sin6_scope_id = rs->rs_bound_scope_id; 142 uaddr_len = sizeof(*sin6); 143 } 144 } else { 145 /* If socket is not yet bound and the socket is connected, 146 * set the return address family to be the same as the 147 * connected address, but with 0 address value. If it is not 148 * connected, set the family to be AF_UNSPEC (value 0) and 149 * the address size to be that of an IPv4 address. 150 */ 151 if (ipv6_addr_any(&rs->rs_bound_addr)) { 152 if (ipv6_addr_any(&rs->rs_conn_addr)) { 153 sin = (struct sockaddr_in *)uaddr; 154 memset(sin, 0, sizeof(*sin)); 155 sin->sin_family = AF_UNSPEC; 156 return sizeof(*sin); 157 } 158 159 #if IS_ENABLED(CONFIG_IPV6) 160 if (!(ipv6_addr_type(&rs->rs_conn_addr) & 161 IPV6_ADDR_MAPPED)) { 162 sin6 = (struct sockaddr_in6 *)uaddr; 163 memset(sin6, 0, sizeof(*sin6)); 164 sin6->sin6_family = AF_INET6; 165 return sizeof(*sin6); 166 } 167 #endif 168 169 sin = (struct sockaddr_in *)uaddr; 170 memset(sin, 0, sizeof(*sin)); 171 sin->sin_family = AF_INET; 172 return sizeof(*sin); 173 } 174 if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) { 175 sin = (struct sockaddr_in *)uaddr; 176 memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 177 sin->sin_family = AF_INET; 178 sin->sin_port = rs->rs_bound_port; 179 sin->sin_addr.s_addr = rs->rs_bound_addr_v4; 180 uaddr_len = sizeof(*sin); 181 } else { 182 sin6 = (struct sockaddr_in6 *)uaddr; 183 sin6->sin6_family = AF_INET6; 184 sin6->sin6_port = rs->rs_bound_port; 185 sin6->sin6_addr = rs->rs_bound_addr; 186 sin6->sin6_flowinfo = 0; 187 sin6->sin6_scope_id = rs->rs_bound_scope_id; 188 uaddr_len = sizeof(*sin6); 189 } 190 } 191 192 return uaddr_len; 193 } 194 195 /* 196 * RDS' poll is without a doubt the least intuitive part of the interface, 197 * as EPOLLIN and EPOLLOUT do not behave entirely as you would expect from 198 * a network protocol. 199 * 200 * EPOLLIN is asserted if 201 * - there is data on the receive queue. 202 * - to signal that a previously congested destination may have become 203 * uncongested 204 * - A notification has been queued to the socket (this can be a congestion 205 * update, or a RDMA completion, or a MSG_ZEROCOPY completion). 206 * 207 * EPOLLOUT is asserted if there is room on the send queue. This does not mean 208 * however, that the next sendmsg() call will succeed. If the application tries 209 * to send to a congested destination, the system call may still fail (and 210 * return ENOBUFS). 211 */ 212 static __poll_t rds_poll(struct file *file, struct socket *sock, 213 poll_table *wait) 214 { 215 struct sock *sk = sock->sk; 216 struct rds_sock *rs = rds_sk_to_rs(sk); 217 __poll_t mask = 0; 218 unsigned long flags; 219 220 poll_wait(file, sk_sleep(sk), wait); 221 222 if (rs->rs_seen_congestion) 223 poll_wait(file, &rds_poll_waitq, wait); 224 225 read_lock_irqsave(&rs->rs_recv_lock, flags); 226 if (!rs->rs_cong_monitor) { 227 /* When a congestion map was updated, we signal EPOLLIN for 228 * "historical" reasons. Applications can also poll for 229 * WRBAND instead. */ 230 if (rds_cong_updated_since(&rs->rs_cong_track)) 231 mask |= (EPOLLIN | EPOLLRDNORM | EPOLLWRBAND); 232 } else { 233 spin_lock(&rs->rs_lock); 234 if (rs->rs_cong_notify) 235 mask |= (EPOLLIN | EPOLLRDNORM); 236 spin_unlock(&rs->rs_lock); 237 } 238 if (!list_empty(&rs->rs_recv_queue) || 239 !list_empty(&rs->rs_notify_queue) || 240 !list_empty(&rs->rs_zcookie_queue.zcookie_head)) 241 mask |= (EPOLLIN | EPOLLRDNORM); 242 if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) 243 mask |= (EPOLLOUT | EPOLLWRNORM); 244 if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) 245 mask |= EPOLLERR; 246 read_unlock_irqrestore(&rs->rs_recv_lock, flags); 247 248 /* clear state any time we wake a seen-congested socket */ 249 if (mask) 250 rs->rs_seen_congestion = 0; 251 252 return mask; 253 } 254 255 static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 256 { 257 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 258 rds_tos_t utos, tos = 0; 259 260 switch (cmd) { 261 case SIOCRDSSETTOS: 262 if (get_user(utos, (rds_tos_t __user *)arg)) 263 return -EFAULT; 264 265 if (rs->rs_transport && 266 rs->rs_transport->get_tos_map) 267 tos = rs->rs_transport->get_tos_map(utos); 268 else 269 return -ENOIOCTLCMD; 270 271 spin_lock_bh(&rds_sock_lock); 272 if (rs->rs_tos || rs->rs_conn) { 273 spin_unlock_bh(&rds_sock_lock); 274 return -EINVAL; 275 } 276 rs->rs_tos = tos; 277 spin_unlock_bh(&rds_sock_lock); 278 break; 279 case SIOCRDSGETTOS: 280 spin_lock_bh(&rds_sock_lock); 281 tos = rs->rs_tos; 282 spin_unlock_bh(&rds_sock_lock); 283 if (put_user(tos, (rds_tos_t __user *)arg)) 284 return -EFAULT; 285 break; 286 default: 287 return -ENOIOCTLCMD; 288 } 289 290 return 0; 291 } 292 293 static int rds_cancel_sent_to(struct rds_sock *rs, sockptr_t optval, int len) 294 { 295 struct sockaddr_in6 sin6; 296 struct sockaddr_in sin; 297 int ret = 0; 298 299 /* racing with another thread binding seems ok here */ 300 if (ipv6_addr_any(&rs->rs_bound_addr)) { 301 ret = -ENOTCONN; /* XXX not a great errno */ 302 goto out; 303 } 304 305 if (len < sizeof(struct sockaddr_in)) { 306 ret = -EINVAL; 307 goto out; 308 } else if (len < sizeof(struct sockaddr_in6)) { 309 /* Assume IPv4 */ 310 if (copy_from_sockptr(&sin, optval, 311 sizeof(struct sockaddr_in))) { 312 ret = -EFAULT; 313 goto out; 314 } 315 ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr); 316 sin6.sin6_port = sin.sin_port; 317 } else { 318 if (copy_from_sockptr(&sin6, optval, 319 sizeof(struct sockaddr_in6))) { 320 ret = -EFAULT; 321 goto out; 322 } 323 } 324 325 rds_send_drop_to(rs, &sin6); 326 out: 327 return ret; 328 } 329 330 static int rds_set_bool_option(unsigned char *optvar, sockptr_t optval, 331 int optlen) 332 { 333 int value; 334 335 if (optlen < sizeof(int)) 336 return -EINVAL; 337 if (copy_from_sockptr(&value, optval, sizeof(int))) 338 return -EFAULT; 339 *optvar = !!value; 340 return 0; 341 } 342 343 static int rds_cong_monitor(struct rds_sock *rs, sockptr_t optval, int optlen) 344 { 345 int ret; 346 347 ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen); 348 if (ret == 0) { 349 if (rs->rs_cong_monitor) { 350 rds_cong_add_socket(rs); 351 } else { 352 rds_cong_remove_socket(rs); 353 rs->rs_cong_mask = 0; 354 rs->rs_cong_notify = 0; 355 } 356 } 357 return ret; 358 } 359 360 static int rds_set_transport(struct net *net, struct rds_sock *rs, 361 sockptr_t optval, int optlen) 362 { 363 int t_type; 364 365 if (rs->rs_transport) 366 return -EOPNOTSUPP; /* previously attached to transport */ 367 368 if (optlen != sizeof(int)) 369 return -EINVAL; 370 371 if (copy_from_sockptr(&t_type, optval, sizeof(t_type))) 372 return -EFAULT; 373 374 if (t_type < 0 || t_type >= RDS_TRANS_COUNT) 375 return -EINVAL; 376 377 /* RDS/IB is restricted to the initial network namespace */ 378 if (t_type != RDS_TRANS_TCP && !net_eq(net, &init_net)) 379 return -EPROTOTYPE; 380 381 rs->rs_transport = rds_trans_get(t_type); 382 383 return rs->rs_transport ? 0 : -ENOPROTOOPT; 384 } 385 386 static int rds_enable_recvtstamp(struct sock *sk, sockptr_t optval, 387 int optlen, int optname) 388 { 389 int val, valbool; 390 391 if (optlen != sizeof(int)) 392 return -EFAULT; 393 394 if (copy_from_sockptr(&val, optval, sizeof(int))) 395 return -EFAULT; 396 397 valbool = val ? 1 : 0; 398 399 if (optname == SO_TIMESTAMP_NEW) 400 sock_set_flag(sk, SOCK_TSTAMP_NEW); 401 402 if (valbool) 403 sock_set_flag(sk, SOCK_RCVTSTAMP); 404 else 405 sock_reset_flag(sk, SOCK_RCVTSTAMP); 406 407 return 0; 408 } 409 410 static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval, 411 int optlen) 412 { 413 struct rds_rx_trace_so trace; 414 int i; 415 416 if (optlen != sizeof(struct rds_rx_trace_so)) 417 return -EFAULT; 418 419 if (copy_from_sockptr(&trace, optval, sizeof(trace))) 420 return -EFAULT; 421 422 if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX) 423 return -EFAULT; 424 425 rs->rs_rx_traces = trace.rx_traces; 426 for (i = 0; i < rs->rs_rx_traces; i++) { 427 if (trace.rx_trace_pos[i] >= RDS_MSG_RX_DGRAM_TRACE_MAX) { 428 rs->rs_rx_traces = 0; 429 return -EFAULT; 430 } 431 rs->rs_rx_trace[i] = trace.rx_trace_pos[i]; 432 } 433 434 return 0; 435 } 436 437 static int rds_setsockopt(struct socket *sock, int level, int optname, 438 sockptr_t optval, unsigned int optlen) 439 { 440 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 441 struct net *net = sock_net(sock->sk); 442 int ret; 443 444 if (level != SOL_RDS) { 445 ret = -ENOPROTOOPT; 446 goto out; 447 } 448 449 switch (optname) { 450 case RDS_CANCEL_SENT_TO: 451 ret = rds_cancel_sent_to(rs, optval, optlen); 452 break; 453 case RDS_GET_MR: 454 ret = rds_get_mr(rs, optval, optlen); 455 break; 456 case RDS_GET_MR_FOR_DEST: 457 ret = rds_get_mr_for_dest(rs, optval, optlen); 458 break; 459 case RDS_FREE_MR: 460 ret = rds_free_mr(rs, optval, optlen); 461 break; 462 case RDS_RECVERR: 463 ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen); 464 break; 465 case RDS_CONG_MONITOR: 466 ret = rds_cong_monitor(rs, optval, optlen); 467 break; 468 case SO_RDS_TRANSPORT: 469 lock_sock(sock->sk); 470 ret = rds_set_transport(net, rs, optval, optlen); 471 release_sock(sock->sk); 472 break; 473 case SO_TIMESTAMP_OLD: 474 case SO_TIMESTAMP_NEW: 475 lock_sock(sock->sk); 476 ret = rds_enable_recvtstamp(sock->sk, optval, optlen, optname); 477 release_sock(sock->sk); 478 break; 479 case SO_RDS_MSG_RXPATH_LATENCY: 480 ret = rds_recv_track_latency(rs, optval, optlen); 481 break; 482 default: 483 ret = -ENOPROTOOPT; 484 } 485 out: 486 return ret; 487 } 488 489 static int rds_getsockopt(struct socket *sock, int level, int optname, 490 char __user *optval, int __user *optlen) 491 { 492 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 493 int ret = -ENOPROTOOPT, len; 494 int trans; 495 496 if (level != SOL_RDS) 497 goto out; 498 499 if (get_user(len, optlen)) { 500 ret = -EFAULT; 501 goto out; 502 } 503 504 switch (optname) { 505 case RDS_INFO_FIRST ... RDS_INFO_LAST: 506 ret = rds_info_getsockopt(sock, optname, optval, 507 optlen); 508 break; 509 510 case RDS_RECVERR: 511 if (len < sizeof(int)) 512 ret = -EINVAL; 513 else 514 if (put_user(rs->rs_recverr, (int __user *) optval) || 515 put_user(sizeof(int), optlen)) 516 ret = -EFAULT; 517 else 518 ret = 0; 519 break; 520 case SO_RDS_TRANSPORT: 521 if (len < sizeof(int)) { 522 ret = -EINVAL; 523 break; 524 } 525 trans = (rs->rs_transport ? rs->rs_transport->t_type : 526 RDS_TRANS_NONE); /* unbound */ 527 if (put_user(trans, (int __user *)optval) || 528 put_user(sizeof(int), optlen)) 529 ret = -EFAULT; 530 else 531 ret = 0; 532 break; 533 default: 534 break; 535 } 536 537 out: 538 return ret; 539 540 } 541 542 static int rds_connect(struct socket *sock, struct sockaddr_unsized *uaddr, 543 int addr_len, int flags) 544 { 545 struct sock *sk = sock->sk; 546 struct sockaddr_in *sin; 547 struct rds_sock *rs = rds_sk_to_rs(sk); 548 int ret = 0; 549 550 if (addr_len < offsetofend(struct sockaddr, sa_family)) 551 return -EINVAL; 552 553 lock_sock(sk); 554 555 switch (uaddr->sa_family) { 556 case AF_INET: 557 sin = (struct sockaddr_in *)uaddr; 558 if (addr_len < sizeof(struct sockaddr_in)) { 559 ret = -EINVAL; 560 break; 561 } 562 if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { 563 ret = -EDESTADDRREQ; 564 break; 565 } 566 if (ipv4_is_multicast(sin->sin_addr.s_addr) || 567 sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) { 568 ret = -EINVAL; 569 break; 570 } 571 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr); 572 rs->rs_conn_port = sin->sin_port; 573 break; 574 575 #if IS_ENABLED(CONFIG_IPV6) 576 case AF_INET6: { 577 struct sockaddr_in6 *sin6; 578 int addr_type; 579 580 sin6 = (struct sockaddr_in6 *)uaddr; 581 if (addr_len < sizeof(struct sockaddr_in6)) { 582 ret = -EINVAL; 583 break; 584 } 585 addr_type = ipv6_addr_type(&sin6->sin6_addr); 586 if (!(addr_type & IPV6_ADDR_UNICAST)) { 587 __be32 addr4; 588 589 if (!(addr_type & IPV6_ADDR_MAPPED)) { 590 ret = -EPROTOTYPE; 591 break; 592 } 593 594 /* It is a mapped address. Need to do some sanity 595 * checks. 596 */ 597 addr4 = sin6->sin6_addr.s6_addr32[3]; 598 if (addr4 == htonl(INADDR_ANY) || 599 addr4 == htonl(INADDR_BROADCAST) || 600 ipv4_is_multicast(addr4)) { 601 ret = -EPROTOTYPE; 602 break; 603 } 604 } 605 606 if (addr_type & IPV6_ADDR_LINKLOCAL) { 607 /* If socket is already bound to a link local address, 608 * the peer address must be on the same link. 609 */ 610 if (sin6->sin6_scope_id == 0 || 611 (!ipv6_addr_any(&rs->rs_bound_addr) && 612 rs->rs_bound_scope_id && 613 sin6->sin6_scope_id != rs->rs_bound_scope_id)) { 614 ret = -EINVAL; 615 break; 616 } 617 /* Remember the connected address scope ID. It will 618 * be checked against the binding local address when 619 * the socket is bound. 620 */ 621 rs->rs_bound_scope_id = sin6->sin6_scope_id; 622 } 623 rs->rs_conn_addr = sin6->sin6_addr; 624 rs->rs_conn_port = sin6->sin6_port; 625 break; 626 } 627 #endif 628 629 default: 630 ret = -EAFNOSUPPORT; 631 break; 632 } 633 634 release_sock(sk); 635 return ret; 636 } 637 638 static struct proto rds_proto = { 639 .name = "RDS", 640 .owner = THIS_MODULE, 641 .obj_size = sizeof(struct rds_sock), 642 }; 643 644 static const struct proto_ops rds_proto_ops = { 645 .family = AF_RDS, 646 .owner = THIS_MODULE, 647 .release = rds_release, 648 .bind = rds_bind, 649 .connect = rds_connect, 650 .socketpair = sock_no_socketpair, 651 .accept = sock_no_accept, 652 .getname = rds_getname, 653 .poll = rds_poll, 654 .ioctl = rds_ioctl, 655 .listen = sock_no_listen, 656 .shutdown = sock_no_shutdown, 657 .setsockopt = rds_setsockopt, 658 .getsockopt = rds_getsockopt, 659 .sendmsg = rds_sendmsg, 660 .recvmsg = rds_recvmsg, 661 .mmap = sock_no_mmap, 662 }; 663 664 static void rds_sock_destruct(struct sock *sk) 665 { 666 struct rds_sock *rs = rds_sk_to_rs(sk); 667 668 WARN_ON((&rs->rs_item != rs->rs_item.next || 669 &rs->rs_item != rs->rs_item.prev)); 670 } 671 672 static int __rds_create(struct socket *sock, struct sock *sk, int protocol) 673 { 674 struct rds_sock *rs; 675 676 sock_init_data(sock, sk); 677 sock->ops = &rds_proto_ops; 678 sk->sk_protocol = protocol; 679 sk->sk_destruct = rds_sock_destruct; 680 681 rs = rds_sk_to_rs(sk); 682 spin_lock_init(&rs->rs_lock); 683 rwlock_init(&rs->rs_recv_lock); 684 INIT_LIST_HEAD(&rs->rs_send_queue); 685 INIT_LIST_HEAD(&rs->rs_recv_queue); 686 INIT_LIST_HEAD(&rs->rs_notify_queue); 687 INIT_LIST_HEAD(&rs->rs_cong_list); 688 rds_message_zcopy_queue_init(&rs->rs_zcookie_queue); 689 spin_lock_init(&rs->rs_rdma_lock); 690 rs->rs_rdma_keys = RB_ROOT; 691 rs->rs_rx_traces = 0; 692 rs->rs_tos = 0; 693 rs->rs_conn = NULL; 694 695 spin_lock_bh(&rds_sock_lock); 696 list_add_tail(&rs->rs_item, &rds_sock_list); 697 rds_sock_count++; 698 spin_unlock_bh(&rds_sock_lock); 699 700 return 0; 701 } 702 703 static int rds_create(struct net *net, struct socket *sock, int protocol, 704 int kern) 705 { 706 struct sock *sk; 707 708 if (sock->type != SOCK_SEQPACKET || protocol) 709 return -ESOCKTNOSUPPORT; 710 711 sk = sk_alloc(net, AF_RDS, GFP_KERNEL, &rds_proto, kern); 712 if (!sk) 713 return -ENOMEM; 714 715 return __rds_create(sock, sk, protocol); 716 } 717 718 void rds_sock_addref(struct rds_sock *rs) 719 { 720 sock_hold(rds_rs_to_sk(rs)); 721 } 722 723 void rds_sock_put(struct rds_sock *rs) 724 { 725 sock_put(rds_rs_to_sk(rs)); 726 } 727 728 static const struct net_proto_family rds_family_ops = { 729 .family = AF_RDS, 730 .create = rds_create, 731 .owner = THIS_MODULE, 732 }; 733 734 static void rds_sock_inc_info(struct socket *sock, unsigned int len, 735 struct rds_info_iterator *iter, 736 struct rds_info_lengths *lens) 737 { 738 struct rds_sock *rs; 739 struct rds_incoming *inc; 740 unsigned int total = 0; 741 742 len /= sizeof(struct rds_info_message); 743 744 spin_lock_bh(&rds_sock_lock); 745 746 list_for_each_entry(rs, &rds_sock_list, rs_item) { 747 /* This option only supports IPv4 sockets. */ 748 if (!ipv6_addr_v4mapped(&rs->rs_bound_addr)) 749 continue; 750 751 read_lock(&rs->rs_recv_lock); 752 753 /* XXX too lazy to maintain counts.. */ 754 list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { 755 total++; 756 if (total <= len) 757 rds_inc_info_copy(inc, iter, 758 inc->i_saddr.s6_addr32[3], 759 rs->rs_bound_addr_v4, 760 1); 761 } 762 763 read_unlock(&rs->rs_recv_lock); 764 } 765 766 spin_unlock_bh(&rds_sock_lock); 767 768 lens->nr = total; 769 lens->each = sizeof(struct rds_info_message); 770 } 771 772 #if IS_ENABLED(CONFIG_IPV6) 773 static void rds6_sock_inc_info(struct socket *sock, unsigned int len, 774 struct rds_info_iterator *iter, 775 struct rds_info_lengths *lens) 776 { 777 struct rds_incoming *inc; 778 unsigned int total = 0; 779 struct rds_sock *rs; 780 781 len /= sizeof(struct rds6_info_message); 782 783 spin_lock_bh(&rds_sock_lock); 784 785 list_for_each_entry(rs, &rds_sock_list, rs_item) { 786 read_lock(&rs->rs_recv_lock); 787 788 list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { 789 total++; 790 if (total <= len) 791 rds6_inc_info_copy(inc, iter, &inc->i_saddr, 792 &rs->rs_bound_addr, 1); 793 } 794 795 read_unlock(&rs->rs_recv_lock); 796 } 797 798 spin_unlock_bh(&rds_sock_lock); 799 800 lens->nr = total; 801 lens->each = sizeof(struct rds6_info_message); 802 } 803 #endif 804 805 static void rds_sock_info(struct socket *sock, unsigned int len, 806 struct rds_info_iterator *iter, 807 struct rds_info_lengths *lens) 808 { 809 struct rds_info_socket sinfo; 810 unsigned int cnt = 0; 811 struct rds_sock *rs; 812 813 len /= sizeof(struct rds_info_socket); 814 815 spin_lock_bh(&rds_sock_lock); 816 817 if (len < rds_sock_count) { 818 cnt = rds_sock_count; 819 goto out; 820 } 821 822 list_for_each_entry(rs, &rds_sock_list, rs_item) { 823 /* This option only supports IPv4 sockets. */ 824 if (!ipv6_addr_v4mapped(&rs->rs_bound_addr)) 825 continue; 826 sinfo.sndbuf = rds_sk_sndbuf(rs); 827 sinfo.rcvbuf = rds_sk_rcvbuf(rs); 828 sinfo.bound_addr = rs->rs_bound_addr_v4; 829 sinfo.connected_addr = rs->rs_conn_addr_v4; 830 sinfo.bound_port = rs->rs_bound_port; 831 sinfo.connected_port = rs->rs_conn_port; 832 sinfo.inum = sock_i_ino(rds_rs_to_sk(rs)); 833 834 rds_info_copy(iter, &sinfo, sizeof(sinfo)); 835 cnt++; 836 } 837 838 out: 839 lens->nr = cnt; 840 lens->each = sizeof(struct rds_info_socket); 841 842 spin_unlock_bh(&rds_sock_lock); 843 } 844 845 #if IS_ENABLED(CONFIG_IPV6) 846 static void rds6_sock_info(struct socket *sock, unsigned int len, 847 struct rds_info_iterator *iter, 848 struct rds_info_lengths *lens) 849 { 850 struct rds6_info_socket sinfo6; 851 struct rds_sock *rs; 852 853 len /= sizeof(struct rds6_info_socket); 854 855 spin_lock_bh(&rds_sock_lock); 856 857 if (len < rds_sock_count) 858 goto out; 859 860 list_for_each_entry(rs, &rds_sock_list, rs_item) { 861 sinfo6.sndbuf = rds_sk_sndbuf(rs); 862 sinfo6.rcvbuf = rds_sk_rcvbuf(rs); 863 sinfo6.bound_addr = rs->rs_bound_addr; 864 sinfo6.connected_addr = rs->rs_conn_addr; 865 sinfo6.bound_port = rs->rs_bound_port; 866 sinfo6.connected_port = rs->rs_conn_port; 867 sinfo6.inum = sock_i_ino(rds_rs_to_sk(rs)); 868 869 rds_info_copy(iter, &sinfo6, sizeof(sinfo6)); 870 } 871 872 out: 873 lens->nr = rds_sock_count; 874 lens->each = sizeof(struct rds6_info_socket); 875 876 spin_unlock_bh(&rds_sock_lock); 877 } 878 #endif 879 880 static void rds_exit(void) 881 { 882 sock_unregister(rds_family_ops.family); 883 proto_unregister(&rds_proto); 884 rds_conn_exit(); 885 rds_cong_exit(); 886 rds_sysctl_exit(); 887 rds_threads_exit(); 888 rds_stats_exit(); 889 rds_page_exit(); 890 rds_bind_lock_destroy(); 891 rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info); 892 rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); 893 #if IS_ENABLED(CONFIG_IPV6) 894 rds_info_deregister_func(RDS6_INFO_SOCKETS, rds6_sock_info); 895 rds_info_deregister_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info); 896 #endif 897 } 898 module_exit(rds_exit); 899 900 u32 rds_gen_num; 901 902 static int __init rds_init(void) 903 { 904 int ret; 905 906 net_get_random_once(&rds_gen_num, sizeof(rds_gen_num)); 907 908 ret = rds_bind_lock_init(); 909 if (ret) 910 goto out; 911 912 ret = rds_conn_init(); 913 if (ret) 914 goto out_bind; 915 916 ret = rds_threads_init(); 917 if (ret) 918 goto out_conn; 919 ret = rds_sysctl_init(); 920 if (ret) 921 goto out_threads; 922 ret = rds_stats_init(); 923 if (ret) 924 goto out_sysctl; 925 ret = proto_register(&rds_proto, 1); 926 if (ret) 927 goto out_stats; 928 ret = sock_register(&rds_family_ops); 929 if (ret) 930 goto out_proto; 931 932 rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info); 933 rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); 934 #if IS_ENABLED(CONFIG_IPV6) 935 rds_info_register_func(RDS6_INFO_SOCKETS, rds6_sock_info); 936 rds_info_register_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info); 937 #endif 938 939 goto out; 940 941 out_proto: 942 proto_unregister(&rds_proto); 943 out_stats: 944 rds_stats_exit(); 945 out_sysctl: 946 rds_sysctl_exit(); 947 out_threads: 948 rds_threads_exit(); 949 out_conn: 950 rds_conn_exit(); 951 rds_cong_exit(); 952 rds_page_exit(); 953 out_bind: 954 rds_bind_lock_destroy(); 955 out: 956 return ret; 957 } 958 module_init(rds_init); 959 960 #define DRV_VERSION "4.0" 961 #define DRV_RELDATE "Feb 12, 2009" 962 963 MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>"); 964 MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets" 965 " v" DRV_VERSION " (" DRV_RELDATE ")"); 966 MODULE_VERSION(DRV_VERSION); 967 MODULE_LICENSE("Dual BSD/GPL"); 968 MODULE_ALIAS_NETPROTO(PF_RDS); 969