1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic socket support routines. Memory allocators, socket lock/release 8 * handler for protocols to use and generic option handler. 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 */ 85 86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 87 88 #include <linux/unaligned.h> 89 #include <linux/capability.h> 90 #include <linux/errno.h> 91 #include <linux/errqueue.h> 92 #include <linux/types.h> 93 #include <linux/socket.h> 94 #include <linux/in.h> 95 #include <linux/kernel.h> 96 #include <linux/module.h> 97 #include <linux/proc_fs.h> 98 #include <linux/seq_file.h> 99 #include <linux/sched.h> 100 #include <linux/sched/mm.h> 101 #include <linux/timer.h> 102 #include <linux/string.h> 103 #include <linux/sockios.h> 104 #include <linux/net.h> 105 #include <linux/mm.h> 106 #include <linux/slab.h> 107 #include <linux/interrupt.h> 108 #include <linux/poll.h> 109 #include <linux/tcp.h> 110 #include <linux/udp.h> 111 #include <linux/init.h> 112 #include <linux/highmem.h> 113 #include <linux/user_namespace.h> 114 #include <linux/static_key.h> 115 #include <linux/memcontrol.h> 116 #include <linux/prefetch.h> 117 #include <linux/compat.h> 118 #include <linux/mroute.h> 119 #include <linux/mroute6.h> 120 #include <linux/icmpv6.h> 121 122 #include <linux/uaccess.h> 123 124 #include <linux/netdevice.h> 125 #include <net/protocol.h> 126 #include <linux/skbuff.h> 127 #include <linux/skbuff_ref.h> 128 #include <net/net_namespace.h> 129 #include <net/request_sock.h> 130 #include <net/sock.h> 131 #include <net/proto_memory.h> 132 #include <linux/net_tstamp.h> 133 #include <net/xfrm.h> 134 #include <linux/ipsec.h> 135 #include <net/cls_cgroup.h> 136 #include <net/netprio_cgroup.h> 137 #include <linux/sock_diag.h> 138 139 #include <linux/filter.h> 140 #include <net/sock_reuseport.h> 141 #include <net/bpf_sk_storage.h> 142 143 #include <trace/events/sock.h> 144 145 #include <net/tcp.h> 146 #include <net/busy_poll.h> 147 #include <net/phonet/phonet.h> 148 149 #include <linux/ethtool.h> 150 151 #include <uapi/linux/pidfd.h> 152 153 #include "dev.h" 154 155 static DEFINE_MUTEX(proto_list_mutex); 156 static LIST_HEAD(proto_list); 157 158 static void sock_def_write_space_wfree(struct sock *sk); 159 static void sock_def_write_space(struct sock *sk); 160 161 /** 162 * sk_ns_capable - General socket capability test 163 * @sk: Socket to use a capability on or through 164 * @user_ns: The user namespace of the capability to use 165 * @cap: The capability to use 166 * 167 * Test to see if the opener of the socket had when the socket was 168 * created and the current process has the capability @cap in the user 169 * namespace @user_ns. 170 */ 171 bool sk_ns_capable(const struct sock *sk, 172 struct user_namespace *user_ns, int cap) 173 { 174 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 175 ns_capable(user_ns, cap); 176 } 177 EXPORT_SYMBOL(sk_ns_capable); 178 179 /** 180 * sk_capable - Socket global capability test 181 * @sk: Socket to use a capability on or through 182 * @cap: The global capability to use 183 * 184 * Test to see if the opener of the socket had when the socket was 185 * created and the current process has the capability @cap in all user 186 * namespaces. 187 */ 188 bool sk_capable(const struct sock *sk, int cap) 189 { 190 return sk_ns_capable(sk, &init_user_ns, cap); 191 } 192 EXPORT_SYMBOL(sk_capable); 193 194 /** 195 * sk_net_capable - Network namespace socket capability test 196 * @sk: Socket to use a capability on or through 197 * @cap: The capability to use 198 * 199 * Test to see if the opener of the socket had when the socket was created 200 * and the current process has the capability @cap over the network namespace 201 * the socket is a member of. 202 */ 203 bool sk_net_capable(const struct sock *sk, int cap) 204 { 205 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 206 } 207 EXPORT_SYMBOL(sk_net_capable); 208 209 /* 210 * Each address family might have different locking rules, so we have 211 * one slock key per address family and separate keys for internal and 212 * userspace sockets. 213 */ 214 static struct lock_class_key af_family_keys[AF_MAX]; 215 static struct lock_class_key af_family_kern_keys[AF_MAX]; 216 static struct lock_class_key af_family_slock_keys[AF_MAX]; 217 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 218 219 /* 220 * Make lock validator output more readable. (we pre-construct these 221 * strings build-time, so that runtime initialization of socket 222 * locks is fast): 223 */ 224 225 #define _sock_locks(x) \ 226 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 227 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 228 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 229 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 230 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 231 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 232 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 233 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 234 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 235 x "27" , x "28" , x "AF_CAN" , \ 236 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 237 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 238 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 239 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 240 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 241 x "AF_MCTP" , \ 242 x "AF_MAX" 243 244 static const char *const af_family_key_strings[AF_MAX+1] = { 245 _sock_locks("sk_lock-") 246 }; 247 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 248 _sock_locks("slock-") 249 }; 250 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 251 _sock_locks("clock-") 252 }; 253 254 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 255 _sock_locks("k-sk_lock-") 256 }; 257 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 258 _sock_locks("k-slock-") 259 }; 260 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 261 _sock_locks("k-clock-") 262 }; 263 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 264 _sock_locks("rlock-") 265 }; 266 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 267 _sock_locks("wlock-") 268 }; 269 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 270 _sock_locks("elock-") 271 }; 272 273 /* 274 * sk_callback_lock and sk queues locking rules are per-address-family, 275 * so split the lock classes by using a per-AF key: 276 */ 277 static struct lock_class_key af_callback_keys[AF_MAX]; 278 static struct lock_class_key af_rlock_keys[AF_MAX]; 279 static struct lock_class_key af_wlock_keys[AF_MAX]; 280 static struct lock_class_key af_elock_keys[AF_MAX]; 281 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 282 283 /* Run time adjustable parameters. */ 284 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 285 EXPORT_SYMBOL(sysctl_wmem_max); 286 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 287 EXPORT_SYMBOL(sysctl_rmem_max); 288 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 289 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 290 291 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 292 EXPORT_SYMBOL_GPL(memalloc_socks_key); 293 294 /** 295 * sk_set_memalloc - sets %SOCK_MEMALLOC 296 * @sk: socket to set it on 297 * 298 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 299 * It's the responsibility of the admin to adjust min_free_kbytes 300 * to meet the requirements 301 */ 302 void sk_set_memalloc(struct sock *sk) 303 { 304 sock_set_flag(sk, SOCK_MEMALLOC); 305 sk->sk_allocation |= __GFP_MEMALLOC; 306 static_branch_inc(&memalloc_socks_key); 307 } 308 EXPORT_SYMBOL_GPL(sk_set_memalloc); 309 310 void sk_clear_memalloc(struct sock *sk) 311 { 312 sock_reset_flag(sk, SOCK_MEMALLOC); 313 sk->sk_allocation &= ~__GFP_MEMALLOC; 314 static_branch_dec(&memalloc_socks_key); 315 316 /* 317 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 318 * progress of swapping. SOCK_MEMALLOC may be cleared while 319 * it has rmem allocations due to the last swapfile being deactivated 320 * but there is a risk that the socket is unusable due to exceeding 321 * the rmem limits. Reclaim the reserves and obey rmem limits again. 322 */ 323 sk_mem_reclaim(sk); 324 } 325 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 326 327 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 328 { 329 int ret; 330 unsigned int noreclaim_flag; 331 332 /* these should have been dropped before queueing */ 333 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 334 335 noreclaim_flag = memalloc_noreclaim_save(); 336 ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, 337 tcp_v6_do_rcv, 338 tcp_v4_do_rcv, 339 sk, skb); 340 memalloc_noreclaim_restore(noreclaim_flag); 341 342 return ret; 343 } 344 EXPORT_SYMBOL(__sk_backlog_rcv); 345 346 void sk_error_report(struct sock *sk) 347 { 348 sk->sk_error_report(sk); 349 350 switch (sk->sk_family) { 351 case AF_INET: 352 fallthrough; 353 case AF_INET6: 354 trace_inet_sk_error_report(sk); 355 break; 356 default: 357 break; 358 } 359 } 360 EXPORT_SYMBOL(sk_error_report); 361 362 int sock_get_timeout(long timeo, void *optval, bool old_timeval) 363 { 364 struct __kernel_sock_timeval tv; 365 366 if (timeo == MAX_SCHEDULE_TIMEOUT) { 367 tv.tv_sec = 0; 368 tv.tv_usec = 0; 369 } else { 370 tv.tv_sec = timeo / HZ; 371 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; 372 } 373 374 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 375 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; 376 *(struct old_timeval32 *)optval = tv32; 377 return sizeof(tv32); 378 } 379 380 if (old_timeval) { 381 struct __kernel_old_timeval old_tv; 382 old_tv.tv_sec = tv.tv_sec; 383 old_tv.tv_usec = tv.tv_usec; 384 *(struct __kernel_old_timeval *)optval = old_tv; 385 return sizeof(old_tv); 386 } 387 388 *(struct __kernel_sock_timeval *)optval = tv; 389 return sizeof(tv); 390 } 391 EXPORT_SYMBOL(sock_get_timeout); 392 393 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, 394 sockptr_t optval, int optlen, bool old_timeval) 395 { 396 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 397 struct old_timeval32 tv32; 398 399 if (optlen < sizeof(tv32)) 400 return -EINVAL; 401 402 if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) 403 return -EFAULT; 404 tv->tv_sec = tv32.tv_sec; 405 tv->tv_usec = tv32.tv_usec; 406 } else if (old_timeval) { 407 struct __kernel_old_timeval old_tv; 408 409 if (optlen < sizeof(old_tv)) 410 return -EINVAL; 411 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) 412 return -EFAULT; 413 tv->tv_sec = old_tv.tv_sec; 414 tv->tv_usec = old_tv.tv_usec; 415 } else { 416 if (optlen < sizeof(*tv)) 417 return -EINVAL; 418 if (copy_from_sockptr(tv, optval, sizeof(*tv))) 419 return -EFAULT; 420 } 421 422 return 0; 423 } 424 EXPORT_SYMBOL(sock_copy_user_timeval); 425 426 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, 427 bool old_timeval) 428 { 429 struct __kernel_sock_timeval tv; 430 int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); 431 long val; 432 433 if (err) 434 return err; 435 436 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 437 return -EDOM; 438 439 if (tv.tv_sec < 0) { 440 static int warned __read_mostly; 441 442 WRITE_ONCE(*timeo_p, 0); 443 if (warned < 10 && net_ratelimit()) { 444 warned++; 445 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 446 __func__, current->comm, task_pid_nr(current)); 447 } 448 return 0; 449 } 450 val = MAX_SCHEDULE_TIMEOUT; 451 if ((tv.tv_sec || tv.tv_usec) && 452 (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))) 453 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, 454 USEC_PER_SEC / HZ); 455 WRITE_ONCE(*timeo_p, val); 456 return 0; 457 } 458 459 static bool sk_set_prio_allowed(const struct sock *sk, int val) 460 { 461 return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) || 462 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || 463 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); 464 } 465 466 static bool sock_needs_netstamp(const struct sock *sk) 467 { 468 switch (sk->sk_family) { 469 case AF_UNSPEC: 470 case AF_UNIX: 471 return false; 472 default: 473 return true; 474 } 475 } 476 477 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 478 { 479 if (sk->sk_flags & flags) { 480 sk->sk_flags &= ~flags; 481 if (sock_needs_netstamp(sk) && 482 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 483 net_disable_timestamp(); 484 } 485 } 486 487 488 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 489 { 490 unsigned long flags; 491 struct sk_buff_head *list = &sk->sk_receive_queue; 492 493 if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { 494 atomic_inc(&sk->sk_drops); 495 trace_sock_rcvqueue_full(sk, skb); 496 return -ENOMEM; 497 } 498 499 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 500 atomic_inc(&sk->sk_drops); 501 return -ENOBUFS; 502 } 503 504 skb->dev = NULL; 505 skb_set_owner_r(skb, sk); 506 507 /* we escape from rcu protected region, make sure we dont leak 508 * a norefcounted dst 509 */ 510 skb_dst_force(skb); 511 512 spin_lock_irqsave(&list->lock, flags); 513 sock_skb_set_dropcount(sk, skb); 514 __skb_queue_tail(list, skb); 515 spin_unlock_irqrestore(&list->lock, flags); 516 517 if (!sock_flag(sk, SOCK_DEAD)) 518 sk->sk_data_ready(sk); 519 return 0; 520 } 521 EXPORT_SYMBOL(__sock_queue_rcv_skb); 522 523 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, 524 enum skb_drop_reason *reason) 525 { 526 enum skb_drop_reason drop_reason; 527 int err; 528 529 err = sk_filter(sk, skb); 530 if (err) { 531 drop_reason = SKB_DROP_REASON_SOCKET_FILTER; 532 goto out; 533 } 534 err = __sock_queue_rcv_skb(sk, skb); 535 switch (err) { 536 case -ENOMEM: 537 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; 538 break; 539 case -ENOBUFS: 540 drop_reason = SKB_DROP_REASON_PROTO_MEM; 541 break; 542 default: 543 drop_reason = SKB_NOT_DROPPED_YET; 544 break; 545 } 546 out: 547 if (reason) 548 *reason = drop_reason; 549 return err; 550 } 551 EXPORT_SYMBOL(sock_queue_rcv_skb_reason); 552 553 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 554 const int nested, unsigned int trim_cap, bool refcounted) 555 { 556 int rc = NET_RX_SUCCESS; 557 558 if (sk_filter_trim_cap(sk, skb, trim_cap)) 559 goto discard_and_relse; 560 561 skb->dev = NULL; 562 563 if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { 564 atomic_inc(&sk->sk_drops); 565 goto discard_and_relse; 566 } 567 if (nested) 568 bh_lock_sock_nested(sk); 569 else 570 bh_lock_sock(sk); 571 if (!sock_owned_by_user(sk)) { 572 /* 573 * trylock + unlock semantics: 574 */ 575 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 576 577 rc = sk_backlog_rcv(sk, skb); 578 579 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 580 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { 581 bh_unlock_sock(sk); 582 atomic_inc(&sk->sk_drops); 583 goto discard_and_relse; 584 } 585 586 bh_unlock_sock(sk); 587 out: 588 if (refcounted) 589 sock_put(sk); 590 return rc; 591 discard_and_relse: 592 kfree_skb(skb); 593 goto out; 594 } 595 EXPORT_SYMBOL(__sk_receive_skb); 596 597 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, 598 u32)); 599 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 600 u32)); 601 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 602 { 603 struct dst_entry *dst = __sk_dst_get(sk); 604 605 if (dst && dst->obsolete && 606 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 607 dst, cookie) == NULL) { 608 sk_tx_queue_clear(sk); 609 WRITE_ONCE(sk->sk_dst_pending_confirm, 0); 610 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 611 dst_release(dst); 612 return NULL; 613 } 614 615 return dst; 616 } 617 EXPORT_SYMBOL(__sk_dst_check); 618 619 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 620 { 621 struct dst_entry *dst = sk_dst_get(sk); 622 623 if (dst && dst->obsolete && 624 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 625 dst, cookie) == NULL) { 626 sk_dst_reset(sk); 627 dst_release(dst); 628 return NULL; 629 } 630 631 return dst; 632 } 633 EXPORT_SYMBOL(sk_dst_check); 634 635 static int sock_bindtoindex_locked(struct sock *sk, int ifindex) 636 { 637 int ret = -ENOPROTOOPT; 638 #ifdef CONFIG_NETDEVICES 639 struct net *net = sock_net(sk); 640 641 /* Sorry... */ 642 ret = -EPERM; 643 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) 644 goto out; 645 646 ret = -EINVAL; 647 if (ifindex < 0) 648 goto out; 649 650 /* Paired with all READ_ONCE() done locklessly. */ 651 WRITE_ONCE(sk->sk_bound_dev_if, ifindex); 652 653 if (sk->sk_prot->rehash) 654 sk->sk_prot->rehash(sk); 655 sk_dst_reset(sk); 656 657 ret = 0; 658 659 out: 660 #endif 661 662 return ret; 663 } 664 665 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) 666 { 667 int ret; 668 669 if (lock_sk) 670 lock_sock(sk); 671 ret = sock_bindtoindex_locked(sk, ifindex); 672 if (lock_sk) 673 release_sock(sk); 674 675 return ret; 676 } 677 EXPORT_SYMBOL(sock_bindtoindex); 678 679 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) 680 { 681 int ret = -ENOPROTOOPT; 682 #ifdef CONFIG_NETDEVICES 683 struct net *net = sock_net(sk); 684 char devname[IFNAMSIZ]; 685 int index; 686 687 ret = -EINVAL; 688 if (optlen < 0) 689 goto out; 690 691 /* Bind this socket to a particular device like "eth0", 692 * as specified in the passed interface name. If the 693 * name is "" or the option length is zero the socket 694 * is not bound. 695 */ 696 if (optlen > IFNAMSIZ - 1) 697 optlen = IFNAMSIZ - 1; 698 memset(devname, 0, sizeof(devname)); 699 700 ret = -EFAULT; 701 if (copy_from_sockptr(devname, optval, optlen)) 702 goto out; 703 704 index = 0; 705 if (devname[0] != '\0') { 706 struct net_device *dev; 707 708 rcu_read_lock(); 709 dev = dev_get_by_name_rcu(net, devname); 710 if (dev) 711 index = dev->ifindex; 712 rcu_read_unlock(); 713 ret = -ENODEV; 714 if (!dev) 715 goto out; 716 } 717 718 sockopt_lock_sock(sk); 719 ret = sock_bindtoindex_locked(sk, index); 720 sockopt_release_sock(sk); 721 out: 722 #endif 723 724 return ret; 725 } 726 727 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, 728 sockptr_t optlen, int len) 729 { 730 int ret = -ENOPROTOOPT; 731 #ifdef CONFIG_NETDEVICES 732 int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); 733 struct net *net = sock_net(sk); 734 char devname[IFNAMSIZ]; 735 736 if (bound_dev_if == 0) { 737 len = 0; 738 goto zero; 739 } 740 741 ret = -EINVAL; 742 if (len < IFNAMSIZ) 743 goto out; 744 745 ret = netdev_get_name(net, devname, bound_dev_if); 746 if (ret) 747 goto out; 748 749 len = strlen(devname) + 1; 750 751 ret = -EFAULT; 752 if (copy_to_sockptr(optval, devname, len)) 753 goto out; 754 755 zero: 756 ret = -EFAULT; 757 if (copy_to_sockptr(optlen, &len, sizeof(int))) 758 goto out; 759 760 ret = 0; 761 762 out: 763 #endif 764 765 return ret; 766 } 767 768 bool sk_mc_loop(const struct sock *sk) 769 { 770 if (dev_recursion_level()) 771 return false; 772 if (!sk) 773 return true; 774 /* IPV6_ADDRFORM can change sk->sk_family under us. */ 775 switch (READ_ONCE(sk->sk_family)) { 776 case AF_INET: 777 return inet_test_bit(MC_LOOP, sk); 778 #if IS_ENABLED(CONFIG_IPV6) 779 case AF_INET6: 780 return inet6_test_bit(MC6_LOOP, sk); 781 #endif 782 } 783 WARN_ON_ONCE(1); 784 return true; 785 } 786 EXPORT_SYMBOL(sk_mc_loop); 787 788 void sock_set_reuseaddr(struct sock *sk) 789 { 790 lock_sock(sk); 791 sk->sk_reuse = SK_CAN_REUSE; 792 release_sock(sk); 793 } 794 EXPORT_SYMBOL(sock_set_reuseaddr); 795 796 void sock_set_reuseport(struct sock *sk) 797 { 798 lock_sock(sk); 799 sk->sk_reuseport = true; 800 release_sock(sk); 801 } 802 EXPORT_SYMBOL(sock_set_reuseport); 803 804 void sock_no_linger(struct sock *sk) 805 { 806 lock_sock(sk); 807 WRITE_ONCE(sk->sk_lingertime, 0); 808 sock_set_flag(sk, SOCK_LINGER); 809 release_sock(sk); 810 } 811 EXPORT_SYMBOL(sock_no_linger); 812 813 void sock_set_priority(struct sock *sk, u32 priority) 814 { 815 WRITE_ONCE(sk->sk_priority, priority); 816 } 817 EXPORT_SYMBOL(sock_set_priority); 818 819 void sock_set_sndtimeo(struct sock *sk, s64 secs) 820 { 821 lock_sock(sk); 822 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) 823 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); 824 else 825 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); 826 release_sock(sk); 827 } 828 EXPORT_SYMBOL(sock_set_sndtimeo); 829 830 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) 831 { 832 sock_valbool_flag(sk, SOCK_RCVTSTAMP, val); 833 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns); 834 if (val) { 835 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); 836 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 837 } 838 } 839 840 void sock_enable_timestamps(struct sock *sk) 841 { 842 lock_sock(sk); 843 __sock_set_timestamps(sk, true, false, true); 844 release_sock(sk); 845 } 846 EXPORT_SYMBOL(sock_enable_timestamps); 847 848 void sock_set_timestamp(struct sock *sk, int optname, bool valbool) 849 { 850 switch (optname) { 851 case SO_TIMESTAMP_OLD: 852 __sock_set_timestamps(sk, valbool, false, false); 853 break; 854 case SO_TIMESTAMP_NEW: 855 __sock_set_timestamps(sk, valbool, true, false); 856 break; 857 case SO_TIMESTAMPNS_OLD: 858 __sock_set_timestamps(sk, valbool, false, true); 859 break; 860 case SO_TIMESTAMPNS_NEW: 861 __sock_set_timestamps(sk, valbool, true, true); 862 break; 863 } 864 } 865 866 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) 867 { 868 struct net *net = sock_net(sk); 869 struct net_device *dev = NULL; 870 bool match = false; 871 int *vclock_index; 872 int i, num; 873 874 if (sk->sk_bound_dev_if) 875 dev = dev_get_by_index(net, sk->sk_bound_dev_if); 876 877 if (!dev) { 878 pr_err("%s: sock not bind to device\n", __func__); 879 return -EOPNOTSUPP; 880 } 881 882 num = ethtool_get_phc_vclocks(dev, &vclock_index); 883 dev_put(dev); 884 885 for (i = 0; i < num; i++) { 886 if (*(vclock_index + i) == phc_index) { 887 match = true; 888 break; 889 } 890 } 891 892 if (num > 0) 893 kfree(vclock_index); 894 895 if (!match) 896 return -EINVAL; 897 898 WRITE_ONCE(sk->sk_bind_phc, phc_index); 899 900 return 0; 901 } 902 903 int sock_set_timestamping(struct sock *sk, int optname, 904 struct so_timestamping timestamping) 905 { 906 int val = timestamping.flags; 907 int ret; 908 909 if (val & ~SOF_TIMESTAMPING_MASK) 910 return -EINVAL; 911 912 if (val & SOF_TIMESTAMPING_OPT_ID_TCP && 913 !(val & SOF_TIMESTAMPING_OPT_ID)) 914 return -EINVAL; 915 916 if (val & SOF_TIMESTAMPING_OPT_ID && 917 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 918 if (sk_is_tcp(sk)) { 919 if ((1 << sk->sk_state) & 920 (TCPF_CLOSE | TCPF_LISTEN)) 921 return -EINVAL; 922 if (val & SOF_TIMESTAMPING_OPT_ID_TCP) 923 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); 924 else 925 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); 926 } else { 927 atomic_set(&sk->sk_tskey, 0); 928 } 929 } 930 931 if (val & SOF_TIMESTAMPING_OPT_STATS && 932 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) 933 return -EINVAL; 934 935 if (val & SOF_TIMESTAMPING_BIND_PHC) { 936 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); 937 if (ret) 938 return ret; 939 } 940 941 WRITE_ONCE(sk->sk_tsflags, val); 942 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); 943 sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY)); 944 945 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 946 sock_enable_timestamp(sk, 947 SOCK_TIMESTAMPING_RX_SOFTWARE); 948 else 949 sock_disable_timestamp(sk, 950 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 951 return 0; 952 } 953 954 #if defined(CONFIG_CGROUP_BPF) 955 void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op) 956 { 957 struct bpf_sock_ops_kern sock_ops; 958 959 memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); 960 sock_ops.op = op; 961 sock_ops.is_fullsock = 1; 962 sock_ops.sk = sk; 963 bpf_skops_init_skb(&sock_ops, skb, 0); 964 __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS); 965 } 966 #endif 967 968 void sock_set_keepalive(struct sock *sk) 969 { 970 lock_sock(sk); 971 if (sk->sk_prot->keepalive) 972 sk->sk_prot->keepalive(sk, true); 973 sock_valbool_flag(sk, SOCK_KEEPOPEN, true); 974 release_sock(sk); 975 } 976 EXPORT_SYMBOL(sock_set_keepalive); 977 978 static void __sock_set_rcvbuf(struct sock *sk, int val) 979 { 980 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it 981 * as a negative value. 982 */ 983 val = min_t(int, val, INT_MAX / 2); 984 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 985 986 /* We double it on the way in to account for "struct sk_buff" etc. 987 * overhead. Applications assume that the SO_RCVBUF setting they make 988 * will allow that much actual data to be received on that socket. 989 * 990 * Applications are unaware that "struct sk_buff" and other overheads 991 * allocate from the receive buffer during socket buffer allocation. 992 * 993 * And after considering the possible alternatives, returning the value 994 * we actually used in getsockopt is the most desirable behavior. 995 */ 996 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); 997 } 998 999 void sock_set_rcvbuf(struct sock *sk, int val) 1000 { 1001 lock_sock(sk); 1002 __sock_set_rcvbuf(sk, val); 1003 release_sock(sk); 1004 } 1005 EXPORT_SYMBOL(sock_set_rcvbuf); 1006 1007 static void __sock_set_mark(struct sock *sk, u32 val) 1008 { 1009 if (val != sk->sk_mark) { 1010 WRITE_ONCE(sk->sk_mark, val); 1011 sk_dst_reset(sk); 1012 } 1013 } 1014 1015 void sock_set_mark(struct sock *sk, u32 val) 1016 { 1017 lock_sock(sk); 1018 __sock_set_mark(sk, val); 1019 release_sock(sk); 1020 } 1021 EXPORT_SYMBOL(sock_set_mark); 1022 1023 static void sock_release_reserved_memory(struct sock *sk, int bytes) 1024 { 1025 /* Round down bytes to multiple of pages */ 1026 bytes = round_down(bytes, PAGE_SIZE); 1027 1028 WARN_ON(bytes > sk->sk_reserved_mem); 1029 WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes); 1030 sk_mem_reclaim(sk); 1031 } 1032 1033 static int sock_reserve_memory(struct sock *sk, int bytes) 1034 { 1035 long allocated; 1036 bool charged; 1037 int pages; 1038 1039 if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) 1040 return -EOPNOTSUPP; 1041 1042 if (!bytes) 1043 return 0; 1044 1045 pages = sk_mem_pages(bytes); 1046 1047 /* pre-charge to memcg */ 1048 charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages, 1049 GFP_KERNEL | __GFP_RETRY_MAYFAIL); 1050 if (!charged) 1051 return -ENOMEM; 1052 1053 /* pre-charge to forward_alloc */ 1054 sk_memory_allocated_add(sk, pages); 1055 allocated = sk_memory_allocated(sk); 1056 /* If the system goes into memory pressure with this 1057 * precharge, give up and return error. 1058 */ 1059 if (allocated > sk_prot_mem_limits(sk, 1)) { 1060 sk_memory_allocated_sub(sk, pages); 1061 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); 1062 return -ENOMEM; 1063 } 1064 sk_forward_alloc_add(sk, pages << PAGE_SHIFT); 1065 1066 WRITE_ONCE(sk->sk_reserved_mem, 1067 sk->sk_reserved_mem + (pages << PAGE_SHIFT)); 1068 1069 return 0; 1070 } 1071 1072 #ifdef CONFIG_PAGE_POOL 1073 1074 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED 1075 * in 1 syscall. The limit exists to limit the amount of memory the kernel 1076 * allocates to copy these tokens, and to prevent looping over the frags for 1077 * too long. 1078 */ 1079 #define MAX_DONTNEED_TOKENS 128 1080 #define MAX_DONTNEED_FRAGS 1024 1081 1082 static noinline_for_stack int 1083 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) 1084 { 1085 unsigned int num_tokens, i, j, k, netmem_num = 0; 1086 struct dmabuf_token *tokens; 1087 int ret = 0, num_frags = 0; 1088 netmem_ref netmems[16]; 1089 1090 if (!sk_is_tcp(sk)) 1091 return -EBADF; 1092 1093 if (optlen % sizeof(*tokens) || 1094 optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS) 1095 return -EINVAL; 1096 1097 num_tokens = optlen / sizeof(*tokens); 1098 tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL); 1099 if (!tokens) 1100 return -ENOMEM; 1101 1102 if (copy_from_sockptr(tokens, optval, optlen)) { 1103 kvfree(tokens); 1104 return -EFAULT; 1105 } 1106 1107 xa_lock_bh(&sk->sk_user_frags); 1108 for (i = 0; i < num_tokens; i++) { 1109 for (j = 0; j < tokens[i].token_count; j++) { 1110 if (++num_frags > MAX_DONTNEED_FRAGS) 1111 goto frag_limit_reached; 1112 1113 netmem_ref netmem = (__force netmem_ref)__xa_erase( 1114 &sk->sk_user_frags, tokens[i].token_start + j); 1115 1116 if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem))) 1117 continue; 1118 1119 netmems[netmem_num++] = netmem; 1120 if (netmem_num == ARRAY_SIZE(netmems)) { 1121 xa_unlock_bh(&sk->sk_user_frags); 1122 for (k = 0; k < netmem_num; k++) 1123 WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); 1124 netmem_num = 0; 1125 xa_lock_bh(&sk->sk_user_frags); 1126 } 1127 ret++; 1128 } 1129 } 1130 1131 frag_limit_reached: 1132 xa_unlock_bh(&sk->sk_user_frags); 1133 for (k = 0; k < netmem_num; k++) 1134 WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); 1135 1136 kvfree(tokens); 1137 return ret; 1138 } 1139 #endif 1140 1141 void sockopt_lock_sock(struct sock *sk) 1142 { 1143 /* When current->bpf_ctx is set, the setsockopt is called from 1144 * a bpf prog. bpf has ensured the sk lock has been 1145 * acquired before calling setsockopt(). 1146 */ 1147 if (has_current_bpf_ctx()) 1148 return; 1149 1150 lock_sock(sk); 1151 } 1152 EXPORT_SYMBOL(sockopt_lock_sock); 1153 1154 void sockopt_release_sock(struct sock *sk) 1155 { 1156 if (has_current_bpf_ctx()) 1157 return; 1158 1159 release_sock(sk); 1160 } 1161 EXPORT_SYMBOL(sockopt_release_sock); 1162 1163 bool sockopt_ns_capable(struct user_namespace *ns, int cap) 1164 { 1165 return has_current_bpf_ctx() || ns_capable(ns, cap); 1166 } 1167 EXPORT_SYMBOL(sockopt_ns_capable); 1168 1169 bool sockopt_capable(int cap) 1170 { 1171 return has_current_bpf_ctx() || capable(cap); 1172 } 1173 EXPORT_SYMBOL(sockopt_capable); 1174 1175 static int sockopt_validate_clockid(__kernel_clockid_t value) 1176 { 1177 switch (value) { 1178 case CLOCK_REALTIME: 1179 case CLOCK_MONOTONIC: 1180 case CLOCK_TAI: 1181 return 0; 1182 } 1183 return -EINVAL; 1184 } 1185 1186 /* 1187 * This is meant for all protocols to use and covers goings on 1188 * at the socket level. Everything here is generic. 1189 */ 1190 1191 int sk_setsockopt(struct sock *sk, int level, int optname, 1192 sockptr_t optval, unsigned int optlen) 1193 { 1194 struct so_timestamping timestamping; 1195 struct socket *sock = sk->sk_socket; 1196 struct sock_txtime sk_txtime; 1197 int val; 1198 int valbool; 1199 struct linger ling; 1200 int ret = 0; 1201 1202 /* 1203 * Options without arguments 1204 */ 1205 1206 if (optname == SO_BINDTODEVICE) 1207 return sock_setbindtodevice(sk, optval, optlen); 1208 1209 if (optlen < sizeof(int)) 1210 return -EINVAL; 1211 1212 if (copy_from_sockptr(&val, optval, sizeof(val))) 1213 return -EFAULT; 1214 1215 valbool = val ? 1 : 0; 1216 1217 /* handle options which do not require locking the socket. */ 1218 switch (optname) { 1219 case SO_PRIORITY: 1220 if (sk_set_prio_allowed(sk, val)) { 1221 sock_set_priority(sk, val); 1222 return 0; 1223 } 1224 return -EPERM; 1225 case SO_TYPE: 1226 case SO_PROTOCOL: 1227 case SO_DOMAIN: 1228 case SO_ERROR: 1229 return -ENOPROTOOPT; 1230 #ifdef CONFIG_NET_RX_BUSY_POLL 1231 case SO_BUSY_POLL: 1232 if (val < 0) 1233 return -EINVAL; 1234 WRITE_ONCE(sk->sk_ll_usec, val); 1235 return 0; 1236 case SO_PREFER_BUSY_POLL: 1237 if (valbool && !sockopt_capable(CAP_NET_ADMIN)) 1238 return -EPERM; 1239 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); 1240 return 0; 1241 case SO_BUSY_POLL_BUDGET: 1242 if (val > READ_ONCE(sk->sk_busy_poll_budget) && 1243 !sockopt_capable(CAP_NET_ADMIN)) 1244 return -EPERM; 1245 if (val < 0 || val > U16_MAX) 1246 return -EINVAL; 1247 WRITE_ONCE(sk->sk_busy_poll_budget, val); 1248 return 0; 1249 #endif 1250 case SO_MAX_PACING_RATE: 1251 { 1252 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; 1253 unsigned long pacing_rate; 1254 1255 if (sizeof(ulval) != sizeof(val) && 1256 optlen >= sizeof(ulval) && 1257 copy_from_sockptr(&ulval, optval, sizeof(ulval))) { 1258 return -EFAULT; 1259 } 1260 if (ulval != ~0UL) 1261 cmpxchg(&sk->sk_pacing_status, 1262 SK_PACING_NONE, 1263 SK_PACING_NEEDED); 1264 /* Pairs with READ_ONCE() from sk_getsockopt() */ 1265 WRITE_ONCE(sk->sk_max_pacing_rate, ulval); 1266 pacing_rate = READ_ONCE(sk->sk_pacing_rate); 1267 if (ulval < pacing_rate) 1268 WRITE_ONCE(sk->sk_pacing_rate, ulval); 1269 return 0; 1270 } 1271 case SO_TXREHASH: 1272 if (!sk_is_tcp(sk)) 1273 return -EOPNOTSUPP; 1274 if (val < -1 || val > 1) 1275 return -EINVAL; 1276 if ((u8)val == SOCK_TXREHASH_DEFAULT) 1277 val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); 1278 /* Paired with READ_ONCE() in tcp_rtx_synack() 1279 * and sk_getsockopt(). 1280 */ 1281 WRITE_ONCE(sk->sk_txrehash, (u8)val); 1282 return 0; 1283 case SO_PEEK_OFF: 1284 { 1285 int (*set_peek_off)(struct sock *sk, int val); 1286 1287 set_peek_off = READ_ONCE(sock->ops)->set_peek_off; 1288 if (set_peek_off) 1289 ret = set_peek_off(sk, val); 1290 else 1291 ret = -EOPNOTSUPP; 1292 return ret; 1293 } 1294 #ifdef CONFIG_PAGE_POOL 1295 case SO_DEVMEM_DONTNEED: 1296 return sock_devmem_dontneed(sk, optval, optlen); 1297 #endif 1298 } 1299 1300 sockopt_lock_sock(sk); 1301 1302 switch (optname) { 1303 case SO_DEBUG: 1304 if (val && !sockopt_capable(CAP_NET_ADMIN)) 1305 ret = -EACCES; 1306 else 1307 sock_valbool_flag(sk, SOCK_DBG, valbool); 1308 break; 1309 case SO_REUSEADDR: 1310 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 1311 break; 1312 case SO_REUSEPORT: 1313 if (valbool && !sk_is_inet(sk)) 1314 ret = -EOPNOTSUPP; 1315 else 1316 sk->sk_reuseport = valbool; 1317 break; 1318 case SO_DONTROUTE: 1319 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 1320 sk_dst_reset(sk); 1321 break; 1322 case SO_BROADCAST: 1323 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 1324 break; 1325 case SO_SNDBUF: 1326 /* Don't error on this BSD doesn't and if you think 1327 * about it this is right. Otherwise apps have to 1328 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1329 * are treated in BSD as hints 1330 */ 1331 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); 1332 set_sndbuf: 1333 /* Ensure val * 2 fits into an int, to prevent max_t() 1334 * from treating it as a negative value. 1335 */ 1336 val = min_t(int, val, INT_MAX / 2); 1337 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1338 WRITE_ONCE(sk->sk_sndbuf, 1339 max_t(int, val * 2, SOCK_MIN_SNDBUF)); 1340 /* Wake up sending tasks if we upped the value. */ 1341 sk->sk_write_space(sk); 1342 break; 1343 1344 case SO_SNDBUFFORCE: 1345 if (!sockopt_capable(CAP_NET_ADMIN)) { 1346 ret = -EPERM; 1347 break; 1348 } 1349 1350 /* No negative values (to prevent underflow, as val will be 1351 * multiplied by 2). 1352 */ 1353 if (val < 0) 1354 val = 0; 1355 goto set_sndbuf; 1356 1357 case SO_RCVBUF: 1358 /* Don't error on this BSD doesn't and if you think 1359 * about it this is right. Otherwise apps have to 1360 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1361 * are treated in BSD as hints 1362 */ 1363 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); 1364 break; 1365 1366 case SO_RCVBUFFORCE: 1367 if (!sockopt_capable(CAP_NET_ADMIN)) { 1368 ret = -EPERM; 1369 break; 1370 } 1371 1372 /* No negative values (to prevent underflow, as val will be 1373 * multiplied by 2). 1374 */ 1375 __sock_set_rcvbuf(sk, max(val, 0)); 1376 break; 1377 1378 case SO_KEEPALIVE: 1379 if (sk->sk_prot->keepalive) 1380 sk->sk_prot->keepalive(sk, valbool); 1381 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 1382 break; 1383 1384 case SO_OOBINLINE: 1385 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 1386 break; 1387 1388 case SO_NO_CHECK: 1389 sk->sk_no_check_tx = valbool; 1390 break; 1391 1392 case SO_LINGER: 1393 if (optlen < sizeof(ling)) { 1394 ret = -EINVAL; /* 1003.1g */ 1395 break; 1396 } 1397 if (copy_from_sockptr(&ling, optval, sizeof(ling))) { 1398 ret = -EFAULT; 1399 break; 1400 } 1401 if (!ling.l_onoff) { 1402 sock_reset_flag(sk, SOCK_LINGER); 1403 } else { 1404 unsigned long t_sec = ling.l_linger; 1405 1406 if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ) 1407 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT); 1408 else 1409 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ); 1410 sock_set_flag(sk, SOCK_LINGER); 1411 } 1412 break; 1413 1414 case SO_BSDCOMPAT: 1415 break; 1416 1417 case SO_TIMESTAMP_OLD: 1418 case SO_TIMESTAMP_NEW: 1419 case SO_TIMESTAMPNS_OLD: 1420 case SO_TIMESTAMPNS_NEW: 1421 sock_set_timestamp(sk, optname, valbool); 1422 break; 1423 1424 case SO_TIMESTAMPING_NEW: 1425 case SO_TIMESTAMPING_OLD: 1426 if (optlen == sizeof(timestamping)) { 1427 if (copy_from_sockptr(×tamping, optval, 1428 sizeof(timestamping))) { 1429 ret = -EFAULT; 1430 break; 1431 } 1432 } else { 1433 memset(×tamping, 0, sizeof(timestamping)); 1434 timestamping.flags = val; 1435 } 1436 ret = sock_set_timestamping(sk, optname, timestamping); 1437 break; 1438 1439 case SO_RCVLOWAT: 1440 { 1441 int (*set_rcvlowat)(struct sock *sk, int val) = NULL; 1442 1443 if (val < 0) 1444 val = INT_MAX; 1445 if (sock) 1446 set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat; 1447 if (set_rcvlowat) 1448 ret = set_rcvlowat(sk, val); 1449 else 1450 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 1451 break; 1452 } 1453 case SO_RCVTIMEO_OLD: 1454 case SO_RCVTIMEO_NEW: 1455 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, 1456 optlen, optname == SO_RCVTIMEO_OLD); 1457 break; 1458 1459 case SO_SNDTIMEO_OLD: 1460 case SO_SNDTIMEO_NEW: 1461 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, 1462 optlen, optname == SO_SNDTIMEO_OLD); 1463 break; 1464 1465 case SO_ATTACH_FILTER: { 1466 struct sock_fprog fprog; 1467 1468 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1469 if (!ret) 1470 ret = sk_attach_filter(&fprog, sk); 1471 break; 1472 } 1473 case SO_ATTACH_BPF: 1474 ret = -EINVAL; 1475 if (optlen == sizeof(u32)) { 1476 u32 ufd; 1477 1478 ret = -EFAULT; 1479 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1480 break; 1481 1482 ret = sk_attach_bpf(ufd, sk); 1483 } 1484 break; 1485 1486 case SO_ATTACH_REUSEPORT_CBPF: { 1487 struct sock_fprog fprog; 1488 1489 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1490 if (!ret) 1491 ret = sk_reuseport_attach_filter(&fprog, sk); 1492 break; 1493 } 1494 case SO_ATTACH_REUSEPORT_EBPF: 1495 ret = -EINVAL; 1496 if (optlen == sizeof(u32)) { 1497 u32 ufd; 1498 1499 ret = -EFAULT; 1500 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1501 break; 1502 1503 ret = sk_reuseport_attach_bpf(ufd, sk); 1504 } 1505 break; 1506 1507 case SO_DETACH_REUSEPORT_BPF: 1508 ret = reuseport_detach_prog(sk); 1509 break; 1510 1511 case SO_DETACH_FILTER: 1512 ret = sk_detach_filter(sk); 1513 break; 1514 1515 case SO_LOCK_FILTER: 1516 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 1517 ret = -EPERM; 1518 else 1519 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 1520 break; 1521 1522 case SO_MARK: 1523 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 1524 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1525 ret = -EPERM; 1526 break; 1527 } 1528 1529 __sock_set_mark(sk, val); 1530 break; 1531 case SO_RCVMARK: 1532 sock_valbool_flag(sk, SOCK_RCVMARK, valbool); 1533 break; 1534 1535 case SO_RCVPRIORITY: 1536 sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool); 1537 break; 1538 1539 case SO_RXQ_OVFL: 1540 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1541 break; 1542 1543 case SO_WIFI_STATUS: 1544 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1545 break; 1546 1547 case SO_NOFCS: 1548 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1549 break; 1550 1551 case SO_SELECT_ERR_QUEUE: 1552 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1553 break; 1554 1555 case SO_PASSCRED: 1556 if (sk_may_scm_recv(sk)) 1557 sk->sk_scm_credentials = valbool; 1558 else 1559 ret = -EOPNOTSUPP; 1560 break; 1561 1562 case SO_PASSSEC: 1563 if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk)) 1564 sk->sk_scm_security = valbool; 1565 else 1566 ret = -EOPNOTSUPP; 1567 break; 1568 1569 case SO_PASSPIDFD: 1570 if (sk_is_unix(sk)) 1571 sk->sk_scm_pidfd = valbool; 1572 else 1573 ret = -EOPNOTSUPP; 1574 break; 1575 1576 case SO_PASSRIGHTS: 1577 if (sk_is_unix(sk)) 1578 sk->sk_scm_rights = valbool; 1579 else 1580 ret = -EOPNOTSUPP; 1581 break; 1582 1583 case SO_INCOMING_CPU: 1584 reuseport_update_incoming_cpu(sk, val); 1585 break; 1586 1587 case SO_CNX_ADVICE: 1588 if (val == 1) 1589 dst_negative_advice(sk); 1590 break; 1591 1592 case SO_ZEROCOPY: 1593 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1594 if (!(sk_is_tcp(sk) || 1595 (sk->sk_type == SOCK_DGRAM && 1596 sk->sk_protocol == IPPROTO_UDP))) 1597 ret = -EOPNOTSUPP; 1598 } else if (sk->sk_family != PF_RDS) { 1599 ret = -EOPNOTSUPP; 1600 } 1601 if (!ret) { 1602 if (val < 0 || val > 1) 1603 ret = -EINVAL; 1604 else 1605 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1606 } 1607 break; 1608 1609 case SO_TXTIME: 1610 if (optlen != sizeof(struct sock_txtime)) { 1611 ret = -EINVAL; 1612 break; 1613 } else if (copy_from_sockptr(&sk_txtime, optval, 1614 sizeof(struct sock_txtime))) { 1615 ret = -EFAULT; 1616 break; 1617 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1618 ret = -EINVAL; 1619 break; 1620 } 1621 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet 1622 * scheduler has enough safe guards. 1623 */ 1624 if (sk_txtime.clockid != CLOCK_MONOTONIC && 1625 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1626 ret = -EPERM; 1627 break; 1628 } 1629 1630 ret = sockopt_validate_clockid(sk_txtime.clockid); 1631 if (ret) 1632 break; 1633 1634 sock_valbool_flag(sk, SOCK_TXTIME, true); 1635 sk->sk_clockid = sk_txtime.clockid; 1636 sk->sk_txtime_deadline_mode = 1637 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1638 sk->sk_txtime_report_errors = 1639 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1640 break; 1641 1642 case SO_BINDTOIFINDEX: 1643 ret = sock_bindtoindex_locked(sk, val); 1644 break; 1645 1646 case SO_BUF_LOCK: 1647 if (val & ~SOCK_BUF_LOCK_MASK) { 1648 ret = -EINVAL; 1649 break; 1650 } 1651 sk->sk_userlocks = val | (sk->sk_userlocks & 1652 ~SOCK_BUF_LOCK_MASK); 1653 break; 1654 1655 case SO_RESERVE_MEM: 1656 { 1657 int delta; 1658 1659 if (val < 0) { 1660 ret = -EINVAL; 1661 break; 1662 } 1663 1664 delta = val - sk->sk_reserved_mem; 1665 if (delta < 0) 1666 sock_release_reserved_memory(sk, -delta); 1667 else 1668 ret = sock_reserve_memory(sk, delta); 1669 break; 1670 } 1671 1672 default: 1673 ret = -ENOPROTOOPT; 1674 break; 1675 } 1676 sockopt_release_sock(sk); 1677 return ret; 1678 } 1679 1680 int sock_setsockopt(struct socket *sock, int level, int optname, 1681 sockptr_t optval, unsigned int optlen) 1682 { 1683 return sk_setsockopt(sock->sk, level, optname, 1684 optval, optlen); 1685 } 1686 EXPORT_SYMBOL(sock_setsockopt); 1687 1688 static const struct cred *sk_get_peer_cred(struct sock *sk) 1689 { 1690 const struct cred *cred; 1691 1692 spin_lock(&sk->sk_peer_lock); 1693 cred = get_cred(sk->sk_peer_cred); 1694 spin_unlock(&sk->sk_peer_lock); 1695 1696 return cred; 1697 } 1698 1699 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1700 struct ucred *ucred) 1701 { 1702 ucred->pid = pid_vnr(pid); 1703 ucred->uid = ucred->gid = -1; 1704 if (cred) { 1705 struct user_namespace *current_ns = current_user_ns(); 1706 1707 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1708 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1709 } 1710 } 1711 1712 static int groups_to_user(sockptr_t dst, const struct group_info *src) 1713 { 1714 struct user_namespace *user_ns = current_user_ns(); 1715 int i; 1716 1717 for (i = 0; i < src->ngroups; i++) { 1718 gid_t gid = from_kgid_munged(user_ns, src->gid[i]); 1719 1720 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid))) 1721 return -EFAULT; 1722 } 1723 1724 return 0; 1725 } 1726 1727 int sk_getsockopt(struct sock *sk, int level, int optname, 1728 sockptr_t optval, sockptr_t optlen) 1729 { 1730 struct socket *sock = sk->sk_socket; 1731 1732 union { 1733 int val; 1734 u64 val64; 1735 unsigned long ulval; 1736 struct linger ling; 1737 struct old_timeval32 tm32; 1738 struct __kernel_old_timeval tm; 1739 struct __kernel_sock_timeval stm; 1740 struct sock_txtime txtime; 1741 struct so_timestamping timestamping; 1742 } v; 1743 1744 int lv = sizeof(int); 1745 int len; 1746 1747 if (copy_from_sockptr(&len, optlen, sizeof(int))) 1748 return -EFAULT; 1749 if (len < 0) 1750 return -EINVAL; 1751 1752 memset(&v, 0, sizeof(v)); 1753 1754 switch (optname) { 1755 case SO_DEBUG: 1756 v.val = sock_flag(sk, SOCK_DBG); 1757 break; 1758 1759 case SO_DONTROUTE: 1760 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1761 break; 1762 1763 case SO_BROADCAST: 1764 v.val = sock_flag(sk, SOCK_BROADCAST); 1765 break; 1766 1767 case SO_SNDBUF: 1768 v.val = READ_ONCE(sk->sk_sndbuf); 1769 break; 1770 1771 case SO_RCVBUF: 1772 v.val = READ_ONCE(sk->sk_rcvbuf); 1773 break; 1774 1775 case SO_REUSEADDR: 1776 v.val = sk->sk_reuse; 1777 break; 1778 1779 case SO_REUSEPORT: 1780 v.val = sk->sk_reuseport; 1781 break; 1782 1783 case SO_KEEPALIVE: 1784 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1785 break; 1786 1787 case SO_TYPE: 1788 v.val = sk->sk_type; 1789 break; 1790 1791 case SO_PROTOCOL: 1792 v.val = sk->sk_protocol; 1793 break; 1794 1795 case SO_DOMAIN: 1796 v.val = sk->sk_family; 1797 break; 1798 1799 case SO_ERROR: 1800 v.val = -sock_error(sk); 1801 if (v.val == 0) 1802 v.val = xchg(&sk->sk_err_soft, 0); 1803 break; 1804 1805 case SO_OOBINLINE: 1806 v.val = sock_flag(sk, SOCK_URGINLINE); 1807 break; 1808 1809 case SO_NO_CHECK: 1810 v.val = sk->sk_no_check_tx; 1811 break; 1812 1813 case SO_PRIORITY: 1814 v.val = READ_ONCE(sk->sk_priority); 1815 break; 1816 1817 case SO_LINGER: 1818 lv = sizeof(v.ling); 1819 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1820 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ; 1821 break; 1822 1823 case SO_BSDCOMPAT: 1824 break; 1825 1826 case SO_TIMESTAMP_OLD: 1827 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1828 !sock_flag(sk, SOCK_TSTAMP_NEW) && 1829 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1830 break; 1831 1832 case SO_TIMESTAMPNS_OLD: 1833 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); 1834 break; 1835 1836 case SO_TIMESTAMP_NEW: 1837 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); 1838 break; 1839 1840 case SO_TIMESTAMPNS_NEW: 1841 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); 1842 break; 1843 1844 case SO_TIMESTAMPING_OLD: 1845 case SO_TIMESTAMPING_NEW: 1846 lv = sizeof(v.timestamping); 1847 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only 1848 * returning the flags when they were set through the same option. 1849 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD. 1850 */ 1851 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) { 1852 v.timestamping.flags = READ_ONCE(sk->sk_tsflags); 1853 v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); 1854 } 1855 break; 1856 1857 case SO_RCVTIMEO_OLD: 1858 case SO_RCVTIMEO_NEW: 1859 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v, 1860 SO_RCVTIMEO_OLD == optname); 1861 break; 1862 1863 case SO_SNDTIMEO_OLD: 1864 case SO_SNDTIMEO_NEW: 1865 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v, 1866 SO_SNDTIMEO_OLD == optname); 1867 break; 1868 1869 case SO_RCVLOWAT: 1870 v.val = READ_ONCE(sk->sk_rcvlowat); 1871 break; 1872 1873 case SO_SNDLOWAT: 1874 v.val = 1; 1875 break; 1876 1877 case SO_PASSCRED: 1878 if (!sk_may_scm_recv(sk)) 1879 return -EOPNOTSUPP; 1880 1881 v.val = sk->sk_scm_credentials; 1882 break; 1883 1884 case SO_PASSPIDFD: 1885 if (!sk_is_unix(sk)) 1886 return -EOPNOTSUPP; 1887 1888 v.val = sk->sk_scm_pidfd; 1889 break; 1890 1891 case SO_PASSRIGHTS: 1892 if (!sk_is_unix(sk)) 1893 return -EOPNOTSUPP; 1894 1895 v.val = sk->sk_scm_rights; 1896 break; 1897 1898 case SO_PEERCRED: 1899 { 1900 struct ucred peercred; 1901 if (len > sizeof(peercred)) 1902 len = sizeof(peercred); 1903 1904 spin_lock(&sk->sk_peer_lock); 1905 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1906 spin_unlock(&sk->sk_peer_lock); 1907 1908 if (copy_to_sockptr(optval, &peercred, len)) 1909 return -EFAULT; 1910 goto lenout; 1911 } 1912 1913 case SO_PEERPIDFD: 1914 { 1915 struct pid *peer_pid; 1916 struct file *pidfd_file = NULL; 1917 unsigned int flags = 0; 1918 int pidfd; 1919 1920 if (len > sizeof(pidfd)) 1921 len = sizeof(pidfd); 1922 1923 spin_lock(&sk->sk_peer_lock); 1924 peer_pid = get_pid(sk->sk_peer_pid); 1925 spin_unlock(&sk->sk_peer_lock); 1926 1927 if (!peer_pid) 1928 return -ENODATA; 1929 1930 /* The use of PIDFD_STALE requires stashing of struct pid 1931 * on pidfs with pidfs_register_pid() and only AF_UNIX 1932 * were prepared for this. 1933 */ 1934 if (sk->sk_family == AF_UNIX) 1935 flags = PIDFD_STALE; 1936 1937 pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file); 1938 put_pid(peer_pid); 1939 if (pidfd < 0) 1940 return pidfd; 1941 1942 if (copy_to_sockptr(optval, &pidfd, len) || 1943 copy_to_sockptr(optlen, &len, sizeof(int))) { 1944 put_unused_fd(pidfd); 1945 fput(pidfd_file); 1946 1947 return -EFAULT; 1948 } 1949 1950 fd_install(pidfd, pidfd_file); 1951 return 0; 1952 } 1953 1954 case SO_PEERGROUPS: 1955 { 1956 const struct cred *cred; 1957 int ret, n; 1958 1959 cred = sk_get_peer_cred(sk); 1960 if (!cred) 1961 return -ENODATA; 1962 1963 n = cred->group_info->ngroups; 1964 if (len < n * sizeof(gid_t)) { 1965 len = n * sizeof(gid_t); 1966 put_cred(cred); 1967 return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE; 1968 } 1969 len = n * sizeof(gid_t); 1970 1971 ret = groups_to_user(optval, cred->group_info); 1972 put_cred(cred); 1973 if (ret) 1974 return ret; 1975 goto lenout; 1976 } 1977 1978 case SO_PEERNAME: 1979 { 1980 struct sockaddr_storage address; 1981 1982 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2); 1983 if (lv < 0) 1984 return -ENOTCONN; 1985 if (lv < len) 1986 return -EINVAL; 1987 if (copy_to_sockptr(optval, &address, len)) 1988 return -EFAULT; 1989 goto lenout; 1990 } 1991 1992 /* Dubious BSD thing... Probably nobody even uses it, but 1993 * the UNIX standard wants it for whatever reason... -DaveM 1994 */ 1995 case SO_ACCEPTCONN: 1996 v.val = sk->sk_state == TCP_LISTEN; 1997 break; 1998 1999 case SO_PASSSEC: 2000 if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk)) 2001 return -EOPNOTSUPP; 2002 2003 v.val = sk->sk_scm_security; 2004 break; 2005 2006 case SO_PEERSEC: 2007 return security_socket_getpeersec_stream(sock, 2008 optval, optlen, len); 2009 2010 case SO_MARK: 2011 v.val = READ_ONCE(sk->sk_mark); 2012 break; 2013 2014 case SO_RCVMARK: 2015 v.val = sock_flag(sk, SOCK_RCVMARK); 2016 break; 2017 2018 case SO_RCVPRIORITY: 2019 v.val = sock_flag(sk, SOCK_RCVPRIORITY); 2020 break; 2021 2022 case SO_RXQ_OVFL: 2023 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 2024 break; 2025 2026 case SO_WIFI_STATUS: 2027 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 2028 break; 2029 2030 case SO_PEEK_OFF: 2031 if (!READ_ONCE(sock->ops)->set_peek_off) 2032 return -EOPNOTSUPP; 2033 2034 v.val = READ_ONCE(sk->sk_peek_off); 2035 break; 2036 case SO_NOFCS: 2037 v.val = sock_flag(sk, SOCK_NOFCS); 2038 break; 2039 2040 case SO_BINDTODEVICE: 2041 return sock_getbindtodevice(sk, optval, optlen, len); 2042 2043 case SO_GET_FILTER: 2044 len = sk_get_filter(sk, optval, len); 2045 if (len < 0) 2046 return len; 2047 2048 goto lenout; 2049 2050 case SO_LOCK_FILTER: 2051 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 2052 break; 2053 2054 case SO_BPF_EXTENSIONS: 2055 v.val = bpf_tell_extensions(); 2056 break; 2057 2058 case SO_SELECT_ERR_QUEUE: 2059 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 2060 break; 2061 2062 #ifdef CONFIG_NET_RX_BUSY_POLL 2063 case SO_BUSY_POLL: 2064 v.val = READ_ONCE(sk->sk_ll_usec); 2065 break; 2066 case SO_PREFER_BUSY_POLL: 2067 v.val = READ_ONCE(sk->sk_prefer_busy_poll); 2068 break; 2069 #endif 2070 2071 case SO_MAX_PACING_RATE: 2072 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */ 2073 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { 2074 lv = sizeof(v.ulval); 2075 v.ulval = READ_ONCE(sk->sk_max_pacing_rate); 2076 } else { 2077 /* 32bit version */ 2078 v.val = min_t(unsigned long, ~0U, 2079 READ_ONCE(sk->sk_max_pacing_rate)); 2080 } 2081 break; 2082 2083 case SO_INCOMING_CPU: 2084 v.val = READ_ONCE(sk->sk_incoming_cpu); 2085 break; 2086 2087 case SO_MEMINFO: 2088 { 2089 u32 meminfo[SK_MEMINFO_VARS]; 2090 2091 sk_get_meminfo(sk, meminfo); 2092 2093 len = min_t(unsigned int, len, sizeof(meminfo)); 2094 if (copy_to_sockptr(optval, &meminfo, len)) 2095 return -EFAULT; 2096 2097 goto lenout; 2098 } 2099 2100 #ifdef CONFIG_NET_RX_BUSY_POLL 2101 case SO_INCOMING_NAPI_ID: 2102 v.val = READ_ONCE(sk->sk_napi_id); 2103 2104 /* aggregate non-NAPI IDs down to 0 */ 2105 if (!napi_id_valid(v.val)) 2106 v.val = 0; 2107 2108 break; 2109 #endif 2110 2111 case SO_COOKIE: 2112 lv = sizeof(u64); 2113 if (len < lv) 2114 return -EINVAL; 2115 v.val64 = sock_gen_cookie(sk); 2116 break; 2117 2118 case SO_ZEROCOPY: 2119 v.val = sock_flag(sk, SOCK_ZEROCOPY); 2120 break; 2121 2122 case SO_TXTIME: 2123 lv = sizeof(v.txtime); 2124 v.txtime.clockid = sk->sk_clockid; 2125 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 2126 SOF_TXTIME_DEADLINE_MODE : 0; 2127 v.txtime.flags |= sk->sk_txtime_report_errors ? 2128 SOF_TXTIME_REPORT_ERRORS : 0; 2129 break; 2130 2131 case SO_BINDTOIFINDEX: 2132 v.val = READ_ONCE(sk->sk_bound_dev_if); 2133 break; 2134 2135 case SO_NETNS_COOKIE: 2136 lv = sizeof(u64); 2137 if (len != lv) 2138 return -EINVAL; 2139 v.val64 = sock_net(sk)->net_cookie; 2140 break; 2141 2142 case SO_BUF_LOCK: 2143 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; 2144 break; 2145 2146 case SO_RESERVE_MEM: 2147 v.val = READ_ONCE(sk->sk_reserved_mem); 2148 break; 2149 2150 case SO_TXREHASH: 2151 if (!sk_is_tcp(sk)) 2152 return -EOPNOTSUPP; 2153 2154 /* Paired with WRITE_ONCE() in sk_setsockopt() */ 2155 v.val = READ_ONCE(sk->sk_txrehash); 2156 break; 2157 2158 default: 2159 /* We implement the SO_SNDLOWAT etc to not be settable 2160 * (1003.1g 7). 2161 */ 2162 return -ENOPROTOOPT; 2163 } 2164 2165 if (len > lv) 2166 len = lv; 2167 if (copy_to_sockptr(optval, &v, len)) 2168 return -EFAULT; 2169 lenout: 2170 if (copy_to_sockptr(optlen, &len, sizeof(int))) 2171 return -EFAULT; 2172 return 0; 2173 } 2174 2175 /* 2176 * Initialize an sk_lock. 2177 * 2178 * (We also register the sk_lock with the lock validator.) 2179 */ 2180 static inline void sock_lock_init(struct sock *sk) 2181 { 2182 sk_owner_clear(sk); 2183 2184 if (sk->sk_kern_sock) 2185 sock_lock_init_class_and_name( 2186 sk, 2187 af_family_kern_slock_key_strings[sk->sk_family], 2188 af_family_kern_slock_keys + sk->sk_family, 2189 af_family_kern_key_strings[sk->sk_family], 2190 af_family_kern_keys + sk->sk_family); 2191 else 2192 sock_lock_init_class_and_name( 2193 sk, 2194 af_family_slock_key_strings[sk->sk_family], 2195 af_family_slock_keys + sk->sk_family, 2196 af_family_key_strings[sk->sk_family], 2197 af_family_keys + sk->sk_family); 2198 } 2199 2200 /* 2201 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 2202 * even temporarily, because of RCU lookups. sk_node should also be left as is. 2203 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 2204 */ 2205 static void sock_copy(struct sock *nsk, const struct sock *osk) 2206 { 2207 const struct proto *prot = READ_ONCE(osk->sk_prot); 2208 #ifdef CONFIG_SECURITY_NETWORK 2209 void *sptr = nsk->sk_security; 2210 #endif 2211 2212 /* If we move sk_tx_queue_mapping out of the private section, 2213 * we must check if sk_tx_queue_clear() is called after 2214 * sock_copy() in sk_clone_lock(). 2215 */ 2216 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < 2217 offsetof(struct sock, sk_dontcopy_begin) || 2218 offsetof(struct sock, sk_tx_queue_mapping) >= 2219 offsetof(struct sock, sk_dontcopy_end)); 2220 2221 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 2222 2223 unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 2224 prot->obj_size - offsetof(struct sock, sk_dontcopy_end), 2225 /* alloc is larger than struct, see sk_prot_alloc() */); 2226 2227 #ifdef CONFIG_SECURITY_NETWORK 2228 nsk->sk_security = sptr; 2229 security_sk_clone(osk, nsk); 2230 #endif 2231 } 2232 2233 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 2234 int family) 2235 { 2236 struct sock *sk; 2237 struct kmem_cache *slab; 2238 2239 slab = prot->slab; 2240 if (slab != NULL) { 2241 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 2242 if (!sk) 2243 return sk; 2244 if (want_init_on_alloc(priority)) 2245 sk_prot_clear_nulls(sk, prot->obj_size); 2246 } else 2247 sk = kmalloc(prot->obj_size, priority); 2248 2249 if (sk != NULL) { 2250 if (security_sk_alloc(sk, family, priority)) 2251 goto out_free; 2252 2253 if (!try_module_get(prot->owner)) 2254 goto out_free_sec; 2255 } 2256 2257 return sk; 2258 2259 out_free_sec: 2260 security_sk_free(sk); 2261 out_free: 2262 if (slab != NULL) 2263 kmem_cache_free(slab, sk); 2264 else 2265 kfree(sk); 2266 return NULL; 2267 } 2268 2269 static void sk_prot_free(struct proto *prot, struct sock *sk) 2270 { 2271 struct kmem_cache *slab; 2272 struct module *owner; 2273 2274 owner = prot->owner; 2275 slab = prot->slab; 2276 2277 cgroup_sk_free(&sk->sk_cgrp_data); 2278 mem_cgroup_sk_free(sk); 2279 security_sk_free(sk); 2280 2281 sk_owner_put(sk); 2282 2283 if (slab != NULL) 2284 kmem_cache_free(slab, sk); 2285 else 2286 kfree(sk); 2287 module_put(owner); 2288 } 2289 2290 /** 2291 * sk_alloc - All socket objects are allocated here 2292 * @net: the applicable net namespace 2293 * @family: protocol family 2294 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2295 * @prot: struct proto associated with this new sock instance 2296 * @kern: is this to be a kernel socket? 2297 */ 2298 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 2299 struct proto *prot, int kern) 2300 { 2301 struct sock *sk; 2302 2303 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 2304 if (sk) { 2305 sk->sk_family = family; 2306 /* 2307 * See comment in struct sock definition to understand 2308 * why we need sk_prot_creator -acme 2309 */ 2310 sk->sk_prot = sk->sk_prot_creator = prot; 2311 sk->sk_kern_sock = kern; 2312 sock_lock_init(sk); 2313 sk->sk_net_refcnt = kern ? 0 : 1; 2314 if (likely(sk->sk_net_refcnt)) { 2315 get_net_track(net, &sk->ns_tracker, priority); 2316 sock_inuse_add(net, 1); 2317 } else { 2318 net_passive_inc(net); 2319 __netns_tracker_alloc(net, &sk->ns_tracker, 2320 false, priority); 2321 } 2322 2323 sock_net_set(sk, net); 2324 refcount_set(&sk->sk_wmem_alloc, 1); 2325 2326 mem_cgroup_sk_alloc(sk); 2327 cgroup_sk_alloc(&sk->sk_cgrp_data); 2328 sock_update_classid(&sk->sk_cgrp_data); 2329 sock_update_netprioidx(&sk->sk_cgrp_data); 2330 sk_tx_queue_clear(sk); 2331 } 2332 2333 return sk; 2334 } 2335 EXPORT_SYMBOL(sk_alloc); 2336 2337 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 2338 * grace period. This is the case for UDP sockets and TCP listeners. 2339 */ 2340 static void __sk_destruct(struct rcu_head *head) 2341 { 2342 struct sock *sk = container_of(head, struct sock, sk_rcu); 2343 struct net *net = sock_net(sk); 2344 struct sk_filter *filter; 2345 2346 if (sk->sk_destruct) 2347 sk->sk_destruct(sk); 2348 2349 filter = rcu_dereference_check(sk->sk_filter, 2350 refcount_read(&sk->sk_wmem_alloc) == 0); 2351 if (filter) { 2352 sk_filter_uncharge(sk, filter); 2353 RCU_INIT_POINTER(sk->sk_filter, NULL); 2354 } 2355 2356 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 2357 2358 #ifdef CONFIG_BPF_SYSCALL 2359 bpf_sk_storage_free(sk); 2360 #endif 2361 2362 if (atomic_read(&sk->sk_omem_alloc)) 2363 pr_debug("%s: optmem leakage (%d bytes) detected\n", 2364 __func__, atomic_read(&sk->sk_omem_alloc)); 2365 2366 if (sk->sk_frag.page) { 2367 put_page(sk->sk_frag.page); 2368 sk->sk_frag.page = NULL; 2369 } 2370 2371 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ 2372 put_cred(sk->sk_peer_cred); 2373 put_pid(sk->sk_peer_pid); 2374 2375 if (likely(sk->sk_net_refcnt)) { 2376 put_net_track(net, &sk->ns_tracker); 2377 } else { 2378 __netns_tracker_free(net, &sk->ns_tracker, false); 2379 net_passive_dec(net); 2380 } 2381 sk_prot_free(sk->sk_prot_creator, sk); 2382 } 2383 2384 void sk_net_refcnt_upgrade(struct sock *sk) 2385 { 2386 struct net *net = sock_net(sk); 2387 2388 WARN_ON_ONCE(sk->sk_net_refcnt); 2389 __netns_tracker_free(net, &sk->ns_tracker, false); 2390 net_passive_dec(net); 2391 sk->sk_net_refcnt = 1; 2392 get_net_track(net, &sk->ns_tracker, GFP_KERNEL); 2393 sock_inuse_add(net, 1); 2394 } 2395 EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade); 2396 2397 void sk_destruct(struct sock *sk) 2398 { 2399 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); 2400 2401 if (rcu_access_pointer(sk->sk_reuseport_cb)) { 2402 reuseport_detach_sock(sk); 2403 use_call_rcu = true; 2404 } 2405 2406 if (use_call_rcu) 2407 call_rcu(&sk->sk_rcu, __sk_destruct); 2408 else 2409 __sk_destruct(&sk->sk_rcu); 2410 } 2411 2412 static void __sk_free(struct sock *sk) 2413 { 2414 if (likely(sk->sk_net_refcnt)) 2415 sock_inuse_add(sock_net(sk), -1); 2416 2417 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 2418 sock_diag_broadcast_destroy(sk); 2419 else 2420 sk_destruct(sk); 2421 } 2422 2423 void sk_free(struct sock *sk) 2424 { 2425 /* 2426 * We subtract one from sk_wmem_alloc and can know if 2427 * some packets are still in some tx queue. 2428 * If not null, sock_wfree() will call __sk_free(sk) later 2429 */ 2430 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 2431 __sk_free(sk); 2432 } 2433 EXPORT_SYMBOL(sk_free); 2434 2435 static void sk_init_common(struct sock *sk) 2436 { 2437 skb_queue_head_init(&sk->sk_receive_queue); 2438 skb_queue_head_init(&sk->sk_write_queue); 2439 skb_queue_head_init(&sk->sk_error_queue); 2440 2441 rwlock_init(&sk->sk_callback_lock); 2442 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 2443 af_rlock_keys + sk->sk_family, 2444 af_family_rlock_key_strings[sk->sk_family]); 2445 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 2446 af_wlock_keys + sk->sk_family, 2447 af_family_wlock_key_strings[sk->sk_family]); 2448 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 2449 af_elock_keys + sk->sk_family, 2450 af_family_elock_key_strings[sk->sk_family]); 2451 if (sk->sk_kern_sock) 2452 lockdep_set_class_and_name(&sk->sk_callback_lock, 2453 af_kern_callback_keys + sk->sk_family, 2454 af_family_kern_clock_key_strings[sk->sk_family]); 2455 else 2456 lockdep_set_class_and_name(&sk->sk_callback_lock, 2457 af_callback_keys + sk->sk_family, 2458 af_family_clock_key_strings[sk->sk_family]); 2459 } 2460 2461 /** 2462 * sk_clone_lock - clone a socket, and lock its clone 2463 * @sk: the socket to clone 2464 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2465 * 2466 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 2467 */ 2468 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 2469 { 2470 struct proto *prot = READ_ONCE(sk->sk_prot); 2471 struct sk_filter *filter; 2472 bool is_charged = true; 2473 struct sock *newsk; 2474 2475 newsk = sk_prot_alloc(prot, priority, sk->sk_family); 2476 if (!newsk) 2477 goto out; 2478 2479 sock_copy(newsk, sk); 2480 2481 newsk->sk_prot_creator = prot; 2482 2483 /* SANITY */ 2484 if (likely(newsk->sk_net_refcnt)) { 2485 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); 2486 sock_inuse_add(sock_net(newsk), 1); 2487 } else { 2488 /* Kernel sockets are not elevating the struct net refcount. 2489 * Instead, use a tracker to more easily detect if a layer 2490 * is not properly dismantling its kernel sockets at netns 2491 * destroy time. 2492 */ 2493 net_passive_inc(sock_net(newsk)); 2494 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, 2495 false, priority); 2496 } 2497 sk_node_init(&newsk->sk_node); 2498 sock_lock_init(newsk); 2499 bh_lock_sock(newsk); 2500 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 2501 newsk->sk_backlog.len = 0; 2502 2503 atomic_set(&newsk->sk_rmem_alloc, 0); 2504 2505 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ 2506 refcount_set(&newsk->sk_wmem_alloc, 1); 2507 2508 atomic_set(&newsk->sk_omem_alloc, 0); 2509 sk_init_common(newsk); 2510 2511 newsk->sk_dst_cache = NULL; 2512 newsk->sk_dst_pending_confirm = 0; 2513 newsk->sk_wmem_queued = 0; 2514 newsk->sk_forward_alloc = 0; 2515 newsk->sk_reserved_mem = 0; 2516 atomic_set(&newsk->sk_drops, 0); 2517 newsk->sk_send_head = NULL; 2518 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 2519 atomic_set(&newsk->sk_zckey, 0); 2520 2521 sock_reset_flag(newsk, SOCK_DONE); 2522 2523 /* sk->sk_memcg will be populated at accept() time */ 2524 newsk->sk_memcg = NULL; 2525 2526 cgroup_sk_clone(&newsk->sk_cgrp_data); 2527 2528 rcu_read_lock(); 2529 filter = rcu_dereference(sk->sk_filter); 2530 if (filter != NULL) 2531 /* though it's an empty new sock, the charging may fail 2532 * if sysctl_optmem_max was changed between creation of 2533 * original socket and cloning 2534 */ 2535 is_charged = sk_filter_charge(newsk, filter); 2536 RCU_INIT_POINTER(newsk->sk_filter, filter); 2537 rcu_read_unlock(); 2538 2539 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 2540 /* We need to make sure that we don't uncharge the new 2541 * socket if we couldn't charge it in the first place 2542 * as otherwise we uncharge the parent's filter. 2543 */ 2544 if (!is_charged) 2545 RCU_INIT_POINTER(newsk->sk_filter, NULL); 2546 2547 goto free; 2548 } 2549 2550 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 2551 2552 if (bpf_sk_storage_clone(sk, newsk)) 2553 goto free; 2554 2555 /* Clear sk_user_data if parent had the pointer tagged 2556 * as not suitable for copying when cloning. 2557 */ 2558 if (sk_user_data_is_nocopy(newsk)) 2559 newsk->sk_user_data = NULL; 2560 2561 newsk->sk_err = 0; 2562 newsk->sk_err_soft = 0; 2563 newsk->sk_priority = 0; 2564 newsk->sk_incoming_cpu = raw_smp_processor_id(); 2565 2566 /* Before updating sk_refcnt, we must commit prior changes to memory 2567 * (Documentation/RCU/rculist_nulls.rst for details) 2568 */ 2569 smp_wmb(); 2570 refcount_set(&newsk->sk_refcnt, 2); 2571 2572 sk_set_socket(newsk, NULL); 2573 sk_tx_queue_clear(newsk); 2574 RCU_INIT_POINTER(newsk->sk_wq, NULL); 2575 2576 if (newsk->sk_prot->sockets_allocated) 2577 sk_sockets_allocated_inc(newsk); 2578 2579 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) 2580 net_enable_timestamp(); 2581 out: 2582 return newsk; 2583 free: 2584 /* It is still raw copy of parent, so invalidate 2585 * destructor and make plain sk_free() 2586 */ 2587 newsk->sk_destruct = NULL; 2588 bh_unlock_sock(newsk); 2589 sk_free(newsk); 2590 newsk = NULL; 2591 goto out; 2592 } 2593 EXPORT_SYMBOL_GPL(sk_clone_lock); 2594 2595 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) 2596 { 2597 bool is_ipv6 = false; 2598 u32 max_size; 2599 2600 #if IS_ENABLED(CONFIG_IPV6) 2601 is_ipv6 = (sk->sk_family == AF_INET6 && 2602 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); 2603 #endif 2604 /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ 2605 max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) : 2606 READ_ONCE(dst->dev->gso_ipv4_max_size); 2607 if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) 2608 max_size = GSO_LEGACY_MAX_SIZE; 2609 2610 return max_size - (MAX_TCP_HEADER + 1); 2611 } 2612 2613 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 2614 { 2615 u32 max_segs = 1; 2616 2617 sk->sk_route_caps = dst->dev->features; 2618 if (sk_is_tcp(sk)) { 2619 struct inet_connection_sock *icsk = inet_csk(sk); 2620 2621 sk->sk_route_caps |= NETIF_F_GSO; 2622 icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK); 2623 } 2624 if (sk->sk_route_caps & NETIF_F_GSO) 2625 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 2626 if (unlikely(sk->sk_gso_disabled)) 2627 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2628 if (sk_can_gso(sk)) { 2629 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 2630 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2631 } else { 2632 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 2633 sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); 2634 /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ 2635 max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); 2636 } 2637 } 2638 sk->sk_gso_max_segs = max_segs; 2639 sk_dst_set(sk, dst); 2640 } 2641 EXPORT_SYMBOL_GPL(sk_setup_caps); 2642 2643 /* 2644 * Simple resource managers for sockets. 2645 */ 2646 2647 2648 /* 2649 * Write buffer destructor automatically called from kfree_skb. 2650 */ 2651 void sock_wfree(struct sk_buff *skb) 2652 { 2653 struct sock *sk = skb->sk; 2654 unsigned int len = skb->truesize; 2655 bool free; 2656 2657 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 2658 if (sock_flag(sk, SOCK_RCU_FREE) && 2659 sk->sk_write_space == sock_def_write_space) { 2660 rcu_read_lock(); 2661 free = refcount_sub_and_test(len, &sk->sk_wmem_alloc); 2662 sock_def_write_space_wfree(sk); 2663 rcu_read_unlock(); 2664 if (unlikely(free)) 2665 __sk_free(sk); 2666 return; 2667 } 2668 2669 /* 2670 * Keep a reference on sk_wmem_alloc, this will be released 2671 * after sk_write_space() call 2672 */ 2673 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 2674 sk->sk_write_space(sk); 2675 len = 1; 2676 } 2677 /* 2678 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 2679 * could not do because of in-flight packets 2680 */ 2681 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 2682 __sk_free(sk); 2683 } 2684 EXPORT_SYMBOL(sock_wfree); 2685 2686 /* This variant of sock_wfree() is used by TCP, 2687 * since it sets SOCK_USE_WRITE_QUEUE. 2688 */ 2689 void __sock_wfree(struct sk_buff *skb) 2690 { 2691 struct sock *sk = skb->sk; 2692 2693 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 2694 __sk_free(sk); 2695 } 2696 2697 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 2698 { 2699 skb_orphan(skb); 2700 #ifdef CONFIG_INET 2701 if (unlikely(!sk_fullsock(sk))) 2702 return skb_set_owner_edemux(skb, sk); 2703 #endif 2704 skb->sk = sk; 2705 skb->destructor = sock_wfree; 2706 skb_set_hash_from_sk(skb, sk); 2707 /* 2708 * We used to take a refcount on sk, but following operation 2709 * is enough to guarantee sk_free() won't free this sock until 2710 * all in-flight packets are completed 2711 */ 2712 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 2713 } 2714 EXPORT_SYMBOL(skb_set_owner_w); 2715 2716 static bool can_skb_orphan_partial(const struct sk_buff *skb) 2717 { 2718 /* Drivers depend on in-order delivery for crypto offload, 2719 * partial orphan breaks out-of-order-OK logic. 2720 */ 2721 if (skb_is_decrypted(skb)) 2722 return false; 2723 2724 return (skb->destructor == sock_wfree || 2725 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); 2726 } 2727 2728 /* This helper is used by netem, as it can hold packets in its 2729 * delay queue. We want to allow the owner socket to send more 2730 * packets, as if they were already TX completed by a typical driver. 2731 * But we also want to keep skb->sk set because some packet schedulers 2732 * rely on it (sch_fq for example). 2733 */ 2734 void skb_orphan_partial(struct sk_buff *skb) 2735 { 2736 if (skb_is_tcp_pure_ack(skb)) 2737 return; 2738 2739 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) 2740 return; 2741 2742 skb_orphan(skb); 2743 } 2744 EXPORT_SYMBOL(skb_orphan_partial); 2745 2746 /* 2747 * Read buffer destructor automatically called from kfree_skb. 2748 */ 2749 void sock_rfree(struct sk_buff *skb) 2750 { 2751 struct sock *sk = skb->sk; 2752 unsigned int len = skb->truesize; 2753 2754 atomic_sub(len, &sk->sk_rmem_alloc); 2755 sk_mem_uncharge(sk, len); 2756 } 2757 EXPORT_SYMBOL(sock_rfree); 2758 2759 /* 2760 * Buffer destructor for skbs that are not used directly in read or write 2761 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 2762 */ 2763 void sock_efree(struct sk_buff *skb) 2764 { 2765 sock_put(skb->sk); 2766 } 2767 EXPORT_SYMBOL(sock_efree); 2768 2769 /* Buffer destructor for prefetch/receive path where reference count may 2770 * not be held, e.g. for listen sockets. 2771 */ 2772 #ifdef CONFIG_INET 2773 void sock_pfree(struct sk_buff *skb) 2774 { 2775 struct sock *sk = skb->sk; 2776 2777 if (!sk_is_refcounted(sk)) 2778 return; 2779 2780 if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { 2781 inet_reqsk(sk)->rsk_listener = NULL; 2782 reqsk_free(inet_reqsk(sk)); 2783 return; 2784 } 2785 2786 sock_gen_put(sk); 2787 } 2788 EXPORT_SYMBOL(sock_pfree); 2789 #endif /* CONFIG_INET */ 2790 2791 kuid_t sock_i_uid(struct sock *sk) 2792 { 2793 kuid_t uid; 2794 2795 read_lock_bh(&sk->sk_callback_lock); 2796 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 2797 read_unlock_bh(&sk->sk_callback_lock); 2798 return uid; 2799 } 2800 EXPORT_SYMBOL(sock_i_uid); 2801 2802 unsigned long __sock_i_ino(struct sock *sk) 2803 { 2804 unsigned long ino; 2805 2806 read_lock(&sk->sk_callback_lock); 2807 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 2808 read_unlock(&sk->sk_callback_lock); 2809 return ino; 2810 } 2811 EXPORT_SYMBOL(__sock_i_ino); 2812 2813 unsigned long sock_i_ino(struct sock *sk) 2814 { 2815 unsigned long ino; 2816 2817 local_bh_disable(); 2818 ino = __sock_i_ino(sk); 2819 local_bh_enable(); 2820 return ino; 2821 } 2822 EXPORT_SYMBOL(sock_i_ino); 2823 2824 /* 2825 * Allocate a skb from the socket's send buffer. 2826 */ 2827 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 2828 gfp_t priority) 2829 { 2830 if (force || 2831 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { 2832 struct sk_buff *skb = alloc_skb(size, priority); 2833 2834 if (skb) { 2835 skb_set_owner_w(skb, sk); 2836 return skb; 2837 } 2838 } 2839 return NULL; 2840 } 2841 EXPORT_SYMBOL(sock_wmalloc); 2842 2843 static void sock_ofree(struct sk_buff *skb) 2844 { 2845 struct sock *sk = skb->sk; 2846 2847 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 2848 } 2849 2850 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 2851 gfp_t priority) 2852 { 2853 struct sk_buff *skb; 2854 2855 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2856 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2857 READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) 2858 return NULL; 2859 2860 skb = alloc_skb(size, priority); 2861 if (!skb) 2862 return NULL; 2863 2864 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2865 skb->sk = sk; 2866 skb->destructor = sock_ofree; 2867 return skb; 2868 } 2869 2870 /* 2871 * Allocate a memory block from the socket's option memory buffer. 2872 */ 2873 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2874 { 2875 int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); 2876 2877 if ((unsigned int)size <= optmem_max && 2878 atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { 2879 void *mem; 2880 /* First do the add, to avoid the race if kmalloc 2881 * might sleep. 2882 */ 2883 atomic_add(size, &sk->sk_omem_alloc); 2884 mem = kmalloc(size, priority); 2885 if (mem) 2886 return mem; 2887 atomic_sub(size, &sk->sk_omem_alloc); 2888 } 2889 return NULL; 2890 } 2891 EXPORT_SYMBOL(sock_kmalloc); 2892 2893 /* 2894 * Duplicate the input "src" memory block using the socket's 2895 * option memory buffer. 2896 */ 2897 void *sock_kmemdup(struct sock *sk, const void *src, 2898 int size, gfp_t priority) 2899 { 2900 void *mem; 2901 2902 mem = sock_kmalloc(sk, size, priority); 2903 if (mem) 2904 memcpy(mem, src, size); 2905 return mem; 2906 } 2907 EXPORT_SYMBOL(sock_kmemdup); 2908 2909 /* Free an option memory block. Note, we actually want the inline 2910 * here as this allows gcc to detect the nullify and fold away the 2911 * condition entirely. 2912 */ 2913 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2914 const bool nullify) 2915 { 2916 if (WARN_ON_ONCE(!mem)) 2917 return; 2918 if (nullify) 2919 kfree_sensitive(mem); 2920 else 2921 kfree(mem); 2922 atomic_sub(size, &sk->sk_omem_alloc); 2923 } 2924 2925 void sock_kfree_s(struct sock *sk, void *mem, int size) 2926 { 2927 __sock_kfree_s(sk, mem, size, false); 2928 } 2929 EXPORT_SYMBOL(sock_kfree_s); 2930 2931 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2932 { 2933 __sock_kfree_s(sk, mem, size, true); 2934 } 2935 EXPORT_SYMBOL(sock_kzfree_s); 2936 2937 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2938 I think, these locks should be removed for datagram sockets. 2939 */ 2940 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2941 { 2942 DEFINE_WAIT(wait); 2943 2944 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2945 for (;;) { 2946 if (!timeo) 2947 break; 2948 if (signal_pending(current)) 2949 break; 2950 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2951 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2952 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) 2953 break; 2954 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2955 break; 2956 if (READ_ONCE(sk->sk_err)) 2957 break; 2958 timeo = schedule_timeout(timeo); 2959 } 2960 finish_wait(sk_sleep(sk), &wait); 2961 return timeo; 2962 } 2963 2964 2965 /* 2966 * Generic send/receive buffer handlers 2967 */ 2968 2969 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2970 unsigned long data_len, int noblock, 2971 int *errcode, int max_page_order) 2972 { 2973 struct sk_buff *skb; 2974 long timeo; 2975 int err; 2976 2977 timeo = sock_sndtimeo(sk, noblock); 2978 for (;;) { 2979 err = sock_error(sk); 2980 if (err != 0) 2981 goto failure; 2982 2983 err = -EPIPE; 2984 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2985 goto failure; 2986 2987 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) 2988 break; 2989 2990 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2991 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2992 err = -EAGAIN; 2993 if (!timeo) 2994 goto failure; 2995 if (signal_pending(current)) 2996 goto interrupted; 2997 timeo = sock_wait_for_wmem(sk, timeo); 2998 } 2999 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 3000 errcode, sk->sk_allocation); 3001 if (skb) 3002 skb_set_owner_w(skb, sk); 3003 return skb; 3004 3005 interrupted: 3006 err = sock_intr_errno(timeo); 3007 failure: 3008 *errcode = err; 3009 return NULL; 3010 } 3011 EXPORT_SYMBOL(sock_alloc_send_pskb); 3012 3013 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, 3014 struct sockcm_cookie *sockc) 3015 { 3016 u32 tsflags; 3017 3018 BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31)); 3019 3020 switch (cmsg->cmsg_type) { 3021 case SO_MARK: 3022 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 3023 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3024 return -EPERM; 3025 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3026 return -EINVAL; 3027 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 3028 break; 3029 case SO_TIMESTAMPING_OLD: 3030 case SO_TIMESTAMPING_NEW: 3031 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3032 return -EINVAL; 3033 3034 tsflags = *(u32 *)CMSG_DATA(cmsg); 3035 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 3036 return -EINVAL; 3037 3038 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 3039 sockc->tsflags |= tsflags; 3040 break; 3041 case SCM_TXTIME: 3042 if (!sock_flag(sk, SOCK_TXTIME)) 3043 return -EINVAL; 3044 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 3045 return -EINVAL; 3046 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 3047 break; 3048 case SCM_TS_OPT_ID: 3049 if (sk_is_tcp(sk)) 3050 return -EINVAL; 3051 tsflags = READ_ONCE(sk->sk_tsflags); 3052 if (!(tsflags & SOF_TIMESTAMPING_OPT_ID)) 3053 return -EINVAL; 3054 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3055 return -EINVAL; 3056 sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg); 3057 sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID; 3058 break; 3059 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 3060 case SCM_RIGHTS: 3061 case SCM_CREDENTIALS: 3062 break; 3063 case SO_PRIORITY: 3064 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3065 return -EINVAL; 3066 if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg))) 3067 return -EPERM; 3068 sockc->priority = *(u32 *)CMSG_DATA(cmsg); 3069 break; 3070 case SCM_DEVMEM_DMABUF: 3071 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3072 return -EINVAL; 3073 sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg); 3074 break; 3075 default: 3076 return -EINVAL; 3077 } 3078 return 0; 3079 } 3080 EXPORT_SYMBOL(__sock_cmsg_send); 3081 3082 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 3083 struct sockcm_cookie *sockc) 3084 { 3085 struct cmsghdr *cmsg; 3086 int ret; 3087 3088 for_each_cmsghdr(cmsg, msg) { 3089 if (!CMSG_OK(msg, cmsg)) 3090 return -EINVAL; 3091 if (cmsg->cmsg_level != SOL_SOCKET) 3092 continue; 3093 ret = __sock_cmsg_send(sk, cmsg, sockc); 3094 if (ret) 3095 return ret; 3096 } 3097 return 0; 3098 } 3099 EXPORT_SYMBOL(sock_cmsg_send); 3100 3101 static void sk_enter_memory_pressure(struct sock *sk) 3102 { 3103 if (!sk->sk_prot->enter_memory_pressure) 3104 return; 3105 3106 sk->sk_prot->enter_memory_pressure(sk); 3107 } 3108 3109 static void sk_leave_memory_pressure(struct sock *sk) 3110 { 3111 if (sk->sk_prot->leave_memory_pressure) { 3112 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure, 3113 tcp_leave_memory_pressure, sk); 3114 } else { 3115 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 3116 3117 if (memory_pressure && READ_ONCE(*memory_pressure)) 3118 WRITE_ONCE(*memory_pressure, 0); 3119 } 3120 } 3121 3122 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); 3123 3124 /** 3125 * skb_page_frag_refill - check that a page_frag contains enough room 3126 * @sz: minimum size of the fragment we want to get 3127 * @pfrag: pointer to page_frag 3128 * @gfp: priority for memory allocation 3129 * 3130 * Note: While this allocator tries to use high order pages, there is 3131 * no guarantee that allocations succeed. Therefore, @sz MUST be 3132 * less or equal than PAGE_SIZE. 3133 */ 3134 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 3135 { 3136 if (pfrag->page) { 3137 if (page_ref_count(pfrag->page) == 1) { 3138 pfrag->offset = 0; 3139 return true; 3140 } 3141 if (pfrag->offset + sz <= pfrag->size) 3142 return true; 3143 put_page(pfrag->page); 3144 } 3145 3146 pfrag->offset = 0; 3147 if (SKB_FRAG_PAGE_ORDER && 3148 !static_branch_unlikely(&net_high_order_alloc_disable_key)) { 3149 /* Avoid direct reclaim but allow kswapd to wake */ 3150 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 3151 __GFP_COMP | __GFP_NOWARN | 3152 __GFP_NORETRY, 3153 SKB_FRAG_PAGE_ORDER); 3154 if (likely(pfrag->page)) { 3155 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 3156 return true; 3157 } 3158 } 3159 pfrag->page = alloc_page(gfp); 3160 if (likely(pfrag->page)) { 3161 pfrag->size = PAGE_SIZE; 3162 return true; 3163 } 3164 return false; 3165 } 3166 EXPORT_SYMBOL(skb_page_frag_refill); 3167 3168 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 3169 { 3170 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 3171 return true; 3172 3173 sk_enter_memory_pressure(sk); 3174 sk_stream_moderate_sndbuf(sk); 3175 return false; 3176 } 3177 EXPORT_SYMBOL(sk_page_frag_refill); 3178 3179 void __lock_sock(struct sock *sk) 3180 __releases(&sk->sk_lock.slock) 3181 __acquires(&sk->sk_lock.slock) 3182 { 3183 DEFINE_WAIT(wait); 3184 3185 for (;;) { 3186 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 3187 TASK_UNINTERRUPTIBLE); 3188 spin_unlock_bh(&sk->sk_lock.slock); 3189 schedule(); 3190 spin_lock_bh(&sk->sk_lock.slock); 3191 if (!sock_owned_by_user(sk)) 3192 break; 3193 } 3194 finish_wait(&sk->sk_lock.wq, &wait); 3195 } 3196 3197 void __release_sock(struct sock *sk) 3198 __releases(&sk->sk_lock.slock) 3199 __acquires(&sk->sk_lock.slock) 3200 { 3201 struct sk_buff *skb, *next; 3202 3203 while ((skb = sk->sk_backlog.head) != NULL) { 3204 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 3205 3206 spin_unlock_bh(&sk->sk_lock.slock); 3207 3208 do { 3209 next = skb->next; 3210 prefetch(next); 3211 DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb)); 3212 skb_mark_not_on_list(skb); 3213 sk_backlog_rcv(sk, skb); 3214 3215 cond_resched(); 3216 3217 skb = next; 3218 } while (skb != NULL); 3219 3220 spin_lock_bh(&sk->sk_lock.slock); 3221 } 3222 3223 /* 3224 * Doing the zeroing here guarantee we can not loop forever 3225 * while a wild producer attempts to flood us. 3226 */ 3227 sk->sk_backlog.len = 0; 3228 } 3229 3230 void __sk_flush_backlog(struct sock *sk) 3231 { 3232 spin_lock_bh(&sk->sk_lock.slock); 3233 __release_sock(sk); 3234 3235 if (sk->sk_prot->release_cb) 3236 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, 3237 tcp_release_cb, sk); 3238 3239 spin_unlock_bh(&sk->sk_lock.slock); 3240 } 3241 EXPORT_SYMBOL_GPL(__sk_flush_backlog); 3242 3243 /** 3244 * sk_wait_data - wait for data to arrive at sk_receive_queue 3245 * @sk: sock to wait on 3246 * @timeo: for how long 3247 * @skb: last skb seen on sk_receive_queue 3248 * 3249 * Now socket state including sk->sk_err is changed only under lock, 3250 * hence we may omit checks after joining wait queue. 3251 * We check receive queue before schedule() only as optimization; 3252 * it is very likely that release_sock() added new data. 3253 */ 3254 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 3255 { 3256 DEFINE_WAIT_FUNC(wait, woken_wake_function); 3257 int rc; 3258 3259 add_wait_queue(sk_sleep(sk), &wait); 3260 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 3261 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 3262 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 3263 remove_wait_queue(sk_sleep(sk), &wait); 3264 return rc; 3265 } 3266 EXPORT_SYMBOL(sk_wait_data); 3267 3268 /** 3269 * __sk_mem_raise_allocated - increase memory_allocated 3270 * @sk: socket 3271 * @size: memory size to allocate 3272 * @amt: pages to allocate 3273 * @kind: allocation type 3274 * 3275 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc. 3276 * 3277 * Unlike the globally shared limits among the sockets under same protocol, 3278 * consuming the budget of a memcg won't have direct effect on other ones. 3279 * So be optimistic about memcg's tolerance, and leave the callers to decide 3280 * whether or not to raise allocated through sk_under_memory_pressure() or 3281 * its variants. 3282 */ 3283 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 3284 { 3285 struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL; 3286 struct proto *prot = sk->sk_prot; 3287 bool charged = true; 3288 long allocated; 3289 3290 sk_memory_allocated_add(sk, amt); 3291 allocated = sk_memory_allocated(sk); 3292 3293 if (memcg) { 3294 charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()); 3295 if (!charged) 3296 goto suppress_allocation; 3297 } 3298 3299 /* Under limit. */ 3300 if (allocated <= sk_prot_mem_limits(sk, 0)) { 3301 sk_leave_memory_pressure(sk); 3302 return 1; 3303 } 3304 3305 /* Under pressure. */ 3306 if (allocated > sk_prot_mem_limits(sk, 1)) 3307 sk_enter_memory_pressure(sk); 3308 3309 /* Over hard limit. */ 3310 if (allocated > sk_prot_mem_limits(sk, 2)) 3311 goto suppress_allocation; 3312 3313 /* Guarantee minimum buffer size under pressure (either global 3314 * or memcg) to make sure features described in RFC 7323 (TCP 3315 * Extensions for High Performance) work properly. 3316 * 3317 * This rule does NOT stand when exceeds global or memcg's hard 3318 * limit, or else a DoS attack can be taken place by spawning 3319 * lots of sockets whose usage are under minimum buffer size. 3320 */ 3321 if (kind == SK_MEM_RECV) { 3322 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 3323 return 1; 3324 3325 } else { /* SK_MEM_SEND */ 3326 int wmem0 = sk_get_wmem0(sk, prot); 3327 3328 if (sk->sk_type == SOCK_STREAM) { 3329 if (sk->sk_wmem_queued < wmem0) 3330 return 1; 3331 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 3332 return 1; 3333 } 3334 } 3335 3336 if (sk_has_memory_pressure(sk)) { 3337 u64 alloc; 3338 3339 /* The following 'average' heuristic is within the 3340 * scope of global accounting, so it only makes 3341 * sense for global memory pressure. 3342 */ 3343 if (!sk_under_global_memory_pressure(sk)) 3344 return 1; 3345 3346 /* Try to be fair among all the sockets under global 3347 * pressure by allowing the ones that below average 3348 * usage to raise. 3349 */ 3350 alloc = sk_sockets_allocated_read_positive(sk); 3351 if (sk_prot_mem_limits(sk, 2) > alloc * 3352 sk_mem_pages(sk->sk_wmem_queued + 3353 atomic_read(&sk->sk_rmem_alloc) + 3354 sk->sk_forward_alloc)) 3355 return 1; 3356 } 3357 3358 suppress_allocation: 3359 3360 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 3361 sk_stream_moderate_sndbuf(sk); 3362 3363 /* Fail only if socket is _under_ its sndbuf. 3364 * In this case we cannot block, so that we have to fail. 3365 */ 3366 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { 3367 /* Force charge with __GFP_NOFAIL */ 3368 if (memcg && !charged) { 3369 mem_cgroup_charge_skmem(memcg, amt, 3370 gfp_memcg_charge() | __GFP_NOFAIL); 3371 } 3372 return 1; 3373 } 3374 } 3375 3376 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) 3377 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 3378 3379 sk_memory_allocated_sub(sk, amt); 3380 3381 if (memcg && charged) 3382 mem_cgroup_uncharge_skmem(memcg, amt); 3383 3384 return 0; 3385 } 3386 3387 /** 3388 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 3389 * @sk: socket 3390 * @size: memory size to allocate 3391 * @kind: allocation type 3392 * 3393 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 3394 * rmem allocation. This function assumes that protocols which have 3395 * memory_pressure use sk_wmem_queued as write buffer accounting. 3396 */ 3397 int __sk_mem_schedule(struct sock *sk, int size, int kind) 3398 { 3399 int ret, amt = sk_mem_pages(size); 3400 3401 sk_forward_alloc_add(sk, amt << PAGE_SHIFT); 3402 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 3403 if (!ret) 3404 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT)); 3405 return ret; 3406 } 3407 EXPORT_SYMBOL(__sk_mem_schedule); 3408 3409 /** 3410 * __sk_mem_reduce_allocated - reclaim memory_allocated 3411 * @sk: socket 3412 * @amount: number of quanta 3413 * 3414 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 3415 */ 3416 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 3417 { 3418 sk_memory_allocated_sub(sk, amount); 3419 3420 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 3421 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 3422 3423 if (sk_under_global_memory_pressure(sk) && 3424 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 3425 sk_leave_memory_pressure(sk); 3426 } 3427 3428 /** 3429 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 3430 * @sk: socket 3431 * @amount: number of bytes (rounded down to a PAGE_SIZE multiple) 3432 */ 3433 void __sk_mem_reclaim(struct sock *sk, int amount) 3434 { 3435 amount >>= PAGE_SHIFT; 3436 sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT)); 3437 __sk_mem_reduce_allocated(sk, amount); 3438 } 3439 EXPORT_SYMBOL(__sk_mem_reclaim); 3440 3441 int sk_set_peek_off(struct sock *sk, int val) 3442 { 3443 WRITE_ONCE(sk->sk_peek_off, val); 3444 return 0; 3445 } 3446 EXPORT_SYMBOL_GPL(sk_set_peek_off); 3447 3448 /* 3449 * Set of default routines for initialising struct proto_ops when 3450 * the protocol does not support a particular function. In certain 3451 * cases where it makes no sense for a protocol to have a "do nothing" 3452 * function, some default processing is provided. 3453 */ 3454 3455 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 3456 { 3457 return -EOPNOTSUPP; 3458 } 3459 EXPORT_SYMBOL(sock_no_bind); 3460 3461 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 3462 int len, int flags) 3463 { 3464 return -EOPNOTSUPP; 3465 } 3466 EXPORT_SYMBOL(sock_no_connect); 3467 3468 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 3469 { 3470 return -EOPNOTSUPP; 3471 } 3472 EXPORT_SYMBOL(sock_no_socketpair); 3473 3474 int sock_no_accept(struct socket *sock, struct socket *newsock, 3475 struct proto_accept_arg *arg) 3476 { 3477 return -EOPNOTSUPP; 3478 } 3479 EXPORT_SYMBOL(sock_no_accept); 3480 3481 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 3482 int peer) 3483 { 3484 return -EOPNOTSUPP; 3485 } 3486 EXPORT_SYMBOL(sock_no_getname); 3487 3488 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3489 { 3490 return -EOPNOTSUPP; 3491 } 3492 EXPORT_SYMBOL(sock_no_ioctl); 3493 3494 int sock_no_listen(struct socket *sock, int backlog) 3495 { 3496 return -EOPNOTSUPP; 3497 } 3498 EXPORT_SYMBOL(sock_no_listen); 3499 3500 int sock_no_shutdown(struct socket *sock, int how) 3501 { 3502 return -EOPNOTSUPP; 3503 } 3504 EXPORT_SYMBOL(sock_no_shutdown); 3505 3506 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 3507 { 3508 return -EOPNOTSUPP; 3509 } 3510 EXPORT_SYMBOL(sock_no_sendmsg); 3511 3512 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 3513 { 3514 return -EOPNOTSUPP; 3515 } 3516 EXPORT_SYMBOL(sock_no_sendmsg_locked); 3517 3518 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 3519 int flags) 3520 { 3521 return -EOPNOTSUPP; 3522 } 3523 EXPORT_SYMBOL(sock_no_recvmsg); 3524 3525 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 3526 { 3527 /* Mirror missing mmap method error code */ 3528 return -ENODEV; 3529 } 3530 EXPORT_SYMBOL(sock_no_mmap); 3531 3532 /* 3533 * When a file is received (via SCM_RIGHTS, etc), we must bump the 3534 * various sock-based usage counts. 3535 */ 3536 void __receive_sock(struct file *file) 3537 { 3538 struct socket *sock; 3539 3540 sock = sock_from_file(file); 3541 if (sock) { 3542 sock_update_netprioidx(&sock->sk->sk_cgrp_data); 3543 sock_update_classid(&sock->sk->sk_cgrp_data); 3544 } 3545 } 3546 3547 /* 3548 * Default Socket Callbacks 3549 */ 3550 3551 static void sock_def_wakeup(struct sock *sk) 3552 { 3553 struct socket_wq *wq; 3554 3555 rcu_read_lock(); 3556 wq = rcu_dereference(sk->sk_wq); 3557 if (skwq_has_sleeper(wq)) 3558 wake_up_interruptible_all(&wq->wait); 3559 rcu_read_unlock(); 3560 } 3561 3562 static void sock_def_error_report(struct sock *sk) 3563 { 3564 struct socket_wq *wq; 3565 3566 rcu_read_lock(); 3567 wq = rcu_dereference(sk->sk_wq); 3568 if (skwq_has_sleeper(wq)) 3569 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 3570 sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR); 3571 rcu_read_unlock(); 3572 } 3573 3574 void sock_def_readable(struct sock *sk) 3575 { 3576 struct socket_wq *wq; 3577 3578 trace_sk_data_ready(sk); 3579 3580 rcu_read_lock(); 3581 wq = rcu_dereference(sk->sk_wq); 3582 if (skwq_has_sleeper(wq)) 3583 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 3584 EPOLLRDNORM | EPOLLRDBAND); 3585 sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); 3586 rcu_read_unlock(); 3587 } 3588 3589 static void sock_def_write_space(struct sock *sk) 3590 { 3591 struct socket_wq *wq; 3592 3593 rcu_read_lock(); 3594 3595 /* Do not wake up a writer until he can make "significant" 3596 * progress. --DaveM 3597 */ 3598 if (sock_writeable(sk)) { 3599 wq = rcu_dereference(sk->sk_wq); 3600 if (skwq_has_sleeper(wq)) 3601 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3602 EPOLLWRNORM | EPOLLWRBAND); 3603 3604 /* Should agree with poll, otherwise some programs break */ 3605 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 3606 } 3607 3608 rcu_read_unlock(); 3609 } 3610 3611 /* An optimised version of sock_def_write_space(), should only be called 3612 * for SOCK_RCU_FREE sockets under RCU read section and after putting 3613 * ->sk_wmem_alloc. 3614 */ 3615 static void sock_def_write_space_wfree(struct sock *sk) 3616 { 3617 /* Do not wake up a writer until he can make "significant" 3618 * progress. --DaveM 3619 */ 3620 if (sock_writeable(sk)) { 3621 struct socket_wq *wq = rcu_dereference(sk->sk_wq); 3622 3623 /* rely on refcount_sub from sock_wfree() */ 3624 smp_mb__after_atomic(); 3625 if (wq && waitqueue_active(&wq->wait)) 3626 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3627 EPOLLWRNORM | EPOLLWRBAND); 3628 3629 /* Should agree with poll, otherwise some programs break */ 3630 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 3631 } 3632 } 3633 3634 static void sock_def_destruct(struct sock *sk) 3635 { 3636 } 3637 3638 void sk_send_sigurg(struct sock *sk) 3639 { 3640 if (sk->sk_socket && sk->sk_socket->file) 3641 if (send_sigurg(sk->sk_socket->file)) 3642 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 3643 } 3644 EXPORT_SYMBOL(sk_send_sigurg); 3645 3646 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 3647 unsigned long expires) 3648 { 3649 if (!mod_timer(timer, expires)) 3650 sock_hold(sk); 3651 } 3652 EXPORT_SYMBOL(sk_reset_timer); 3653 3654 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 3655 { 3656 if (timer_delete(timer)) 3657 __sock_put(sk); 3658 } 3659 EXPORT_SYMBOL(sk_stop_timer); 3660 3661 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) 3662 { 3663 if (timer_delete_sync(timer)) 3664 __sock_put(sk); 3665 } 3666 EXPORT_SYMBOL(sk_stop_timer_sync); 3667 3668 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) 3669 { 3670 sk_init_common(sk); 3671 sk->sk_send_head = NULL; 3672 3673 timer_setup(&sk->sk_timer, NULL, 0); 3674 3675 sk->sk_allocation = GFP_KERNEL; 3676 sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); 3677 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); 3678 sk->sk_state = TCP_CLOSE; 3679 sk->sk_use_task_frag = true; 3680 sk_set_socket(sk, sock); 3681 3682 sock_set_flag(sk, SOCK_ZAPPED); 3683 3684 if (sock) { 3685 sk->sk_type = sock->type; 3686 RCU_INIT_POINTER(sk->sk_wq, &sock->wq); 3687 sock->sk = sk; 3688 } else { 3689 RCU_INIT_POINTER(sk->sk_wq, NULL); 3690 } 3691 sk->sk_uid = uid; 3692 3693 sk->sk_state_change = sock_def_wakeup; 3694 sk->sk_data_ready = sock_def_readable; 3695 sk->sk_write_space = sock_def_write_space; 3696 sk->sk_error_report = sock_def_error_report; 3697 sk->sk_destruct = sock_def_destruct; 3698 3699 sk->sk_frag.page = NULL; 3700 sk->sk_frag.offset = 0; 3701 sk->sk_peek_off = -1; 3702 3703 sk->sk_peer_pid = NULL; 3704 sk->sk_peer_cred = NULL; 3705 spin_lock_init(&sk->sk_peer_lock); 3706 3707 sk->sk_write_pending = 0; 3708 sk->sk_rcvlowat = 1; 3709 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 3710 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 3711 3712 sk->sk_stamp = SK_DEFAULT_STAMP; 3713 #if BITS_PER_LONG==32 3714 seqlock_init(&sk->sk_stamp_seq); 3715 #endif 3716 atomic_set(&sk->sk_zckey, 0); 3717 3718 #ifdef CONFIG_NET_RX_BUSY_POLL 3719 sk->sk_napi_id = 0; 3720 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); 3721 #endif 3722 3723 sk->sk_max_pacing_rate = ~0UL; 3724 sk->sk_pacing_rate = ~0UL; 3725 WRITE_ONCE(sk->sk_pacing_shift, 10); 3726 sk->sk_incoming_cpu = -1; 3727 3728 sk_rx_queue_clear(sk); 3729 /* 3730 * Before updating sk_refcnt, we must commit prior changes to memory 3731 * (Documentation/RCU/rculist_nulls.rst for details) 3732 */ 3733 smp_wmb(); 3734 refcount_set(&sk->sk_refcnt, 1); 3735 atomic_set(&sk->sk_drops, 0); 3736 } 3737 EXPORT_SYMBOL(sock_init_data_uid); 3738 3739 void sock_init_data(struct socket *sock, struct sock *sk) 3740 { 3741 kuid_t uid = sock ? 3742 SOCK_INODE(sock)->i_uid : 3743 make_kuid(sock_net(sk)->user_ns, 0); 3744 3745 sock_init_data_uid(sock, sk, uid); 3746 } 3747 EXPORT_SYMBOL(sock_init_data); 3748 3749 void lock_sock_nested(struct sock *sk, int subclass) 3750 { 3751 /* The sk_lock has mutex_lock() semantics here. */ 3752 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 3753 3754 might_sleep(); 3755 spin_lock_bh(&sk->sk_lock.slock); 3756 if (sock_owned_by_user_nocheck(sk)) 3757 __lock_sock(sk); 3758 sk->sk_lock.owned = 1; 3759 spin_unlock_bh(&sk->sk_lock.slock); 3760 } 3761 EXPORT_SYMBOL(lock_sock_nested); 3762 3763 void release_sock(struct sock *sk) 3764 { 3765 spin_lock_bh(&sk->sk_lock.slock); 3766 if (sk->sk_backlog.tail) 3767 __release_sock(sk); 3768 3769 if (sk->sk_prot->release_cb) 3770 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, 3771 tcp_release_cb, sk); 3772 3773 sock_release_ownership(sk); 3774 if (waitqueue_active(&sk->sk_lock.wq)) 3775 wake_up(&sk->sk_lock.wq); 3776 spin_unlock_bh(&sk->sk_lock.slock); 3777 } 3778 EXPORT_SYMBOL(release_sock); 3779 3780 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) 3781 { 3782 might_sleep(); 3783 spin_lock_bh(&sk->sk_lock.slock); 3784 3785 if (!sock_owned_by_user_nocheck(sk)) { 3786 /* 3787 * Fast path return with bottom halves disabled and 3788 * sock::sk_lock.slock held. 3789 * 3790 * The 'mutex' is not contended and holding 3791 * sock::sk_lock.slock prevents all other lockers to 3792 * proceed so the corresponding unlock_sock_fast() can 3793 * avoid the slow path of release_sock() completely and 3794 * just release slock. 3795 * 3796 * From a semantical POV this is equivalent to 'acquiring' 3797 * the 'mutex', hence the corresponding lockdep 3798 * mutex_release() has to happen in the fast path of 3799 * unlock_sock_fast(). 3800 */ 3801 return false; 3802 } 3803 3804 __lock_sock(sk); 3805 sk->sk_lock.owned = 1; 3806 __acquire(&sk->sk_lock.slock); 3807 spin_unlock_bh(&sk->sk_lock.slock); 3808 return true; 3809 } 3810 EXPORT_SYMBOL(__lock_sock_fast); 3811 3812 int sock_gettstamp(struct socket *sock, void __user *userstamp, 3813 bool timeval, bool time32) 3814 { 3815 struct sock *sk = sock->sk; 3816 struct timespec64 ts; 3817 3818 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 3819 ts = ktime_to_timespec64(sock_read_timestamp(sk)); 3820 if (ts.tv_sec == -1) 3821 return -ENOENT; 3822 if (ts.tv_sec == 0) { 3823 ktime_t kt = ktime_get_real(); 3824 sock_write_timestamp(sk, kt); 3825 ts = ktime_to_timespec64(kt); 3826 } 3827 3828 if (timeval) 3829 ts.tv_nsec /= 1000; 3830 3831 #ifdef CONFIG_COMPAT_32BIT_TIME 3832 if (time32) 3833 return put_old_timespec32(&ts, userstamp); 3834 #endif 3835 #ifdef CONFIG_SPARC64 3836 /* beware of padding in sparc64 timeval */ 3837 if (timeval && !in_compat_syscall()) { 3838 struct __kernel_old_timeval __user tv = { 3839 .tv_sec = ts.tv_sec, 3840 .tv_usec = ts.tv_nsec, 3841 }; 3842 if (copy_to_user(userstamp, &tv, sizeof(tv))) 3843 return -EFAULT; 3844 return 0; 3845 } 3846 #endif 3847 return put_timespec64(&ts, userstamp); 3848 } 3849 EXPORT_SYMBOL(sock_gettstamp); 3850 3851 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) 3852 { 3853 if (!sock_flag(sk, flag)) { 3854 unsigned long previous_flags = sk->sk_flags; 3855 3856 sock_set_flag(sk, flag); 3857 /* 3858 * we just set one of the two flags which require net 3859 * time stamping, but time stamping might have been on 3860 * already because of the other one 3861 */ 3862 if (sock_needs_netstamp(sk) && 3863 !(previous_flags & SK_FLAGS_TIMESTAMP)) 3864 net_enable_timestamp(); 3865 } 3866 } 3867 3868 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 3869 int level, int type) 3870 { 3871 struct sock_exterr_skb *serr; 3872 struct sk_buff *skb; 3873 int copied, err; 3874 3875 err = -EAGAIN; 3876 skb = sock_dequeue_err_skb(sk); 3877 if (skb == NULL) 3878 goto out; 3879 3880 copied = skb->len; 3881 if (copied > len) { 3882 msg->msg_flags |= MSG_TRUNC; 3883 copied = len; 3884 } 3885 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3886 if (err) 3887 goto out_free_skb; 3888 3889 sock_recv_timestamp(msg, sk, skb); 3890 3891 serr = SKB_EXT_ERR(skb); 3892 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 3893 3894 msg->msg_flags |= MSG_ERRQUEUE; 3895 err = copied; 3896 3897 out_free_skb: 3898 kfree_skb(skb); 3899 out: 3900 return err; 3901 } 3902 EXPORT_SYMBOL(sock_recv_errqueue); 3903 3904 /* 3905 * Get a socket option on an socket. 3906 * 3907 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3908 * asynchronous errors should be reported by getsockopt. We assume 3909 * this means if you specify SO_ERROR (otherwise what is the point of it). 3910 */ 3911 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3912 char __user *optval, int __user *optlen) 3913 { 3914 struct sock *sk = sock->sk; 3915 3916 /* IPV6_ADDRFORM can change sk->sk_prot under us. */ 3917 return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); 3918 } 3919 EXPORT_SYMBOL(sock_common_getsockopt); 3920 3921 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3922 int flags) 3923 { 3924 struct sock *sk = sock->sk; 3925 int addr_len = 0; 3926 int err; 3927 3928 err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len); 3929 if (err >= 0) 3930 msg->msg_namelen = addr_len; 3931 return err; 3932 } 3933 EXPORT_SYMBOL(sock_common_recvmsg); 3934 3935 /* 3936 * Set socket options on an inet socket. 3937 */ 3938 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3939 sockptr_t optval, unsigned int optlen) 3940 { 3941 struct sock *sk = sock->sk; 3942 3943 /* IPV6_ADDRFORM can change sk->sk_prot under us. */ 3944 return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); 3945 } 3946 EXPORT_SYMBOL(sock_common_setsockopt); 3947 3948 void sk_common_release(struct sock *sk) 3949 { 3950 if (sk->sk_prot->destroy) 3951 sk->sk_prot->destroy(sk); 3952 3953 /* 3954 * Observation: when sk_common_release is called, processes have 3955 * no access to socket. But net still has. 3956 * Step one, detach it from networking: 3957 * 3958 * A. Remove from hash tables. 3959 */ 3960 3961 sk->sk_prot->unhash(sk); 3962 3963 /* 3964 * In this point socket cannot receive new packets, but it is possible 3965 * that some packets are in flight because some CPU runs receiver and 3966 * did hash table lookup before we unhashed socket. They will achieve 3967 * receive queue and will be purged by socket destructor. 3968 * 3969 * Also we still have packets pending on receive queue and probably, 3970 * our own packets waiting in device queues. sock_destroy will drain 3971 * receive queue, but transmitted packets will delay socket destruction 3972 * until the last reference will be released. 3973 */ 3974 3975 sock_orphan(sk); 3976 3977 xfrm_sk_free_policy(sk); 3978 3979 sock_put(sk); 3980 } 3981 EXPORT_SYMBOL(sk_common_release); 3982 3983 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3984 { 3985 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3986 3987 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3988 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); 3989 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3990 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); 3991 mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc); 3992 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); 3993 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3994 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); 3995 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); 3996 } 3997 3998 #ifdef CONFIG_PROC_FS 3999 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 4000 4001 int sock_prot_inuse_get(struct net *net, struct proto *prot) 4002 { 4003 int cpu, idx = prot->inuse_idx; 4004 int res = 0; 4005 4006 for_each_possible_cpu(cpu) 4007 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 4008 4009 return res >= 0 ? res : 0; 4010 } 4011 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 4012 4013 int sock_inuse_get(struct net *net) 4014 { 4015 int cpu, res = 0; 4016 4017 for_each_possible_cpu(cpu) 4018 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all; 4019 4020 return res; 4021 } 4022 4023 EXPORT_SYMBOL_GPL(sock_inuse_get); 4024 4025 static int __net_init sock_inuse_init_net(struct net *net) 4026 { 4027 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 4028 if (net->core.prot_inuse == NULL) 4029 return -ENOMEM; 4030 return 0; 4031 } 4032 4033 static void __net_exit sock_inuse_exit_net(struct net *net) 4034 { 4035 free_percpu(net->core.prot_inuse); 4036 } 4037 4038 static struct pernet_operations net_inuse_ops = { 4039 .init = sock_inuse_init_net, 4040 .exit = sock_inuse_exit_net, 4041 }; 4042 4043 static __init int net_inuse_init(void) 4044 { 4045 if (register_pernet_subsys(&net_inuse_ops)) 4046 panic("Cannot initialize net inuse counters"); 4047 4048 return 0; 4049 } 4050 4051 core_initcall(net_inuse_init); 4052 4053 static int assign_proto_idx(struct proto *prot) 4054 { 4055 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 4056 4057 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) { 4058 pr_err("PROTO_INUSE_NR exhausted\n"); 4059 return -ENOSPC; 4060 } 4061 4062 set_bit(prot->inuse_idx, proto_inuse_idx); 4063 return 0; 4064 } 4065 4066 static void release_proto_idx(struct proto *prot) 4067 { 4068 if (prot->inuse_idx != PROTO_INUSE_NR) 4069 clear_bit(prot->inuse_idx, proto_inuse_idx); 4070 } 4071 #else 4072 static inline int assign_proto_idx(struct proto *prot) 4073 { 4074 return 0; 4075 } 4076 4077 static inline void release_proto_idx(struct proto *prot) 4078 { 4079 } 4080 4081 #endif 4082 4083 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) 4084 { 4085 if (!twsk_prot) 4086 return; 4087 kfree(twsk_prot->twsk_slab_name); 4088 twsk_prot->twsk_slab_name = NULL; 4089 kmem_cache_destroy(twsk_prot->twsk_slab); 4090 twsk_prot->twsk_slab = NULL; 4091 } 4092 4093 static int tw_prot_init(const struct proto *prot) 4094 { 4095 struct timewait_sock_ops *twsk_prot = prot->twsk_prot; 4096 4097 if (!twsk_prot) 4098 return 0; 4099 4100 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", 4101 prot->name); 4102 if (!twsk_prot->twsk_slab_name) 4103 return -ENOMEM; 4104 4105 twsk_prot->twsk_slab = 4106 kmem_cache_create(twsk_prot->twsk_slab_name, 4107 twsk_prot->twsk_obj_size, 0, 4108 SLAB_ACCOUNT | prot->slab_flags, 4109 NULL); 4110 if (!twsk_prot->twsk_slab) { 4111 pr_crit("%s: Can't create timewait sock SLAB cache!\n", 4112 prot->name); 4113 return -ENOMEM; 4114 } 4115 4116 return 0; 4117 } 4118 4119 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 4120 { 4121 if (!rsk_prot) 4122 return; 4123 kfree(rsk_prot->slab_name); 4124 rsk_prot->slab_name = NULL; 4125 kmem_cache_destroy(rsk_prot->slab); 4126 rsk_prot->slab = NULL; 4127 } 4128 4129 static int req_prot_init(const struct proto *prot) 4130 { 4131 struct request_sock_ops *rsk_prot = prot->rsk_prot; 4132 4133 if (!rsk_prot) 4134 return 0; 4135 4136 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 4137 prot->name); 4138 if (!rsk_prot->slab_name) 4139 return -ENOMEM; 4140 4141 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 4142 rsk_prot->obj_size, 0, 4143 SLAB_ACCOUNT | prot->slab_flags, 4144 NULL); 4145 4146 if (!rsk_prot->slab) { 4147 pr_crit("%s: Can't create request sock SLAB cache!\n", 4148 prot->name); 4149 return -ENOMEM; 4150 } 4151 return 0; 4152 } 4153 4154 int proto_register(struct proto *prot, int alloc_slab) 4155 { 4156 int ret = -ENOBUFS; 4157 4158 if (prot->memory_allocated && !prot->sysctl_mem) { 4159 pr_err("%s: missing sysctl_mem\n", prot->name); 4160 return -EINVAL; 4161 } 4162 if (prot->memory_allocated && !prot->per_cpu_fw_alloc) { 4163 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name); 4164 return -EINVAL; 4165 } 4166 if (alloc_slab) { 4167 prot->slab = kmem_cache_create_usercopy(prot->name, 4168 prot->obj_size, 0, 4169 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 4170 prot->slab_flags, 4171 prot->useroffset, prot->usersize, 4172 NULL); 4173 4174 if (prot->slab == NULL) { 4175 pr_crit("%s: Can't create sock SLAB cache!\n", 4176 prot->name); 4177 goto out; 4178 } 4179 4180 if (req_prot_init(prot)) 4181 goto out_free_request_sock_slab; 4182 4183 if (tw_prot_init(prot)) 4184 goto out_free_timewait_sock_slab; 4185 } 4186 4187 mutex_lock(&proto_list_mutex); 4188 ret = assign_proto_idx(prot); 4189 if (ret) { 4190 mutex_unlock(&proto_list_mutex); 4191 goto out_free_timewait_sock_slab; 4192 } 4193 list_add(&prot->node, &proto_list); 4194 mutex_unlock(&proto_list_mutex); 4195 return ret; 4196 4197 out_free_timewait_sock_slab: 4198 if (alloc_slab) 4199 tw_prot_cleanup(prot->twsk_prot); 4200 out_free_request_sock_slab: 4201 if (alloc_slab) { 4202 req_prot_cleanup(prot->rsk_prot); 4203 4204 kmem_cache_destroy(prot->slab); 4205 prot->slab = NULL; 4206 } 4207 out: 4208 return ret; 4209 } 4210 EXPORT_SYMBOL(proto_register); 4211 4212 void proto_unregister(struct proto *prot) 4213 { 4214 mutex_lock(&proto_list_mutex); 4215 release_proto_idx(prot); 4216 list_del(&prot->node); 4217 mutex_unlock(&proto_list_mutex); 4218 4219 kmem_cache_destroy(prot->slab); 4220 prot->slab = NULL; 4221 4222 req_prot_cleanup(prot->rsk_prot); 4223 tw_prot_cleanup(prot->twsk_prot); 4224 } 4225 EXPORT_SYMBOL(proto_unregister); 4226 4227 int sock_load_diag_module(int family, int protocol) 4228 { 4229 if (!protocol) { 4230 if (!sock_is_registered(family)) 4231 return -ENOENT; 4232 4233 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 4234 NETLINK_SOCK_DIAG, family); 4235 } 4236 4237 #ifdef CONFIG_INET 4238 if (family == AF_INET && 4239 protocol != IPPROTO_RAW && 4240 protocol < MAX_INET_PROTOS && 4241 !rcu_access_pointer(inet_protos[protocol])) 4242 return -ENOENT; 4243 #endif 4244 4245 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 4246 NETLINK_SOCK_DIAG, family, protocol); 4247 } 4248 EXPORT_SYMBOL(sock_load_diag_module); 4249 4250 #ifdef CONFIG_PROC_FS 4251 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 4252 __acquires(proto_list_mutex) 4253 { 4254 mutex_lock(&proto_list_mutex); 4255 return seq_list_start_head(&proto_list, *pos); 4256 } 4257 4258 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4259 { 4260 return seq_list_next(v, &proto_list, pos); 4261 } 4262 4263 static void proto_seq_stop(struct seq_file *seq, void *v) 4264 __releases(proto_list_mutex) 4265 { 4266 mutex_unlock(&proto_list_mutex); 4267 } 4268 4269 static char proto_method_implemented(const void *method) 4270 { 4271 return method == NULL ? 'n' : 'y'; 4272 } 4273 static long sock_prot_memory_allocated(struct proto *proto) 4274 { 4275 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 4276 } 4277 4278 static const char *sock_prot_memory_pressure(struct proto *proto) 4279 { 4280 return proto->memory_pressure != NULL ? 4281 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 4282 } 4283 4284 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 4285 { 4286 4287 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 4288 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 4289 proto->name, 4290 proto->obj_size, 4291 sock_prot_inuse_get(seq_file_net(seq), proto), 4292 sock_prot_memory_allocated(proto), 4293 sock_prot_memory_pressure(proto), 4294 proto->max_header, 4295 proto->slab == NULL ? "no" : "yes", 4296 module_name(proto->owner), 4297 proto_method_implemented(proto->close), 4298 proto_method_implemented(proto->connect), 4299 proto_method_implemented(proto->disconnect), 4300 proto_method_implemented(proto->accept), 4301 proto_method_implemented(proto->ioctl), 4302 proto_method_implemented(proto->init), 4303 proto_method_implemented(proto->destroy), 4304 proto_method_implemented(proto->shutdown), 4305 proto_method_implemented(proto->setsockopt), 4306 proto_method_implemented(proto->getsockopt), 4307 proto_method_implemented(proto->sendmsg), 4308 proto_method_implemented(proto->recvmsg), 4309 proto_method_implemented(proto->bind), 4310 proto_method_implemented(proto->backlog_rcv), 4311 proto_method_implemented(proto->hash), 4312 proto_method_implemented(proto->unhash), 4313 proto_method_implemented(proto->get_port), 4314 proto_method_implemented(proto->enter_memory_pressure)); 4315 } 4316 4317 static int proto_seq_show(struct seq_file *seq, void *v) 4318 { 4319 if (v == &proto_list) 4320 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 4321 "protocol", 4322 "size", 4323 "sockets", 4324 "memory", 4325 "press", 4326 "maxhdr", 4327 "slab", 4328 "module", 4329 "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n"); 4330 else 4331 proto_seq_printf(seq, list_entry(v, struct proto, node)); 4332 return 0; 4333 } 4334 4335 static const struct seq_operations proto_seq_ops = { 4336 .start = proto_seq_start, 4337 .next = proto_seq_next, 4338 .stop = proto_seq_stop, 4339 .show = proto_seq_show, 4340 }; 4341 4342 static __net_init int proto_init_net(struct net *net) 4343 { 4344 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 4345 sizeof(struct seq_net_private))) 4346 return -ENOMEM; 4347 4348 return 0; 4349 } 4350 4351 static __net_exit void proto_exit_net(struct net *net) 4352 { 4353 remove_proc_entry("protocols", net->proc_net); 4354 } 4355 4356 4357 static __net_initdata struct pernet_operations proto_net_ops = { 4358 .init = proto_init_net, 4359 .exit = proto_exit_net, 4360 }; 4361 4362 static int __init proto_init(void) 4363 { 4364 return register_pernet_subsys(&proto_net_ops); 4365 } 4366 4367 subsys_initcall(proto_init); 4368 4369 #endif /* PROC_FS */ 4370 4371 #ifdef CONFIG_NET_RX_BUSY_POLL 4372 bool sk_busy_loop_end(void *p, unsigned long start_time) 4373 { 4374 struct sock *sk = p; 4375 4376 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 4377 return true; 4378 4379 if (sk_is_udp(sk) && 4380 !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) 4381 return true; 4382 4383 return sk_busy_loop_timeout(sk, start_time); 4384 } 4385 EXPORT_SYMBOL(sk_busy_loop_end); 4386 #endif /* CONFIG_NET_RX_BUSY_POLL */ 4387 4388 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) 4389 { 4390 if (!sk->sk_prot->bind_add) 4391 return -EOPNOTSUPP; 4392 return sk->sk_prot->bind_add(sk, addr, addr_len); 4393 } 4394 EXPORT_SYMBOL(sock_bind_add); 4395 4396 /* Copy 'size' bytes from userspace and return `size` back to userspace */ 4397 int sock_ioctl_inout(struct sock *sk, unsigned int cmd, 4398 void __user *arg, void *karg, size_t size) 4399 { 4400 int ret; 4401 4402 if (copy_from_user(karg, arg, size)) 4403 return -EFAULT; 4404 4405 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg); 4406 if (ret) 4407 return ret; 4408 4409 if (copy_to_user(arg, karg, size)) 4410 return -EFAULT; 4411 4412 return 0; 4413 } 4414 EXPORT_SYMBOL(sock_ioctl_inout); 4415 4416 /* This is the most common ioctl prep function, where the result (4 bytes) is 4417 * copied back to userspace if the ioctl() returns successfully. No input is 4418 * copied from userspace as input argument. 4419 */ 4420 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg) 4421 { 4422 int ret, karg = 0; 4423 4424 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg); 4425 if (ret) 4426 return ret; 4427 4428 return put_user(karg, (int __user *)arg); 4429 } 4430 4431 /* A wrapper around sock ioctls, which copies the data from userspace 4432 * (depending on the protocol/ioctl), and copies back the result to userspace. 4433 * The main motivation for this function is to pass kernel memory to the 4434 * protocol ioctl callbacks, instead of userspace memory. 4435 */ 4436 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) 4437 { 4438 int rc = 1; 4439 4440 if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET) 4441 rc = ipmr_sk_ioctl(sk, cmd, arg); 4442 else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6) 4443 rc = ip6mr_sk_ioctl(sk, cmd, arg); 4444 else if (sk_is_phonet(sk)) 4445 rc = phonet_sk_ioctl(sk, cmd, arg); 4446 4447 /* If ioctl was processed, returns its value */ 4448 if (rc <= 0) 4449 return rc; 4450 4451 /* Otherwise call the default handler */ 4452 return sock_ioctl_out(sk, cmd, arg); 4453 } 4454 EXPORT_SYMBOL(sk_ioctl); 4455 4456 static int __init sock_struct_check(void) 4457 { 4458 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops); 4459 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off); 4460 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue); 4461 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue); 4462 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog); 4463 4464 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst); 4465 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex); 4466 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie); 4467 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf); 4468 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter); 4469 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq); 4470 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready); 4471 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo); 4472 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat); 4473 4474 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err); 4475 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket); 4476 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); 4477 4478 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); 4479 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); 4480 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc); 4481 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags); 4482 4483 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); 4484 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); 4485 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf); 4486 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued); 4487 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc); 4488 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags); 4489 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head); 4490 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue); 4491 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending); 4492 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm); 4493 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status); 4494 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag); 4495 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer); 4496 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate); 4497 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey); 4498 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey); 4499 4500 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate); 4501 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo); 4502 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority); 4503 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark); 4504 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache); 4505 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps); 4506 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type); 4507 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size); 4508 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation); 4509 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash); 4510 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs); 4511 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift); 4512 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag); 4513 return 0; 4514 } 4515 4516 core_initcall(sock_struct_check); 4517