1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET4: Implementation of BSD Unix domain sockets. 4 * 5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk> 6 * 7 * Fixes: 8 * Linus Torvalds : Assorted bug cures. 9 * Niibe Yutaka : async I/O support. 10 * Carsten Paeth : PF_UNIX check, address fixes. 11 * Alan Cox : Limit size of allocated blocks. 12 * Alan Cox : Fixed the stupid socketpair bug. 13 * Alan Cox : BSD compatibility fine tuning. 14 * Alan Cox : Fixed a bug in connect when interrupted. 15 * Alan Cox : Sorted out a proper draft version of 16 * file descriptor passing hacked up from 17 * Mike Shaver's work. 18 * Marty Leisner : Fixes to fd passing 19 * Nick Nevin : recvmsg bugfix. 20 * Alan Cox : Started proper garbage collector 21 * Heiko EiBfeldt : Missing verify_area check 22 * Alan Cox : Started POSIXisms 23 * Andreas Schwab : Replace inode by dentry for proper 24 * reference counting 25 * Kirk Petersen : Made this a module 26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm. 27 * Lots of bug fixes. 28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces 29 * by above two patches. 30 * Andrea Arcangeli : If possible we block in connect(2) 31 * if the max backlog of the listen socket 32 * is been reached. This won't break 33 * old apps and it will avoid huge amount 34 * of socks hashed (this for unix_gc() 35 * performances reasons). 36 * Security fix that limits the max 37 * number of socks to 2*max_files and 38 * the number of skb queueable in the 39 * dgram receiver. 40 * Artur Skawina : Hash function optimizations 41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) 42 * Malcolm Beattie : Set peercred for socketpair 43 * Michal Ostrowski : Module initialization cleanup. 44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, 45 * the core infrastructure is doing that 46 * for all net proto families now (2.5.69+) 47 * 48 * Known differences from reference BSD that was tested: 49 * 50 * [TO FIX] 51 * ECONNREFUSED is not returned from one end of a connected() socket to the 52 * other the moment one end closes. 53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark 54 * and a fake inode identifier (nor the BSD first socket fstat twice bug). 55 * [NOT TO FIX] 56 * accept() returns a path name even if the connecting socket has closed 57 * in the meantime (BSD loses the path and gives up). 58 * accept() returns 0 length path for an unbound connector. BSD returns 16 59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??) 60 * socketpair(...SOCK_RAW..) doesn't panic the kernel. 61 * BSD af_unix apparently has connect forgetting to block properly. 62 * (need to check this with the POSIX spec in detail) 63 * 64 * Differences from 2.0.0-11-... (ANK) 65 * Bug fixes and improvements. 66 * - client shutdown killed server socket. 67 * - removed all useless cli/sti pairs. 68 * 69 * Semantic changes/extensions. 70 * - generic control message passing. 71 * - SCM_CREDENTIALS control message. 72 * - "Abstract" (not FS based) socket bindings. 73 * Abstract names are sequences of bytes (not zero terminated) 74 * started by 0, so that this name space does not intersect 75 * with BSD names. 76 */ 77 78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 79 80 #include <linux/bpf-cgroup.h> 81 #include <linux/btf_ids.h> 82 #include <linux/dcache.h> 83 #include <linux/errno.h> 84 #include <linux/fcntl.h> 85 #include <linux/file.h> 86 #include <linux/filter.h> 87 #include <linux/fs.h> 88 #include <linux/init.h> 89 #include <linux/kernel.h> 90 #include <linux/mount.h> 91 #include <linux/namei.h> 92 #include <linux/poll.h> 93 #include <linux/proc_fs.h> 94 #include <linux/sched/signal.h> 95 #include <linux/security.h> 96 #include <linux/seq_file.h> 97 #include <linux/skbuff.h> 98 #include <linux/slab.h> 99 #include <linux/socket.h> 100 #include <linux/splice.h> 101 #include <linux/string.h> 102 #include <linux/uaccess.h> 103 #include <net/af_unix.h> 104 #include <net/net_namespace.h> 105 #include <net/scm.h> 106 #include <net/tcp_states.h> 107 #include <uapi/linux/sockios.h> 108 #include <uapi/linux/termios.h> 109 110 #include "af_unix.h" 111 112 static atomic_long_t unix_nr_socks; 113 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; 114 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; 115 116 /* SMP locking strategy: 117 * hash table is protected with spinlock. 118 * each socket state is protected by separate spinlock. 119 */ 120 #ifdef CONFIG_PROVE_LOCKING 121 #define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r))) 122 123 static int unix_table_lock_cmp_fn(const struct lockdep_map *a, 124 const struct lockdep_map *b) 125 { 126 return cmp_ptr(a, b); 127 } 128 129 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a, 130 const struct lockdep_map *_b) 131 { 132 const struct unix_sock *a, *b; 133 134 a = container_of(_a, struct unix_sock, lock.dep_map); 135 b = container_of(_b, struct unix_sock, lock.dep_map); 136 137 if (a->sk.sk_state == TCP_LISTEN) { 138 /* unix_stream_connect(): Before the 2nd unix_state_lock(), 139 * 140 * 1. a is TCP_LISTEN. 141 * 2. b is not a. 142 * 3. concurrent connect(b -> a) must fail. 143 * 144 * Except for 2. & 3., the b's state can be any possible 145 * value due to concurrent connect() or listen(). 146 * 147 * 2. is detected in debug_spin_lock_before(), and 3. cannot 148 * be expressed as lock_cmp_fn. 149 */ 150 switch (b->sk.sk_state) { 151 case TCP_CLOSE: 152 case TCP_ESTABLISHED: 153 case TCP_LISTEN: 154 return -1; 155 default: 156 /* Invalid case. */ 157 return 0; 158 } 159 } 160 161 /* Should never happen. Just to be symmetric. */ 162 if (b->sk.sk_state == TCP_LISTEN) { 163 switch (b->sk.sk_state) { 164 case TCP_CLOSE: 165 case TCP_ESTABLISHED: 166 return 1; 167 default: 168 return 0; 169 } 170 } 171 172 /* unix_state_double_lock(): ascending address order. */ 173 return cmp_ptr(a, b); 174 } 175 176 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a, 177 const struct lockdep_map *_b) 178 { 179 const struct sock *a, *b; 180 181 a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map); 182 b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map); 183 184 /* unix_collect_skb(): listener -> embryo order. */ 185 if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a) 186 return -1; 187 188 /* Should never happen. Just to be symmetric. */ 189 if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b) 190 return 1; 191 192 return 0; 193 } 194 #endif 195 196 static unsigned int unix_unbound_hash(struct sock *sk) 197 { 198 unsigned long hash = (unsigned long)sk; 199 200 hash ^= hash >> 16; 201 hash ^= hash >> 8; 202 hash ^= sk->sk_type; 203 204 return hash & UNIX_HASH_MOD; 205 } 206 207 static unsigned int unix_bsd_hash(struct inode *i) 208 { 209 return i->i_ino & UNIX_HASH_MOD; 210 } 211 212 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, 213 int addr_len, int type) 214 { 215 __wsum csum = csum_partial(sunaddr, addr_len, 0); 216 unsigned int hash; 217 218 hash = (__force unsigned int)csum_fold(csum); 219 hash ^= hash >> 8; 220 hash ^= type; 221 222 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); 223 } 224 225 static void unix_table_double_lock(struct net *net, 226 unsigned int hash1, unsigned int hash2) 227 { 228 if (hash1 == hash2) { 229 spin_lock(&net->unx.table.locks[hash1]); 230 return; 231 } 232 233 if (hash1 > hash2) 234 swap(hash1, hash2); 235 236 spin_lock(&net->unx.table.locks[hash1]); 237 spin_lock(&net->unx.table.locks[hash2]); 238 } 239 240 static void unix_table_double_unlock(struct net *net, 241 unsigned int hash1, unsigned int hash2) 242 { 243 if (hash1 == hash2) { 244 spin_unlock(&net->unx.table.locks[hash1]); 245 return; 246 } 247 248 spin_unlock(&net->unx.table.locks[hash1]); 249 spin_unlock(&net->unx.table.locks[hash2]); 250 } 251 252 #ifdef CONFIG_SECURITY_NETWORK 253 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 254 { 255 UNIXCB(skb).secid = scm->secid; 256 } 257 258 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 259 { 260 scm->secid = UNIXCB(skb).secid; 261 } 262 263 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 264 { 265 return (scm->secid == UNIXCB(skb).secid); 266 } 267 #else 268 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) 269 { } 270 271 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) 272 { } 273 274 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) 275 { 276 return true; 277 } 278 #endif /* CONFIG_SECURITY_NETWORK */ 279 280 static inline int unix_may_send(struct sock *sk, struct sock *osk) 281 { 282 return !unix_peer(osk) || unix_peer(osk) == sk; 283 } 284 285 static inline int unix_recvq_full_lockless(const struct sock *sk) 286 { 287 return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; 288 } 289 290 struct sock *unix_peer_get(struct sock *s) 291 { 292 struct sock *peer; 293 294 unix_state_lock(s); 295 peer = unix_peer(s); 296 if (peer) 297 sock_hold(peer); 298 unix_state_unlock(s); 299 return peer; 300 } 301 EXPORT_SYMBOL_GPL(unix_peer_get); 302 303 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, 304 int addr_len) 305 { 306 struct unix_address *addr; 307 308 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); 309 if (!addr) 310 return NULL; 311 312 refcount_set(&addr->refcnt, 1); 313 addr->len = addr_len; 314 memcpy(addr->name, sunaddr, addr_len); 315 316 return addr; 317 } 318 319 static inline void unix_release_addr(struct unix_address *addr) 320 { 321 if (refcount_dec_and_test(&addr->refcnt)) 322 kfree(addr); 323 } 324 325 /* 326 * Check unix socket name: 327 * - should be not zero length. 328 * - if started by not zero, should be NULL terminated (FS object) 329 * - if started by zero, it is abstract name. 330 */ 331 332 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) 333 { 334 if (addr_len <= offsetof(struct sockaddr_un, sun_path) || 335 addr_len > sizeof(*sunaddr)) 336 return -EINVAL; 337 338 if (sunaddr->sun_family != AF_UNIX) 339 return -EINVAL; 340 341 return 0; 342 } 343 344 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) 345 { 346 struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr; 347 short offset = offsetof(struct sockaddr_storage, __data); 348 349 BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path)); 350 351 /* This may look like an off by one error but it is a bit more 352 * subtle. 108 is the longest valid AF_UNIX path for a binding. 353 * sun_path[108] doesn't as such exist. However in kernel space 354 * we are guaranteed that it is a valid memory location in our 355 * kernel address buffer because syscall functions always pass 356 * a pointer of struct sockaddr_storage which has a bigger buffer 357 * than 108. Also, we must terminate sun_path for strlen() in 358 * getname_kernel(). 359 */ 360 addr->__data[addr_len - offset] = 0; 361 362 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will 363 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen() 364 * know the actual buffer. 365 */ 366 return strlen(addr->__data) + offset + 1; 367 } 368 369 static void __unix_remove_socket(struct sock *sk) 370 { 371 sk_del_node_init(sk); 372 } 373 374 static void __unix_insert_socket(struct net *net, struct sock *sk) 375 { 376 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 377 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); 378 } 379 380 static void __unix_set_addr_hash(struct net *net, struct sock *sk, 381 struct unix_address *addr, unsigned int hash) 382 { 383 __unix_remove_socket(sk); 384 smp_store_release(&unix_sk(sk)->addr, addr); 385 386 sk->sk_hash = hash; 387 __unix_insert_socket(net, sk); 388 } 389 390 static void unix_remove_socket(struct net *net, struct sock *sk) 391 { 392 spin_lock(&net->unx.table.locks[sk->sk_hash]); 393 __unix_remove_socket(sk); 394 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 395 } 396 397 static void unix_insert_unbound_socket(struct net *net, struct sock *sk) 398 { 399 spin_lock(&net->unx.table.locks[sk->sk_hash]); 400 __unix_insert_socket(net, sk); 401 spin_unlock(&net->unx.table.locks[sk->sk_hash]); 402 } 403 404 static void unix_insert_bsd_socket(struct sock *sk) 405 { 406 spin_lock(&bsd_socket_locks[sk->sk_hash]); 407 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); 408 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 409 } 410 411 static void unix_remove_bsd_socket(struct sock *sk) 412 { 413 if (!hlist_unhashed(&sk->sk_bind_node)) { 414 spin_lock(&bsd_socket_locks[sk->sk_hash]); 415 __sk_del_bind_node(sk); 416 spin_unlock(&bsd_socket_locks[sk->sk_hash]); 417 418 sk_node_init(&sk->sk_bind_node); 419 } 420 } 421 422 static struct sock *__unix_find_socket_byname(struct net *net, 423 struct sockaddr_un *sunname, 424 int len, unsigned int hash) 425 { 426 struct sock *s; 427 428 sk_for_each(s, &net->unx.table.buckets[hash]) { 429 struct unix_sock *u = unix_sk(s); 430 431 if (u->addr->len == len && 432 !memcmp(u->addr->name, sunname, len)) 433 return s; 434 } 435 return NULL; 436 } 437 438 static inline struct sock *unix_find_socket_byname(struct net *net, 439 struct sockaddr_un *sunname, 440 int len, unsigned int hash) 441 { 442 struct sock *s; 443 444 spin_lock(&net->unx.table.locks[hash]); 445 s = __unix_find_socket_byname(net, sunname, len, hash); 446 if (s) 447 sock_hold(s); 448 spin_unlock(&net->unx.table.locks[hash]); 449 return s; 450 } 451 452 static struct sock *unix_find_socket_byinode(struct inode *i) 453 { 454 unsigned int hash = unix_bsd_hash(i); 455 struct sock *s; 456 457 spin_lock(&bsd_socket_locks[hash]); 458 sk_for_each_bound(s, &bsd_socket_buckets[hash]) { 459 struct dentry *dentry = unix_sk(s)->path.dentry; 460 461 if (dentry && d_backing_inode(dentry) == i) { 462 sock_hold(s); 463 spin_unlock(&bsd_socket_locks[hash]); 464 return s; 465 } 466 } 467 spin_unlock(&bsd_socket_locks[hash]); 468 return NULL; 469 } 470 471 /* Support code for asymmetrically connected dgram sockets 472 * 473 * If a datagram socket is connected to a socket not itself connected 474 * to the first socket (eg, /dev/log), clients may only enqueue more 475 * messages if the present receive queue of the server socket is not 476 * "too large". This means there's a second writeability condition 477 * poll and sendmsg need to test. The dgram recv code will do a wake 478 * up on the peer_wait wait queue of a socket upon reception of a 479 * datagram which needs to be propagated to sleeping would-be writers 480 * since these might not have sent anything so far. This can't be 481 * accomplished via poll_wait because the lifetime of the server 482 * socket might be less than that of its clients if these break their 483 * association with it or if the server socket is closed while clients 484 * are still connected to it and there's no way to inform "a polling 485 * implementation" that it should let go of a certain wait queue 486 * 487 * In order to propagate a wake up, a wait_queue_entry_t of the client 488 * socket is enqueued on the peer_wait queue of the server socket 489 * whose wake function does a wake_up on the ordinary client socket 490 * wait queue. This connection is established whenever a write (or 491 * poll for write) hit the flow control condition and broken when the 492 * association to the server socket is dissolved or after a wake up 493 * was relayed. 494 */ 495 496 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags, 497 void *key) 498 { 499 struct unix_sock *u; 500 wait_queue_head_t *u_sleep; 501 502 u = container_of(q, struct unix_sock, peer_wake); 503 504 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait, 505 q); 506 u->peer_wake.private = NULL; 507 508 /* relaying can only happen while the wq still exists */ 509 u_sleep = sk_sleep(&u->sk); 510 if (u_sleep) 511 wake_up_interruptible_poll(u_sleep, key_to_poll(key)); 512 513 return 0; 514 } 515 516 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other) 517 { 518 struct unix_sock *u, *u_other; 519 int rc; 520 521 u = unix_sk(sk); 522 u_other = unix_sk(other); 523 rc = 0; 524 spin_lock(&u_other->peer_wait.lock); 525 526 if (!u->peer_wake.private) { 527 u->peer_wake.private = other; 528 __add_wait_queue(&u_other->peer_wait, &u->peer_wake); 529 530 rc = 1; 531 } 532 533 spin_unlock(&u_other->peer_wait.lock); 534 return rc; 535 } 536 537 static void unix_dgram_peer_wake_disconnect(struct sock *sk, 538 struct sock *other) 539 { 540 struct unix_sock *u, *u_other; 541 542 u = unix_sk(sk); 543 u_other = unix_sk(other); 544 spin_lock(&u_other->peer_wait.lock); 545 546 if (u->peer_wake.private == other) { 547 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake); 548 u->peer_wake.private = NULL; 549 } 550 551 spin_unlock(&u_other->peer_wait.lock); 552 } 553 554 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk, 555 struct sock *other) 556 { 557 unix_dgram_peer_wake_disconnect(sk, other); 558 wake_up_interruptible_poll(sk_sleep(sk), 559 EPOLLOUT | 560 EPOLLWRNORM | 561 EPOLLWRBAND); 562 } 563 564 /* preconditions: 565 * - unix_peer(sk) == other 566 * - association is stable 567 */ 568 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) 569 { 570 int connected; 571 572 connected = unix_dgram_peer_wake_connect(sk, other); 573 574 /* If other is SOCK_DEAD, we want to make sure we signal 575 * POLLOUT, such that a subsequent write() can get a 576 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs 577 * to other and its full, we will hang waiting for POLLOUT. 578 */ 579 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) 580 return 1; 581 582 if (connected) 583 unix_dgram_peer_wake_disconnect(sk, other); 584 585 return 0; 586 } 587 588 static int unix_writable(const struct sock *sk, unsigned char state) 589 { 590 return state != TCP_LISTEN && 591 (refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf); 592 } 593 594 static void unix_write_space(struct sock *sk) 595 { 596 struct socket_wq *wq; 597 598 rcu_read_lock(); 599 if (unix_writable(sk, READ_ONCE(sk->sk_state))) { 600 wq = rcu_dereference(sk->sk_wq); 601 if (skwq_has_sleeper(wq)) 602 wake_up_interruptible_sync_poll(&wq->wait, 603 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 604 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 605 } 606 rcu_read_unlock(); 607 } 608 609 /* When dgram socket disconnects (or changes its peer), we clear its receive 610 * queue of packets arrived from previous peer. First, it allows to do 611 * flow control based only on wmem_alloc; second, sk connected to peer 612 * may receive messages only from that peer. */ 613 static void unix_dgram_disconnected(struct sock *sk, struct sock *other) 614 { 615 if (!skb_queue_empty(&sk->sk_receive_queue)) { 616 skb_queue_purge_reason(&sk->sk_receive_queue, 617 SKB_DROP_REASON_UNIX_DISCONNECT); 618 619 wake_up_interruptible_all(&unix_sk(sk)->peer_wait); 620 621 /* If one link of bidirectional dgram pipe is disconnected, 622 * we signal error. Messages are lost. Do not make this, 623 * when peer was not connected to us. 624 */ 625 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { 626 WRITE_ONCE(other->sk_err, ECONNRESET); 627 sk_error_report(other); 628 } 629 } 630 } 631 632 static void unix_sock_destructor(struct sock *sk) 633 { 634 struct unix_sock *u = unix_sk(sk); 635 636 skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE); 637 638 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); 639 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); 640 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); 641 if (!sock_flag(sk, SOCK_DEAD)) { 642 pr_info("Attempt to release alive unix socket: %p\n", sk); 643 return; 644 } 645 646 if (u->addr) 647 unix_release_addr(u->addr); 648 649 atomic_long_dec(&unix_nr_socks); 650 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 651 #ifdef UNIX_REFCNT_DEBUG 652 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk, 653 atomic_long_read(&unix_nr_socks)); 654 #endif 655 } 656 657 static void unix_release_sock(struct sock *sk, int embrion) 658 { 659 struct unix_sock *u = unix_sk(sk); 660 struct sock *skpair; 661 struct sk_buff *skb; 662 struct path path; 663 int state; 664 665 unix_remove_socket(sock_net(sk), sk); 666 unix_remove_bsd_socket(sk); 667 668 /* Clear state */ 669 unix_state_lock(sk); 670 sock_orphan(sk); 671 WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); 672 path = u->path; 673 u->path.dentry = NULL; 674 u->path.mnt = NULL; 675 state = sk->sk_state; 676 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 677 678 skpair = unix_peer(sk); 679 unix_peer(sk) = NULL; 680 681 unix_state_unlock(sk); 682 683 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 684 u->oob_skb = NULL; 685 #endif 686 687 wake_up_interruptible_all(&u->peer_wait); 688 689 if (skpair != NULL) { 690 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { 691 unix_state_lock(skpair); 692 /* No more writes */ 693 WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); 694 if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) 695 WRITE_ONCE(skpair->sk_err, ECONNRESET); 696 unix_state_unlock(skpair); 697 skpair->sk_state_change(skpair); 698 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); 699 } 700 701 unix_dgram_peer_wake_disconnect(sk, skpair); 702 sock_put(skpair); /* It may now die */ 703 } 704 705 /* Try to flush out this socket. Throw out buffers at least */ 706 707 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { 708 if (state == TCP_LISTEN) 709 unix_release_sock(skb->sk, 1); 710 711 /* passed fds are erased in the kfree_skb hook */ 712 kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); 713 } 714 715 if (path.dentry) 716 path_put(&path); 717 718 sock_put(sk); 719 720 /* ---- Socket is dead now and most probably destroyed ---- */ 721 722 /* 723 * Fixme: BSD difference: In BSD all sockets connected to us get 724 * ECONNRESET and we die on the spot. In Linux we behave 725 * like files and pipes do and wait for the last 726 * dereference. 727 * 728 * Can't we simply set sock->err? 729 * 730 * What the above comment does talk about? --ANK(980817) 731 */ 732 733 if (READ_ONCE(unix_tot_inflight)) 734 unix_gc(); /* Garbage collect fds */ 735 } 736 737 static void init_peercred(struct sock *sk) 738 { 739 sk->sk_peer_pid = get_pid(task_tgid(current)); 740 sk->sk_peer_cred = get_current_cred(); 741 } 742 743 static void update_peercred(struct sock *sk) 744 { 745 const struct cred *old_cred; 746 struct pid *old_pid; 747 748 spin_lock(&sk->sk_peer_lock); 749 old_pid = sk->sk_peer_pid; 750 old_cred = sk->sk_peer_cred; 751 init_peercred(sk); 752 spin_unlock(&sk->sk_peer_lock); 753 754 put_pid(old_pid); 755 put_cred(old_cred); 756 } 757 758 static void copy_peercred(struct sock *sk, struct sock *peersk) 759 { 760 lockdep_assert_held(&unix_sk(peersk)->lock); 761 762 spin_lock(&sk->sk_peer_lock); 763 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); 764 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); 765 spin_unlock(&sk->sk_peer_lock); 766 } 767 768 static int unix_listen(struct socket *sock, int backlog) 769 { 770 int err; 771 struct sock *sk = sock->sk; 772 struct unix_sock *u = unix_sk(sk); 773 774 err = -EOPNOTSUPP; 775 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 776 goto out; /* Only stream/seqpacket sockets accept */ 777 err = -EINVAL; 778 if (!READ_ONCE(u->addr)) 779 goto out; /* No listens on an unbound socket */ 780 unix_state_lock(sk); 781 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) 782 goto out_unlock; 783 if (backlog > sk->sk_max_ack_backlog) 784 wake_up_interruptible_all(&u->peer_wait); 785 sk->sk_max_ack_backlog = backlog; 786 WRITE_ONCE(sk->sk_state, TCP_LISTEN); 787 788 /* set credentials so connect can copy them */ 789 update_peercred(sk); 790 err = 0; 791 792 out_unlock: 793 unix_state_unlock(sk); 794 out: 795 return err; 796 } 797 798 static int unix_release(struct socket *); 799 static int unix_bind(struct socket *, struct sockaddr *, int); 800 static int unix_stream_connect(struct socket *, struct sockaddr *, 801 int addr_len, int flags); 802 static int unix_socketpair(struct socket *, struct socket *); 803 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg); 804 static int unix_getname(struct socket *, struct sockaddr *, int); 805 static __poll_t unix_poll(struct file *, struct socket *, poll_table *); 806 static __poll_t unix_dgram_poll(struct file *, struct socket *, 807 poll_table *); 808 static int unix_ioctl(struct socket *, unsigned int, unsigned long); 809 #ifdef CONFIG_COMPAT 810 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); 811 #endif 812 static int unix_shutdown(struct socket *, int); 813 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t); 814 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int); 815 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, 816 struct pipe_inode_info *, size_t size, 817 unsigned int flags); 818 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); 819 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); 820 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 821 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); 822 static int unix_dgram_connect(struct socket *, struct sockaddr *, 823 int, int); 824 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); 825 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, 826 int); 827 828 #ifdef CONFIG_PROC_FS 829 static int unix_count_nr_fds(struct sock *sk) 830 { 831 struct sk_buff *skb; 832 struct unix_sock *u; 833 int nr_fds = 0; 834 835 spin_lock(&sk->sk_receive_queue.lock); 836 skb = skb_peek(&sk->sk_receive_queue); 837 while (skb) { 838 u = unix_sk(skb->sk); 839 nr_fds += atomic_read(&u->scm_stat.nr_fds); 840 skb = skb_peek_next(skb, &sk->sk_receive_queue); 841 } 842 spin_unlock(&sk->sk_receive_queue.lock); 843 844 return nr_fds; 845 } 846 847 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) 848 { 849 struct sock *sk = sock->sk; 850 unsigned char s_state; 851 struct unix_sock *u; 852 int nr_fds = 0; 853 854 if (sk) { 855 s_state = READ_ONCE(sk->sk_state); 856 u = unix_sk(sk); 857 858 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their 859 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN. 860 * SOCK_DGRAM is ordinary. So, no lock is needed. 861 */ 862 if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED) 863 nr_fds = atomic_read(&u->scm_stat.nr_fds); 864 else if (s_state == TCP_LISTEN) 865 nr_fds = unix_count_nr_fds(sk); 866 867 seq_printf(m, "scm_fds: %u\n", nr_fds); 868 } 869 } 870 #else 871 #define unix_show_fdinfo NULL 872 #endif 873 874 static const struct proto_ops unix_stream_ops = { 875 .family = PF_UNIX, 876 .owner = THIS_MODULE, 877 .release = unix_release, 878 .bind = unix_bind, 879 .connect = unix_stream_connect, 880 .socketpair = unix_socketpair, 881 .accept = unix_accept, 882 .getname = unix_getname, 883 .poll = unix_poll, 884 .ioctl = unix_ioctl, 885 #ifdef CONFIG_COMPAT 886 .compat_ioctl = unix_compat_ioctl, 887 #endif 888 .listen = unix_listen, 889 .shutdown = unix_shutdown, 890 .sendmsg = unix_stream_sendmsg, 891 .recvmsg = unix_stream_recvmsg, 892 .read_skb = unix_stream_read_skb, 893 .mmap = sock_no_mmap, 894 .splice_read = unix_stream_splice_read, 895 .set_peek_off = sk_set_peek_off, 896 .show_fdinfo = unix_show_fdinfo, 897 }; 898 899 static const struct proto_ops unix_dgram_ops = { 900 .family = PF_UNIX, 901 .owner = THIS_MODULE, 902 .release = unix_release, 903 .bind = unix_bind, 904 .connect = unix_dgram_connect, 905 .socketpair = unix_socketpair, 906 .accept = sock_no_accept, 907 .getname = unix_getname, 908 .poll = unix_dgram_poll, 909 .ioctl = unix_ioctl, 910 #ifdef CONFIG_COMPAT 911 .compat_ioctl = unix_compat_ioctl, 912 #endif 913 .listen = sock_no_listen, 914 .shutdown = unix_shutdown, 915 .sendmsg = unix_dgram_sendmsg, 916 .read_skb = unix_read_skb, 917 .recvmsg = unix_dgram_recvmsg, 918 .mmap = sock_no_mmap, 919 .set_peek_off = sk_set_peek_off, 920 .show_fdinfo = unix_show_fdinfo, 921 }; 922 923 static const struct proto_ops unix_seqpacket_ops = { 924 .family = PF_UNIX, 925 .owner = THIS_MODULE, 926 .release = unix_release, 927 .bind = unix_bind, 928 .connect = unix_stream_connect, 929 .socketpair = unix_socketpair, 930 .accept = unix_accept, 931 .getname = unix_getname, 932 .poll = unix_dgram_poll, 933 .ioctl = unix_ioctl, 934 #ifdef CONFIG_COMPAT 935 .compat_ioctl = unix_compat_ioctl, 936 #endif 937 .listen = unix_listen, 938 .shutdown = unix_shutdown, 939 .sendmsg = unix_seqpacket_sendmsg, 940 .recvmsg = unix_seqpacket_recvmsg, 941 .mmap = sock_no_mmap, 942 .set_peek_off = sk_set_peek_off, 943 .show_fdinfo = unix_show_fdinfo, 944 }; 945 946 static void unix_close(struct sock *sk, long timeout) 947 { 948 /* Nothing to do here, unix socket does not need a ->close(). 949 * This is merely for sockmap. 950 */ 951 } 952 953 static bool unix_bpf_bypass_getsockopt(int level, int optname) 954 { 955 if (level == SOL_SOCKET) { 956 switch (optname) { 957 case SO_PEERPIDFD: 958 return true; 959 default: 960 return false; 961 } 962 } 963 964 return false; 965 } 966 967 struct proto unix_dgram_proto = { 968 .name = "UNIX", 969 .owner = THIS_MODULE, 970 .obj_size = sizeof(struct unix_sock), 971 .close = unix_close, 972 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 973 #ifdef CONFIG_BPF_SYSCALL 974 .psock_update_sk_prot = unix_dgram_bpf_update_proto, 975 #endif 976 }; 977 978 struct proto unix_stream_proto = { 979 .name = "UNIX-STREAM", 980 .owner = THIS_MODULE, 981 .obj_size = sizeof(struct unix_sock), 982 .close = unix_close, 983 .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, 984 #ifdef CONFIG_BPF_SYSCALL 985 .psock_update_sk_prot = unix_stream_bpf_update_proto, 986 #endif 987 }; 988 989 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) 990 { 991 struct unix_sock *u; 992 struct sock *sk; 993 int err; 994 995 atomic_long_inc(&unix_nr_socks); 996 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { 997 err = -ENFILE; 998 goto err; 999 } 1000 1001 if (type == SOCK_STREAM) 1002 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); 1003 else /*dgram and seqpacket */ 1004 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); 1005 1006 if (!sk) { 1007 err = -ENOMEM; 1008 goto err; 1009 } 1010 1011 sock_init_data(sock, sk); 1012 1013 sk->sk_hash = unix_unbound_hash(sk); 1014 sk->sk_allocation = GFP_KERNEL_ACCOUNT; 1015 sk->sk_write_space = unix_write_space; 1016 sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); 1017 sk->sk_destruct = unix_sock_destructor; 1018 lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL); 1019 1020 u = unix_sk(sk); 1021 u->listener = NULL; 1022 u->vertex = NULL; 1023 u->path.dentry = NULL; 1024 u->path.mnt = NULL; 1025 spin_lock_init(&u->lock); 1026 lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL); 1027 mutex_init(&u->iolock); /* single task reading lock */ 1028 mutex_init(&u->bindlock); /* single task binding lock */ 1029 init_waitqueue_head(&u->peer_wait); 1030 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); 1031 memset(&u->scm_stat, 0, sizeof(struct scm_stat)); 1032 unix_insert_unbound_socket(net, sk); 1033 1034 sock_prot_inuse_add(net, sk->sk_prot, 1); 1035 1036 return sk; 1037 1038 err: 1039 atomic_long_dec(&unix_nr_socks); 1040 return ERR_PTR(err); 1041 } 1042 1043 static int unix_create(struct net *net, struct socket *sock, int protocol, 1044 int kern) 1045 { 1046 struct sock *sk; 1047 1048 if (protocol && protocol != PF_UNIX) 1049 return -EPROTONOSUPPORT; 1050 1051 sock->state = SS_UNCONNECTED; 1052 1053 switch (sock->type) { 1054 case SOCK_STREAM: 1055 sock->ops = &unix_stream_ops; 1056 break; 1057 /* 1058 * Believe it or not BSD has AF_UNIX, SOCK_RAW though 1059 * nothing uses it. 1060 */ 1061 case SOCK_RAW: 1062 sock->type = SOCK_DGRAM; 1063 fallthrough; 1064 case SOCK_DGRAM: 1065 sock->ops = &unix_dgram_ops; 1066 break; 1067 case SOCK_SEQPACKET: 1068 sock->ops = &unix_seqpacket_ops; 1069 break; 1070 default: 1071 return -ESOCKTNOSUPPORT; 1072 } 1073 1074 sk = unix_create1(net, sock, kern, sock->type); 1075 if (IS_ERR(sk)) 1076 return PTR_ERR(sk); 1077 1078 return 0; 1079 } 1080 1081 static int unix_release(struct socket *sock) 1082 { 1083 struct sock *sk = sock->sk; 1084 1085 if (!sk) 1086 return 0; 1087 1088 sk->sk_prot->close(sk, 0); 1089 unix_release_sock(sk, 0); 1090 sock->sk = NULL; 1091 1092 return 0; 1093 } 1094 1095 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, 1096 int type) 1097 { 1098 struct inode *inode; 1099 struct path path; 1100 struct sock *sk; 1101 int err; 1102 1103 unix_mkname_bsd(sunaddr, addr_len); 1104 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); 1105 if (err) 1106 goto fail; 1107 1108 err = path_permission(&path, MAY_WRITE); 1109 if (err) 1110 goto path_put; 1111 1112 err = -ECONNREFUSED; 1113 inode = d_backing_inode(path.dentry); 1114 if (!S_ISSOCK(inode->i_mode)) 1115 goto path_put; 1116 1117 sk = unix_find_socket_byinode(inode); 1118 if (!sk) 1119 goto path_put; 1120 1121 err = -EPROTOTYPE; 1122 if (sk->sk_type == type) 1123 touch_atime(&path); 1124 else 1125 goto sock_put; 1126 1127 path_put(&path); 1128 1129 return sk; 1130 1131 sock_put: 1132 sock_put(sk); 1133 path_put: 1134 path_put(&path); 1135 fail: 1136 return ERR_PTR(err); 1137 } 1138 1139 static struct sock *unix_find_abstract(struct net *net, 1140 struct sockaddr_un *sunaddr, 1141 int addr_len, int type) 1142 { 1143 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); 1144 struct dentry *dentry; 1145 struct sock *sk; 1146 1147 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); 1148 if (!sk) 1149 return ERR_PTR(-ECONNREFUSED); 1150 1151 dentry = unix_sk(sk)->path.dentry; 1152 if (dentry) 1153 touch_atime(&unix_sk(sk)->path); 1154 1155 return sk; 1156 } 1157 1158 static struct sock *unix_find_other(struct net *net, 1159 struct sockaddr_un *sunaddr, 1160 int addr_len, int type) 1161 { 1162 struct sock *sk; 1163 1164 if (sunaddr->sun_path[0]) 1165 sk = unix_find_bsd(sunaddr, addr_len, type); 1166 else 1167 sk = unix_find_abstract(net, sunaddr, addr_len, type); 1168 1169 return sk; 1170 } 1171 1172 static int unix_autobind(struct sock *sk) 1173 { 1174 struct unix_sock *u = unix_sk(sk); 1175 unsigned int new_hash, old_hash; 1176 struct net *net = sock_net(sk); 1177 struct unix_address *addr; 1178 u32 lastnum, ordernum; 1179 int err; 1180 1181 err = mutex_lock_interruptible(&u->bindlock); 1182 if (err) 1183 return err; 1184 1185 if (u->addr) 1186 goto out; 1187 1188 err = -ENOMEM; 1189 addr = kzalloc(sizeof(*addr) + 1190 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); 1191 if (!addr) 1192 goto out; 1193 1194 addr->len = offsetof(struct sockaddr_un, sun_path) + 6; 1195 addr->name->sun_family = AF_UNIX; 1196 refcount_set(&addr->refcnt, 1); 1197 1198 old_hash = sk->sk_hash; 1199 ordernum = get_random_u32(); 1200 lastnum = ordernum & 0xFFFFF; 1201 retry: 1202 ordernum = (ordernum + 1) & 0xFFFFF; 1203 sprintf(addr->name->sun_path + 1, "%05x", ordernum); 1204 1205 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1206 unix_table_double_lock(net, old_hash, new_hash); 1207 1208 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { 1209 unix_table_double_unlock(net, old_hash, new_hash); 1210 1211 /* __unix_find_socket_byname() may take long time if many names 1212 * are already in use. 1213 */ 1214 cond_resched(); 1215 1216 if (ordernum == lastnum) { 1217 /* Give up if all names seems to be in use. */ 1218 err = -ENOSPC; 1219 unix_release_addr(addr); 1220 goto out; 1221 } 1222 1223 goto retry; 1224 } 1225 1226 __unix_set_addr_hash(net, sk, addr, new_hash); 1227 unix_table_double_unlock(net, old_hash, new_hash); 1228 err = 0; 1229 1230 out: mutex_unlock(&u->bindlock); 1231 return err; 1232 } 1233 1234 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, 1235 int addr_len) 1236 { 1237 umode_t mode = S_IFSOCK | 1238 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); 1239 struct unix_sock *u = unix_sk(sk); 1240 unsigned int new_hash, old_hash; 1241 struct net *net = sock_net(sk); 1242 struct mnt_idmap *idmap; 1243 struct unix_address *addr; 1244 struct dentry *dentry; 1245 struct path parent; 1246 int err; 1247 1248 addr_len = unix_mkname_bsd(sunaddr, addr_len); 1249 addr = unix_create_addr(sunaddr, addr_len); 1250 if (!addr) 1251 return -ENOMEM; 1252 1253 /* 1254 * Get the parent directory, calculate the hash for last 1255 * component. 1256 */ 1257 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); 1258 if (IS_ERR(dentry)) { 1259 err = PTR_ERR(dentry); 1260 goto out; 1261 } 1262 1263 /* 1264 * All right, let's create it. 1265 */ 1266 idmap = mnt_idmap(parent.mnt); 1267 err = security_path_mknod(&parent, dentry, mode, 0); 1268 if (!err) 1269 err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); 1270 if (err) 1271 goto out_path; 1272 err = mutex_lock_interruptible(&u->bindlock); 1273 if (err) 1274 goto out_unlink; 1275 if (u->addr) 1276 goto out_unlock; 1277 1278 old_hash = sk->sk_hash; 1279 new_hash = unix_bsd_hash(d_backing_inode(dentry)); 1280 unix_table_double_lock(net, old_hash, new_hash); 1281 u->path.mnt = mntget(parent.mnt); 1282 u->path.dentry = dget(dentry); 1283 __unix_set_addr_hash(net, sk, addr, new_hash); 1284 unix_table_double_unlock(net, old_hash, new_hash); 1285 unix_insert_bsd_socket(sk); 1286 mutex_unlock(&u->bindlock); 1287 done_path_create(&parent, dentry); 1288 return 0; 1289 1290 out_unlock: 1291 mutex_unlock(&u->bindlock); 1292 err = -EINVAL; 1293 out_unlink: 1294 /* failed after successful mknod? unlink what we'd created... */ 1295 vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); 1296 out_path: 1297 done_path_create(&parent, dentry); 1298 out: 1299 unix_release_addr(addr); 1300 return err == -EEXIST ? -EADDRINUSE : err; 1301 } 1302 1303 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, 1304 int addr_len) 1305 { 1306 struct unix_sock *u = unix_sk(sk); 1307 unsigned int new_hash, old_hash; 1308 struct net *net = sock_net(sk); 1309 struct unix_address *addr; 1310 int err; 1311 1312 addr = unix_create_addr(sunaddr, addr_len); 1313 if (!addr) 1314 return -ENOMEM; 1315 1316 err = mutex_lock_interruptible(&u->bindlock); 1317 if (err) 1318 goto out; 1319 1320 if (u->addr) { 1321 err = -EINVAL; 1322 goto out_mutex; 1323 } 1324 1325 old_hash = sk->sk_hash; 1326 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); 1327 unix_table_double_lock(net, old_hash, new_hash); 1328 1329 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) 1330 goto out_spin; 1331 1332 __unix_set_addr_hash(net, sk, addr, new_hash); 1333 unix_table_double_unlock(net, old_hash, new_hash); 1334 mutex_unlock(&u->bindlock); 1335 return 0; 1336 1337 out_spin: 1338 unix_table_double_unlock(net, old_hash, new_hash); 1339 err = -EADDRINUSE; 1340 out_mutex: 1341 mutex_unlock(&u->bindlock); 1342 out: 1343 unix_release_addr(addr); 1344 return err; 1345 } 1346 1347 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 1348 { 1349 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1350 struct sock *sk = sock->sk; 1351 int err; 1352 1353 if (addr_len == offsetof(struct sockaddr_un, sun_path) && 1354 sunaddr->sun_family == AF_UNIX) 1355 return unix_autobind(sk); 1356 1357 err = unix_validate_addr(sunaddr, addr_len); 1358 if (err) 1359 return err; 1360 1361 if (sunaddr->sun_path[0]) 1362 err = unix_bind_bsd(sk, sunaddr, addr_len); 1363 else 1364 err = unix_bind_abstract(sk, sunaddr, addr_len); 1365 1366 return err; 1367 } 1368 1369 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 1370 { 1371 if (unlikely(sk1 == sk2) || !sk2) { 1372 unix_state_lock(sk1); 1373 return; 1374 } 1375 1376 if (sk1 > sk2) 1377 swap(sk1, sk2); 1378 1379 unix_state_lock(sk1); 1380 unix_state_lock(sk2); 1381 } 1382 1383 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) 1384 { 1385 if (unlikely(sk1 == sk2) || !sk2) { 1386 unix_state_unlock(sk1); 1387 return; 1388 } 1389 unix_state_unlock(sk1); 1390 unix_state_unlock(sk2); 1391 } 1392 1393 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, 1394 int alen, int flags) 1395 { 1396 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; 1397 struct sock *sk = sock->sk; 1398 struct sock *other; 1399 int err; 1400 1401 err = -EINVAL; 1402 if (alen < offsetofend(struct sockaddr, sa_family)) 1403 goto out; 1404 1405 if (addr->sa_family != AF_UNSPEC) { 1406 err = unix_validate_addr(sunaddr, alen); 1407 if (err) 1408 goto out; 1409 1410 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen); 1411 if (err) 1412 goto out; 1413 1414 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1415 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1416 !READ_ONCE(unix_sk(sk)->addr)) { 1417 err = unix_autobind(sk); 1418 if (err) 1419 goto out; 1420 } 1421 1422 restart: 1423 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); 1424 if (IS_ERR(other)) { 1425 err = PTR_ERR(other); 1426 goto out; 1427 } 1428 1429 unix_state_double_lock(sk, other); 1430 1431 /* Apparently VFS overslept socket death. Retry. */ 1432 if (sock_flag(other, SOCK_DEAD)) { 1433 unix_state_double_unlock(sk, other); 1434 sock_put(other); 1435 goto restart; 1436 } 1437 1438 err = -EPERM; 1439 if (!unix_may_send(sk, other)) 1440 goto out_unlock; 1441 1442 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 1443 if (err) 1444 goto out_unlock; 1445 1446 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1447 WRITE_ONCE(other->sk_state, TCP_ESTABLISHED); 1448 } else { 1449 /* 1450 * 1003.1g breaking connected state with AF_UNSPEC 1451 */ 1452 other = NULL; 1453 unix_state_double_lock(sk, other); 1454 } 1455 1456 /* 1457 * If it was connected, reconnect. 1458 */ 1459 if (unix_peer(sk)) { 1460 struct sock *old_peer = unix_peer(sk); 1461 1462 unix_peer(sk) = other; 1463 if (!other) 1464 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 1465 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); 1466 1467 unix_state_double_unlock(sk, other); 1468 1469 if (other != old_peer) { 1470 unix_dgram_disconnected(sk, old_peer); 1471 1472 unix_state_lock(old_peer); 1473 if (!unix_peer(old_peer)) 1474 WRITE_ONCE(old_peer->sk_state, TCP_CLOSE); 1475 unix_state_unlock(old_peer); 1476 } 1477 1478 sock_put(old_peer); 1479 } else { 1480 unix_peer(sk) = other; 1481 unix_state_double_unlock(sk, other); 1482 } 1483 1484 return 0; 1485 1486 out_unlock: 1487 unix_state_double_unlock(sk, other); 1488 sock_put(other); 1489 out: 1490 return err; 1491 } 1492 1493 static long unix_wait_for_peer(struct sock *other, long timeo) 1494 { 1495 struct unix_sock *u = unix_sk(other); 1496 int sched; 1497 DEFINE_WAIT(wait); 1498 1499 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); 1500 1501 sched = !sock_flag(other, SOCK_DEAD) && 1502 !(other->sk_shutdown & RCV_SHUTDOWN) && 1503 unix_recvq_full_lockless(other); 1504 1505 unix_state_unlock(other); 1506 1507 if (sched) 1508 timeo = schedule_timeout(timeo); 1509 1510 finish_wait(&u->peer_wait, &wait); 1511 return timeo; 1512 } 1513 1514 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, 1515 int addr_len, int flags) 1516 { 1517 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 1518 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; 1519 struct unix_sock *u = unix_sk(sk), *newu, *otheru; 1520 struct net *net = sock_net(sk); 1521 struct sk_buff *skb = NULL; 1522 unsigned char state; 1523 long timeo; 1524 int err; 1525 1526 err = unix_validate_addr(sunaddr, addr_len); 1527 if (err) 1528 goto out; 1529 1530 err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len); 1531 if (err) 1532 goto out; 1533 1534 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1535 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1536 !READ_ONCE(u->addr)) { 1537 err = unix_autobind(sk); 1538 if (err) 1539 goto out; 1540 } 1541 1542 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); 1543 1544 /* First of all allocate resources. 1545 * If we will make it after state is locked, 1546 * we will have to recheck all again in any case. 1547 */ 1548 1549 /* create new sock for complete connection */ 1550 newsk = unix_create1(net, NULL, 0, sock->type); 1551 if (IS_ERR(newsk)) { 1552 err = PTR_ERR(newsk); 1553 goto out; 1554 } 1555 1556 /* Allocate skb for sending to listening sock */ 1557 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); 1558 if (!skb) { 1559 err = -ENOMEM; 1560 goto out_free_sk; 1561 } 1562 1563 restart: 1564 /* Find listening sock. */ 1565 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); 1566 if (IS_ERR(other)) { 1567 err = PTR_ERR(other); 1568 goto out_free_skb; 1569 } 1570 1571 unix_state_lock(other); 1572 1573 /* Apparently VFS overslept socket death. Retry. */ 1574 if (sock_flag(other, SOCK_DEAD)) { 1575 unix_state_unlock(other); 1576 sock_put(other); 1577 goto restart; 1578 } 1579 1580 if (other->sk_state != TCP_LISTEN || 1581 other->sk_shutdown & RCV_SHUTDOWN) { 1582 err = -ECONNREFUSED; 1583 goto out_unlock; 1584 } 1585 1586 if (unix_recvq_full_lockless(other)) { 1587 if (!timeo) { 1588 err = -EAGAIN; 1589 goto out_unlock; 1590 } 1591 1592 timeo = unix_wait_for_peer(other, timeo); 1593 sock_put(other); 1594 1595 err = sock_intr_errno(timeo); 1596 if (signal_pending(current)) 1597 goto out_free_skb; 1598 1599 goto restart; 1600 } 1601 1602 /* self connect and simultaneous connect are eliminated 1603 * by rejecting TCP_LISTEN socket to avoid deadlock. 1604 */ 1605 state = READ_ONCE(sk->sk_state); 1606 if (unlikely(state != TCP_CLOSE)) { 1607 err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1608 goto out_unlock; 1609 } 1610 1611 unix_state_lock(sk); 1612 1613 if (unlikely(sk->sk_state != TCP_CLOSE)) { 1614 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL; 1615 unix_state_unlock(sk); 1616 goto out_unlock; 1617 } 1618 1619 err = security_unix_stream_connect(sk, other, newsk); 1620 if (err) { 1621 unix_state_unlock(sk); 1622 goto out_unlock; 1623 } 1624 1625 /* The way is open! Fastly set all the necessary fields... */ 1626 1627 sock_hold(sk); 1628 unix_peer(newsk) = sk; 1629 newsk->sk_state = TCP_ESTABLISHED; 1630 newsk->sk_type = sk->sk_type; 1631 init_peercred(newsk); 1632 newu = unix_sk(newsk); 1633 newu->listener = other; 1634 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1635 otheru = unix_sk(other); 1636 1637 /* copy address information from listening to new sock 1638 * 1639 * The contents of *(otheru->addr) and otheru->path 1640 * are seen fully set up here, since we have found 1641 * otheru in hash under its lock. Insertion into the 1642 * hash chain we'd found it in had been done in an 1643 * earlier critical area protected by the chain's lock, 1644 * the same one where we'd set *(otheru->addr) contents, 1645 * as well as otheru->path and otheru->addr itself. 1646 * 1647 * Using smp_store_release() here to set newu->addr 1648 * is enough to make those stores, as well as stores 1649 * to newu->path visible to anyone who gets newu->addr 1650 * by smp_load_acquire(). IOW, the same warranties 1651 * as for unix_sock instances bound in unix_bind() or 1652 * in unix_autobind(). 1653 */ 1654 if (otheru->path.dentry) { 1655 path_get(&otheru->path); 1656 newu->path = otheru->path; 1657 } 1658 refcount_inc(&otheru->addr->refcnt); 1659 smp_store_release(&newu->addr, otheru->addr); 1660 1661 /* Set credentials */ 1662 copy_peercred(sk, other); 1663 1664 sock->state = SS_CONNECTED; 1665 WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED); 1666 sock_hold(newsk); 1667 1668 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */ 1669 unix_peer(sk) = newsk; 1670 1671 unix_state_unlock(sk); 1672 1673 /* take ten and send info to listening sock */ 1674 spin_lock(&other->sk_receive_queue.lock); 1675 __skb_queue_tail(&other->sk_receive_queue, skb); 1676 spin_unlock(&other->sk_receive_queue.lock); 1677 unix_state_unlock(other); 1678 other->sk_data_ready(other); 1679 sock_put(other); 1680 return 0; 1681 1682 out_unlock: 1683 unix_state_unlock(other); 1684 sock_put(other); 1685 out_free_skb: 1686 consume_skb(skb); 1687 out_free_sk: 1688 unix_release_sock(newsk, 0); 1689 out: 1690 return err; 1691 } 1692 1693 static int unix_socketpair(struct socket *socka, struct socket *sockb) 1694 { 1695 struct sock *ska = socka->sk, *skb = sockb->sk; 1696 1697 /* Join our sockets back to back */ 1698 sock_hold(ska); 1699 sock_hold(skb); 1700 unix_peer(ska) = skb; 1701 unix_peer(skb) = ska; 1702 init_peercred(ska); 1703 init_peercred(skb); 1704 1705 ska->sk_state = TCP_ESTABLISHED; 1706 skb->sk_state = TCP_ESTABLISHED; 1707 socka->state = SS_CONNECTED; 1708 sockb->state = SS_CONNECTED; 1709 return 0; 1710 } 1711 1712 static void unix_sock_inherit_flags(const struct socket *old, 1713 struct socket *new) 1714 { 1715 if (test_bit(SOCK_PASSCRED, &old->flags)) 1716 set_bit(SOCK_PASSCRED, &new->flags); 1717 if (test_bit(SOCK_PASSPIDFD, &old->flags)) 1718 set_bit(SOCK_PASSPIDFD, &new->flags); 1719 if (test_bit(SOCK_PASSSEC, &old->flags)) 1720 set_bit(SOCK_PASSSEC, &new->flags); 1721 } 1722 1723 static int unix_accept(struct socket *sock, struct socket *newsock, 1724 struct proto_accept_arg *arg) 1725 { 1726 struct sock *sk = sock->sk; 1727 struct sk_buff *skb; 1728 struct sock *tsk; 1729 1730 arg->err = -EOPNOTSUPP; 1731 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) 1732 goto out; 1733 1734 arg->err = -EINVAL; 1735 if (READ_ONCE(sk->sk_state) != TCP_LISTEN) 1736 goto out; 1737 1738 /* If socket state is TCP_LISTEN it cannot change (for now...), 1739 * so that no locks are necessary. 1740 */ 1741 1742 skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, 1743 &arg->err); 1744 if (!skb) { 1745 /* This means receive shutdown. */ 1746 if (arg->err == 0) 1747 arg->err = -EINVAL; 1748 goto out; 1749 } 1750 1751 tsk = skb->sk; 1752 skb_free_datagram(sk, skb); 1753 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1754 1755 /* attach accepted sock to socket */ 1756 unix_state_lock(tsk); 1757 unix_update_edges(unix_sk(tsk)); 1758 newsock->state = SS_CONNECTED; 1759 unix_sock_inherit_flags(sock, newsock); 1760 sock_graft(tsk, newsock); 1761 unix_state_unlock(tsk); 1762 return 0; 1763 1764 out: 1765 return arg->err; 1766 } 1767 1768 1769 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) 1770 { 1771 struct sock *sk = sock->sk; 1772 struct unix_address *addr; 1773 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); 1774 int err = 0; 1775 1776 if (peer) { 1777 sk = unix_peer_get(sk); 1778 1779 err = -ENOTCONN; 1780 if (!sk) 1781 goto out; 1782 err = 0; 1783 } else { 1784 sock_hold(sk); 1785 } 1786 1787 addr = smp_load_acquire(&unix_sk(sk)->addr); 1788 if (!addr) { 1789 sunaddr->sun_family = AF_UNIX; 1790 sunaddr->sun_path[0] = 0; 1791 err = offsetof(struct sockaddr_un, sun_path); 1792 } else { 1793 err = addr->len; 1794 memcpy(sunaddr, addr->name, addr->len); 1795 1796 if (peer) 1797 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1798 CGROUP_UNIX_GETPEERNAME); 1799 else 1800 BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err, 1801 CGROUP_UNIX_GETSOCKNAME); 1802 } 1803 sock_put(sk); 1804 out: 1805 return err; 1806 } 1807 1808 /* The "user->unix_inflight" variable is protected by the garbage 1809 * collection lock, and we just read it locklessly here. If you go 1810 * over the limit, there might be a tiny race in actually noticing 1811 * it across threads. Tough. 1812 */ 1813 static inline bool too_many_unix_fds(struct task_struct *p) 1814 { 1815 struct user_struct *user = current_user(); 1816 1817 if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) 1818 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 1819 return false; 1820 } 1821 1822 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1823 { 1824 if (too_many_unix_fds(current)) 1825 return -ETOOMANYREFS; 1826 1827 UNIXCB(skb).fp = scm->fp; 1828 scm->fp = NULL; 1829 1830 if (unix_prepare_fpl(UNIXCB(skb).fp)) 1831 return -ENOMEM; 1832 1833 return 0; 1834 } 1835 1836 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1837 { 1838 scm->fp = UNIXCB(skb).fp; 1839 UNIXCB(skb).fp = NULL; 1840 1841 unix_destroy_fpl(scm->fp); 1842 } 1843 1844 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) 1845 { 1846 scm->fp = scm_fp_dup(UNIXCB(skb).fp); 1847 } 1848 1849 static void unix_destruct_scm(struct sk_buff *skb) 1850 { 1851 struct scm_cookie scm; 1852 1853 memset(&scm, 0, sizeof(scm)); 1854 scm.pid = UNIXCB(skb).pid; 1855 if (UNIXCB(skb).fp) 1856 unix_detach_fds(&scm, skb); 1857 1858 /* Alas, it calls VFS */ 1859 /* So fscking what? fput() had been SMP-safe since the last Summer */ 1860 scm_destroy(&scm); 1861 sock_wfree(skb); 1862 } 1863 1864 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) 1865 { 1866 int err = 0; 1867 1868 UNIXCB(skb).pid = get_pid(scm->pid); 1869 UNIXCB(skb).uid = scm->creds.uid; 1870 UNIXCB(skb).gid = scm->creds.gid; 1871 UNIXCB(skb).fp = NULL; 1872 unix_get_secdata(scm, skb); 1873 if (scm->fp && send_fds) 1874 err = unix_attach_fds(scm, skb); 1875 1876 skb->destructor = unix_destruct_scm; 1877 return err; 1878 } 1879 1880 static bool unix_passcred_enabled(const struct socket *sock, 1881 const struct sock *other) 1882 { 1883 return test_bit(SOCK_PASSCRED, &sock->flags) || 1884 test_bit(SOCK_PASSPIDFD, &sock->flags) || 1885 !other->sk_socket || 1886 test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || 1887 test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); 1888 } 1889 1890 /* 1891 * Some apps rely on write() giving SCM_CREDENTIALS 1892 * We include credentials if source or destination socket 1893 * asserted SOCK_PASSCRED. 1894 */ 1895 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, 1896 const struct sock *other) 1897 { 1898 if (UNIXCB(skb).pid) 1899 return; 1900 if (unix_passcred_enabled(sock, other)) { 1901 UNIXCB(skb).pid = get_pid(task_tgid(current)); 1902 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); 1903 } 1904 } 1905 1906 static bool unix_skb_scm_eq(struct sk_buff *skb, 1907 struct scm_cookie *scm) 1908 { 1909 return UNIXCB(skb).pid == scm->pid && 1910 uid_eq(UNIXCB(skb).uid, scm->creds.uid) && 1911 gid_eq(UNIXCB(skb).gid, scm->creds.gid) && 1912 unix_secdata_eq(scm, skb); 1913 } 1914 1915 static void scm_stat_add(struct sock *sk, struct sk_buff *skb) 1916 { 1917 struct scm_fp_list *fp = UNIXCB(skb).fp; 1918 struct unix_sock *u = unix_sk(sk); 1919 1920 if (unlikely(fp && fp->count)) { 1921 atomic_add(fp->count, &u->scm_stat.nr_fds); 1922 unix_add_edges(fp, u); 1923 } 1924 } 1925 1926 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) 1927 { 1928 struct scm_fp_list *fp = UNIXCB(skb).fp; 1929 struct unix_sock *u = unix_sk(sk); 1930 1931 if (unlikely(fp && fp->count)) { 1932 atomic_sub(fp->count, &u->scm_stat.nr_fds); 1933 unix_del_edges(fp); 1934 } 1935 } 1936 1937 /* 1938 * Send AF_UNIX data. 1939 */ 1940 1941 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, 1942 size_t len) 1943 { 1944 struct sock *sk = sock->sk, *other = NULL; 1945 struct unix_sock *u = unix_sk(sk); 1946 struct scm_cookie scm; 1947 struct sk_buff *skb; 1948 int data_len = 0; 1949 int sk_locked; 1950 long timeo; 1951 int err; 1952 1953 err = scm_send(sock, msg, &scm, false); 1954 if (err < 0) 1955 return err; 1956 1957 wait_for_unix_gc(scm.fp); 1958 1959 if (msg->msg_flags & MSG_OOB) { 1960 err = -EOPNOTSUPP; 1961 goto out; 1962 } 1963 1964 if (msg->msg_namelen) { 1965 err = unix_validate_addr(msg->msg_name, msg->msg_namelen); 1966 if (err) 1967 goto out; 1968 1969 err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, 1970 msg->msg_name, 1971 &msg->msg_namelen, 1972 NULL); 1973 if (err) 1974 goto out; 1975 } 1976 1977 if ((test_bit(SOCK_PASSCRED, &sock->flags) || 1978 test_bit(SOCK_PASSPIDFD, &sock->flags)) && 1979 !READ_ONCE(u->addr)) { 1980 err = unix_autobind(sk); 1981 if (err) 1982 goto out; 1983 } 1984 1985 if (len > READ_ONCE(sk->sk_sndbuf) - 32) { 1986 err = -EMSGSIZE; 1987 goto out; 1988 } 1989 1990 if (len > SKB_MAX_ALLOC) { 1991 data_len = min_t(size_t, 1992 len - SKB_MAX_ALLOC, 1993 MAX_SKB_FRAGS * PAGE_SIZE); 1994 data_len = PAGE_ALIGN(data_len); 1995 1996 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE); 1997 } 1998 1999 skb = sock_alloc_send_pskb(sk, len - data_len, data_len, 2000 msg->msg_flags & MSG_DONTWAIT, &err, 2001 PAGE_ALLOC_COSTLY_ORDER); 2002 if (!skb) 2003 goto out; 2004 2005 err = unix_scm_to_skb(&scm, skb, true); 2006 if (err < 0) 2007 goto out_free; 2008 2009 skb_put(skb, len - data_len); 2010 skb->data_len = data_len; 2011 skb->len = len; 2012 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len); 2013 if (err) 2014 goto out_free; 2015 2016 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); 2017 2018 if (msg->msg_namelen) { 2019 lookup: 2020 other = unix_find_other(sock_net(sk), msg->msg_name, 2021 msg->msg_namelen, sk->sk_type); 2022 if (IS_ERR(other)) { 2023 err = PTR_ERR(other); 2024 goto out_free; 2025 } 2026 } else { 2027 other = unix_peer_get(sk); 2028 if (!other) { 2029 err = -ENOTCONN; 2030 goto out_free; 2031 } 2032 } 2033 2034 if (sk_filter(other, skb) < 0) { 2035 /* Toss the packet but do not return any error to the sender */ 2036 err = len; 2037 goto out_sock_put; 2038 } 2039 2040 restart: 2041 sk_locked = 0; 2042 unix_state_lock(other); 2043 restart_locked: 2044 2045 if (!unix_may_send(sk, other)) { 2046 err = -EPERM; 2047 goto out_unlock; 2048 } 2049 2050 if (unlikely(sock_flag(other, SOCK_DEAD))) { 2051 /* Check with 1003.1g - what should datagram error */ 2052 2053 unix_state_unlock(other); 2054 2055 if (sk->sk_type == SOCK_SEQPACKET) { 2056 /* We are here only when racing with unix_release_sock() 2057 * is clearing @other. Never change state to TCP_CLOSE 2058 * unlike SOCK_DGRAM wants. 2059 */ 2060 err = -EPIPE; 2061 goto out_sock_put; 2062 } 2063 2064 if (!sk_locked) 2065 unix_state_lock(sk); 2066 2067 if (unix_peer(sk) == other) { 2068 unix_peer(sk) = NULL; 2069 unix_dgram_peer_wake_disconnect_wakeup(sk, other); 2070 2071 WRITE_ONCE(sk->sk_state, TCP_CLOSE); 2072 unix_state_unlock(sk); 2073 2074 unix_dgram_disconnected(sk, other); 2075 sock_put(other); 2076 err = -ECONNREFUSED; 2077 goto out_sock_put; 2078 } 2079 2080 unix_state_unlock(sk); 2081 2082 if (!msg->msg_namelen) { 2083 err = -ECONNRESET; 2084 goto out_sock_put; 2085 } 2086 2087 sock_put(other); 2088 goto lookup; 2089 } 2090 2091 if (other->sk_shutdown & RCV_SHUTDOWN) { 2092 err = -EPIPE; 2093 goto out_unlock; 2094 } 2095 2096 if (sk->sk_type != SOCK_SEQPACKET) { 2097 err = security_unix_may_send(sk->sk_socket, other->sk_socket); 2098 if (err) 2099 goto out_unlock; 2100 } 2101 2102 /* other == sk && unix_peer(other) != sk if 2103 * - unix_peer(sk) == NULL, destination address bound to sk 2104 * - unix_peer(sk) == sk by time of get but disconnected before lock 2105 */ 2106 if (other != sk && 2107 unlikely(unix_peer(other) != sk && 2108 unix_recvq_full_lockless(other))) { 2109 if (timeo) { 2110 timeo = unix_wait_for_peer(other, timeo); 2111 2112 err = sock_intr_errno(timeo); 2113 if (signal_pending(current)) 2114 goto out_sock_put; 2115 2116 goto restart; 2117 } 2118 2119 if (!sk_locked) { 2120 unix_state_unlock(other); 2121 unix_state_double_lock(sk, other); 2122 } 2123 2124 if (unix_peer(sk) != other || 2125 unix_dgram_peer_wake_me(sk, other)) { 2126 err = -EAGAIN; 2127 sk_locked = 1; 2128 goto out_unlock; 2129 } 2130 2131 if (!sk_locked) { 2132 sk_locked = 1; 2133 goto restart_locked; 2134 } 2135 } 2136 2137 if (unlikely(sk_locked)) 2138 unix_state_unlock(sk); 2139 2140 if (sock_flag(other, SOCK_RCVTSTAMP)) 2141 __net_timestamp(skb); 2142 maybe_add_creds(skb, sock, other); 2143 scm_stat_add(other, skb); 2144 skb_queue_tail(&other->sk_receive_queue, skb); 2145 unix_state_unlock(other); 2146 other->sk_data_ready(other); 2147 sock_put(other); 2148 scm_destroy(&scm); 2149 return len; 2150 2151 out_unlock: 2152 if (sk_locked) 2153 unix_state_unlock(sk); 2154 unix_state_unlock(other); 2155 out_sock_put: 2156 sock_put(other); 2157 out_free: 2158 consume_skb(skb); 2159 out: 2160 scm_destroy(&scm); 2161 return err; 2162 } 2163 2164 /* We use paged skbs for stream sockets, and limit occupancy to 32768 2165 * bytes, and a minimum of a full page. 2166 */ 2167 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) 2168 2169 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2170 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, 2171 struct scm_cookie *scm, bool fds_sent) 2172 { 2173 struct unix_sock *ousk = unix_sk(other); 2174 struct sk_buff *skb; 2175 int err; 2176 2177 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); 2178 2179 if (!skb) 2180 return err; 2181 2182 err = unix_scm_to_skb(scm, skb, !fds_sent); 2183 if (err < 0) 2184 goto out; 2185 2186 skb_put(skb, 1); 2187 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); 2188 2189 if (err) 2190 goto out; 2191 2192 unix_state_lock(other); 2193 2194 if (sock_flag(other, SOCK_DEAD) || 2195 (other->sk_shutdown & RCV_SHUTDOWN)) { 2196 unix_state_unlock(other); 2197 err = -EPIPE; 2198 goto out; 2199 } 2200 2201 maybe_add_creds(skb, sock, other); 2202 scm_stat_add(other, skb); 2203 2204 spin_lock(&other->sk_receive_queue.lock); 2205 WRITE_ONCE(ousk->oob_skb, skb); 2206 __skb_queue_tail(&other->sk_receive_queue, skb); 2207 spin_unlock(&other->sk_receive_queue.lock); 2208 2209 sk_send_sigurg(other); 2210 unix_state_unlock(other); 2211 other->sk_data_ready(other); 2212 2213 return 0; 2214 out: 2215 consume_skb(skb); 2216 return err; 2217 } 2218 #endif 2219 2220 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, 2221 size_t len) 2222 { 2223 struct sock *sk = sock->sk; 2224 struct sk_buff *skb = NULL; 2225 struct sock *other = NULL; 2226 struct scm_cookie scm; 2227 bool fds_sent = false; 2228 int err, sent = 0; 2229 2230 err = scm_send(sock, msg, &scm, false); 2231 if (err < 0) 2232 return err; 2233 2234 wait_for_unix_gc(scm.fp); 2235 2236 if (msg->msg_flags & MSG_OOB) { 2237 err = -EOPNOTSUPP; 2238 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2239 if (len) 2240 len--; 2241 else 2242 #endif 2243 goto out_err; 2244 } 2245 2246 if (msg->msg_namelen) { 2247 err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; 2248 goto out_err; 2249 } else { 2250 other = unix_peer(sk); 2251 if (!other) { 2252 err = -ENOTCONN; 2253 goto out_err; 2254 } 2255 } 2256 2257 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2258 goto out_pipe; 2259 2260 while (sent < len) { 2261 int size = len - sent; 2262 int data_len; 2263 2264 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2265 skb = sock_alloc_send_pskb(sk, 0, 0, 2266 msg->msg_flags & MSG_DONTWAIT, 2267 &err, 0); 2268 } else { 2269 /* Keep two messages in the pipe so it schedules better */ 2270 size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64); 2271 2272 /* allow fallback to order-0 allocations */ 2273 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); 2274 2275 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); 2276 2277 data_len = min_t(size_t, size, PAGE_ALIGN(data_len)); 2278 2279 skb = sock_alloc_send_pskb(sk, size - data_len, data_len, 2280 msg->msg_flags & MSG_DONTWAIT, &err, 2281 get_order(UNIX_SKB_FRAGS_SZ)); 2282 } 2283 if (!skb) 2284 goto out_err; 2285 2286 /* Only send the fds in the first buffer */ 2287 err = unix_scm_to_skb(&scm, skb, !fds_sent); 2288 if (err < 0) 2289 goto out_free; 2290 2291 fds_sent = true; 2292 2293 if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { 2294 skb->ip_summed = CHECKSUM_UNNECESSARY; 2295 err = skb_splice_from_iter(skb, &msg->msg_iter, size, 2296 sk->sk_allocation); 2297 if (err < 0) 2298 goto out_free; 2299 2300 size = err; 2301 refcount_add(size, &sk->sk_wmem_alloc); 2302 } else { 2303 skb_put(skb, size - data_len); 2304 skb->data_len = data_len; 2305 skb->len = size; 2306 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 2307 if (err) 2308 goto out_free; 2309 } 2310 2311 unix_state_lock(other); 2312 2313 if (sock_flag(other, SOCK_DEAD) || 2314 (other->sk_shutdown & RCV_SHUTDOWN)) 2315 goto out_pipe_unlock; 2316 2317 maybe_add_creds(skb, sock, other); 2318 scm_stat_add(other, skb); 2319 skb_queue_tail(&other->sk_receive_queue, skb); 2320 unix_state_unlock(other); 2321 other->sk_data_ready(other); 2322 sent += size; 2323 } 2324 2325 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2326 if (msg->msg_flags & MSG_OOB) { 2327 err = queue_oob(sock, msg, other, &scm, fds_sent); 2328 if (err) 2329 goto out_err; 2330 sent++; 2331 } 2332 #endif 2333 2334 scm_destroy(&scm); 2335 2336 return sent; 2337 2338 out_pipe_unlock: 2339 unix_state_unlock(other); 2340 out_pipe: 2341 if (!sent && !(msg->msg_flags & MSG_NOSIGNAL)) 2342 send_sig(SIGPIPE, current, 0); 2343 err = -EPIPE; 2344 out_free: 2345 consume_skb(skb); 2346 out_err: 2347 scm_destroy(&scm); 2348 return sent ? : err; 2349 } 2350 2351 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg, 2352 size_t len) 2353 { 2354 int err; 2355 struct sock *sk = sock->sk; 2356 2357 err = sock_error(sk); 2358 if (err) 2359 return err; 2360 2361 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2362 return -ENOTCONN; 2363 2364 if (msg->msg_namelen) 2365 msg->msg_namelen = 0; 2366 2367 return unix_dgram_sendmsg(sock, msg, len); 2368 } 2369 2370 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, 2371 size_t size, int flags) 2372 { 2373 struct sock *sk = sock->sk; 2374 2375 if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED) 2376 return -ENOTCONN; 2377 2378 return unix_dgram_recvmsg(sock, msg, size, flags); 2379 } 2380 2381 static void unix_copy_addr(struct msghdr *msg, struct sock *sk) 2382 { 2383 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); 2384 2385 if (addr) { 2386 msg->msg_namelen = addr->len; 2387 memcpy(msg->msg_name, addr->name, addr->len); 2388 } 2389 } 2390 2391 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, 2392 int flags) 2393 { 2394 struct scm_cookie scm; 2395 struct socket *sock = sk->sk_socket; 2396 struct unix_sock *u = unix_sk(sk); 2397 struct sk_buff *skb, *last; 2398 long timeo; 2399 int skip; 2400 int err; 2401 2402 err = -EOPNOTSUPP; 2403 if (flags&MSG_OOB) 2404 goto out; 2405 2406 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 2407 2408 do { 2409 mutex_lock(&u->iolock); 2410 2411 skip = sk_peek_offset(sk, flags); 2412 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags, 2413 &skip, &err, &last); 2414 if (skb) { 2415 if (!(flags & MSG_PEEK)) 2416 scm_stat_del(sk, skb); 2417 break; 2418 } 2419 2420 mutex_unlock(&u->iolock); 2421 2422 if (err != -EAGAIN) 2423 break; 2424 } while (timeo && 2425 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, 2426 &err, &timeo, last)); 2427 2428 if (!skb) { /* implies iolock unlocked */ 2429 unix_state_lock(sk); 2430 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ 2431 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && 2432 (sk->sk_shutdown & RCV_SHUTDOWN)) 2433 err = 0; 2434 unix_state_unlock(sk); 2435 goto out; 2436 } 2437 2438 if (wq_has_sleeper(&u->peer_wait)) 2439 wake_up_interruptible_sync_poll(&u->peer_wait, 2440 EPOLLOUT | EPOLLWRNORM | 2441 EPOLLWRBAND); 2442 2443 if (msg->msg_name) { 2444 unix_copy_addr(msg, skb->sk); 2445 2446 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2447 msg->msg_name, 2448 &msg->msg_namelen); 2449 } 2450 2451 if (size > skb->len - skip) 2452 size = skb->len - skip; 2453 else if (size < skb->len - skip) 2454 msg->msg_flags |= MSG_TRUNC; 2455 2456 err = skb_copy_datagram_msg(skb, skip, msg, size); 2457 if (err) 2458 goto out_free; 2459 2460 if (sock_flag(sk, SOCK_RCVTSTAMP)) 2461 __sock_recv_timestamp(msg, sk, skb); 2462 2463 memset(&scm, 0, sizeof(scm)); 2464 2465 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2466 unix_set_secdata(&scm, skb); 2467 2468 if (!(flags & MSG_PEEK)) { 2469 if (UNIXCB(skb).fp) 2470 unix_detach_fds(&scm, skb); 2471 2472 sk_peek_offset_bwd(sk, skb->len); 2473 } else { 2474 /* It is questionable: on PEEK we could: 2475 - do not return fds - good, but too simple 8) 2476 - return fds, and do not return them on read (old strategy, 2477 apparently wrong) 2478 - clone fds (I chose it for now, it is the most universal 2479 solution) 2480 2481 POSIX 1003.1g does not actually define this clearly 2482 at all. POSIX 1003.1g doesn't define a lot of things 2483 clearly however! 2484 2485 */ 2486 2487 sk_peek_offset_fwd(sk, size); 2488 2489 if (UNIXCB(skb).fp) 2490 unix_peek_fds(&scm, skb); 2491 } 2492 err = (flags & MSG_TRUNC) ? skb->len - skip : size; 2493 2494 scm_recv_unix(sock, msg, &scm, flags); 2495 2496 out_free: 2497 skb_free_datagram(sk, skb); 2498 mutex_unlock(&u->iolock); 2499 out: 2500 return err; 2501 } 2502 2503 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2504 int flags) 2505 { 2506 struct sock *sk = sock->sk; 2507 2508 #ifdef CONFIG_BPF_SYSCALL 2509 const struct proto *prot = READ_ONCE(sk->sk_prot); 2510 2511 if (prot != &unix_dgram_proto) 2512 return prot->recvmsg(sk, msg, size, flags, NULL); 2513 #endif 2514 return __unix_dgram_recvmsg(sk, msg, size, flags); 2515 } 2516 2517 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2518 { 2519 struct unix_sock *u = unix_sk(sk); 2520 struct sk_buff *skb; 2521 int err; 2522 2523 mutex_lock(&u->iolock); 2524 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2525 mutex_unlock(&u->iolock); 2526 if (!skb) 2527 return err; 2528 2529 return recv_actor(sk, skb); 2530 } 2531 2532 /* 2533 * Sleep until more data has arrived. But check for races.. 2534 */ 2535 static long unix_stream_data_wait(struct sock *sk, long timeo, 2536 struct sk_buff *last, unsigned int last_len, 2537 bool freezable) 2538 { 2539 unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2540 struct sk_buff *tail; 2541 DEFINE_WAIT(wait); 2542 2543 unix_state_lock(sk); 2544 2545 for (;;) { 2546 prepare_to_wait(sk_sleep(sk), &wait, state); 2547 2548 tail = skb_peek_tail(&sk->sk_receive_queue); 2549 if (tail != last || 2550 (tail && tail->len != last_len) || 2551 sk->sk_err || 2552 (sk->sk_shutdown & RCV_SHUTDOWN) || 2553 signal_pending(current) || 2554 !timeo) 2555 break; 2556 2557 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2558 unix_state_unlock(sk); 2559 timeo = schedule_timeout(timeo); 2560 unix_state_lock(sk); 2561 2562 if (sock_flag(sk, SOCK_DEAD)) 2563 break; 2564 2565 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2566 } 2567 2568 finish_wait(sk_sleep(sk), &wait); 2569 unix_state_unlock(sk); 2570 return timeo; 2571 } 2572 2573 static unsigned int unix_skb_len(const struct sk_buff *skb) 2574 { 2575 return skb->len - UNIXCB(skb).consumed; 2576 } 2577 2578 struct unix_stream_read_state { 2579 int (*recv_actor)(struct sk_buff *, int, int, 2580 struct unix_stream_read_state *); 2581 struct socket *socket; 2582 struct msghdr *msg; 2583 struct pipe_inode_info *pipe; 2584 size_t size; 2585 int flags; 2586 unsigned int splice_flags; 2587 }; 2588 2589 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2590 static int unix_stream_recv_urg(struct unix_stream_read_state *state) 2591 { 2592 struct socket *sock = state->socket; 2593 struct sock *sk = sock->sk; 2594 struct unix_sock *u = unix_sk(sk); 2595 int chunk = 1; 2596 struct sk_buff *oob_skb; 2597 2598 mutex_lock(&u->iolock); 2599 unix_state_lock(sk); 2600 spin_lock(&sk->sk_receive_queue.lock); 2601 2602 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { 2603 spin_unlock(&sk->sk_receive_queue.lock); 2604 unix_state_unlock(sk); 2605 mutex_unlock(&u->iolock); 2606 return -EINVAL; 2607 } 2608 2609 oob_skb = u->oob_skb; 2610 2611 if (!(state->flags & MSG_PEEK)) 2612 WRITE_ONCE(u->oob_skb, NULL); 2613 2614 spin_unlock(&sk->sk_receive_queue.lock); 2615 unix_state_unlock(sk); 2616 2617 chunk = state->recv_actor(oob_skb, 0, chunk, state); 2618 2619 if (!(state->flags & MSG_PEEK)) 2620 UNIXCB(oob_skb).consumed += 1; 2621 2622 mutex_unlock(&u->iolock); 2623 2624 if (chunk < 0) 2625 return -EFAULT; 2626 2627 state->msg->msg_flags |= MSG_OOB; 2628 return 1; 2629 } 2630 2631 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, 2632 int flags, int copied) 2633 { 2634 struct sk_buff *read_skb = NULL, *unread_skb = NULL; 2635 struct unix_sock *u = unix_sk(sk); 2636 2637 if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb))) 2638 return skb; 2639 2640 spin_lock(&sk->sk_receive_queue.lock); 2641 2642 if (!unix_skb_len(skb)) { 2643 if (copied && (!u->oob_skb || skb == u->oob_skb)) { 2644 skb = NULL; 2645 } else if (flags & MSG_PEEK) { 2646 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2647 } else { 2648 read_skb = skb; 2649 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2650 __skb_unlink(read_skb, &sk->sk_receive_queue); 2651 } 2652 2653 if (!skb) 2654 goto unlock; 2655 } 2656 2657 if (skb != u->oob_skb) 2658 goto unlock; 2659 2660 if (copied) { 2661 skb = NULL; 2662 } else if (!(flags & MSG_PEEK)) { 2663 WRITE_ONCE(u->oob_skb, NULL); 2664 2665 if (!sock_flag(sk, SOCK_URGINLINE)) { 2666 __skb_unlink(skb, &sk->sk_receive_queue); 2667 unread_skb = skb; 2668 skb = skb_peek(&sk->sk_receive_queue); 2669 } 2670 } else if (!sock_flag(sk, SOCK_URGINLINE)) { 2671 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2672 } 2673 2674 unlock: 2675 spin_unlock(&sk->sk_receive_queue.lock); 2676 2677 consume_skb(read_skb); 2678 kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB); 2679 2680 return skb; 2681 } 2682 #endif 2683 2684 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) 2685 { 2686 struct unix_sock *u = unix_sk(sk); 2687 struct sk_buff *skb; 2688 int err; 2689 2690 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) 2691 return -ENOTCONN; 2692 2693 mutex_lock(&u->iolock); 2694 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); 2695 mutex_unlock(&u->iolock); 2696 if (!skb) 2697 return err; 2698 2699 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2700 if (unlikely(skb == READ_ONCE(u->oob_skb))) { 2701 bool drop = false; 2702 2703 unix_state_lock(sk); 2704 2705 if (sock_flag(sk, SOCK_DEAD)) { 2706 unix_state_unlock(sk); 2707 kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); 2708 return -ECONNRESET; 2709 } 2710 2711 spin_lock(&sk->sk_receive_queue.lock); 2712 if (likely(skb == u->oob_skb)) { 2713 WRITE_ONCE(u->oob_skb, NULL); 2714 drop = true; 2715 } 2716 spin_unlock(&sk->sk_receive_queue.lock); 2717 2718 unix_state_unlock(sk); 2719 2720 if (drop) { 2721 kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB); 2722 return -EAGAIN; 2723 } 2724 } 2725 #endif 2726 2727 return recv_actor(sk, skb); 2728 } 2729 2730 static int unix_stream_read_generic(struct unix_stream_read_state *state, 2731 bool freezable) 2732 { 2733 struct scm_cookie scm; 2734 struct socket *sock = state->socket; 2735 struct sock *sk = sock->sk; 2736 struct unix_sock *u = unix_sk(sk); 2737 int copied = 0; 2738 int flags = state->flags; 2739 int noblock = flags & MSG_DONTWAIT; 2740 bool check_creds = false; 2741 int target; 2742 int err = 0; 2743 long timeo; 2744 int skip; 2745 size_t size = state->size; 2746 unsigned int last_len; 2747 2748 if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { 2749 err = -EINVAL; 2750 goto out; 2751 } 2752 2753 if (unlikely(flags & MSG_OOB)) { 2754 err = -EOPNOTSUPP; 2755 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2756 err = unix_stream_recv_urg(state); 2757 #endif 2758 goto out; 2759 } 2760 2761 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); 2762 timeo = sock_rcvtimeo(sk, noblock); 2763 2764 memset(&scm, 0, sizeof(scm)); 2765 2766 /* Lock the socket to prevent queue disordering 2767 * while sleeps in memcpy_tomsg 2768 */ 2769 mutex_lock(&u->iolock); 2770 2771 skip = max(sk_peek_offset(sk, flags), 0); 2772 2773 do { 2774 struct sk_buff *skb, *last; 2775 int chunk; 2776 2777 redo: 2778 unix_state_lock(sk); 2779 if (sock_flag(sk, SOCK_DEAD)) { 2780 err = -ECONNRESET; 2781 goto unlock; 2782 } 2783 last = skb = skb_peek(&sk->sk_receive_queue); 2784 last_len = last ? last->len : 0; 2785 2786 again: 2787 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 2788 if (skb) { 2789 skb = manage_oob(skb, sk, flags, copied); 2790 if (!skb && copied) { 2791 unix_state_unlock(sk); 2792 break; 2793 } 2794 } 2795 #endif 2796 if (skb == NULL) { 2797 if (copied >= target) 2798 goto unlock; 2799 2800 /* 2801 * POSIX 1003.1g mandates this order. 2802 */ 2803 2804 err = sock_error(sk); 2805 if (err) 2806 goto unlock; 2807 if (sk->sk_shutdown & RCV_SHUTDOWN) 2808 goto unlock; 2809 2810 unix_state_unlock(sk); 2811 if (!timeo) { 2812 err = -EAGAIN; 2813 break; 2814 } 2815 2816 mutex_unlock(&u->iolock); 2817 2818 timeo = unix_stream_data_wait(sk, timeo, last, 2819 last_len, freezable); 2820 2821 if (signal_pending(current)) { 2822 err = sock_intr_errno(timeo); 2823 scm_destroy(&scm); 2824 goto out; 2825 } 2826 2827 mutex_lock(&u->iolock); 2828 goto redo; 2829 unlock: 2830 unix_state_unlock(sk); 2831 break; 2832 } 2833 2834 while (skip >= unix_skb_len(skb)) { 2835 skip -= unix_skb_len(skb); 2836 last = skb; 2837 last_len = skb->len; 2838 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2839 if (!skb) 2840 goto again; 2841 } 2842 2843 unix_state_unlock(sk); 2844 2845 if (check_creds) { 2846 /* Never glue messages from different writers */ 2847 if (!unix_skb_scm_eq(skb, &scm)) 2848 break; 2849 } else if (test_bit(SOCK_PASSCRED, &sock->flags) || 2850 test_bit(SOCK_PASSPIDFD, &sock->flags)) { 2851 /* Copy credentials */ 2852 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); 2853 unix_set_secdata(&scm, skb); 2854 check_creds = true; 2855 } 2856 2857 /* Copy address just once */ 2858 if (state->msg && state->msg->msg_name) { 2859 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, 2860 state->msg->msg_name); 2861 unix_copy_addr(state->msg, skb->sk); 2862 2863 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, 2864 state->msg->msg_name, 2865 &state->msg->msg_namelen); 2866 2867 sunaddr = NULL; 2868 } 2869 2870 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); 2871 chunk = state->recv_actor(skb, skip, chunk, state); 2872 if (chunk < 0) { 2873 if (copied == 0) 2874 copied = -EFAULT; 2875 break; 2876 } 2877 copied += chunk; 2878 size -= chunk; 2879 2880 /* Mark read part of skb as used */ 2881 if (!(flags & MSG_PEEK)) { 2882 UNIXCB(skb).consumed += chunk; 2883 2884 sk_peek_offset_bwd(sk, chunk); 2885 2886 if (UNIXCB(skb).fp) { 2887 scm_stat_del(sk, skb); 2888 unix_detach_fds(&scm, skb); 2889 } 2890 2891 if (unix_skb_len(skb)) 2892 break; 2893 2894 skb_unlink(skb, &sk->sk_receive_queue); 2895 consume_skb(skb); 2896 2897 if (scm.fp) 2898 break; 2899 } else { 2900 /* It is questionable, see note in unix_dgram_recvmsg. 2901 */ 2902 if (UNIXCB(skb).fp) 2903 unix_peek_fds(&scm, skb); 2904 2905 sk_peek_offset_fwd(sk, chunk); 2906 2907 if (UNIXCB(skb).fp) 2908 break; 2909 2910 skip = 0; 2911 last = skb; 2912 last_len = skb->len; 2913 unix_state_lock(sk); 2914 skb = skb_peek_next(skb, &sk->sk_receive_queue); 2915 if (skb) 2916 goto again; 2917 unix_state_unlock(sk); 2918 break; 2919 } 2920 } while (size); 2921 2922 mutex_unlock(&u->iolock); 2923 if (state->msg) 2924 scm_recv_unix(sock, state->msg, &scm, flags); 2925 else 2926 scm_destroy(&scm); 2927 out: 2928 return copied ? : err; 2929 } 2930 2931 static int unix_stream_read_actor(struct sk_buff *skb, 2932 int skip, int chunk, 2933 struct unix_stream_read_state *state) 2934 { 2935 int ret; 2936 2937 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, 2938 state->msg, chunk); 2939 return ret ?: chunk; 2940 } 2941 2942 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, 2943 size_t size, int flags) 2944 { 2945 struct unix_stream_read_state state = { 2946 .recv_actor = unix_stream_read_actor, 2947 .socket = sk->sk_socket, 2948 .msg = msg, 2949 .size = size, 2950 .flags = flags 2951 }; 2952 2953 return unix_stream_read_generic(&state, true); 2954 } 2955 2956 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, 2957 size_t size, int flags) 2958 { 2959 struct unix_stream_read_state state = { 2960 .recv_actor = unix_stream_read_actor, 2961 .socket = sock, 2962 .msg = msg, 2963 .size = size, 2964 .flags = flags 2965 }; 2966 2967 #ifdef CONFIG_BPF_SYSCALL 2968 struct sock *sk = sock->sk; 2969 const struct proto *prot = READ_ONCE(sk->sk_prot); 2970 2971 if (prot != &unix_stream_proto) 2972 return prot->recvmsg(sk, msg, size, flags, NULL); 2973 #endif 2974 return unix_stream_read_generic(&state, true); 2975 } 2976 2977 static int unix_stream_splice_actor(struct sk_buff *skb, 2978 int skip, int chunk, 2979 struct unix_stream_read_state *state) 2980 { 2981 return skb_splice_bits(skb, state->socket->sk, 2982 UNIXCB(skb).consumed + skip, 2983 state->pipe, chunk, state->splice_flags); 2984 } 2985 2986 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos, 2987 struct pipe_inode_info *pipe, 2988 size_t size, unsigned int flags) 2989 { 2990 struct unix_stream_read_state state = { 2991 .recv_actor = unix_stream_splice_actor, 2992 .socket = sock, 2993 .pipe = pipe, 2994 .size = size, 2995 .splice_flags = flags, 2996 }; 2997 2998 if (unlikely(*ppos)) 2999 return -ESPIPE; 3000 3001 if (sock->file->f_flags & O_NONBLOCK || 3002 flags & SPLICE_F_NONBLOCK) 3003 state.flags = MSG_DONTWAIT; 3004 3005 return unix_stream_read_generic(&state, false); 3006 } 3007 3008 static int unix_shutdown(struct socket *sock, int mode) 3009 { 3010 struct sock *sk = sock->sk; 3011 struct sock *other; 3012 3013 if (mode < SHUT_RD || mode > SHUT_RDWR) 3014 return -EINVAL; 3015 /* This maps: 3016 * SHUT_RD (0) -> RCV_SHUTDOWN (1) 3017 * SHUT_WR (1) -> SEND_SHUTDOWN (2) 3018 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3) 3019 */ 3020 ++mode; 3021 3022 unix_state_lock(sk); 3023 WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); 3024 other = unix_peer(sk); 3025 if (other) 3026 sock_hold(other); 3027 unix_state_unlock(sk); 3028 sk->sk_state_change(sk); 3029 3030 if (other && 3031 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 3032 3033 int peer_mode = 0; 3034 const struct proto *prot = READ_ONCE(other->sk_prot); 3035 3036 if (prot->unhash) 3037 prot->unhash(other); 3038 if (mode&RCV_SHUTDOWN) 3039 peer_mode |= SEND_SHUTDOWN; 3040 if (mode&SEND_SHUTDOWN) 3041 peer_mode |= RCV_SHUTDOWN; 3042 unix_state_lock(other); 3043 WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); 3044 unix_state_unlock(other); 3045 other->sk_state_change(other); 3046 if (peer_mode == SHUTDOWN_MASK) 3047 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 3048 else if (peer_mode & RCV_SHUTDOWN) 3049 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 3050 } 3051 if (other) 3052 sock_put(other); 3053 3054 return 0; 3055 } 3056 3057 long unix_inq_len(struct sock *sk) 3058 { 3059 struct sk_buff *skb; 3060 long amount = 0; 3061 3062 if (READ_ONCE(sk->sk_state) == TCP_LISTEN) 3063 return -EINVAL; 3064 3065 spin_lock(&sk->sk_receive_queue.lock); 3066 if (sk->sk_type == SOCK_STREAM || 3067 sk->sk_type == SOCK_SEQPACKET) { 3068 skb_queue_walk(&sk->sk_receive_queue, skb) 3069 amount += unix_skb_len(skb); 3070 } else { 3071 skb = skb_peek(&sk->sk_receive_queue); 3072 if (skb) 3073 amount = skb->len; 3074 } 3075 spin_unlock(&sk->sk_receive_queue.lock); 3076 3077 return amount; 3078 } 3079 EXPORT_SYMBOL_GPL(unix_inq_len); 3080 3081 long unix_outq_len(struct sock *sk) 3082 { 3083 return sk_wmem_alloc_get(sk); 3084 } 3085 EXPORT_SYMBOL_GPL(unix_outq_len); 3086 3087 static int unix_open_file(struct sock *sk) 3088 { 3089 struct path path; 3090 struct file *f; 3091 int fd; 3092 3093 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3094 return -EPERM; 3095 3096 if (!smp_load_acquire(&unix_sk(sk)->addr)) 3097 return -ENOENT; 3098 3099 path = unix_sk(sk)->path; 3100 if (!path.dentry) 3101 return -ENOENT; 3102 3103 path_get(&path); 3104 3105 fd = get_unused_fd_flags(O_CLOEXEC); 3106 if (fd < 0) 3107 goto out; 3108 3109 f = dentry_open(&path, O_PATH, current_cred()); 3110 if (IS_ERR(f)) { 3111 put_unused_fd(fd); 3112 fd = PTR_ERR(f); 3113 goto out; 3114 } 3115 3116 fd_install(fd, f); 3117 out: 3118 path_put(&path); 3119 3120 return fd; 3121 } 3122 3123 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3124 { 3125 struct sock *sk = sock->sk; 3126 long amount = 0; 3127 int err; 3128 3129 switch (cmd) { 3130 case SIOCOUTQ: 3131 amount = unix_outq_len(sk); 3132 err = put_user(amount, (int __user *)arg); 3133 break; 3134 case SIOCINQ: 3135 amount = unix_inq_len(sk); 3136 if (amount < 0) 3137 err = amount; 3138 else 3139 err = put_user(amount, (int __user *)arg); 3140 break; 3141 case SIOCUNIXFILE: 3142 err = unix_open_file(sk); 3143 break; 3144 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3145 case SIOCATMARK: 3146 { 3147 struct unix_sock *u = unix_sk(sk); 3148 struct sk_buff *skb; 3149 int answ = 0; 3150 3151 mutex_lock(&u->iolock); 3152 3153 skb = skb_peek(&sk->sk_receive_queue); 3154 if (skb) { 3155 struct sk_buff *oob_skb = READ_ONCE(u->oob_skb); 3156 struct sk_buff *next_skb; 3157 3158 next_skb = skb_peek_next(skb, &sk->sk_receive_queue); 3159 3160 if (skb == oob_skb || 3161 (!unix_skb_len(skb) && 3162 (!oob_skb || next_skb == oob_skb))) 3163 answ = 1; 3164 } 3165 3166 mutex_unlock(&u->iolock); 3167 3168 err = put_user(answ, (int __user *)arg); 3169 } 3170 break; 3171 #endif 3172 default: 3173 err = -ENOIOCTLCMD; 3174 break; 3175 } 3176 return err; 3177 } 3178 3179 #ifdef CONFIG_COMPAT 3180 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3181 { 3182 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); 3183 } 3184 #endif 3185 3186 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait) 3187 { 3188 struct sock *sk = sock->sk; 3189 unsigned char state; 3190 __poll_t mask; 3191 u8 shutdown; 3192 3193 sock_poll_wait(file, sock, wait); 3194 mask = 0; 3195 shutdown = READ_ONCE(sk->sk_shutdown); 3196 state = READ_ONCE(sk->sk_state); 3197 3198 /* exceptional events? */ 3199 if (READ_ONCE(sk->sk_err)) 3200 mask |= EPOLLERR; 3201 if (shutdown == SHUTDOWN_MASK) 3202 mask |= EPOLLHUP; 3203 if (shutdown & RCV_SHUTDOWN) 3204 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3205 3206 /* readable? */ 3207 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3208 mask |= EPOLLIN | EPOLLRDNORM; 3209 if (sk_is_readable(sk)) 3210 mask |= EPOLLIN | EPOLLRDNORM; 3211 #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 3212 if (READ_ONCE(unix_sk(sk)->oob_skb)) 3213 mask |= EPOLLPRI; 3214 #endif 3215 3216 /* Connection-based need to check for termination and startup */ 3217 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && 3218 state == TCP_CLOSE) 3219 mask |= EPOLLHUP; 3220 3221 /* 3222 * we set writable also when the other side has shut down the 3223 * connection. This prevents stuck sockets. 3224 */ 3225 if (unix_writable(sk, state)) 3226 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3227 3228 return mask; 3229 } 3230 3231 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, 3232 poll_table *wait) 3233 { 3234 struct sock *sk = sock->sk, *other; 3235 unsigned int writable; 3236 unsigned char state; 3237 __poll_t mask; 3238 u8 shutdown; 3239 3240 sock_poll_wait(file, sock, wait); 3241 mask = 0; 3242 shutdown = READ_ONCE(sk->sk_shutdown); 3243 state = READ_ONCE(sk->sk_state); 3244 3245 /* exceptional events? */ 3246 if (READ_ONCE(sk->sk_err) || 3247 !skb_queue_empty_lockless(&sk->sk_error_queue)) 3248 mask |= EPOLLERR | 3249 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); 3250 3251 if (shutdown & RCV_SHUTDOWN) 3252 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; 3253 if (shutdown == SHUTDOWN_MASK) 3254 mask |= EPOLLHUP; 3255 3256 /* readable? */ 3257 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 3258 mask |= EPOLLIN | EPOLLRDNORM; 3259 if (sk_is_readable(sk)) 3260 mask |= EPOLLIN | EPOLLRDNORM; 3261 3262 /* Connection-based need to check for termination and startup */ 3263 if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE) 3264 mask |= EPOLLHUP; 3265 3266 /* No write status requested, avoid expensive OUT tests. */ 3267 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT))) 3268 return mask; 3269 3270 writable = unix_writable(sk, state); 3271 if (writable) { 3272 unix_state_lock(sk); 3273 3274 other = unix_peer(sk); 3275 if (other && unix_peer(other) != sk && 3276 unix_recvq_full_lockless(other) && 3277 unix_dgram_peer_wake_me(sk, other)) 3278 writable = 0; 3279 3280 unix_state_unlock(sk); 3281 } 3282 3283 if (writable) 3284 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; 3285 else 3286 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 3287 3288 return mask; 3289 } 3290 3291 #ifdef CONFIG_PROC_FS 3292 3293 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) 3294 3295 #define get_bucket(x) ((x) >> BUCKET_SPACE) 3296 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) 3297 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) 3298 3299 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) 3300 { 3301 unsigned long offset = get_offset(*pos); 3302 unsigned long bucket = get_bucket(*pos); 3303 unsigned long count = 0; 3304 struct sock *sk; 3305 3306 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); 3307 sk; sk = sk_next(sk)) { 3308 if (++count == offset) 3309 break; 3310 } 3311 3312 return sk; 3313 } 3314 3315 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) 3316 { 3317 unsigned long bucket = get_bucket(*pos); 3318 struct net *net = seq_file_net(seq); 3319 struct sock *sk; 3320 3321 while (bucket < UNIX_HASH_SIZE) { 3322 spin_lock(&net->unx.table.locks[bucket]); 3323 3324 sk = unix_from_bucket(seq, pos); 3325 if (sk) 3326 return sk; 3327 3328 spin_unlock(&net->unx.table.locks[bucket]); 3329 3330 *pos = set_bucket_offset(++bucket, 1); 3331 } 3332 3333 return NULL; 3334 } 3335 3336 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, 3337 loff_t *pos) 3338 { 3339 unsigned long bucket = get_bucket(*pos); 3340 3341 sk = sk_next(sk); 3342 if (sk) 3343 return sk; 3344 3345 3346 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); 3347 3348 *pos = set_bucket_offset(++bucket, 1); 3349 3350 return unix_get_first(seq, pos); 3351 } 3352 3353 static void *unix_seq_start(struct seq_file *seq, loff_t *pos) 3354 { 3355 if (!*pos) 3356 return SEQ_START_TOKEN; 3357 3358 return unix_get_first(seq, pos); 3359 } 3360 3361 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3362 { 3363 ++*pos; 3364 3365 if (v == SEQ_START_TOKEN) 3366 return unix_get_first(seq, pos); 3367 3368 return unix_get_next(seq, v, pos); 3369 } 3370 3371 static void unix_seq_stop(struct seq_file *seq, void *v) 3372 { 3373 struct sock *sk = v; 3374 3375 if (sk) 3376 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); 3377 } 3378 3379 static int unix_seq_show(struct seq_file *seq, void *v) 3380 { 3381 3382 if (v == SEQ_START_TOKEN) 3383 seq_puts(seq, "Num RefCount Protocol Flags Type St " 3384 "Inode Path\n"); 3385 else { 3386 struct sock *s = v; 3387 struct unix_sock *u = unix_sk(s); 3388 unix_state_lock(s); 3389 3390 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu", 3391 s, 3392 refcount_read(&s->sk_refcnt), 3393 0, 3394 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, 3395 s->sk_type, 3396 s->sk_socket ? 3397 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : 3398 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), 3399 sock_i_ino(s)); 3400 3401 if (u->addr) { // under a hash table lock here 3402 int i, len; 3403 seq_putc(seq, ' '); 3404 3405 i = 0; 3406 len = u->addr->len - 3407 offsetof(struct sockaddr_un, sun_path); 3408 if (u->addr->name->sun_path[0]) { 3409 len--; 3410 } else { 3411 seq_putc(seq, '@'); 3412 i++; 3413 } 3414 for ( ; i < len; i++) 3415 seq_putc(seq, u->addr->name->sun_path[i] ?: 3416 '@'); 3417 } 3418 unix_state_unlock(s); 3419 seq_putc(seq, '\n'); 3420 } 3421 3422 return 0; 3423 } 3424 3425 static const struct seq_operations unix_seq_ops = { 3426 .start = unix_seq_start, 3427 .next = unix_seq_next, 3428 .stop = unix_seq_stop, 3429 .show = unix_seq_show, 3430 }; 3431 3432 #ifdef CONFIG_BPF_SYSCALL 3433 struct bpf_unix_iter_state { 3434 struct seq_net_private p; 3435 unsigned int cur_sk; 3436 unsigned int end_sk; 3437 unsigned int max_sk; 3438 struct sock **batch; 3439 bool st_bucket_done; 3440 }; 3441 3442 struct bpf_iter__unix { 3443 __bpf_md_ptr(struct bpf_iter_meta *, meta); 3444 __bpf_md_ptr(struct unix_sock *, unix_sk); 3445 uid_t uid __aligned(8); 3446 }; 3447 3448 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, 3449 struct unix_sock *unix_sk, uid_t uid) 3450 { 3451 struct bpf_iter__unix ctx; 3452 3453 meta->seq_num--; /* skip SEQ_START_TOKEN */ 3454 ctx.meta = meta; 3455 ctx.unix_sk = unix_sk; 3456 ctx.uid = uid; 3457 return bpf_iter_run_prog(prog, &ctx); 3458 } 3459 3460 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) 3461 3462 { 3463 struct bpf_unix_iter_state *iter = seq->private; 3464 unsigned int expected = 1; 3465 struct sock *sk; 3466 3467 sock_hold(start_sk); 3468 iter->batch[iter->end_sk++] = start_sk; 3469 3470 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { 3471 if (iter->end_sk < iter->max_sk) { 3472 sock_hold(sk); 3473 iter->batch[iter->end_sk++] = sk; 3474 } 3475 3476 expected++; 3477 } 3478 3479 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); 3480 3481 return expected; 3482 } 3483 3484 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) 3485 { 3486 while (iter->cur_sk < iter->end_sk) 3487 sock_put(iter->batch[iter->cur_sk++]); 3488 } 3489 3490 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, 3491 unsigned int new_batch_sz) 3492 { 3493 struct sock **new_batch; 3494 3495 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, 3496 GFP_USER | __GFP_NOWARN); 3497 if (!new_batch) 3498 return -ENOMEM; 3499 3500 bpf_iter_unix_put_batch(iter); 3501 kvfree(iter->batch); 3502 iter->batch = new_batch; 3503 iter->max_sk = new_batch_sz; 3504 3505 return 0; 3506 } 3507 3508 static struct sock *bpf_iter_unix_batch(struct seq_file *seq, 3509 loff_t *pos) 3510 { 3511 struct bpf_unix_iter_state *iter = seq->private; 3512 unsigned int expected; 3513 bool resized = false; 3514 struct sock *sk; 3515 3516 if (iter->st_bucket_done) 3517 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); 3518 3519 again: 3520 /* Get a new batch */ 3521 iter->cur_sk = 0; 3522 iter->end_sk = 0; 3523 3524 sk = unix_get_first(seq, pos); 3525 if (!sk) 3526 return NULL; /* Done */ 3527 3528 expected = bpf_iter_unix_hold_batch(seq, sk); 3529 3530 if (iter->end_sk == expected) { 3531 iter->st_bucket_done = true; 3532 return sk; 3533 } 3534 3535 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { 3536 resized = true; 3537 goto again; 3538 } 3539 3540 return sk; 3541 } 3542 3543 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) 3544 { 3545 if (!*pos) 3546 return SEQ_START_TOKEN; 3547 3548 /* bpf iter does not support lseek, so it always 3549 * continue from where it was stop()-ped. 3550 */ 3551 return bpf_iter_unix_batch(seq, pos); 3552 } 3553 3554 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3555 { 3556 struct bpf_unix_iter_state *iter = seq->private; 3557 struct sock *sk; 3558 3559 /* Whenever seq_next() is called, the iter->cur_sk is 3560 * done with seq_show(), so advance to the next sk in 3561 * the batch. 3562 */ 3563 if (iter->cur_sk < iter->end_sk) 3564 sock_put(iter->batch[iter->cur_sk++]); 3565 3566 ++*pos; 3567 3568 if (iter->cur_sk < iter->end_sk) 3569 sk = iter->batch[iter->cur_sk]; 3570 else 3571 sk = bpf_iter_unix_batch(seq, pos); 3572 3573 return sk; 3574 } 3575 3576 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) 3577 { 3578 struct bpf_iter_meta meta; 3579 struct bpf_prog *prog; 3580 struct sock *sk = v; 3581 uid_t uid; 3582 bool slow; 3583 int ret; 3584 3585 if (v == SEQ_START_TOKEN) 3586 return 0; 3587 3588 slow = lock_sock_fast(sk); 3589 3590 if (unlikely(sk_unhashed(sk))) { 3591 ret = SEQ_SKIP; 3592 goto unlock; 3593 } 3594 3595 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); 3596 meta.seq = seq; 3597 prog = bpf_iter_get_info(&meta, false); 3598 ret = unix_prog_seq_show(prog, &meta, v, uid); 3599 unlock: 3600 unlock_sock_fast(sk, slow); 3601 return ret; 3602 } 3603 3604 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) 3605 { 3606 struct bpf_unix_iter_state *iter = seq->private; 3607 struct bpf_iter_meta meta; 3608 struct bpf_prog *prog; 3609 3610 if (!v) { 3611 meta.seq = seq; 3612 prog = bpf_iter_get_info(&meta, true); 3613 if (prog) 3614 (void)unix_prog_seq_show(prog, &meta, v, 0); 3615 } 3616 3617 if (iter->cur_sk < iter->end_sk) 3618 bpf_iter_unix_put_batch(iter); 3619 } 3620 3621 static const struct seq_operations bpf_iter_unix_seq_ops = { 3622 .start = bpf_iter_unix_seq_start, 3623 .next = bpf_iter_unix_seq_next, 3624 .stop = bpf_iter_unix_seq_stop, 3625 .show = bpf_iter_unix_seq_show, 3626 }; 3627 #endif 3628 #endif 3629 3630 static const struct net_proto_family unix_family_ops = { 3631 .family = PF_UNIX, 3632 .create = unix_create, 3633 .owner = THIS_MODULE, 3634 }; 3635 3636 3637 static int __net_init unix_net_init(struct net *net) 3638 { 3639 int i; 3640 3641 net->unx.sysctl_max_dgram_qlen = 10; 3642 if (unix_sysctl_register(net)) 3643 goto out; 3644 3645 #ifdef CONFIG_PROC_FS 3646 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, 3647 sizeof(struct seq_net_private))) 3648 goto err_sysctl; 3649 #endif 3650 3651 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, 3652 sizeof(spinlock_t), GFP_KERNEL); 3653 if (!net->unx.table.locks) 3654 goto err_proc; 3655 3656 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, 3657 sizeof(struct hlist_head), 3658 GFP_KERNEL); 3659 if (!net->unx.table.buckets) 3660 goto free_locks; 3661 3662 for (i = 0; i < UNIX_HASH_SIZE; i++) { 3663 spin_lock_init(&net->unx.table.locks[i]); 3664 lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL); 3665 INIT_HLIST_HEAD(&net->unx.table.buckets[i]); 3666 } 3667 3668 return 0; 3669 3670 free_locks: 3671 kvfree(net->unx.table.locks); 3672 err_proc: 3673 #ifdef CONFIG_PROC_FS 3674 remove_proc_entry("unix", net->proc_net); 3675 err_sysctl: 3676 #endif 3677 unix_sysctl_unregister(net); 3678 out: 3679 return -ENOMEM; 3680 } 3681 3682 static void __net_exit unix_net_exit(struct net *net) 3683 { 3684 kvfree(net->unx.table.buckets); 3685 kvfree(net->unx.table.locks); 3686 unix_sysctl_unregister(net); 3687 remove_proc_entry("unix", net->proc_net); 3688 } 3689 3690 static struct pernet_operations unix_net_ops = { 3691 .init = unix_net_init, 3692 .exit = unix_net_exit, 3693 }; 3694 3695 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3696 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, 3697 struct unix_sock *unix_sk, uid_t uid) 3698 3699 #define INIT_BATCH_SZ 16 3700 3701 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) 3702 { 3703 struct bpf_unix_iter_state *iter = priv_data; 3704 int err; 3705 3706 err = bpf_iter_init_seq_net(priv_data, aux); 3707 if (err) 3708 return err; 3709 3710 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); 3711 if (err) { 3712 bpf_iter_fini_seq_net(priv_data); 3713 return err; 3714 } 3715 3716 return 0; 3717 } 3718 3719 static void bpf_iter_fini_unix(void *priv_data) 3720 { 3721 struct bpf_unix_iter_state *iter = priv_data; 3722 3723 bpf_iter_fini_seq_net(priv_data); 3724 kvfree(iter->batch); 3725 } 3726 3727 static const struct bpf_iter_seq_info unix_seq_info = { 3728 .seq_ops = &bpf_iter_unix_seq_ops, 3729 .init_seq_private = bpf_iter_init_unix, 3730 .fini_seq_private = bpf_iter_fini_unix, 3731 .seq_priv_size = sizeof(struct bpf_unix_iter_state), 3732 }; 3733 3734 static const struct bpf_func_proto * 3735 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, 3736 const struct bpf_prog *prog) 3737 { 3738 switch (func_id) { 3739 case BPF_FUNC_setsockopt: 3740 return &bpf_sk_setsockopt_proto; 3741 case BPF_FUNC_getsockopt: 3742 return &bpf_sk_getsockopt_proto; 3743 default: 3744 return NULL; 3745 } 3746 } 3747 3748 static struct bpf_iter_reg unix_reg_info = { 3749 .target = "unix", 3750 .ctx_arg_info_size = 1, 3751 .ctx_arg_info = { 3752 { offsetof(struct bpf_iter__unix, unix_sk), 3753 PTR_TO_BTF_ID_OR_NULL }, 3754 }, 3755 .get_func_proto = bpf_iter_unix_get_func_proto, 3756 .seq_info = &unix_seq_info, 3757 }; 3758 3759 static void __init bpf_iter_register(void) 3760 { 3761 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; 3762 if (bpf_iter_reg_target(&unix_reg_info)) 3763 pr_warn("Warning: could not register bpf iterator unix\n"); 3764 } 3765 #endif 3766 3767 static int __init af_unix_init(void) 3768 { 3769 int i, rc = -1; 3770 3771 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); 3772 3773 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { 3774 spin_lock_init(&bsd_socket_locks[i]); 3775 INIT_HLIST_HEAD(&bsd_socket_buckets[i]); 3776 } 3777 3778 rc = proto_register(&unix_dgram_proto, 1); 3779 if (rc != 0) { 3780 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3781 goto out; 3782 } 3783 3784 rc = proto_register(&unix_stream_proto, 1); 3785 if (rc != 0) { 3786 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); 3787 proto_unregister(&unix_dgram_proto); 3788 goto out; 3789 } 3790 3791 sock_register(&unix_family_ops); 3792 register_pernet_subsys(&unix_net_ops); 3793 unix_bpf_build_proto(); 3794 3795 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) 3796 bpf_iter_register(); 3797 #endif 3798 3799 out: 3800 return rc; 3801 } 3802 3803 /* Later than subsys_initcall() because we depend on stuff initialised there */ 3804 fs_initcall(af_unix_init); 3805