xref: /linux/net/unix/af_unix.c (revision b803c4a4f78834b31ebfbbcea350473333760559)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/bpf-cgroup.h>
81 #include <linux/btf_ids.h>
82 #include <linux/dcache.h>
83 #include <linux/errno.h>
84 #include <linux/fcntl.h>
85 #include <linux/file.h>
86 #include <linux/filter.h>
87 #include <linux/fs.h>
88 #include <linux/init.h>
89 #include <linux/kernel.h>
90 #include <linux/mount.h>
91 #include <linux/namei.h>
92 #include <linux/poll.h>
93 #include <linux/proc_fs.h>
94 #include <linux/sched/signal.h>
95 #include <linux/security.h>
96 #include <linux/seq_file.h>
97 #include <linux/skbuff.h>
98 #include <linux/slab.h>
99 #include <linux/socket.h>
100 #include <linux/splice.h>
101 #include <linux/string.h>
102 #include <linux/uaccess.h>
103 #include <net/af_unix.h>
104 #include <net/net_namespace.h>
105 #include <net/scm.h>
106 #include <net/tcp_states.h>
107 #include <uapi/linux/sockios.h>
108 #include <uapi/linux/termios.h>
109 
110 #include "af_unix.h"
111 
112 static atomic_long_t unix_nr_socks;
113 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
114 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
115 
116 /* SMP locking strategy:
117  *    hash table is protected with spinlock.
118  *    each socket state is protected by separate spinlock.
119  */
120 #ifdef CONFIG_PROVE_LOCKING
121 #define cmp_ptr(l, r)	(((l) > (r)) - ((l) < (r)))
122 
123 static int unix_table_lock_cmp_fn(const struct lockdep_map *a,
124 				  const struct lockdep_map *b)
125 {
126 	return cmp_ptr(a, b);
127 }
128 
129 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
130 				  const struct lockdep_map *_b)
131 {
132 	const struct unix_sock *a, *b;
133 
134 	a = container_of(_a, struct unix_sock, lock.dep_map);
135 	b = container_of(_b, struct unix_sock, lock.dep_map);
136 
137 	if (a->sk.sk_state == TCP_LISTEN) {
138 		/* unix_stream_connect(): Before the 2nd unix_state_lock(),
139 		 *
140 		 *   1. a is TCP_LISTEN.
141 		 *   2. b is not a.
142 		 *   3. concurrent connect(b -> a) must fail.
143 		 *
144 		 * Except for 2. & 3., the b's state can be any possible
145 		 * value due to concurrent connect() or listen().
146 		 *
147 		 * 2. is detected in debug_spin_lock_before(), and 3. cannot
148 		 * be expressed as lock_cmp_fn.
149 		 */
150 		switch (b->sk.sk_state) {
151 		case TCP_CLOSE:
152 		case TCP_ESTABLISHED:
153 		case TCP_LISTEN:
154 			return -1;
155 		default:
156 			/* Invalid case. */
157 			return 0;
158 		}
159 	}
160 
161 	/* Should never happen.  Just to be symmetric. */
162 	if (b->sk.sk_state == TCP_LISTEN) {
163 		switch (b->sk.sk_state) {
164 		case TCP_CLOSE:
165 		case TCP_ESTABLISHED:
166 			return 1;
167 		default:
168 			return 0;
169 		}
170 	}
171 
172 	/* unix_state_double_lock(): ascending address order. */
173 	return cmp_ptr(a, b);
174 }
175 
176 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a,
177 				  const struct lockdep_map *_b)
178 {
179 	const struct sock *a, *b;
180 
181 	a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map);
182 	b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map);
183 
184 	/* unix_collect_skb(): listener -> embryo order. */
185 	if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a)
186 		return -1;
187 
188 	/* Should never happen.  Just to be symmetric. */
189 	if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b)
190 		return 1;
191 
192 	return 0;
193 }
194 #endif
195 
196 static unsigned int unix_unbound_hash(struct sock *sk)
197 {
198 	unsigned long hash = (unsigned long)sk;
199 
200 	hash ^= hash >> 16;
201 	hash ^= hash >> 8;
202 	hash ^= sk->sk_type;
203 
204 	return hash & UNIX_HASH_MOD;
205 }
206 
207 static unsigned int unix_bsd_hash(struct inode *i)
208 {
209 	return i->i_ino & UNIX_HASH_MOD;
210 }
211 
212 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
213 				       int addr_len, int type)
214 {
215 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
216 	unsigned int hash;
217 
218 	hash = (__force unsigned int)csum_fold(csum);
219 	hash ^= hash >> 8;
220 	hash ^= type;
221 
222 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
223 }
224 
225 static void unix_table_double_lock(struct net *net,
226 				   unsigned int hash1, unsigned int hash2)
227 {
228 	if (hash1 == hash2) {
229 		spin_lock(&net->unx.table.locks[hash1]);
230 		return;
231 	}
232 
233 	if (hash1 > hash2)
234 		swap(hash1, hash2);
235 
236 	spin_lock(&net->unx.table.locks[hash1]);
237 	spin_lock(&net->unx.table.locks[hash2]);
238 }
239 
240 static void unix_table_double_unlock(struct net *net,
241 				     unsigned int hash1, unsigned int hash2)
242 {
243 	if (hash1 == hash2) {
244 		spin_unlock(&net->unx.table.locks[hash1]);
245 		return;
246 	}
247 
248 	spin_unlock(&net->unx.table.locks[hash1]);
249 	spin_unlock(&net->unx.table.locks[hash2]);
250 }
251 
252 #ifdef CONFIG_SECURITY_NETWORK
253 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
254 {
255 	UNIXCB(skb).secid = scm->secid;
256 }
257 
258 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
259 {
260 	scm->secid = UNIXCB(skb).secid;
261 }
262 
263 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
264 {
265 	return (scm->secid == UNIXCB(skb).secid);
266 }
267 #else
268 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
269 { }
270 
271 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
272 { }
273 
274 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
275 {
276 	return true;
277 }
278 #endif /* CONFIG_SECURITY_NETWORK */
279 
280 static inline int unix_may_send(struct sock *sk, struct sock *osk)
281 {
282 	return !unix_peer(osk) || unix_peer(osk) == sk;
283 }
284 
285 static inline int unix_recvq_full_lockless(const struct sock *sk)
286 {
287 	return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
288 }
289 
290 struct sock *unix_peer_get(struct sock *s)
291 {
292 	struct sock *peer;
293 
294 	unix_state_lock(s);
295 	peer = unix_peer(s);
296 	if (peer)
297 		sock_hold(peer);
298 	unix_state_unlock(s);
299 	return peer;
300 }
301 EXPORT_SYMBOL_GPL(unix_peer_get);
302 
303 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
304 					     int addr_len)
305 {
306 	struct unix_address *addr;
307 
308 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
309 	if (!addr)
310 		return NULL;
311 
312 	refcount_set(&addr->refcnt, 1);
313 	addr->len = addr_len;
314 	memcpy(addr->name, sunaddr, addr_len);
315 
316 	return addr;
317 }
318 
319 static inline void unix_release_addr(struct unix_address *addr)
320 {
321 	if (refcount_dec_and_test(&addr->refcnt))
322 		kfree(addr);
323 }
324 
325 /*
326  *	Check unix socket name:
327  *		- should be not zero length.
328  *	        - if started by not zero, should be NULL terminated (FS object)
329  *		- if started by zero, it is abstract name.
330  */
331 
332 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
333 {
334 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
335 	    addr_len > sizeof(*sunaddr))
336 		return -EINVAL;
337 
338 	if (sunaddr->sun_family != AF_UNIX)
339 		return -EINVAL;
340 
341 	return 0;
342 }
343 
344 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
345 {
346 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
347 	short offset = offsetof(struct sockaddr_storage, __data);
348 
349 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
350 
351 	/* This may look like an off by one error but it is a bit more
352 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
353 	 * sun_path[108] doesn't as such exist.  However in kernel space
354 	 * we are guaranteed that it is a valid memory location in our
355 	 * kernel address buffer because syscall functions always pass
356 	 * a pointer of struct sockaddr_storage which has a bigger buffer
357 	 * than 108.  Also, we must terminate sun_path for strlen() in
358 	 * getname_kernel().
359 	 */
360 	addr->__data[addr_len - offset] = 0;
361 
362 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
363 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
364 	 * know the actual buffer.
365 	 */
366 	return strlen(addr->__data) + offset + 1;
367 }
368 
369 static void __unix_remove_socket(struct sock *sk)
370 {
371 	sk_del_node_init(sk);
372 }
373 
374 static void __unix_insert_socket(struct net *net, struct sock *sk)
375 {
376 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
377 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
378 }
379 
380 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
381 				 struct unix_address *addr, unsigned int hash)
382 {
383 	__unix_remove_socket(sk);
384 	smp_store_release(&unix_sk(sk)->addr, addr);
385 
386 	sk->sk_hash = hash;
387 	__unix_insert_socket(net, sk);
388 }
389 
390 static void unix_remove_socket(struct net *net, struct sock *sk)
391 {
392 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
393 	__unix_remove_socket(sk);
394 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
395 }
396 
397 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
398 {
399 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
400 	__unix_insert_socket(net, sk);
401 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
402 }
403 
404 static void unix_insert_bsd_socket(struct sock *sk)
405 {
406 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
407 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
408 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
409 }
410 
411 static void unix_remove_bsd_socket(struct sock *sk)
412 {
413 	if (!hlist_unhashed(&sk->sk_bind_node)) {
414 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
415 		__sk_del_bind_node(sk);
416 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
417 
418 		sk_node_init(&sk->sk_bind_node);
419 	}
420 }
421 
422 static struct sock *__unix_find_socket_byname(struct net *net,
423 					      struct sockaddr_un *sunname,
424 					      int len, unsigned int hash)
425 {
426 	struct sock *s;
427 
428 	sk_for_each(s, &net->unx.table.buckets[hash]) {
429 		struct unix_sock *u = unix_sk(s);
430 
431 		if (u->addr->len == len &&
432 		    !memcmp(u->addr->name, sunname, len))
433 			return s;
434 	}
435 	return NULL;
436 }
437 
438 static inline struct sock *unix_find_socket_byname(struct net *net,
439 						   struct sockaddr_un *sunname,
440 						   int len, unsigned int hash)
441 {
442 	struct sock *s;
443 
444 	spin_lock(&net->unx.table.locks[hash]);
445 	s = __unix_find_socket_byname(net, sunname, len, hash);
446 	if (s)
447 		sock_hold(s);
448 	spin_unlock(&net->unx.table.locks[hash]);
449 	return s;
450 }
451 
452 static struct sock *unix_find_socket_byinode(struct inode *i)
453 {
454 	unsigned int hash = unix_bsd_hash(i);
455 	struct sock *s;
456 
457 	spin_lock(&bsd_socket_locks[hash]);
458 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
459 		struct dentry *dentry = unix_sk(s)->path.dentry;
460 
461 		if (dentry && d_backing_inode(dentry) == i) {
462 			sock_hold(s);
463 			spin_unlock(&bsd_socket_locks[hash]);
464 			return s;
465 		}
466 	}
467 	spin_unlock(&bsd_socket_locks[hash]);
468 	return NULL;
469 }
470 
471 /* Support code for asymmetrically connected dgram sockets
472  *
473  * If a datagram socket is connected to a socket not itself connected
474  * to the first socket (eg, /dev/log), clients may only enqueue more
475  * messages if the present receive queue of the server socket is not
476  * "too large". This means there's a second writeability condition
477  * poll and sendmsg need to test. The dgram recv code will do a wake
478  * up on the peer_wait wait queue of a socket upon reception of a
479  * datagram which needs to be propagated to sleeping would-be writers
480  * since these might not have sent anything so far. This can't be
481  * accomplished via poll_wait because the lifetime of the server
482  * socket might be less than that of its clients if these break their
483  * association with it or if the server socket is closed while clients
484  * are still connected to it and there's no way to inform "a polling
485  * implementation" that it should let go of a certain wait queue
486  *
487  * In order to propagate a wake up, a wait_queue_entry_t of the client
488  * socket is enqueued on the peer_wait queue of the server socket
489  * whose wake function does a wake_up on the ordinary client socket
490  * wait queue. This connection is established whenever a write (or
491  * poll for write) hit the flow control condition and broken when the
492  * association to the server socket is dissolved or after a wake up
493  * was relayed.
494  */
495 
496 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
497 				      void *key)
498 {
499 	struct unix_sock *u;
500 	wait_queue_head_t *u_sleep;
501 
502 	u = container_of(q, struct unix_sock, peer_wake);
503 
504 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
505 			    q);
506 	u->peer_wake.private = NULL;
507 
508 	/* relaying can only happen while the wq still exists */
509 	u_sleep = sk_sleep(&u->sk);
510 	if (u_sleep)
511 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
512 
513 	return 0;
514 }
515 
516 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
517 {
518 	struct unix_sock *u, *u_other;
519 	int rc;
520 
521 	u = unix_sk(sk);
522 	u_other = unix_sk(other);
523 	rc = 0;
524 	spin_lock(&u_other->peer_wait.lock);
525 
526 	if (!u->peer_wake.private) {
527 		u->peer_wake.private = other;
528 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
529 
530 		rc = 1;
531 	}
532 
533 	spin_unlock(&u_other->peer_wait.lock);
534 	return rc;
535 }
536 
537 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
538 					    struct sock *other)
539 {
540 	struct unix_sock *u, *u_other;
541 
542 	u = unix_sk(sk);
543 	u_other = unix_sk(other);
544 	spin_lock(&u_other->peer_wait.lock);
545 
546 	if (u->peer_wake.private == other) {
547 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
548 		u->peer_wake.private = NULL;
549 	}
550 
551 	spin_unlock(&u_other->peer_wait.lock);
552 }
553 
554 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
555 						   struct sock *other)
556 {
557 	unix_dgram_peer_wake_disconnect(sk, other);
558 	wake_up_interruptible_poll(sk_sleep(sk),
559 				   EPOLLOUT |
560 				   EPOLLWRNORM |
561 				   EPOLLWRBAND);
562 }
563 
564 /* preconditions:
565  *	- unix_peer(sk) == other
566  *	- association is stable
567  */
568 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
569 {
570 	int connected;
571 
572 	connected = unix_dgram_peer_wake_connect(sk, other);
573 
574 	/* If other is SOCK_DEAD, we want to make sure we signal
575 	 * POLLOUT, such that a subsequent write() can get a
576 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
577 	 * to other and its full, we will hang waiting for POLLOUT.
578 	 */
579 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
580 		return 1;
581 
582 	if (connected)
583 		unix_dgram_peer_wake_disconnect(sk, other);
584 
585 	return 0;
586 }
587 
588 static int unix_writable(const struct sock *sk, unsigned char state)
589 {
590 	return state != TCP_LISTEN &&
591 		(refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
592 }
593 
594 static void unix_write_space(struct sock *sk)
595 {
596 	struct socket_wq *wq;
597 
598 	rcu_read_lock();
599 	if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
600 		wq = rcu_dereference(sk->sk_wq);
601 		if (skwq_has_sleeper(wq))
602 			wake_up_interruptible_sync_poll(&wq->wait,
603 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
604 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
605 	}
606 	rcu_read_unlock();
607 }
608 
609 /* When dgram socket disconnects (or changes its peer), we clear its receive
610  * queue of packets arrived from previous peer. First, it allows to do
611  * flow control based only on wmem_alloc; second, sk connected to peer
612  * may receive messages only from that peer. */
613 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
614 {
615 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
616 		skb_queue_purge_reason(&sk->sk_receive_queue,
617 				       SKB_DROP_REASON_UNIX_DISCONNECT);
618 
619 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
620 
621 		/* If one link of bidirectional dgram pipe is disconnected,
622 		 * we signal error. Messages are lost. Do not make this,
623 		 * when peer was not connected to us.
624 		 */
625 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
626 			WRITE_ONCE(other->sk_err, ECONNRESET);
627 			sk_error_report(other);
628 		}
629 	}
630 }
631 
632 static void unix_sock_destructor(struct sock *sk)
633 {
634 	struct unix_sock *u = unix_sk(sk);
635 
636 	skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE);
637 
638 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
639 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
640 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
641 	if (!sock_flag(sk, SOCK_DEAD)) {
642 		pr_info("Attempt to release alive unix socket: %p\n", sk);
643 		return;
644 	}
645 
646 	if (u->addr)
647 		unix_release_addr(u->addr);
648 
649 	atomic_long_dec(&unix_nr_socks);
650 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
651 #ifdef UNIX_REFCNT_DEBUG
652 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
653 		atomic_long_read(&unix_nr_socks));
654 #endif
655 }
656 
657 static void unix_release_sock(struct sock *sk, int embrion)
658 {
659 	struct unix_sock *u = unix_sk(sk);
660 	struct sock *skpair;
661 	struct sk_buff *skb;
662 	struct path path;
663 	int state;
664 
665 	unix_remove_socket(sock_net(sk), sk);
666 	unix_remove_bsd_socket(sk);
667 
668 	/* Clear state */
669 	unix_state_lock(sk);
670 	sock_orphan(sk);
671 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
672 	path	     = u->path;
673 	u->path.dentry = NULL;
674 	u->path.mnt = NULL;
675 	state = sk->sk_state;
676 	WRITE_ONCE(sk->sk_state, TCP_CLOSE);
677 
678 	skpair = unix_peer(sk);
679 	unix_peer(sk) = NULL;
680 
681 	unix_state_unlock(sk);
682 
683 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
684 	u->oob_skb = NULL;
685 #endif
686 
687 	wake_up_interruptible_all(&u->peer_wait);
688 
689 	if (skpair != NULL) {
690 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
691 			unix_state_lock(skpair);
692 			/* No more writes */
693 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
694 			if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion)
695 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
696 			unix_state_unlock(skpair);
697 			skpair->sk_state_change(skpair);
698 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
699 		}
700 
701 		unix_dgram_peer_wake_disconnect(sk, skpair);
702 		sock_put(skpair); /* It may now die */
703 	}
704 
705 	/* Try to flush out this socket. Throw out buffers at least */
706 
707 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
708 		if (state == TCP_LISTEN)
709 			unix_release_sock(skb->sk, 1);
710 
711 		/* passed fds are erased in the kfree_skb hook */
712 		kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
713 	}
714 
715 	if (path.dentry)
716 		path_put(&path);
717 
718 	sock_put(sk);
719 
720 	/* ---- Socket is dead now and most probably destroyed ---- */
721 
722 	/*
723 	 * Fixme: BSD difference: In BSD all sockets connected to us get
724 	 *	  ECONNRESET and we die on the spot. In Linux we behave
725 	 *	  like files and pipes do and wait for the last
726 	 *	  dereference.
727 	 *
728 	 * Can't we simply set sock->err?
729 	 *
730 	 *	  What the above comment does talk about? --ANK(980817)
731 	 */
732 
733 	if (READ_ONCE(unix_tot_inflight))
734 		unix_gc();		/* Garbage collect fds */
735 }
736 
737 static void init_peercred(struct sock *sk)
738 {
739 	sk->sk_peer_pid = get_pid(task_tgid(current));
740 	sk->sk_peer_cred = get_current_cred();
741 }
742 
743 static void update_peercred(struct sock *sk)
744 {
745 	const struct cred *old_cred;
746 	struct pid *old_pid;
747 
748 	spin_lock(&sk->sk_peer_lock);
749 	old_pid = sk->sk_peer_pid;
750 	old_cred = sk->sk_peer_cred;
751 	init_peercred(sk);
752 	spin_unlock(&sk->sk_peer_lock);
753 
754 	put_pid(old_pid);
755 	put_cred(old_cred);
756 }
757 
758 static void copy_peercred(struct sock *sk, struct sock *peersk)
759 {
760 	lockdep_assert_held(&unix_sk(peersk)->lock);
761 
762 	spin_lock(&sk->sk_peer_lock);
763 	sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
764 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
765 	spin_unlock(&sk->sk_peer_lock);
766 }
767 
768 static int unix_listen(struct socket *sock, int backlog)
769 {
770 	int err;
771 	struct sock *sk = sock->sk;
772 	struct unix_sock *u = unix_sk(sk);
773 
774 	err = -EOPNOTSUPP;
775 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
776 		goto out;	/* Only stream/seqpacket sockets accept */
777 	err = -EINVAL;
778 	if (!READ_ONCE(u->addr))
779 		goto out;	/* No listens on an unbound socket */
780 	unix_state_lock(sk);
781 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
782 		goto out_unlock;
783 	if (backlog > sk->sk_max_ack_backlog)
784 		wake_up_interruptible_all(&u->peer_wait);
785 	sk->sk_max_ack_backlog	= backlog;
786 	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
787 
788 	/* set credentials so connect can copy them */
789 	update_peercred(sk);
790 	err = 0;
791 
792 out_unlock:
793 	unix_state_unlock(sk);
794 out:
795 	return err;
796 }
797 
798 static int unix_release(struct socket *);
799 static int unix_bind(struct socket *, struct sockaddr *, int);
800 static int unix_stream_connect(struct socket *, struct sockaddr *,
801 			       int addr_len, int flags);
802 static int unix_socketpair(struct socket *, struct socket *);
803 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
804 static int unix_getname(struct socket *, struct sockaddr *, int);
805 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
806 static __poll_t unix_dgram_poll(struct file *, struct socket *,
807 				    poll_table *);
808 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
809 #ifdef CONFIG_COMPAT
810 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
811 #endif
812 static int unix_shutdown(struct socket *, int);
813 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
814 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
815 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
816 				       struct pipe_inode_info *, size_t size,
817 				       unsigned int flags);
818 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
819 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
820 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
821 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
822 static int unix_dgram_connect(struct socket *, struct sockaddr *,
823 			      int, int);
824 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
825 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
826 				  int);
827 
828 #ifdef CONFIG_PROC_FS
829 static int unix_count_nr_fds(struct sock *sk)
830 {
831 	struct sk_buff *skb;
832 	struct unix_sock *u;
833 	int nr_fds = 0;
834 
835 	spin_lock(&sk->sk_receive_queue.lock);
836 	skb = skb_peek(&sk->sk_receive_queue);
837 	while (skb) {
838 		u = unix_sk(skb->sk);
839 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
840 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
841 	}
842 	spin_unlock(&sk->sk_receive_queue.lock);
843 
844 	return nr_fds;
845 }
846 
847 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
848 {
849 	struct sock *sk = sock->sk;
850 	unsigned char s_state;
851 	struct unix_sock *u;
852 	int nr_fds = 0;
853 
854 	if (sk) {
855 		s_state = READ_ONCE(sk->sk_state);
856 		u = unix_sk(sk);
857 
858 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
859 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
860 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
861 		 */
862 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
863 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
864 		else if (s_state == TCP_LISTEN)
865 			nr_fds = unix_count_nr_fds(sk);
866 
867 		seq_printf(m, "scm_fds: %u\n", nr_fds);
868 	}
869 }
870 #else
871 #define unix_show_fdinfo NULL
872 #endif
873 
874 static const struct proto_ops unix_stream_ops = {
875 	.family =	PF_UNIX,
876 	.owner =	THIS_MODULE,
877 	.release =	unix_release,
878 	.bind =		unix_bind,
879 	.connect =	unix_stream_connect,
880 	.socketpair =	unix_socketpair,
881 	.accept =	unix_accept,
882 	.getname =	unix_getname,
883 	.poll =		unix_poll,
884 	.ioctl =	unix_ioctl,
885 #ifdef CONFIG_COMPAT
886 	.compat_ioctl =	unix_compat_ioctl,
887 #endif
888 	.listen =	unix_listen,
889 	.shutdown =	unix_shutdown,
890 	.sendmsg =	unix_stream_sendmsg,
891 	.recvmsg =	unix_stream_recvmsg,
892 	.read_skb =	unix_stream_read_skb,
893 	.mmap =		sock_no_mmap,
894 	.splice_read =	unix_stream_splice_read,
895 	.set_peek_off =	sk_set_peek_off,
896 	.show_fdinfo =	unix_show_fdinfo,
897 };
898 
899 static const struct proto_ops unix_dgram_ops = {
900 	.family =	PF_UNIX,
901 	.owner =	THIS_MODULE,
902 	.release =	unix_release,
903 	.bind =		unix_bind,
904 	.connect =	unix_dgram_connect,
905 	.socketpair =	unix_socketpair,
906 	.accept =	sock_no_accept,
907 	.getname =	unix_getname,
908 	.poll =		unix_dgram_poll,
909 	.ioctl =	unix_ioctl,
910 #ifdef CONFIG_COMPAT
911 	.compat_ioctl =	unix_compat_ioctl,
912 #endif
913 	.listen =	sock_no_listen,
914 	.shutdown =	unix_shutdown,
915 	.sendmsg =	unix_dgram_sendmsg,
916 	.read_skb =	unix_read_skb,
917 	.recvmsg =	unix_dgram_recvmsg,
918 	.mmap =		sock_no_mmap,
919 	.set_peek_off =	sk_set_peek_off,
920 	.show_fdinfo =	unix_show_fdinfo,
921 };
922 
923 static const struct proto_ops unix_seqpacket_ops = {
924 	.family =	PF_UNIX,
925 	.owner =	THIS_MODULE,
926 	.release =	unix_release,
927 	.bind =		unix_bind,
928 	.connect =	unix_stream_connect,
929 	.socketpair =	unix_socketpair,
930 	.accept =	unix_accept,
931 	.getname =	unix_getname,
932 	.poll =		unix_dgram_poll,
933 	.ioctl =	unix_ioctl,
934 #ifdef CONFIG_COMPAT
935 	.compat_ioctl =	unix_compat_ioctl,
936 #endif
937 	.listen =	unix_listen,
938 	.shutdown =	unix_shutdown,
939 	.sendmsg =	unix_seqpacket_sendmsg,
940 	.recvmsg =	unix_seqpacket_recvmsg,
941 	.mmap =		sock_no_mmap,
942 	.set_peek_off =	sk_set_peek_off,
943 	.show_fdinfo =	unix_show_fdinfo,
944 };
945 
946 static void unix_close(struct sock *sk, long timeout)
947 {
948 	/* Nothing to do here, unix socket does not need a ->close().
949 	 * This is merely for sockmap.
950 	 */
951 }
952 
953 static bool unix_bpf_bypass_getsockopt(int level, int optname)
954 {
955 	if (level == SOL_SOCKET) {
956 		switch (optname) {
957 		case SO_PEERPIDFD:
958 			return true;
959 		default:
960 			return false;
961 		}
962 	}
963 
964 	return false;
965 }
966 
967 struct proto unix_dgram_proto = {
968 	.name			= "UNIX",
969 	.owner			= THIS_MODULE,
970 	.obj_size		= sizeof(struct unix_sock),
971 	.close			= unix_close,
972 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
973 #ifdef CONFIG_BPF_SYSCALL
974 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
975 #endif
976 };
977 
978 struct proto unix_stream_proto = {
979 	.name			= "UNIX-STREAM",
980 	.owner			= THIS_MODULE,
981 	.obj_size		= sizeof(struct unix_sock),
982 	.close			= unix_close,
983 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
984 #ifdef CONFIG_BPF_SYSCALL
985 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
986 #endif
987 };
988 
989 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
990 {
991 	struct unix_sock *u;
992 	struct sock *sk;
993 	int err;
994 
995 	atomic_long_inc(&unix_nr_socks);
996 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
997 		err = -ENFILE;
998 		goto err;
999 	}
1000 
1001 	if (type == SOCK_STREAM)
1002 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
1003 	else /*dgram and  seqpacket */
1004 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
1005 
1006 	if (!sk) {
1007 		err = -ENOMEM;
1008 		goto err;
1009 	}
1010 
1011 	sock_init_data(sock, sk);
1012 
1013 	sk->sk_hash		= unix_unbound_hash(sk);
1014 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
1015 	sk->sk_write_space	= unix_write_space;
1016 	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);
1017 	sk->sk_destruct		= unix_sock_destructor;
1018 	lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL);
1019 
1020 	u = unix_sk(sk);
1021 	u->listener = NULL;
1022 	u->vertex = NULL;
1023 	u->path.dentry = NULL;
1024 	u->path.mnt = NULL;
1025 	spin_lock_init(&u->lock);
1026 	lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);
1027 	mutex_init(&u->iolock); /* single task reading lock */
1028 	mutex_init(&u->bindlock); /* single task binding lock */
1029 	init_waitqueue_head(&u->peer_wait);
1030 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1031 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1032 	unix_insert_unbound_socket(net, sk);
1033 
1034 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1035 
1036 	return sk;
1037 
1038 err:
1039 	atomic_long_dec(&unix_nr_socks);
1040 	return ERR_PTR(err);
1041 }
1042 
1043 static int unix_create(struct net *net, struct socket *sock, int protocol,
1044 		       int kern)
1045 {
1046 	struct sock *sk;
1047 
1048 	if (protocol && protocol != PF_UNIX)
1049 		return -EPROTONOSUPPORT;
1050 
1051 	sock->state = SS_UNCONNECTED;
1052 
1053 	switch (sock->type) {
1054 	case SOCK_STREAM:
1055 		sock->ops = &unix_stream_ops;
1056 		break;
1057 		/*
1058 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1059 		 *	nothing uses it.
1060 		 */
1061 	case SOCK_RAW:
1062 		sock->type = SOCK_DGRAM;
1063 		fallthrough;
1064 	case SOCK_DGRAM:
1065 		sock->ops = &unix_dgram_ops;
1066 		break;
1067 	case SOCK_SEQPACKET:
1068 		sock->ops = &unix_seqpacket_ops;
1069 		break;
1070 	default:
1071 		return -ESOCKTNOSUPPORT;
1072 	}
1073 
1074 	sk = unix_create1(net, sock, kern, sock->type);
1075 	if (IS_ERR(sk))
1076 		return PTR_ERR(sk);
1077 
1078 	return 0;
1079 }
1080 
1081 static int unix_release(struct socket *sock)
1082 {
1083 	struct sock *sk = sock->sk;
1084 
1085 	if (!sk)
1086 		return 0;
1087 
1088 	sk->sk_prot->close(sk, 0);
1089 	unix_release_sock(sk, 0);
1090 	sock->sk = NULL;
1091 
1092 	return 0;
1093 }
1094 
1095 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1096 				  int type)
1097 {
1098 	struct inode *inode;
1099 	struct path path;
1100 	struct sock *sk;
1101 	int err;
1102 
1103 	unix_mkname_bsd(sunaddr, addr_len);
1104 	err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1105 	if (err)
1106 		goto fail;
1107 
1108 	err = path_permission(&path, MAY_WRITE);
1109 	if (err)
1110 		goto path_put;
1111 
1112 	err = -ECONNREFUSED;
1113 	inode = d_backing_inode(path.dentry);
1114 	if (!S_ISSOCK(inode->i_mode))
1115 		goto path_put;
1116 
1117 	sk = unix_find_socket_byinode(inode);
1118 	if (!sk)
1119 		goto path_put;
1120 
1121 	err = -EPROTOTYPE;
1122 	if (sk->sk_type == type)
1123 		touch_atime(&path);
1124 	else
1125 		goto sock_put;
1126 
1127 	path_put(&path);
1128 
1129 	return sk;
1130 
1131 sock_put:
1132 	sock_put(sk);
1133 path_put:
1134 	path_put(&path);
1135 fail:
1136 	return ERR_PTR(err);
1137 }
1138 
1139 static struct sock *unix_find_abstract(struct net *net,
1140 				       struct sockaddr_un *sunaddr,
1141 				       int addr_len, int type)
1142 {
1143 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1144 	struct dentry *dentry;
1145 	struct sock *sk;
1146 
1147 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1148 	if (!sk)
1149 		return ERR_PTR(-ECONNREFUSED);
1150 
1151 	dentry = unix_sk(sk)->path.dentry;
1152 	if (dentry)
1153 		touch_atime(&unix_sk(sk)->path);
1154 
1155 	return sk;
1156 }
1157 
1158 static struct sock *unix_find_other(struct net *net,
1159 				    struct sockaddr_un *sunaddr,
1160 				    int addr_len, int type)
1161 {
1162 	struct sock *sk;
1163 
1164 	if (sunaddr->sun_path[0])
1165 		sk = unix_find_bsd(sunaddr, addr_len, type);
1166 	else
1167 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1168 
1169 	return sk;
1170 }
1171 
1172 static int unix_autobind(struct sock *sk)
1173 {
1174 	struct unix_sock *u = unix_sk(sk);
1175 	unsigned int new_hash, old_hash;
1176 	struct net *net = sock_net(sk);
1177 	struct unix_address *addr;
1178 	u32 lastnum, ordernum;
1179 	int err;
1180 
1181 	err = mutex_lock_interruptible(&u->bindlock);
1182 	if (err)
1183 		return err;
1184 
1185 	if (u->addr)
1186 		goto out;
1187 
1188 	err = -ENOMEM;
1189 	addr = kzalloc(sizeof(*addr) +
1190 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1191 	if (!addr)
1192 		goto out;
1193 
1194 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1195 	addr->name->sun_family = AF_UNIX;
1196 	refcount_set(&addr->refcnt, 1);
1197 
1198 	old_hash = sk->sk_hash;
1199 	ordernum = get_random_u32();
1200 	lastnum = ordernum & 0xFFFFF;
1201 retry:
1202 	ordernum = (ordernum + 1) & 0xFFFFF;
1203 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1204 
1205 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1206 	unix_table_double_lock(net, old_hash, new_hash);
1207 
1208 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1209 		unix_table_double_unlock(net, old_hash, new_hash);
1210 
1211 		/* __unix_find_socket_byname() may take long time if many names
1212 		 * are already in use.
1213 		 */
1214 		cond_resched();
1215 
1216 		if (ordernum == lastnum) {
1217 			/* Give up if all names seems to be in use. */
1218 			err = -ENOSPC;
1219 			unix_release_addr(addr);
1220 			goto out;
1221 		}
1222 
1223 		goto retry;
1224 	}
1225 
1226 	__unix_set_addr_hash(net, sk, addr, new_hash);
1227 	unix_table_double_unlock(net, old_hash, new_hash);
1228 	err = 0;
1229 
1230 out:	mutex_unlock(&u->bindlock);
1231 	return err;
1232 }
1233 
1234 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1235 			 int addr_len)
1236 {
1237 	umode_t mode = S_IFSOCK |
1238 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1239 	struct unix_sock *u = unix_sk(sk);
1240 	unsigned int new_hash, old_hash;
1241 	struct net *net = sock_net(sk);
1242 	struct mnt_idmap *idmap;
1243 	struct unix_address *addr;
1244 	struct dentry *dentry;
1245 	struct path parent;
1246 	int err;
1247 
1248 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1249 	addr = unix_create_addr(sunaddr, addr_len);
1250 	if (!addr)
1251 		return -ENOMEM;
1252 
1253 	/*
1254 	 * Get the parent directory, calculate the hash for last
1255 	 * component.
1256 	 */
1257 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1258 	if (IS_ERR(dentry)) {
1259 		err = PTR_ERR(dentry);
1260 		goto out;
1261 	}
1262 
1263 	/*
1264 	 * All right, let's create it.
1265 	 */
1266 	idmap = mnt_idmap(parent.mnt);
1267 	err = security_path_mknod(&parent, dentry, mode, 0);
1268 	if (!err)
1269 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1270 	if (err)
1271 		goto out_path;
1272 	err = mutex_lock_interruptible(&u->bindlock);
1273 	if (err)
1274 		goto out_unlink;
1275 	if (u->addr)
1276 		goto out_unlock;
1277 
1278 	old_hash = sk->sk_hash;
1279 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1280 	unix_table_double_lock(net, old_hash, new_hash);
1281 	u->path.mnt = mntget(parent.mnt);
1282 	u->path.dentry = dget(dentry);
1283 	__unix_set_addr_hash(net, sk, addr, new_hash);
1284 	unix_table_double_unlock(net, old_hash, new_hash);
1285 	unix_insert_bsd_socket(sk);
1286 	mutex_unlock(&u->bindlock);
1287 	done_path_create(&parent, dentry);
1288 	return 0;
1289 
1290 out_unlock:
1291 	mutex_unlock(&u->bindlock);
1292 	err = -EINVAL;
1293 out_unlink:
1294 	/* failed after successful mknod?  unlink what we'd created... */
1295 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1296 out_path:
1297 	done_path_create(&parent, dentry);
1298 out:
1299 	unix_release_addr(addr);
1300 	return err == -EEXIST ? -EADDRINUSE : err;
1301 }
1302 
1303 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1304 			      int addr_len)
1305 {
1306 	struct unix_sock *u = unix_sk(sk);
1307 	unsigned int new_hash, old_hash;
1308 	struct net *net = sock_net(sk);
1309 	struct unix_address *addr;
1310 	int err;
1311 
1312 	addr = unix_create_addr(sunaddr, addr_len);
1313 	if (!addr)
1314 		return -ENOMEM;
1315 
1316 	err = mutex_lock_interruptible(&u->bindlock);
1317 	if (err)
1318 		goto out;
1319 
1320 	if (u->addr) {
1321 		err = -EINVAL;
1322 		goto out_mutex;
1323 	}
1324 
1325 	old_hash = sk->sk_hash;
1326 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1327 	unix_table_double_lock(net, old_hash, new_hash);
1328 
1329 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1330 		goto out_spin;
1331 
1332 	__unix_set_addr_hash(net, sk, addr, new_hash);
1333 	unix_table_double_unlock(net, old_hash, new_hash);
1334 	mutex_unlock(&u->bindlock);
1335 	return 0;
1336 
1337 out_spin:
1338 	unix_table_double_unlock(net, old_hash, new_hash);
1339 	err = -EADDRINUSE;
1340 out_mutex:
1341 	mutex_unlock(&u->bindlock);
1342 out:
1343 	unix_release_addr(addr);
1344 	return err;
1345 }
1346 
1347 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1348 {
1349 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1350 	struct sock *sk = sock->sk;
1351 	int err;
1352 
1353 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1354 	    sunaddr->sun_family == AF_UNIX)
1355 		return unix_autobind(sk);
1356 
1357 	err = unix_validate_addr(sunaddr, addr_len);
1358 	if (err)
1359 		return err;
1360 
1361 	if (sunaddr->sun_path[0])
1362 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1363 	else
1364 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1365 
1366 	return err;
1367 }
1368 
1369 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1370 {
1371 	if (unlikely(sk1 == sk2) || !sk2) {
1372 		unix_state_lock(sk1);
1373 		return;
1374 	}
1375 
1376 	if (sk1 > sk2)
1377 		swap(sk1, sk2);
1378 
1379 	unix_state_lock(sk1);
1380 	unix_state_lock(sk2);
1381 }
1382 
1383 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1384 {
1385 	if (unlikely(sk1 == sk2) || !sk2) {
1386 		unix_state_unlock(sk1);
1387 		return;
1388 	}
1389 	unix_state_unlock(sk1);
1390 	unix_state_unlock(sk2);
1391 }
1392 
1393 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1394 			      int alen, int flags)
1395 {
1396 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1397 	struct sock *sk = sock->sk;
1398 	struct sock *other;
1399 	int err;
1400 
1401 	err = -EINVAL;
1402 	if (alen < offsetofend(struct sockaddr, sa_family))
1403 		goto out;
1404 
1405 	if (addr->sa_family != AF_UNSPEC) {
1406 		err = unix_validate_addr(sunaddr, alen);
1407 		if (err)
1408 			goto out;
1409 
1410 		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1411 		if (err)
1412 			goto out;
1413 
1414 		if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1415 		     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1416 		    !READ_ONCE(unix_sk(sk)->addr)) {
1417 			err = unix_autobind(sk);
1418 			if (err)
1419 				goto out;
1420 		}
1421 
1422 restart:
1423 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1424 		if (IS_ERR(other)) {
1425 			err = PTR_ERR(other);
1426 			goto out;
1427 		}
1428 
1429 		unix_state_double_lock(sk, other);
1430 
1431 		/* Apparently VFS overslept socket death. Retry. */
1432 		if (sock_flag(other, SOCK_DEAD)) {
1433 			unix_state_double_unlock(sk, other);
1434 			sock_put(other);
1435 			goto restart;
1436 		}
1437 
1438 		err = -EPERM;
1439 		if (!unix_may_send(sk, other))
1440 			goto out_unlock;
1441 
1442 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1443 		if (err)
1444 			goto out_unlock;
1445 
1446 		WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1447 		WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1448 	} else {
1449 		/*
1450 		 *	1003.1g breaking connected state with AF_UNSPEC
1451 		 */
1452 		other = NULL;
1453 		unix_state_double_lock(sk, other);
1454 	}
1455 
1456 	/*
1457 	 * If it was connected, reconnect.
1458 	 */
1459 	if (unix_peer(sk)) {
1460 		struct sock *old_peer = unix_peer(sk);
1461 
1462 		unix_peer(sk) = other;
1463 		if (!other)
1464 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1465 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1466 
1467 		unix_state_double_unlock(sk, other);
1468 
1469 		if (other != old_peer) {
1470 			unix_dgram_disconnected(sk, old_peer);
1471 
1472 			unix_state_lock(old_peer);
1473 			if (!unix_peer(old_peer))
1474 				WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1475 			unix_state_unlock(old_peer);
1476 		}
1477 
1478 		sock_put(old_peer);
1479 	} else {
1480 		unix_peer(sk) = other;
1481 		unix_state_double_unlock(sk, other);
1482 	}
1483 
1484 	return 0;
1485 
1486 out_unlock:
1487 	unix_state_double_unlock(sk, other);
1488 	sock_put(other);
1489 out:
1490 	return err;
1491 }
1492 
1493 static long unix_wait_for_peer(struct sock *other, long timeo)
1494 {
1495 	struct unix_sock *u = unix_sk(other);
1496 	int sched;
1497 	DEFINE_WAIT(wait);
1498 
1499 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1500 
1501 	sched = !sock_flag(other, SOCK_DEAD) &&
1502 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1503 		unix_recvq_full_lockless(other);
1504 
1505 	unix_state_unlock(other);
1506 
1507 	if (sched)
1508 		timeo = schedule_timeout(timeo);
1509 
1510 	finish_wait(&u->peer_wait, &wait);
1511 	return timeo;
1512 }
1513 
1514 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1515 			       int addr_len, int flags)
1516 {
1517 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1518 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1519 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1520 	struct net *net = sock_net(sk);
1521 	struct sk_buff *skb = NULL;
1522 	unsigned char state;
1523 	long timeo;
1524 	int err;
1525 
1526 	err = unix_validate_addr(sunaddr, addr_len);
1527 	if (err)
1528 		goto out;
1529 
1530 	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1531 	if (err)
1532 		goto out;
1533 
1534 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1535 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1536 	    !READ_ONCE(u->addr)) {
1537 		err = unix_autobind(sk);
1538 		if (err)
1539 			goto out;
1540 	}
1541 
1542 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1543 
1544 	/* First of all allocate resources.
1545 	 * If we will make it after state is locked,
1546 	 * we will have to recheck all again in any case.
1547 	 */
1548 
1549 	/* create new sock for complete connection */
1550 	newsk = unix_create1(net, NULL, 0, sock->type);
1551 	if (IS_ERR(newsk)) {
1552 		err = PTR_ERR(newsk);
1553 		goto out;
1554 	}
1555 
1556 	/* Allocate skb for sending to listening sock */
1557 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1558 	if (!skb) {
1559 		err = -ENOMEM;
1560 		goto out_free_sk;
1561 	}
1562 
1563 restart:
1564 	/*  Find listening sock. */
1565 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1566 	if (IS_ERR(other)) {
1567 		err = PTR_ERR(other);
1568 		goto out_free_skb;
1569 	}
1570 
1571 	unix_state_lock(other);
1572 
1573 	/* Apparently VFS overslept socket death. Retry. */
1574 	if (sock_flag(other, SOCK_DEAD)) {
1575 		unix_state_unlock(other);
1576 		sock_put(other);
1577 		goto restart;
1578 	}
1579 
1580 	if (other->sk_state != TCP_LISTEN ||
1581 	    other->sk_shutdown & RCV_SHUTDOWN) {
1582 		err = -ECONNREFUSED;
1583 		goto out_unlock;
1584 	}
1585 
1586 	if (unix_recvq_full_lockless(other)) {
1587 		if (!timeo) {
1588 			err = -EAGAIN;
1589 			goto out_unlock;
1590 		}
1591 
1592 		timeo = unix_wait_for_peer(other, timeo);
1593 		sock_put(other);
1594 
1595 		err = sock_intr_errno(timeo);
1596 		if (signal_pending(current))
1597 			goto out_free_skb;
1598 
1599 		goto restart;
1600 	}
1601 
1602 	/* self connect and simultaneous connect are eliminated
1603 	 * by rejecting TCP_LISTEN socket to avoid deadlock.
1604 	 */
1605 	state = READ_ONCE(sk->sk_state);
1606 	if (unlikely(state != TCP_CLOSE)) {
1607 		err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1608 		goto out_unlock;
1609 	}
1610 
1611 	unix_state_lock(sk);
1612 
1613 	if (unlikely(sk->sk_state != TCP_CLOSE)) {
1614 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1615 		unix_state_unlock(sk);
1616 		goto out_unlock;
1617 	}
1618 
1619 	err = security_unix_stream_connect(sk, other, newsk);
1620 	if (err) {
1621 		unix_state_unlock(sk);
1622 		goto out_unlock;
1623 	}
1624 
1625 	/* The way is open! Fastly set all the necessary fields... */
1626 
1627 	sock_hold(sk);
1628 	unix_peer(newsk)	= sk;
1629 	newsk->sk_state		= TCP_ESTABLISHED;
1630 	newsk->sk_type		= sk->sk_type;
1631 	init_peercred(newsk);
1632 	newu = unix_sk(newsk);
1633 	newu->listener = other;
1634 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1635 	otheru = unix_sk(other);
1636 
1637 	/* copy address information from listening to new sock
1638 	 *
1639 	 * The contents of *(otheru->addr) and otheru->path
1640 	 * are seen fully set up here, since we have found
1641 	 * otheru in hash under its lock.  Insertion into the
1642 	 * hash chain we'd found it in had been done in an
1643 	 * earlier critical area protected by the chain's lock,
1644 	 * the same one where we'd set *(otheru->addr) contents,
1645 	 * as well as otheru->path and otheru->addr itself.
1646 	 *
1647 	 * Using smp_store_release() here to set newu->addr
1648 	 * is enough to make those stores, as well as stores
1649 	 * to newu->path visible to anyone who gets newu->addr
1650 	 * by smp_load_acquire().  IOW, the same warranties
1651 	 * as for unix_sock instances bound in unix_bind() or
1652 	 * in unix_autobind().
1653 	 */
1654 	if (otheru->path.dentry) {
1655 		path_get(&otheru->path);
1656 		newu->path = otheru->path;
1657 	}
1658 	refcount_inc(&otheru->addr->refcnt);
1659 	smp_store_release(&newu->addr, otheru->addr);
1660 
1661 	/* Set credentials */
1662 	copy_peercred(sk, other);
1663 
1664 	sock->state	= SS_CONNECTED;
1665 	WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1666 	sock_hold(newsk);
1667 
1668 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1669 	unix_peer(sk)	= newsk;
1670 
1671 	unix_state_unlock(sk);
1672 
1673 	/* take ten and send info to listening sock */
1674 	spin_lock(&other->sk_receive_queue.lock);
1675 	__skb_queue_tail(&other->sk_receive_queue, skb);
1676 	spin_unlock(&other->sk_receive_queue.lock);
1677 	unix_state_unlock(other);
1678 	other->sk_data_ready(other);
1679 	sock_put(other);
1680 	return 0;
1681 
1682 out_unlock:
1683 	unix_state_unlock(other);
1684 	sock_put(other);
1685 out_free_skb:
1686 	consume_skb(skb);
1687 out_free_sk:
1688 	unix_release_sock(newsk, 0);
1689 out:
1690 	return err;
1691 }
1692 
1693 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1694 {
1695 	struct sock *ska = socka->sk, *skb = sockb->sk;
1696 
1697 	/* Join our sockets back to back */
1698 	sock_hold(ska);
1699 	sock_hold(skb);
1700 	unix_peer(ska) = skb;
1701 	unix_peer(skb) = ska;
1702 	init_peercred(ska);
1703 	init_peercred(skb);
1704 
1705 	ska->sk_state = TCP_ESTABLISHED;
1706 	skb->sk_state = TCP_ESTABLISHED;
1707 	socka->state  = SS_CONNECTED;
1708 	sockb->state  = SS_CONNECTED;
1709 	return 0;
1710 }
1711 
1712 static void unix_sock_inherit_flags(const struct socket *old,
1713 				    struct socket *new)
1714 {
1715 	if (test_bit(SOCK_PASSCRED, &old->flags))
1716 		set_bit(SOCK_PASSCRED, &new->flags);
1717 	if (test_bit(SOCK_PASSPIDFD, &old->flags))
1718 		set_bit(SOCK_PASSPIDFD, &new->flags);
1719 	if (test_bit(SOCK_PASSSEC, &old->flags))
1720 		set_bit(SOCK_PASSSEC, &new->flags);
1721 }
1722 
1723 static int unix_accept(struct socket *sock, struct socket *newsock,
1724 		       struct proto_accept_arg *arg)
1725 {
1726 	struct sock *sk = sock->sk;
1727 	struct sk_buff *skb;
1728 	struct sock *tsk;
1729 
1730 	arg->err = -EOPNOTSUPP;
1731 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1732 		goto out;
1733 
1734 	arg->err = -EINVAL;
1735 	if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1736 		goto out;
1737 
1738 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1739 	 * so that no locks are necessary.
1740 	 */
1741 
1742 	skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1743 				&arg->err);
1744 	if (!skb) {
1745 		/* This means receive shutdown. */
1746 		if (arg->err == 0)
1747 			arg->err = -EINVAL;
1748 		goto out;
1749 	}
1750 
1751 	tsk = skb->sk;
1752 	skb_free_datagram(sk, skb);
1753 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1754 
1755 	/* attach accepted sock to socket */
1756 	unix_state_lock(tsk);
1757 	unix_update_edges(unix_sk(tsk));
1758 	newsock->state = SS_CONNECTED;
1759 	unix_sock_inherit_flags(sock, newsock);
1760 	sock_graft(tsk, newsock);
1761 	unix_state_unlock(tsk);
1762 	return 0;
1763 
1764 out:
1765 	return arg->err;
1766 }
1767 
1768 
1769 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1770 {
1771 	struct sock *sk = sock->sk;
1772 	struct unix_address *addr;
1773 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1774 	int err = 0;
1775 
1776 	if (peer) {
1777 		sk = unix_peer_get(sk);
1778 
1779 		err = -ENOTCONN;
1780 		if (!sk)
1781 			goto out;
1782 		err = 0;
1783 	} else {
1784 		sock_hold(sk);
1785 	}
1786 
1787 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1788 	if (!addr) {
1789 		sunaddr->sun_family = AF_UNIX;
1790 		sunaddr->sun_path[0] = 0;
1791 		err = offsetof(struct sockaddr_un, sun_path);
1792 	} else {
1793 		err = addr->len;
1794 		memcpy(sunaddr, addr->name, addr->len);
1795 
1796 		if (peer)
1797 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1798 					       CGROUP_UNIX_GETPEERNAME);
1799 		else
1800 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1801 					       CGROUP_UNIX_GETSOCKNAME);
1802 	}
1803 	sock_put(sk);
1804 out:
1805 	return err;
1806 }
1807 
1808 /* The "user->unix_inflight" variable is protected by the garbage
1809  * collection lock, and we just read it locklessly here. If you go
1810  * over the limit, there might be a tiny race in actually noticing
1811  * it across threads. Tough.
1812  */
1813 static inline bool too_many_unix_fds(struct task_struct *p)
1814 {
1815 	struct user_struct *user = current_user();
1816 
1817 	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1818 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1819 	return false;
1820 }
1821 
1822 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1823 {
1824 	if (too_many_unix_fds(current))
1825 		return -ETOOMANYREFS;
1826 
1827 	UNIXCB(skb).fp = scm->fp;
1828 	scm->fp = NULL;
1829 
1830 	if (unix_prepare_fpl(UNIXCB(skb).fp))
1831 		return -ENOMEM;
1832 
1833 	return 0;
1834 }
1835 
1836 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1837 {
1838 	scm->fp = UNIXCB(skb).fp;
1839 	UNIXCB(skb).fp = NULL;
1840 
1841 	unix_destroy_fpl(scm->fp);
1842 }
1843 
1844 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1845 {
1846 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1847 }
1848 
1849 static void unix_destruct_scm(struct sk_buff *skb)
1850 {
1851 	struct scm_cookie scm;
1852 
1853 	memset(&scm, 0, sizeof(scm));
1854 	scm.pid  = UNIXCB(skb).pid;
1855 	if (UNIXCB(skb).fp)
1856 		unix_detach_fds(&scm, skb);
1857 
1858 	/* Alas, it calls VFS */
1859 	/* So fscking what? fput() had been SMP-safe since the last Summer */
1860 	scm_destroy(&scm);
1861 	sock_wfree(skb);
1862 }
1863 
1864 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1865 {
1866 	int err = 0;
1867 
1868 	UNIXCB(skb).pid  = get_pid(scm->pid);
1869 	UNIXCB(skb).uid = scm->creds.uid;
1870 	UNIXCB(skb).gid = scm->creds.gid;
1871 	UNIXCB(skb).fp = NULL;
1872 	unix_get_secdata(scm, skb);
1873 	if (scm->fp && send_fds)
1874 		err = unix_attach_fds(scm, skb);
1875 
1876 	skb->destructor = unix_destruct_scm;
1877 	return err;
1878 }
1879 
1880 static bool unix_passcred_enabled(const struct socket *sock,
1881 				  const struct sock *other)
1882 {
1883 	return test_bit(SOCK_PASSCRED, &sock->flags) ||
1884 	       test_bit(SOCK_PASSPIDFD, &sock->flags) ||
1885 	       !other->sk_socket ||
1886 	       test_bit(SOCK_PASSCRED, &other->sk_socket->flags) ||
1887 	       test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags);
1888 }
1889 
1890 /*
1891  * Some apps rely on write() giving SCM_CREDENTIALS
1892  * We include credentials if source or destination socket
1893  * asserted SOCK_PASSCRED.
1894  */
1895 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1896 			    const struct sock *other)
1897 {
1898 	if (UNIXCB(skb).pid)
1899 		return;
1900 	if (unix_passcred_enabled(sock, other)) {
1901 		UNIXCB(skb).pid  = get_pid(task_tgid(current));
1902 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1903 	}
1904 }
1905 
1906 static bool unix_skb_scm_eq(struct sk_buff *skb,
1907 			    struct scm_cookie *scm)
1908 {
1909 	return UNIXCB(skb).pid == scm->pid &&
1910 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1911 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1912 	       unix_secdata_eq(scm, skb);
1913 }
1914 
1915 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1916 {
1917 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1918 	struct unix_sock *u = unix_sk(sk);
1919 
1920 	if (unlikely(fp && fp->count)) {
1921 		atomic_add(fp->count, &u->scm_stat.nr_fds);
1922 		unix_add_edges(fp, u);
1923 	}
1924 }
1925 
1926 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1927 {
1928 	struct scm_fp_list *fp = UNIXCB(skb).fp;
1929 	struct unix_sock *u = unix_sk(sk);
1930 
1931 	if (unlikely(fp && fp->count)) {
1932 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
1933 		unix_del_edges(fp);
1934 	}
1935 }
1936 
1937 /*
1938  *	Send AF_UNIX data.
1939  */
1940 
1941 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1942 			      size_t len)
1943 {
1944 	struct sock *sk = sock->sk, *other = NULL;
1945 	struct unix_sock *u = unix_sk(sk);
1946 	struct scm_cookie scm;
1947 	struct sk_buff *skb;
1948 	int data_len = 0;
1949 	int sk_locked;
1950 	long timeo;
1951 	int err;
1952 
1953 	err = scm_send(sock, msg, &scm, false);
1954 	if (err < 0)
1955 		return err;
1956 
1957 	wait_for_unix_gc(scm.fp);
1958 
1959 	if (msg->msg_flags & MSG_OOB) {
1960 		err = -EOPNOTSUPP;
1961 		goto out;
1962 	}
1963 
1964 	if (msg->msg_namelen) {
1965 		err = unix_validate_addr(msg->msg_name, msg->msg_namelen);
1966 		if (err)
1967 			goto out;
1968 
1969 		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
1970 							    msg->msg_name,
1971 							    &msg->msg_namelen,
1972 							    NULL);
1973 		if (err)
1974 			goto out;
1975 	}
1976 
1977 	if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
1978 	     test_bit(SOCK_PASSPIDFD, &sock->flags)) &&
1979 	    !READ_ONCE(u->addr)) {
1980 		err = unix_autobind(sk);
1981 		if (err)
1982 			goto out;
1983 	}
1984 
1985 	if (len > READ_ONCE(sk->sk_sndbuf) - 32) {
1986 		err = -EMSGSIZE;
1987 		goto out;
1988 	}
1989 
1990 	if (len > SKB_MAX_ALLOC) {
1991 		data_len = min_t(size_t,
1992 				 len - SKB_MAX_ALLOC,
1993 				 MAX_SKB_FRAGS * PAGE_SIZE);
1994 		data_len = PAGE_ALIGN(data_len);
1995 
1996 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1997 	}
1998 
1999 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2000 				   msg->msg_flags & MSG_DONTWAIT, &err,
2001 				   PAGE_ALLOC_COSTLY_ORDER);
2002 	if (!skb)
2003 		goto out;
2004 
2005 	err = unix_scm_to_skb(&scm, skb, true);
2006 	if (err < 0)
2007 		goto out_free;
2008 
2009 	skb_put(skb, len - data_len);
2010 	skb->data_len = data_len;
2011 	skb->len = len;
2012 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2013 	if (err)
2014 		goto out_free;
2015 
2016 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2017 
2018 	if (msg->msg_namelen) {
2019 lookup:
2020 		other = unix_find_other(sock_net(sk), msg->msg_name,
2021 					msg->msg_namelen, sk->sk_type);
2022 		if (IS_ERR(other)) {
2023 			err = PTR_ERR(other);
2024 			goto out_free;
2025 		}
2026 	} else {
2027 		other = unix_peer_get(sk);
2028 		if (!other) {
2029 			err = -ENOTCONN;
2030 			goto out_free;
2031 		}
2032 	}
2033 
2034 	if (sk_filter(other, skb) < 0) {
2035 		/* Toss the packet but do not return any error to the sender */
2036 		err = len;
2037 		goto out_sock_put;
2038 	}
2039 
2040 restart:
2041 	sk_locked = 0;
2042 	unix_state_lock(other);
2043 restart_locked:
2044 
2045 	if (!unix_may_send(sk, other)) {
2046 		err = -EPERM;
2047 		goto out_unlock;
2048 	}
2049 
2050 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2051 		/* Check with 1003.1g - what should datagram error */
2052 
2053 		unix_state_unlock(other);
2054 
2055 		if (sk->sk_type == SOCK_SEQPACKET) {
2056 			/* We are here only when racing with unix_release_sock()
2057 			 * is clearing @other. Never change state to TCP_CLOSE
2058 			 * unlike SOCK_DGRAM wants.
2059 			 */
2060 			err = -EPIPE;
2061 			goto out_sock_put;
2062 		}
2063 
2064 		if (!sk_locked)
2065 			unix_state_lock(sk);
2066 
2067 		if (unix_peer(sk) == other) {
2068 			unix_peer(sk) = NULL;
2069 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2070 
2071 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2072 			unix_state_unlock(sk);
2073 
2074 			unix_dgram_disconnected(sk, other);
2075 			sock_put(other);
2076 			err = -ECONNREFUSED;
2077 			goto out_sock_put;
2078 		}
2079 
2080 		unix_state_unlock(sk);
2081 
2082 		if (!msg->msg_namelen) {
2083 			err = -ECONNRESET;
2084 			goto out_sock_put;
2085 		}
2086 
2087 		sock_put(other);
2088 		goto lookup;
2089 	}
2090 
2091 	if (other->sk_shutdown & RCV_SHUTDOWN) {
2092 		err = -EPIPE;
2093 		goto out_unlock;
2094 	}
2095 
2096 	if (sk->sk_type != SOCK_SEQPACKET) {
2097 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2098 		if (err)
2099 			goto out_unlock;
2100 	}
2101 
2102 	/* other == sk && unix_peer(other) != sk if
2103 	 * - unix_peer(sk) == NULL, destination address bound to sk
2104 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2105 	 */
2106 	if (other != sk &&
2107 	    unlikely(unix_peer(other) != sk &&
2108 	    unix_recvq_full_lockless(other))) {
2109 		if (timeo) {
2110 			timeo = unix_wait_for_peer(other, timeo);
2111 
2112 			err = sock_intr_errno(timeo);
2113 			if (signal_pending(current))
2114 				goto out_sock_put;
2115 
2116 			goto restart;
2117 		}
2118 
2119 		if (!sk_locked) {
2120 			unix_state_unlock(other);
2121 			unix_state_double_lock(sk, other);
2122 		}
2123 
2124 		if (unix_peer(sk) != other ||
2125 		    unix_dgram_peer_wake_me(sk, other)) {
2126 			err = -EAGAIN;
2127 			sk_locked = 1;
2128 			goto out_unlock;
2129 		}
2130 
2131 		if (!sk_locked) {
2132 			sk_locked = 1;
2133 			goto restart_locked;
2134 		}
2135 	}
2136 
2137 	if (unlikely(sk_locked))
2138 		unix_state_unlock(sk);
2139 
2140 	if (sock_flag(other, SOCK_RCVTSTAMP))
2141 		__net_timestamp(skb);
2142 	maybe_add_creds(skb, sock, other);
2143 	scm_stat_add(other, skb);
2144 	skb_queue_tail(&other->sk_receive_queue, skb);
2145 	unix_state_unlock(other);
2146 	other->sk_data_ready(other);
2147 	sock_put(other);
2148 	scm_destroy(&scm);
2149 	return len;
2150 
2151 out_unlock:
2152 	if (sk_locked)
2153 		unix_state_unlock(sk);
2154 	unix_state_unlock(other);
2155 out_sock_put:
2156 	sock_put(other);
2157 out_free:
2158 	consume_skb(skb);
2159 out:
2160 	scm_destroy(&scm);
2161 	return err;
2162 }
2163 
2164 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2165  * bytes, and a minimum of a full page.
2166  */
2167 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2168 
2169 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2170 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other,
2171 		     struct scm_cookie *scm, bool fds_sent)
2172 {
2173 	struct unix_sock *ousk = unix_sk(other);
2174 	struct sk_buff *skb;
2175 	int err;
2176 
2177 	skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2178 
2179 	if (!skb)
2180 		return err;
2181 
2182 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2183 	if (err < 0)
2184 		goto out;
2185 
2186 	skb_put(skb, 1);
2187 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2188 
2189 	if (err)
2190 		goto out;
2191 
2192 	unix_state_lock(other);
2193 
2194 	if (sock_flag(other, SOCK_DEAD) ||
2195 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2196 		unix_state_unlock(other);
2197 		err = -EPIPE;
2198 		goto out;
2199 	}
2200 
2201 	maybe_add_creds(skb, sock, other);
2202 	scm_stat_add(other, skb);
2203 
2204 	spin_lock(&other->sk_receive_queue.lock);
2205 	WRITE_ONCE(ousk->oob_skb, skb);
2206 	__skb_queue_tail(&other->sk_receive_queue, skb);
2207 	spin_unlock(&other->sk_receive_queue.lock);
2208 
2209 	sk_send_sigurg(other);
2210 	unix_state_unlock(other);
2211 	other->sk_data_ready(other);
2212 
2213 	return 0;
2214 out:
2215 	consume_skb(skb);
2216 	return err;
2217 }
2218 #endif
2219 
2220 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2221 			       size_t len)
2222 {
2223 	struct sock *sk = sock->sk;
2224 	struct sk_buff *skb = NULL;
2225 	struct sock *other = NULL;
2226 	struct scm_cookie scm;
2227 	bool fds_sent = false;
2228 	int err, sent = 0;
2229 
2230 	err = scm_send(sock, msg, &scm, false);
2231 	if (err < 0)
2232 		return err;
2233 
2234 	wait_for_unix_gc(scm.fp);
2235 
2236 	if (msg->msg_flags & MSG_OOB) {
2237 		err = -EOPNOTSUPP;
2238 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2239 		if (len)
2240 			len--;
2241 		else
2242 #endif
2243 			goto out_err;
2244 	}
2245 
2246 	if (msg->msg_namelen) {
2247 		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2248 		goto out_err;
2249 	} else {
2250 		other = unix_peer(sk);
2251 		if (!other) {
2252 			err = -ENOTCONN;
2253 			goto out_err;
2254 		}
2255 	}
2256 
2257 	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2258 		goto out_pipe;
2259 
2260 	while (sent < len) {
2261 		int size = len - sent;
2262 		int data_len;
2263 
2264 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2265 			skb = sock_alloc_send_pskb(sk, 0, 0,
2266 						   msg->msg_flags & MSG_DONTWAIT,
2267 						   &err, 0);
2268 		} else {
2269 			/* Keep two messages in the pipe so it schedules better */
2270 			size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2271 
2272 			/* allow fallback to order-0 allocations */
2273 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2274 
2275 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2276 
2277 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2278 
2279 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2280 						   msg->msg_flags & MSG_DONTWAIT, &err,
2281 						   get_order(UNIX_SKB_FRAGS_SZ));
2282 		}
2283 		if (!skb)
2284 			goto out_err;
2285 
2286 		/* Only send the fds in the first buffer */
2287 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2288 		if (err < 0)
2289 			goto out_free;
2290 
2291 		fds_sent = true;
2292 
2293 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2294 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2295 			err = skb_splice_from_iter(skb, &msg->msg_iter, size,
2296 						   sk->sk_allocation);
2297 			if (err < 0)
2298 				goto out_free;
2299 
2300 			size = err;
2301 			refcount_add(size, &sk->sk_wmem_alloc);
2302 		} else {
2303 			skb_put(skb, size - data_len);
2304 			skb->data_len = data_len;
2305 			skb->len = size;
2306 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2307 			if (err)
2308 				goto out_free;
2309 		}
2310 
2311 		unix_state_lock(other);
2312 
2313 		if (sock_flag(other, SOCK_DEAD) ||
2314 		    (other->sk_shutdown & RCV_SHUTDOWN))
2315 			goto out_pipe_unlock;
2316 
2317 		maybe_add_creds(skb, sock, other);
2318 		scm_stat_add(other, skb);
2319 		skb_queue_tail(&other->sk_receive_queue, skb);
2320 		unix_state_unlock(other);
2321 		other->sk_data_ready(other);
2322 		sent += size;
2323 	}
2324 
2325 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2326 	if (msg->msg_flags & MSG_OOB) {
2327 		err = queue_oob(sock, msg, other, &scm, fds_sent);
2328 		if (err)
2329 			goto out_err;
2330 		sent++;
2331 	}
2332 #endif
2333 
2334 	scm_destroy(&scm);
2335 
2336 	return sent;
2337 
2338 out_pipe_unlock:
2339 	unix_state_unlock(other);
2340 out_pipe:
2341 	if (!sent && !(msg->msg_flags & MSG_NOSIGNAL))
2342 		send_sig(SIGPIPE, current, 0);
2343 	err = -EPIPE;
2344 out_free:
2345 	consume_skb(skb);
2346 out_err:
2347 	scm_destroy(&scm);
2348 	return sent ? : err;
2349 }
2350 
2351 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2352 				  size_t len)
2353 {
2354 	int err;
2355 	struct sock *sk = sock->sk;
2356 
2357 	err = sock_error(sk);
2358 	if (err)
2359 		return err;
2360 
2361 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2362 		return -ENOTCONN;
2363 
2364 	if (msg->msg_namelen)
2365 		msg->msg_namelen = 0;
2366 
2367 	return unix_dgram_sendmsg(sock, msg, len);
2368 }
2369 
2370 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2371 				  size_t size, int flags)
2372 {
2373 	struct sock *sk = sock->sk;
2374 
2375 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2376 		return -ENOTCONN;
2377 
2378 	return unix_dgram_recvmsg(sock, msg, size, flags);
2379 }
2380 
2381 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2382 {
2383 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2384 
2385 	if (addr) {
2386 		msg->msg_namelen = addr->len;
2387 		memcpy(msg->msg_name, addr->name, addr->len);
2388 	}
2389 }
2390 
2391 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2392 			 int flags)
2393 {
2394 	struct scm_cookie scm;
2395 	struct socket *sock = sk->sk_socket;
2396 	struct unix_sock *u = unix_sk(sk);
2397 	struct sk_buff *skb, *last;
2398 	long timeo;
2399 	int skip;
2400 	int err;
2401 
2402 	err = -EOPNOTSUPP;
2403 	if (flags&MSG_OOB)
2404 		goto out;
2405 
2406 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2407 
2408 	do {
2409 		mutex_lock(&u->iolock);
2410 
2411 		skip = sk_peek_offset(sk, flags);
2412 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2413 					      &skip, &err, &last);
2414 		if (skb) {
2415 			if (!(flags & MSG_PEEK))
2416 				scm_stat_del(sk, skb);
2417 			break;
2418 		}
2419 
2420 		mutex_unlock(&u->iolock);
2421 
2422 		if (err != -EAGAIN)
2423 			break;
2424 	} while (timeo &&
2425 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2426 					      &err, &timeo, last));
2427 
2428 	if (!skb) { /* implies iolock unlocked */
2429 		unix_state_lock(sk);
2430 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2431 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2432 		    (sk->sk_shutdown & RCV_SHUTDOWN))
2433 			err = 0;
2434 		unix_state_unlock(sk);
2435 		goto out;
2436 	}
2437 
2438 	if (wq_has_sleeper(&u->peer_wait))
2439 		wake_up_interruptible_sync_poll(&u->peer_wait,
2440 						EPOLLOUT | EPOLLWRNORM |
2441 						EPOLLWRBAND);
2442 
2443 	if (msg->msg_name) {
2444 		unix_copy_addr(msg, skb->sk);
2445 
2446 		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2447 						      msg->msg_name,
2448 						      &msg->msg_namelen);
2449 	}
2450 
2451 	if (size > skb->len - skip)
2452 		size = skb->len - skip;
2453 	else if (size < skb->len - skip)
2454 		msg->msg_flags |= MSG_TRUNC;
2455 
2456 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2457 	if (err)
2458 		goto out_free;
2459 
2460 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2461 		__sock_recv_timestamp(msg, sk, skb);
2462 
2463 	memset(&scm, 0, sizeof(scm));
2464 
2465 	scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2466 	unix_set_secdata(&scm, skb);
2467 
2468 	if (!(flags & MSG_PEEK)) {
2469 		if (UNIXCB(skb).fp)
2470 			unix_detach_fds(&scm, skb);
2471 
2472 		sk_peek_offset_bwd(sk, skb->len);
2473 	} else {
2474 		/* It is questionable: on PEEK we could:
2475 		   - do not return fds - good, but too simple 8)
2476 		   - return fds, and do not return them on read (old strategy,
2477 		     apparently wrong)
2478 		   - clone fds (I chose it for now, it is the most universal
2479 		     solution)
2480 
2481 		   POSIX 1003.1g does not actually define this clearly
2482 		   at all. POSIX 1003.1g doesn't define a lot of things
2483 		   clearly however!
2484 
2485 		*/
2486 
2487 		sk_peek_offset_fwd(sk, size);
2488 
2489 		if (UNIXCB(skb).fp)
2490 			unix_peek_fds(&scm, skb);
2491 	}
2492 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2493 
2494 	scm_recv_unix(sock, msg, &scm, flags);
2495 
2496 out_free:
2497 	skb_free_datagram(sk, skb);
2498 	mutex_unlock(&u->iolock);
2499 out:
2500 	return err;
2501 }
2502 
2503 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2504 			      int flags)
2505 {
2506 	struct sock *sk = sock->sk;
2507 
2508 #ifdef CONFIG_BPF_SYSCALL
2509 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2510 
2511 	if (prot != &unix_dgram_proto)
2512 		return prot->recvmsg(sk, msg, size, flags, NULL);
2513 #endif
2514 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2515 }
2516 
2517 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2518 {
2519 	struct unix_sock *u = unix_sk(sk);
2520 	struct sk_buff *skb;
2521 	int err;
2522 
2523 	mutex_lock(&u->iolock);
2524 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2525 	mutex_unlock(&u->iolock);
2526 	if (!skb)
2527 		return err;
2528 
2529 	return recv_actor(sk, skb);
2530 }
2531 
2532 /*
2533  *	Sleep until more data has arrived. But check for races..
2534  */
2535 static long unix_stream_data_wait(struct sock *sk, long timeo,
2536 				  struct sk_buff *last, unsigned int last_len,
2537 				  bool freezable)
2538 {
2539 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2540 	struct sk_buff *tail;
2541 	DEFINE_WAIT(wait);
2542 
2543 	unix_state_lock(sk);
2544 
2545 	for (;;) {
2546 		prepare_to_wait(sk_sleep(sk), &wait, state);
2547 
2548 		tail = skb_peek_tail(&sk->sk_receive_queue);
2549 		if (tail != last ||
2550 		    (tail && tail->len != last_len) ||
2551 		    sk->sk_err ||
2552 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2553 		    signal_pending(current) ||
2554 		    !timeo)
2555 			break;
2556 
2557 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2558 		unix_state_unlock(sk);
2559 		timeo = schedule_timeout(timeo);
2560 		unix_state_lock(sk);
2561 
2562 		if (sock_flag(sk, SOCK_DEAD))
2563 			break;
2564 
2565 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2566 	}
2567 
2568 	finish_wait(sk_sleep(sk), &wait);
2569 	unix_state_unlock(sk);
2570 	return timeo;
2571 }
2572 
2573 static unsigned int unix_skb_len(const struct sk_buff *skb)
2574 {
2575 	return skb->len - UNIXCB(skb).consumed;
2576 }
2577 
2578 struct unix_stream_read_state {
2579 	int (*recv_actor)(struct sk_buff *, int, int,
2580 			  struct unix_stream_read_state *);
2581 	struct socket *socket;
2582 	struct msghdr *msg;
2583 	struct pipe_inode_info *pipe;
2584 	size_t size;
2585 	int flags;
2586 	unsigned int splice_flags;
2587 };
2588 
2589 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2590 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2591 {
2592 	struct socket *sock = state->socket;
2593 	struct sock *sk = sock->sk;
2594 	struct unix_sock *u = unix_sk(sk);
2595 	int chunk = 1;
2596 	struct sk_buff *oob_skb;
2597 
2598 	mutex_lock(&u->iolock);
2599 	unix_state_lock(sk);
2600 	spin_lock(&sk->sk_receive_queue.lock);
2601 
2602 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2603 		spin_unlock(&sk->sk_receive_queue.lock);
2604 		unix_state_unlock(sk);
2605 		mutex_unlock(&u->iolock);
2606 		return -EINVAL;
2607 	}
2608 
2609 	oob_skb = u->oob_skb;
2610 
2611 	if (!(state->flags & MSG_PEEK))
2612 		WRITE_ONCE(u->oob_skb, NULL);
2613 
2614 	spin_unlock(&sk->sk_receive_queue.lock);
2615 	unix_state_unlock(sk);
2616 
2617 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2618 
2619 	if (!(state->flags & MSG_PEEK))
2620 		UNIXCB(oob_skb).consumed += 1;
2621 
2622 	mutex_unlock(&u->iolock);
2623 
2624 	if (chunk < 0)
2625 		return -EFAULT;
2626 
2627 	state->msg->msg_flags |= MSG_OOB;
2628 	return 1;
2629 }
2630 
2631 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2632 				  int flags, int copied)
2633 {
2634 	struct sk_buff *read_skb = NULL, *unread_skb = NULL;
2635 	struct unix_sock *u = unix_sk(sk);
2636 
2637 	if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb)))
2638 		return skb;
2639 
2640 	spin_lock(&sk->sk_receive_queue.lock);
2641 
2642 	if (!unix_skb_len(skb)) {
2643 		if (copied && (!u->oob_skb || skb == u->oob_skb)) {
2644 			skb = NULL;
2645 		} else if (flags & MSG_PEEK) {
2646 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2647 		} else {
2648 			read_skb = skb;
2649 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2650 			__skb_unlink(read_skb, &sk->sk_receive_queue);
2651 		}
2652 
2653 		if (!skb)
2654 			goto unlock;
2655 	}
2656 
2657 	if (skb != u->oob_skb)
2658 		goto unlock;
2659 
2660 	if (copied) {
2661 		skb = NULL;
2662 	} else if (!(flags & MSG_PEEK)) {
2663 		WRITE_ONCE(u->oob_skb, NULL);
2664 
2665 		if (!sock_flag(sk, SOCK_URGINLINE)) {
2666 			__skb_unlink(skb, &sk->sk_receive_queue);
2667 			unread_skb = skb;
2668 			skb = skb_peek(&sk->sk_receive_queue);
2669 		}
2670 	} else if (!sock_flag(sk, SOCK_URGINLINE)) {
2671 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
2672 	}
2673 
2674 unlock:
2675 	spin_unlock(&sk->sk_receive_queue.lock);
2676 
2677 	consume_skb(read_skb);
2678 	kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2679 
2680 	return skb;
2681 }
2682 #endif
2683 
2684 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2685 {
2686 	struct unix_sock *u = unix_sk(sk);
2687 	struct sk_buff *skb;
2688 	int err;
2689 
2690 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2691 		return -ENOTCONN;
2692 
2693 	mutex_lock(&u->iolock);
2694 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2695 	mutex_unlock(&u->iolock);
2696 	if (!skb)
2697 		return err;
2698 
2699 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2700 	if (unlikely(skb == READ_ONCE(u->oob_skb))) {
2701 		bool drop = false;
2702 
2703 		unix_state_lock(sk);
2704 
2705 		if (sock_flag(sk, SOCK_DEAD)) {
2706 			unix_state_unlock(sk);
2707 			kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
2708 			return -ECONNRESET;
2709 		}
2710 
2711 		spin_lock(&sk->sk_receive_queue.lock);
2712 		if (likely(skb == u->oob_skb)) {
2713 			WRITE_ONCE(u->oob_skb, NULL);
2714 			drop = true;
2715 		}
2716 		spin_unlock(&sk->sk_receive_queue.lock);
2717 
2718 		unix_state_unlock(sk);
2719 
2720 		if (drop) {
2721 			kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2722 			return -EAGAIN;
2723 		}
2724 	}
2725 #endif
2726 
2727 	return recv_actor(sk, skb);
2728 }
2729 
2730 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2731 				    bool freezable)
2732 {
2733 	struct scm_cookie scm;
2734 	struct socket *sock = state->socket;
2735 	struct sock *sk = sock->sk;
2736 	struct unix_sock *u = unix_sk(sk);
2737 	int copied = 0;
2738 	int flags = state->flags;
2739 	int noblock = flags & MSG_DONTWAIT;
2740 	bool check_creds = false;
2741 	int target;
2742 	int err = 0;
2743 	long timeo;
2744 	int skip;
2745 	size_t size = state->size;
2746 	unsigned int last_len;
2747 
2748 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2749 		err = -EINVAL;
2750 		goto out;
2751 	}
2752 
2753 	if (unlikely(flags & MSG_OOB)) {
2754 		err = -EOPNOTSUPP;
2755 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2756 		err = unix_stream_recv_urg(state);
2757 #endif
2758 		goto out;
2759 	}
2760 
2761 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2762 	timeo = sock_rcvtimeo(sk, noblock);
2763 
2764 	memset(&scm, 0, sizeof(scm));
2765 
2766 	/* Lock the socket to prevent queue disordering
2767 	 * while sleeps in memcpy_tomsg
2768 	 */
2769 	mutex_lock(&u->iolock);
2770 
2771 	skip = max(sk_peek_offset(sk, flags), 0);
2772 
2773 	do {
2774 		struct sk_buff *skb, *last;
2775 		int chunk;
2776 
2777 redo:
2778 		unix_state_lock(sk);
2779 		if (sock_flag(sk, SOCK_DEAD)) {
2780 			err = -ECONNRESET;
2781 			goto unlock;
2782 		}
2783 		last = skb = skb_peek(&sk->sk_receive_queue);
2784 		last_len = last ? last->len : 0;
2785 
2786 again:
2787 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2788 		if (skb) {
2789 			skb = manage_oob(skb, sk, flags, copied);
2790 			if (!skb && copied) {
2791 				unix_state_unlock(sk);
2792 				break;
2793 			}
2794 		}
2795 #endif
2796 		if (skb == NULL) {
2797 			if (copied >= target)
2798 				goto unlock;
2799 
2800 			/*
2801 			 *	POSIX 1003.1g mandates this order.
2802 			 */
2803 
2804 			err = sock_error(sk);
2805 			if (err)
2806 				goto unlock;
2807 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2808 				goto unlock;
2809 
2810 			unix_state_unlock(sk);
2811 			if (!timeo) {
2812 				err = -EAGAIN;
2813 				break;
2814 			}
2815 
2816 			mutex_unlock(&u->iolock);
2817 
2818 			timeo = unix_stream_data_wait(sk, timeo, last,
2819 						      last_len, freezable);
2820 
2821 			if (signal_pending(current)) {
2822 				err = sock_intr_errno(timeo);
2823 				scm_destroy(&scm);
2824 				goto out;
2825 			}
2826 
2827 			mutex_lock(&u->iolock);
2828 			goto redo;
2829 unlock:
2830 			unix_state_unlock(sk);
2831 			break;
2832 		}
2833 
2834 		while (skip >= unix_skb_len(skb)) {
2835 			skip -= unix_skb_len(skb);
2836 			last = skb;
2837 			last_len = skb->len;
2838 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2839 			if (!skb)
2840 				goto again;
2841 		}
2842 
2843 		unix_state_unlock(sk);
2844 
2845 		if (check_creds) {
2846 			/* Never glue messages from different writers */
2847 			if (!unix_skb_scm_eq(skb, &scm))
2848 				break;
2849 		} else if (test_bit(SOCK_PASSCRED, &sock->flags) ||
2850 			   test_bit(SOCK_PASSPIDFD, &sock->flags)) {
2851 			/* Copy credentials */
2852 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2853 			unix_set_secdata(&scm, skb);
2854 			check_creds = true;
2855 		}
2856 
2857 		/* Copy address just once */
2858 		if (state->msg && state->msg->msg_name) {
2859 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2860 					 state->msg->msg_name);
2861 			unix_copy_addr(state->msg, skb->sk);
2862 
2863 			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2864 							      state->msg->msg_name,
2865 							      &state->msg->msg_namelen);
2866 
2867 			sunaddr = NULL;
2868 		}
2869 
2870 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2871 		chunk = state->recv_actor(skb, skip, chunk, state);
2872 		if (chunk < 0) {
2873 			if (copied == 0)
2874 				copied = -EFAULT;
2875 			break;
2876 		}
2877 		copied += chunk;
2878 		size -= chunk;
2879 
2880 		/* Mark read part of skb as used */
2881 		if (!(flags & MSG_PEEK)) {
2882 			UNIXCB(skb).consumed += chunk;
2883 
2884 			sk_peek_offset_bwd(sk, chunk);
2885 
2886 			if (UNIXCB(skb).fp) {
2887 				scm_stat_del(sk, skb);
2888 				unix_detach_fds(&scm, skb);
2889 			}
2890 
2891 			if (unix_skb_len(skb))
2892 				break;
2893 
2894 			skb_unlink(skb, &sk->sk_receive_queue);
2895 			consume_skb(skb);
2896 
2897 			if (scm.fp)
2898 				break;
2899 		} else {
2900 			/* It is questionable, see note in unix_dgram_recvmsg.
2901 			 */
2902 			if (UNIXCB(skb).fp)
2903 				unix_peek_fds(&scm, skb);
2904 
2905 			sk_peek_offset_fwd(sk, chunk);
2906 
2907 			if (UNIXCB(skb).fp)
2908 				break;
2909 
2910 			skip = 0;
2911 			last = skb;
2912 			last_len = skb->len;
2913 			unix_state_lock(sk);
2914 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2915 			if (skb)
2916 				goto again;
2917 			unix_state_unlock(sk);
2918 			break;
2919 		}
2920 	} while (size);
2921 
2922 	mutex_unlock(&u->iolock);
2923 	if (state->msg)
2924 		scm_recv_unix(sock, state->msg, &scm, flags);
2925 	else
2926 		scm_destroy(&scm);
2927 out:
2928 	return copied ? : err;
2929 }
2930 
2931 static int unix_stream_read_actor(struct sk_buff *skb,
2932 				  int skip, int chunk,
2933 				  struct unix_stream_read_state *state)
2934 {
2935 	int ret;
2936 
2937 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2938 				    state->msg, chunk);
2939 	return ret ?: chunk;
2940 }
2941 
2942 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2943 			  size_t size, int flags)
2944 {
2945 	struct unix_stream_read_state state = {
2946 		.recv_actor = unix_stream_read_actor,
2947 		.socket = sk->sk_socket,
2948 		.msg = msg,
2949 		.size = size,
2950 		.flags = flags
2951 	};
2952 
2953 	return unix_stream_read_generic(&state, true);
2954 }
2955 
2956 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2957 			       size_t size, int flags)
2958 {
2959 	struct unix_stream_read_state state = {
2960 		.recv_actor = unix_stream_read_actor,
2961 		.socket = sock,
2962 		.msg = msg,
2963 		.size = size,
2964 		.flags = flags
2965 	};
2966 
2967 #ifdef CONFIG_BPF_SYSCALL
2968 	struct sock *sk = sock->sk;
2969 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2970 
2971 	if (prot != &unix_stream_proto)
2972 		return prot->recvmsg(sk, msg, size, flags, NULL);
2973 #endif
2974 	return unix_stream_read_generic(&state, true);
2975 }
2976 
2977 static int unix_stream_splice_actor(struct sk_buff *skb,
2978 				    int skip, int chunk,
2979 				    struct unix_stream_read_state *state)
2980 {
2981 	return skb_splice_bits(skb, state->socket->sk,
2982 			       UNIXCB(skb).consumed + skip,
2983 			       state->pipe, chunk, state->splice_flags);
2984 }
2985 
2986 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
2987 				       struct pipe_inode_info *pipe,
2988 				       size_t size, unsigned int flags)
2989 {
2990 	struct unix_stream_read_state state = {
2991 		.recv_actor = unix_stream_splice_actor,
2992 		.socket = sock,
2993 		.pipe = pipe,
2994 		.size = size,
2995 		.splice_flags = flags,
2996 	};
2997 
2998 	if (unlikely(*ppos))
2999 		return -ESPIPE;
3000 
3001 	if (sock->file->f_flags & O_NONBLOCK ||
3002 	    flags & SPLICE_F_NONBLOCK)
3003 		state.flags = MSG_DONTWAIT;
3004 
3005 	return unix_stream_read_generic(&state, false);
3006 }
3007 
3008 static int unix_shutdown(struct socket *sock, int mode)
3009 {
3010 	struct sock *sk = sock->sk;
3011 	struct sock *other;
3012 
3013 	if (mode < SHUT_RD || mode > SHUT_RDWR)
3014 		return -EINVAL;
3015 	/* This maps:
3016 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
3017 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
3018 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3019 	 */
3020 	++mode;
3021 
3022 	unix_state_lock(sk);
3023 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3024 	other = unix_peer(sk);
3025 	if (other)
3026 		sock_hold(other);
3027 	unix_state_unlock(sk);
3028 	sk->sk_state_change(sk);
3029 
3030 	if (other &&
3031 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3032 
3033 		int peer_mode = 0;
3034 		const struct proto *prot = READ_ONCE(other->sk_prot);
3035 
3036 		if (prot->unhash)
3037 			prot->unhash(other);
3038 		if (mode&RCV_SHUTDOWN)
3039 			peer_mode |= SEND_SHUTDOWN;
3040 		if (mode&SEND_SHUTDOWN)
3041 			peer_mode |= RCV_SHUTDOWN;
3042 		unix_state_lock(other);
3043 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3044 		unix_state_unlock(other);
3045 		other->sk_state_change(other);
3046 		if (peer_mode == SHUTDOWN_MASK)
3047 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3048 		else if (peer_mode & RCV_SHUTDOWN)
3049 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3050 	}
3051 	if (other)
3052 		sock_put(other);
3053 
3054 	return 0;
3055 }
3056 
3057 long unix_inq_len(struct sock *sk)
3058 {
3059 	struct sk_buff *skb;
3060 	long amount = 0;
3061 
3062 	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3063 		return -EINVAL;
3064 
3065 	spin_lock(&sk->sk_receive_queue.lock);
3066 	if (sk->sk_type == SOCK_STREAM ||
3067 	    sk->sk_type == SOCK_SEQPACKET) {
3068 		skb_queue_walk(&sk->sk_receive_queue, skb)
3069 			amount += unix_skb_len(skb);
3070 	} else {
3071 		skb = skb_peek(&sk->sk_receive_queue);
3072 		if (skb)
3073 			amount = skb->len;
3074 	}
3075 	spin_unlock(&sk->sk_receive_queue.lock);
3076 
3077 	return amount;
3078 }
3079 EXPORT_SYMBOL_GPL(unix_inq_len);
3080 
3081 long unix_outq_len(struct sock *sk)
3082 {
3083 	return sk_wmem_alloc_get(sk);
3084 }
3085 EXPORT_SYMBOL_GPL(unix_outq_len);
3086 
3087 static int unix_open_file(struct sock *sk)
3088 {
3089 	struct path path;
3090 	struct file *f;
3091 	int fd;
3092 
3093 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3094 		return -EPERM;
3095 
3096 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3097 		return -ENOENT;
3098 
3099 	path = unix_sk(sk)->path;
3100 	if (!path.dentry)
3101 		return -ENOENT;
3102 
3103 	path_get(&path);
3104 
3105 	fd = get_unused_fd_flags(O_CLOEXEC);
3106 	if (fd < 0)
3107 		goto out;
3108 
3109 	f = dentry_open(&path, O_PATH, current_cred());
3110 	if (IS_ERR(f)) {
3111 		put_unused_fd(fd);
3112 		fd = PTR_ERR(f);
3113 		goto out;
3114 	}
3115 
3116 	fd_install(fd, f);
3117 out:
3118 	path_put(&path);
3119 
3120 	return fd;
3121 }
3122 
3123 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3124 {
3125 	struct sock *sk = sock->sk;
3126 	long amount = 0;
3127 	int err;
3128 
3129 	switch (cmd) {
3130 	case SIOCOUTQ:
3131 		amount = unix_outq_len(sk);
3132 		err = put_user(amount, (int __user *)arg);
3133 		break;
3134 	case SIOCINQ:
3135 		amount = unix_inq_len(sk);
3136 		if (amount < 0)
3137 			err = amount;
3138 		else
3139 			err = put_user(amount, (int __user *)arg);
3140 		break;
3141 	case SIOCUNIXFILE:
3142 		err = unix_open_file(sk);
3143 		break;
3144 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3145 	case SIOCATMARK:
3146 		{
3147 			struct unix_sock *u = unix_sk(sk);
3148 			struct sk_buff *skb;
3149 			int answ = 0;
3150 
3151 			mutex_lock(&u->iolock);
3152 
3153 			skb = skb_peek(&sk->sk_receive_queue);
3154 			if (skb) {
3155 				struct sk_buff *oob_skb = READ_ONCE(u->oob_skb);
3156 				struct sk_buff *next_skb;
3157 
3158 				next_skb = skb_peek_next(skb, &sk->sk_receive_queue);
3159 
3160 				if (skb == oob_skb ||
3161 				    (!unix_skb_len(skb) &&
3162 				     (!oob_skb || next_skb == oob_skb)))
3163 					answ = 1;
3164 			}
3165 
3166 			mutex_unlock(&u->iolock);
3167 
3168 			err = put_user(answ, (int __user *)arg);
3169 		}
3170 		break;
3171 #endif
3172 	default:
3173 		err = -ENOIOCTLCMD;
3174 		break;
3175 	}
3176 	return err;
3177 }
3178 
3179 #ifdef CONFIG_COMPAT
3180 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3181 {
3182 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3183 }
3184 #endif
3185 
3186 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3187 {
3188 	struct sock *sk = sock->sk;
3189 	unsigned char state;
3190 	__poll_t mask;
3191 	u8 shutdown;
3192 
3193 	sock_poll_wait(file, sock, wait);
3194 	mask = 0;
3195 	shutdown = READ_ONCE(sk->sk_shutdown);
3196 	state = READ_ONCE(sk->sk_state);
3197 
3198 	/* exceptional events? */
3199 	if (READ_ONCE(sk->sk_err))
3200 		mask |= EPOLLERR;
3201 	if (shutdown == SHUTDOWN_MASK)
3202 		mask |= EPOLLHUP;
3203 	if (shutdown & RCV_SHUTDOWN)
3204 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3205 
3206 	/* readable? */
3207 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3208 		mask |= EPOLLIN | EPOLLRDNORM;
3209 	if (sk_is_readable(sk))
3210 		mask |= EPOLLIN | EPOLLRDNORM;
3211 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3212 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3213 		mask |= EPOLLPRI;
3214 #endif
3215 
3216 	/* Connection-based need to check for termination and startup */
3217 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3218 	    state == TCP_CLOSE)
3219 		mask |= EPOLLHUP;
3220 
3221 	/*
3222 	 * we set writable also when the other side has shut down the
3223 	 * connection. This prevents stuck sockets.
3224 	 */
3225 	if (unix_writable(sk, state))
3226 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3227 
3228 	return mask;
3229 }
3230 
3231 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3232 				    poll_table *wait)
3233 {
3234 	struct sock *sk = sock->sk, *other;
3235 	unsigned int writable;
3236 	unsigned char state;
3237 	__poll_t mask;
3238 	u8 shutdown;
3239 
3240 	sock_poll_wait(file, sock, wait);
3241 	mask = 0;
3242 	shutdown = READ_ONCE(sk->sk_shutdown);
3243 	state = READ_ONCE(sk->sk_state);
3244 
3245 	/* exceptional events? */
3246 	if (READ_ONCE(sk->sk_err) ||
3247 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3248 		mask |= EPOLLERR |
3249 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3250 
3251 	if (shutdown & RCV_SHUTDOWN)
3252 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3253 	if (shutdown == SHUTDOWN_MASK)
3254 		mask |= EPOLLHUP;
3255 
3256 	/* readable? */
3257 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3258 		mask |= EPOLLIN | EPOLLRDNORM;
3259 	if (sk_is_readable(sk))
3260 		mask |= EPOLLIN | EPOLLRDNORM;
3261 
3262 	/* Connection-based need to check for termination and startup */
3263 	if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3264 		mask |= EPOLLHUP;
3265 
3266 	/* No write status requested, avoid expensive OUT tests. */
3267 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3268 		return mask;
3269 
3270 	writable = unix_writable(sk, state);
3271 	if (writable) {
3272 		unix_state_lock(sk);
3273 
3274 		other = unix_peer(sk);
3275 		if (other && unix_peer(other) != sk &&
3276 		    unix_recvq_full_lockless(other) &&
3277 		    unix_dgram_peer_wake_me(sk, other))
3278 			writable = 0;
3279 
3280 		unix_state_unlock(sk);
3281 	}
3282 
3283 	if (writable)
3284 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3285 	else
3286 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3287 
3288 	return mask;
3289 }
3290 
3291 #ifdef CONFIG_PROC_FS
3292 
3293 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3294 
3295 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3296 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3297 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3298 
3299 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3300 {
3301 	unsigned long offset = get_offset(*pos);
3302 	unsigned long bucket = get_bucket(*pos);
3303 	unsigned long count = 0;
3304 	struct sock *sk;
3305 
3306 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3307 	     sk; sk = sk_next(sk)) {
3308 		if (++count == offset)
3309 			break;
3310 	}
3311 
3312 	return sk;
3313 }
3314 
3315 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3316 {
3317 	unsigned long bucket = get_bucket(*pos);
3318 	struct net *net = seq_file_net(seq);
3319 	struct sock *sk;
3320 
3321 	while (bucket < UNIX_HASH_SIZE) {
3322 		spin_lock(&net->unx.table.locks[bucket]);
3323 
3324 		sk = unix_from_bucket(seq, pos);
3325 		if (sk)
3326 			return sk;
3327 
3328 		spin_unlock(&net->unx.table.locks[bucket]);
3329 
3330 		*pos = set_bucket_offset(++bucket, 1);
3331 	}
3332 
3333 	return NULL;
3334 }
3335 
3336 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3337 				  loff_t *pos)
3338 {
3339 	unsigned long bucket = get_bucket(*pos);
3340 
3341 	sk = sk_next(sk);
3342 	if (sk)
3343 		return sk;
3344 
3345 
3346 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3347 
3348 	*pos = set_bucket_offset(++bucket, 1);
3349 
3350 	return unix_get_first(seq, pos);
3351 }
3352 
3353 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3354 {
3355 	if (!*pos)
3356 		return SEQ_START_TOKEN;
3357 
3358 	return unix_get_first(seq, pos);
3359 }
3360 
3361 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3362 {
3363 	++*pos;
3364 
3365 	if (v == SEQ_START_TOKEN)
3366 		return unix_get_first(seq, pos);
3367 
3368 	return unix_get_next(seq, v, pos);
3369 }
3370 
3371 static void unix_seq_stop(struct seq_file *seq, void *v)
3372 {
3373 	struct sock *sk = v;
3374 
3375 	if (sk)
3376 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3377 }
3378 
3379 static int unix_seq_show(struct seq_file *seq, void *v)
3380 {
3381 
3382 	if (v == SEQ_START_TOKEN)
3383 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3384 			 "Inode Path\n");
3385 	else {
3386 		struct sock *s = v;
3387 		struct unix_sock *u = unix_sk(s);
3388 		unix_state_lock(s);
3389 
3390 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3391 			s,
3392 			refcount_read(&s->sk_refcnt),
3393 			0,
3394 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3395 			s->sk_type,
3396 			s->sk_socket ?
3397 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3398 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3399 			sock_i_ino(s));
3400 
3401 		if (u->addr) {	// under a hash table lock here
3402 			int i, len;
3403 			seq_putc(seq, ' ');
3404 
3405 			i = 0;
3406 			len = u->addr->len -
3407 				offsetof(struct sockaddr_un, sun_path);
3408 			if (u->addr->name->sun_path[0]) {
3409 				len--;
3410 			} else {
3411 				seq_putc(seq, '@');
3412 				i++;
3413 			}
3414 			for ( ; i < len; i++)
3415 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3416 					 '@');
3417 		}
3418 		unix_state_unlock(s);
3419 		seq_putc(seq, '\n');
3420 	}
3421 
3422 	return 0;
3423 }
3424 
3425 static const struct seq_operations unix_seq_ops = {
3426 	.start  = unix_seq_start,
3427 	.next   = unix_seq_next,
3428 	.stop   = unix_seq_stop,
3429 	.show   = unix_seq_show,
3430 };
3431 
3432 #ifdef CONFIG_BPF_SYSCALL
3433 struct bpf_unix_iter_state {
3434 	struct seq_net_private p;
3435 	unsigned int cur_sk;
3436 	unsigned int end_sk;
3437 	unsigned int max_sk;
3438 	struct sock **batch;
3439 	bool st_bucket_done;
3440 };
3441 
3442 struct bpf_iter__unix {
3443 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3444 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3445 	uid_t uid __aligned(8);
3446 };
3447 
3448 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3449 			      struct unix_sock *unix_sk, uid_t uid)
3450 {
3451 	struct bpf_iter__unix ctx;
3452 
3453 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3454 	ctx.meta = meta;
3455 	ctx.unix_sk = unix_sk;
3456 	ctx.uid = uid;
3457 	return bpf_iter_run_prog(prog, &ctx);
3458 }
3459 
3460 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3461 
3462 {
3463 	struct bpf_unix_iter_state *iter = seq->private;
3464 	unsigned int expected = 1;
3465 	struct sock *sk;
3466 
3467 	sock_hold(start_sk);
3468 	iter->batch[iter->end_sk++] = start_sk;
3469 
3470 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3471 		if (iter->end_sk < iter->max_sk) {
3472 			sock_hold(sk);
3473 			iter->batch[iter->end_sk++] = sk;
3474 		}
3475 
3476 		expected++;
3477 	}
3478 
3479 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3480 
3481 	return expected;
3482 }
3483 
3484 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3485 {
3486 	while (iter->cur_sk < iter->end_sk)
3487 		sock_put(iter->batch[iter->cur_sk++]);
3488 }
3489 
3490 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3491 				       unsigned int new_batch_sz)
3492 {
3493 	struct sock **new_batch;
3494 
3495 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3496 			     GFP_USER | __GFP_NOWARN);
3497 	if (!new_batch)
3498 		return -ENOMEM;
3499 
3500 	bpf_iter_unix_put_batch(iter);
3501 	kvfree(iter->batch);
3502 	iter->batch = new_batch;
3503 	iter->max_sk = new_batch_sz;
3504 
3505 	return 0;
3506 }
3507 
3508 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3509 					loff_t *pos)
3510 {
3511 	struct bpf_unix_iter_state *iter = seq->private;
3512 	unsigned int expected;
3513 	bool resized = false;
3514 	struct sock *sk;
3515 
3516 	if (iter->st_bucket_done)
3517 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3518 
3519 again:
3520 	/* Get a new batch */
3521 	iter->cur_sk = 0;
3522 	iter->end_sk = 0;
3523 
3524 	sk = unix_get_first(seq, pos);
3525 	if (!sk)
3526 		return NULL; /* Done */
3527 
3528 	expected = bpf_iter_unix_hold_batch(seq, sk);
3529 
3530 	if (iter->end_sk == expected) {
3531 		iter->st_bucket_done = true;
3532 		return sk;
3533 	}
3534 
3535 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3536 		resized = true;
3537 		goto again;
3538 	}
3539 
3540 	return sk;
3541 }
3542 
3543 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3544 {
3545 	if (!*pos)
3546 		return SEQ_START_TOKEN;
3547 
3548 	/* bpf iter does not support lseek, so it always
3549 	 * continue from where it was stop()-ped.
3550 	 */
3551 	return bpf_iter_unix_batch(seq, pos);
3552 }
3553 
3554 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3555 {
3556 	struct bpf_unix_iter_state *iter = seq->private;
3557 	struct sock *sk;
3558 
3559 	/* Whenever seq_next() is called, the iter->cur_sk is
3560 	 * done with seq_show(), so advance to the next sk in
3561 	 * the batch.
3562 	 */
3563 	if (iter->cur_sk < iter->end_sk)
3564 		sock_put(iter->batch[iter->cur_sk++]);
3565 
3566 	++*pos;
3567 
3568 	if (iter->cur_sk < iter->end_sk)
3569 		sk = iter->batch[iter->cur_sk];
3570 	else
3571 		sk = bpf_iter_unix_batch(seq, pos);
3572 
3573 	return sk;
3574 }
3575 
3576 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3577 {
3578 	struct bpf_iter_meta meta;
3579 	struct bpf_prog *prog;
3580 	struct sock *sk = v;
3581 	uid_t uid;
3582 	bool slow;
3583 	int ret;
3584 
3585 	if (v == SEQ_START_TOKEN)
3586 		return 0;
3587 
3588 	slow = lock_sock_fast(sk);
3589 
3590 	if (unlikely(sk_unhashed(sk))) {
3591 		ret = SEQ_SKIP;
3592 		goto unlock;
3593 	}
3594 
3595 	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3596 	meta.seq = seq;
3597 	prog = bpf_iter_get_info(&meta, false);
3598 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3599 unlock:
3600 	unlock_sock_fast(sk, slow);
3601 	return ret;
3602 }
3603 
3604 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3605 {
3606 	struct bpf_unix_iter_state *iter = seq->private;
3607 	struct bpf_iter_meta meta;
3608 	struct bpf_prog *prog;
3609 
3610 	if (!v) {
3611 		meta.seq = seq;
3612 		prog = bpf_iter_get_info(&meta, true);
3613 		if (prog)
3614 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3615 	}
3616 
3617 	if (iter->cur_sk < iter->end_sk)
3618 		bpf_iter_unix_put_batch(iter);
3619 }
3620 
3621 static const struct seq_operations bpf_iter_unix_seq_ops = {
3622 	.start	= bpf_iter_unix_seq_start,
3623 	.next	= bpf_iter_unix_seq_next,
3624 	.stop	= bpf_iter_unix_seq_stop,
3625 	.show	= bpf_iter_unix_seq_show,
3626 };
3627 #endif
3628 #endif
3629 
3630 static const struct net_proto_family unix_family_ops = {
3631 	.family = PF_UNIX,
3632 	.create = unix_create,
3633 	.owner	= THIS_MODULE,
3634 };
3635 
3636 
3637 static int __net_init unix_net_init(struct net *net)
3638 {
3639 	int i;
3640 
3641 	net->unx.sysctl_max_dgram_qlen = 10;
3642 	if (unix_sysctl_register(net))
3643 		goto out;
3644 
3645 #ifdef CONFIG_PROC_FS
3646 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3647 			     sizeof(struct seq_net_private)))
3648 		goto err_sysctl;
3649 #endif
3650 
3651 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3652 					      sizeof(spinlock_t), GFP_KERNEL);
3653 	if (!net->unx.table.locks)
3654 		goto err_proc;
3655 
3656 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3657 						sizeof(struct hlist_head),
3658 						GFP_KERNEL);
3659 	if (!net->unx.table.buckets)
3660 		goto free_locks;
3661 
3662 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3663 		spin_lock_init(&net->unx.table.locks[i]);
3664 		lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);
3665 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3666 	}
3667 
3668 	return 0;
3669 
3670 free_locks:
3671 	kvfree(net->unx.table.locks);
3672 err_proc:
3673 #ifdef CONFIG_PROC_FS
3674 	remove_proc_entry("unix", net->proc_net);
3675 err_sysctl:
3676 #endif
3677 	unix_sysctl_unregister(net);
3678 out:
3679 	return -ENOMEM;
3680 }
3681 
3682 static void __net_exit unix_net_exit(struct net *net)
3683 {
3684 	kvfree(net->unx.table.buckets);
3685 	kvfree(net->unx.table.locks);
3686 	unix_sysctl_unregister(net);
3687 	remove_proc_entry("unix", net->proc_net);
3688 }
3689 
3690 static struct pernet_operations unix_net_ops = {
3691 	.init = unix_net_init,
3692 	.exit = unix_net_exit,
3693 };
3694 
3695 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3696 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3697 		     struct unix_sock *unix_sk, uid_t uid)
3698 
3699 #define INIT_BATCH_SZ 16
3700 
3701 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3702 {
3703 	struct bpf_unix_iter_state *iter = priv_data;
3704 	int err;
3705 
3706 	err = bpf_iter_init_seq_net(priv_data, aux);
3707 	if (err)
3708 		return err;
3709 
3710 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3711 	if (err) {
3712 		bpf_iter_fini_seq_net(priv_data);
3713 		return err;
3714 	}
3715 
3716 	return 0;
3717 }
3718 
3719 static void bpf_iter_fini_unix(void *priv_data)
3720 {
3721 	struct bpf_unix_iter_state *iter = priv_data;
3722 
3723 	bpf_iter_fini_seq_net(priv_data);
3724 	kvfree(iter->batch);
3725 }
3726 
3727 static const struct bpf_iter_seq_info unix_seq_info = {
3728 	.seq_ops		= &bpf_iter_unix_seq_ops,
3729 	.init_seq_private	= bpf_iter_init_unix,
3730 	.fini_seq_private	= bpf_iter_fini_unix,
3731 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3732 };
3733 
3734 static const struct bpf_func_proto *
3735 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3736 			     const struct bpf_prog *prog)
3737 {
3738 	switch (func_id) {
3739 	case BPF_FUNC_setsockopt:
3740 		return &bpf_sk_setsockopt_proto;
3741 	case BPF_FUNC_getsockopt:
3742 		return &bpf_sk_getsockopt_proto;
3743 	default:
3744 		return NULL;
3745 	}
3746 }
3747 
3748 static struct bpf_iter_reg unix_reg_info = {
3749 	.target			= "unix",
3750 	.ctx_arg_info_size	= 1,
3751 	.ctx_arg_info		= {
3752 		{ offsetof(struct bpf_iter__unix, unix_sk),
3753 		  PTR_TO_BTF_ID_OR_NULL },
3754 	},
3755 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3756 	.seq_info		= &unix_seq_info,
3757 };
3758 
3759 static void __init bpf_iter_register(void)
3760 {
3761 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3762 	if (bpf_iter_reg_target(&unix_reg_info))
3763 		pr_warn("Warning: could not register bpf iterator unix\n");
3764 }
3765 #endif
3766 
3767 static int __init af_unix_init(void)
3768 {
3769 	int i, rc = -1;
3770 
3771 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3772 
3773 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3774 		spin_lock_init(&bsd_socket_locks[i]);
3775 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3776 	}
3777 
3778 	rc = proto_register(&unix_dgram_proto, 1);
3779 	if (rc != 0) {
3780 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3781 		goto out;
3782 	}
3783 
3784 	rc = proto_register(&unix_stream_proto, 1);
3785 	if (rc != 0) {
3786 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3787 		proto_unregister(&unix_dgram_proto);
3788 		goto out;
3789 	}
3790 
3791 	sock_register(&unix_family_ops);
3792 	register_pernet_subsys(&unix_net_ops);
3793 	unix_bpf_build_proto();
3794 
3795 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3796 	bpf_iter_register();
3797 #endif
3798 
3799 out:
3800 	return rc;
3801 }
3802 
3803 /* Later than subsys_initcall() because we depend on stuff initialised there */
3804 fs_initcall(af_unix_init);
3805