xref: /linux/net/unix/af_unix.c (revision ab93e0dd72c37d378dd936f031ffb83ff2bd87ce)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * NET4:	Implementation of BSD Unix domain sockets.
4  *
5  * Authors:	Alan Cox, <alan@lxorguk.ukuu.org.uk>
6  *
7  * Fixes:
8  *		Linus Torvalds	:	Assorted bug cures.
9  *		Niibe Yutaka	:	async I/O support.
10  *		Carsten Paeth	:	PF_UNIX check, address fixes.
11  *		Alan Cox	:	Limit size of allocated blocks.
12  *		Alan Cox	:	Fixed the stupid socketpair bug.
13  *		Alan Cox	:	BSD compatibility fine tuning.
14  *		Alan Cox	:	Fixed a bug in connect when interrupted.
15  *		Alan Cox	:	Sorted out a proper draft version of
16  *					file descriptor passing hacked up from
17  *					Mike Shaver's work.
18  *		Marty Leisner	:	Fixes to fd passing
19  *		Nick Nevin	:	recvmsg bugfix.
20  *		Alan Cox	:	Started proper garbage collector
21  *		Heiko EiBfeldt	:	Missing verify_area check
22  *		Alan Cox	:	Started POSIXisms
23  *		Andreas Schwab	:	Replace inode by dentry for proper
24  *					reference counting
25  *		Kirk Petersen	:	Made this a module
26  *	    Christoph Rohland	:	Elegant non-blocking accept/connect algorithm.
27  *					Lots of bug fixes.
28  *	     Alexey Kuznetosv	:	Repaired (I hope) bugs introduces
29  *					by above two patches.
30  *	     Andrea Arcangeli	:	If possible we block in connect(2)
31  *					if the max backlog of the listen socket
32  *					is been reached. This won't break
33  *					old apps and it will avoid huge amount
34  *					of socks hashed (this for unix_gc()
35  *					performances reasons).
36  *					Security fix that limits the max
37  *					number of socks to 2*max_files and
38  *					the number of skb queueable in the
39  *					dgram receiver.
40  *		Artur Skawina   :	Hash function optimizations
41  *	     Alexey Kuznetsov   :	Full scale SMP. Lot of bugs are introduced 8)
42  *	      Malcolm Beattie   :	Set peercred for socketpair
43  *	     Michal Ostrowski   :       Module initialization cleanup.
44  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
45  *	     				the core infrastructure is doing that
46  *	     				for all net proto families now (2.5.69+)
47  *
48  * Known differences from reference BSD that was tested:
49  *
50  *	[TO FIX]
51  *	ECONNREFUSED is not returned from one end of a connected() socket to the
52  *		other the moment one end closes.
53  *	fstat() doesn't return st_dev=0, and give the blksize as high water mark
54  *		and a fake inode identifier (nor the BSD first socket fstat twice bug).
55  *	[NOT TO FIX]
56  *	accept() returns a path name even if the connecting socket has closed
57  *		in the meantime (BSD loses the path and gives up).
58  *	accept() returns 0 length path for an unbound connector. BSD returns 16
59  *		and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60  *	socketpair(...SOCK_RAW..) doesn't panic the kernel.
61  *	BSD af_unix apparently has connect forgetting to block properly.
62  *		(need to check this with the POSIX spec in detail)
63  *
64  * Differences from 2.0.0-11-... (ANK)
65  *	Bug fixes and improvements.
66  *		- client shutdown killed server socket.
67  *		- removed all useless cli/sti pairs.
68  *
69  *	Semantic changes/extensions.
70  *		- generic control message passing.
71  *		- SCM_CREDENTIALS control message.
72  *		- "Abstract" (not FS based) socket bindings.
73  *		  Abstract names are sequences of bytes (not zero terminated)
74  *		  started by 0, so that this name space does not intersect
75  *		  with BSD names.
76  */
77 
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
79 
80 #include <linux/bpf-cgroup.h>
81 #include <linux/btf_ids.h>
82 #include <linux/dcache.h>
83 #include <linux/errno.h>
84 #include <linux/fcntl.h>
85 #include <linux/file.h>
86 #include <linux/filter.h>
87 #include <linux/fs.h>
88 #include <linux/fs_struct.h>
89 #include <linux/init.h>
90 #include <linux/kernel.h>
91 #include <linux/mount.h>
92 #include <linux/namei.h>
93 #include <linux/net.h>
94 #include <linux/pidfs.h>
95 #include <linux/poll.h>
96 #include <linux/proc_fs.h>
97 #include <linux/sched/signal.h>
98 #include <linux/security.h>
99 #include <linux/seq_file.h>
100 #include <linux/skbuff.h>
101 #include <linux/slab.h>
102 #include <linux/socket.h>
103 #include <linux/splice.h>
104 #include <linux/string.h>
105 #include <linux/uaccess.h>
106 #include <net/af_unix.h>
107 #include <net/net_namespace.h>
108 #include <net/scm.h>
109 #include <net/tcp_states.h>
110 #include <uapi/linux/sockios.h>
111 #include <uapi/linux/termios.h>
112 
113 #include "af_unix.h"
114 
115 static atomic_long_t unix_nr_socks;
116 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
117 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
118 
119 /* SMP locking strategy:
120  *    hash table is protected with spinlock.
121  *    each socket state is protected by separate spinlock.
122  */
123 #ifdef CONFIG_PROVE_LOCKING
124 #define cmp_ptr(l, r)	(((l) > (r)) - ((l) < (r)))
125 
unix_table_lock_cmp_fn(const struct lockdep_map * a,const struct lockdep_map * b)126 static int unix_table_lock_cmp_fn(const struct lockdep_map *a,
127 				  const struct lockdep_map *b)
128 {
129 	return cmp_ptr(a, b);
130 }
131 
unix_state_lock_cmp_fn(const struct lockdep_map * _a,const struct lockdep_map * _b)132 static int unix_state_lock_cmp_fn(const struct lockdep_map *_a,
133 				  const struct lockdep_map *_b)
134 {
135 	const struct unix_sock *a, *b;
136 
137 	a = container_of(_a, struct unix_sock, lock.dep_map);
138 	b = container_of(_b, struct unix_sock, lock.dep_map);
139 
140 	if (a->sk.sk_state == TCP_LISTEN) {
141 		/* unix_stream_connect(): Before the 2nd unix_state_lock(),
142 		 *
143 		 *   1. a is TCP_LISTEN.
144 		 *   2. b is not a.
145 		 *   3. concurrent connect(b -> a) must fail.
146 		 *
147 		 * Except for 2. & 3., the b's state can be any possible
148 		 * value due to concurrent connect() or listen().
149 		 *
150 		 * 2. is detected in debug_spin_lock_before(), and 3. cannot
151 		 * be expressed as lock_cmp_fn.
152 		 */
153 		switch (b->sk.sk_state) {
154 		case TCP_CLOSE:
155 		case TCP_ESTABLISHED:
156 		case TCP_LISTEN:
157 			return -1;
158 		default:
159 			/* Invalid case. */
160 			return 0;
161 		}
162 	}
163 
164 	/* Should never happen.  Just to be symmetric. */
165 	if (b->sk.sk_state == TCP_LISTEN) {
166 		switch (b->sk.sk_state) {
167 		case TCP_CLOSE:
168 		case TCP_ESTABLISHED:
169 			return 1;
170 		default:
171 			return 0;
172 		}
173 	}
174 
175 	/* unix_state_double_lock(): ascending address order. */
176 	return cmp_ptr(a, b);
177 }
178 
unix_recvq_lock_cmp_fn(const struct lockdep_map * _a,const struct lockdep_map * _b)179 static int unix_recvq_lock_cmp_fn(const struct lockdep_map *_a,
180 				  const struct lockdep_map *_b)
181 {
182 	const struct sock *a, *b;
183 
184 	a = container_of(_a, struct sock, sk_receive_queue.lock.dep_map);
185 	b = container_of(_b, struct sock, sk_receive_queue.lock.dep_map);
186 
187 	/* unix_collect_skb(): listener -> embryo order. */
188 	if (a->sk_state == TCP_LISTEN && unix_sk(b)->listener == a)
189 		return -1;
190 
191 	/* Should never happen.  Just to be symmetric. */
192 	if (b->sk_state == TCP_LISTEN && unix_sk(a)->listener == b)
193 		return 1;
194 
195 	return 0;
196 }
197 #endif
198 
unix_unbound_hash(struct sock * sk)199 static unsigned int unix_unbound_hash(struct sock *sk)
200 {
201 	unsigned long hash = (unsigned long)sk;
202 
203 	hash ^= hash >> 16;
204 	hash ^= hash >> 8;
205 	hash ^= sk->sk_type;
206 
207 	return hash & UNIX_HASH_MOD;
208 }
209 
unix_bsd_hash(struct inode * i)210 static unsigned int unix_bsd_hash(struct inode *i)
211 {
212 	return i->i_ino & UNIX_HASH_MOD;
213 }
214 
unix_abstract_hash(struct sockaddr_un * sunaddr,int addr_len,int type)215 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
216 				       int addr_len, int type)
217 {
218 	__wsum csum = csum_partial(sunaddr, addr_len, 0);
219 	unsigned int hash;
220 
221 	hash = (__force unsigned int)csum_fold(csum);
222 	hash ^= hash >> 8;
223 	hash ^= type;
224 
225 	return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
226 }
227 
unix_table_double_lock(struct net * net,unsigned int hash1,unsigned int hash2)228 static void unix_table_double_lock(struct net *net,
229 				   unsigned int hash1, unsigned int hash2)
230 {
231 	if (hash1 == hash2) {
232 		spin_lock(&net->unx.table.locks[hash1]);
233 		return;
234 	}
235 
236 	if (hash1 > hash2)
237 		swap(hash1, hash2);
238 
239 	spin_lock(&net->unx.table.locks[hash1]);
240 	spin_lock(&net->unx.table.locks[hash2]);
241 }
242 
unix_table_double_unlock(struct net * net,unsigned int hash1,unsigned int hash2)243 static void unix_table_double_unlock(struct net *net,
244 				     unsigned int hash1, unsigned int hash2)
245 {
246 	if (hash1 == hash2) {
247 		spin_unlock(&net->unx.table.locks[hash1]);
248 		return;
249 	}
250 
251 	spin_unlock(&net->unx.table.locks[hash1]);
252 	spin_unlock(&net->unx.table.locks[hash2]);
253 }
254 
255 #ifdef CONFIG_SECURITY_NETWORK
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)256 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
257 {
258 	UNIXCB(skb).secid = scm->secid;
259 }
260 
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)261 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
262 {
263 	scm->secid = UNIXCB(skb).secid;
264 }
265 
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)266 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
267 {
268 	return (scm->secid == UNIXCB(skb).secid);
269 }
270 #else
unix_get_secdata(struct scm_cookie * scm,struct sk_buff * skb)271 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
272 { }
273 
unix_set_secdata(struct scm_cookie * scm,struct sk_buff * skb)274 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
275 { }
276 
unix_secdata_eq(struct scm_cookie * scm,struct sk_buff * skb)277 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
278 {
279 	return true;
280 }
281 #endif /* CONFIG_SECURITY_NETWORK */
282 
unix_may_send(struct sock * sk,struct sock * osk)283 static inline int unix_may_send(struct sock *sk, struct sock *osk)
284 {
285 	return !unix_peer(osk) || unix_peer(osk) == sk;
286 }
287 
unix_recvq_full_lockless(const struct sock * sk)288 static inline int unix_recvq_full_lockless(const struct sock *sk)
289 {
290 	return skb_queue_len_lockless(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
291 }
292 
unix_peer_get(struct sock * s)293 struct sock *unix_peer_get(struct sock *s)
294 {
295 	struct sock *peer;
296 
297 	unix_state_lock(s);
298 	peer = unix_peer(s);
299 	if (peer)
300 		sock_hold(peer);
301 	unix_state_unlock(s);
302 	return peer;
303 }
304 EXPORT_SYMBOL_GPL(unix_peer_get);
305 
unix_create_addr(struct sockaddr_un * sunaddr,int addr_len)306 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
307 					     int addr_len)
308 {
309 	struct unix_address *addr;
310 
311 	addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
312 	if (!addr)
313 		return NULL;
314 
315 	refcount_set(&addr->refcnt, 1);
316 	addr->len = addr_len;
317 	memcpy(addr->name, sunaddr, addr_len);
318 
319 	return addr;
320 }
321 
unix_release_addr(struct unix_address * addr)322 static inline void unix_release_addr(struct unix_address *addr)
323 {
324 	if (refcount_dec_and_test(&addr->refcnt))
325 		kfree(addr);
326 }
327 
328 /*
329  *	Check unix socket name:
330  *		- should be not zero length.
331  *	        - if started by not zero, should be NULL terminated (FS object)
332  *		- if started by zero, it is abstract name.
333  */
334 
unix_validate_addr(struct sockaddr_un * sunaddr,int addr_len)335 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
336 {
337 	if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
338 	    addr_len > sizeof(*sunaddr))
339 		return -EINVAL;
340 
341 	if (sunaddr->sun_family != AF_UNIX)
342 		return -EINVAL;
343 
344 	return 0;
345 }
346 
unix_mkname_bsd(struct sockaddr_un * sunaddr,int addr_len)347 static int unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
348 {
349 	struct sockaddr_storage *addr = (struct sockaddr_storage *)sunaddr;
350 	short offset = offsetof(struct sockaddr_storage, __data);
351 
352 	BUILD_BUG_ON(offset != offsetof(struct sockaddr_un, sun_path));
353 
354 	/* This may look like an off by one error but it is a bit more
355 	 * subtle.  108 is the longest valid AF_UNIX path for a binding.
356 	 * sun_path[108] doesn't as such exist.  However in kernel space
357 	 * we are guaranteed that it is a valid memory location in our
358 	 * kernel address buffer because syscall functions always pass
359 	 * a pointer of struct sockaddr_storage which has a bigger buffer
360 	 * than 108.  Also, we must terminate sun_path for strlen() in
361 	 * getname_kernel().
362 	 */
363 	addr->__data[addr_len - offset] = 0;
364 
365 	/* Don't pass sunaddr->sun_path to strlen().  Otherwise, 108 will
366 	 * cause panic if CONFIG_FORTIFY_SOURCE=y.  Let __fortify_strlen()
367 	 * know the actual buffer.
368 	 */
369 	return strlen(addr->__data) + offset + 1;
370 }
371 
__unix_remove_socket(struct sock * sk)372 static void __unix_remove_socket(struct sock *sk)
373 {
374 	sk_del_node_init(sk);
375 }
376 
__unix_insert_socket(struct net * net,struct sock * sk)377 static void __unix_insert_socket(struct net *net, struct sock *sk)
378 {
379 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
380 	sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
381 }
382 
__unix_set_addr_hash(struct net * net,struct sock * sk,struct unix_address * addr,unsigned int hash)383 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
384 				 struct unix_address *addr, unsigned int hash)
385 {
386 	__unix_remove_socket(sk);
387 	smp_store_release(&unix_sk(sk)->addr, addr);
388 
389 	sk->sk_hash = hash;
390 	__unix_insert_socket(net, sk);
391 }
392 
unix_remove_socket(struct net * net,struct sock * sk)393 static void unix_remove_socket(struct net *net, struct sock *sk)
394 {
395 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
396 	__unix_remove_socket(sk);
397 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
398 }
399 
unix_insert_unbound_socket(struct net * net,struct sock * sk)400 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
401 {
402 	spin_lock(&net->unx.table.locks[sk->sk_hash]);
403 	__unix_insert_socket(net, sk);
404 	spin_unlock(&net->unx.table.locks[sk->sk_hash]);
405 }
406 
unix_insert_bsd_socket(struct sock * sk)407 static void unix_insert_bsd_socket(struct sock *sk)
408 {
409 	spin_lock(&bsd_socket_locks[sk->sk_hash]);
410 	sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
411 	spin_unlock(&bsd_socket_locks[sk->sk_hash]);
412 }
413 
unix_remove_bsd_socket(struct sock * sk)414 static void unix_remove_bsd_socket(struct sock *sk)
415 {
416 	if (!hlist_unhashed(&sk->sk_bind_node)) {
417 		spin_lock(&bsd_socket_locks[sk->sk_hash]);
418 		__sk_del_bind_node(sk);
419 		spin_unlock(&bsd_socket_locks[sk->sk_hash]);
420 
421 		sk_node_init(&sk->sk_bind_node);
422 	}
423 }
424 
__unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)425 static struct sock *__unix_find_socket_byname(struct net *net,
426 					      struct sockaddr_un *sunname,
427 					      int len, unsigned int hash)
428 {
429 	struct sock *s;
430 
431 	sk_for_each(s, &net->unx.table.buckets[hash]) {
432 		struct unix_sock *u = unix_sk(s);
433 
434 		if (u->addr->len == len &&
435 		    !memcmp(u->addr->name, sunname, len))
436 			return s;
437 	}
438 	return NULL;
439 }
440 
unix_find_socket_byname(struct net * net,struct sockaddr_un * sunname,int len,unsigned int hash)441 static inline struct sock *unix_find_socket_byname(struct net *net,
442 						   struct sockaddr_un *sunname,
443 						   int len, unsigned int hash)
444 {
445 	struct sock *s;
446 
447 	spin_lock(&net->unx.table.locks[hash]);
448 	s = __unix_find_socket_byname(net, sunname, len, hash);
449 	if (s)
450 		sock_hold(s);
451 	spin_unlock(&net->unx.table.locks[hash]);
452 	return s;
453 }
454 
unix_find_socket_byinode(struct inode * i)455 static struct sock *unix_find_socket_byinode(struct inode *i)
456 {
457 	unsigned int hash = unix_bsd_hash(i);
458 	struct sock *s;
459 
460 	spin_lock(&bsd_socket_locks[hash]);
461 	sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
462 		struct dentry *dentry = unix_sk(s)->path.dentry;
463 
464 		if (dentry && d_backing_inode(dentry) == i) {
465 			sock_hold(s);
466 			spin_unlock(&bsd_socket_locks[hash]);
467 			return s;
468 		}
469 	}
470 	spin_unlock(&bsd_socket_locks[hash]);
471 	return NULL;
472 }
473 
474 /* Support code for asymmetrically connected dgram sockets
475  *
476  * If a datagram socket is connected to a socket not itself connected
477  * to the first socket (eg, /dev/log), clients may only enqueue more
478  * messages if the present receive queue of the server socket is not
479  * "too large". This means there's a second writeability condition
480  * poll and sendmsg need to test. The dgram recv code will do a wake
481  * up on the peer_wait wait queue of a socket upon reception of a
482  * datagram which needs to be propagated to sleeping would-be writers
483  * since these might not have sent anything so far. This can't be
484  * accomplished via poll_wait because the lifetime of the server
485  * socket might be less than that of its clients if these break their
486  * association with it or if the server socket is closed while clients
487  * are still connected to it and there's no way to inform "a polling
488  * implementation" that it should let go of a certain wait queue
489  *
490  * In order to propagate a wake up, a wait_queue_entry_t of the client
491  * socket is enqueued on the peer_wait queue of the server socket
492  * whose wake function does a wake_up on the ordinary client socket
493  * wait queue. This connection is established whenever a write (or
494  * poll for write) hit the flow control condition and broken when the
495  * association to the server socket is dissolved or after a wake up
496  * was relayed.
497  */
498 
unix_dgram_peer_wake_relay(wait_queue_entry_t * q,unsigned mode,int flags,void * key)499 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
500 				      void *key)
501 {
502 	struct unix_sock *u;
503 	wait_queue_head_t *u_sleep;
504 
505 	u = container_of(q, struct unix_sock, peer_wake);
506 
507 	__remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
508 			    q);
509 	u->peer_wake.private = NULL;
510 
511 	/* relaying can only happen while the wq still exists */
512 	u_sleep = sk_sleep(&u->sk);
513 	if (u_sleep)
514 		wake_up_interruptible_poll(u_sleep, key_to_poll(key));
515 
516 	return 0;
517 }
518 
unix_dgram_peer_wake_connect(struct sock * sk,struct sock * other)519 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
520 {
521 	struct unix_sock *u, *u_other;
522 	int rc;
523 
524 	u = unix_sk(sk);
525 	u_other = unix_sk(other);
526 	rc = 0;
527 	spin_lock(&u_other->peer_wait.lock);
528 
529 	if (!u->peer_wake.private) {
530 		u->peer_wake.private = other;
531 		__add_wait_queue(&u_other->peer_wait, &u->peer_wake);
532 
533 		rc = 1;
534 	}
535 
536 	spin_unlock(&u_other->peer_wait.lock);
537 	return rc;
538 }
539 
unix_dgram_peer_wake_disconnect(struct sock * sk,struct sock * other)540 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
541 					    struct sock *other)
542 {
543 	struct unix_sock *u, *u_other;
544 
545 	u = unix_sk(sk);
546 	u_other = unix_sk(other);
547 	spin_lock(&u_other->peer_wait.lock);
548 
549 	if (u->peer_wake.private == other) {
550 		__remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
551 		u->peer_wake.private = NULL;
552 	}
553 
554 	spin_unlock(&u_other->peer_wait.lock);
555 }
556 
unix_dgram_peer_wake_disconnect_wakeup(struct sock * sk,struct sock * other)557 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
558 						   struct sock *other)
559 {
560 	unix_dgram_peer_wake_disconnect(sk, other);
561 	wake_up_interruptible_poll(sk_sleep(sk),
562 				   EPOLLOUT |
563 				   EPOLLWRNORM |
564 				   EPOLLWRBAND);
565 }
566 
567 /* preconditions:
568  *	- unix_peer(sk) == other
569  *	- association is stable
570  */
unix_dgram_peer_wake_me(struct sock * sk,struct sock * other)571 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
572 {
573 	int connected;
574 
575 	connected = unix_dgram_peer_wake_connect(sk, other);
576 
577 	/* If other is SOCK_DEAD, we want to make sure we signal
578 	 * POLLOUT, such that a subsequent write() can get a
579 	 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
580 	 * to other and its full, we will hang waiting for POLLOUT.
581 	 */
582 	if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
583 		return 1;
584 
585 	if (connected)
586 		unix_dgram_peer_wake_disconnect(sk, other);
587 
588 	return 0;
589 }
590 
unix_writable(const struct sock * sk,unsigned char state)591 static int unix_writable(const struct sock *sk, unsigned char state)
592 {
593 	return state != TCP_LISTEN &&
594 		(refcount_read(&sk->sk_wmem_alloc) << 2) <= READ_ONCE(sk->sk_sndbuf);
595 }
596 
unix_write_space(struct sock * sk)597 static void unix_write_space(struct sock *sk)
598 {
599 	struct socket_wq *wq;
600 
601 	rcu_read_lock();
602 	if (unix_writable(sk, READ_ONCE(sk->sk_state))) {
603 		wq = rcu_dereference(sk->sk_wq);
604 		if (skwq_has_sleeper(wq))
605 			wake_up_interruptible_sync_poll(&wq->wait,
606 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
607 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
608 	}
609 	rcu_read_unlock();
610 }
611 
612 /* When dgram socket disconnects (or changes its peer), we clear its receive
613  * queue of packets arrived from previous peer. First, it allows to do
614  * flow control based only on wmem_alloc; second, sk connected to peer
615  * may receive messages only from that peer. */
unix_dgram_disconnected(struct sock * sk,struct sock * other)616 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
617 {
618 	if (!skb_queue_empty(&sk->sk_receive_queue)) {
619 		skb_queue_purge_reason(&sk->sk_receive_queue,
620 				       SKB_DROP_REASON_UNIX_DISCONNECT);
621 
622 		wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
623 
624 		/* If one link of bidirectional dgram pipe is disconnected,
625 		 * we signal error. Messages are lost. Do not make this,
626 		 * when peer was not connected to us.
627 		 */
628 		if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
629 			WRITE_ONCE(other->sk_err, ECONNRESET);
630 			sk_error_report(other);
631 		}
632 	}
633 }
634 
unix_sock_destructor(struct sock * sk)635 static void unix_sock_destructor(struct sock *sk)
636 {
637 	struct unix_sock *u = unix_sk(sk);
638 
639 	skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE);
640 
641 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
642 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
643 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
644 	if (!sock_flag(sk, SOCK_DEAD)) {
645 		pr_info("Attempt to release alive unix socket: %p\n", sk);
646 		return;
647 	}
648 
649 	if (u->addr)
650 		unix_release_addr(u->addr);
651 
652 	atomic_long_dec(&unix_nr_socks);
653 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
654 #ifdef UNIX_REFCNT_DEBUG
655 	pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
656 		atomic_long_read(&unix_nr_socks));
657 #endif
658 }
659 
unix_skb_len(const struct sk_buff * skb)660 static unsigned int unix_skb_len(const struct sk_buff *skb)
661 {
662 	return skb->len - UNIXCB(skb).consumed;
663 }
664 
unix_release_sock(struct sock * sk,int embrion)665 static void unix_release_sock(struct sock *sk, int embrion)
666 {
667 	struct unix_sock *u = unix_sk(sk);
668 	struct sock *skpair;
669 	struct sk_buff *skb;
670 	struct path path;
671 	int state;
672 
673 	unix_remove_socket(sock_net(sk), sk);
674 	unix_remove_bsd_socket(sk);
675 
676 	/* Clear state */
677 	unix_state_lock(sk);
678 	sock_orphan(sk);
679 	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
680 	path	     = u->path;
681 	u->path.dentry = NULL;
682 	u->path.mnt = NULL;
683 	state = sk->sk_state;
684 	WRITE_ONCE(sk->sk_state, TCP_CLOSE);
685 
686 	skpair = unix_peer(sk);
687 	unix_peer(sk) = NULL;
688 
689 	unix_state_unlock(sk);
690 
691 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
692 	u->oob_skb = NULL;
693 #endif
694 
695 	wake_up_interruptible_all(&u->peer_wait);
696 
697 	if (skpair != NULL) {
698 		if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
699 			struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
700 
701 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
702 			if (skb && !unix_skb_len(skb))
703 				skb = skb_peek_next(skb, &sk->sk_receive_queue);
704 #endif
705 			unix_state_lock(skpair);
706 			/* No more writes */
707 			WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK);
708 			if (skb || embrion)
709 				WRITE_ONCE(skpair->sk_err, ECONNRESET);
710 			unix_state_unlock(skpair);
711 			skpair->sk_state_change(skpair);
712 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
713 		}
714 
715 		unix_dgram_peer_wake_disconnect(sk, skpair);
716 		sock_put(skpair); /* It may now die */
717 	}
718 
719 	/* Try to flush out this socket. Throw out buffers at least */
720 
721 	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
722 		if (state == TCP_LISTEN)
723 			unix_release_sock(skb->sk, 1);
724 
725 		/* passed fds are erased in the kfree_skb hook */
726 		kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
727 	}
728 
729 	if (path.dentry)
730 		path_put(&path);
731 
732 	sock_put(sk);
733 
734 	/* ---- Socket is dead now and most probably destroyed ---- */
735 
736 	/*
737 	 * Fixme: BSD difference: In BSD all sockets connected to us get
738 	 *	  ECONNRESET and we die on the spot. In Linux we behave
739 	 *	  like files and pipes do and wait for the last
740 	 *	  dereference.
741 	 *
742 	 * Can't we simply set sock->err?
743 	 *
744 	 *	  What the above comment does talk about? --ANK(980817)
745 	 */
746 
747 	if (READ_ONCE(unix_tot_inflight))
748 		unix_gc();		/* Garbage collect fds */
749 }
750 
751 struct unix_peercred {
752 	struct pid *peer_pid;
753 	const struct cred *peer_cred;
754 };
755 
prepare_peercred(struct unix_peercred * peercred)756 static inline int prepare_peercred(struct unix_peercred *peercred)
757 {
758 	struct pid *pid;
759 	int err;
760 
761 	pid = task_tgid(current);
762 	err = pidfs_register_pid(pid);
763 	if (likely(!err)) {
764 		peercred->peer_pid = get_pid(pid);
765 		peercred->peer_cred = get_current_cred();
766 	}
767 	return err;
768 }
769 
drop_peercred(struct unix_peercred * peercred)770 static void drop_peercred(struct unix_peercred *peercred)
771 {
772 	const struct cred *cred = NULL;
773 	struct pid *pid = NULL;
774 
775 	might_sleep();
776 
777 	swap(peercred->peer_pid, pid);
778 	swap(peercred->peer_cred, cred);
779 
780 	put_pid(pid);
781 	put_cred(cred);
782 }
783 
init_peercred(struct sock * sk,const struct unix_peercred * peercred)784 static inline void init_peercred(struct sock *sk,
785 				 const struct unix_peercred *peercred)
786 {
787 	sk->sk_peer_pid = peercred->peer_pid;
788 	sk->sk_peer_cred = peercred->peer_cred;
789 }
790 
update_peercred(struct sock * sk,struct unix_peercred * peercred)791 static void update_peercred(struct sock *sk, struct unix_peercred *peercred)
792 {
793 	const struct cred *old_cred;
794 	struct pid *old_pid;
795 
796 	spin_lock(&sk->sk_peer_lock);
797 	old_pid = sk->sk_peer_pid;
798 	old_cred = sk->sk_peer_cred;
799 	init_peercred(sk, peercred);
800 	spin_unlock(&sk->sk_peer_lock);
801 
802 	peercred->peer_pid = old_pid;
803 	peercred->peer_cred = old_cred;
804 }
805 
copy_peercred(struct sock * sk,struct sock * peersk)806 static void copy_peercred(struct sock *sk, struct sock *peersk)
807 {
808 	lockdep_assert_held(&unix_sk(peersk)->lock);
809 
810 	spin_lock(&sk->sk_peer_lock);
811 	sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
812 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
813 	spin_unlock(&sk->sk_peer_lock);
814 }
815 
unix_may_passcred(const struct sock * sk)816 static bool unix_may_passcred(const struct sock *sk)
817 {
818 	return sk->sk_scm_credentials || sk->sk_scm_pidfd;
819 }
820 
unix_listen(struct socket * sock,int backlog)821 static int unix_listen(struct socket *sock, int backlog)
822 {
823 	int err;
824 	struct sock *sk = sock->sk;
825 	struct unix_sock *u = unix_sk(sk);
826 	struct unix_peercred peercred = {};
827 
828 	err = -EOPNOTSUPP;
829 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
830 		goto out;	/* Only stream/seqpacket sockets accept */
831 	err = -EINVAL;
832 	if (!READ_ONCE(u->addr))
833 		goto out;	/* No listens on an unbound socket */
834 	err = prepare_peercred(&peercred);
835 	if (err)
836 		goto out;
837 	unix_state_lock(sk);
838 	if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
839 		goto out_unlock;
840 	if (backlog > sk->sk_max_ack_backlog)
841 		wake_up_interruptible_all(&u->peer_wait);
842 	sk->sk_max_ack_backlog	= backlog;
843 	WRITE_ONCE(sk->sk_state, TCP_LISTEN);
844 
845 	/* set credentials so connect can copy them */
846 	update_peercred(sk, &peercred);
847 	err = 0;
848 
849 out_unlock:
850 	unix_state_unlock(sk);
851 	drop_peercred(&peercred);
852 out:
853 	return err;
854 }
855 
856 static int unix_release(struct socket *);
857 static int unix_bind(struct socket *, struct sockaddr *, int);
858 static int unix_stream_connect(struct socket *, struct sockaddr *,
859 			       int addr_len, int flags);
860 static int unix_socketpair(struct socket *, struct socket *);
861 static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
862 static int unix_getname(struct socket *, struct sockaddr *, int);
863 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
864 static __poll_t unix_dgram_poll(struct file *, struct socket *,
865 				    poll_table *);
866 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
867 #ifdef CONFIG_COMPAT
868 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
869 #endif
870 static int unix_shutdown(struct socket *, int);
871 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
872 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
873 static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
874 				       struct pipe_inode_info *, size_t size,
875 				       unsigned int flags);
876 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
877 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
878 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
879 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
880 static int unix_dgram_connect(struct socket *, struct sockaddr *,
881 			      int, int);
882 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
883 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
884 				  int);
885 
886 #ifdef CONFIG_PROC_FS
unix_count_nr_fds(struct sock * sk)887 static int unix_count_nr_fds(struct sock *sk)
888 {
889 	struct sk_buff *skb;
890 	struct unix_sock *u;
891 	int nr_fds = 0;
892 
893 	spin_lock(&sk->sk_receive_queue.lock);
894 	skb = skb_peek(&sk->sk_receive_queue);
895 	while (skb) {
896 		u = unix_sk(skb->sk);
897 		nr_fds += atomic_read(&u->scm_stat.nr_fds);
898 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
899 	}
900 	spin_unlock(&sk->sk_receive_queue.lock);
901 
902 	return nr_fds;
903 }
904 
unix_show_fdinfo(struct seq_file * m,struct socket * sock)905 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
906 {
907 	struct sock *sk = sock->sk;
908 	unsigned char s_state;
909 	struct unix_sock *u;
910 	int nr_fds = 0;
911 
912 	if (sk) {
913 		s_state = READ_ONCE(sk->sk_state);
914 		u = unix_sk(sk);
915 
916 		/* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
917 		 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
918 		 * SOCK_DGRAM is ordinary. So, no lock is needed.
919 		 */
920 		if (sock->type == SOCK_DGRAM || s_state == TCP_ESTABLISHED)
921 			nr_fds = atomic_read(&u->scm_stat.nr_fds);
922 		else if (s_state == TCP_LISTEN)
923 			nr_fds = unix_count_nr_fds(sk);
924 
925 		seq_printf(m, "scm_fds: %u\n", nr_fds);
926 	}
927 }
928 #else
929 #define unix_show_fdinfo NULL
930 #endif
931 
unix_custom_sockopt(int optname)932 static bool unix_custom_sockopt(int optname)
933 {
934 	switch (optname) {
935 	case SO_INQ:
936 		return true;
937 	default:
938 		return false;
939 	}
940 }
941 
unix_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)942 static int unix_setsockopt(struct socket *sock, int level, int optname,
943 			   sockptr_t optval, unsigned int optlen)
944 {
945 	struct unix_sock *u = unix_sk(sock->sk);
946 	struct sock *sk = sock->sk;
947 	int val;
948 
949 	if (level != SOL_SOCKET)
950 		return -EOPNOTSUPP;
951 
952 	if (!unix_custom_sockopt(optname))
953 		return sock_setsockopt(sock, level, optname, optval, optlen);
954 
955 	if (optlen != sizeof(int))
956 		return -EINVAL;
957 
958 	if (copy_from_sockptr(&val, optval, sizeof(val)))
959 		return -EFAULT;
960 
961 	switch (optname) {
962 	case SO_INQ:
963 		if (sk->sk_type != SOCK_STREAM)
964 			return -EINVAL;
965 
966 		if (val > 1 || val < 0)
967 			return -EINVAL;
968 
969 		WRITE_ONCE(u->recvmsg_inq, val);
970 		break;
971 	default:
972 		return -ENOPROTOOPT;
973 	}
974 
975 	return 0;
976 }
977 
978 static const struct proto_ops unix_stream_ops = {
979 	.family =	PF_UNIX,
980 	.owner =	THIS_MODULE,
981 	.release =	unix_release,
982 	.bind =		unix_bind,
983 	.connect =	unix_stream_connect,
984 	.socketpair =	unix_socketpair,
985 	.accept =	unix_accept,
986 	.getname =	unix_getname,
987 	.poll =		unix_poll,
988 	.ioctl =	unix_ioctl,
989 #ifdef CONFIG_COMPAT
990 	.compat_ioctl =	unix_compat_ioctl,
991 #endif
992 	.listen =	unix_listen,
993 	.shutdown =	unix_shutdown,
994 	.setsockopt =	unix_setsockopt,
995 	.sendmsg =	unix_stream_sendmsg,
996 	.recvmsg =	unix_stream_recvmsg,
997 	.read_skb =	unix_stream_read_skb,
998 	.mmap =		sock_no_mmap,
999 	.splice_read =	unix_stream_splice_read,
1000 	.set_peek_off =	sk_set_peek_off,
1001 	.show_fdinfo =	unix_show_fdinfo,
1002 };
1003 
1004 static const struct proto_ops unix_dgram_ops = {
1005 	.family =	PF_UNIX,
1006 	.owner =	THIS_MODULE,
1007 	.release =	unix_release,
1008 	.bind =		unix_bind,
1009 	.connect =	unix_dgram_connect,
1010 	.socketpair =	unix_socketpair,
1011 	.accept =	sock_no_accept,
1012 	.getname =	unix_getname,
1013 	.poll =		unix_dgram_poll,
1014 	.ioctl =	unix_ioctl,
1015 #ifdef CONFIG_COMPAT
1016 	.compat_ioctl =	unix_compat_ioctl,
1017 #endif
1018 	.listen =	sock_no_listen,
1019 	.shutdown =	unix_shutdown,
1020 	.sendmsg =	unix_dgram_sendmsg,
1021 	.read_skb =	unix_read_skb,
1022 	.recvmsg =	unix_dgram_recvmsg,
1023 	.mmap =		sock_no_mmap,
1024 	.set_peek_off =	sk_set_peek_off,
1025 	.show_fdinfo =	unix_show_fdinfo,
1026 };
1027 
1028 static const struct proto_ops unix_seqpacket_ops = {
1029 	.family =	PF_UNIX,
1030 	.owner =	THIS_MODULE,
1031 	.release =	unix_release,
1032 	.bind =		unix_bind,
1033 	.connect =	unix_stream_connect,
1034 	.socketpair =	unix_socketpair,
1035 	.accept =	unix_accept,
1036 	.getname =	unix_getname,
1037 	.poll =		unix_dgram_poll,
1038 	.ioctl =	unix_ioctl,
1039 #ifdef CONFIG_COMPAT
1040 	.compat_ioctl =	unix_compat_ioctl,
1041 #endif
1042 	.listen =	unix_listen,
1043 	.shutdown =	unix_shutdown,
1044 	.sendmsg =	unix_seqpacket_sendmsg,
1045 	.recvmsg =	unix_seqpacket_recvmsg,
1046 	.mmap =		sock_no_mmap,
1047 	.set_peek_off =	sk_set_peek_off,
1048 	.show_fdinfo =	unix_show_fdinfo,
1049 };
1050 
unix_close(struct sock * sk,long timeout)1051 static void unix_close(struct sock *sk, long timeout)
1052 {
1053 	/* Nothing to do here, unix socket does not need a ->close().
1054 	 * This is merely for sockmap.
1055 	 */
1056 }
1057 
unix_bpf_bypass_getsockopt(int level,int optname)1058 static bool unix_bpf_bypass_getsockopt(int level, int optname)
1059 {
1060 	if (level == SOL_SOCKET) {
1061 		switch (optname) {
1062 		case SO_PEERPIDFD:
1063 			return true;
1064 		default:
1065 			return false;
1066 		}
1067 	}
1068 
1069 	return false;
1070 }
1071 
1072 struct proto unix_dgram_proto = {
1073 	.name			= "UNIX",
1074 	.owner			= THIS_MODULE,
1075 	.obj_size		= sizeof(struct unix_sock),
1076 	.close			= unix_close,
1077 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
1078 #ifdef CONFIG_BPF_SYSCALL
1079 	.psock_update_sk_prot	= unix_dgram_bpf_update_proto,
1080 #endif
1081 };
1082 
1083 struct proto unix_stream_proto = {
1084 	.name			= "UNIX-STREAM",
1085 	.owner			= THIS_MODULE,
1086 	.obj_size		= sizeof(struct unix_sock),
1087 	.close			= unix_close,
1088 	.bpf_bypass_getsockopt	= unix_bpf_bypass_getsockopt,
1089 #ifdef CONFIG_BPF_SYSCALL
1090 	.psock_update_sk_prot	= unix_stream_bpf_update_proto,
1091 #endif
1092 };
1093 
unix_create1(struct net * net,struct socket * sock,int kern,int type)1094 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
1095 {
1096 	struct unix_sock *u;
1097 	struct sock *sk;
1098 	int err;
1099 
1100 	atomic_long_inc(&unix_nr_socks);
1101 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
1102 		err = -ENFILE;
1103 		goto err;
1104 	}
1105 
1106 	if (type == SOCK_STREAM)
1107 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
1108 	else /*dgram and  seqpacket */
1109 		sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
1110 
1111 	if (!sk) {
1112 		err = -ENOMEM;
1113 		goto err;
1114 	}
1115 
1116 	sock_init_data(sock, sk);
1117 
1118 	sk->sk_scm_rights	= 1;
1119 	sk->sk_hash		= unix_unbound_hash(sk);
1120 	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
1121 	sk->sk_write_space	= unix_write_space;
1122 	sk->sk_max_ack_backlog	= READ_ONCE(net->unx.sysctl_max_dgram_qlen);
1123 	sk->sk_destruct		= unix_sock_destructor;
1124 	lock_set_cmp_fn(&sk->sk_receive_queue.lock, unix_recvq_lock_cmp_fn, NULL);
1125 
1126 	u = unix_sk(sk);
1127 	u->listener = NULL;
1128 	u->vertex = NULL;
1129 	u->path.dentry = NULL;
1130 	u->path.mnt = NULL;
1131 	spin_lock_init(&u->lock);
1132 	lock_set_cmp_fn(&u->lock, unix_state_lock_cmp_fn, NULL);
1133 	mutex_init(&u->iolock); /* single task reading lock */
1134 	mutex_init(&u->bindlock); /* single task binding lock */
1135 	init_waitqueue_head(&u->peer_wait);
1136 	init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
1137 	memset(&u->scm_stat, 0, sizeof(struct scm_stat));
1138 	unix_insert_unbound_socket(net, sk);
1139 
1140 	sock_prot_inuse_add(net, sk->sk_prot, 1);
1141 
1142 	return sk;
1143 
1144 err:
1145 	atomic_long_dec(&unix_nr_socks);
1146 	return ERR_PTR(err);
1147 }
1148 
unix_create(struct net * net,struct socket * sock,int protocol,int kern)1149 static int unix_create(struct net *net, struct socket *sock, int protocol,
1150 		       int kern)
1151 {
1152 	struct sock *sk;
1153 
1154 	if (protocol && protocol != PF_UNIX)
1155 		return -EPROTONOSUPPORT;
1156 
1157 	sock->state = SS_UNCONNECTED;
1158 
1159 	switch (sock->type) {
1160 	case SOCK_STREAM:
1161 		set_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags);
1162 		sock->ops = &unix_stream_ops;
1163 		break;
1164 		/*
1165 		 *	Believe it or not BSD has AF_UNIX, SOCK_RAW though
1166 		 *	nothing uses it.
1167 		 */
1168 	case SOCK_RAW:
1169 		sock->type = SOCK_DGRAM;
1170 		fallthrough;
1171 	case SOCK_DGRAM:
1172 		sock->ops = &unix_dgram_ops;
1173 		break;
1174 	case SOCK_SEQPACKET:
1175 		sock->ops = &unix_seqpacket_ops;
1176 		break;
1177 	default:
1178 		return -ESOCKTNOSUPPORT;
1179 	}
1180 
1181 	sk = unix_create1(net, sock, kern, sock->type);
1182 	if (IS_ERR(sk))
1183 		return PTR_ERR(sk);
1184 
1185 	return 0;
1186 }
1187 
unix_release(struct socket * sock)1188 static int unix_release(struct socket *sock)
1189 {
1190 	struct sock *sk = sock->sk;
1191 
1192 	if (!sk)
1193 		return 0;
1194 
1195 	sk->sk_prot->close(sk, 0);
1196 	unix_release_sock(sk, 0);
1197 	sock->sk = NULL;
1198 
1199 	return 0;
1200 }
1201 
unix_find_bsd(struct sockaddr_un * sunaddr,int addr_len,int type,int flags)1202 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1203 				  int type, int flags)
1204 {
1205 	struct inode *inode;
1206 	struct path path;
1207 	struct sock *sk;
1208 	int err;
1209 
1210 	unix_mkname_bsd(sunaddr, addr_len);
1211 
1212 	if (flags & SOCK_COREDUMP) {
1213 		const struct cred *cred;
1214 		struct cred *kcred;
1215 		struct path root;
1216 
1217 		kcred = prepare_kernel_cred(&init_task);
1218 		if (!kcred) {
1219 			err = -ENOMEM;
1220 			goto fail;
1221 		}
1222 
1223 		task_lock(&init_task);
1224 		get_fs_root(init_task.fs, &root);
1225 		task_unlock(&init_task);
1226 
1227 		cred = override_creds(kcred);
1228 		err = vfs_path_lookup(root.dentry, root.mnt, sunaddr->sun_path,
1229 				      LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS |
1230 				      LOOKUP_NO_MAGICLINKS, &path);
1231 		put_cred(revert_creds(cred));
1232 		path_put(&root);
1233 		if (err)
1234 			goto fail;
1235 	} else {
1236 		err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1237 		if (err)
1238 			goto fail;
1239 
1240 		err = path_permission(&path, MAY_WRITE);
1241 		if (err)
1242 			goto path_put;
1243 	}
1244 
1245 	err = -ECONNREFUSED;
1246 	inode = d_backing_inode(path.dentry);
1247 	if (!S_ISSOCK(inode->i_mode))
1248 		goto path_put;
1249 
1250 	sk = unix_find_socket_byinode(inode);
1251 	if (!sk)
1252 		goto path_put;
1253 
1254 	err = -EPROTOTYPE;
1255 	if (sk->sk_type == type)
1256 		touch_atime(&path);
1257 	else
1258 		goto sock_put;
1259 
1260 	path_put(&path);
1261 
1262 	return sk;
1263 
1264 sock_put:
1265 	sock_put(sk);
1266 path_put:
1267 	path_put(&path);
1268 fail:
1269 	return ERR_PTR(err);
1270 }
1271 
unix_find_abstract(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type)1272 static struct sock *unix_find_abstract(struct net *net,
1273 				       struct sockaddr_un *sunaddr,
1274 				       int addr_len, int type)
1275 {
1276 	unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1277 	struct dentry *dentry;
1278 	struct sock *sk;
1279 
1280 	sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1281 	if (!sk)
1282 		return ERR_PTR(-ECONNREFUSED);
1283 
1284 	dentry = unix_sk(sk)->path.dentry;
1285 	if (dentry)
1286 		touch_atime(&unix_sk(sk)->path);
1287 
1288 	return sk;
1289 }
1290 
unix_find_other(struct net * net,struct sockaddr_un * sunaddr,int addr_len,int type,int flags)1291 static struct sock *unix_find_other(struct net *net,
1292 				    struct sockaddr_un *sunaddr,
1293 				    int addr_len, int type, int flags)
1294 {
1295 	struct sock *sk;
1296 
1297 	if (sunaddr->sun_path[0])
1298 		sk = unix_find_bsd(sunaddr, addr_len, type, flags);
1299 	else
1300 		sk = unix_find_abstract(net, sunaddr, addr_len, type);
1301 
1302 	return sk;
1303 }
1304 
unix_autobind(struct sock * sk)1305 static int unix_autobind(struct sock *sk)
1306 {
1307 	struct unix_sock *u = unix_sk(sk);
1308 	unsigned int new_hash, old_hash;
1309 	struct net *net = sock_net(sk);
1310 	struct unix_address *addr;
1311 	u32 lastnum, ordernum;
1312 	int err;
1313 
1314 	err = mutex_lock_interruptible(&u->bindlock);
1315 	if (err)
1316 		return err;
1317 
1318 	if (u->addr)
1319 		goto out;
1320 
1321 	err = -ENOMEM;
1322 	addr = kzalloc(sizeof(*addr) +
1323 		       offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1324 	if (!addr)
1325 		goto out;
1326 
1327 	addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1328 	addr->name->sun_family = AF_UNIX;
1329 	refcount_set(&addr->refcnt, 1);
1330 
1331 	old_hash = sk->sk_hash;
1332 	ordernum = get_random_u32();
1333 	lastnum = ordernum & 0xFFFFF;
1334 retry:
1335 	ordernum = (ordernum + 1) & 0xFFFFF;
1336 	sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1337 
1338 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1339 	unix_table_double_lock(net, old_hash, new_hash);
1340 
1341 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1342 		unix_table_double_unlock(net, old_hash, new_hash);
1343 
1344 		/* __unix_find_socket_byname() may take long time if many names
1345 		 * are already in use.
1346 		 */
1347 		cond_resched();
1348 
1349 		if (ordernum == lastnum) {
1350 			/* Give up if all names seems to be in use. */
1351 			err = -ENOSPC;
1352 			unix_release_addr(addr);
1353 			goto out;
1354 		}
1355 
1356 		goto retry;
1357 	}
1358 
1359 	__unix_set_addr_hash(net, sk, addr, new_hash);
1360 	unix_table_double_unlock(net, old_hash, new_hash);
1361 	err = 0;
1362 
1363 out:	mutex_unlock(&u->bindlock);
1364 	return err;
1365 }
1366 
unix_bind_bsd(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1367 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1368 			 int addr_len)
1369 {
1370 	umode_t mode = S_IFSOCK |
1371 	       (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1372 	struct unix_sock *u = unix_sk(sk);
1373 	unsigned int new_hash, old_hash;
1374 	struct net *net = sock_net(sk);
1375 	struct mnt_idmap *idmap;
1376 	struct unix_address *addr;
1377 	struct dentry *dentry;
1378 	struct path parent;
1379 	int err;
1380 
1381 	addr_len = unix_mkname_bsd(sunaddr, addr_len);
1382 	addr = unix_create_addr(sunaddr, addr_len);
1383 	if (!addr)
1384 		return -ENOMEM;
1385 
1386 	/*
1387 	 * Get the parent directory, calculate the hash for last
1388 	 * component.
1389 	 */
1390 	dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1391 	if (IS_ERR(dentry)) {
1392 		err = PTR_ERR(dentry);
1393 		goto out;
1394 	}
1395 
1396 	/*
1397 	 * All right, let's create it.
1398 	 */
1399 	idmap = mnt_idmap(parent.mnt);
1400 	err = security_path_mknod(&parent, dentry, mode, 0);
1401 	if (!err)
1402 		err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0);
1403 	if (err)
1404 		goto out_path;
1405 	err = mutex_lock_interruptible(&u->bindlock);
1406 	if (err)
1407 		goto out_unlink;
1408 	if (u->addr)
1409 		goto out_unlock;
1410 
1411 	old_hash = sk->sk_hash;
1412 	new_hash = unix_bsd_hash(d_backing_inode(dentry));
1413 	unix_table_double_lock(net, old_hash, new_hash);
1414 	u->path.mnt = mntget(parent.mnt);
1415 	u->path.dentry = dget(dentry);
1416 	__unix_set_addr_hash(net, sk, addr, new_hash);
1417 	unix_table_double_unlock(net, old_hash, new_hash);
1418 	unix_insert_bsd_socket(sk);
1419 	mutex_unlock(&u->bindlock);
1420 	done_path_create(&parent, dentry);
1421 	return 0;
1422 
1423 out_unlock:
1424 	mutex_unlock(&u->bindlock);
1425 	err = -EINVAL;
1426 out_unlink:
1427 	/* failed after successful mknod?  unlink what we'd created... */
1428 	vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL);
1429 out_path:
1430 	done_path_create(&parent, dentry);
1431 out:
1432 	unix_release_addr(addr);
1433 	return err == -EEXIST ? -EADDRINUSE : err;
1434 }
1435 
unix_bind_abstract(struct sock * sk,struct sockaddr_un * sunaddr,int addr_len)1436 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1437 			      int addr_len)
1438 {
1439 	struct unix_sock *u = unix_sk(sk);
1440 	unsigned int new_hash, old_hash;
1441 	struct net *net = sock_net(sk);
1442 	struct unix_address *addr;
1443 	int err;
1444 
1445 	addr = unix_create_addr(sunaddr, addr_len);
1446 	if (!addr)
1447 		return -ENOMEM;
1448 
1449 	err = mutex_lock_interruptible(&u->bindlock);
1450 	if (err)
1451 		goto out;
1452 
1453 	if (u->addr) {
1454 		err = -EINVAL;
1455 		goto out_mutex;
1456 	}
1457 
1458 	old_hash = sk->sk_hash;
1459 	new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1460 	unix_table_double_lock(net, old_hash, new_hash);
1461 
1462 	if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1463 		goto out_spin;
1464 
1465 	__unix_set_addr_hash(net, sk, addr, new_hash);
1466 	unix_table_double_unlock(net, old_hash, new_hash);
1467 	mutex_unlock(&u->bindlock);
1468 	return 0;
1469 
1470 out_spin:
1471 	unix_table_double_unlock(net, old_hash, new_hash);
1472 	err = -EADDRINUSE;
1473 out_mutex:
1474 	mutex_unlock(&u->bindlock);
1475 out:
1476 	unix_release_addr(addr);
1477 	return err;
1478 }
1479 
unix_bind(struct socket * sock,struct sockaddr * uaddr,int addr_len)1480 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1481 {
1482 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1483 	struct sock *sk = sock->sk;
1484 	int err;
1485 
1486 	if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1487 	    sunaddr->sun_family == AF_UNIX)
1488 		return unix_autobind(sk);
1489 
1490 	err = unix_validate_addr(sunaddr, addr_len);
1491 	if (err)
1492 		return err;
1493 
1494 	if (sunaddr->sun_path[0])
1495 		err = unix_bind_bsd(sk, sunaddr, addr_len);
1496 	else
1497 		err = unix_bind_abstract(sk, sunaddr, addr_len);
1498 
1499 	return err;
1500 }
1501 
unix_state_double_lock(struct sock * sk1,struct sock * sk2)1502 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1503 {
1504 	if (unlikely(sk1 == sk2) || !sk2) {
1505 		unix_state_lock(sk1);
1506 		return;
1507 	}
1508 
1509 	if (sk1 > sk2)
1510 		swap(sk1, sk2);
1511 
1512 	unix_state_lock(sk1);
1513 	unix_state_lock(sk2);
1514 }
1515 
unix_state_double_unlock(struct sock * sk1,struct sock * sk2)1516 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1517 {
1518 	if (unlikely(sk1 == sk2) || !sk2) {
1519 		unix_state_unlock(sk1);
1520 		return;
1521 	}
1522 	unix_state_unlock(sk1);
1523 	unix_state_unlock(sk2);
1524 }
1525 
unix_dgram_connect(struct socket * sock,struct sockaddr * addr,int alen,int flags)1526 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1527 			      int alen, int flags)
1528 {
1529 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1530 	struct sock *sk = sock->sk;
1531 	struct sock *other;
1532 	int err;
1533 
1534 	err = -EINVAL;
1535 	if (alen < offsetofend(struct sockaddr, sa_family))
1536 		goto out;
1537 
1538 	if (addr->sa_family != AF_UNSPEC) {
1539 		err = unix_validate_addr(sunaddr, alen);
1540 		if (err)
1541 			goto out;
1542 
1543 		err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, addr, &alen);
1544 		if (err)
1545 			goto out;
1546 
1547 		if (unix_may_passcred(sk) && !READ_ONCE(unix_sk(sk)->addr)) {
1548 			err = unix_autobind(sk);
1549 			if (err)
1550 				goto out;
1551 		}
1552 
1553 restart:
1554 		other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type, 0);
1555 		if (IS_ERR(other)) {
1556 			err = PTR_ERR(other);
1557 			goto out;
1558 		}
1559 
1560 		unix_state_double_lock(sk, other);
1561 
1562 		/* Apparently VFS overslept socket death. Retry. */
1563 		if (sock_flag(other, SOCK_DEAD)) {
1564 			unix_state_double_unlock(sk, other);
1565 			sock_put(other);
1566 			goto restart;
1567 		}
1568 
1569 		err = -EPERM;
1570 		if (!unix_may_send(sk, other))
1571 			goto out_unlock;
1572 
1573 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1574 		if (err)
1575 			goto out_unlock;
1576 
1577 		WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1578 		WRITE_ONCE(other->sk_state, TCP_ESTABLISHED);
1579 	} else {
1580 		/*
1581 		 *	1003.1g breaking connected state with AF_UNSPEC
1582 		 */
1583 		other = NULL;
1584 		unix_state_double_lock(sk, other);
1585 	}
1586 
1587 	/*
1588 	 * If it was connected, reconnect.
1589 	 */
1590 	if (unix_peer(sk)) {
1591 		struct sock *old_peer = unix_peer(sk);
1592 
1593 		unix_peer(sk) = other;
1594 		if (!other)
1595 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
1596 		unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1597 
1598 		unix_state_double_unlock(sk, other);
1599 
1600 		if (other != old_peer) {
1601 			unix_dgram_disconnected(sk, old_peer);
1602 
1603 			unix_state_lock(old_peer);
1604 			if (!unix_peer(old_peer))
1605 				WRITE_ONCE(old_peer->sk_state, TCP_CLOSE);
1606 			unix_state_unlock(old_peer);
1607 		}
1608 
1609 		sock_put(old_peer);
1610 	} else {
1611 		unix_peer(sk) = other;
1612 		unix_state_double_unlock(sk, other);
1613 	}
1614 
1615 	return 0;
1616 
1617 out_unlock:
1618 	unix_state_double_unlock(sk, other);
1619 	sock_put(other);
1620 out:
1621 	return err;
1622 }
1623 
unix_wait_for_peer(struct sock * other,long timeo)1624 static long unix_wait_for_peer(struct sock *other, long timeo)
1625 {
1626 	struct unix_sock *u = unix_sk(other);
1627 	int sched;
1628 	DEFINE_WAIT(wait);
1629 
1630 	prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1631 
1632 	sched = !sock_flag(other, SOCK_DEAD) &&
1633 		!(other->sk_shutdown & RCV_SHUTDOWN) &&
1634 		unix_recvq_full_lockless(other);
1635 
1636 	unix_state_unlock(other);
1637 
1638 	if (sched)
1639 		timeo = schedule_timeout(timeo);
1640 
1641 	finish_wait(&u->peer_wait, &wait);
1642 	return timeo;
1643 }
1644 
unix_stream_connect(struct socket * sock,struct sockaddr * uaddr,int addr_len,int flags)1645 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1646 			       int addr_len, int flags)
1647 {
1648 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1649 	struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1650 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1651 	struct unix_peercred peercred = {};
1652 	struct net *net = sock_net(sk);
1653 	struct sk_buff *skb = NULL;
1654 	unsigned char state;
1655 	long timeo;
1656 	int err;
1657 
1658 	err = unix_validate_addr(sunaddr, addr_len);
1659 	if (err)
1660 		goto out;
1661 
1662 	err = BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, &addr_len);
1663 	if (err)
1664 		goto out;
1665 
1666 	if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) {
1667 		err = unix_autobind(sk);
1668 		if (err)
1669 			goto out;
1670 	}
1671 
1672 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1673 
1674 	/* First of all allocate resources.
1675 	 * If we will make it after state is locked,
1676 	 * we will have to recheck all again in any case.
1677 	 */
1678 
1679 	/* create new sock for complete connection */
1680 	newsk = unix_create1(net, NULL, 0, sock->type);
1681 	if (IS_ERR(newsk)) {
1682 		err = PTR_ERR(newsk);
1683 		goto out;
1684 	}
1685 
1686 	err = prepare_peercred(&peercred);
1687 	if (err)
1688 		goto out;
1689 
1690 	/* Allocate skb for sending to listening sock */
1691 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1692 	if (!skb) {
1693 		err = -ENOMEM;
1694 		goto out_free_sk;
1695 	}
1696 
1697 restart:
1698 	/*  Find listening sock. */
1699 	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, flags);
1700 	if (IS_ERR(other)) {
1701 		err = PTR_ERR(other);
1702 		goto out_free_skb;
1703 	}
1704 
1705 	unix_state_lock(other);
1706 
1707 	/* Apparently VFS overslept socket death. Retry. */
1708 	if (sock_flag(other, SOCK_DEAD)) {
1709 		unix_state_unlock(other);
1710 		sock_put(other);
1711 		goto restart;
1712 	}
1713 
1714 	if (other->sk_state != TCP_LISTEN ||
1715 	    other->sk_shutdown & RCV_SHUTDOWN) {
1716 		err = -ECONNREFUSED;
1717 		goto out_unlock;
1718 	}
1719 
1720 	if (unix_recvq_full_lockless(other)) {
1721 		if (!timeo) {
1722 			err = -EAGAIN;
1723 			goto out_unlock;
1724 		}
1725 
1726 		timeo = unix_wait_for_peer(other, timeo);
1727 		sock_put(other);
1728 
1729 		err = sock_intr_errno(timeo);
1730 		if (signal_pending(current))
1731 			goto out_free_skb;
1732 
1733 		goto restart;
1734 	}
1735 
1736 	/* self connect and simultaneous connect are eliminated
1737 	 * by rejecting TCP_LISTEN socket to avoid deadlock.
1738 	 */
1739 	state = READ_ONCE(sk->sk_state);
1740 	if (unlikely(state != TCP_CLOSE)) {
1741 		err = state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1742 		goto out_unlock;
1743 	}
1744 
1745 	unix_state_lock(sk);
1746 
1747 	if (unlikely(sk->sk_state != TCP_CLOSE)) {
1748 		err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EINVAL;
1749 		unix_state_unlock(sk);
1750 		goto out_unlock;
1751 	}
1752 
1753 	err = security_unix_stream_connect(sk, other, newsk);
1754 	if (err) {
1755 		unix_state_unlock(sk);
1756 		goto out_unlock;
1757 	}
1758 
1759 	/* The way is open! Fastly set all the necessary fields... */
1760 
1761 	sock_hold(sk);
1762 	unix_peer(newsk) = sk;
1763 	newsk->sk_state = TCP_ESTABLISHED;
1764 	newsk->sk_type = sk->sk_type;
1765 	newsk->sk_scm_recv_flags = other->sk_scm_recv_flags;
1766 	init_peercred(newsk, &peercred);
1767 
1768 	newu = unix_sk(newsk);
1769 	newu->listener = other;
1770 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1771 	otheru = unix_sk(other);
1772 
1773 	/* copy address information from listening to new sock
1774 	 *
1775 	 * The contents of *(otheru->addr) and otheru->path
1776 	 * are seen fully set up here, since we have found
1777 	 * otheru in hash under its lock.  Insertion into the
1778 	 * hash chain we'd found it in had been done in an
1779 	 * earlier critical area protected by the chain's lock,
1780 	 * the same one where we'd set *(otheru->addr) contents,
1781 	 * as well as otheru->path and otheru->addr itself.
1782 	 *
1783 	 * Using smp_store_release() here to set newu->addr
1784 	 * is enough to make those stores, as well as stores
1785 	 * to newu->path visible to anyone who gets newu->addr
1786 	 * by smp_load_acquire().  IOW, the same warranties
1787 	 * as for unix_sock instances bound in unix_bind() or
1788 	 * in unix_autobind().
1789 	 */
1790 	if (otheru->path.dentry) {
1791 		path_get(&otheru->path);
1792 		newu->path = otheru->path;
1793 	}
1794 	refcount_inc(&otheru->addr->refcnt);
1795 	smp_store_release(&newu->addr, otheru->addr);
1796 
1797 	/* Set credentials */
1798 	copy_peercred(sk, other);
1799 
1800 	sock->state	= SS_CONNECTED;
1801 	WRITE_ONCE(sk->sk_state, TCP_ESTABLISHED);
1802 	sock_hold(newsk);
1803 
1804 	smp_mb__after_atomic();	/* sock_hold() does an atomic_inc() */
1805 	unix_peer(sk)	= newsk;
1806 
1807 	unix_state_unlock(sk);
1808 
1809 	/* take ten and send info to listening sock */
1810 	spin_lock(&other->sk_receive_queue.lock);
1811 	__skb_queue_tail(&other->sk_receive_queue, skb);
1812 	spin_unlock(&other->sk_receive_queue.lock);
1813 	unix_state_unlock(other);
1814 	other->sk_data_ready(other);
1815 	sock_put(other);
1816 	return 0;
1817 
1818 out_unlock:
1819 	unix_state_unlock(other);
1820 	sock_put(other);
1821 out_free_skb:
1822 	consume_skb(skb);
1823 out_free_sk:
1824 	unix_release_sock(newsk, 0);
1825 out:
1826 	drop_peercred(&peercred);
1827 	return err;
1828 }
1829 
unix_socketpair(struct socket * socka,struct socket * sockb)1830 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1831 {
1832 	struct unix_peercred ska_peercred = {}, skb_peercred = {};
1833 	struct sock *ska = socka->sk, *skb = sockb->sk;
1834 	int err;
1835 
1836 	err = prepare_peercred(&ska_peercred);
1837 	if (err)
1838 		return err;
1839 
1840 	err = prepare_peercred(&skb_peercred);
1841 	if (err) {
1842 		drop_peercred(&ska_peercred);
1843 		return err;
1844 	}
1845 
1846 	/* Join our sockets back to back */
1847 	sock_hold(ska);
1848 	sock_hold(skb);
1849 	unix_peer(ska) = skb;
1850 	unix_peer(skb) = ska;
1851 	init_peercred(ska, &ska_peercred);
1852 	init_peercred(skb, &skb_peercred);
1853 
1854 	ska->sk_state = TCP_ESTABLISHED;
1855 	skb->sk_state = TCP_ESTABLISHED;
1856 	socka->state  = SS_CONNECTED;
1857 	sockb->state  = SS_CONNECTED;
1858 	return 0;
1859 }
1860 
unix_accept(struct socket * sock,struct socket * newsock,struct proto_accept_arg * arg)1861 static int unix_accept(struct socket *sock, struct socket *newsock,
1862 		       struct proto_accept_arg *arg)
1863 {
1864 	struct sock *sk = sock->sk;
1865 	struct sk_buff *skb;
1866 	struct sock *tsk;
1867 
1868 	arg->err = -EOPNOTSUPP;
1869 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1870 		goto out;
1871 
1872 	arg->err = -EINVAL;
1873 	if (READ_ONCE(sk->sk_state) != TCP_LISTEN)
1874 		goto out;
1875 
1876 	/* If socket state is TCP_LISTEN it cannot change (for now...),
1877 	 * so that no locks are necessary.
1878 	 */
1879 
1880 	skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1881 				&arg->err);
1882 	if (!skb) {
1883 		/* This means receive shutdown. */
1884 		if (arg->err == 0)
1885 			arg->err = -EINVAL;
1886 		goto out;
1887 	}
1888 
1889 	tsk = skb->sk;
1890 	skb_free_datagram(sk, skb);
1891 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
1892 
1893 	if (tsk->sk_type == SOCK_STREAM)
1894 		set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags);
1895 
1896 	/* attach accepted sock to socket */
1897 	unix_state_lock(tsk);
1898 	unix_update_edges(unix_sk(tsk));
1899 	newsock->state = SS_CONNECTED;
1900 	sock_graft(tsk, newsock);
1901 	unix_state_unlock(tsk);
1902 	return 0;
1903 
1904 out:
1905 	return arg->err;
1906 }
1907 
1908 
unix_getname(struct socket * sock,struct sockaddr * uaddr,int peer)1909 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1910 {
1911 	struct sock *sk = sock->sk;
1912 	struct unix_address *addr;
1913 	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1914 	int err = 0;
1915 
1916 	if (peer) {
1917 		sk = unix_peer_get(sk);
1918 
1919 		err = -ENOTCONN;
1920 		if (!sk)
1921 			goto out;
1922 		err = 0;
1923 	} else {
1924 		sock_hold(sk);
1925 	}
1926 
1927 	addr = smp_load_acquire(&unix_sk(sk)->addr);
1928 	if (!addr) {
1929 		sunaddr->sun_family = AF_UNIX;
1930 		sunaddr->sun_path[0] = 0;
1931 		err = offsetof(struct sockaddr_un, sun_path);
1932 	} else {
1933 		err = addr->len;
1934 		memcpy(sunaddr, addr->name, addr->len);
1935 
1936 		if (peer)
1937 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1938 					       CGROUP_UNIX_GETPEERNAME);
1939 		else
1940 			BPF_CGROUP_RUN_SA_PROG(sk, uaddr, &err,
1941 					       CGROUP_UNIX_GETSOCKNAME);
1942 	}
1943 	sock_put(sk);
1944 out:
1945 	return err;
1946 }
1947 
1948 /* The "user->unix_inflight" variable is protected by the garbage
1949  * collection lock, and we just read it locklessly here. If you go
1950  * over the limit, there might be a tiny race in actually noticing
1951  * it across threads. Tough.
1952  */
too_many_unix_fds(struct task_struct * p)1953 static inline bool too_many_unix_fds(struct task_struct *p)
1954 {
1955 	struct user_struct *user = current_user();
1956 
1957 	if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
1958 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
1959 	return false;
1960 }
1961 
unix_attach_fds(struct scm_cookie * scm,struct sk_buff * skb)1962 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1963 {
1964 	if (too_many_unix_fds(current))
1965 		return -ETOOMANYREFS;
1966 
1967 	UNIXCB(skb).fp = scm->fp;
1968 	scm->fp = NULL;
1969 
1970 	if (unix_prepare_fpl(UNIXCB(skb).fp))
1971 		return -ENOMEM;
1972 
1973 	return 0;
1974 }
1975 
unix_detach_fds(struct scm_cookie * scm,struct sk_buff * skb)1976 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1977 {
1978 	scm->fp = UNIXCB(skb).fp;
1979 	UNIXCB(skb).fp = NULL;
1980 
1981 	unix_destroy_fpl(scm->fp);
1982 }
1983 
unix_peek_fds(struct scm_cookie * scm,struct sk_buff * skb)1984 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1985 {
1986 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1987 }
1988 
unix_destruct_scm(struct sk_buff * skb)1989 static void unix_destruct_scm(struct sk_buff *skb)
1990 {
1991 	struct scm_cookie scm;
1992 
1993 	memset(&scm, 0, sizeof(scm));
1994 	scm.pid = UNIXCB(skb).pid;
1995 	if (UNIXCB(skb).fp)
1996 		unix_detach_fds(&scm, skb);
1997 
1998 	/* Alas, it calls VFS */
1999 	/* So fscking what? fput() had been SMP-safe since the last Summer */
2000 	scm_destroy(&scm);
2001 	sock_wfree(skb);
2002 }
2003 
unix_scm_to_skb(struct scm_cookie * scm,struct sk_buff * skb,bool send_fds)2004 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
2005 {
2006 	int err = 0;
2007 
2008 	UNIXCB(skb).pid = get_pid(scm->pid);
2009 	UNIXCB(skb).uid = scm->creds.uid;
2010 	UNIXCB(skb).gid = scm->creds.gid;
2011 	UNIXCB(skb).fp = NULL;
2012 	unix_get_secdata(scm, skb);
2013 	if (scm->fp && send_fds)
2014 		err = unix_attach_fds(scm, skb);
2015 
2016 	skb->destructor = unix_destruct_scm;
2017 	return err;
2018 }
2019 
unix_skb_to_scm(struct sk_buff * skb,struct scm_cookie * scm)2020 static void unix_skb_to_scm(struct sk_buff *skb, struct scm_cookie *scm)
2021 {
2022 	scm_set_cred(scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2023 	unix_set_secdata(scm, skb);
2024 }
2025 
2026 /**
2027  * unix_maybe_add_creds() - Adds current task uid/gid and struct pid to skb if needed.
2028  * @skb: skb to attach creds to.
2029  * @sk: Sender sock.
2030  * @other: Receiver sock.
2031  *
2032  * Some apps rely on write() giving SCM_CREDENTIALS
2033  * We include credentials if source or destination socket
2034  * asserted SOCK_PASSCRED.
2035  *
2036  * Context: May sleep.
2037  * Return: On success zero, on error a negative error code is returned.
2038  */
unix_maybe_add_creds(struct sk_buff * skb,const struct sock * sk,const struct sock * other)2039 static int unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk,
2040 				const struct sock *other)
2041 {
2042 	if (UNIXCB(skb).pid)
2043 		return 0;
2044 
2045 	if (unix_may_passcred(sk) || unix_may_passcred(other) ||
2046 	    !other->sk_socket) {
2047 		struct pid *pid;
2048 		int err;
2049 
2050 		pid = task_tgid(current);
2051 		err = pidfs_register_pid(pid);
2052 		if (unlikely(err))
2053 			return err;
2054 
2055 		UNIXCB(skb).pid = get_pid(pid);
2056 		current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
2057 	}
2058 
2059 	return 0;
2060 }
2061 
unix_skb_scm_eq(struct sk_buff * skb,struct scm_cookie * scm)2062 static bool unix_skb_scm_eq(struct sk_buff *skb,
2063 			    struct scm_cookie *scm)
2064 {
2065 	return UNIXCB(skb).pid == scm->pid &&
2066 	       uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
2067 	       gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
2068 	       unix_secdata_eq(scm, skb);
2069 }
2070 
scm_stat_add(struct sock * sk,struct sk_buff * skb)2071 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
2072 {
2073 	struct scm_fp_list *fp = UNIXCB(skb).fp;
2074 	struct unix_sock *u = unix_sk(sk);
2075 
2076 	if (unlikely(fp && fp->count)) {
2077 		atomic_add(fp->count, &u->scm_stat.nr_fds);
2078 		unix_add_edges(fp, u);
2079 	}
2080 }
2081 
scm_stat_del(struct sock * sk,struct sk_buff * skb)2082 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
2083 {
2084 	struct scm_fp_list *fp = UNIXCB(skb).fp;
2085 	struct unix_sock *u = unix_sk(sk);
2086 
2087 	if (unlikely(fp && fp->count)) {
2088 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
2089 		unix_del_edges(fp);
2090 	}
2091 }
2092 
2093 /*
2094  *	Send AF_UNIX data.
2095  */
2096 
unix_dgram_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2097 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
2098 			      size_t len)
2099 {
2100 	struct sock *sk = sock->sk, *other = NULL;
2101 	struct unix_sock *u = unix_sk(sk);
2102 	struct scm_cookie scm;
2103 	struct sk_buff *skb;
2104 	int data_len = 0;
2105 	int sk_locked;
2106 	long timeo;
2107 	int err;
2108 
2109 	err = scm_send(sock, msg, &scm, false);
2110 	if (err < 0)
2111 		return err;
2112 
2113 	wait_for_unix_gc(scm.fp);
2114 
2115 	if (msg->msg_flags & MSG_OOB) {
2116 		err = -EOPNOTSUPP;
2117 		goto out;
2118 	}
2119 
2120 	if (msg->msg_namelen) {
2121 		err = unix_validate_addr(msg->msg_name, msg->msg_namelen);
2122 		if (err)
2123 			goto out;
2124 
2125 		err = BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk,
2126 							    msg->msg_name,
2127 							    &msg->msg_namelen,
2128 							    NULL);
2129 		if (err)
2130 			goto out;
2131 	}
2132 
2133 	if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) {
2134 		err = unix_autobind(sk);
2135 		if (err)
2136 			goto out;
2137 	}
2138 
2139 	if (len > READ_ONCE(sk->sk_sndbuf) - 32) {
2140 		err = -EMSGSIZE;
2141 		goto out;
2142 	}
2143 
2144 	if (len > SKB_MAX_ALLOC) {
2145 		data_len = min_t(size_t,
2146 				 len - SKB_MAX_ALLOC,
2147 				 MAX_SKB_FRAGS * PAGE_SIZE);
2148 		data_len = PAGE_ALIGN(data_len);
2149 
2150 		BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
2151 	}
2152 
2153 	skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
2154 				   msg->msg_flags & MSG_DONTWAIT, &err,
2155 				   PAGE_ALLOC_COSTLY_ORDER);
2156 	if (!skb)
2157 		goto out;
2158 
2159 	err = unix_scm_to_skb(&scm, skb, true);
2160 	if (err < 0)
2161 		goto out_free;
2162 
2163 	skb_put(skb, len - data_len);
2164 	skb->data_len = data_len;
2165 	skb->len = len;
2166 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
2167 	if (err)
2168 		goto out_free;
2169 
2170 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
2171 
2172 	if (msg->msg_namelen) {
2173 lookup:
2174 		other = unix_find_other(sock_net(sk), msg->msg_name,
2175 					msg->msg_namelen, sk->sk_type, 0);
2176 		if (IS_ERR(other)) {
2177 			err = PTR_ERR(other);
2178 			goto out_free;
2179 		}
2180 	} else {
2181 		other = unix_peer_get(sk);
2182 		if (!other) {
2183 			err = -ENOTCONN;
2184 			goto out_free;
2185 		}
2186 	}
2187 
2188 	if (sk_filter(other, skb) < 0) {
2189 		/* Toss the packet but do not return any error to the sender */
2190 		err = len;
2191 		goto out_sock_put;
2192 	}
2193 
2194 	err = unix_maybe_add_creds(skb, sk, other);
2195 	if (err)
2196 		goto out_sock_put;
2197 
2198 restart:
2199 	sk_locked = 0;
2200 	unix_state_lock(other);
2201 restart_locked:
2202 
2203 	if (!unix_may_send(sk, other)) {
2204 		err = -EPERM;
2205 		goto out_unlock;
2206 	}
2207 
2208 	if (unlikely(sock_flag(other, SOCK_DEAD))) {
2209 		/* Check with 1003.1g - what should datagram error */
2210 
2211 		unix_state_unlock(other);
2212 
2213 		if (sk->sk_type == SOCK_SEQPACKET) {
2214 			/* We are here only when racing with unix_release_sock()
2215 			 * is clearing @other. Never change state to TCP_CLOSE
2216 			 * unlike SOCK_DGRAM wants.
2217 			 */
2218 			err = -EPIPE;
2219 			goto out_sock_put;
2220 		}
2221 
2222 		if (!sk_locked)
2223 			unix_state_lock(sk);
2224 
2225 		if (unix_peer(sk) == other) {
2226 			unix_peer(sk) = NULL;
2227 			unix_dgram_peer_wake_disconnect_wakeup(sk, other);
2228 
2229 			WRITE_ONCE(sk->sk_state, TCP_CLOSE);
2230 			unix_state_unlock(sk);
2231 
2232 			unix_dgram_disconnected(sk, other);
2233 			sock_put(other);
2234 			err = -ECONNREFUSED;
2235 			goto out_sock_put;
2236 		}
2237 
2238 		unix_state_unlock(sk);
2239 
2240 		if (!msg->msg_namelen) {
2241 			err = -ECONNRESET;
2242 			goto out_sock_put;
2243 		}
2244 
2245 		sock_put(other);
2246 		goto lookup;
2247 	}
2248 
2249 	if (other->sk_shutdown & RCV_SHUTDOWN) {
2250 		err = -EPIPE;
2251 		goto out_unlock;
2252 	}
2253 
2254 	if (UNIXCB(skb).fp && !other->sk_scm_rights) {
2255 		err = -EPERM;
2256 		goto out_unlock;
2257 	}
2258 
2259 	if (sk->sk_type != SOCK_SEQPACKET) {
2260 		err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2261 		if (err)
2262 			goto out_unlock;
2263 	}
2264 
2265 	/* other == sk && unix_peer(other) != sk if
2266 	 * - unix_peer(sk) == NULL, destination address bound to sk
2267 	 * - unix_peer(sk) == sk by time of get but disconnected before lock
2268 	 */
2269 	if (other != sk &&
2270 	    unlikely(unix_peer(other) != sk &&
2271 	    unix_recvq_full_lockless(other))) {
2272 		if (timeo) {
2273 			timeo = unix_wait_for_peer(other, timeo);
2274 
2275 			err = sock_intr_errno(timeo);
2276 			if (signal_pending(current))
2277 				goto out_sock_put;
2278 
2279 			goto restart;
2280 		}
2281 
2282 		if (!sk_locked) {
2283 			unix_state_unlock(other);
2284 			unix_state_double_lock(sk, other);
2285 		}
2286 
2287 		if (unix_peer(sk) != other ||
2288 		    unix_dgram_peer_wake_me(sk, other)) {
2289 			err = -EAGAIN;
2290 			sk_locked = 1;
2291 			goto out_unlock;
2292 		}
2293 
2294 		if (!sk_locked) {
2295 			sk_locked = 1;
2296 			goto restart_locked;
2297 		}
2298 	}
2299 
2300 	if (unlikely(sk_locked))
2301 		unix_state_unlock(sk);
2302 
2303 	if (sock_flag(other, SOCK_RCVTSTAMP))
2304 		__net_timestamp(skb);
2305 
2306 	scm_stat_add(other, skb);
2307 	skb_queue_tail(&other->sk_receive_queue, skb);
2308 	unix_state_unlock(other);
2309 	other->sk_data_ready(other);
2310 	sock_put(other);
2311 	scm_destroy(&scm);
2312 	return len;
2313 
2314 out_unlock:
2315 	if (sk_locked)
2316 		unix_state_unlock(sk);
2317 	unix_state_unlock(other);
2318 out_sock_put:
2319 	sock_put(other);
2320 out_free:
2321 	consume_skb(skb);
2322 out:
2323 	scm_destroy(&scm);
2324 	return err;
2325 }
2326 
2327 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2328  * bytes, and a minimum of a full page.
2329  */
2330 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2331 
2332 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
queue_oob(struct sock * sk,struct msghdr * msg,struct sock * other,struct scm_cookie * scm,bool fds_sent)2333 static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other,
2334 		     struct scm_cookie *scm, bool fds_sent)
2335 {
2336 	struct unix_sock *ousk = unix_sk(other);
2337 	struct sk_buff *skb;
2338 	int err;
2339 
2340 	skb = sock_alloc_send_skb(sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2341 
2342 	if (!skb)
2343 		return err;
2344 
2345 	err = unix_scm_to_skb(scm, skb, !fds_sent);
2346 	if (err < 0)
2347 		goto out;
2348 
2349 	err = unix_maybe_add_creds(skb, sk, other);
2350 	if (err)
2351 		goto out;
2352 
2353 	skb_put(skb, 1);
2354 	err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2355 
2356 	if (err)
2357 		goto out;
2358 
2359 	unix_state_lock(other);
2360 
2361 	if (sock_flag(other, SOCK_DEAD) ||
2362 	    (other->sk_shutdown & RCV_SHUTDOWN)) {
2363 		err = -EPIPE;
2364 		goto out_unlock;
2365 	}
2366 
2367 	if (UNIXCB(skb).fp && !other->sk_scm_rights) {
2368 		err = -EPERM;
2369 		goto out_unlock;
2370 	}
2371 
2372 	scm_stat_add(other, skb);
2373 
2374 	spin_lock(&other->sk_receive_queue.lock);
2375 	WRITE_ONCE(ousk->oob_skb, skb);
2376 	WRITE_ONCE(ousk->inq_len, ousk->inq_len + 1);
2377 	__skb_queue_tail(&other->sk_receive_queue, skb);
2378 	spin_unlock(&other->sk_receive_queue.lock);
2379 
2380 	sk_send_sigurg(other);
2381 	unix_state_unlock(other);
2382 	other->sk_data_ready(other);
2383 
2384 	return 0;
2385 out_unlock:
2386 	unix_state_unlock(other);
2387 out:
2388 	consume_skb(skb);
2389 	return err;
2390 }
2391 #endif
2392 
unix_stream_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2393 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2394 			       size_t len)
2395 {
2396 	struct sock *sk = sock->sk;
2397 	struct sk_buff *skb = NULL;
2398 	struct sock *other = NULL;
2399 	struct unix_sock *otheru;
2400 	struct scm_cookie scm;
2401 	bool fds_sent = false;
2402 	int err, sent = 0;
2403 
2404 	err = scm_send(sock, msg, &scm, false);
2405 	if (err < 0)
2406 		return err;
2407 
2408 	wait_for_unix_gc(scm.fp);
2409 
2410 	if (msg->msg_flags & MSG_OOB) {
2411 		err = -EOPNOTSUPP;
2412 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2413 		if (len)
2414 			len--;
2415 		else
2416 #endif
2417 			goto out_err;
2418 	}
2419 
2420 	if (msg->msg_namelen) {
2421 		err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2422 		goto out_err;
2423 	}
2424 
2425 	other = unix_peer(sk);
2426 	if (!other) {
2427 		err = -ENOTCONN;
2428 		goto out_err;
2429 	}
2430 
2431 	otheru = unix_sk(other);
2432 
2433 	if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2434 		goto out_pipe;
2435 
2436 	while (sent < len) {
2437 		int size = len - sent;
2438 		int data_len;
2439 
2440 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2441 			skb = sock_alloc_send_pskb(sk, 0, 0,
2442 						   msg->msg_flags & MSG_DONTWAIT,
2443 						   &err, 0);
2444 		} else {
2445 			/* Keep two messages in the pipe so it schedules better */
2446 			size = min_t(int, size, (READ_ONCE(sk->sk_sndbuf) >> 1) - 64);
2447 
2448 			/* allow fallback to order-0 allocations */
2449 			size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2450 
2451 			data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2452 
2453 			data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2454 
2455 			skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2456 						   msg->msg_flags & MSG_DONTWAIT, &err,
2457 						   get_order(UNIX_SKB_FRAGS_SZ));
2458 		}
2459 		if (!skb)
2460 			goto out_err;
2461 
2462 		/* Only send the fds in the first buffer */
2463 		err = unix_scm_to_skb(&scm, skb, !fds_sent);
2464 		if (err < 0)
2465 			goto out_free;
2466 
2467 		fds_sent = true;
2468 
2469 		err = unix_maybe_add_creds(skb, sk, other);
2470 		if (err)
2471 			goto out_free;
2472 
2473 		if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
2474 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2475 			err = skb_splice_from_iter(skb, &msg->msg_iter, size);
2476 			if (err < 0)
2477 				goto out_free;
2478 
2479 			size = err;
2480 			refcount_add(size, &sk->sk_wmem_alloc);
2481 		} else {
2482 			skb_put(skb, size - data_len);
2483 			skb->data_len = data_len;
2484 			skb->len = size;
2485 			err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2486 			if (err)
2487 				goto out_free;
2488 		}
2489 
2490 		unix_state_lock(other);
2491 
2492 		if (sock_flag(other, SOCK_DEAD) ||
2493 		    (other->sk_shutdown & RCV_SHUTDOWN))
2494 			goto out_pipe_unlock;
2495 
2496 		if (UNIXCB(skb).fp && !other->sk_scm_rights) {
2497 			unix_state_unlock(other);
2498 			err = -EPERM;
2499 			goto out_free;
2500 		}
2501 
2502 		scm_stat_add(other, skb);
2503 
2504 		spin_lock(&other->sk_receive_queue.lock);
2505 		WRITE_ONCE(otheru->inq_len, otheru->inq_len + skb->len);
2506 		__skb_queue_tail(&other->sk_receive_queue, skb);
2507 		spin_unlock(&other->sk_receive_queue.lock);
2508 
2509 		unix_state_unlock(other);
2510 		other->sk_data_ready(other);
2511 		sent += size;
2512 	}
2513 
2514 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2515 	if (msg->msg_flags & MSG_OOB) {
2516 		err = queue_oob(sk, msg, other, &scm, fds_sent);
2517 		if (err)
2518 			goto out_err;
2519 		sent++;
2520 	}
2521 #endif
2522 
2523 	scm_destroy(&scm);
2524 
2525 	return sent;
2526 
2527 out_pipe_unlock:
2528 	unix_state_unlock(other);
2529 out_pipe:
2530 	if (!sent && !(msg->msg_flags & MSG_NOSIGNAL))
2531 		send_sig(SIGPIPE, current, 0);
2532 	err = -EPIPE;
2533 out_free:
2534 	consume_skb(skb);
2535 out_err:
2536 	scm_destroy(&scm);
2537 	return sent ? : err;
2538 }
2539 
unix_seqpacket_sendmsg(struct socket * sock,struct msghdr * msg,size_t len)2540 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2541 				  size_t len)
2542 {
2543 	int err;
2544 	struct sock *sk = sock->sk;
2545 
2546 	err = sock_error(sk);
2547 	if (err)
2548 		return err;
2549 
2550 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2551 		return -ENOTCONN;
2552 
2553 	if (msg->msg_namelen)
2554 		msg->msg_namelen = 0;
2555 
2556 	return unix_dgram_sendmsg(sock, msg, len);
2557 }
2558 
unix_seqpacket_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2559 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2560 				  size_t size, int flags)
2561 {
2562 	struct sock *sk = sock->sk;
2563 
2564 	if (READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)
2565 		return -ENOTCONN;
2566 
2567 	return unix_dgram_recvmsg(sock, msg, size, flags);
2568 }
2569 
unix_copy_addr(struct msghdr * msg,struct sock * sk)2570 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2571 {
2572 	struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2573 
2574 	if (addr) {
2575 		msg->msg_namelen = addr->len;
2576 		memcpy(msg->msg_name, addr->name, addr->len);
2577 	}
2578 }
2579 
__unix_dgram_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)2580 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2581 			 int flags)
2582 {
2583 	struct scm_cookie scm;
2584 	struct socket *sock = sk->sk_socket;
2585 	struct unix_sock *u = unix_sk(sk);
2586 	struct sk_buff *skb, *last;
2587 	long timeo;
2588 	int skip;
2589 	int err;
2590 
2591 	err = -EOPNOTSUPP;
2592 	if (flags&MSG_OOB)
2593 		goto out;
2594 
2595 	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2596 
2597 	do {
2598 		mutex_lock(&u->iolock);
2599 
2600 		skip = sk_peek_offset(sk, flags);
2601 		skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2602 					      &skip, &err, &last);
2603 		if (skb) {
2604 			if (!(flags & MSG_PEEK))
2605 				scm_stat_del(sk, skb);
2606 			break;
2607 		}
2608 
2609 		mutex_unlock(&u->iolock);
2610 
2611 		if (err != -EAGAIN)
2612 			break;
2613 	} while (timeo &&
2614 		 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2615 					      &err, &timeo, last));
2616 
2617 	if (!skb) { /* implies iolock unlocked */
2618 		/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2619 		if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2620 		    (READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN))
2621 			err = 0;
2622 		goto out;
2623 	}
2624 
2625 	if (wq_has_sleeper(&u->peer_wait))
2626 		wake_up_interruptible_sync_poll(&u->peer_wait,
2627 						EPOLLOUT | EPOLLWRNORM |
2628 						EPOLLWRBAND);
2629 
2630 	if (msg->msg_name) {
2631 		unix_copy_addr(msg, skb->sk);
2632 
2633 		BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk,
2634 						      msg->msg_name,
2635 						      &msg->msg_namelen);
2636 	}
2637 
2638 	if (size > skb->len - skip)
2639 		size = skb->len - skip;
2640 	else if (size < skb->len - skip)
2641 		msg->msg_flags |= MSG_TRUNC;
2642 
2643 	err = skb_copy_datagram_msg(skb, skip, msg, size);
2644 	if (err)
2645 		goto out_free;
2646 
2647 	if (sock_flag(sk, SOCK_RCVTSTAMP))
2648 		__sock_recv_timestamp(msg, sk, skb);
2649 
2650 	memset(&scm, 0, sizeof(scm));
2651 
2652 	unix_skb_to_scm(skb, &scm);
2653 
2654 	if (!(flags & MSG_PEEK)) {
2655 		if (UNIXCB(skb).fp)
2656 			unix_detach_fds(&scm, skb);
2657 
2658 		sk_peek_offset_bwd(sk, skb->len);
2659 	} else {
2660 		/* It is questionable: on PEEK we could:
2661 		   - do not return fds - good, but too simple 8)
2662 		   - return fds, and do not return them on read (old strategy,
2663 		     apparently wrong)
2664 		   - clone fds (I chose it for now, it is the most universal
2665 		     solution)
2666 
2667 		   POSIX 1003.1g does not actually define this clearly
2668 		   at all. POSIX 1003.1g doesn't define a lot of things
2669 		   clearly however!
2670 
2671 		*/
2672 
2673 		sk_peek_offset_fwd(sk, size);
2674 
2675 		if (UNIXCB(skb).fp)
2676 			unix_peek_fds(&scm, skb);
2677 	}
2678 	err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2679 
2680 	scm_recv_unix(sock, msg, &scm, flags);
2681 
2682 out_free:
2683 	skb_free_datagram(sk, skb);
2684 	mutex_unlock(&u->iolock);
2685 out:
2686 	return err;
2687 }
2688 
unix_dgram_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)2689 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2690 			      int flags)
2691 {
2692 	struct sock *sk = sock->sk;
2693 
2694 #ifdef CONFIG_BPF_SYSCALL
2695 	const struct proto *prot = READ_ONCE(sk->sk_prot);
2696 
2697 	if (prot != &unix_dgram_proto)
2698 		return prot->recvmsg(sk, msg, size, flags, NULL);
2699 #endif
2700 	return __unix_dgram_recvmsg(sk, msg, size, flags);
2701 }
2702 
unix_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2703 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2704 {
2705 	struct unix_sock *u = unix_sk(sk);
2706 	struct sk_buff *skb;
2707 	int err;
2708 
2709 	mutex_lock(&u->iolock);
2710 	skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2711 	mutex_unlock(&u->iolock);
2712 	if (!skb)
2713 		return err;
2714 
2715 	return recv_actor(sk, skb);
2716 }
2717 
2718 /*
2719  *	Sleep until more data has arrived. But check for races..
2720  */
unix_stream_data_wait(struct sock * sk,long timeo,struct sk_buff * last,unsigned int last_len,bool freezable)2721 static long unix_stream_data_wait(struct sock *sk, long timeo,
2722 				  struct sk_buff *last, unsigned int last_len,
2723 				  bool freezable)
2724 {
2725 	unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE;
2726 	struct sk_buff *tail;
2727 	DEFINE_WAIT(wait);
2728 
2729 	unix_state_lock(sk);
2730 
2731 	for (;;) {
2732 		prepare_to_wait(sk_sleep(sk), &wait, state);
2733 
2734 		tail = skb_peek_tail(&sk->sk_receive_queue);
2735 		if (tail != last ||
2736 		    (tail && tail->len != last_len) ||
2737 		    sk->sk_err ||
2738 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
2739 		    signal_pending(current) ||
2740 		    !timeo)
2741 			break;
2742 
2743 		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2744 		unix_state_unlock(sk);
2745 		timeo = schedule_timeout(timeo);
2746 		unix_state_lock(sk);
2747 
2748 		if (sock_flag(sk, SOCK_DEAD))
2749 			break;
2750 
2751 		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2752 	}
2753 
2754 	finish_wait(sk_sleep(sk), &wait);
2755 	unix_state_unlock(sk);
2756 	return timeo;
2757 }
2758 
2759 struct unix_stream_read_state {
2760 	int (*recv_actor)(struct sk_buff *, int, int,
2761 			  struct unix_stream_read_state *);
2762 	struct socket *socket;
2763 	struct msghdr *msg;
2764 	struct pipe_inode_info *pipe;
2765 	size_t size;
2766 	int flags;
2767 	unsigned int splice_flags;
2768 };
2769 
2770 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
unix_stream_recv_urg(struct unix_stream_read_state * state)2771 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2772 {
2773 	struct sk_buff *oob_skb, *read_skb = NULL;
2774 	struct socket *sock = state->socket;
2775 	struct sock *sk = sock->sk;
2776 	struct unix_sock *u = unix_sk(sk);
2777 	int chunk = 1;
2778 
2779 	mutex_lock(&u->iolock);
2780 	unix_state_lock(sk);
2781 	spin_lock(&sk->sk_receive_queue.lock);
2782 
2783 	if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2784 		spin_unlock(&sk->sk_receive_queue.lock);
2785 		unix_state_unlock(sk);
2786 		mutex_unlock(&u->iolock);
2787 		return -EINVAL;
2788 	}
2789 
2790 	oob_skb = u->oob_skb;
2791 
2792 	if (!(state->flags & MSG_PEEK)) {
2793 		WRITE_ONCE(u->oob_skb, NULL);
2794 		WRITE_ONCE(u->inq_len, u->inq_len - 1);
2795 
2796 		if (oob_skb->prev != (struct sk_buff *)&sk->sk_receive_queue &&
2797 		    !unix_skb_len(oob_skb->prev)) {
2798 			read_skb = oob_skb->prev;
2799 			__skb_unlink(read_skb, &sk->sk_receive_queue);
2800 		}
2801 	}
2802 
2803 	spin_unlock(&sk->sk_receive_queue.lock);
2804 	unix_state_unlock(sk);
2805 
2806 	chunk = state->recv_actor(oob_skb, 0, chunk, state);
2807 
2808 	if (!(state->flags & MSG_PEEK))
2809 		UNIXCB(oob_skb).consumed += 1;
2810 
2811 	mutex_unlock(&u->iolock);
2812 
2813 	consume_skb(read_skb);
2814 
2815 	if (chunk < 0)
2816 		return -EFAULT;
2817 
2818 	state->msg->msg_flags |= MSG_OOB;
2819 	return 1;
2820 }
2821 
manage_oob(struct sk_buff * skb,struct sock * sk,int flags,int copied)2822 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2823 				  int flags, int copied)
2824 {
2825 	struct sk_buff *read_skb = NULL, *unread_skb = NULL;
2826 	struct unix_sock *u = unix_sk(sk);
2827 
2828 	if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb)))
2829 		return skb;
2830 
2831 	spin_lock(&sk->sk_receive_queue.lock);
2832 
2833 	if (!unix_skb_len(skb)) {
2834 		if (copied && (!u->oob_skb || skb == u->oob_skb)) {
2835 			skb = NULL;
2836 		} else if (flags & MSG_PEEK) {
2837 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2838 		} else {
2839 			read_skb = skb;
2840 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
2841 			__skb_unlink(read_skb, &sk->sk_receive_queue);
2842 		}
2843 
2844 		if (!skb)
2845 			goto unlock;
2846 	}
2847 
2848 	if (skb != u->oob_skb)
2849 		goto unlock;
2850 
2851 	if (copied) {
2852 		skb = NULL;
2853 	} else if (!(flags & MSG_PEEK)) {
2854 		WRITE_ONCE(u->oob_skb, NULL);
2855 
2856 		if (!sock_flag(sk, SOCK_URGINLINE)) {
2857 			__skb_unlink(skb, &sk->sk_receive_queue);
2858 			unread_skb = skb;
2859 			skb = skb_peek(&sk->sk_receive_queue);
2860 		}
2861 	} else if (!sock_flag(sk, SOCK_URGINLINE)) {
2862 		skb = skb_peek_next(skb, &sk->sk_receive_queue);
2863 	}
2864 
2865 unlock:
2866 	spin_unlock(&sk->sk_receive_queue.lock);
2867 
2868 	consume_skb(read_skb);
2869 	kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2870 
2871 	return skb;
2872 }
2873 #endif
2874 
unix_stream_read_skb(struct sock * sk,skb_read_actor_t recv_actor)2875 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2876 {
2877 	struct sk_buff_head *queue = &sk->sk_receive_queue;
2878 	struct unix_sock *u = unix_sk(sk);
2879 	struct sk_buff *skb;
2880 	int err;
2881 
2882 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED))
2883 		return -ENOTCONN;
2884 
2885 	err = sock_error(sk);
2886 	if (err)
2887 		return err;
2888 
2889 	mutex_lock(&u->iolock);
2890 	spin_lock(&queue->lock);
2891 
2892 	skb = __skb_dequeue(queue);
2893 	if (!skb) {
2894 		spin_unlock(&queue->lock);
2895 		mutex_unlock(&u->iolock);
2896 		return -EAGAIN;
2897 	}
2898 
2899 	WRITE_ONCE(u->inq_len, u->inq_len - skb->len);
2900 
2901 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2902 	if (skb == u->oob_skb) {
2903 		WRITE_ONCE(u->oob_skb, NULL);
2904 		spin_unlock(&queue->lock);
2905 		mutex_unlock(&u->iolock);
2906 
2907 		kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
2908 		return -EAGAIN;
2909 	}
2910 #endif
2911 
2912 	spin_unlock(&queue->lock);
2913 	mutex_unlock(&u->iolock);
2914 
2915 	return recv_actor(sk, skb);
2916 }
2917 
unix_stream_read_generic(struct unix_stream_read_state * state,bool freezable)2918 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2919 				    bool freezable)
2920 {
2921 	int noblock = state->flags & MSG_DONTWAIT;
2922 	struct socket *sock = state->socket;
2923 	struct msghdr *msg = state->msg;
2924 	struct sock *sk = sock->sk;
2925 	size_t size = state->size;
2926 	int flags = state->flags;
2927 	bool check_creds = false;
2928 	struct scm_cookie scm;
2929 	unsigned int last_len;
2930 	struct unix_sock *u;
2931 	int copied = 0;
2932 	int err = 0;
2933 	long timeo;
2934 	int target;
2935 	int skip;
2936 
2937 	if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) {
2938 		err = -EINVAL;
2939 		goto out;
2940 	}
2941 
2942 	if (unlikely(flags & MSG_OOB)) {
2943 		err = -EOPNOTSUPP;
2944 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2945 		err = unix_stream_recv_urg(state);
2946 #endif
2947 		goto out;
2948 	}
2949 
2950 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2951 	timeo = sock_rcvtimeo(sk, noblock);
2952 
2953 	memset(&scm, 0, sizeof(scm));
2954 
2955 	u = unix_sk(sk);
2956 
2957 	/* Lock the socket to prevent queue disordering
2958 	 * while sleeps in memcpy_tomsg
2959 	 */
2960 	mutex_lock(&u->iolock);
2961 
2962 	skip = max(sk_peek_offset(sk, flags), 0);
2963 
2964 	do {
2965 		struct sk_buff *skb, *last;
2966 		int chunk;
2967 
2968 redo:
2969 		unix_state_lock(sk);
2970 		if (sock_flag(sk, SOCK_DEAD)) {
2971 			err = -ECONNRESET;
2972 			goto unlock;
2973 		}
2974 		last = skb = skb_peek(&sk->sk_receive_queue);
2975 		last_len = last ? last->len : 0;
2976 
2977 again:
2978 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2979 		if (skb) {
2980 			skb = manage_oob(skb, sk, flags, copied);
2981 			if (!skb && copied) {
2982 				unix_state_unlock(sk);
2983 				break;
2984 			}
2985 		}
2986 #endif
2987 		if (skb == NULL) {
2988 			if (copied >= target)
2989 				goto unlock;
2990 
2991 			/*
2992 			 *	POSIX 1003.1g mandates this order.
2993 			 */
2994 
2995 			err = sock_error(sk);
2996 			if (err)
2997 				goto unlock;
2998 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2999 				goto unlock;
3000 
3001 			unix_state_unlock(sk);
3002 			if (!timeo) {
3003 				err = -EAGAIN;
3004 				break;
3005 			}
3006 
3007 			mutex_unlock(&u->iolock);
3008 
3009 			timeo = unix_stream_data_wait(sk, timeo, last,
3010 						      last_len, freezable);
3011 
3012 			if (signal_pending(current)) {
3013 				err = sock_intr_errno(timeo);
3014 				scm_destroy(&scm);
3015 				goto out;
3016 			}
3017 
3018 			mutex_lock(&u->iolock);
3019 			goto redo;
3020 unlock:
3021 			unix_state_unlock(sk);
3022 			break;
3023 		}
3024 
3025 		while (skip >= unix_skb_len(skb)) {
3026 			skip -= unix_skb_len(skb);
3027 			last = skb;
3028 			last_len = skb->len;
3029 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
3030 			if (!skb)
3031 				goto again;
3032 		}
3033 
3034 		unix_state_unlock(sk);
3035 
3036 		if (check_creds) {
3037 			/* Never glue messages from different writers */
3038 			if (!unix_skb_scm_eq(skb, &scm))
3039 				break;
3040 		} else if (unix_may_passcred(sk)) {
3041 			/* Copy credentials */
3042 			unix_skb_to_scm(skb, &scm);
3043 			check_creds = true;
3044 		}
3045 
3046 		/* Copy address just once */
3047 		if (msg && msg->msg_name) {
3048 			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
3049 
3050 			unix_copy_addr(msg, skb->sk);
3051 			BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, msg->msg_name,
3052 							      &msg->msg_namelen);
3053 
3054 			sunaddr = NULL;
3055 		}
3056 
3057 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
3058 		chunk = state->recv_actor(skb, skip, chunk, state);
3059 		if (chunk < 0) {
3060 			if (copied == 0)
3061 				copied = -EFAULT;
3062 			break;
3063 		}
3064 		copied += chunk;
3065 		size -= chunk;
3066 
3067 		/* Mark read part of skb as used */
3068 		if (!(flags & MSG_PEEK)) {
3069 			UNIXCB(skb).consumed += chunk;
3070 
3071 			sk_peek_offset_bwd(sk, chunk);
3072 
3073 			if (UNIXCB(skb).fp) {
3074 				scm_stat_del(sk, skb);
3075 				unix_detach_fds(&scm, skb);
3076 			}
3077 
3078 			if (unix_skb_len(skb))
3079 				break;
3080 
3081 			spin_lock(&sk->sk_receive_queue.lock);
3082 			WRITE_ONCE(u->inq_len, u->inq_len - skb->len);
3083 			__skb_unlink(skb, &sk->sk_receive_queue);
3084 			spin_unlock(&sk->sk_receive_queue.lock);
3085 
3086 			consume_skb(skb);
3087 
3088 			if (scm.fp)
3089 				break;
3090 		} else {
3091 			/* It is questionable, see note in unix_dgram_recvmsg.
3092 			 */
3093 			if (UNIXCB(skb).fp)
3094 				unix_peek_fds(&scm, skb);
3095 
3096 			sk_peek_offset_fwd(sk, chunk);
3097 
3098 			if (UNIXCB(skb).fp)
3099 				break;
3100 
3101 			skip = 0;
3102 			last = skb;
3103 			last_len = skb->len;
3104 			unix_state_lock(sk);
3105 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
3106 			if (skb)
3107 				goto again;
3108 			unix_state_unlock(sk);
3109 			break;
3110 		}
3111 	} while (size);
3112 
3113 	mutex_unlock(&u->iolock);
3114 	if (msg) {
3115 		scm_recv_unix(sock, msg, &scm, flags);
3116 
3117 		if (READ_ONCE(u->recvmsg_inq) || msg->msg_get_inq) {
3118 			msg->msg_inq = READ_ONCE(u->inq_len);
3119 			put_cmsg(msg, SOL_SOCKET, SCM_INQ,
3120 				 sizeof(msg->msg_inq), &msg->msg_inq);
3121 		}
3122 	} else {
3123 		scm_destroy(&scm);
3124 	}
3125 out:
3126 	return copied ? : err;
3127 }
3128 
unix_stream_read_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)3129 static int unix_stream_read_actor(struct sk_buff *skb,
3130 				  int skip, int chunk,
3131 				  struct unix_stream_read_state *state)
3132 {
3133 	int ret;
3134 
3135 	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
3136 				    state->msg, chunk);
3137 	return ret ?: chunk;
3138 }
3139 
__unix_stream_recvmsg(struct sock * sk,struct msghdr * msg,size_t size,int flags)3140 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
3141 			  size_t size, int flags)
3142 {
3143 	struct unix_stream_read_state state = {
3144 		.recv_actor = unix_stream_read_actor,
3145 		.socket = sk->sk_socket,
3146 		.msg = msg,
3147 		.size = size,
3148 		.flags = flags
3149 	};
3150 
3151 	return unix_stream_read_generic(&state, true);
3152 }
3153 
unix_stream_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3154 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
3155 			       size_t size, int flags)
3156 {
3157 	struct unix_stream_read_state state = {
3158 		.recv_actor = unix_stream_read_actor,
3159 		.socket = sock,
3160 		.msg = msg,
3161 		.size = size,
3162 		.flags = flags
3163 	};
3164 
3165 #ifdef CONFIG_BPF_SYSCALL
3166 	struct sock *sk = sock->sk;
3167 	const struct proto *prot = READ_ONCE(sk->sk_prot);
3168 
3169 	if (prot != &unix_stream_proto)
3170 		return prot->recvmsg(sk, msg, size, flags, NULL);
3171 #endif
3172 	return unix_stream_read_generic(&state, true);
3173 }
3174 
unix_stream_splice_actor(struct sk_buff * skb,int skip,int chunk,struct unix_stream_read_state * state)3175 static int unix_stream_splice_actor(struct sk_buff *skb,
3176 				    int skip, int chunk,
3177 				    struct unix_stream_read_state *state)
3178 {
3179 	return skb_splice_bits(skb, state->socket->sk,
3180 			       UNIXCB(skb).consumed + skip,
3181 			       state->pipe, chunk, state->splice_flags);
3182 }
3183 
unix_stream_splice_read(struct socket * sock,loff_t * ppos,struct pipe_inode_info * pipe,size_t size,unsigned int flags)3184 static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
3185 				       struct pipe_inode_info *pipe,
3186 				       size_t size, unsigned int flags)
3187 {
3188 	struct unix_stream_read_state state = {
3189 		.recv_actor = unix_stream_splice_actor,
3190 		.socket = sock,
3191 		.pipe = pipe,
3192 		.size = size,
3193 		.splice_flags = flags,
3194 	};
3195 
3196 	if (unlikely(*ppos))
3197 		return -ESPIPE;
3198 
3199 	if (sock->file->f_flags & O_NONBLOCK ||
3200 	    flags & SPLICE_F_NONBLOCK)
3201 		state.flags = MSG_DONTWAIT;
3202 
3203 	return unix_stream_read_generic(&state, false);
3204 }
3205 
unix_shutdown(struct socket * sock,int mode)3206 static int unix_shutdown(struct socket *sock, int mode)
3207 {
3208 	struct sock *sk = sock->sk;
3209 	struct sock *other;
3210 
3211 	if (mode < SHUT_RD || mode > SHUT_RDWR)
3212 		return -EINVAL;
3213 	/* This maps:
3214 	 * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
3215 	 * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
3216 	 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3217 	 */
3218 	++mode;
3219 
3220 	unix_state_lock(sk);
3221 	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode);
3222 	other = unix_peer(sk);
3223 	if (other)
3224 		sock_hold(other);
3225 	unix_state_unlock(sk);
3226 	sk->sk_state_change(sk);
3227 
3228 	if (other &&
3229 		(sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
3230 
3231 		int peer_mode = 0;
3232 		const struct proto *prot = READ_ONCE(other->sk_prot);
3233 
3234 		if (prot->unhash)
3235 			prot->unhash(other);
3236 		if (mode&RCV_SHUTDOWN)
3237 			peer_mode |= SEND_SHUTDOWN;
3238 		if (mode&SEND_SHUTDOWN)
3239 			peer_mode |= RCV_SHUTDOWN;
3240 		unix_state_lock(other);
3241 		WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode);
3242 		unix_state_unlock(other);
3243 		other->sk_state_change(other);
3244 		if (peer_mode == SHUTDOWN_MASK)
3245 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3246 		else if (peer_mode & RCV_SHUTDOWN)
3247 			sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3248 	}
3249 	if (other)
3250 		sock_put(other);
3251 
3252 	return 0;
3253 }
3254 
unix_inq_len(struct sock * sk)3255 long unix_inq_len(struct sock *sk)
3256 {
3257 	struct sk_buff *skb;
3258 	long amount = 0;
3259 
3260 	if (READ_ONCE(sk->sk_state) == TCP_LISTEN)
3261 		return -EINVAL;
3262 
3263 	if (sk->sk_type == SOCK_STREAM)
3264 		return READ_ONCE(unix_sk(sk)->inq_len);
3265 
3266 	spin_lock(&sk->sk_receive_queue.lock);
3267 	if (sk->sk_type == SOCK_SEQPACKET) {
3268 		skb_queue_walk(&sk->sk_receive_queue, skb)
3269 			amount += unix_skb_len(skb);
3270 	} else {
3271 		skb = skb_peek(&sk->sk_receive_queue);
3272 		if (skb)
3273 			amount = skb->len;
3274 	}
3275 	spin_unlock(&sk->sk_receive_queue.lock);
3276 
3277 	return amount;
3278 }
3279 EXPORT_SYMBOL_GPL(unix_inq_len);
3280 
unix_outq_len(struct sock * sk)3281 long unix_outq_len(struct sock *sk)
3282 {
3283 	return sk_wmem_alloc_get(sk);
3284 }
3285 EXPORT_SYMBOL_GPL(unix_outq_len);
3286 
unix_open_file(struct sock * sk)3287 static int unix_open_file(struct sock *sk)
3288 {
3289 	struct file *f;
3290 	int fd;
3291 
3292 	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3293 		return -EPERM;
3294 
3295 	if (!smp_load_acquire(&unix_sk(sk)->addr))
3296 		return -ENOENT;
3297 
3298 	if (!unix_sk(sk)->path.dentry)
3299 		return -ENOENT;
3300 
3301 	fd = get_unused_fd_flags(O_CLOEXEC);
3302 	if (fd < 0)
3303 		return fd;
3304 
3305 	f = dentry_open(&unix_sk(sk)->path, O_PATH, current_cred());
3306 	if (IS_ERR(f)) {
3307 		put_unused_fd(fd);
3308 		return PTR_ERR(f);
3309 	}
3310 
3311 	fd_install(fd, f);
3312 	return fd;
3313 }
3314 
unix_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3315 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3316 {
3317 	struct sock *sk = sock->sk;
3318 	long amount = 0;
3319 	int err;
3320 
3321 	switch (cmd) {
3322 	case SIOCOUTQ:
3323 		amount = unix_outq_len(sk);
3324 		err = put_user(amount, (int __user *)arg);
3325 		break;
3326 	case SIOCINQ:
3327 		amount = unix_inq_len(sk);
3328 		if (amount < 0)
3329 			err = amount;
3330 		else
3331 			err = put_user(amount, (int __user *)arg);
3332 		break;
3333 	case SIOCUNIXFILE:
3334 		err = unix_open_file(sk);
3335 		break;
3336 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3337 	case SIOCATMARK:
3338 		{
3339 			struct unix_sock *u = unix_sk(sk);
3340 			struct sk_buff *skb;
3341 			int answ = 0;
3342 
3343 			mutex_lock(&u->iolock);
3344 
3345 			skb = skb_peek(&sk->sk_receive_queue);
3346 			if (skb) {
3347 				struct sk_buff *oob_skb = READ_ONCE(u->oob_skb);
3348 				struct sk_buff *next_skb;
3349 
3350 				next_skb = skb_peek_next(skb, &sk->sk_receive_queue);
3351 
3352 				if (skb == oob_skb ||
3353 				    (!unix_skb_len(skb) &&
3354 				     (!oob_skb || next_skb == oob_skb)))
3355 					answ = 1;
3356 			}
3357 
3358 			mutex_unlock(&u->iolock);
3359 
3360 			err = put_user(answ, (int __user *)arg);
3361 		}
3362 		break;
3363 #endif
3364 	default:
3365 		err = -ENOIOCTLCMD;
3366 		break;
3367 	}
3368 	return err;
3369 }
3370 
3371 #ifdef CONFIG_COMPAT
unix_compat_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3372 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3373 {
3374 	return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3375 }
3376 #endif
3377 
unix_poll(struct file * file,struct socket * sock,poll_table * wait)3378 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3379 {
3380 	struct sock *sk = sock->sk;
3381 	unsigned char state;
3382 	__poll_t mask;
3383 	u8 shutdown;
3384 
3385 	sock_poll_wait(file, sock, wait);
3386 	mask = 0;
3387 	shutdown = READ_ONCE(sk->sk_shutdown);
3388 	state = READ_ONCE(sk->sk_state);
3389 
3390 	/* exceptional events? */
3391 	if (READ_ONCE(sk->sk_err))
3392 		mask |= EPOLLERR;
3393 	if (shutdown == SHUTDOWN_MASK)
3394 		mask |= EPOLLHUP;
3395 	if (shutdown & RCV_SHUTDOWN)
3396 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3397 
3398 	/* readable? */
3399 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3400 		mask |= EPOLLIN | EPOLLRDNORM;
3401 	if (sk_is_readable(sk))
3402 		mask |= EPOLLIN | EPOLLRDNORM;
3403 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3404 	if (READ_ONCE(unix_sk(sk)->oob_skb))
3405 		mask |= EPOLLPRI;
3406 #endif
3407 
3408 	/* Connection-based need to check for termination and startup */
3409 	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3410 	    state == TCP_CLOSE)
3411 		mask |= EPOLLHUP;
3412 
3413 	/*
3414 	 * we set writable also when the other side has shut down the
3415 	 * connection. This prevents stuck sockets.
3416 	 */
3417 	if (unix_writable(sk, state))
3418 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3419 
3420 	return mask;
3421 }
3422 
unix_dgram_poll(struct file * file,struct socket * sock,poll_table * wait)3423 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3424 				    poll_table *wait)
3425 {
3426 	struct sock *sk = sock->sk, *other;
3427 	unsigned int writable;
3428 	unsigned char state;
3429 	__poll_t mask;
3430 	u8 shutdown;
3431 
3432 	sock_poll_wait(file, sock, wait);
3433 	mask = 0;
3434 	shutdown = READ_ONCE(sk->sk_shutdown);
3435 	state = READ_ONCE(sk->sk_state);
3436 
3437 	/* exceptional events? */
3438 	if (READ_ONCE(sk->sk_err) ||
3439 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
3440 		mask |= EPOLLERR |
3441 			(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3442 
3443 	if (shutdown & RCV_SHUTDOWN)
3444 		mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3445 	if (shutdown == SHUTDOWN_MASK)
3446 		mask |= EPOLLHUP;
3447 
3448 	/* readable? */
3449 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3450 		mask |= EPOLLIN | EPOLLRDNORM;
3451 	if (sk_is_readable(sk))
3452 		mask |= EPOLLIN | EPOLLRDNORM;
3453 
3454 	/* Connection-based need to check for termination and startup */
3455 	if (sk->sk_type == SOCK_SEQPACKET && state == TCP_CLOSE)
3456 		mask |= EPOLLHUP;
3457 
3458 	/* No write status requested, avoid expensive OUT tests. */
3459 	if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3460 		return mask;
3461 
3462 	writable = unix_writable(sk, state);
3463 	if (writable) {
3464 		unix_state_lock(sk);
3465 
3466 		other = unix_peer(sk);
3467 		if (other && unix_peer(other) != sk &&
3468 		    unix_recvq_full_lockless(other) &&
3469 		    unix_dgram_peer_wake_me(sk, other))
3470 			writable = 0;
3471 
3472 		unix_state_unlock(sk);
3473 	}
3474 
3475 	if (writable)
3476 		mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3477 	else
3478 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3479 
3480 	return mask;
3481 }
3482 
3483 #ifdef CONFIG_PROC_FS
3484 
3485 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3486 
3487 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3488 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3489 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3490 
unix_from_bucket(struct seq_file * seq,loff_t * pos)3491 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3492 {
3493 	unsigned long offset = get_offset(*pos);
3494 	unsigned long bucket = get_bucket(*pos);
3495 	unsigned long count = 0;
3496 	struct sock *sk;
3497 
3498 	for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3499 	     sk; sk = sk_next(sk)) {
3500 		if (++count == offset)
3501 			break;
3502 	}
3503 
3504 	return sk;
3505 }
3506 
unix_get_first(struct seq_file * seq,loff_t * pos)3507 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3508 {
3509 	unsigned long bucket = get_bucket(*pos);
3510 	struct net *net = seq_file_net(seq);
3511 	struct sock *sk;
3512 
3513 	while (bucket < UNIX_HASH_SIZE) {
3514 		spin_lock(&net->unx.table.locks[bucket]);
3515 
3516 		sk = unix_from_bucket(seq, pos);
3517 		if (sk)
3518 			return sk;
3519 
3520 		spin_unlock(&net->unx.table.locks[bucket]);
3521 
3522 		*pos = set_bucket_offset(++bucket, 1);
3523 	}
3524 
3525 	return NULL;
3526 }
3527 
unix_get_next(struct seq_file * seq,struct sock * sk,loff_t * pos)3528 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3529 				  loff_t *pos)
3530 {
3531 	unsigned long bucket = get_bucket(*pos);
3532 
3533 	sk = sk_next(sk);
3534 	if (sk)
3535 		return sk;
3536 
3537 
3538 	spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3539 
3540 	*pos = set_bucket_offset(++bucket, 1);
3541 
3542 	return unix_get_first(seq, pos);
3543 }
3544 
unix_seq_start(struct seq_file * seq,loff_t * pos)3545 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3546 {
3547 	if (!*pos)
3548 		return SEQ_START_TOKEN;
3549 
3550 	return unix_get_first(seq, pos);
3551 }
3552 
unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3553 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3554 {
3555 	++*pos;
3556 
3557 	if (v == SEQ_START_TOKEN)
3558 		return unix_get_first(seq, pos);
3559 
3560 	return unix_get_next(seq, v, pos);
3561 }
3562 
unix_seq_stop(struct seq_file * seq,void * v)3563 static void unix_seq_stop(struct seq_file *seq, void *v)
3564 {
3565 	struct sock *sk = v;
3566 
3567 	if (sk)
3568 		spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3569 }
3570 
unix_seq_show(struct seq_file * seq,void * v)3571 static int unix_seq_show(struct seq_file *seq, void *v)
3572 {
3573 
3574 	if (v == SEQ_START_TOKEN)
3575 		seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
3576 			 "Inode Path\n");
3577 	else {
3578 		struct sock *s = v;
3579 		struct unix_sock *u = unix_sk(s);
3580 		unix_state_lock(s);
3581 
3582 		seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3583 			s,
3584 			refcount_read(&s->sk_refcnt),
3585 			0,
3586 			s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3587 			s->sk_type,
3588 			s->sk_socket ?
3589 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3590 			(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3591 			sock_i_ino(s));
3592 
3593 		if (u->addr) {	// under a hash table lock here
3594 			int i, len;
3595 			seq_putc(seq, ' ');
3596 
3597 			i = 0;
3598 			len = u->addr->len -
3599 				offsetof(struct sockaddr_un, sun_path);
3600 			if (u->addr->name->sun_path[0]) {
3601 				len--;
3602 			} else {
3603 				seq_putc(seq, '@');
3604 				i++;
3605 			}
3606 			for ( ; i < len; i++)
3607 				seq_putc(seq, u->addr->name->sun_path[i] ?:
3608 					 '@');
3609 		}
3610 		unix_state_unlock(s);
3611 		seq_putc(seq, '\n');
3612 	}
3613 
3614 	return 0;
3615 }
3616 
3617 static const struct seq_operations unix_seq_ops = {
3618 	.start  = unix_seq_start,
3619 	.next   = unix_seq_next,
3620 	.stop   = unix_seq_stop,
3621 	.show   = unix_seq_show,
3622 };
3623 
3624 #ifdef CONFIG_BPF_SYSCALL
3625 struct bpf_unix_iter_state {
3626 	struct seq_net_private p;
3627 	unsigned int cur_sk;
3628 	unsigned int end_sk;
3629 	unsigned int max_sk;
3630 	struct sock **batch;
3631 	bool st_bucket_done;
3632 };
3633 
3634 struct bpf_iter__unix {
3635 	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3636 	__bpf_md_ptr(struct unix_sock *, unix_sk);
3637 	uid_t uid __aligned(8);
3638 };
3639 
unix_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3640 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3641 			      struct unix_sock *unix_sk, uid_t uid)
3642 {
3643 	struct bpf_iter__unix ctx;
3644 
3645 	meta->seq_num--;  /* skip SEQ_START_TOKEN */
3646 	ctx.meta = meta;
3647 	ctx.unix_sk = unix_sk;
3648 	ctx.uid = uid;
3649 	return bpf_iter_run_prog(prog, &ctx);
3650 }
3651 
bpf_iter_unix_hold_batch(struct seq_file * seq,struct sock * start_sk)3652 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3653 
3654 {
3655 	struct bpf_unix_iter_state *iter = seq->private;
3656 	unsigned int expected = 1;
3657 	struct sock *sk;
3658 
3659 	sock_hold(start_sk);
3660 	iter->batch[iter->end_sk++] = start_sk;
3661 
3662 	for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3663 		if (iter->end_sk < iter->max_sk) {
3664 			sock_hold(sk);
3665 			iter->batch[iter->end_sk++] = sk;
3666 		}
3667 
3668 		expected++;
3669 	}
3670 
3671 	spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3672 
3673 	return expected;
3674 }
3675 
bpf_iter_unix_put_batch(struct bpf_unix_iter_state * iter)3676 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3677 {
3678 	while (iter->cur_sk < iter->end_sk)
3679 		sock_put(iter->batch[iter->cur_sk++]);
3680 }
3681 
bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state * iter,unsigned int new_batch_sz)3682 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3683 				       unsigned int new_batch_sz)
3684 {
3685 	struct sock **new_batch;
3686 
3687 	new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3688 			     GFP_USER | __GFP_NOWARN);
3689 	if (!new_batch)
3690 		return -ENOMEM;
3691 
3692 	bpf_iter_unix_put_batch(iter);
3693 	kvfree(iter->batch);
3694 	iter->batch = new_batch;
3695 	iter->max_sk = new_batch_sz;
3696 
3697 	return 0;
3698 }
3699 
bpf_iter_unix_batch(struct seq_file * seq,loff_t * pos)3700 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3701 					loff_t *pos)
3702 {
3703 	struct bpf_unix_iter_state *iter = seq->private;
3704 	unsigned int expected;
3705 	bool resized = false;
3706 	struct sock *sk;
3707 
3708 	if (iter->st_bucket_done)
3709 		*pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3710 
3711 again:
3712 	/* Get a new batch */
3713 	iter->cur_sk = 0;
3714 	iter->end_sk = 0;
3715 
3716 	sk = unix_get_first(seq, pos);
3717 	if (!sk)
3718 		return NULL; /* Done */
3719 
3720 	expected = bpf_iter_unix_hold_batch(seq, sk);
3721 
3722 	if (iter->end_sk == expected) {
3723 		iter->st_bucket_done = true;
3724 		return sk;
3725 	}
3726 
3727 	if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3728 		resized = true;
3729 		goto again;
3730 	}
3731 
3732 	return sk;
3733 }
3734 
bpf_iter_unix_seq_start(struct seq_file * seq,loff_t * pos)3735 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3736 {
3737 	if (!*pos)
3738 		return SEQ_START_TOKEN;
3739 
3740 	/* bpf iter does not support lseek, so it always
3741 	 * continue from where it was stop()-ped.
3742 	 */
3743 	return bpf_iter_unix_batch(seq, pos);
3744 }
3745 
bpf_iter_unix_seq_next(struct seq_file * seq,void * v,loff_t * pos)3746 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3747 {
3748 	struct bpf_unix_iter_state *iter = seq->private;
3749 	struct sock *sk;
3750 
3751 	/* Whenever seq_next() is called, the iter->cur_sk is
3752 	 * done with seq_show(), so advance to the next sk in
3753 	 * the batch.
3754 	 */
3755 	if (iter->cur_sk < iter->end_sk)
3756 		sock_put(iter->batch[iter->cur_sk++]);
3757 
3758 	++*pos;
3759 
3760 	if (iter->cur_sk < iter->end_sk)
3761 		sk = iter->batch[iter->cur_sk];
3762 	else
3763 		sk = bpf_iter_unix_batch(seq, pos);
3764 
3765 	return sk;
3766 }
3767 
bpf_iter_unix_seq_show(struct seq_file * seq,void * v)3768 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3769 {
3770 	struct bpf_iter_meta meta;
3771 	struct bpf_prog *prog;
3772 	struct sock *sk = v;
3773 	uid_t uid;
3774 	bool slow;
3775 	int ret;
3776 
3777 	if (v == SEQ_START_TOKEN)
3778 		return 0;
3779 
3780 	slow = lock_sock_fast(sk);
3781 
3782 	if (unlikely(sk_unhashed(sk))) {
3783 		ret = SEQ_SKIP;
3784 		goto unlock;
3785 	}
3786 
3787 	uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
3788 	meta.seq = seq;
3789 	prog = bpf_iter_get_info(&meta, false);
3790 	ret = unix_prog_seq_show(prog, &meta, v, uid);
3791 unlock:
3792 	unlock_sock_fast(sk, slow);
3793 	return ret;
3794 }
3795 
bpf_iter_unix_seq_stop(struct seq_file * seq,void * v)3796 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3797 {
3798 	struct bpf_unix_iter_state *iter = seq->private;
3799 	struct bpf_iter_meta meta;
3800 	struct bpf_prog *prog;
3801 
3802 	if (!v) {
3803 		meta.seq = seq;
3804 		prog = bpf_iter_get_info(&meta, true);
3805 		if (prog)
3806 			(void)unix_prog_seq_show(prog, &meta, v, 0);
3807 	}
3808 
3809 	if (iter->cur_sk < iter->end_sk)
3810 		bpf_iter_unix_put_batch(iter);
3811 }
3812 
3813 static const struct seq_operations bpf_iter_unix_seq_ops = {
3814 	.start	= bpf_iter_unix_seq_start,
3815 	.next	= bpf_iter_unix_seq_next,
3816 	.stop	= bpf_iter_unix_seq_stop,
3817 	.show	= bpf_iter_unix_seq_show,
3818 };
3819 #endif
3820 #endif
3821 
3822 static const struct net_proto_family unix_family_ops = {
3823 	.family = PF_UNIX,
3824 	.create = unix_create,
3825 	.owner	= THIS_MODULE,
3826 };
3827 
3828 
unix_net_init(struct net * net)3829 static int __net_init unix_net_init(struct net *net)
3830 {
3831 	int i;
3832 
3833 	net->unx.sysctl_max_dgram_qlen = 10;
3834 	if (unix_sysctl_register(net))
3835 		goto out;
3836 
3837 #ifdef CONFIG_PROC_FS
3838 	if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3839 			     sizeof(struct seq_net_private)))
3840 		goto err_sysctl;
3841 #endif
3842 
3843 	net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3844 					      sizeof(spinlock_t), GFP_KERNEL);
3845 	if (!net->unx.table.locks)
3846 		goto err_proc;
3847 
3848 	net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3849 						sizeof(struct hlist_head),
3850 						GFP_KERNEL);
3851 	if (!net->unx.table.buckets)
3852 		goto free_locks;
3853 
3854 	for (i = 0; i < UNIX_HASH_SIZE; i++) {
3855 		spin_lock_init(&net->unx.table.locks[i]);
3856 		lock_set_cmp_fn(&net->unx.table.locks[i], unix_table_lock_cmp_fn, NULL);
3857 		INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3858 	}
3859 
3860 	return 0;
3861 
3862 free_locks:
3863 	kvfree(net->unx.table.locks);
3864 err_proc:
3865 #ifdef CONFIG_PROC_FS
3866 	remove_proc_entry("unix", net->proc_net);
3867 err_sysctl:
3868 #endif
3869 	unix_sysctl_unregister(net);
3870 out:
3871 	return -ENOMEM;
3872 }
3873 
unix_net_exit(struct net * net)3874 static void __net_exit unix_net_exit(struct net *net)
3875 {
3876 	kvfree(net->unx.table.buckets);
3877 	kvfree(net->unx.table.locks);
3878 	unix_sysctl_unregister(net);
3879 	remove_proc_entry("unix", net->proc_net);
3880 }
3881 
3882 static struct pernet_operations unix_net_ops = {
3883 	.init = unix_net_init,
3884 	.exit = unix_net_exit,
3885 };
3886 
3887 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(unix,struct bpf_iter_meta * meta,struct unix_sock * unix_sk,uid_t uid)3888 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3889 		     struct unix_sock *unix_sk, uid_t uid)
3890 
3891 #define INIT_BATCH_SZ 16
3892 
3893 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3894 {
3895 	struct bpf_unix_iter_state *iter = priv_data;
3896 	int err;
3897 
3898 	err = bpf_iter_init_seq_net(priv_data, aux);
3899 	if (err)
3900 		return err;
3901 
3902 	err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3903 	if (err) {
3904 		bpf_iter_fini_seq_net(priv_data);
3905 		return err;
3906 	}
3907 
3908 	return 0;
3909 }
3910 
bpf_iter_fini_unix(void * priv_data)3911 static void bpf_iter_fini_unix(void *priv_data)
3912 {
3913 	struct bpf_unix_iter_state *iter = priv_data;
3914 
3915 	bpf_iter_fini_seq_net(priv_data);
3916 	kvfree(iter->batch);
3917 }
3918 
3919 static const struct bpf_iter_seq_info unix_seq_info = {
3920 	.seq_ops		= &bpf_iter_unix_seq_ops,
3921 	.init_seq_private	= bpf_iter_init_unix,
3922 	.fini_seq_private	= bpf_iter_fini_unix,
3923 	.seq_priv_size		= sizeof(struct bpf_unix_iter_state),
3924 };
3925 
3926 static const struct bpf_func_proto *
bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3927 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3928 			     const struct bpf_prog *prog)
3929 {
3930 	switch (func_id) {
3931 	case BPF_FUNC_setsockopt:
3932 		return &bpf_sk_setsockopt_proto;
3933 	case BPF_FUNC_getsockopt:
3934 		return &bpf_sk_getsockopt_proto;
3935 	default:
3936 		return NULL;
3937 	}
3938 }
3939 
3940 static struct bpf_iter_reg unix_reg_info = {
3941 	.target			= "unix",
3942 	.ctx_arg_info_size	= 1,
3943 	.ctx_arg_info		= {
3944 		{ offsetof(struct bpf_iter__unix, unix_sk),
3945 		  PTR_TO_BTF_ID_OR_NULL },
3946 	},
3947 	.get_func_proto         = bpf_iter_unix_get_func_proto,
3948 	.seq_info		= &unix_seq_info,
3949 };
3950 
bpf_iter_register(void)3951 static void __init bpf_iter_register(void)
3952 {
3953 	unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3954 	if (bpf_iter_reg_target(&unix_reg_info))
3955 		pr_warn("Warning: could not register bpf iterator unix\n");
3956 }
3957 #endif
3958 
af_unix_init(void)3959 static int __init af_unix_init(void)
3960 {
3961 	int i, rc = -1;
3962 
3963 	BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3964 
3965 	for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3966 		spin_lock_init(&bsd_socket_locks[i]);
3967 		INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3968 	}
3969 
3970 	rc = proto_register(&unix_dgram_proto, 1);
3971 	if (rc != 0) {
3972 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3973 		goto out;
3974 	}
3975 
3976 	rc = proto_register(&unix_stream_proto, 1);
3977 	if (rc != 0) {
3978 		pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3979 		proto_unregister(&unix_dgram_proto);
3980 		goto out;
3981 	}
3982 
3983 	sock_register(&unix_family_ops);
3984 	register_pernet_subsys(&unix_net_ops);
3985 	unix_bpf_build_proto();
3986 
3987 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3988 	bpf_iter_register();
3989 #endif
3990 
3991 out:
3992 	return rc;
3993 }
3994 
3995 /* Later than subsys_initcall() because we depend on stuff initialised there */
3996 fs_initcall(af_unix_init);
3997