xref: /linux/io_uring/net.c (revision 23acda7c221a76ff711d65f4ca90029d43b249a0) !
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/file.h>
5 #include <linux/slab.h>
6 #include <linux/net.h>
7 #include <linux/compat.h>
8 #include <net/compat.h>
9 #include <linux/io_uring.h>
10 
11 #include <uapi/linux/io_uring.h>
12 
13 #include "filetable.h"
14 #include "io_uring.h"
15 #include "kbuf.h"
16 #include "alloc_cache.h"
17 #include "net.h"
18 #include "notif.h"
19 #include "rsrc.h"
20 #include "zcrx.h"
21 
22 struct io_shutdown {
23 	struct file			*file;
24 	int				how;
25 };
26 
27 struct io_accept {
28 	struct file			*file;
29 	struct sockaddr __user		*addr;
30 	int __user			*addr_len;
31 	int				flags;
32 	int				iou_flags;
33 	u32				file_slot;
34 	unsigned long			nofile;
35 };
36 
37 struct io_socket {
38 	struct file			*file;
39 	int				domain;
40 	int				type;
41 	int				protocol;
42 	int				flags;
43 	u32				file_slot;
44 	unsigned long			nofile;
45 };
46 
47 struct io_connect {
48 	struct file			*file;
49 	struct sockaddr __user		*addr;
50 	int				addr_len;
51 	bool				in_progress;
52 	bool				seen_econnaborted;
53 };
54 
55 struct io_bind {
56 	struct file			*file;
57 	int				addr_len;
58 };
59 
60 struct io_listen {
61 	struct file			*file;
62 	int				backlog;
63 };
64 
65 struct io_sr_msg {
66 	struct file			*file;
67 	union {
68 		struct compat_msghdr __user	*umsg_compat;
69 		struct user_msghdr __user	*umsg;
70 		void __user			*buf;
71 	};
72 	int				len;
73 	unsigned			done_io;
74 	unsigned			msg_flags;
75 	unsigned			nr_multishot_loops;
76 	u16				flags;
77 	/* initialised and used only by !msg send variants */
78 	u16				buf_group;
79 	/* per-invocation mshot limit */
80 	unsigned			mshot_len;
81 	/* overall mshot byte limit */
82 	unsigned			mshot_total_len;
83 	void __user			*msg_control;
84 	/* used only for send zerocopy */
85 	struct io_kiocb 		*notif;
86 };
87 
88 /*
89  * The UAPI flags are the lower 8 bits, as that's all sqe->ioprio will hold
90  * anyway. Use the upper 8 bits for internal uses.
91  */
92 enum sr_retry_flags {
93 	IORING_RECV_RETRY	= (1U << 15),
94 	IORING_RECV_PARTIAL_MAP	= (1U << 14),
95 	IORING_RECV_MSHOT_CAP	= (1U << 13),
96 	IORING_RECV_MSHOT_LIM	= (1U << 12),
97 	IORING_RECV_MSHOT_DONE	= (1U << 11),
98 
99 	IORING_RECV_RETRY_CLEAR	= IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP,
100 	IORING_RECV_NO_RETRY	= IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP |
101 				  IORING_RECV_MSHOT_CAP | IORING_RECV_MSHOT_DONE,
102 };
103 
104 /*
105  * Number of times we'll try and do receives if there's more data. If we
106  * exceed this limit, then add us to the back of the queue and retry from
107  * there. This helps fairness between flooding clients.
108  */
109 #define MULTISHOT_MAX_RETRY	32
110 
111 struct io_recvzc {
112 	struct file			*file;
113 	u16				flags;
114 	u32				len;
115 	struct io_zcrx_ifq		*ifq;
116 };
117 
118 static int io_sg_from_iter_iovec(struct sk_buff *skb,
119 				 struct iov_iter *from, size_t length);
120 static int io_sg_from_iter(struct sk_buff *skb,
121 			   struct iov_iter *from, size_t length);
122 
io_shutdown_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)123 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
124 {
125 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
126 
127 	if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
128 		     sqe->buf_index || sqe->splice_fd_in))
129 		return -EINVAL;
130 
131 	shutdown->how = READ_ONCE(sqe->len);
132 	req->flags |= REQ_F_FORCE_ASYNC;
133 	return 0;
134 }
135 
io_shutdown(struct io_kiocb * req,unsigned int issue_flags)136 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
137 {
138 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
139 	struct socket *sock;
140 	int ret;
141 
142 	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
143 
144 	sock = sock_from_file(req->file);
145 	if (unlikely(!sock))
146 		return -ENOTSOCK;
147 
148 	ret = __sys_shutdown_sock(sock, shutdown->how);
149 	io_req_set_res(req, ret, 0);
150 	return IOU_COMPLETE;
151 }
152 
io_net_retry(struct socket * sock,int flags)153 static bool io_net_retry(struct socket *sock, int flags)
154 {
155 	if (!(flags & MSG_WAITALL))
156 		return false;
157 	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
158 }
159 
io_netmsg_iovec_free(struct io_async_msghdr * kmsg)160 static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg)
161 {
162 	if (kmsg->vec.iovec)
163 		io_vec_free(&kmsg->vec);
164 }
165 
io_netmsg_recycle(struct io_kiocb * req,unsigned int issue_flags)166 static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
167 {
168 	struct io_async_msghdr *hdr = req->async_data;
169 
170 	/* can't recycle, ensure we free the iovec if we have one */
171 	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
172 		io_netmsg_iovec_free(hdr);
173 		return;
174 	}
175 
176 	/* Let normal cleanup path reap it if we fail adding to the cache */
177 	io_alloc_cache_vec_kasan(&hdr->vec);
178 	if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP)
179 		io_vec_free(&hdr->vec);
180 
181 	if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr))
182 		io_req_async_data_clear(req, REQ_F_NEED_CLEANUP);
183 }
184 
io_msg_alloc_async(struct io_kiocb * req)185 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
186 {
187 	struct io_ring_ctx *ctx = req->ctx;
188 	struct io_async_msghdr *hdr;
189 
190 	hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req);
191 	if (!hdr)
192 		return NULL;
193 
194 	/* If the async data was cached, we might have an iov cached inside. */
195 	if (hdr->vec.iovec)
196 		req->flags |= REQ_F_NEED_CLEANUP;
197 	return hdr;
198 }
199 
io_mshot_prep_retry(struct io_kiocb * req,struct io_async_msghdr * kmsg)200 static inline void io_mshot_prep_retry(struct io_kiocb *req,
201 				       struct io_async_msghdr *kmsg)
202 {
203 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
204 
205 	req->flags &= ~REQ_F_BL_EMPTY;
206 	sr->done_io = 0;
207 	sr->flags &= ~IORING_RECV_RETRY_CLEAR;
208 	sr->len = sr->mshot_len;
209 }
210 
io_net_import_vec(struct io_kiocb * req,struct io_async_msghdr * iomsg,const struct iovec __user * uiov,unsigned uvec_seg,int ddir)211 static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg,
212 			     const struct iovec __user *uiov, unsigned uvec_seg,
213 			     int ddir)
214 {
215 	struct iovec *iov;
216 	int ret, nr_segs;
217 
218 	if (iomsg->vec.iovec) {
219 		nr_segs = iomsg->vec.nr;
220 		iov = iomsg->vec.iovec;
221 	} else {
222 		nr_segs = 1;
223 		iov = &iomsg->fast_iov;
224 	}
225 
226 	ret = __import_iovec(ddir, uiov, uvec_seg, nr_segs, &iov,
227 			     &iomsg->msg.msg_iter, io_is_compat(req->ctx));
228 	if (unlikely(ret < 0))
229 		return ret;
230 
231 	if (iov) {
232 		req->flags |= REQ_F_NEED_CLEANUP;
233 		io_vec_reset_iovec(&iomsg->vec, iov, iomsg->msg.msg_iter.nr_segs);
234 	}
235 	return 0;
236 }
237 
io_compat_msg_copy_hdr(struct io_kiocb * req,struct io_async_msghdr * iomsg,struct compat_msghdr * msg,int ddir,struct sockaddr __user ** save_addr)238 static int io_compat_msg_copy_hdr(struct io_kiocb *req,
239 				  struct io_async_msghdr *iomsg,
240 				  struct compat_msghdr *msg, int ddir,
241 				  struct sockaddr __user **save_addr)
242 {
243 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
244 	struct compat_iovec __user *uiov;
245 	int ret;
246 
247 	if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg)))
248 		return -EFAULT;
249 
250 	ret = __get_compat_msghdr(&iomsg->msg, msg, save_addr);
251 	if (ret)
252 		return ret;
253 
254 	uiov = compat_ptr(msg->msg_iov);
255 	if (req->flags & REQ_F_BUFFER_SELECT) {
256 		if (msg->msg_iovlen == 0) {
257 			sr->len = 0;
258 		} else if (msg->msg_iovlen > 1) {
259 			return -EINVAL;
260 		} else {
261 			struct compat_iovec tmp_iov;
262 
263 			if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov)))
264 				return -EFAULT;
265 			sr->len = tmp_iov.iov_len;
266 		}
267 	}
268 	return 0;
269 }
270 
io_copy_msghdr_from_user(struct user_msghdr * msg,struct user_msghdr __user * umsg)271 static int io_copy_msghdr_from_user(struct user_msghdr *msg,
272 				    struct user_msghdr __user *umsg)
273 {
274 	if (!user_access_begin(umsg, sizeof(*umsg)))
275 		return -EFAULT;
276 	unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end);
277 	unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end);
278 	unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end);
279 	unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end);
280 	unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end);
281 	unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end);
282 	user_access_end();
283 	return 0;
284 ua_end:
285 	user_access_end();
286 	return -EFAULT;
287 }
288 
io_msg_copy_hdr(struct io_kiocb * req,struct io_async_msghdr * iomsg,struct user_msghdr * msg,int ddir,struct sockaddr __user ** save_addr)289 static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
290 			   struct user_msghdr *msg, int ddir,
291 			   struct sockaddr __user **save_addr)
292 {
293 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
294 	struct user_msghdr __user *umsg = sr->umsg;
295 	int ret;
296 
297 	iomsg->msg.msg_name = &iomsg->addr;
298 	iomsg->msg.msg_iter.nr_segs = 0;
299 
300 	if (io_is_compat(req->ctx)) {
301 		struct compat_msghdr cmsg;
302 
303 		ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ddir, save_addr);
304 		if (ret)
305 			return ret;
306 
307 		memset(msg, 0, sizeof(*msg));
308 		msg->msg_namelen = cmsg.msg_namelen;
309 		msg->msg_controllen = cmsg.msg_controllen;
310 		msg->msg_iov = compat_ptr(cmsg.msg_iov);
311 		msg->msg_iovlen = cmsg.msg_iovlen;
312 		return 0;
313 	}
314 
315 	ret = io_copy_msghdr_from_user(msg, umsg);
316 	if (unlikely(ret))
317 		return ret;
318 
319 	msg->msg_flags = 0;
320 
321 	ret = __copy_msghdr(&iomsg->msg, msg, save_addr);
322 	if (ret)
323 		return ret;
324 
325 	if (req->flags & REQ_F_BUFFER_SELECT) {
326 		if (msg->msg_iovlen == 0) {
327 			sr->len = 0;
328 		} else if (msg->msg_iovlen > 1) {
329 			return -EINVAL;
330 		} else {
331 			struct iovec __user *uiov = msg->msg_iov;
332 			struct iovec tmp_iov;
333 
334 			if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov)))
335 				return -EFAULT;
336 			sr->len = tmp_iov.iov_len;
337 		}
338 	}
339 	return 0;
340 }
341 
io_sendmsg_recvmsg_cleanup(struct io_kiocb * req)342 void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
343 {
344 	struct io_async_msghdr *io = req->async_data;
345 
346 	io_netmsg_iovec_free(io);
347 }
348 
io_send_setup(struct io_kiocb * req,const struct io_uring_sqe * sqe)349 static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
350 {
351 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
352 	struct io_async_msghdr *kmsg = req->async_data;
353 	void __user *addr;
354 	u16 addr_len;
355 	int ret;
356 
357 	sr->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
358 
359 	if (READ_ONCE(sqe->__pad3[0]))
360 		return -EINVAL;
361 
362 	kmsg->msg.msg_name = NULL;
363 	kmsg->msg.msg_namelen = 0;
364 	kmsg->msg.msg_control = NULL;
365 	kmsg->msg.msg_controllen = 0;
366 	kmsg->msg.msg_ubuf = NULL;
367 
368 	addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
369 	addr_len = READ_ONCE(sqe->addr_len);
370 	if (addr) {
371 		ret = move_addr_to_kernel(addr, addr_len, &kmsg->addr);
372 		if (unlikely(ret < 0))
373 			return ret;
374 		kmsg->msg.msg_name = &kmsg->addr;
375 		kmsg->msg.msg_namelen = addr_len;
376 	}
377 	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
378 		if (!(sr->flags & IORING_SEND_VECTORIZED)) {
379 			req->flags |= REQ_F_IMPORT_BUFFER;
380 			return 0;
381 		}
382 
383 		kmsg->msg.msg_iter.nr_segs = sr->len;
384 		return io_prep_reg_iovec(req, &kmsg->vec, sr->buf, sr->len);
385 	}
386 	if (req->flags & REQ_F_BUFFER_SELECT)
387 		return 0;
388 
389 	if (sr->flags & IORING_SEND_VECTORIZED)
390 		return io_net_import_vec(req, kmsg, sr->buf, sr->len, ITER_SOURCE);
391 
392 	return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter);
393 }
394 
io_sendmsg_setup(struct io_kiocb * req,const struct io_uring_sqe * sqe)395 static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
396 {
397 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
398 	struct io_async_msghdr *kmsg = req->async_data;
399 	struct user_msghdr msg;
400 	int ret;
401 
402 	sr->flags |= IORING_SEND_VECTORIZED;
403 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
404 	ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL);
405 	if (unlikely(ret))
406 		return ret;
407 	/* save msg_control as sys_sendmsg() overwrites it */
408 	sr->msg_control = kmsg->msg.msg_control_user;
409 
410 	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
411 		kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen;
412 		return io_prep_reg_iovec(req, &kmsg->vec, msg.msg_iov,
413 					 msg.msg_iovlen);
414 	}
415 	if (req->flags & REQ_F_BUFFER_SELECT)
416 		return 0;
417 	return io_net_import_vec(req, kmsg, msg.msg_iov, msg.msg_iovlen, ITER_SOURCE);
418 }
419 
420 #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE | IORING_SEND_VECTORIZED)
421 
io_sendmsg_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)422 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
423 {
424 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
425 
426 	sr->done_io = 0;
427 	sr->len = READ_ONCE(sqe->len);
428 	if (unlikely(sr->len < 0))
429 		return -EINVAL;
430 	sr->flags = READ_ONCE(sqe->ioprio);
431 	if (sr->flags & ~SENDMSG_FLAGS)
432 		return -EINVAL;
433 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
434 	if (sr->msg_flags & MSG_DONTWAIT)
435 		req->flags |= REQ_F_NOWAIT;
436 	if (req->flags & REQ_F_BUFFER_SELECT)
437 		sr->buf_group = req->buf_index;
438 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
439 		if (req->opcode == IORING_OP_SENDMSG)
440 			return -EINVAL;
441 		sr->msg_flags |= MSG_WAITALL;
442 		req->flags |= REQ_F_MULTISHOT;
443 	}
444 
445 	if (io_is_compat(req->ctx))
446 		sr->msg_flags |= MSG_CMSG_COMPAT;
447 
448 	if (unlikely(!io_msg_alloc_async(req)))
449 		return -ENOMEM;
450 	if (req->opcode != IORING_OP_SENDMSG)
451 		return io_send_setup(req, sqe);
452 	if (unlikely(sqe->addr2 || sqe->file_index))
453 		return -EINVAL;
454 	return io_sendmsg_setup(req, sqe);
455 }
456 
io_req_msg_cleanup(struct io_kiocb * req,unsigned int issue_flags)457 static void io_req_msg_cleanup(struct io_kiocb *req,
458 			       unsigned int issue_flags)
459 {
460 	io_netmsg_recycle(req, issue_flags);
461 }
462 
463 /*
464  * For bundle completions, we need to figure out how many segments we consumed.
465  * A bundle could be using a single ITER_UBUF if that's all we mapped, or it
466  * could be using an ITER_IOVEC. If the latter, then if we consumed all of
467  * the segments, then it's a trivial questiont o answer. If we have residual
468  * data in the iter, then loop the segments to figure out how much we
469  * transferred.
470  */
io_bundle_nbufs(struct io_async_msghdr * kmsg,int ret)471 static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret)
472 {
473 	struct iovec *iov;
474 	int nbufs;
475 
476 	/* no data is always zero segments, and a ubuf is always 1 segment */
477 	if (ret <= 0)
478 		return 0;
479 	if (iter_is_ubuf(&kmsg->msg.msg_iter))
480 		return 1;
481 
482 	iov = kmsg->vec.iovec;
483 	if (!iov)
484 		iov = &kmsg->fast_iov;
485 
486 	/* if all data was transferred, it's basic pointer math */
487 	if (!iov_iter_count(&kmsg->msg.msg_iter))
488 		return iter_iov(&kmsg->msg.msg_iter) - iov;
489 
490 	/* short transfer, count segments */
491 	nbufs = 0;
492 	do {
493 		int this_len = min_t(int, iov[nbufs].iov_len, ret);
494 
495 		nbufs++;
496 		ret -= this_len;
497 	} while (ret);
498 
499 	return nbufs;
500 }
501 
io_net_kbuf_recyle(struct io_kiocb * req,struct io_buffer_list * bl,struct io_async_msghdr * kmsg,int len)502 static int io_net_kbuf_recyle(struct io_kiocb *req, struct io_buffer_list *bl,
503 			      struct io_async_msghdr *kmsg, int len)
504 {
505 	req->flags |= REQ_F_BL_NO_RECYCLE;
506 	if (req->flags & REQ_F_BUFFERS_COMMIT)
507 		io_kbuf_commit(req, bl, len, io_bundle_nbufs(kmsg, len));
508 	return IOU_RETRY;
509 }
510 
io_send_finish(struct io_kiocb * req,struct io_async_msghdr * kmsg,struct io_br_sel * sel)511 static inline bool io_send_finish(struct io_kiocb *req,
512 				  struct io_async_msghdr *kmsg,
513 				  struct io_br_sel *sel)
514 {
515 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
516 	bool bundle_finished = sel->val <= 0;
517 	unsigned int cflags;
518 
519 	if (!(sr->flags & IORING_RECVSEND_BUNDLE)) {
520 		cflags = io_put_kbuf(req, sel->val, sel->buf_list);
521 		goto finish;
522 	}
523 
524 	cflags = io_put_kbufs(req, sel->val, sel->buf_list, io_bundle_nbufs(kmsg, sel->val));
525 
526 	/*
527 	 * Don't start new bundles if the buffer list is empty, or if the
528 	 * current operation needed to go through polling to complete.
529 	 */
530 	if (bundle_finished || req->flags & (REQ_F_BL_EMPTY | REQ_F_POLLED))
531 		goto finish;
532 
533 	/*
534 	 * Fill CQE for this receive and see if we should keep trying to
535 	 * receive from this socket.
536 	 */
537 	if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) {
538 		io_mshot_prep_retry(req, kmsg);
539 		return false;
540 	}
541 
542 	/* Otherwise stop bundle and use the current result. */
543 finish:
544 	io_req_set_res(req, sel->val, cflags);
545 	sel->val = IOU_COMPLETE;
546 	return true;
547 }
548 
io_sendmsg(struct io_kiocb * req,unsigned int issue_flags)549 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
550 {
551 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
552 	struct io_async_msghdr *kmsg = req->async_data;
553 	struct socket *sock;
554 	unsigned flags;
555 	int min_ret = 0;
556 	int ret;
557 
558 	sock = sock_from_file(req->file);
559 	if (unlikely(!sock))
560 		return -ENOTSOCK;
561 
562 	if (!(req->flags & REQ_F_POLLED) &&
563 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
564 		return -EAGAIN;
565 
566 	flags = sr->msg_flags;
567 	if (issue_flags & IO_URING_F_NONBLOCK)
568 		flags |= MSG_DONTWAIT;
569 	if (flags & MSG_WAITALL)
570 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
571 
572 	kmsg->msg.msg_control_user = sr->msg_control;
573 
574 	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
575 
576 	if (ret < min_ret) {
577 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
578 			return -EAGAIN;
579 		if (ret > 0 && io_net_retry(sock, flags)) {
580 			kmsg->msg.msg_controllen = 0;
581 			kmsg->msg.msg_control = NULL;
582 			sr->done_io += ret;
583 			return -EAGAIN;
584 		}
585 		if (ret == -ERESTARTSYS)
586 			ret = -EINTR;
587 		req_set_fail(req);
588 	}
589 	io_req_msg_cleanup(req, issue_flags);
590 	if (ret >= 0)
591 		ret += sr->done_io;
592 	else if (sr->done_io)
593 		ret = sr->done_io;
594 	io_req_set_res(req, ret, 0);
595 	return IOU_COMPLETE;
596 }
597 
io_send_select_buffer(struct io_kiocb * req,unsigned int issue_flags,struct io_br_sel * sel,struct io_async_msghdr * kmsg)598 static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
599 				 struct io_br_sel *sel, struct io_async_msghdr *kmsg)
600 {
601 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
602 	struct buf_sel_arg arg = {
603 		.iovs = &kmsg->fast_iov,
604 		.max_len = min_not_zero(sr->len, INT_MAX),
605 		.nr_iovs = 1,
606 		.buf_group = sr->buf_group,
607 	};
608 	int ret;
609 
610 	if (kmsg->vec.iovec) {
611 		arg.nr_iovs = kmsg->vec.nr;
612 		arg.iovs = kmsg->vec.iovec;
613 		arg.mode = KBUF_MODE_FREE;
614 	}
615 
616 	if (!(sr->flags & IORING_RECVSEND_BUNDLE))
617 		arg.nr_iovs = 1;
618 	else
619 		arg.mode |= KBUF_MODE_EXPAND;
620 
621 	ret = io_buffers_select(req, &arg, sel, issue_flags);
622 	if (unlikely(ret < 0))
623 		return ret;
624 
625 	if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
626 		kmsg->vec.nr = ret;
627 		kmsg->vec.iovec = arg.iovs;
628 		req->flags |= REQ_F_NEED_CLEANUP;
629 	}
630 	sr->len = arg.out_len;
631 
632 	if (ret == 1) {
633 		sr->buf = arg.iovs[0].iov_base;
634 		ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
635 					&kmsg->msg.msg_iter);
636 		if (unlikely(ret))
637 			return ret;
638 	} else {
639 		iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE,
640 				arg.iovs, ret, arg.out_len);
641 	}
642 
643 	return 0;
644 }
645 
io_send(struct io_kiocb * req,unsigned int issue_flags)646 int io_send(struct io_kiocb *req, unsigned int issue_flags)
647 {
648 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
649 	struct io_async_msghdr *kmsg = req->async_data;
650 	struct io_br_sel sel = { };
651 	struct socket *sock;
652 	unsigned flags;
653 	int min_ret = 0;
654 	int ret;
655 
656 	sock = sock_from_file(req->file);
657 	if (unlikely(!sock))
658 		return -ENOTSOCK;
659 
660 	if (!(req->flags & REQ_F_POLLED) &&
661 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
662 		return -EAGAIN;
663 
664 	flags = sr->msg_flags;
665 	if (issue_flags & IO_URING_F_NONBLOCK)
666 		flags |= MSG_DONTWAIT;
667 
668 retry_bundle:
669 	sel.buf_list = NULL;
670 	if (io_do_buffer_select(req)) {
671 		ret = io_send_select_buffer(req, issue_flags, &sel, kmsg);
672 		if (ret)
673 			return ret;
674 	}
675 
676 	/*
677 	 * If MSG_WAITALL is set, or this is a bundle send, then we need
678 	 * the full amount. If just bundle is set, if we do a short send
679 	 * then we complete the bundle sequence rather than continue on.
680 	 */
681 	if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE)
682 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
683 
684 	flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
685 	kmsg->msg.msg_flags = flags;
686 	ret = sock_sendmsg(sock, &kmsg->msg);
687 	if (ret < min_ret) {
688 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
689 			return -EAGAIN;
690 
691 		if (ret > 0 && io_net_retry(sock, flags)) {
692 			sr->len -= ret;
693 			sr->buf += ret;
694 			sr->done_io += ret;
695 			return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
696 		}
697 		if (ret == -ERESTARTSYS)
698 			ret = -EINTR;
699 		req_set_fail(req);
700 	}
701 	if (ret >= 0)
702 		ret += sr->done_io;
703 	else if (sr->done_io)
704 		ret = sr->done_io;
705 
706 	sel.val = ret;
707 	if (!io_send_finish(req, kmsg, &sel))
708 		goto retry_bundle;
709 
710 	io_req_msg_cleanup(req, issue_flags);
711 	return sel.val;
712 }
713 
io_recvmsg_mshot_prep(struct io_kiocb * req,struct io_async_msghdr * iomsg,int namelen,size_t controllen)714 static int io_recvmsg_mshot_prep(struct io_kiocb *req,
715 				 struct io_async_msghdr *iomsg,
716 				 int namelen, size_t controllen)
717 {
718 	if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) ==
719 			  (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) {
720 		int hdr;
721 
722 		if (unlikely(namelen < 0))
723 			return -EOVERFLOW;
724 		if (check_add_overflow(sizeof(struct io_uring_recvmsg_out),
725 					namelen, &hdr))
726 			return -EOVERFLOW;
727 		if (check_add_overflow(hdr, controllen, &hdr))
728 			return -EOVERFLOW;
729 
730 		iomsg->namelen = namelen;
731 		iomsg->controllen = controllen;
732 		return 0;
733 	}
734 
735 	return 0;
736 }
737 
io_recvmsg_copy_hdr(struct io_kiocb * req,struct io_async_msghdr * iomsg)738 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
739 			       struct io_async_msghdr *iomsg)
740 {
741 	struct user_msghdr msg;
742 	int ret;
743 
744 	ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr);
745 	if (unlikely(ret))
746 		return ret;
747 
748 	if (!(req->flags & REQ_F_BUFFER_SELECT)) {
749 		ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen,
750 					ITER_DEST);
751 		if (unlikely(ret))
752 			return ret;
753 	}
754 	return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen,
755 					msg.msg_controllen);
756 }
757 
io_recvmsg_prep_setup(struct io_kiocb * req)758 static int io_recvmsg_prep_setup(struct io_kiocb *req)
759 {
760 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
761 	struct io_async_msghdr *kmsg;
762 
763 	kmsg = io_msg_alloc_async(req);
764 	if (unlikely(!kmsg))
765 		return -ENOMEM;
766 
767 	if (req->opcode == IORING_OP_RECV) {
768 		kmsg->msg.msg_name = NULL;
769 		kmsg->msg.msg_namelen = 0;
770 		kmsg->msg.msg_inq = 0;
771 		kmsg->msg.msg_control = NULL;
772 		kmsg->msg.msg_get_inq = 1;
773 		kmsg->msg.msg_controllen = 0;
774 		kmsg->msg.msg_iocb = NULL;
775 		kmsg->msg.msg_ubuf = NULL;
776 
777 		if (req->flags & REQ_F_BUFFER_SELECT)
778 			return 0;
779 		return import_ubuf(ITER_DEST, sr->buf, sr->len,
780 				   &kmsg->msg.msg_iter);
781 	}
782 
783 	return io_recvmsg_copy_hdr(req, kmsg);
784 }
785 
786 #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \
787 			IORING_RECVSEND_BUNDLE)
788 
io_recvmsg_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)789 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
790 {
791 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
792 
793 	sr->done_io = 0;
794 
795 	if (unlikely(sqe->addr2))
796 		return -EINVAL;
797 
798 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
799 	sr->len = READ_ONCE(sqe->len);
800 	if (unlikely(sr->len < 0))
801 		return -EINVAL;
802 	sr->flags = READ_ONCE(sqe->ioprio);
803 	if (sr->flags & ~RECVMSG_FLAGS)
804 		return -EINVAL;
805 	sr->msg_flags = READ_ONCE(sqe->msg_flags);
806 	if (sr->msg_flags & MSG_DONTWAIT)
807 		req->flags |= REQ_F_NOWAIT;
808 	if (sr->msg_flags & MSG_ERRQUEUE)
809 		req->flags |= REQ_F_CLEAR_POLLIN;
810 	if (req->flags & REQ_F_BUFFER_SELECT)
811 		sr->buf_group = req->buf_index;
812 	sr->mshot_total_len = sr->mshot_len = 0;
813 	if (sr->flags & IORING_RECV_MULTISHOT) {
814 		if (!(req->flags & REQ_F_BUFFER_SELECT))
815 			return -EINVAL;
816 		if (sr->msg_flags & MSG_WAITALL)
817 			return -EINVAL;
818 		if (req->opcode == IORING_OP_RECV) {
819 			sr->mshot_len = sr->len;
820 			sr->mshot_total_len = READ_ONCE(sqe->optlen);
821 			if (sr->mshot_total_len)
822 				sr->flags |= IORING_RECV_MSHOT_LIM;
823 		} else if (sqe->optlen) {
824 			return -EINVAL;
825 		}
826 		req->flags |= REQ_F_APOLL_MULTISHOT;
827 	} else if (sqe->optlen) {
828 		return -EINVAL;
829 	}
830 
831 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
832 		if (req->opcode == IORING_OP_RECVMSG)
833 			return -EINVAL;
834 	}
835 
836 	if (io_is_compat(req->ctx))
837 		sr->msg_flags |= MSG_CMSG_COMPAT;
838 
839 	sr->nr_multishot_loops = 0;
840 	return io_recvmsg_prep_setup(req);
841 }
842 
843 /* bits to clear in old and inherit in new cflags on bundle retry */
844 #define CQE_F_MASK	(IORING_CQE_F_SOCK_NONEMPTY|IORING_CQE_F_MORE)
845 
846 /*
847  * Finishes io_recv and io_recvmsg.
848  *
849  * Returns true if it is actually finished, or false if it should run
850  * again (for multishot).
851  */
io_recv_finish(struct io_kiocb * req,struct io_async_msghdr * kmsg,struct io_br_sel * sel,bool mshot_finished,unsigned issue_flags)852 static inline bool io_recv_finish(struct io_kiocb *req,
853 				  struct io_async_msghdr *kmsg,
854 				  struct io_br_sel *sel, bool mshot_finished,
855 				  unsigned issue_flags)
856 {
857 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
858 	unsigned int cflags = 0;
859 
860 	if (kmsg->msg.msg_inq > 0)
861 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
862 
863 	if (sel->val > 0 && sr->flags & IORING_RECV_MSHOT_LIM) {
864 		/*
865 		 * If sr->len hits zero, the limit has been reached. Mark
866 		 * mshot as finished, and flag MSHOT_DONE as well to prevent
867 		 * a potential bundle from being retried.
868 		 */
869 		sr->mshot_total_len -= min_t(int, sel->val, sr->mshot_total_len);
870 		if (!sr->mshot_total_len) {
871 			sr->flags |= IORING_RECV_MSHOT_DONE;
872 			mshot_finished = true;
873 		}
874 	}
875 
876 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
877 		size_t this_ret = sel->val - sr->done_io;
878 
879 		cflags |= io_put_kbufs(req, this_ret, sel->buf_list, io_bundle_nbufs(kmsg, this_ret));
880 		if (sr->flags & IORING_RECV_RETRY)
881 			cflags = req->cqe.flags | (cflags & CQE_F_MASK);
882 		if (sr->mshot_len && sel->val >= sr->mshot_len)
883 			sr->flags |= IORING_RECV_MSHOT_CAP;
884 		/* bundle with no more immediate buffers, we're done */
885 		if (req->flags & REQ_F_BL_EMPTY)
886 			goto finish;
887 		/*
888 		 * If more is available AND it was a full transfer, retry and
889 		 * append to this one
890 		 */
891 		if (!(sr->flags & IORING_RECV_NO_RETRY) &&
892 		    kmsg->msg.msg_inq > 1 && this_ret > 0 &&
893 		    !iov_iter_count(&kmsg->msg.msg_iter)) {
894 			req->cqe.flags = cflags & ~CQE_F_MASK;
895 			sr->len = kmsg->msg.msg_inq;
896 			sr->done_io += this_ret;
897 			sr->flags |= IORING_RECV_RETRY;
898 			return false;
899 		}
900 	} else {
901 		cflags |= io_put_kbuf(req, sel->val, sel->buf_list);
902 	}
903 
904 	/*
905 	 * Fill CQE for this receive and see if we should keep trying to
906 	 * receive from this socket.
907 	 */
908 	if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished &&
909 	    io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) {
910 		sel->val = IOU_RETRY;
911 		io_mshot_prep_retry(req, kmsg);
912 		/* Known not-empty or unknown state, retry */
913 		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) {
914 			if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY &&
915 			    !(sr->flags & IORING_RECV_MSHOT_CAP)) {
916 				return false;
917 			}
918 			/* mshot retries exceeded, force a requeue */
919 			sr->nr_multishot_loops = 0;
920 			sr->flags &= ~IORING_RECV_MSHOT_CAP;
921 			if (issue_flags & IO_URING_F_MULTISHOT)
922 				sel->val = IOU_REQUEUE;
923 		}
924 		return true;
925 	}
926 
927 	/* Finish the request / stop multishot. */
928 finish:
929 	io_req_set_res(req, sel->val, cflags);
930 	sel->val = IOU_COMPLETE;
931 	io_req_msg_cleanup(req, issue_flags);
932 	return true;
933 }
934 
io_recvmsg_prep_multishot(struct io_async_msghdr * kmsg,struct io_sr_msg * sr,void __user ** buf,size_t * len)935 static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg,
936 				     struct io_sr_msg *sr, void __user **buf,
937 				     size_t *len)
938 {
939 	unsigned long ubuf = (unsigned long) *buf;
940 	unsigned long hdr;
941 
942 	hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
943 		kmsg->controllen;
944 	if (*len < hdr)
945 		return -EFAULT;
946 
947 	if (kmsg->controllen) {
948 		unsigned long control = ubuf + hdr - kmsg->controllen;
949 
950 		kmsg->msg.msg_control_user = (void __user *) control;
951 		kmsg->msg.msg_controllen = kmsg->controllen;
952 	}
953 
954 	sr->buf = *buf; /* stash for later copy */
955 	*buf = (void __user *) (ubuf + hdr);
956 	kmsg->payloadlen = *len = *len - hdr;
957 	return 0;
958 }
959 
960 struct io_recvmsg_multishot_hdr {
961 	struct io_uring_recvmsg_out msg;
962 	struct sockaddr_storage addr;
963 };
964 
io_recvmsg_multishot(struct socket * sock,struct io_sr_msg * io,struct io_async_msghdr * kmsg,unsigned int flags,bool * finished)965 static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,
966 				struct io_async_msghdr *kmsg,
967 				unsigned int flags, bool *finished)
968 {
969 	int err;
970 	int copy_len;
971 	struct io_recvmsg_multishot_hdr hdr;
972 
973 	if (kmsg->namelen)
974 		kmsg->msg.msg_name = &hdr.addr;
975 	kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
976 	kmsg->msg.msg_namelen = 0;
977 
978 	if (sock->file->f_flags & O_NONBLOCK)
979 		flags |= MSG_DONTWAIT;
980 
981 	err = sock_recvmsg(sock, &kmsg->msg, flags);
982 	*finished = err <= 0;
983 	if (err < 0)
984 		return err;
985 
986 	hdr.msg = (struct io_uring_recvmsg_out) {
987 		.controllen = kmsg->controllen - kmsg->msg.msg_controllen,
988 		.flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT
989 	};
990 
991 	hdr.msg.payloadlen = err;
992 	if (err > kmsg->payloadlen)
993 		err = kmsg->payloadlen;
994 
995 	copy_len = sizeof(struct io_uring_recvmsg_out);
996 	if (kmsg->msg.msg_namelen > kmsg->namelen)
997 		copy_len += kmsg->namelen;
998 	else
999 		copy_len += kmsg->msg.msg_namelen;
1000 
1001 	/*
1002 	 *      "fromlen shall refer to the value before truncation.."
1003 	 *                      1003.1g
1004 	 */
1005 	hdr.msg.namelen = kmsg->msg.msg_namelen;
1006 
1007 	/* ensure that there is no gap between hdr and sockaddr_storage */
1008 	BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) !=
1009 		     sizeof(struct io_uring_recvmsg_out));
1010 	if (copy_to_user(io->buf, &hdr, copy_len)) {
1011 		*finished = true;
1012 		return -EFAULT;
1013 	}
1014 
1015 	return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
1016 			kmsg->controllen + err;
1017 }
1018 
io_recvmsg(struct io_kiocb * req,unsigned int issue_flags)1019 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
1020 {
1021 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1022 	struct io_async_msghdr *kmsg = req->async_data;
1023 	struct io_br_sel sel = { };
1024 	struct socket *sock;
1025 	unsigned flags;
1026 	int ret, min_ret = 0;
1027 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1028 	bool mshot_finished = true;
1029 
1030 	sock = sock_from_file(req->file);
1031 	if (unlikely(!sock))
1032 		return -ENOTSOCK;
1033 
1034 	if (!(req->flags & REQ_F_POLLED) &&
1035 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1036 		return -EAGAIN;
1037 
1038 	flags = sr->msg_flags;
1039 	if (force_nonblock)
1040 		flags |= MSG_DONTWAIT;
1041 
1042 retry_multishot:
1043 	sel.buf_list = NULL;
1044 	if (io_do_buffer_select(req)) {
1045 		size_t len = sr->len;
1046 
1047 		sel = io_buffer_select(req, &len, sr->buf_group, issue_flags);
1048 		if (!sel.addr)
1049 			return -ENOBUFS;
1050 
1051 		if (req->flags & REQ_F_APOLL_MULTISHOT) {
1052 			ret = io_recvmsg_prep_multishot(kmsg, sr, &sel.addr, &len);
1053 			if (ret) {
1054 				io_kbuf_recycle(req, sel.buf_list, issue_flags);
1055 				return ret;
1056 			}
1057 		}
1058 
1059 		iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, sel.addr, len);
1060 	}
1061 
1062 	kmsg->msg.msg_get_inq = 1;
1063 	kmsg->msg.msg_inq = -1;
1064 	if (req->flags & REQ_F_APOLL_MULTISHOT) {
1065 		ret = io_recvmsg_multishot(sock, sr, kmsg, flags,
1066 					   &mshot_finished);
1067 	} else {
1068 		/* disable partial retry for recvmsg with cmsg attached */
1069 		if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen)
1070 			min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1071 
1072 		ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg,
1073 					 kmsg->uaddr, flags);
1074 	}
1075 
1076 	if (ret < min_ret) {
1077 		if (ret == -EAGAIN && force_nonblock) {
1078 			io_kbuf_recycle(req, sel.buf_list, issue_flags);
1079 			return IOU_RETRY;
1080 		}
1081 		if (ret > 0 && io_net_retry(sock, flags)) {
1082 			sr->done_io += ret;
1083 			return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
1084 		}
1085 		if (ret == -ERESTARTSYS)
1086 			ret = -EINTR;
1087 		req_set_fail(req);
1088 	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
1089 		req_set_fail(req);
1090 	}
1091 
1092 	if (ret > 0)
1093 		ret += sr->done_io;
1094 	else if (sr->done_io)
1095 		ret = sr->done_io;
1096 	else
1097 		io_kbuf_recycle(req, sel.buf_list, issue_flags);
1098 
1099 	sel.val = ret;
1100 	if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags))
1101 		goto retry_multishot;
1102 
1103 	return sel.val;
1104 }
1105 
io_recv_buf_select(struct io_kiocb * req,struct io_async_msghdr * kmsg,struct io_br_sel * sel,unsigned int issue_flags)1106 static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg,
1107 			      struct io_br_sel *sel, unsigned int issue_flags)
1108 {
1109 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1110 	int ret;
1111 
1112 	/*
1113 	 * If the ring isn't locked, then don't use the peek interface
1114 	 * to grab multiple buffers as we will lock/unlock between
1115 	 * this selection and posting the buffers.
1116 	 */
1117 	if (!(issue_flags & IO_URING_F_UNLOCKED) &&
1118 	    sr->flags & IORING_RECVSEND_BUNDLE) {
1119 		struct buf_sel_arg arg = {
1120 			.iovs = &kmsg->fast_iov,
1121 			.nr_iovs = 1,
1122 			.mode = KBUF_MODE_EXPAND,
1123 			.buf_group = sr->buf_group,
1124 		};
1125 
1126 		if (kmsg->vec.iovec) {
1127 			arg.nr_iovs = kmsg->vec.nr;
1128 			arg.iovs = kmsg->vec.iovec;
1129 			arg.mode |= KBUF_MODE_FREE;
1130 		}
1131 
1132 		if (sel->val)
1133 			arg.max_len = sel->val;
1134 		else if (kmsg->msg.msg_inq > 1)
1135 			arg.max_len = min_not_zero(sel->val, (ssize_t) kmsg->msg.msg_inq);
1136 
1137 		/* if mshot limited, ensure we don't go over */
1138 		if (sr->flags & IORING_RECV_MSHOT_LIM)
1139 			arg.max_len = min_not_zero(arg.max_len, sr->mshot_total_len);
1140 		ret = io_buffers_peek(req, &arg, sel);
1141 		if (unlikely(ret < 0))
1142 			return ret;
1143 
1144 		if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
1145 			kmsg->vec.nr = ret;
1146 			kmsg->vec.iovec = arg.iovs;
1147 			req->flags |= REQ_F_NEED_CLEANUP;
1148 		}
1149 		if (arg.partial_map)
1150 			sr->flags |= IORING_RECV_PARTIAL_MAP;
1151 
1152 		/* special case 1 vec, can be a fast path */
1153 		if (ret == 1) {
1154 			sr->buf = arg.iovs[0].iov_base;
1155 			sr->len = arg.iovs[0].iov_len;
1156 			goto map_ubuf;
1157 		}
1158 		iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
1159 				arg.out_len);
1160 	} else {
1161 		size_t len = sel->val;
1162 
1163 		*sel = io_buffer_select(req, &len, sr->buf_group, issue_flags);
1164 		if (!sel->addr)
1165 			return -ENOBUFS;
1166 		sr->buf = sel->addr;
1167 		sr->len = len;
1168 map_ubuf:
1169 		ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
1170 				  &kmsg->msg.msg_iter);
1171 		if (unlikely(ret))
1172 			return ret;
1173 	}
1174 
1175 	return 0;
1176 }
1177 
io_recv(struct io_kiocb * req,unsigned int issue_flags)1178 int io_recv(struct io_kiocb *req, unsigned int issue_flags)
1179 {
1180 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1181 	struct io_async_msghdr *kmsg = req->async_data;
1182 	struct io_br_sel sel;
1183 	struct socket *sock;
1184 	unsigned flags;
1185 	int ret, min_ret = 0;
1186 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1187 	bool mshot_finished;
1188 
1189 	if (!(req->flags & REQ_F_POLLED) &&
1190 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1191 		return -EAGAIN;
1192 
1193 	sock = sock_from_file(req->file);
1194 	if (unlikely(!sock))
1195 		return -ENOTSOCK;
1196 
1197 	flags = sr->msg_flags;
1198 	if (force_nonblock)
1199 		flags |= MSG_DONTWAIT;
1200 
1201 retry_multishot:
1202 	sel.buf_list = NULL;
1203 	if (io_do_buffer_select(req)) {
1204 		sel.val = sr->len;
1205 		ret = io_recv_buf_select(req, kmsg, &sel, issue_flags);
1206 		if (unlikely(ret < 0)) {
1207 			kmsg->msg.msg_inq = -1;
1208 			goto out_free;
1209 		}
1210 		sr->buf = NULL;
1211 	}
1212 
1213 	kmsg->msg.msg_flags = 0;
1214 	kmsg->msg.msg_inq = -1;
1215 
1216 	if (flags & MSG_WAITALL)
1217 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1218 
1219 	ret = sock_recvmsg(sock, &kmsg->msg, flags);
1220 	if (ret < min_ret) {
1221 		if (ret == -EAGAIN && force_nonblock) {
1222 			io_kbuf_recycle(req, sel.buf_list, issue_flags);
1223 			return IOU_RETRY;
1224 		}
1225 		if (ret > 0 && io_net_retry(sock, flags)) {
1226 			sr->len -= ret;
1227 			sr->buf += ret;
1228 			sr->done_io += ret;
1229 			return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret);
1230 		}
1231 		if (ret == -ERESTARTSYS)
1232 			ret = -EINTR;
1233 		req_set_fail(req);
1234 	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
1235 out_free:
1236 		req_set_fail(req);
1237 	}
1238 
1239 	mshot_finished = ret <= 0;
1240 	if (ret > 0)
1241 		ret += sr->done_io;
1242 	else if (sr->done_io)
1243 		ret = sr->done_io;
1244 	else
1245 		io_kbuf_recycle(req, sel.buf_list, issue_flags);
1246 
1247 	sel.val = ret;
1248 	if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags))
1249 		goto retry_multishot;
1250 
1251 	return sel.val;
1252 }
1253 
io_recvzc_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1254 int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1255 {
1256 	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
1257 	unsigned ifq_idx;
1258 
1259 	if (unlikely(sqe->addr2 || sqe->addr || sqe->addr3))
1260 		return -EINVAL;
1261 
1262 	ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx);
1263 	zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx);
1264 	if (!zc->ifq)
1265 		return -EINVAL;
1266 
1267 	zc->len = READ_ONCE(sqe->len);
1268 	zc->flags = READ_ONCE(sqe->ioprio);
1269 	if (READ_ONCE(sqe->msg_flags))
1270 		return -EINVAL;
1271 	if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT))
1272 		return -EINVAL;
1273 	/* multishot required */
1274 	if (!(zc->flags & IORING_RECV_MULTISHOT))
1275 		return -EINVAL;
1276 	/* All data completions are posted as aux CQEs. */
1277 	req->flags |= REQ_F_APOLL_MULTISHOT;
1278 
1279 	return 0;
1280 }
1281 
io_recvzc(struct io_kiocb * req,unsigned int issue_flags)1282 int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
1283 {
1284 	struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
1285 	struct socket *sock;
1286 	unsigned int len;
1287 	int ret;
1288 
1289 	if (!(req->flags & REQ_F_POLLED) &&
1290 	    (zc->flags & IORING_RECVSEND_POLL_FIRST))
1291 		return -EAGAIN;
1292 
1293 	sock = sock_from_file(req->file);
1294 	if (unlikely(!sock))
1295 		return -ENOTSOCK;
1296 
1297 	len = zc->len;
1298 	ret = io_zcrx_recv(req, zc->ifq, sock, 0, issue_flags, &zc->len);
1299 	if (len && zc->len == 0) {
1300 		io_req_set_res(req, 0, 0);
1301 
1302 		return IOU_COMPLETE;
1303 	}
1304 	if (unlikely(ret <= 0) && ret != -EAGAIN) {
1305 		if (ret == -ERESTARTSYS)
1306 			ret = -EINTR;
1307 		if (ret == IOU_REQUEUE)
1308 			return IOU_REQUEUE;
1309 
1310 		req_set_fail(req);
1311 		io_req_set_res(req, ret, 0);
1312 		return IOU_COMPLETE;
1313 	}
1314 	return IOU_RETRY;
1315 }
1316 
io_send_zc_cleanup(struct io_kiocb * req)1317 void io_send_zc_cleanup(struct io_kiocb *req)
1318 {
1319 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1320 	struct io_async_msghdr *io = req->async_data;
1321 
1322 	if (req_has_async_data(req))
1323 		io_netmsg_iovec_free(io);
1324 	if (zc->notif) {
1325 		io_notif_flush(zc->notif);
1326 		zc->notif = NULL;
1327 	}
1328 }
1329 
1330 #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF)
1331 #define IO_ZC_FLAGS_VALID  (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE | \
1332 				IORING_SEND_VECTORIZED)
1333 
io_send_zc_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1334 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1335 {
1336 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1337 	struct io_ring_ctx *ctx = req->ctx;
1338 	struct io_async_msghdr *iomsg;
1339 	struct io_kiocb *notif;
1340 	u64 user_data;
1341 	int ret;
1342 
1343 	zc->done_io = 0;
1344 
1345 	if (unlikely(READ_ONCE(sqe->__pad2[0])))
1346 		return -EINVAL;
1347 	/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
1348 	if (req->flags & REQ_F_CQE_SKIP)
1349 		return -EINVAL;
1350 
1351 	notif = zc->notif = io_alloc_notif(ctx);
1352 	if (!notif)
1353 		return -ENOMEM;
1354 	user_data = READ_ONCE(sqe->addr3);
1355 	if (!user_data)
1356 		user_data = req->cqe.user_data;
1357 
1358 	notif->cqe.user_data = user_data;
1359 	notif->cqe.res = 0;
1360 	notif->cqe.flags = IORING_CQE_F_NOTIF;
1361 	req->flags |= REQ_F_NEED_CLEANUP | REQ_F_POLL_NO_LAZY;
1362 
1363 	zc->flags = READ_ONCE(sqe->ioprio);
1364 	if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) {
1365 		if (zc->flags & ~IO_ZC_FLAGS_VALID)
1366 			return -EINVAL;
1367 		if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
1368 			struct io_notif_data *nd = io_notif_to_data(notif);
1369 
1370 			nd->zc_report = true;
1371 			nd->zc_used = false;
1372 			nd->zc_copied = false;
1373 		}
1374 	}
1375 
1376 	zc->len = READ_ONCE(sqe->len);
1377 	zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY;
1378 	req->buf_index = READ_ONCE(sqe->buf_index);
1379 	if (zc->msg_flags & MSG_DONTWAIT)
1380 		req->flags |= REQ_F_NOWAIT;
1381 
1382 	if (io_is_compat(ctx))
1383 		zc->msg_flags |= MSG_CMSG_COMPAT;
1384 
1385 	iomsg = io_msg_alloc_async(req);
1386 	if (unlikely(!iomsg))
1387 		return -ENOMEM;
1388 
1389 	if (req->opcode == IORING_OP_SEND_ZC) {
1390 		ret = io_send_setup(req, sqe);
1391 	} else {
1392 		if (unlikely(sqe->addr2 || sqe->file_index))
1393 			return -EINVAL;
1394 		ret = io_sendmsg_setup(req, sqe);
1395 	}
1396 	if (unlikely(ret))
1397 		return ret;
1398 
1399 	if (!(zc->flags & IORING_RECVSEND_FIXED_BUF)) {
1400 		iomsg->msg.sg_from_iter = io_sg_from_iter_iovec;
1401 		return io_notif_account_mem(zc->notif, iomsg->msg.msg_iter.count);
1402 	}
1403 	iomsg->msg.sg_from_iter = io_sg_from_iter;
1404 	return 0;
1405 }
1406 
io_sg_from_iter_iovec(struct sk_buff * skb,struct iov_iter * from,size_t length)1407 static int io_sg_from_iter_iovec(struct sk_buff *skb,
1408 				 struct iov_iter *from, size_t length)
1409 {
1410 	skb_zcopy_downgrade_managed(skb);
1411 	return zerocopy_fill_skb_from_iter(skb, from, length);
1412 }
1413 
io_sg_from_iter(struct sk_buff * skb,struct iov_iter * from,size_t length)1414 static int io_sg_from_iter(struct sk_buff *skb,
1415 			   struct iov_iter *from, size_t length)
1416 {
1417 	struct skb_shared_info *shinfo = skb_shinfo(skb);
1418 	int frag = shinfo->nr_frags;
1419 	int ret = 0;
1420 	struct bvec_iter bi;
1421 	ssize_t copied = 0;
1422 	unsigned long truesize = 0;
1423 
1424 	if (!frag)
1425 		shinfo->flags |= SKBFL_MANAGED_FRAG_REFS;
1426 	else if (unlikely(!skb_zcopy_managed(skb)))
1427 		return zerocopy_fill_skb_from_iter(skb, from, length);
1428 
1429 	bi.bi_size = min(from->count, length);
1430 	bi.bi_bvec_done = from->iov_offset;
1431 	bi.bi_idx = 0;
1432 
1433 	while (bi.bi_size && frag < MAX_SKB_FRAGS) {
1434 		struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi);
1435 
1436 		copied += v.bv_len;
1437 		truesize += PAGE_ALIGN(v.bv_len + v.bv_offset);
1438 		__skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page,
1439 					   v.bv_offset, v.bv_len);
1440 		bvec_iter_advance_single(from->bvec, &bi, v.bv_len);
1441 	}
1442 	if (bi.bi_size)
1443 		ret = -EMSGSIZE;
1444 
1445 	shinfo->nr_frags = frag;
1446 	from->bvec += bi.bi_idx;
1447 	from->nr_segs -= bi.bi_idx;
1448 	from->count -= copied;
1449 	from->iov_offset = bi.bi_bvec_done;
1450 
1451 	skb->data_len += copied;
1452 	skb->len += copied;
1453 	skb->truesize += truesize;
1454 	return ret;
1455 }
1456 
io_send_zc_import(struct io_kiocb * req,struct io_async_msghdr * kmsg,unsigned int issue_flags)1457 static int io_send_zc_import(struct io_kiocb *req,
1458 			     struct io_async_msghdr *kmsg,
1459 			     unsigned int issue_flags)
1460 {
1461 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1462 	struct io_kiocb *notif = sr->notif;
1463 	int ret;
1464 
1465 	WARN_ON_ONCE(!(sr->flags & IORING_RECVSEND_FIXED_BUF));
1466 
1467 	notif->buf_index = req->buf_index;
1468 
1469 	if (!(sr->flags & IORING_SEND_VECTORIZED)) {
1470 		ret = io_import_reg_buf(notif, &kmsg->msg.msg_iter,
1471 					(u64)(uintptr_t)sr->buf, sr->len,
1472 					ITER_SOURCE, issue_flags);
1473 	} else {
1474 		unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs;
1475 
1476 		ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter,
1477 					notif, &kmsg->vec, uvec_segs,
1478 					issue_flags);
1479 	}
1480 
1481 	if (unlikely(ret))
1482 		return ret;
1483 	req->flags &= ~REQ_F_IMPORT_BUFFER;
1484 	return 0;
1485 }
1486 
io_sendmsg_zc(struct io_kiocb * req,unsigned int issue_flags)1487 int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
1488 {
1489 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1490 	struct io_async_msghdr *kmsg = req->async_data;
1491 	struct socket *sock;
1492 	unsigned msg_flags;
1493 	int ret, min_ret = 0;
1494 
1495 	sock = sock_from_file(req->file);
1496 	if (unlikely(!sock))
1497 		return -ENOTSOCK;
1498 	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1499 		return -EOPNOTSUPP;
1500 	if (!(req->flags & REQ_F_POLLED) &&
1501 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1502 		return -EAGAIN;
1503 
1504 	if (req->flags & REQ_F_IMPORT_BUFFER) {
1505 		ret = io_send_zc_import(req, kmsg, issue_flags);
1506 		if (unlikely(ret))
1507 			return ret;
1508 	}
1509 
1510 	msg_flags = sr->msg_flags;
1511 	if (issue_flags & IO_URING_F_NONBLOCK)
1512 		msg_flags |= MSG_DONTWAIT;
1513 	if (msg_flags & MSG_WAITALL)
1514 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1515 
1516 	kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
1517 
1518 	if (req->opcode == IORING_OP_SEND_ZC) {
1519 		msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
1520 		kmsg->msg.msg_flags = msg_flags;
1521 		ret = sock_sendmsg(sock, &kmsg->msg);
1522 	} else {
1523 		kmsg->msg.msg_control_user = sr->msg_control;
1524 		ret = __sys_sendmsg_sock(sock, &kmsg->msg, msg_flags);
1525 	}
1526 
1527 	if (unlikely(ret < min_ret)) {
1528 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1529 			return -EAGAIN;
1530 
1531 		if (ret > 0 && io_net_retry(sock, sr->msg_flags)) {
1532 			sr->done_io += ret;
1533 			return -EAGAIN;
1534 		}
1535 		if (ret == -ERESTARTSYS)
1536 			ret = -EINTR;
1537 		req_set_fail(req);
1538 	}
1539 
1540 	if (ret >= 0)
1541 		ret += sr->done_io;
1542 	else if (sr->done_io)
1543 		ret = sr->done_io;
1544 
1545 	/*
1546 	 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1547 	 * flushing notif to io_send_zc_cleanup()
1548 	 */
1549 	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1550 		io_notif_flush(sr->notif);
1551 		sr->notif = NULL;
1552 		io_req_msg_cleanup(req, 0);
1553 	}
1554 	io_req_set_res(req, ret, IORING_CQE_F_MORE);
1555 	return IOU_COMPLETE;
1556 }
1557 
io_sendrecv_fail(struct io_kiocb * req)1558 void io_sendrecv_fail(struct io_kiocb *req)
1559 {
1560 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1561 
1562 	if (sr->done_io)
1563 		req->cqe.res = sr->done_io;
1564 
1565 	if ((req->flags & REQ_F_NEED_CLEANUP) &&
1566 	    (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC))
1567 		req->cqe.flags |= IORING_CQE_F_MORE;
1568 }
1569 
1570 #define ACCEPT_FLAGS	(IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \
1571 			 IORING_ACCEPT_POLL_FIRST)
1572 
io_accept_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1573 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1574 {
1575 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1576 
1577 	if (sqe->len || sqe->buf_index)
1578 		return -EINVAL;
1579 
1580 	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1581 	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
1582 	accept->flags = READ_ONCE(sqe->accept_flags);
1583 	accept->nofile = rlimit(RLIMIT_NOFILE);
1584 	accept->iou_flags = READ_ONCE(sqe->ioprio);
1585 	if (accept->iou_flags & ~ACCEPT_FLAGS)
1586 		return -EINVAL;
1587 
1588 	accept->file_slot = READ_ONCE(sqe->file_index);
1589 	if (accept->file_slot) {
1590 		if (accept->flags & SOCK_CLOEXEC)
1591 			return -EINVAL;
1592 		if (accept->iou_flags & IORING_ACCEPT_MULTISHOT &&
1593 		    accept->file_slot != IORING_FILE_INDEX_ALLOC)
1594 			return -EINVAL;
1595 	}
1596 	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1597 		return -EINVAL;
1598 	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
1599 		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1600 	if (accept->iou_flags & IORING_ACCEPT_MULTISHOT)
1601 		req->flags |= REQ_F_APOLL_MULTISHOT;
1602 	if (accept->iou_flags & IORING_ACCEPT_DONTWAIT)
1603 		req->flags |= REQ_F_NOWAIT;
1604 	return 0;
1605 }
1606 
io_accept(struct io_kiocb * req,unsigned int issue_flags)1607 int io_accept(struct io_kiocb *req, unsigned int issue_flags)
1608 {
1609 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1610 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1611 	bool fixed = !!accept->file_slot;
1612 	struct proto_accept_arg arg = {
1613 		.flags = force_nonblock ? O_NONBLOCK : 0,
1614 	};
1615 	struct file *file;
1616 	unsigned cflags;
1617 	int ret, fd;
1618 
1619 	if (!(req->flags & REQ_F_POLLED) &&
1620 	    accept->iou_flags & IORING_ACCEPT_POLL_FIRST)
1621 		return -EAGAIN;
1622 
1623 retry:
1624 	if (!fixed) {
1625 		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
1626 		if (unlikely(fd < 0))
1627 			return fd;
1628 	}
1629 	arg.err = 0;
1630 	arg.is_empty = -1;
1631 	file = do_accept(req->file, &arg, accept->addr, accept->addr_len,
1632 			 accept->flags);
1633 	if (IS_ERR(file)) {
1634 		if (!fixed)
1635 			put_unused_fd(fd);
1636 		ret = PTR_ERR(file);
1637 		if (ret == -EAGAIN && force_nonblock &&
1638 		    !(accept->iou_flags & IORING_ACCEPT_DONTWAIT))
1639 			return IOU_RETRY;
1640 
1641 		if (ret == -ERESTARTSYS)
1642 			ret = -EINTR;
1643 	} else if (!fixed) {
1644 		fd_install(fd, file);
1645 		ret = fd;
1646 	} else {
1647 		ret = io_fixed_fd_install(req, issue_flags, file,
1648 						accept->file_slot);
1649 	}
1650 
1651 	cflags = 0;
1652 	if (!arg.is_empty)
1653 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
1654 
1655 	if (ret >= 0 && (req->flags & REQ_F_APOLL_MULTISHOT) &&
1656 	    io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
1657 		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1)
1658 			goto retry;
1659 		return IOU_RETRY;
1660 	}
1661 
1662 	io_req_set_res(req, ret, cflags);
1663 	if (ret < 0)
1664 		req_set_fail(req);
1665 	return IOU_COMPLETE;
1666 }
1667 
io_socket_bpf_populate(struct io_uring_bpf_ctx * bctx,struct io_kiocb * req)1668 void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req)
1669 {
1670 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1671 
1672 	bctx->socket.family = sock->domain;
1673 	bctx->socket.type = sock->type;
1674 	bctx->socket.protocol = sock->protocol;
1675 }
1676 
io_socket_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1677 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1678 {
1679 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1680 
1681 	if (sqe->addr || sqe->rw_flags || sqe->buf_index)
1682 		return -EINVAL;
1683 
1684 	sock->domain = READ_ONCE(sqe->fd);
1685 	sock->type = READ_ONCE(sqe->off);
1686 	sock->protocol = READ_ONCE(sqe->len);
1687 	sock->file_slot = READ_ONCE(sqe->file_index);
1688 	sock->nofile = rlimit(RLIMIT_NOFILE);
1689 
1690 	sock->flags = sock->type & ~SOCK_TYPE_MASK;
1691 	if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
1692 		return -EINVAL;
1693 	if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1694 		return -EINVAL;
1695 	return 0;
1696 }
1697 
io_socket(struct io_kiocb * req,unsigned int issue_flags)1698 int io_socket(struct io_kiocb *req, unsigned int issue_flags)
1699 {
1700 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1701 	bool fixed = !!sock->file_slot;
1702 	struct file *file;
1703 	int ret, fd;
1704 
1705 	if (!fixed) {
1706 		fd = __get_unused_fd_flags(sock->flags, sock->nofile);
1707 		if (unlikely(fd < 0))
1708 			return fd;
1709 	}
1710 	file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
1711 	if (IS_ERR(file)) {
1712 		if (!fixed)
1713 			put_unused_fd(fd);
1714 		ret = PTR_ERR(file);
1715 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1716 			return -EAGAIN;
1717 		if (ret == -ERESTARTSYS)
1718 			ret = -EINTR;
1719 		req_set_fail(req);
1720 	} else if (!fixed) {
1721 		fd_install(fd, file);
1722 		ret = fd;
1723 	} else {
1724 		ret = io_fixed_fd_install(req, issue_flags, file,
1725 					    sock->file_slot);
1726 	}
1727 	io_req_set_res(req, ret, 0);
1728 	return IOU_COMPLETE;
1729 }
1730 
io_connect_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1731 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1732 {
1733 	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
1734 	struct io_async_msghdr *io;
1735 
1736 	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
1737 		return -EINVAL;
1738 
1739 	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1740 	conn->addr_len =  READ_ONCE(sqe->addr2);
1741 	conn->in_progress = conn->seen_econnaborted = false;
1742 
1743 	io = io_msg_alloc_async(req);
1744 	if (unlikely(!io))
1745 		return -ENOMEM;
1746 
1747 	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr);
1748 }
1749 
io_connect(struct io_kiocb * req,unsigned int issue_flags)1750 int io_connect(struct io_kiocb *req, unsigned int issue_flags)
1751 {
1752 	struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect);
1753 	struct io_async_msghdr *io = req->async_data;
1754 	unsigned file_flags;
1755 	int ret;
1756 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1757 
1758 	if (connect->in_progress) {
1759 		struct poll_table_struct pt = { ._key = EPOLLERR };
1760 
1761 		if (vfs_poll(req->file, &pt) & EPOLLERR)
1762 			goto get_sock_err;
1763 	}
1764 
1765 	file_flags = force_nonblock ? O_NONBLOCK : 0;
1766 
1767 	ret = __sys_connect_file(req->file, &io->addr, connect->addr_len,
1768 				 file_flags);
1769 	if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED)
1770 	    && force_nonblock) {
1771 		if (ret == -EINPROGRESS) {
1772 			connect->in_progress = true;
1773 		} else if (ret == -ECONNABORTED) {
1774 			if (connect->seen_econnaborted)
1775 				goto out;
1776 			connect->seen_econnaborted = true;
1777 		}
1778 		return -EAGAIN;
1779 	}
1780 	if (connect->in_progress) {
1781 		/*
1782 		 * At least bluetooth will return -EBADFD on a re-connect
1783 		 * attempt, and it's (supposedly) also valid to get -EISCONN
1784 		 * which means the previous result is good. For both of these,
1785 		 * grab the sock_error() and use that for the completion.
1786 		 */
1787 		if (ret == -EBADFD || ret == -EISCONN) {
1788 get_sock_err:
1789 			ret = sock_error(sock_from_file(req->file)->sk);
1790 		}
1791 	}
1792 	if (ret == -ERESTARTSYS)
1793 		ret = -EINTR;
1794 out:
1795 	if (ret < 0)
1796 		req_set_fail(req);
1797 	io_req_msg_cleanup(req, issue_flags);
1798 	io_req_set_res(req, ret, 0);
1799 	return IOU_COMPLETE;
1800 }
1801 
io_bind_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1802 int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1803 {
1804 	struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
1805 	struct sockaddr __user *uaddr;
1806 	struct io_async_msghdr *io;
1807 
1808 	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
1809 		return -EINVAL;
1810 
1811 	uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1812 	bind->addr_len =  READ_ONCE(sqe->addr2);
1813 
1814 	io = io_msg_alloc_async(req);
1815 	if (unlikely(!io))
1816 		return -ENOMEM;
1817 	return move_addr_to_kernel(uaddr, bind->addr_len, &io->addr);
1818 }
1819 
io_bind(struct io_kiocb * req,unsigned int issue_flags)1820 int io_bind(struct io_kiocb *req, unsigned int issue_flags)
1821 {
1822 	struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
1823 	struct io_async_msghdr *io = req->async_data;
1824 	struct socket *sock;
1825 	int ret;
1826 
1827 	sock = sock_from_file(req->file);
1828 	if (unlikely(!sock))
1829 		return -ENOTSOCK;
1830 
1831 	ret = __sys_bind_socket(sock, &io->addr, bind->addr_len);
1832 	if (ret < 0)
1833 		req_set_fail(req);
1834 	io_req_set_res(req, ret, 0);
1835 	return 0;
1836 }
1837 
io_listen_prep(struct io_kiocb * req,const struct io_uring_sqe * sqe)1838 int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1839 {
1840 	struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
1841 
1842 	if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2)
1843 		return -EINVAL;
1844 
1845 	listen->backlog = READ_ONCE(sqe->len);
1846 	return 0;
1847 }
1848 
io_listen(struct io_kiocb * req,unsigned int issue_flags)1849 int io_listen(struct io_kiocb *req, unsigned int issue_flags)
1850 {
1851 	struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
1852 	struct socket *sock;
1853 	int ret;
1854 
1855 	sock = sock_from_file(req->file);
1856 	if (unlikely(!sock))
1857 		return -ENOTSOCK;
1858 
1859 	ret = __sys_listen_socket(sock, listen->backlog);
1860 	if (ret < 0)
1861 		req_set_fail(req);
1862 	io_req_set_res(req, ret, 0);
1863 	return 0;
1864 }
1865 
io_netmsg_cache_free(const void * entry)1866 void io_netmsg_cache_free(const void *entry)
1867 {
1868 	struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;
1869 
1870 	io_vec_free(&kmsg->vec);
1871 	kfree(kmsg);
1872 }
1873