1*0105b056SJens Axboe // SPDX-License-Identifier: GPL-2.0
2*0105b056SJens Axboe /*
3*0105b056SJens Axboe * Waiting for completion events
4*0105b056SJens Axboe */
5*0105b056SJens Axboe #include <linux/kernel.h>
6*0105b056SJens Axboe #include <linux/sched/signal.h>
7*0105b056SJens Axboe #include <linux/io_uring.h>
8*0105b056SJens Axboe
9*0105b056SJens Axboe #include <trace/events/io_uring.h>
10*0105b056SJens Axboe
11*0105b056SJens Axboe #include <uapi/linux/io_uring.h>
12*0105b056SJens Axboe
13*0105b056SJens Axboe #include "io_uring.h"
14*0105b056SJens Axboe #include "napi.h"
15*0105b056SJens Axboe #include "wait.h"
16*0105b056SJens Axboe
io_wake_function(struct wait_queue_entry * curr,unsigned int mode,int wake_flags,void * key)17*0105b056SJens Axboe static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
18*0105b056SJens Axboe int wake_flags, void *key)
19*0105b056SJens Axboe {
20*0105b056SJens Axboe struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq);
21*0105b056SJens Axboe
22*0105b056SJens Axboe /*
23*0105b056SJens Axboe * Cannot safely flush overflowed CQEs from here, ensure we wake up
24*0105b056SJens Axboe * the task, and the next invocation will do it.
25*0105b056SJens Axboe */
26*0105b056SJens Axboe if (io_should_wake(iowq) || io_has_work(iowq->ctx))
27*0105b056SJens Axboe return autoremove_wake_function(curr, mode, wake_flags, key);
28*0105b056SJens Axboe return -1;
29*0105b056SJens Axboe }
30*0105b056SJens Axboe
io_run_task_work_sig(struct io_ring_ctx * ctx)31*0105b056SJens Axboe int io_run_task_work_sig(struct io_ring_ctx *ctx)
32*0105b056SJens Axboe {
33*0105b056SJens Axboe if (io_local_work_pending(ctx)) {
34*0105b056SJens Axboe __set_current_state(TASK_RUNNING);
35*0105b056SJens Axboe if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0)
36*0105b056SJens Axboe return 0;
37*0105b056SJens Axboe }
38*0105b056SJens Axboe if (io_run_task_work() > 0)
39*0105b056SJens Axboe return 0;
40*0105b056SJens Axboe if (task_sigpending(current))
41*0105b056SJens Axboe return -EINTR;
42*0105b056SJens Axboe return 0;
43*0105b056SJens Axboe }
44*0105b056SJens Axboe
current_pending_io(void)45*0105b056SJens Axboe static bool current_pending_io(void)
46*0105b056SJens Axboe {
47*0105b056SJens Axboe struct io_uring_task *tctx = current->io_uring;
48*0105b056SJens Axboe
49*0105b056SJens Axboe if (!tctx)
50*0105b056SJens Axboe return false;
51*0105b056SJens Axboe return percpu_counter_read_positive(&tctx->inflight);
52*0105b056SJens Axboe }
53*0105b056SJens Axboe
io_cqring_timer_wakeup(struct hrtimer * timer)54*0105b056SJens Axboe static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
55*0105b056SJens Axboe {
56*0105b056SJens Axboe struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
57*0105b056SJens Axboe
58*0105b056SJens Axboe WRITE_ONCE(iowq->hit_timeout, 1);
59*0105b056SJens Axboe iowq->min_timeout = 0;
60*0105b056SJens Axboe wake_up_process(iowq->wq.private);
61*0105b056SJens Axboe return HRTIMER_NORESTART;
62*0105b056SJens Axboe }
63*0105b056SJens Axboe
64*0105b056SJens Axboe /*
65*0105b056SJens Axboe * Doing min_timeout portion. If we saw any timeouts, events, or have work,
66*0105b056SJens Axboe * wake up. If not, and we have a normal timeout, switch to that and keep
67*0105b056SJens Axboe * sleeping.
68*0105b056SJens Axboe */
io_cqring_min_timer_wakeup(struct hrtimer * timer)69*0105b056SJens Axboe static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
70*0105b056SJens Axboe {
71*0105b056SJens Axboe struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t);
72*0105b056SJens Axboe struct io_ring_ctx *ctx = iowq->ctx;
73*0105b056SJens Axboe
74*0105b056SJens Axboe /* no general timeout, or shorter (or equal), we are done */
75*0105b056SJens Axboe if (iowq->timeout == KTIME_MAX ||
76*0105b056SJens Axboe ktime_compare(iowq->min_timeout, iowq->timeout) >= 0)
77*0105b056SJens Axboe goto out_wake;
78*0105b056SJens Axboe /* work we may need to run, wake function will see if we need to wake */
79*0105b056SJens Axboe if (io_has_work(ctx))
80*0105b056SJens Axboe goto out_wake;
81*0105b056SJens Axboe /* got events since we started waiting, min timeout is done */
82*0105b056SJens Axboe if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
83*0105b056SJens Axboe goto out_wake;
84*0105b056SJens Axboe /* if we have any events and min timeout expired, we're done */
85*0105b056SJens Axboe if (io_cqring_events(ctx))
86*0105b056SJens Axboe goto out_wake;
87*0105b056SJens Axboe
88*0105b056SJens Axboe /*
89*0105b056SJens Axboe * If using deferred task_work running and application is waiting on
90*0105b056SJens Axboe * more than one request, ensure we reset it now where we are switching
91*0105b056SJens Axboe * to normal sleeps. Any request completion post min_wait should wake
92*0105b056SJens Axboe * the task and return.
93*0105b056SJens Axboe */
94*0105b056SJens Axboe if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
95*0105b056SJens Axboe atomic_set(&ctx->cq_wait_nr, 1);
96*0105b056SJens Axboe smp_mb();
97*0105b056SJens Axboe if (!llist_empty(&ctx->work_llist))
98*0105b056SJens Axboe goto out_wake;
99*0105b056SJens Axboe }
100*0105b056SJens Axboe
101*0105b056SJens Axboe /* any generated CQE posted past this time should wake us up */
102*0105b056SJens Axboe iowq->cq_tail = iowq->cq_min_tail;
103*0105b056SJens Axboe
104*0105b056SJens Axboe hrtimer_update_function(&iowq->t, io_cqring_timer_wakeup);
105*0105b056SJens Axboe hrtimer_set_expires(timer, iowq->timeout);
106*0105b056SJens Axboe return HRTIMER_RESTART;
107*0105b056SJens Axboe out_wake:
108*0105b056SJens Axboe return io_cqring_timer_wakeup(timer);
109*0105b056SJens Axboe }
110*0105b056SJens Axboe
io_cqring_schedule_timeout(struct io_wait_queue * iowq,clockid_t clock_id,ktime_t start_time)111*0105b056SJens Axboe static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
112*0105b056SJens Axboe clockid_t clock_id, ktime_t start_time)
113*0105b056SJens Axboe {
114*0105b056SJens Axboe ktime_t timeout;
115*0105b056SJens Axboe
116*0105b056SJens Axboe if (iowq->min_timeout) {
117*0105b056SJens Axboe timeout = ktime_add_ns(iowq->min_timeout, start_time);
118*0105b056SJens Axboe hrtimer_setup_on_stack(&iowq->t, io_cqring_min_timer_wakeup, clock_id,
119*0105b056SJens Axboe HRTIMER_MODE_ABS);
120*0105b056SJens Axboe } else {
121*0105b056SJens Axboe timeout = iowq->timeout;
122*0105b056SJens Axboe hrtimer_setup_on_stack(&iowq->t, io_cqring_timer_wakeup, clock_id,
123*0105b056SJens Axboe HRTIMER_MODE_ABS);
124*0105b056SJens Axboe }
125*0105b056SJens Axboe
126*0105b056SJens Axboe hrtimer_set_expires_range_ns(&iowq->t, timeout, 0);
127*0105b056SJens Axboe hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS);
128*0105b056SJens Axboe
129*0105b056SJens Axboe if (!READ_ONCE(iowq->hit_timeout))
130*0105b056SJens Axboe schedule();
131*0105b056SJens Axboe
132*0105b056SJens Axboe hrtimer_cancel(&iowq->t);
133*0105b056SJens Axboe destroy_hrtimer_on_stack(&iowq->t);
134*0105b056SJens Axboe __set_current_state(TASK_RUNNING);
135*0105b056SJens Axboe
136*0105b056SJens Axboe return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0;
137*0105b056SJens Axboe }
138*0105b056SJens Axboe
__io_cqring_wait_schedule(struct io_ring_ctx * ctx,struct io_wait_queue * iowq,struct ext_arg * ext_arg,ktime_t start_time)139*0105b056SJens Axboe static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
140*0105b056SJens Axboe struct io_wait_queue *iowq,
141*0105b056SJens Axboe struct ext_arg *ext_arg,
142*0105b056SJens Axboe ktime_t start_time)
143*0105b056SJens Axboe {
144*0105b056SJens Axboe int ret = 0;
145*0105b056SJens Axboe
146*0105b056SJens Axboe /*
147*0105b056SJens Axboe * Mark us as being in io_wait if we have pending requests, so cpufreq
148*0105b056SJens Axboe * can take into account that the task is waiting for IO - turns out
149*0105b056SJens Axboe * to be important for low QD IO.
150*0105b056SJens Axboe */
151*0105b056SJens Axboe if (ext_arg->iowait && current_pending_io())
152*0105b056SJens Axboe current->in_iowait = 1;
153*0105b056SJens Axboe if (iowq->timeout != KTIME_MAX || iowq->min_timeout)
154*0105b056SJens Axboe ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time);
155*0105b056SJens Axboe else
156*0105b056SJens Axboe schedule();
157*0105b056SJens Axboe current->in_iowait = 0;
158*0105b056SJens Axboe return ret;
159*0105b056SJens Axboe }
160*0105b056SJens Axboe
161*0105b056SJens Axboe /* If this returns > 0, the caller should retry */
io_cqring_wait_schedule(struct io_ring_ctx * ctx,struct io_wait_queue * iowq,struct ext_arg * ext_arg,ktime_t start_time)162*0105b056SJens Axboe static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
163*0105b056SJens Axboe struct io_wait_queue *iowq,
164*0105b056SJens Axboe struct ext_arg *ext_arg,
165*0105b056SJens Axboe ktime_t start_time)
166*0105b056SJens Axboe {
167*0105b056SJens Axboe if (unlikely(READ_ONCE(ctx->check_cq)))
168*0105b056SJens Axboe return 1;
169*0105b056SJens Axboe if (unlikely(io_local_work_pending(ctx)))
170*0105b056SJens Axboe return 1;
171*0105b056SJens Axboe if (unlikely(task_work_pending(current)))
172*0105b056SJens Axboe return 1;
173*0105b056SJens Axboe if (unlikely(task_sigpending(current)))
174*0105b056SJens Axboe return -EINTR;
175*0105b056SJens Axboe if (unlikely(io_should_wake(iowq)))
176*0105b056SJens Axboe return 0;
177*0105b056SJens Axboe
178*0105b056SJens Axboe return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time);
179*0105b056SJens Axboe }
180*0105b056SJens Axboe
181*0105b056SJens Axboe /*
182*0105b056SJens Axboe * Wait until events become available, if we don't already have some. The
183*0105b056SJens Axboe * application must reap them itself, as they reside on the shared cq ring.
184*0105b056SJens Axboe */
io_cqring_wait(struct io_ring_ctx * ctx,int min_events,u32 flags,struct ext_arg * ext_arg)185*0105b056SJens Axboe int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
186*0105b056SJens Axboe struct ext_arg *ext_arg)
187*0105b056SJens Axboe {
188*0105b056SJens Axboe struct io_wait_queue iowq;
189*0105b056SJens Axboe struct io_rings *rings = ctx->rings;
190*0105b056SJens Axboe ktime_t start_time;
191*0105b056SJens Axboe int ret;
192*0105b056SJens Axboe
193*0105b056SJens Axboe min_events = min_t(int, min_events, ctx->cq_entries);
194*0105b056SJens Axboe
195*0105b056SJens Axboe if (!io_allowed_run_tw(ctx))
196*0105b056SJens Axboe return -EEXIST;
197*0105b056SJens Axboe if (io_local_work_pending(ctx))
198*0105b056SJens Axboe io_run_local_work(ctx, min_events,
199*0105b056SJens Axboe max(IO_LOCAL_TW_DEFAULT_MAX, min_events));
200*0105b056SJens Axboe io_run_task_work();
201*0105b056SJens Axboe
202*0105b056SJens Axboe if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)))
203*0105b056SJens Axboe io_cqring_do_overflow_flush(ctx);
204*0105b056SJens Axboe if (__io_cqring_events_user(ctx) >= min_events)
205*0105b056SJens Axboe return 0;
206*0105b056SJens Axboe
207*0105b056SJens Axboe init_waitqueue_func_entry(&iowq.wq, io_wake_function);
208*0105b056SJens Axboe iowq.wq.private = current;
209*0105b056SJens Axboe INIT_LIST_HEAD(&iowq.wq.entry);
210*0105b056SJens Axboe iowq.ctx = ctx;
211*0105b056SJens Axboe iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
212*0105b056SJens Axboe iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail);
213*0105b056SJens Axboe iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
214*0105b056SJens Axboe iowq.hit_timeout = 0;
215*0105b056SJens Axboe iowq.min_timeout = ext_arg->min_time;
216*0105b056SJens Axboe iowq.timeout = KTIME_MAX;
217*0105b056SJens Axboe start_time = io_get_time(ctx);
218*0105b056SJens Axboe
219*0105b056SJens Axboe if (ext_arg->ts_set) {
220*0105b056SJens Axboe iowq.timeout = timespec64_to_ktime(ext_arg->ts);
221*0105b056SJens Axboe if (!(flags & IORING_ENTER_ABS_TIMER))
222*0105b056SJens Axboe iowq.timeout = ktime_add(iowq.timeout, start_time);
223*0105b056SJens Axboe }
224*0105b056SJens Axboe
225*0105b056SJens Axboe if (ext_arg->sig) {
226*0105b056SJens Axboe #ifdef CONFIG_COMPAT
227*0105b056SJens Axboe if (in_compat_syscall())
228*0105b056SJens Axboe ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig,
229*0105b056SJens Axboe ext_arg->argsz);
230*0105b056SJens Axboe else
231*0105b056SJens Axboe #endif
232*0105b056SJens Axboe ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz);
233*0105b056SJens Axboe
234*0105b056SJens Axboe if (ret)
235*0105b056SJens Axboe return ret;
236*0105b056SJens Axboe }
237*0105b056SJens Axboe
238*0105b056SJens Axboe io_napi_busy_loop(ctx, &iowq);
239*0105b056SJens Axboe
240*0105b056SJens Axboe trace_io_uring_cqring_wait(ctx, min_events);
241*0105b056SJens Axboe do {
242*0105b056SJens Axboe unsigned long check_cq;
243*0105b056SJens Axboe int nr_wait;
244*0105b056SJens Axboe
245*0105b056SJens Axboe /* if min timeout has been hit, don't reset wait count */
246*0105b056SJens Axboe if (!iowq.hit_timeout)
247*0105b056SJens Axboe nr_wait = (int) iowq.cq_tail -
248*0105b056SJens Axboe READ_ONCE(ctx->rings->cq.tail);
249*0105b056SJens Axboe else
250*0105b056SJens Axboe nr_wait = 1;
251*0105b056SJens Axboe
252*0105b056SJens Axboe if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
253*0105b056SJens Axboe atomic_set(&ctx->cq_wait_nr, nr_wait);
254*0105b056SJens Axboe set_current_state(TASK_INTERRUPTIBLE);
255*0105b056SJens Axboe } else {
256*0105b056SJens Axboe prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
257*0105b056SJens Axboe TASK_INTERRUPTIBLE);
258*0105b056SJens Axboe }
259*0105b056SJens Axboe
260*0105b056SJens Axboe ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time);
261*0105b056SJens Axboe __set_current_state(TASK_RUNNING);
262*0105b056SJens Axboe atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
263*0105b056SJens Axboe
264*0105b056SJens Axboe /*
265*0105b056SJens Axboe * Run task_work after scheduling and before io_should_wake().
266*0105b056SJens Axboe * If we got woken because of task_work being processed, run it
267*0105b056SJens Axboe * now rather than let the caller do another wait loop.
268*0105b056SJens Axboe */
269*0105b056SJens Axboe if (io_local_work_pending(ctx))
270*0105b056SJens Axboe io_run_local_work(ctx, nr_wait, nr_wait);
271*0105b056SJens Axboe io_run_task_work();
272*0105b056SJens Axboe
273*0105b056SJens Axboe /*
274*0105b056SJens Axboe * Non-local task_work will be run on exit to userspace, but
275*0105b056SJens Axboe * if we're using DEFER_TASKRUN, then we could have waited
276*0105b056SJens Axboe * with a timeout for a number of requests. If the timeout
277*0105b056SJens Axboe * hits, we could have some requests ready to process. Ensure
278*0105b056SJens Axboe * this break is _after_ we have run task_work, to avoid
279*0105b056SJens Axboe * deferring running potentially pending requests until the
280*0105b056SJens Axboe * next time we wait for events.
281*0105b056SJens Axboe */
282*0105b056SJens Axboe if (ret < 0)
283*0105b056SJens Axboe break;
284*0105b056SJens Axboe
285*0105b056SJens Axboe check_cq = READ_ONCE(ctx->check_cq);
286*0105b056SJens Axboe if (unlikely(check_cq)) {
287*0105b056SJens Axboe /* let the caller flush overflows, retry */
288*0105b056SJens Axboe if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
289*0105b056SJens Axboe io_cqring_do_overflow_flush(ctx);
290*0105b056SJens Axboe if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) {
291*0105b056SJens Axboe ret = -EBADR;
292*0105b056SJens Axboe break;
293*0105b056SJens Axboe }
294*0105b056SJens Axboe }
295*0105b056SJens Axboe
296*0105b056SJens Axboe if (io_should_wake(&iowq)) {
297*0105b056SJens Axboe ret = 0;
298*0105b056SJens Axboe break;
299*0105b056SJens Axboe }
300*0105b056SJens Axboe cond_resched();
301*0105b056SJens Axboe } while (1);
302*0105b056SJens Axboe
303*0105b056SJens Axboe if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
304*0105b056SJens Axboe finish_wait(&ctx->cq_wait, &iowq.wq);
305*0105b056SJens Axboe restore_saved_sigmask_unless(ret == -EINTR);
306*0105b056SJens Axboe
307*0105b056SJens Axboe return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
308*0105b056SJens Axboe }
309