1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "blk-rq-qos.h"
4
5 __read_mostly DEFINE_STATIC_KEY_FALSE(block_rq_qos);
6
7 /*
8 * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
9 * false if 'v' + 1 would be bigger than 'below'.
10 */
atomic_inc_below(atomic_t * v,unsigned int below)11 static bool atomic_inc_below(atomic_t *v, unsigned int below)
12 {
13 unsigned int cur = atomic_read(v);
14
15 do {
16 if (cur >= below)
17 return false;
18 } while (!atomic_try_cmpxchg(v, &cur, cur + 1));
19
20 return true;
21 }
22
rq_wait_inc_below(struct rq_wait * rq_wait,unsigned int limit)23 bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit)
24 {
25 return atomic_inc_below(&rq_wait->inflight, limit);
26 }
27
__rq_qos_cleanup(struct rq_qos * rqos,struct bio * bio)28 void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio)
29 {
30 do {
31 if (rqos->ops->cleanup)
32 rqos->ops->cleanup(rqos, bio);
33 rqos = rqos->next;
34 } while (rqos);
35 }
36
__rq_qos_done(struct rq_qos * rqos,struct request * rq)37 void __rq_qos_done(struct rq_qos *rqos, struct request *rq)
38 {
39 do {
40 if (rqos->ops->done)
41 rqos->ops->done(rqos, rq);
42 rqos = rqos->next;
43 } while (rqos);
44 }
45
__rq_qos_issue(struct rq_qos * rqos,struct request * rq)46 void __rq_qos_issue(struct rq_qos *rqos, struct request *rq)
47 {
48 do {
49 if (rqos->ops->issue)
50 rqos->ops->issue(rqos, rq);
51 rqos = rqos->next;
52 } while (rqos);
53 }
54
__rq_qos_requeue(struct rq_qos * rqos,struct request * rq)55 void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq)
56 {
57 do {
58 if (rqos->ops->requeue)
59 rqos->ops->requeue(rqos, rq);
60 rqos = rqos->next;
61 } while (rqos);
62 }
63
__rq_qos_throttle(struct rq_qos * rqos,struct bio * bio)64 void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio)
65 {
66 do {
67 if (rqos->ops->throttle)
68 rqos->ops->throttle(rqos, bio);
69 rqos = rqos->next;
70 } while (rqos);
71 }
72
__rq_qos_track(struct rq_qos * rqos,struct request * rq,struct bio * bio)73 void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
74 {
75 do {
76 if (rqos->ops->track)
77 rqos->ops->track(rqos, rq, bio);
78 rqos = rqos->next;
79 } while (rqos);
80 }
81
__rq_qos_merge(struct rq_qos * rqos,struct request * rq,struct bio * bio)82 void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio)
83 {
84 do {
85 if (rqos->ops->merge)
86 rqos->ops->merge(rqos, rq, bio);
87 rqos = rqos->next;
88 } while (rqos);
89 }
90
__rq_qos_done_bio(struct rq_qos * rqos,struct bio * bio)91 void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio)
92 {
93 do {
94 if (rqos->ops->done_bio)
95 rqos->ops->done_bio(rqos, bio);
96 rqos = rqos->next;
97 } while (rqos);
98 }
99
__rq_qos_queue_depth_changed(struct rq_qos * rqos)100 void __rq_qos_queue_depth_changed(struct rq_qos *rqos)
101 {
102 do {
103 if (rqos->ops->queue_depth_changed)
104 rqos->ops->queue_depth_changed(rqos);
105 rqos = rqos->next;
106 } while (rqos);
107 }
108
109 /*
110 * Return true, if we can't increase the depth further by scaling
111 */
rq_depth_calc_max_depth(struct rq_depth * rqd)112 bool rq_depth_calc_max_depth(struct rq_depth *rqd)
113 {
114 unsigned int depth;
115 bool ret = false;
116
117 /*
118 * For QD=1 devices, this is a special case. It's important for those
119 * to have one request ready when one completes, so force a depth of
120 * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
121 * since the device can't have more than that in flight. If we're
122 * scaling down, then keep a setting of 1/1/1.
123 */
124 if (rqd->queue_depth == 1) {
125 if (rqd->scale_step > 0)
126 rqd->max_depth = 1;
127 else {
128 rqd->max_depth = 2;
129 ret = true;
130 }
131 } else {
132 /*
133 * scale_step == 0 is our default state. If we have suffered
134 * latency spikes, step will be > 0, and we shrink the
135 * allowed write depths. If step is < 0, we're only doing
136 * writes, and we allow a temporarily higher depth to
137 * increase performance.
138 */
139 depth = min_t(unsigned int, rqd->default_depth,
140 rqd->queue_depth);
141 if (rqd->scale_step > 0)
142 depth = 1 + ((depth - 1) >> min(31, rqd->scale_step));
143 else if (rqd->scale_step < 0) {
144 unsigned int maxd = 3 * rqd->queue_depth / 4;
145
146 depth = 1 + ((depth - 1) << -rqd->scale_step);
147 if (depth > maxd) {
148 depth = maxd;
149 ret = true;
150 }
151 }
152
153 rqd->max_depth = depth;
154 }
155
156 return ret;
157 }
158
159 /* Returns true on success and false if scaling up wasn't possible */
rq_depth_scale_up(struct rq_depth * rqd)160 bool rq_depth_scale_up(struct rq_depth *rqd)
161 {
162 /*
163 * Hit max in previous round, stop here
164 */
165 if (rqd->scaled_max)
166 return false;
167
168 rqd->scale_step--;
169
170 rqd->scaled_max = rq_depth_calc_max_depth(rqd);
171 return true;
172 }
173
174 /*
175 * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
176 * had a latency violation. Returns true on success and returns false if
177 * scaling down wasn't possible.
178 */
rq_depth_scale_down(struct rq_depth * rqd,bool hard_throttle)179 bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
180 {
181 /*
182 * Stop scaling down when we've hit the limit. This also prevents
183 * ->scale_step from going to crazy values, if the device can't
184 * keep up.
185 */
186 if (rqd->max_depth == 1)
187 return false;
188
189 if (rqd->scale_step < 0 && hard_throttle)
190 rqd->scale_step = 0;
191 else
192 rqd->scale_step++;
193
194 rqd->scaled_max = false;
195 rq_depth_calc_max_depth(rqd);
196 return true;
197 }
198
199 struct rq_qos_wait_data {
200 struct wait_queue_entry wq;
201 struct rq_wait *rqw;
202 acquire_inflight_cb_t *cb;
203 void *private_data;
204 bool got_token;
205 };
206
rq_qos_wake_function(struct wait_queue_entry * curr,unsigned int mode,int wake_flags,void * key)207 static int rq_qos_wake_function(struct wait_queue_entry *curr,
208 unsigned int mode, int wake_flags, void *key)
209 {
210 struct rq_qos_wait_data *data = container_of(curr,
211 struct rq_qos_wait_data,
212 wq);
213
214 /*
215 * If we fail to get a budget, return -1 to interrupt the wake up loop
216 * in __wake_up_common.
217 */
218 if (!data->cb(data->rqw, data->private_data))
219 return -1;
220
221 data->got_token = true;
222 /*
223 * autoremove_wake_function() removes the wait entry only when it
224 * actually changed the task state. We want the wait always removed.
225 * Remove explicitly and use default_wake_function().
226 */
227 default_wake_function(curr, mode, wake_flags, key);
228 /*
229 * Note that the order of operations is important as finish_wait()
230 * tests whether @curr is removed without grabbing the lock. This
231 * should be the last thing to do to make sure we will not have a
232 * UAF access to @data. And the semantics of memory barrier in it
233 * also make sure the waiter will see the latest @data->got_token
234 * once list_empty_careful() in finish_wait() returns true.
235 */
236 list_del_init_careful(&curr->entry);
237 return 1;
238 }
239
240 /**
241 * rq_qos_wait - throttle on a rqw if we need to
242 * @rqw: rqw to throttle on
243 * @private_data: caller provided specific data
244 * @acquire_inflight_cb: inc the rqw->inflight counter if we can
245 * @cleanup_cb: the callback to cleanup in case we race with a waker
246 *
247 * This provides a uniform place for the rq_qos users to do their throttling.
248 * Since you can end up with a lot of things sleeping at once, this manages the
249 * waking up based on the resources available. The acquire_inflight_cb should
250 * inc the rqw->inflight if we have the ability to do so, or return false if not
251 * and then we will sleep until the room becomes available.
252 *
253 * cleanup_cb is in case that we race with a waker and need to cleanup the
254 * inflight count accordingly.
255 */
rq_qos_wait(struct rq_wait * rqw,void * private_data,acquire_inflight_cb_t * acquire_inflight_cb,cleanup_cb_t * cleanup_cb)256 void rq_qos_wait(struct rq_wait *rqw, void *private_data,
257 acquire_inflight_cb_t *acquire_inflight_cb,
258 cleanup_cb_t *cleanup_cb)
259 {
260 struct rq_qos_wait_data data = {
261 .rqw = rqw,
262 .cb = acquire_inflight_cb,
263 .private_data = private_data,
264 .got_token = false,
265 };
266 bool first_waiter;
267
268 /*
269 * If there are no waiters in the waiting queue, try to increase the
270 * inflight counter if we can. Otherwise, prepare for adding ourselves
271 * to the waiting queue.
272 */
273 if (!waitqueue_active(&rqw->wait) && acquire_inflight_cb(rqw, private_data))
274 return;
275
276 init_wait_func(&data.wq, rq_qos_wake_function);
277 first_waiter = prepare_to_wait_exclusive(&rqw->wait, &data.wq,
278 TASK_UNINTERRUPTIBLE);
279 /*
280 * Make sure there is at least one inflight process; otherwise, waiters
281 * will never be woken up. Since there may be no inflight process before
282 * adding ourselves to the waiting queue above, we need to try to
283 * increase the inflight counter for ourselves. And it is sufficient to
284 * guarantee that at least the first waiter to enter the waiting queue
285 * will re-check the waiting condition before going to sleep, thus
286 * ensuring forward progress.
287 */
288 if (!data.got_token && first_waiter && acquire_inflight_cb(rqw, private_data)) {
289 finish_wait(&rqw->wait, &data.wq);
290 /*
291 * We raced with rq_qos_wake_function() getting a token,
292 * which means we now have two. Put our local token
293 * and wake anyone else potentially waiting for one.
294 *
295 * Enough memory barrier in list_empty_careful() in
296 * finish_wait() is paired with list_del_init_careful()
297 * in rq_qos_wake_function() to make sure we will see
298 * the latest @data->got_token.
299 */
300 if (data.got_token)
301 cleanup_cb(rqw, private_data);
302 return;
303 }
304
305 /* we are now relying on the waker to increase our inflight counter. */
306 do {
307 if (data.got_token)
308 break;
309 io_schedule();
310 set_current_state(TASK_UNINTERRUPTIBLE);
311 } while (1);
312 finish_wait(&rqw->wait, &data.wq);
313 }
314
rq_qos_exit(struct request_queue * q)315 void rq_qos_exit(struct request_queue *q)
316 {
317 mutex_lock(&q->rq_qos_mutex);
318 while (q->rq_qos) {
319 struct rq_qos *rqos = q->rq_qos;
320 q->rq_qos = rqos->next;
321 rqos->ops->exit(rqos);
322 static_branch_dec(&block_rq_qos);
323 }
324 mutex_unlock(&q->rq_qos_mutex);
325 }
326
rq_qos_add(struct rq_qos * rqos,struct gendisk * disk,enum rq_qos_id id,const struct rq_qos_ops * ops)327 int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
328 const struct rq_qos_ops *ops)
329 {
330 struct request_queue *q = disk->queue;
331 unsigned int memflags;
332
333 lockdep_assert_held(&q->rq_qos_mutex);
334
335 rqos->disk = disk;
336 rqos->id = id;
337 rqos->ops = ops;
338
339 /*
340 * No IO can be in-flight when adding rqos, so freeze queue, which
341 * is fine since we only support rq_qos for blk-mq queue.
342 */
343 memflags = blk_mq_freeze_queue(q);
344
345 if (rq_qos_id(q, rqos->id))
346 goto ebusy;
347 rqos->next = q->rq_qos;
348 q->rq_qos = rqos;
349 static_branch_inc(&block_rq_qos);
350
351 blk_mq_unfreeze_queue(q, memflags);
352
353 if (rqos->ops->debugfs_attrs) {
354 mutex_lock(&q->debugfs_mutex);
355 blk_mq_debugfs_register_rqos(rqos);
356 mutex_unlock(&q->debugfs_mutex);
357 }
358
359 return 0;
360 ebusy:
361 blk_mq_unfreeze_queue(q, memflags);
362 return -EBUSY;
363 }
364
rq_qos_del(struct rq_qos * rqos)365 void rq_qos_del(struct rq_qos *rqos)
366 {
367 struct request_queue *q = rqos->disk->queue;
368 struct rq_qos **cur;
369 unsigned int memflags;
370
371 lockdep_assert_held(&q->rq_qos_mutex);
372
373 memflags = blk_mq_freeze_queue(q);
374 for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) {
375 if (*cur == rqos) {
376 *cur = rqos->next;
377 break;
378 }
379 }
380 blk_mq_unfreeze_queue(q, memflags);
381
382 mutex_lock(&q->debugfs_mutex);
383 blk_mq_debugfs_unregister_rqos(rqos);
384 mutex_unlock(&q->debugfs_mutex);
385 }
386