xref: /linux/block/blk-rq-qos.c (revision ab93e0dd72c37d378dd936f031ffb83ff2bd87ce)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "blk-rq-qos.h"
4 
5 __read_mostly DEFINE_STATIC_KEY_FALSE(block_rq_qos);
6 
7 /*
8  * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
9  * false if 'v' + 1 would be bigger than 'below'.
10  */
atomic_inc_below(atomic_t * v,unsigned int below)11 static bool atomic_inc_below(atomic_t *v, unsigned int below)
12 {
13 	unsigned int cur = atomic_read(v);
14 
15 	do {
16 		if (cur >= below)
17 			return false;
18 	} while (!atomic_try_cmpxchg(v, &cur, cur + 1));
19 
20 	return true;
21 }
22 
rq_wait_inc_below(struct rq_wait * rq_wait,unsigned int limit)23 bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit)
24 {
25 	return atomic_inc_below(&rq_wait->inflight, limit);
26 }
27 
__rq_qos_cleanup(struct rq_qos * rqos,struct bio * bio)28 void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio)
29 {
30 	do {
31 		if (rqos->ops->cleanup)
32 			rqos->ops->cleanup(rqos, bio);
33 		rqos = rqos->next;
34 	} while (rqos);
35 }
36 
__rq_qos_done(struct rq_qos * rqos,struct request * rq)37 void __rq_qos_done(struct rq_qos *rqos, struct request *rq)
38 {
39 	do {
40 		if (rqos->ops->done)
41 			rqos->ops->done(rqos, rq);
42 		rqos = rqos->next;
43 	} while (rqos);
44 }
45 
__rq_qos_issue(struct rq_qos * rqos,struct request * rq)46 void __rq_qos_issue(struct rq_qos *rqos, struct request *rq)
47 {
48 	do {
49 		if (rqos->ops->issue)
50 			rqos->ops->issue(rqos, rq);
51 		rqos = rqos->next;
52 	} while (rqos);
53 }
54 
__rq_qos_requeue(struct rq_qos * rqos,struct request * rq)55 void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq)
56 {
57 	do {
58 		if (rqos->ops->requeue)
59 			rqos->ops->requeue(rqos, rq);
60 		rqos = rqos->next;
61 	} while (rqos);
62 }
63 
__rq_qos_throttle(struct rq_qos * rqos,struct bio * bio)64 void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio)
65 {
66 	do {
67 		if (rqos->ops->throttle)
68 			rqos->ops->throttle(rqos, bio);
69 		rqos = rqos->next;
70 	} while (rqos);
71 }
72 
__rq_qos_track(struct rq_qos * rqos,struct request * rq,struct bio * bio)73 void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
74 {
75 	do {
76 		if (rqos->ops->track)
77 			rqos->ops->track(rqos, rq, bio);
78 		rqos = rqos->next;
79 	} while (rqos);
80 }
81 
__rq_qos_merge(struct rq_qos * rqos,struct request * rq,struct bio * bio)82 void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio)
83 {
84 	do {
85 		if (rqos->ops->merge)
86 			rqos->ops->merge(rqos, rq, bio);
87 		rqos = rqos->next;
88 	} while (rqos);
89 }
90 
__rq_qos_done_bio(struct rq_qos * rqos,struct bio * bio)91 void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio)
92 {
93 	do {
94 		if (rqos->ops->done_bio)
95 			rqos->ops->done_bio(rqos, bio);
96 		rqos = rqos->next;
97 	} while (rqos);
98 }
99 
__rq_qos_queue_depth_changed(struct rq_qos * rqos)100 void __rq_qos_queue_depth_changed(struct rq_qos *rqos)
101 {
102 	do {
103 		if (rqos->ops->queue_depth_changed)
104 			rqos->ops->queue_depth_changed(rqos);
105 		rqos = rqos->next;
106 	} while (rqos);
107 }
108 
109 /*
110  * Return true, if we can't increase the depth further by scaling
111  */
rq_depth_calc_max_depth(struct rq_depth * rqd)112 bool rq_depth_calc_max_depth(struct rq_depth *rqd)
113 {
114 	unsigned int depth;
115 	bool ret = false;
116 
117 	/*
118 	 * For QD=1 devices, this is a special case. It's important for those
119 	 * to have one request ready when one completes, so force a depth of
120 	 * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
121 	 * since the device can't have more than that in flight. If we're
122 	 * scaling down, then keep a setting of 1/1/1.
123 	 */
124 	if (rqd->queue_depth == 1) {
125 		if (rqd->scale_step > 0)
126 			rqd->max_depth = 1;
127 		else {
128 			rqd->max_depth = 2;
129 			ret = true;
130 		}
131 	} else {
132 		/*
133 		 * scale_step == 0 is our default state. If we have suffered
134 		 * latency spikes, step will be > 0, and we shrink the
135 		 * allowed write depths. If step is < 0, we're only doing
136 		 * writes, and we allow a temporarily higher depth to
137 		 * increase performance.
138 		 */
139 		depth = min_t(unsigned int, rqd->default_depth,
140 			      rqd->queue_depth);
141 		if (rqd->scale_step > 0)
142 			depth = 1 + ((depth - 1) >> min(31, rqd->scale_step));
143 		else if (rqd->scale_step < 0) {
144 			unsigned int maxd = 3 * rqd->queue_depth / 4;
145 
146 			depth = 1 + ((depth - 1) << -rqd->scale_step);
147 			if (depth > maxd) {
148 				depth = maxd;
149 				ret = true;
150 			}
151 		}
152 
153 		rqd->max_depth = depth;
154 	}
155 
156 	return ret;
157 }
158 
159 /* Returns true on success and false if scaling up wasn't possible */
rq_depth_scale_up(struct rq_depth * rqd)160 bool rq_depth_scale_up(struct rq_depth *rqd)
161 {
162 	/*
163 	 * Hit max in previous round, stop here
164 	 */
165 	if (rqd->scaled_max)
166 		return false;
167 
168 	rqd->scale_step--;
169 
170 	rqd->scaled_max = rq_depth_calc_max_depth(rqd);
171 	return true;
172 }
173 
174 /*
175  * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
176  * had a latency violation. Returns true on success and returns false if
177  * scaling down wasn't possible.
178  */
rq_depth_scale_down(struct rq_depth * rqd,bool hard_throttle)179 bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
180 {
181 	/*
182 	 * Stop scaling down when we've hit the limit. This also prevents
183 	 * ->scale_step from going to crazy values, if the device can't
184 	 * keep up.
185 	 */
186 	if (rqd->max_depth == 1)
187 		return false;
188 
189 	if (rqd->scale_step < 0 && hard_throttle)
190 		rqd->scale_step = 0;
191 	else
192 		rqd->scale_step++;
193 
194 	rqd->scaled_max = false;
195 	rq_depth_calc_max_depth(rqd);
196 	return true;
197 }
198 
199 struct rq_qos_wait_data {
200 	struct wait_queue_entry wq;
201 	struct rq_wait *rqw;
202 	acquire_inflight_cb_t *cb;
203 	void *private_data;
204 	bool got_token;
205 };
206 
rq_qos_wake_function(struct wait_queue_entry * curr,unsigned int mode,int wake_flags,void * key)207 static int rq_qos_wake_function(struct wait_queue_entry *curr,
208 				unsigned int mode, int wake_flags, void *key)
209 {
210 	struct rq_qos_wait_data *data = container_of(curr,
211 						     struct rq_qos_wait_data,
212 						     wq);
213 
214 	/*
215 	 * If we fail to get a budget, return -1 to interrupt the wake up loop
216 	 * in __wake_up_common.
217 	 */
218 	if (!data->cb(data->rqw, data->private_data))
219 		return -1;
220 
221 	data->got_token = true;
222 	/*
223 	 * autoremove_wake_function() removes the wait entry only when it
224 	 * actually changed the task state. We want the wait always removed.
225 	 * Remove explicitly and use default_wake_function().
226 	 */
227 	default_wake_function(curr, mode, wake_flags, key);
228 	/*
229 	 * Note that the order of operations is important as finish_wait()
230 	 * tests whether @curr is removed without grabbing the lock. This
231 	 * should be the last thing to do to make sure we will not have a
232 	 * UAF access to @data. And the semantics of memory barrier in it
233 	 * also make sure the waiter will see the latest @data->got_token
234 	 * once list_empty_careful() in finish_wait() returns true.
235 	 */
236 	list_del_init_careful(&curr->entry);
237 	return 1;
238 }
239 
240 /**
241  * rq_qos_wait - throttle on a rqw if we need to
242  * @rqw: rqw to throttle on
243  * @private_data: caller provided specific data
244  * @acquire_inflight_cb: inc the rqw->inflight counter if we can
245  * @cleanup_cb: the callback to cleanup in case we race with a waker
246  *
247  * This provides a uniform place for the rq_qos users to do their throttling.
248  * Since you can end up with a lot of things sleeping at once, this manages the
249  * waking up based on the resources available.  The acquire_inflight_cb should
250  * inc the rqw->inflight if we have the ability to do so, or return false if not
251  * and then we will sleep until the room becomes available.
252  *
253  * cleanup_cb is in case that we race with a waker and need to cleanup the
254  * inflight count accordingly.
255  */
rq_qos_wait(struct rq_wait * rqw,void * private_data,acquire_inflight_cb_t * acquire_inflight_cb,cleanup_cb_t * cleanup_cb)256 void rq_qos_wait(struct rq_wait *rqw, void *private_data,
257 		 acquire_inflight_cb_t *acquire_inflight_cb,
258 		 cleanup_cb_t *cleanup_cb)
259 {
260 	struct rq_qos_wait_data data = {
261 		.rqw		= rqw,
262 		.cb		= acquire_inflight_cb,
263 		.private_data	= private_data,
264 		.got_token	= false,
265 	};
266 	bool first_waiter;
267 
268 	/*
269 	 * If there are no waiters in the waiting queue, try to increase the
270 	 * inflight counter if we can. Otherwise, prepare for adding ourselves
271 	 * to the waiting queue.
272 	 */
273 	if (!waitqueue_active(&rqw->wait) && acquire_inflight_cb(rqw, private_data))
274 		return;
275 
276 	init_wait_func(&data.wq, rq_qos_wake_function);
277 	first_waiter = prepare_to_wait_exclusive(&rqw->wait, &data.wq,
278 						 TASK_UNINTERRUPTIBLE);
279 	/*
280 	 * Make sure there is at least one inflight process; otherwise, waiters
281 	 * will never be woken up. Since there may be no inflight process before
282 	 * adding ourselves to the waiting queue above, we need to try to
283 	 * increase the inflight counter for ourselves. And it is sufficient to
284 	 * guarantee that at least the first waiter to enter the waiting queue
285 	 * will re-check the waiting condition before going to sleep, thus
286 	 * ensuring forward progress.
287 	 */
288 	if (!data.got_token && first_waiter && acquire_inflight_cb(rqw, private_data)) {
289 		finish_wait(&rqw->wait, &data.wq);
290 		/*
291 		 * We raced with rq_qos_wake_function() getting a token,
292 		 * which means we now have two. Put our local token
293 		 * and wake anyone else potentially waiting for one.
294 		 *
295 		 * Enough memory barrier in list_empty_careful() in
296 		 * finish_wait() is paired with list_del_init_careful()
297 		 * in rq_qos_wake_function() to make sure we will see
298 		 * the latest @data->got_token.
299 		 */
300 		if (data.got_token)
301 			cleanup_cb(rqw, private_data);
302 		return;
303 	}
304 
305 	/* we are now relying on the waker to increase our inflight counter. */
306 	do {
307 		if (data.got_token)
308 			break;
309 		io_schedule();
310 		set_current_state(TASK_UNINTERRUPTIBLE);
311 	} while (1);
312 	finish_wait(&rqw->wait, &data.wq);
313 }
314 
rq_qos_exit(struct request_queue * q)315 void rq_qos_exit(struct request_queue *q)
316 {
317 	mutex_lock(&q->rq_qos_mutex);
318 	while (q->rq_qos) {
319 		struct rq_qos *rqos = q->rq_qos;
320 		q->rq_qos = rqos->next;
321 		rqos->ops->exit(rqos);
322 		static_branch_dec(&block_rq_qos);
323 	}
324 	mutex_unlock(&q->rq_qos_mutex);
325 }
326 
rq_qos_add(struct rq_qos * rqos,struct gendisk * disk,enum rq_qos_id id,const struct rq_qos_ops * ops)327 int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
328 		const struct rq_qos_ops *ops)
329 {
330 	struct request_queue *q = disk->queue;
331 	unsigned int memflags;
332 
333 	lockdep_assert_held(&q->rq_qos_mutex);
334 
335 	rqos->disk = disk;
336 	rqos->id = id;
337 	rqos->ops = ops;
338 
339 	/*
340 	 * No IO can be in-flight when adding rqos, so freeze queue, which
341 	 * is fine since we only support rq_qos for blk-mq queue.
342 	 */
343 	memflags = blk_mq_freeze_queue(q);
344 
345 	if (rq_qos_id(q, rqos->id))
346 		goto ebusy;
347 	rqos->next = q->rq_qos;
348 	q->rq_qos = rqos;
349 	static_branch_inc(&block_rq_qos);
350 
351 	blk_mq_unfreeze_queue(q, memflags);
352 
353 	if (rqos->ops->debugfs_attrs) {
354 		mutex_lock(&q->debugfs_mutex);
355 		blk_mq_debugfs_register_rqos(rqos);
356 		mutex_unlock(&q->debugfs_mutex);
357 	}
358 
359 	return 0;
360 ebusy:
361 	blk_mq_unfreeze_queue(q, memflags);
362 	return -EBUSY;
363 }
364 
rq_qos_del(struct rq_qos * rqos)365 void rq_qos_del(struct rq_qos *rqos)
366 {
367 	struct request_queue *q = rqos->disk->queue;
368 	struct rq_qos **cur;
369 	unsigned int memflags;
370 
371 	lockdep_assert_held(&q->rq_qos_mutex);
372 
373 	memflags = blk_mq_freeze_queue(q);
374 	for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) {
375 		if (*cur == rqos) {
376 			*cur = rqos->next;
377 			break;
378 		}
379 	}
380 	blk_mq_unfreeze_queue(q, memflags);
381 
382 	mutex_lock(&q->debugfs_mutex);
383 	blk_mq_debugfs_unregister_rqos(rqos);
384 	mutex_unlock(&q->debugfs_mutex);
385 }
386