1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "blk-rq-qos.h" 4 5 /* 6 * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, 7 * false if 'v' + 1 would be bigger than 'below'. 8 */ 9 static bool atomic_inc_below(atomic_t *v, unsigned int below) 10 { 11 unsigned int cur = atomic_read(v); 12 13 do { 14 if (cur >= below) 15 return false; 16 } while (!atomic_try_cmpxchg(v, &cur, cur + 1)); 17 18 return true; 19 } 20 21 bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit) 22 { 23 return atomic_inc_below(&rq_wait->inflight, limit); 24 } 25 26 void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio) 27 { 28 do { 29 if (rqos->ops->cleanup) 30 rqos->ops->cleanup(rqos, bio); 31 rqos = rqos->next; 32 } while (rqos); 33 } 34 35 void __rq_qos_done(struct rq_qos *rqos, struct request *rq) 36 { 37 do { 38 if (rqos->ops->done) 39 rqos->ops->done(rqos, rq); 40 rqos = rqos->next; 41 } while (rqos); 42 } 43 44 void __rq_qos_issue(struct rq_qos *rqos, struct request *rq) 45 { 46 do { 47 if (rqos->ops->issue) 48 rqos->ops->issue(rqos, rq); 49 rqos = rqos->next; 50 } while (rqos); 51 } 52 53 void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq) 54 { 55 do { 56 if (rqos->ops->requeue) 57 rqos->ops->requeue(rqos, rq); 58 rqos = rqos->next; 59 } while (rqos); 60 } 61 62 void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio) 63 { 64 do { 65 if (rqos->ops->throttle) 66 rqos->ops->throttle(rqos, bio); 67 rqos = rqos->next; 68 } while (rqos); 69 } 70 71 void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) 72 { 73 do { 74 if (rqos->ops->track) 75 rqos->ops->track(rqos, rq, bio); 76 rqos = rqos->next; 77 } while (rqos); 78 } 79 80 void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio) 81 { 82 do { 83 if (rqos->ops->merge) 84 rqos->ops->merge(rqos, rq, bio); 85 rqos = rqos->next; 86 } while (rqos); 87 } 88 89 void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio) 90 { 91 do { 92 if (rqos->ops->done_bio) 93 rqos->ops->done_bio(rqos, bio); 94 rqos = rqos->next; 95 } while (rqos); 96 } 97 98 void __rq_qos_queue_depth_changed(struct rq_qos *rqos) 99 { 100 do { 101 if (rqos->ops->queue_depth_changed) 102 rqos->ops->queue_depth_changed(rqos); 103 rqos = rqos->next; 104 } while (rqos); 105 } 106 107 /* 108 * Return true, if we can't increase the depth further by scaling 109 */ 110 bool rq_depth_calc_max_depth(struct rq_depth *rqd) 111 { 112 unsigned int depth; 113 bool ret = false; 114 115 /* 116 * For QD=1 devices, this is a special case. It's important for those 117 * to have one request ready when one completes, so force a depth of 118 * 2 for those devices. On the backend, it'll be a depth of 1 anyway, 119 * since the device can't have more than that in flight. If we're 120 * scaling down, then keep a setting of 1/1/1. 121 */ 122 if (rqd->queue_depth == 1) { 123 if (rqd->scale_step > 0) 124 rqd->max_depth = 1; 125 else { 126 rqd->max_depth = 2; 127 ret = true; 128 } 129 } else { 130 /* 131 * scale_step == 0 is our default state. If we have suffered 132 * latency spikes, step will be > 0, and we shrink the 133 * allowed write depths. If step is < 0, we're only doing 134 * writes, and we allow a temporarily higher depth to 135 * increase performance. 136 */ 137 depth = min_t(unsigned int, rqd->default_depth, 138 rqd->queue_depth); 139 if (rqd->scale_step > 0) 140 depth = 1 + ((depth - 1) >> min(31, rqd->scale_step)); 141 else if (rqd->scale_step < 0) { 142 unsigned int maxd = 3 * rqd->queue_depth / 4; 143 144 depth = 1 + ((depth - 1) << -rqd->scale_step); 145 if (depth > maxd) { 146 depth = maxd; 147 ret = true; 148 } 149 } 150 151 rqd->max_depth = depth; 152 } 153 154 return ret; 155 } 156 157 /* Returns true on success and false if scaling up wasn't possible */ 158 bool rq_depth_scale_up(struct rq_depth *rqd) 159 { 160 /* 161 * Hit max in previous round, stop here 162 */ 163 if (rqd->scaled_max) 164 return false; 165 166 rqd->scale_step--; 167 168 rqd->scaled_max = rq_depth_calc_max_depth(rqd); 169 return true; 170 } 171 172 /* 173 * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we 174 * had a latency violation. Returns true on success and returns false if 175 * scaling down wasn't possible. 176 */ 177 bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) 178 { 179 /* 180 * Stop scaling down when we've hit the limit. This also prevents 181 * ->scale_step from going to crazy values, if the device can't 182 * keep up. 183 */ 184 if (rqd->max_depth == 1) 185 return false; 186 187 if (rqd->scale_step < 0 && hard_throttle) 188 rqd->scale_step = 0; 189 else 190 rqd->scale_step++; 191 192 rqd->scaled_max = false; 193 rq_depth_calc_max_depth(rqd); 194 return true; 195 } 196 197 struct rq_qos_wait_data { 198 struct wait_queue_entry wq; 199 struct rq_wait *rqw; 200 acquire_inflight_cb_t *cb; 201 void *private_data; 202 bool got_token; 203 }; 204 205 static int rq_qos_wake_function(struct wait_queue_entry *curr, 206 unsigned int mode, int wake_flags, void *key) 207 { 208 struct rq_qos_wait_data *data = container_of(curr, 209 struct rq_qos_wait_data, 210 wq); 211 212 /* 213 * If we fail to get a budget, return -1 to interrupt the wake up loop 214 * in __wake_up_common. 215 */ 216 if (!data->cb(data->rqw, data->private_data)) 217 return -1; 218 219 data->got_token = true; 220 /* 221 * autoremove_wake_function() removes the wait entry only when it 222 * actually changed the task state. We want the wait always removed. 223 * Remove explicitly and use default_wake_function(). 224 */ 225 default_wake_function(curr, mode, wake_flags, key); 226 /* 227 * Note that the order of operations is important as finish_wait() 228 * tests whether @curr is removed without grabbing the lock. This 229 * should be the last thing to do to make sure we will not have a 230 * UAF access to @data. And the semantics of memory barrier in it 231 * also make sure the waiter will see the latest @data->got_token 232 * once list_empty_careful() in finish_wait() returns true. 233 */ 234 list_del_init_careful(&curr->entry); 235 return 1; 236 } 237 238 /** 239 * rq_qos_wait - throttle on a rqw if we need to 240 * @rqw: rqw to throttle on 241 * @private_data: caller provided specific data 242 * @acquire_inflight_cb: inc the rqw->inflight counter if we can 243 * @cleanup_cb: the callback to cleanup in case we race with a waker 244 * 245 * This provides a uniform place for the rq_qos users to do their throttling. 246 * Since you can end up with a lot of things sleeping at once, this manages the 247 * waking up based on the resources available. The acquire_inflight_cb should 248 * inc the rqw->inflight if we have the ability to do so, or return false if not 249 * and then we will sleep until the room becomes available. 250 * 251 * cleanup_cb is in case that we race with a waker and need to cleanup the 252 * inflight count accordingly. 253 */ 254 void rq_qos_wait(struct rq_wait *rqw, void *private_data, 255 acquire_inflight_cb_t *acquire_inflight_cb, 256 cleanup_cb_t *cleanup_cb) 257 { 258 struct rq_qos_wait_data data = { 259 .rqw = rqw, 260 .cb = acquire_inflight_cb, 261 .private_data = private_data, 262 .got_token = false, 263 }; 264 bool first_waiter; 265 266 /* 267 * If there are no waiters in the waiting queue, try to increase the 268 * inflight counter if we can. Otherwise, prepare for adding ourselves 269 * to the waiting queue. 270 */ 271 if (!waitqueue_active(&rqw->wait) && acquire_inflight_cb(rqw, private_data)) 272 return; 273 274 init_wait_func(&data.wq, rq_qos_wake_function); 275 first_waiter = prepare_to_wait_exclusive(&rqw->wait, &data.wq, 276 TASK_UNINTERRUPTIBLE); 277 /* 278 * Make sure there is at least one inflight process; otherwise, waiters 279 * will never be woken up. Since there may be no inflight process before 280 * adding ourselves to the waiting queue above, we need to try to 281 * increase the inflight counter for ourselves. And it is sufficient to 282 * guarantee that at least the first waiter to enter the waiting queue 283 * will re-check the waiting condition before going to sleep, thus 284 * ensuring forward progress. 285 */ 286 if (!data.got_token && first_waiter && acquire_inflight_cb(rqw, private_data)) { 287 finish_wait(&rqw->wait, &data.wq); 288 /* 289 * We raced with rq_qos_wake_function() getting a token, 290 * which means we now have two. Put our local token 291 * and wake anyone else potentially waiting for one. 292 * 293 * Enough memory barrier in list_empty_careful() in 294 * finish_wait() is paired with list_del_init_careful() 295 * in rq_qos_wake_function() to make sure we will see 296 * the latest @data->got_token. 297 */ 298 if (data.got_token) 299 cleanup_cb(rqw, private_data); 300 return; 301 } 302 303 /* we are now relying on the waker to increase our inflight counter. */ 304 do { 305 if (data.got_token) 306 break; 307 io_schedule(); 308 set_current_state(TASK_UNINTERRUPTIBLE); 309 } while (1); 310 finish_wait(&rqw->wait, &data.wq); 311 } 312 313 void rq_qos_exit(struct request_queue *q) 314 { 315 mutex_lock(&q->rq_qos_mutex); 316 while (q->rq_qos) { 317 struct rq_qos *rqos = q->rq_qos; 318 q->rq_qos = rqos->next; 319 rqos->ops->exit(rqos); 320 } 321 mutex_unlock(&q->rq_qos_mutex); 322 } 323 324 int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, 325 const struct rq_qos_ops *ops) 326 { 327 struct request_queue *q = disk->queue; 328 unsigned int memflags; 329 330 lockdep_assert_held(&q->rq_qos_mutex); 331 332 rqos->disk = disk; 333 rqos->id = id; 334 rqos->ops = ops; 335 336 /* 337 * No IO can be in-flight when adding rqos, so freeze queue, which 338 * is fine since we only support rq_qos for blk-mq queue. 339 */ 340 memflags = blk_mq_freeze_queue(q); 341 342 if (rq_qos_id(q, rqos->id)) 343 goto ebusy; 344 rqos->next = q->rq_qos; 345 q->rq_qos = rqos; 346 347 blk_mq_unfreeze_queue(q, memflags); 348 349 if (rqos->ops->debugfs_attrs) { 350 mutex_lock(&q->debugfs_mutex); 351 blk_mq_debugfs_register_rqos(rqos); 352 mutex_unlock(&q->debugfs_mutex); 353 } 354 355 return 0; 356 ebusy: 357 blk_mq_unfreeze_queue(q, memflags); 358 return -EBUSY; 359 } 360 361 void rq_qos_del(struct rq_qos *rqos) 362 { 363 struct request_queue *q = rqos->disk->queue; 364 struct rq_qos **cur; 365 unsigned int memflags; 366 367 lockdep_assert_held(&q->rq_qos_mutex); 368 369 memflags = blk_mq_freeze_queue(q); 370 for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { 371 if (*cur == rqos) { 372 *cur = rqos->next; 373 break; 374 } 375 } 376 blk_mq_unfreeze_queue(q, memflags); 377 378 mutex_lock(&q->debugfs_mutex); 379 blk_mq_debugfs_unregister_rqos(rqos); 380 mutex_unlock(&q->debugfs_mutex); 381 } 382