1e43473b7SVivek Goyal /* 2e43473b7SVivek Goyal * Interface for controlling IO bandwidth on a request queue 3e43473b7SVivek Goyal * 4e43473b7SVivek Goyal * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> 5e43473b7SVivek Goyal */ 6e43473b7SVivek Goyal 7e43473b7SVivek Goyal #include <linux/module.h> 8e43473b7SVivek Goyal #include <linux/slab.h> 9e43473b7SVivek Goyal #include <linux/blkdev.h> 10e43473b7SVivek Goyal #include <linux/bio.h> 11e43473b7SVivek Goyal #include <linux/blktrace_api.h> 12e43473b7SVivek Goyal #include "blk-cgroup.h" 13e43473b7SVivek Goyal 14e43473b7SVivek Goyal /* Max dispatch from a group in 1 round */ 15e43473b7SVivek Goyal static int throtl_grp_quantum = 8; 16e43473b7SVivek Goyal 17e43473b7SVivek Goyal /* Total max dispatch from all groups in one round */ 18e43473b7SVivek Goyal static int throtl_quantum = 32; 19e43473b7SVivek Goyal 20e43473b7SVivek Goyal /* Throttling is performed over 100ms slice and after that slice is renewed */ 21e43473b7SVivek Goyal static unsigned long throtl_slice = HZ/10; /* 100 ms */ 22e43473b7SVivek Goyal 23e43473b7SVivek Goyal struct throtl_rb_root { 24e43473b7SVivek Goyal struct rb_root rb; 25e43473b7SVivek Goyal struct rb_node *left; 26e43473b7SVivek Goyal unsigned int count; 27e43473b7SVivek Goyal unsigned long min_disptime; 28e43473b7SVivek Goyal }; 29e43473b7SVivek Goyal 30e43473b7SVivek Goyal #define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \ 31e43473b7SVivek Goyal .count = 0, .min_disptime = 0} 32e43473b7SVivek Goyal 33e43473b7SVivek Goyal #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 34e43473b7SVivek Goyal 35e43473b7SVivek Goyal struct throtl_grp { 36e43473b7SVivek Goyal /* List of throtl groups on the request queue*/ 37e43473b7SVivek Goyal struct hlist_node tg_node; 38e43473b7SVivek Goyal 39e43473b7SVivek Goyal /* active throtl group service_tree member */ 40e43473b7SVivek Goyal struct rb_node rb_node; 41e43473b7SVivek Goyal 42e43473b7SVivek Goyal /* 43e43473b7SVivek Goyal * Dispatch time in jiffies. This is the estimated time when group 44e43473b7SVivek Goyal * will unthrottle and is ready to dispatch more bio. It is used as 45e43473b7SVivek Goyal * key to sort active groups in service tree. 46e43473b7SVivek Goyal */ 47e43473b7SVivek Goyal unsigned long disptime; 48e43473b7SVivek Goyal 49e43473b7SVivek Goyal struct blkio_group blkg; 50e43473b7SVivek Goyal atomic_t ref; 51e43473b7SVivek Goyal unsigned int flags; 52e43473b7SVivek Goyal 53e43473b7SVivek Goyal /* Two lists for READ and WRITE */ 54e43473b7SVivek Goyal struct bio_list bio_lists[2]; 55e43473b7SVivek Goyal 56e43473b7SVivek Goyal /* Number of queued bios on READ and WRITE lists */ 57e43473b7SVivek Goyal unsigned int nr_queued[2]; 58e43473b7SVivek Goyal 59e43473b7SVivek Goyal /* bytes per second rate limits */ 60e43473b7SVivek Goyal uint64_t bps[2]; 61e43473b7SVivek Goyal 628e89d13fSVivek Goyal /* IOPS limits */ 638e89d13fSVivek Goyal unsigned int iops[2]; 648e89d13fSVivek Goyal 65e43473b7SVivek Goyal /* Number of bytes disptached in current slice */ 66e43473b7SVivek Goyal uint64_t bytes_disp[2]; 678e89d13fSVivek Goyal /* Number of bio's dispatched in current slice */ 688e89d13fSVivek Goyal unsigned int io_disp[2]; 69e43473b7SVivek Goyal 70e43473b7SVivek Goyal /* When did we start a new slice */ 71e43473b7SVivek Goyal unsigned long slice_start[2]; 72e43473b7SVivek Goyal unsigned long slice_end[2]; 73fe071437SVivek Goyal 74fe071437SVivek Goyal /* Some throttle limits got updated for the group */ 75fe071437SVivek Goyal bool limits_changed; 76e43473b7SVivek Goyal }; 77e43473b7SVivek Goyal 78e43473b7SVivek Goyal struct throtl_data 79e43473b7SVivek Goyal { 80e43473b7SVivek Goyal /* List of throtl groups */ 81e43473b7SVivek Goyal struct hlist_head tg_list; 82e43473b7SVivek Goyal 83e43473b7SVivek Goyal /* service tree for active throtl groups */ 84e43473b7SVivek Goyal struct throtl_rb_root tg_service_tree; 85e43473b7SVivek Goyal 86e43473b7SVivek Goyal struct throtl_grp root_tg; 87e43473b7SVivek Goyal struct request_queue *queue; 88e43473b7SVivek Goyal 89e43473b7SVivek Goyal /* Total Number of queued bios on READ and WRITE lists */ 90e43473b7SVivek Goyal unsigned int nr_queued[2]; 91e43473b7SVivek Goyal 92e43473b7SVivek Goyal /* 9302977e4aSVivek Goyal * number of total undestroyed groups 94e43473b7SVivek Goyal */ 95e43473b7SVivek Goyal unsigned int nr_undestroyed_grps; 96e43473b7SVivek Goyal 97e43473b7SVivek Goyal /* Work for dispatching throttled bios */ 98e43473b7SVivek Goyal struct delayed_work throtl_work; 99fe071437SVivek Goyal 100fe071437SVivek Goyal atomic_t limits_changed; 101e43473b7SVivek Goyal }; 102e43473b7SVivek Goyal 103e43473b7SVivek Goyal enum tg_state_flags { 104e43473b7SVivek Goyal THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ 105e43473b7SVivek Goyal }; 106e43473b7SVivek Goyal 107e43473b7SVivek Goyal #define THROTL_TG_FNS(name) \ 108e43473b7SVivek Goyal static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \ 109e43473b7SVivek Goyal { \ 110e43473b7SVivek Goyal (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \ 111e43473b7SVivek Goyal } \ 112e43473b7SVivek Goyal static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \ 113e43473b7SVivek Goyal { \ 114e43473b7SVivek Goyal (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \ 115e43473b7SVivek Goyal } \ 116e43473b7SVivek Goyal static inline int throtl_tg_##name(const struct throtl_grp *tg) \ 117e43473b7SVivek Goyal { \ 118e43473b7SVivek Goyal return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \ 119e43473b7SVivek Goyal } 120e43473b7SVivek Goyal 121e43473b7SVivek Goyal THROTL_TG_FNS(on_rr); 122e43473b7SVivek Goyal 123e43473b7SVivek Goyal #define throtl_log_tg(td, tg, fmt, args...) \ 124e43473b7SVivek Goyal blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ 125e43473b7SVivek Goyal blkg_path(&(tg)->blkg), ##args); \ 126e43473b7SVivek Goyal 127e43473b7SVivek Goyal #define throtl_log(td, fmt, args...) \ 128e43473b7SVivek Goyal blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) 129e43473b7SVivek Goyal 130e43473b7SVivek Goyal static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg) 131e43473b7SVivek Goyal { 132e43473b7SVivek Goyal if (blkg) 133e43473b7SVivek Goyal return container_of(blkg, struct throtl_grp, blkg); 134e43473b7SVivek Goyal 135e43473b7SVivek Goyal return NULL; 136e43473b7SVivek Goyal } 137e43473b7SVivek Goyal 138e43473b7SVivek Goyal static inline int total_nr_queued(struct throtl_data *td) 139e43473b7SVivek Goyal { 140e43473b7SVivek Goyal return (td->nr_queued[0] + td->nr_queued[1]); 141e43473b7SVivek Goyal } 142e43473b7SVivek Goyal 143e43473b7SVivek Goyal static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) 144e43473b7SVivek Goyal { 145e43473b7SVivek Goyal atomic_inc(&tg->ref); 146e43473b7SVivek Goyal return tg; 147e43473b7SVivek Goyal } 148e43473b7SVivek Goyal 149e43473b7SVivek Goyal static void throtl_put_tg(struct throtl_grp *tg) 150e43473b7SVivek Goyal { 151e43473b7SVivek Goyal BUG_ON(atomic_read(&tg->ref) <= 0); 152e43473b7SVivek Goyal if (!atomic_dec_and_test(&tg->ref)) 153e43473b7SVivek Goyal return; 154e43473b7SVivek Goyal kfree(tg); 155e43473b7SVivek Goyal } 156e43473b7SVivek Goyal 157e43473b7SVivek Goyal static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td, 158e43473b7SVivek Goyal struct cgroup *cgroup) 159e43473b7SVivek Goyal { 160e43473b7SVivek Goyal struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); 161e43473b7SVivek Goyal struct throtl_grp *tg = NULL; 162e43473b7SVivek Goyal void *key = td; 163e43473b7SVivek Goyal struct backing_dev_info *bdi = &td->queue->backing_dev_info; 164e43473b7SVivek Goyal unsigned int major, minor; 165e43473b7SVivek Goyal 166e43473b7SVivek Goyal /* 167e43473b7SVivek Goyal * TODO: Speed up blkiocg_lookup_group() by maintaining a radix 168e43473b7SVivek Goyal * tree of blkg (instead of traversing through hash list all 169e43473b7SVivek Goyal * the time. 170e43473b7SVivek Goyal */ 171e43473b7SVivek Goyal tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); 172e43473b7SVivek Goyal 173e43473b7SVivek Goyal /* Fill in device details for root group */ 174e43473b7SVivek Goyal if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { 175e43473b7SVivek Goyal sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 176e43473b7SVivek Goyal tg->blkg.dev = MKDEV(major, minor); 177e43473b7SVivek Goyal goto done; 178e43473b7SVivek Goyal } 179e43473b7SVivek Goyal 180e43473b7SVivek Goyal if (tg) 181e43473b7SVivek Goyal goto done; 182e43473b7SVivek Goyal 183e43473b7SVivek Goyal tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); 184e43473b7SVivek Goyal if (!tg) 185e43473b7SVivek Goyal goto done; 186e43473b7SVivek Goyal 187e43473b7SVivek Goyal INIT_HLIST_NODE(&tg->tg_node); 188e43473b7SVivek Goyal RB_CLEAR_NODE(&tg->rb_node); 189e43473b7SVivek Goyal bio_list_init(&tg->bio_lists[0]); 190e43473b7SVivek Goyal bio_list_init(&tg->bio_lists[1]); 191e43473b7SVivek Goyal 192e43473b7SVivek Goyal /* 193e43473b7SVivek Goyal * Take the initial reference that will be released on destroy 194e43473b7SVivek Goyal * This can be thought of a joint reference by cgroup and 195e43473b7SVivek Goyal * request queue which will be dropped by either request queue 196e43473b7SVivek Goyal * exit or cgroup deletion path depending on who is exiting first. 197e43473b7SVivek Goyal */ 198e43473b7SVivek Goyal atomic_set(&tg->ref, 1); 199e43473b7SVivek Goyal 200e43473b7SVivek Goyal /* Add group onto cgroup list */ 201e43473b7SVivek Goyal sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 202e43473b7SVivek Goyal blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, 203e43473b7SVivek Goyal MKDEV(major, minor), BLKIO_POLICY_THROTL); 204e43473b7SVivek Goyal 205e43473b7SVivek Goyal tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); 206e43473b7SVivek Goyal tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); 2078e89d13fSVivek Goyal tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); 2088e89d13fSVivek Goyal tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); 209e43473b7SVivek Goyal 210e43473b7SVivek Goyal hlist_add_head(&tg->tg_node, &td->tg_list); 211e43473b7SVivek Goyal td->nr_undestroyed_grps++; 212e43473b7SVivek Goyal done: 213e43473b7SVivek Goyal return tg; 214e43473b7SVivek Goyal } 215e43473b7SVivek Goyal 216e43473b7SVivek Goyal static struct throtl_grp * throtl_get_tg(struct throtl_data *td) 217e43473b7SVivek Goyal { 218e43473b7SVivek Goyal struct cgroup *cgroup; 219e43473b7SVivek Goyal struct throtl_grp *tg = NULL; 220e43473b7SVivek Goyal 221e43473b7SVivek Goyal rcu_read_lock(); 222e43473b7SVivek Goyal cgroup = task_cgroup(current, blkio_subsys_id); 223e43473b7SVivek Goyal tg = throtl_find_alloc_tg(td, cgroup); 224e43473b7SVivek Goyal if (!tg) 225e43473b7SVivek Goyal tg = &td->root_tg; 226e43473b7SVivek Goyal rcu_read_unlock(); 227e43473b7SVivek Goyal return tg; 228e43473b7SVivek Goyal } 229e43473b7SVivek Goyal 230e43473b7SVivek Goyal static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root) 231e43473b7SVivek Goyal { 232e43473b7SVivek Goyal /* Service tree is empty */ 233e43473b7SVivek Goyal if (!root->count) 234e43473b7SVivek Goyal return NULL; 235e43473b7SVivek Goyal 236e43473b7SVivek Goyal if (!root->left) 237e43473b7SVivek Goyal root->left = rb_first(&root->rb); 238e43473b7SVivek Goyal 239e43473b7SVivek Goyal if (root->left) 240e43473b7SVivek Goyal return rb_entry_tg(root->left); 241e43473b7SVivek Goyal 242e43473b7SVivek Goyal return NULL; 243e43473b7SVivek Goyal } 244e43473b7SVivek Goyal 245e43473b7SVivek Goyal static void rb_erase_init(struct rb_node *n, struct rb_root *root) 246e43473b7SVivek Goyal { 247e43473b7SVivek Goyal rb_erase(n, root); 248e43473b7SVivek Goyal RB_CLEAR_NODE(n); 249e43473b7SVivek Goyal } 250e43473b7SVivek Goyal 251e43473b7SVivek Goyal static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root) 252e43473b7SVivek Goyal { 253e43473b7SVivek Goyal if (root->left == n) 254e43473b7SVivek Goyal root->left = NULL; 255e43473b7SVivek Goyal rb_erase_init(n, &root->rb); 256e43473b7SVivek Goyal --root->count; 257e43473b7SVivek Goyal } 258e43473b7SVivek Goyal 259e43473b7SVivek Goyal static void update_min_dispatch_time(struct throtl_rb_root *st) 260e43473b7SVivek Goyal { 261e43473b7SVivek Goyal struct throtl_grp *tg; 262e43473b7SVivek Goyal 263e43473b7SVivek Goyal tg = throtl_rb_first(st); 264e43473b7SVivek Goyal if (!tg) 265e43473b7SVivek Goyal return; 266e43473b7SVivek Goyal 267e43473b7SVivek Goyal st->min_disptime = tg->disptime; 268e43473b7SVivek Goyal } 269e43473b7SVivek Goyal 270e43473b7SVivek Goyal static void 271e43473b7SVivek Goyal tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg) 272e43473b7SVivek Goyal { 273e43473b7SVivek Goyal struct rb_node **node = &st->rb.rb_node; 274e43473b7SVivek Goyal struct rb_node *parent = NULL; 275e43473b7SVivek Goyal struct throtl_grp *__tg; 276e43473b7SVivek Goyal unsigned long key = tg->disptime; 277e43473b7SVivek Goyal int left = 1; 278e43473b7SVivek Goyal 279e43473b7SVivek Goyal while (*node != NULL) { 280e43473b7SVivek Goyal parent = *node; 281e43473b7SVivek Goyal __tg = rb_entry_tg(parent); 282e43473b7SVivek Goyal 283e43473b7SVivek Goyal if (time_before(key, __tg->disptime)) 284e43473b7SVivek Goyal node = &parent->rb_left; 285e43473b7SVivek Goyal else { 286e43473b7SVivek Goyal node = &parent->rb_right; 287e43473b7SVivek Goyal left = 0; 288e43473b7SVivek Goyal } 289e43473b7SVivek Goyal } 290e43473b7SVivek Goyal 291e43473b7SVivek Goyal if (left) 292e43473b7SVivek Goyal st->left = &tg->rb_node; 293e43473b7SVivek Goyal 294e43473b7SVivek Goyal rb_link_node(&tg->rb_node, parent, node); 295e43473b7SVivek Goyal rb_insert_color(&tg->rb_node, &st->rb); 296e43473b7SVivek Goyal } 297e43473b7SVivek Goyal 298e43473b7SVivek Goyal static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) 299e43473b7SVivek Goyal { 300e43473b7SVivek Goyal struct throtl_rb_root *st = &td->tg_service_tree; 301e43473b7SVivek Goyal 302e43473b7SVivek Goyal tg_service_tree_add(st, tg); 303e43473b7SVivek Goyal throtl_mark_tg_on_rr(tg); 304e43473b7SVivek Goyal st->count++; 305e43473b7SVivek Goyal } 306e43473b7SVivek Goyal 307e43473b7SVivek Goyal static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) 308e43473b7SVivek Goyal { 309e43473b7SVivek Goyal if (!throtl_tg_on_rr(tg)) 310e43473b7SVivek Goyal __throtl_enqueue_tg(td, tg); 311e43473b7SVivek Goyal } 312e43473b7SVivek Goyal 313e43473b7SVivek Goyal static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) 314e43473b7SVivek Goyal { 315e43473b7SVivek Goyal throtl_rb_erase(&tg->rb_node, &td->tg_service_tree); 316e43473b7SVivek Goyal throtl_clear_tg_on_rr(tg); 317e43473b7SVivek Goyal } 318e43473b7SVivek Goyal 319e43473b7SVivek Goyal static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) 320e43473b7SVivek Goyal { 321e43473b7SVivek Goyal if (throtl_tg_on_rr(tg)) 322e43473b7SVivek Goyal __throtl_dequeue_tg(td, tg); 323e43473b7SVivek Goyal } 324e43473b7SVivek Goyal 325e43473b7SVivek Goyal static void throtl_schedule_next_dispatch(struct throtl_data *td) 326e43473b7SVivek Goyal { 327e43473b7SVivek Goyal struct throtl_rb_root *st = &td->tg_service_tree; 328e43473b7SVivek Goyal 329e43473b7SVivek Goyal /* 330e43473b7SVivek Goyal * If there are more bios pending, schedule more work. 331e43473b7SVivek Goyal */ 332e43473b7SVivek Goyal if (!total_nr_queued(td)) 333e43473b7SVivek Goyal return; 334e43473b7SVivek Goyal 335e43473b7SVivek Goyal BUG_ON(!st->count); 336e43473b7SVivek Goyal 337e43473b7SVivek Goyal update_min_dispatch_time(st); 338e43473b7SVivek Goyal 339e43473b7SVivek Goyal if (time_before_eq(st->min_disptime, jiffies)) 340e43473b7SVivek Goyal throtl_schedule_delayed_work(td->queue, 0); 341e43473b7SVivek Goyal else 342e43473b7SVivek Goyal throtl_schedule_delayed_work(td->queue, 343e43473b7SVivek Goyal (st->min_disptime - jiffies)); 344e43473b7SVivek Goyal } 345e43473b7SVivek Goyal 346e43473b7SVivek Goyal static inline void 347e43473b7SVivek Goyal throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) 348e43473b7SVivek Goyal { 349e43473b7SVivek Goyal tg->bytes_disp[rw] = 0; 3508e89d13fSVivek Goyal tg->io_disp[rw] = 0; 351e43473b7SVivek Goyal tg->slice_start[rw] = jiffies; 352e43473b7SVivek Goyal tg->slice_end[rw] = jiffies + throtl_slice; 353e43473b7SVivek Goyal throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", 354e43473b7SVivek Goyal rw == READ ? 'R' : 'W', tg->slice_start[rw], 355e43473b7SVivek Goyal tg->slice_end[rw], jiffies); 356e43473b7SVivek Goyal } 357e43473b7SVivek Goyal 358e43473b7SVivek Goyal static inline void throtl_extend_slice(struct throtl_data *td, 359e43473b7SVivek Goyal struct throtl_grp *tg, bool rw, unsigned long jiffy_end) 360e43473b7SVivek Goyal { 361e43473b7SVivek Goyal tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 362e43473b7SVivek Goyal throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", 363e43473b7SVivek Goyal rw == READ ? 'R' : 'W', tg->slice_start[rw], 364e43473b7SVivek Goyal tg->slice_end[rw], jiffies); 365e43473b7SVivek Goyal } 366e43473b7SVivek Goyal 367e43473b7SVivek Goyal /* Determine if previously allocated or extended slice is complete or not */ 368e43473b7SVivek Goyal static bool 369e43473b7SVivek Goyal throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) 370e43473b7SVivek Goyal { 371e43473b7SVivek Goyal if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 372e43473b7SVivek Goyal return 0; 373e43473b7SVivek Goyal 374e43473b7SVivek Goyal return 1; 375e43473b7SVivek Goyal } 376e43473b7SVivek Goyal 377e43473b7SVivek Goyal /* Trim the used slices and adjust slice start accordingly */ 378e43473b7SVivek Goyal static inline void 379e43473b7SVivek Goyal throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) 380e43473b7SVivek Goyal { 3813aad5d3eSVivek Goyal unsigned long nr_slices, time_elapsed, io_trim; 3823aad5d3eSVivek Goyal u64 bytes_trim, tmp; 383e43473b7SVivek Goyal 384e43473b7SVivek Goyal BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); 385e43473b7SVivek Goyal 386e43473b7SVivek Goyal /* 387e43473b7SVivek Goyal * If bps are unlimited (-1), then time slice don't get 388e43473b7SVivek Goyal * renewed. Don't try to trim the slice if slice is used. A new 389e43473b7SVivek Goyal * slice will start when appropriate. 390e43473b7SVivek Goyal */ 391e43473b7SVivek Goyal if (throtl_slice_used(td, tg, rw)) 392e43473b7SVivek Goyal return; 393e43473b7SVivek Goyal 394e43473b7SVivek Goyal time_elapsed = jiffies - tg->slice_start[rw]; 395e43473b7SVivek Goyal 396e43473b7SVivek Goyal nr_slices = time_elapsed / throtl_slice; 397e43473b7SVivek Goyal 398e43473b7SVivek Goyal if (!nr_slices) 399e43473b7SVivek Goyal return; 4003aad5d3eSVivek Goyal tmp = tg->bps[rw] * throtl_slice * nr_slices; 4013aad5d3eSVivek Goyal do_div(tmp, HZ); 4023aad5d3eSVivek Goyal bytes_trim = tmp; 403e43473b7SVivek Goyal 4048e89d13fSVivek Goyal io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; 405e43473b7SVivek Goyal 4068e89d13fSVivek Goyal if (!bytes_trim && !io_trim) 407e43473b7SVivek Goyal return; 408e43473b7SVivek Goyal 409e43473b7SVivek Goyal if (tg->bytes_disp[rw] >= bytes_trim) 410e43473b7SVivek Goyal tg->bytes_disp[rw] -= bytes_trim; 411e43473b7SVivek Goyal else 412e43473b7SVivek Goyal tg->bytes_disp[rw] = 0; 413e43473b7SVivek Goyal 4148e89d13fSVivek Goyal if (tg->io_disp[rw] >= io_trim) 4158e89d13fSVivek Goyal tg->io_disp[rw] -= io_trim; 4168e89d13fSVivek Goyal else 4178e89d13fSVivek Goyal tg->io_disp[rw] = 0; 4188e89d13fSVivek Goyal 419e43473b7SVivek Goyal tg->slice_start[rw] += nr_slices * throtl_slice; 420e43473b7SVivek Goyal 4213aad5d3eSVivek Goyal throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" 422e43473b7SVivek Goyal " start=%lu end=%lu jiffies=%lu", 4238e89d13fSVivek Goyal rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, 424e43473b7SVivek Goyal tg->slice_start[rw], tg->slice_end[rw], jiffies); 425e43473b7SVivek Goyal } 426e43473b7SVivek Goyal 4278e89d13fSVivek Goyal static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, 428e43473b7SVivek Goyal struct bio *bio, unsigned long *wait) 429e43473b7SVivek Goyal { 430e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 4318e89d13fSVivek Goyal unsigned int io_allowed; 432e43473b7SVivek Goyal unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 433e43473b7SVivek Goyal 4348e89d13fSVivek Goyal jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 435e43473b7SVivek Goyal 4368e89d13fSVivek Goyal /* Slice has just started. Consider one slice interval */ 4378e89d13fSVivek Goyal if (!jiffy_elapsed) 4388e89d13fSVivek Goyal jiffy_elapsed_rnd = throtl_slice; 4398e89d13fSVivek Goyal 4408e89d13fSVivek Goyal jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 4418e89d13fSVivek Goyal 442*5e901a2bSVivek Goyal io_allowed = (tg->iops[rw] * jiffy_elapsed_rnd) / HZ; 4438e89d13fSVivek Goyal 4448e89d13fSVivek Goyal if (tg->io_disp[rw] + 1 <= io_allowed) { 445e43473b7SVivek Goyal if (wait) 446e43473b7SVivek Goyal *wait = 0; 447e43473b7SVivek Goyal return 1; 448e43473b7SVivek Goyal } 449e43473b7SVivek Goyal 4508e89d13fSVivek Goyal /* Calc approx time to dispatch */ 4518e89d13fSVivek Goyal jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; 4528e89d13fSVivek Goyal 4538e89d13fSVivek Goyal if (jiffy_wait > jiffy_elapsed) 4548e89d13fSVivek Goyal jiffy_wait = jiffy_wait - jiffy_elapsed; 4558e89d13fSVivek Goyal else 4568e89d13fSVivek Goyal jiffy_wait = 1; 4578e89d13fSVivek Goyal 4588e89d13fSVivek Goyal if (wait) 4598e89d13fSVivek Goyal *wait = jiffy_wait; 4608e89d13fSVivek Goyal return 0; 461e43473b7SVivek Goyal } 462e43473b7SVivek Goyal 4638e89d13fSVivek Goyal static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, 4648e89d13fSVivek Goyal struct bio *bio, unsigned long *wait) 4658e89d13fSVivek Goyal { 4668e89d13fSVivek Goyal bool rw = bio_data_dir(bio); 4673aad5d3eSVivek Goyal u64 bytes_allowed, extra_bytes, tmp; 4688e89d13fSVivek Goyal unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 4698e89d13fSVivek Goyal 470e43473b7SVivek Goyal jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 471e43473b7SVivek Goyal 472e43473b7SVivek Goyal /* Slice has just started. Consider one slice interval */ 473e43473b7SVivek Goyal if (!jiffy_elapsed) 474e43473b7SVivek Goyal jiffy_elapsed_rnd = throtl_slice; 475e43473b7SVivek Goyal 476e43473b7SVivek Goyal jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 477e43473b7SVivek Goyal 478*5e901a2bSVivek Goyal tmp = tg->bps[rw] * jiffy_elapsed_rnd; 479*5e901a2bSVivek Goyal do_div(tmp, HZ); 4803aad5d3eSVivek Goyal bytes_allowed = tmp; 481e43473b7SVivek Goyal 482e43473b7SVivek Goyal if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { 483e43473b7SVivek Goyal if (wait) 484e43473b7SVivek Goyal *wait = 0; 485e43473b7SVivek Goyal return 1; 486e43473b7SVivek Goyal } 487e43473b7SVivek Goyal 488e43473b7SVivek Goyal /* Calc approx time to dispatch */ 489e43473b7SVivek Goyal extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed; 490e43473b7SVivek Goyal jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); 491e43473b7SVivek Goyal 492e43473b7SVivek Goyal if (!jiffy_wait) 493e43473b7SVivek Goyal jiffy_wait = 1; 494e43473b7SVivek Goyal 495e43473b7SVivek Goyal /* 496e43473b7SVivek Goyal * This wait time is without taking into consideration the rounding 497e43473b7SVivek Goyal * up we did. Add that time also. 498e43473b7SVivek Goyal */ 499e43473b7SVivek Goyal jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); 500e43473b7SVivek Goyal if (wait) 501e43473b7SVivek Goyal *wait = jiffy_wait; 5028e89d13fSVivek Goyal return 0; 5038e89d13fSVivek Goyal } 504e43473b7SVivek Goyal 5058e89d13fSVivek Goyal /* 5068e89d13fSVivek Goyal * Returns whether one can dispatch a bio or not. Also returns approx number 5078e89d13fSVivek Goyal * of jiffies to wait before this bio is with-in IO rate and can be dispatched 5088e89d13fSVivek Goyal */ 5098e89d13fSVivek Goyal static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, 5108e89d13fSVivek Goyal struct bio *bio, unsigned long *wait) 5118e89d13fSVivek Goyal { 5128e89d13fSVivek Goyal bool rw = bio_data_dir(bio); 5138e89d13fSVivek Goyal unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; 5148e89d13fSVivek Goyal 5158e89d13fSVivek Goyal /* 5168e89d13fSVivek Goyal * Currently whole state machine of group depends on first bio 5178e89d13fSVivek Goyal * queued in the group bio list. So one should not be calling 5188e89d13fSVivek Goyal * this function with a different bio if there are other bios 5198e89d13fSVivek Goyal * queued. 5208e89d13fSVivek Goyal */ 5218e89d13fSVivek Goyal BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); 5228e89d13fSVivek Goyal 5238e89d13fSVivek Goyal /* If tg->bps = -1, then BW is unlimited */ 5248e89d13fSVivek Goyal if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { 5258e89d13fSVivek Goyal if (wait) 5268e89d13fSVivek Goyal *wait = 0; 5278e89d13fSVivek Goyal return 1; 5288e89d13fSVivek Goyal } 5298e89d13fSVivek Goyal 5308e89d13fSVivek Goyal /* 5318e89d13fSVivek Goyal * If previous slice expired, start a new one otherwise renew/extend 5328e89d13fSVivek Goyal * existing slice to make sure it is at least throtl_slice interval 5338e89d13fSVivek Goyal * long since now. 5348e89d13fSVivek Goyal */ 5358e89d13fSVivek Goyal if (throtl_slice_used(td, tg, rw)) 5368e89d13fSVivek Goyal throtl_start_new_slice(td, tg, rw); 5378e89d13fSVivek Goyal else { 5388e89d13fSVivek Goyal if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) 5398e89d13fSVivek Goyal throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); 5408e89d13fSVivek Goyal } 5418e89d13fSVivek Goyal 5428e89d13fSVivek Goyal if (tg_with_in_bps_limit(td, tg, bio, &bps_wait) 5438e89d13fSVivek Goyal && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) { 5448e89d13fSVivek Goyal if (wait) 5458e89d13fSVivek Goyal *wait = 0; 5468e89d13fSVivek Goyal return 1; 5478e89d13fSVivek Goyal } 5488e89d13fSVivek Goyal 5498e89d13fSVivek Goyal max_wait = max(bps_wait, iops_wait); 5508e89d13fSVivek Goyal 5518e89d13fSVivek Goyal if (wait) 5528e89d13fSVivek Goyal *wait = max_wait; 5538e89d13fSVivek Goyal 5548e89d13fSVivek Goyal if (time_before(tg->slice_end[rw], jiffies + max_wait)) 5558e89d13fSVivek Goyal throtl_extend_slice(td, tg, rw, jiffies + max_wait); 556e43473b7SVivek Goyal 557e43473b7SVivek Goyal return 0; 558e43473b7SVivek Goyal } 559e43473b7SVivek Goyal 560e43473b7SVivek Goyal static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 561e43473b7SVivek Goyal { 562e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 563e43473b7SVivek Goyal bool sync = bio->bi_rw & REQ_SYNC; 564e43473b7SVivek Goyal 565e43473b7SVivek Goyal /* Charge the bio to the group */ 566e43473b7SVivek Goyal tg->bytes_disp[rw] += bio->bi_size; 5678e89d13fSVivek Goyal tg->io_disp[rw]++; 568e43473b7SVivek Goyal 569e43473b7SVivek Goyal /* 570e43473b7SVivek Goyal * TODO: This will take blkg->stats_lock. Figure out a way 571e43473b7SVivek Goyal * to avoid this cost. 572e43473b7SVivek Goyal */ 573e43473b7SVivek Goyal blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); 574e43473b7SVivek Goyal } 575e43473b7SVivek Goyal 576e43473b7SVivek Goyal static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, 577e43473b7SVivek Goyal struct bio *bio) 578e43473b7SVivek Goyal { 579e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 580e43473b7SVivek Goyal 581e43473b7SVivek Goyal bio_list_add(&tg->bio_lists[rw], bio); 582e43473b7SVivek Goyal /* Take a bio reference on tg */ 583e43473b7SVivek Goyal throtl_ref_get_tg(tg); 584e43473b7SVivek Goyal tg->nr_queued[rw]++; 585e43473b7SVivek Goyal td->nr_queued[rw]++; 586e43473b7SVivek Goyal throtl_enqueue_tg(td, tg); 587e43473b7SVivek Goyal } 588e43473b7SVivek Goyal 589e43473b7SVivek Goyal static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) 590e43473b7SVivek Goyal { 591e43473b7SVivek Goyal unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; 592e43473b7SVivek Goyal struct bio *bio; 593e43473b7SVivek Goyal 594e43473b7SVivek Goyal if ((bio = bio_list_peek(&tg->bio_lists[READ]))) 595e43473b7SVivek Goyal tg_may_dispatch(td, tg, bio, &read_wait); 596e43473b7SVivek Goyal 597e43473b7SVivek Goyal if ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) 598e43473b7SVivek Goyal tg_may_dispatch(td, tg, bio, &write_wait); 599e43473b7SVivek Goyal 600e43473b7SVivek Goyal min_wait = min(read_wait, write_wait); 601e43473b7SVivek Goyal disptime = jiffies + min_wait; 602e43473b7SVivek Goyal 603e43473b7SVivek Goyal /* Update dispatch time */ 604e43473b7SVivek Goyal throtl_dequeue_tg(td, tg); 605e43473b7SVivek Goyal tg->disptime = disptime; 606e43473b7SVivek Goyal throtl_enqueue_tg(td, tg); 607e43473b7SVivek Goyal } 608e43473b7SVivek Goyal 609e43473b7SVivek Goyal static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, 610e43473b7SVivek Goyal bool rw, struct bio_list *bl) 611e43473b7SVivek Goyal { 612e43473b7SVivek Goyal struct bio *bio; 613e43473b7SVivek Goyal 614e43473b7SVivek Goyal bio = bio_list_pop(&tg->bio_lists[rw]); 615e43473b7SVivek Goyal tg->nr_queued[rw]--; 616e43473b7SVivek Goyal /* Drop bio reference on tg */ 617e43473b7SVivek Goyal throtl_put_tg(tg); 618e43473b7SVivek Goyal 619e43473b7SVivek Goyal BUG_ON(td->nr_queued[rw] <= 0); 620e43473b7SVivek Goyal td->nr_queued[rw]--; 621e43473b7SVivek Goyal 622e43473b7SVivek Goyal throtl_charge_bio(tg, bio); 623e43473b7SVivek Goyal bio_list_add(bl, bio); 624e43473b7SVivek Goyal bio->bi_rw |= REQ_THROTTLED; 625e43473b7SVivek Goyal 626e43473b7SVivek Goyal throtl_trim_slice(td, tg, rw); 627e43473b7SVivek Goyal } 628e43473b7SVivek Goyal 629e43473b7SVivek Goyal static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, 630e43473b7SVivek Goyal struct bio_list *bl) 631e43473b7SVivek Goyal { 632e43473b7SVivek Goyal unsigned int nr_reads = 0, nr_writes = 0; 633e43473b7SVivek Goyal unsigned int max_nr_reads = throtl_grp_quantum*3/4; 634e43473b7SVivek Goyal unsigned int max_nr_writes = throtl_grp_quantum - nr_reads; 635e43473b7SVivek Goyal struct bio *bio; 636e43473b7SVivek Goyal 637e43473b7SVivek Goyal /* Try to dispatch 75% READS and 25% WRITES */ 638e43473b7SVivek Goyal 639e43473b7SVivek Goyal while ((bio = bio_list_peek(&tg->bio_lists[READ])) 640e43473b7SVivek Goyal && tg_may_dispatch(td, tg, bio, NULL)) { 641e43473b7SVivek Goyal 642e43473b7SVivek Goyal tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); 643e43473b7SVivek Goyal nr_reads++; 644e43473b7SVivek Goyal 645e43473b7SVivek Goyal if (nr_reads >= max_nr_reads) 646e43473b7SVivek Goyal break; 647e43473b7SVivek Goyal } 648e43473b7SVivek Goyal 649e43473b7SVivek Goyal while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) 650e43473b7SVivek Goyal && tg_may_dispatch(td, tg, bio, NULL)) { 651e43473b7SVivek Goyal 652e43473b7SVivek Goyal tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); 653e43473b7SVivek Goyal nr_writes++; 654e43473b7SVivek Goyal 655e43473b7SVivek Goyal if (nr_writes >= max_nr_writes) 656e43473b7SVivek Goyal break; 657e43473b7SVivek Goyal } 658e43473b7SVivek Goyal 659e43473b7SVivek Goyal return nr_reads + nr_writes; 660e43473b7SVivek Goyal } 661e43473b7SVivek Goyal 662e43473b7SVivek Goyal static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) 663e43473b7SVivek Goyal { 664e43473b7SVivek Goyal unsigned int nr_disp = 0; 665e43473b7SVivek Goyal struct throtl_grp *tg; 666e43473b7SVivek Goyal struct throtl_rb_root *st = &td->tg_service_tree; 667e43473b7SVivek Goyal 668e43473b7SVivek Goyal while (1) { 669e43473b7SVivek Goyal tg = throtl_rb_first(st); 670e43473b7SVivek Goyal 671e43473b7SVivek Goyal if (!tg) 672e43473b7SVivek Goyal break; 673e43473b7SVivek Goyal 674e43473b7SVivek Goyal if (time_before(jiffies, tg->disptime)) 675e43473b7SVivek Goyal break; 676e43473b7SVivek Goyal 677e43473b7SVivek Goyal throtl_dequeue_tg(td, tg); 678e43473b7SVivek Goyal 679e43473b7SVivek Goyal nr_disp += throtl_dispatch_tg(td, tg, bl); 680e43473b7SVivek Goyal 681e43473b7SVivek Goyal if (tg->nr_queued[0] || tg->nr_queued[1]) { 682e43473b7SVivek Goyal tg_update_disptime(td, tg); 683e43473b7SVivek Goyal throtl_enqueue_tg(td, tg); 684e43473b7SVivek Goyal } 685e43473b7SVivek Goyal 686e43473b7SVivek Goyal if (nr_disp >= throtl_quantum) 687e43473b7SVivek Goyal break; 688e43473b7SVivek Goyal } 689e43473b7SVivek Goyal 690e43473b7SVivek Goyal return nr_disp; 691e43473b7SVivek Goyal } 692e43473b7SVivek Goyal 693fe071437SVivek Goyal static void throtl_process_limit_change(struct throtl_data *td) 694fe071437SVivek Goyal { 695fe071437SVivek Goyal struct throtl_grp *tg; 696fe071437SVivek Goyal struct hlist_node *pos, *n; 697fe071437SVivek Goyal 698fe071437SVivek Goyal /* 699fe071437SVivek Goyal * Make sure atomic_inc() effects from 700fe071437SVivek Goyal * throtl_update_blkio_group_read_bps(), group of functions are 701fe071437SVivek Goyal * visible. 702fe071437SVivek Goyal * Is this required or smp_mb__after_atomic_inc() was suffcient 703fe071437SVivek Goyal * after the atomic_inc(). 704fe071437SVivek Goyal */ 705fe071437SVivek Goyal smp_rmb(); 706fe071437SVivek Goyal if (!atomic_read(&td->limits_changed)) 707fe071437SVivek Goyal return; 708fe071437SVivek Goyal 709fe071437SVivek Goyal throtl_log(td, "limit changed =%d", atomic_read(&td->limits_changed)); 710fe071437SVivek Goyal 711fe071437SVivek Goyal hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { 712fe071437SVivek Goyal /* 713fe071437SVivek Goyal * Do I need an smp_rmb() here to make sure tg->limits_changed 714fe071437SVivek Goyal * update is visible. I am relying on smp_rmb() at the 715fe071437SVivek Goyal * beginning of function and not putting a new one here. 716fe071437SVivek Goyal */ 717fe071437SVivek Goyal 718fe071437SVivek Goyal if (throtl_tg_on_rr(tg) && tg->limits_changed) { 719fe071437SVivek Goyal throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" 720fe071437SVivek Goyal " riops=%u wiops=%u", tg->bps[READ], 721fe071437SVivek Goyal tg->bps[WRITE], tg->iops[READ], 722fe071437SVivek Goyal tg->iops[WRITE]); 723fe071437SVivek Goyal tg_update_disptime(td, tg); 724fe071437SVivek Goyal tg->limits_changed = false; 725fe071437SVivek Goyal } 726fe071437SVivek Goyal } 727fe071437SVivek Goyal 728fe071437SVivek Goyal smp_mb__before_atomic_dec(); 729fe071437SVivek Goyal atomic_dec(&td->limits_changed); 730fe071437SVivek Goyal smp_mb__after_atomic_dec(); 731fe071437SVivek Goyal } 732fe071437SVivek Goyal 733e43473b7SVivek Goyal /* Dispatch throttled bios. Should be called without queue lock held. */ 734e43473b7SVivek Goyal static int throtl_dispatch(struct request_queue *q) 735e43473b7SVivek Goyal { 736e43473b7SVivek Goyal struct throtl_data *td = q->td; 737e43473b7SVivek Goyal unsigned int nr_disp = 0; 738e43473b7SVivek Goyal struct bio_list bio_list_on_stack; 739e43473b7SVivek Goyal struct bio *bio; 740e43473b7SVivek Goyal 741e43473b7SVivek Goyal spin_lock_irq(q->queue_lock); 742e43473b7SVivek Goyal 743fe071437SVivek Goyal throtl_process_limit_change(td); 744fe071437SVivek Goyal 745e43473b7SVivek Goyal if (!total_nr_queued(td)) 746e43473b7SVivek Goyal goto out; 747e43473b7SVivek Goyal 748e43473b7SVivek Goyal bio_list_init(&bio_list_on_stack); 749e43473b7SVivek Goyal 750e43473b7SVivek Goyal throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u", 751e43473b7SVivek Goyal total_nr_queued(td), td->nr_queued[READ], 752e43473b7SVivek Goyal td->nr_queued[WRITE]); 753e43473b7SVivek Goyal 754e43473b7SVivek Goyal nr_disp = throtl_select_dispatch(td, &bio_list_on_stack); 755e43473b7SVivek Goyal 756e43473b7SVivek Goyal if (nr_disp) 757e43473b7SVivek Goyal throtl_log(td, "bios disp=%u", nr_disp); 758e43473b7SVivek Goyal 759e43473b7SVivek Goyal throtl_schedule_next_dispatch(td); 760e43473b7SVivek Goyal out: 761e43473b7SVivek Goyal spin_unlock_irq(q->queue_lock); 762e43473b7SVivek Goyal 763e43473b7SVivek Goyal /* 764e43473b7SVivek Goyal * If we dispatched some requests, unplug the queue to make sure 765e43473b7SVivek Goyal * immediate dispatch 766e43473b7SVivek Goyal */ 767e43473b7SVivek Goyal if (nr_disp) { 768e43473b7SVivek Goyal while((bio = bio_list_pop(&bio_list_on_stack))) 769e43473b7SVivek Goyal generic_make_request(bio); 770e43473b7SVivek Goyal blk_unplug(q); 771e43473b7SVivek Goyal } 772e43473b7SVivek Goyal return nr_disp; 773e43473b7SVivek Goyal } 774e43473b7SVivek Goyal 775e43473b7SVivek Goyal void blk_throtl_work(struct work_struct *work) 776e43473b7SVivek Goyal { 777e43473b7SVivek Goyal struct throtl_data *td = container_of(work, struct throtl_data, 778e43473b7SVivek Goyal throtl_work.work); 779e43473b7SVivek Goyal struct request_queue *q = td->queue; 780e43473b7SVivek Goyal 781e43473b7SVivek Goyal throtl_dispatch(q); 782e43473b7SVivek Goyal } 783e43473b7SVivek Goyal 784e43473b7SVivek Goyal /* Call with queue lock held */ 785e43473b7SVivek Goyal void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay) 786e43473b7SVivek Goyal { 787e43473b7SVivek Goyal 788e43473b7SVivek Goyal struct throtl_data *td = q->td; 789e43473b7SVivek Goyal struct delayed_work *dwork = &td->throtl_work; 790e43473b7SVivek Goyal 791e43473b7SVivek Goyal if (total_nr_queued(td) > 0) { 792e43473b7SVivek Goyal /* 793e43473b7SVivek Goyal * We might have a work scheduled to be executed in future. 794e43473b7SVivek Goyal * Cancel that and schedule a new one. 795e43473b7SVivek Goyal */ 796e43473b7SVivek Goyal __cancel_delayed_work(dwork); 797e43473b7SVivek Goyal kblockd_schedule_delayed_work(q, dwork, delay); 798e43473b7SVivek Goyal throtl_log(td, "schedule work. delay=%lu jiffies=%lu", 799e43473b7SVivek Goyal delay, jiffies); 800e43473b7SVivek Goyal } 801e43473b7SVivek Goyal } 802e43473b7SVivek Goyal EXPORT_SYMBOL(throtl_schedule_delayed_work); 803e43473b7SVivek Goyal 804e43473b7SVivek Goyal static void 805e43473b7SVivek Goyal throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg) 806e43473b7SVivek Goyal { 807e43473b7SVivek Goyal /* Something wrong if we are trying to remove same group twice */ 808e43473b7SVivek Goyal BUG_ON(hlist_unhashed(&tg->tg_node)); 809e43473b7SVivek Goyal 810e43473b7SVivek Goyal hlist_del_init(&tg->tg_node); 811e43473b7SVivek Goyal 812e43473b7SVivek Goyal /* 813e43473b7SVivek Goyal * Put the reference taken at the time of creation so that when all 814e43473b7SVivek Goyal * queues are gone, group can be destroyed. 815e43473b7SVivek Goyal */ 816e43473b7SVivek Goyal throtl_put_tg(tg); 817e43473b7SVivek Goyal td->nr_undestroyed_grps--; 818e43473b7SVivek Goyal } 819e43473b7SVivek Goyal 820e43473b7SVivek Goyal static void throtl_release_tgs(struct throtl_data *td) 821e43473b7SVivek Goyal { 822e43473b7SVivek Goyal struct hlist_node *pos, *n; 823e43473b7SVivek Goyal struct throtl_grp *tg; 824e43473b7SVivek Goyal 825e43473b7SVivek Goyal hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { 826e43473b7SVivek Goyal /* 827e43473b7SVivek Goyal * If cgroup removal path got to blk_group first and removed 828e43473b7SVivek Goyal * it from cgroup list, then it will take care of destroying 829e43473b7SVivek Goyal * cfqg also. 830e43473b7SVivek Goyal */ 831e43473b7SVivek Goyal if (!blkiocg_del_blkio_group(&tg->blkg)) 832e43473b7SVivek Goyal throtl_destroy_tg(td, tg); 833e43473b7SVivek Goyal } 834e43473b7SVivek Goyal } 835e43473b7SVivek Goyal 836e43473b7SVivek Goyal static void throtl_td_free(struct throtl_data *td) 837e43473b7SVivek Goyal { 838e43473b7SVivek Goyal kfree(td); 839e43473b7SVivek Goyal } 840e43473b7SVivek Goyal 841e43473b7SVivek Goyal /* 842e43473b7SVivek Goyal * Blk cgroup controller notification saying that blkio_group object is being 843e43473b7SVivek Goyal * delinked as associated cgroup object is going away. That also means that 844e43473b7SVivek Goyal * no new IO will come in this group. So get rid of this group as soon as 845e43473b7SVivek Goyal * any pending IO in the group is finished. 846e43473b7SVivek Goyal * 847e43473b7SVivek Goyal * This function is called under rcu_read_lock(). key is the rcu protected 848e43473b7SVivek Goyal * pointer. That means "key" is a valid throtl_data pointer as long as we are 849e43473b7SVivek Goyal * rcu read lock. 850e43473b7SVivek Goyal * 851e43473b7SVivek Goyal * "key" was fetched from blkio_group under blkio_cgroup->lock. That means 852e43473b7SVivek Goyal * it should not be NULL as even if queue was going away, cgroup deltion 853e43473b7SVivek Goyal * path got to it first. 854e43473b7SVivek Goyal */ 855e43473b7SVivek Goyal void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg) 856e43473b7SVivek Goyal { 857e43473b7SVivek Goyal unsigned long flags; 858e43473b7SVivek Goyal struct throtl_data *td = key; 859e43473b7SVivek Goyal 860e43473b7SVivek Goyal spin_lock_irqsave(td->queue->queue_lock, flags); 861e43473b7SVivek Goyal throtl_destroy_tg(td, tg_of_blkg(blkg)); 862e43473b7SVivek Goyal spin_unlock_irqrestore(td->queue->queue_lock, flags); 863e43473b7SVivek Goyal } 864e43473b7SVivek Goyal 865fe071437SVivek Goyal /* 866fe071437SVivek Goyal * For all update functions, key should be a valid pointer because these 867fe071437SVivek Goyal * update functions are called under blkcg_lock, that means, blkg is 868fe071437SVivek Goyal * valid and in turn key is valid. queue exit path can not race becuase 869fe071437SVivek Goyal * of blkcg_lock 870fe071437SVivek Goyal * 871fe071437SVivek Goyal * Can not take queue lock in update functions as queue lock under blkcg_lock 872fe071437SVivek Goyal * is not allowed. Under other paths we take blkcg_lock under queue_lock. 873fe071437SVivek Goyal */ 874fe071437SVivek Goyal static void throtl_update_blkio_group_read_bps(void *key, 875fe071437SVivek Goyal struct blkio_group *blkg, u64 read_bps) 876e43473b7SVivek Goyal { 877fe071437SVivek Goyal struct throtl_data *td = key; 878fe071437SVivek Goyal 879e43473b7SVivek Goyal tg_of_blkg(blkg)->bps[READ] = read_bps; 880fe071437SVivek Goyal /* Make sure read_bps is updated before setting limits_changed */ 881fe071437SVivek Goyal smp_wmb(); 882fe071437SVivek Goyal tg_of_blkg(blkg)->limits_changed = true; 883fe071437SVivek Goyal 884fe071437SVivek Goyal /* Make sure tg->limits_changed is updated before td->limits_changed */ 885fe071437SVivek Goyal smp_mb__before_atomic_inc(); 886fe071437SVivek Goyal atomic_inc(&td->limits_changed); 887fe071437SVivek Goyal smp_mb__after_atomic_inc(); 888fe071437SVivek Goyal 889fe071437SVivek Goyal /* Schedule a work now to process the limit change */ 890fe071437SVivek Goyal throtl_schedule_delayed_work(td->queue, 0); 891e43473b7SVivek Goyal } 892e43473b7SVivek Goyal 893fe071437SVivek Goyal static void throtl_update_blkio_group_write_bps(void *key, 894fe071437SVivek Goyal struct blkio_group *blkg, u64 write_bps) 895e43473b7SVivek Goyal { 896fe071437SVivek Goyal struct throtl_data *td = key; 897fe071437SVivek Goyal 898e43473b7SVivek Goyal tg_of_blkg(blkg)->bps[WRITE] = write_bps; 899fe071437SVivek Goyal smp_wmb(); 900fe071437SVivek Goyal tg_of_blkg(blkg)->limits_changed = true; 901fe071437SVivek Goyal smp_mb__before_atomic_inc(); 902fe071437SVivek Goyal atomic_inc(&td->limits_changed); 903fe071437SVivek Goyal smp_mb__after_atomic_inc(); 904fe071437SVivek Goyal throtl_schedule_delayed_work(td->queue, 0); 905e43473b7SVivek Goyal } 906e43473b7SVivek Goyal 907fe071437SVivek Goyal static void throtl_update_blkio_group_read_iops(void *key, 908fe071437SVivek Goyal struct blkio_group *blkg, unsigned int read_iops) 9098e89d13fSVivek Goyal { 910fe071437SVivek Goyal struct throtl_data *td = key; 911fe071437SVivek Goyal 9128e89d13fSVivek Goyal tg_of_blkg(blkg)->iops[READ] = read_iops; 913fe071437SVivek Goyal smp_wmb(); 914fe071437SVivek Goyal tg_of_blkg(blkg)->limits_changed = true; 915fe071437SVivek Goyal smp_mb__before_atomic_inc(); 916fe071437SVivek Goyal atomic_inc(&td->limits_changed); 917fe071437SVivek Goyal smp_mb__after_atomic_inc(); 918fe071437SVivek Goyal throtl_schedule_delayed_work(td->queue, 0); 9198e89d13fSVivek Goyal } 9208e89d13fSVivek Goyal 921fe071437SVivek Goyal static void throtl_update_blkio_group_write_iops(void *key, 922fe071437SVivek Goyal struct blkio_group *blkg, unsigned int write_iops) 9238e89d13fSVivek Goyal { 924fe071437SVivek Goyal struct throtl_data *td = key; 925fe071437SVivek Goyal 9268e89d13fSVivek Goyal tg_of_blkg(blkg)->iops[WRITE] = write_iops; 927fe071437SVivek Goyal smp_wmb(); 928fe071437SVivek Goyal tg_of_blkg(blkg)->limits_changed = true; 929fe071437SVivek Goyal smp_mb__before_atomic_inc(); 930fe071437SVivek Goyal atomic_inc(&td->limits_changed); 931fe071437SVivek Goyal smp_mb__after_atomic_inc(); 932fe071437SVivek Goyal throtl_schedule_delayed_work(td->queue, 0); 9338e89d13fSVivek Goyal } 9348e89d13fSVivek Goyal 935e43473b7SVivek Goyal void throtl_shutdown_timer_wq(struct request_queue *q) 936e43473b7SVivek Goyal { 937e43473b7SVivek Goyal struct throtl_data *td = q->td; 938e43473b7SVivek Goyal 939e43473b7SVivek Goyal cancel_delayed_work_sync(&td->throtl_work); 940e43473b7SVivek Goyal } 941e43473b7SVivek Goyal 942e43473b7SVivek Goyal static struct blkio_policy_type blkio_policy_throtl = { 943e43473b7SVivek Goyal .ops = { 944e43473b7SVivek Goyal .blkio_unlink_group_fn = throtl_unlink_blkio_group, 945e43473b7SVivek Goyal .blkio_update_group_read_bps_fn = 946e43473b7SVivek Goyal throtl_update_blkio_group_read_bps, 947e43473b7SVivek Goyal .blkio_update_group_write_bps_fn = 948e43473b7SVivek Goyal throtl_update_blkio_group_write_bps, 9498e89d13fSVivek Goyal .blkio_update_group_read_iops_fn = 9508e89d13fSVivek Goyal throtl_update_blkio_group_read_iops, 9518e89d13fSVivek Goyal .blkio_update_group_write_iops_fn = 9528e89d13fSVivek Goyal throtl_update_blkio_group_write_iops, 953e43473b7SVivek Goyal }, 9548e89d13fSVivek Goyal .plid = BLKIO_POLICY_THROTL, 955e43473b7SVivek Goyal }; 956e43473b7SVivek Goyal 957e43473b7SVivek Goyal int blk_throtl_bio(struct request_queue *q, struct bio **biop) 958e43473b7SVivek Goyal { 959e43473b7SVivek Goyal struct throtl_data *td = q->td; 960e43473b7SVivek Goyal struct throtl_grp *tg; 961e43473b7SVivek Goyal struct bio *bio = *biop; 962e43473b7SVivek Goyal bool rw = bio_data_dir(bio), update_disptime = true; 963e43473b7SVivek Goyal 964e43473b7SVivek Goyal if (bio->bi_rw & REQ_THROTTLED) { 965e43473b7SVivek Goyal bio->bi_rw &= ~REQ_THROTTLED; 966e43473b7SVivek Goyal return 0; 967e43473b7SVivek Goyal } 968e43473b7SVivek Goyal 969e43473b7SVivek Goyal spin_lock_irq(q->queue_lock); 970e43473b7SVivek Goyal tg = throtl_get_tg(td); 971e43473b7SVivek Goyal 972e43473b7SVivek Goyal if (tg->nr_queued[rw]) { 973e43473b7SVivek Goyal /* 974e43473b7SVivek Goyal * There is already another bio queued in same dir. No 975e43473b7SVivek Goyal * need to update dispatch time. 976fe071437SVivek Goyal * Still update the disptime if rate limits on this group 977fe071437SVivek Goyal * were changed. 978e43473b7SVivek Goyal */ 979fe071437SVivek Goyal if (!tg->limits_changed) 980e43473b7SVivek Goyal update_disptime = false; 981fe071437SVivek Goyal else 982fe071437SVivek Goyal tg->limits_changed = false; 983fe071437SVivek Goyal 984e43473b7SVivek Goyal goto queue_bio; 985e43473b7SVivek Goyal } 986e43473b7SVivek Goyal 987e43473b7SVivek Goyal /* Bio is with-in rate limit of group */ 988e43473b7SVivek Goyal if (tg_may_dispatch(td, tg, bio, NULL)) { 989e43473b7SVivek Goyal throtl_charge_bio(tg, bio); 990e43473b7SVivek Goyal goto out; 991e43473b7SVivek Goyal } 992e43473b7SVivek Goyal 993e43473b7SVivek Goyal queue_bio: 9948e89d13fSVivek Goyal throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu" 9958e89d13fSVivek Goyal " iodisp=%u iops=%u queued=%d/%d", 9968e89d13fSVivek Goyal rw == READ ? 'R' : 'W', 997e43473b7SVivek Goyal tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], 9988e89d13fSVivek Goyal tg->io_disp[rw], tg->iops[rw], 999e43473b7SVivek Goyal tg->nr_queued[READ], tg->nr_queued[WRITE]); 1000e43473b7SVivek Goyal 1001e43473b7SVivek Goyal throtl_add_bio_tg(q->td, tg, bio); 1002e43473b7SVivek Goyal *biop = NULL; 1003e43473b7SVivek Goyal 1004e43473b7SVivek Goyal if (update_disptime) { 1005e43473b7SVivek Goyal tg_update_disptime(td, tg); 1006e43473b7SVivek Goyal throtl_schedule_next_dispatch(td); 1007e43473b7SVivek Goyal } 1008e43473b7SVivek Goyal 1009e43473b7SVivek Goyal out: 1010e43473b7SVivek Goyal spin_unlock_irq(q->queue_lock); 1011e43473b7SVivek Goyal return 0; 1012e43473b7SVivek Goyal } 1013e43473b7SVivek Goyal 1014e43473b7SVivek Goyal int blk_throtl_init(struct request_queue *q) 1015e43473b7SVivek Goyal { 1016e43473b7SVivek Goyal struct throtl_data *td; 1017e43473b7SVivek Goyal struct throtl_grp *tg; 1018e43473b7SVivek Goyal 1019e43473b7SVivek Goyal td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1020e43473b7SVivek Goyal if (!td) 1021e43473b7SVivek Goyal return -ENOMEM; 1022e43473b7SVivek Goyal 1023e43473b7SVivek Goyal INIT_HLIST_HEAD(&td->tg_list); 1024e43473b7SVivek Goyal td->tg_service_tree = THROTL_RB_ROOT; 1025fe071437SVivek Goyal atomic_set(&td->limits_changed, 0); 1026e43473b7SVivek Goyal 1027e43473b7SVivek Goyal /* Init root group */ 1028e43473b7SVivek Goyal tg = &td->root_tg; 1029e43473b7SVivek Goyal INIT_HLIST_NODE(&tg->tg_node); 1030e43473b7SVivek Goyal RB_CLEAR_NODE(&tg->rb_node); 1031e43473b7SVivek Goyal bio_list_init(&tg->bio_lists[0]); 1032e43473b7SVivek Goyal bio_list_init(&tg->bio_lists[1]); 1033e43473b7SVivek Goyal 1034e43473b7SVivek Goyal /* Practically unlimited BW */ 1035e43473b7SVivek Goyal tg->bps[0] = tg->bps[1] = -1; 10368e89d13fSVivek Goyal tg->iops[0] = tg->iops[1] = -1; 103702977e4aSVivek Goyal 103802977e4aSVivek Goyal /* 103902977e4aSVivek Goyal * Set root group reference to 2. One reference will be dropped when 104002977e4aSVivek Goyal * all groups on tg_list are being deleted during queue exit. Other 104102977e4aSVivek Goyal * reference will remain there as we don't want to delete this group 104202977e4aSVivek Goyal * as it is statically allocated and gets destroyed when throtl_data 104302977e4aSVivek Goyal * goes away. 104402977e4aSVivek Goyal */ 104502977e4aSVivek Goyal atomic_set(&tg->ref, 2); 104602977e4aSVivek Goyal hlist_add_head(&tg->tg_node, &td->tg_list); 104702977e4aSVivek Goyal td->nr_undestroyed_grps++; 1048e43473b7SVivek Goyal 1049e43473b7SVivek Goyal INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); 1050e43473b7SVivek Goyal 1051e43473b7SVivek Goyal rcu_read_lock(); 1052e43473b7SVivek Goyal blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td, 1053e43473b7SVivek Goyal 0, BLKIO_POLICY_THROTL); 1054e43473b7SVivek Goyal rcu_read_unlock(); 1055e43473b7SVivek Goyal 1056e43473b7SVivek Goyal /* Attach throtl data to request queue */ 1057e43473b7SVivek Goyal td->queue = q; 1058e43473b7SVivek Goyal q->td = td; 1059e43473b7SVivek Goyal return 0; 1060e43473b7SVivek Goyal } 1061e43473b7SVivek Goyal 1062e43473b7SVivek Goyal void blk_throtl_exit(struct request_queue *q) 1063e43473b7SVivek Goyal { 1064e43473b7SVivek Goyal struct throtl_data *td = q->td; 1065e43473b7SVivek Goyal bool wait = false; 1066e43473b7SVivek Goyal 1067e43473b7SVivek Goyal BUG_ON(!td); 1068e43473b7SVivek Goyal 1069e43473b7SVivek Goyal throtl_shutdown_timer_wq(q); 1070e43473b7SVivek Goyal 1071e43473b7SVivek Goyal spin_lock_irq(q->queue_lock); 1072e43473b7SVivek Goyal throtl_release_tgs(td); 1073e43473b7SVivek Goyal 1074e43473b7SVivek Goyal /* If there are other groups */ 107502977e4aSVivek Goyal if (td->nr_undestroyed_grps > 0) 1076e43473b7SVivek Goyal wait = true; 1077e43473b7SVivek Goyal 1078e43473b7SVivek Goyal spin_unlock_irq(q->queue_lock); 1079e43473b7SVivek Goyal 1080e43473b7SVivek Goyal /* 1081e43473b7SVivek Goyal * Wait for tg->blkg->key accessors to exit their grace periods. 1082e43473b7SVivek Goyal * Do this wait only if there are other undestroyed groups out 1083e43473b7SVivek Goyal * there (other than root group). This can happen if cgroup deletion 1084e43473b7SVivek Goyal * path claimed the responsibility of cleaning up a group before 1085e43473b7SVivek Goyal * queue cleanup code get to the group. 1086e43473b7SVivek Goyal * 1087e43473b7SVivek Goyal * Do not call synchronize_rcu() unconditionally as there are drivers 1088e43473b7SVivek Goyal * which create/delete request queue hundreds of times during scan/boot 1089e43473b7SVivek Goyal * and synchronize_rcu() can take significant time and slow down boot. 1090e43473b7SVivek Goyal */ 1091e43473b7SVivek Goyal if (wait) 1092e43473b7SVivek Goyal synchronize_rcu(); 1093fe071437SVivek Goyal 1094fe071437SVivek Goyal /* 1095fe071437SVivek Goyal * Just being safe to make sure after previous flush if some body did 1096fe071437SVivek Goyal * update limits through cgroup and another work got queued, cancel 1097fe071437SVivek Goyal * it. 1098fe071437SVivek Goyal */ 1099fe071437SVivek Goyal throtl_shutdown_timer_wq(q); 1100e43473b7SVivek Goyal throtl_td_free(td); 1101e43473b7SVivek Goyal } 1102e43473b7SVivek Goyal 1103e43473b7SVivek Goyal static int __init throtl_init(void) 1104e43473b7SVivek Goyal { 1105e43473b7SVivek Goyal blkio_policy_register(&blkio_policy_throtl); 1106e43473b7SVivek Goyal return 0; 1107e43473b7SVivek Goyal } 1108e43473b7SVivek Goyal 1109e43473b7SVivek Goyal module_init(throtl_init); 1110