1e43473b7SVivek Goyal /* 2e43473b7SVivek Goyal * Interface for controlling IO bandwidth on a request queue 3e43473b7SVivek Goyal * 4e43473b7SVivek Goyal * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> 5e43473b7SVivek Goyal */ 6e43473b7SVivek Goyal 7e43473b7SVivek Goyal #include <linux/module.h> 8e43473b7SVivek Goyal #include <linux/slab.h> 9e43473b7SVivek Goyal #include <linux/blkdev.h> 10e43473b7SVivek Goyal #include <linux/bio.h> 11e43473b7SVivek Goyal #include <linux/blktrace_api.h> 12e43473b7SVivek Goyal #include "blk-cgroup.h" 13bc9fcbf9STejun Heo #include "blk.h" 14e43473b7SVivek Goyal 15e43473b7SVivek Goyal /* Max dispatch from a group in 1 round */ 16e43473b7SVivek Goyal static int throtl_grp_quantum = 8; 17e43473b7SVivek Goyal 18e43473b7SVivek Goyal /* Total max dispatch from all groups in one round */ 19e43473b7SVivek Goyal static int throtl_quantum = 32; 20e43473b7SVivek Goyal 21e43473b7SVivek Goyal /* Throttling is performed over 100ms slice and after that slice is renewed */ 22e43473b7SVivek Goyal static unsigned long throtl_slice = HZ/10; /* 100 ms */ 23e43473b7SVivek Goyal 24450adcbeSVivek Goyal /* A workqueue to queue throttle related work */ 25450adcbeSVivek Goyal static struct workqueue_struct *kthrotld_workqueue; 26450adcbeSVivek Goyal static void throtl_schedule_delayed_work(struct throtl_data *td, 27450adcbeSVivek Goyal unsigned long delay); 28450adcbeSVivek Goyal 29e43473b7SVivek Goyal struct throtl_rb_root { 30e43473b7SVivek Goyal struct rb_root rb; 31e43473b7SVivek Goyal struct rb_node *left; 32e43473b7SVivek Goyal unsigned int count; 33e43473b7SVivek Goyal unsigned long min_disptime; 34e43473b7SVivek Goyal }; 35e43473b7SVivek Goyal 36e43473b7SVivek Goyal #define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \ 37e43473b7SVivek Goyal .count = 0, .min_disptime = 0} 38e43473b7SVivek Goyal 39e43473b7SVivek Goyal #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 40e43473b7SVivek Goyal 41e43473b7SVivek Goyal struct throtl_grp { 42e43473b7SVivek Goyal /* List of throtl groups on the request queue*/ 43e43473b7SVivek Goyal struct hlist_node tg_node; 44e43473b7SVivek Goyal 45e43473b7SVivek Goyal /* active throtl group service_tree member */ 46e43473b7SVivek Goyal struct rb_node rb_node; 47e43473b7SVivek Goyal 48e43473b7SVivek Goyal /* 49e43473b7SVivek Goyal * Dispatch time in jiffies. This is the estimated time when group 50e43473b7SVivek Goyal * will unthrottle and is ready to dispatch more bio. It is used as 51e43473b7SVivek Goyal * key to sort active groups in service tree. 52e43473b7SVivek Goyal */ 53e43473b7SVivek Goyal unsigned long disptime; 54e43473b7SVivek Goyal 55e43473b7SVivek Goyal struct blkio_group blkg; 56e43473b7SVivek Goyal atomic_t ref; 57e43473b7SVivek Goyal unsigned int flags; 58e43473b7SVivek Goyal 59e43473b7SVivek Goyal /* Two lists for READ and WRITE */ 60e43473b7SVivek Goyal struct bio_list bio_lists[2]; 61e43473b7SVivek Goyal 62e43473b7SVivek Goyal /* Number of queued bios on READ and WRITE lists */ 63e43473b7SVivek Goyal unsigned int nr_queued[2]; 64e43473b7SVivek Goyal 65e43473b7SVivek Goyal /* bytes per second rate limits */ 66e43473b7SVivek Goyal uint64_t bps[2]; 67e43473b7SVivek Goyal 688e89d13fSVivek Goyal /* IOPS limits */ 698e89d13fSVivek Goyal unsigned int iops[2]; 708e89d13fSVivek Goyal 71e43473b7SVivek Goyal /* Number of bytes disptached in current slice */ 72e43473b7SVivek Goyal uint64_t bytes_disp[2]; 738e89d13fSVivek Goyal /* Number of bio's dispatched in current slice */ 748e89d13fSVivek Goyal unsigned int io_disp[2]; 75e43473b7SVivek Goyal 76e43473b7SVivek Goyal /* When did we start a new slice */ 77e43473b7SVivek Goyal unsigned long slice_start[2]; 78e43473b7SVivek Goyal unsigned long slice_end[2]; 79fe071437SVivek Goyal 80fe071437SVivek Goyal /* Some throttle limits got updated for the group */ 816f037937SAndreas Schwab int limits_changed; 824843c69dSVivek Goyal 834843c69dSVivek Goyal struct rcu_head rcu_head; 84e43473b7SVivek Goyal }; 85e43473b7SVivek Goyal 86e43473b7SVivek Goyal struct throtl_data 87e43473b7SVivek Goyal { 88e43473b7SVivek Goyal /* List of throtl groups */ 89e43473b7SVivek Goyal struct hlist_head tg_list; 90e43473b7SVivek Goyal 91e43473b7SVivek Goyal /* service tree for active throtl groups */ 92e43473b7SVivek Goyal struct throtl_rb_root tg_service_tree; 93e43473b7SVivek Goyal 9429b12589SVivek Goyal struct throtl_grp *root_tg; 95e43473b7SVivek Goyal struct request_queue *queue; 96e43473b7SVivek Goyal 97e43473b7SVivek Goyal /* Total Number of queued bios on READ and WRITE lists */ 98e43473b7SVivek Goyal unsigned int nr_queued[2]; 99e43473b7SVivek Goyal 100e43473b7SVivek Goyal /* 10102977e4aSVivek Goyal * number of total undestroyed groups 102e43473b7SVivek Goyal */ 103e43473b7SVivek Goyal unsigned int nr_undestroyed_grps; 104e43473b7SVivek Goyal 105e43473b7SVivek Goyal /* Work for dispatching throttled bios */ 106e43473b7SVivek Goyal struct delayed_work throtl_work; 107fe071437SVivek Goyal 1086f037937SAndreas Schwab int limits_changed; 109e43473b7SVivek Goyal }; 110e43473b7SVivek Goyal 111e43473b7SVivek Goyal enum tg_state_flags { 112e43473b7SVivek Goyal THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ 113e43473b7SVivek Goyal }; 114e43473b7SVivek Goyal 115e43473b7SVivek Goyal #define THROTL_TG_FNS(name) \ 116e43473b7SVivek Goyal static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \ 117e43473b7SVivek Goyal { \ 118e43473b7SVivek Goyal (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \ 119e43473b7SVivek Goyal } \ 120e43473b7SVivek Goyal static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \ 121e43473b7SVivek Goyal { \ 122e43473b7SVivek Goyal (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \ 123e43473b7SVivek Goyal } \ 124e43473b7SVivek Goyal static inline int throtl_tg_##name(const struct throtl_grp *tg) \ 125e43473b7SVivek Goyal { \ 126e43473b7SVivek Goyal return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \ 127e43473b7SVivek Goyal } 128e43473b7SVivek Goyal 129e43473b7SVivek Goyal THROTL_TG_FNS(on_rr); 130e43473b7SVivek Goyal 131e43473b7SVivek Goyal #define throtl_log_tg(td, tg, fmt, args...) \ 132e43473b7SVivek Goyal blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ 133e43473b7SVivek Goyal blkg_path(&(tg)->blkg), ##args); \ 134e43473b7SVivek Goyal 135e43473b7SVivek Goyal #define throtl_log(td, fmt, args...) \ 136e43473b7SVivek Goyal blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) 137e43473b7SVivek Goyal 138e43473b7SVivek Goyal static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg) 139e43473b7SVivek Goyal { 140e43473b7SVivek Goyal if (blkg) 141e43473b7SVivek Goyal return container_of(blkg, struct throtl_grp, blkg); 142e43473b7SVivek Goyal 143e43473b7SVivek Goyal return NULL; 144e43473b7SVivek Goyal } 145e43473b7SVivek Goyal 146d2f31a5fSJoe Perches static inline unsigned int total_nr_queued(struct throtl_data *td) 147e43473b7SVivek Goyal { 148d2f31a5fSJoe Perches return td->nr_queued[0] + td->nr_queued[1]; 149e43473b7SVivek Goyal } 150e43473b7SVivek Goyal 151e43473b7SVivek Goyal static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) 152e43473b7SVivek Goyal { 153e43473b7SVivek Goyal atomic_inc(&tg->ref); 154e43473b7SVivek Goyal return tg; 155e43473b7SVivek Goyal } 156e43473b7SVivek Goyal 1574843c69dSVivek Goyal static void throtl_free_tg(struct rcu_head *head) 1584843c69dSVivek Goyal { 1594843c69dSVivek Goyal struct throtl_grp *tg; 1604843c69dSVivek Goyal 1614843c69dSVivek Goyal tg = container_of(head, struct throtl_grp, rcu_head); 1625624a4e4SVivek Goyal free_percpu(tg->blkg.stats_cpu); 1634843c69dSVivek Goyal kfree(tg); 1644843c69dSVivek Goyal } 1654843c69dSVivek Goyal 166e43473b7SVivek Goyal static void throtl_put_tg(struct throtl_grp *tg) 167e43473b7SVivek Goyal { 168e43473b7SVivek Goyal BUG_ON(atomic_read(&tg->ref) <= 0); 169e43473b7SVivek Goyal if (!atomic_dec_and_test(&tg->ref)) 170e43473b7SVivek Goyal return; 1714843c69dSVivek Goyal 1724843c69dSVivek Goyal /* 1734843c69dSVivek Goyal * A group is freed in rcu manner. But having an rcu lock does not 1744843c69dSVivek Goyal * mean that one can access all the fields of blkg and assume these 1754843c69dSVivek Goyal * are valid. For example, don't try to follow throtl_data and 1764843c69dSVivek Goyal * request queue links. 1774843c69dSVivek Goyal * 1784843c69dSVivek Goyal * Having a reference to blkg under an rcu allows acess to only 1794843c69dSVivek Goyal * values local to groups like group stats and group rate limits 1804843c69dSVivek Goyal */ 1814843c69dSVivek Goyal call_rcu(&tg->rcu_head, throtl_free_tg); 182e43473b7SVivek Goyal } 183e43473b7SVivek Goyal 184a29a171eSVivek Goyal static void throtl_init_group(struct throtl_grp *tg) 185a29a171eSVivek Goyal { 186a29a171eSVivek Goyal INIT_HLIST_NODE(&tg->tg_node); 187a29a171eSVivek Goyal RB_CLEAR_NODE(&tg->rb_node); 188a29a171eSVivek Goyal bio_list_init(&tg->bio_lists[0]); 189a29a171eSVivek Goyal bio_list_init(&tg->bio_lists[1]); 190a29a171eSVivek Goyal tg->limits_changed = false; 191a29a171eSVivek Goyal 192a29a171eSVivek Goyal /* Practically unlimited BW */ 193a29a171eSVivek Goyal tg->bps[0] = tg->bps[1] = -1; 194a29a171eSVivek Goyal tg->iops[0] = tg->iops[1] = -1; 195a29a171eSVivek Goyal 196a29a171eSVivek Goyal /* 197a29a171eSVivek Goyal * Take the initial reference that will be released on destroy 198a29a171eSVivek Goyal * This can be thought of a joint reference by cgroup and 199a29a171eSVivek Goyal * request queue which will be dropped by either request queue 200a29a171eSVivek Goyal * exit or cgroup deletion path depending on who is exiting first. 201a29a171eSVivek Goyal */ 202a29a171eSVivek Goyal atomic_set(&tg->ref, 1); 203a29a171eSVivek Goyal } 204a29a171eSVivek Goyal 205a29a171eSVivek Goyal /* Should be called with rcu read lock held (needed for blkcg) */ 206a29a171eSVivek Goyal static void 207a29a171eSVivek Goyal throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg) 208a29a171eSVivek Goyal { 209a29a171eSVivek Goyal hlist_add_head(&tg->tg_node, &td->tg_list); 210a29a171eSVivek Goyal td->nr_undestroyed_grps++; 211a29a171eSVivek Goyal } 212a29a171eSVivek Goyal 213269f5415SVivek Goyal static void 214269f5415SVivek Goyal __throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) 215f469a7b4SVivek Goyal { 216f469a7b4SVivek Goyal struct backing_dev_info *bdi = &td->queue->backing_dev_info; 217f469a7b4SVivek Goyal unsigned int major, minor; 218f469a7b4SVivek Goyal 219269f5415SVivek Goyal if (!tg || tg->blkg.dev) 220269f5415SVivek Goyal return; 221269f5415SVivek Goyal 222269f5415SVivek Goyal /* 223269f5415SVivek Goyal * Fill in device details for a group which might not have been 224269f5415SVivek Goyal * filled at group creation time as queue was being instantiated 225269f5415SVivek Goyal * and driver had not attached a device yet 226269f5415SVivek Goyal */ 227269f5415SVivek Goyal if (bdi->dev && dev_name(bdi->dev)) { 228f469a7b4SVivek Goyal sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 229269f5415SVivek Goyal tg->blkg.dev = MKDEV(major, minor); 230269f5415SVivek Goyal } 231269f5415SVivek Goyal } 232269f5415SVivek Goyal 233af75cd3cSVivek Goyal /* 234af75cd3cSVivek Goyal * Should be called with without queue lock held. Here queue lock will be 235af75cd3cSVivek Goyal * taken rarely. It will be taken only once during life time of a group 236af75cd3cSVivek Goyal * if need be 237af75cd3cSVivek Goyal */ 238af75cd3cSVivek Goyal static void 239af75cd3cSVivek Goyal throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) 240af75cd3cSVivek Goyal { 241af75cd3cSVivek Goyal if (!tg || tg->blkg.dev) 242af75cd3cSVivek Goyal return; 243af75cd3cSVivek Goyal 244af75cd3cSVivek Goyal spin_lock_irq(td->queue->queue_lock); 245af75cd3cSVivek Goyal __throtl_tg_fill_dev_details(td, tg); 246af75cd3cSVivek Goyal spin_unlock_irq(td->queue->queue_lock); 247af75cd3cSVivek Goyal } 248af75cd3cSVivek Goyal 249269f5415SVivek Goyal static void throtl_init_add_tg_lists(struct throtl_data *td, 250269f5415SVivek Goyal struct throtl_grp *tg, struct blkio_cgroup *blkcg) 251269f5415SVivek Goyal { 252269f5415SVivek Goyal __throtl_tg_fill_dev_details(td, tg); 253269f5415SVivek Goyal 254269f5415SVivek Goyal /* Add group onto cgroup list */ 255f469a7b4SVivek Goyal blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, 256269f5415SVivek Goyal tg->blkg.dev, BLKIO_POLICY_THROTL); 257f469a7b4SVivek Goyal 258f469a7b4SVivek Goyal tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); 259f469a7b4SVivek Goyal tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); 260f469a7b4SVivek Goyal tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); 261f469a7b4SVivek Goyal tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); 262f469a7b4SVivek Goyal 263f469a7b4SVivek Goyal throtl_add_group_to_td_list(td, tg); 264f469a7b4SVivek Goyal } 265f469a7b4SVivek Goyal 266f469a7b4SVivek Goyal /* Should be called without queue lock and outside of rcu period */ 267f469a7b4SVivek Goyal static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td) 268f469a7b4SVivek Goyal { 269f469a7b4SVivek Goyal struct throtl_grp *tg = NULL; 2705624a4e4SVivek Goyal int ret; 271f469a7b4SVivek Goyal 272f469a7b4SVivek Goyal tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); 273f469a7b4SVivek Goyal if (!tg) 274f469a7b4SVivek Goyal return NULL; 275f469a7b4SVivek Goyal 2765624a4e4SVivek Goyal ret = blkio_alloc_blkg_stats(&tg->blkg); 2775624a4e4SVivek Goyal 2785624a4e4SVivek Goyal if (ret) { 2795624a4e4SVivek Goyal kfree(tg); 2805624a4e4SVivek Goyal return NULL; 2815624a4e4SVivek Goyal } 2825624a4e4SVivek Goyal 283f469a7b4SVivek Goyal throtl_init_group(tg); 284f469a7b4SVivek Goyal return tg; 285f469a7b4SVivek Goyal } 286f469a7b4SVivek Goyal 287f469a7b4SVivek Goyal static struct 288f469a7b4SVivek Goyal throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) 289e43473b7SVivek Goyal { 290e43473b7SVivek Goyal struct throtl_grp *tg = NULL; 291e43473b7SVivek Goyal void *key = td; 292e43473b7SVivek Goyal 293e43473b7SVivek Goyal /* 294be2c6b19SVivek Goyal * This is the common case when there are no blkio cgroups. 295be2c6b19SVivek Goyal * Avoid lookup in this case 296be2c6b19SVivek Goyal */ 297be2c6b19SVivek Goyal if (blkcg == &blkio_root_cgroup) 29829b12589SVivek Goyal tg = td->root_tg; 299be2c6b19SVivek Goyal else 300e43473b7SVivek Goyal tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); 301e43473b7SVivek Goyal 302269f5415SVivek Goyal __throtl_tg_fill_dev_details(td, tg); 303e43473b7SVivek Goyal return tg; 304e43473b7SVivek Goyal } 305e43473b7SVivek Goyal 306e43473b7SVivek Goyal static struct throtl_grp * throtl_get_tg(struct throtl_data *td) 307e43473b7SVivek Goyal { 308f469a7b4SVivek Goyal struct throtl_grp *tg = NULL, *__tg = NULL; 30970087dc3SVivek Goyal struct blkio_cgroup *blkcg; 310f469a7b4SVivek Goyal struct request_queue *q = td->queue; 311e43473b7SVivek Goyal 312e43473b7SVivek Goyal rcu_read_lock(); 31370087dc3SVivek Goyal blkcg = task_blkio_cgroup(current); 314f469a7b4SVivek Goyal tg = throtl_find_tg(td, blkcg); 315f469a7b4SVivek Goyal if (tg) { 316f469a7b4SVivek Goyal rcu_read_unlock(); 317f469a7b4SVivek Goyal return tg; 318f469a7b4SVivek Goyal } 319f469a7b4SVivek Goyal 320f469a7b4SVivek Goyal /* 321f469a7b4SVivek Goyal * Need to allocate a group. Allocation of group also needs allocation 322f469a7b4SVivek Goyal * of per cpu stats which in-turn takes a mutex() and can block. Hence 323315fceeeSTejun Heo * we need to drop rcu lock and queue_lock before we call alloc. 324f469a7b4SVivek Goyal */ 325f469a7b4SVivek Goyal rcu_read_unlock(); 326f469a7b4SVivek Goyal spin_unlock_irq(q->queue_lock); 327f469a7b4SVivek Goyal 328f469a7b4SVivek Goyal tg = throtl_alloc_tg(td); 329f469a7b4SVivek Goyal 330f469a7b4SVivek Goyal /* Group allocated and queue is still alive. take the lock */ 331f469a7b4SVivek Goyal spin_lock_irq(q->queue_lock); 332f469a7b4SVivek Goyal 333*bc16a4f9STejun Heo /* Make sure @q is still alive */ 334*bc16a4f9STejun Heo if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { 335*bc16a4f9STejun Heo kfree(tg); 336*bc16a4f9STejun Heo return NULL; 337*bc16a4f9STejun Heo } 338*bc16a4f9STejun Heo 339f469a7b4SVivek Goyal /* 340f469a7b4SVivek Goyal * Initialize the new group. After sleeping, read the blkcg again. 341f469a7b4SVivek Goyal */ 342f469a7b4SVivek Goyal rcu_read_lock(); 343f469a7b4SVivek Goyal blkcg = task_blkio_cgroup(current); 344f469a7b4SVivek Goyal 345f469a7b4SVivek Goyal /* 346f469a7b4SVivek Goyal * If some other thread already allocated the group while we were 347f469a7b4SVivek Goyal * not holding queue lock, free up the group 348f469a7b4SVivek Goyal */ 349f469a7b4SVivek Goyal __tg = throtl_find_tg(td, blkcg); 350f469a7b4SVivek Goyal 351f469a7b4SVivek Goyal if (__tg) { 352f469a7b4SVivek Goyal kfree(tg); 353f469a7b4SVivek Goyal rcu_read_unlock(); 354f469a7b4SVivek Goyal return __tg; 355f469a7b4SVivek Goyal } 356f469a7b4SVivek Goyal 357f469a7b4SVivek Goyal /* Group allocation failed. Account the IO to root group */ 358f469a7b4SVivek Goyal if (!tg) { 35929b12589SVivek Goyal tg = td->root_tg; 360f469a7b4SVivek Goyal return tg; 361f469a7b4SVivek Goyal } 362f469a7b4SVivek Goyal 363f469a7b4SVivek Goyal throtl_init_add_tg_lists(td, tg, blkcg); 364e43473b7SVivek Goyal rcu_read_unlock(); 365e43473b7SVivek Goyal return tg; 366e43473b7SVivek Goyal } 367e43473b7SVivek Goyal 368e43473b7SVivek Goyal static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root) 369e43473b7SVivek Goyal { 370e43473b7SVivek Goyal /* Service tree is empty */ 371e43473b7SVivek Goyal if (!root->count) 372e43473b7SVivek Goyal return NULL; 373e43473b7SVivek Goyal 374e43473b7SVivek Goyal if (!root->left) 375e43473b7SVivek Goyal root->left = rb_first(&root->rb); 376e43473b7SVivek Goyal 377e43473b7SVivek Goyal if (root->left) 378e43473b7SVivek Goyal return rb_entry_tg(root->left); 379e43473b7SVivek Goyal 380e43473b7SVivek Goyal return NULL; 381e43473b7SVivek Goyal } 382e43473b7SVivek Goyal 383e43473b7SVivek Goyal static void rb_erase_init(struct rb_node *n, struct rb_root *root) 384e43473b7SVivek Goyal { 385e43473b7SVivek Goyal rb_erase(n, root); 386e43473b7SVivek Goyal RB_CLEAR_NODE(n); 387e43473b7SVivek Goyal } 388e43473b7SVivek Goyal 389e43473b7SVivek Goyal static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root) 390e43473b7SVivek Goyal { 391e43473b7SVivek Goyal if (root->left == n) 392e43473b7SVivek Goyal root->left = NULL; 393e43473b7SVivek Goyal rb_erase_init(n, &root->rb); 394e43473b7SVivek Goyal --root->count; 395e43473b7SVivek Goyal } 396e43473b7SVivek Goyal 397e43473b7SVivek Goyal static void update_min_dispatch_time(struct throtl_rb_root *st) 398e43473b7SVivek Goyal { 399e43473b7SVivek Goyal struct throtl_grp *tg; 400e43473b7SVivek Goyal 401e43473b7SVivek Goyal tg = throtl_rb_first(st); 402e43473b7SVivek Goyal if (!tg) 403e43473b7SVivek Goyal return; 404e43473b7SVivek Goyal 405e43473b7SVivek Goyal st->min_disptime = tg->disptime; 406e43473b7SVivek Goyal } 407e43473b7SVivek Goyal 408e43473b7SVivek Goyal static void 409e43473b7SVivek Goyal tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg) 410e43473b7SVivek Goyal { 411e43473b7SVivek Goyal struct rb_node **node = &st->rb.rb_node; 412e43473b7SVivek Goyal struct rb_node *parent = NULL; 413e43473b7SVivek Goyal struct throtl_grp *__tg; 414e43473b7SVivek Goyal unsigned long key = tg->disptime; 415e43473b7SVivek Goyal int left = 1; 416e43473b7SVivek Goyal 417e43473b7SVivek Goyal while (*node != NULL) { 418e43473b7SVivek Goyal parent = *node; 419e43473b7SVivek Goyal __tg = rb_entry_tg(parent); 420e43473b7SVivek Goyal 421e43473b7SVivek Goyal if (time_before(key, __tg->disptime)) 422e43473b7SVivek Goyal node = &parent->rb_left; 423e43473b7SVivek Goyal else { 424e43473b7SVivek Goyal node = &parent->rb_right; 425e43473b7SVivek Goyal left = 0; 426e43473b7SVivek Goyal } 427e43473b7SVivek Goyal } 428e43473b7SVivek Goyal 429e43473b7SVivek Goyal if (left) 430e43473b7SVivek Goyal st->left = &tg->rb_node; 431e43473b7SVivek Goyal 432e43473b7SVivek Goyal rb_link_node(&tg->rb_node, parent, node); 433e43473b7SVivek Goyal rb_insert_color(&tg->rb_node, &st->rb); 434e43473b7SVivek Goyal } 435e43473b7SVivek Goyal 436e43473b7SVivek Goyal static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) 437e43473b7SVivek Goyal { 438e43473b7SVivek Goyal struct throtl_rb_root *st = &td->tg_service_tree; 439e43473b7SVivek Goyal 440e43473b7SVivek Goyal tg_service_tree_add(st, tg); 441e43473b7SVivek Goyal throtl_mark_tg_on_rr(tg); 442e43473b7SVivek Goyal st->count++; 443e43473b7SVivek Goyal } 444e43473b7SVivek Goyal 445e43473b7SVivek Goyal static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) 446e43473b7SVivek Goyal { 447e43473b7SVivek Goyal if (!throtl_tg_on_rr(tg)) 448e43473b7SVivek Goyal __throtl_enqueue_tg(td, tg); 449e43473b7SVivek Goyal } 450e43473b7SVivek Goyal 451e43473b7SVivek Goyal static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) 452e43473b7SVivek Goyal { 453e43473b7SVivek Goyal throtl_rb_erase(&tg->rb_node, &td->tg_service_tree); 454e43473b7SVivek Goyal throtl_clear_tg_on_rr(tg); 455e43473b7SVivek Goyal } 456e43473b7SVivek Goyal 457e43473b7SVivek Goyal static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) 458e43473b7SVivek Goyal { 459e43473b7SVivek Goyal if (throtl_tg_on_rr(tg)) 460e43473b7SVivek Goyal __throtl_dequeue_tg(td, tg); 461e43473b7SVivek Goyal } 462e43473b7SVivek Goyal 463e43473b7SVivek Goyal static void throtl_schedule_next_dispatch(struct throtl_data *td) 464e43473b7SVivek Goyal { 465e43473b7SVivek Goyal struct throtl_rb_root *st = &td->tg_service_tree; 466e43473b7SVivek Goyal 467e43473b7SVivek Goyal /* 468e43473b7SVivek Goyal * If there are more bios pending, schedule more work. 469e43473b7SVivek Goyal */ 470e43473b7SVivek Goyal if (!total_nr_queued(td)) 471e43473b7SVivek Goyal return; 472e43473b7SVivek Goyal 473e43473b7SVivek Goyal BUG_ON(!st->count); 474e43473b7SVivek Goyal 475e43473b7SVivek Goyal update_min_dispatch_time(st); 476e43473b7SVivek Goyal 477e43473b7SVivek Goyal if (time_before_eq(st->min_disptime, jiffies)) 478450adcbeSVivek Goyal throtl_schedule_delayed_work(td, 0); 479e43473b7SVivek Goyal else 480450adcbeSVivek Goyal throtl_schedule_delayed_work(td, (st->min_disptime - jiffies)); 481e43473b7SVivek Goyal } 482e43473b7SVivek Goyal 483e43473b7SVivek Goyal static inline void 484e43473b7SVivek Goyal throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) 485e43473b7SVivek Goyal { 486e43473b7SVivek Goyal tg->bytes_disp[rw] = 0; 4878e89d13fSVivek Goyal tg->io_disp[rw] = 0; 488e43473b7SVivek Goyal tg->slice_start[rw] = jiffies; 489e43473b7SVivek Goyal tg->slice_end[rw] = jiffies + throtl_slice; 490e43473b7SVivek Goyal throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", 491e43473b7SVivek Goyal rw == READ ? 'R' : 'W', tg->slice_start[rw], 492e43473b7SVivek Goyal tg->slice_end[rw], jiffies); 493e43473b7SVivek Goyal } 494e43473b7SVivek Goyal 495d1ae8ffdSVivek Goyal static inline void throtl_set_slice_end(struct throtl_data *td, 496d1ae8ffdSVivek Goyal struct throtl_grp *tg, bool rw, unsigned long jiffy_end) 497d1ae8ffdSVivek Goyal { 498d1ae8ffdSVivek Goyal tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 499d1ae8ffdSVivek Goyal } 500d1ae8ffdSVivek Goyal 501e43473b7SVivek Goyal static inline void throtl_extend_slice(struct throtl_data *td, 502e43473b7SVivek Goyal struct throtl_grp *tg, bool rw, unsigned long jiffy_end) 503e43473b7SVivek Goyal { 504e43473b7SVivek Goyal tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 505e43473b7SVivek Goyal throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", 506e43473b7SVivek Goyal rw == READ ? 'R' : 'W', tg->slice_start[rw], 507e43473b7SVivek Goyal tg->slice_end[rw], jiffies); 508e43473b7SVivek Goyal } 509e43473b7SVivek Goyal 510e43473b7SVivek Goyal /* Determine if previously allocated or extended slice is complete or not */ 511e43473b7SVivek Goyal static bool 512e43473b7SVivek Goyal throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) 513e43473b7SVivek Goyal { 514e43473b7SVivek Goyal if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 515e43473b7SVivek Goyal return 0; 516e43473b7SVivek Goyal 517e43473b7SVivek Goyal return 1; 518e43473b7SVivek Goyal } 519e43473b7SVivek Goyal 520e43473b7SVivek Goyal /* Trim the used slices and adjust slice start accordingly */ 521e43473b7SVivek Goyal static inline void 522e43473b7SVivek Goyal throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) 523e43473b7SVivek Goyal { 5243aad5d3eSVivek Goyal unsigned long nr_slices, time_elapsed, io_trim; 5253aad5d3eSVivek Goyal u64 bytes_trim, tmp; 526e43473b7SVivek Goyal 527e43473b7SVivek Goyal BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); 528e43473b7SVivek Goyal 529e43473b7SVivek Goyal /* 530e43473b7SVivek Goyal * If bps are unlimited (-1), then time slice don't get 531e43473b7SVivek Goyal * renewed. Don't try to trim the slice if slice is used. A new 532e43473b7SVivek Goyal * slice will start when appropriate. 533e43473b7SVivek Goyal */ 534e43473b7SVivek Goyal if (throtl_slice_used(td, tg, rw)) 535e43473b7SVivek Goyal return; 536e43473b7SVivek Goyal 537d1ae8ffdSVivek Goyal /* 538d1ae8ffdSVivek Goyal * A bio has been dispatched. Also adjust slice_end. It might happen 539d1ae8ffdSVivek Goyal * that initially cgroup limit was very low resulting in high 540d1ae8ffdSVivek Goyal * slice_end, but later limit was bumped up and bio was dispached 541d1ae8ffdSVivek Goyal * sooner, then we need to reduce slice_end. A high bogus slice_end 542d1ae8ffdSVivek Goyal * is bad because it does not allow new slice to start. 543d1ae8ffdSVivek Goyal */ 544d1ae8ffdSVivek Goyal 545d1ae8ffdSVivek Goyal throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice); 546d1ae8ffdSVivek Goyal 547e43473b7SVivek Goyal time_elapsed = jiffies - tg->slice_start[rw]; 548e43473b7SVivek Goyal 549e43473b7SVivek Goyal nr_slices = time_elapsed / throtl_slice; 550e43473b7SVivek Goyal 551e43473b7SVivek Goyal if (!nr_slices) 552e43473b7SVivek Goyal return; 5533aad5d3eSVivek Goyal tmp = tg->bps[rw] * throtl_slice * nr_slices; 5543aad5d3eSVivek Goyal do_div(tmp, HZ); 5553aad5d3eSVivek Goyal bytes_trim = tmp; 556e43473b7SVivek Goyal 5578e89d13fSVivek Goyal io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; 558e43473b7SVivek Goyal 5598e89d13fSVivek Goyal if (!bytes_trim && !io_trim) 560e43473b7SVivek Goyal return; 561e43473b7SVivek Goyal 562e43473b7SVivek Goyal if (tg->bytes_disp[rw] >= bytes_trim) 563e43473b7SVivek Goyal tg->bytes_disp[rw] -= bytes_trim; 564e43473b7SVivek Goyal else 565e43473b7SVivek Goyal tg->bytes_disp[rw] = 0; 566e43473b7SVivek Goyal 5678e89d13fSVivek Goyal if (tg->io_disp[rw] >= io_trim) 5688e89d13fSVivek Goyal tg->io_disp[rw] -= io_trim; 5698e89d13fSVivek Goyal else 5708e89d13fSVivek Goyal tg->io_disp[rw] = 0; 5718e89d13fSVivek Goyal 572e43473b7SVivek Goyal tg->slice_start[rw] += nr_slices * throtl_slice; 573e43473b7SVivek Goyal 5743aad5d3eSVivek Goyal throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" 575e43473b7SVivek Goyal " start=%lu end=%lu jiffies=%lu", 5768e89d13fSVivek Goyal rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, 577e43473b7SVivek Goyal tg->slice_start[rw], tg->slice_end[rw], jiffies); 578e43473b7SVivek Goyal } 579e43473b7SVivek Goyal 5808e89d13fSVivek Goyal static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, 581e43473b7SVivek Goyal struct bio *bio, unsigned long *wait) 582e43473b7SVivek Goyal { 583e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 5848e89d13fSVivek Goyal unsigned int io_allowed; 585e43473b7SVivek Goyal unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 586c49c06e4SVivek Goyal u64 tmp; 587e43473b7SVivek Goyal 5888e89d13fSVivek Goyal jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 589e43473b7SVivek Goyal 5908e89d13fSVivek Goyal /* Slice has just started. Consider one slice interval */ 5918e89d13fSVivek Goyal if (!jiffy_elapsed) 5928e89d13fSVivek Goyal jiffy_elapsed_rnd = throtl_slice; 5938e89d13fSVivek Goyal 5948e89d13fSVivek Goyal jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 5958e89d13fSVivek Goyal 596c49c06e4SVivek Goyal /* 597c49c06e4SVivek Goyal * jiffy_elapsed_rnd should not be a big value as minimum iops can be 598c49c06e4SVivek Goyal * 1 then at max jiffy elapsed should be equivalent of 1 second as we 599c49c06e4SVivek Goyal * will allow dispatch after 1 second and after that slice should 600c49c06e4SVivek Goyal * have been trimmed. 601c49c06e4SVivek Goyal */ 602c49c06e4SVivek Goyal 603c49c06e4SVivek Goyal tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; 604c49c06e4SVivek Goyal do_div(tmp, HZ); 605c49c06e4SVivek Goyal 606c49c06e4SVivek Goyal if (tmp > UINT_MAX) 607c49c06e4SVivek Goyal io_allowed = UINT_MAX; 608c49c06e4SVivek Goyal else 609c49c06e4SVivek Goyal io_allowed = tmp; 6108e89d13fSVivek Goyal 6118e89d13fSVivek Goyal if (tg->io_disp[rw] + 1 <= io_allowed) { 612e43473b7SVivek Goyal if (wait) 613e43473b7SVivek Goyal *wait = 0; 614e43473b7SVivek Goyal return 1; 615e43473b7SVivek Goyal } 616e43473b7SVivek Goyal 6178e89d13fSVivek Goyal /* Calc approx time to dispatch */ 6188e89d13fSVivek Goyal jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; 6198e89d13fSVivek Goyal 6208e89d13fSVivek Goyal if (jiffy_wait > jiffy_elapsed) 6218e89d13fSVivek Goyal jiffy_wait = jiffy_wait - jiffy_elapsed; 6228e89d13fSVivek Goyal else 6238e89d13fSVivek Goyal jiffy_wait = 1; 6248e89d13fSVivek Goyal 6258e89d13fSVivek Goyal if (wait) 6268e89d13fSVivek Goyal *wait = jiffy_wait; 6278e89d13fSVivek Goyal return 0; 628e43473b7SVivek Goyal } 629e43473b7SVivek Goyal 6308e89d13fSVivek Goyal static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, 6318e89d13fSVivek Goyal struct bio *bio, unsigned long *wait) 6328e89d13fSVivek Goyal { 6338e89d13fSVivek Goyal bool rw = bio_data_dir(bio); 6343aad5d3eSVivek Goyal u64 bytes_allowed, extra_bytes, tmp; 6358e89d13fSVivek Goyal unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 6368e89d13fSVivek Goyal 637e43473b7SVivek Goyal jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 638e43473b7SVivek Goyal 639e43473b7SVivek Goyal /* Slice has just started. Consider one slice interval */ 640e43473b7SVivek Goyal if (!jiffy_elapsed) 641e43473b7SVivek Goyal jiffy_elapsed_rnd = throtl_slice; 642e43473b7SVivek Goyal 643e43473b7SVivek Goyal jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 644e43473b7SVivek Goyal 6455e901a2bSVivek Goyal tmp = tg->bps[rw] * jiffy_elapsed_rnd; 6465e901a2bSVivek Goyal do_div(tmp, HZ); 6473aad5d3eSVivek Goyal bytes_allowed = tmp; 648e43473b7SVivek Goyal 649e43473b7SVivek Goyal if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { 650e43473b7SVivek Goyal if (wait) 651e43473b7SVivek Goyal *wait = 0; 652e43473b7SVivek Goyal return 1; 653e43473b7SVivek Goyal } 654e43473b7SVivek Goyal 655e43473b7SVivek Goyal /* Calc approx time to dispatch */ 656e43473b7SVivek Goyal extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed; 657e43473b7SVivek Goyal jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); 658e43473b7SVivek Goyal 659e43473b7SVivek Goyal if (!jiffy_wait) 660e43473b7SVivek Goyal jiffy_wait = 1; 661e43473b7SVivek Goyal 662e43473b7SVivek Goyal /* 663e43473b7SVivek Goyal * This wait time is without taking into consideration the rounding 664e43473b7SVivek Goyal * up we did. Add that time also. 665e43473b7SVivek Goyal */ 666e43473b7SVivek Goyal jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); 667e43473b7SVivek Goyal if (wait) 668e43473b7SVivek Goyal *wait = jiffy_wait; 6698e89d13fSVivek Goyal return 0; 6708e89d13fSVivek Goyal } 671e43473b7SVivek Goyal 672af75cd3cSVivek Goyal static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) { 673af75cd3cSVivek Goyal if (tg->bps[rw] == -1 && tg->iops[rw] == -1) 674af75cd3cSVivek Goyal return 1; 675af75cd3cSVivek Goyal return 0; 676af75cd3cSVivek Goyal } 677af75cd3cSVivek Goyal 6788e89d13fSVivek Goyal /* 6798e89d13fSVivek Goyal * Returns whether one can dispatch a bio or not. Also returns approx number 6808e89d13fSVivek Goyal * of jiffies to wait before this bio is with-in IO rate and can be dispatched 6818e89d13fSVivek Goyal */ 6828e89d13fSVivek Goyal static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, 6838e89d13fSVivek Goyal struct bio *bio, unsigned long *wait) 6848e89d13fSVivek Goyal { 6858e89d13fSVivek Goyal bool rw = bio_data_dir(bio); 6868e89d13fSVivek Goyal unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; 6878e89d13fSVivek Goyal 6888e89d13fSVivek Goyal /* 6898e89d13fSVivek Goyal * Currently whole state machine of group depends on first bio 6908e89d13fSVivek Goyal * queued in the group bio list. So one should not be calling 6918e89d13fSVivek Goyal * this function with a different bio if there are other bios 6928e89d13fSVivek Goyal * queued. 6938e89d13fSVivek Goyal */ 6948e89d13fSVivek Goyal BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); 6958e89d13fSVivek Goyal 6968e89d13fSVivek Goyal /* If tg->bps = -1, then BW is unlimited */ 6978e89d13fSVivek Goyal if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { 6988e89d13fSVivek Goyal if (wait) 6998e89d13fSVivek Goyal *wait = 0; 7008e89d13fSVivek Goyal return 1; 7018e89d13fSVivek Goyal } 7028e89d13fSVivek Goyal 7038e89d13fSVivek Goyal /* 7048e89d13fSVivek Goyal * If previous slice expired, start a new one otherwise renew/extend 7058e89d13fSVivek Goyal * existing slice to make sure it is at least throtl_slice interval 7068e89d13fSVivek Goyal * long since now. 7078e89d13fSVivek Goyal */ 7088e89d13fSVivek Goyal if (throtl_slice_used(td, tg, rw)) 7098e89d13fSVivek Goyal throtl_start_new_slice(td, tg, rw); 7108e89d13fSVivek Goyal else { 7118e89d13fSVivek Goyal if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) 7128e89d13fSVivek Goyal throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); 7138e89d13fSVivek Goyal } 7148e89d13fSVivek Goyal 7158e89d13fSVivek Goyal if (tg_with_in_bps_limit(td, tg, bio, &bps_wait) 7168e89d13fSVivek Goyal && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) { 7178e89d13fSVivek Goyal if (wait) 7188e89d13fSVivek Goyal *wait = 0; 7198e89d13fSVivek Goyal return 1; 7208e89d13fSVivek Goyal } 7218e89d13fSVivek Goyal 7228e89d13fSVivek Goyal max_wait = max(bps_wait, iops_wait); 7238e89d13fSVivek Goyal 7248e89d13fSVivek Goyal if (wait) 7258e89d13fSVivek Goyal *wait = max_wait; 7268e89d13fSVivek Goyal 7278e89d13fSVivek Goyal if (time_before(tg->slice_end[rw], jiffies + max_wait)) 7288e89d13fSVivek Goyal throtl_extend_slice(td, tg, rw, jiffies + max_wait); 729e43473b7SVivek Goyal 730e43473b7SVivek Goyal return 0; 731e43473b7SVivek Goyal } 732e43473b7SVivek Goyal 733e43473b7SVivek Goyal static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 734e43473b7SVivek Goyal { 735e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 736e5a94f56SShaohua Li bool sync = rw_is_sync(bio->bi_rw); 737e43473b7SVivek Goyal 738e43473b7SVivek Goyal /* Charge the bio to the group */ 739e43473b7SVivek Goyal tg->bytes_disp[rw] += bio->bi_size; 7408e89d13fSVivek Goyal tg->io_disp[rw]++; 741e43473b7SVivek Goyal 742e43473b7SVivek Goyal blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); 743e43473b7SVivek Goyal } 744e43473b7SVivek Goyal 745e43473b7SVivek Goyal static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, 746e43473b7SVivek Goyal struct bio *bio) 747e43473b7SVivek Goyal { 748e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 749e43473b7SVivek Goyal 750e43473b7SVivek Goyal bio_list_add(&tg->bio_lists[rw], bio); 751e43473b7SVivek Goyal /* Take a bio reference on tg */ 752e43473b7SVivek Goyal throtl_ref_get_tg(tg); 753e43473b7SVivek Goyal tg->nr_queued[rw]++; 754e43473b7SVivek Goyal td->nr_queued[rw]++; 755e43473b7SVivek Goyal throtl_enqueue_tg(td, tg); 756e43473b7SVivek Goyal } 757e43473b7SVivek Goyal 758e43473b7SVivek Goyal static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) 759e43473b7SVivek Goyal { 760e43473b7SVivek Goyal unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; 761e43473b7SVivek Goyal struct bio *bio; 762e43473b7SVivek Goyal 763e43473b7SVivek Goyal if ((bio = bio_list_peek(&tg->bio_lists[READ]))) 764e43473b7SVivek Goyal tg_may_dispatch(td, tg, bio, &read_wait); 765e43473b7SVivek Goyal 766e43473b7SVivek Goyal if ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) 767e43473b7SVivek Goyal tg_may_dispatch(td, tg, bio, &write_wait); 768e43473b7SVivek Goyal 769e43473b7SVivek Goyal min_wait = min(read_wait, write_wait); 770e43473b7SVivek Goyal disptime = jiffies + min_wait; 771e43473b7SVivek Goyal 772e43473b7SVivek Goyal /* Update dispatch time */ 773e43473b7SVivek Goyal throtl_dequeue_tg(td, tg); 774e43473b7SVivek Goyal tg->disptime = disptime; 775e43473b7SVivek Goyal throtl_enqueue_tg(td, tg); 776e43473b7SVivek Goyal } 777e43473b7SVivek Goyal 778e43473b7SVivek Goyal static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, 779e43473b7SVivek Goyal bool rw, struct bio_list *bl) 780e43473b7SVivek Goyal { 781e43473b7SVivek Goyal struct bio *bio; 782e43473b7SVivek Goyal 783e43473b7SVivek Goyal bio = bio_list_pop(&tg->bio_lists[rw]); 784e43473b7SVivek Goyal tg->nr_queued[rw]--; 785e43473b7SVivek Goyal /* Drop bio reference on tg */ 786e43473b7SVivek Goyal throtl_put_tg(tg); 787e43473b7SVivek Goyal 788e43473b7SVivek Goyal BUG_ON(td->nr_queued[rw] <= 0); 789e43473b7SVivek Goyal td->nr_queued[rw]--; 790e43473b7SVivek Goyal 791e43473b7SVivek Goyal throtl_charge_bio(tg, bio); 792e43473b7SVivek Goyal bio_list_add(bl, bio); 793e43473b7SVivek Goyal bio->bi_rw |= REQ_THROTTLED; 794e43473b7SVivek Goyal 795e43473b7SVivek Goyal throtl_trim_slice(td, tg, rw); 796e43473b7SVivek Goyal } 797e43473b7SVivek Goyal 798e43473b7SVivek Goyal static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, 799e43473b7SVivek Goyal struct bio_list *bl) 800e43473b7SVivek Goyal { 801e43473b7SVivek Goyal unsigned int nr_reads = 0, nr_writes = 0; 802e43473b7SVivek Goyal unsigned int max_nr_reads = throtl_grp_quantum*3/4; 803c2f6805dSVivek Goyal unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; 804e43473b7SVivek Goyal struct bio *bio; 805e43473b7SVivek Goyal 806e43473b7SVivek Goyal /* Try to dispatch 75% READS and 25% WRITES */ 807e43473b7SVivek Goyal 808e43473b7SVivek Goyal while ((bio = bio_list_peek(&tg->bio_lists[READ])) 809e43473b7SVivek Goyal && tg_may_dispatch(td, tg, bio, NULL)) { 810e43473b7SVivek Goyal 811e43473b7SVivek Goyal tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); 812e43473b7SVivek Goyal nr_reads++; 813e43473b7SVivek Goyal 814e43473b7SVivek Goyal if (nr_reads >= max_nr_reads) 815e43473b7SVivek Goyal break; 816e43473b7SVivek Goyal } 817e43473b7SVivek Goyal 818e43473b7SVivek Goyal while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) 819e43473b7SVivek Goyal && tg_may_dispatch(td, tg, bio, NULL)) { 820e43473b7SVivek Goyal 821e43473b7SVivek Goyal tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); 822e43473b7SVivek Goyal nr_writes++; 823e43473b7SVivek Goyal 824e43473b7SVivek Goyal if (nr_writes >= max_nr_writes) 825e43473b7SVivek Goyal break; 826e43473b7SVivek Goyal } 827e43473b7SVivek Goyal 828e43473b7SVivek Goyal return nr_reads + nr_writes; 829e43473b7SVivek Goyal } 830e43473b7SVivek Goyal 831e43473b7SVivek Goyal static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) 832e43473b7SVivek Goyal { 833e43473b7SVivek Goyal unsigned int nr_disp = 0; 834e43473b7SVivek Goyal struct throtl_grp *tg; 835e43473b7SVivek Goyal struct throtl_rb_root *st = &td->tg_service_tree; 836e43473b7SVivek Goyal 837e43473b7SVivek Goyal while (1) { 838e43473b7SVivek Goyal tg = throtl_rb_first(st); 839e43473b7SVivek Goyal 840e43473b7SVivek Goyal if (!tg) 841e43473b7SVivek Goyal break; 842e43473b7SVivek Goyal 843e43473b7SVivek Goyal if (time_before(jiffies, tg->disptime)) 844e43473b7SVivek Goyal break; 845e43473b7SVivek Goyal 846e43473b7SVivek Goyal throtl_dequeue_tg(td, tg); 847e43473b7SVivek Goyal 848e43473b7SVivek Goyal nr_disp += throtl_dispatch_tg(td, tg, bl); 849e43473b7SVivek Goyal 850e43473b7SVivek Goyal if (tg->nr_queued[0] || tg->nr_queued[1]) { 851e43473b7SVivek Goyal tg_update_disptime(td, tg); 852e43473b7SVivek Goyal throtl_enqueue_tg(td, tg); 853e43473b7SVivek Goyal } 854e43473b7SVivek Goyal 855e43473b7SVivek Goyal if (nr_disp >= throtl_quantum) 856e43473b7SVivek Goyal break; 857e43473b7SVivek Goyal } 858e43473b7SVivek Goyal 859e43473b7SVivek Goyal return nr_disp; 860e43473b7SVivek Goyal } 861e43473b7SVivek Goyal 862fe071437SVivek Goyal static void throtl_process_limit_change(struct throtl_data *td) 863fe071437SVivek Goyal { 864fe071437SVivek Goyal struct throtl_grp *tg; 865fe071437SVivek Goyal struct hlist_node *pos, *n; 866fe071437SVivek Goyal 867de701c74SVivek Goyal if (!td->limits_changed) 868fe071437SVivek Goyal return; 869fe071437SVivek Goyal 870de701c74SVivek Goyal xchg(&td->limits_changed, false); 871fe071437SVivek Goyal 872de701c74SVivek Goyal throtl_log(td, "limits changed"); 873fe071437SVivek Goyal 87404a6b516SVivek Goyal hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { 875de701c74SVivek Goyal if (!tg->limits_changed) 876de701c74SVivek Goyal continue; 877fe071437SVivek Goyal 878de701c74SVivek Goyal if (!xchg(&tg->limits_changed, false)) 879de701c74SVivek Goyal continue; 880de701c74SVivek Goyal 881de701c74SVivek Goyal throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" 882de701c74SVivek Goyal " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE], 883de701c74SVivek Goyal tg->iops[READ], tg->iops[WRITE]); 884de701c74SVivek Goyal 88504521db0SVivek Goyal /* 88604521db0SVivek Goyal * Restart the slices for both READ and WRITES. It 88704521db0SVivek Goyal * might happen that a group's limit are dropped 88804521db0SVivek Goyal * suddenly and we don't want to account recently 88904521db0SVivek Goyal * dispatched IO with new low rate 89004521db0SVivek Goyal */ 89104521db0SVivek Goyal throtl_start_new_slice(td, tg, 0); 89204521db0SVivek Goyal throtl_start_new_slice(td, tg, 1); 89304521db0SVivek Goyal 894de701c74SVivek Goyal if (throtl_tg_on_rr(tg)) 895de701c74SVivek Goyal tg_update_disptime(td, tg); 896de701c74SVivek Goyal } 897fe071437SVivek Goyal } 898fe071437SVivek Goyal 899e43473b7SVivek Goyal /* Dispatch throttled bios. Should be called without queue lock held. */ 900e43473b7SVivek Goyal static int throtl_dispatch(struct request_queue *q) 901e43473b7SVivek Goyal { 902e43473b7SVivek Goyal struct throtl_data *td = q->td; 903e43473b7SVivek Goyal unsigned int nr_disp = 0; 904e43473b7SVivek Goyal struct bio_list bio_list_on_stack; 905e43473b7SVivek Goyal struct bio *bio; 90669d60eb9SVivek Goyal struct blk_plug plug; 907e43473b7SVivek Goyal 908e43473b7SVivek Goyal spin_lock_irq(q->queue_lock); 909e43473b7SVivek Goyal 910fe071437SVivek Goyal throtl_process_limit_change(td); 911fe071437SVivek Goyal 912e43473b7SVivek Goyal if (!total_nr_queued(td)) 913e43473b7SVivek Goyal goto out; 914e43473b7SVivek Goyal 915e43473b7SVivek Goyal bio_list_init(&bio_list_on_stack); 916e43473b7SVivek Goyal 917d2f31a5fSJoe Perches throtl_log(td, "dispatch nr_queued=%u read=%u write=%u", 918e43473b7SVivek Goyal total_nr_queued(td), td->nr_queued[READ], 919e43473b7SVivek Goyal td->nr_queued[WRITE]); 920e43473b7SVivek Goyal 921e43473b7SVivek Goyal nr_disp = throtl_select_dispatch(td, &bio_list_on_stack); 922e43473b7SVivek Goyal 923e43473b7SVivek Goyal if (nr_disp) 924e43473b7SVivek Goyal throtl_log(td, "bios disp=%u", nr_disp); 925e43473b7SVivek Goyal 926e43473b7SVivek Goyal throtl_schedule_next_dispatch(td); 927e43473b7SVivek Goyal out: 928e43473b7SVivek Goyal spin_unlock_irq(q->queue_lock); 929e43473b7SVivek Goyal 930e43473b7SVivek Goyal /* 931e43473b7SVivek Goyal * If we dispatched some requests, unplug the queue to make sure 932e43473b7SVivek Goyal * immediate dispatch 933e43473b7SVivek Goyal */ 934e43473b7SVivek Goyal if (nr_disp) { 93569d60eb9SVivek Goyal blk_start_plug(&plug); 936e43473b7SVivek Goyal while((bio = bio_list_pop(&bio_list_on_stack))) 937e43473b7SVivek Goyal generic_make_request(bio); 93869d60eb9SVivek Goyal blk_finish_plug(&plug); 939e43473b7SVivek Goyal } 940e43473b7SVivek Goyal return nr_disp; 941e43473b7SVivek Goyal } 942e43473b7SVivek Goyal 943e43473b7SVivek Goyal void blk_throtl_work(struct work_struct *work) 944e43473b7SVivek Goyal { 945e43473b7SVivek Goyal struct throtl_data *td = container_of(work, struct throtl_data, 946e43473b7SVivek Goyal throtl_work.work); 947e43473b7SVivek Goyal struct request_queue *q = td->queue; 948e43473b7SVivek Goyal 949e43473b7SVivek Goyal throtl_dispatch(q); 950e43473b7SVivek Goyal } 951e43473b7SVivek Goyal 952e43473b7SVivek Goyal /* Call with queue lock held */ 953450adcbeSVivek Goyal static void 954450adcbeSVivek Goyal throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay) 955e43473b7SVivek Goyal { 956e43473b7SVivek Goyal 957e43473b7SVivek Goyal struct delayed_work *dwork = &td->throtl_work; 958e43473b7SVivek Goyal 95904521db0SVivek Goyal /* schedule work if limits changed even if no bio is queued */ 960d2f31a5fSJoe Perches if (total_nr_queued(td) || td->limits_changed) { 961e43473b7SVivek Goyal /* 962e43473b7SVivek Goyal * We might have a work scheduled to be executed in future. 963e43473b7SVivek Goyal * Cancel that and schedule a new one. 964e43473b7SVivek Goyal */ 965e43473b7SVivek Goyal __cancel_delayed_work(dwork); 966450adcbeSVivek Goyal queue_delayed_work(kthrotld_workqueue, dwork, delay); 967e43473b7SVivek Goyal throtl_log(td, "schedule work. delay=%lu jiffies=%lu", 968e43473b7SVivek Goyal delay, jiffies); 969e43473b7SVivek Goyal } 970e43473b7SVivek Goyal } 971e43473b7SVivek Goyal 972e43473b7SVivek Goyal static void 973e43473b7SVivek Goyal throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg) 974e43473b7SVivek Goyal { 975e43473b7SVivek Goyal /* Something wrong if we are trying to remove same group twice */ 976e43473b7SVivek Goyal BUG_ON(hlist_unhashed(&tg->tg_node)); 977e43473b7SVivek Goyal 978e43473b7SVivek Goyal hlist_del_init(&tg->tg_node); 979e43473b7SVivek Goyal 980e43473b7SVivek Goyal /* 981e43473b7SVivek Goyal * Put the reference taken at the time of creation so that when all 982e43473b7SVivek Goyal * queues are gone, group can be destroyed. 983e43473b7SVivek Goyal */ 984e43473b7SVivek Goyal throtl_put_tg(tg); 985e43473b7SVivek Goyal td->nr_undestroyed_grps--; 986e43473b7SVivek Goyal } 987e43473b7SVivek Goyal 988e43473b7SVivek Goyal static void throtl_release_tgs(struct throtl_data *td) 989e43473b7SVivek Goyal { 990e43473b7SVivek Goyal struct hlist_node *pos, *n; 991e43473b7SVivek Goyal struct throtl_grp *tg; 992e43473b7SVivek Goyal 993e43473b7SVivek Goyal hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { 994e43473b7SVivek Goyal /* 995e43473b7SVivek Goyal * If cgroup removal path got to blk_group first and removed 996e43473b7SVivek Goyal * it from cgroup list, then it will take care of destroying 997e43473b7SVivek Goyal * cfqg also. 998e43473b7SVivek Goyal */ 999e43473b7SVivek Goyal if (!blkiocg_del_blkio_group(&tg->blkg)) 1000e43473b7SVivek Goyal throtl_destroy_tg(td, tg); 1001e43473b7SVivek Goyal } 1002e43473b7SVivek Goyal } 1003e43473b7SVivek Goyal 1004e43473b7SVivek Goyal static void throtl_td_free(struct throtl_data *td) 1005e43473b7SVivek Goyal { 1006e43473b7SVivek Goyal kfree(td); 1007e43473b7SVivek Goyal } 1008e43473b7SVivek Goyal 1009e43473b7SVivek Goyal /* 1010e43473b7SVivek Goyal * Blk cgroup controller notification saying that blkio_group object is being 1011e43473b7SVivek Goyal * delinked as associated cgroup object is going away. That also means that 1012e43473b7SVivek Goyal * no new IO will come in this group. So get rid of this group as soon as 1013e43473b7SVivek Goyal * any pending IO in the group is finished. 1014e43473b7SVivek Goyal * 1015e43473b7SVivek Goyal * This function is called under rcu_read_lock(). key is the rcu protected 1016e43473b7SVivek Goyal * pointer. That means "key" is a valid throtl_data pointer as long as we are 1017e43473b7SVivek Goyal * rcu read lock. 1018e43473b7SVivek Goyal * 1019e43473b7SVivek Goyal * "key" was fetched from blkio_group under blkio_cgroup->lock. That means 1020e43473b7SVivek Goyal * it should not be NULL as even if queue was going away, cgroup deltion 1021e43473b7SVivek Goyal * path got to it first. 1022e43473b7SVivek Goyal */ 1023e43473b7SVivek Goyal void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg) 1024e43473b7SVivek Goyal { 1025e43473b7SVivek Goyal unsigned long flags; 1026e43473b7SVivek Goyal struct throtl_data *td = key; 1027e43473b7SVivek Goyal 1028e43473b7SVivek Goyal spin_lock_irqsave(td->queue->queue_lock, flags); 1029e43473b7SVivek Goyal throtl_destroy_tg(td, tg_of_blkg(blkg)); 1030e43473b7SVivek Goyal spin_unlock_irqrestore(td->queue->queue_lock, flags); 1031e43473b7SVivek Goyal } 1032e43473b7SVivek Goyal 1033de701c74SVivek Goyal static void throtl_update_blkio_group_common(struct throtl_data *td, 1034de701c74SVivek Goyal struct throtl_grp *tg) 1035de701c74SVivek Goyal { 1036de701c74SVivek Goyal xchg(&tg->limits_changed, true); 1037de701c74SVivek Goyal xchg(&td->limits_changed, true); 1038de701c74SVivek Goyal /* Schedule a work now to process the limit change */ 1039de701c74SVivek Goyal throtl_schedule_delayed_work(td, 0); 1040de701c74SVivek Goyal } 1041de701c74SVivek Goyal 1042fe071437SVivek Goyal /* 1043fe071437SVivek Goyal * For all update functions, key should be a valid pointer because these 1044fe071437SVivek Goyal * update functions are called under blkcg_lock, that means, blkg is 104525985edcSLucas De Marchi * valid and in turn key is valid. queue exit path can not race because 1046fe071437SVivek Goyal * of blkcg_lock 1047fe071437SVivek Goyal * 1048fe071437SVivek Goyal * Can not take queue lock in update functions as queue lock under blkcg_lock 1049fe071437SVivek Goyal * is not allowed. Under other paths we take blkcg_lock under queue_lock. 1050fe071437SVivek Goyal */ 1051fe071437SVivek Goyal static void throtl_update_blkio_group_read_bps(void *key, 1052fe071437SVivek Goyal struct blkio_group *blkg, u64 read_bps) 1053e43473b7SVivek Goyal { 1054fe071437SVivek Goyal struct throtl_data *td = key; 1055de701c74SVivek Goyal struct throtl_grp *tg = tg_of_blkg(blkg); 1056fe071437SVivek Goyal 1057de701c74SVivek Goyal tg->bps[READ] = read_bps; 1058de701c74SVivek Goyal throtl_update_blkio_group_common(td, tg); 1059e43473b7SVivek Goyal } 1060e43473b7SVivek Goyal 1061fe071437SVivek Goyal static void throtl_update_blkio_group_write_bps(void *key, 1062fe071437SVivek Goyal struct blkio_group *blkg, u64 write_bps) 1063e43473b7SVivek Goyal { 1064fe071437SVivek Goyal struct throtl_data *td = key; 1065de701c74SVivek Goyal struct throtl_grp *tg = tg_of_blkg(blkg); 1066fe071437SVivek Goyal 1067de701c74SVivek Goyal tg->bps[WRITE] = write_bps; 1068de701c74SVivek Goyal throtl_update_blkio_group_common(td, tg); 1069e43473b7SVivek Goyal } 1070e43473b7SVivek Goyal 1071fe071437SVivek Goyal static void throtl_update_blkio_group_read_iops(void *key, 1072fe071437SVivek Goyal struct blkio_group *blkg, unsigned int read_iops) 10738e89d13fSVivek Goyal { 1074fe071437SVivek Goyal struct throtl_data *td = key; 1075de701c74SVivek Goyal struct throtl_grp *tg = tg_of_blkg(blkg); 1076fe071437SVivek Goyal 1077de701c74SVivek Goyal tg->iops[READ] = read_iops; 1078de701c74SVivek Goyal throtl_update_blkio_group_common(td, tg); 10798e89d13fSVivek Goyal } 10808e89d13fSVivek Goyal 1081fe071437SVivek Goyal static void throtl_update_blkio_group_write_iops(void *key, 1082fe071437SVivek Goyal struct blkio_group *blkg, unsigned int write_iops) 10838e89d13fSVivek Goyal { 1084fe071437SVivek Goyal struct throtl_data *td = key; 1085de701c74SVivek Goyal struct throtl_grp *tg = tg_of_blkg(blkg); 1086fe071437SVivek Goyal 1087de701c74SVivek Goyal tg->iops[WRITE] = write_iops; 1088de701c74SVivek Goyal throtl_update_blkio_group_common(td, tg); 10898e89d13fSVivek Goyal } 10908e89d13fSVivek Goyal 1091da527770SVivek Goyal static void throtl_shutdown_wq(struct request_queue *q) 1092e43473b7SVivek Goyal { 1093e43473b7SVivek Goyal struct throtl_data *td = q->td; 1094e43473b7SVivek Goyal 1095e43473b7SVivek Goyal cancel_delayed_work_sync(&td->throtl_work); 1096e43473b7SVivek Goyal } 1097e43473b7SVivek Goyal 1098e43473b7SVivek Goyal static struct blkio_policy_type blkio_policy_throtl = { 1099e43473b7SVivek Goyal .ops = { 1100e43473b7SVivek Goyal .blkio_unlink_group_fn = throtl_unlink_blkio_group, 1101e43473b7SVivek Goyal .blkio_update_group_read_bps_fn = 1102e43473b7SVivek Goyal throtl_update_blkio_group_read_bps, 1103e43473b7SVivek Goyal .blkio_update_group_write_bps_fn = 1104e43473b7SVivek Goyal throtl_update_blkio_group_write_bps, 11058e89d13fSVivek Goyal .blkio_update_group_read_iops_fn = 11068e89d13fSVivek Goyal throtl_update_blkio_group_read_iops, 11078e89d13fSVivek Goyal .blkio_update_group_write_iops_fn = 11088e89d13fSVivek Goyal throtl_update_blkio_group_write_iops, 1109e43473b7SVivek Goyal }, 11108e89d13fSVivek Goyal .plid = BLKIO_POLICY_THROTL, 1111e43473b7SVivek Goyal }; 1112e43473b7SVivek Goyal 1113*bc16a4f9STejun Heo bool blk_throtl_bio(struct request_queue *q, struct bio *bio) 1114e43473b7SVivek Goyal { 1115e43473b7SVivek Goyal struct throtl_data *td = q->td; 1116e43473b7SVivek Goyal struct throtl_grp *tg; 1117e43473b7SVivek Goyal bool rw = bio_data_dir(bio), update_disptime = true; 1118af75cd3cSVivek Goyal struct blkio_cgroup *blkcg; 1119*bc16a4f9STejun Heo bool throttled = false; 1120e43473b7SVivek Goyal 1121e43473b7SVivek Goyal if (bio->bi_rw & REQ_THROTTLED) { 1122e43473b7SVivek Goyal bio->bi_rw &= ~REQ_THROTTLED; 1123*bc16a4f9STejun Heo goto out; 1124e43473b7SVivek Goyal } 1125e43473b7SVivek Goyal 1126af75cd3cSVivek Goyal /* 1127af75cd3cSVivek Goyal * A throtl_grp pointer retrieved under rcu can be used to access 1128af75cd3cSVivek Goyal * basic fields like stats and io rates. If a group has no rules, 1129af75cd3cSVivek Goyal * just update the dispatch stats in lockless manner and return. 1130af75cd3cSVivek Goyal */ 1131af75cd3cSVivek Goyal 1132af75cd3cSVivek Goyal rcu_read_lock(); 1133af75cd3cSVivek Goyal blkcg = task_blkio_cgroup(current); 1134af75cd3cSVivek Goyal tg = throtl_find_tg(td, blkcg); 1135af75cd3cSVivek Goyal if (tg) { 1136af75cd3cSVivek Goyal throtl_tg_fill_dev_details(td, tg); 1137af75cd3cSVivek Goyal 1138af75cd3cSVivek Goyal if (tg_no_rule_group(tg, rw)) { 1139af75cd3cSVivek Goyal blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, 1140e5a94f56SShaohua Li rw, rw_is_sync(bio->bi_rw)); 1141af75cd3cSVivek Goyal rcu_read_unlock(); 1142*bc16a4f9STejun Heo goto out; 1143af75cd3cSVivek Goyal } 1144af75cd3cSVivek Goyal } 1145af75cd3cSVivek Goyal rcu_read_unlock(); 1146af75cd3cSVivek Goyal 1147af75cd3cSVivek Goyal /* 1148af75cd3cSVivek Goyal * Either group has not been allocated yet or it is not an unlimited 1149af75cd3cSVivek Goyal * IO group 1150af75cd3cSVivek Goyal */ 1151e43473b7SVivek Goyal spin_lock_irq(q->queue_lock); 1152e43473b7SVivek Goyal tg = throtl_get_tg(td); 1153*bc16a4f9STejun Heo if (unlikely(!tg)) 1154*bc16a4f9STejun Heo goto out_unlock; 1155f469a7b4SVivek Goyal 1156e43473b7SVivek Goyal if (tg->nr_queued[rw]) { 1157e43473b7SVivek Goyal /* 1158e43473b7SVivek Goyal * There is already another bio queued in same dir. No 1159e43473b7SVivek Goyal * need to update dispatch time. 1160e43473b7SVivek Goyal */ 1161e43473b7SVivek Goyal update_disptime = false; 1162e43473b7SVivek Goyal goto queue_bio; 1163de701c74SVivek Goyal 1164e43473b7SVivek Goyal } 1165e43473b7SVivek Goyal 1166e43473b7SVivek Goyal /* Bio is with-in rate limit of group */ 1167e43473b7SVivek Goyal if (tg_may_dispatch(td, tg, bio, NULL)) { 1168e43473b7SVivek Goyal throtl_charge_bio(tg, bio); 116904521db0SVivek Goyal 117004521db0SVivek Goyal /* 117104521db0SVivek Goyal * We need to trim slice even when bios are not being queued 117204521db0SVivek Goyal * otherwise it might happen that a bio is not queued for 117304521db0SVivek Goyal * a long time and slice keeps on extending and trim is not 117404521db0SVivek Goyal * called for a long time. Now if limits are reduced suddenly 117504521db0SVivek Goyal * we take into account all the IO dispatched so far at new 117604521db0SVivek Goyal * low rate and * newly queued IO gets a really long dispatch 117704521db0SVivek Goyal * time. 117804521db0SVivek Goyal * 117904521db0SVivek Goyal * So keep on trimming slice even if bio is not queued. 118004521db0SVivek Goyal */ 118104521db0SVivek Goyal throtl_trim_slice(td, tg, rw); 1182*bc16a4f9STejun Heo goto out_unlock; 1183e43473b7SVivek Goyal } 1184e43473b7SVivek Goyal 1185e43473b7SVivek Goyal queue_bio: 1186fd16d263SJoe Perches throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu" 11878e89d13fSVivek Goyal " iodisp=%u iops=%u queued=%d/%d", 11888e89d13fSVivek Goyal rw == READ ? 'R' : 'W', 1189e43473b7SVivek Goyal tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], 11908e89d13fSVivek Goyal tg->io_disp[rw], tg->iops[rw], 1191e43473b7SVivek Goyal tg->nr_queued[READ], tg->nr_queued[WRITE]); 1192e43473b7SVivek Goyal 1193e43473b7SVivek Goyal throtl_add_bio_tg(q->td, tg, bio); 1194*bc16a4f9STejun Heo throttled = true; 1195e43473b7SVivek Goyal 1196e43473b7SVivek Goyal if (update_disptime) { 1197e43473b7SVivek Goyal tg_update_disptime(td, tg); 1198e43473b7SVivek Goyal throtl_schedule_next_dispatch(td); 1199e43473b7SVivek Goyal } 1200e43473b7SVivek Goyal 1201*bc16a4f9STejun Heo out_unlock: 1202e43473b7SVivek Goyal spin_unlock_irq(q->queue_lock); 1203*bc16a4f9STejun Heo out: 1204*bc16a4f9STejun Heo return throttled; 1205e43473b7SVivek Goyal } 1206e43473b7SVivek Goyal 1207e43473b7SVivek Goyal int blk_throtl_init(struct request_queue *q) 1208e43473b7SVivek Goyal { 1209e43473b7SVivek Goyal struct throtl_data *td; 1210e43473b7SVivek Goyal struct throtl_grp *tg; 1211e43473b7SVivek Goyal 1212e43473b7SVivek Goyal td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1213e43473b7SVivek Goyal if (!td) 1214e43473b7SVivek Goyal return -ENOMEM; 1215e43473b7SVivek Goyal 1216e43473b7SVivek Goyal INIT_HLIST_HEAD(&td->tg_list); 1217e43473b7SVivek Goyal td->tg_service_tree = THROTL_RB_ROOT; 1218de701c74SVivek Goyal td->limits_changed = false; 1219a29a171eSVivek Goyal INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); 1220e43473b7SVivek Goyal 122129b12589SVivek Goyal /* alloc and Init root group. */ 122229b12589SVivek Goyal td->queue = q; 122329b12589SVivek Goyal tg = throtl_alloc_tg(td); 122402977e4aSVivek Goyal 122529b12589SVivek Goyal if (!tg) { 122629b12589SVivek Goyal kfree(td); 122729b12589SVivek Goyal return -ENOMEM; 122829b12589SVivek Goyal } 122929b12589SVivek Goyal 123029b12589SVivek Goyal td->root_tg = tg; 1231e43473b7SVivek Goyal 1232e43473b7SVivek Goyal rcu_read_lock(); 12335617cbefSVivek Goyal throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup); 1234e43473b7SVivek Goyal rcu_read_unlock(); 1235e43473b7SVivek Goyal 1236e43473b7SVivek Goyal /* Attach throtl data to request queue */ 1237e43473b7SVivek Goyal q->td = td; 1238e43473b7SVivek Goyal return 0; 1239e43473b7SVivek Goyal } 1240e43473b7SVivek Goyal 1241e43473b7SVivek Goyal void blk_throtl_exit(struct request_queue *q) 1242e43473b7SVivek Goyal { 1243e43473b7SVivek Goyal struct throtl_data *td = q->td; 1244e43473b7SVivek Goyal bool wait = false; 1245e43473b7SVivek Goyal 1246e43473b7SVivek Goyal BUG_ON(!td); 1247e43473b7SVivek Goyal 1248da527770SVivek Goyal throtl_shutdown_wq(q); 1249e43473b7SVivek Goyal 1250e43473b7SVivek Goyal spin_lock_irq(q->queue_lock); 1251e43473b7SVivek Goyal throtl_release_tgs(td); 1252e43473b7SVivek Goyal 1253e43473b7SVivek Goyal /* If there are other groups */ 125402977e4aSVivek Goyal if (td->nr_undestroyed_grps > 0) 1255e43473b7SVivek Goyal wait = true; 1256e43473b7SVivek Goyal 1257e43473b7SVivek Goyal spin_unlock_irq(q->queue_lock); 1258e43473b7SVivek Goyal 1259e43473b7SVivek Goyal /* 1260e43473b7SVivek Goyal * Wait for tg->blkg->key accessors to exit their grace periods. 1261e43473b7SVivek Goyal * Do this wait only if there are other undestroyed groups out 1262e43473b7SVivek Goyal * there (other than root group). This can happen if cgroup deletion 1263e43473b7SVivek Goyal * path claimed the responsibility of cleaning up a group before 1264e43473b7SVivek Goyal * queue cleanup code get to the group. 1265e43473b7SVivek Goyal * 1266e43473b7SVivek Goyal * Do not call synchronize_rcu() unconditionally as there are drivers 1267e43473b7SVivek Goyal * which create/delete request queue hundreds of times during scan/boot 1268e43473b7SVivek Goyal * and synchronize_rcu() can take significant time and slow down boot. 1269e43473b7SVivek Goyal */ 1270e43473b7SVivek Goyal if (wait) 1271e43473b7SVivek Goyal synchronize_rcu(); 1272fe071437SVivek Goyal 1273fe071437SVivek Goyal /* 1274fe071437SVivek Goyal * Just being safe to make sure after previous flush if some body did 1275fe071437SVivek Goyal * update limits through cgroup and another work got queued, cancel 1276fe071437SVivek Goyal * it. 1277fe071437SVivek Goyal */ 1278da527770SVivek Goyal throtl_shutdown_wq(q); 1279e43473b7SVivek Goyal throtl_td_free(td); 1280e43473b7SVivek Goyal } 1281e43473b7SVivek Goyal 1282e43473b7SVivek Goyal static int __init throtl_init(void) 1283e43473b7SVivek Goyal { 1284450adcbeSVivek Goyal kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); 1285450adcbeSVivek Goyal if (!kthrotld_workqueue) 1286450adcbeSVivek Goyal panic("Failed to create kthrotld\n"); 1287450adcbeSVivek Goyal 1288e43473b7SVivek Goyal blkio_policy_register(&blkio_policy_throtl); 1289e43473b7SVivek Goyal return 0; 1290e43473b7SVivek Goyal } 1291e43473b7SVivek Goyal 1292e43473b7SVivek Goyal module_init(throtl_init); 1293