1e43473b7SVivek Goyal /* 2e43473b7SVivek Goyal * Interface for controlling IO bandwidth on a request queue 3e43473b7SVivek Goyal * 4e43473b7SVivek Goyal * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> 5e43473b7SVivek Goyal */ 6e43473b7SVivek Goyal 7e43473b7SVivek Goyal #include <linux/module.h> 8e43473b7SVivek Goyal #include <linux/slab.h> 9e43473b7SVivek Goyal #include <linux/blkdev.h> 10e43473b7SVivek Goyal #include <linux/bio.h> 11e43473b7SVivek Goyal #include <linux/blktrace_api.h> 12e43473b7SVivek Goyal #include "blk-cgroup.h" 13e43473b7SVivek Goyal 14e43473b7SVivek Goyal /* Max dispatch from a group in 1 round */ 15e43473b7SVivek Goyal static int throtl_grp_quantum = 8; 16e43473b7SVivek Goyal 17e43473b7SVivek Goyal /* Total max dispatch from all groups in one round */ 18e43473b7SVivek Goyal static int throtl_quantum = 32; 19e43473b7SVivek Goyal 20e43473b7SVivek Goyal /* Throttling is performed over 100ms slice and after that slice is renewed */ 21e43473b7SVivek Goyal static unsigned long throtl_slice = HZ/10; /* 100 ms */ 22e43473b7SVivek Goyal 23450adcbeSVivek Goyal /* A workqueue to queue throttle related work */ 24450adcbeSVivek Goyal static struct workqueue_struct *kthrotld_workqueue; 25450adcbeSVivek Goyal static void throtl_schedule_delayed_work(struct throtl_data *td, 26450adcbeSVivek Goyal unsigned long delay); 27450adcbeSVivek Goyal 28e43473b7SVivek Goyal struct throtl_rb_root { 29e43473b7SVivek Goyal struct rb_root rb; 30e43473b7SVivek Goyal struct rb_node *left; 31e43473b7SVivek Goyal unsigned int count; 32e43473b7SVivek Goyal unsigned long min_disptime; 33e43473b7SVivek Goyal }; 34e43473b7SVivek Goyal 35e43473b7SVivek Goyal #define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \ 36e43473b7SVivek Goyal .count = 0, .min_disptime = 0} 37e43473b7SVivek Goyal 38e43473b7SVivek Goyal #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 39e43473b7SVivek Goyal 40e43473b7SVivek Goyal struct throtl_grp { 41e43473b7SVivek Goyal /* List of throtl groups on the request queue*/ 42e43473b7SVivek Goyal struct hlist_node tg_node; 43e43473b7SVivek Goyal 44e43473b7SVivek Goyal /* active throtl group service_tree member */ 45e43473b7SVivek Goyal struct rb_node rb_node; 46e43473b7SVivek Goyal 47e43473b7SVivek Goyal /* 48e43473b7SVivek Goyal * Dispatch time in jiffies. This is the estimated time when group 49e43473b7SVivek Goyal * will unthrottle and is ready to dispatch more bio. It is used as 50e43473b7SVivek Goyal * key to sort active groups in service tree. 51e43473b7SVivek Goyal */ 52e43473b7SVivek Goyal unsigned long disptime; 53e43473b7SVivek Goyal 54e43473b7SVivek Goyal struct blkio_group blkg; 55e43473b7SVivek Goyal atomic_t ref; 56e43473b7SVivek Goyal unsigned int flags; 57e43473b7SVivek Goyal 58e43473b7SVivek Goyal /* Two lists for READ and WRITE */ 59e43473b7SVivek Goyal struct bio_list bio_lists[2]; 60e43473b7SVivek Goyal 61e43473b7SVivek Goyal /* Number of queued bios on READ and WRITE lists */ 62e43473b7SVivek Goyal unsigned int nr_queued[2]; 63e43473b7SVivek Goyal 64e43473b7SVivek Goyal /* bytes per second rate limits */ 65e43473b7SVivek Goyal uint64_t bps[2]; 66e43473b7SVivek Goyal 678e89d13fSVivek Goyal /* IOPS limits */ 688e89d13fSVivek Goyal unsigned int iops[2]; 698e89d13fSVivek Goyal 70e43473b7SVivek Goyal /* Number of bytes disptached in current slice */ 71e43473b7SVivek Goyal uint64_t bytes_disp[2]; 728e89d13fSVivek Goyal /* Number of bio's dispatched in current slice */ 738e89d13fSVivek Goyal unsigned int io_disp[2]; 74e43473b7SVivek Goyal 75e43473b7SVivek Goyal /* When did we start a new slice */ 76e43473b7SVivek Goyal unsigned long slice_start[2]; 77e43473b7SVivek Goyal unsigned long slice_end[2]; 78fe071437SVivek Goyal 79fe071437SVivek Goyal /* Some throttle limits got updated for the group */ 806f037937SAndreas Schwab int limits_changed; 814843c69dSVivek Goyal 824843c69dSVivek Goyal struct rcu_head rcu_head; 83e43473b7SVivek Goyal }; 84e43473b7SVivek Goyal 85e43473b7SVivek Goyal struct throtl_data 86e43473b7SVivek Goyal { 87e43473b7SVivek Goyal /* List of throtl groups */ 88e43473b7SVivek Goyal struct hlist_head tg_list; 89e43473b7SVivek Goyal 90e43473b7SVivek Goyal /* service tree for active throtl groups */ 91e43473b7SVivek Goyal struct throtl_rb_root tg_service_tree; 92e43473b7SVivek Goyal 9329b12589SVivek Goyal struct throtl_grp *root_tg; 94e43473b7SVivek Goyal struct request_queue *queue; 95e43473b7SVivek Goyal 96e43473b7SVivek Goyal /* Total Number of queued bios on READ and WRITE lists */ 97e43473b7SVivek Goyal unsigned int nr_queued[2]; 98e43473b7SVivek Goyal 99e43473b7SVivek Goyal /* 10002977e4aSVivek Goyal * number of total undestroyed groups 101e43473b7SVivek Goyal */ 102e43473b7SVivek Goyal unsigned int nr_undestroyed_grps; 103e43473b7SVivek Goyal 104e43473b7SVivek Goyal /* Work for dispatching throttled bios */ 105e43473b7SVivek Goyal struct delayed_work throtl_work; 106fe071437SVivek Goyal 1076f037937SAndreas Schwab int limits_changed; 108e43473b7SVivek Goyal }; 109e43473b7SVivek Goyal 110e43473b7SVivek Goyal enum tg_state_flags { 111e43473b7SVivek Goyal THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ 112e43473b7SVivek Goyal }; 113e43473b7SVivek Goyal 114e43473b7SVivek Goyal #define THROTL_TG_FNS(name) \ 115e43473b7SVivek Goyal static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \ 116e43473b7SVivek Goyal { \ 117e43473b7SVivek Goyal (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \ 118e43473b7SVivek Goyal } \ 119e43473b7SVivek Goyal static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \ 120e43473b7SVivek Goyal { \ 121e43473b7SVivek Goyal (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \ 122e43473b7SVivek Goyal } \ 123e43473b7SVivek Goyal static inline int throtl_tg_##name(const struct throtl_grp *tg) \ 124e43473b7SVivek Goyal { \ 125e43473b7SVivek Goyal return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \ 126e43473b7SVivek Goyal } 127e43473b7SVivek Goyal 128e43473b7SVivek Goyal THROTL_TG_FNS(on_rr); 129e43473b7SVivek Goyal 130e43473b7SVivek Goyal #define throtl_log_tg(td, tg, fmt, args...) \ 131e43473b7SVivek Goyal blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ 132e43473b7SVivek Goyal blkg_path(&(tg)->blkg), ##args); \ 133e43473b7SVivek Goyal 134e43473b7SVivek Goyal #define throtl_log(td, fmt, args...) \ 135e43473b7SVivek Goyal blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) 136e43473b7SVivek Goyal 137e43473b7SVivek Goyal static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg) 138e43473b7SVivek Goyal { 139e43473b7SVivek Goyal if (blkg) 140e43473b7SVivek Goyal return container_of(blkg, struct throtl_grp, blkg); 141e43473b7SVivek Goyal 142e43473b7SVivek Goyal return NULL; 143e43473b7SVivek Goyal } 144e43473b7SVivek Goyal 145e43473b7SVivek Goyal static inline int total_nr_queued(struct throtl_data *td) 146e43473b7SVivek Goyal { 147e43473b7SVivek Goyal return (td->nr_queued[0] + td->nr_queued[1]); 148e43473b7SVivek Goyal } 149e43473b7SVivek Goyal 150e43473b7SVivek Goyal static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) 151e43473b7SVivek Goyal { 152e43473b7SVivek Goyal atomic_inc(&tg->ref); 153e43473b7SVivek Goyal return tg; 154e43473b7SVivek Goyal } 155e43473b7SVivek Goyal 1564843c69dSVivek Goyal static void throtl_free_tg(struct rcu_head *head) 1574843c69dSVivek Goyal { 1584843c69dSVivek Goyal struct throtl_grp *tg; 1594843c69dSVivek Goyal 1604843c69dSVivek Goyal tg = container_of(head, struct throtl_grp, rcu_head); 1615624a4e4SVivek Goyal free_percpu(tg->blkg.stats_cpu); 1624843c69dSVivek Goyal kfree(tg); 1634843c69dSVivek Goyal } 1644843c69dSVivek Goyal 165e43473b7SVivek Goyal static void throtl_put_tg(struct throtl_grp *tg) 166e43473b7SVivek Goyal { 167e43473b7SVivek Goyal BUG_ON(atomic_read(&tg->ref) <= 0); 168e43473b7SVivek Goyal if (!atomic_dec_and_test(&tg->ref)) 169e43473b7SVivek Goyal return; 1704843c69dSVivek Goyal 1714843c69dSVivek Goyal /* 1724843c69dSVivek Goyal * A group is freed in rcu manner. But having an rcu lock does not 1734843c69dSVivek Goyal * mean that one can access all the fields of blkg and assume these 1744843c69dSVivek Goyal * are valid. For example, don't try to follow throtl_data and 1754843c69dSVivek Goyal * request queue links. 1764843c69dSVivek Goyal * 1774843c69dSVivek Goyal * Having a reference to blkg under an rcu allows acess to only 1784843c69dSVivek Goyal * values local to groups like group stats and group rate limits 1794843c69dSVivek Goyal */ 1804843c69dSVivek Goyal call_rcu(&tg->rcu_head, throtl_free_tg); 181e43473b7SVivek Goyal } 182e43473b7SVivek Goyal 183a29a171eSVivek Goyal static void throtl_init_group(struct throtl_grp *tg) 184a29a171eSVivek Goyal { 185a29a171eSVivek Goyal INIT_HLIST_NODE(&tg->tg_node); 186a29a171eSVivek Goyal RB_CLEAR_NODE(&tg->rb_node); 187a29a171eSVivek Goyal bio_list_init(&tg->bio_lists[0]); 188a29a171eSVivek Goyal bio_list_init(&tg->bio_lists[1]); 189a29a171eSVivek Goyal tg->limits_changed = false; 190a29a171eSVivek Goyal 191a29a171eSVivek Goyal /* Practically unlimited BW */ 192a29a171eSVivek Goyal tg->bps[0] = tg->bps[1] = -1; 193a29a171eSVivek Goyal tg->iops[0] = tg->iops[1] = -1; 194a29a171eSVivek Goyal 195a29a171eSVivek Goyal /* 196a29a171eSVivek Goyal * Take the initial reference that will be released on destroy 197a29a171eSVivek Goyal * This can be thought of a joint reference by cgroup and 198a29a171eSVivek Goyal * request queue which will be dropped by either request queue 199a29a171eSVivek Goyal * exit or cgroup deletion path depending on who is exiting first. 200a29a171eSVivek Goyal */ 201a29a171eSVivek Goyal atomic_set(&tg->ref, 1); 202a29a171eSVivek Goyal } 203a29a171eSVivek Goyal 204a29a171eSVivek Goyal /* Should be called with rcu read lock held (needed for blkcg) */ 205a29a171eSVivek Goyal static void 206a29a171eSVivek Goyal throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg) 207a29a171eSVivek Goyal { 208a29a171eSVivek Goyal hlist_add_head(&tg->tg_node, &td->tg_list); 209a29a171eSVivek Goyal td->nr_undestroyed_grps++; 210a29a171eSVivek Goyal } 211a29a171eSVivek Goyal 212269f5415SVivek Goyal static void 213269f5415SVivek Goyal __throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) 214f469a7b4SVivek Goyal { 215f469a7b4SVivek Goyal struct backing_dev_info *bdi = &td->queue->backing_dev_info; 216f469a7b4SVivek Goyal unsigned int major, minor; 217f469a7b4SVivek Goyal 218269f5415SVivek Goyal if (!tg || tg->blkg.dev) 219269f5415SVivek Goyal return; 220269f5415SVivek Goyal 221269f5415SVivek Goyal /* 222269f5415SVivek Goyal * Fill in device details for a group which might not have been 223269f5415SVivek Goyal * filled at group creation time as queue was being instantiated 224269f5415SVivek Goyal * and driver had not attached a device yet 225269f5415SVivek Goyal */ 226269f5415SVivek Goyal if (bdi->dev && dev_name(bdi->dev)) { 227f469a7b4SVivek Goyal sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); 228269f5415SVivek Goyal tg->blkg.dev = MKDEV(major, minor); 229269f5415SVivek Goyal } 230269f5415SVivek Goyal } 231269f5415SVivek Goyal 232af75cd3cSVivek Goyal /* 233af75cd3cSVivek Goyal * Should be called with without queue lock held. Here queue lock will be 234af75cd3cSVivek Goyal * taken rarely. It will be taken only once during life time of a group 235af75cd3cSVivek Goyal * if need be 236af75cd3cSVivek Goyal */ 237af75cd3cSVivek Goyal static void 238af75cd3cSVivek Goyal throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) 239af75cd3cSVivek Goyal { 240af75cd3cSVivek Goyal if (!tg || tg->blkg.dev) 241af75cd3cSVivek Goyal return; 242af75cd3cSVivek Goyal 243af75cd3cSVivek Goyal spin_lock_irq(td->queue->queue_lock); 244af75cd3cSVivek Goyal __throtl_tg_fill_dev_details(td, tg); 245af75cd3cSVivek Goyal spin_unlock_irq(td->queue->queue_lock); 246af75cd3cSVivek Goyal } 247af75cd3cSVivek Goyal 248269f5415SVivek Goyal static void throtl_init_add_tg_lists(struct throtl_data *td, 249269f5415SVivek Goyal struct throtl_grp *tg, struct blkio_cgroup *blkcg) 250269f5415SVivek Goyal { 251269f5415SVivek Goyal __throtl_tg_fill_dev_details(td, tg); 252269f5415SVivek Goyal 253269f5415SVivek Goyal /* Add group onto cgroup list */ 254f469a7b4SVivek Goyal blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, 255269f5415SVivek Goyal tg->blkg.dev, BLKIO_POLICY_THROTL); 256f469a7b4SVivek Goyal 257f469a7b4SVivek Goyal tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); 258f469a7b4SVivek Goyal tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); 259f469a7b4SVivek Goyal tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); 260f469a7b4SVivek Goyal tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); 261f469a7b4SVivek Goyal 262f469a7b4SVivek Goyal throtl_add_group_to_td_list(td, tg); 263f469a7b4SVivek Goyal } 264f469a7b4SVivek Goyal 265f469a7b4SVivek Goyal /* Should be called without queue lock and outside of rcu period */ 266f469a7b4SVivek Goyal static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td) 267f469a7b4SVivek Goyal { 268f469a7b4SVivek Goyal struct throtl_grp *tg = NULL; 2695624a4e4SVivek Goyal int ret; 270f469a7b4SVivek Goyal 271f469a7b4SVivek Goyal tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); 272f469a7b4SVivek Goyal if (!tg) 273f469a7b4SVivek Goyal return NULL; 274f469a7b4SVivek Goyal 2755624a4e4SVivek Goyal ret = blkio_alloc_blkg_stats(&tg->blkg); 2765624a4e4SVivek Goyal 2775624a4e4SVivek Goyal if (ret) { 2785624a4e4SVivek Goyal kfree(tg); 2795624a4e4SVivek Goyal return NULL; 2805624a4e4SVivek Goyal } 2815624a4e4SVivek Goyal 282f469a7b4SVivek Goyal throtl_init_group(tg); 283f469a7b4SVivek Goyal return tg; 284f469a7b4SVivek Goyal } 285f469a7b4SVivek Goyal 286f469a7b4SVivek Goyal static struct 287f469a7b4SVivek Goyal throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) 288e43473b7SVivek Goyal { 289e43473b7SVivek Goyal struct throtl_grp *tg = NULL; 290e43473b7SVivek Goyal void *key = td; 291e43473b7SVivek Goyal 292e43473b7SVivek Goyal /* 293be2c6b19SVivek Goyal * This is the common case when there are no blkio cgroups. 294be2c6b19SVivek Goyal * Avoid lookup in this case 295be2c6b19SVivek Goyal */ 296be2c6b19SVivek Goyal if (blkcg == &blkio_root_cgroup) 29729b12589SVivek Goyal tg = td->root_tg; 298be2c6b19SVivek Goyal else 299e43473b7SVivek Goyal tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); 300e43473b7SVivek Goyal 301269f5415SVivek Goyal __throtl_tg_fill_dev_details(td, tg); 302e43473b7SVivek Goyal return tg; 303e43473b7SVivek Goyal } 304e43473b7SVivek Goyal 305f469a7b4SVivek Goyal /* 306f469a7b4SVivek Goyal * This function returns with queue lock unlocked in case of error, like 307f469a7b4SVivek Goyal * request queue is no more 308f469a7b4SVivek Goyal */ 309e43473b7SVivek Goyal static struct throtl_grp * throtl_get_tg(struct throtl_data *td) 310e43473b7SVivek Goyal { 311f469a7b4SVivek Goyal struct throtl_grp *tg = NULL, *__tg = NULL; 31270087dc3SVivek Goyal struct blkio_cgroup *blkcg; 313f469a7b4SVivek Goyal struct request_queue *q = td->queue; 314e43473b7SVivek Goyal 315e43473b7SVivek Goyal rcu_read_lock(); 31670087dc3SVivek Goyal blkcg = task_blkio_cgroup(current); 317f469a7b4SVivek Goyal tg = throtl_find_tg(td, blkcg); 318f469a7b4SVivek Goyal if (tg) { 319f469a7b4SVivek Goyal rcu_read_unlock(); 320f469a7b4SVivek Goyal return tg; 321f469a7b4SVivek Goyal } 322f469a7b4SVivek Goyal 323f469a7b4SVivek Goyal /* 324f469a7b4SVivek Goyal * Need to allocate a group. Allocation of group also needs allocation 325f469a7b4SVivek Goyal * of per cpu stats which in-turn takes a mutex() and can block. Hence 326f469a7b4SVivek Goyal * we need to drop rcu lock and queue_lock before we call alloc 327f469a7b4SVivek Goyal * 328f469a7b4SVivek Goyal * Take the request queue reference to make sure queue does not 329f469a7b4SVivek Goyal * go away once we return from allocation. 330f469a7b4SVivek Goyal */ 331f469a7b4SVivek Goyal blk_get_queue(q); 332f469a7b4SVivek Goyal rcu_read_unlock(); 333f469a7b4SVivek Goyal spin_unlock_irq(q->queue_lock); 334f469a7b4SVivek Goyal 335f469a7b4SVivek Goyal tg = throtl_alloc_tg(td); 336f469a7b4SVivek Goyal /* 337f469a7b4SVivek Goyal * We might have slept in group allocation. Make sure queue is not 338f469a7b4SVivek Goyal * dead 339f469a7b4SVivek Goyal */ 340f469a7b4SVivek Goyal if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { 341f469a7b4SVivek Goyal blk_put_queue(q); 342f469a7b4SVivek Goyal if (tg) 343f469a7b4SVivek Goyal kfree(tg); 344f469a7b4SVivek Goyal 345f469a7b4SVivek Goyal return ERR_PTR(-ENODEV); 346f469a7b4SVivek Goyal } 347f469a7b4SVivek Goyal blk_put_queue(q); 348f469a7b4SVivek Goyal 349f469a7b4SVivek Goyal /* Group allocated and queue is still alive. take the lock */ 350f469a7b4SVivek Goyal spin_lock_irq(q->queue_lock); 351f469a7b4SVivek Goyal 352f469a7b4SVivek Goyal /* 353f469a7b4SVivek Goyal * Initialize the new group. After sleeping, read the blkcg again. 354f469a7b4SVivek Goyal */ 355f469a7b4SVivek Goyal rcu_read_lock(); 356f469a7b4SVivek Goyal blkcg = task_blkio_cgroup(current); 357f469a7b4SVivek Goyal 358f469a7b4SVivek Goyal /* 359f469a7b4SVivek Goyal * If some other thread already allocated the group while we were 360f469a7b4SVivek Goyal * not holding queue lock, free up the group 361f469a7b4SVivek Goyal */ 362f469a7b4SVivek Goyal __tg = throtl_find_tg(td, blkcg); 363f469a7b4SVivek Goyal 364f469a7b4SVivek Goyal if (__tg) { 365f469a7b4SVivek Goyal kfree(tg); 366f469a7b4SVivek Goyal rcu_read_unlock(); 367f469a7b4SVivek Goyal return __tg; 368f469a7b4SVivek Goyal } 369f469a7b4SVivek Goyal 370f469a7b4SVivek Goyal /* Group allocation failed. Account the IO to root group */ 371f469a7b4SVivek Goyal if (!tg) { 37229b12589SVivek Goyal tg = td->root_tg; 373f469a7b4SVivek Goyal return tg; 374f469a7b4SVivek Goyal } 375f469a7b4SVivek Goyal 376f469a7b4SVivek Goyal throtl_init_add_tg_lists(td, tg, blkcg); 377e43473b7SVivek Goyal rcu_read_unlock(); 378e43473b7SVivek Goyal return tg; 379e43473b7SVivek Goyal } 380e43473b7SVivek Goyal 381e43473b7SVivek Goyal static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root) 382e43473b7SVivek Goyal { 383e43473b7SVivek Goyal /* Service tree is empty */ 384e43473b7SVivek Goyal if (!root->count) 385e43473b7SVivek Goyal return NULL; 386e43473b7SVivek Goyal 387e43473b7SVivek Goyal if (!root->left) 388e43473b7SVivek Goyal root->left = rb_first(&root->rb); 389e43473b7SVivek Goyal 390e43473b7SVivek Goyal if (root->left) 391e43473b7SVivek Goyal return rb_entry_tg(root->left); 392e43473b7SVivek Goyal 393e43473b7SVivek Goyal return NULL; 394e43473b7SVivek Goyal } 395e43473b7SVivek Goyal 396e43473b7SVivek Goyal static void rb_erase_init(struct rb_node *n, struct rb_root *root) 397e43473b7SVivek Goyal { 398e43473b7SVivek Goyal rb_erase(n, root); 399e43473b7SVivek Goyal RB_CLEAR_NODE(n); 400e43473b7SVivek Goyal } 401e43473b7SVivek Goyal 402e43473b7SVivek Goyal static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root) 403e43473b7SVivek Goyal { 404e43473b7SVivek Goyal if (root->left == n) 405e43473b7SVivek Goyal root->left = NULL; 406e43473b7SVivek Goyal rb_erase_init(n, &root->rb); 407e43473b7SVivek Goyal --root->count; 408e43473b7SVivek Goyal } 409e43473b7SVivek Goyal 410e43473b7SVivek Goyal static void update_min_dispatch_time(struct throtl_rb_root *st) 411e43473b7SVivek Goyal { 412e43473b7SVivek Goyal struct throtl_grp *tg; 413e43473b7SVivek Goyal 414e43473b7SVivek Goyal tg = throtl_rb_first(st); 415e43473b7SVivek Goyal if (!tg) 416e43473b7SVivek Goyal return; 417e43473b7SVivek Goyal 418e43473b7SVivek Goyal st->min_disptime = tg->disptime; 419e43473b7SVivek Goyal } 420e43473b7SVivek Goyal 421e43473b7SVivek Goyal static void 422e43473b7SVivek Goyal tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg) 423e43473b7SVivek Goyal { 424e43473b7SVivek Goyal struct rb_node **node = &st->rb.rb_node; 425e43473b7SVivek Goyal struct rb_node *parent = NULL; 426e43473b7SVivek Goyal struct throtl_grp *__tg; 427e43473b7SVivek Goyal unsigned long key = tg->disptime; 428e43473b7SVivek Goyal int left = 1; 429e43473b7SVivek Goyal 430e43473b7SVivek Goyal while (*node != NULL) { 431e43473b7SVivek Goyal parent = *node; 432e43473b7SVivek Goyal __tg = rb_entry_tg(parent); 433e43473b7SVivek Goyal 434e43473b7SVivek Goyal if (time_before(key, __tg->disptime)) 435e43473b7SVivek Goyal node = &parent->rb_left; 436e43473b7SVivek Goyal else { 437e43473b7SVivek Goyal node = &parent->rb_right; 438e43473b7SVivek Goyal left = 0; 439e43473b7SVivek Goyal } 440e43473b7SVivek Goyal } 441e43473b7SVivek Goyal 442e43473b7SVivek Goyal if (left) 443e43473b7SVivek Goyal st->left = &tg->rb_node; 444e43473b7SVivek Goyal 445e43473b7SVivek Goyal rb_link_node(&tg->rb_node, parent, node); 446e43473b7SVivek Goyal rb_insert_color(&tg->rb_node, &st->rb); 447e43473b7SVivek Goyal } 448e43473b7SVivek Goyal 449e43473b7SVivek Goyal static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) 450e43473b7SVivek Goyal { 451e43473b7SVivek Goyal struct throtl_rb_root *st = &td->tg_service_tree; 452e43473b7SVivek Goyal 453e43473b7SVivek Goyal tg_service_tree_add(st, tg); 454e43473b7SVivek Goyal throtl_mark_tg_on_rr(tg); 455e43473b7SVivek Goyal st->count++; 456e43473b7SVivek Goyal } 457e43473b7SVivek Goyal 458e43473b7SVivek Goyal static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) 459e43473b7SVivek Goyal { 460e43473b7SVivek Goyal if (!throtl_tg_on_rr(tg)) 461e43473b7SVivek Goyal __throtl_enqueue_tg(td, tg); 462e43473b7SVivek Goyal } 463e43473b7SVivek Goyal 464e43473b7SVivek Goyal static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) 465e43473b7SVivek Goyal { 466e43473b7SVivek Goyal throtl_rb_erase(&tg->rb_node, &td->tg_service_tree); 467e43473b7SVivek Goyal throtl_clear_tg_on_rr(tg); 468e43473b7SVivek Goyal } 469e43473b7SVivek Goyal 470e43473b7SVivek Goyal static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) 471e43473b7SVivek Goyal { 472e43473b7SVivek Goyal if (throtl_tg_on_rr(tg)) 473e43473b7SVivek Goyal __throtl_dequeue_tg(td, tg); 474e43473b7SVivek Goyal } 475e43473b7SVivek Goyal 476e43473b7SVivek Goyal static void throtl_schedule_next_dispatch(struct throtl_data *td) 477e43473b7SVivek Goyal { 478e43473b7SVivek Goyal struct throtl_rb_root *st = &td->tg_service_tree; 479e43473b7SVivek Goyal 480e43473b7SVivek Goyal /* 481e43473b7SVivek Goyal * If there are more bios pending, schedule more work. 482e43473b7SVivek Goyal */ 483e43473b7SVivek Goyal if (!total_nr_queued(td)) 484e43473b7SVivek Goyal return; 485e43473b7SVivek Goyal 486e43473b7SVivek Goyal BUG_ON(!st->count); 487e43473b7SVivek Goyal 488e43473b7SVivek Goyal update_min_dispatch_time(st); 489e43473b7SVivek Goyal 490e43473b7SVivek Goyal if (time_before_eq(st->min_disptime, jiffies)) 491450adcbeSVivek Goyal throtl_schedule_delayed_work(td, 0); 492e43473b7SVivek Goyal else 493450adcbeSVivek Goyal throtl_schedule_delayed_work(td, (st->min_disptime - jiffies)); 494e43473b7SVivek Goyal } 495e43473b7SVivek Goyal 496e43473b7SVivek Goyal static inline void 497e43473b7SVivek Goyal throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) 498e43473b7SVivek Goyal { 499e43473b7SVivek Goyal tg->bytes_disp[rw] = 0; 5008e89d13fSVivek Goyal tg->io_disp[rw] = 0; 501e43473b7SVivek Goyal tg->slice_start[rw] = jiffies; 502e43473b7SVivek Goyal tg->slice_end[rw] = jiffies + throtl_slice; 503e43473b7SVivek Goyal throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", 504e43473b7SVivek Goyal rw == READ ? 'R' : 'W', tg->slice_start[rw], 505e43473b7SVivek Goyal tg->slice_end[rw], jiffies); 506e43473b7SVivek Goyal } 507e43473b7SVivek Goyal 508d1ae8ffdSVivek Goyal static inline void throtl_set_slice_end(struct throtl_data *td, 509d1ae8ffdSVivek Goyal struct throtl_grp *tg, bool rw, unsigned long jiffy_end) 510d1ae8ffdSVivek Goyal { 511d1ae8ffdSVivek Goyal tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 512d1ae8ffdSVivek Goyal } 513d1ae8ffdSVivek Goyal 514e43473b7SVivek Goyal static inline void throtl_extend_slice(struct throtl_data *td, 515e43473b7SVivek Goyal struct throtl_grp *tg, bool rw, unsigned long jiffy_end) 516e43473b7SVivek Goyal { 517e43473b7SVivek Goyal tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 518e43473b7SVivek Goyal throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", 519e43473b7SVivek Goyal rw == READ ? 'R' : 'W', tg->slice_start[rw], 520e43473b7SVivek Goyal tg->slice_end[rw], jiffies); 521e43473b7SVivek Goyal } 522e43473b7SVivek Goyal 523e43473b7SVivek Goyal /* Determine if previously allocated or extended slice is complete or not */ 524e43473b7SVivek Goyal static bool 525e43473b7SVivek Goyal throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) 526e43473b7SVivek Goyal { 527e43473b7SVivek Goyal if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 528e43473b7SVivek Goyal return 0; 529e43473b7SVivek Goyal 530e43473b7SVivek Goyal return 1; 531e43473b7SVivek Goyal } 532e43473b7SVivek Goyal 533e43473b7SVivek Goyal /* Trim the used slices and adjust slice start accordingly */ 534e43473b7SVivek Goyal static inline void 535e43473b7SVivek Goyal throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) 536e43473b7SVivek Goyal { 5373aad5d3eSVivek Goyal unsigned long nr_slices, time_elapsed, io_trim; 5383aad5d3eSVivek Goyal u64 bytes_trim, tmp; 539e43473b7SVivek Goyal 540e43473b7SVivek Goyal BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); 541e43473b7SVivek Goyal 542e43473b7SVivek Goyal /* 543e43473b7SVivek Goyal * If bps are unlimited (-1), then time slice don't get 544e43473b7SVivek Goyal * renewed. Don't try to trim the slice if slice is used. A new 545e43473b7SVivek Goyal * slice will start when appropriate. 546e43473b7SVivek Goyal */ 547e43473b7SVivek Goyal if (throtl_slice_used(td, tg, rw)) 548e43473b7SVivek Goyal return; 549e43473b7SVivek Goyal 550d1ae8ffdSVivek Goyal /* 551d1ae8ffdSVivek Goyal * A bio has been dispatched. Also adjust slice_end. It might happen 552d1ae8ffdSVivek Goyal * that initially cgroup limit was very low resulting in high 553d1ae8ffdSVivek Goyal * slice_end, but later limit was bumped up and bio was dispached 554d1ae8ffdSVivek Goyal * sooner, then we need to reduce slice_end. A high bogus slice_end 555d1ae8ffdSVivek Goyal * is bad because it does not allow new slice to start. 556d1ae8ffdSVivek Goyal */ 557d1ae8ffdSVivek Goyal 558d1ae8ffdSVivek Goyal throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice); 559d1ae8ffdSVivek Goyal 560e43473b7SVivek Goyal time_elapsed = jiffies - tg->slice_start[rw]; 561e43473b7SVivek Goyal 562e43473b7SVivek Goyal nr_slices = time_elapsed / throtl_slice; 563e43473b7SVivek Goyal 564e43473b7SVivek Goyal if (!nr_slices) 565e43473b7SVivek Goyal return; 5663aad5d3eSVivek Goyal tmp = tg->bps[rw] * throtl_slice * nr_slices; 5673aad5d3eSVivek Goyal do_div(tmp, HZ); 5683aad5d3eSVivek Goyal bytes_trim = tmp; 569e43473b7SVivek Goyal 5708e89d13fSVivek Goyal io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; 571e43473b7SVivek Goyal 5728e89d13fSVivek Goyal if (!bytes_trim && !io_trim) 573e43473b7SVivek Goyal return; 574e43473b7SVivek Goyal 575e43473b7SVivek Goyal if (tg->bytes_disp[rw] >= bytes_trim) 576e43473b7SVivek Goyal tg->bytes_disp[rw] -= bytes_trim; 577e43473b7SVivek Goyal else 578e43473b7SVivek Goyal tg->bytes_disp[rw] = 0; 579e43473b7SVivek Goyal 5808e89d13fSVivek Goyal if (tg->io_disp[rw] >= io_trim) 5818e89d13fSVivek Goyal tg->io_disp[rw] -= io_trim; 5828e89d13fSVivek Goyal else 5838e89d13fSVivek Goyal tg->io_disp[rw] = 0; 5848e89d13fSVivek Goyal 585e43473b7SVivek Goyal tg->slice_start[rw] += nr_slices * throtl_slice; 586e43473b7SVivek Goyal 5873aad5d3eSVivek Goyal throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" 588e43473b7SVivek Goyal " start=%lu end=%lu jiffies=%lu", 5898e89d13fSVivek Goyal rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, 590e43473b7SVivek Goyal tg->slice_start[rw], tg->slice_end[rw], jiffies); 591e43473b7SVivek Goyal } 592e43473b7SVivek Goyal 5938e89d13fSVivek Goyal static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, 594e43473b7SVivek Goyal struct bio *bio, unsigned long *wait) 595e43473b7SVivek Goyal { 596e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 5978e89d13fSVivek Goyal unsigned int io_allowed; 598e43473b7SVivek Goyal unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 599c49c06e4SVivek Goyal u64 tmp; 600e43473b7SVivek Goyal 6018e89d13fSVivek Goyal jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 602e43473b7SVivek Goyal 6038e89d13fSVivek Goyal /* Slice has just started. Consider one slice interval */ 6048e89d13fSVivek Goyal if (!jiffy_elapsed) 6058e89d13fSVivek Goyal jiffy_elapsed_rnd = throtl_slice; 6068e89d13fSVivek Goyal 6078e89d13fSVivek Goyal jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 6088e89d13fSVivek Goyal 609c49c06e4SVivek Goyal /* 610c49c06e4SVivek Goyal * jiffy_elapsed_rnd should not be a big value as minimum iops can be 611c49c06e4SVivek Goyal * 1 then at max jiffy elapsed should be equivalent of 1 second as we 612c49c06e4SVivek Goyal * will allow dispatch after 1 second and after that slice should 613c49c06e4SVivek Goyal * have been trimmed. 614c49c06e4SVivek Goyal */ 615c49c06e4SVivek Goyal 616c49c06e4SVivek Goyal tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; 617c49c06e4SVivek Goyal do_div(tmp, HZ); 618c49c06e4SVivek Goyal 619c49c06e4SVivek Goyal if (tmp > UINT_MAX) 620c49c06e4SVivek Goyal io_allowed = UINT_MAX; 621c49c06e4SVivek Goyal else 622c49c06e4SVivek Goyal io_allowed = tmp; 6238e89d13fSVivek Goyal 6248e89d13fSVivek Goyal if (tg->io_disp[rw] + 1 <= io_allowed) { 625e43473b7SVivek Goyal if (wait) 626e43473b7SVivek Goyal *wait = 0; 627e43473b7SVivek Goyal return 1; 628e43473b7SVivek Goyal } 629e43473b7SVivek Goyal 6308e89d13fSVivek Goyal /* Calc approx time to dispatch */ 6318e89d13fSVivek Goyal jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; 6328e89d13fSVivek Goyal 6338e89d13fSVivek Goyal if (jiffy_wait > jiffy_elapsed) 6348e89d13fSVivek Goyal jiffy_wait = jiffy_wait - jiffy_elapsed; 6358e89d13fSVivek Goyal else 6368e89d13fSVivek Goyal jiffy_wait = 1; 6378e89d13fSVivek Goyal 6388e89d13fSVivek Goyal if (wait) 6398e89d13fSVivek Goyal *wait = jiffy_wait; 6408e89d13fSVivek Goyal return 0; 641e43473b7SVivek Goyal } 642e43473b7SVivek Goyal 6438e89d13fSVivek Goyal static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, 6448e89d13fSVivek Goyal struct bio *bio, unsigned long *wait) 6458e89d13fSVivek Goyal { 6468e89d13fSVivek Goyal bool rw = bio_data_dir(bio); 6473aad5d3eSVivek Goyal u64 bytes_allowed, extra_bytes, tmp; 6488e89d13fSVivek Goyal unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 6498e89d13fSVivek Goyal 650e43473b7SVivek Goyal jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 651e43473b7SVivek Goyal 652e43473b7SVivek Goyal /* Slice has just started. Consider one slice interval */ 653e43473b7SVivek Goyal if (!jiffy_elapsed) 654e43473b7SVivek Goyal jiffy_elapsed_rnd = throtl_slice; 655e43473b7SVivek Goyal 656e43473b7SVivek Goyal jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 657e43473b7SVivek Goyal 6585e901a2bSVivek Goyal tmp = tg->bps[rw] * jiffy_elapsed_rnd; 6595e901a2bSVivek Goyal do_div(tmp, HZ); 6603aad5d3eSVivek Goyal bytes_allowed = tmp; 661e43473b7SVivek Goyal 662e43473b7SVivek Goyal if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { 663e43473b7SVivek Goyal if (wait) 664e43473b7SVivek Goyal *wait = 0; 665e43473b7SVivek Goyal return 1; 666e43473b7SVivek Goyal } 667e43473b7SVivek Goyal 668e43473b7SVivek Goyal /* Calc approx time to dispatch */ 669e43473b7SVivek Goyal extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed; 670e43473b7SVivek Goyal jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); 671e43473b7SVivek Goyal 672e43473b7SVivek Goyal if (!jiffy_wait) 673e43473b7SVivek Goyal jiffy_wait = 1; 674e43473b7SVivek Goyal 675e43473b7SVivek Goyal /* 676e43473b7SVivek Goyal * This wait time is without taking into consideration the rounding 677e43473b7SVivek Goyal * up we did. Add that time also. 678e43473b7SVivek Goyal */ 679e43473b7SVivek Goyal jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); 680e43473b7SVivek Goyal if (wait) 681e43473b7SVivek Goyal *wait = jiffy_wait; 6828e89d13fSVivek Goyal return 0; 6838e89d13fSVivek Goyal } 684e43473b7SVivek Goyal 685af75cd3cSVivek Goyal static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) { 686af75cd3cSVivek Goyal if (tg->bps[rw] == -1 && tg->iops[rw] == -1) 687af75cd3cSVivek Goyal return 1; 688af75cd3cSVivek Goyal return 0; 689af75cd3cSVivek Goyal } 690af75cd3cSVivek Goyal 6918e89d13fSVivek Goyal /* 6928e89d13fSVivek Goyal * Returns whether one can dispatch a bio or not. Also returns approx number 6938e89d13fSVivek Goyal * of jiffies to wait before this bio is with-in IO rate and can be dispatched 6948e89d13fSVivek Goyal */ 6958e89d13fSVivek Goyal static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, 6968e89d13fSVivek Goyal struct bio *bio, unsigned long *wait) 6978e89d13fSVivek Goyal { 6988e89d13fSVivek Goyal bool rw = bio_data_dir(bio); 6998e89d13fSVivek Goyal unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; 7008e89d13fSVivek Goyal 7018e89d13fSVivek Goyal /* 7028e89d13fSVivek Goyal * Currently whole state machine of group depends on first bio 7038e89d13fSVivek Goyal * queued in the group bio list. So one should not be calling 7048e89d13fSVivek Goyal * this function with a different bio if there are other bios 7058e89d13fSVivek Goyal * queued. 7068e89d13fSVivek Goyal */ 7078e89d13fSVivek Goyal BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); 7088e89d13fSVivek Goyal 7098e89d13fSVivek Goyal /* If tg->bps = -1, then BW is unlimited */ 7108e89d13fSVivek Goyal if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { 7118e89d13fSVivek Goyal if (wait) 7128e89d13fSVivek Goyal *wait = 0; 7138e89d13fSVivek Goyal return 1; 7148e89d13fSVivek Goyal } 7158e89d13fSVivek Goyal 7168e89d13fSVivek Goyal /* 7178e89d13fSVivek Goyal * If previous slice expired, start a new one otherwise renew/extend 7188e89d13fSVivek Goyal * existing slice to make sure it is at least throtl_slice interval 7198e89d13fSVivek Goyal * long since now. 7208e89d13fSVivek Goyal */ 7218e89d13fSVivek Goyal if (throtl_slice_used(td, tg, rw)) 7228e89d13fSVivek Goyal throtl_start_new_slice(td, tg, rw); 7238e89d13fSVivek Goyal else { 7248e89d13fSVivek Goyal if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) 7258e89d13fSVivek Goyal throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); 7268e89d13fSVivek Goyal } 7278e89d13fSVivek Goyal 7288e89d13fSVivek Goyal if (tg_with_in_bps_limit(td, tg, bio, &bps_wait) 7298e89d13fSVivek Goyal && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) { 7308e89d13fSVivek Goyal if (wait) 7318e89d13fSVivek Goyal *wait = 0; 7328e89d13fSVivek Goyal return 1; 7338e89d13fSVivek Goyal } 7348e89d13fSVivek Goyal 7358e89d13fSVivek Goyal max_wait = max(bps_wait, iops_wait); 7368e89d13fSVivek Goyal 7378e89d13fSVivek Goyal if (wait) 7388e89d13fSVivek Goyal *wait = max_wait; 7398e89d13fSVivek Goyal 7408e89d13fSVivek Goyal if (time_before(tg->slice_end[rw], jiffies + max_wait)) 7418e89d13fSVivek Goyal throtl_extend_slice(td, tg, rw, jiffies + max_wait); 742e43473b7SVivek Goyal 743e43473b7SVivek Goyal return 0; 744e43473b7SVivek Goyal } 745e43473b7SVivek Goyal 746e43473b7SVivek Goyal static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 747e43473b7SVivek Goyal { 748e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 749e43473b7SVivek Goyal bool sync = bio->bi_rw & REQ_SYNC; 750e43473b7SVivek Goyal 751e43473b7SVivek Goyal /* Charge the bio to the group */ 752e43473b7SVivek Goyal tg->bytes_disp[rw] += bio->bi_size; 7538e89d13fSVivek Goyal tg->io_disp[rw]++; 754e43473b7SVivek Goyal 755e43473b7SVivek Goyal blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); 756e43473b7SVivek Goyal } 757e43473b7SVivek Goyal 758e43473b7SVivek Goyal static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, 759e43473b7SVivek Goyal struct bio *bio) 760e43473b7SVivek Goyal { 761e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 762e43473b7SVivek Goyal 763e43473b7SVivek Goyal bio_list_add(&tg->bio_lists[rw], bio); 764e43473b7SVivek Goyal /* Take a bio reference on tg */ 765e43473b7SVivek Goyal throtl_ref_get_tg(tg); 766e43473b7SVivek Goyal tg->nr_queued[rw]++; 767e43473b7SVivek Goyal td->nr_queued[rw]++; 768e43473b7SVivek Goyal throtl_enqueue_tg(td, tg); 769e43473b7SVivek Goyal } 770e43473b7SVivek Goyal 771e43473b7SVivek Goyal static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) 772e43473b7SVivek Goyal { 773e43473b7SVivek Goyal unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; 774e43473b7SVivek Goyal struct bio *bio; 775e43473b7SVivek Goyal 776e43473b7SVivek Goyal if ((bio = bio_list_peek(&tg->bio_lists[READ]))) 777e43473b7SVivek Goyal tg_may_dispatch(td, tg, bio, &read_wait); 778e43473b7SVivek Goyal 779e43473b7SVivek Goyal if ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) 780e43473b7SVivek Goyal tg_may_dispatch(td, tg, bio, &write_wait); 781e43473b7SVivek Goyal 782e43473b7SVivek Goyal min_wait = min(read_wait, write_wait); 783e43473b7SVivek Goyal disptime = jiffies + min_wait; 784e43473b7SVivek Goyal 785e43473b7SVivek Goyal /* Update dispatch time */ 786e43473b7SVivek Goyal throtl_dequeue_tg(td, tg); 787e43473b7SVivek Goyal tg->disptime = disptime; 788e43473b7SVivek Goyal throtl_enqueue_tg(td, tg); 789e43473b7SVivek Goyal } 790e43473b7SVivek Goyal 791e43473b7SVivek Goyal static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, 792e43473b7SVivek Goyal bool rw, struct bio_list *bl) 793e43473b7SVivek Goyal { 794e43473b7SVivek Goyal struct bio *bio; 795e43473b7SVivek Goyal 796e43473b7SVivek Goyal bio = bio_list_pop(&tg->bio_lists[rw]); 797e43473b7SVivek Goyal tg->nr_queued[rw]--; 798e43473b7SVivek Goyal /* Drop bio reference on tg */ 799e43473b7SVivek Goyal throtl_put_tg(tg); 800e43473b7SVivek Goyal 801e43473b7SVivek Goyal BUG_ON(td->nr_queued[rw] <= 0); 802e43473b7SVivek Goyal td->nr_queued[rw]--; 803e43473b7SVivek Goyal 804e43473b7SVivek Goyal throtl_charge_bio(tg, bio); 805e43473b7SVivek Goyal bio_list_add(bl, bio); 806e43473b7SVivek Goyal bio->bi_rw |= REQ_THROTTLED; 807e43473b7SVivek Goyal 808e43473b7SVivek Goyal throtl_trim_slice(td, tg, rw); 809e43473b7SVivek Goyal } 810e43473b7SVivek Goyal 811e43473b7SVivek Goyal static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, 812e43473b7SVivek Goyal struct bio_list *bl) 813e43473b7SVivek Goyal { 814e43473b7SVivek Goyal unsigned int nr_reads = 0, nr_writes = 0; 815e43473b7SVivek Goyal unsigned int max_nr_reads = throtl_grp_quantum*3/4; 816c2f6805dSVivek Goyal unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; 817e43473b7SVivek Goyal struct bio *bio; 818e43473b7SVivek Goyal 819e43473b7SVivek Goyal /* Try to dispatch 75% READS and 25% WRITES */ 820e43473b7SVivek Goyal 821e43473b7SVivek Goyal while ((bio = bio_list_peek(&tg->bio_lists[READ])) 822e43473b7SVivek Goyal && tg_may_dispatch(td, tg, bio, NULL)) { 823e43473b7SVivek Goyal 824e43473b7SVivek Goyal tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); 825e43473b7SVivek Goyal nr_reads++; 826e43473b7SVivek Goyal 827e43473b7SVivek Goyal if (nr_reads >= max_nr_reads) 828e43473b7SVivek Goyal break; 829e43473b7SVivek Goyal } 830e43473b7SVivek Goyal 831e43473b7SVivek Goyal while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) 832e43473b7SVivek Goyal && tg_may_dispatch(td, tg, bio, NULL)) { 833e43473b7SVivek Goyal 834e43473b7SVivek Goyal tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); 835e43473b7SVivek Goyal nr_writes++; 836e43473b7SVivek Goyal 837e43473b7SVivek Goyal if (nr_writes >= max_nr_writes) 838e43473b7SVivek Goyal break; 839e43473b7SVivek Goyal } 840e43473b7SVivek Goyal 841e43473b7SVivek Goyal return nr_reads + nr_writes; 842e43473b7SVivek Goyal } 843e43473b7SVivek Goyal 844e43473b7SVivek Goyal static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) 845e43473b7SVivek Goyal { 846e43473b7SVivek Goyal unsigned int nr_disp = 0; 847e43473b7SVivek Goyal struct throtl_grp *tg; 848e43473b7SVivek Goyal struct throtl_rb_root *st = &td->tg_service_tree; 849e43473b7SVivek Goyal 850e43473b7SVivek Goyal while (1) { 851e43473b7SVivek Goyal tg = throtl_rb_first(st); 852e43473b7SVivek Goyal 853e43473b7SVivek Goyal if (!tg) 854e43473b7SVivek Goyal break; 855e43473b7SVivek Goyal 856e43473b7SVivek Goyal if (time_before(jiffies, tg->disptime)) 857e43473b7SVivek Goyal break; 858e43473b7SVivek Goyal 859e43473b7SVivek Goyal throtl_dequeue_tg(td, tg); 860e43473b7SVivek Goyal 861e43473b7SVivek Goyal nr_disp += throtl_dispatch_tg(td, tg, bl); 862e43473b7SVivek Goyal 863e43473b7SVivek Goyal if (tg->nr_queued[0] || tg->nr_queued[1]) { 864e43473b7SVivek Goyal tg_update_disptime(td, tg); 865e43473b7SVivek Goyal throtl_enqueue_tg(td, tg); 866e43473b7SVivek Goyal } 867e43473b7SVivek Goyal 868e43473b7SVivek Goyal if (nr_disp >= throtl_quantum) 869e43473b7SVivek Goyal break; 870e43473b7SVivek Goyal } 871e43473b7SVivek Goyal 872e43473b7SVivek Goyal return nr_disp; 873e43473b7SVivek Goyal } 874e43473b7SVivek Goyal 875fe071437SVivek Goyal static void throtl_process_limit_change(struct throtl_data *td) 876fe071437SVivek Goyal { 877fe071437SVivek Goyal struct throtl_grp *tg; 878fe071437SVivek Goyal struct hlist_node *pos, *n; 879fe071437SVivek Goyal 880de701c74SVivek Goyal if (!td->limits_changed) 881fe071437SVivek Goyal return; 882fe071437SVivek Goyal 883de701c74SVivek Goyal xchg(&td->limits_changed, false); 884fe071437SVivek Goyal 885de701c74SVivek Goyal throtl_log(td, "limits changed"); 886fe071437SVivek Goyal 88704a6b516SVivek Goyal hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { 888de701c74SVivek Goyal if (!tg->limits_changed) 889de701c74SVivek Goyal continue; 890fe071437SVivek Goyal 891de701c74SVivek Goyal if (!xchg(&tg->limits_changed, false)) 892de701c74SVivek Goyal continue; 893de701c74SVivek Goyal 894de701c74SVivek Goyal throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" 895de701c74SVivek Goyal " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE], 896de701c74SVivek Goyal tg->iops[READ], tg->iops[WRITE]); 897de701c74SVivek Goyal 89804521db0SVivek Goyal /* 89904521db0SVivek Goyal * Restart the slices for both READ and WRITES. It 90004521db0SVivek Goyal * might happen that a group's limit are dropped 90104521db0SVivek Goyal * suddenly and we don't want to account recently 90204521db0SVivek Goyal * dispatched IO with new low rate 90304521db0SVivek Goyal */ 90404521db0SVivek Goyal throtl_start_new_slice(td, tg, 0); 90504521db0SVivek Goyal throtl_start_new_slice(td, tg, 1); 90604521db0SVivek Goyal 907de701c74SVivek Goyal if (throtl_tg_on_rr(tg)) 908de701c74SVivek Goyal tg_update_disptime(td, tg); 909de701c74SVivek Goyal } 910fe071437SVivek Goyal } 911fe071437SVivek Goyal 912e43473b7SVivek Goyal /* Dispatch throttled bios. Should be called without queue lock held. */ 913e43473b7SVivek Goyal static int throtl_dispatch(struct request_queue *q) 914e43473b7SVivek Goyal { 915e43473b7SVivek Goyal struct throtl_data *td = q->td; 916e43473b7SVivek Goyal unsigned int nr_disp = 0; 917e43473b7SVivek Goyal struct bio_list bio_list_on_stack; 918e43473b7SVivek Goyal struct bio *bio; 91969d60eb9SVivek Goyal struct blk_plug plug; 920e43473b7SVivek Goyal 921e43473b7SVivek Goyal spin_lock_irq(q->queue_lock); 922e43473b7SVivek Goyal 923fe071437SVivek Goyal throtl_process_limit_change(td); 924fe071437SVivek Goyal 925e43473b7SVivek Goyal if (!total_nr_queued(td)) 926e43473b7SVivek Goyal goto out; 927e43473b7SVivek Goyal 928e43473b7SVivek Goyal bio_list_init(&bio_list_on_stack); 929e43473b7SVivek Goyal 930*fd16d263SJoe Perches throtl_log(td, "dispatch nr_queued=%d read=%u write=%u", 931e43473b7SVivek Goyal total_nr_queued(td), td->nr_queued[READ], 932e43473b7SVivek Goyal td->nr_queued[WRITE]); 933e43473b7SVivek Goyal 934e43473b7SVivek Goyal nr_disp = throtl_select_dispatch(td, &bio_list_on_stack); 935e43473b7SVivek Goyal 936e43473b7SVivek Goyal if (nr_disp) 937e43473b7SVivek Goyal throtl_log(td, "bios disp=%u", nr_disp); 938e43473b7SVivek Goyal 939e43473b7SVivek Goyal throtl_schedule_next_dispatch(td); 940e43473b7SVivek Goyal out: 941e43473b7SVivek Goyal spin_unlock_irq(q->queue_lock); 942e43473b7SVivek Goyal 943e43473b7SVivek Goyal /* 944e43473b7SVivek Goyal * If we dispatched some requests, unplug the queue to make sure 945e43473b7SVivek Goyal * immediate dispatch 946e43473b7SVivek Goyal */ 947e43473b7SVivek Goyal if (nr_disp) { 94869d60eb9SVivek Goyal blk_start_plug(&plug); 949e43473b7SVivek Goyal while((bio = bio_list_pop(&bio_list_on_stack))) 950e43473b7SVivek Goyal generic_make_request(bio); 95169d60eb9SVivek Goyal blk_finish_plug(&plug); 952e43473b7SVivek Goyal } 953e43473b7SVivek Goyal return nr_disp; 954e43473b7SVivek Goyal } 955e43473b7SVivek Goyal 956e43473b7SVivek Goyal void blk_throtl_work(struct work_struct *work) 957e43473b7SVivek Goyal { 958e43473b7SVivek Goyal struct throtl_data *td = container_of(work, struct throtl_data, 959e43473b7SVivek Goyal throtl_work.work); 960e43473b7SVivek Goyal struct request_queue *q = td->queue; 961e43473b7SVivek Goyal 962e43473b7SVivek Goyal throtl_dispatch(q); 963e43473b7SVivek Goyal } 964e43473b7SVivek Goyal 965e43473b7SVivek Goyal /* Call with queue lock held */ 966450adcbeSVivek Goyal static void 967450adcbeSVivek Goyal throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay) 968e43473b7SVivek Goyal { 969e43473b7SVivek Goyal 970e43473b7SVivek Goyal struct delayed_work *dwork = &td->throtl_work; 971e43473b7SVivek Goyal 97204521db0SVivek Goyal /* schedule work if limits changed even if no bio is queued */ 97304521db0SVivek Goyal if (total_nr_queued(td) > 0 || td->limits_changed) { 974e43473b7SVivek Goyal /* 975e43473b7SVivek Goyal * We might have a work scheduled to be executed in future. 976e43473b7SVivek Goyal * Cancel that and schedule a new one. 977e43473b7SVivek Goyal */ 978e43473b7SVivek Goyal __cancel_delayed_work(dwork); 979450adcbeSVivek Goyal queue_delayed_work(kthrotld_workqueue, dwork, delay); 980e43473b7SVivek Goyal throtl_log(td, "schedule work. delay=%lu jiffies=%lu", 981e43473b7SVivek Goyal delay, jiffies); 982e43473b7SVivek Goyal } 983e43473b7SVivek Goyal } 984e43473b7SVivek Goyal 985e43473b7SVivek Goyal static void 986e43473b7SVivek Goyal throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg) 987e43473b7SVivek Goyal { 988e43473b7SVivek Goyal /* Something wrong if we are trying to remove same group twice */ 989e43473b7SVivek Goyal BUG_ON(hlist_unhashed(&tg->tg_node)); 990e43473b7SVivek Goyal 991e43473b7SVivek Goyal hlist_del_init(&tg->tg_node); 992e43473b7SVivek Goyal 993e43473b7SVivek Goyal /* 994e43473b7SVivek Goyal * Put the reference taken at the time of creation so that when all 995e43473b7SVivek Goyal * queues are gone, group can be destroyed. 996e43473b7SVivek Goyal */ 997e43473b7SVivek Goyal throtl_put_tg(tg); 998e43473b7SVivek Goyal td->nr_undestroyed_grps--; 999e43473b7SVivek Goyal } 1000e43473b7SVivek Goyal 1001e43473b7SVivek Goyal static void throtl_release_tgs(struct throtl_data *td) 1002e43473b7SVivek Goyal { 1003e43473b7SVivek Goyal struct hlist_node *pos, *n; 1004e43473b7SVivek Goyal struct throtl_grp *tg; 1005e43473b7SVivek Goyal 1006e43473b7SVivek Goyal hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { 1007e43473b7SVivek Goyal /* 1008e43473b7SVivek Goyal * If cgroup removal path got to blk_group first and removed 1009e43473b7SVivek Goyal * it from cgroup list, then it will take care of destroying 1010e43473b7SVivek Goyal * cfqg also. 1011e43473b7SVivek Goyal */ 1012e43473b7SVivek Goyal if (!blkiocg_del_blkio_group(&tg->blkg)) 1013e43473b7SVivek Goyal throtl_destroy_tg(td, tg); 1014e43473b7SVivek Goyal } 1015e43473b7SVivek Goyal } 1016e43473b7SVivek Goyal 1017e43473b7SVivek Goyal static void throtl_td_free(struct throtl_data *td) 1018e43473b7SVivek Goyal { 1019e43473b7SVivek Goyal kfree(td); 1020e43473b7SVivek Goyal } 1021e43473b7SVivek Goyal 1022e43473b7SVivek Goyal /* 1023e43473b7SVivek Goyal * Blk cgroup controller notification saying that blkio_group object is being 1024e43473b7SVivek Goyal * delinked as associated cgroup object is going away. That also means that 1025e43473b7SVivek Goyal * no new IO will come in this group. So get rid of this group as soon as 1026e43473b7SVivek Goyal * any pending IO in the group is finished. 1027e43473b7SVivek Goyal * 1028e43473b7SVivek Goyal * This function is called under rcu_read_lock(). key is the rcu protected 1029e43473b7SVivek Goyal * pointer. That means "key" is a valid throtl_data pointer as long as we are 1030e43473b7SVivek Goyal * rcu read lock. 1031e43473b7SVivek Goyal * 1032e43473b7SVivek Goyal * "key" was fetched from blkio_group under blkio_cgroup->lock. That means 1033e43473b7SVivek Goyal * it should not be NULL as even if queue was going away, cgroup deltion 1034e43473b7SVivek Goyal * path got to it first. 1035e43473b7SVivek Goyal */ 1036e43473b7SVivek Goyal void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg) 1037e43473b7SVivek Goyal { 1038e43473b7SVivek Goyal unsigned long flags; 1039e43473b7SVivek Goyal struct throtl_data *td = key; 1040e43473b7SVivek Goyal 1041e43473b7SVivek Goyal spin_lock_irqsave(td->queue->queue_lock, flags); 1042e43473b7SVivek Goyal throtl_destroy_tg(td, tg_of_blkg(blkg)); 1043e43473b7SVivek Goyal spin_unlock_irqrestore(td->queue->queue_lock, flags); 1044e43473b7SVivek Goyal } 1045e43473b7SVivek Goyal 1046de701c74SVivek Goyal static void throtl_update_blkio_group_common(struct throtl_data *td, 1047de701c74SVivek Goyal struct throtl_grp *tg) 1048de701c74SVivek Goyal { 1049de701c74SVivek Goyal xchg(&tg->limits_changed, true); 1050de701c74SVivek Goyal xchg(&td->limits_changed, true); 1051de701c74SVivek Goyal /* Schedule a work now to process the limit change */ 1052de701c74SVivek Goyal throtl_schedule_delayed_work(td, 0); 1053de701c74SVivek Goyal } 1054de701c74SVivek Goyal 1055fe071437SVivek Goyal /* 1056fe071437SVivek Goyal * For all update functions, key should be a valid pointer because these 1057fe071437SVivek Goyal * update functions are called under blkcg_lock, that means, blkg is 105825985edcSLucas De Marchi * valid and in turn key is valid. queue exit path can not race because 1059fe071437SVivek Goyal * of blkcg_lock 1060fe071437SVivek Goyal * 1061fe071437SVivek Goyal * Can not take queue lock in update functions as queue lock under blkcg_lock 1062fe071437SVivek Goyal * is not allowed. Under other paths we take blkcg_lock under queue_lock. 1063fe071437SVivek Goyal */ 1064fe071437SVivek Goyal static void throtl_update_blkio_group_read_bps(void *key, 1065fe071437SVivek Goyal struct blkio_group *blkg, u64 read_bps) 1066e43473b7SVivek Goyal { 1067fe071437SVivek Goyal struct throtl_data *td = key; 1068de701c74SVivek Goyal struct throtl_grp *tg = tg_of_blkg(blkg); 1069fe071437SVivek Goyal 1070de701c74SVivek Goyal tg->bps[READ] = read_bps; 1071de701c74SVivek Goyal throtl_update_blkio_group_common(td, tg); 1072e43473b7SVivek Goyal } 1073e43473b7SVivek Goyal 1074fe071437SVivek Goyal static void throtl_update_blkio_group_write_bps(void *key, 1075fe071437SVivek Goyal struct blkio_group *blkg, u64 write_bps) 1076e43473b7SVivek Goyal { 1077fe071437SVivek Goyal struct throtl_data *td = key; 1078de701c74SVivek Goyal struct throtl_grp *tg = tg_of_blkg(blkg); 1079fe071437SVivek Goyal 1080de701c74SVivek Goyal tg->bps[WRITE] = write_bps; 1081de701c74SVivek Goyal throtl_update_blkio_group_common(td, tg); 1082e43473b7SVivek Goyal } 1083e43473b7SVivek Goyal 1084fe071437SVivek Goyal static void throtl_update_blkio_group_read_iops(void *key, 1085fe071437SVivek Goyal struct blkio_group *blkg, unsigned int read_iops) 10868e89d13fSVivek Goyal { 1087fe071437SVivek Goyal struct throtl_data *td = key; 1088de701c74SVivek Goyal struct throtl_grp *tg = tg_of_blkg(blkg); 1089fe071437SVivek Goyal 1090de701c74SVivek Goyal tg->iops[READ] = read_iops; 1091de701c74SVivek Goyal throtl_update_blkio_group_common(td, tg); 10928e89d13fSVivek Goyal } 10938e89d13fSVivek Goyal 1094fe071437SVivek Goyal static void throtl_update_blkio_group_write_iops(void *key, 1095fe071437SVivek Goyal struct blkio_group *blkg, unsigned int write_iops) 10968e89d13fSVivek Goyal { 1097fe071437SVivek Goyal struct throtl_data *td = key; 1098de701c74SVivek Goyal struct throtl_grp *tg = tg_of_blkg(blkg); 1099fe071437SVivek Goyal 1100de701c74SVivek Goyal tg->iops[WRITE] = write_iops; 1101de701c74SVivek Goyal throtl_update_blkio_group_common(td, tg); 11028e89d13fSVivek Goyal } 11038e89d13fSVivek Goyal 1104da527770SVivek Goyal static void throtl_shutdown_wq(struct request_queue *q) 1105e43473b7SVivek Goyal { 1106e43473b7SVivek Goyal struct throtl_data *td = q->td; 1107e43473b7SVivek Goyal 1108e43473b7SVivek Goyal cancel_delayed_work_sync(&td->throtl_work); 1109e43473b7SVivek Goyal } 1110e43473b7SVivek Goyal 1111e43473b7SVivek Goyal static struct blkio_policy_type blkio_policy_throtl = { 1112e43473b7SVivek Goyal .ops = { 1113e43473b7SVivek Goyal .blkio_unlink_group_fn = throtl_unlink_blkio_group, 1114e43473b7SVivek Goyal .blkio_update_group_read_bps_fn = 1115e43473b7SVivek Goyal throtl_update_blkio_group_read_bps, 1116e43473b7SVivek Goyal .blkio_update_group_write_bps_fn = 1117e43473b7SVivek Goyal throtl_update_blkio_group_write_bps, 11188e89d13fSVivek Goyal .blkio_update_group_read_iops_fn = 11198e89d13fSVivek Goyal throtl_update_blkio_group_read_iops, 11208e89d13fSVivek Goyal .blkio_update_group_write_iops_fn = 11218e89d13fSVivek Goyal throtl_update_blkio_group_write_iops, 1122e43473b7SVivek Goyal }, 11238e89d13fSVivek Goyal .plid = BLKIO_POLICY_THROTL, 1124e43473b7SVivek Goyal }; 1125e43473b7SVivek Goyal 1126e43473b7SVivek Goyal int blk_throtl_bio(struct request_queue *q, struct bio **biop) 1127e43473b7SVivek Goyal { 1128e43473b7SVivek Goyal struct throtl_data *td = q->td; 1129e43473b7SVivek Goyal struct throtl_grp *tg; 1130e43473b7SVivek Goyal struct bio *bio = *biop; 1131e43473b7SVivek Goyal bool rw = bio_data_dir(bio), update_disptime = true; 1132af75cd3cSVivek Goyal struct blkio_cgroup *blkcg; 1133e43473b7SVivek Goyal 1134e43473b7SVivek Goyal if (bio->bi_rw & REQ_THROTTLED) { 1135e43473b7SVivek Goyal bio->bi_rw &= ~REQ_THROTTLED; 1136e43473b7SVivek Goyal return 0; 1137e43473b7SVivek Goyal } 1138e43473b7SVivek Goyal 1139af75cd3cSVivek Goyal /* 1140af75cd3cSVivek Goyal * A throtl_grp pointer retrieved under rcu can be used to access 1141af75cd3cSVivek Goyal * basic fields like stats and io rates. If a group has no rules, 1142af75cd3cSVivek Goyal * just update the dispatch stats in lockless manner and return. 1143af75cd3cSVivek Goyal */ 1144af75cd3cSVivek Goyal 1145af75cd3cSVivek Goyal rcu_read_lock(); 1146af75cd3cSVivek Goyal blkcg = task_blkio_cgroup(current); 1147af75cd3cSVivek Goyal tg = throtl_find_tg(td, blkcg); 1148af75cd3cSVivek Goyal if (tg) { 1149af75cd3cSVivek Goyal throtl_tg_fill_dev_details(td, tg); 1150af75cd3cSVivek Goyal 1151af75cd3cSVivek Goyal if (tg_no_rule_group(tg, rw)) { 1152af75cd3cSVivek Goyal blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, 1153af75cd3cSVivek Goyal rw, bio->bi_rw & REQ_SYNC); 1154af75cd3cSVivek Goyal rcu_read_unlock(); 1155af75cd3cSVivek Goyal return 0; 1156af75cd3cSVivek Goyal } 1157af75cd3cSVivek Goyal } 1158af75cd3cSVivek Goyal rcu_read_unlock(); 1159af75cd3cSVivek Goyal 1160af75cd3cSVivek Goyal /* 1161af75cd3cSVivek Goyal * Either group has not been allocated yet or it is not an unlimited 1162af75cd3cSVivek Goyal * IO group 1163af75cd3cSVivek Goyal */ 1164af75cd3cSVivek Goyal 1165e43473b7SVivek Goyal spin_lock_irq(q->queue_lock); 1166e43473b7SVivek Goyal tg = throtl_get_tg(td); 1167e43473b7SVivek Goyal 1168f469a7b4SVivek Goyal if (IS_ERR(tg)) { 1169f469a7b4SVivek Goyal if (PTR_ERR(tg) == -ENODEV) { 1170f469a7b4SVivek Goyal /* 1171f469a7b4SVivek Goyal * Queue is gone. No queue lock held here. 1172f469a7b4SVivek Goyal */ 1173f469a7b4SVivek Goyal return -ENODEV; 1174f469a7b4SVivek Goyal } 1175f469a7b4SVivek Goyal } 1176f469a7b4SVivek Goyal 1177e43473b7SVivek Goyal if (tg->nr_queued[rw]) { 1178e43473b7SVivek Goyal /* 1179e43473b7SVivek Goyal * There is already another bio queued in same dir. No 1180e43473b7SVivek Goyal * need to update dispatch time. 1181e43473b7SVivek Goyal */ 1182e43473b7SVivek Goyal update_disptime = false; 1183e43473b7SVivek Goyal goto queue_bio; 1184de701c74SVivek Goyal 1185e43473b7SVivek Goyal } 1186e43473b7SVivek Goyal 1187e43473b7SVivek Goyal /* Bio is with-in rate limit of group */ 1188e43473b7SVivek Goyal if (tg_may_dispatch(td, tg, bio, NULL)) { 1189e43473b7SVivek Goyal throtl_charge_bio(tg, bio); 119004521db0SVivek Goyal 119104521db0SVivek Goyal /* 119204521db0SVivek Goyal * We need to trim slice even when bios are not being queued 119304521db0SVivek Goyal * otherwise it might happen that a bio is not queued for 119404521db0SVivek Goyal * a long time and slice keeps on extending and trim is not 119504521db0SVivek Goyal * called for a long time. Now if limits are reduced suddenly 119604521db0SVivek Goyal * we take into account all the IO dispatched so far at new 119704521db0SVivek Goyal * low rate and * newly queued IO gets a really long dispatch 119804521db0SVivek Goyal * time. 119904521db0SVivek Goyal * 120004521db0SVivek Goyal * So keep on trimming slice even if bio is not queued. 120104521db0SVivek Goyal */ 120204521db0SVivek Goyal throtl_trim_slice(td, tg, rw); 1203e43473b7SVivek Goyal goto out; 1204e43473b7SVivek Goyal } 1205e43473b7SVivek Goyal 1206e43473b7SVivek Goyal queue_bio: 1207*fd16d263SJoe Perches throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu" 12088e89d13fSVivek Goyal " iodisp=%u iops=%u queued=%d/%d", 12098e89d13fSVivek Goyal rw == READ ? 'R' : 'W', 1210e43473b7SVivek Goyal tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], 12118e89d13fSVivek Goyal tg->io_disp[rw], tg->iops[rw], 1212e43473b7SVivek Goyal tg->nr_queued[READ], tg->nr_queued[WRITE]); 1213e43473b7SVivek Goyal 1214e43473b7SVivek Goyal throtl_add_bio_tg(q->td, tg, bio); 1215e43473b7SVivek Goyal *biop = NULL; 1216e43473b7SVivek Goyal 1217e43473b7SVivek Goyal if (update_disptime) { 1218e43473b7SVivek Goyal tg_update_disptime(td, tg); 1219e43473b7SVivek Goyal throtl_schedule_next_dispatch(td); 1220e43473b7SVivek Goyal } 1221e43473b7SVivek Goyal 1222e43473b7SVivek Goyal out: 1223e43473b7SVivek Goyal spin_unlock_irq(q->queue_lock); 1224e43473b7SVivek Goyal return 0; 1225e43473b7SVivek Goyal } 1226e43473b7SVivek Goyal 1227e43473b7SVivek Goyal int blk_throtl_init(struct request_queue *q) 1228e43473b7SVivek Goyal { 1229e43473b7SVivek Goyal struct throtl_data *td; 1230e43473b7SVivek Goyal struct throtl_grp *tg; 1231e43473b7SVivek Goyal 1232e43473b7SVivek Goyal td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1233e43473b7SVivek Goyal if (!td) 1234e43473b7SVivek Goyal return -ENOMEM; 1235e43473b7SVivek Goyal 1236e43473b7SVivek Goyal INIT_HLIST_HEAD(&td->tg_list); 1237e43473b7SVivek Goyal td->tg_service_tree = THROTL_RB_ROOT; 1238de701c74SVivek Goyal td->limits_changed = false; 1239a29a171eSVivek Goyal INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); 1240e43473b7SVivek Goyal 124129b12589SVivek Goyal /* alloc and Init root group. */ 124229b12589SVivek Goyal td->queue = q; 124329b12589SVivek Goyal tg = throtl_alloc_tg(td); 124402977e4aSVivek Goyal 124529b12589SVivek Goyal if (!tg) { 124629b12589SVivek Goyal kfree(td); 124729b12589SVivek Goyal return -ENOMEM; 124829b12589SVivek Goyal } 124929b12589SVivek Goyal 125029b12589SVivek Goyal td->root_tg = tg; 1251e43473b7SVivek Goyal 1252e43473b7SVivek Goyal rcu_read_lock(); 12535617cbefSVivek Goyal throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup); 1254e43473b7SVivek Goyal rcu_read_unlock(); 1255e43473b7SVivek Goyal 1256e43473b7SVivek Goyal /* Attach throtl data to request queue */ 1257e43473b7SVivek Goyal q->td = td; 1258e43473b7SVivek Goyal return 0; 1259e43473b7SVivek Goyal } 1260e43473b7SVivek Goyal 1261e43473b7SVivek Goyal void blk_throtl_exit(struct request_queue *q) 1262e43473b7SVivek Goyal { 1263e43473b7SVivek Goyal struct throtl_data *td = q->td; 1264e43473b7SVivek Goyal bool wait = false; 1265e43473b7SVivek Goyal 1266e43473b7SVivek Goyal BUG_ON(!td); 1267e43473b7SVivek Goyal 1268da527770SVivek Goyal throtl_shutdown_wq(q); 1269e43473b7SVivek Goyal 1270e43473b7SVivek Goyal spin_lock_irq(q->queue_lock); 1271e43473b7SVivek Goyal throtl_release_tgs(td); 1272e43473b7SVivek Goyal 1273e43473b7SVivek Goyal /* If there are other groups */ 127402977e4aSVivek Goyal if (td->nr_undestroyed_grps > 0) 1275e43473b7SVivek Goyal wait = true; 1276e43473b7SVivek Goyal 1277e43473b7SVivek Goyal spin_unlock_irq(q->queue_lock); 1278e43473b7SVivek Goyal 1279e43473b7SVivek Goyal /* 1280e43473b7SVivek Goyal * Wait for tg->blkg->key accessors to exit their grace periods. 1281e43473b7SVivek Goyal * Do this wait only if there are other undestroyed groups out 1282e43473b7SVivek Goyal * there (other than root group). This can happen if cgroup deletion 1283e43473b7SVivek Goyal * path claimed the responsibility of cleaning up a group before 1284e43473b7SVivek Goyal * queue cleanup code get to the group. 1285e43473b7SVivek Goyal * 1286e43473b7SVivek Goyal * Do not call synchronize_rcu() unconditionally as there are drivers 1287e43473b7SVivek Goyal * which create/delete request queue hundreds of times during scan/boot 1288e43473b7SVivek Goyal * and synchronize_rcu() can take significant time and slow down boot. 1289e43473b7SVivek Goyal */ 1290e43473b7SVivek Goyal if (wait) 1291e43473b7SVivek Goyal synchronize_rcu(); 1292fe071437SVivek Goyal 1293fe071437SVivek Goyal /* 1294fe071437SVivek Goyal * Just being safe to make sure after previous flush if some body did 1295fe071437SVivek Goyal * update limits through cgroup and another work got queued, cancel 1296fe071437SVivek Goyal * it. 1297fe071437SVivek Goyal */ 1298da527770SVivek Goyal throtl_shutdown_wq(q); 1299e43473b7SVivek Goyal throtl_td_free(td); 1300e43473b7SVivek Goyal } 1301e43473b7SVivek Goyal 1302e43473b7SVivek Goyal static int __init throtl_init(void) 1303e43473b7SVivek Goyal { 1304450adcbeSVivek Goyal kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); 1305450adcbeSVivek Goyal if (!kthrotld_workqueue) 1306450adcbeSVivek Goyal panic("Failed to create kthrotld\n"); 1307450adcbeSVivek Goyal 1308e43473b7SVivek Goyal blkio_policy_register(&blkio_policy_throtl); 1309e43473b7SVivek Goyal return 0; 1310e43473b7SVivek Goyal } 1311e43473b7SVivek Goyal 1312e43473b7SVivek Goyal module_init(throtl_init); 1313