1e43473b7SVivek Goyal /* 2e43473b7SVivek Goyal * Interface for controlling IO bandwidth on a request queue 3e43473b7SVivek Goyal * 4e43473b7SVivek Goyal * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> 5e43473b7SVivek Goyal */ 6e43473b7SVivek Goyal 7e43473b7SVivek Goyal #include <linux/module.h> 8e43473b7SVivek Goyal #include <linux/slab.h> 9e43473b7SVivek Goyal #include <linux/blkdev.h> 10e43473b7SVivek Goyal #include <linux/bio.h> 11e43473b7SVivek Goyal #include <linux/blktrace_api.h> 12e43473b7SVivek Goyal #include "blk-cgroup.h" 13bc9fcbf9STejun Heo #include "blk.h" 14e43473b7SVivek Goyal 15e43473b7SVivek Goyal /* Max dispatch from a group in 1 round */ 16e43473b7SVivek Goyal static int throtl_grp_quantum = 8; 17e43473b7SVivek Goyal 18e43473b7SVivek Goyal /* Total max dispatch from all groups in one round */ 19e43473b7SVivek Goyal static int throtl_quantum = 32; 20e43473b7SVivek Goyal 21e43473b7SVivek Goyal /* Throttling is performed over 100ms slice and after that slice is renewed */ 22e43473b7SVivek Goyal static unsigned long throtl_slice = HZ/10; /* 100 ms */ 23e43473b7SVivek Goyal 240381411eSTejun Heo static struct blkio_policy_type blkio_policy_throtl; 250381411eSTejun Heo 26450adcbeSVivek Goyal /* A workqueue to queue throttle related work */ 27450adcbeSVivek Goyal static struct workqueue_struct *kthrotld_workqueue; 28450adcbeSVivek Goyal static void throtl_schedule_delayed_work(struct throtl_data *td, 29450adcbeSVivek Goyal unsigned long delay); 30450adcbeSVivek Goyal 31e43473b7SVivek Goyal struct throtl_rb_root { 32e43473b7SVivek Goyal struct rb_root rb; 33e43473b7SVivek Goyal struct rb_node *left; 34e43473b7SVivek Goyal unsigned int count; 35e43473b7SVivek Goyal unsigned long min_disptime; 36e43473b7SVivek Goyal }; 37e43473b7SVivek Goyal 38e43473b7SVivek Goyal #define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \ 39e43473b7SVivek Goyal .count = 0, .min_disptime = 0} 40e43473b7SVivek Goyal 41e43473b7SVivek Goyal #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 42e43473b7SVivek Goyal 438a3d2615STejun Heo /* Per-cpu group stats */ 448a3d2615STejun Heo struct tg_stats_cpu { 458a3d2615STejun Heo /* total bytes transferred */ 468a3d2615STejun Heo struct blkg_rwstat service_bytes; 478a3d2615STejun Heo /* total IOs serviced, post merge */ 488a3d2615STejun Heo struct blkg_rwstat serviced; 498a3d2615STejun Heo }; 508a3d2615STejun Heo 51e43473b7SVivek Goyal struct throtl_grp { 52e43473b7SVivek Goyal /* active throtl group service_tree member */ 53e43473b7SVivek Goyal struct rb_node rb_node; 54e43473b7SVivek Goyal 55e43473b7SVivek Goyal /* 56e43473b7SVivek Goyal * Dispatch time in jiffies. This is the estimated time when group 57e43473b7SVivek Goyal * will unthrottle and is ready to dispatch more bio. It is used as 58e43473b7SVivek Goyal * key to sort active groups in service tree. 59e43473b7SVivek Goyal */ 60e43473b7SVivek Goyal unsigned long disptime; 61e43473b7SVivek Goyal 62e43473b7SVivek Goyal unsigned int flags; 63e43473b7SVivek Goyal 64e43473b7SVivek Goyal /* Two lists for READ and WRITE */ 65e43473b7SVivek Goyal struct bio_list bio_lists[2]; 66e43473b7SVivek Goyal 67e43473b7SVivek Goyal /* Number of queued bios on READ and WRITE lists */ 68e43473b7SVivek Goyal unsigned int nr_queued[2]; 69e43473b7SVivek Goyal 70e43473b7SVivek Goyal /* bytes per second rate limits */ 71e43473b7SVivek Goyal uint64_t bps[2]; 72e43473b7SVivek Goyal 738e89d13fSVivek Goyal /* IOPS limits */ 748e89d13fSVivek Goyal unsigned int iops[2]; 758e89d13fSVivek Goyal 76e43473b7SVivek Goyal /* Number of bytes disptached in current slice */ 77e43473b7SVivek Goyal uint64_t bytes_disp[2]; 788e89d13fSVivek Goyal /* Number of bio's dispatched in current slice */ 798e89d13fSVivek Goyal unsigned int io_disp[2]; 80e43473b7SVivek Goyal 81e43473b7SVivek Goyal /* When did we start a new slice */ 82e43473b7SVivek Goyal unsigned long slice_start[2]; 83e43473b7SVivek Goyal unsigned long slice_end[2]; 84fe071437SVivek Goyal 85fe071437SVivek Goyal /* Some throttle limits got updated for the group */ 866f037937SAndreas Schwab int limits_changed; 878a3d2615STejun Heo 888a3d2615STejun Heo /* Per cpu stats pointer */ 898a3d2615STejun Heo struct tg_stats_cpu __percpu *stats_cpu; 908a3d2615STejun Heo 918a3d2615STejun Heo /* List of tgs waiting for per cpu stats memory to be allocated */ 928a3d2615STejun Heo struct list_head stats_alloc_node; 93e43473b7SVivek Goyal }; 94e43473b7SVivek Goyal 95e43473b7SVivek Goyal struct throtl_data 96e43473b7SVivek Goyal { 97e43473b7SVivek Goyal /* service tree for active throtl groups */ 98e43473b7SVivek Goyal struct throtl_rb_root tg_service_tree; 99e43473b7SVivek Goyal 100e43473b7SVivek Goyal struct request_queue *queue; 101e43473b7SVivek Goyal 102e43473b7SVivek Goyal /* Total Number of queued bios on READ and WRITE lists */ 103e43473b7SVivek Goyal unsigned int nr_queued[2]; 104e43473b7SVivek Goyal 105e43473b7SVivek Goyal /* 10602977e4aSVivek Goyal * number of total undestroyed groups 107e43473b7SVivek Goyal */ 108e43473b7SVivek Goyal unsigned int nr_undestroyed_grps; 109e43473b7SVivek Goyal 110e43473b7SVivek Goyal /* Work for dispatching throttled bios */ 111e43473b7SVivek Goyal struct delayed_work throtl_work; 112fe071437SVivek Goyal 1136f037937SAndreas Schwab int limits_changed; 114e43473b7SVivek Goyal }; 115e43473b7SVivek Goyal 1168a3d2615STejun Heo /* list and work item to allocate percpu group stats */ 1178a3d2615STejun Heo static DEFINE_SPINLOCK(tg_stats_alloc_lock); 1188a3d2615STejun Heo static LIST_HEAD(tg_stats_alloc_list); 1198a3d2615STejun Heo 1208a3d2615STejun Heo static void tg_stats_alloc_fn(struct work_struct *); 1218a3d2615STejun Heo static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); 1228a3d2615STejun Heo 1230381411eSTejun Heo static inline struct throtl_grp *blkg_to_tg(struct blkio_group *blkg) 1240381411eSTejun Heo { 1250381411eSTejun Heo return blkg_to_pdata(blkg, &blkio_policy_throtl); 1260381411eSTejun Heo } 1270381411eSTejun Heo 1280381411eSTejun Heo static inline struct blkio_group *tg_to_blkg(struct throtl_grp *tg) 1290381411eSTejun Heo { 130aaec55a0STejun Heo return pdata_to_blkg(tg); 1310381411eSTejun Heo } 1320381411eSTejun Heo 13303d8e111STejun Heo static inline struct throtl_grp *td_root_tg(struct throtl_data *td) 13403d8e111STejun Heo { 13503d8e111STejun Heo return blkg_to_tg(td->queue->root_blkg); 13603d8e111STejun Heo } 13703d8e111STejun Heo 138e43473b7SVivek Goyal enum tg_state_flags { 139e43473b7SVivek Goyal THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ 140e43473b7SVivek Goyal }; 141e43473b7SVivek Goyal 142e43473b7SVivek Goyal #define THROTL_TG_FNS(name) \ 143e43473b7SVivek Goyal static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \ 144e43473b7SVivek Goyal { \ 145e43473b7SVivek Goyal (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \ 146e43473b7SVivek Goyal } \ 147e43473b7SVivek Goyal static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \ 148e43473b7SVivek Goyal { \ 149e43473b7SVivek Goyal (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \ 150e43473b7SVivek Goyal } \ 151e43473b7SVivek Goyal static inline int throtl_tg_##name(const struct throtl_grp *tg) \ 152e43473b7SVivek Goyal { \ 153e43473b7SVivek Goyal return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \ 154e43473b7SVivek Goyal } 155e43473b7SVivek Goyal 156e43473b7SVivek Goyal THROTL_TG_FNS(on_rr); 157e43473b7SVivek Goyal 158e43473b7SVivek Goyal #define throtl_log_tg(td, tg, fmt, args...) \ 159e43473b7SVivek Goyal blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ 1600381411eSTejun Heo blkg_path(tg_to_blkg(tg)), ##args); \ 161e43473b7SVivek Goyal 162e43473b7SVivek Goyal #define throtl_log(td, fmt, args...) \ 163e43473b7SVivek Goyal blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) 164e43473b7SVivek Goyal 165d2f31a5fSJoe Perches static inline unsigned int total_nr_queued(struct throtl_data *td) 166e43473b7SVivek Goyal { 167d2f31a5fSJoe Perches return td->nr_queued[0] + td->nr_queued[1]; 168e43473b7SVivek Goyal } 169e43473b7SVivek Goyal 1708a3d2615STejun Heo /* 1718a3d2615STejun Heo * Worker for allocating per cpu stat for tgs. This is scheduled on the 1728a3d2615STejun Heo * system_nrt_wq once there are some groups on the alloc_list waiting for 1738a3d2615STejun Heo * allocation. 1748a3d2615STejun Heo */ 1758a3d2615STejun Heo static void tg_stats_alloc_fn(struct work_struct *work) 1768a3d2615STejun Heo { 1778a3d2615STejun Heo static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ 1788a3d2615STejun Heo struct delayed_work *dwork = to_delayed_work(work); 1798a3d2615STejun Heo bool empty = false; 1808a3d2615STejun Heo 1818a3d2615STejun Heo alloc_stats: 1828a3d2615STejun Heo if (!stats_cpu) { 1838a3d2615STejun Heo stats_cpu = alloc_percpu(struct tg_stats_cpu); 1848a3d2615STejun Heo if (!stats_cpu) { 1858a3d2615STejun Heo /* allocation failed, try again after some time */ 1868a3d2615STejun Heo queue_delayed_work(system_nrt_wq, dwork, 1878a3d2615STejun Heo msecs_to_jiffies(10)); 1888a3d2615STejun Heo return; 1898a3d2615STejun Heo } 1908a3d2615STejun Heo } 1918a3d2615STejun Heo 1928a3d2615STejun Heo spin_lock_irq(&tg_stats_alloc_lock); 1938a3d2615STejun Heo 1948a3d2615STejun Heo if (!list_empty(&tg_stats_alloc_list)) { 1958a3d2615STejun Heo struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, 1968a3d2615STejun Heo struct throtl_grp, 1978a3d2615STejun Heo stats_alloc_node); 1988a3d2615STejun Heo swap(tg->stats_cpu, stats_cpu); 1998a3d2615STejun Heo list_del_init(&tg->stats_alloc_node); 2008a3d2615STejun Heo } 2018a3d2615STejun Heo 2028a3d2615STejun Heo empty = list_empty(&tg_stats_alloc_list); 2038a3d2615STejun Heo spin_unlock_irq(&tg_stats_alloc_lock); 2048a3d2615STejun Heo if (!empty) 2058a3d2615STejun Heo goto alloc_stats; 2068a3d2615STejun Heo } 2078a3d2615STejun Heo 2080381411eSTejun Heo static void throtl_init_blkio_group(struct blkio_group *blkg) 209a29a171eSVivek Goyal { 2100381411eSTejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 211cd1604faSTejun Heo 212a29a171eSVivek Goyal RB_CLEAR_NODE(&tg->rb_node); 213a29a171eSVivek Goyal bio_list_init(&tg->bio_lists[0]); 214a29a171eSVivek Goyal bio_list_init(&tg->bio_lists[1]); 215a29a171eSVivek Goyal tg->limits_changed = false; 216a29a171eSVivek Goyal 217e56da7e2STejun Heo tg->bps[READ] = -1; 218e56da7e2STejun Heo tg->bps[WRITE] = -1; 219e56da7e2STejun Heo tg->iops[READ] = -1; 220e56da7e2STejun Heo tg->iops[WRITE] = -1; 2218a3d2615STejun Heo 2228a3d2615STejun Heo /* 2238a3d2615STejun Heo * Ugh... We need to perform per-cpu allocation for tg->stats_cpu 2248a3d2615STejun Heo * but percpu allocator can't be called from IO path. Queue tg on 2258a3d2615STejun Heo * tg_stats_alloc_list and allocate from work item. 2268a3d2615STejun Heo */ 2278a3d2615STejun Heo spin_lock(&tg_stats_alloc_lock); 2288a3d2615STejun Heo list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); 2298a3d2615STejun Heo queue_delayed_work(system_nrt_wq, &tg_stats_alloc_work, 0); 2308a3d2615STejun Heo spin_unlock(&tg_stats_alloc_lock); 2318a3d2615STejun Heo } 2328a3d2615STejun Heo 2338a3d2615STejun Heo static void throtl_exit_blkio_group(struct blkio_group *blkg) 2348a3d2615STejun Heo { 2358a3d2615STejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 2368a3d2615STejun Heo 2378a3d2615STejun Heo spin_lock(&tg_stats_alloc_lock); 2388a3d2615STejun Heo list_del_init(&tg->stats_alloc_node); 2398a3d2615STejun Heo spin_unlock(&tg_stats_alloc_lock); 2408a3d2615STejun Heo 2418a3d2615STejun Heo free_percpu(tg->stats_cpu); 2428a3d2615STejun Heo } 2438a3d2615STejun Heo 2448a3d2615STejun Heo static void throtl_reset_group_stats(struct blkio_group *blkg) 2458a3d2615STejun Heo { 2468a3d2615STejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 2478a3d2615STejun Heo int cpu; 2488a3d2615STejun Heo 2498a3d2615STejun Heo if (tg->stats_cpu == NULL) 2508a3d2615STejun Heo return; 2518a3d2615STejun Heo 2528a3d2615STejun Heo for_each_possible_cpu(cpu) { 2538a3d2615STejun Heo struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); 2548a3d2615STejun Heo 2558a3d2615STejun Heo blkg_rwstat_reset(&sc->service_bytes); 2568a3d2615STejun Heo blkg_rwstat_reset(&sc->serviced); 2578a3d2615STejun Heo } 258a29a171eSVivek Goyal } 259a29a171eSVivek Goyal 260f469a7b4SVivek Goyal static struct 261cd1604faSTejun Heo throtl_grp *throtl_lookup_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) 262e43473b7SVivek Goyal { 263e43473b7SVivek Goyal /* 264be2c6b19SVivek Goyal * This is the common case when there are no blkio cgroups. 265be2c6b19SVivek Goyal * Avoid lookup in this case 266be2c6b19SVivek Goyal */ 267be2c6b19SVivek Goyal if (blkcg == &blkio_root_cgroup) 26803d8e111STejun Heo return td_root_tg(td); 269e43473b7SVivek Goyal 270e8989faeSTejun Heo return blkg_to_tg(blkg_lookup(blkcg, td->queue)); 271e43473b7SVivek Goyal } 272e43473b7SVivek Goyal 273cd1604faSTejun Heo static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, 2740a5a7d0eSTejun Heo struct blkio_cgroup *blkcg) 275e43473b7SVivek Goyal { 276f469a7b4SVivek Goyal struct request_queue *q = td->queue; 277cd1604faSTejun Heo struct throtl_grp *tg = NULL; 2780a5a7d0eSTejun Heo 279f469a7b4SVivek Goyal /* 280cd1604faSTejun Heo * This is the common case when there are no blkio cgroups. 281cd1604faSTejun Heo * Avoid lookup in this case 282f469a7b4SVivek Goyal */ 283cd1604faSTejun Heo if (blkcg == &blkio_root_cgroup) { 28403d8e111STejun Heo tg = td_root_tg(td); 285cd1604faSTejun Heo } else { 286cd1604faSTejun Heo struct blkio_group *blkg; 287cd1604faSTejun Heo 288aaec55a0STejun Heo blkg = blkg_lookup_create(blkcg, q, false); 289cd1604faSTejun Heo 290cd1604faSTejun Heo /* if %NULL and @q is alive, fall back to root_tg */ 291cd1604faSTejun Heo if (!IS_ERR(blkg)) 2920381411eSTejun Heo tg = blkg_to_tg(blkg); 293cd1604faSTejun Heo else if (!blk_queue_dead(q)) 29403d8e111STejun Heo tg = td_root_tg(td); 295f469a7b4SVivek Goyal } 296f469a7b4SVivek Goyal 297e43473b7SVivek Goyal return tg; 298e43473b7SVivek Goyal } 299e43473b7SVivek Goyal 300e43473b7SVivek Goyal static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root) 301e43473b7SVivek Goyal { 302e43473b7SVivek Goyal /* Service tree is empty */ 303e43473b7SVivek Goyal if (!root->count) 304e43473b7SVivek Goyal return NULL; 305e43473b7SVivek Goyal 306e43473b7SVivek Goyal if (!root->left) 307e43473b7SVivek Goyal root->left = rb_first(&root->rb); 308e43473b7SVivek Goyal 309e43473b7SVivek Goyal if (root->left) 310e43473b7SVivek Goyal return rb_entry_tg(root->left); 311e43473b7SVivek Goyal 312e43473b7SVivek Goyal return NULL; 313e43473b7SVivek Goyal } 314e43473b7SVivek Goyal 315e43473b7SVivek Goyal static void rb_erase_init(struct rb_node *n, struct rb_root *root) 316e43473b7SVivek Goyal { 317e43473b7SVivek Goyal rb_erase(n, root); 318e43473b7SVivek Goyal RB_CLEAR_NODE(n); 319e43473b7SVivek Goyal } 320e43473b7SVivek Goyal 321e43473b7SVivek Goyal static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root) 322e43473b7SVivek Goyal { 323e43473b7SVivek Goyal if (root->left == n) 324e43473b7SVivek Goyal root->left = NULL; 325e43473b7SVivek Goyal rb_erase_init(n, &root->rb); 326e43473b7SVivek Goyal --root->count; 327e43473b7SVivek Goyal } 328e43473b7SVivek Goyal 329e43473b7SVivek Goyal static void update_min_dispatch_time(struct throtl_rb_root *st) 330e43473b7SVivek Goyal { 331e43473b7SVivek Goyal struct throtl_grp *tg; 332e43473b7SVivek Goyal 333e43473b7SVivek Goyal tg = throtl_rb_first(st); 334e43473b7SVivek Goyal if (!tg) 335e43473b7SVivek Goyal return; 336e43473b7SVivek Goyal 337e43473b7SVivek Goyal st->min_disptime = tg->disptime; 338e43473b7SVivek Goyal } 339e43473b7SVivek Goyal 340e43473b7SVivek Goyal static void 341e43473b7SVivek Goyal tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg) 342e43473b7SVivek Goyal { 343e43473b7SVivek Goyal struct rb_node **node = &st->rb.rb_node; 344e43473b7SVivek Goyal struct rb_node *parent = NULL; 345e43473b7SVivek Goyal struct throtl_grp *__tg; 346e43473b7SVivek Goyal unsigned long key = tg->disptime; 347e43473b7SVivek Goyal int left = 1; 348e43473b7SVivek Goyal 349e43473b7SVivek Goyal while (*node != NULL) { 350e43473b7SVivek Goyal parent = *node; 351e43473b7SVivek Goyal __tg = rb_entry_tg(parent); 352e43473b7SVivek Goyal 353e43473b7SVivek Goyal if (time_before(key, __tg->disptime)) 354e43473b7SVivek Goyal node = &parent->rb_left; 355e43473b7SVivek Goyal else { 356e43473b7SVivek Goyal node = &parent->rb_right; 357e43473b7SVivek Goyal left = 0; 358e43473b7SVivek Goyal } 359e43473b7SVivek Goyal } 360e43473b7SVivek Goyal 361e43473b7SVivek Goyal if (left) 362e43473b7SVivek Goyal st->left = &tg->rb_node; 363e43473b7SVivek Goyal 364e43473b7SVivek Goyal rb_link_node(&tg->rb_node, parent, node); 365e43473b7SVivek Goyal rb_insert_color(&tg->rb_node, &st->rb); 366e43473b7SVivek Goyal } 367e43473b7SVivek Goyal 368e43473b7SVivek Goyal static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) 369e43473b7SVivek Goyal { 370e43473b7SVivek Goyal struct throtl_rb_root *st = &td->tg_service_tree; 371e43473b7SVivek Goyal 372e43473b7SVivek Goyal tg_service_tree_add(st, tg); 373e43473b7SVivek Goyal throtl_mark_tg_on_rr(tg); 374e43473b7SVivek Goyal st->count++; 375e43473b7SVivek Goyal } 376e43473b7SVivek Goyal 377e43473b7SVivek Goyal static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) 378e43473b7SVivek Goyal { 379e43473b7SVivek Goyal if (!throtl_tg_on_rr(tg)) 380e43473b7SVivek Goyal __throtl_enqueue_tg(td, tg); 381e43473b7SVivek Goyal } 382e43473b7SVivek Goyal 383e43473b7SVivek Goyal static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) 384e43473b7SVivek Goyal { 385e43473b7SVivek Goyal throtl_rb_erase(&tg->rb_node, &td->tg_service_tree); 386e43473b7SVivek Goyal throtl_clear_tg_on_rr(tg); 387e43473b7SVivek Goyal } 388e43473b7SVivek Goyal 389e43473b7SVivek Goyal static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) 390e43473b7SVivek Goyal { 391e43473b7SVivek Goyal if (throtl_tg_on_rr(tg)) 392e43473b7SVivek Goyal __throtl_dequeue_tg(td, tg); 393e43473b7SVivek Goyal } 394e43473b7SVivek Goyal 395e43473b7SVivek Goyal static void throtl_schedule_next_dispatch(struct throtl_data *td) 396e43473b7SVivek Goyal { 397e43473b7SVivek Goyal struct throtl_rb_root *st = &td->tg_service_tree; 398e43473b7SVivek Goyal 399e43473b7SVivek Goyal /* 400e43473b7SVivek Goyal * If there are more bios pending, schedule more work. 401e43473b7SVivek Goyal */ 402e43473b7SVivek Goyal if (!total_nr_queued(td)) 403e43473b7SVivek Goyal return; 404e43473b7SVivek Goyal 405e43473b7SVivek Goyal BUG_ON(!st->count); 406e43473b7SVivek Goyal 407e43473b7SVivek Goyal update_min_dispatch_time(st); 408e43473b7SVivek Goyal 409e43473b7SVivek Goyal if (time_before_eq(st->min_disptime, jiffies)) 410450adcbeSVivek Goyal throtl_schedule_delayed_work(td, 0); 411e43473b7SVivek Goyal else 412450adcbeSVivek Goyal throtl_schedule_delayed_work(td, (st->min_disptime - jiffies)); 413e43473b7SVivek Goyal } 414e43473b7SVivek Goyal 415e43473b7SVivek Goyal static inline void 416e43473b7SVivek Goyal throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) 417e43473b7SVivek Goyal { 418e43473b7SVivek Goyal tg->bytes_disp[rw] = 0; 4198e89d13fSVivek Goyal tg->io_disp[rw] = 0; 420e43473b7SVivek Goyal tg->slice_start[rw] = jiffies; 421e43473b7SVivek Goyal tg->slice_end[rw] = jiffies + throtl_slice; 422e43473b7SVivek Goyal throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", 423e43473b7SVivek Goyal rw == READ ? 'R' : 'W', tg->slice_start[rw], 424e43473b7SVivek Goyal tg->slice_end[rw], jiffies); 425e43473b7SVivek Goyal } 426e43473b7SVivek Goyal 427d1ae8ffdSVivek Goyal static inline void throtl_set_slice_end(struct throtl_data *td, 428d1ae8ffdSVivek Goyal struct throtl_grp *tg, bool rw, unsigned long jiffy_end) 429d1ae8ffdSVivek Goyal { 430d1ae8ffdSVivek Goyal tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 431d1ae8ffdSVivek Goyal } 432d1ae8ffdSVivek Goyal 433e43473b7SVivek Goyal static inline void throtl_extend_slice(struct throtl_data *td, 434e43473b7SVivek Goyal struct throtl_grp *tg, bool rw, unsigned long jiffy_end) 435e43473b7SVivek Goyal { 436e43473b7SVivek Goyal tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 437e43473b7SVivek Goyal throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", 438e43473b7SVivek Goyal rw == READ ? 'R' : 'W', tg->slice_start[rw], 439e43473b7SVivek Goyal tg->slice_end[rw], jiffies); 440e43473b7SVivek Goyal } 441e43473b7SVivek Goyal 442e43473b7SVivek Goyal /* Determine if previously allocated or extended slice is complete or not */ 443e43473b7SVivek Goyal static bool 444e43473b7SVivek Goyal throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) 445e43473b7SVivek Goyal { 446e43473b7SVivek Goyal if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 447e43473b7SVivek Goyal return 0; 448e43473b7SVivek Goyal 449e43473b7SVivek Goyal return 1; 450e43473b7SVivek Goyal } 451e43473b7SVivek Goyal 452e43473b7SVivek Goyal /* Trim the used slices and adjust slice start accordingly */ 453e43473b7SVivek Goyal static inline void 454e43473b7SVivek Goyal throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) 455e43473b7SVivek Goyal { 4563aad5d3eSVivek Goyal unsigned long nr_slices, time_elapsed, io_trim; 4573aad5d3eSVivek Goyal u64 bytes_trim, tmp; 458e43473b7SVivek Goyal 459e43473b7SVivek Goyal BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); 460e43473b7SVivek Goyal 461e43473b7SVivek Goyal /* 462e43473b7SVivek Goyal * If bps are unlimited (-1), then time slice don't get 463e43473b7SVivek Goyal * renewed. Don't try to trim the slice if slice is used. A new 464e43473b7SVivek Goyal * slice will start when appropriate. 465e43473b7SVivek Goyal */ 466e43473b7SVivek Goyal if (throtl_slice_used(td, tg, rw)) 467e43473b7SVivek Goyal return; 468e43473b7SVivek Goyal 469d1ae8ffdSVivek Goyal /* 470d1ae8ffdSVivek Goyal * A bio has been dispatched. Also adjust slice_end. It might happen 471d1ae8ffdSVivek Goyal * that initially cgroup limit was very low resulting in high 472d1ae8ffdSVivek Goyal * slice_end, but later limit was bumped up and bio was dispached 473d1ae8ffdSVivek Goyal * sooner, then we need to reduce slice_end. A high bogus slice_end 474d1ae8ffdSVivek Goyal * is bad because it does not allow new slice to start. 475d1ae8ffdSVivek Goyal */ 476d1ae8ffdSVivek Goyal 477d1ae8ffdSVivek Goyal throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice); 478d1ae8ffdSVivek Goyal 479e43473b7SVivek Goyal time_elapsed = jiffies - tg->slice_start[rw]; 480e43473b7SVivek Goyal 481e43473b7SVivek Goyal nr_slices = time_elapsed / throtl_slice; 482e43473b7SVivek Goyal 483e43473b7SVivek Goyal if (!nr_slices) 484e43473b7SVivek Goyal return; 4853aad5d3eSVivek Goyal tmp = tg->bps[rw] * throtl_slice * nr_slices; 4863aad5d3eSVivek Goyal do_div(tmp, HZ); 4873aad5d3eSVivek Goyal bytes_trim = tmp; 488e43473b7SVivek Goyal 4898e89d13fSVivek Goyal io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; 490e43473b7SVivek Goyal 4918e89d13fSVivek Goyal if (!bytes_trim && !io_trim) 492e43473b7SVivek Goyal return; 493e43473b7SVivek Goyal 494e43473b7SVivek Goyal if (tg->bytes_disp[rw] >= bytes_trim) 495e43473b7SVivek Goyal tg->bytes_disp[rw] -= bytes_trim; 496e43473b7SVivek Goyal else 497e43473b7SVivek Goyal tg->bytes_disp[rw] = 0; 498e43473b7SVivek Goyal 4998e89d13fSVivek Goyal if (tg->io_disp[rw] >= io_trim) 5008e89d13fSVivek Goyal tg->io_disp[rw] -= io_trim; 5018e89d13fSVivek Goyal else 5028e89d13fSVivek Goyal tg->io_disp[rw] = 0; 5038e89d13fSVivek Goyal 504e43473b7SVivek Goyal tg->slice_start[rw] += nr_slices * throtl_slice; 505e43473b7SVivek Goyal 5063aad5d3eSVivek Goyal throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" 507e43473b7SVivek Goyal " start=%lu end=%lu jiffies=%lu", 5088e89d13fSVivek Goyal rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, 509e43473b7SVivek Goyal tg->slice_start[rw], tg->slice_end[rw], jiffies); 510e43473b7SVivek Goyal } 511e43473b7SVivek Goyal 5128e89d13fSVivek Goyal static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, 513e43473b7SVivek Goyal struct bio *bio, unsigned long *wait) 514e43473b7SVivek Goyal { 515e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 5168e89d13fSVivek Goyal unsigned int io_allowed; 517e43473b7SVivek Goyal unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 518c49c06e4SVivek Goyal u64 tmp; 519e43473b7SVivek Goyal 5208e89d13fSVivek Goyal jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 521e43473b7SVivek Goyal 5228e89d13fSVivek Goyal /* Slice has just started. Consider one slice interval */ 5238e89d13fSVivek Goyal if (!jiffy_elapsed) 5248e89d13fSVivek Goyal jiffy_elapsed_rnd = throtl_slice; 5258e89d13fSVivek Goyal 5268e89d13fSVivek Goyal jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 5278e89d13fSVivek Goyal 528c49c06e4SVivek Goyal /* 529c49c06e4SVivek Goyal * jiffy_elapsed_rnd should not be a big value as minimum iops can be 530c49c06e4SVivek Goyal * 1 then at max jiffy elapsed should be equivalent of 1 second as we 531c49c06e4SVivek Goyal * will allow dispatch after 1 second and after that slice should 532c49c06e4SVivek Goyal * have been trimmed. 533c49c06e4SVivek Goyal */ 534c49c06e4SVivek Goyal 535c49c06e4SVivek Goyal tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; 536c49c06e4SVivek Goyal do_div(tmp, HZ); 537c49c06e4SVivek Goyal 538c49c06e4SVivek Goyal if (tmp > UINT_MAX) 539c49c06e4SVivek Goyal io_allowed = UINT_MAX; 540c49c06e4SVivek Goyal else 541c49c06e4SVivek Goyal io_allowed = tmp; 5428e89d13fSVivek Goyal 5438e89d13fSVivek Goyal if (tg->io_disp[rw] + 1 <= io_allowed) { 544e43473b7SVivek Goyal if (wait) 545e43473b7SVivek Goyal *wait = 0; 546e43473b7SVivek Goyal return 1; 547e43473b7SVivek Goyal } 548e43473b7SVivek Goyal 5498e89d13fSVivek Goyal /* Calc approx time to dispatch */ 5508e89d13fSVivek Goyal jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; 5518e89d13fSVivek Goyal 5528e89d13fSVivek Goyal if (jiffy_wait > jiffy_elapsed) 5538e89d13fSVivek Goyal jiffy_wait = jiffy_wait - jiffy_elapsed; 5548e89d13fSVivek Goyal else 5558e89d13fSVivek Goyal jiffy_wait = 1; 5568e89d13fSVivek Goyal 5578e89d13fSVivek Goyal if (wait) 5588e89d13fSVivek Goyal *wait = jiffy_wait; 5598e89d13fSVivek Goyal return 0; 560e43473b7SVivek Goyal } 561e43473b7SVivek Goyal 5628e89d13fSVivek Goyal static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, 5638e89d13fSVivek Goyal struct bio *bio, unsigned long *wait) 5648e89d13fSVivek Goyal { 5658e89d13fSVivek Goyal bool rw = bio_data_dir(bio); 5663aad5d3eSVivek Goyal u64 bytes_allowed, extra_bytes, tmp; 5678e89d13fSVivek Goyal unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 5688e89d13fSVivek Goyal 569e43473b7SVivek Goyal jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 570e43473b7SVivek Goyal 571e43473b7SVivek Goyal /* Slice has just started. Consider one slice interval */ 572e43473b7SVivek Goyal if (!jiffy_elapsed) 573e43473b7SVivek Goyal jiffy_elapsed_rnd = throtl_slice; 574e43473b7SVivek Goyal 575e43473b7SVivek Goyal jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 576e43473b7SVivek Goyal 5775e901a2bSVivek Goyal tmp = tg->bps[rw] * jiffy_elapsed_rnd; 5785e901a2bSVivek Goyal do_div(tmp, HZ); 5793aad5d3eSVivek Goyal bytes_allowed = tmp; 580e43473b7SVivek Goyal 581e43473b7SVivek Goyal if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { 582e43473b7SVivek Goyal if (wait) 583e43473b7SVivek Goyal *wait = 0; 584e43473b7SVivek Goyal return 1; 585e43473b7SVivek Goyal } 586e43473b7SVivek Goyal 587e43473b7SVivek Goyal /* Calc approx time to dispatch */ 588e43473b7SVivek Goyal extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed; 589e43473b7SVivek Goyal jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); 590e43473b7SVivek Goyal 591e43473b7SVivek Goyal if (!jiffy_wait) 592e43473b7SVivek Goyal jiffy_wait = 1; 593e43473b7SVivek Goyal 594e43473b7SVivek Goyal /* 595e43473b7SVivek Goyal * This wait time is without taking into consideration the rounding 596e43473b7SVivek Goyal * up we did. Add that time also. 597e43473b7SVivek Goyal */ 598e43473b7SVivek Goyal jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); 599e43473b7SVivek Goyal if (wait) 600e43473b7SVivek Goyal *wait = jiffy_wait; 6018e89d13fSVivek Goyal return 0; 6028e89d13fSVivek Goyal } 603e43473b7SVivek Goyal 604af75cd3cSVivek Goyal static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) { 605af75cd3cSVivek Goyal if (tg->bps[rw] == -1 && tg->iops[rw] == -1) 606af75cd3cSVivek Goyal return 1; 607af75cd3cSVivek Goyal return 0; 608af75cd3cSVivek Goyal } 609af75cd3cSVivek Goyal 6108e89d13fSVivek Goyal /* 6118e89d13fSVivek Goyal * Returns whether one can dispatch a bio or not. Also returns approx number 6128e89d13fSVivek Goyal * of jiffies to wait before this bio is with-in IO rate and can be dispatched 6138e89d13fSVivek Goyal */ 6148e89d13fSVivek Goyal static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, 6158e89d13fSVivek Goyal struct bio *bio, unsigned long *wait) 6168e89d13fSVivek Goyal { 6178e89d13fSVivek Goyal bool rw = bio_data_dir(bio); 6188e89d13fSVivek Goyal unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; 6198e89d13fSVivek Goyal 6208e89d13fSVivek Goyal /* 6218e89d13fSVivek Goyal * Currently whole state machine of group depends on first bio 6228e89d13fSVivek Goyal * queued in the group bio list. So one should not be calling 6238e89d13fSVivek Goyal * this function with a different bio if there are other bios 6248e89d13fSVivek Goyal * queued. 6258e89d13fSVivek Goyal */ 6268e89d13fSVivek Goyal BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); 6278e89d13fSVivek Goyal 6288e89d13fSVivek Goyal /* If tg->bps = -1, then BW is unlimited */ 6298e89d13fSVivek Goyal if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { 6308e89d13fSVivek Goyal if (wait) 6318e89d13fSVivek Goyal *wait = 0; 6328e89d13fSVivek Goyal return 1; 6338e89d13fSVivek Goyal } 6348e89d13fSVivek Goyal 6358e89d13fSVivek Goyal /* 6368e89d13fSVivek Goyal * If previous slice expired, start a new one otherwise renew/extend 6378e89d13fSVivek Goyal * existing slice to make sure it is at least throtl_slice interval 6388e89d13fSVivek Goyal * long since now. 6398e89d13fSVivek Goyal */ 6408e89d13fSVivek Goyal if (throtl_slice_used(td, tg, rw)) 6418e89d13fSVivek Goyal throtl_start_new_slice(td, tg, rw); 6428e89d13fSVivek Goyal else { 6438e89d13fSVivek Goyal if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) 6448e89d13fSVivek Goyal throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); 6458e89d13fSVivek Goyal } 6468e89d13fSVivek Goyal 6478e89d13fSVivek Goyal if (tg_with_in_bps_limit(td, tg, bio, &bps_wait) 6488e89d13fSVivek Goyal && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) { 6498e89d13fSVivek Goyal if (wait) 6508e89d13fSVivek Goyal *wait = 0; 6518e89d13fSVivek Goyal return 1; 6528e89d13fSVivek Goyal } 6538e89d13fSVivek Goyal 6548e89d13fSVivek Goyal max_wait = max(bps_wait, iops_wait); 6558e89d13fSVivek Goyal 6568e89d13fSVivek Goyal if (wait) 6578e89d13fSVivek Goyal *wait = max_wait; 6588e89d13fSVivek Goyal 6598e89d13fSVivek Goyal if (time_before(tg->slice_end[rw], jiffies + max_wait)) 6608e89d13fSVivek Goyal throtl_extend_slice(td, tg, rw, jiffies + max_wait); 661e43473b7SVivek Goyal 662e43473b7SVivek Goyal return 0; 663e43473b7SVivek Goyal } 664e43473b7SVivek Goyal 665629ed0b1STejun Heo static void throtl_update_dispatch_stats(struct blkio_group *blkg, u64 bytes, 666629ed0b1STejun Heo int rw) 667629ed0b1STejun Heo { 6688a3d2615STejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 6698a3d2615STejun Heo struct tg_stats_cpu *stats_cpu; 670629ed0b1STejun Heo unsigned long flags; 671629ed0b1STejun Heo 672629ed0b1STejun Heo /* If per cpu stats are not allocated yet, don't do any accounting. */ 6738a3d2615STejun Heo if (tg->stats_cpu == NULL) 674629ed0b1STejun Heo return; 675629ed0b1STejun Heo 676629ed0b1STejun Heo /* 677629ed0b1STejun Heo * Disabling interrupts to provide mutual exclusion between two 678629ed0b1STejun Heo * writes on same cpu. It probably is not needed for 64bit. Not 679629ed0b1STejun Heo * optimizing that case yet. 680629ed0b1STejun Heo */ 681629ed0b1STejun Heo local_irq_save(flags); 682629ed0b1STejun Heo 6838a3d2615STejun Heo stats_cpu = this_cpu_ptr(tg->stats_cpu); 684629ed0b1STejun Heo 685629ed0b1STejun Heo blkg_rwstat_add(&stats_cpu->serviced, rw, 1); 686629ed0b1STejun Heo blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); 687629ed0b1STejun Heo 688629ed0b1STejun Heo local_irq_restore(flags); 689629ed0b1STejun Heo } 690629ed0b1STejun Heo 691e43473b7SVivek Goyal static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 692e43473b7SVivek Goyal { 693e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 694e43473b7SVivek Goyal 695e43473b7SVivek Goyal /* Charge the bio to the group */ 696e43473b7SVivek Goyal tg->bytes_disp[rw] += bio->bi_size; 6978e89d13fSVivek Goyal tg->io_disp[rw]++; 698e43473b7SVivek Goyal 699629ed0b1STejun Heo throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw); 700e43473b7SVivek Goyal } 701e43473b7SVivek Goyal 702e43473b7SVivek Goyal static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, 703e43473b7SVivek Goyal struct bio *bio) 704e43473b7SVivek Goyal { 705e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 706e43473b7SVivek Goyal 707e43473b7SVivek Goyal bio_list_add(&tg->bio_lists[rw], bio); 708e43473b7SVivek Goyal /* Take a bio reference on tg */ 7091adaf3ddSTejun Heo blkg_get(tg_to_blkg(tg)); 710e43473b7SVivek Goyal tg->nr_queued[rw]++; 711e43473b7SVivek Goyal td->nr_queued[rw]++; 712e43473b7SVivek Goyal throtl_enqueue_tg(td, tg); 713e43473b7SVivek Goyal } 714e43473b7SVivek Goyal 715e43473b7SVivek Goyal static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) 716e43473b7SVivek Goyal { 717e43473b7SVivek Goyal unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; 718e43473b7SVivek Goyal struct bio *bio; 719e43473b7SVivek Goyal 720e43473b7SVivek Goyal if ((bio = bio_list_peek(&tg->bio_lists[READ]))) 721e43473b7SVivek Goyal tg_may_dispatch(td, tg, bio, &read_wait); 722e43473b7SVivek Goyal 723e43473b7SVivek Goyal if ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) 724e43473b7SVivek Goyal tg_may_dispatch(td, tg, bio, &write_wait); 725e43473b7SVivek Goyal 726e43473b7SVivek Goyal min_wait = min(read_wait, write_wait); 727e43473b7SVivek Goyal disptime = jiffies + min_wait; 728e43473b7SVivek Goyal 729e43473b7SVivek Goyal /* Update dispatch time */ 730e43473b7SVivek Goyal throtl_dequeue_tg(td, tg); 731e43473b7SVivek Goyal tg->disptime = disptime; 732e43473b7SVivek Goyal throtl_enqueue_tg(td, tg); 733e43473b7SVivek Goyal } 734e43473b7SVivek Goyal 735e43473b7SVivek Goyal static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, 736e43473b7SVivek Goyal bool rw, struct bio_list *bl) 737e43473b7SVivek Goyal { 738e43473b7SVivek Goyal struct bio *bio; 739e43473b7SVivek Goyal 740e43473b7SVivek Goyal bio = bio_list_pop(&tg->bio_lists[rw]); 741e43473b7SVivek Goyal tg->nr_queued[rw]--; 7421adaf3ddSTejun Heo /* Drop bio reference on blkg */ 7431adaf3ddSTejun Heo blkg_put(tg_to_blkg(tg)); 744e43473b7SVivek Goyal 745e43473b7SVivek Goyal BUG_ON(td->nr_queued[rw] <= 0); 746e43473b7SVivek Goyal td->nr_queued[rw]--; 747e43473b7SVivek Goyal 748e43473b7SVivek Goyal throtl_charge_bio(tg, bio); 749e43473b7SVivek Goyal bio_list_add(bl, bio); 750e43473b7SVivek Goyal bio->bi_rw |= REQ_THROTTLED; 751e43473b7SVivek Goyal 752e43473b7SVivek Goyal throtl_trim_slice(td, tg, rw); 753e43473b7SVivek Goyal } 754e43473b7SVivek Goyal 755e43473b7SVivek Goyal static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, 756e43473b7SVivek Goyal struct bio_list *bl) 757e43473b7SVivek Goyal { 758e43473b7SVivek Goyal unsigned int nr_reads = 0, nr_writes = 0; 759e43473b7SVivek Goyal unsigned int max_nr_reads = throtl_grp_quantum*3/4; 760c2f6805dSVivek Goyal unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; 761e43473b7SVivek Goyal struct bio *bio; 762e43473b7SVivek Goyal 763e43473b7SVivek Goyal /* Try to dispatch 75% READS and 25% WRITES */ 764e43473b7SVivek Goyal 765e43473b7SVivek Goyal while ((bio = bio_list_peek(&tg->bio_lists[READ])) 766e43473b7SVivek Goyal && tg_may_dispatch(td, tg, bio, NULL)) { 767e43473b7SVivek Goyal 768e43473b7SVivek Goyal tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); 769e43473b7SVivek Goyal nr_reads++; 770e43473b7SVivek Goyal 771e43473b7SVivek Goyal if (nr_reads >= max_nr_reads) 772e43473b7SVivek Goyal break; 773e43473b7SVivek Goyal } 774e43473b7SVivek Goyal 775e43473b7SVivek Goyal while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) 776e43473b7SVivek Goyal && tg_may_dispatch(td, tg, bio, NULL)) { 777e43473b7SVivek Goyal 778e43473b7SVivek Goyal tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); 779e43473b7SVivek Goyal nr_writes++; 780e43473b7SVivek Goyal 781e43473b7SVivek Goyal if (nr_writes >= max_nr_writes) 782e43473b7SVivek Goyal break; 783e43473b7SVivek Goyal } 784e43473b7SVivek Goyal 785e43473b7SVivek Goyal return nr_reads + nr_writes; 786e43473b7SVivek Goyal } 787e43473b7SVivek Goyal 788e43473b7SVivek Goyal static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) 789e43473b7SVivek Goyal { 790e43473b7SVivek Goyal unsigned int nr_disp = 0; 791e43473b7SVivek Goyal struct throtl_grp *tg; 792e43473b7SVivek Goyal struct throtl_rb_root *st = &td->tg_service_tree; 793e43473b7SVivek Goyal 794e43473b7SVivek Goyal while (1) { 795e43473b7SVivek Goyal tg = throtl_rb_first(st); 796e43473b7SVivek Goyal 797e43473b7SVivek Goyal if (!tg) 798e43473b7SVivek Goyal break; 799e43473b7SVivek Goyal 800e43473b7SVivek Goyal if (time_before(jiffies, tg->disptime)) 801e43473b7SVivek Goyal break; 802e43473b7SVivek Goyal 803e43473b7SVivek Goyal throtl_dequeue_tg(td, tg); 804e43473b7SVivek Goyal 805e43473b7SVivek Goyal nr_disp += throtl_dispatch_tg(td, tg, bl); 806e43473b7SVivek Goyal 807e43473b7SVivek Goyal if (tg->nr_queued[0] || tg->nr_queued[1]) { 808e43473b7SVivek Goyal tg_update_disptime(td, tg); 809e43473b7SVivek Goyal throtl_enqueue_tg(td, tg); 810e43473b7SVivek Goyal } 811e43473b7SVivek Goyal 812e43473b7SVivek Goyal if (nr_disp >= throtl_quantum) 813e43473b7SVivek Goyal break; 814e43473b7SVivek Goyal } 815e43473b7SVivek Goyal 816e43473b7SVivek Goyal return nr_disp; 817e43473b7SVivek Goyal } 818e43473b7SVivek Goyal 819fe071437SVivek Goyal static void throtl_process_limit_change(struct throtl_data *td) 820fe071437SVivek Goyal { 8214eef3049STejun Heo struct request_queue *q = td->queue; 8224eef3049STejun Heo struct blkio_group *blkg, *n; 823fe071437SVivek Goyal 824de701c74SVivek Goyal if (!td->limits_changed) 825fe071437SVivek Goyal return; 826fe071437SVivek Goyal 827de701c74SVivek Goyal xchg(&td->limits_changed, false); 828fe071437SVivek Goyal 829de701c74SVivek Goyal throtl_log(td, "limits changed"); 830fe071437SVivek Goyal 831e8989faeSTejun Heo list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { 8324eef3049STejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 8334eef3049STejun Heo 834de701c74SVivek Goyal if (!tg->limits_changed) 835de701c74SVivek Goyal continue; 836fe071437SVivek Goyal 837de701c74SVivek Goyal if (!xchg(&tg->limits_changed, false)) 838de701c74SVivek Goyal continue; 839de701c74SVivek Goyal 840de701c74SVivek Goyal throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" 841de701c74SVivek Goyal " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE], 842de701c74SVivek Goyal tg->iops[READ], tg->iops[WRITE]); 843de701c74SVivek Goyal 84404521db0SVivek Goyal /* 84504521db0SVivek Goyal * Restart the slices for both READ and WRITES. It 84604521db0SVivek Goyal * might happen that a group's limit are dropped 84704521db0SVivek Goyal * suddenly and we don't want to account recently 84804521db0SVivek Goyal * dispatched IO with new low rate 84904521db0SVivek Goyal */ 85004521db0SVivek Goyal throtl_start_new_slice(td, tg, 0); 85104521db0SVivek Goyal throtl_start_new_slice(td, tg, 1); 85204521db0SVivek Goyal 853de701c74SVivek Goyal if (throtl_tg_on_rr(tg)) 854de701c74SVivek Goyal tg_update_disptime(td, tg); 855de701c74SVivek Goyal } 856fe071437SVivek Goyal } 857fe071437SVivek Goyal 858e43473b7SVivek Goyal /* Dispatch throttled bios. Should be called without queue lock held. */ 859e43473b7SVivek Goyal static int throtl_dispatch(struct request_queue *q) 860e43473b7SVivek Goyal { 861e43473b7SVivek Goyal struct throtl_data *td = q->td; 862e43473b7SVivek Goyal unsigned int nr_disp = 0; 863e43473b7SVivek Goyal struct bio_list bio_list_on_stack; 864e43473b7SVivek Goyal struct bio *bio; 86569d60eb9SVivek Goyal struct blk_plug plug; 866e43473b7SVivek Goyal 867e43473b7SVivek Goyal spin_lock_irq(q->queue_lock); 868e43473b7SVivek Goyal 869fe071437SVivek Goyal throtl_process_limit_change(td); 870fe071437SVivek Goyal 871e43473b7SVivek Goyal if (!total_nr_queued(td)) 872e43473b7SVivek Goyal goto out; 873e43473b7SVivek Goyal 874e43473b7SVivek Goyal bio_list_init(&bio_list_on_stack); 875e43473b7SVivek Goyal 876d2f31a5fSJoe Perches throtl_log(td, "dispatch nr_queued=%u read=%u write=%u", 877e43473b7SVivek Goyal total_nr_queued(td), td->nr_queued[READ], 878e43473b7SVivek Goyal td->nr_queued[WRITE]); 879e43473b7SVivek Goyal 880e43473b7SVivek Goyal nr_disp = throtl_select_dispatch(td, &bio_list_on_stack); 881e43473b7SVivek Goyal 882e43473b7SVivek Goyal if (nr_disp) 883e43473b7SVivek Goyal throtl_log(td, "bios disp=%u", nr_disp); 884e43473b7SVivek Goyal 885e43473b7SVivek Goyal throtl_schedule_next_dispatch(td); 886e43473b7SVivek Goyal out: 887e43473b7SVivek Goyal spin_unlock_irq(q->queue_lock); 888e43473b7SVivek Goyal 889e43473b7SVivek Goyal /* 890e43473b7SVivek Goyal * If we dispatched some requests, unplug the queue to make sure 891e43473b7SVivek Goyal * immediate dispatch 892e43473b7SVivek Goyal */ 893e43473b7SVivek Goyal if (nr_disp) { 89469d60eb9SVivek Goyal blk_start_plug(&plug); 895e43473b7SVivek Goyal while((bio = bio_list_pop(&bio_list_on_stack))) 896e43473b7SVivek Goyal generic_make_request(bio); 89769d60eb9SVivek Goyal blk_finish_plug(&plug); 898e43473b7SVivek Goyal } 899e43473b7SVivek Goyal return nr_disp; 900e43473b7SVivek Goyal } 901e43473b7SVivek Goyal 902e43473b7SVivek Goyal void blk_throtl_work(struct work_struct *work) 903e43473b7SVivek Goyal { 904e43473b7SVivek Goyal struct throtl_data *td = container_of(work, struct throtl_data, 905e43473b7SVivek Goyal throtl_work.work); 906e43473b7SVivek Goyal struct request_queue *q = td->queue; 907e43473b7SVivek Goyal 908e43473b7SVivek Goyal throtl_dispatch(q); 909e43473b7SVivek Goyal } 910e43473b7SVivek Goyal 911e43473b7SVivek Goyal /* Call with queue lock held */ 912450adcbeSVivek Goyal static void 913450adcbeSVivek Goyal throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay) 914e43473b7SVivek Goyal { 915e43473b7SVivek Goyal 916e43473b7SVivek Goyal struct delayed_work *dwork = &td->throtl_work; 917e43473b7SVivek Goyal 91804521db0SVivek Goyal /* schedule work if limits changed even if no bio is queued */ 919d2f31a5fSJoe Perches if (total_nr_queued(td) || td->limits_changed) { 920e43473b7SVivek Goyal /* 921e43473b7SVivek Goyal * We might have a work scheduled to be executed in future. 922e43473b7SVivek Goyal * Cancel that and schedule a new one. 923e43473b7SVivek Goyal */ 924e43473b7SVivek Goyal __cancel_delayed_work(dwork); 925450adcbeSVivek Goyal queue_delayed_work(kthrotld_workqueue, dwork, delay); 926e43473b7SVivek Goyal throtl_log(td, "schedule work. delay=%lu jiffies=%lu", 927e43473b7SVivek Goyal delay, jiffies); 928e43473b7SVivek Goyal } 929e43473b7SVivek Goyal } 930e43473b7SVivek Goyal 931d366e7ecSTejun Heo static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, void *pdata, int off) 93241b38b6dSTejun Heo { 933d366e7ecSTejun Heo struct throtl_grp *tg = pdata; 93441b38b6dSTejun Heo struct blkg_rwstat rwstat = { }, tmp; 93541b38b6dSTejun Heo int i, cpu; 93641b38b6dSTejun Heo 93741b38b6dSTejun Heo for_each_possible_cpu(cpu) { 9388a3d2615STejun Heo struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); 93941b38b6dSTejun Heo 94041b38b6dSTejun Heo tmp = blkg_rwstat_read((void *)sc + off); 94141b38b6dSTejun Heo for (i = 0; i < BLKG_RWSTAT_NR; i++) 94241b38b6dSTejun Heo rwstat.cnt[i] += tmp.cnt[i]; 94341b38b6dSTejun Heo } 94441b38b6dSTejun Heo 945d366e7ecSTejun Heo return __blkg_prfill_rwstat(sf, pdata, &rwstat); 94641b38b6dSTejun Heo } 94741b38b6dSTejun Heo 9488a3d2615STejun Heo static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, 94941b38b6dSTejun Heo struct seq_file *sf) 95041b38b6dSTejun Heo { 95141b38b6dSTejun Heo struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp); 95241b38b6dSTejun Heo 953ec399347STejun Heo blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkio_policy_throtl, 9545bc4afb1STejun Heo cft->private, true); 95541b38b6dSTejun Heo return 0; 95641b38b6dSTejun Heo } 95741b38b6dSTejun Heo 958d366e7ecSTejun Heo static u64 tg_prfill_conf_u64(struct seq_file *sf, void *pdata, int off) 95960c2bc2dSTejun Heo { 960d366e7ecSTejun Heo u64 v = *(u64 *)(pdata + off); 96160c2bc2dSTejun Heo 962af133cebSTejun Heo if (v == -1) 96360c2bc2dSTejun Heo return 0; 964d366e7ecSTejun Heo return __blkg_prfill_u64(sf, pdata, v); 96560c2bc2dSTejun Heo } 96660c2bc2dSTejun Heo 967d366e7ecSTejun Heo static u64 tg_prfill_conf_uint(struct seq_file *sf, void *pdata, int off) 968af133cebSTejun Heo { 969d366e7ecSTejun Heo unsigned int v = *(unsigned int *)(pdata + off); 970af133cebSTejun Heo 971af133cebSTejun Heo if (v == -1) 972af133cebSTejun Heo return 0; 973d366e7ecSTejun Heo return __blkg_prfill_u64(sf, pdata, v); 974af133cebSTejun Heo } 975af133cebSTejun Heo 976af133cebSTejun Heo static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft, 97760c2bc2dSTejun Heo struct seq_file *sf) 97860c2bc2dSTejun Heo { 979af133cebSTejun Heo blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp), tg_prfill_conf_u64, 980ec399347STejun Heo &blkio_policy_throtl, cft->private, false); 98160c2bc2dSTejun Heo return 0; 98260c2bc2dSTejun Heo } 98360c2bc2dSTejun Heo 984af133cebSTejun Heo static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft, 985af133cebSTejun Heo struct seq_file *sf) 986e43473b7SVivek Goyal { 987af133cebSTejun Heo blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp), tg_prfill_conf_uint, 988ec399347STejun Heo &blkio_policy_throtl, cft->private, false); 989af133cebSTejun Heo return 0; 990e43473b7SVivek Goyal } 991e43473b7SVivek Goyal 992af133cebSTejun Heo static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, 993af133cebSTejun Heo bool is_u64) 99460c2bc2dSTejun Heo { 99560c2bc2dSTejun Heo struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp); 99660c2bc2dSTejun Heo struct blkg_conf_ctx ctx; 997af133cebSTejun Heo struct throtl_grp *tg; 998*a2b1693bSTejun Heo struct throtl_data *td; 99960c2bc2dSTejun Heo int ret; 100060c2bc2dSTejun Heo 1001da8b0662STejun Heo ret = blkg_conf_prep(blkcg, &blkio_policy_throtl, buf, &ctx); 100260c2bc2dSTejun Heo if (ret) 100360c2bc2dSTejun Heo return ret; 100460c2bc2dSTejun Heo 1005af133cebSTejun Heo tg = blkg_to_tg(ctx.blkg); 1006*a2b1693bSTejun Heo td = ctx.blkg->q->td; 1007af133cebSTejun Heo 1008af133cebSTejun Heo if (!ctx.v) 1009af133cebSTejun Heo ctx.v = -1; 1010af133cebSTejun Heo 1011af133cebSTejun Heo if (is_u64) 1012af133cebSTejun Heo *(u64 *)((void *)tg + cft->private) = ctx.v; 1013af133cebSTejun Heo else 1014af133cebSTejun Heo *(unsigned int *)((void *)tg + cft->private) = ctx.v; 1015af133cebSTejun Heo 1016af133cebSTejun Heo /* XXX: we don't need the following deferred processing */ 1017af133cebSTejun Heo xchg(&tg->limits_changed, true); 1018af133cebSTejun Heo xchg(&td->limits_changed, true); 1019af133cebSTejun Heo throtl_schedule_delayed_work(td, 0); 1020af133cebSTejun Heo 102160c2bc2dSTejun Heo blkg_conf_finish(&ctx); 1022*a2b1693bSTejun Heo return 0; 102360c2bc2dSTejun Heo } 102460c2bc2dSTejun Heo 1025af133cebSTejun Heo static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft, 102660c2bc2dSTejun Heo const char *buf) 102760c2bc2dSTejun Heo { 1028af133cebSTejun Heo return tg_set_conf(cgrp, cft, buf, true); 102960c2bc2dSTejun Heo } 103060c2bc2dSTejun Heo 1031af133cebSTejun Heo static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft, 103260c2bc2dSTejun Heo const char *buf) 103360c2bc2dSTejun Heo { 1034af133cebSTejun Heo return tg_set_conf(cgrp, cft, buf, false); 103560c2bc2dSTejun Heo } 103660c2bc2dSTejun Heo 103760c2bc2dSTejun Heo static struct cftype throtl_files[] = { 103860c2bc2dSTejun Heo { 103960c2bc2dSTejun Heo .name = "throttle.read_bps_device", 1040af133cebSTejun Heo .private = offsetof(struct throtl_grp, bps[READ]), 1041af133cebSTejun Heo .read_seq_string = tg_print_conf_u64, 1042af133cebSTejun Heo .write_string = tg_set_conf_u64, 104360c2bc2dSTejun Heo .max_write_len = 256, 104460c2bc2dSTejun Heo }, 104560c2bc2dSTejun Heo { 104660c2bc2dSTejun Heo .name = "throttle.write_bps_device", 1047af133cebSTejun Heo .private = offsetof(struct throtl_grp, bps[WRITE]), 1048af133cebSTejun Heo .read_seq_string = tg_print_conf_u64, 1049af133cebSTejun Heo .write_string = tg_set_conf_u64, 105060c2bc2dSTejun Heo .max_write_len = 256, 105160c2bc2dSTejun Heo }, 105260c2bc2dSTejun Heo { 105360c2bc2dSTejun Heo .name = "throttle.read_iops_device", 1054af133cebSTejun Heo .private = offsetof(struct throtl_grp, iops[READ]), 1055af133cebSTejun Heo .read_seq_string = tg_print_conf_uint, 1056af133cebSTejun Heo .write_string = tg_set_conf_uint, 105760c2bc2dSTejun Heo .max_write_len = 256, 105860c2bc2dSTejun Heo }, 105960c2bc2dSTejun Heo { 106060c2bc2dSTejun Heo .name = "throttle.write_iops_device", 1061af133cebSTejun Heo .private = offsetof(struct throtl_grp, iops[WRITE]), 1062af133cebSTejun Heo .read_seq_string = tg_print_conf_uint, 1063af133cebSTejun Heo .write_string = tg_set_conf_uint, 106460c2bc2dSTejun Heo .max_write_len = 256, 106560c2bc2dSTejun Heo }, 106660c2bc2dSTejun Heo { 106760c2bc2dSTejun Heo .name = "throttle.io_service_bytes", 10685bc4afb1STejun Heo .private = offsetof(struct tg_stats_cpu, service_bytes), 10698a3d2615STejun Heo .read_seq_string = tg_print_cpu_rwstat, 107060c2bc2dSTejun Heo }, 107160c2bc2dSTejun Heo { 107260c2bc2dSTejun Heo .name = "throttle.io_serviced", 10735bc4afb1STejun Heo .private = offsetof(struct tg_stats_cpu, serviced), 10748a3d2615STejun Heo .read_seq_string = tg_print_cpu_rwstat, 107560c2bc2dSTejun Heo }, 107660c2bc2dSTejun Heo { } /* terminate */ 107760c2bc2dSTejun Heo }; 107860c2bc2dSTejun Heo 1079da527770SVivek Goyal static void throtl_shutdown_wq(struct request_queue *q) 1080e43473b7SVivek Goyal { 1081e43473b7SVivek Goyal struct throtl_data *td = q->td; 1082e43473b7SVivek Goyal 1083e43473b7SVivek Goyal cancel_delayed_work_sync(&td->throtl_work); 1084e43473b7SVivek Goyal } 1085e43473b7SVivek Goyal 1086e43473b7SVivek Goyal static struct blkio_policy_type blkio_policy_throtl = { 1087e43473b7SVivek Goyal .ops = { 10880381411eSTejun Heo .blkio_init_group_fn = throtl_init_blkio_group, 10898a3d2615STejun Heo .blkio_exit_group_fn = throtl_exit_blkio_group, 10908a3d2615STejun Heo .blkio_reset_group_stats_fn = throtl_reset_group_stats, 1091e43473b7SVivek Goyal }, 10920381411eSTejun Heo .pdata_size = sizeof(struct throtl_grp), 109360c2bc2dSTejun Heo .cftypes = throtl_files, 1094e43473b7SVivek Goyal }; 1095e43473b7SVivek Goyal 1096bc16a4f9STejun Heo bool blk_throtl_bio(struct request_queue *q, struct bio *bio) 1097e43473b7SVivek Goyal { 1098e43473b7SVivek Goyal struct throtl_data *td = q->td; 1099e43473b7SVivek Goyal struct throtl_grp *tg; 1100e43473b7SVivek Goyal bool rw = bio_data_dir(bio), update_disptime = true; 1101af75cd3cSVivek Goyal struct blkio_cgroup *blkcg; 1102bc16a4f9STejun Heo bool throttled = false; 1103e43473b7SVivek Goyal 1104e43473b7SVivek Goyal if (bio->bi_rw & REQ_THROTTLED) { 1105e43473b7SVivek Goyal bio->bi_rw &= ~REQ_THROTTLED; 1106bc16a4f9STejun Heo goto out; 1107e43473b7SVivek Goyal } 1108e43473b7SVivek Goyal 1109671058fbSTejun Heo /* bio_associate_current() needs ioc, try creating */ 1110671058fbSTejun Heo create_io_context(GFP_ATOMIC, q->node); 1111671058fbSTejun Heo 1112af75cd3cSVivek Goyal /* 1113af75cd3cSVivek Goyal * A throtl_grp pointer retrieved under rcu can be used to access 1114af75cd3cSVivek Goyal * basic fields like stats and io rates. If a group has no rules, 1115af75cd3cSVivek Goyal * just update the dispatch stats in lockless manner and return. 1116af75cd3cSVivek Goyal */ 1117af75cd3cSVivek Goyal rcu_read_lock(); 11184f85cb96STejun Heo blkcg = bio_blkio_cgroup(bio); 1119cd1604faSTejun Heo tg = throtl_lookup_tg(td, blkcg); 1120af75cd3cSVivek Goyal if (tg) { 1121af75cd3cSVivek Goyal if (tg_no_rule_group(tg, rw)) { 1122629ed0b1STejun Heo throtl_update_dispatch_stats(tg_to_blkg(tg), 1123629ed0b1STejun Heo bio->bi_size, bio->bi_rw); 11242a7f1244STejun Heo goto out_unlock_rcu; 1125af75cd3cSVivek Goyal } 1126af75cd3cSVivek Goyal } 1127af75cd3cSVivek Goyal 1128af75cd3cSVivek Goyal /* 1129af75cd3cSVivek Goyal * Either group has not been allocated yet or it is not an unlimited 1130af75cd3cSVivek Goyal * IO group 1131af75cd3cSVivek Goyal */ 1132e43473b7SVivek Goyal spin_lock_irq(q->queue_lock); 1133cd1604faSTejun Heo tg = throtl_lookup_create_tg(td, blkcg); 1134bc16a4f9STejun Heo if (unlikely(!tg)) 1135bc16a4f9STejun Heo goto out_unlock; 1136f469a7b4SVivek Goyal 1137e43473b7SVivek Goyal if (tg->nr_queued[rw]) { 1138e43473b7SVivek Goyal /* 1139e43473b7SVivek Goyal * There is already another bio queued in same dir. No 1140e43473b7SVivek Goyal * need to update dispatch time. 1141e43473b7SVivek Goyal */ 1142e43473b7SVivek Goyal update_disptime = false; 1143e43473b7SVivek Goyal goto queue_bio; 1144de701c74SVivek Goyal 1145e43473b7SVivek Goyal } 1146e43473b7SVivek Goyal 1147e43473b7SVivek Goyal /* Bio is with-in rate limit of group */ 1148e43473b7SVivek Goyal if (tg_may_dispatch(td, tg, bio, NULL)) { 1149e43473b7SVivek Goyal throtl_charge_bio(tg, bio); 115004521db0SVivek Goyal 115104521db0SVivek Goyal /* 115204521db0SVivek Goyal * We need to trim slice even when bios are not being queued 115304521db0SVivek Goyal * otherwise it might happen that a bio is not queued for 115404521db0SVivek Goyal * a long time and slice keeps on extending and trim is not 115504521db0SVivek Goyal * called for a long time. Now if limits are reduced suddenly 115604521db0SVivek Goyal * we take into account all the IO dispatched so far at new 115704521db0SVivek Goyal * low rate and * newly queued IO gets a really long dispatch 115804521db0SVivek Goyal * time. 115904521db0SVivek Goyal * 116004521db0SVivek Goyal * So keep on trimming slice even if bio is not queued. 116104521db0SVivek Goyal */ 116204521db0SVivek Goyal throtl_trim_slice(td, tg, rw); 1163bc16a4f9STejun Heo goto out_unlock; 1164e43473b7SVivek Goyal } 1165e43473b7SVivek Goyal 1166e43473b7SVivek Goyal queue_bio: 1167fd16d263SJoe Perches throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu" 11688e89d13fSVivek Goyal " iodisp=%u iops=%u queued=%d/%d", 11698e89d13fSVivek Goyal rw == READ ? 'R' : 'W', 1170e43473b7SVivek Goyal tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], 11718e89d13fSVivek Goyal tg->io_disp[rw], tg->iops[rw], 1172e43473b7SVivek Goyal tg->nr_queued[READ], tg->nr_queued[WRITE]); 1173e43473b7SVivek Goyal 1174671058fbSTejun Heo bio_associate_current(bio); 1175e43473b7SVivek Goyal throtl_add_bio_tg(q->td, tg, bio); 1176bc16a4f9STejun Heo throttled = true; 1177e43473b7SVivek Goyal 1178e43473b7SVivek Goyal if (update_disptime) { 1179e43473b7SVivek Goyal tg_update_disptime(td, tg); 1180e43473b7SVivek Goyal throtl_schedule_next_dispatch(td); 1181e43473b7SVivek Goyal } 1182e43473b7SVivek Goyal 1183bc16a4f9STejun Heo out_unlock: 1184e43473b7SVivek Goyal spin_unlock_irq(q->queue_lock); 11852a7f1244STejun Heo out_unlock_rcu: 11862a7f1244STejun Heo rcu_read_unlock(); 1187bc16a4f9STejun Heo out: 1188bc16a4f9STejun Heo return throttled; 1189e43473b7SVivek Goyal } 1190e43473b7SVivek Goyal 1191c9a929ddSTejun Heo /** 1192c9a929ddSTejun Heo * blk_throtl_drain - drain throttled bios 1193c9a929ddSTejun Heo * @q: request_queue to drain throttled bios for 1194c9a929ddSTejun Heo * 1195c9a929ddSTejun Heo * Dispatch all currently throttled bios on @q through ->make_request_fn(). 1196c9a929ddSTejun Heo */ 1197c9a929ddSTejun Heo void blk_throtl_drain(struct request_queue *q) 1198c9a929ddSTejun Heo __releases(q->queue_lock) __acquires(q->queue_lock) 1199c9a929ddSTejun Heo { 1200c9a929ddSTejun Heo struct throtl_data *td = q->td; 1201c9a929ddSTejun Heo struct throtl_rb_root *st = &td->tg_service_tree; 1202c9a929ddSTejun Heo struct throtl_grp *tg; 1203c9a929ddSTejun Heo struct bio_list bl; 1204c9a929ddSTejun Heo struct bio *bio; 1205c9a929ddSTejun Heo 1206334c2b0bSJens Axboe WARN_ON_ONCE(!queue_is_locked(q)); 1207c9a929ddSTejun Heo 1208c9a929ddSTejun Heo bio_list_init(&bl); 1209c9a929ddSTejun Heo 1210c9a929ddSTejun Heo while ((tg = throtl_rb_first(st))) { 1211c9a929ddSTejun Heo throtl_dequeue_tg(td, tg); 1212c9a929ddSTejun Heo 1213c9a929ddSTejun Heo while ((bio = bio_list_peek(&tg->bio_lists[READ]))) 1214c9a929ddSTejun Heo tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl); 1215c9a929ddSTejun Heo while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) 1216c9a929ddSTejun Heo tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl); 1217c9a929ddSTejun Heo } 1218c9a929ddSTejun Heo spin_unlock_irq(q->queue_lock); 1219c9a929ddSTejun Heo 1220c9a929ddSTejun Heo while ((bio = bio_list_pop(&bl))) 1221c9a929ddSTejun Heo generic_make_request(bio); 1222c9a929ddSTejun Heo 1223c9a929ddSTejun Heo spin_lock_irq(q->queue_lock); 1224c9a929ddSTejun Heo } 1225c9a929ddSTejun Heo 1226e43473b7SVivek Goyal int blk_throtl_init(struct request_queue *q) 1227e43473b7SVivek Goyal { 1228e43473b7SVivek Goyal struct throtl_data *td; 1229*a2b1693bSTejun Heo int ret; 1230e43473b7SVivek Goyal 1231e43473b7SVivek Goyal td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1232e43473b7SVivek Goyal if (!td) 1233e43473b7SVivek Goyal return -ENOMEM; 1234e43473b7SVivek Goyal 1235e43473b7SVivek Goyal td->tg_service_tree = THROTL_RB_ROOT; 1236de701c74SVivek Goyal td->limits_changed = false; 1237a29a171eSVivek Goyal INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); 1238e43473b7SVivek Goyal 1239cd1604faSTejun Heo q->td = td; 124029b12589SVivek Goyal td->queue = q; 124102977e4aSVivek Goyal 1242*a2b1693bSTejun Heo /* activate policy */ 1243*a2b1693bSTejun Heo ret = blkcg_activate_policy(q, &blkio_policy_throtl); 1244*a2b1693bSTejun Heo if (ret) 124529b12589SVivek Goyal kfree(td); 1246*a2b1693bSTejun Heo return ret; 1247e43473b7SVivek Goyal } 1248e43473b7SVivek Goyal 1249e43473b7SVivek Goyal void blk_throtl_exit(struct request_queue *q) 1250e43473b7SVivek Goyal { 1251c875f4d0STejun Heo BUG_ON(!q->td); 1252da527770SVivek Goyal throtl_shutdown_wq(q); 1253*a2b1693bSTejun Heo blkcg_deactivate_policy(q, &blkio_policy_throtl); 1254c9a929ddSTejun Heo kfree(q->td); 1255e43473b7SVivek Goyal } 1256e43473b7SVivek Goyal 1257e43473b7SVivek Goyal static int __init throtl_init(void) 1258e43473b7SVivek Goyal { 1259450adcbeSVivek Goyal kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); 1260450adcbeSVivek Goyal if (!kthrotld_workqueue) 1261450adcbeSVivek Goyal panic("Failed to create kthrotld\n"); 1262450adcbeSVivek Goyal 12638bd435b3STejun Heo return blkio_policy_register(&blkio_policy_throtl); 1264e43473b7SVivek Goyal } 1265e43473b7SVivek Goyal 1266e43473b7SVivek Goyal module_init(throtl_init); 1267