1e43473b7SVivek Goyal /* 2e43473b7SVivek Goyal * Interface for controlling IO bandwidth on a request queue 3e43473b7SVivek Goyal * 4e43473b7SVivek Goyal * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> 5e43473b7SVivek Goyal */ 6e43473b7SVivek Goyal 7e43473b7SVivek Goyal #include <linux/module.h> 8e43473b7SVivek Goyal #include <linux/slab.h> 9e43473b7SVivek Goyal #include <linux/blkdev.h> 10e43473b7SVivek Goyal #include <linux/bio.h> 11e43473b7SVivek Goyal #include <linux/blktrace_api.h> 12e43473b7SVivek Goyal #include "blk-cgroup.h" 13bc9fcbf9STejun Heo #include "blk.h" 14e43473b7SVivek Goyal 15e43473b7SVivek Goyal /* Max dispatch from a group in 1 round */ 16e43473b7SVivek Goyal static int throtl_grp_quantum = 8; 17e43473b7SVivek Goyal 18e43473b7SVivek Goyal /* Total max dispatch from all groups in one round */ 19e43473b7SVivek Goyal static int throtl_quantum = 32; 20e43473b7SVivek Goyal 21e43473b7SVivek Goyal /* Throttling is performed over 100ms slice and after that slice is renewed */ 22e43473b7SVivek Goyal static unsigned long throtl_slice = HZ/10; /* 100 ms */ 23e43473b7SVivek Goyal 240381411eSTejun Heo static struct blkio_policy_type blkio_policy_throtl; 250381411eSTejun Heo 26450adcbeSVivek Goyal /* A workqueue to queue throttle related work */ 27450adcbeSVivek Goyal static struct workqueue_struct *kthrotld_workqueue; 28450adcbeSVivek Goyal static void throtl_schedule_delayed_work(struct throtl_data *td, 29450adcbeSVivek Goyal unsigned long delay); 30450adcbeSVivek Goyal 31e43473b7SVivek Goyal struct throtl_rb_root { 32e43473b7SVivek Goyal struct rb_root rb; 33e43473b7SVivek Goyal struct rb_node *left; 34e43473b7SVivek Goyal unsigned int count; 35e43473b7SVivek Goyal unsigned long min_disptime; 36e43473b7SVivek Goyal }; 37e43473b7SVivek Goyal 38e43473b7SVivek Goyal #define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \ 39e43473b7SVivek Goyal .count = 0, .min_disptime = 0} 40e43473b7SVivek Goyal 41e43473b7SVivek Goyal #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 42e43473b7SVivek Goyal 438a3d2615STejun Heo /* Per-cpu group stats */ 448a3d2615STejun Heo struct tg_stats_cpu { 458a3d2615STejun Heo /* total bytes transferred */ 468a3d2615STejun Heo struct blkg_rwstat service_bytes; 478a3d2615STejun Heo /* total IOs serviced, post merge */ 488a3d2615STejun Heo struct blkg_rwstat serviced; 498a3d2615STejun Heo }; 508a3d2615STejun Heo 51e43473b7SVivek Goyal struct throtl_grp { 52e43473b7SVivek Goyal /* active throtl group service_tree member */ 53e43473b7SVivek Goyal struct rb_node rb_node; 54e43473b7SVivek Goyal 55e43473b7SVivek Goyal /* 56e43473b7SVivek Goyal * Dispatch time in jiffies. This is the estimated time when group 57e43473b7SVivek Goyal * will unthrottle and is ready to dispatch more bio. It is used as 58e43473b7SVivek Goyal * key to sort active groups in service tree. 59e43473b7SVivek Goyal */ 60e43473b7SVivek Goyal unsigned long disptime; 61e43473b7SVivek Goyal 62e43473b7SVivek Goyal unsigned int flags; 63e43473b7SVivek Goyal 64e43473b7SVivek Goyal /* Two lists for READ and WRITE */ 65e43473b7SVivek Goyal struct bio_list bio_lists[2]; 66e43473b7SVivek Goyal 67e43473b7SVivek Goyal /* Number of queued bios on READ and WRITE lists */ 68e43473b7SVivek Goyal unsigned int nr_queued[2]; 69e43473b7SVivek Goyal 70e43473b7SVivek Goyal /* bytes per second rate limits */ 71e43473b7SVivek Goyal uint64_t bps[2]; 72e43473b7SVivek Goyal 738e89d13fSVivek Goyal /* IOPS limits */ 748e89d13fSVivek Goyal unsigned int iops[2]; 758e89d13fSVivek Goyal 76e43473b7SVivek Goyal /* Number of bytes disptached in current slice */ 77e43473b7SVivek Goyal uint64_t bytes_disp[2]; 788e89d13fSVivek Goyal /* Number of bio's dispatched in current slice */ 798e89d13fSVivek Goyal unsigned int io_disp[2]; 80e43473b7SVivek Goyal 81e43473b7SVivek Goyal /* When did we start a new slice */ 82e43473b7SVivek Goyal unsigned long slice_start[2]; 83e43473b7SVivek Goyal unsigned long slice_end[2]; 84fe071437SVivek Goyal 85fe071437SVivek Goyal /* Some throttle limits got updated for the group */ 866f037937SAndreas Schwab int limits_changed; 878a3d2615STejun Heo 888a3d2615STejun Heo /* Per cpu stats pointer */ 898a3d2615STejun Heo struct tg_stats_cpu __percpu *stats_cpu; 908a3d2615STejun Heo 918a3d2615STejun Heo /* List of tgs waiting for per cpu stats memory to be allocated */ 928a3d2615STejun Heo struct list_head stats_alloc_node; 93e43473b7SVivek Goyal }; 94e43473b7SVivek Goyal 95e43473b7SVivek Goyal struct throtl_data 96e43473b7SVivek Goyal { 97e43473b7SVivek Goyal /* service tree for active throtl groups */ 98e43473b7SVivek Goyal struct throtl_rb_root tg_service_tree; 99e43473b7SVivek Goyal 10029b12589SVivek Goyal struct throtl_grp *root_tg; 101e43473b7SVivek Goyal struct request_queue *queue; 102e43473b7SVivek Goyal 103e43473b7SVivek Goyal /* Total Number of queued bios on READ and WRITE lists */ 104e43473b7SVivek Goyal unsigned int nr_queued[2]; 105e43473b7SVivek Goyal 106e43473b7SVivek Goyal /* 10702977e4aSVivek Goyal * number of total undestroyed groups 108e43473b7SVivek Goyal */ 109e43473b7SVivek Goyal unsigned int nr_undestroyed_grps; 110e43473b7SVivek Goyal 111e43473b7SVivek Goyal /* Work for dispatching throttled bios */ 112e43473b7SVivek Goyal struct delayed_work throtl_work; 113fe071437SVivek Goyal 1146f037937SAndreas Schwab int limits_changed; 115e43473b7SVivek Goyal }; 116e43473b7SVivek Goyal 1178a3d2615STejun Heo /* list and work item to allocate percpu group stats */ 1188a3d2615STejun Heo static DEFINE_SPINLOCK(tg_stats_alloc_lock); 1198a3d2615STejun Heo static LIST_HEAD(tg_stats_alloc_list); 1208a3d2615STejun Heo 1218a3d2615STejun Heo static void tg_stats_alloc_fn(struct work_struct *); 1228a3d2615STejun Heo static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); 1238a3d2615STejun Heo 1240381411eSTejun Heo static inline struct throtl_grp *blkg_to_tg(struct blkio_group *blkg) 1250381411eSTejun Heo { 1260381411eSTejun Heo return blkg_to_pdata(blkg, &blkio_policy_throtl); 1270381411eSTejun Heo } 1280381411eSTejun Heo 1290381411eSTejun Heo static inline struct blkio_group *tg_to_blkg(struct throtl_grp *tg) 1300381411eSTejun Heo { 131aaec55a0STejun Heo return pdata_to_blkg(tg); 1320381411eSTejun Heo } 1330381411eSTejun Heo 134e43473b7SVivek Goyal enum tg_state_flags { 135e43473b7SVivek Goyal THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ 136e43473b7SVivek Goyal }; 137e43473b7SVivek Goyal 138e43473b7SVivek Goyal #define THROTL_TG_FNS(name) \ 139e43473b7SVivek Goyal static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \ 140e43473b7SVivek Goyal { \ 141e43473b7SVivek Goyal (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \ 142e43473b7SVivek Goyal } \ 143e43473b7SVivek Goyal static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \ 144e43473b7SVivek Goyal { \ 145e43473b7SVivek Goyal (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \ 146e43473b7SVivek Goyal } \ 147e43473b7SVivek Goyal static inline int throtl_tg_##name(const struct throtl_grp *tg) \ 148e43473b7SVivek Goyal { \ 149e43473b7SVivek Goyal return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \ 150e43473b7SVivek Goyal } 151e43473b7SVivek Goyal 152e43473b7SVivek Goyal THROTL_TG_FNS(on_rr); 153e43473b7SVivek Goyal 154e43473b7SVivek Goyal #define throtl_log_tg(td, tg, fmt, args...) \ 155e43473b7SVivek Goyal blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ 1560381411eSTejun Heo blkg_path(tg_to_blkg(tg)), ##args); \ 157e43473b7SVivek Goyal 158e43473b7SVivek Goyal #define throtl_log(td, fmt, args...) \ 159e43473b7SVivek Goyal blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) 160e43473b7SVivek Goyal 161d2f31a5fSJoe Perches static inline unsigned int total_nr_queued(struct throtl_data *td) 162e43473b7SVivek Goyal { 163d2f31a5fSJoe Perches return td->nr_queued[0] + td->nr_queued[1]; 164e43473b7SVivek Goyal } 165e43473b7SVivek Goyal 1668a3d2615STejun Heo /* 1678a3d2615STejun Heo * Worker for allocating per cpu stat for tgs. This is scheduled on the 1688a3d2615STejun Heo * system_nrt_wq once there are some groups on the alloc_list waiting for 1698a3d2615STejun Heo * allocation. 1708a3d2615STejun Heo */ 1718a3d2615STejun Heo static void tg_stats_alloc_fn(struct work_struct *work) 1728a3d2615STejun Heo { 1738a3d2615STejun Heo static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ 1748a3d2615STejun Heo struct delayed_work *dwork = to_delayed_work(work); 1758a3d2615STejun Heo bool empty = false; 1768a3d2615STejun Heo 1778a3d2615STejun Heo alloc_stats: 1788a3d2615STejun Heo if (!stats_cpu) { 1798a3d2615STejun Heo stats_cpu = alloc_percpu(struct tg_stats_cpu); 1808a3d2615STejun Heo if (!stats_cpu) { 1818a3d2615STejun Heo /* allocation failed, try again after some time */ 1828a3d2615STejun Heo queue_delayed_work(system_nrt_wq, dwork, 1838a3d2615STejun Heo msecs_to_jiffies(10)); 1848a3d2615STejun Heo return; 1858a3d2615STejun Heo } 1868a3d2615STejun Heo } 1878a3d2615STejun Heo 1888a3d2615STejun Heo spin_lock_irq(&tg_stats_alloc_lock); 1898a3d2615STejun Heo 1908a3d2615STejun Heo if (!list_empty(&tg_stats_alloc_list)) { 1918a3d2615STejun Heo struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, 1928a3d2615STejun Heo struct throtl_grp, 1938a3d2615STejun Heo stats_alloc_node); 1948a3d2615STejun Heo swap(tg->stats_cpu, stats_cpu); 1958a3d2615STejun Heo list_del_init(&tg->stats_alloc_node); 1968a3d2615STejun Heo } 1978a3d2615STejun Heo 1988a3d2615STejun Heo empty = list_empty(&tg_stats_alloc_list); 1998a3d2615STejun Heo spin_unlock_irq(&tg_stats_alloc_lock); 2008a3d2615STejun Heo if (!empty) 2018a3d2615STejun Heo goto alloc_stats; 2028a3d2615STejun Heo } 2038a3d2615STejun Heo 2040381411eSTejun Heo static void throtl_init_blkio_group(struct blkio_group *blkg) 205a29a171eSVivek Goyal { 2060381411eSTejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 207cd1604faSTejun Heo 208a29a171eSVivek Goyal RB_CLEAR_NODE(&tg->rb_node); 209a29a171eSVivek Goyal bio_list_init(&tg->bio_lists[0]); 210a29a171eSVivek Goyal bio_list_init(&tg->bio_lists[1]); 211a29a171eSVivek Goyal tg->limits_changed = false; 212a29a171eSVivek Goyal 213e56da7e2STejun Heo tg->bps[READ] = -1; 214e56da7e2STejun Heo tg->bps[WRITE] = -1; 215e56da7e2STejun Heo tg->iops[READ] = -1; 216e56da7e2STejun Heo tg->iops[WRITE] = -1; 2178a3d2615STejun Heo 2188a3d2615STejun Heo /* 2198a3d2615STejun Heo * Ugh... We need to perform per-cpu allocation for tg->stats_cpu 2208a3d2615STejun Heo * but percpu allocator can't be called from IO path. Queue tg on 2218a3d2615STejun Heo * tg_stats_alloc_list and allocate from work item. 2228a3d2615STejun Heo */ 2238a3d2615STejun Heo spin_lock(&tg_stats_alloc_lock); 2248a3d2615STejun Heo list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); 2258a3d2615STejun Heo queue_delayed_work(system_nrt_wq, &tg_stats_alloc_work, 0); 2268a3d2615STejun Heo spin_unlock(&tg_stats_alloc_lock); 2278a3d2615STejun Heo } 2288a3d2615STejun Heo 2298a3d2615STejun Heo static void throtl_exit_blkio_group(struct blkio_group *blkg) 2308a3d2615STejun Heo { 2318a3d2615STejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 2328a3d2615STejun Heo 2338a3d2615STejun Heo spin_lock(&tg_stats_alloc_lock); 2348a3d2615STejun Heo list_del_init(&tg->stats_alloc_node); 2358a3d2615STejun Heo spin_unlock(&tg_stats_alloc_lock); 2368a3d2615STejun Heo 2378a3d2615STejun Heo free_percpu(tg->stats_cpu); 2388a3d2615STejun Heo } 2398a3d2615STejun Heo 2408a3d2615STejun Heo static void throtl_reset_group_stats(struct blkio_group *blkg) 2418a3d2615STejun Heo { 2428a3d2615STejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 2438a3d2615STejun Heo int cpu; 2448a3d2615STejun Heo 2458a3d2615STejun Heo if (tg->stats_cpu == NULL) 2468a3d2615STejun Heo return; 2478a3d2615STejun Heo 2488a3d2615STejun Heo for_each_possible_cpu(cpu) { 2498a3d2615STejun Heo struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); 2508a3d2615STejun Heo 2518a3d2615STejun Heo blkg_rwstat_reset(&sc->service_bytes); 2528a3d2615STejun Heo blkg_rwstat_reset(&sc->serviced); 2538a3d2615STejun Heo } 254a29a171eSVivek Goyal } 255a29a171eSVivek Goyal 256f469a7b4SVivek Goyal static struct 257cd1604faSTejun Heo throtl_grp *throtl_lookup_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) 258e43473b7SVivek Goyal { 259e43473b7SVivek Goyal /* 260be2c6b19SVivek Goyal * This is the common case when there are no blkio cgroups. 261be2c6b19SVivek Goyal * Avoid lookup in this case 262be2c6b19SVivek Goyal */ 263be2c6b19SVivek Goyal if (blkcg == &blkio_root_cgroup) 2647a4dd281STejun Heo return td->root_tg; 265e43473b7SVivek Goyal 266e8989faeSTejun Heo return blkg_to_tg(blkg_lookup(blkcg, td->queue)); 267e43473b7SVivek Goyal } 268e43473b7SVivek Goyal 269cd1604faSTejun Heo static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, 2700a5a7d0eSTejun Heo struct blkio_cgroup *blkcg) 271e43473b7SVivek Goyal { 272f469a7b4SVivek Goyal struct request_queue *q = td->queue; 273cd1604faSTejun Heo struct throtl_grp *tg = NULL; 2740a5a7d0eSTejun Heo 275f469a7b4SVivek Goyal /* 276cd1604faSTejun Heo * This is the common case when there are no blkio cgroups. 277cd1604faSTejun Heo * Avoid lookup in this case 278f469a7b4SVivek Goyal */ 279cd1604faSTejun Heo if (blkcg == &blkio_root_cgroup) { 28029b12589SVivek Goyal tg = td->root_tg; 281cd1604faSTejun Heo } else { 282cd1604faSTejun Heo struct blkio_group *blkg; 283cd1604faSTejun Heo 284aaec55a0STejun Heo blkg = blkg_lookup_create(blkcg, q, false); 285cd1604faSTejun Heo 286cd1604faSTejun Heo /* if %NULL and @q is alive, fall back to root_tg */ 287cd1604faSTejun Heo if (!IS_ERR(blkg)) 2880381411eSTejun Heo tg = blkg_to_tg(blkg); 289cd1604faSTejun Heo else if (!blk_queue_dead(q)) 290cd1604faSTejun Heo tg = td->root_tg; 291f469a7b4SVivek Goyal } 292f469a7b4SVivek Goyal 293e43473b7SVivek Goyal return tg; 294e43473b7SVivek Goyal } 295e43473b7SVivek Goyal 296e43473b7SVivek Goyal static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root) 297e43473b7SVivek Goyal { 298e43473b7SVivek Goyal /* Service tree is empty */ 299e43473b7SVivek Goyal if (!root->count) 300e43473b7SVivek Goyal return NULL; 301e43473b7SVivek Goyal 302e43473b7SVivek Goyal if (!root->left) 303e43473b7SVivek Goyal root->left = rb_first(&root->rb); 304e43473b7SVivek Goyal 305e43473b7SVivek Goyal if (root->left) 306e43473b7SVivek Goyal return rb_entry_tg(root->left); 307e43473b7SVivek Goyal 308e43473b7SVivek Goyal return NULL; 309e43473b7SVivek Goyal } 310e43473b7SVivek Goyal 311e43473b7SVivek Goyal static void rb_erase_init(struct rb_node *n, struct rb_root *root) 312e43473b7SVivek Goyal { 313e43473b7SVivek Goyal rb_erase(n, root); 314e43473b7SVivek Goyal RB_CLEAR_NODE(n); 315e43473b7SVivek Goyal } 316e43473b7SVivek Goyal 317e43473b7SVivek Goyal static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root) 318e43473b7SVivek Goyal { 319e43473b7SVivek Goyal if (root->left == n) 320e43473b7SVivek Goyal root->left = NULL; 321e43473b7SVivek Goyal rb_erase_init(n, &root->rb); 322e43473b7SVivek Goyal --root->count; 323e43473b7SVivek Goyal } 324e43473b7SVivek Goyal 325e43473b7SVivek Goyal static void update_min_dispatch_time(struct throtl_rb_root *st) 326e43473b7SVivek Goyal { 327e43473b7SVivek Goyal struct throtl_grp *tg; 328e43473b7SVivek Goyal 329e43473b7SVivek Goyal tg = throtl_rb_first(st); 330e43473b7SVivek Goyal if (!tg) 331e43473b7SVivek Goyal return; 332e43473b7SVivek Goyal 333e43473b7SVivek Goyal st->min_disptime = tg->disptime; 334e43473b7SVivek Goyal } 335e43473b7SVivek Goyal 336e43473b7SVivek Goyal static void 337e43473b7SVivek Goyal tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg) 338e43473b7SVivek Goyal { 339e43473b7SVivek Goyal struct rb_node **node = &st->rb.rb_node; 340e43473b7SVivek Goyal struct rb_node *parent = NULL; 341e43473b7SVivek Goyal struct throtl_grp *__tg; 342e43473b7SVivek Goyal unsigned long key = tg->disptime; 343e43473b7SVivek Goyal int left = 1; 344e43473b7SVivek Goyal 345e43473b7SVivek Goyal while (*node != NULL) { 346e43473b7SVivek Goyal parent = *node; 347e43473b7SVivek Goyal __tg = rb_entry_tg(parent); 348e43473b7SVivek Goyal 349e43473b7SVivek Goyal if (time_before(key, __tg->disptime)) 350e43473b7SVivek Goyal node = &parent->rb_left; 351e43473b7SVivek Goyal else { 352e43473b7SVivek Goyal node = &parent->rb_right; 353e43473b7SVivek Goyal left = 0; 354e43473b7SVivek Goyal } 355e43473b7SVivek Goyal } 356e43473b7SVivek Goyal 357e43473b7SVivek Goyal if (left) 358e43473b7SVivek Goyal st->left = &tg->rb_node; 359e43473b7SVivek Goyal 360e43473b7SVivek Goyal rb_link_node(&tg->rb_node, parent, node); 361e43473b7SVivek Goyal rb_insert_color(&tg->rb_node, &st->rb); 362e43473b7SVivek Goyal } 363e43473b7SVivek Goyal 364e43473b7SVivek Goyal static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) 365e43473b7SVivek Goyal { 366e43473b7SVivek Goyal struct throtl_rb_root *st = &td->tg_service_tree; 367e43473b7SVivek Goyal 368e43473b7SVivek Goyal tg_service_tree_add(st, tg); 369e43473b7SVivek Goyal throtl_mark_tg_on_rr(tg); 370e43473b7SVivek Goyal st->count++; 371e43473b7SVivek Goyal } 372e43473b7SVivek Goyal 373e43473b7SVivek Goyal static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) 374e43473b7SVivek Goyal { 375e43473b7SVivek Goyal if (!throtl_tg_on_rr(tg)) 376e43473b7SVivek Goyal __throtl_enqueue_tg(td, tg); 377e43473b7SVivek Goyal } 378e43473b7SVivek Goyal 379e43473b7SVivek Goyal static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) 380e43473b7SVivek Goyal { 381e43473b7SVivek Goyal throtl_rb_erase(&tg->rb_node, &td->tg_service_tree); 382e43473b7SVivek Goyal throtl_clear_tg_on_rr(tg); 383e43473b7SVivek Goyal } 384e43473b7SVivek Goyal 385e43473b7SVivek Goyal static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) 386e43473b7SVivek Goyal { 387e43473b7SVivek Goyal if (throtl_tg_on_rr(tg)) 388e43473b7SVivek Goyal __throtl_dequeue_tg(td, tg); 389e43473b7SVivek Goyal } 390e43473b7SVivek Goyal 391e43473b7SVivek Goyal static void throtl_schedule_next_dispatch(struct throtl_data *td) 392e43473b7SVivek Goyal { 393e43473b7SVivek Goyal struct throtl_rb_root *st = &td->tg_service_tree; 394e43473b7SVivek Goyal 395e43473b7SVivek Goyal /* 396e43473b7SVivek Goyal * If there are more bios pending, schedule more work. 397e43473b7SVivek Goyal */ 398e43473b7SVivek Goyal if (!total_nr_queued(td)) 399e43473b7SVivek Goyal return; 400e43473b7SVivek Goyal 401e43473b7SVivek Goyal BUG_ON(!st->count); 402e43473b7SVivek Goyal 403e43473b7SVivek Goyal update_min_dispatch_time(st); 404e43473b7SVivek Goyal 405e43473b7SVivek Goyal if (time_before_eq(st->min_disptime, jiffies)) 406450adcbeSVivek Goyal throtl_schedule_delayed_work(td, 0); 407e43473b7SVivek Goyal else 408450adcbeSVivek Goyal throtl_schedule_delayed_work(td, (st->min_disptime - jiffies)); 409e43473b7SVivek Goyal } 410e43473b7SVivek Goyal 411e43473b7SVivek Goyal static inline void 412e43473b7SVivek Goyal throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) 413e43473b7SVivek Goyal { 414e43473b7SVivek Goyal tg->bytes_disp[rw] = 0; 4158e89d13fSVivek Goyal tg->io_disp[rw] = 0; 416e43473b7SVivek Goyal tg->slice_start[rw] = jiffies; 417e43473b7SVivek Goyal tg->slice_end[rw] = jiffies + throtl_slice; 418e43473b7SVivek Goyal throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", 419e43473b7SVivek Goyal rw == READ ? 'R' : 'W', tg->slice_start[rw], 420e43473b7SVivek Goyal tg->slice_end[rw], jiffies); 421e43473b7SVivek Goyal } 422e43473b7SVivek Goyal 423d1ae8ffdSVivek Goyal static inline void throtl_set_slice_end(struct throtl_data *td, 424d1ae8ffdSVivek Goyal struct throtl_grp *tg, bool rw, unsigned long jiffy_end) 425d1ae8ffdSVivek Goyal { 426d1ae8ffdSVivek Goyal tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 427d1ae8ffdSVivek Goyal } 428d1ae8ffdSVivek Goyal 429e43473b7SVivek Goyal static inline void throtl_extend_slice(struct throtl_data *td, 430e43473b7SVivek Goyal struct throtl_grp *tg, bool rw, unsigned long jiffy_end) 431e43473b7SVivek Goyal { 432e43473b7SVivek Goyal tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 433e43473b7SVivek Goyal throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", 434e43473b7SVivek Goyal rw == READ ? 'R' : 'W', tg->slice_start[rw], 435e43473b7SVivek Goyal tg->slice_end[rw], jiffies); 436e43473b7SVivek Goyal } 437e43473b7SVivek Goyal 438e43473b7SVivek Goyal /* Determine if previously allocated or extended slice is complete or not */ 439e43473b7SVivek Goyal static bool 440e43473b7SVivek Goyal throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) 441e43473b7SVivek Goyal { 442e43473b7SVivek Goyal if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 443e43473b7SVivek Goyal return 0; 444e43473b7SVivek Goyal 445e43473b7SVivek Goyal return 1; 446e43473b7SVivek Goyal } 447e43473b7SVivek Goyal 448e43473b7SVivek Goyal /* Trim the used slices and adjust slice start accordingly */ 449e43473b7SVivek Goyal static inline void 450e43473b7SVivek Goyal throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) 451e43473b7SVivek Goyal { 4523aad5d3eSVivek Goyal unsigned long nr_slices, time_elapsed, io_trim; 4533aad5d3eSVivek Goyal u64 bytes_trim, tmp; 454e43473b7SVivek Goyal 455e43473b7SVivek Goyal BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); 456e43473b7SVivek Goyal 457e43473b7SVivek Goyal /* 458e43473b7SVivek Goyal * If bps are unlimited (-1), then time slice don't get 459e43473b7SVivek Goyal * renewed. Don't try to trim the slice if slice is used. A new 460e43473b7SVivek Goyal * slice will start when appropriate. 461e43473b7SVivek Goyal */ 462e43473b7SVivek Goyal if (throtl_slice_used(td, tg, rw)) 463e43473b7SVivek Goyal return; 464e43473b7SVivek Goyal 465d1ae8ffdSVivek Goyal /* 466d1ae8ffdSVivek Goyal * A bio has been dispatched. Also adjust slice_end. It might happen 467d1ae8ffdSVivek Goyal * that initially cgroup limit was very low resulting in high 468d1ae8ffdSVivek Goyal * slice_end, but later limit was bumped up and bio was dispached 469d1ae8ffdSVivek Goyal * sooner, then we need to reduce slice_end. A high bogus slice_end 470d1ae8ffdSVivek Goyal * is bad because it does not allow new slice to start. 471d1ae8ffdSVivek Goyal */ 472d1ae8ffdSVivek Goyal 473d1ae8ffdSVivek Goyal throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice); 474d1ae8ffdSVivek Goyal 475e43473b7SVivek Goyal time_elapsed = jiffies - tg->slice_start[rw]; 476e43473b7SVivek Goyal 477e43473b7SVivek Goyal nr_slices = time_elapsed / throtl_slice; 478e43473b7SVivek Goyal 479e43473b7SVivek Goyal if (!nr_slices) 480e43473b7SVivek Goyal return; 4813aad5d3eSVivek Goyal tmp = tg->bps[rw] * throtl_slice * nr_slices; 4823aad5d3eSVivek Goyal do_div(tmp, HZ); 4833aad5d3eSVivek Goyal bytes_trim = tmp; 484e43473b7SVivek Goyal 4858e89d13fSVivek Goyal io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; 486e43473b7SVivek Goyal 4878e89d13fSVivek Goyal if (!bytes_trim && !io_trim) 488e43473b7SVivek Goyal return; 489e43473b7SVivek Goyal 490e43473b7SVivek Goyal if (tg->bytes_disp[rw] >= bytes_trim) 491e43473b7SVivek Goyal tg->bytes_disp[rw] -= bytes_trim; 492e43473b7SVivek Goyal else 493e43473b7SVivek Goyal tg->bytes_disp[rw] = 0; 494e43473b7SVivek Goyal 4958e89d13fSVivek Goyal if (tg->io_disp[rw] >= io_trim) 4968e89d13fSVivek Goyal tg->io_disp[rw] -= io_trim; 4978e89d13fSVivek Goyal else 4988e89d13fSVivek Goyal tg->io_disp[rw] = 0; 4998e89d13fSVivek Goyal 500e43473b7SVivek Goyal tg->slice_start[rw] += nr_slices * throtl_slice; 501e43473b7SVivek Goyal 5023aad5d3eSVivek Goyal throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" 503e43473b7SVivek Goyal " start=%lu end=%lu jiffies=%lu", 5048e89d13fSVivek Goyal rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, 505e43473b7SVivek Goyal tg->slice_start[rw], tg->slice_end[rw], jiffies); 506e43473b7SVivek Goyal } 507e43473b7SVivek Goyal 5088e89d13fSVivek Goyal static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, 509e43473b7SVivek Goyal struct bio *bio, unsigned long *wait) 510e43473b7SVivek Goyal { 511e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 5128e89d13fSVivek Goyal unsigned int io_allowed; 513e43473b7SVivek Goyal unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 514c49c06e4SVivek Goyal u64 tmp; 515e43473b7SVivek Goyal 5168e89d13fSVivek Goyal jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 517e43473b7SVivek Goyal 5188e89d13fSVivek Goyal /* Slice has just started. Consider one slice interval */ 5198e89d13fSVivek Goyal if (!jiffy_elapsed) 5208e89d13fSVivek Goyal jiffy_elapsed_rnd = throtl_slice; 5218e89d13fSVivek Goyal 5228e89d13fSVivek Goyal jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 5238e89d13fSVivek Goyal 524c49c06e4SVivek Goyal /* 525c49c06e4SVivek Goyal * jiffy_elapsed_rnd should not be a big value as minimum iops can be 526c49c06e4SVivek Goyal * 1 then at max jiffy elapsed should be equivalent of 1 second as we 527c49c06e4SVivek Goyal * will allow dispatch after 1 second and after that slice should 528c49c06e4SVivek Goyal * have been trimmed. 529c49c06e4SVivek Goyal */ 530c49c06e4SVivek Goyal 531c49c06e4SVivek Goyal tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; 532c49c06e4SVivek Goyal do_div(tmp, HZ); 533c49c06e4SVivek Goyal 534c49c06e4SVivek Goyal if (tmp > UINT_MAX) 535c49c06e4SVivek Goyal io_allowed = UINT_MAX; 536c49c06e4SVivek Goyal else 537c49c06e4SVivek Goyal io_allowed = tmp; 5388e89d13fSVivek Goyal 5398e89d13fSVivek Goyal if (tg->io_disp[rw] + 1 <= io_allowed) { 540e43473b7SVivek Goyal if (wait) 541e43473b7SVivek Goyal *wait = 0; 542e43473b7SVivek Goyal return 1; 543e43473b7SVivek Goyal } 544e43473b7SVivek Goyal 5458e89d13fSVivek Goyal /* Calc approx time to dispatch */ 5468e89d13fSVivek Goyal jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; 5478e89d13fSVivek Goyal 5488e89d13fSVivek Goyal if (jiffy_wait > jiffy_elapsed) 5498e89d13fSVivek Goyal jiffy_wait = jiffy_wait - jiffy_elapsed; 5508e89d13fSVivek Goyal else 5518e89d13fSVivek Goyal jiffy_wait = 1; 5528e89d13fSVivek Goyal 5538e89d13fSVivek Goyal if (wait) 5548e89d13fSVivek Goyal *wait = jiffy_wait; 5558e89d13fSVivek Goyal return 0; 556e43473b7SVivek Goyal } 557e43473b7SVivek Goyal 5588e89d13fSVivek Goyal static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, 5598e89d13fSVivek Goyal struct bio *bio, unsigned long *wait) 5608e89d13fSVivek Goyal { 5618e89d13fSVivek Goyal bool rw = bio_data_dir(bio); 5623aad5d3eSVivek Goyal u64 bytes_allowed, extra_bytes, tmp; 5638e89d13fSVivek Goyal unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 5648e89d13fSVivek Goyal 565e43473b7SVivek Goyal jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 566e43473b7SVivek Goyal 567e43473b7SVivek Goyal /* Slice has just started. Consider one slice interval */ 568e43473b7SVivek Goyal if (!jiffy_elapsed) 569e43473b7SVivek Goyal jiffy_elapsed_rnd = throtl_slice; 570e43473b7SVivek Goyal 571e43473b7SVivek Goyal jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 572e43473b7SVivek Goyal 5735e901a2bSVivek Goyal tmp = tg->bps[rw] * jiffy_elapsed_rnd; 5745e901a2bSVivek Goyal do_div(tmp, HZ); 5753aad5d3eSVivek Goyal bytes_allowed = tmp; 576e43473b7SVivek Goyal 577e43473b7SVivek Goyal if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { 578e43473b7SVivek Goyal if (wait) 579e43473b7SVivek Goyal *wait = 0; 580e43473b7SVivek Goyal return 1; 581e43473b7SVivek Goyal } 582e43473b7SVivek Goyal 583e43473b7SVivek Goyal /* Calc approx time to dispatch */ 584e43473b7SVivek Goyal extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed; 585e43473b7SVivek Goyal jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); 586e43473b7SVivek Goyal 587e43473b7SVivek Goyal if (!jiffy_wait) 588e43473b7SVivek Goyal jiffy_wait = 1; 589e43473b7SVivek Goyal 590e43473b7SVivek Goyal /* 591e43473b7SVivek Goyal * This wait time is without taking into consideration the rounding 592e43473b7SVivek Goyal * up we did. Add that time also. 593e43473b7SVivek Goyal */ 594e43473b7SVivek Goyal jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); 595e43473b7SVivek Goyal if (wait) 596e43473b7SVivek Goyal *wait = jiffy_wait; 5978e89d13fSVivek Goyal return 0; 5988e89d13fSVivek Goyal } 599e43473b7SVivek Goyal 600af75cd3cSVivek Goyal static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) { 601af75cd3cSVivek Goyal if (tg->bps[rw] == -1 && tg->iops[rw] == -1) 602af75cd3cSVivek Goyal return 1; 603af75cd3cSVivek Goyal return 0; 604af75cd3cSVivek Goyal } 605af75cd3cSVivek Goyal 6068e89d13fSVivek Goyal /* 6078e89d13fSVivek Goyal * Returns whether one can dispatch a bio or not. Also returns approx number 6088e89d13fSVivek Goyal * of jiffies to wait before this bio is with-in IO rate and can be dispatched 6098e89d13fSVivek Goyal */ 6108e89d13fSVivek Goyal static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, 6118e89d13fSVivek Goyal struct bio *bio, unsigned long *wait) 6128e89d13fSVivek Goyal { 6138e89d13fSVivek Goyal bool rw = bio_data_dir(bio); 6148e89d13fSVivek Goyal unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; 6158e89d13fSVivek Goyal 6168e89d13fSVivek Goyal /* 6178e89d13fSVivek Goyal * Currently whole state machine of group depends on first bio 6188e89d13fSVivek Goyal * queued in the group bio list. So one should not be calling 6198e89d13fSVivek Goyal * this function with a different bio if there are other bios 6208e89d13fSVivek Goyal * queued. 6218e89d13fSVivek Goyal */ 6228e89d13fSVivek Goyal BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); 6238e89d13fSVivek Goyal 6248e89d13fSVivek Goyal /* If tg->bps = -1, then BW is unlimited */ 6258e89d13fSVivek Goyal if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { 6268e89d13fSVivek Goyal if (wait) 6278e89d13fSVivek Goyal *wait = 0; 6288e89d13fSVivek Goyal return 1; 6298e89d13fSVivek Goyal } 6308e89d13fSVivek Goyal 6318e89d13fSVivek Goyal /* 6328e89d13fSVivek Goyal * If previous slice expired, start a new one otherwise renew/extend 6338e89d13fSVivek Goyal * existing slice to make sure it is at least throtl_slice interval 6348e89d13fSVivek Goyal * long since now. 6358e89d13fSVivek Goyal */ 6368e89d13fSVivek Goyal if (throtl_slice_used(td, tg, rw)) 6378e89d13fSVivek Goyal throtl_start_new_slice(td, tg, rw); 6388e89d13fSVivek Goyal else { 6398e89d13fSVivek Goyal if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) 6408e89d13fSVivek Goyal throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); 6418e89d13fSVivek Goyal } 6428e89d13fSVivek Goyal 6438e89d13fSVivek Goyal if (tg_with_in_bps_limit(td, tg, bio, &bps_wait) 6448e89d13fSVivek Goyal && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) { 6458e89d13fSVivek Goyal if (wait) 6468e89d13fSVivek Goyal *wait = 0; 6478e89d13fSVivek Goyal return 1; 6488e89d13fSVivek Goyal } 6498e89d13fSVivek Goyal 6508e89d13fSVivek Goyal max_wait = max(bps_wait, iops_wait); 6518e89d13fSVivek Goyal 6528e89d13fSVivek Goyal if (wait) 6538e89d13fSVivek Goyal *wait = max_wait; 6548e89d13fSVivek Goyal 6558e89d13fSVivek Goyal if (time_before(tg->slice_end[rw], jiffies + max_wait)) 6568e89d13fSVivek Goyal throtl_extend_slice(td, tg, rw, jiffies + max_wait); 657e43473b7SVivek Goyal 658e43473b7SVivek Goyal return 0; 659e43473b7SVivek Goyal } 660e43473b7SVivek Goyal 661629ed0b1STejun Heo static void throtl_update_dispatch_stats(struct blkio_group *blkg, u64 bytes, 662629ed0b1STejun Heo int rw) 663629ed0b1STejun Heo { 6648a3d2615STejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 6658a3d2615STejun Heo struct tg_stats_cpu *stats_cpu; 666629ed0b1STejun Heo unsigned long flags; 667629ed0b1STejun Heo 668629ed0b1STejun Heo /* If per cpu stats are not allocated yet, don't do any accounting. */ 6698a3d2615STejun Heo if (tg->stats_cpu == NULL) 670629ed0b1STejun Heo return; 671629ed0b1STejun Heo 672629ed0b1STejun Heo /* 673629ed0b1STejun Heo * Disabling interrupts to provide mutual exclusion between two 674629ed0b1STejun Heo * writes on same cpu. It probably is not needed for 64bit. Not 675629ed0b1STejun Heo * optimizing that case yet. 676629ed0b1STejun Heo */ 677629ed0b1STejun Heo local_irq_save(flags); 678629ed0b1STejun Heo 6798a3d2615STejun Heo stats_cpu = this_cpu_ptr(tg->stats_cpu); 680629ed0b1STejun Heo 681629ed0b1STejun Heo blkg_rwstat_add(&stats_cpu->serviced, rw, 1); 682629ed0b1STejun Heo blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); 683629ed0b1STejun Heo 684629ed0b1STejun Heo local_irq_restore(flags); 685629ed0b1STejun Heo } 686629ed0b1STejun Heo 687e43473b7SVivek Goyal static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 688e43473b7SVivek Goyal { 689e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 690e43473b7SVivek Goyal 691e43473b7SVivek Goyal /* Charge the bio to the group */ 692e43473b7SVivek Goyal tg->bytes_disp[rw] += bio->bi_size; 6938e89d13fSVivek Goyal tg->io_disp[rw]++; 694e43473b7SVivek Goyal 695629ed0b1STejun Heo throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw); 696e43473b7SVivek Goyal } 697e43473b7SVivek Goyal 698e43473b7SVivek Goyal static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, 699e43473b7SVivek Goyal struct bio *bio) 700e43473b7SVivek Goyal { 701e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 702e43473b7SVivek Goyal 703e43473b7SVivek Goyal bio_list_add(&tg->bio_lists[rw], bio); 704e43473b7SVivek Goyal /* Take a bio reference on tg */ 7051adaf3ddSTejun Heo blkg_get(tg_to_blkg(tg)); 706e43473b7SVivek Goyal tg->nr_queued[rw]++; 707e43473b7SVivek Goyal td->nr_queued[rw]++; 708e43473b7SVivek Goyal throtl_enqueue_tg(td, tg); 709e43473b7SVivek Goyal } 710e43473b7SVivek Goyal 711e43473b7SVivek Goyal static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) 712e43473b7SVivek Goyal { 713e43473b7SVivek Goyal unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; 714e43473b7SVivek Goyal struct bio *bio; 715e43473b7SVivek Goyal 716e43473b7SVivek Goyal if ((bio = bio_list_peek(&tg->bio_lists[READ]))) 717e43473b7SVivek Goyal tg_may_dispatch(td, tg, bio, &read_wait); 718e43473b7SVivek Goyal 719e43473b7SVivek Goyal if ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) 720e43473b7SVivek Goyal tg_may_dispatch(td, tg, bio, &write_wait); 721e43473b7SVivek Goyal 722e43473b7SVivek Goyal min_wait = min(read_wait, write_wait); 723e43473b7SVivek Goyal disptime = jiffies + min_wait; 724e43473b7SVivek Goyal 725e43473b7SVivek Goyal /* Update dispatch time */ 726e43473b7SVivek Goyal throtl_dequeue_tg(td, tg); 727e43473b7SVivek Goyal tg->disptime = disptime; 728e43473b7SVivek Goyal throtl_enqueue_tg(td, tg); 729e43473b7SVivek Goyal } 730e43473b7SVivek Goyal 731e43473b7SVivek Goyal static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, 732e43473b7SVivek Goyal bool rw, struct bio_list *bl) 733e43473b7SVivek Goyal { 734e43473b7SVivek Goyal struct bio *bio; 735e43473b7SVivek Goyal 736e43473b7SVivek Goyal bio = bio_list_pop(&tg->bio_lists[rw]); 737e43473b7SVivek Goyal tg->nr_queued[rw]--; 7381adaf3ddSTejun Heo /* Drop bio reference on blkg */ 7391adaf3ddSTejun Heo blkg_put(tg_to_blkg(tg)); 740e43473b7SVivek Goyal 741e43473b7SVivek Goyal BUG_ON(td->nr_queued[rw] <= 0); 742e43473b7SVivek Goyal td->nr_queued[rw]--; 743e43473b7SVivek Goyal 744e43473b7SVivek Goyal throtl_charge_bio(tg, bio); 745e43473b7SVivek Goyal bio_list_add(bl, bio); 746e43473b7SVivek Goyal bio->bi_rw |= REQ_THROTTLED; 747e43473b7SVivek Goyal 748e43473b7SVivek Goyal throtl_trim_slice(td, tg, rw); 749e43473b7SVivek Goyal } 750e43473b7SVivek Goyal 751e43473b7SVivek Goyal static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, 752e43473b7SVivek Goyal struct bio_list *bl) 753e43473b7SVivek Goyal { 754e43473b7SVivek Goyal unsigned int nr_reads = 0, nr_writes = 0; 755e43473b7SVivek Goyal unsigned int max_nr_reads = throtl_grp_quantum*3/4; 756c2f6805dSVivek Goyal unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; 757e43473b7SVivek Goyal struct bio *bio; 758e43473b7SVivek Goyal 759e43473b7SVivek Goyal /* Try to dispatch 75% READS and 25% WRITES */ 760e43473b7SVivek Goyal 761e43473b7SVivek Goyal while ((bio = bio_list_peek(&tg->bio_lists[READ])) 762e43473b7SVivek Goyal && tg_may_dispatch(td, tg, bio, NULL)) { 763e43473b7SVivek Goyal 764e43473b7SVivek Goyal tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); 765e43473b7SVivek Goyal nr_reads++; 766e43473b7SVivek Goyal 767e43473b7SVivek Goyal if (nr_reads >= max_nr_reads) 768e43473b7SVivek Goyal break; 769e43473b7SVivek Goyal } 770e43473b7SVivek Goyal 771e43473b7SVivek Goyal while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) 772e43473b7SVivek Goyal && tg_may_dispatch(td, tg, bio, NULL)) { 773e43473b7SVivek Goyal 774e43473b7SVivek Goyal tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); 775e43473b7SVivek Goyal nr_writes++; 776e43473b7SVivek Goyal 777e43473b7SVivek Goyal if (nr_writes >= max_nr_writes) 778e43473b7SVivek Goyal break; 779e43473b7SVivek Goyal } 780e43473b7SVivek Goyal 781e43473b7SVivek Goyal return nr_reads + nr_writes; 782e43473b7SVivek Goyal } 783e43473b7SVivek Goyal 784e43473b7SVivek Goyal static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) 785e43473b7SVivek Goyal { 786e43473b7SVivek Goyal unsigned int nr_disp = 0; 787e43473b7SVivek Goyal struct throtl_grp *tg; 788e43473b7SVivek Goyal struct throtl_rb_root *st = &td->tg_service_tree; 789e43473b7SVivek Goyal 790e43473b7SVivek Goyal while (1) { 791e43473b7SVivek Goyal tg = throtl_rb_first(st); 792e43473b7SVivek Goyal 793e43473b7SVivek Goyal if (!tg) 794e43473b7SVivek Goyal break; 795e43473b7SVivek Goyal 796e43473b7SVivek Goyal if (time_before(jiffies, tg->disptime)) 797e43473b7SVivek Goyal break; 798e43473b7SVivek Goyal 799e43473b7SVivek Goyal throtl_dequeue_tg(td, tg); 800e43473b7SVivek Goyal 801e43473b7SVivek Goyal nr_disp += throtl_dispatch_tg(td, tg, bl); 802e43473b7SVivek Goyal 803e43473b7SVivek Goyal if (tg->nr_queued[0] || tg->nr_queued[1]) { 804e43473b7SVivek Goyal tg_update_disptime(td, tg); 805e43473b7SVivek Goyal throtl_enqueue_tg(td, tg); 806e43473b7SVivek Goyal } 807e43473b7SVivek Goyal 808e43473b7SVivek Goyal if (nr_disp >= throtl_quantum) 809e43473b7SVivek Goyal break; 810e43473b7SVivek Goyal } 811e43473b7SVivek Goyal 812e43473b7SVivek Goyal return nr_disp; 813e43473b7SVivek Goyal } 814e43473b7SVivek Goyal 815fe071437SVivek Goyal static void throtl_process_limit_change(struct throtl_data *td) 816fe071437SVivek Goyal { 8174eef3049STejun Heo struct request_queue *q = td->queue; 8184eef3049STejun Heo struct blkio_group *blkg, *n; 819fe071437SVivek Goyal 820de701c74SVivek Goyal if (!td->limits_changed) 821fe071437SVivek Goyal return; 822fe071437SVivek Goyal 823de701c74SVivek Goyal xchg(&td->limits_changed, false); 824fe071437SVivek Goyal 825de701c74SVivek Goyal throtl_log(td, "limits changed"); 826fe071437SVivek Goyal 827e8989faeSTejun Heo list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { 8284eef3049STejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 8294eef3049STejun Heo 830de701c74SVivek Goyal if (!tg->limits_changed) 831de701c74SVivek Goyal continue; 832fe071437SVivek Goyal 833de701c74SVivek Goyal if (!xchg(&tg->limits_changed, false)) 834de701c74SVivek Goyal continue; 835de701c74SVivek Goyal 836de701c74SVivek Goyal throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" 837de701c74SVivek Goyal " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE], 838de701c74SVivek Goyal tg->iops[READ], tg->iops[WRITE]); 839de701c74SVivek Goyal 84004521db0SVivek Goyal /* 84104521db0SVivek Goyal * Restart the slices for both READ and WRITES. It 84204521db0SVivek Goyal * might happen that a group's limit are dropped 84304521db0SVivek Goyal * suddenly and we don't want to account recently 84404521db0SVivek Goyal * dispatched IO with new low rate 84504521db0SVivek Goyal */ 84604521db0SVivek Goyal throtl_start_new_slice(td, tg, 0); 84704521db0SVivek Goyal throtl_start_new_slice(td, tg, 1); 84804521db0SVivek Goyal 849de701c74SVivek Goyal if (throtl_tg_on_rr(tg)) 850de701c74SVivek Goyal tg_update_disptime(td, tg); 851de701c74SVivek Goyal } 852fe071437SVivek Goyal } 853fe071437SVivek Goyal 854e43473b7SVivek Goyal /* Dispatch throttled bios. Should be called without queue lock held. */ 855e43473b7SVivek Goyal static int throtl_dispatch(struct request_queue *q) 856e43473b7SVivek Goyal { 857e43473b7SVivek Goyal struct throtl_data *td = q->td; 858e43473b7SVivek Goyal unsigned int nr_disp = 0; 859e43473b7SVivek Goyal struct bio_list bio_list_on_stack; 860e43473b7SVivek Goyal struct bio *bio; 86169d60eb9SVivek Goyal struct blk_plug plug; 862e43473b7SVivek Goyal 863e43473b7SVivek Goyal spin_lock_irq(q->queue_lock); 864e43473b7SVivek Goyal 865fe071437SVivek Goyal throtl_process_limit_change(td); 866fe071437SVivek Goyal 867e43473b7SVivek Goyal if (!total_nr_queued(td)) 868e43473b7SVivek Goyal goto out; 869e43473b7SVivek Goyal 870e43473b7SVivek Goyal bio_list_init(&bio_list_on_stack); 871e43473b7SVivek Goyal 872d2f31a5fSJoe Perches throtl_log(td, "dispatch nr_queued=%u read=%u write=%u", 873e43473b7SVivek Goyal total_nr_queued(td), td->nr_queued[READ], 874e43473b7SVivek Goyal td->nr_queued[WRITE]); 875e43473b7SVivek Goyal 876e43473b7SVivek Goyal nr_disp = throtl_select_dispatch(td, &bio_list_on_stack); 877e43473b7SVivek Goyal 878e43473b7SVivek Goyal if (nr_disp) 879e43473b7SVivek Goyal throtl_log(td, "bios disp=%u", nr_disp); 880e43473b7SVivek Goyal 881e43473b7SVivek Goyal throtl_schedule_next_dispatch(td); 882e43473b7SVivek Goyal out: 883e43473b7SVivek Goyal spin_unlock_irq(q->queue_lock); 884e43473b7SVivek Goyal 885e43473b7SVivek Goyal /* 886e43473b7SVivek Goyal * If we dispatched some requests, unplug the queue to make sure 887e43473b7SVivek Goyal * immediate dispatch 888e43473b7SVivek Goyal */ 889e43473b7SVivek Goyal if (nr_disp) { 89069d60eb9SVivek Goyal blk_start_plug(&plug); 891e43473b7SVivek Goyal while((bio = bio_list_pop(&bio_list_on_stack))) 892e43473b7SVivek Goyal generic_make_request(bio); 89369d60eb9SVivek Goyal blk_finish_plug(&plug); 894e43473b7SVivek Goyal } 895e43473b7SVivek Goyal return nr_disp; 896e43473b7SVivek Goyal } 897e43473b7SVivek Goyal 898e43473b7SVivek Goyal void blk_throtl_work(struct work_struct *work) 899e43473b7SVivek Goyal { 900e43473b7SVivek Goyal struct throtl_data *td = container_of(work, struct throtl_data, 901e43473b7SVivek Goyal throtl_work.work); 902e43473b7SVivek Goyal struct request_queue *q = td->queue; 903e43473b7SVivek Goyal 904e43473b7SVivek Goyal throtl_dispatch(q); 905e43473b7SVivek Goyal } 906e43473b7SVivek Goyal 907e43473b7SVivek Goyal /* Call with queue lock held */ 908450adcbeSVivek Goyal static void 909450adcbeSVivek Goyal throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay) 910e43473b7SVivek Goyal { 911e43473b7SVivek Goyal 912e43473b7SVivek Goyal struct delayed_work *dwork = &td->throtl_work; 913e43473b7SVivek Goyal 91404521db0SVivek Goyal /* schedule work if limits changed even if no bio is queued */ 915d2f31a5fSJoe Perches if (total_nr_queued(td) || td->limits_changed) { 916e43473b7SVivek Goyal /* 917e43473b7SVivek Goyal * We might have a work scheduled to be executed in future. 918e43473b7SVivek Goyal * Cancel that and schedule a new one. 919e43473b7SVivek Goyal */ 920e43473b7SVivek Goyal __cancel_delayed_work(dwork); 921450adcbeSVivek Goyal queue_delayed_work(kthrotld_workqueue, dwork, delay); 922e43473b7SVivek Goyal throtl_log(td, "schedule work. delay=%lu jiffies=%lu", 923e43473b7SVivek Goyal delay, jiffies); 924e43473b7SVivek Goyal } 925e43473b7SVivek Goyal } 926e43473b7SVivek Goyal 927d366e7ecSTejun Heo static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, void *pdata, int off) 92841b38b6dSTejun Heo { 929d366e7ecSTejun Heo struct throtl_grp *tg = pdata; 93041b38b6dSTejun Heo struct blkg_rwstat rwstat = { }, tmp; 93141b38b6dSTejun Heo int i, cpu; 93241b38b6dSTejun Heo 93341b38b6dSTejun Heo for_each_possible_cpu(cpu) { 9348a3d2615STejun Heo struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); 93541b38b6dSTejun Heo 93641b38b6dSTejun Heo tmp = blkg_rwstat_read((void *)sc + off); 93741b38b6dSTejun Heo for (i = 0; i < BLKG_RWSTAT_NR; i++) 93841b38b6dSTejun Heo rwstat.cnt[i] += tmp.cnt[i]; 93941b38b6dSTejun Heo } 94041b38b6dSTejun Heo 941d366e7ecSTejun Heo return __blkg_prfill_rwstat(sf, pdata, &rwstat); 94241b38b6dSTejun Heo } 94341b38b6dSTejun Heo 9448a3d2615STejun Heo static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, 94541b38b6dSTejun Heo struct seq_file *sf) 94641b38b6dSTejun Heo { 94741b38b6dSTejun Heo struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp); 94841b38b6dSTejun Heo 949*5bc4afb1STejun Heo blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, BLKIO_POLICY_THROTL, 950*5bc4afb1STejun Heo cft->private, true); 95141b38b6dSTejun Heo return 0; 95241b38b6dSTejun Heo } 95341b38b6dSTejun Heo 954d366e7ecSTejun Heo static u64 tg_prfill_conf_u64(struct seq_file *sf, void *pdata, int off) 95560c2bc2dSTejun Heo { 956d366e7ecSTejun Heo u64 v = *(u64 *)(pdata + off); 95760c2bc2dSTejun Heo 958af133cebSTejun Heo if (v == -1) 95960c2bc2dSTejun Heo return 0; 960d366e7ecSTejun Heo return __blkg_prfill_u64(sf, pdata, v); 96160c2bc2dSTejun Heo } 96260c2bc2dSTejun Heo 963d366e7ecSTejun Heo static u64 tg_prfill_conf_uint(struct seq_file *sf, void *pdata, int off) 964af133cebSTejun Heo { 965d366e7ecSTejun Heo unsigned int v = *(unsigned int *)(pdata + off); 966af133cebSTejun Heo 967af133cebSTejun Heo if (v == -1) 968af133cebSTejun Heo return 0; 969d366e7ecSTejun Heo return __blkg_prfill_u64(sf, pdata, v); 970af133cebSTejun Heo } 971af133cebSTejun Heo 972af133cebSTejun Heo static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft, 97360c2bc2dSTejun Heo struct seq_file *sf) 97460c2bc2dSTejun Heo { 975af133cebSTejun Heo blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp), tg_prfill_conf_u64, 976af133cebSTejun Heo BLKIO_POLICY_THROTL, cft->private, false); 97760c2bc2dSTejun Heo return 0; 97860c2bc2dSTejun Heo } 97960c2bc2dSTejun Heo 980af133cebSTejun Heo static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft, 981af133cebSTejun Heo struct seq_file *sf) 982e43473b7SVivek Goyal { 983af133cebSTejun Heo blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp), tg_prfill_conf_uint, 984af133cebSTejun Heo BLKIO_POLICY_THROTL, cft->private, false); 985af133cebSTejun Heo return 0; 986e43473b7SVivek Goyal } 987e43473b7SVivek Goyal 988af133cebSTejun Heo static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, 989af133cebSTejun Heo bool is_u64) 99060c2bc2dSTejun Heo { 99160c2bc2dSTejun Heo struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp); 99260c2bc2dSTejun Heo struct blkg_conf_ctx ctx; 993af133cebSTejun Heo struct throtl_grp *tg; 99460c2bc2dSTejun Heo int ret; 99560c2bc2dSTejun Heo 99660c2bc2dSTejun Heo ret = blkg_conf_prep(blkcg, buf, &ctx); 99760c2bc2dSTejun Heo if (ret) 99860c2bc2dSTejun Heo return ret; 99960c2bc2dSTejun Heo 100060c2bc2dSTejun Heo ret = -EINVAL; 1001af133cebSTejun Heo tg = blkg_to_tg(ctx.blkg); 1002af133cebSTejun Heo if (tg) { 1003af133cebSTejun Heo struct throtl_data *td = ctx.blkg->q->td; 1004af133cebSTejun Heo 1005af133cebSTejun Heo if (!ctx.v) 1006af133cebSTejun Heo ctx.v = -1; 1007af133cebSTejun Heo 1008af133cebSTejun Heo if (is_u64) 1009af133cebSTejun Heo *(u64 *)((void *)tg + cft->private) = ctx.v; 1010af133cebSTejun Heo else 1011af133cebSTejun Heo *(unsigned int *)((void *)tg + cft->private) = ctx.v; 1012af133cebSTejun Heo 1013af133cebSTejun Heo /* XXX: we don't need the following deferred processing */ 1014af133cebSTejun Heo xchg(&tg->limits_changed, true); 1015af133cebSTejun Heo xchg(&td->limits_changed, true); 1016af133cebSTejun Heo throtl_schedule_delayed_work(td, 0); 1017af133cebSTejun Heo 101860c2bc2dSTejun Heo ret = 0; 101960c2bc2dSTejun Heo } 102060c2bc2dSTejun Heo 102160c2bc2dSTejun Heo blkg_conf_finish(&ctx); 102260c2bc2dSTejun Heo return ret; 102360c2bc2dSTejun Heo } 102460c2bc2dSTejun Heo 1025af133cebSTejun Heo static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft, 102660c2bc2dSTejun Heo const char *buf) 102760c2bc2dSTejun Heo { 1028af133cebSTejun Heo return tg_set_conf(cgrp, cft, buf, true); 102960c2bc2dSTejun Heo } 103060c2bc2dSTejun Heo 1031af133cebSTejun Heo static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft, 103260c2bc2dSTejun Heo const char *buf) 103360c2bc2dSTejun Heo { 1034af133cebSTejun Heo return tg_set_conf(cgrp, cft, buf, false); 103560c2bc2dSTejun Heo } 103660c2bc2dSTejun Heo 103760c2bc2dSTejun Heo static struct cftype throtl_files[] = { 103860c2bc2dSTejun Heo { 103960c2bc2dSTejun Heo .name = "throttle.read_bps_device", 1040af133cebSTejun Heo .private = offsetof(struct throtl_grp, bps[READ]), 1041af133cebSTejun Heo .read_seq_string = tg_print_conf_u64, 1042af133cebSTejun Heo .write_string = tg_set_conf_u64, 104360c2bc2dSTejun Heo .max_write_len = 256, 104460c2bc2dSTejun Heo }, 104560c2bc2dSTejun Heo { 104660c2bc2dSTejun Heo .name = "throttle.write_bps_device", 1047af133cebSTejun Heo .private = offsetof(struct throtl_grp, bps[WRITE]), 1048af133cebSTejun Heo .read_seq_string = tg_print_conf_u64, 1049af133cebSTejun Heo .write_string = tg_set_conf_u64, 105060c2bc2dSTejun Heo .max_write_len = 256, 105160c2bc2dSTejun Heo }, 105260c2bc2dSTejun Heo { 105360c2bc2dSTejun Heo .name = "throttle.read_iops_device", 1054af133cebSTejun Heo .private = offsetof(struct throtl_grp, iops[READ]), 1055af133cebSTejun Heo .read_seq_string = tg_print_conf_uint, 1056af133cebSTejun Heo .write_string = tg_set_conf_uint, 105760c2bc2dSTejun Heo .max_write_len = 256, 105860c2bc2dSTejun Heo }, 105960c2bc2dSTejun Heo { 106060c2bc2dSTejun Heo .name = "throttle.write_iops_device", 1061af133cebSTejun Heo .private = offsetof(struct throtl_grp, iops[WRITE]), 1062af133cebSTejun Heo .read_seq_string = tg_print_conf_uint, 1063af133cebSTejun Heo .write_string = tg_set_conf_uint, 106460c2bc2dSTejun Heo .max_write_len = 256, 106560c2bc2dSTejun Heo }, 106660c2bc2dSTejun Heo { 106760c2bc2dSTejun Heo .name = "throttle.io_service_bytes", 1068*5bc4afb1STejun Heo .private = offsetof(struct tg_stats_cpu, service_bytes), 10698a3d2615STejun Heo .read_seq_string = tg_print_cpu_rwstat, 107060c2bc2dSTejun Heo }, 107160c2bc2dSTejun Heo { 107260c2bc2dSTejun Heo .name = "throttle.io_serviced", 1073*5bc4afb1STejun Heo .private = offsetof(struct tg_stats_cpu, serviced), 10748a3d2615STejun Heo .read_seq_string = tg_print_cpu_rwstat, 107560c2bc2dSTejun Heo }, 107660c2bc2dSTejun Heo { } /* terminate */ 107760c2bc2dSTejun Heo }; 107860c2bc2dSTejun Heo 1079da527770SVivek Goyal static void throtl_shutdown_wq(struct request_queue *q) 1080e43473b7SVivek Goyal { 1081e43473b7SVivek Goyal struct throtl_data *td = q->td; 1082e43473b7SVivek Goyal 1083e43473b7SVivek Goyal cancel_delayed_work_sync(&td->throtl_work); 1084e43473b7SVivek Goyal } 1085e43473b7SVivek Goyal 1086e43473b7SVivek Goyal static struct blkio_policy_type blkio_policy_throtl = { 1087e43473b7SVivek Goyal .ops = { 10880381411eSTejun Heo .blkio_init_group_fn = throtl_init_blkio_group, 10898a3d2615STejun Heo .blkio_exit_group_fn = throtl_exit_blkio_group, 10908a3d2615STejun Heo .blkio_reset_group_stats_fn = throtl_reset_group_stats, 1091e43473b7SVivek Goyal }, 10928e89d13fSVivek Goyal .plid = BLKIO_POLICY_THROTL, 10930381411eSTejun Heo .pdata_size = sizeof(struct throtl_grp), 109460c2bc2dSTejun Heo .cftypes = throtl_files, 1095e43473b7SVivek Goyal }; 1096e43473b7SVivek Goyal 1097bc16a4f9STejun Heo bool blk_throtl_bio(struct request_queue *q, struct bio *bio) 1098e43473b7SVivek Goyal { 1099e43473b7SVivek Goyal struct throtl_data *td = q->td; 1100e43473b7SVivek Goyal struct throtl_grp *tg; 1101e43473b7SVivek Goyal bool rw = bio_data_dir(bio), update_disptime = true; 1102af75cd3cSVivek Goyal struct blkio_cgroup *blkcg; 1103bc16a4f9STejun Heo bool throttled = false; 1104e43473b7SVivek Goyal 1105e43473b7SVivek Goyal if (bio->bi_rw & REQ_THROTTLED) { 1106e43473b7SVivek Goyal bio->bi_rw &= ~REQ_THROTTLED; 1107bc16a4f9STejun Heo goto out; 1108e43473b7SVivek Goyal } 1109e43473b7SVivek Goyal 1110671058fbSTejun Heo /* bio_associate_current() needs ioc, try creating */ 1111671058fbSTejun Heo create_io_context(GFP_ATOMIC, q->node); 1112671058fbSTejun Heo 1113af75cd3cSVivek Goyal /* 1114af75cd3cSVivek Goyal * A throtl_grp pointer retrieved under rcu can be used to access 1115af75cd3cSVivek Goyal * basic fields like stats and io rates. If a group has no rules, 1116af75cd3cSVivek Goyal * just update the dispatch stats in lockless manner and return. 1117af75cd3cSVivek Goyal */ 1118af75cd3cSVivek Goyal rcu_read_lock(); 11194f85cb96STejun Heo blkcg = bio_blkio_cgroup(bio); 1120cd1604faSTejun Heo tg = throtl_lookup_tg(td, blkcg); 1121af75cd3cSVivek Goyal if (tg) { 1122af75cd3cSVivek Goyal if (tg_no_rule_group(tg, rw)) { 1123629ed0b1STejun Heo throtl_update_dispatch_stats(tg_to_blkg(tg), 1124629ed0b1STejun Heo bio->bi_size, bio->bi_rw); 11252a7f1244STejun Heo goto out_unlock_rcu; 1126af75cd3cSVivek Goyal } 1127af75cd3cSVivek Goyal } 1128af75cd3cSVivek Goyal 1129af75cd3cSVivek Goyal /* 1130af75cd3cSVivek Goyal * Either group has not been allocated yet or it is not an unlimited 1131af75cd3cSVivek Goyal * IO group 1132af75cd3cSVivek Goyal */ 1133e43473b7SVivek Goyal spin_lock_irq(q->queue_lock); 1134cd1604faSTejun Heo tg = throtl_lookup_create_tg(td, blkcg); 1135bc16a4f9STejun Heo if (unlikely(!tg)) 1136bc16a4f9STejun Heo goto out_unlock; 1137f469a7b4SVivek Goyal 1138e43473b7SVivek Goyal if (tg->nr_queued[rw]) { 1139e43473b7SVivek Goyal /* 1140e43473b7SVivek Goyal * There is already another bio queued in same dir. No 1141e43473b7SVivek Goyal * need to update dispatch time. 1142e43473b7SVivek Goyal */ 1143e43473b7SVivek Goyal update_disptime = false; 1144e43473b7SVivek Goyal goto queue_bio; 1145de701c74SVivek Goyal 1146e43473b7SVivek Goyal } 1147e43473b7SVivek Goyal 1148e43473b7SVivek Goyal /* Bio is with-in rate limit of group */ 1149e43473b7SVivek Goyal if (tg_may_dispatch(td, tg, bio, NULL)) { 1150e43473b7SVivek Goyal throtl_charge_bio(tg, bio); 115104521db0SVivek Goyal 115204521db0SVivek Goyal /* 115304521db0SVivek Goyal * We need to trim slice even when bios are not being queued 115404521db0SVivek Goyal * otherwise it might happen that a bio is not queued for 115504521db0SVivek Goyal * a long time and slice keeps on extending and trim is not 115604521db0SVivek Goyal * called for a long time. Now if limits are reduced suddenly 115704521db0SVivek Goyal * we take into account all the IO dispatched so far at new 115804521db0SVivek Goyal * low rate and * newly queued IO gets a really long dispatch 115904521db0SVivek Goyal * time. 116004521db0SVivek Goyal * 116104521db0SVivek Goyal * So keep on trimming slice even if bio is not queued. 116204521db0SVivek Goyal */ 116304521db0SVivek Goyal throtl_trim_slice(td, tg, rw); 1164bc16a4f9STejun Heo goto out_unlock; 1165e43473b7SVivek Goyal } 1166e43473b7SVivek Goyal 1167e43473b7SVivek Goyal queue_bio: 1168fd16d263SJoe Perches throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu" 11698e89d13fSVivek Goyal " iodisp=%u iops=%u queued=%d/%d", 11708e89d13fSVivek Goyal rw == READ ? 'R' : 'W', 1171e43473b7SVivek Goyal tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], 11728e89d13fSVivek Goyal tg->io_disp[rw], tg->iops[rw], 1173e43473b7SVivek Goyal tg->nr_queued[READ], tg->nr_queued[WRITE]); 1174e43473b7SVivek Goyal 1175671058fbSTejun Heo bio_associate_current(bio); 1176e43473b7SVivek Goyal throtl_add_bio_tg(q->td, tg, bio); 1177bc16a4f9STejun Heo throttled = true; 1178e43473b7SVivek Goyal 1179e43473b7SVivek Goyal if (update_disptime) { 1180e43473b7SVivek Goyal tg_update_disptime(td, tg); 1181e43473b7SVivek Goyal throtl_schedule_next_dispatch(td); 1182e43473b7SVivek Goyal } 1183e43473b7SVivek Goyal 1184bc16a4f9STejun Heo out_unlock: 1185e43473b7SVivek Goyal spin_unlock_irq(q->queue_lock); 11862a7f1244STejun Heo out_unlock_rcu: 11872a7f1244STejun Heo rcu_read_unlock(); 1188bc16a4f9STejun Heo out: 1189bc16a4f9STejun Heo return throttled; 1190e43473b7SVivek Goyal } 1191e43473b7SVivek Goyal 1192c9a929ddSTejun Heo /** 1193c9a929ddSTejun Heo * blk_throtl_drain - drain throttled bios 1194c9a929ddSTejun Heo * @q: request_queue to drain throttled bios for 1195c9a929ddSTejun Heo * 1196c9a929ddSTejun Heo * Dispatch all currently throttled bios on @q through ->make_request_fn(). 1197c9a929ddSTejun Heo */ 1198c9a929ddSTejun Heo void blk_throtl_drain(struct request_queue *q) 1199c9a929ddSTejun Heo __releases(q->queue_lock) __acquires(q->queue_lock) 1200c9a929ddSTejun Heo { 1201c9a929ddSTejun Heo struct throtl_data *td = q->td; 1202c9a929ddSTejun Heo struct throtl_rb_root *st = &td->tg_service_tree; 1203c9a929ddSTejun Heo struct throtl_grp *tg; 1204c9a929ddSTejun Heo struct bio_list bl; 1205c9a929ddSTejun Heo struct bio *bio; 1206c9a929ddSTejun Heo 1207334c2b0bSJens Axboe WARN_ON_ONCE(!queue_is_locked(q)); 1208c9a929ddSTejun Heo 1209c9a929ddSTejun Heo bio_list_init(&bl); 1210c9a929ddSTejun Heo 1211c9a929ddSTejun Heo while ((tg = throtl_rb_first(st))) { 1212c9a929ddSTejun Heo throtl_dequeue_tg(td, tg); 1213c9a929ddSTejun Heo 1214c9a929ddSTejun Heo while ((bio = bio_list_peek(&tg->bio_lists[READ]))) 1215c9a929ddSTejun Heo tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl); 1216c9a929ddSTejun Heo while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) 1217c9a929ddSTejun Heo tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl); 1218c9a929ddSTejun Heo } 1219c9a929ddSTejun Heo spin_unlock_irq(q->queue_lock); 1220c9a929ddSTejun Heo 1221c9a929ddSTejun Heo while ((bio = bio_list_pop(&bl))) 1222c9a929ddSTejun Heo generic_make_request(bio); 1223c9a929ddSTejun Heo 1224c9a929ddSTejun Heo spin_lock_irq(q->queue_lock); 1225c9a929ddSTejun Heo } 1226c9a929ddSTejun Heo 1227e43473b7SVivek Goyal int blk_throtl_init(struct request_queue *q) 1228e43473b7SVivek Goyal { 1229e43473b7SVivek Goyal struct throtl_data *td; 1230cd1604faSTejun Heo struct blkio_group *blkg; 1231e43473b7SVivek Goyal 1232e43473b7SVivek Goyal td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1233e43473b7SVivek Goyal if (!td) 1234e43473b7SVivek Goyal return -ENOMEM; 1235e43473b7SVivek Goyal 1236e43473b7SVivek Goyal td->tg_service_tree = THROTL_RB_ROOT; 1237de701c74SVivek Goyal td->limits_changed = false; 1238a29a171eSVivek Goyal INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); 1239e43473b7SVivek Goyal 1240cd1604faSTejun Heo q->td = td; 124129b12589SVivek Goyal td->queue = q; 124202977e4aSVivek Goyal 1243cd1604faSTejun Heo /* alloc and init root group. */ 1244f51b802cSTejun Heo rcu_read_lock(); 1245f51b802cSTejun Heo spin_lock_irq(q->queue_lock); 1246f51b802cSTejun Heo 1247aaec55a0STejun Heo blkg = blkg_lookup_create(&blkio_root_cgroup, q, true); 1248cd1604faSTejun Heo if (!IS_ERR(blkg)) 12490381411eSTejun Heo td->root_tg = blkg_to_tg(blkg); 1250f51b802cSTejun Heo 1251f51b802cSTejun Heo spin_unlock_irq(q->queue_lock); 1252f51b802cSTejun Heo rcu_read_unlock(); 1253f51b802cSTejun Heo 1254f51b802cSTejun Heo if (!td->root_tg) { 125529b12589SVivek Goyal kfree(td); 125629b12589SVivek Goyal return -ENOMEM; 125729b12589SVivek Goyal } 1258e43473b7SVivek Goyal return 0; 1259e43473b7SVivek Goyal } 1260e43473b7SVivek Goyal 1261e43473b7SVivek Goyal void blk_throtl_exit(struct request_queue *q) 1262e43473b7SVivek Goyal { 1263c875f4d0STejun Heo BUG_ON(!q->td); 1264da527770SVivek Goyal throtl_shutdown_wq(q); 1265c9a929ddSTejun Heo kfree(q->td); 1266e43473b7SVivek Goyal } 1267e43473b7SVivek Goyal 1268e43473b7SVivek Goyal static int __init throtl_init(void) 1269e43473b7SVivek Goyal { 1270450adcbeSVivek Goyal kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); 1271450adcbeSVivek Goyal if (!kthrotld_workqueue) 1272450adcbeSVivek Goyal panic("Failed to create kthrotld\n"); 1273450adcbeSVivek Goyal 1274e43473b7SVivek Goyal blkio_policy_register(&blkio_policy_throtl); 1275e43473b7SVivek Goyal return 0; 1276e43473b7SVivek Goyal } 1277e43473b7SVivek Goyal 1278e43473b7SVivek Goyal module_init(throtl_init); 1279