1e43473b7SVivek Goyal /* 2e43473b7SVivek Goyal * Interface for controlling IO bandwidth on a request queue 3e43473b7SVivek Goyal * 4e43473b7SVivek Goyal * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> 5e43473b7SVivek Goyal */ 6e43473b7SVivek Goyal 7e43473b7SVivek Goyal #include <linux/module.h> 8e43473b7SVivek Goyal #include <linux/slab.h> 9e43473b7SVivek Goyal #include <linux/blkdev.h> 10e43473b7SVivek Goyal #include <linux/bio.h> 11e43473b7SVivek Goyal #include <linux/blktrace_api.h> 12e43473b7SVivek Goyal #include "blk-cgroup.h" 13bc9fcbf9STejun Heo #include "blk.h" 14e43473b7SVivek Goyal 15e43473b7SVivek Goyal /* Max dispatch from a group in 1 round */ 16e43473b7SVivek Goyal static int throtl_grp_quantum = 8; 17e43473b7SVivek Goyal 18e43473b7SVivek Goyal /* Total max dispatch from all groups in one round */ 19e43473b7SVivek Goyal static int throtl_quantum = 32; 20e43473b7SVivek Goyal 21e43473b7SVivek Goyal /* Throttling is performed over 100ms slice and after that slice is renewed */ 22e43473b7SVivek Goyal static unsigned long throtl_slice = HZ/10; /* 100 ms */ 23e43473b7SVivek Goyal 243c798398STejun Heo static struct blkcg_policy blkcg_policy_throtl; 250381411eSTejun Heo 26450adcbeSVivek Goyal /* A workqueue to queue throttle related work */ 27450adcbeSVivek Goyal static struct workqueue_struct *kthrotld_workqueue; 28450adcbeSVivek Goyal 29c9e0332eSTejun Heo struct throtl_service_queue { 30*73f0d49aSTejun Heo /* 31*73f0d49aSTejun Heo * Bios queued directly to this service_queue or dispatched from 32*73f0d49aSTejun Heo * children throtl_grp's. 33*73f0d49aSTejun Heo */ 34*73f0d49aSTejun Heo struct bio_list bio_lists[2]; /* queued bios [READ/WRITE] */ 35*73f0d49aSTejun Heo unsigned int nr_queued[2]; /* number of queued bios */ 36*73f0d49aSTejun Heo 37*73f0d49aSTejun Heo /* 38*73f0d49aSTejun Heo * RB tree of active children throtl_grp's, which are sorted by 39*73f0d49aSTejun Heo * their ->disptime. 40*73f0d49aSTejun Heo */ 41c9e0332eSTejun Heo struct rb_root pending_tree; /* RB tree of active tgs */ 42c9e0332eSTejun Heo struct rb_node *first_pending; /* first node in the tree */ 43c9e0332eSTejun Heo unsigned int nr_pending; /* # queued in the tree */ 44c9e0332eSTejun Heo unsigned long first_pending_disptime; /* disptime of the first tg */ 45e43473b7SVivek Goyal }; 46e43473b7SVivek Goyal 475b2c16aaSTejun Heo enum tg_state_flags { 485b2c16aaSTejun Heo THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ 495b2c16aaSTejun Heo }; 505b2c16aaSTejun Heo 51e43473b7SVivek Goyal #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 52e43473b7SVivek Goyal 538a3d2615STejun Heo /* Per-cpu group stats */ 548a3d2615STejun Heo struct tg_stats_cpu { 558a3d2615STejun Heo /* total bytes transferred */ 568a3d2615STejun Heo struct blkg_rwstat service_bytes; 578a3d2615STejun Heo /* total IOs serviced, post merge */ 588a3d2615STejun Heo struct blkg_rwstat serviced; 598a3d2615STejun Heo }; 608a3d2615STejun Heo 61e43473b7SVivek Goyal struct throtl_grp { 62f95a04afSTejun Heo /* must be the first member */ 63f95a04afSTejun Heo struct blkg_policy_data pd; 64f95a04afSTejun Heo 65c9e0332eSTejun Heo /* active throtl group service_queue member */ 66e43473b7SVivek Goyal struct rb_node rb_node; 67e43473b7SVivek Goyal 680f3457f6STejun Heo /* throtl_data this group belongs to */ 690f3457f6STejun Heo struct throtl_data *td; 700f3457f6STejun Heo 7149a2f1e3STejun Heo /* this group's service queue */ 7249a2f1e3STejun Heo struct throtl_service_queue service_queue; 7349a2f1e3STejun Heo 74e43473b7SVivek Goyal /* 75e43473b7SVivek Goyal * Dispatch time in jiffies. This is the estimated time when group 76e43473b7SVivek Goyal * will unthrottle and is ready to dispatch more bio. It is used as 77e43473b7SVivek Goyal * key to sort active groups in service tree. 78e43473b7SVivek Goyal */ 79e43473b7SVivek Goyal unsigned long disptime; 80e43473b7SVivek Goyal 81e43473b7SVivek Goyal unsigned int flags; 82e43473b7SVivek Goyal 83e43473b7SVivek Goyal /* bytes per second rate limits */ 84e43473b7SVivek Goyal uint64_t bps[2]; 85e43473b7SVivek Goyal 868e89d13fSVivek Goyal /* IOPS limits */ 878e89d13fSVivek Goyal unsigned int iops[2]; 888e89d13fSVivek Goyal 89e43473b7SVivek Goyal /* Number of bytes disptached in current slice */ 90e43473b7SVivek Goyal uint64_t bytes_disp[2]; 918e89d13fSVivek Goyal /* Number of bio's dispatched in current slice */ 928e89d13fSVivek Goyal unsigned int io_disp[2]; 93e43473b7SVivek Goyal 94e43473b7SVivek Goyal /* When did we start a new slice */ 95e43473b7SVivek Goyal unsigned long slice_start[2]; 96e43473b7SVivek Goyal unsigned long slice_end[2]; 97fe071437SVivek Goyal 988a3d2615STejun Heo /* Per cpu stats pointer */ 998a3d2615STejun Heo struct tg_stats_cpu __percpu *stats_cpu; 1008a3d2615STejun Heo 1018a3d2615STejun Heo /* List of tgs waiting for per cpu stats memory to be allocated */ 1028a3d2615STejun Heo struct list_head stats_alloc_node; 103e43473b7SVivek Goyal }; 104e43473b7SVivek Goyal 105e43473b7SVivek Goyal struct throtl_data 106e43473b7SVivek Goyal { 107e43473b7SVivek Goyal /* service tree for active throtl groups */ 108c9e0332eSTejun Heo struct throtl_service_queue service_queue; 109e43473b7SVivek Goyal 110e43473b7SVivek Goyal struct request_queue *queue; 111e43473b7SVivek Goyal 112e43473b7SVivek Goyal /* Total Number of queued bios on READ and WRITE lists */ 113e43473b7SVivek Goyal unsigned int nr_queued[2]; 114e43473b7SVivek Goyal 115e43473b7SVivek Goyal /* 11602977e4aSVivek Goyal * number of total undestroyed groups 117e43473b7SVivek Goyal */ 118e43473b7SVivek Goyal unsigned int nr_undestroyed_grps; 119e43473b7SVivek Goyal 120e43473b7SVivek Goyal /* Work for dispatching throttled bios */ 121cb76199cSTejun Heo struct delayed_work dispatch_work; 122e43473b7SVivek Goyal }; 123e43473b7SVivek Goyal 1248a3d2615STejun Heo /* list and work item to allocate percpu group stats */ 1258a3d2615STejun Heo static DEFINE_SPINLOCK(tg_stats_alloc_lock); 1268a3d2615STejun Heo static LIST_HEAD(tg_stats_alloc_list); 1278a3d2615STejun Heo 1288a3d2615STejun Heo static void tg_stats_alloc_fn(struct work_struct *); 1298a3d2615STejun Heo static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); 1308a3d2615STejun Heo 131f95a04afSTejun Heo static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) 132f95a04afSTejun Heo { 133f95a04afSTejun Heo return pd ? container_of(pd, struct throtl_grp, pd) : NULL; 134f95a04afSTejun Heo } 135f95a04afSTejun Heo 1363c798398STejun Heo static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) 1370381411eSTejun Heo { 138f95a04afSTejun Heo return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); 1390381411eSTejun Heo } 1400381411eSTejun Heo 1413c798398STejun Heo static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) 1420381411eSTejun Heo { 143f95a04afSTejun Heo return pd_to_blkg(&tg->pd); 1440381411eSTejun Heo } 1450381411eSTejun Heo 14603d8e111STejun Heo static inline struct throtl_grp *td_root_tg(struct throtl_data *td) 14703d8e111STejun Heo { 14803d8e111STejun Heo return blkg_to_tg(td->queue->root_blkg); 14903d8e111STejun Heo } 15003d8e111STejun Heo 1510f3457f6STejun Heo #define throtl_log_tg(tg, fmt, args...) do { \ 15254e7ed12STejun Heo char __pbuf[128]; \ 15354e7ed12STejun Heo \ 15454e7ed12STejun Heo blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \ 1550f3457f6STejun Heo blk_add_trace_msg((tg)->td->queue, "throtl %s " fmt, __pbuf, ##args); \ 15654e7ed12STejun Heo } while (0) 157e43473b7SVivek Goyal 158e43473b7SVivek Goyal #define throtl_log(td, fmt, args...) \ 159e43473b7SVivek Goyal blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) 160e43473b7SVivek Goyal 1618a3d2615STejun Heo /* 1628a3d2615STejun Heo * Worker for allocating per cpu stat for tgs. This is scheduled on the 1633b07e9caSTejun Heo * system_wq once there are some groups on the alloc_list waiting for 1648a3d2615STejun Heo * allocation. 1658a3d2615STejun Heo */ 1668a3d2615STejun Heo static void tg_stats_alloc_fn(struct work_struct *work) 1678a3d2615STejun Heo { 1688a3d2615STejun Heo static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ 1698a3d2615STejun Heo struct delayed_work *dwork = to_delayed_work(work); 1708a3d2615STejun Heo bool empty = false; 1718a3d2615STejun Heo 1728a3d2615STejun Heo alloc_stats: 1738a3d2615STejun Heo if (!stats_cpu) { 1748a3d2615STejun Heo stats_cpu = alloc_percpu(struct tg_stats_cpu); 1758a3d2615STejun Heo if (!stats_cpu) { 1768a3d2615STejun Heo /* allocation failed, try again after some time */ 1773b07e9caSTejun Heo schedule_delayed_work(dwork, msecs_to_jiffies(10)); 1788a3d2615STejun Heo return; 1798a3d2615STejun Heo } 1808a3d2615STejun Heo } 1818a3d2615STejun Heo 1828a3d2615STejun Heo spin_lock_irq(&tg_stats_alloc_lock); 1838a3d2615STejun Heo 1848a3d2615STejun Heo if (!list_empty(&tg_stats_alloc_list)) { 1858a3d2615STejun Heo struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, 1868a3d2615STejun Heo struct throtl_grp, 1878a3d2615STejun Heo stats_alloc_node); 1888a3d2615STejun Heo swap(tg->stats_cpu, stats_cpu); 1898a3d2615STejun Heo list_del_init(&tg->stats_alloc_node); 1908a3d2615STejun Heo } 1918a3d2615STejun Heo 1928a3d2615STejun Heo empty = list_empty(&tg_stats_alloc_list); 1938a3d2615STejun Heo spin_unlock_irq(&tg_stats_alloc_lock); 1948a3d2615STejun Heo if (!empty) 1958a3d2615STejun Heo goto alloc_stats; 1968a3d2615STejun Heo } 1978a3d2615STejun Heo 19849a2f1e3STejun Heo /* init a service_queue, assumes the caller zeroed it */ 19949a2f1e3STejun Heo static void throtl_service_queue_init(struct throtl_service_queue *sq) 20049a2f1e3STejun Heo { 201*73f0d49aSTejun Heo bio_list_init(&sq->bio_lists[0]); 202*73f0d49aSTejun Heo bio_list_init(&sq->bio_lists[1]); 20349a2f1e3STejun Heo sq->pending_tree = RB_ROOT; 20449a2f1e3STejun Heo } 20549a2f1e3STejun Heo 2063c798398STejun Heo static void throtl_pd_init(struct blkcg_gq *blkg) 207a29a171eSVivek Goyal { 2080381411eSTejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 209ff26eaadSTejun Heo unsigned long flags; 210cd1604faSTejun Heo 21149a2f1e3STejun Heo throtl_service_queue_init(&tg->service_queue); 212a29a171eSVivek Goyal RB_CLEAR_NODE(&tg->rb_node); 2130f3457f6STejun Heo tg->td = blkg->q->td; 214a29a171eSVivek Goyal 215e56da7e2STejun Heo tg->bps[READ] = -1; 216e56da7e2STejun Heo tg->bps[WRITE] = -1; 217e56da7e2STejun Heo tg->iops[READ] = -1; 218e56da7e2STejun Heo tg->iops[WRITE] = -1; 2198a3d2615STejun Heo 2208a3d2615STejun Heo /* 2218a3d2615STejun Heo * Ugh... We need to perform per-cpu allocation for tg->stats_cpu 2228a3d2615STejun Heo * but percpu allocator can't be called from IO path. Queue tg on 2238a3d2615STejun Heo * tg_stats_alloc_list and allocate from work item. 2248a3d2615STejun Heo */ 225ff26eaadSTejun Heo spin_lock_irqsave(&tg_stats_alloc_lock, flags); 2268a3d2615STejun Heo list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); 2273b07e9caSTejun Heo schedule_delayed_work(&tg_stats_alloc_work, 0); 228ff26eaadSTejun Heo spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); 2298a3d2615STejun Heo } 2308a3d2615STejun Heo 2313c798398STejun Heo static void throtl_pd_exit(struct blkcg_gq *blkg) 2328a3d2615STejun Heo { 2338a3d2615STejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 234ff26eaadSTejun Heo unsigned long flags; 2358a3d2615STejun Heo 236ff26eaadSTejun Heo spin_lock_irqsave(&tg_stats_alloc_lock, flags); 2378a3d2615STejun Heo list_del_init(&tg->stats_alloc_node); 238ff26eaadSTejun Heo spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); 2398a3d2615STejun Heo 2408a3d2615STejun Heo free_percpu(tg->stats_cpu); 2418a3d2615STejun Heo } 2428a3d2615STejun Heo 2433c798398STejun Heo static void throtl_pd_reset_stats(struct blkcg_gq *blkg) 2448a3d2615STejun Heo { 2458a3d2615STejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 2468a3d2615STejun Heo int cpu; 2478a3d2615STejun Heo 2488a3d2615STejun Heo if (tg->stats_cpu == NULL) 2498a3d2615STejun Heo return; 2508a3d2615STejun Heo 2518a3d2615STejun Heo for_each_possible_cpu(cpu) { 2528a3d2615STejun Heo struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); 2538a3d2615STejun Heo 2548a3d2615STejun Heo blkg_rwstat_reset(&sc->service_bytes); 2558a3d2615STejun Heo blkg_rwstat_reset(&sc->serviced); 2568a3d2615STejun Heo } 257a29a171eSVivek Goyal } 258a29a171eSVivek Goyal 2593c798398STejun Heo static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, 2603c798398STejun Heo struct blkcg *blkcg) 261e43473b7SVivek Goyal { 262e43473b7SVivek Goyal /* 2633c798398STejun Heo * This is the common case when there are no blkcgs. Avoid lookup 2643c798398STejun Heo * in this case 265be2c6b19SVivek Goyal */ 2663c798398STejun Heo if (blkcg == &blkcg_root) 26703d8e111STejun Heo return td_root_tg(td); 268e43473b7SVivek Goyal 269e8989faeSTejun Heo return blkg_to_tg(blkg_lookup(blkcg, td->queue)); 270e43473b7SVivek Goyal } 271e43473b7SVivek Goyal 272cd1604faSTejun Heo static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, 2733c798398STejun Heo struct blkcg *blkcg) 274e43473b7SVivek Goyal { 275f469a7b4SVivek Goyal struct request_queue *q = td->queue; 276cd1604faSTejun Heo struct throtl_grp *tg = NULL; 2770a5a7d0eSTejun Heo 278f469a7b4SVivek Goyal /* 2793c798398STejun Heo * This is the common case when there are no blkcgs. Avoid lookup 2803c798398STejun Heo * in this case 281f469a7b4SVivek Goyal */ 2823c798398STejun Heo if (blkcg == &blkcg_root) { 28303d8e111STejun Heo tg = td_root_tg(td); 284cd1604faSTejun Heo } else { 2853c798398STejun Heo struct blkcg_gq *blkg; 286cd1604faSTejun Heo 2873c96cb32STejun Heo blkg = blkg_lookup_create(blkcg, q); 288cd1604faSTejun Heo 289cd1604faSTejun Heo /* if %NULL and @q is alive, fall back to root_tg */ 290cd1604faSTejun Heo if (!IS_ERR(blkg)) 2910381411eSTejun Heo tg = blkg_to_tg(blkg); 2923f3299d5SBart Van Assche else if (!blk_queue_dying(q)) 29303d8e111STejun Heo tg = td_root_tg(td); 294f469a7b4SVivek Goyal } 295f469a7b4SVivek Goyal 296e43473b7SVivek Goyal return tg; 297e43473b7SVivek Goyal } 298e43473b7SVivek Goyal 2990049af73STejun Heo static struct throtl_grp * 3000049af73STejun Heo throtl_rb_first(struct throtl_service_queue *parent_sq) 301e43473b7SVivek Goyal { 302e43473b7SVivek Goyal /* Service tree is empty */ 3030049af73STejun Heo if (!parent_sq->nr_pending) 304e43473b7SVivek Goyal return NULL; 305e43473b7SVivek Goyal 3060049af73STejun Heo if (!parent_sq->first_pending) 3070049af73STejun Heo parent_sq->first_pending = rb_first(&parent_sq->pending_tree); 308e43473b7SVivek Goyal 3090049af73STejun Heo if (parent_sq->first_pending) 3100049af73STejun Heo return rb_entry_tg(parent_sq->first_pending); 311e43473b7SVivek Goyal 312e43473b7SVivek Goyal return NULL; 313e43473b7SVivek Goyal } 314e43473b7SVivek Goyal 315e43473b7SVivek Goyal static void rb_erase_init(struct rb_node *n, struct rb_root *root) 316e43473b7SVivek Goyal { 317e43473b7SVivek Goyal rb_erase(n, root); 318e43473b7SVivek Goyal RB_CLEAR_NODE(n); 319e43473b7SVivek Goyal } 320e43473b7SVivek Goyal 3210049af73STejun Heo static void throtl_rb_erase(struct rb_node *n, 3220049af73STejun Heo struct throtl_service_queue *parent_sq) 323e43473b7SVivek Goyal { 3240049af73STejun Heo if (parent_sq->first_pending == n) 3250049af73STejun Heo parent_sq->first_pending = NULL; 3260049af73STejun Heo rb_erase_init(n, &parent_sq->pending_tree); 3270049af73STejun Heo --parent_sq->nr_pending; 328e43473b7SVivek Goyal } 329e43473b7SVivek Goyal 3300049af73STejun Heo static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) 331e43473b7SVivek Goyal { 332e43473b7SVivek Goyal struct throtl_grp *tg; 333e43473b7SVivek Goyal 3340049af73STejun Heo tg = throtl_rb_first(parent_sq); 335e43473b7SVivek Goyal if (!tg) 336e43473b7SVivek Goyal return; 337e43473b7SVivek Goyal 3380049af73STejun Heo parent_sq->first_pending_disptime = tg->disptime; 339e43473b7SVivek Goyal } 340e43473b7SVivek Goyal 3410049af73STejun Heo static void tg_service_queue_add(struct throtl_grp *tg, 3420049af73STejun Heo struct throtl_service_queue *parent_sq) 343e43473b7SVivek Goyal { 3440049af73STejun Heo struct rb_node **node = &parent_sq->pending_tree.rb_node; 345e43473b7SVivek Goyal struct rb_node *parent = NULL; 346e43473b7SVivek Goyal struct throtl_grp *__tg; 347e43473b7SVivek Goyal unsigned long key = tg->disptime; 348e43473b7SVivek Goyal int left = 1; 349e43473b7SVivek Goyal 350e43473b7SVivek Goyal while (*node != NULL) { 351e43473b7SVivek Goyal parent = *node; 352e43473b7SVivek Goyal __tg = rb_entry_tg(parent); 353e43473b7SVivek Goyal 354e43473b7SVivek Goyal if (time_before(key, __tg->disptime)) 355e43473b7SVivek Goyal node = &parent->rb_left; 356e43473b7SVivek Goyal else { 357e43473b7SVivek Goyal node = &parent->rb_right; 358e43473b7SVivek Goyal left = 0; 359e43473b7SVivek Goyal } 360e43473b7SVivek Goyal } 361e43473b7SVivek Goyal 362e43473b7SVivek Goyal if (left) 3630049af73STejun Heo parent_sq->first_pending = &tg->rb_node; 364e43473b7SVivek Goyal 365e43473b7SVivek Goyal rb_link_node(&tg->rb_node, parent, node); 3660049af73STejun Heo rb_insert_color(&tg->rb_node, &parent_sq->pending_tree); 367e43473b7SVivek Goyal } 368e43473b7SVivek Goyal 3690049af73STejun Heo static void __throtl_enqueue_tg(struct throtl_grp *tg, 3700049af73STejun Heo struct throtl_service_queue *parent_sq) 371e43473b7SVivek Goyal { 3720049af73STejun Heo tg_service_queue_add(tg, parent_sq); 3735b2c16aaSTejun Heo tg->flags |= THROTL_TG_PENDING; 3740049af73STejun Heo parent_sq->nr_pending++; 375e43473b7SVivek Goyal } 376e43473b7SVivek Goyal 3770049af73STejun Heo static void throtl_enqueue_tg(struct throtl_grp *tg, 3780049af73STejun Heo struct throtl_service_queue *parent_sq) 379e43473b7SVivek Goyal { 3805b2c16aaSTejun Heo if (!(tg->flags & THROTL_TG_PENDING)) 3810049af73STejun Heo __throtl_enqueue_tg(tg, parent_sq); 382e43473b7SVivek Goyal } 383e43473b7SVivek Goyal 3840049af73STejun Heo static void __throtl_dequeue_tg(struct throtl_grp *tg, 3850049af73STejun Heo struct throtl_service_queue *parent_sq) 386e43473b7SVivek Goyal { 3870049af73STejun Heo throtl_rb_erase(&tg->rb_node, parent_sq); 3885b2c16aaSTejun Heo tg->flags &= ~THROTL_TG_PENDING; 389e43473b7SVivek Goyal } 390e43473b7SVivek Goyal 3910049af73STejun Heo static void throtl_dequeue_tg(struct throtl_grp *tg, 3920049af73STejun Heo struct throtl_service_queue *parent_sq) 393e43473b7SVivek Goyal { 3945b2c16aaSTejun Heo if (tg->flags & THROTL_TG_PENDING) 3950049af73STejun Heo __throtl_dequeue_tg(tg, parent_sq); 396e43473b7SVivek Goyal } 397e43473b7SVivek Goyal 398a9131a27STejun Heo /* Call with queue lock held */ 399a9131a27STejun Heo static void throtl_schedule_delayed_work(struct throtl_data *td, 400a9131a27STejun Heo unsigned long delay) 401a9131a27STejun Heo { 402a9131a27STejun Heo struct delayed_work *dwork = &td->dispatch_work; 403a9131a27STejun Heo 404a9131a27STejun Heo mod_delayed_work(kthrotld_workqueue, dwork, delay); 4056a525600STejun Heo throtl_log(td, "schedule work. delay=%lu jiffies=%lu", delay, jiffies); 406a9131a27STejun Heo } 407a9131a27STejun Heo 408e43473b7SVivek Goyal static void throtl_schedule_next_dispatch(struct throtl_data *td) 409e43473b7SVivek Goyal { 410c9e0332eSTejun Heo struct throtl_service_queue *sq = &td->service_queue; 411e43473b7SVivek Goyal 4126a525600STejun Heo /* any pending children left? */ 413c9e0332eSTejun Heo if (!sq->nr_pending) 414e43473b7SVivek Goyal return; 415e43473b7SVivek Goyal 416c9e0332eSTejun Heo update_min_dispatch_time(sq); 417e43473b7SVivek Goyal 418c9e0332eSTejun Heo if (time_before_eq(sq->first_pending_disptime, jiffies)) 419450adcbeSVivek Goyal throtl_schedule_delayed_work(td, 0); 420e43473b7SVivek Goyal else 421c9e0332eSTejun Heo throtl_schedule_delayed_work(td, sq->first_pending_disptime - jiffies); 422e43473b7SVivek Goyal } 423e43473b7SVivek Goyal 4240f3457f6STejun Heo static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) 425e43473b7SVivek Goyal { 426e43473b7SVivek Goyal tg->bytes_disp[rw] = 0; 4278e89d13fSVivek Goyal tg->io_disp[rw] = 0; 428e43473b7SVivek Goyal tg->slice_start[rw] = jiffies; 429e43473b7SVivek Goyal tg->slice_end[rw] = jiffies + throtl_slice; 4300f3457f6STejun Heo throtl_log_tg(tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", 431e43473b7SVivek Goyal rw == READ ? 'R' : 'W', tg->slice_start[rw], 432e43473b7SVivek Goyal tg->slice_end[rw], jiffies); 433e43473b7SVivek Goyal } 434e43473b7SVivek Goyal 4350f3457f6STejun Heo static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, 4360f3457f6STejun Heo unsigned long jiffy_end) 437d1ae8ffdSVivek Goyal { 438d1ae8ffdSVivek Goyal tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 439d1ae8ffdSVivek Goyal } 440d1ae8ffdSVivek Goyal 4410f3457f6STejun Heo static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, 4420f3457f6STejun Heo unsigned long jiffy_end) 443e43473b7SVivek Goyal { 444e43473b7SVivek Goyal tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 4450f3457f6STejun Heo throtl_log_tg(tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", 446e43473b7SVivek Goyal rw == READ ? 'R' : 'W', tg->slice_start[rw], 447e43473b7SVivek Goyal tg->slice_end[rw], jiffies); 448e43473b7SVivek Goyal } 449e43473b7SVivek Goyal 450e43473b7SVivek Goyal /* Determine if previously allocated or extended slice is complete or not */ 4510f3457f6STejun Heo static bool throtl_slice_used(struct throtl_grp *tg, bool rw) 452e43473b7SVivek Goyal { 453e43473b7SVivek Goyal if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 454e43473b7SVivek Goyal return 0; 455e43473b7SVivek Goyal 456e43473b7SVivek Goyal return 1; 457e43473b7SVivek Goyal } 458e43473b7SVivek Goyal 459e43473b7SVivek Goyal /* Trim the used slices and adjust slice start accordingly */ 4600f3457f6STejun Heo static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) 461e43473b7SVivek Goyal { 4623aad5d3eSVivek Goyal unsigned long nr_slices, time_elapsed, io_trim; 4633aad5d3eSVivek Goyal u64 bytes_trim, tmp; 464e43473b7SVivek Goyal 465e43473b7SVivek Goyal BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); 466e43473b7SVivek Goyal 467e43473b7SVivek Goyal /* 468e43473b7SVivek Goyal * If bps are unlimited (-1), then time slice don't get 469e43473b7SVivek Goyal * renewed. Don't try to trim the slice if slice is used. A new 470e43473b7SVivek Goyal * slice will start when appropriate. 471e43473b7SVivek Goyal */ 4720f3457f6STejun Heo if (throtl_slice_used(tg, rw)) 473e43473b7SVivek Goyal return; 474e43473b7SVivek Goyal 475d1ae8ffdSVivek Goyal /* 476d1ae8ffdSVivek Goyal * A bio has been dispatched. Also adjust slice_end. It might happen 477d1ae8ffdSVivek Goyal * that initially cgroup limit was very low resulting in high 478d1ae8ffdSVivek Goyal * slice_end, but later limit was bumped up and bio was dispached 479d1ae8ffdSVivek Goyal * sooner, then we need to reduce slice_end. A high bogus slice_end 480d1ae8ffdSVivek Goyal * is bad because it does not allow new slice to start. 481d1ae8ffdSVivek Goyal */ 482d1ae8ffdSVivek Goyal 4830f3457f6STejun Heo throtl_set_slice_end(tg, rw, jiffies + throtl_slice); 484d1ae8ffdSVivek Goyal 485e43473b7SVivek Goyal time_elapsed = jiffies - tg->slice_start[rw]; 486e43473b7SVivek Goyal 487e43473b7SVivek Goyal nr_slices = time_elapsed / throtl_slice; 488e43473b7SVivek Goyal 489e43473b7SVivek Goyal if (!nr_slices) 490e43473b7SVivek Goyal return; 4913aad5d3eSVivek Goyal tmp = tg->bps[rw] * throtl_slice * nr_slices; 4923aad5d3eSVivek Goyal do_div(tmp, HZ); 4933aad5d3eSVivek Goyal bytes_trim = tmp; 494e43473b7SVivek Goyal 4958e89d13fSVivek Goyal io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; 496e43473b7SVivek Goyal 4978e89d13fSVivek Goyal if (!bytes_trim && !io_trim) 498e43473b7SVivek Goyal return; 499e43473b7SVivek Goyal 500e43473b7SVivek Goyal if (tg->bytes_disp[rw] >= bytes_trim) 501e43473b7SVivek Goyal tg->bytes_disp[rw] -= bytes_trim; 502e43473b7SVivek Goyal else 503e43473b7SVivek Goyal tg->bytes_disp[rw] = 0; 504e43473b7SVivek Goyal 5058e89d13fSVivek Goyal if (tg->io_disp[rw] >= io_trim) 5068e89d13fSVivek Goyal tg->io_disp[rw] -= io_trim; 5078e89d13fSVivek Goyal else 5088e89d13fSVivek Goyal tg->io_disp[rw] = 0; 5098e89d13fSVivek Goyal 510e43473b7SVivek Goyal tg->slice_start[rw] += nr_slices * throtl_slice; 511e43473b7SVivek Goyal 5120f3457f6STejun Heo throtl_log_tg(tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" 513e43473b7SVivek Goyal " start=%lu end=%lu jiffies=%lu", 5148e89d13fSVivek Goyal rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, 515e43473b7SVivek Goyal tg->slice_start[rw], tg->slice_end[rw], jiffies); 516e43473b7SVivek Goyal } 517e43473b7SVivek Goyal 5180f3457f6STejun Heo static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, 5190f3457f6STejun Heo unsigned long *wait) 520e43473b7SVivek Goyal { 521e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 5228e89d13fSVivek Goyal unsigned int io_allowed; 523e43473b7SVivek Goyal unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 524c49c06e4SVivek Goyal u64 tmp; 525e43473b7SVivek Goyal 5268e89d13fSVivek Goyal jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 527e43473b7SVivek Goyal 5288e89d13fSVivek Goyal /* Slice has just started. Consider one slice interval */ 5298e89d13fSVivek Goyal if (!jiffy_elapsed) 5308e89d13fSVivek Goyal jiffy_elapsed_rnd = throtl_slice; 5318e89d13fSVivek Goyal 5328e89d13fSVivek Goyal jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 5338e89d13fSVivek Goyal 534c49c06e4SVivek Goyal /* 535c49c06e4SVivek Goyal * jiffy_elapsed_rnd should not be a big value as minimum iops can be 536c49c06e4SVivek Goyal * 1 then at max jiffy elapsed should be equivalent of 1 second as we 537c49c06e4SVivek Goyal * will allow dispatch after 1 second and after that slice should 538c49c06e4SVivek Goyal * have been trimmed. 539c49c06e4SVivek Goyal */ 540c49c06e4SVivek Goyal 541c49c06e4SVivek Goyal tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; 542c49c06e4SVivek Goyal do_div(tmp, HZ); 543c49c06e4SVivek Goyal 544c49c06e4SVivek Goyal if (tmp > UINT_MAX) 545c49c06e4SVivek Goyal io_allowed = UINT_MAX; 546c49c06e4SVivek Goyal else 547c49c06e4SVivek Goyal io_allowed = tmp; 5488e89d13fSVivek Goyal 5498e89d13fSVivek Goyal if (tg->io_disp[rw] + 1 <= io_allowed) { 550e43473b7SVivek Goyal if (wait) 551e43473b7SVivek Goyal *wait = 0; 552e43473b7SVivek Goyal return 1; 553e43473b7SVivek Goyal } 554e43473b7SVivek Goyal 5558e89d13fSVivek Goyal /* Calc approx time to dispatch */ 5568e89d13fSVivek Goyal jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; 5578e89d13fSVivek Goyal 5588e89d13fSVivek Goyal if (jiffy_wait > jiffy_elapsed) 5598e89d13fSVivek Goyal jiffy_wait = jiffy_wait - jiffy_elapsed; 5608e89d13fSVivek Goyal else 5618e89d13fSVivek Goyal jiffy_wait = 1; 5628e89d13fSVivek Goyal 5638e89d13fSVivek Goyal if (wait) 5648e89d13fSVivek Goyal *wait = jiffy_wait; 5658e89d13fSVivek Goyal return 0; 566e43473b7SVivek Goyal } 567e43473b7SVivek Goyal 5680f3457f6STejun Heo static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, 5690f3457f6STejun Heo unsigned long *wait) 5708e89d13fSVivek Goyal { 5718e89d13fSVivek Goyal bool rw = bio_data_dir(bio); 5723aad5d3eSVivek Goyal u64 bytes_allowed, extra_bytes, tmp; 5738e89d13fSVivek Goyal unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 5748e89d13fSVivek Goyal 575e43473b7SVivek Goyal jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 576e43473b7SVivek Goyal 577e43473b7SVivek Goyal /* Slice has just started. Consider one slice interval */ 578e43473b7SVivek Goyal if (!jiffy_elapsed) 579e43473b7SVivek Goyal jiffy_elapsed_rnd = throtl_slice; 580e43473b7SVivek Goyal 581e43473b7SVivek Goyal jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 582e43473b7SVivek Goyal 5835e901a2bSVivek Goyal tmp = tg->bps[rw] * jiffy_elapsed_rnd; 5845e901a2bSVivek Goyal do_div(tmp, HZ); 5853aad5d3eSVivek Goyal bytes_allowed = tmp; 586e43473b7SVivek Goyal 587e43473b7SVivek Goyal if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { 588e43473b7SVivek Goyal if (wait) 589e43473b7SVivek Goyal *wait = 0; 590e43473b7SVivek Goyal return 1; 591e43473b7SVivek Goyal } 592e43473b7SVivek Goyal 593e43473b7SVivek Goyal /* Calc approx time to dispatch */ 594e43473b7SVivek Goyal extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed; 595e43473b7SVivek Goyal jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); 596e43473b7SVivek Goyal 597e43473b7SVivek Goyal if (!jiffy_wait) 598e43473b7SVivek Goyal jiffy_wait = 1; 599e43473b7SVivek Goyal 600e43473b7SVivek Goyal /* 601e43473b7SVivek Goyal * This wait time is without taking into consideration the rounding 602e43473b7SVivek Goyal * up we did. Add that time also. 603e43473b7SVivek Goyal */ 604e43473b7SVivek Goyal jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); 605e43473b7SVivek Goyal if (wait) 606e43473b7SVivek Goyal *wait = jiffy_wait; 6078e89d13fSVivek Goyal return 0; 6088e89d13fSVivek Goyal } 609e43473b7SVivek Goyal 610af75cd3cSVivek Goyal static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) { 611af75cd3cSVivek Goyal if (tg->bps[rw] == -1 && tg->iops[rw] == -1) 612af75cd3cSVivek Goyal return 1; 613af75cd3cSVivek Goyal return 0; 614af75cd3cSVivek Goyal } 615af75cd3cSVivek Goyal 6168e89d13fSVivek Goyal /* 6178e89d13fSVivek Goyal * Returns whether one can dispatch a bio or not. Also returns approx number 6188e89d13fSVivek Goyal * of jiffies to wait before this bio is with-in IO rate and can be dispatched 6198e89d13fSVivek Goyal */ 6200f3457f6STejun Heo static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, 6210f3457f6STejun Heo unsigned long *wait) 6228e89d13fSVivek Goyal { 6238e89d13fSVivek Goyal bool rw = bio_data_dir(bio); 6248e89d13fSVivek Goyal unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; 6258e89d13fSVivek Goyal 6268e89d13fSVivek Goyal /* 6278e89d13fSVivek Goyal * Currently whole state machine of group depends on first bio 6288e89d13fSVivek Goyal * queued in the group bio list. So one should not be calling 6298e89d13fSVivek Goyal * this function with a different bio if there are other bios 6308e89d13fSVivek Goyal * queued. 6318e89d13fSVivek Goyal */ 632*73f0d49aSTejun Heo BUG_ON(tg->service_queue.nr_queued[rw] && 633*73f0d49aSTejun Heo bio != bio_list_peek(&tg->service_queue.bio_lists[rw])); 6348e89d13fSVivek Goyal 6358e89d13fSVivek Goyal /* If tg->bps = -1, then BW is unlimited */ 6368e89d13fSVivek Goyal if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { 6378e89d13fSVivek Goyal if (wait) 6388e89d13fSVivek Goyal *wait = 0; 6398e89d13fSVivek Goyal return 1; 6408e89d13fSVivek Goyal } 6418e89d13fSVivek Goyal 6428e89d13fSVivek Goyal /* 6438e89d13fSVivek Goyal * If previous slice expired, start a new one otherwise renew/extend 6448e89d13fSVivek Goyal * existing slice to make sure it is at least throtl_slice interval 6458e89d13fSVivek Goyal * long since now. 6468e89d13fSVivek Goyal */ 6470f3457f6STejun Heo if (throtl_slice_used(tg, rw)) 6480f3457f6STejun Heo throtl_start_new_slice(tg, rw); 6498e89d13fSVivek Goyal else { 6508e89d13fSVivek Goyal if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) 6510f3457f6STejun Heo throtl_extend_slice(tg, rw, jiffies + throtl_slice); 6528e89d13fSVivek Goyal } 6538e89d13fSVivek Goyal 6540f3457f6STejun Heo if (tg_with_in_bps_limit(tg, bio, &bps_wait) && 6550f3457f6STejun Heo tg_with_in_iops_limit(tg, bio, &iops_wait)) { 6568e89d13fSVivek Goyal if (wait) 6578e89d13fSVivek Goyal *wait = 0; 6588e89d13fSVivek Goyal return 1; 6598e89d13fSVivek Goyal } 6608e89d13fSVivek Goyal 6618e89d13fSVivek Goyal max_wait = max(bps_wait, iops_wait); 6628e89d13fSVivek Goyal 6638e89d13fSVivek Goyal if (wait) 6648e89d13fSVivek Goyal *wait = max_wait; 6658e89d13fSVivek Goyal 6668e89d13fSVivek Goyal if (time_before(tg->slice_end[rw], jiffies + max_wait)) 6670f3457f6STejun Heo throtl_extend_slice(tg, rw, jiffies + max_wait); 668e43473b7SVivek Goyal 669e43473b7SVivek Goyal return 0; 670e43473b7SVivek Goyal } 671e43473b7SVivek Goyal 6723c798398STejun Heo static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, 673629ed0b1STejun Heo int rw) 674629ed0b1STejun Heo { 6758a3d2615STejun Heo struct throtl_grp *tg = blkg_to_tg(blkg); 6768a3d2615STejun Heo struct tg_stats_cpu *stats_cpu; 677629ed0b1STejun Heo unsigned long flags; 678629ed0b1STejun Heo 679629ed0b1STejun Heo /* If per cpu stats are not allocated yet, don't do any accounting. */ 6808a3d2615STejun Heo if (tg->stats_cpu == NULL) 681629ed0b1STejun Heo return; 682629ed0b1STejun Heo 683629ed0b1STejun Heo /* 684629ed0b1STejun Heo * Disabling interrupts to provide mutual exclusion between two 685629ed0b1STejun Heo * writes on same cpu. It probably is not needed for 64bit. Not 686629ed0b1STejun Heo * optimizing that case yet. 687629ed0b1STejun Heo */ 688629ed0b1STejun Heo local_irq_save(flags); 689629ed0b1STejun Heo 6908a3d2615STejun Heo stats_cpu = this_cpu_ptr(tg->stats_cpu); 691629ed0b1STejun Heo 692629ed0b1STejun Heo blkg_rwstat_add(&stats_cpu->serviced, rw, 1); 693629ed0b1STejun Heo blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); 694629ed0b1STejun Heo 695629ed0b1STejun Heo local_irq_restore(flags); 696629ed0b1STejun Heo } 697629ed0b1STejun Heo 698e43473b7SVivek Goyal static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 699e43473b7SVivek Goyal { 700e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 701e43473b7SVivek Goyal 702e43473b7SVivek Goyal /* Charge the bio to the group */ 703e43473b7SVivek Goyal tg->bytes_disp[rw] += bio->bi_size; 7048e89d13fSVivek Goyal tg->io_disp[rw]++; 705e43473b7SVivek Goyal 706629ed0b1STejun Heo throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw); 707e43473b7SVivek Goyal } 708e43473b7SVivek Goyal 7090049af73STejun Heo static void throtl_add_bio_tg(struct bio *bio, struct throtl_grp *tg, 7100049af73STejun Heo struct throtl_service_queue *parent_sq) 711e43473b7SVivek Goyal { 712*73f0d49aSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 713e43473b7SVivek Goyal bool rw = bio_data_dir(bio); 714e43473b7SVivek Goyal 715*73f0d49aSTejun Heo bio_list_add(&sq->bio_lists[rw], bio); 716e43473b7SVivek Goyal /* Take a bio reference on tg */ 7171adaf3ddSTejun Heo blkg_get(tg_to_blkg(tg)); 718*73f0d49aSTejun Heo sq->nr_queued[rw]++; 719e2d57e60STejun Heo tg->td->nr_queued[rw]++; 7200049af73STejun Heo throtl_enqueue_tg(tg, parent_sq); 721e43473b7SVivek Goyal } 722e43473b7SVivek Goyal 7230049af73STejun Heo static void tg_update_disptime(struct throtl_grp *tg, 7240049af73STejun Heo struct throtl_service_queue *parent_sq) 725e43473b7SVivek Goyal { 726*73f0d49aSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 727e43473b7SVivek Goyal unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; 728e43473b7SVivek Goyal struct bio *bio; 729e43473b7SVivek Goyal 730*73f0d49aSTejun Heo if ((bio = bio_list_peek(&sq->bio_lists[READ]))) 7310f3457f6STejun Heo tg_may_dispatch(tg, bio, &read_wait); 732e43473b7SVivek Goyal 733*73f0d49aSTejun Heo if ((bio = bio_list_peek(&sq->bio_lists[WRITE]))) 7340f3457f6STejun Heo tg_may_dispatch(tg, bio, &write_wait); 735e43473b7SVivek Goyal 736e43473b7SVivek Goyal min_wait = min(read_wait, write_wait); 737e43473b7SVivek Goyal disptime = jiffies + min_wait; 738e43473b7SVivek Goyal 739e43473b7SVivek Goyal /* Update dispatch time */ 7400049af73STejun Heo throtl_dequeue_tg(tg, parent_sq); 741e43473b7SVivek Goyal tg->disptime = disptime; 7420049af73STejun Heo throtl_enqueue_tg(tg, parent_sq); 743e43473b7SVivek Goyal } 744e43473b7SVivek Goyal 7450f3457f6STejun Heo static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw, 7460f3457f6STejun Heo struct bio_list *bl) 747e43473b7SVivek Goyal { 748*73f0d49aSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 749e43473b7SVivek Goyal struct bio *bio; 750e43473b7SVivek Goyal 751*73f0d49aSTejun Heo bio = bio_list_pop(&sq->bio_lists[rw]); 752*73f0d49aSTejun Heo sq->nr_queued[rw]--; 7531adaf3ddSTejun Heo /* Drop bio reference on blkg */ 7541adaf3ddSTejun Heo blkg_put(tg_to_blkg(tg)); 755e43473b7SVivek Goyal 7560f3457f6STejun Heo BUG_ON(tg->td->nr_queued[rw] <= 0); 7570f3457f6STejun Heo tg->td->nr_queued[rw]--; 758e43473b7SVivek Goyal 759e43473b7SVivek Goyal throtl_charge_bio(tg, bio); 760e43473b7SVivek Goyal bio_list_add(bl, bio); 761e43473b7SVivek Goyal bio->bi_rw |= REQ_THROTTLED; 762e43473b7SVivek Goyal 7630f3457f6STejun Heo throtl_trim_slice(tg, rw); 764e43473b7SVivek Goyal } 765e43473b7SVivek Goyal 7660f3457f6STejun Heo static int throtl_dispatch_tg(struct throtl_grp *tg, struct bio_list *bl) 767e43473b7SVivek Goyal { 768*73f0d49aSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 769e43473b7SVivek Goyal unsigned int nr_reads = 0, nr_writes = 0; 770e43473b7SVivek Goyal unsigned int max_nr_reads = throtl_grp_quantum*3/4; 771c2f6805dSVivek Goyal unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; 772e43473b7SVivek Goyal struct bio *bio; 773e43473b7SVivek Goyal 774e43473b7SVivek Goyal /* Try to dispatch 75% READS and 25% WRITES */ 775e43473b7SVivek Goyal 776*73f0d49aSTejun Heo while ((bio = bio_list_peek(&sq->bio_lists[READ])) && 7770f3457f6STejun Heo tg_may_dispatch(tg, bio, NULL)) { 778e43473b7SVivek Goyal 7790f3457f6STejun Heo tg_dispatch_one_bio(tg, bio_data_dir(bio), bl); 780e43473b7SVivek Goyal nr_reads++; 781e43473b7SVivek Goyal 782e43473b7SVivek Goyal if (nr_reads >= max_nr_reads) 783e43473b7SVivek Goyal break; 784e43473b7SVivek Goyal } 785e43473b7SVivek Goyal 786*73f0d49aSTejun Heo while ((bio = bio_list_peek(&sq->bio_lists[WRITE])) && 7870f3457f6STejun Heo tg_may_dispatch(tg, bio, NULL)) { 788e43473b7SVivek Goyal 7890f3457f6STejun Heo tg_dispatch_one_bio(tg, bio_data_dir(bio), bl); 790e43473b7SVivek Goyal nr_writes++; 791e43473b7SVivek Goyal 792e43473b7SVivek Goyal if (nr_writes >= max_nr_writes) 793e43473b7SVivek Goyal break; 794e43473b7SVivek Goyal } 795e43473b7SVivek Goyal 796e43473b7SVivek Goyal return nr_reads + nr_writes; 797e43473b7SVivek Goyal } 798e43473b7SVivek Goyal 7990049af73STejun Heo static int throtl_select_dispatch(struct throtl_service_queue *parent_sq, 800e2d57e60STejun Heo struct bio_list *bl) 801e43473b7SVivek Goyal { 802e43473b7SVivek Goyal unsigned int nr_disp = 0; 803e43473b7SVivek Goyal 804e43473b7SVivek Goyal while (1) { 805*73f0d49aSTejun Heo struct throtl_grp *tg = throtl_rb_first(parent_sq); 806*73f0d49aSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 807e43473b7SVivek Goyal 808e43473b7SVivek Goyal if (!tg) 809e43473b7SVivek Goyal break; 810e43473b7SVivek Goyal 811e43473b7SVivek Goyal if (time_before(jiffies, tg->disptime)) 812e43473b7SVivek Goyal break; 813e43473b7SVivek Goyal 8140049af73STejun Heo throtl_dequeue_tg(tg, parent_sq); 815e43473b7SVivek Goyal 8160f3457f6STejun Heo nr_disp += throtl_dispatch_tg(tg, bl); 817e43473b7SVivek Goyal 818*73f0d49aSTejun Heo if (sq->nr_queued[0] || sq->nr_queued[1]) 8190049af73STejun Heo tg_update_disptime(tg, parent_sq); 820e43473b7SVivek Goyal 821e43473b7SVivek Goyal if (nr_disp >= throtl_quantum) 822e43473b7SVivek Goyal break; 823e43473b7SVivek Goyal } 824e43473b7SVivek Goyal 825e43473b7SVivek Goyal return nr_disp; 826e43473b7SVivek Goyal } 827e43473b7SVivek Goyal 828cb76199cSTejun Heo /* work function to dispatch throttled bios */ 829cb76199cSTejun Heo void blk_throtl_dispatch_work_fn(struct work_struct *work) 830e43473b7SVivek Goyal { 831cb76199cSTejun Heo struct throtl_data *td = container_of(to_delayed_work(work), 832cb76199cSTejun Heo struct throtl_data, dispatch_work); 833cb76199cSTejun Heo struct request_queue *q = td->queue; 834e43473b7SVivek Goyal unsigned int nr_disp = 0; 835e43473b7SVivek Goyal struct bio_list bio_list_on_stack; 836e43473b7SVivek Goyal struct bio *bio; 83769d60eb9SVivek Goyal struct blk_plug plug; 838e43473b7SVivek Goyal 839e43473b7SVivek Goyal spin_lock_irq(q->queue_lock); 840e43473b7SVivek Goyal 841e43473b7SVivek Goyal bio_list_init(&bio_list_on_stack); 842e43473b7SVivek Goyal 843d2f31a5fSJoe Perches throtl_log(td, "dispatch nr_queued=%u read=%u write=%u", 8446a525600STejun Heo td->nr_queued[READ] + td->nr_queued[WRITE], 8456a525600STejun Heo td->nr_queued[READ], td->nr_queued[WRITE]); 846e43473b7SVivek Goyal 847e2d57e60STejun Heo nr_disp = throtl_select_dispatch(&td->service_queue, &bio_list_on_stack); 848e43473b7SVivek Goyal 849e43473b7SVivek Goyal if (nr_disp) 850e43473b7SVivek Goyal throtl_log(td, "bios disp=%u", nr_disp); 851e43473b7SVivek Goyal 852e43473b7SVivek Goyal throtl_schedule_next_dispatch(td); 8536a525600STejun Heo 854e43473b7SVivek Goyal spin_unlock_irq(q->queue_lock); 855e43473b7SVivek Goyal 856e43473b7SVivek Goyal /* 857e43473b7SVivek Goyal * If we dispatched some requests, unplug the queue to make sure 858e43473b7SVivek Goyal * immediate dispatch 859e43473b7SVivek Goyal */ 860e43473b7SVivek Goyal if (nr_disp) { 86169d60eb9SVivek Goyal blk_start_plug(&plug); 862e43473b7SVivek Goyal while((bio = bio_list_pop(&bio_list_on_stack))) 863e43473b7SVivek Goyal generic_make_request(bio); 86469d60eb9SVivek Goyal blk_finish_plug(&plug); 865e43473b7SVivek Goyal } 866e43473b7SVivek Goyal } 867e43473b7SVivek Goyal 868f95a04afSTejun Heo static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, 869f95a04afSTejun Heo struct blkg_policy_data *pd, int off) 87041b38b6dSTejun Heo { 871f95a04afSTejun Heo struct throtl_grp *tg = pd_to_tg(pd); 87241b38b6dSTejun Heo struct blkg_rwstat rwstat = { }, tmp; 87341b38b6dSTejun Heo int i, cpu; 87441b38b6dSTejun Heo 87541b38b6dSTejun Heo for_each_possible_cpu(cpu) { 8768a3d2615STejun Heo struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); 87741b38b6dSTejun Heo 87841b38b6dSTejun Heo tmp = blkg_rwstat_read((void *)sc + off); 87941b38b6dSTejun Heo for (i = 0; i < BLKG_RWSTAT_NR; i++) 88041b38b6dSTejun Heo rwstat.cnt[i] += tmp.cnt[i]; 88141b38b6dSTejun Heo } 88241b38b6dSTejun Heo 883f95a04afSTejun Heo return __blkg_prfill_rwstat(sf, pd, &rwstat); 88441b38b6dSTejun Heo } 88541b38b6dSTejun Heo 8868a3d2615STejun Heo static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, 88741b38b6dSTejun Heo struct seq_file *sf) 88841b38b6dSTejun Heo { 8893c798398STejun Heo struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 89041b38b6dSTejun Heo 8913c798398STejun Heo blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl, 8925bc4afb1STejun Heo cft->private, true); 89341b38b6dSTejun Heo return 0; 89441b38b6dSTejun Heo } 89541b38b6dSTejun Heo 896f95a04afSTejun Heo static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, 897f95a04afSTejun Heo int off) 89860c2bc2dSTejun Heo { 899f95a04afSTejun Heo struct throtl_grp *tg = pd_to_tg(pd); 900f95a04afSTejun Heo u64 v = *(u64 *)((void *)tg + off); 90160c2bc2dSTejun Heo 902af133cebSTejun Heo if (v == -1) 90360c2bc2dSTejun Heo return 0; 904f95a04afSTejun Heo return __blkg_prfill_u64(sf, pd, v); 90560c2bc2dSTejun Heo } 90660c2bc2dSTejun Heo 907f95a04afSTejun Heo static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, 908f95a04afSTejun Heo int off) 909af133cebSTejun Heo { 910f95a04afSTejun Heo struct throtl_grp *tg = pd_to_tg(pd); 911f95a04afSTejun Heo unsigned int v = *(unsigned int *)((void *)tg + off); 912af133cebSTejun Heo 913af133cebSTejun Heo if (v == -1) 914af133cebSTejun Heo return 0; 915f95a04afSTejun Heo return __blkg_prfill_u64(sf, pd, v); 916af133cebSTejun Heo } 917af133cebSTejun Heo 918af133cebSTejun Heo static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft, 91960c2bc2dSTejun Heo struct seq_file *sf) 92060c2bc2dSTejun Heo { 9213c798398STejun Heo blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64, 9223c798398STejun Heo &blkcg_policy_throtl, cft->private, false); 92360c2bc2dSTejun Heo return 0; 92460c2bc2dSTejun Heo } 92560c2bc2dSTejun Heo 926af133cebSTejun Heo static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft, 927af133cebSTejun Heo struct seq_file *sf) 928e43473b7SVivek Goyal { 9293c798398STejun Heo blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint, 9303c798398STejun Heo &blkcg_policy_throtl, cft->private, false); 931af133cebSTejun Heo return 0; 932e43473b7SVivek Goyal } 933e43473b7SVivek Goyal 934af133cebSTejun Heo static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, 935af133cebSTejun Heo bool is_u64) 93660c2bc2dSTejun Heo { 9373c798398STejun Heo struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 93860c2bc2dSTejun Heo struct blkg_conf_ctx ctx; 939af133cebSTejun Heo struct throtl_grp *tg; 940a2b1693bSTejun Heo struct throtl_data *td; 94160c2bc2dSTejun Heo int ret; 94260c2bc2dSTejun Heo 9433c798398STejun Heo ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); 94460c2bc2dSTejun Heo if (ret) 94560c2bc2dSTejun Heo return ret; 94660c2bc2dSTejun Heo 947af133cebSTejun Heo tg = blkg_to_tg(ctx.blkg); 948a2b1693bSTejun Heo td = ctx.blkg->q->td; 949af133cebSTejun Heo 950af133cebSTejun Heo if (!ctx.v) 951af133cebSTejun Heo ctx.v = -1; 952af133cebSTejun Heo 953af133cebSTejun Heo if (is_u64) 954af133cebSTejun Heo *(u64 *)((void *)tg + cft->private) = ctx.v; 955af133cebSTejun Heo else 956af133cebSTejun Heo *(unsigned int *)((void *)tg + cft->private) = ctx.v; 957af133cebSTejun Heo 9580f3457f6STejun Heo throtl_log_tg(tg, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", 959632b4493STejun Heo tg->bps[READ], tg->bps[WRITE], 960632b4493STejun Heo tg->iops[READ], tg->iops[WRITE]); 961632b4493STejun Heo 962632b4493STejun Heo /* 963632b4493STejun Heo * We're already holding queue_lock and know @tg is valid. Let's 964632b4493STejun Heo * apply the new config directly. 965632b4493STejun Heo * 966632b4493STejun Heo * Restart the slices for both READ and WRITES. It might happen 967632b4493STejun Heo * that a group's limit are dropped suddenly and we don't want to 968632b4493STejun Heo * account recently dispatched IO with new low rate. 969632b4493STejun Heo */ 9700f3457f6STejun Heo throtl_start_new_slice(tg, 0); 9710f3457f6STejun Heo throtl_start_new_slice(tg, 1); 972632b4493STejun Heo 9735b2c16aaSTejun Heo if (tg->flags & THROTL_TG_PENDING) { 9740049af73STejun Heo tg_update_disptime(tg, &td->service_queue); 975632b4493STejun Heo throtl_schedule_next_dispatch(td); 976632b4493STejun Heo } 977af133cebSTejun Heo 97860c2bc2dSTejun Heo blkg_conf_finish(&ctx); 979a2b1693bSTejun Heo return 0; 98060c2bc2dSTejun Heo } 98160c2bc2dSTejun Heo 982af133cebSTejun Heo static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft, 98360c2bc2dSTejun Heo const char *buf) 98460c2bc2dSTejun Heo { 985af133cebSTejun Heo return tg_set_conf(cgrp, cft, buf, true); 98660c2bc2dSTejun Heo } 98760c2bc2dSTejun Heo 988af133cebSTejun Heo static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft, 98960c2bc2dSTejun Heo const char *buf) 99060c2bc2dSTejun Heo { 991af133cebSTejun Heo return tg_set_conf(cgrp, cft, buf, false); 99260c2bc2dSTejun Heo } 99360c2bc2dSTejun Heo 99460c2bc2dSTejun Heo static struct cftype throtl_files[] = { 99560c2bc2dSTejun Heo { 99660c2bc2dSTejun Heo .name = "throttle.read_bps_device", 997af133cebSTejun Heo .private = offsetof(struct throtl_grp, bps[READ]), 998af133cebSTejun Heo .read_seq_string = tg_print_conf_u64, 999af133cebSTejun Heo .write_string = tg_set_conf_u64, 100060c2bc2dSTejun Heo .max_write_len = 256, 100160c2bc2dSTejun Heo }, 100260c2bc2dSTejun Heo { 100360c2bc2dSTejun Heo .name = "throttle.write_bps_device", 1004af133cebSTejun Heo .private = offsetof(struct throtl_grp, bps[WRITE]), 1005af133cebSTejun Heo .read_seq_string = tg_print_conf_u64, 1006af133cebSTejun Heo .write_string = tg_set_conf_u64, 100760c2bc2dSTejun Heo .max_write_len = 256, 100860c2bc2dSTejun Heo }, 100960c2bc2dSTejun Heo { 101060c2bc2dSTejun Heo .name = "throttle.read_iops_device", 1011af133cebSTejun Heo .private = offsetof(struct throtl_grp, iops[READ]), 1012af133cebSTejun Heo .read_seq_string = tg_print_conf_uint, 1013af133cebSTejun Heo .write_string = tg_set_conf_uint, 101460c2bc2dSTejun Heo .max_write_len = 256, 101560c2bc2dSTejun Heo }, 101660c2bc2dSTejun Heo { 101760c2bc2dSTejun Heo .name = "throttle.write_iops_device", 1018af133cebSTejun Heo .private = offsetof(struct throtl_grp, iops[WRITE]), 1019af133cebSTejun Heo .read_seq_string = tg_print_conf_uint, 1020af133cebSTejun Heo .write_string = tg_set_conf_uint, 102160c2bc2dSTejun Heo .max_write_len = 256, 102260c2bc2dSTejun Heo }, 102360c2bc2dSTejun Heo { 102460c2bc2dSTejun Heo .name = "throttle.io_service_bytes", 10255bc4afb1STejun Heo .private = offsetof(struct tg_stats_cpu, service_bytes), 10268a3d2615STejun Heo .read_seq_string = tg_print_cpu_rwstat, 102760c2bc2dSTejun Heo }, 102860c2bc2dSTejun Heo { 102960c2bc2dSTejun Heo .name = "throttle.io_serviced", 10305bc4afb1STejun Heo .private = offsetof(struct tg_stats_cpu, serviced), 10318a3d2615STejun Heo .read_seq_string = tg_print_cpu_rwstat, 103260c2bc2dSTejun Heo }, 103360c2bc2dSTejun Heo { } /* terminate */ 103460c2bc2dSTejun Heo }; 103560c2bc2dSTejun Heo 1036da527770SVivek Goyal static void throtl_shutdown_wq(struct request_queue *q) 1037e43473b7SVivek Goyal { 1038e43473b7SVivek Goyal struct throtl_data *td = q->td; 1039e43473b7SVivek Goyal 1040cb76199cSTejun Heo cancel_delayed_work_sync(&td->dispatch_work); 1041e43473b7SVivek Goyal } 1042e43473b7SVivek Goyal 10433c798398STejun Heo static struct blkcg_policy blkcg_policy_throtl = { 1044f9fcc2d3STejun Heo .pd_size = sizeof(struct throtl_grp), 1045f9fcc2d3STejun Heo .cftypes = throtl_files, 1046f9fcc2d3STejun Heo 10473c798398STejun Heo .pd_init_fn = throtl_pd_init, 10483c798398STejun Heo .pd_exit_fn = throtl_pd_exit, 10493c798398STejun Heo .pd_reset_stats_fn = throtl_pd_reset_stats, 1050e43473b7SVivek Goyal }; 1051e43473b7SVivek Goyal 1052bc16a4f9STejun Heo bool blk_throtl_bio(struct request_queue *q, struct bio *bio) 1053e43473b7SVivek Goyal { 1054e43473b7SVivek Goyal struct throtl_data *td = q->td; 1055e43473b7SVivek Goyal struct throtl_grp *tg; 1056*73f0d49aSTejun Heo struct throtl_service_queue *sq; 1057e43473b7SVivek Goyal bool rw = bio_data_dir(bio), update_disptime = true; 10583c798398STejun Heo struct blkcg *blkcg; 1059bc16a4f9STejun Heo bool throttled = false; 1060e43473b7SVivek Goyal 1061e43473b7SVivek Goyal if (bio->bi_rw & REQ_THROTTLED) { 1062e43473b7SVivek Goyal bio->bi_rw &= ~REQ_THROTTLED; 1063bc16a4f9STejun Heo goto out; 1064e43473b7SVivek Goyal } 1065e43473b7SVivek Goyal 1066af75cd3cSVivek Goyal /* 1067af75cd3cSVivek Goyal * A throtl_grp pointer retrieved under rcu can be used to access 1068af75cd3cSVivek Goyal * basic fields like stats and io rates. If a group has no rules, 1069af75cd3cSVivek Goyal * just update the dispatch stats in lockless manner and return. 1070af75cd3cSVivek Goyal */ 1071af75cd3cSVivek Goyal rcu_read_lock(); 10723c798398STejun Heo blkcg = bio_blkcg(bio); 1073cd1604faSTejun Heo tg = throtl_lookup_tg(td, blkcg); 1074af75cd3cSVivek Goyal if (tg) { 1075af75cd3cSVivek Goyal if (tg_no_rule_group(tg, rw)) { 1076629ed0b1STejun Heo throtl_update_dispatch_stats(tg_to_blkg(tg), 1077629ed0b1STejun Heo bio->bi_size, bio->bi_rw); 10782a7f1244STejun Heo goto out_unlock_rcu; 1079af75cd3cSVivek Goyal } 1080af75cd3cSVivek Goyal } 1081af75cd3cSVivek Goyal 1082af75cd3cSVivek Goyal /* 1083af75cd3cSVivek Goyal * Either group has not been allocated yet or it is not an unlimited 1084af75cd3cSVivek Goyal * IO group 1085af75cd3cSVivek Goyal */ 1086e43473b7SVivek Goyal spin_lock_irq(q->queue_lock); 1087cd1604faSTejun Heo tg = throtl_lookup_create_tg(td, blkcg); 1088bc16a4f9STejun Heo if (unlikely(!tg)) 1089bc16a4f9STejun Heo goto out_unlock; 1090f469a7b4SVivek Goyal 1091*73f0d49aSTejun Heo sq = &tg->service_queue; 1092*73f0d49aSTejun Heo 1093*73f0d49aSTejun Heo if (sq->nr_queued[rw]) { 1094e43473b7SVivek Goyal /* 1095e43473b7SVivek Goyal * There is already another bio queued in same dir. No 1096e43473b7SVivek Goyal * need to update dispatch time. 1097e43473b7SVivek Goyal */ 1098e43473b7SVivek Goyal update_disptime = false; 1099e43473b7SVivek Goyal goto queue_bio; 1100de701c74SVivek Goyal 1101e43473b7SVivek Goyal } 1102e43473b7SVivek Goyal 1103e43473b7SVivek Goyal /* Bio is with-in rate limit of group */ 11040f3457f6STejun Heo if (tg_may_dispatch(tg, bio, NULL)) { 1105e43473b7SVivek Goyal throtl_charge_bio(tg, bio); 110604521db0SVivek Goyal 110704521db0SVivek Goyal /* 110804521db0SVivek Goyal * We need to trim slice even when bios are not being queued 110904521db0SVivek Goyal * otherwise it might happen that a bio is not queued for 111004521db0SVivek Goyal * a long time and slice keeps on extending and trim is not 111104521db0SVivek Goyal * called for a long time. Now if limits are reduced suddenly 111204521db0SVivek Goyal * we take into account all the IO dispatched so far at new 111304521db0SVivek Goyal * low rate and * newly queued IO gets a really long dispatch 111404521db0SVivek Goyal * time. 111504521db0SVivek Goyal * 111604521db0SVivek Goyal * So keep on trimming slice even if bio is not queued. 111704521db0SVivek Goyal */ 11180f3457f6STejun Heo throtl_trim_slice(tg, rw); 1119bc16a4f9STejun Heo goto out_unlock; 1120e43473b7SVivek Goyal } 1121e43473b7SVivek Goyal 1122e43473b7SVivek Goyal queue_bio: 11230f3457f6STejun Heo throtl_log_tg(tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu" 11248e89d13fSVivek Goyal " iodisp=%u iops=%u queued=%d/%d", 11258e89d13fSVivek Goyal rw == READ ? 'R' : 'W', 1126e43473b7SVivek Goyal tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], 11278e89d13fSVivek Goyal tg->io_disp[rw], tg->iops[rw], 1128*73f0d49aSTejun Heo sq->nr_queued[READ], sq->nr_queued[WRITE]); 1129e43473b7SVivek Goyal 1130671058fbSTejun Heo bio_associate_current(bio); 11310049af73STejun Heo throtl_add_bio_tg(bio, tg, &q->td->service_queue); 1132bc16a4f9STejun Heo throttled = true; 1133e43473b7SVivek Goyal 1134e43473b7SVivek Goyal if (update_disptime) { 11350049af73STejun Heo tg_update_disptime(tg, &td->service_queue); 1136e43473b7SVivek Goyal throtl_schedule_next_dispatch(td); 1137e43473b7SVivek Goyal } 1138e43473b7SVivek Goyal 1139bc16a4f9STejun Heo out_unlock: 1140e43473b7SVivek Goyal spin_unlock_irq(q->queue_lock); 11412a7f1244STejun Heo out_unlock_rcu: 11422a7f1244STejun Heo rcu_read_unlock(); 1143bc16a4f9STejun Heo out: 1144bc16a4f9STejun Heo return throttled; 1145e43473b7SVivek Goyal } 1146e43473b7SVivek Goyal 1147c9a929ddSTejun Heo /** 1148c9a929ddSTejun Heo * blk_throtl_drain - drain throttled bios 1149c9a929ddSTejun Heo * @q: request_queue to drain throttled bios for 1150c9a929ddSTejun Heo * 1151c9a929ddSTejun Heo * Dispatch all currently throttled bios on @q through ->make_request_fn(). 1152c9a929ddSTejun Heo */ 1153c9a929ddSTejun Heo void blk_throtl_drain(struct request_queue *q) 1154c9a929ddSTejun Heo __releases(q->queue_lock) __acquires(q->queue_lock) 1155c9a929ddSTejun Heo { 1156c9a929ddSTejun Heo struct throtl_data *td = q->td; 11570049af73STejun Heo struct throtl_service_queue *parent_sq = &td->service_queue; 1158c9a929ddSTejun Heo struct throtl_grp *tg; 1159c9a929ddSTejun Heo struct bio_list bl; 1160c9a929ddSTejun Heo struct bio *bio; 1161c9a929ddSTejun Heo 11628bcb6c7dSAndi Kleen queue_lockdep_assert_held(q); 1163c9a929ddSTejun Heo 1164c9a929ddSTejun Heo bio_list_init(&bl); 1165c9a929ddSTejun Heo 11660049af73STejun Heo while ((tg = throtl_rb_first(parent_sq))) { 1167*73f0d49aSTejun Heo struct throtl_service_queue *sq = &tg->service_queue; 1168*73f0d49aSTejun Heo 11690049af73STejun Heo throtl_dequeue_tg(tg, parent_sq); 1170c9a929ddSTejun Heo 1171*73f0d49aSTejun Heo while ((bio = bio_list_peek(&sq->bio_lists[READ]))) 11720f3457f6STejun Heo tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl); 1173*73f0d49aSTejun Heo while ((bio = bio_list_peek(&sq->bio_lists[WRITE]))) 11740f3457f6STejun Heo tg_dispatch_one_bio(tg, bio_data_dir(bio), &bl); 1175c9a929ddSTejun Heo } 1176c9a929ddSTejun Heo spin_unlock_irq(q->queue_lock); 1177c9a929ddSTejun Heo 1178c9a929ddSTejun Heo while ((bio = bio_list_pop(&bl))) 1179c9a929ddSTejun Heo generic_make_request(bio); 1180c9a929ddSTejun Heo 1181c9a929ddSTejun Heo spin_lock_irq(q->queue_lock); 1182c9a929ddSTejun Heo } 1183c9a929ddSTejun Heo 1184e43473b7SVivek Goyal int blk_throtl_init(struct request_queue *q) 1185e43473b7SVivek Goyal { 1186e43473b7SVivek Goyal struct throtl_data *td; 1187a2b1693bSTejun Heo int ret; 1188e43473b7SVivek Goyal 1189e43473b7SVivek Goyal td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1190e43473b7SVivek Goyal if (!td) 1191e43473b7SVivek Goyal return -ENOMEM; 1192e43473b7SVivek Goyal 1193cb76199cSTejun Heo INIT_DELAYED_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); 119449a2f1e3STejun Heo throtl_service_queue_init(&td->service_queue); 1195e43473b7SVivek Goyal 1196cd1604faSTejun Heo q->td = td; 119729b12589SVivek Goyal td->queue = q; 119802977e4aSVivek Goyal 1199a2b1693bSTejun Heo /* activate policy */ 12003c798398STejun Heo ret = blkcg_activate_policy(q, &blkcg_policy_throtl); 1201a2b1693bSTejun Heo if (ret) 120229b12589SVivek Goyal kfree(td); 1203a2b1693bSTejun Heo return ret; 1204e43473b7SVivek Goyal } 1205e43473b7SVivek Goyal 1206e43473b7SVivek Goyal void blk_throtl_exit(struct request_queue *q) 1207e43473b7SVivek Goyal { 1208c875f4d0STejun Heo BUG_ON(!q->td); 1209da527770SVivek Goyal throtl_shutdown_wq(q); 12103c798398STejun Heo blkcg_deactivate_policy(q, &blkcg_policy_throtl); 1211c9a929ddSTejun Heo kfree(q->td); 1212e43473b7SVivek Goyal } 1213e43473b7SVivek Goyal 1214e43473b7SVivek Goyal static int __init throtl_init(void) 1215e43473b7SVivek Goyal { 1216450adcbeSVivek Goyal kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); 1217450adcbeSVivek Goyal if (!kthrotld_workqueue) 1218450adcbeSVivek Goyal panic("Failed to create kthrotld\n"); 1219450adcbeSVivek Goyal 12203c798398STejun Heo return blkcg_policy_register(&blkcg_policy_throtl); 1221e43473b7SVivek Goyal } 1222e43473b7SVivek Goyal 1223e43473b7SVivek Goyal module_init(throtl_init); 1224